-
Notifications
You must be signed in to change notification settings - Fork 4
Getting Started
#include <taro.hpp>
#include <iostream>
int main() {
taro::Taro taro{4}; // number of threads
auto task_a = taro.emplace([](){
std::cout << "task a\n";
});
auto task_b = taro.emplace([](){
std::cout << "task b\n";
});
auto task_c = taro.emplace([](){
std::cout << "task c\n";
});
auto task_d = taro.emplace([](){
std::cout << "task d\n";
});
// dependency
// A -> C
// B -> C
// C -> D
task_a.precede(task_c);
task_b.precede(task_c);
task_c.precede(task_d);
taro.schedule();
taro.wait();
}
Taro is a header-only library. To compile the program, clone the Taro project include the taro.hpp.
~$ git clone https://github.com/dian-lun-lin/taro.git
~$ cd taro
~/taro$ g++ -std=c++20 examples/simple.cpp -I. -O2 -o simple
~/taro$ ./simple
However, this does not showcase the power of Taro. The true powerfulness of Taro is that Taro enables multitasking within a task graph. In the following, we use semaphores and CUDA as examples.
#include <taro.hpp>
#include <taro/await/semaphore.hpp>
#include <vector>
#include <algorithm>
void simple(size_t num_threads, size_t num_semaphores) {
taro::Taro taro{num_threads};
auto semaphores = taro.semaphore_await<1>(num_semaphores);
std::vector<int> ans(num_semaphores, 0);
for(size_t i = 0; i < num_semaphores; ++i) {
taro.emplace([&, i]() -> taro::Coro {
co_await semaphores.acquire(i);
ans[i]++;
semaphores.release(i);
});
taro.emplace([&, i]() -> taro::Coro {
co_await semaphores.acquire(i);
ans[i]--;
semaphores.release(i);
});
}
taro.schedule();
taro.wait();
}
Traditionally, a CPU thread that fails to acquire a semaphore must block until other threads release the semaphore. However, in Taro, we enable the CPU thread to multitask and switch to other available tasks instead of blocking. To compile the code, you need C++20 for C++ Coroutine support.
~$ git clone https://github.com/dian-lun-lin/taro.git
~$ cd taro
~/taro$ g++ -std=c++20 examples/semaphore.cpp -I. -O2 -o semaphore
~/taro$ ./semaphore
Taro incorporates CUDA's asynchronous model with C++ Coroutine. We offer three methods to offload GPU operations: until_polling, until_callback, and wait.
#include <taro.hpp>
#include <taro/await/cuda.hpp>
int main() {
taro::Taro taro{4}; // number of threads
auto cuda = taro.cuda_await(4); // number of cuda streams
int* d_a;
size_t N{100};
size_t BLOCK_SIZE{128};
std::vector<int> h_a(N * N);
std::vector<int> h_b(N * N);
dim3 dim_grid((N - 1) / BLOCK_SIZE + 1, (N - 1) / BLOCK_SIZE + 1, 1);
dim3 dim_block(BLOCK_SIZE, BLOCK_SIZE, 1);
auto task_a = taro.emplace([&]() {
std::iota(h_a.begin(), h_a.end(), 0);
std::cout << "task a\n";
});
// malloc
auto task_b = taro.emplace([=, &cuda]() {
cuda.wait([=](cudaStream_t stream) { // wait method
cudaMallocAsync((void**)&d_a, N * N * sizeof(int), stream);
});
std::cout << "task b use wait methodg\n";
});
// H2D
auto task_c = taro.emplace([=, &h_a, &cuda]() -> taro::Coro {
co_await cuda.until_polling([=, &h_a](cudaStream_t stream) { // polling method
cudaMemcpyAsync(d_a, h_a.data(), N * N * sizeof(int), cudaMemcpyHostToDevice, stream);
});
std::cout << "task c use polling method\n";
});
// D2H and free
auto task_d = taro.emplace([=, &h_b, &cuda]() -> taro::Coro {
co_await cuda.until_callback([=, &h_b](cudaStream_t stream) { // callback method
cudaMemcpyAsync(h_b.data(), d_a, N * N * sizeof(int), cudaMemcpyDeviceToHost, stream);
cudaFreeAsync(d_a, stream);
});
std::cout << "task d use callback method\n";
});
// dependency
// A -> C
// B -> C
// C -> D
task_a.precede(task_c);
task_b.precede(task_c);
task_c.precede(task_d);
taro.schedule();
taro.wait();
return 0;
}
After a CPU thread offloads GPU operations using until_callback or until_polling, it will multitask to other available tasks.
To compile the code, you need CUDA v12+ and C++20 for C++ Coroutine support.
~$ git clone https://github.com/dian-lun-lin/taro.git
~$ cd taro
~/taro$ nvcc -std=c++20 examples/simple.cu -I. -O2 -o simple
~/taro$ ./simple
TBD
TBD