Skip to content

Getting Started

Dian-Lun Lin (Luan) edited this page Dec 18, 2023 · 9 revisions

Write Your First Taro Program

#include <taro.hpp>
#include <iostream>
int main() {
  taro::Taro taro{4}; // number of threads
  auto task_a = taro.emplace([](){
    std::cout << "task a\n";
  });
  auto task_b = taro.emplace([](){
    std::cout << "task b\n";
  });
  auto task_c = taro.emplace([](){
    std::cout << "task c\n";
  });
  auto task_d = taro.emplace([](){
    std::cout << "task d\n";
  });

  // dependency
  // A -> C
  // B -> C
  // C -> D
  task_a.precede(task_c);
  task_b.precede(task_c);
  task_c.precede(task_d);
  
  taro.schedule();
  taro.wait();
}

Taro is a header-only library. To compile the program, clone the Taro project include the taro.hpp.

~$ git clone https://github.com/dian-lun-lin/taro.git  
~$ cd taro
~/taro$ g++ -std=c++20 examples/simple.cpp -I. -O2 -o simple
~/taro$ ./simple

However, this does not showcase the power of Taro. The true powerfulness of Taro is that Taro enables multitasking within a task graph. In the following, we use semaphores and CUDA as examples.

Semaphore

#include <taro.hpp>
#include <taro/await/semaphore.hpp>
#include <vector>
#include <algorithm>


void simple(size_t num_threads, size_t num_semaphores) {
  taro::Taro taro{num_threads};
  auto semaphores = taro.semaphore_await<1>(num_semaphores);

  std::vector<int> ans(num_semaphores, 0);

  for(size_t i = 0; i < num_semaphores; ++i) {
    taro.emplace([&, i]() -> taro::Coro {
      co_await semaphores.acquire(i);
      ans[i]++;
      semaphores.release(i);
    });

    taro.emplace([&, i]() -> taro::Coro {
      co_await semaphores.acquire(i);
      ans[i]--;
      semaphores.release(i);
    });
  }
  taro.schedule();
  taro.wait();

}

Traditionally, a CPU thread that fails to acquire a semaphore must block until other threads release the semaphore. However, in Taro, we enable the CPU thread to multitask and switch to other available tasks instead of blocking. To compile the code, you need C++20 for C++ Coroutine support.

~$ git clone https://github.com/dian-lun-lin/taro.git  
~$ cd taro
~/taro$ g++ -std=c++20 examples/semaphore.cpp -I. -O2 -o semaphore
~/taro$ ./semaphore

CUDA

Taro incorporates CUDA's asynchronous model with C++ Coroutine. We offer three methods to offload GPU operations: until_polling, until_callback, and wait.

#include <taro.hpp>
#include <taro/await/cuda.hpp>

int main() {
  taro::Taro taro{4}; // number of threads
  auto cuda = taro.cuda_await(4); // number of cuda streams
  int* d_a;
  size_t N{100};
  size_t BLOCK_SIZE{128};

  std::vector<int> h_a(N * N);
  std::vector<int> h_b(N * N);
  dim3 dim_grid((N - 1) / BLOCK_SIZE + 1, (N - 1) / BLOCK_SIZE + 1, 1);
  dim3 dim_block(BLOCK_SIZE, BLOCK_SIZE, 1);

  auto task_a = taro.emplace([&]() {
    std::iota(h_a.begin(), h_a.end(), 0);
    std::cout << "task a\n";
  });
  
  // malloc
  auto task_b = taro.emplace([=, &cuda]() {
    cuda.wait([=](cudaStream_t stream) {  // wait method
      cudaMallocAsync((void**)&d_a, N * N * sizeof(int), stream);
    });
    std::cout << "task b use wait methodg\n";
  });
  
  // H2D
  auto task_c = taro.emplace([=, &h_a, &cuda]() -> taro::Coro {
    co_await cuda.until_polling([=, &h_a](cudaStream_t stream) { // polling method   
      cudaMemcpyAsync(d_a, h_a.data(), N * N * sizeof(int), cudaMemcpyHostToDevice, stream);
    });
    std::cout << "task c use polling method\n";
  });
  
  // D2H and free
  auto task_d = taro.emplace([=, &h_b, &cuda]() -> taro::Coro { 
    co_await cuda.until_callback([=, &h_b](cudaStream_t stream) {   // callback method
      cudaMemcpyAsync(h_b.data(), d_a, N * N * sizeof(int), cudaMemcpyDeviceToHost, stream);
      cudaFreeAsync(d_a, stream);
    });
    std::cout << "task d use callback method\n";
  });
  
  // dependency
  // A -> C
  // B -> C
  // C -> D
  task_a.precede(task_c);
  task_b.precede(task_c);
  task_c.precede(task_d);
  
  taro.schedule();
  taro.wait();

  return 0;
}

After a CPU thread offloads GPU operations using until_callback or until_polling, it will multitask to other available tasks.

To compile the code, you need CUDA v12+ and C++20 for C++ Coroutine support.

~$ git clone https://github.com/dian-lun-lin/taro.git  
~$ cd taro
~/taro$ nvcc -std=c++20 examples/simple.cu -I. -O2 -o simple
~/taro$ ./simple

Introduction


Documentation


Problems with Taro

TBD

Release Schedule

TBD

Clone this wiki locally