From b83c049e9f7b58e5b1d4c2844ccf5948ca0d92c7 Mon Sep 17 00:00:00 2001
From: Josh Romero <romero.josh@gmail.com>
Date: Tue, 30 Aug 2022 14:02:42 -0700
Subject: [PATCH 01/35] Pass custom relative_peak_threshold and
 min_separation_angle to GPUTracker.

---
 cuslines/cuslines.cpp                 |  9 +++++
 cuslines/generate_streamlines_cuda.cu | 52 ++++++++++++++++++++++-----
 cuslines/generate_streamlines_cuda.h  |  1 +
 run_dipy_cpu_hardi.py                 | 37 +++++++++++++------
 run_dipy_gpu_hardi.py                 | 36 ++++++++++++-------
 5 files changed, 103 insertions(+), 32 deletions(-)
diff --git a/cuslines/cuslines.cpp b/cuslines/cuslines.cpp
index c190019..b38262f 100644
--- a/cuslines/cuslines.cpp
+++ b/cuslines/cuslines.cpp
@@ -64,6 +64,8 @@ class GPUTracker {
                double min_signal,
                double tc_threshold,
                double step_size,
+               double relative_peak_thresh,
+               double min_separation_angle,
                np_array_cast dataf,
                np_array H,
                np_array R,
@@ -125,6 +127,8 @@ class GPUTracker {
       min_signal_ = min_signal;
       tc_threshold_ = tc_threshold;
       step_size_ = step_size;
+      relative_peak_thresh_ = relative_peak_thresh,
+      min_separation_angle_ = min_separation_angle,
 
       // Allocate/copy constant problem data on GPUs
       dataf_d.resize(ngpus_, nullptr);
@@ -221,6 +225,7 @@ class GPUTracker {
 
       // Call GPU routine
       generate_streamlines_cuda_mgpu(max_angle_, min_signal_, tc_threshold_, step_size_,
+                                     relative_peak_thresh_, min_separation_angle_,
                                      nseeds, seeds_d,
                                      dimx_, dimy_, dimz_, dimt_,
                                      dataf_d, H_d, R_d, delta_nr_, delta_b_d, delta_q_d, b0s_mask_d, metric_map_d, samplm_nr_, sampling_matrix_d,
@@ -285,6 +290,8 @@ class GPUTracker {
     double tc_threshold_;
     double min_signal_;
     double step_size_;
+    double relative_peak_thresh_;
+    double min_separation_angle_;
 
     std::vector<int> nSlines_old_;
     std::vector<REAL*> slines_;
@@ -309,6 +316,7 @@ class GPUTracker {
 PYBIND11_MODULE(cuslines, m) {
   py::class_<GPUTracker>(m, "GPUTracker")
     .def(py::init<double, double, double, double,
+                  double, double,
 		  np_array_cast, np_array,
                   np_array, np_array,
                   np_array, np_array_int,
@@ -321,6 +329,7 @@ PYBIND11_MODULE(cuslines, m) {
                   py::arg().noconvert(), py::arg().noconvert(),
                   py::arg().noconvert(), py::arg().noconvert(),
                   py::arg().noconvert(), py::arg().noconvert(),
+                  py::arg().noconvert(), py::arg().noconvert(),
                   py::arg("ngpus") = 1, py::arg("rng_seed") = 0,
                   py::arg("rng_offset") = 0)
 
diff --git a/cuslines/generate_streamlines_cuda.cu b/cuslines/generate_streamlines_cuda.cu
index 700860a..a270e31 100644
--- a/cuslines/generate_streamlines_cuda.cu
+++ b/cuslines/generate_streamlines_cuda.cu
@@ -56,13 +56,23 @@
 
 using namespace cuwsort;
 
-//#define USE_FIXED_PERMUTATION
+#define USE_FIXED_PERMUTATION
 #ifdef USE_FIXED_PERMUTATION
-__device__ const int fixedPerm[] = {44, 47, 53,  0,  3,  3, 39,  9, 19, 21, 50, 36, 23,
-                                     6, 24, 24, 12,  1, 38, 39, 23, 46, 24, 17, 37, 25, 
-                                    13,  8,  9, 20, 51, 16, 51,  5, 15, 47,  0, 18, 35, 
-                                    24, 49, 51, 29, 19, 19, 14, 39, 32,  1,  9, 32, 31,
-                                    10, 52, 23};
+//__device__ const int fixedPerm[] = {44, 47, 53,  0,  3,  3, 39,  9, 19, 21, 50, 36, 23,
+//                                     6, 24, 24, 12,  1, 38, 39, 23, 46, 24, 17, 37, 25, 
+//                                    13,  8,  9, 20, 51, 16, 51,  5, 15, 47,  0, 18, 35, 
+//                                    24, 49, 51, 29, 19, 19, 14, 39, 32,  1,  9, 32, 31,
+//                                    10, 52, 23};
+__device__ const int fixedPerm[] = {
+  47, 117,  67, 103,   9,  21,  36,  87,  70,  88, 140,  58,  39,  87,  88,  81,  25,  77,
+  72,   9, 148, 115,  79,  82,  99,  29, 147, 147, 142,  32,   9, 127,  32,  31, 114,  28,
+  34, 128, 128,  53, 133,  38,  17,  79, 132, 105,  42,  31, 120,   1,  65,  57,  35, 102,
+ 119,  11,  82,  91, 128, 142,  99,  53, 140, 121,  84,  68,   6,  47, 127, 131, 100,  78,
+ 143, 148,  23, 141, 117,  85,  48,  49,  69,  95,  94,   0, 113,  36,  48,  93, 131,  98,
+  42, 112, 149, 127,   0, 138, 114,  43, 127,  23, 130, 121,  98,  62, 123,  82, 148,  50,
+  14,  41,  58,  36,  10,  86,  43, 104,  11,   2,  51,  80,  32, 128,  38,  19,  42, 115,
+  77,  30,  24, 125,   2,   3,  94, 107,  13, 112,  40,  72,  19,  95,  72,  67,  61,  14,
+  96,   4, 139,  86, 121, 109};
 #endif
 
 
@@ -480,8 +490,8 @@ __device__ int peak_directions_d(const REAL_T  *__restrict__ odf,
                                  const int num_edges,
 				 int samplm_nr,
 				 int *__restrict__ __shInd,
-                                 const REAL_T relative_peak_thres=static_cast<REAL_T>(0.5),
-                                 const REAL_T min_separation_angle=static_cast<REAL_T>(0.4363323129985824)) { // 20 degrees in rads
+				 const REAL_T relative_peak_thres,
+				 const REAL_T min_separation_angle) {
 
         const int tidx = threadIdx.x;
 
@@ -808,6 +818,8 @@ template<int BDIM_X,
 __device__ int get_direction_d(curandStatePhilox4_32_10_t *st,
                                const REAL_T max_angle,
 			       const REAL_T min_signal,
+			       const REAL_T relative_peak_thres,
+			       const REAL_T min_separation_angle,
 			       REAL3_T dir,
                                const int dimx,
                                const int dimy,
@@ -901,6 +913,7 @@ __device__ int get_direction_d(curandStatePhilox4_32_10_t *st,
                                 if (j+tidx < hr_side) {
 #ifdef USE_FIXED_PERMUTATION
                                         const int srcPermInd = fixedPerm[j+tidx];
+                                         printf("srcPermInd: %d\n", srcPermInd);
 #else
                                         const int srcPermInd = curand(st) % hr_side;
 //                                        if (srcPermInd < 0 || srcPermInd >= hr_side) {
@@ -1015,7 +1028,9 @@ __device__ int get_direction_d(curandStatePhilox4_32_10_t *st,
                                                            sphere_edges,
                                                            num_edges,
 							   samplm_nr,
-							   reinterpret_cast<int *>(__r_sh)); // reuse __r_sh as shInd in func which is large enough
+							   reinterpret_cast<int *>(__r_sh), // reuse __r_sh as shInd in func which is large enough
+							   relative_peak_thres,
+							   min_separation_angle);
                 if (NATTEMPTS == 1) { // init=True...
                         return ndir; // and dirs;
                 } else { // init=False...
@@ -1083,6 +1098,8 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
 			 const REAL_T min_signal,
 			 const REAL_T tc_threshold,
 			 const REAL_T step_size,
+			 const REAL_T relative_peak_thres,
+			 const REAL_T min_separation_angle,
                          REAL3_T seed,
                          REAL3_T first_step,
                          REAL3_T voxel_size,
@@ -1144,6 +1161,8 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
                                            5>(st,
 					      max_angle,
 					      min_signal,
+					      relative_peak_thres,
+					      min_separation_angle,
                                               direction,
                                               dimx, dimy, dimz, dimt, dataf,
                                               b0s_mask /* !dwi_mask */,
@@ -1210,6 +1229,8 @@ template<int BDIM_X,
          typename REAL3_T>
 __global__ void getNumStreamlines_k(const REAL_T max_angle,
 				    const REAL_T min_signal,
+				    const REAL_T relative_peak_thres,
+				    const REAL_T min_separation_angle,
 				    const long long rndSeed,
                                     const int rndOffset,
                                     const int nseed,
@@ -1264,6 +1285,8 @@ __global__ void getNumStreamlines_k(const REAL_T max_angle,
                                    1>(&st,
 				      max_angle,
 				      min_signal,
+				      relative_peak_thres,
+				      min_separation_angle,
                                       MAKE_REAL3(0,0,0),
                                       dimx, dimy, dimz, dimt, dataf,
                                       b0s_mask /* !dwi_mask */,
@@ -1304,6 +1327,8 @@ __global__ void genStreamlinesMerge_k(const REAL_T max_angle,
 				      const REAL_T min_signal,
 				      const REAL_T tc_threshold,
 				      const REAL_T step_size,
+				      const REAL_T relative_peak_thres,
+				      const REAL_T min_separation_angle,
 				      const long long rndSeed,
                                       const int rndOffset,
                                       const int nseed,
@@ -1386,6 +1411,8 @@ __global__ void genStreamlinesMerge_k(const REAL_T max_angle,
 						            min_signal,
 							    tc_threshold,
 							    step_size,
+							    relative_peak_thres,
+							    min_separation_angle,
                                                             seed,
                                                             MAKE_REAL3(-first_step.x, -first_step.y, -first_step.z),
                                                             MAKE_REAL3(1, 1, 1),
@@ -1423,6 +1450,8 @@ __global__ void genStreamlinesMerge_k(const REAL_T max_angle,
 						            min_signal,
 							    tc_threshold,
 							    step_size,
+							    relative_peak_thres,
+							    min_separation_angle,
                                                             seed,
                                                             first_step,
                                                             MAKE_REAL3(1, 1, 1),
@@ -1466,6 +1495,7 @@ __global__ void genStreamlinesMerge_k(const REAL_T max_angle,
 }
 
 void generate_streamlines_cuda_mgpu(const REAL max_angle, const REAL min_signal, const REAL tc_threshold, const REAL step_size,
+                                    const REAL relative_peak_thresh, const REAL min_separation_angle,
                                     const int nseeds, const std::vector<REAL*> &seeds_d,
                                     const int dimx, const int dimy, const int dimz, const int dimt,
                                     const std::vector<REAL*> &dataf_d, const std::vector<REAL*> &H_d, const std::vector<REAL*> &R_d,
@@ -1517,6 +1547,8 @@ void generate_streamlines_cuda_mgpu(const REAL max_angle, const REAL min_signal,
                         THR_X_BL/THR_X_SL>
                         <<<grid, block, shSizeGNS>>>(max_angle,
 						     min_signal,
+						     relative_peak_thresh,
+						     min_separation_angle,
 						     rng_seed,
 						     rng_offset + n*nseeds_per_gpu,
 						     nseeds_gpu,
@@ -1626,6 +1658,8 @@ void generate_streamlines_cuda_mgpu(const REAL max_angle, const REAL min_signal,
 								   min_signal,
 								   tc_threshold,
 								   step_size,
+								   relative_peak_thresh,
+								   min_separation_angle,
 								   rng_seed,
 								   rng_offset + n*nseeds_per_gpu,
 								   nseeds_gpu,
diff --git a/cuslines/generate_streamlines_cuda.h b/cuslines/generate_streamlines_cuda.h
index 96a60e0..4ad90d7 100644
--- a/cuslines/generate_streamlines_cuda.h
+++ b/cuslines/generate_streamlines_cuda.h
@@ -34,6 +34,7 @@
 #include "globals.h"
 
 void generate_streamlines_cuda_mgpu(const REAL max_angle, const REAL min_signal, const REAL tc_threshold, const REAL step_size,
+                                    const REAL relative_peak_thresh, const REAL min_separation_angle,
 				    const int nseeds, const std::vector<REAL*> &seeds_d,
                                     const int dimx, const int dimy, const int dimz, const int dimt,
                                     const std::vector<REAL*> &dataf_d, const std::vector<REAL*> &H_d, const std::vector<REAL*> &R_d,
diff --git a/run_dipy_cpu_hardi.py b/run_dipy_cpu_hardi.py
index dabbc63..1437884 100644
--- a/run_dipy_cpu_hardi.py
+++ b/run_dipy_cpu_hardi.py
@@ -32,6 +32,7 @@
 import time
 
 import numpy as np
+import numpy.linalg as npl
 
 import dipy.reconst.dti as dti
 from dipy.io import read_bvals_bvecs
@@ -41,18 +42,19 @@
 from dipy.core.gradients import gradient_table
 from dipy.data import small_sphere
 from dipy.direction import BootDirectionGetter
-from dipy.reconst.shm import OpdtModel
+from dipy.reconst.shm import OpdtModel, CsaOdfModel
 from dipy.tracking.local_tracking import LocalTracking
 from dipy.tracking.stopping_criterion import ThresholdStoppingCriterion
 from dipy.reconst import shm
 from dipy.data import get_fnames
 from dipy.data import read_stanford_pve_maps
+from dipy.tracking.streamline import transform_streamlines
 
 import nibabel as nib
 from nibabel.orientations import aff2axcodes
 
 # Import custom module
-import cuslines.cuslines as cuslines
+#import cuslines.cuslines as cuslines
 
 t0 = time.time()
 
@@ -75,6 +77,13 @@ def get_img(ep2_seq):
 parser.add_argument("--output-prefix", type=str, default ='', help="path to the output file")
 parser.add_argument("--chunk-size", type=int, required=True, help="how many seeds to process per sweep")
 parser.add_argument("--nseeds", type=int, default=None, help="how many seeds to process in total")
+parser.add_argument("--sh-order",type=int,default=4,help="sh_order")
+parser.add_argument("--fa-threshold",type=float,default=0.1,help="FA threshold")
+parser.add_argument("--relative-peak-threshold",type=float,default=0.25,help="relative peak threshold")
+parser.add_argument("--min_separation-angle",type=float,default=45,help="min separation angle (in degrees)")
+parser.add_argument("--sm-lambda",type=float,default=0.006,help="smoothing lambda")
+parser.add_argument("--sampling-density", type=int, default=3, help="sampling density for seed generation")
+
 args = parser.parse_args()
 
 # Get Stanford HARDI data
@@ -84,6 +93,7 @@ def get_img(ep2_seq):
 
 img = get_img(hardi_nifti_fname)
 voxel_order = "".join(aff2axcodes(img.affine))
+img_affine = img.affine
 
 gtab = get_gtab(hardi_bval_fname, hardi_bvec_fname)
 #roi = get_img(hardi_nifti_fname)
@@ -100,22 +110,22 @@ def get_img(ep2_seq):
 FA[np.isnan(FA)] = 0
 
 # Setup tissue_classifier args
-tissue_classifier = ThresholdStoppingCriterion(FA, 0.1)
+tissue_classifier = ThresholdStoppingCriterion(FA, args.fa_threshold)
 metric_map = np.asarray(FA, 'float64')
 
 # Create seeds for ROI
-seed_mask = utils.seeds_from_mask(roi_data, density=3, affine=np.eye(4))
-seed_mask = seed_mask[0:args.nseeds]
+#seed_mask = utils.seeds_from_mask(roi_data, density=sampling_density, affine=img_affine)
+seed_mask = utils.seeds_from_mask(roi_data, density=args.sampling_density, affine=np.eye(4))
+#seed_mask = seed_mask[1000:args.nseeds]
+seed_mask = seed_mask[500:501]
 
 # Setup model
-print('slowadcodf')
-sh_order = 6
-model = OpdtModel(gtab, sh_order=sh_order, min_signal=1)
+#model = CsaOdfModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=1)
+model = OpdtModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=1)
 
 # Setup direction getter args
 print('Bootstrap direction getter')
-boot_dg = BootDirectionGetter.from_data(data, model, max_angle=60., sphere=small_sphere)
-
+boot_dg = BootDirectionGetter.from_data(data, model, max_angle=60., sphere=small_sphere, sh_order=args.sh_order, relative_peak_threshold=args.relative_peak_threshold, min_separation_angle=args.min_separation_angle)
 
 print('streamline gen')
 global_chunk_size = args.chunk_size
@@ -127,6 +137,7 @@ def get_img(ep2_seq):
 for idx in range(int(nchunks)):
   # Main streamline computation
   ts = time.time()
+  #streamline_generator = LocalTracking(boot_dg, tissue_classifier, seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size], affine=img_affine, step_size=.5)
   streamline_generator = LocalTracking(boot_dg, tissue_classifier, seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size], affine=np.eye(4), step_size=.5)
   streamlines = [s for s in streamline_generator]
   te = time.time()
@@ -141,6 +152,12 @@ def get_img(ep2_seq):
     ts = time.time()
     #save_tractogram(fname, streamlines, img.affine, vox_size=roi.header.get_zooms(), shape=roi_data.shape)
     #save_tractogram(fname, streamlines)
+    #sft = StatefulTractogram(streamlines, hardi_nifti_fname, Space.VOX)
+    #print(seed_mask)
+    #print(streamlines)
+
+    #inv_affine = npl.inv(img_affine)
+    #streamlines = transform_streamlines(streamlines,inv_affine)
     sft = StatefulTractogram(streamlines, hardi_nifti_fname, Space.VOX)
     save_tractogram(sft, fname)
     te = time.time()
diff --git a/run_dipy_gpu_hardi.py b/run_dipy_gpu_hardi.py
index 9386631..832fbfb 100644
--- a/run_dipy_gpu_hardi.py
+++ b/run_dipy_gpu_hardi.py
@@ -76,10 +76,15 @@ def get_img(ep2_seq):
 parser.add_argument("--nseeds", type=int, default=None, help="how many seeds to process in total")
 parser.add_argument("--ngpus", type=int, required=True, help="number of GPUs to use")
 parser.add_argument("--use-fast-write", action='store_true', help="use fast file write")
-parser.add_argument("--max-angle", type=float, default=1.0471975511965976, help="default: 60 deg (in rad)")
+parser.add_argument("--max-angle", type=float, default=60, help="max angle (in degrees)")
 parser.add_argument("--min-signal", type=float, default=1.0, help="default: 1.0")
-parser.add_argument("--tc-threshold", type=float, default=0.1, help="default: 0.1")
 parser.add_argument("--step-size", type=float, default=0.5, help="default: 0.5")
+parser.add_argument("--sh-order",type=int,default=4,help="sh_order")
+parser.add_argument("--fa-threshold",type=float,default=0.1,help="FA threshold")
+parser.add_argument("--relative-peak-threshold",type=float,default=0.25,help="relative peak threshold")
+parser.add_argument("--min-separation-angle",type=float,default=45,help="min separation angle (in degrees)")
+parser.add_argument("--sm-lambda",type=float,default=0.006,help="smoothing lambda")
+parser.add_argument("--sampling-density", type=int, default=3, help="sampling density for seed generation")
 args = parser.parse_args()
 
 # Get Stanford HARDI data
@@ -108,21 +113,22 @@ def get_img(ep2_seq):
 FA[np.isnan(FA)] = 0
 
 # Setup tissue_classifier args
-#tissue_classifier = ThresholdTissueClassifier(FA, 0.1)
+#tissue_classifier = ThresholdTissueClassifier(FA, args.fa_threshold)
 metric_map = np.asarray(FA, 'float64')
 
 # Create seeds for ROI
-seed_mask = utils.seeds_from_mask(roi_data, density=3, affine=np.eye(4))
-seed_mask = seed_mask[0:args.nseeds]
+#seed_mask = utils.seeds_from_mask(roi_data, density=sampling_density, affine=img_affine)
+seed_mask = utils.seeds_from_mask(roi_data, density=args.sampling_density, affine=np.eye(4))
+#seed_mask = seed_mask[0:args.nseeds]
+seed_mask = seed_mask[500:501]
 
 # Setup model
-print('slowadcodf')
-sh_order = 6
-model = OpdtModel(gtab, sh_order=sh_order, min_signal=1)
+#model = CsaOdfModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=1)
+model = OpdtModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=1)
 
 # Setup direction getter args
 print('Bootstrap direction getter')
-#boot_dg = BootDirectionGetter.from_data(data, model, max_angle=60., sphere=small_sphere)
+#boot_dg = BootDirectionGetter.from_data(data, model, max_angle=60., sphere=small_sphere, sh_order=args.sh_order, relative_peak_threshold=args.relative_peak_threshold, min_separation_angle=args.min_separation_angle)
 b0s_mask = gtab.b0s_mask
 dwi_mask = ~b0s_mask
 
@@ -134,14 +140,14 @@ def get_img(ep2_seq):
 sphere = small_sphere
 theta = sphere.theta
 phi = sphere.phi
-sampling_matrix, _, _ = shm.real_sym_sh_basis(sh_order, theta, phi)
+sampling_matrix, _, _ = shm.real_sym_sh_basis(args.sh_order, theta, phi)
 
 ## from BootPmfGen __init__
 # setup H and R matrices
 # TODO: figure out how to get H, R matrices from direction getter object
 x, y, z = model.gtab.gradients[dwi_mask].T
 r, theta, phi = shm.cart2sphere(x, y, z)
-B, _, _ = shm.real_sym_sh_basis(sh_order, theta, phi)
+B, _, _ = shm.real_sym_sh_basis(args.sh_order, theta, phi)
 H = shm.hat(B)
 R = shm.lcr_matrix(H)
 
@@ -154,10 +160,12 @@ def get_img(ep2_seq):
 
 #streamline_generator = LocalTracking(boot_dg, tissue_classifier, seed_mask, affine=np.eye(4), step_size=.5)
 
-gpu_tracker = cuslines.GPUTracker(args.max_angle,
+gpu_tracker = cuslines.GPUTracker(args.max_angle * np.pi/180,
                                   args.min_signal,
-                                  args.tc_threshold,
+                                  args.fa_threshold,
                                   args.step_size,
+                                  args.relative_peak_threshold,
+                                  args.min_separation_angle * np.pi/180,
                                   dataf, H, R, delta_b, delta_q,
                                   b0s_mask.astype(np.int32), metric_map, sampling_matrix,
                                   sphere.vertices, sphere.edges.astype(np.int32),
@@ -177,6 +185,8 @@ def get_img(ep2_seq):
 
   # Save tracklines file
   if args.output_prefix:
+    #print(seed_mask)
+    #print(streamlines)
     if args.use_fast_write:
       prefix = "{}.{}_{}".format(args.output_prefix, idx+1, nchunks)
       ts = time.time()

From 3a57a3af954400029210ea1ffcd5d53afea4a033 Mon Sep 17 00:00:00 2001
From: Josh Romero <romero.josh@gmail.com>
Date: Tue, 30 Aug 2022 14:05:56 -0700
Subject: [PATCH 02/35] Adjust some script arguments.

---
 run_dipy_cpu_hardi.py | 17 ++++++++++-------
 run_dipy_gpu_hardi.py |  8 ++++----
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/run_dipy_cpu_hardi.py b/run_dipy_cpu_hardi.py
index 1437884..a2d8d5d 100644
--- a/run_dipy_cpu_hardi.py
+++ b/run_dipy_cpu_hardi.py
@@ -77,6 +77,9 @@ def get_img(ep2_seq):
 parser.add_argument("--output-prefix", type=str, default ='', help="path to the output file")
 parser.add_argument("--chunk-size", type=int, required=True, help="how many seeds to process per sweep")
 parser.add_argument("--nseeds", type=int, default=None, help="how many seeds to process in total")
+parser.add_argument("--max-angle", type=float, default=60, help="max angle (in degrees)")
+parser.add_argument("--min-signal", type=float, default=1.0, help="default: 1.0")
+parser.add_argument("--step-size", type=float, default=0.5, help="default: 0.5")
 parser.add_argument("--sh-order",type=int,default=4,help="sh_order")
 parser.add_argument("--fa-threshold",type=float,default=0.1,help="FA threshold")
 parser.add_argument("--relative-peak-threshold",type=float,default=0.25,help="relative peak threshold")
@@ -120,12 +123,12 @@ def get_img(ep2_seq):
 seed_mask = seed_mask[500:501]
 
 # Setup model
-#model = CsaOdfModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=1)
-model = OpdtModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=1)
+#model = CsaOdfModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=args.min_signal)
+model = OpdtModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=args.min_signal)
 
 # Setup direction getter args
 print('Bootstrap direction getter')
-boot_dg = BootDirectionGetter.from_data(data, model, max_angle=60., sphere=small_sphere, sh_order=args.sh_order, relative_peak_threshold=args.relative_peak_threshold, min_separation_angle=args.min_separation_angle)
+boot_dg = BootDirectionGetter.from_data(data, model, max_angle=args.max_angle, sphere=small_sphere, sh_order=args.sh_order, relative_peak_threshold=args.relative_peak_threshold, min_separation_angle=args.min_separation_angle)
 
 print('streamline gen')
 global_chunk_size = args.chunk_size
@@ -137,8 +140,8 @@ def get_img(ep2_seq):
 for idx in range(int(nchunks)):
   # Main streamline computation
   ts = time.time()
-  #streamline_generator = LocalTracking(boot_dg, tissue_classifier, seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size], affine=img_affine, step_size=.5)
-  streamline_generator = LocalTracking(boot_dg, tissue_classifier, seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size], affine=np.eye(4), step_size=.5)
+  #streamline_generator = LocalTracking(boot_dg, tissue_classifier, seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size], affine=img_affine, step_size=args.step_size)
+  streamline_generator = LocalTracking(boot_dg, tissue_classifier, seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size], affine=np.eye(4), step_size=args.step_size)
   streamlines = [s for s in streamline_generator]
   te = time.time()
   streamline_time += (te-ts)
@@ -153,8 +156,8 @@ def get_img(ep2_seq):
     #save_tractogram(fname, streamlines, img.affine, vox_size=roi.header.get_zooms(), shape=roi_data.shape)
     #save_tractogram(fname, streamlines)
     #sft = StatefulTractogram(streamlines, hardi_nifti_fname, Space.VOX)
-    #print(seed_mask)
-    #print(streamlines)
+    print(seed_mask)
+    print(streamlines)
 
     #inv_affine = npl.inv(img_affine)
     #streamlines = transform_streamlines(streamlines,inv_affine)
diff --git a/run_dipy_gpu_hardi.py b/run_dipy_gpu_hardi.py
index 832fbfb..3e333b3 100644
--- a/run_dipy_gpu_hardi.py
+++ b/run_dipy_gpu_hardi.py
@@ -128,7 +128,7 @@ def get_img(ep2_seq):
 
 # Setup direction getter args
 print('Bootstrap direction getter')
-#boot_dg = BootDirectionGetter.from_data(data, model, max_angle=60., sphere=small_sphere, sh_order=args.sh_order, relative_peak_threshold=args.relative_peak_threshold, min_separation_angle=args.min_separation_angle)
+#boot_dg = BootDirectionGetter.from_data(data, model, max_angle=args.max_angle, sphere=small_sphere, sh_order=args.sh_order, relative_peak_threshold=args.relative_peak_threshold, min_separation_angle=args.min_separation_angle)
 b0s_mask = gtab.b0s_mask
 dwi_mask = ~b0s_mask
 
@@ -158,7 +158,7 @@ def get_img(ep2_seq):
 global_chunk_size = args.chunk_size * args.ngpus
 nchunks = (seed_mask.shape[0] + global_chunk_size - 1) // global_chunk_size
 
-#streamline_generator = LocalTracking(boot_dg, tissue_classifier, seed_mask, affine=np.eye(4), step_size=.5)
+#streamline_generator = LocalTracking(boot_dg, tissue_classifier, seed_mask, affine=np.eye(4), step_size=args.step_size)
 
 gpu_tracker = cuslines.GPUTracker(args.max_angle * np.pi/180,
                                   args.min_signal,
@@ -185,8 +185,8 @@ def get_img(ep2_seq):
 
   # Save tracklines file
   if args.output_prefix:
-    #print(seed_mask)
-    #print(streamlines)
+    print(seed_mask)
+    print(streamlines)
     if args.use_fast_write:
       prefix = "{}.{}_{}".format(args.output_prefix, idx+1, nchunks)
       ts = time.time()

From 0fbca110c3a4d1d8e66e8c42713ec6224c6618ce Mon Sep 17 00:00:00 2001
From: Josh Romero <romero.josh@gmail.com>
Date: Tue, 30 Aug 2022 16:34:25 -0700
Subject: [PATCH 03/35] Initial draft commit of CsaOdf changes for GPU. Changes
 to sample scripts to enable usage.

---
 cuslines/cuslines.cpp                 | 20 ++++--
 cuslines/generate_streamlines_cuda.cu | 96 ++++++++++++++++++++++-----
 cuslines/generate_streamlines_cuda.h  |  2 +-
 run_dipy_cpu.py                       | 40 ++++++++---
 run_dipy_cpu_hardi.py                 | 18 +++--
 run_dipy_gpu.py                       | 50 +++++++++-----
 run_dipy_gpu_hardi.py                 | 31 ++++++---
 7 files changed, 190 insertions(+), 67 deletions(-)

diff --git a/cuslines/cuslines.cpp b/cuslines/cuslines.cpp
index b38262f..2a9c58b 100644
--- a/cuslines/cuslines.cpp
+++ b/cuslines/cuslines.cpp
@@ -58,9 +58,15 @@ py::capsule cleanup(T* ptr) {
          });
 }
 
+enum ModelType {
+  OPDT = 0,
+  CSAODF = 1,
+};
+
 class GPUTracker {
   public:
-    GPUTracker(double max_angle,
+    GPUTracker(ModelType model_type,
+               double max_angle,
                double min_signal,
                double tc_threshold,
                double step_size,
@@ -123,6 +129,7 @@ class GPUTracker {
       std::cerr << "Creating GPUTracker with " << ngpus << " GPUs..." << std::endl;
       ngpus_ = ngpus;
 
+      model_type_ = model_type;
       max_angle_ = max_angle;
       min_signal_ = min_signal;
       tc_threshold_ = tc_threshold;
@@ -224,7 +231,7 @@ class GPUTracker {
       std::vector<int> nSlines(ngpus_);
 
       // Call GPU routine
-      generate_streamlines_cuda_mgpu(max_angle_, min_signal_, tc_threshold_, step_size_,
+      generate_streamlines_cuda_mgpu(model_type_, max_angle_, min_signal_, tc_threshold_, step_size_,
                                      relative_peak_thresh_, min_separation_angle_,
                                      nseeds, seeds_d,
                                      dimx_, dimy_, dimz_, dimt_,
@@ -286,6 +293,7 @@ class GPUTracker {
     int nedges_;
     int delta_nr_, samplm_nr_;
 
+    ModelType model_type_;
     double max_angle_;
     double tc_threshold_;
     double min_signal_;
@@ -314,8 +322,12 @@ class GPUTracker {
 
 
 PYBIND11_MODULE(cuslines, m) {
+  py::enum_<ModelType>(m, "ModelType")
+    .value("OPDT", OPDT)
+    .value("CSAODF", CSAODF);
+
   py::class_<GPUTracker>(m, "GPUTracker")
-    .def(py::init<double, double, double, double,
+    .def(py::init<ModelType, double, double, double, double,
                   double, double,
 		  np_array_cast, np_array,
                   np_array, np_array,
@@ -323,7 +335,7 @@ PYBIND11_MODULE(cuslines, m) {
                   np_array, np_array,
                   np_array, np_array_int,
                   int, int, int>(),
-                  py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(),
+                  py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(),
                   py::arg().noconvert(), py::arg().noconvert(),
                   py::arg().noconvert(), py::arg().noconvert(),
                   py::arg().noconvert(), py::arg().noconvert(),
diff --git a/cuslines/generate_streamlines_cuda.cu b/cuslines/generate_streamlines_cuda.cu
index a270e31..e9dbb6a 100644
--- a/cuslines/generate_streamlines_cuda.cu
+++ b/cuslines/generate_streamlines_cuda.cu
@@ -56,7 +56,7 @@
 
 using namespace cuwsort;
 
-#define USE_FIXED_PERMUTATION
+//#define USE_FIXED_PERMUTATION
 #ifdef USE_FIXED_PERMUTATION
 //__device__ const int fixedPerm[] = {44, 47, 53,  0,  3,  3, 39,  9, 19, 21, 50, 36, 23,
 //                                     6, 24, 24, 12,  1, 38, 39, 23, 46, 24, 17, 37, 25, 
@@ -368,6 +368,52 @@ __device__ void ndotp_log_d(const int N,
         return;
 }
 
+template<int BDIM_X,
+	 typename VAL_T>
+__device__ void ndotp_log_csa_d(const int N,
+				const int M,
+				const VAL_T *__restrict__ srcV,
+				const VAL_T *__restrict__ srcM,
+				VAL_T *__restrict__ dstV) {
+
+	const int tidx = threadIdx.x;
+
+	const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+	const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+	// Clamp values
+	constexpr VAL_T min = .001;
+	constexpr VAL_T max = .999;
+
+	//#pragma unroll
+	for(int i = 0; i < N; i++) {
+
+		VAL_T __tmp = 0;
+
+		//#pragma unroll
+		for(int j = 0; j < M; j += BDIM_X) {
+			if (j+tidx < M) {
+				const VAL_T v = MIN(MAX(srcV[j+tidx], min), max);
+				__tmp += LOG(-LOG(v)) * srcM[i*M + j+tidx];
+			}
+		}
+		#pragma unroll
+		for(int j = BDIM_X/2; j; j /= 2) {
+#if 0
+			__tmp += __shfl_xor_sync(WMASK, __tmp, j, BDIM_X);
+#else
+			__tmp += __shfl_down_sync(WMASK, __tmp, j, BDIM_X);
+#endif
+		}
+		// values could be held by BDIM_X threads and written
+		// together every BDIM_X iterations...
+
+		if (tidx == 0) {
+			dstV[i] = __tmp;
+		}
+	}
+	return;
+}
+
 template<int BDIM_X,
          typename LEN_T,
          typename VAL_T>
@@ -816,6 +862,7 @@ template<int BDIM_X,
          typename REAL_T,
          typename REAL3_T>
 __device__ int get_direction_d(curandStatePhilox4_32_10_t *st,
+			       const int model_type,
                                const REAL_T max_angle,
 			       const REAL_T min_signal,
 			       const REAL_T relative_peak_thres,
@@ -913,7 +960,6 @@ __device__ int get_direction_d(curandStatePhilox4_32_10_t *st,
                                 if (j+tidx < hr_side) {
 #ifdef USE_FIXED_PERMUTATION
                                         const int srcPermInd = fixedPerm[j+tidx];
-                                         printf("srcPermInd: %d\n", srcPermInd);
 #else
                                         const int srcPermInd = curand(st) % hr_side;
 //                                        if (srcPermInd < 0 || srcPermInd >= hr_side) {
@@ -962,16 +1008,25 @@ __device__ int get_direction_d(curandStatePhilox4_32_10_t *st,
 			maskGet<BDIM_X>(dimt, b0s_mask, __vox_data_sh, __msk_data_sh);
 			__syncwarp(WMASK);
 
-                        ndotp_log_d<BDIM_X>(delta_nr, hr_side, __msk_data_sh, delta_q, __r_sh);
-                        ndotp_d    <BDIM_X>(delta_nr, hr_side, __msk_data_sh, delta_b, __h_sh);
-
-                        __syncwarp(WMASK);
+			if (model_type == 0) {
+				ndotp_log_d<BDIM_X>(delta_nr, hr_side, __msk_data_sh, delta_q, __r_sh);
+				ndotp_d    <BDIM_X>(delta_nr, hr_side, __msk_data_sh, delta_b, __h_sh);
+				__syncwarp(WMASK);
+				#pragma unroll
+				for(int j = tidx; j < delta_nr; j += BDIM_X) {
+					__r_sh[j] -= __h_sh[j];
+				}
+				__syncwarp(WMASK);
+			} else if (model_type == 1) {
+				constexpr REAL_T _n0_const = 0.28209479177387814; // .5 / sqrt(pi)
+				ndotp_log_csa_d<BDIM_X>(delta_nr, hr_side, __msk_data_sh, delta_q, __r_sh);
+				__syncwarp(WMASK);
+				if (tidx == 0) {
+					__r_sh[0] = _n0_const;
+				}
+				__syncwarp(WMASK);
+			}
 
-                        #pragma unroll
-                        for(int j = tidx; j < delta_nr; j += BDIM_X) {
-				__r_sh[j] -= __h_sh[j];
-                        }
-                        __syncwarp(WMASK);
                         // __r_sh[tidy] <- python 'coef'
 
                         ndotp_d<BDIM_X>(samplm_nr, delta_nr, __r_sh, sampling_matrix, __h_sh);
@@ -1094,6 +1149,7 @@ template<int BDIM_X,
          typename REAL_T,
          typename REAL3_T>
 __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
+			 const int model_type,
 			 const REAL_T max_angle,
 			 const REAL_T min_signal,
 			 const REAL_T tc_threshold,
@@ -1159,6 +1215,7 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
                 int ndir = get_direction_d<BDIM_X,
                                            BDIM_Y,
                                            5>(st,
+					      model_type,
 					      max_angle,
 					      min_signal,
 					      relative_peak_thres,
@@ -1227,7 +1284,8 @@ template<int BDIM_X,
          int BDIM_Y,
          typename REAL_T,
          typename REAL3_T>
-__global__ void getNumStreamlines_k(const REAL_T max_angle,
+__global__ void getNumStreamlines_k(const int model_type,
+                                    const REAL_T max_angle,
 				    const REAL_T min_signal,
 				    const REAL_T relative_peak_thres,
 				    const REAL_T min_separation_angle,
@@ -1283,6 +1341,7 @@ __global__ void getNumStreamlines_k(const REAL_T max_angle,
         int ndir = get_direction_d<BDIM_X,
                                    BDIM_Y,
                                    1>(&st,
+				      model_type,
 				      max_angle,
 				      min_signal,
 				      relative_peak_thres,
@@ -1323,7 +1382,8 @@ template<int BDIM_X,
          int BDIM_Y,
          typename REAL_T,
          typename REAL3_T>
-__global__ void genStreamlinesMerge_k(const REAL_T max_angle,
+__global__ void genStreamlinesMerge_k(const int model_type,
+				      const REAL_T max_angle,
 				      const REAL_T min_signal,
 				      const REAL_T tc_threshold,
 				      const REAL_T step_size,
@@ -1407,6 +1467,7 @@ __global__ void genStreamlinesMerge_k(const REAL_T max_angle,
                 int stepsB;
                 const int tissue_classB = tracker_d<BDIM_X,
                                                     BDIM_Y>(&st,
+							    model_type,
 						            max_angle,
 						            min_signal,
 							    tc_threshold,
@@ -1446,6 +1507,7 @@ __global__ void genStreamlinesMerge_k(const REAL_T max_angle,
                 int stepsF;
                 const int tissue_classF = tracker_d<BDIM_X,
                                                     BDIM_Y>(&st,
+							    model_type,
 						            max_angle,
 						            min_signal,
 							    tc_threshold,
@@ -1494,7 +1556,7 @@ __global__ void genStreamlinesMerge_k(const REAL_T max_angle,
         return;
 }
 
-void generate_streamlines_cuda_mgpu(const REAL max_angle, const REAL min_signal, const REAL tc_threshold, const REAL step_size,
+void generate_streamlines_cuda_mgpu(const int model_type, const REAL max_angle, const REAL min_signal, const REAL tc_threshold, const REAL step_size,
                                     const REAL relative_peak_thresh, const REAL min_separation_angle,
                                     const int nseeds, const std::vector<REAL*> &seeds_d,
                                     const int dimx, const int dimy, const int dimz, const int dimt,
@@ -1545,7 +1607,8 @@ void generate_streamlines_cuda_mgpu(const REAL max_angle, const REAL min_signal,
     // Precompute number of streamlines before allocating memory
     getNumStreamlines_k<THR_X_SL,
                         THR_X_BL/THR_X_SL>
-                        <<<grid, block, shSizeGNS>>>(max_angle,
+                        <<<grid, block, shSizeGNS>>>(model_type,
+                                                     max_angle,
 						     min_signal,
 						     relative_peak_thresh,
 						     min_separation_angle,
@@ -1654,7 +1717,8 @@ void generate_streamlines_cuda_mgpu(const REAL max_angle, const REAL min_signal,
     //fprintf(stderr, "Launching kernel with %u blocks of size (%u, %u)\n", grid.x, block.x, block.y);
     genStreamlinesMerge_k<THR_X_SL,
                           THR_X_BL/THR_X_SL>
-                          <<<grid, block, shSizeGNS, streams[n]>>>(max_angle,
+                          <<<grid, block, shSizeGNS, streams[n]>>>(model_type,
+								   max_angle,
 								   min_signal,
 								   tc_threshold,
 								   step_size,
diff --git a/cuslines/generate_streamlines_cuda.h b/cuslines/generate_streamlines_cuda.h
index 4ad90d7..ba3c239 100644
--- a/cuslines/generate_streamlines_cuda.h
+++ b/cuslines/generate_streamlines_cuda.h
@@ -33,7 +33,7 @@
 
 #include "globals.h"
 
-void generate_streamlines_cuda_mgpu(const REAL max_angle, const REAL min_signal, const REAL tc_threshold, const REAL step_size,
+void generate_streamlines_cuda_mgpu(const int model_type, const REAL max_angle, const REAL min_signal, const REAL tc_threshold, const REAL step_size,
                                     const REAL relative_peak_thresh, const REAL min_separation_angle,
 				    const int nseeds, const std::vector<REAL*> &seeds_d,
                                     const int dimx, const int dimy, const int dimz, const int dimt,
diff --git a/run_dipy_cpu.py b/run_dipy_cpu.py
index 9b32d98..67cdc3b 100644
--- a/run_dipy_cpu.py
+++ b/run_dipy_cpu.py
@@ -32,6 +32,7 @@
 import time
 
 import numpy as np
+import numpy.linalg as npl
 
 import dipy.reconst.dti as dti
 from dipy.io import read_bvals_bvecs
@@ -41,7 +42,7 @@
 from dipy.core.gradients import gradient_table
 from dipy.data import small_sphere
 from dipy.direction import BootDirectionGetter
-from dipy.reconst.shm import OpdtModel
+from dipy.reconst.shm import OpdtModel, CsaOdfModel
 from dipy.tracking.local_tracking import LocalTracking
 from dipy.tracking.stopping_criterion import ThresholdStoppingCriterion
 from dipy.reconst import shm
@@ -50,7 +51,7 @@
 from nibabel.orientations import aff2axcodes
 
 # Import custom module
-import cuslines.cuslines as cuslines
+#import cuslines.cuslines as cuslines
 
 t0 = time.time()
 
@@ -76,7 +77,19 @@ def get_img(ep2_seq):
 parser.add_argument("mask_nifti", help="path to the mask file")
 parser.add_argument("roi_nifti", help="path to the ROI file")
 parser.add_argument("--output-prefix", type=str, default ='', help="path to the output file")
-parser.add_argument("--chunk-size", type=int, required=True, help="how many seeds to process per sweep, per GPU")
+parser.add_argument("--chunk-size", type=int, required=True, help="how many seeds to process per sweep")
+parser.add_argument("--nseeds", type=int, default=None, help="how many seeds to process in total")
+parser.add_argument("--max-angle", type=float, default=60, help="max angle (in degrees)")
+parser.add_argument("--min-signal", type=float, default=1.0, help="default: 1.0")
+parser.add_argument("--step-size", type=float, default=0.5, help="default: 0.5")
+parser.add_argument("--sh-order",type=int,default=4,help="sh_order")
+parser.add_argument("--fa-threshold",type=float,default=0.1,help="FA threshold")
+parser.add_argument("--relative-peak-threshold",type=float,default=0.25,help="relative peak threshold")
+parser.add_argument("--min_separation-angle",type=float,default=45,help="min separation angle (in degrees)")
+parser.add_argument("--sm-lambda",type=float,default=0.006,help="smoothing lambda")
+parser.add_argument("--sampling-density", type=int, default=3, help="sampling density for seed generation")
+parser.add_argument("--model", type=str, default="opdt", choices=['opdt', 'csaodf'], help="model to use")
+
 args = parser.parse_args()
 
 img = get_img(args.nifti_file)
@@ -96,33 +109,38 @@ def get_img(ep2_seq):
 FA[np.isnan(FA)] = 0
 
 # Setup tissue_classifier args
-tissue_classifier = ThresholdStoppingCriterion(FA, 0.1)
+tissue_classifier = ThresholdStoppingCriterion(FA, args.fa_threshold)
 metric_map = np.asarray(FA, 'float64')
 
 # Create seeds for ROI
-seed_mask = utils.seeds_from_mask(roi_data, density=3, affine=np.eye(4))
+#seed_mask = utils.seeds_from_mask(roi_data, density=sampling_density, affine=img_affine)
+seed_mask = utils.seeds_from_mask(roi_data, density=args.sampling_density, affine=np.eye(4))
+seed_mask = seed_mask[0:args.nseeds]
 
 # Setup model
-print('slowadcodf')
-sh_order = 6
-model = OpdtModel(gtab, sh_order=sh_order, min_signal=1)
+if args.model == "opdt":
+  print("Running OPDT model...")
+  model = OpdtModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=args.min_signal)
+else:
+  print("Running CSAODF model...")
+  model = CsaOdfModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=args.min_signal)
 
 # Setup direction getter args
 print('Bootstrap direction getter')
-boot_dg = BootDirectionGetter.from_data(data, model, max_angle=60., sphere=small_sphere)
+boot_dg = BootDirectionGetter.from_data(data, model, max_angle=args.max_angle, sphere=small_sphere, sh_order=args.sh_order, relative_peak_threshold=args.relative_peak_threshold, min_separation_angle=args.min_separation_angle)
 
 print('streamline gen')
 global_chunk_size = args.chunk_size
 nchunks = (seed_mask.shape[0] + global_chunk_size - 1) // global_chunk_size
 
-streamline_generator = LocalTracking(boot_dg, tissue_classifier, seed_mask, affine=np.eye(4), step_size=.5)
-
 t1 = time.time()
 streamline_time = 0
 io_time = 0
 for idx in range(int(nchunks)):
   # Main streamline computation
   ts = time.time()
+  #streamline_generator = LocalTracking(boot_dg, tissue_classifier, seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size], affine=img_affine, step_size=args.step_size)
+  streamline_generator = LocalTracking(boot_dg, tissue_classifier, seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size], affine=np.eye(4), step_size=args.step_size)
   streamlines = [s for s in streamline_generator]
   te = time.time()
   streamline_time += (te-ts)
diff --git a/run_dipy_cpu_hardi.py b/run_dipy_cpu_hardi.py
index a2d8d5d..2e2b0b6 100644
--- a/run_dipy_cpu_hardi.py
+++ b/run_dipy_cpu_hardi.py
@@ -83,9 +83,10 @@ def get_img(ep2_seq):
 parser.add_argument("--sh-order",type=int,default=4,help="sh_order")
 parser.add_argument("--fa-threshold",type=float,default=0.1,help="FA threshold")
 parser.add_argument("--relative-peak-threshold",type=float,default=0.25,help="relative peak threshold")
-parser.add_argument("--min_separation-angle",type=float,default=45,help="min separation angle (in degrees)")
+parser.add_argument("--min-separation-angle",type=float,default=45,help="min separation angle (in degrees)")
 parser.add_argument("--sm-lambda",type=float,default=0.006,help="smoothing lambda")
 parser.add_argument("--sampling-density", type=int, default=3, help="sampling density for seed generation")
+parser.add_argument("--model", type=str, default="opdt", choices=['opdt', 'csaodf'], help="model to use")
 
 args = parser.parse_args()
 
@@ -119,12 +120,15 @@ def get_img(ep2_seq):
 # Create seeds for ROI
 #seed_mask = utils.seeds_from_mask(roi_data, density=sampling_density, affine=img_affine)
 seed_mask = utils.seeds_from_mask(roi_data, density=args.sampling_density, affine=np.eye(4))
-#seed_mask = seed_mask[1000:args.nseeds]
-seed_mask = seed_mask[500:501]
+seed_mask = seed_mask[0:args.nseeds]
 
 # Setup model
-#model = CsaOdfModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=args.min_signal)
-model = OpdtModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=args.min_signal)
+if args.model == "opdt":
+  print("Running OPDT model...")
+  model = OpdtModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=args.min_signal)
+else:
+  print("Running CSAODF model...")
+  model = CsaOdfModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=args.min_signal)
 
 # Setup direction getter args
 print('Bootstrap direction getter')
@@ -156,8 +160,8 @@ def get_img(ep2_seq):
     #save_tractogram(fname, streamlines, img.affine, vox_size=roi.header.get_zooms(), shape=roi_data.shape)
     #save_tractogram(fname, streamlines)
     #sft = StatefulTractogram(streamlines, hardi_nifti_fname, Space.VOX)
-    print(seed_mask)
-    print(streamlines)
+    #print(seed_mask)
+    #print(streamlines)
 
     #inv_affine = npl.inv(img_affine)
     #streamlines = transform_streamlines(streamlines,inv_affine)
diff --git a/run_dipy_gpu.py b/run_dipy_gpu.py
index ce109b9..26dbab4 100644
--- a/run_dipy_gpu.py
+++ b/run_dipy_gpu.py
@@ -43,7 +43,7 @@
 from dipy.core.gradients import gradient_table
 from dipy.data import small_sphere
 #from dipy.direction import BootDirectionGetter
-from dipy.reconst.shm import OpdtModel
+from dipy.reconst.shm import OpdtModel, CsaOdfModel
 #from dipy.tracking.local import LocalTracking, ThresholdTissueClassifier
 from dipy.reconst import shm
 
@@ -79,13 +79,18 @@ def get_img(ep2_seq):
 parser.add_argument("--roi_nifti", default=None, help="path to the ROI file")
 parser.add_argument("--output-prefix", type=str, default ='', help="path to the output file")
 parser.add_argument("--chunk-size", type=int, required=True, help="how many seeds to process per sweep, per GPU")
-parser.add_argument("--sampling-density", type=int, default=3, help="sampling density for seed generation")
 parser.add_argument("--ngpus", type=int, required=True, help="number of GPUs to use")
 parser.add_argument("--use-fast-write", action='store_true', help="use fast file write")
-parser.add_argument("--max-angle", type=float, default=1.0471975511965976, help="default: 60 deg (in rad)")
+parser.add_argument("--max-angle", type=float, default=60, help="max angle (in degrees)")
 parser.add_argument("--min-signal", type=float, default=1.0, help="default: 1.0")
-parser.add_argument("--tc-threshold", type=float, default=0.1, help="default: 0.1")
 parser.add_argument("--step-size", type=float, default=0.5, help="default: 0.5")
+parser.add_argument("--sh-order",type=int,default=4,help="sh_order")
+parser.add_argument("--fa-threshold",type=float,default=0.1,help="FA threshold")
+parser.add_argument("--relative-peak-threshold",type=float,default=0.25,help="relative peak threshold")
+parser.add_argument("--min-separation-angle",type=float,default=45,help="min separation angle (in degrees)")
+parser.add_argument("--sm-lambda",type=float,default=0.006,help="smoothing lambda")
+parser.add_argument("--sampling-density", type=int, default=3, help="sampling density for seed generation")
+parser.add_argument("--model", type=str, default="opdt", choices=['opdt', 'csaodf'], help="model to use")
 args = parser.parse_args()
 
 img = get_img(args.nifti_file)
@@ -122,7 +127,7 @@ def get_img(ep2_seq):
         np.save(args.fa_numpy, FA)
 
 # Setup tissue_classifier args
-#tissue_classifier = ThresholdTissueClassifier(FA, 0.1)
+#tissue_classifier = ThresholdTissueClassifier(FA, args.fa_threshold)
 metric_map = np.asarray(FA, 'float64')
 
 # resample roi if necessary
@@ -137,32 +142,39 @@ def get_img(ep2_seq):
 seed_mask = utils.seeds_from_mask(roi_data, density=args.sampling_density, affine=np.eye(4))
 
 # Setup model
-print('slowadcodf')
-sh_order = 6
-model = OpdtModel(gtab, sh_order=sh_order, min_signal=1)
+if args.model == "opdt":
+  print("Running OPDT model...")
+  model = OpdtModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=args.min_signal)
+  fit_matrix = model._fit_matrix
+  delta_b, delta_q = fit_matrix
+else:
+  print("Running CSAODF model...")
+  model = CsaOdfModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=args.min_signal)
+  fit_matrix = model._fit_matrix
+  # Unlike OPDT, CSA has a single matrix used for fit_matrix. Populating delta_b and delta_q with necessary values for
+  # now.
+  delta_b = fit_matrix
+  delta_q = fit_matrix
 
 # Setup direction getter args
 print('Bootstrap direction getter')
-#boot_dg = BootDirectionGetter.from_data(data, model, max_angle=60., sphere=small_sphere)
+#boot_dg = BootDirectionGetter.from_data(data, model, max_angle=args.max_angle, sphere=small_sphere, sh_order=args.sh_order, relative_peak_threshold=args.relative_peak_threshold, min_separation_angle=args.min_separation_angle)
 b0s_mask = gtab.b0s_mask
 dwi_mask = ~b0s_mask
 
-# get fit_matrix from model
-fit_matrix = model._fit_matrix
-delta_b, delta_q = fit_matrix
 
 # setup sampling matrix
 sphere = small_sphere
 theta = sphere.theta
 phi = sphere.phi
-sampling_matrix, _, _ = shm.real_sym_sh_basis(sh_order, theta, phi)
+sampling_matrix, _, _ = shm.real_sym_sh_basis(args.sh_order, theta, phi)
 
 ## from BootPmfGen __init__
 # setup H and R matrices
 # TODO: figure out how to get H, R matrices from direction getter object
 x, y, z = model.gtab.gradients[dwi_mask].T
 r, theta, phi = shm.cart2sphere(x, y, z)
-B, _, _ = shm.real_sym_sh_basis(sh_order, theta, phi)
+B, _, _ = shm.real_sym_sh_basis(args.sh_order, theta, phi)
 H = shm.hat(B)
 R = shm.lcr_matrix(H)
 
@@ -173,11 +185,15 @@ def get_img(ep2_seq):
 global_chunk_size = args.chunk_size * args.ngpus
 nchunks = (seed_mask.shape[0] + global_chunk_size - 1) // global_chunk_size
 
-#streamline_generator = LocalTracking(boot_dg, tissue_classifier, seed_mask, affine=np.eye(4), step_size=.5)
-gpu_tracker = cuslines.GPUTracker(args.max_angle,
+#streamline_generator = LocalTracking(boot_dg, tissue_classifier, seed_mask, affine=np.eye(4), step_size=args.step_size)
+
+gpu_tracker = cuslines.GPUTracker(cuslines.ModelType.OPDT if args.model == "opdt" else cuslines.ModelType.CSAODF,
+                                  args.max_angle * np.pi/180,
                                   args.min_signal,
-                                  args.tc_threshold,
+                                  args.fa_threshold,
                                   args.step_size,
+                                  args.relative_peak_threshold,
+                                  args.min_separation_angle * np.pi/180,
                                   dataf, H, R, delta_b, delta_q,
                                   b0s_mask.astype(np.int32), metric_map, sampling_matrix,
                                   sphere.vertices, sphere.edges.astype(np.int32),
diff --git a/run_dipy_gpu_hardi.py b/run_dipy_gpu_hardi.py
index 3e333b3..863887d 100644
--- a/run_dipy_gpu_hardi.py
+++ b/run_dipy_gpu_hardi.py
@@ -41,7 +41,7 @@
 from dipy.core.gradients import gradient_table
 from dipy.data import small_sphere
 #from dipy.direction import BootDirectionGetter
-from dipy.reconst.shm import OpdtModel
+from dipy.reconst.shm import OpdtModel, CsaOdfModel
 #from dipy.tracking.local import LocalTracking, ThresholdTissueClassifier
 from dipy.reconst import shm
 from dipy.data import get_fnames
@@ -85,6 +85,7 @@ def get_img(ep2_seq):
 parser.add_argument("--min-separation-angle",type=float,default=45,help="min separation angle (in degrees)")
 parser.add_argument("--sm-lambda",type=float,default=0.006,help="smoothing lambda")
 parser.add_argument("--sampling-density", type=int, default=3, help="sampling density for seed generation")
+parser.add_argument("--model", type=str, default="opdt", choices=['opdt', 'csaodf'], help="model to use")
 args = parser.parse_args()
 
 # Get Stanford HARDI data
@@ -119,12 +120,22 @@ def get_img(ep2_seq):
 # Create seeds for ROI
 #seed_mask = utils.seeds_from_mask(roi_data, density=sampling_density, affine=img_affine)
 seed_mask = utils.seeds_from_mask(roi_data, density=args.sampling_density, affine=np.eye(4))
-#seed_mask = seed_mask[0:args.nseeds]
-seed_mask = seed_mask[500:501]
+seed_mask = seed_mask[0:args.nseeds]
 
 # Setup model
-#model = CsaOdfModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=1)
-model = OpdtModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=1)
+if args.model == "opdt":
+  print("Running OPDT model...")
+  model = OpdtModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=args.min_signal)
+  fit_matrix = model._fit_matrix
+  delta_b, delta_q = fit_matrix
+else:
+  print("Running CSAODF model...")
+  model = CsaOdfModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=args.min_signal)
+  fit_matrix = model._fit_matrix
+  # Unlike OPDT, CSA has a single matrix used for fit_matrix. Populating delta_b and delta_q with necessary values for
+  # now.
+  delta_b = fit_matrix
+  delta_q = fit_matrix
 
 # Setup direction getter args
 print('Bootstrap direction getter')
@@ -132,9 +143,6 @@ def get_img(ep2_seq):
 b0s_mask = gtab.b0s_mask
 dwi_mask = ~b0s_mask
 
-# get fit_matrix from model
-fit_matrix = model._fit_matrix
-delta_b, delta_q = fit_matrix
 
 # setup sampling matrix
 sphere = small_sphere
@@ -160,7 +168,8 @@ def get_img(ep2_seq):
 
 #streamline_generator = LocalTracking(boot_dg, tissue_classifier, seed_mask, affine=np.eye(4), step_size=args.step_size)
 
-gpu_tracker = cuslines.GPUTracker(args.max_angle * np.pi/180,
+gpu_tracker = cuslines.GPUTracker(cuslines.ModelType.OPDT if args.model == "opdt" else cuslines.ModelType.CSAODF,
+                                  args.max_angle * np.pi/180,
                                   args.min_signal,
                                   args.fa_threshold,
                                   args.step_size,
@@ -185,8 +194,8 @@ def get_img(ep2_seq):
 
   # Save tracklines file
   if args.output_prefix:
-    print(seed_mask)
-    print(streamlines)
+    #print(seed_mask)
+    #print(streamlines)
     if args.use_fast_write:
       prefix = "{}.{}_{}".format(args.output_prefix, idx+1, nchunks)
       ts = time.time()

From 055982d2d3f6a0c382711888d95600ff7c250db9 Mon Sep 17 00:00:00 2001
From: Josh Romero <romero.josh@gmail.com>
Date: Tue, 7 Feb 2023 16:00:58 -0800
Subject: [PATCH 04/35] Debugging. Disable permutation in GPU implementation.

---
 cuslines/generate_streamlines_cuda.cu | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cuslines/generate_streamlines_cuda.cu b/cuslines/generate_streamlines_cuda.cu
index e9dbb6a..e0836f7 100644
--- a/cuslines/generate_streamlines_cuda.cu
+++ b/cuslines/generate_streamlines_cuda.cu
@@ -961,12 +961,13 @@ __device__ int get_direction_d(curandStatePhilox4_32_10_t *st,
 #ifdef USE_FIXED_PERMUTATION
                                         const int srcPermInd = fixedPerm[j+tidx];
 #else
-                                        const int srcPermInd = curand(st) % hr_side;
+//                                        const int srcPermInd = curand(st) % hr_side;
 //                                        if (srcPermInd < 0 || srcPermInd >= hr_side) {
 //                                                printf("srcPermInd: %d\n", srcPermInd);
 //                                        }
 #endif
-					__h_sh[j+tidx] += __r_sh[srcPermInd];
+					//__h_sh[j+tidx] += __r_sh[srcPermInd];
+					__h_sh[j+tidx] += __r_sh[j+tidx];
                                 }
                         }
 			__syncwarp(WMASK);

From 1f9f10b0f9c19296dfbfd81956601b144e3d7350 Mon Sep 17 00:00:00 2001
From: Josh Romero <romero.josh@gmail.com>
Date: Tue, 7 Feb 2023 16:37:59 -0800
Subject: [PATCH 05/35] Better handling of GPU random numbers to ensure threads
 get unique subsequences. Slightly better, but still off from CPU.

---
 cuslines/generate_streamlines_cuda.cu | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/cuslines/generate_streamlines_cuda.cu b/cuslines/generate_streamlines_cuda.cu
index e0836f7..065329c 100644
--- a/cuslines/generate_streamlines_cuda.cu
+++ b/cuslines/generate_streamlines_cuda.cu
@@ -961,13 +961,13 @@ __device__ int get_direction_d(curandStatePhilox4_32_10_t *st,
 #ifdef USE_FIXED_PERMUTATION
                                         const int srcPermInd = fixedPerm[j+tidx];
 #else
-//                                        const int srcPermInd = curand(st) % hr_side;
+                                        const int srcPermInd = curand(st) % hr_side;
 //                                        if (srcPermInd < 0 || srcPermInd >= hr_side) {
 //                                                printf("srcPermInd: %d\n", srcPermInd);
 //                                        }
 #endif
-					//__h_sh[j+tidx] += __r_sh[srcPermInd];
-					__h_sh[j+tidx] += __r_sh[j+tidx];
+					__h_sh[j+tidx] += __r_sh[srcPermInd];
+					//__h_sh[j+tidx] += __r_sh[j+tidx];
                                 }
                         }
 			__syncwarp(WMASK);
@@ -1315,6 +1315,7 @@ __global__ void getNumStreamlines_k(const int model_type,
 
         const int tidx = threadIdx.x;
         const int slid = blockIdx.x*blockDim.y + threadIdx.y;
+        const size_t gid = blockIdx.x * blockDim.y * blockDim.x + blockDim.x * threadIdx.y + threadIdx.x;
 
         if (slid >= nseed) {
                 return;
@@ -1328,7 +1329,8 @@ __global__ void getNumStreamlines_k(const int model_type,
 	const int hr_side = dimt-1;
 
         curandStatePhilox4_32_10_t st;
-        curand_init(rndSeed, slid + rndOffset, DIV_UP(hr_side, BDIM_X)*tidx, &st); // each thread uses DIV_UP(hr_side/BDIM_X)
+        //curand_init(rndSeed, slid + rndOffset, DIV_UP(hr_side, BDIM_X)*tidx, &st); // each thread uses DIV_UP(hr_side/BDIM_X)
+        curand_init(rndSeed, gid, 0, &st); // each thread uses DIV_UP(hr_side/BDIM_X)
                                                                                    // elements of the same sequence
         // python:
         //directions = get_direction(None, dataf, dwi_mask, sphere, s, H, R, model, max_angle,
@@ -1429,7 +1431,10 @@ __global__ void genStreamlinesMerge_k(const int model_type,
 	const int hr_side = dimt-1;
 
         curandStatePhilox4_32_10_t st;
-        curand_init(rndSeed, slid+rndOffset, DIV_UP(hr_side, BDIM_X)*tidx, &st); // each thread uses DIV_UP(HR_SIDE/BDIM_X)
+        const int gbid = blockIdx.y*gridDim.x + blockIdx.x;
+        const size_t gid = blockIdx.x * blockDim.y * blockDim.x + blockDim.x * threadIdx.y + threadIdx.x;
+        //curand_init(rndSeed, slid+rndOffset, DIV_UP(hr_side, BDIM_X)*tidx, &st); // each thread uses DIV_UP(HR_SIDE/BDIM_X)
+        curand_init(rndSeed, gid+1, 0, &st); // each thread uses DIV_UP(hr_side/BDIM_X)
                                                                                  // elements of the same sequence
         if (slid >= nseed) {
                 return;

From 7812bd303fb69e26c785acdfc1df2ee7f2c2be03 Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Thu, 27 Jun 2024 15:12:58 -0700
Subject: [PATCH 06/35] refactor model code into functions, begin csd function

---
 cuslines/cuslines.cpp                 |   5 -
 cuslines/generate_streamlines_cuda.cu | 226 ++++++++++----------------
 cuslines/generate_streamlines_cuda.h  |   8 +-
 cuslines/globals.h                    |   5 +
 4 files changed, 99 insertions(+), 145 deletions(-)

diff --git a/cuslines/cuslines.cpp b/cuslines/cuslines.cpp
index 2a9c58b..da47caa 100644
--- a/cuslines/cuslines.cpp
+++ b/cuslines/cuslines.cpp
@@ -58,11 +58,6 @@ py::capsule cleanup(T* ptr) {
          });
 }
 
-enum ModelType {
-  OPDT = 0,
-  CSAODF = 1,
-};
-
 class GPUTracker {
   public:
     GPUTracker(ModelType model_type,
diff --git a/cuslines/generate_streamlines_cuda.cu b/cuslines/generate_streamlines_cuda.cu
index 065329c..ff88436 100644
--- a/cuslines/generate_streamlines_cuda.cu
+++ b/cuslines/generate_streamlines_cuda.cu
@@ -170,68 +170,6 @@ __device__ int trilinear_interp_d(const int dimx,
         return 0;
 }
 
-template<int BDIM_X,
-         int N,
-         typename VAL_T>
-__device__ void copy_d(      VAL_T *__restrict__ dst,
-                       const VAL_T *__restrict__ src) {
-
-        const int tidx = threadIdx.x;
-
-        #pragma unroll
-        for(int j = 0; j < N; j+= BDIM_X) {
-                if (j+tidx < N) {
-                        dst[j+tidx] = src[j+tidx];
-                }
-        }
-        return;
-}
-
-template<int BDIM_X,
-         int N,
-         int M,
-         typename VAL_T>
-__device__ void ndotp_d(const VAL_T *__restrict__ srcV,
-                        const VAL_T *__restrict__ srcM,
-                              VAL_T *__restrict__ dstV) {
-
-        const int tidx = threadIdx.x;
-
-        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
-        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
-
-        //#pragma unroll
-        for(int i = 0; i < N; i++) {
-
-                VAL_T __tmp = 0;
-
-                #pragma unroll
-                for(int j = 0; j < M; j += BDIM_X) {
-                        if (j+tidx < M) {
-                                __tmp += srcV[j+tidx]*srcM[i*M + j+tidx];
-                        }
-                }
-#if 0
-                #pragma unroll
-                for(int j = BDIM_X/2; j; j /= 2) {
-                        __tmp += __shfl_xor_sync(WMASK, __tmp, j, BDIM_X);
-                }
-#else
-                #pragma unroll
-                for(int j = BDIM_X/2; j; j /= 2) {
-                        __tmp += __shfl_down_sync(WMASK, __tmp, j, BDIM_X);
-                }
-#endif
-                // values could be held by BDIM_X threads and written
-                // together every BDIM_X iterations...
-
-                if (tidx == 0) {
-                        dstV[i] = __tmp;
-                }
-        }
-        return;
-}
-
 template<int BDIM_X,
          typename VAL_T>
 __device__ void ndotp_d(const int N,
@@ -274,54 +212,6 @@ __device__ void ndotp_d(const int N,
         return;
 }
 
-template<int BDIM_X,
-         int N,
-         int M,
-         typename VAL_T>
-__device__ void ndotp_log_d(const VAL_T *__restrict__ srcV,
-                            const VAL_T *__restrict__ srcM,
-                                  VAL_T *__restrict__ dstV) {
-
-        const int tidx = threadIdx.x;
-
-        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
-         const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
-
-        const VAL_T ONEP5 = static_cast<VAL_T>(1.5);
-
-        //#pragma unroll
-        for(int i = 0; i < N; i++) {
-
-                VAL_T __tmp = 0;
-
-                #pragma unroll
-                for(int j = 0; j < M; j += BDIM_X) {
-                        if (j+tidx < M) {
-
-                                const VAL_T v = srcV[j+tidx];
-                                __tmp += -LOG(v)*(ONEP5+LOG(v))*v * srcM[i*M + j+tidx];
-                        }
-                }
-#if 0
-                #pragma unroll
-                for(int j = BDIM_X/2; j; j /= 2) {
-                        __tmp += __shfl_xor_sync(WMASK, __tmp, j, BDIM_X);
-                }
-#else
-                #pragma unroll
-                for(int j = BDIM_X/2; j; j /= 2) {
-                        __tmp += __shfl_down_sync(WMASK, __tmp, j, BDIM_X);
-                }
-#endif
-                // values could be held by BDIM_X threads and written
-                // together every BDIM_X iterations...
-
-                if (tidx == 0) {
-                        dstV[i] = __tmp;
-                }
-        }
-        return;
-}
 
 template<int BDIM_X,
          typename VAL_T>
@@ -414,6 +304,87 @@ __device__ void ndotp_log_csa_d(const int N,
 	return;
 }
 
+template<int BDIM_X,
+         typename REAL_T>
+__device__ void fit_opdt(const int delta_nr,
+                         const int hr_side,
+                         const REAL_T *__restrict__ delta_q,
+                         const REAL_T *__restrict__ delta_b,
+                         const REAL_T *__restrict__ __msk_data_sh,
+                         REAL_T *__restrict__ __h_sh,
+                         REAL_T *__restrict__ __r_sh) {
+        const int tidx = threadIdx.x;
+        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+        ndotp_log_d<BDIM_X>(delta_nr, hr_side, __msk_data_sh, delta_q, __r_sh);
+        ndotp_d    <BDIM_X>(delta_nr, hr_side, __msk_data_sh, delta_b, __h_sh);
+        __syncwarp(WMASK);
+        #pragma unroll
+        for(int j = tidx; j < delta_nr; j += BDIM_X) {
+                __r_sh[j] -= __h_sh[j];
+        }
+        __syncwarp(WMASK);
+}
+
+template<int BDIM_X, typename REAL_T>
+__device__ void fit_csa(const int delta_nr,
+                        const int hr_side,
+                        const REAL_T *__restrict__ fit_matrix,
+                        const REAL_T *__restrict__ __msk_data_sh,
+                        REAL_T *__restrict__ __r_sh) {
+        const int tidx = threadIdx.x;
+        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+        constexpr REAL _n0_const = 0.28209479177387814; // .5 / sqrt(pi)
+        ndotp_log_csa_d<BDIM_X>(delta_nr, hr_side, __msk_data_sh, fit_matrix, __r_sh);
+        __syncwarp(WMASK);
+        if (tidx == 0) {
+                __r_sh[0] = _n0_const;
+        }
+        __syncwarp(WMASK);
+}
+
+// Based on dipy implementation:
+// https://github.com/dipy/dipy/blob/104345de6dcbfd6627552b00d180e53db6c17381/dipy/reconst/csdeconv.py#L573C5-L573C13
+template<int BDIM_X, typename REAL_T>
+__device__ void fit_csa(const int delta_nr,
+                        const int hr_side,
+                        const REAL_T *__restrict__ X,
+                        const REAL_T *__restrict__ B_reg,
+                        const REAL_T *__restrict__ P, // TODO: P can be calculated beforehand
+                        const REAL_T *__restrict__ __msk_data_sh,
+                        REAL_T *__restrict__ __r_sh) {
+        constexpr REAL_T mu = 1e-5;
+        constexpr REAL_T tau = 0.1;
+
+    
+
+}
+
+template<int BDIM_X, typename REAL_T>
+__device__ void fit_model_coef(const ModelType model_type,
+                               const int delta_nr, // delta_nr is number of ODF directions
+                               const int hr_side, // hr_side is number of data directions
+                               const REAL_T *__restrict__ delta_q,
+                               const REAL_T *__restrict__ delta_b, // these are fit matrices the model can use, different for each model
+                               const REAL_T *__restrict__ __msk_data_sh, // __msk_data_sh is the part of the data currently being operated on by this block
+                               REAL_T *__restrict__ __h_sh, // these last two are modifications to the coefficients that will be returned
+                               REAL_T *__restrict__ __r_sh) {
+        switch(model_type) {
+                case 0:
+                        fit_opdt<BDIM_X>(delta_nr, hr_side, delta_q, delta_b, __msk_data_sh, __h_sh, __r_sh);
+                        break;
+                case 1:
+                        fit_csa<BDIM_X>(delta_nr, hr_side, delta_q, __msk_data_sh, __r_sh);
+                        break;
+                default:
+                        printf("FATAL: Invalid Model Type.\n");
+                        break;
+        }
+}
+
 template<int BDIM_X,
          typename LEN_T,
          typename VAL_T>
@@ -862,7 +833,7 @@ template<int BDIM_X,
          typename REAL_T,
          typename REAL3_T>
 __device__ int get_direction_d(curandStatePhilox4_32_10_t *st,
-			       const int model_type,
+			       const ModelType model_type,
                                const REAL_T max_angle,
 			       const REAL_T min_signal,
 			       const REAL_T relative_peak_thres,
@@ -1009,24 +980,7 @@ __device__ int get_direction_d(curandStatePhilox4_32_10_t *st,
 			maskGet<BDIM_X>(dimt, b0s_mask, __vox_data_sh, __msk_data_sh);
 			__syncwarp(WMASK);
 
-			if (model_type == 0) {
-				ndotp_log_d<BDIM_X>(delta_nr, hr_side, __msk_data_sh, delta_q, __r_sh);
-				ndotp_d    <BDIM_X>(delta_nr, hr_side, __msk_data_sh, delta_b, __h_sh);
-				__syncwarp(WMASK);
-				#pragma unroll
-				for(int j = tidx; j < delta_nr; j += BDIM_X) {
-					__r_sh[j] -= __h_sh[j];
-				}
-				__syncwarp(WMASK);
-			} else if (model_type == 1) {
-				constexpr REAL_T _n0_const = 0.28209479177387814; // .5 / sqrt(pi)
-				ndotp_log_csa_d<BDIM_X>(delta_nr, hr_side, __msk_data_sh, delta_q, __r_sh);
-				__syncwarp(WMASK);
-				if (tidx == 0) {
-					__r_sh[0] = _n0_const;
-				}
-				__syncwarp(WMASK);
-			}
+                        fit_model_coef<BDIM_X>(model_type, delta_nr, hr_side, delta_q, delta_b, __msk_data_sh, __h_sh, __r_sh);
 
                         // __r_sh[tidy] <- python 'coef'
 
@@ -1150,7 +1104,7 @@ template<int BDIM_X,
          typename REAL_T,
          typename REAL3_T>
 __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
-			 const int model_type,
+			 const ModelType model_type,
 			 const REAL_T max_angle,
 			 const REAL_T min_signal,
 			 const REAL_T tc_threshold,
@@ -1285,7 +1239,7 @@ template<int BDIM_X,
          int BDIM_Y,
          typename REAL_T,
          typename REAL3_T>
-__global__ void getNumStreamlines_k(const int model_type,
+__global__ void getNumStreamlines_k(const ModelType model_type,
                                     const REAL_T max_angle,
 				    const REAL_T min_signal,
 				    const REAL_T relative_peak_thres,
@@ -1326,7 +1280,7 @@ __global__ void getNumStreamlines_k(const int model_type,
 
         REAL3_T *__restrict__ __shDir = shDir0+slid*samplm_nr;
 
-	const int hr_side = dimt-1;
+	// const int hr_side = dimt-1;
 
         curandStatePhilox4_32_10_t st;
         //curand_init(rndSeed, slid + rndOffset, DIV_UP(hr_side, BDIM_X)*tidx, &st); // each thread uses DIV_UP(hr_side/BDIM_X)
@@ -1385,7 +1339,7 @@ template<int BDIM_X,
          int BDIM_Y,
          typename REAL_T,
          typename REAL3_T>
-__global__ void genStreamlinesMerge_k(const int model_type,
+__global__ void genStreamlinesMerge_k(const ModelType model_type,
 				      const REAL_T max_angle,
 				      const REAL_T min_signal,
 				      const REAL_T tc_threshold,
@@ -1428,10 +1382,10 @@ __global__ void genStreamlinesMerge_k(const int model_type,
         const int lid = (tidy*BDIM_X + tidx) % 32;
         const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
 
-	const int hr_side = dimt-1;
+	// const int hr_side = dimt-1;
 
         curandStatePhilox4_32_10_t st;
-        const int gbid = blockIdx.y*gridDim.x + blockIdx.x;
+        // const int gbid = blockIdx.y*gridDim.x + blockIdx.x;
         const size_t gid = blockIdx.x * blockDim.y * blockDim.x + blockDim.x * threadIdx.y + threadIdx.x;
         //curand_init(rndSeed, slid+rndOffset, DIV_UP(hr_side, BDIM_X)*tidx, &st); // each thread uses DIV_UP(HR_SIDE/BDIM_X)
         curand_init(rndSeed, gid+1, 0, &st); // each thread uses DIV_UP(hr_side/BDIM_X)
@@ -1562,7 +1516,7 @@ __global__ void genStreamlinesMerge_k(const int model_type,
         return;
 }
 
-void generate_streamlines_cuda_mgpu(const int model_type, const REAL max_angle, const REAL min_signal, const REAL tc_threshold, const REAL step_size,
+void generate_streamlines_cuda_mgpu(const ModelType model_type, const REAL max_angle, const REAL min_signal, const REAL tc_threshold, const REAL step_size,
                                     const REAL relative_peak_thresh, const REAL min_separation_angle,
                                     const int nseeds, const std::vector<REAL*> &seeds_d,
                                     const int dimx, const int dimy, const int dimz, const int dimt,
diff --git a/cuslines/generate_streamlines_cuda.h b/cuslines/generate_streamlines_cuda.h
index ba3c239..70b3817 100644
--- a/cuslines/generate_streamlines_cuda.h
+++ b/cuslines/generate_streamlines_cuda.h
@@ -33,15 +33,15 @@
 
 #include "globals.h"
 
-void generate_streamlines_cuda_mgpu(const int model_type, const REAL max_angle, const REAL min_signal, const REAL tc_threshold, const REAL step_size,
+void generate_streamlines_cuda_mgpu(const ModelType model_type, const REAL max_angle, const REAL min_signal, const REAL tc_threshold, const REAL step_size,
                                     const REAL relative_peak_thresh, const REAL min_separation_angle,
-				    const int nseeds, const std::vector<REAL*> &seeds_d,
+				    				const int nseeds, const std::vector<REAL*> &seeds_d,
                                     const int dimx, const int dimy, const int dimz, const int dimt,
                                     const std::vector<REAL*> &dataf_d, const std::vector<REAL*> &H_d, const std::vector<REAL*> &R_d,
-			            const int delta_nr,
+			            			const int delta_nr,
                                     const std::vector<REAL*> &delta_b_d, const std::vector<REAL*> &delta_q_d,
                                     const std::vector<int*> &b0s_mask_d, const std::vector<REAL*> &metric_map_d,
-			            const int samplm_nr,
+			            			const int samplm_nr,
                                     const std::vector<REAL*> &sampling_matrix_d,
                                     const std::vector<REAL*> &sphere_vertices_d, const std::vector<int*> &sphere_edges_d, const int nedges,
                                     std::vector<REAL*> &slines_h, std::vector<int*> &slinesLen_h, std::vector<int> &nSlines_h,
diff --git a/cuslines/globals.h b/cuslines/globals.h
index 45432c2..dcf85bd 100644
--- a/cuslines/globals.h
+++ b/cuslines/globals.h
@@ -82,4 +82,9 @@
 
 #define DIV_UP(a,b) (((a)+((b)-1))/(b))
 
+enum ModelType {
+  OPDT = 0,
+  CSAODF = 1,
+};
+
 #endif

From ee0b83caa7caf55dcb0269e1665093ba00a2122b Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Thu, 27 Jun 2024 15:20:08 -0700
Subject: [PATCH 07/35] spacing here should only uses spaces

---
 cuslines/generate_streamlines_cuda.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cuslines/generate_streamlines_cuda.h b/cuslines/generate_streamlines_cuda.h
index 70b3817..14105ce 100644
--- a/cuslines/generate_streamlines_cuda.h
+++ b/cuslines/generate_streamlines_cuda.h
@@ -35,13 +35,13 @@
 
 void generate_streamlines_cuda_mgpu(const ModelType model_type, const REAL max_angle, const REAL min_signal, const REAL tc_threshold, const REAL step_size,
                                     const REAL relative_peak_thresh, const REAL min_separation_angle,
-				    				const int nseeds, const std::vector<REAL*> &seeds_d,
+                                    const int nseeds, const std::vector<REAL*> &seeds_d,
                                     const int dimx, const int dimy, const int dimz, const int dimt,
                                     const std::vector<REAL*> &dataf_d, const std::vector<REAL*> &H_d, const std::vector<REAL*> &R_d,
-			            			const int delta_nr,
+                                    const int delta_nr,
                                     const std::vector<REAL*> &delta_b_d, const std::vector<REAL*> &delta_q_d,
                                     const std::vector<int*> &b0s_mask_d, const std::vector<REAL*> &metric_map_d,
-			            			const int samplm_nr,
+                                    const int samplm_nr,
                                     const std::vector<REAL*> &sampling_matrix_d,
                                     const std::vector<REAL*> &sphere_vertices_d, const std::vector<int*> &sphere_edges_d, const int nedges,
                                     std::vector<REAL*> &slines_h, std::vector<int*> &slinesLen_h, std::vector<int> &nSlines_h,

From fec04166de7d93e0ad4109054c461ad377881a5f Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Tue, 2 Jul 2024 10:27:56 -0700
Subject: [PATCH 08/35] add probabilistic tractography

---
 cuslines/cuslines.cpp                 |   4 +-
 cuslines/generate_streamlines_cuda.cu | 481 ++++++++++++++++++++------
 cuslines/globals.h                    |   8 +-
 run_dipy_gpu_hardi.py                 |  52 ++-
 4 files changed, 431 insertions(+), 114 deletions(-)

diff --git a/cuslines/cuslines.cpp b/cuslines/cuslines.cpp
index da47caa..cebe9ad 100644
--- a/cuslines/cuslines.cpp
+++ b/cuslines/cuslines.cpp
@@ -319,7 +319,9 @@ class GPUTracker {
 PYBIND11_MODULE(cuslines, m) {
   py::enum_<ModelType>(m, "ModelType")
     .value("OPDT", OPDT)
-    .value("CSAODF", CSAODF);
+    .value("CSA", CSA)
+    .value("PROB", PROB)
+    .value("CSD", CSD);
 
   py::class_<GPUTracker>(m, "GPUTracker")
     .def(py::init<ModelType, double, double, double, double,
diff --git a/cuslines/generate_streamlines_cuda.cu b/cuslines/generate_streamlines_cuda.cu
index ff88436..a8b4fc0 100644
--- a/cuslines/generate_streamlines_cuda.cu
+++ b/cuslines/generate_streamlines_cuda.cu
@@ -215,7 +215,7 @@ __device__ void ndotp_d(const int N,
 
 template<int BDIM_X,
          typename VAL_T>
-__device__ void ndotp_log_d(const int N,
+__device__ void ndotp_log_opdt_d(const int N,
 			    const int M,
 			    const VAL_T *__restrict__ srcV,
                             const VAL_T *__restrict__ srcM,
@@ -304,6 +304,7 @@ __device__ void ndotp_log_csa_d(const int N,
 	return;
 }
 
+
 template<int BDIM_X,
          typename REAL_T>
 __device__ void fit_opdt(const int delta_nr,
@@ -317,8 +318,8 @@ __device__ void fit_opdt(const int delta_nr,
         const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
         const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
 
-        ndotp_log_d<BDIM_X>(delta_nr, hr_side, __msk_data_sh, delta_q, __r_sh);
-        ndotp_d    <BDIM_X>(delta_nr, hr_side, __msk_data_sh, delta_b, __h_sh);
+        ndotp_log_opdt_d<BDIM_X>(delta_nr, hr_side, __msk_data_sh, delta_q, __r_sh);
+        ndotp_d         <BDIM_X>(delta_nr, hr_side, __msk_data_sh, delta_b, __h_sh);
         __syncwarp(WMASK);
         #pragma unroll
         for(int j = tidx; j < delta_nr; j += BDIM_X) {
@@ -348,20 +349,22 @@ __device__ void fit_csa(const int delta_nr,
 
 // Based on dipy implementation:
 // https://github.com/dipy/dipy/blob/104345de6dcbfd6627552b00d180e53db6c17381/dipy/reconst/csdeconv.py#L573C5-L573C13
-template<int BDIM_X, typename REAL_T>
-__device__ void fit_csa(const int delta_nr,
-                        const int hr_side,
-                        const REAL_T *__restrict__ X,
-                        const REAL_T *__restrict__ B_reg,
-                        const REAL_T *__restrict__ P, // TODO: P can be calculated beforehand
-                        const REAL_T *__restrict__ __msk_data_sh,
-                        REAL_T *__restrict__ __r_sh) {
-        constexpr REAL_T mu = 1e-5;
-        constexpr REAL_T tau = 0.1;
-
-    
-
-}
+// template<int BDIM_X, typename REAL_T>
+// __device__ void fit_csd(const int delta_nr,
+//                         const int hr_side,
+//                         const REAL_T *__restrict__ __X,
+//                         const REAL_T *__restrict__ B_reg,
+//                         const REAL_T *__restrict__ __msk_data_sh,
+//                         REAL_T *__restrict__ __r_sh) {
+//         constexpr REAL_T mu = 1e-5;
+//         constexpr REAL_T tau = 0.1;
+//         REAL* __P, __z; // TODO: find somewhere to alloc these, or using existing space
+//         nTmuln_d<BDIM_X>(delta_nr, hr_side, __X, __P); // TODO: P can be calculated beforehand
+//         ndotp_d <BDIM_X>(delta_nr, hr_side, __X, __msk_data_sh, __z);
+
+//         // TODO: finish this
+
+// }
 
 template<int BDIM_X, typename REAL_T>
 __device__ void fit_model_coef(const ModelType model_type,
@@ -373,10 +376,10 @@ __device__ void fit_model_coef(const ModelType model_type,
                                REAL_T *__restrict__ __h_sh, // these last two are modifications to the coefficients that will be returned
                                REAL_T *__restrict__ __r_sh) {
         switch(model_type) {
-                case 0:
+                case OPDT:
                         fit_opdt<BDIM_X>(delta_nr, hr_side, delta_q, delta_b, __msk_data_sh, __h_sh, __r_sh);
                         break;
-                case 1:
+                case CSA:
                         fit_csa<BDIM_X>(delta_nr, hr_side, delta_q, __msk_data_sh, __r_sh);
                         break;
                 default:
@@ -827,42 +830,273 @@ __device__ void printArray(const char *name, int ncol, int n, REAL_T *arr) {
 	}
 }
 
+template<int BDIM_X,
+         int BDIM_Y,
+         bool IS_START,
+         typename REAL_T,
+         typename REAL3_T>
+__device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st,
+                                    const REAL_T *__restrict__ pmf,
+                                    const REAL_T max_angle,
+                                    const REAL_T relative_peak_thres,
+                                    const REAL_T min_separation_angle,
+                                    REAL3_T dir,
+                                    const int dimx,
+                                    const int dimy,
+                                    const int dimz,
+                                    const int dimt,
+                                    const REAL3_T point,
+                                    const int samplm_nr,
+                                    const REAL3_T *__restrict__ sphere_vertices,
+                                    const int2 *__restrict__ sphere_edges,
+                                    const int num_edges,
+                                    REAL3_T *__restrict__ dirs) {
+        const int tidx = threadIdx.x;
+        const int tidy = threadIdx.y;
+	
+        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+	const int n32dimt = ((dimt+31)/32)*32;
+
+	extern REAL_T __shared__ __sh[];
+
+	REAL_T *__pmf_data_sh = reinterpret_cast<REAL_T *>(__sh);
+	int *__shInd = reinterpret_cast<int *>(__pmf_data_sh) + BDIM_Y*n32dimt; // only used if IS_START is 1
+
+	__pmf_data_sh += tidy*n32dimt;
+	__shInd += tidy*n32dimt;
+
+        // pmf = self.pmf_gen.get_pmf_c(&point[0], pmf)
+        const int rv = trilinear_interp_d<BDIM_X>(dimx, dimy, dimz, dimt, pmf, point, __pmf_data_sh);
+        if (rv != 0) {
+                return 0;
+        }
+
+        // for i in range(_len):
+        //     if pmf[i] > max_pmf:
+        //         max_pmf = pmf[i]
+        // absolute_pmf_threshold = pmf_threshold * max_pmf
+        const REAL_T absolpmf_thresh = PMF_THRESHOLD_P * max_d<BDIM_X>(samplm_nr, __pmf_data_sh, REAL_MIN);
+        __syncwarp(WMASK);
+
+        // for i in range(_len):
+        //     if pmf[i] < absolute_pmf_threshold:
+        //         pmf[i] = 0.0
+        for(int i = tidx; i < samplm_nr; i += BDIM_X) {
+                if (__pmf_data_sh[i] < absolpmf_thresh) {
+                        __pmf_data_sh[i] = 0.0;
+                }
+        }
+        __syncwarp(WMASK);
+
+        if (IS_START) {
+                return peak_directions_d<BDIM_X,
+                                         BDIM_Y>(__pmf_data_sh,
+                                                 dirs,
+                                                 sphere_vertices,
+                                                 sphere_edges,
+                                                 num_edges,
+                                                 samplm_nr,
+                                                 __shInd,
+                                                 relative_peak_thres,
+                                                 min_separation_angle);
+        } else {
+                REAL_T __tmp;
+                #ifdef DEBUG
+                        __syncwarp();
+                        if (tidx == 0) {
+                                printArray("__pmf_data_sh initial", 8, samplm_nr, __pmf_data_sh);
+                                printf("absolpmf_thresh %10.8f\n", absolpmf_thresh);
+                                printf("--->            dir %10.8f, %10.8f, %10.8f\n", dir.x, dir.y, dir.z);
+                                printf("--->            point %10.8f, %10.8f, %10.8f\n", point.x, point.y, point.z);
+                                if (sqrt(dir.x*dir.x + dir.y*dir.y + dir.z*dir.z) >= 1.05){
+                                        printf("ERROR dir %10.8f, %10.8f, %10.8f\n", dir.x, dir.y, dir.z);
+                                }
+                        }
+                #endif
+
+                // // These should not be relevant
+                // if norm(&direction[0]) == 0:
+                //     return 1
+                // normalize(&direction[0])
+
+                // for i in range(_len):
+                //         cos_sim = self.vertices[i][0] * direction[0] \
+                //                 + self.vertices[i][1] * direction[1] \
+                //                 + self.vertices[i][2] * direction[2]
+                //         if cos_sim < 0:
+                //                 cos_sim = cos_sim * -1
+                //         if cos_sim < self.cos_similarity:
+                //                 pmf[i] = 0
+                const REAL_T cos_similarity = COS(max_angle);
+
+                for(int i = tidx; i < samplm_nr; i += BDIM_X) {
+                        const REAL_T dot = dir.x*sphere_vertices[i].x+
+                                           dir.y*sphere_vertices[i].y+
+                                           dir.z*sphere_vertices[i].z;
+
+                        if (FABS(dot) < cos_similarity) {
+                                __pmf_data_sh[i] = 0.0;
+                        }
+                }
+                __syncwarp(WMASK);
+
+                #ifdef DEBUG
+                        __syncwarp();
+                        if (tidx == 0) {
+                                printArray("__pmf_data_sh after filtering", 8, samplm_nr, __pmf_data_sh);
+                        }
+                #endif
+
+                // cumsum(pmf, pmf, _len)
+                for (int j = 0; j < samplm_nr; j += BDIM_X) {
+                        if ((tidx == 0) && (j != 0)) {
+                                __pmf_data_sh[j] += __pmf_data_sh[j-1];
+                        }
+                        __syncwarp(WMASK);
+
+                        #pragma unroll
+                        for(int i = 1; i < BDIM_X; i *= 2) {
+                                if ((tidx >= i) && (j+tidx < samplm_nr)) {
+                                        __tmp = __pmf_data_sh[j+tidx-i];
+                                }
+                                __syncwarp(WMASK);
+                                if ((tidx >= i) && (j+tidx < samplm_nr)) {
+                                        __pmf_data_sh[j+tidx] += __tmp;
+                                }
+                                __syncwarp(WMASK);
+                        }
+                }
+
+                #ifdef DEBUG
+                        __syncwarp();
+                        if (tidx == 0) {
+                                printArray("__pmf_data_sh after cumsum", 8, samplm_nr, __pmf_data_sh);
+                        }
+                #endif
+
+                // last_cdf = pmf[_len - 1]
+                // if last_cdf == 0:
+                //         return 1
+                if (tidx == 0) {
+                        __tmp = __pmf_data_sh[samplm_nr - 1];
+                }
+                REAL_T last_cdf = __shfl_sync(WMASK, __tmp, 0, BDIM_X);
+                if (last_cdf == 0) {
+                        return 0;
+                }
+
+                // idx = where_to_insert(pmf, random() * last_cdf, _len)
+                if (tidx == 0) {
+                        __tmp = curand_uniform(st) * last_cdf;
+                }
+                REAL_T selected_cdf = __shfl_sync(WMASK, __tmp, 0, BDIM_X);
+                int indProb = samplm_nr;
+                for(int i = samplm_nr - 1 - tidx; i >= 0; i-= BDIM_X) {
+                        if (selected_cdf >= __pmf_data_sh[i]) {
+                                if ((i < samplm_nr) && (selected_cdf < __pmf_data_sh[i + 1])) {
+                                        indProb = i + 1;
+                                } else {
+                                        indProb = MIN(samplm_nr - 1, i + BDIM_X);
+                                }
+                                break;
+                        }
+                }
+
+                #pragma unroll
+                for(int i = BDIM_X/2; i; i /= 2) {
+                        __tmp = __shfl_xor_sync(WMASK, indProb, i, BDIM_X);
+                        indProb = MIN(indProb, __tmp);
+                }
+
+                #ifdef DEBUG
+                        __syncwarp();
+                        if (tidx == 0) {
+                                printf("last_cdf %10.8f\n", last_cdf);
+                                printf("selected_cdf %10.8f\n", selected_cdf);
+                                printf("indProb %i out of %i\n", indProb, samplm_nr);
+                        }
+                #endif
+
+                // newdir = self.vertices[idx]
+                // if (direction[0] * newdir[0]
+                //     + direction[1] * newdir[1]
+                //     + direction[2] * newdir[2] > 0):
+                //     copy_point(&newdir[0], &direction[0])
+                // else:
+                //     newdir[0] = newdir[0] * -1
+                //     newdir[1] = newdir[1] * -1
+                //     newdir[2] = newdir[2] * -1
+                //     copy_point(&newdir[0], &direction[0])
+                if (tidx == 0) {
+                        if ((dir.x * sphere_vertices[indProb].x +
+                             dir.y * sphere_vertices[indProb].y +
+                             dir.z * sphere_vertices[indProb].z) > 0) {
+                                dirs[0] = MAKE_REAL3(sphere_vertices[indProb].x,
+                                                     sphere_vertices[indProb].y,
+                                                     sphere_vertices[indProb].z);
+                        } else {
+                                dirs[0] = MAKE_REAL3(-sphere_vertices[indProb].x,
+                                                     -sphere_vertices[indProb].y,
+                                                     -sphere_vertices[indProb].z);
+                        }
+                }
+
+                #ifdef DEBUG
+                        if (tidx == 0) {
+                                if ((dirs[0].x == dir.x) && (dirs[0].y == dir.y) && (dirs[0].z == dir.z)) {
+                                printf("ERROR dir %10.8f, %10.8f, %10.8f\n", dirs[0].x, dirs[0].y, dirs[0].z);
+                                printf("last_cdf %10.8f\n", last_cdf);
+                                printf("selected_cdf %10.8f\n", selected_cdf);
+                                printf("indProb %i out of %i\n", indProb, samplm_nr);
+                                }
+                                if (sqrt(dirs[0].x*dirs[0].x + dirs[0].y*dirs[0].y + dirs[0].z*dirs[0].z) >= 1.1) {
+                                        printf("ERROR dir %10.8f, %10.8f, %10.8f\n", dirs[0].x, dirs[0].y, dirs[0].z);
+                                }
+                        }
+                #endif
+                return 1;
+        }
+}
+
 template<int BDIM_X,
          int BDIM_Y,
          int NATTEMPTS,
          typename REAL_T,
          typename REAL3_T>
-__device__ int get_direction_d(curandStatePhilox4_32_10_t *st,
-			       const ModelType model_type,
-                               const REAL_T max_angle,
-			       const REAL_T min_signal,
-			       const REAL_T relative_peak_thres,
-			       const REAL_T min_separation_angle,
-			       REAL3_T dir,
-                               const int dimx,
-                               const int dimy,
-                               const int dimz,
-                               const int dimt,
-                               const REAL_T *__restrict__ dataf,
-                               const int *__restrict__ b0s_mask, // not using this (and its opposite, dwi_mask)
-                                                                 // but not clear if it will never be needed so
-                                                                 // we'll keep it here for now...
-                               const REAL3_T point,
-                               const REAL_T *__restrict__ H, 
-                               const REAL_T *__restrict__ R,
-                               // model unused
-                               // max_angle, pmf_threshold from global defines
-                               // b0s_mask already passed
-                               // min_signal from global defines
-			       const int delta_nr,
-                               const REAL_T *__restrict__ delta_b,
-                               const REAL_T *__restrict__ delta_q, // fit_matrix
-			       const int samplm_nr,
-                               const REAL_T *__restrict__ sampling_matrix,
-                               const REAL3_T *__restrict__ sphere_vertices,
-                               const int2 *__restrict__ sphere_edges,
-                               const int num_edges,
-                                     REAL3_T *__restrict__ dirs) {
+__device__ int get_direction_boot_d(
+                                curandStatePhilox4_32_10_t *st,
+                                const ModelType model_type,
+                                const REAL_T max_angle,
+                                const REAL_T min_signal,
+                                const REAL_T relative_peak_thres,
+                                const REAL_T min_separation_angle,
+                                REAL3_T dir,
+                                const int dimx,
+                                const int dimy,
+                                const int dimz,
+                                const int dimt,
+                                const REAL_T *__restrict__ dataf,
+                                const int *__restrict__ b0s_mask, // not using this (and its opposite, dwi_mask)
+                                                                  // but not clear if it will never be needed so
+                                                                  // we'll keep it here for now...
+                                const REAL3_T point,
+                                const REAL_T *__restrict__ H, 
+                                const REAL_T *__restrict__ R,
+                                // model unused
+                                // max_angle, pmf_threshold from global defines
+                                // b0s_mask already passed
+                                // min_signal from global defines
+                                const int delta_nr,
+                                const REAL_T *__restrict__ delta_b,
+                                const REAL_T *__restrict__ delta_q, // fit_matrix
+                                const int samplm_nr,
+                                const REAL_T *__restrict__ sampling_matrix,
+                                const REAL3_T *__restrict__ sphere_vertices,
+                                const int2 *__restrict__ sphere_edges,
+                                const int num_edges,
+                                REAL3_T *__restrict__ dirs) {
 
         const int tidx = threadIdx.x;
         const int tidy = threadIdx.y;
@@ -1165,33 +1399,53 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
 
         int i;
         for(i = 1; i < MAX_SLINE_LEN; i++) {
-
-		// call get_direction_d() with NATTEMPTS=5
-                int ndir = get_direction_d<BDIM_X,
-                                           BDIM_Y,
-                                           5>(st,
-					      model_type,
-					      max_angle,
-					      min_signal,
-					      relative_peak_thres,
-					      min_separation_angle,
-                                              direction,
-                                              dimx, dimy, dimz, dimt, dataf,
-                                              b0s_mask /* !dwi_mask */,
-                                              point,
-                                              H, R,
-                                              // model unused
-                                              // max_angle, pmf_threshold from global defines
-                                              // b0s_mask already passed
-                                              // min_signal from global defines
-					      delta_nr,
-                                              delta_b, delta_q, // fit_matrix
-					      samplm_nr,
-                                              sampling_matrix,
-                                              sphere_vertices,
-                                              sphere_edges,
-                                              num_edges,
-                                              __shDir);
+                int ndir;
+                if (model_type == PROB) {
+                        ndir = get_direction_prob_d<BDIM_X,
+                                                    BDIM_Y,
+                                                    0>(
+                                                        st,
+                                                        dataf,
+                                                        max_angle,
+                                                        relative_peak_thres,
+                                                        min_separation_angle,
+                                                        direction,
+                                                        dimx, dimy, dimz, dimt,
+                                                        point,
+                                                        samplm_nr,
+                                                        sphere_vertices,
+                                                        sphere_edges,
+                                                        num_edges,
+                                                        __shDir);
+                } else {
+                        // call get_direction_boot_d() with NATTEMPTS=5
+                        ndir = get_direction_boot_d<BDIM_X,
+                                                    BDIM_Y,
+                                                    5>(
+                                                        st,
+                                                        model_type,
+                                                        max_angle,
+                                                        min_signal,
+                                                        relative_peak_thres,
+                                                        min_separation_angle,
+                                                        direction,
+                                                        dimx, dimy, dimz, dimt, dataf,
+                                                        b0s_mask /* !dwi_mask */,
+                                                        point,
+                                                        H, R,
+                                                        // model unused
+                                                        // max_angle, pmf_threshold from global defines
+                                                        // b0s_mask already passed
+                                                        // min_signal from global defines
+                                                        delta_nr,
+                                                        delta_b, delta_q, // fit_matrix
+                                                        samplm_nr,
+                                                        sampling_matrix,
+                                                        sphere_vertices,
+                                                        sphere_edges,
+                                                        num_edges,
+                                                        __shDir);
+                }
                 __syncwarp(WMASK);
                 direction = __shDir[0];
                 __syncwarp(WMASK);
@@ -1295,31 +1549,52 @@ __global__ void getNumStreamlines_k(const ModelType model_type,
 	//	printf("seed: %f, %f, %f\n", seed.x, seed.y, seed.z);
 	//}
 
-        int ndir = get_direction_d<BDIM_X,
-                                   BDIM_Y,
-                                   1>(&st,
-				      model_type,
-				      max_angle,
-				      min_signal,
-				      relative_peak_thres,
-				      min_separation_angle,
-                                      MAKE_REAL3(0,0,0),
-                                      dimx, dimy, dimz, dimt, dataf,
-                                      b0s_mask /* !dwi_mask */,
-                                      seed,
-                                      H, R,
-                                      // model unused
-                                      // max_angle, pmf_threshold from global defines
-                                      // b0s_mask already passed
-                                      // min_signal from global defines
-				      delta_nr,
-                                      delta_b, delta_q, // fit_matrix
-				      samplm_nr,
-                                      sampling_matrix,
-                                      sphere_vertices,
-                                      sphere_edges,
-                                      num_edges,
-                                      __shDir);
+        int ndir;
+        if (model_type == PROB) {
+                ndir = get_direction_prob_d<BDIM_X,
+                                            BDIM_Y,
+                                            1>(
+                                                &st,
+                                                dataf,
+                                                max_angle,
+                                                relative_peak_thres,
+                                                min_separation_angle,
+                                                MAKE_REAL3(0,0,0),
+                                                dimx, dimy, dimz, dimt,
+                                                seed,
+                                                samplm_nr,
+                                                sphere_vertices,
+                                                sphere_edges,
+                                                num_edges,
+                                                __shDir);
+        } else {
+                ndir = get_direction_boot_d<BDIM_X,
+                                            BDIM_Y,
+                                            1>(
+                                                &st,
+                                                model_type,
+                                                max_angle,
+                                                min_signal,
+                                                relative_peak_thres,
+                                                min_separation_angle,
+                                                MAKE_REAL3(0,0,0),
+                                                dimx, dimy, dimz, dimt, dataf,
+                                                b0s_mask /* !dwi_mask */,
+                                                seed,
+                                                H, R,
+                                                // model unused
+                                                // max_angle, pmf_threshold from global defines
+                                                // b0s_mask already passed
+                                                // min_signal from global defines
+                                                delta_nr,
+                                                delta_b, delta_q, // fit_matrix
+                                                samplm_nr,
+                                                sampling_matrix,
+                                                sphere_vertices,
+                                                sphere_edges,
+                                                num_edges,
+                                                __shDir);
+        }
         if (tidx == 0) {
                 slineOutOff[slid] = ndir;
 
@@ -1551,7 +1826,7 @@ void generate_streamlines_cuda_mgpu(const ModelType model_type, const REAL max_a
 
   int n32dimt = ((dimt+31)/32)*32;
 
-  size_t shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*(2*n32dimt + 2*MAX(n32dimt, samplm_nr)) + // for get_direction_d
+  size_t shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*(2*n32dimt + 2*MAX(n32dimt, samplm_nr)) + // for get_direction_boot_d
 		     sizeof(int)*samplm_nr;						        // for peak_directions_d	
 
   //printf("shSizeGNS: %zu\n", shSizeGNS);
diff --git a/cuslines/globals.h b/cuslines/globals.h
index dcf85bd..9c924c1 100644
--- a/cuslines/globals.h
+++ b/cuslines/globals.h
@@ -82,9 +82,15 @@
 
 #define DIV_UP(a,b) (((a)+((b)-1))/(b))
 
+#if 0
+  #define DEBUG
+#endif
+
 enum ModelType {
   OPDT = 0,
-  CSAODF = 1,
+  CSA = 1,
+  PROB = 2,
+  CSD = 3,
 };
 
 #endif
diff --git a/run_dipy_gpu_hardi.py b/run_dipy_gpu_hardi.py
index 863887d..b6bb7de 100644
--- a/run_dipy_gpu_hardi.py
+++ b/run_dipy_gpu_hardi.py
@@ -38,10 +38,11 @@
 from dipy.io.stateful_tractogram import Origin, Space, StatefulTractogram
 from dipy.io.streamline import save_tractogram
 from dipy.tracking import utils
-from dipy.core.gradients import gradient_table
+from dipy.core.gradients import gradient_table, unique_bvals_magnitude
 from dipy.data import small_sphere
 #from dipy.direction import BootDirectionGetter
 from dipy.reconst.shm import OpdtModel, CsaOdfModel
+from dipy.reconst.csdeconv import ConstrainedSphericalDeconvModel, auto_response_ssst
 #from dipy.tracking.local import LocalTracking, ThresholdTissueClassifier
 from dipy.reconst import shm
 from dipy.data import get_fnames
@@ -85,7 +86,8 @@ def get_img(ep2_seq):
 parser.add_argument("--min-separation-angle",type=float,default=45,help="min separation angle (in degrees)")
 parser.add_argument("--sm-lambda",type=float,default=0.006,help="smoothing lambda")
 parser.add_argument("--sampling-density", type=int, default=3, help="sampling density for seed generation")
-parser.add_argument("--model", type=str, default="opdt", choices=['opdt', 'csaodf'], help="model to use")
+parser.add_argument("--model", type=str, default="opdt", choices=['opdt', 'csa', 'csd'], help="model to use")
+parser.add_argument("--dg", type=str, default="boot", choices=['boot', 'prob'], help="direction getting scheme to use")
 args = parser.parse_args()
 
 # Get Stanford HARDI data
@@ -119,27 +121,59 @@ def get_img(ep2_seq):
 
 # Create seeds for ROI
 #seed_mask = utils.seeds_from_mask(roi_data, density=sampling_density, affine=img_affine)
-seed_mask = utils.seeds_from_mask(roi_data, density=args.sampling_density, affine=np.eye(4))
-seed_mask = seed_mask[0:args.nseeds]
+# seed_mask = utils.seeds_from_mask(roi_data, density=args.sampling_density, affine=np.eye(4))
+# seed_mask = seed_mask[0:args.nseeds]
+seed_mask = np.asarray(utils.random_seeds_from_mask(
+  roi_data, seeds_count=args.nseeds,
+  seed_count_per_voxel=False,
+  affine=np.eye(4)))
 
 # Setup model
 if args.model == "opdt":
+  model_type = cuslines.ModelType.OPDT
   print("Running OPDT model...")
   model = OpdtModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=args.min_signal)
   fit_matrix = model._fit_matrix
   delta_b, delta_q = fit_matrix
-else:
-  print("Running CSAODF model...")
+elif args.model == "csa":
+  model_type = cuslines.ModelType.CSA
+  print("Running CSA model...")
   model = CsaOdfModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=args.min_signal)
   fit_matrix = model._fit_matrix
   # Unlike OPDT, CSA has a single matrix used for fit_matrix. Populating delta_b and delta_q with necessary values for
   # now.
   delta_b = fit_matrix
   delta_q = fit_matrix
+else:
+  model_type = cuslines.ModelType.CSD
+  print("Running CSD model...")
+  unique_bvals = unique_bvals_magnitude(gtab.bvals)
+  if len(unique_bvals[unique_bvals > 0]) > 1:
+    low_shell_idx = gtab.bvals <= unique_bvals[unique_bvals > 0][0]
+    response_gtab = gradient_table( # reinit as single shell for this CSD
+      gtab.bvals[low_shell_idx],
+      gtab.bvecs[low_shell_idx])
+    data = data[..., low_shell_idx]
+  else:
+    response_gtab = gtab
+  response, _ = auto_response_ssst(
+    response_gtab,
+    data,
+    roi_radii=10,
+    fa_thr=0.7)
+  model = ConstrainedSphericalDeconvModel(gtab, response, sh_order=args.sh_order)
+  # TODO: we shouldnt have to do this, also for CSA, but we populate delta_b, delta_q.
+  # we need to name change delta_b/delta_q and make it possible for them to be None, or something like this
+  delta_b = model._X
+  delta_q = model.B_reg
+
+if args.dg == "prob":
+  model_type = cuslines.ModelType.PROB
+  fit = model.fit(data, mask=(metric_map >= args.fa_threshold))
+  data = fit.odf(small_sphere).clip(min=0)
+  print(data.shape)
 
 # Setup direction getter args
-print('Bootstrap direction getter')
-#boot_dg = BootDirectionGetter.from_data(data, model, max_angle=args.max_angle, sphere=small_sphere, sh_order=args.sh_order, relative_peak_threshold=args.relative_peak_threshold, min_separation_angle=args.min_separation_angle)
 b0s_mask = gtab.b0s_mask
 dwi_mask = ~b0s_mask
 
@@ -168,7 +202,7 @@ def get_img(ep2_seq):
 
 #streamline_generator = LocalTracking(boot_dg, tissue_classifier, seed_mask, affine=np.eye(4), step_size=args.step_size)
 
-gpu_tracker = cuslines.GPUTracker(cuslines.ModelType.OPDT if args.model == "opdt" else cuslines.ModelType.CSAODF,
+gpu_tracker = cuslines.GPUTracker(model_type,
                                   args.max_angle * np.pi/180,
                                   args.min_signal,
                                   args.fa_threshold,

From c2108be7e29e120a947d38d37eef59fb38391bd4 Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Tue, 2 Jul 2024 11:51:17 -0700
Subject: [PATCH 09/35] all of these could potentially be F-order depending on
 how things are setup.

---
 cuslines/cuslines.cpp | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/cuslines/cuslines.cpp b/cuslines/cuslines.cpp
index cebe9ad..55d47a3 100644
--- a/cuslines/cuslines.cpp
+++ b/cuslines/cuslines.cpp
@@ -48,6 +48,7 @@ using np_array_short = py::array_t<short>;
 
 using np_array_cast = py::array_t<REAL, py::array::c_style | py::array::forcecast>;
 using np_array_short_cast = py::array_t<short, py::array::c_style | py::array::forcecast>;
+using np_array_int_cast = py::array_t<int, py::array::c_style | py::array::forcecast>;
 
 // Handle to cleanup returned host allocations when associated Python object is destroyed
 template <typename T>
@@ -68,15 +69,15 @@ class GPUTracker {
                double relative_peak_thresh,
                double min_separation_angle,
                np_array_cast dataf,
-               np_array H,
-               np_array R,
-               np_array delta_b,
-               np_array delta_q,
-               np_array_int b0s_mask,
-               np_array metric_map,
-               np_array sampling_matrix,
-               np_array sphere_vertices,
-               np_array_int sphere_edges,
+               np_array_cast H,
+               np_array_cast R,
+               np_array_cast delta_b,
+               np_array_cast delta_q,
+               np_array_int_cast b0s_mask,
+               np_array_cast metric_map,
+               np_array_cast sampling_matrix,
+               np_array_cast sphere_vertices,
+               np_array_int_cast sphere_edges,
                int ngpus = 1,
                int rng_seed = 0,
                int rng_offset = 0) {

From ec4f1ea133b5be2884a57a1a9003bb86c2e40a38 Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Tue, 2 Jul 2024 14:43:15 -0700
Subject: [PATCH 10/35] reasonable default chunk size

---
 cuslines/cuslines.cpp |  3 +++
 run_dipy_gpu_hardi.py | 28 ++++++++++++++++++++++++++--
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/cuslines/cuslines.cpp b/cuslines/cuslines.cpp
index 55d47a3..429102c 100644
--- a/cuslines/cuslines.cpp
+++ b/cuslines/cuslines.cpp
@@ -318,6 +318,9 @@ class GPUTracker {
 
 
 PYBIND11_MODULE(cuslines, m) {
+  m.attr("MAX_SLINE_LEN") = py::int_(MAX_SLINE_LEN);
+  m.attr("REAL_SIZE") = py::int_(REAL_SIZE);
+
   py::enum_<ModelType>(m, "ModelType")
     .value("OPDT", OPDT)
     .value("CSA", CSA)
diff --git a/run_dipy_gpu_hardi.py b/run_dipy_gpu_hardi.py
index b6bb7de..7b46535 100644
--- a/run_dipy_gpu_hardi.py
+++ b/run_dipy_gpu_hardi.py
@@ -30,6 +30,7 @@
 import argparse
 import random
 import time
+import subprocess
 
 import numpy as np
 
@@ -73,7 +74,7 @@ def get_img(ep2_seq):
 print("parsing arguments")
 parser = argparse.ArgumentParser()
 parser.add_argument("--output-prefix", type=str, default ='', help="path to the output file")
-parser.add_argument("--chunk-size", type=int, required=True, help="how many seeds to process per sweep, per GPU")
+parser.add_argument("--chunk-size", type=int, default=None, help="how many seeds to process per sweep, per GPU. If None, will estimate based on free memory in GPU0")
 parser.add_argument("--nseeds", type=int, default=None, help="how many seeds to process in total")
 parser.add_argument("--ngpus", type=int, required=True, help="number of GPUs to use")
 parser.add_argument("--use-fast-write", action='store_true', help="use fast file write")
@@ -196,8 +197,31 @@ def get_img(ep2_seq):
 # create floating point copy of data
 dataf = np.asarray(data, dtype=float)
 
+if args.chunk_size is None: # TODO: I think we will ultimately remove chunks altogether
+  try:
+    result = subprocess.run(
+      ['nvidia-smi', '--query-gpu=memory.free', '--format=csv,nounits,noheader', f'--id=0'], 
+      stdout=subprocess.PIPE, 
+      stderr=subprocess.PIPE, 
+      text=True)
+    if result.returncode != 0:
+      raise RuntimeError(f"Error: {result.stderr.strip()}")
+
+    free_memory = int(result.stdout.strip().split('\n')[0]) # This is in MB
+
+    chunk_size = free_memory * (1024.0 / (5 * cuslines.MAX_SLINE_LEN)) * (1024.0 / (3 * cuslines.REAL_SIZE))
+    chunk_size /= 2 # Allow for to 50% of GPU memory can be used for stuff other than storing streamlines
+    chunk_size = int((chunk_size // 1000) * 1000)
+
+    print(f"Chunk size estimated at {chunk_size}")
+  except Exception as e:
+    chunk_size = 100000
+    print("Failed to use nvidia-smi to estimate chunk size, using 100,000: " + str(e))
+else:
+  chunk_size = args.chunk_size
+
 print('streamline gen')
-global_chunk_size = args.chunk_size * args.ngpus
+global_chunk_size = chunk_size * args.ngpus
 nchunks = (seed_mask.shape[0] + global_chunk_size - 1) // global_chunk_size
 
 #streamline_generator = LocalTracking(boot_dg, tissue_classifier, seed_mask, affine=np.eye(4), step_size=args.step_size)

From 5c1b7e39cedeb3cdd8f4c5814f2acc53c4c8e040 Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Mon, 8 Jul 2024 14:54:05 -0700
Subject: [PATCH 11/35] BFs, improvements

---
 cuslines/cudamacro.h                  |  26 +++++++
 cuslines/cuslines.cpp                 |  23 +++---
 cuslines/generate_streamlines_cuda.cu | 106 ++++++++++++--------------
 cuslines/globals.h                    |   8 +-
 4 files changed, 90 insertions(+), 73 deletions(-)

diff --git a/cuslines/cudamacro.h b/cuslines/cudamacro.h
index c447529..49ac24c 100644
--- a/cuslines/cudamacro.h
+++ b/cuslines/cudamacro.h
@@ -45,4 +45,30 @@
         exit(EXIT_FAILURE);                                                  \
     }}
 
+#ifdef USE_NVTX
+#include "nvToolsExt.h"
+
+const uint32_t colors4[] = {0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 0x0000ffff, 0x00ff0000, 0x00ffffff};
+const int num_colors4 = sizeof(colors4)/sizeof(colors4[0]);
+
+#define START_RANGE(name,cid) { \
+        int color_id = cid; \
+        color_id = color_id%num_colors4;\
+        nvtxEventAttributes_t eventAttrib = {0}; \
+        eventAttrib.version = NVTX_VERSION; \
+        eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \
+        eventAttrib.colorType = NVTX_COLOR_ARGB; \
+        eventAttrib.color = colors4[color_id]; \
+        eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
+        eventAttrib.message.ascii = name; \
+        nvtxRangePushEx(&eventAttrib); \
+}
+#define END_RANGE { \
+        nvtxRangePop(); \
+}
+#else
+#define START_RANGE(name,cid)
+#define END_RANGE
+#endif
+
 #endif
diff --git a/cuslines/cuslines.cpp b/cuslines/cuslines.cpp
index 429102c..68a9954 100644
--- a/cuslines/cuslines.cpp
+++ b/cuslines/cuslines.cpp
@@ -37,17 +37,16 @@ namespace py = pybind11;
 
 #include <cuda_runtime.h>
 
+// #define USE_NVTX
+
 #include "globals.h"
 #include "cudamacro.h"
 #include "generate_streamlines_cuda.h"
 
 using np_array = py::array_t<REAL>;
-using np_array_bool = py::array_t<bool>;
 using np_array_int = py::array_t<int>;
-using np_array_short = py::array_t<short>;
 
 using np_array_cast = py::array_t<REAL, py::array::c_style | py::array::forcecast>;
-using np_array_short_cast = py::array_t<short, py::array::c_style | py::array::forcecast>;
 using np_array_int_cast = py::array_t<int, py::array::c_style | py::array::forcecast>;
 
 // Handle to cleanup returned host allocations when associated Python object is destroyed
@@ -271,6 +270,9 @@ class GPUTracker {
       auto roi_shape_info = roi_shape.request();
       auto voxel_size_info = voxel_size.request();
       auto vox_to_ras_info = vox_to_ras.request();
+
+      START_RANGE("filewrite", 0);
+
       //#pragma omp parallel for
       for (int n = 0; n < ngpus_; ++n) {
         std::stringstream ss;
@@ -279,6 +281,8 @@ class GPUTracker {
                   voxel_order.c_str(), reinterpret_cast<REAL *>(vox_to_ras_info.ptr), nSlines_old_[n], slinesLen_[n],
                   reinterpret_cast<REAL3 *>(slines_[n]));
       }
+
+      END_RANGE;
     }
 
   private:
@@ -324,17 +328,16 @@ PYBIND11_MODULE(cuslines, m) {
   py::enum_<ModelType>(m, "ModelType")
     .value("OPDT", OPDT)
     .value("CSA", CSA)
-    .value("PROB", PROB)
-    .value("CSD", CSD);
+    .value("PROB", PROB);
 
   py::class_<GPUTracker>(m, "GPUTracker")
     .def(py::init<ModelType, double, double, double, double,
                   double, double,
-		  np_array_cast, np_array,
-                  np_array, np_array,
-                  np_array, np_array_int,
-                  np_array, np_array,
-                  np_array, np_array_int,
+		              np_array_cast, np_array_cast,
+                  np_array_cast, np_array_cast,
+                  np_array_cast, np_array_int_cast,
+                  np_array_cast, np_array_cast,
+                  np_array_cast, np_array_int_cast,
                   int, int, int>(),
                   py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(), py::arg().noconvert(),
                   py::arg().noconvert(), py::arg().noconvert(),
diff --git a/cuslines/generate_streamlines_cuda.cu b/cuslines/generate_streamlines_cuda.cu
index a8b4fc0..9174348 100644
--- a/cuslines/generate_streamlines_cuda.cu
+++ b/cuslines/generate_streamlines_cuda.cu
@@ -347,25 +347,6 @@ __device__ void fit_csa(const int delta_nr,
         __syncwarp(WMASK);
 }
 
-// Based on dipy implementation:
-// https://github.com/dipy/dipy/blob/104345de6dcbfd6627552b00d180e53db6c17381/dipy/reconst/csdeconv.py#L573C5-L573C13
-// template<int BDIM_X, typename REAL_T>
-// __device__ void fit_csd(const int delta_nr,
-//                         const int hr_side,
-//                         const REAL_T *__restrict__ __X,
-//                         const REAL_T *__restrict__ B_reg,
-//                         const REAL_T *__restrict__ __msk_data_sh,
-//                         REAL_T *__restrict__ __r_sh) {
-//         constexpr REAL_T mu = 1e-5;
-//         constexpr REAL_T tau = 0.1;
-//         REAL* __P, __z; // TODO: find somewhere to alloc these, or using existing space
-//         nTmuln_d<BDIM_X>(delta_nr, hr_side, __X, __P); // TODO: P can be calculated beforehand
-//         ndotp_d <BDIM_X>(delta_nr, hr_side, __X, __msk_data_sh, __z);
-
-//         // TODO: finish this
-
-// }
-
 template<int BDIM_X, typename REAL_T>
 __device__ void fit_model_coef(const ModelType model_type,
                                const int delta_nr, // delta_nr is number of ODF directions
@@ -862,10 +843,15 @@ __device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st,
 	extern REAL_T __shared__ __sh[];
 
 	REAL_T *__pmf_data_sh = reinterpret_cast<REAL_T *>(__sh);
-	int *__shInd = reinterpret_cast<int *>(__pmf_data_sh) + BDIM_Y*n32dimt; // only used if IS_START is 1
+        int *__shInd;
+        if (IS_START) {
+	        __shInd = reinterpret_cast<int *>(__pmf_data_sh + BDIM_Y*n32dimt);
+        }
 
 	__pmf_data_sh += tidy*n32dimt;
-	__shInd += tidy*n32dimt;
+        if (IS_START) {
+	        __shInd += tidy*n32dimt;
+        }
 
         // pmf = self.pmf_gen.get_pmf_c(&point[0], pmf)
         const int rv = trilinear_interp_d<BDIM_X>(dimx, dimy, dimz, dimt, pmf, point, __pmf_data_sh);
@@ -904,7 +890,7 @@ __device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st,
         } else {
                 REAL_T __tmp;
                 #ifdef DEBUG
-                        __syncwarp();
+                        __syncwarp(WMASK);
                         if (tidx == 0) {
                                 printArray("__pmf_data_sh initial", 8, samplm_nr, __pmf_data_sh);
                                 printf("absolpmf_thresh %10.8f\n", absolpmf_thresh);
@@ -914,6 +900,7 @@ __device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st,
                                         printf("ERROR dir %10.8f, %10.8f, %10.8f\n", dir.x, dir.y, dir.z);
                                 }
                         }
+                        __syncwarp(WMASK);
                 #endif
 
                 // // These should not be relevant
@@ -943,10 +930,11 @@ __device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st,
                 __syncwarp(WMASK);
 
                 #ifdef DEBUG
-                        __syncwarp();
+                        __syncwarp(WMASK);
                         if (tidx == 0) {
                                 printArray("__pmf_data_sh after filtering", 8, samplm_nr, __pmf_data_sh);
                         }
+                        __syncwarp(WMASK);
                 #endif
 
                 // cumsum(pmf, pmf, _len)
@@ -956,33 +944,29 @@ __device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st,
                         }
                         __syncwarp(WMASK);
 
-                        #pragma unroll
-                        for(int i = 1; i < BDIM_X; i *= 2) {
-                                if ((tidx >= i) && (j+tidx < samplm_nr)) {
-                                        __tmp = __pmf_data_sh[j+tidx-i];
-                                }
-                                __syncwarp(WMASK);
+                        REAL_T __t_pmf = __pmf_data_sh[j+tidx];
+                        for (int i = 1; i < BDIM_X; i*=2) {
+                                __tmp = __shfl_up_sync(WMASK, __t_pmf, i, BDIM_X);
                                 if ((tidx >= i) && (j+tidx < samplm_nr)) {
-                                        __pmf_data_sh[j+tidx] += __tmp;
+                                        __t_pmf += __tmp;
                                 }
-                                __syncwarp(WMASK);
                         }
+                        __pmf_data_sh[j+tidx] = __t_pmf;
+                        __syncwarp(WMASK);
                 }
 
                 #ifdef DEBUG
-                        __syncwarp();
+                        __syncwarp(WMASK);
                         if (tidx == 0) {
                                 printArray("__pmf_data_sh after cumsum", 8, samplm_nr, __pmf_data_sh);
                         }
+                        __syncwarp(WMASK);
                 #endif
 
                 // last_cdf = pmf[_len - 1]
                 // if last_cdf == 0:
                 //         return 1
-                if (tidx == 0) {
-                        __tmp = __pmf_data_sh[samplm_nr - 1];
-                }
-                REAL_T last_cdf = __shfl_sync(WMASK, __tmp, 0, BDIM_X);
+                REAL_T last_cdf = __pmf_data_sh[samplm_nr - 1];
                 if (last_cdf == 0) {
                         return 0;
                 }
@@ -992,31 +976,24 @@ __device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st,
                         __tmp = curand_uniform(st) * last_cdf;
                 }
                 REAL_T selected_cdf = __shfl_sync(WMASK, __tmp, 0, BDIM_X);
-                int indProb = samplm_nr;
-                for(int i = samplm_nr - 1 - tidx; i >= 0; i-= BDIM_X) {
-                        if (selected_cdf >= __pmf_data_sh[i]) {
-                                if ((i < samplm_nr) && (selected_cdf < __pmf_data_sh[i + 1])) {
-                                        indProb = i + 1;
-                                } else {
-                                        indProb = MIN(samplm_nr - 1, i + BDIM_X);
-                                }
+
+                int indProb = samplm_nr - 1;
+                for (int ii = 0; ii < samplm_nr; ii+=BDIM_X) {
+                        int __msk = __ballot_sync(WMASK, selected_cdf < __pmf_data_sh[ii+tidx]);
+                        if (__msk != 0) {
+                                indProb = ii + __ffs(__msk) - 1;
                                 break;
                         }
                 }
 
-                #pragma unroll
-                for(int i = BDIM_X/2; i; i /= 2) {
-                        __tmp = __shfl_xor_sync(WMASK, indProb, i, BDIM_X);
-                        indProb = MIN(indProb, __tmp);
-                }
-
                 #ifdef DEBUG
-                        __syncwarp();
+                        __syncwarp(WMASK);
                         if (tidx == 0) {
                                 printf("last_cdf %10.8f\n", last_cdf);
                                 printf("selected_cdf %10.8f\n", selected_cdf);
                                 printf("indProb %i out of %i\n", indProb, samplm_nr);
                         }
+                        __syncwarp(WMASK);
                 #endif
 
                 // newdir = self.vertices[idx]
@@ -1826,9 +1803,13 @@ void generate_streamlines_cuda_mgpu(const ModelType model_type, const REAL max_a
 
   int n32dimt = ((dimt+31)/32)*32;
 
-  size_t shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*(2*n32dimt + 2*MAX(n32dimt, samplm_nr)) + // for get_direction_boot_d
-		     sizeof(int)*samplm_nr;						        // for peak_directions_d	
-
+  size_t shSizeGNS;
+  if (model_type == PROB) {
+    shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*n32dimt + sizeof(int)*(THR_X_BL/THR_X_SL)*n32dimt;
+  } else {
+    shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*(2*n32dimt + 2*MAX(n32dimt, samplm_nr)) + // for get_direction_boot_d
+                sizeof(int)*samplm_nr;						        // for peak_directions_d	
+  }
   //printf("shSizeGNS: %zu\n", shSizeGNS);
 
   //#pragma omp parallel for
@@ -1905,15 +1886,19 @@ void generate_streamlines_cuda_mgpu(const ModelType model_type, const REAL max_a
     CHECK_CUDA(cudaMemset(slineSeed_d[n], -1, sizeof(*slineSeed_d[n])*nSlines_h[n]));
 
     // Allocate/reallocate output arrays if necessary
-    if (nSlines_h[n] > nSlines_old_h[n]) {
+    if (nSlines_h[n] > EXCESS_ALLOC_FACT*nSlines_old_h[n]) {
       if(slines_h[n]) cudaFreeHost(slines_h[n]);
       if(slinesLen_h[n]) cudaFreeHost(slinesLen_h[n]);
       slines_h[n] = nullptr;
       slinesLen_h[n] = nullptr;
     }
 
-    if (!slines_h[n]) CHECK_CUDA(cudaMallocHost(&slines_h[n], 2*3*MAX_SLINE_LEN*nSlines_h[n]*sizeof(*slines_h[n])));
-    if (!slinesLen_h[n]) CHECK_CUDA(cudaMallocHost(&slinesLen_h[n], nSlines_h[n]*sizeof(*slinesLen_h[n])));
+#ifdef DEBUG
+    printf("buffer size %zu\n", sizeof(*slines_h[n])*EXCESS_ALLOC_FACT*2*3*MAX_SLINE_LEN*nSlines_h[n]);
+#endif
+
+    if (!slines_h[n]) CHECK_CUDA(cudaMallocHost(&slines_h[n], sizeof(*slines_h[n])*EXCESS_ALLOC_FACT*2*3*MAX_SLINE_LEN*nSlines_h[n]));
+    if (!slinesLen_h[n]) CHECK_CUDA(cudaMallocHost(&slinesLen_h[n], sizeof(*slinesLen_h[n])*EXCESS_ALLOC_FACT*nSlines_h[n]));
   }
 
   //if (nSlines_h) {
@@ -1949,6 +1934,13 @@ void generate_streamlines_cuda_mgpu(const ModelType model_type, const REAL max_a
     std::cerr << "GPU " << n << ": ";
     std::cerr << "Generating " << nSlines_h[n] << " streamlines (from " << nseeds_gpu << " seeds)" << std::endl; 
 #endif
+
+    if (model_type == PROB) {
+        // Shared memory requirements are smaller for probabilistic for main run
+        // than for preliminary run
+        shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*n32dimt; 
+    }
+
     //fprintf(stderr, "Launching kernel with %u blocks of size (%u, %u)\n", grid.x, block.x, block.y);
     genStreamlinesMerge_k<THR_X_SL,
                           THR_X_BL/THR_X_SL>
diff --git a/cuslines/globals.h b/cuslines/globals.h
index 9c924c1..ab5459b 100644
--- a/cuslines/globals.h
+++ b/cuslines/globals.h
@@ -70,11 +70,6 @@
 #define MAX_SLINE_LEN	(501)
 #define PMF_THRESHOLD_P	((REAL)0.1)
 
-//#define TC_THRESHOLD_P	((REAL)0.1)
-//#define STEP_SIZE_P	((REAL)0.5)  // only for TRK generation
-//#define MAX_ANGLE_P	((REAL)1.0471975511965976) // 60 deg in radians
-//#define MIN_SIGNAL_P	((REAL)1.0)
-
 #define MAX_SLINES_PER_SEED (10)
 
 #define MIN(x,y) (((x)<(y))?(x):(y))
@@ -82,6 +77,8 @@
 
 #define DIV_UP(a,b) (((a)+((b)-1))/(b))
 
+#define EXCESS_ALLOC_FACT 2
+
 #if 0
   #define DEBUG
 #endif
@@ -90,7 +87,6 @@ enum ModelType {
   OPDT = 0,
   CSA = 1,
   PROB = 2,
-  CSD = 3,
 };
 
 #endif

From d1cf0f67b0096e7b50e679c54700bf9eb51e9561 Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Tue, 9 Jul 2024 16:31:35 -0700
Subject: [PATCH 12/35] this is not useful

---
 run_dipy_gpu_hardi.py | 28 ++--------------------------
 1 file changed, 2 insertions(+), 26 deletions(-)

diff --git a/run_dipy_gpu_hardi.py b/run_dipy_gpu_hardi.py
index 7b46535..b60cb9a 100644
--- a/run_dipy_gpu_hardi.py
+++ b/run_dipy_gpu_hardi.py
@@ -30,7 +30,6 @@
 import argparse
 import random
 import time
-import subprocess
 
 import numpy as np
 
@@ -74,7 +73,7 @@ def get_img(ep2_seq):
 print("parsing arguments")
 parser = argparse.ArgumentParser()
 parser.add_argument("--output-prefix", type=str, default ='', help="path to the output file")
-parser.add_argument("--chunk-size", type=int, default=None, help="how many seeds to process per sweep, per GPU. If None, will estimate based on free memory in GPU0")
+parser.add_argument("--chunk-size", type=int, default=100000, help="how many seeds to process per sweep, per GPU.")
 parser.add_argument("--nseeds", type=int, default=None, help="how many seeds to process in total")
 parser.add_argument("--ngpus", type=int, required=True, help="number of GPUs to use")
 parser.add_argument("--use-fast-write", action='store_true', help="use fast file write")
@@ -197,31 +196,8 @@ def get_img(ep2_seq):
 # create floating point copy of data
 dataf = np.asarray(data, dtype=float)
 
-if args.chunk_size is None: # TODO: I think we will ultimately remove chunks altogether
-  try:
-    result = subprocess.run(
-      ['nvidia-smi', '--query-gpu=memory.free', '--format=csv,nounits,noheader', f'--id=0'], 
-      stdout=subprocess.PIPE, 
-      stderr=subprocess.PIPE, 
-      text=True)
-    if result.returncode != 0:
-      raise RuntimeError(f"Error: {result.stderr.strip()}")
-
-    free_memory = int(result.stdout.strip().split('\n')[0]) # This is in MB
-
-    chunk_size = free_memory * (1024.0 / (5 * cuslines.MAX_SLINE_LEN)) * (1024.0 / (3 * cuslines.REAL_SIZE))
-    chunk_size /= 2 # Allow for to 50% of GPU memory can be used for stuff other than storing streamlines
-    chunk_size = int((chunk_size // 1000) * 1000)
-
-    print(f"Chunk size estimated at {chunk_size}")
-  except Exception as e:
-    chunk_size = 100000
-    print("Failed to use nvidia-smi to estimate chunk size, using 100,000: " + str(e))
-else:
-  chunk_size = args.chunk_size
-
 print('streamline gen')
-global_chunk_size = chunk_size * args.ngpus
+global_chunk_size = args.chunk_size * args.ngpus
 nchunks = (seed_mask.shape[0] + global_chunk_size - 1) // global_chunk_size
 
 #streamline_generator = LocalTracking(boot_dg, tissue_classifier, seed_mask, affine=np.eye(4), step_size=args.step_size)

From f81fe8eef59cb10713f979a97a41e99ec81500a5 Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Wed, 17 Jul 2024 11:15:42 -0700
Subject: [PATCH 13/35] ptt first draft; prob updates/fixes

---
 cuslines/Makefile                     |   10 +-
 cuslines/cuslines.cpp                 |    3 +-
 cuslines/disc.h                       | 1721 +++++++++++++++++++++++++
 cuslines/generate_streamlines_cuda.cu |  292 ++---
 cuslines/globals.h                    |    6 +
 cuslines/ptt.cu                       |  383 ++++++
 cuslines/ptt.cuh                      |   40 +
 cuslines/utils.cu                     |  144 +++
 8 files changed, 2441 insertions(+), 158 deletions(-)
 create mode 100644 cuslines/disc.h
 create mode 100644 cuslines/ptt.cu
 create mode 100644 cuslines/ptt.cuh
 create mode 100644 cuslines/utils.cu

diff --git a/cuslines/Makefile b/cuslines/Makefile
index d167bc1..67c9cbb 100644
--- a/cuslines/Makefile
+++ b/cuslines/Makefile
@@ -26,6 +26,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
+# CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.9/cuda/12.2
 CUDA_HOME=/usr/local/cuda
 CUDACC=$(CUDA_HOME)/bin/nvcc
 CXX=g++
@@ -38,17 +39,18 @@ CUDA_ARCH = $(foreach SM,$(SMS),-gencode arch=compute_$(SM),code=sm_$(SM))
 LASTSM := $(lastword $(sort $(SMS)))
 CUDA_ARCH += -gencode arch=compute_$(LASTSM),code=compute_$(LASTSM)
 CUDACFLAGS=-c -O3 -lineinfo -Xptxas=-v -std=c++11 -Xcompiler -fPIC -Xcompiler=-fopenmp $(CUDA_ARCH)
+LDFLAGS= -shared -fopenmp -L$(CUDA_HOME)/lib64 -lcudart -lnvToolsExt
 
 all: cuslines
 
 cuslines: generate_streamlines_cuda.o cuslines.o
-	$(LD) -shared cuslines.o generate_streamlines_cuda.o -o cuslines`python3-config --extension-suffix` -fopenmp -L$(CUDA_HOME)/lib64 -lcudart
+	$(LD) generate_streamlines_cuda.o cuslines.o -o cuslines`python3-config --extension-suffix` $(LDFLAGS)
 
-%.o: %.cu
-	$(CUDACC) $(CUDACFLAGS) $<
+%.o : %.cu
+	$(CUDACC) $(CUDACFLAGS) $< -o $@
 
 %.o: %.cpp
 	$(CXX) $(CXXFLAGS) $< -o $@
 
 clean:
-	rm *.o cuslines`python3-config --extension-suffix`
+	rm *.o cuslines`python3-config --extension-suffix` __pycache__/*.pyc
diff --git a/cuslines/cuslines.cpp b/cuslines/cuslines.cpp
index 68a9954..4e8bc30 100644
--- a/cuslines/cuslines.cpp
+++ b/cuslines/cuslines.cpp
@@ -328,7 +328,8 @@ PYBIND11_MODULE(cuslines, m) {
   py::enum_<ModelType>(m, "ModelType")
     .value("OPDT", OPDT)
     .value("CSA", CSA)
-    .value("PROB", PROB);
+    .value("PROB", PROB)
+    .value("PTT", PTT);
 
   py::class_<GPUTracker>(m, "GPUTracker")
     .def(py::init<ModelType, double, double, double, double,
diff --git a/cuslines/disc.h b/cuslines/disc.h
new file mode 100644
index 0000000..f5c3589
--- /dev/null
+++ b/cuslines/disc.h
@@ -0,0 +1,1721 @@
+
+/*
+This code from: https://github.com/nibrary/nibrary/blob/main/src/math/disc.h
+
+BSD 3-Clause License
+
+Copyright (c) 2024, Dogu Baran Aydogan All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#define DISC_4_VERT_CNT 62
+#define DISC_4_FACE_CNT 97
+
+#define DISC_4_VERT {\
+	-0.99632399,0.08566510,\
+	-0.98618071,-0.16567317,\
+	-0.94150749,0.33699206,\
+	-0.91375624,-0.40626289,\
+	-0.82498245,0.56515834,\
+	-0.78016046,-0.62557946,\
+	-0.76768368,-0.12856657,\
+	-0.73146437,0.13012819,\
+	-0.65993870,0.75131945,\
+	-0.64923474,-0.36077026,\
+	-0.64404827,0.37054571,\
+	-0.59590501,-0.80305493,\
+	-0.52723292,-0.08735736,\
+	-0.50689203,0.59055453,\
+	-0.48847116,-0.55956115,\
+	-0.45296756,0.16857820,\
+	-0.44983881,0.89310976,\
+	-0.37980292,-0.92506742,\
+	-0.37431635,-0.30508770,\
+	-0.34347782,0.41024244,\
+	-0.28695359,-0.72298195,\
+	-0.26607611,-0.04337891,\
+	-0.26300954,0.69460622,\
+	-0.20716495,0.97830603,\
+	-0.19234589,-0.49854986,\
+	-0.17059736,0.21089158,\
+	-0.13571649,-0.99074771,\
+	-0.09401357,-0.25347966,\
+	-0.08264934,0.47807857,\
+	-0.02277615,-0.74173498,\
+	-0.00823427,0.74390934,\
+	0,0,\
+	0.04408226,0.99902790,\
+	0.07601333,-0.47847223,\
+	0.09881278,0.25627739,\
+	0.12027544,-0.99274056,\
+	0.17542943,-0.20988186,\
+	0.18324588,0.50353410,\
+	0.23456374,-0.70587816,\
+	0.25179308,0.73431645,\
+	0.27247058,0.04550104,\
+	0.28231740,0.95932106,\
+	0.33582739,-0.41640371,\
+	0.36311579,-0.93174402,\
+	0.37710908,0.31092054,\
+	0.45674883,-0.17177025,\
+	0.47051745,0.57347807,\
+	0.47748339,-0.61101450,\
+	0.51751229,0.85567577,\
+	0.53030761,0.08858984,\
+	0.57509454,-0.81808696,\
+	0.63141296,-0.38867064,\
+	0.64811006,0.37237321,\
+	0.71524914,0.69886956,\
+	0.73323663,-0.14081300,\
+	0.76190940,0.12763299,\
+	0.76512049,-0.64388713,\
+	0.85941151,0.51128451,\
+	0.90165136,-0.43246368,\
+	0.95650965,0.29170069,\
+	0.97794249,-0.20887431,\
+	0.99951822,0.03103749}
+
+
+
+#define DISC_4_FACE {\
+	39,32,30,\
+	30,22,28,\
+	28,22,19,\
+	52,59,57,\
+	14,11,20,\
+	47,56,51,\
+	50,56,47,\
+	47,43,50,\
+	41,39,48,\
+	32,39,41,\
+	10,4,2,\
+	37,39,30,\
+	30,28,37,\
+	25,28,19,\
+	25,21,31,\
+	53,52,57,\
+	44,52,46,\
+	46,37,44,\
+	39,37,46,\
+	52,53,46,\
+	48,39,46,\
+	46,53,48,\
+	61,59,55,\
+	59,52,55,\
+	42,47,51,\
+	35,29,26,\
+	26,29,20,\
+	12,21,15,\
+	19,10,15,\
+	15,25,19,\
+	21,25,15,\
+	18,21,12,\
+	18,9,14,\
+	12,9,18,\
+	6,9,12,\
+	0,1,6,\
+	5,11,14,\
+	14,9,5,\
+	51,56,58,\
+	16,22,23,\
+	30,32,23,\
+	23,22,30,\
+	34,40,44,\
+	44,37,34,\
+	31,40,34,\
+	34,25,31,\
+	34,37,28,\
+	28,25,34,\
+	51,58,54,\
+	54,58,60,\
+	54,60,61,\
+	61,55,54,\
+	49,52,44,\
+	49,55,52,\
+	44,40,49,\
+	49,54,55,\
+	36,40,31,\
+	33,42,36,\
+	43,47,38,\
+	35,43,38,\
+	38,29,35,\
+	33,29,38,\
+	38,42,33,\
+	47,42,38,\
+	17,20,11,\
+	17,26,20,\
+	21,18,27,\
+	33,36,27,\
+	31,21,27,\
+	27,36,31,\
+	14,20,24,\
+	24,18,14,\
+	24,27,18,\
+	33,27,24,\
+	24,29,33,\
+	20,29,24,\
+	7,6,12,\
+	12,15,7,\
+	7,15,10,\
+	0,6,7,\
+	7,2,0,\
+	7,10,2,\
+	3,5,9,\
+	3,6,1,\
+	9,6,3,\
+	4,10,13,\
+	13,8,4,\
+	13,10,19,\
+	19,22,13,\
+	13,22,16,\
+	16,8,13,\
+	45,49,40,\
+	40,36,45,\
+	45,36,42,\
+	45,42,51,\
+	51,54,45,\
+	54,49,45}
+
+
+
+
+#define DISC_5_VERT_CNT 88
+#define DISC_5_FACE_CNT 143
+
+#define DISC_5_VERT {\
+	-0.99971936,0.02368974,\
+	-0.98497387,-0.17270345,\
+	-0.97603282,0.21762338,\
+	-0.92922869,-0.36950514,\
+	-0.90708773,0.42094161,\
+	-0.83415725,-0.55152668,\
+	-0.79951365,0.60064792,\
+	-0.79931959,-0.15614114,\
+	-0.78301036,0.22417418,\
+	-0.73599072,0.03246227,\
+	-0.70693097,-0.70728255,\
+	-0.70138379,-0.35670071,\
+	-0.67496938,0.41799949,\
+	-0.65872201,0.75238641,\
+	-0.58146330,-0.53578945,\
+	-0.58133729,-0.15872598,\
+	-0.56876870,0.21601921,\
+	-0.55866507,-0.82939335,\
+	-0.55072833,0.60394640,\
+	-0.49682087,0.86785311,\
+	-0.48121950,0.02702752,\
+	-0.46260160,-0.34991875,\
+	-0.44852900,0.41519979,\
+	-0.43296059,-0.69074037,\
+	-0.37854350,-0.92558350,\
+	-0.35947581,0.64734787,\
+	-0.34709225,-0.16018397,\
+	-0.34353985,0.21908385,\
+	-0.32747735,-0.52839636,\
+	-0.31642072,0.94861896,\
+	-0.23858273,0.43081147,\
+	-0.23727114,0.03012835,\
+	-0.22120862,-0.34920779,\
+	-0.22093401,0.78635645,\
+	-0.21959193,-0.75538583,\
+	-0.18652372,-0.98245046,\
+	-0.11983054,0.22928120,\
+	-0.11608911,0.60000621,\
+	-0.11215562,-0.15690164,\
+	-0.10768432,-0.56628902,\
+	-0.10628279,0.99433594,\
+	-0.00000000,-1.00000000,\
+	-0.00000000,-0.78706952,\
+	0.00000000,0.41097842,\
+	0.00000000,-0.36210641,\
+	0.00000000,0.04030552,\
+	0.00000000,0.79503467,\
+	0.10628279,0.99433594,\
+	0.10768432,-0.56628902,\
+	0.11215563,-0.15690164,\
+	0.11608911,0.60000621,\
+	0.11983054,0.22928120,\
+	0.18652372,-0.98245046,\
+	0.21959193,-0.75538583,\
+	0.22093402,0.78635645,\
+	0.22120862,-0.34920779,\
+	0.23727114,0.03012835,\
+	0.23858273,0.43081146,\
+	0.31642072,0.94861896,\
+	0.32747735,-0.52839636,\
+	0.34353985,0.21908385,\
+	0.34709225,-0.16018397,\
+	0.35947581,0.64734786,\
+	0.37854350,-0.92558350,\
+	0.43296060,-0.69074037,\
+	0.44852900,0.41519979,\
+	0.46260160,-0.34991875,\
+	0.48121950,0.02702752,\
+	0.49682086,0.86785311,\
+	0.55072833,0.60394640,\
+	0.55866507,-0.82939336,\
+	0.56876870,0.21601921,\
+	0.58133729,-0.15872598,\
+	0.58146330,-0.53578945,\
+	0.65872201,0.75238641,\
+	0.67496938,0.41799949,\
+	0.70138379,-0.35670071,\
+	0.70693097,-0.70728255,\
+	0.73599072,0.03246227,\
+	0.78301036,0.22417418,\
+	0.79931959,-0.15614114,\
+	0.79951365,0.60064792,\
+	0.83415725,-0.55152668,\
+	0.90708773,0.42094162,\
+	0.92922869,-0.36950514,\
+	0.97603282,0.21762338,\
+	0.98497387,-0.17270345,\
+	0.99971936,0.02368974}
+
+
+
+#define DISC_5_FACE {\
+	12,6,4,\
+	69,74,68,\
+	1,7,0,\
+	81,74,69,\
+	75,81,69,\
+	83,81,75,\
+	41,52,42,\
+	42,35,41,\
+	42,52,53,\
+	53,48,42,\
+	73,64,77,\
+	77,64,70,\
+	45,31,38,\
+	56,61,67,\
+	42,48,39,\
+	59,73,66,\
+	64,73,59,\
+	59,53,64,\
+	48,53,59,\
+	40,46,47,\
+	36,31,45,\
+	30,36,43,\
+	54,47,46,\
+	18,6,12,\
+	9,0,7,\
+	27,36,30,\
+	31,36,27,\
+	72,61,66,\
+	67,61,72,\
+	5,11,3,\
+	3,7,1,\
+	3,11,7,\
+	17,23,10,\
+	24,23,17,\
+	10,23,14,\
+	14,5,10,\
+	14,11,5,\
+	24,35,34,\
+	34,23,24,\
+	34,35,42,\
+	42,39,34,\
+	83,75,79,\
+	79,85,83,\
+	75,71,79,\
+	80,72,76,\
+	66,73,76,\
+	76,72,66,\
+	63,53,52,\
+	64,53,63,\
+	63,70,64,\
+	44,38,32,\
+	32,39,44,\
+	44,39,48,\
+	66,61,55,\
+	55,59,66,\
+	48,59,55,\
+	55,44,48,\
+	30,43,37,\
+	51,36,45,\
+	51,43,36,\
+	45,56,51,\
+	57,43,51,\
+	58,54,68,\
+	47,54,58,\
+	69,68,62,\
+	68,54,62,\
+	50,43,57,\
+	57,62,50,\
+	50,62,54,\
+	50,54,46,\
+	46,37,50,\
+	50,37,43,\
+	6,18,13,\
+	13,18,19,\
+	12,4,8,\
+	4,2,8,\
+	8,2,0,\
+	0,9,8,\
+	7,11,15,\
+	15,9,7,\
+	12,8,16,\
+	16,8,9,\
+	28,14,23,\
+	28,39,32,\
+	23,34,28,\
+	28,34,39,\
+	85,79,87,\
+	87,80,86,\
+	86,80,84,\
+	80,76,84,\
+	38,44,49,\
+	44,55,49,\
+	45,38,49,\
+	49,56,45,\
+	61,56,49,\
+	49,55,61,\
+	19,18,25,\
+	30,37,25,\
+	60,51,56,\
+	60,56,67,\
+	67,71,60,\
+	57,51,60,\
+	32,38,26,\
+	26,38,31,\
+	22,27,30,\
+	22,16,27,\
+	30,25,22,\
+	22,25,18,\
+	22,18,12,\
+	12,16,22,\
+	9,15,20,\
+	20,16,9,\
+	15,26,20,\
+	20,26,31,\
+	31,27,20,\
+	27,16,20,\
+	14,28,21,\
+	11,14,21,\
+	21,15,11,\
+	21,26,15,\
+	21,28,32,\
+	32,26,21,\
+	80,87,78,\
+	78,87,79,\
+	67,72,78,\
+	78,72,80,\
+	78,71,67,\
+	78,79,71,\
+	82,84,76,\
+	82,73,77,\
+	82,76,73,\
+	33,29,19,\
+	19,25,33,\
+	33,25,37,\
+	33,37,46,\
+	33,46,40,\
+	40,29,33,\
+	57,60,65,\
+	65,60,71,\
+	69,62,65,\
+	65,62,57,\
+	65,75,69,\
+	65,71,75}
+
+
+
+
+#define DISC_6_VERT_CNT 93
+#define DISC_6_FACE_CNT 152
+
+#define DISC_6_VERT {\
+	-0.99999594,0.00284872,\
+	-0.98015885,-0.19821361,\
+	-0.97910452,0.20335765,\
+	-0.91824742,-0.39600716,\
+	-0.91642084,0.40021599,\
+	-0.82654691,0.00183534,\
+	-0.81562261,-0.57858427,\
+	-0.81341702,0.58168096,\
+	-0.77345426,-0.20215086,\
+	-0.77255314,0.20518626,\
+	-0.68941469,-0.39022484,\
+	-0.68791137,0.39257991,\
+	-0.68013707,-0.73308497,\
+	-0.67797282,0.73508697,\
+	-0.63913549,0.00098160,\
+	-0.57136133,-0.55867323,\
+	-0.56955891,0.56029565,\
+	-0.55841446,-0.19717430,\
+	-0.55776663,0.19858763,\
+	-0.52205785,-0.85291008,\
+	-0.52029081,0.85398915,\
+	-0.45456737,-0.37917968,\
+	-0.45349521,0.38007165,\
+	-0.43551427,0.00036391,\
+	-0.42468657,-0.70786237,\
+	-0.42282245,0.70855298,\
+	-0.34062665,-0.94019864,\
+	-0.33950518,0.94060419,\
+	-0.33587158,-0.19180791,\
+	-0.33542840,0.19211031,\
+	-0.32531753,-0.54899060,\
+	-0.32390201,0.54924250,\
+	-0.22054068,-0.00005872,\
+	-0.22037192,-0.37480908,\
+	-0.21959501,-0.76658281,\
+	-0.21957535,0.37471145,\
+	-0.21830443,0.76666288,\
+	-0.14314293,0.98970203,\
+	-0.14308842,-0.98970991,\
+	-0.11182136,-0.58019243,\
+	-0.11144867,-0.19011261,\
+	-0.11115009,0.18972064,\
+	-0.11083876,0.58001267,\
+	-0.01202610,-0.79058356,\
+	-0.01146133,0.79044198,\
+	-0.00296704,-0.38655576,\
+	-0.00251985,0.38610767,\
+	0,0,\
+	0.05602383,0.99842943,\
+	0.05753595,-0.99834343,\
+	0.09929965,-0.58445955,\
+	0.09978787,0.58401640,\
+	0.10720128,-0.19034642,\
+	0.10732262,0.18978004,\
+	0.19741560,0.77640808,\
+	0.19782431,-0.77666373,\
+	0.21311365,-0.37705130,\
+	0.21324783,0.37657351,\
+	0.21679316,-0.00027758,\
+	0.24086510,0.97055861,\
+	0.24384828,-0.96981339,\
+	0.30997719,-0.55399954,\
+	0.31009251,0.55373347,\
+	0.33113518,0.19349214,\
+	0.33116892,-0.19395431,\
+	0.39954032,0.71482630,\
+	0.40007135,-0.71505270,\
+	0.41738752,0.90872859,\
+	0.42074496,-0.90717897,\
+	0.43070234,-0.00023714,\
+	0.44655149,0.38465307,\
+	0.44677948,-0.38475532,\
+	0.55227292,0.20180373,\
+	0.55259284,-0.20199491,\
+	0.55434766,-0.56898461,\
+	0.55436109,0.56926283,\
+	0.59389421,0.80454314,\
+	0.59763878,-0.80176548,\
+	0.63149306,0.00014085,\
+	0.68111571,0.39992460,\
+	0.68195669,-0.39979424,\
+	0.73619953,0.67676455,\
+	0.74034292,-0.67222940,\
+	0.76516557,0.20592197,\
+	0.76545629,-0.20472387,\
+	0.81831151,0.00107473,\
+	0.84515629,0.53451927,\
+	0.84868724,-0.52889504,\
+	0.92889024,0.37035513,\
+	0.93106261,-0.36485945,\
+	0.98208177,0.18845531,\
+	0.98320733,-0.18249206,\
+	0.99999396,0.00347432}
+
+
+
+#define DISC_6_FACE {\
+	47,41,32,\
+	29,35,22,\
+	23,32,29,\
+	29,41,35,\
+	29,32,41,\
+	8,1,3,\
+	6,12,15,\
+	15,12,24,\
+	71,80,73,\
+	73,80,84,\
+	19,24,12,\
+	87,80,82,\
+	48,37,44,\
+	22,11,18,\
+	18,29,22,\
+	23,29,18,\
+	44,37,36,\
+	22,35,31,\
+	31,36,25,\
+	16,11,22,\
+	22,31,16,\
+	16,31,25,\
+	16,25,13,\
+	13,7,16,\
+	16,7,11,\
+	11,7,4,\
+	13,25,20,\
+	69,73,78,\
+	78,73,84,\
+	49,60,55,\
+	60,68,55,\
+	47,32,40,\
+	45,33,39,\
+	45,40,33,\
+	0,1,5,\
+	1,8,5,\
+	39,33,30,\
+	33,21,30,\
+	15,24,30,\
+	30,21,15,\
+	39,30,34,\
+	34,30,24,\
+	24,19,26,\
+	26,34,24,\
+	74,80,71,\
+	74,82,80,\
+	77,82,74,\
+	0,5,2,\
+	27,36,37,\
+	25,36,27,\
+	27,20,25,\
+	89,91,84,\
+	89,80,87,\
+	84,80,89,\
+	85,83,78,\
+	85,78,84,\
+	92,90,85,\
+	85,90,83,\
+	85,91,92,\
+	84,91,85,\
+	69,58,64,\
+	64,73,69,\
+	64,56,71,\
+	71,73,64,\
+	81,76,75,\
+	63,58,69,\
+	78,83,72,\
+	70,63,72,\
+	69,78,72,\
+	72,63,69,\
+	35,41,46,\
+	28,21,33,\
+	28,32,23,\
+	28,40,32,\
+	33,40,28,\
+	15,21,10,\
+	10,6,15,\
+	10,8,3,\
+	3,6,10,\
+	47,40,52,\
+	52,58,47,\
+	52,45,56,\
+	40,45,52,\
+	56,64,52,\
+	52,64,58,\
+	56,45,50,\
+	50,45,39,\
+	9,18,11,\
+	9,2,5,\
+	11,4,9,\
+	4,2,9,\
+	23,18,14,\
+	14,5,8,\
+	18,9,14,\
+	14,9,5,\
+	34,26,38,\
+	83,90,88,\
+	79,86,81,\
+	81,75,79,\
+	79,88,86,\
+	83,88,79,\
+	79,75,70,\
+	79,72,83,\
+	70,72,79,\
+	65,76,67,\
+	65,75,76,\
+	44,36,42,\
+	36,31,42,\
+	42,31,35,\
+	35,46,42,\
+	53,46,41,\
+	58,63,53,\
+	53,41,47,\
+	47,58,53,\
+	66,55,68,\
+	66,68,77,\
+	77,74,66,\
+	55,50,43,\
+	49,55,43,\
+	39,34,43,\
+	43,50,39,\
+	43,38,49,\
+	34,38,43,\
+	21,28,17,\
+	17,14,8,\
+	17,28,23,\
+	23,14,17,\
+	8,10,17,\
+	17,10,21,\
+	67,59,54,\
+	54,65,67,\
+	54,48,44,\
+	54,59,48,\
+	56,50,61,\
+	71,56,61,\
+	61,50,55,\
+	55,66,61,\
+	61,74,71,\
+	61,66,74,\
+	65,54,62,\
+	70,75,62,\
+	75,65,62,\
+	57,63,70,\
+	70,62,57,\
+	57,53,63,\
+	46,53,57,\
+	51,62,54,\
+	44,42,51,\
+	51,54,44,\
+	51,42,46,\
+	46,57,51,\
+	51,57,62}
+
+
+
+#define DISC_7_VERT_CNT 362
+#define DISC_7_FACE_CNT 661
+
+#define DISC_7_VERT {\
+	-0.99985012,-0.01731283,\
+	-0.99556874,0.09403658,\
+	-0.99269568,-0.12064526,\
+	-0.98039504,0.19704206,\
+	-0.97660422,-0.21504466,\
+	-0.95669950,0.29107744,\
+	-0.94719722,-0.32065158,\
+	-0.91977079,0.03476785,\
+	-0.91823426,0.39603769,\
+	-0.90908498,-0.41661072,\
+	-0.90313047,-0.06293630,\
+	-0.89624049,0.13116999,\
+	-0.88570063,-0.24444376,\
+	-0.87285522,-0.15209581,\
+	-0.87078228,0.49166881,\
+	-0.86621517,0.31178476,\
+	-0.86248349,-0.50608521,\
+	-0.86017585,0.21863311,\
+	-0.84170474,-0.33621087,\
+	-0.83205774,0.03180995,\
+	-0.81606482,0.40088958,\
+	-0.81347977,0.58159321,\
+	-0.80880317,-0.58807944,\
+	-0.80047290,-0.06419645,\
+	-0.79753133,-0.42481233,\
+	-0.79434394,0.12621664,\
+	-0.79156344,-0.23872410,\
+	-0.77379916,0.30089105,\
+	-0.76620500,0.48702609,\
+	-0.75497372,-0.15055590,\
+	-0.74996331,-0.66147943,\
+	-0.74615778,0.66576915,\
+	-0.74611496,-0.50995249,\
+	-0.74439196,0.21110739,\
+	-0.73952765,-0.33215782,\
+	-0.73635629,0.02967200,\
+	-0.71613187,0.39251295,\
+	-0.70989537,0.56969253,\
+	-0.69311231,-0.06262365,\
+	-0.68888730,-0.42215438,\
+	-0.68873812,-0.23996840,\
+	-0.68789855,0.12079206,\
+	-0.68775964,-0.59169999,\
+	-0.68761735,-0.72607326,\
+	-0.67300273,0.29907892,\
+	-0.66617715,0.74579354,\
+	-0.66034349,0.48247936,\
+	-0.64930335,0.65326738,\
+	-0.64251884,-0.15112779,\
+	-0.63423915,-0.33167241,\
+	-0.63367295,0.20893327,\
+	-0.63361786,0.02834503,\
+	-0.63317442,-0.50922682,\
+	-0.62397583,-0.67129702,\
+	-0.61668934,-0.78720662,\
+	-0.61382861,0.39179584,\
+	-0.60234812,0.57529738,\
+	-0.58817577,0.80873313,\
+	-0.58517097,-0.06261579,\
+	-0.58292803,-0.24113073,\
+	-0.58101739,0.11895848,\
+	-0.58024100,-0.42164527,\
+	-0.57342995,-0.59754731,\
+	-0.57023518,0.30017942,\
+	-0.56388698,0.69225657,\
+	-0.55806447,0.48787902,\
+	-0.54436708,-0.83884712,\
+	-0.53471317,-0.71163670,\
+	-0.53277416,-0.15208460,\
+	-0.52845305,0.02779153,\
+	-0.52768053,-0.33226318,\
+	-0.52626233,0.20945175,\
+	-0.52436621,-0.51257293,\
+	-0.51322968,0.39629252,\
+	-0.51105321,0.59594241,\
+	-0.50070339,0.77148078,\
+	-0.49284021,0.87011983,\
+	-0.47766585,-0.06283164,\
+	-0.47709422,-0.61587203,\
+	-0.47617996,-0.24250847,\
+	-0.47492414,0.11881178,\
+	-0.47332691,-0.42421072,\
+	-0.46891581,-0.78503521,\
+	-0.46830639,0.30307673,\
+	-0.46266258,0.49810707,\
+	-0.45708829,-0.88942133,\
+	-0.45576672,0.68708769,\
+	-0.42465108,-0.15303707,\
+	-0.42294844,-0.52165201,\
+	-0.42277498,0.02774444,\
+	-0.42158212,-0.33447625,\
+	-0.42154903,-0.70155621,\
+	-0.42118179,0.21073835,\
+	-0.41518847,0.40003793,\
+	-0.41014166,0.59223339,\
+	-0.40595577,0.79572144,\
+	-0.40144096,0.91588490,\
+	-0.37410706,-0.80603587,\
+	-0.37388864,-0.92747360,\
+	-0.37084162,-0.06308696,\
+	-0.36985500,-0.24430894,\
+	-0.36964213,0.11912678,\
+	-0.36961888,-0.42913077,\
+	-0.36950835,-0.61160154,\
+	-0.36692104,0.30426278,\
+	-0.36331898,0.49302705,\
+	-0.36031066,0.69223951,\
+	-0.32677443,0.86283108,\
+	-0.32032475,-0.70802805,\
+	-0.31779837,-0.15404597,\
+	-0.31719767,0.02776004,\
+	-0.31689202,0.21090281,\
+	-0.31681891,-0.33738863,\
+	-0.31666303,-0.52066428,\
+	-0.31403760,0.39611874,\
+	-0.31388083,0.58661750,\
+	-0.30063918,-0.86564249,\
+	-0.29840707,0.77598627,\
+	-0.29118453,0.95666691,\
+	-0.27578713,-0.96121874,\
+	-0.26535904,-0.61453199,\
+	-0.26467581,-0.06343296,\
+	-0.26450873,0.11893627,\
+	-0.26436076,-0.24601298,\
+	-0.26386624,-0.42936315,\
+	-0.26351074,0.30183375,\
+	-0.26320615,0.48755038,\
+	-0.26150637,0.67168140,\
+	-0.25770896,-0.78444210,\
+	-0.22690147,0.87620642,\
+	-0.21182740,-0.15487168,\
+	-0.21176386,0.20951421,\
+	-0.21163724,0.02750973,\
+	-0.21161899,-0.52224104,\
+	-0.21148579,0.57469681,\
+	-0.21131707,0.39241729,\
+	-0.21124026,-0.33783321,\
+	-0.20788702,-0.69983910,\
+	-0.20780059,0.75906391,\
+	-0.20086952,-0.88206044,\
+	-0.18395374,0.98293490,\
+	-0.18078187,-0.98352322,\
+	-0.15939234,0.66160521,\
+	-0.15922292,0.48080959,\
+	-0.15893189,0.29974615,\
+	-0.15886894,0.11819996,\
+	-0.15886163,-0.06382544,\
+	-0.15865438,-0.24633350,\
+	-0.15863082,-0.43008689,\
+	-0.15661700,-0.61168462,\
+	-0.15237093,-0.79492683,\
+	-0.14769055,0.83062127,\
+	-0.11501977,0.91942011,\
+	-0.10707064,0.56845652,\
+	-0.10625023,0.38888883,\
+	-0.10607360,0.20858766,\
+	-0.10591811,-0.15506199,\
+	-0.10588439,0.02705488,\
+	-0.10583335,-0.33807967,\
+	-0.10494986,-0.52098388,\
+	-0.10231280,-0.70603494,\
+	-0.10202215,-0.89648205,\
+	-0.10135860,0.74182236,\
+	-0.08783976,-0.99613462,\
+	-0.06094913,0.99814087,\
+	-0.05373065,0.47736208,\
+	-0.05303096,0.29846240,\
+	-0.05301161,0.11775088,\
+	-0.05297384,-0.24644770,\
+	-0.05294683,-0.06402902,\
+	-0.05272878,-0.42928088,\
+	-0.05216873,0.65274562,\
+	-0.05171351,-0.61420930,\
+	-0.05042272,-0.80478224,\
+	-0.04981932,0.83515353,\
+	-0.00000007,0.92251132,\
+	-0.00000000,-1.00000000,\
+	-0.00000000,-0.52141188,\
+	-0.00000000,-0.33767237,\
+	0.00000000,-0.70961850,\
+	0.00000000,-0.15521184,\
+	0,0,\
+	0.00000000,-0.90399448,\
+	0.00000001,0.74344973,\
+	0.00000001,0.20812464,\
+	0.00000002,0.38816625,\
+	0.00000004,0.56389175,\
+	0.04981924,0.83515359,\
+	0.05042272,-0.80478224,\
+	0.05171351,-0.61420930,\
+	0.05216879,0.65274567,\
+	0.05272878,-0.42928088,\
+	0.05294684,-0.06402902,\
+	0.05297384,-0.24644770,\
+	0.05301163,0.11775088,\
+	0.05303099,0.29846241,\
+	0.05373071,0.47736209,\
+	0.06094900,0.99814088,\
+	0.08783975,-0.99613462,\
+	0.10135861,0.74182246,\
+	0.10202215,-0.89648204,\
+	0.10231280,-0.70603494,\
+	0.10494986,-0.52098388,\
+	0.10583335,-0.33807967,\
+	0.10588440,0.02705488,\
+	0.10591811,-0.15506198,\
+	0.10607362,0.20858767,\
+	0.10625027,0.38888884,\
+	0.10707069,0.56845657,\
+	0.11501967,0.91942019,\
+	0.14769048,0.83062141,\
+	0.15237093,-0.79492683,\
+	0.15661700,-0.61168462,\
+	0.15863082,-0.43008689,\
+	0.15865438,-0.24633350,\
+	0.15886163,-0.06382544,\
+	0.15886894,0.11819997,\
+	0.15893191,0.29974616,\
+	0.15922296,0.48080963,\
+	0.15939237,0.66160531,\
+	0.18078187,-0.98352322,\
+	0.18395366,0.98293492,\
+	0.20086952,-0.88206044,\
+	0.20780058,0.75906403,\
+	0.20788703,-0.69983910,\
+	0.21124026,-0.33783320,\
+	0.21131709,0.39241733,\
+	0.21148582,0.57469689,\
+	0.21161899,-0.52224104,\
+	0.21163724,0.02750974,\
+	0.21176387,0.20951422,\
+	0.21182740,-0.15487168,\
+	0.22690143,0.87620648,\
+	0.25770896,-0.78444210,\
+	0.26150638,0.67168149,\
+	0.26320616,0.48755045,\
+	0.26351075,0.30183378,\
+	0.26386624,-0.42936315,\
+	0.26436076,-0.24601298,\
+	0.26450874,0.11893627,\
+	0.26467581,-0.06343296,\
+	0.26535904,-0.61453199,\
+	0.27578713,-0.96121874,\
+	0.29118450,0.95666692,\
+	0.29840708,0.77598632,\
+	0.30063918,-0.86564249,\
+	0.31388084,0.58661757,\
+	0.31403760,0.39611878,\
+	0.31666303,-0.52066428,\
+	0.31681891,-0.33738863,\
+	0.31689202,0.21090283,\
+	0.31719767,0.02776005,\
+	0.31779837,-0.15404597,\
+	0.32032475,-0.70802805,\
+	0.32677441,0.86283111,\
+	0.36031068,0.69223956,\
+	0.36331898,0.49302709,\
+	0.36692104,0.30426281,\
+	0.36950835,-0.61160154,\
+	0.36961888,-0.42913077,\
+	0.36964213,0.11912679,\
+	0.36985500,-0.24430894,\
+	0.37084162,-0.06308696,\
+	0.37388864,-0.92747360,\
+	0.37410706,-0.80603587,\
+	0.40144095,0.91588491,\
+	0.40595578,0.79572146,\
+	0.41014167,0.59223341,\
+	0.41518846,0.40003796,\
+	0.42118179,0.21073837,\
+	0.42154903,-0.70155621,\
+	0.42158212,-0.33447625,\
+	0.42277498,0.02774444,\
+	0.42294844,-0.52165201,\
+	0.42465107,-0.15303707,\
+	0.45576674,0.68708770,\
+	0.45708829,-0.88942133,\
+	0.46266258,0.49810708,\
+	0.46830638,0.30307674,\
+	0.46891581,-0.78503521,\
+	0.47332691,-0.42421072,\
+	0.47492414,0.11881178,\
+	0.47617996,-0.24250847,\
+	0.47709422,-0.61587202,\
+	0.47766585,-0.06283163,\
+	0.49284023,0.87011983,\
+	0.50070341,0.77148078,\
+	0.51105323,0.59594241,\
+	0.51322967,0.39629252,\
+	0.52436621,-0.51257293,\
+	0.52626232,0.20945175,\
+	0.52768053,-0.33226318,\
+	0.52845304,0.02779153,\
+	0.53277416,-0.15208460,\
+	0.53471317,-0.71163670,\
+	0.54436708,-0.83884712,\
+	0.55806447,0.48787901,\
+	0.56388700,0.69225656,\
+	0.57023517,0.30017942,\
+	0.57342995,-0.59754731,\
+	0.58024100,-0.42164527,\
+	0.58101739,0.11895848,\
+	0.58292803,-0.24113073,\
+	0.58517097,-0.06261579,\
+	0.58817579,0.80873311,\
+	0.60234813,0.57529736,\
+	0.61382860,0.39179583,\
+	0.61668934,-0.78720662,\
+	0.62397583,-0.67129702,\
+	0.63317442,-0.50922682,\
+	0.63361786,0.02834503,\
+	0.63367294,0.20893326,\
+	0.63423914,-0.33167241,\
+	0.64251884,-0.15112779,\
+	0.64930338,0.65326736,\
+	0.66034350,0.48247934,\
+	0.66617718,0.74579352,\
+	0.67300273,0.29907891,\
+	0.68761735,-0.72607326,\
+	0.68775963,-0.59169999,\
+	0.68789855,0.12079206,\
+	0.68873811,-0.23996840,\
+	0.68888730,-0.42215438,\
+	0.69311230,-0.06262365,\
+	0.70989539,0.56969251,\
+	0.71613187,0.39251294,\
+	0.73635629,0.02967200,\
+	0.73952765,-0.33215782,\
+	0.74439196,0.21110738,\
+	0.74611496,-0.50995249,\
+	0.74615780,0.66576913,\
+	0.74996331,-0.66147943,\
+	0.75497372,-0.15055590,\
+	0.76620501,0.48702606,\
+	0.77379915,0.30089103,\
+	0.79156344,-0.23872410,\
+	0.79434393,0.12621664,\
+	0.79753133,-0.42481233,\
+	0.80047290,-0.06419645,\
+	0.80880317,-0.58807944,\
+	0.81347979,0.58159318,\
+	0.81606483,0.40088956,\
+	0.83205774,0.03180994,\
+	0.84170474,-0.33621087,\
+	0.86017584,0.21863310,\
+	0.86248349,-0.50608521,\
+	0.86621517,0.31178474,\
+	0.87078230,0.49166878,\
+	0.87285522,-0.15209581,\
+	0.88570063,-0.24444376,\
+	0.89624049,0.13116998,\
+	0.90313047,-0.06293631,\
+	0.90908498,-0.41661073,\
+	0.91823427,0.39603766,\
+	0.91977079,0.03476784,\
+	0.94719721,-0.32065159,\
+	0.95669950,0.29107742,\
+	0.97660422,-0.21504466,\
+	0.98039504,0.19704204,\
+	0.99269568,-0.12064527,\
+	0.99556874,0.09403657,\
+	0.99985012,-0.01731283}
+
+
+
+#define DISC_7_FACE {\
+	130,146,121,\
+	17,5,3,\
+	197,209,221,\
+	243,221,232,\
+	232,221,209,\
+	226,218,207,\
+	67,66,82,\
+	132,121,146,\
+	122,110,132,\
+	132,110,121,\
+	3,1,11,\
+	11,17,3,\
+	54,67,53,\
+	66,67,54,\
+	294,279,295,\
+	295,279,276,\
+	279,294,270,\
+	272,251,262,\
+	357,349,355,\
+	236,230,250,\
+	50,44,33,\
+	63,44,50,\
+	277,296,287,\
+	226,236,247,\
+	166,184,195,\
+	155,184,166,\
+	93,104,114,\
+	135,114,125,\
+	125,114,104,\
+	76,95,96,\
+	106,115,127,\
+	115,106,94,\
+	117,106,127,\
+	95,106,117,\
+	196,207,218,\
+	126,114,135,\
+	127,115,134,\
+	115,126,134,\
+	162,174,151,\
+	53,67,62,\
+	72,52,62,\
+	237,259,249,\
+	136,148,158,\
+	122,132,145,\
+	130,121,109,\
+	109,123,130,\
+	100,123,109,\
+	0,2,10,\
+	37,47,31,\
+	31,47,45,\
+	45,47,64,\
+	64,57,45,\
+	75,57,64,\
+	75,95,76,\
+	76,57,75,\
+	28,14,20,\
+	361,360,354,\
+	321,327,335,\
+	357,359,348,\
+	348,349,357,\
+	348,335,349,\
+	315,296,306,\
+	315,306,325,\
+	41,50,33,\
+	329,339,345,\
+	242,222,220,\
+	176,198,182,\
+	182,163,176,\
+	161,182,173,\
+	163,182,161,\
+	53,30,43,\
+	43,54,53,\
+	24,9,16,\
+	42,22,30,\
+	42,30,53,\
+	53,62,42,\
+	42,62,52,\
+	307,294,295,\
+	299,294,308,\
+	308,307,318,\
+	294,307,308,\
+	283,299,289,\
+	283,294,299,\
+	283,270,294,\
+	225,237,249,\
+	217,207,195,\
+	230,236,217,\
+	226,207,217,\
+	217,236,226,\
+	250,230,239,\
+	239,229,251,\
+	251,272,260,\
+	260,272,281,\
+	281,269,260,\
+	260,239,251,\
+	260,269,250,\
+	250,239,260,\
+	268,247,257,\
+	257,247,236,\
+	250,269,257,\
+	257,236,250,\
+	314,330,316,\
+	244,232,223,\
+	84,73,93,\
+	84,94,74,\
+	83,73,63,\
+	83,92,104,\
+	83,104,93,\
+	93,73,83,\
+	55,44,63,\
+	63,73,55,\
+	246,227,235,\
+	235,227,218,\
+	235,218,226,\
+	226,247,235,\
+	181,204,194,\
+	155,166,144,\
+	135,125,144,\
+	80,92,71,\
+	63,50,71,\
+	71,83,63,\
+	92,83,71,\
+	101,110,122,\
+	101,92,80,\
+	131,144,125,\
+	122,145,131,\
+	131,145,155,\
+	155,144,131,\
+	111,125,104,\
+	104,92,111,\
+	111,131,125,\
+	122,131,111,\
+	111,101,122,\
+	92,101,111,\
+	164,140,152,\
+	151,174,152,\
+	138,117,127,\
+	138,162,151,\
+	129,140,118,\
+	129,138,151,\
+	117,138,129,\
+	151,152,129,\
+	129,152,140,\
+	107,96,95,\
+	95,117,107,\
+	118,96,107,\
+	107,129,118,\
+	117,129,107,\
+	197,164,175,\
+	175,209,197,\
+	164,152,175,\
+	175,152,174,\
+	218,227,208,\
+	208,196,218,\
+	195,207,185,\
+	207,196,185,\
+	196,165,185,\
+	185,166,195,\
+	143,126,135,\
+	143,165,153,\
+	153,134,143,\
+	143,134,126,\
+	93,114,105,\
+	114,126,105,\
+	105,84,93,\
+	105,126,115,\
+	115,94,105,\
+	94,84,105,\
+	78,62,67,\
+	78,88,72,\
+	72,62,78,\
+	13,2,4,\
+	13,10,2,\
+	18,9,24,\
+	18,6,9,\
+	61,52,72,\
+	61,39,52,\
+	49,39,61,\
+	150,161,173,\
+	81,61,72,\
+	72,88,81,\
+	88,102,81,\
+	124,148,136,\
+	238,252,231,\
+	238,225,249,\
+	249,259,271,\
+	136,158,147,\
+	147,123,136,\
+	130,123,147,\
+	68,77,58,\
+	58,48,68,\
+	155,145,167,\
+	167,184,155,\
+	181,194,167,\
+	167,194,184,\
+	112,123,100,\
+	136,123,112,\
+	112,124,136,\
+	102,124,112,\
+	89,101,80,\
+	110,101,89,\
+	87,68,79,\
+	77,68,87,\
+	79,100,87,\
+	100,109,87,\
+	21,14,28,\
+	28,37,21,\
+	21,37,31,\
+	86,64,74,\
+	86,75,64,\
+	74,94,86,\
+	86,94,106,\
+	86,106,95,\
+	95,75,86,\
+	8,20,14,\
+	33,44,27,\
+	27,17,33,\
+	351,359,361,\
+	351,348,359,\
+	361,354,351,\
+	351,354,342,\
+	321,335,332,\
+	335,348,332,\
+	310,301,292,\
+	281,272,292,\
+	292,301,281,\
+	353,346,356,\
+	344,346,334,\
+	358,356,344,\
+	344,356,346,\
+	288,306,296,\
+	288,277,268,\
+	296,277,288,\
+	298,306,288,\
+	334,346,341,\
+	341,325,334,\
+	341,346,353,\
+	341,353,347,\
+	50,41,60,\
+	80,71,60,\
+	60,71,50,\
+	309,329,322,\
+	322,300,309,\
+	289,299,309,\
+	309,300,289,\
+	173,182,188,\
+	188,179,173,\
+	211,201,188,\
+	188,201,179,\
+	200,182,198,\
+	198,220,200,\
+	200,220,222,\
+	200,188,182,\
+	200,222,211,\
+	211,188,200,\
+	163,161,141,\
+	82,66,85,\
+	116,98,119,\
+	22,42,32,\
+	24,16,32,\
+	32,16,22,\
+	32,39,24,\
+	32,42,52,\
+	52,39,32,\
+	331,308,318,\
+	212,201,224,\
+	224,201,211,\
+	258,253,270,\
+	270,283,258,\
+	245,242,263,\
+	245,222,242,\
+	237,225,213,\
+	231,252,240,\
+	251,229,240,\
+	262,251,240,\
+	240,252,262,\
+	349,335,343,\
+	343,335,327,\
+	343,355,349,\
+	343,352,355,\
+	333,341,347,\
+	315,325,333,\
+	325,341,333,\
+	340,333,347,\
+	330,314,324,\
+	315,333,324,\
+	324,340,330,\
+	333,340,324,\
+	287,296,305,\
+	305,296,315,\
+	315,324,305,\
+	305,324,314,\
+	234,244,223,\
+	234,227,246,\
+	243,232,254,\
+	232,244,254,\
+	254,265,243,\
+	46,37,28,\
+	44,55,36,\
+	36,27,44,\
+	20,27,36,\
+	55,46,36,\
+	28,20,36,\
+	36,46,28,\
+	246,235,256,\
+	268,277,256,\
+	256,247,268,\
+	256,235,247,\
+	206,217,195,\
+	230,217,206,\
+	195,184,206,\
+	184,194,206,\
+	142,138,127,\
+	162,138,142,\
+	127,134,142,\
+	142,134,153,\
+	210,199,223,\
+	223,232,210,\
+	210,232,209,\
+	174,162,183,\
+	219,208,227,\
+	227,234,219,\
+	223,199,219,\
+	219,234,223,\
+	196,208,186,\
+	153,165,186,\
+	186,165,196,\
+	135,144,154,\
+	154,143,135,\
+	165,143,154,\
+	154,144,166,\
+	166,185,154,\
+	154,185,165,\
+	82,85,97,\
+	97,85,98,\
+	98,116,97,\
+	34,39,49,\
+	34,18,24,\
+	24,39,34,\
+	6,18,12,\
+	4,6,12,\
+	12,13,4,\
+	202,213,191,\
+	137,150,160,\
+	179,172,160,\
+	173,179,160,\
+	160,150,173,\
+	61,81,70,\
+	49,61,70,\
+	113,102,88,\
+	113,124,102,\
+	291,271,280,\
+	289,300,280,\
+	280,300,291,\
+	280,271,259,\
+	261,238,249,\
+	249,271,261,\
+	252,238,261,\
+	322,327,312,\
+	312,300,322,\
+	312,327,321,\
+	321,302,312,\
+	291,300,312,\
+	312,302,291,\
+	156,146,130,\
+	130,147,156,\
+	59,68,48,\
+	49,70,59,\
+	79,68,59,\
+	59,70,79,\
+	157,167,145,\
+	157,132,146,\
+	157,145,132,\
+	181,167,157,\
+	90,112,100,\
+	90,70,81,\
+	90,81,102,\
+	102,112,90,\
+	90,100,79,\
+	79,70,90,\
+	99,89,77,\
+	77,87,99,\
+	99,87,109,\
+	110,89,99,\
+	121,110,99,\
+	99,109,121,\
+	0,10,7,\
+	10,19,7,\
+	7,1,0,\
+	7,11,1,\
+	7,19,11,\
+	25,41,33,\
+	11,19,25,\
+	33,17,25,\
+	17,11,25,\
+	23,19,10,\
+	23,13,29,\
+	10,13,23,\
+	20,8,15,\
+	15,27,20,\
+	15,8,5,\
+	5,17,15,\
+	17,27,15,\
+	321,332,313,\
+	313,302,321,\
+	293,302,313,\
+	338,351,342,\
+	348,351,338,\
+	338,332,348,\
+	281,301,290,\
+	290,269,281,\
+	298,290,311,\
+	311,290,301,\
+	310,292,303,\
+	293,313,303,\
+	284,272,262,\
+	284,292,272,\
+	293,303,284,\
+	284,303,292,\
+	342,354,350,\
+	350,354,360,\
+	350,360,358,\
+	358,344,350,\
+	268,257,278,\
+	278,288,268,\
+	298,288,278,\
+	278,257,269,\
+	278,290,298,\
+	269,290,278,\
+	51,60,41,\
+	139,141,161,\
+	161,150,139,\
+	119,141,139,\
+	139,116,119,\
+	308,331,319,\
+	299,308,319,\
+	319,309,299,\
+	329,309,319,\
+	339,329,319,\
+	319,331,339,\
+	241,224,253,\
+	253,258,241,\
+	212,224,241,\
+	259,237,248,\
+	248,241,258,\
+	253,224,233,\
+	233,224,211,\
+	211,222,233,\
+	222,245,233,\
+	279,270,264,\
+	270,253,264,\
+	264,276,279,\
+	253,233,264,\
+	264,233,245,\
+	263,276,264,\
+	264,245,263,\
+	214,238,231,\
+	225,238,214,\
+	191,213,203,\
+	203,213,225,\
+	203,178,191,\
+	193,178,203,\
+	225,214,203,\
+	203,214,193,\
+	215,229,204,\
+	231,240,215,\
+	215,240,229,\
+	345,352,337,\
+	352,343,337,\
+	337,329,345,\
+	322,329,337,\
+	337,327,322,\
+	337,343,327,\
+	287,275,267,\
+	267,277,287,\
+	246,256,267,\
+	267,256,277,\
+	244,234,255,\
+	255,267,275,\
+	255,234,246,\
+	246,267,255,\
+	297,275,287,\
+	287,305,297,\
+	297,305,314,\
+	297,316,304,\
+	297,314,316,\
+	265,254,266,\
+	266,255,275,\
+	266,254,244,\
+	244,255,266,\
+	65,84,74,\
+	73,84,65,\
+	65,55,73,\
+	65,46,55,\
+	47,37,56,\
+	37,46,56,\
+	46,65,56,\
+	56,65,74,\
+	74,64,56,\
+	64,47,56,\
+	216,239,230,\
+	230,206,216,\
+	229,239,216,\
+	204,229,216,\
+	216,194,204,\
+	216,206,194,\
+	199,210,187,\
+	187,183,199,\
+	174,183,187,\
+	187,175,174,\
+	209,175,187,\
+	187,210,209,\
+	171,183,162,\
+	153,186,171,\
+	171,142,153,\
+	162,142,171,\
+	91,67,82,\
+	91,78,67,\
+	82,97,91,\
+	97,108,91,\
+	128,97,116,\
+	128,108,97,\
+	137,108,128,\
+	128,150,137,\
+	128,139,150,\
+	116,139,128,\
+	40,34,49,\
+	49,59,40,\
+	40,59,48,\
+	40,48,29,\
+	18,34,26,\
+	26,12,18,\
+	34,40,26,\
+	26,40,29,\
+	29,13,26,\
+	13,12,26,\
+	191,178,170,\
+	170,178,158,\
+	170,158,148,\
+	148,159,170,\
+	172,159,149,\
+	137,160,149,\
+	149,160,172,\
+	212,202,189,\
+	189,201,212,\
+	179,201,189,\
+	189,172,179,\
+	120,108,137,\
+	137,149,120,\
+	293,284,274,\
+	274,284,262,\
+	262,252,274,\
+	252,261,274,\
+	282,271,291,\
+	282,261,271,\
+	282,274,261,\
+	293,274,282,\
+	282,302,293,\
+	291,302,282,\
+	192,204,181,\
+	192,215,204,\
+	146,156,169,\
+	169,157,146,\
+	181,157,169,\
+	169,192,181,\
+	323,313,332,\
+	332,338,323,\
+	310,303,323,\
+	323,303,313,\
+	298,311,317,\
+	317,306,298,\
+	317,325,306,\
+	334,325,317,\
+	328,344,334,\
+	334,317,328,\
+	328,317,311,\
+	310,323,326,\
+	326,338,342,\
+	326,323,338,\
+	60,51,69,\
+	80,60,69,\
+	58,77,69,\
+	69,51,58,\
+	69,89,80,\
+	77,89,69,\
+	19,23,35,\
+	35,51,41,\
+	41,25,35,\
+	35,25,19,\
+	273,248,258,\
+	273,283,289,\
+	273,258,283,\
+	259,248,273,\
+	289,280,273,\
+	273,280,259,\
+	237,213,228,\
+	228,248,237,\
+	228,202,212,\
+	213,202,228,\
+	212,241,228,\
+	241,248,228,\
+	168,178,193,\
+	168,156,147,\
+	168,147,158,\
+	158,178,168,\
+	286,297,304,\
+	275,297,286,\
+	286,266,275,\
+	199,183,190,\
+	183,171,190,\
+	190,219,199,\
+	208,219,190,\
+	190,186,208,\
+	190,171,186,\
+	177,189,202,\
+	177,170,159,\
+	177,159,172,\
+	172,189,177,\
+	177,202,191,\
+	191,170,177,\
+	113,120,133,\
+	133,120,149,\
+	148,124,133,\
+	124,113,133,\
+	133,159,148,\
+	133,149,159,\
+	103,91,108,\
+	108,120,103,\
+	88,78,103,\
+	78,91,103,\
+	103,113,88,\
+	103,120,113,\
+	215,192,205,\
+	205,214,231,\
+	231,215,205,\
+	193,214,205,\
+	192,169,180,\
+	193,205,180,\
+	180,205,192,\
+	180,168,193,\
+	180,169,156,\
+	156,168,180,\
+	336,350,344,\
+	344,328,336,\
+	342,350,336,\
+	336,326,342,\
+	320,328,311,\
+	320,311,301,\
+	320,336,328,\
+	326,336,320,\
+	320,301,310,\
+	310,326,320,\
+	38,35,23,\
+	29,48,38,\
+	38,23,29,\
+	38,48,58,\
+	58,51,38,\
+	51,35,38,\
+	265,266,285,\
+	266,286,285,\
+	285,286,304}
+
diff --git a/cuslines/generate_streamlines_cuda.cu b/cuslines/generate_streamlines_cuda.cu
index 9174348..ef89dc8 100644
--- a/cuslines/generate_streamlines_cuda.cu
+++ b/cuslines/generate_streamlines_cuda.cu
@@ -36,21 +36,23 @@
 #include <cfloat>
 #include <omp.h>
 #include <vector>
+#include <cmath>
+
 #include "cudamacro.h" /* for time() */
 #include "globals.h"
-//#include "utils.h"
 
 #include <iostream>
 
 #include "cuwsort.cuh"
+#include "ptt.cuh"
+
+#include "utils.cu"
+#include "ptt.cu"
 
 #define MAX_NUM_DIR (128)
 
 #define NTHR_GEN (128)
 
-#define THR_X_BL (64)
-#define THR_X_SL (32)
-
 #define MAX_DIMS        (8)
 #define MAX_STR_LEN     (256)
 
@@ -75,101 +77,6 @@ __device__ const int fixedPerm[] = {
   96,   4, 139,  86, 121, 109};
 #endif
 
-
-template<int BDIM_X,
-         typename REAL_T,
-         typename REAL3_T>
-__device__ int trilinear_interp_d(const int dimx,
-                                  const int dimy,
-                                  const int dimz,
-                                  const int dimt,
-                                  const REAL_T *__restrict__ dataf,
-                                  const REAL3_T point,
-                                        REAL_T *__restrict__ __vox_data) {
-
-        const int tidx = threadIdx.x;
-        //const int tidy = threadIdx.y;
-#if 0        
-        const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
-        const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
-#endif
-        const REAL_T HALF = static_cast<REAL_T>(0.5);
-
-        // all thr compute the same here
-        if (point.x < -HALF || point.x+HALF >= dimx ||
-            point.y < -HALF || point.y+HALF >= dimy ||
-               point.z < -HALF || point.z+HALF >= dimz) {
-                return -1;
-        }
-
-        long long  coo[3][2];
-        REAL wgh[3][2]; // could use just one...
-
-        const REAL_T ONE  = static_cast<REAL_T>(1.0);
-
-        const REAL3_T fl = MAKE_REAL3(FLOOR(point.x),
-                                      FLOOR(point.y),
-                                      FLOOR(point.z));
-
-        wgh[0][1] = point.x - fl.x; 
-        wgh[0][0] = ONE-wgh[0][1]; 
-        coo[0][0] = MAX(0, fl.x);
-        coo[0][1] = MIN(dimx-1, coo[0][0]+1);
-
-        wgh[1][1] = point.y - fl.y; 
-        wgh[1][0] = ONE-wgh[1][1]; 
-        coo[1][0] = MAX(0, fl.y);
-        coo[1][1] = MIN(dimy-1, coo[1][0]+1);
-
-        wgh[2][1] = point.z - fl.z; 
-        wgh[2][0] = ONE-wgh[2][1]; 
-        coo[2][0] = MAX(0, fl.z);
-        coo[2][1] = MIN(dimz-1, coo[2][0]+1);
-
-        //#pragma unroll
-        for(int t = tidx; t < dimt; t += BDIM_X) {
-
-                REAL_T __tmp = 0;
-
-                #pragma unroll
-                for(int i = 0; i < 2; i++) {
-                        #pragma unroll
-                        for(int j = 0; j < 2; j++) {
-                                #pragma unroll
-                                for(int k = 0; k < 2; k++) {
-                                        __tmp += wgh[0][i]*wgh[1][j]*wgh[2][k]*
-                                                 dataf[coo[0][i]*dimy*dimz*dimt +
-                                                       coo[1][j]*dimz*dimt +
-                                                       coo[2][k]*dimt +
-                                                       t];
-                                        /*
-                                        if (tidx == 0 && threadIdx.y == 0 && t==0) {
-                                                printf("wgh[0][%d]: %f, wgh[1][%d]: %f, wgh[2][%d]: %f\n",
-                                                        i, wgh[0][i], j, wgh[1][j], k, wgh[2][k]);
-                                                printf("dataf[%d][%d][%d][%d]: %f\n", coo[0][i], coo[1][j], coo[2][k], t+tidx,
-                                                                dataf[coo[0][i]*dimy*dimz*dimt +
-                                                                coo[1][j]*dimz*dimt +
-                                                                coo[2][k]*dimt +
-                                                                t+tidx]);
-                                        }
-                                        */
-                                }
-                        }
-                }
-                __vox_data[t] = __tmp;
-        }
-#if 0
-        __syncwarp(WMASK);
-        if (tidx == 0 && threadIdx.y == 0) {
-                printf("point: %f, %f, %f\n", point.x, point.y, point.z);
-                for(int i = 0; i < dimt; i++) {
-                        printf("__vox_data[%d]: %f\n", i, __vox_data[i]);
-                }
-        }
-#endif
-        return 0;
-}
-
 template<int BDIM_X,
          typename VAL_T>
 __device__ void ndotp_d(const int N,
@@ -798,19 +705,6 @@ __device__ void maskPut(const LEN_T n,
 	return;
 }
 
-template<typename REAL_T>
-__device__ void printArray(const char *name, int ncol, int n, REAL_T *arr) {
-	if (!threadIdx.x && !threadIdx.y && !blockIdx.x) {
-		printf("%s:\n", name);
-
-		for(int j = 0; j < n; j++) {
-			if ((j%ncol)==0) printf("\n");
-			printf("%10.8f ", arr[j]);
-		}
-		printf("\n");
-	}
-}
-
 template<int BDIM_X,
          int BDIM_Y,
          bool IS_START,
@@ -827,7 +721,6 @@ __device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st,
                                     const int dimz,
                                     const int dimt,
                                     const REAL3_T point,
-                                    const int samplm_nr,
                                     const REAL3_T *__restrict__ sphere_vertices,
                                     const int2 *__restrict__ sphere_edges,
                                     const int num_edges,
@@ -854,7 +747,7 @@ __device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st,
         }
 
         // pmf = self.pmf_gen.get_pmf_c(&point[0], pmf)
-        const int rv = trilinear_interp_d<BDIM_X>(dimx, dimy, dimz, dimt, pmf, point, __pmf_data_sh);
+        const int rv = trilinear_interp_d<BDIM_X>(dimx, dimy, dimz, dimt, 0, pmf, point, __pmf_data_sh);
         if (rv != 0) {
                 return 0;
         }
@@ -863,13 +756,13 @@ __device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st,
         //     if pmf[i] > max_pmf:
         //         max_pmf = pmf[i]
         // absolute_pmf_threshold = pmf_threshold * max_pmf
-        const REAL_T absolpmf_thresh = PMF_THRESHOLD_P * max_d<BDIM_X>(samplm_nr, __pmf_data_sh, REAL_MIN);
+        const REAL_T absolpmf_thresh = PMF_THRESHOLD_P * max_d<BDIM_X>(dimt, __pmf_data_sh, REAL_MIN);
         __syncwarp(WMASK);
 
         // for i in range(_len):
         //     if pmf[i] < absolute_pmf_threshold:
         //         pmf[i] = 0.0
-        for(int i = tidx; i < samplm_nr; i += BDIM_X) {
+        for(int i = tidx; i < dimt; i += BDIM_X) {
                 if (__pmf_data_sh[i] < absolpmf_thresh) {
                         __pmf_data_sh[i] = 0.0;
                 }
@@ -883,7 +776,7 @@ __device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st,
                                                  sphere_vertices,
                                                  sphere_edges,
                                                  num_edges,
-                                                 samplm_nr,
+                                                 dimt,
                                                  __shInd,
                                                  relative_peak_thres,
                                                  min_separation_angle);
@@ -892,7 +785,7 @@ __device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st,
                 #ifdef DEBUG
                         __syncwarp(WMASK);
                         if (tidx == 0) {
-                                printArray("__pmf_data_sh initial", 8, samplm_nr, __pmf_data_sh);
+                                printArray("__pmf_data_sh initial", 8, dimt, __pmf_data_sh);
                                 printf("absolpmf_thresh %10.8f\n", absolpmf_thresh);
                                 printf("--->            dir %10.8f, %10.8f, %10.8f\n", dir.x, dir.y, dir.z);
                                 printf("--->            point %10.8f, %10.8f, %10.8f\n", point.x, point.y, point.z);
@@ -918,7 +811,7 @@ __device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st,
                 //                 pmf[i] = 0
                 const REAL_T cos_similarity = COS(max_angle);
 
-                for(int i = tidx; i < samplm_nr; i += BDIM_X) {
+                for(int i = tidx; i < dimt; i += BDIM_X) {
                         const REAL_T dot = dir.x*sphere_vertices[i].x+
                                            dir.y*sphere_vertices[i].y+
                                            dir.z*sphere_vertices[i].z;
@@ -932,33 +825,18 @@ __device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st,
                 #ifdef DEBUG
                         __syncwarp(WMASK);
                         if (tidx == 0) {
-                                printArray("__pmf_data_sh after filtering", 8, samplm_nr, __pmf_data_sh);
+                                printArray("__pmf_data_sh after filtering", 8, dimt, __pmf_data_sh);
                         }
                         __syncwarp(WMASK);
                 #endif
 
                 // cumsum(pmf, pmf, _len)
-                for (int j = 0; j < samplm_nr; j += BDIM_X) {
-                        if ((tidx == 0) && (j != 0)) {
-                                __pmf_data_sh[j] += __pmf_data_sh[j-1];
-                        }
-                        __syncwarp(WMASK);
-
-                        REAL_T __t_pmf = __pmf_data_sh[j+tidx];
-                        for (int i = 1; i < BDIM_X; i*=2) {
-                                __tmp = __shfl_up_sync(WMASK, __t_pmf, i, BDIM_X);
-                                if ((tidx >= i) && (j+tidx < samplm_nr)) {
-                                        __t_pmf += __tmp;
-                                }
-                        }
-                        __pmf_data_sh[j+tidx] = __t_pmf;
-                        __syncwarp(WMASK);
-                }
+                prefix_sum_sh_d<BDIM_X>(__pmf_data_sh, dimt);
 
                 #ifdef DEBUG
                         __syncwarp(WMASK);
                         if (tidx == 0) {
-                                printArray("__pmf_data_sh after cumsum", 8, samplm_nr, __pmf_data_sh);
+                                printArray("__pmf_data_sh after cumsum", 8, dimt, __pmf_data_sh);
                         }
                         __syncwarp(WMASK);
                 #endif
@@ -966,7 +844,7 @@ __device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st,
                 // last_cdf = pmf[_len - 1]
                 // if last_cdf == 0:
                 //         return 1
-                REAL_T last_cdf = __pmf_data_sh[samplm_nr - 1];
+                REAL_T last_cdf = __pmf_data_sh[dimt - 1];
                 if (last_cdf == 0) {
                         return 0;
                 }
@@ -977,8 +855,8 @@ __device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st,
                 }
                 REAL_T selected_cdf = __shfl_sync(WMASK, __tmp, 0, BDIM_X);
 
-                int indProb = samplm_nr - 1;
-                for (int ii = 0; ii < samplm_nr; ii+=BDIM_X) {
+                int indProb = dimt - 1;
+                for (int ii = 0; ii < dimt; ii+=BDIM_X) {
                         int __msk = __ballot_sync(WMASK, selected_cdf < __pmf_data_sh[ii+tidx]);
                         if (__msk != 0) {
                                 indProb = ii + __ffs(__msk) - 1;
@@ -991,7 +869,7 @@ __device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st,
                         if (tidx == 0) {
                                 printf("last_cdf %10.8f\n", last_cdf);
                                 printf("selected_cdf %10.8f\n", selected_cdf);
-                                printf("indProb %i out of %i\n", indProb, samplm_nr);
+                                printf("indProb %i out of %i\n", indProb, dimt);
                         }
                         __syncwarp(WMASK);
                 #endif
@@ -1026,7 +904,7 @@ __device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st,
                                 printf("ERROR dir %10.8f, %10.8f, %10.8f\n", dirs[0].x, dirs[0].y, dirs[0].z);
                                 printf("last_cdf %10.8f\n", last_cdf);
                                 printf("selected_cdf %10.8f\n", selected_cdf);
-                                printf("indProb %i out of %i\n", indProb, samplm_nr);
+                                printf("indProb %i out of %i\n", indProb, dimt);
                                 }
                                 if (sqrt(dirs[0].x*dirs[0].x + dirs[0].y*dirs[0].y + dirs[0].z*dirs[0].z) >= 1.1) {
                                         printf("ERROR dir %10.8f, %10.8f, %10.8f\n", dirs[0].x, dirs[0].y, dirs[0].z);
@@ -1110,7 +988,7 @@ __device__ int get_direction_boot_d(
         #pragma unroll
         for(int i = 0; i < NATTEMPTS; i++) {
 
-                const int rv = trilinear_interp_d<BDIM_X>(dimx, dimy, dimz, dimt, dataf, point, __vox_data_sh);
+                const int rv = trilinear_interp_d<BDIM_X>(dimx, dimy, dimz, dimt, 0, dataf, point, __vox_data_sh);
 
 		const int nmsk = maskGet<BDIM_X>(dimt, b0s_mask, __vox_data_sh, __msk_data_sh);
 
@@ -1296,7 +1174,7 @@ __device__ int check_point_d(const REAL_T tc_threshold,
 
         __shared__ REAL_T __shInterpOut[BDIM_Y];
 
-        const int rv = trilinear_interp_d<BDIM_X>(dimx, dimy, dimz, 1, metric_map, point, __shInterpOut+tidy);
+        const int rv = trilinear_interp_d<BDIM_X>(dimx, dimy, dimz, 1, 0, metric_map, point, __shInterpOut+tidy);
         __syncwarp(WMASK);
 #if 0
         if (threadIdx.y == 1 && threadIdx.x == 0) {
@@ -1389,11 +1267,24 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
                                                         direction,
                                                         dimx, dimy, dimz, dimt,
                                                         point,
-                                                        samplm_nr,
                                                         sphere_vertices,
                                                         sphere_edges,
                                                         num_edges,
                                                         __shDir);
+                } else if (model_type == PTT) {
+                        ndir = get_direction_ptt_d<BDIM_X,
+                                                   BDIM_Y,
+                                                   0>(
+                                                        st,
+                                                        dataf,
+                                                        max_angle,
+                                                        voxel_size,
+                                                        step_size,
+                                                        direction,
+                                                        dimx, dimy, dimz, dimt,
+                                                        point,
+                                                        sphere_vertices,
+                                                        __shDir);
                 } else {
                         // call get_direction_boot_d() with NATTEMPTS=5
                         ndir = get_direction_boot_d<BDIM_X,
@@ -1527,7 +1418,7 @@ __global__ void getNumStreamlines_k(const ModelType model_type,
 	//}
 
         int ndir;
-        if (model_type == PROB) {
+        if ((model_type == PROB) || (model_type == PTT)) {
                 ndir = get_direction_prob_d<BDIM_X,
                                             BDIM_Y,
                                             1>(
@@ -1539,7 +1430,6 @@ __global__ void getNumStreamlines_k(const ModelType model_type,
                                                 MAKE_REAL3(0,0,0),
                                                 dimx, dimy, dimz, dimt,
                                                 seed,
-                                                samplm_nr,
                                                 sphere_vertices,
                                                 sphere_edges,
                                                 num_edges,
@@ -1634,6 +1524,8 @@ __global__ void genStreamlinesMerge_k(const ModelType model_type,
         const int lid = (tidy*BDIM_X + tidx) % 32;
         const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
 
+        extern REAL_T __shared__ __sh[];
+
 	// const int hr_side = dimt-1;
 
         curandStatePhilox4_32_10_t st;
@@ -1663,7 +1555,6 @@ __global__ void genStreamlinesMerge_k(const ModelType model_type,
         int slineOff = slineOutOff[slid];
 
         for(int i = 0; i < ndir; i++) {
-
                 REAL3_T first_step = shDir0[slid*samplm_nr + i];
 
 		REAL3_T *__restrict__ currSline = sline + slineOff*MAX_SLINE_LEN*2;
@@ -1676,6 +1567,75 @@ __global__ void genStreamlinesMerge_k(const ModelType model_type,
                         printf("calling trackerF from: (%f, %f, %f)\n", first_step.x, first_step.y, first_step.z);
                 }
 #endif
+
+                if (model_type == PTT) {
+                        bool init_norm_success;
+                        // Here we probabilistic find a good intial normal for this initial direction 
+                        init_norm_success = (bool) get_direction_ptt_d<BDIM_X,
+                                                                       BDIM_Y,
+                                                                       1>(
+                                                                                &st,
+                                                                                dataf,
+                                                                                max_angle,
+                                                                                MAKE_REAL3(1, 1, 1),
+                                                                                step_size,
+                                                                                MAKE_REAL3(-first_step.x, -first_step.y, -first_step.z),
+                                                                                dimx, dimy, dimz, dimt,
+                                                                                seed,
+                                                                                sphere_vertices,
+                                                                                shDir1 + slid*samplm_nr);
+
+                        __syncwarp(WMASK);
+                        if (!init_norm_success) {
+                                // try the other direction
+                                init_norm_success = (bool) get_direction_ptt_d<BDIM_X,
+                                                                               BDIM_Y,
+                                                                               1>(
+                                                                                        &st,
+                                                                                        dataf,
+                                                                                        max_angle,
+                                                                                        MAKE_REAL3(1, 1, 1),
+                                                                                        step_size,
+                                                                                        MAKE_REAL3(first_step.x, first_step.y, first_step.z),
+                                                                                        dimx, dimy, dimz, dimt,
+                                                                                        seed,
+                                                                                        sphere_vertices,
+                                                                                        shDir1 + slid*samplm_nr);
+                                __syncwarp(WMASK);
+                                if (!init_norm_success) { // This is rare
+                                        if (tidx == 0) {
+                                                slineLen[slineOff] = 1;
+                                                currSline[0] = seed;
+                                        }
+                                        slineOff += 1;
+                                        continue;
+                                } else {
+                                        if (tidx == 0) {
+                                                REAL_T *sh_frame_ptr = reinterpret_cast<REAL_T *>(__sh) + BDIM_Y*(DISC_VERT_CNT + DISC_FACE_CNT);
+                                                for (int ii = 0; ii < 9; ii++) {
+                                                        sh_frame_ptr[ii + 9*tidy] = -sh_frame_ptr[ii + 9*tidy]; // flip frame
+                                                }
+                                        }
+                                }
+                        }
+                        if (tidx == 0) {
+                                // We need to save the frames/k1/k2 for the forwards step, and reverse them for the backwards step
+                                REAL_T *sh_ptr = reinterpret_cast<REAL_T *>(__sh) + BDIM_Y*(DISC_VERT_CNT + DISC_FACE_CNT);
+                                REAL_T *sh_ptr_to_save = reinterpret_cast<REAL_T *>(__sh) + BDIM_Y*(DISC_VERT_CNT + DISC_FACE_CNT + 9 + 1);
+                
+                                for (int ii = 0; ii < 9; ii++) {
+                                        sh_ptr_to_save[ii + 9*tidy] = -sh_ptr[ii + 9*tidy]; // frame
+                                }
+
+                                sh_ptr += BDIM_Y*9;
+                                sh_ptr_to_save += BDIM_Y*9;
+
+                                sh_ptr_to_save[tidy] = sh_ptr[tidy]; // last val
+                        }
+                        __syncwarp(WMASK);
+                }
+
+
                 int stepsB;
                 const int tissue_classB = tracker_d<BDIM_X,
                                                     BDIM_Y>(&st,
@@ -1708,14 +1668,33 @@ __global__ void genStreamlinesMerge_k(const ModelType model_type,
                 //}
 
                 // reverse backward sline
-                for(int i = 0; i < stepsB/2; i += BDIM_X) {
-                        if (i+tidx < stepsB/2) {
-                                const REAL3_T __p = currSline[i+tidx];
-                                currSline[i+tidx] = currSline[stepsB-1 - (i+tidx)];
-                                currSline[stepsB-1 - (i+tidx)] = __p;
+                for(int j = 0; j < stepsB/2; j += BDIM_X) {
+                        if (j+tidx < stepsB/2) {
+                                const REAL3_T __p = currSline[j+tidx];
+                                currSline[j+tidx] = currSline[stepsB-1 - (j+tidx)];
+                                currSline[stepsB-1 - (j+tidx)] = __p;
                         }
                 }
 
+                if (model_type == PTT) {
+                        __syncwarp(WMASK);
+                        if (tidx == 0) {
+                                // Setup the forwards step using parameters saved from backwards step
+                                REAL_T *sh_ptr = reinterpret_cast<REAL_T *>(__sh) + BDIM_Y*(DISC_VERT_CNT + DISC_FACE_CNT);
+                                REAL_T *sh_ptr_saved = reinterpret_cast<REAL_T *>(__sh) + BDIM_Y*(DISC_VERT_CNT + DISC_FACE_CNT + 9 + 1);
+
+                                for (int ii = 0; ii < 9; ii++) {
+                                        sh_ptr[ii + 9*tidy] = sh_ptr_saved[ii + 9*tidy]; // frame
+                                }
+
+                                sh_ptr += BDIM_Y*9;
+                                sh_ptr_saved += BDIM_Y*9;
+
+                                sh_ptr[tidy] = sh_ptr_saved[tidy]; // last val
+                        }
+                        __syncwarp(WMASK);
+                }
+
                 int stepsF;
                 const int tissue_classF = tracker_d<BDIM_X,
                                                     BDIM_Y>(&st,
@@ -1806,6 +1785,10 @@ void generate_streamlines_cuda_mgpu(const ModelType model_type, const REAL max_a
   size_t shSizeGNS;
   if (model_type == PROB) {
     shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*n32dimt + sizeof(int)*(THR_X_BL/THR_X_SL)*n32dimt;
+  } else if (model_type == PTT) {
+    shSizeGNS = MAX(
+      sizeof(REAL)*(THR_X_BL/THR_X_SL)*(DISC_VERT_CNT + DISC_FACE_CNT + 2*(9 + 1)),
+      sizeof(REAL)*(THR_X_BL/THR_X_SL)*n32dimt + sizeof(int)*(THR_X_BL/THR_X_SL)*n32dimt);
   } else {
     shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*(2*n32dimt + 2*MAX(n32dimt, samplm_nr)) + // for get_direction_boot_d
                 sizeof(int)*samplm_nr;						        // for peak_directions_d	
@@ -1939,6 +1922,9 @@ void generate_streamlines_cuda_mgpu(const ModelType model_type, const REAL max_a
         // Shared memory requirements are smaller for probabilistic for main run
         // than for preliminary run
         shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*n32dimt; 
+    } else if (model_type == PTT) {
+        // Potentially smaller shared memory requirements for PTT for main run
+        shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*(DISC_VERT_CNT + DISC_FACE_CNT + 2*(9 + 1));
     }
 
     //fprintf(stderr, "Launching kernel with %u blocks of size (%u, %u)\n", grid.x, block.x, block.y);
diff --git a/cuslines/globals.h b/cuslines/globals.h
index ab5459b..ad85d7f 100644
--- a/cuslines/globals.h
+++ b/cuslines/globals.h
@@ -43,6 +43,7 @@
 #define REAL_MAX	(FLT_MAX)
 #define REAL_MIN	(-REAL_MAX)
 #define COS		__cosf
+#define SIN		__sinf
 #define FABS		fabsf
 #define SQRT		sqrtf
 #define RSQRT		rsqrtf
@@ -60,6 +61,7 @@
 #define REAL_MAX	(DBL_MAX)
 #define REAL_MIN	(-REAL_MAX)
 #define COS		cos
+#define SIN		sin
 #define FABS		fabs
 #define SQRT		sqrt
 #define RSQRT		rsqrt
@@ -70,6 +72,9 @@
 #define MAX_SLINE_LEN	(501)
 #define PMF_THRESHOLD_P	((REAL)0.1)
 
+#define THR_X_BL (64)
+#define THR_X_SL (32)
+
 #define MAX_SLINES_PER_SEED (10)
 
 #define MIN(x,y) (((x)<(y))?(x):(y))
@@ -87,6 +92,7 @@ enum ModelType {
   OPDT = 0,
   CSA = 1,
   PROB = 2,
+  PTT = 3,
 };
 
 #endif
diff --git a/cuslines/ptt.cu b/cuslines/ptt.cu
new file mode 100644
index 0000000..6fe5676
--- /dev/null
+++ b/cuslines/ptt.cu
@@ -0,0 +1,383 @@
+template<typename REAL_T>
+__device__ void norm3_d(REAL_T *num, int fail_ind) {
+    REAL_T scale = sqrt(num[0] * num[0] + num[1] * num[1] + num[2] * num[2]);
+
+    if (scale != 0) {
+        num[0] /= scale;
+        num[1] /= scale;
+        num[2] /= scale;
+    } else {
+        num[fail_ind] = 1.0; // this can happen randomly during propogation, though is exceedingly rare
+    }
+}
+
+template<typename REAL_T>
+__device__ void crossnorm3_d(REAL_T *dest, const REAL_T *src1, const REAL_T *src2, int fail_ind) {
+    dest[0] = src1[1] * src2[2] - src1[2] * src2[1];
+    dest[1] = src1[2] * src2[0] - src1[0] * src2[2];
+    dest[2] = src1[0] * src2[1] - src1[1] * src2[0];
+
+    norm3_d(dest, fail_ind);
+}
+
+template<typename REAL_T, typename REAL3_T>
+__device__ REAL_T interp4_d(const REAL3_T pos, const REAL_T* frame, const REAL_T *__restrict__ pmf,
+                            const int dimx, const int dimy, const int dimz, const int dimt,
+                            const REAL3_T *__restrict__ odf_sphere_vertices) {
+    const REAL_T HALF = static_cast<REAL_T>(0.5);
+
+    int closest_odf_idx = 0;
+    REAL_T __tmp = 0;
+    for (int ii = 0; ii < dimt; ii++) {
+        REAL_T cos_sim = FABS(
+            odf_sphere_vertices[ii].x * frame[0] \
+            + odf_sphere_vertices[ii].y * frame[1] \
+            + odf_sphere_vertices[ii].z * frame[2]);
+        if (cos_sim > __tmp) {
+            __tmp = cos_sim;
+            closest_odf_idx = ii;
+        }
+    }
+
+    const int rv = trilinear_interp_d<1>(dimx, dimy, dimz, dimt, closest_odf_idx,
+                                         pmf, pos, &__tmp);
+    if (rv != 0) {
+        return -1;
+    } else {
+        return __tmp;
+    }
+}
+
+template<typename REAL_T>
+__device__ void prepare_propagator_d(REAL_T k1, REAL_T k2, REAL_T arclength,
+                                     REAL_T *propagator) {
+    if ((FABS(k1) < K_SMALL) && (FABS(k2) < K_SMALL)) {
+        propagator[0] = arclength;
+        propagator[1] = 0;
+        propagator[2] = 0;
+        propagator[3] = 1;
+        propagator[4] = 0;
+        propagator[5] = 0;
+        propagator[6] = 0;
+        propagator[7] = 0;
+        propagator[8] = 1;
+    } else {
+        if (FABS(k1) < K_SMALL) {
+            k1 = K_SMALL;
+        }
+        if (FABS(k2) < K_SMALL) {
+            k2 = K_SMALL;
+        }
+        REAL_T tmp_arclength = arclength * arclength / 2.0;
+        propagator[0] = arclength;
+        propagator[1] = k1 * tmp_arclength;
+        propagator[2] = k2 * tmp_arclength;
+        propagator[3] = (1 - k2 * k2 * tmp_arclength) - k1 * k1 * tmp_arclength;
+        propagator[4] = k1 * arclength;
+        propagator[5] = k2 * arclength;
+        propagator[6] = -k2 * arclength;
+        propagator[7] = -k1 * k2 * tmp_arclength;
+        propagator[8] = 1 - k2 * k2 * tmp_arclength;
+    }
+}
+
+template<bool IS_INIT, typename REAL_T>
+__device__ void get_probing_frame_d(const REAL_T* frame, curandStatePhilox4_32_10_t *st, REAL_T* probing_frame) {
+    if (IS_INIT) {
+        for (int ii = 0; ii < 3; ii++) {
+            probing_frame[ii] =  frame[ii];
+        }
+        REAL_T __tmp[3];
+        for (int ii = 0; ii < 3; ii++) {
+            do {
+                __tmp[ii] = 2.0 * curand_uniform(st) - 1.0;
+            } while(!__tmp[ii]);
+        }
+
+        norm3_d(probing_frame, 0); // tangent
+        norm3_d(__tmp, 1); // norm
+        crossnorm3_d(probing_frame + 2*3, probing_frame, __tmp, 2); // binorm
+        for (int ii = 0; ii < 3; ii++) {
+            probing_frame[3 + ii] = __tmp[ii];
+        }
+    } else {
+        for (int ii = 0; ii < 9; ii++) {
+            probing_frame[ii] =  frame[ii];
+        }
+    }
+}
+
+template<typename REAL_T, typename REAL3_T>
+__device__ void propogate_frame_d(REAL_T* propagator, REAL_T* frame, REAL3_T* pos, REAL3_T pos_scale) {
+    REAL_T __tmp[3];
+
+    pos->x += pos_scale.x*(propagator[0]*frame[0] + propagator[1]*frame[3] + propagator[2]*frame[6]);
+    pos->y += pos_scale.y*(propagator[0]*frame[1] + propagator[1]*frame[4] + propagator[2]*frame[7]);
+    pos->z += pos_scale.z*(propagator[0]*frame[2] + propagator[1]*frame[5] + propagator[2]*frame[8]);
+
+    for (int ii = 0; ii < 3; ii++) {
+        __tmp[ii] = propagator[3]*frame[ii] + propagator[4]*frame[3+ii] + propagator[5]*frame[6+ii];
+        frame[2*3 + ii] = propagator[6]*frame[ii] + propagator[7]*frame[3+ii] + propagator[8]*frame[6+ii];
+    }
+
+    norm3_d(__tmp, 0); // normalize tangent
+    norm3_d(frame + 2*3, 2); // normalize binorm
+    crossnorm3_d(frame + 3, frame + 2*3, __tmp, 1); // calculate normal from binorm, tangent
+
+    for (int ii = 0; ii < 3; ii++) {
+        frame[ii] = __tmp[ii];
+    }
+}
+
+template<typename REAL_T, typename REAL3_T>
+__device__ REAL_T calculate_data_support_d(REAL_T support,
+                                           const REAL3_T pos, const REAL_T *__restrict__ pmf,
+                                           const int dimx, const int dimy, const int dimz, const int dimt,
+                                           const REAL3_T *__restrict__ odf_sphere_vertices,
+                                           const REAL3_T voxel_size,
+                                           REAL_T k1, REAL_T k2,
+                                           REAL_T* probing_frame,
+                                           REAL_T* last_val_probe) {
+    REAL_T probing_prop[9];
+    REAL3_T probing_pos;
+    REAL_T fod_amp;
+
+    prepare_propagator_d(k1, k2, PROBE_STEP, probing_prop);
+    probing_pos.x = pos.x; probing_pos.y = pos.y; probing_pos.z = pos.z;
+
+    for (int ii = 0; ii < PROBE_QUALITY; ii++) {
+        propogate_frame_d(probing_prop, probing_frame, &probing_pos, voxel_size);
+        fod_amp = interp4_d(probing_pos, probing_frame, pmf,
+                            dimx, dimy, dimz, dimt,
+                            odf_sphere_vertices);
+        if (fod_amp < PMF_THRESHOLD_P) {
+            if (ALLOW_WEAK_LINK) {
+                fod_amp = 0;
+            } else {
+                *last_val_probe = 0;
+                return 0;    
+            }
+        }
+        support += fod_amp;
+    }
+    *last_val_probe = fod_amp;
+
+    return support;
+}
+
+template<int BDIM_X,
+         int BDIM_Y,
+         bool IS_INIT,
+         typename REAL_T,
+         typename REAL3_T>
+__device__ int get_direction_ptt_d(
+    curandStatePhilox4_32_10_t *st,
+    const REAL_T *__restrict__ pmf,
+    const REAL_T max_angle,
+    const REAL3_T voxel_size,
+    const REAL_T step_size,
+    REAL3_T dir,
+    const int dimx, const int dimy, const int dimz, const int dimt,
+    REAL3_T pos,
+    const REAL3_T *__restrict__ odf_sphere_vertices,
+    REAL3_T *__restrict__ dirs) {
+    // Aydogan DB, Shi Y. Parallel Transport Tractography. IEEE Trans
+    // Med Imaging. 2021 Feb;40(2):635-647. doi: 10.1109/TMI.2020.3034038.
+    // Epub 2021 Feb 2. PMID: 33104507; PMCID: PMC7931442.
+    // https://github.com/nibrary/nibrary/blob/main/src/dMRI/tractography/algorithms/ptt
+    // Assumes probe count 1, data_support_exponent 1 for now
+    // Implemented with new CDF sampling strategy
+    // And using initial directions from voxel-wise peaks as in DIPY
+
+    const int tidx = threadIdx.x;
+    const int tidy = threadIdx.y;
+
+    const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+    const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+	extern REAL_T __shared__ __sh[];
+
+    REAL_T *__face_cdf_sh = reinterpret_cast<REAL_T *>(__sh);
+    REAL_T *__vert_cdf_sh = __face_cdf_sh + BDIM_Y*DISC_FACE_CNT; // These first two can be overwritten in other functions
+    REAL_T *__frame_sh = __vert_cdf_sh + BDIM_Y*DISC_VERT_CNT; // the rest must persist form call to call
+    REAL_T *__last_val_sh = __frame_sh + BDIM_Y*9;
+
+    __face_cdf_sh += tidy*DISC_FACE_CNT;
+    __vert_cdf_sh += tidy*DISC_VERT_CNT; 
+    __frame_sh += tidy*9;
+    __last_val_sh += tidy*1;
+
+    const REAL_T max_curvature = (step_size / 2.0) / (SIN(max_angle / 2));
+    REAL_T __tmp;
+
+    if (IS_INIT) {
+        if (tidx==0) {
+            __frame_sh[0] = dir.x;
+            __frame_sh[1] = dir.y;
+            __frame_sh[2] = dir.z;
+            *__last_val_sh = interp4_d(pos, __frame_sh, pmf,
+                                       dimx, dimy, dimz, dimt,
+                                       odf_sphere_vertices);
+        }
+    }
+    __syncwarp(WMASK);
+
+    // Calculate __vert_cdf_sh
+    REAL_T probing_frame[9];
+    REAL_T k1_probe, k2_probe;
+    bool support_found = 0;
+    int jj = 0;
+    do {
+        for (int ii = tidx; ii < DISC_VERT_CNT; ii += BDIM_X) {
+            k1_probe = DISC_VERT[ii*2] * max_curvature;
+            k2_probe = DISC_VERT[ii*2+1] * max_curvature;
+
+#if 0
+            printf("k1_probe: %f for id: %i\n", k1_probe, tidx);
+            printf("k2_probe: %f for id: %i\n", k2_probe, tidx);
+#endif
+
+            get_probing_frame_d<IS_INIT>(__frame_sh, st, probing_frame);
+
+            REAL_T this_support = calculate_data_support_d(*__last_val_sh,
+                                                           pos, pmf, dimx, dimy, dimz, dimt,
+                                                           odf_sphere_vertices, 
+                                                           voxel_size,
+                                                           k1_probe, k2_probe,
+                                                           probing_frame,
+                                                           &__tmp);
+            if (this_support < NORM_MIN_SUPPORT) {
+                __vert_cdf_sh[ii] = 0;
+            } else {
+                __vert_cdf_sh[ii] = this_support;
+                support_found = 1;
+            }
+        }
+        if (IS_INIT) {
+            int __msk = __ballot_sync(WMASK, support_found);
+            support_found = (__msk != 0);
+        }
+    } while (IS_INIT && (!support_found) && (jj++ < TRIES_PER_REJECTION_SAMPLING));
+
+#if 0
+    __syncwarp(WMASK);
+    if (threadIdx.x == 0) {
+        printArrayAlways("VERT PDF", 8, DISC_VERT_CNT, __vert_cdf_sh);
+    }
+    __syncwarp(WMASK);
+#endif
+
+    // Initialize __face_cdf_sh
+    for (int ii = tidx; ii < DISC_FACE_CNT; ii+=BDIM_X) { 
+        __face_cdf_sh[ii] = 0;
+    }
+    __syncwarp(WMASK);
+
+    // Move vert to face
+    for (int ii = tidx; ii < DISC_FACE_CNT; ii+=BDIM_X) {
+        bool all_verts_valid = 1;
+        for (int jj = 0; jj < 3; jj++) {
+            REAL_T vert_val = __vert_cdf_sh[DISC_FACE[ii*3 + jj]];
+            if (vert_val == 0) {
+                all_verts_valid = IS_INIT;
+            }
+            __face_cdf_sh[ii] += vert_val;
+        }
+        if (!all_verts_valid) {
+            __face_cdf_sh[ii] = 0;
+        }
+    }
+    __syncwarp(WMASK);
+
+#if 0
+    __syncwarp(WMASK);
+    if (threadIdx.x == 0) {
+        printArrayAlways("Face PDF", 8, DISC_FACE_CNT, __face_cdf_sh);
+    }
+    __syncwarp(WMASK);
+#endif
+
+    // Prefix sum __face_cdf_sh and return 0 if all 0
+    prefix_sum_sh_d<BDIM_X>(__face_cdf_sh, DISC_FACE_CNT);
+    REAL_T last_cdf = __face_cdf_sh[DISC_FACE_CNT - 1];
+
+    if (last_cdf == 0) {
+        return 0;
+    }
+
+#if 0
+    __syncwarp(WMASK);
+    if (threadIdx.x == 0) {
+        printArrayAlways("Face CDF", 8, DISC_FACE_CNT, __face_cdf_sh);
+    }
+    __syncwarp(WMASK);
+#endif
+
+    // Sample random valid faces randomly
+    REAL_T r1, r2;
+    for (int ii = tidx; ii < TRIES_PER_REJECTION_SAMPLING; ii += BDIM_X) {
+        r1 = curand_uniform(st);
+        r2 = curand_uniform(st);
+		if (r1 + r2 > 1) {
+			r1 = 1 - r1;
+			r2 = 1 - r2;
+		}
+
+        __tmp = curand_uniform(st) * last_cdf;
+		int jj;
+		for (jj = 0; jj < DISC_FACE_CNT; jj++) {
+			if (__face_cdf_sh[jj] >= __tmp)
+				break;
+		}
+ 
+        REAL_T vx0 = max_curvature * DISC_VERT[DISC_FACE[jj*3]*2];
+        REAL_T vx1 = max_curvature * DISC_VERT[DISC_FACE[jj*3+1]*2];
+        REAL_T vx2 = max_curvature * DISC_VERT[DISC_FACE[jj*3+2]*2];
+
+        REAL_T vy0 = max_curvature * DISC_VERT[DISC_FACE[jj*3]*2 + 1];
+        REAL_T vy1 = max_curvature * DISC_VERT[DISC_FACE[jj*3+1]*2 + 1];
+        REAL_T vy2 = max_curvature * DISC_VERT[DISC_FACE[jj*3+2]*2 + 1];
+
+        k1_probe = vx0 + r1 * (vx1 - vx0) + r2 * (vx2 - vx0);
+        k2_probe = vy0 + r1 * (vy1 - vy0) + r2 * (vy2 - vy0);
+
+        get_probing_frame_d<IS_INIT>(__frame_sh, st, probing_frame);
+
+        REAL_T this_support = calculate_data_support_d(*__last_val_sh,
+                                                       pos, pmf, dimx, dimy, dimz, dimt,
+                                                       odf_sphere_vertices, 
+                                                       voxel_size,
+                                                       k1_probe, k2_probe,
+                                                       probing_frame,
+                                                       &__tmp);
+
+        int __msk = __ballot_sync(WMASK, this_support >= NORM_MIN_SUPPORT);
+        if (__msk != 0) {
+            int winning_lane = __ffs(__msk) - 1; // Often 0. occasionally more
+
+            __syncwarp(WMASK);
+            if (tidx == winning_lane) {
+                dirs[0] = MAKE_REAL3(
+                    probing_frame[0],
+                    probing_frame[1],
+                    probing_frame[2]);
+
+                if (!IS_INIT) {
+                    REAL_T __prop[9];
+                    prepare_propagator_d(k1_probe, k2_probe, step_size, __prop);
+                    propogate_frame_d(__prop, probing_frame, &pos, MAKE_REAL3(0, 0, 0));
+                }
+
+                for (int jj = 0; jj < 9; jj++) {
+                    __frame_sh[jj] = probing_frame[jj];
+                }
+                *__last_val_sh = __tmp;
+            }
+            __syncwarp(WMASK);
+            return 1;
+        }
+    }
+    return 0;
+}
+
+
diff --git a/cuslines/ptt.cuh b/cuslines/ptt.cuh
new file mode 100644
index 0000000..754e383
--- /dev/null
+++ b/cuslines/ptt.cuh
@@ -0,0 +1,40 @@
+#ifndef __PTT_CUH__
+#define __PTT_CUH__
+
+#include "disc.h"
+#include "globals.h"
+
+#define PROBE_LENGTH 0.25
+#define PROBE_QUALITY 4 // can be 4-7
+#define ALLOW_WEAK_LINK 1
+#define TRIES_PER_REJECTION_SAMPLING 1024
+#define DEFAULT_PTT_MINDATASUPPORT 0.05
+#define K_SMALL 0.0001
+
+
+#define NORM_MIN_SUPPORT (DEFAULT_PTT_MINDATASUPPORT / PROBE_QUALITY)
+#define PROBE_STEP (PROBE_LENGTH / (PROBE_QUALITY - 1))
+
+#if PROBE_QUALITY == 4
+#define DISC_VERT_CNT DISC_4_VERT_CNT
+#define DISC_FACE_CNT DISC_4_FACE_CNT
+__constant__ REAL DISC_VERT[] = DISC_4_VERT;
+__constant__ int DISC_FACE[] = DISC_4_FACE;
+#elif PROBE_QUALITY == 5
+#define DISC_VERT_CNT DISC_5_VERT_CNT
+#define DISC_FACE_CNT DISC_5_FACE_CNT
+__constant__ REAL DISC_VERT[] = DISC_5_VERT;
+__constant__ int DISC_FACE[] = DISC_5_FACE;
+#elif PROBE_QUALITY == 6
+#define DISC_VERT_CNT DISC_6_VERT_CNT
+#define DISC_FACE_CNT DISC_6_FACE_CNT
+__constant__ REAL DISC_VERT[] = DISC_6_VERT;
+__constant__ int DISC_FACE[] = DISC_6_FACE;
+#elif PROBE_QUALITY == 7
+#define DISC_VERT_CNT DISC_7_VERT_CNT
+#define DISC_FACE_CNT DISC_7_FACE_CNT
+__constant__ REAL DISC_VERT[] = DISC_7_VERT;
+__constant__ int DISC_FACE[] = DISC_7_FACE;
+#endif
+
+#endif
\ No newline at end of file
diff --git a/cuslines/utils.cu b/cuslines/utils.cu
new file mode 100644
index 0000000..5589fc3
--- /dev/null
+++ b/cuslines/utils.cu
@@ -0,0 +1,144 @@
+
+template<int BDIM_X, typename REAL_T>
+__device__ void prefix_sum_sh_d(REAL_T *num_sh, int __len) {
+    const int tidx = threadIdx.x;
+
+    const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
+    const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+    for (int j = 0; j < __len; j += BDIM_X) {
+        if ((tidx == 0) && (j != 0)) {
+            num_sh[j] += num_sh[j-1];
+        }
+        __syncwarp(WMASK);
+
+        REAL_T __t_pmf;
+        if (j+tidx < __len) {
+            __t_pmf = num_sh[j+tidx];
+        }
+        for (int i = 1; i < BDIM_X; i*=2) {
+            REAL_T __tmp = __shfl_up_sync(WMASK, __t_pmf, i, BDIM_X);
+            if ((tidx >= i) && (j+tidx < __len)) {
+                    __t_pmf += __tmp;
+            }
+        }
+        if (j+tidx < __len) {
+            num_sh[j+tidx] = __t_pmf;
+        }
+        __syncwarp(WMASK);
+    }
+}
+
+template<typename REAL_T>
+__device__ void printArrayAlways(const char *name, int ncol, int n, REAL_T *arr) {
+        printf("%s:\n", name);
+
+        for(int j = 0; j < n; j++) {
+                if ((j%ncol)==0) printf("\n");
+                printf("%10.8f ", arr[j]);
+        }
+        printf("\n");
+}
+
+template<typename REAL_T>
+__device__ void printArray(const char *name, int ncol, int n, REAL_T *arr) {
+	if (!threadIdx.x && !threadIdx.y && !blockIdx.x) {
+		printArrayAlways(name, ncol, n, arr);
+	}
+}
+
+template<int BDIM_X,
+         typename REAL_T,
+         typename REAL3_T>
+__device__ int trilinear_interp_d(const int dimx,
+                                  const int dimy,
+                                  const int dimz,
+                                  const int dimt,
+                                  int dimt_start_idx,
+                                  const REAL_T *__restrict__ dataf,
+                                  const REAL3_T point,
+                                  REAL_T *__restrict__ __vox_data) {
+        const REAL_T HALF = static_cast<REAL_T>(0.5);
+
+        // all thr compute the same here
+        if (point.x < -HALF || point.x+HALF >= dimx ||
+            point.y < -HALF || point.y+HALF >= dimy ||
+               point.z < -HALF || point.z+HALF >= dimz) {
+                return -1;
+        }
+
+        long long  coo[3][2];
+        REAL_T wgh[3][2]; // could use just one...
+
+        const REAL_T ONE  = static_cast<REAL_T>(1.0);
+
+        const REAL3_T fl = MAKE_REAL3(FLOOR(point.x),
+                                      FLOOR(point.y),
+                                      FLOOR(point.z));
+
+        wgh[0][1] = point.x - fl.x; 
+        wgh[0][0] = ONE-wgh[0][1]; 
+        coo[0][0] = MAX(0, fl.x);
+        coo[0][1] = MIN(dimx-1, coo[0][0]+1);
+
+        wgh[1][1] = point.y - fl.y; 
+        wgh[1][0] = ONE-wgh[1][1]; 
+        coo[1][0] = MAX(0, fl.y);
+        coo[1][1] = MIN(dimy-1, coo[1][0]+1);
+
+        wgh[2][1] = point.z - fl.z; 
+        wgh[2][0] = ONE-wgh[2][1]; 
+        coo[2][0] = MAX(0, fl.z);
+        coo[2][1] = MIN(dimz-1, coo[2][0]+1);
+
+        if (BDIM_X == 1) { // Whether this is being multithreaded
+                REAL_T __tmp = 0;
+                #pragma unroll
+                for(int i = 0; i < 2; i++) {
+                        #pragma unroll
+                        for(int j = 0; j < 2; j++) {
+                                #pragma unroll
+                                for(int k = 0; k < 2; k++) {
+                                        __tmp += wgh[0][i]*wgh[1][j]*wgh[2][k]*
+                                                dataf[coo[0][i]*dimy*dimz*dimt +
+                                                coo[1][j]*dimz*dimt +
+                                                coo[2][k]*dimt +
+                                                dimt_start_idx];
+                                }
+                        }
+                }
+                __vox_data[0] = __tmp;
+        } else {
+                for(int t = dimt_start_idx + threadIdx.x; t < dimt; t += BDIM_X) {
+
+                        REAL_T __tmp = 0;
+
+                        #pragma unroll
+                        for(int i = 0; i < 2; i++) {
+                                #pragma unroll
+                                for(int j = 0; j < 2; j++) {
+                                        #pragma unroll
+                                        for(int k = 0; k < 2; k++) {
+                                                __tmp += wgh[0][i]*wgh[1][j]*wgh[2][k]*
+                                                        dataf[coo[0][i]*dimy*dimz*dimt +
+                                                        coo[1][j]*dimz*dimt +
+                                                        coo[2][k]*dimt +
+                                                        t];
+                                        }
+                                }
+                        }
+                        __vox_data[t] = __tmp;
+                }
+        }
+
+        /*
+        __syncwarp(WMASK);
+        if (tidx == 0 && threadIdx.y == 0) {
+                printf("point: %f, %f, %f\n", point.x, point.y, point.z);
+                for(int i = 0; i < dimt; i++) {
+                        printf("__vox_data[%d]: %f\n", i, __vox_data[i]);
+                }
+        }
+        */
+        return 0;
+}

From 3e16e95f1f19d95cdcab9113fbcfea5104aebbfb Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Wed, 17 Jul 2024 11:26:38 -0700
Subject: [PATCH 14/35] clean up makefile diffs

---
 cuslines/Makefile | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cuslines/Makefile b/cuslines/Makefile
index 67c9cbb..7845ad9 100644
--- a/cuslines/Makefile
+++ b/cuslines/Makefile
@@ -26,7 +26,6 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
-# CUDA_HOME=/opt/nvidia/hpc_sdk/Linux_x86_64/23.9/cuda/12.2
 CUDA_HOME=/usr/local/cuda
 CUDACC=$(CUDA_HOME)/bin/nvcc
 CXX=g++
@@ -44,7 +43,7 @@ LDFLAGS= -shared -fopenmp -L$(CUDA_HOME)/lib64 -lcudart -lnvToolsExt
 all: cuslines
 
 cuslines: generate_streamlines_cuda.o cuslines.o
-	$(LD) generate_streamlines_cuda.o cuslines.o -o cuslines`python3-config --extension-suffix` $(LDFLAGS)
+	$(LD) -shared cuslines.o generate_streamlines_cuda.o cuslines`python3-config --extension-suffix` $(LDFLAGS)
 
 %.o : %.cu
 	$(CUDACC) $(CUDACFLAGS) $< -o $@

From e50b9aa0eeb8a5a49fce95f861f079445206be65 Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Wed, 17 Jul 2024 12:26:06 -0700
Subject: [PATCH 15/35] fix makefile one last time

---
 cuslines/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuslines/Makefile b/cuslines/Makefile
index 7845ad9..4fc427c 100644
--- a/cuslines/Makefile
+++ b/cuslines/Makefile
@@ -43,7 +43,7 @@ LDFLAGS= -shared -fopenmp -L$(CUDA_HOME)/lib64 -lcudart -lnvToolsExt
 all: cuslines
 
 cuslines: generate_streamlines_cuda.o cuslines.o
-	$(LD) -shared cuslines.o generate_streamlines_cuda.o cuslines`python3-config --extension-suffix` $(LDFLAGS)
+	$(LD) cuslines.o generate_streamlines_cuda.o -o cuslines`python3-config --extension-suffix` $(LDFLAGS)
 
 %.o : %.cu
 	$(CUDACC) $(CUDACFLAGS) $< -o $@

From 2c230c8699f2ec1c33bf57884b538a498229468a Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Wed, 17 Jul 2024 16:35:34 -0700
Subject: [PATCH 16/35] more bfs/improvements, still buggy

---
 cuslines/generate_streamlines_cuda.cu |  30 +++++--
 cuslines/ptt.cu                       | 109 +++++++++++++++-----------
 cuslines/ptt.cuh                      |  17 ++--
 3 files changed, 95 insertions(+), 61 deletions(-)

diff --git a/cuslines/generate_streamlines_cuda.cu b/cuslines/generate_streamlines_cuda.cu
index ef89dc8..745b672 100644
--- a/cuslines/generate_streamlines_cuda.cu
+++ b/cuslines/generate_streamlines_cuda.cu
@@ -1252,8 +1252,15 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
         }
         __syncwarp(WMASK);
 
+        int step_frac;
+        if (model_type == PTT) {
+                step_frac = STEP_FRAC; 
+        } else {
+                step_frac = 1; // STEP_FRAC could be useful in other models
+        }
+
         int i;
-        for(i = 1; i < MAX_SLINE_LEN; i++) {
+        for(i = 1; i < MAX_SLINE_LEN*step_frac; i++) {
                 int ndir;
                 if (model_type == PROB) {
                         ndir = get_direction_prob_d<BDIM_X,
@@ -1330,12 +1337,12 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
                 //point.x += (direction.x / voxel_size.x) * STEP_SIZE_P;
                 //point.y += (direction.y / voxel_size.y) * STEP_SIZE_P;
                 //point.z += (direction.z / voxel_size.z) * STEP_SIZE_P;
-                point.x += (direction.x / voxel_size.x) * step_size;
-                point.y += (direction.y / voxel_size.y) * step_size;
-                point.z += (direction.z / voxel_size.z) * step_size;
+                point.x += (direction.x / voxel_size.x) * (step_size / step_frac);
+                point.y += (direction.y / voxel_size.y) * (step_size / step_frac);
+                point.z += (direction.z / voxel_size.z) * (step_size / step_frac);
 
-                if (tidx == 0) {
-                        streamline[i] = point;
+                if ((tidx == 0) && ((i % step_frac) == 0)){
+                        streamline[i/step_frac] = point;
 #if 0
                         if (threadIdx.y == 1) {
                                 printf("streamline[%d]: %f, %f, %f\n", i, point.x, point.y, point.z);
@@ -1352,7 +1359,13 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
                         break;
                 }
         }
-        nsteps[0] = i;
+        nsteps[0] = i/step_frac;
+        if (((i % step_frac) != 0) && i < step_frac*(MAX_SLINE_LEN - 1)){
+                nsteps[0]++;
+                if (tidx == 0) {
+                        streamline[nsteps[0]] = point;
+                }
+        }
 
         return tissue_class;
 }
@@ -1619,7 +1632,7 @@ __global__ void genStreamlinesMerge_k(const ModelType model_type,
                                 }
                         }
                         if (tidx == 0) {
-                                // We need to save the frames/k1/k2 for the forwards step, and reverse them for the backwards step
+                                // We need to save the frames for the forwards step, and reverse them for the backwards step
                                 REAL_T *sh_ptr = reinterpret_cast<REAL_T *>(__sh) + BDIM_Y*(DISC_VERT_CNT + DISC_FACE_CNT);
                                 REAL_T *sh_ptr_to_save = reinterpret_cast<REAL_T *>(__sh) + BDIM_Y*(DISC_VERT_CNT + DISC_FACE_CNT + 9 + 1);
                 
@@ -2123,6 +2136,7 @@ void write_trk(const char *fname,
         float slineData[1 + 3*(2*MAX_SLINE_LEN)];
 #endif
         for(int i = 0; i < nsline; i++) {
+                printf("len %i\n", slineLen[i]);
                 reinterpret_cast<int *>(slineData)[0] = slineLen[i];
                 for(int j = 0; j < slineLen[i]; j++) {
                         slineData[1+3*j+0] = (float)((sline[i*2*MAX_SLINE_LEN + j].x+0.5)*VOXEL_SIZE[0]);
diff --git a/cuslines/ptt.cu b/cuslines/ptt.cu
index 6fe5676..5d2436e 100644
--- a/cuslines/ptt.cu
+++ b/cuslines/ptt.cu
@@ -1,6 +1,6 @@
 template<typename REAL_T>
 __device__ void norm3_d(REAL_T *num, int fail_ind) {
-    REAL_T scale = sqrt(num[0] * num[0] + num[1] * num[1] + num[2] * num[2]);
+    REAL_T scale = SQRT(num[0] * num[0] + num[1] * num[1] + num[2] * num[2]);
 
     if (scale != 0) {
         num[0] /= scale;
@@ -68,16 +68,20 @@ __device__ void prepare_propagator_d(REAL_T k1, REAL_T k2, REAL_T arclength,
         if (FABS(k2) < K_SMALL) {
             k2 = K_SMALL;
         }
-        REAL_T tmp_arclength = arclength * arclength / 2.0;
-        propagator[0] = arclength;
-        propagator[1] = k1 * tmp_arclength;
-        propagator[2] = k2 * tmp_arclength;
-        propagator[3] = (1 - k2 * k2 * tmp_arclength) - k1 * k1 * tmp_arclength;
-        propagator[4] = k1 * arclength;
-        propagator[5] = k2 * arclength;
-        propagator[6] = -k2 * arclength;
-        propagator[7] = -k1 * k2 * tmp_arclength;
-        propagator[8] = 1 - k2 * k2 * tmp_arclength;
+        REAL_T k     = SQRT(k1*k1+k2*k2);
+        REAL_T sinkt = SIN(k*arclength);
+        REAL_T coskt = COS(k*arclength);
+        REAL_T kk    = 1/(k*k);
+
+        propagator[0] = sinkt/k;
+        propagator[1] = k1*(1-coskt)*kk;
+        propagator[2] = k2*(1-coskt)*kk;
+        propagator[3] = coskt;
+        propagator[4] = k1*sinkt/k;
+        propagator[5] = k2*sinkt/k;
+        propagator[6] = -propagator[5];
+        propagator[7] = k1*k2*(coskt-1)*kk;
+        propagator[8] = (k1*k1+k2*k2*coskt)*kk;
     }
 }
 
@@ -107,26 +111,31 @@ __device__ void get_probing_frame_d(const REAL_T* frame, curandStatePhilox4_32_1
     }
 }
 
-template<typename REAL_T, typename REAL3_T>
-__device__ void propogate_frame_d(REAL_T* propagator, REAL_T* frame, REAL3_T* pos, REAL3_T pos_scale) {
+template<typename REAL_T>
+__device__ void propogate_frame_d(REAL_T* propagator, REAL_T* frame, REAL_T* direc) {
     REAL_T __tmp[3];
 
-    pos->x += pos_scale.x*(propagator[0]*frame[0] + propagator[1]*frame[3] + propagator[2]*frame[6]);
-    pos->y += pos_scale.y*(propagator[0]*frame[1] + propagator[1]*frame[4] + propagator[2]*frame[7]);
-    pos->z += pos_scale.z*(propagator[0]*frame[2] + propagator[1]*frame[5] + propagator[2]*frame[8]);
-
     for (int ii = 0; ii < 3; ii++) {
-        __tmp[ii] = propagator[3]*frame[ii] + propagator[4]*frame[3+ii] + propagator[5]*frame[6+ii];
+        direc[ii]       = propagator[0]*frame[ii] + propagator[1]*frame[3+ii] + propagator[2]*frame[6+ii];
+        __tmp[ii]       = propagator[3]*frame[ii] + propagator[4]*frame[3+ii] + propagator[5]*frame[6+ii];
         frame[2*3 + ii] = propagator[6]*frame[ii] + propagator[7]*frame[3+ii] + propagator[8]*frame[6+ii];
     }
 
+#if 1
+    norm3_d(__tmp, 0); // normalize tangent
+    crossnorm3_d(frame + 3, frame + 2*3, __tmp, 1); // calc normal
+    crossnorm3_d(frame + 2*3, __tmp, frame + 3, 2); // calculate binorm from tangent, norm
+#else
     norm3_d(__tmp, 0); // normalize tangent
     norm3_d(frame + 2*3, 2); // normalize binorm
     crossnorm3_d(frame + 3, frame + 2*3, __tmp, 1); // calculate normal from binorm, tangent
+#endif
 
     for (int ii = 0; ii < 3; ii++) {
         frame[ii] = __tmp[ii];
     }
+
+    norm3_d(direc, 0);
 }
 
 template<typename REAL_T, typename REAL3_T>
@@ -139,14 +148,22 @@ __device__ REAL_T calculate_data_support_d(REAL_T support,
                                            REAL_T* probing_frame,
                                            REAL_T* last_val_probe) {
     REAL_T probing_prop[9];
+    REAL_T direc[3];
     REAL3_T probing_pos;
     REAL_T fod_amp;
 
     prepare_propagator_d(k1, k2, PROBE_STEP, probing_prop);
-    probing_pos.x = pos.x; probing_pos.y = pos.y; probing_pos.z = pos.z;
+    probing_pos.x = pos.x;
+    probing_pos.y = pos.y;
+    probing_pos.z = pos.z;
 
     for (int ii = 0; ii < PROBE_QUALITY; ii++) {
-        propogate_frame_d(probing_prop, probing_frame, &probing_pos, voxel_size);
+        propogate_frame_d(probing_prop, probing_frame, direc);
+
+        probing_pos.x += direc[0] * voxel_size.x;
+        probing_pos.y += direc[1] * voxel_size.y;
+        probing_pos.z += direc[2] * voxel_size.z;
+
         fod_amp = interp4_d(probing_pos, probing_frame, pmf,
                             dimx, dimy, dimz, dimt,
                             odf_sphere_vertices);
@@ -160,6 +177,7 @@ __device__ REAL_T calculate_data_support_d(REAL_T support,
         }
         support += fod_amp;
     }
+
     *last_val_probe = fod_amp;
 
     return support;
@@ -198,16 +216,16 @@ __device__ int get_direction_ptt_d(
 	extern REAL_T __shared__ __sh[];
 
     REAL_T *__face_cdf_sh = reinterpret_cast<REAL_T *>(__sh);
-    REAL_T *__vert_cdf_sh = __face_cdf_sh + BDIM_Y*DISC_FACE_CNT; // These first two can be overwritten in other functions
-    REAL_T *__frame_sh = __vert_cdf_sh + BDIM_Y*DISC_VERT_CNT; // the rest must persist form call to call
+    REAL_T *__vert_pdf_sh = __face_cdf_sh + BDIM_Y*DISC_FACE_CNT; // These first two can be overwritten in other functions
+    REAL_T *__frame_sh = __vert_pdf_sh + BDIM_Y*DISC_VERT_CNT; // the rest must persist form call to call
     REAL_T *__last_val_sh = __frame_sh + BDIM_Y*9;
 
     __face_cdf_sh += tidy*DISC_FACE_CNT;
-    __vert_cdf_sh += tidy*DISC_VERT_CNT; 
+    __vert_pdf_sh += tidy*DISC_VERT_CNT; 
     __frame_sh += tidy*9;
     __last_val_sh += tidy*1;
 
-    const REAL_T max_curvature = (step_size / 2.0) / (SIN(max_angle / 2));
+    const REAL_T max_curvature = (step_size / 2) / (SIN(max_angle / 2));
     REAL_T __tmp;
 
     if (IS_INIT) {
@@ -222,7 +240,7 @@ __device__ int get_direction_ptt_d(
     }
     __syncwarp(WMASK);
 
-    // Calculate __vert_cdf_sh
+    // Calculate __vert_pdf_sh
     REAL_T probing_frame[9];
     REAL_T k1_probe, k2_probe;
     bool support_found = 0;
@@ -232,11 +250,6 @@ __device__ int get_direction_ptt_d(
             k1_probe = DISC_VERT[ii*2] * max_curvature;
             k2_probe = DISC_VERT[ii*2+1] * max_curvature;
 
-#if 0
-            printf("k1_probe: %f for id: %i\n", k1_probe, tidx);
-            printf("k2_probe: %f for id: %i\n", k2_probe, tidx);
-#endif
-
             get_probing_frame_d<IS_INIT>(__frame_sh, st, probing_frame);
 
             REAL_T this_support = calculate_data_support_d(*__last_val_sh,
@@ -246,10 +259,17 @@ __device__ int get_direction_ptt_d(
                                                            k1_probe, k2_probe,
                                                            probing_frame,
                                                            &__tmp);
+
+#if 0
+            if (threadIdx.y == 1 && ii == 0) { 
+                printf("    k1_probe: %f, k2_probe %f, support %f for id: %i\n", k1_probe, k2_probe, this_support, tidx);
+            }
+#endif
+
             if (this_support < NORM_MIN_SUPPORT) {
-                __vert_cdf_sh[ii] = 0;
+                __vert_pdf_sh[ii] = 0;
             } else {
-                __vert_cdf_sh[ii] = this_support;
+                __vert_pdf_sh[ii] = this_support;
                 support_found = 1;
             }
         }
@@ -261,8 +281,8 @@ __device__ int get_direction_ptt_d(
 
 #if 0
     __syncwarp(WMASK);
-    if (threadIdx.x == 0) {
-        printArrayAlways("VERT PDF", 8, DISC_VERT_CNT, __vert_cdf_sh);
+    if (threadIdx.y == 1 && threadIdx.x == 0) {
+        printArrayAlways("VERT PDF", 8, DISC_VERT_CNT, __vert_pdf_sh);
     }
     __syncwarp(WMASK);
 #endif
@@ -277,7 +297,7 @@ __device__ int get_direction_ptt_d(
     for (int ii = tidx; ii < DISC_FACE_CNT; ii+=BDIM_X) {
         bool all_verts_valid = 1;
         for (int jj = 0; jj < 3; jj++) {
-            REAL_T vert_val = __vert_cdf_sh[DISC_FACE[ii*3 + jj]];
+            REAL_T vert_val = __vert_pdf_sh[DISC_FACE[ii*3 + jj]];
             if (vert_val == 0) {
                 all_verts_valid = IS_INIT;
             }
@@ -291,7 +311,7 @@ __device__ int get_direction_ptt_d(
 
 #if 0
     __syncwarp(WMASK);
-    if (threadIdx.x == 0) {
+    if (threadIdx.y == 1 && threadIdx.x == 0) {
         printArrayAlways("Face PDF", 8, DISC_FACE_CNT, __face_cdf_sh);
     }
     __syncwarp(WMASK);
@@ -307,7 +327,7 @@ __device__ int get_direction_ptt_d(
 
 #if 0
     __syncwarp(WMASK);
-    if (threadIdx.x == 0) {
+    if (threadIdx.y == 1 && threadIdx.x == 0) {
         printArrayAlways("Face CDF", 8, DISC_FACE_CNT, __face_cdf_sh);
     }
     __syncwarp(WMASK);
@@ -357,15 +377,14 @@ __device__ int get_direction_ptt_d(
 
             __syncwarp(WMASK);
             if (tidx == winning_lane) {
-                dirs[0] = MAKE_REAL3(
-                    probing_frame[0],
-                    probing_frame[1],
-                    probing_frame[2]);
-
-                if (!IS_INIT) {
+                if (IS_INIT) {
+                    dirs[0] = dir;
+                } else {
                     REAL_T __prop[9];
-                    prepare_propagator_d(k1_probe, k2_probe, step_size, __prop);
-                    propogate_frame_d(__prop, probing_frame, &pos, MAKE_REAL3(0, 0, 0));
+                    REAL_T __dir[3];
+                    prepare_propagator_d(k1_probe, k2_probe, step_size*STEP_FRAC, __prop);
+                    propogate_frame_d(__prop, probing_frame, __dir);
+                    dirs[0] = (REAL3_T) {__dir[0], __dir[1], __dir[2]};
                 }
 
                 for (int jj = 0; jj < 9; jj++) {
diff --git a/cuslines/ptt.cuh b/cuslines/ptt.cuh
index 754e383..b9e7279 100644
--- a/cuslines/ptt.cuh
+++ b/cuslines/ptt.cuh
@@ -4,6 +4,7 @@
 #include "disc.h"
 #include "globals.h"
 
+#define STEP_FRAC 1 // divides output step size (usually 0.5) into this many steps
 #define PROBE_LENGTH 0.25
 #define PROBE_QUALITY 4 // can be 4-7
 #define ALLOW_WEAK_LINK 1
@@ -18,23 +19,23 @@
 #if PROBE_QUALITY == 4
 #define DISC_VERT_CNT DISC_4_VERT_CNT
 #define DISC_FACE_CNT DISC_4_FACE_CNT
-__constant__ REAL DISC_VERT[] = DISC_4_VERT;
-__constant__ int DISC_FACE[] = DISC_4_FACE;
+__device__ __constant__ REAL DISC_VERT[DISC_VERT_CNT*2] = DISC_4_VERT;
+__device__ __constant__ int DISC_FACE[DISC_FACE_CNT*3] = DISC_4_FACE;
 #elif PROBE_QUALITY == 5
 #define DISC_VERT_CNT DISC_5_VERT_CNT
 #define DISC_FACE_CNT DISC_5_FACE_CNT
-__constant__ REAL DISC_VERT[] = DISC_5_VERT;
-__constant__ int DISC_FACE[] = DISC_5_FACE;
+__device__ __constant__ REAL DISC_VERT[DISC_VERT_CNT*2] = DISC_5_VERT;
+__device__ __constant__ int DISC_FACE[DISC_FACE_CNT*3] = DISC_5_FACE;
 #elif PROBE_QUALITY == 6
 #define DISC_VERT_CNT DISC_6_VERT_CNT
 #define DISC_FACE_CNT DISC_6_FACE_CNT
-__constant__ REAL DISC_VERT[] = DISC_6_VERT;
-__constant__ int DISC_FACE[] = DISC_6_FACE;
+__device__ __constant__ REAL DISC_VERT[DISC_VERT_CNT*2] = DISC_6_VERT;
+__device__ __constant__ int DISC_FACE[DISC_FACE_CNT*3] = DISC_6_FACE;
 #elif PROBE_QUALITY == 7
 #define DISC_VERT_CNT DISC_7_VERT_CNT
 #define DISC_FACE_CNT DISC_7_FACE_CNT
-__constant__ REAL DISC_VERT[] = DISC_7_VERT;
-__constant__ int DISC_FACE[] = DISC_7_FACE;
+__device__ __constant__ REAL DISC_VERT[DISC_VERT_CNT*2] = DISC_7_VERT; // TODO: check if removing __constant__ helps or hurts
+__device__ __constant__ int DISC_FACE[DISC_FACE_CNT*3] = DISC_7_FACE;
 #endif
 
 #endif
\ No newline at end of file

From c4f2befde7c4df11535c1c0acdcf55d3d54d4330 Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Thu, 18 Jul 2024 16:12:10 -0700
Subject: [PATCH 17/35] working ptt

---
 cuslines/generate_streamlines_cuda.cu |  4 +-
 cuslines/ptt.cu                       | 76 ++++++++++++++++++---------
 cuslines/ptt.cuh                      |  3 +-
 3 files changed, 55 insertions(+), 28 deletions(-)

diff --git a/cuslines/generate_streamlines_cuda.cu b/cuslines/generate_streamlines_cuda.cu
index 745b672..c2711d6 100644
--- a/cuslines/generate_streamlines_cuda.cu
+++ b/cuslines/generate_streamlines_cuda.cu
@@ -1616,6 +1616,9 @@ __global__ void genStreamlinesMerge_k(const ModelType model_type,
                                                                                         shDir1 + slid*samplm_nr);
                                 __syncwarp(WMASK);
                                 if (!init_norm_success) { // This is rare
+#if 0
+                                        printf("PTT init seed failed");
+#endif
                                         if (tidx == 0) {
                                                 slineLen[slineOff] = 1;
                                                 currSline[0] = seed;
@@ -2136,7 +2139,6 @@ void write_trk(const char *fname,
         float slineData[1 + 3*(2*MAX_SLINE_LEN)];
 #endif
         for(int i = 0; i < nsline; i++) {
-                printf("len %i\n", slineLen[i]);
                 reinterpret_cast<int *>(slineData)[0] = slineLen[i];
                 for(int j = 0; j < slineLen[i]; j++) {
                         slineData[1+3*j+0] = (float)((sline[i*2*MAX_SLINE_LEN + j].x+0.5)*VOXEL_SIZE[0]);
diff --git a/cuslines/ptt.cu b/cuslines/ptt.cu
index 5d2436e..2d25c60 100644
--- a/cuslines/ptt.cu
+++ b/cuslines/ptt.cu
@@ -1,6 +1,6 @@
 template<typename REAL_T>
 __device__ void norm3_d(REAL_T *num, int fail_ind) {
-    REAL_T scale = SQRT(num[0] * num[0] + num[1] * num[1] + num[2] * num[2]);
+    const REAL_T scale = SQRT(num[0] * num[0] + num[1] * num[1] + num[2] * num[2]);
 
     if (scale != 0) {
         num[0] /= scale;
@@ -41,6 +41,11 @@ __device__ REAL_T interp4_d(const REAL3_T pos, const REAL_T* frame, const REAL_T
 
     const int rv = trilinear_interp_d<1>(dimx, dimy, dimz, dimt, closest_odf_idx,
                                          pmf, pos, &__tmp);
+
+#if 0
+    printf("inerpolated %f at %f, %f, %f, %i\n", __tmp, pos.x, pos.y, pos.z, closest_odf_idx);
+#endif
+
     if (rv != 0) {
         return -1;
     } else {
@@ -68,10 +73,10 @@ __device__ void prepare_propagator_d(REAL_T k1, REAL_T k2, REAL_T arclength,
         if (FABS(k2) < K_SMALL) {
             k2 = K_SMALL;
         }
-        REAL_T k     = SQRT(k1*k1+k2*k2);
-        REAL_T sinkt = SIN(k*arclength);
-        REAL_T coskt = COS(k*arclength);
-        REAL_T kk    = 1/(k*k);
+        const REAL_T k     = SQRT(k1*k1+k2*k2);
+        const REAL_T sinkt = SIN(k*arclength);
+        const REAL_T coskt = COS(k*arclength);
+        const REAL_T kk    = 1/(k*k);
 
         propagator[0] = sinkt/k;
         propagator[1] = k1*(1-coskt)*kk;
@@ -225,7 +230,8 @@ __device__ int get_direction_ptt_d(
     __frame_sh += tidy*9;
     __last_val_sh += tidy*1;
 
-    const REAL_T max_curvature = (step_size / 2) / (SIN(max_angle / 2));
+    const REAL_T max_curvature = SIN(max_angle / 2) / (step_size * 2); // bigger numbers means wiggle more
+
     REAL_T __tmp;
 
     if (IS_INIT) {
@@ -274,7 +280,7 @@ __device__ int get_direction_ptt_d(
             }
         }
         if (IS_INIT) {
-            int __msk = __ballot_sync(WMASK, support_found);
+            const int __msk = __ballot_sync(WMASK, support_found);
             support_found = (__msk != 0);
         }
     } while (IS_INIT && (!support_found) && (jj++ < TRIES_PER_REJECTION_SAMPLING));
@@ -335,7 +341,7 @@ __device__ int get_direction_ptt_d(
 
     // Sample random valid faces randomly
     REAL_T r1, r2;
-    for (int ii = tidx; ii < TRIES_PER_REJECTION_SAMPLING; ii += BDIM_X) {
+    for (int ii = 0; ii < TRIES_PER_REJECTION_SAMPLING / BDIM_X; ii++) {
         r1 = curand_uniform(st);
         r2 = curand_uniform(st);
 		if (r1 + r2 > 1) {
@@ -350,39 +356,57 @@ __device__ int get_direction_ptt_d(
 				break;
 		}
  
-        REAL_T vx0 = max_curvature * DISC_VERT[DISC_FACE[jj*3]*2];
-        REAL_T vx1 = max_curvature * DISC_VERT[DISC_FACE[jj*3+1]*2];
-        REAL_T vx2 = max_curvature * DISC_VERT[DISC_FACE[jj*3+2]*2];
+        const REAL_T vx0 = max_curvature * DISC_VERT[DISC_FACE[jj*3]*2];
+        const REAL_T vx1 = max_curvature * DISC_VERT[DISC_FACE[jj*3+1]*2];
+        const REAL_T vx2 = max_curvature * DISC_VERT[DISC_FACE[jj*3+2]*2];
 
-        REAL_T vy0 = max_curvature * DISC_VERT[DISC_FACE[jj*3]*2 + 1];
-        REAL_T vy1 = max_curvature * DISC_VERT[DISC_FACE[jj*3+1]*2 + 1];
-        REAL_T vy2 = max_curvature * DISC_VERT[DISC_FACE[jj*3+2]*2 + 1];
+        const REAL_T vy0 = max_curvature * DISC_VERT[DISC_FACE[jj*3]*2 + 1];
+        const REAL_T vy1 = max_curvature * DISC_VERT[DISC_FACE[jj*3+1]*2 + 1];
+        const REAL_T vy2 = max_curvature * DISC_VERT[DISC_FACE[jj*3+2]*2 + 1];
 
         k1_probe = vx0 + r1 * (vx1 - vx0) + r2 * (vx2 - vx0);
         k2_probe = vy0 + r1 * (vy1 - vy0) + r2 * (vy2 - vy0);
 
         get_probing_frame_d<IS_INIT>(__frame_sh, st, probing_frame);
 
-        REAL_T this_support = calculate_data_support_d(*__last_val_sh,
-                                                       pos, pmf, dimx, dimy, dimz, dimt,
-                                                       odf_sphere_vertices, 
-                                                       voxel_size,
-                                                       k1_probe, k2_probe,
-                                                       probing_frame,
-                                                       &__tmp);
+        const REAL_T this_support = calculate_data_support_d(*__last_val_sh,
+                                                             pos, pmf, dimx, dimy, dimz, dimt,
+                                                             odf_sphere_vertices, 
+                                                             voxel_size,
+                                                             k1_probe, k2_probe,
+                                                             probing_frame,
+                                                             &__tmp);
+
+
+        __syncwarp(WMASK);
+        int winning_lane = -1; // -1 indicates nobody won
+        if(PURE_PROBABILISTIC) { // find the first lane with above min support
+            const int __msk = __ballot_sync(WMASK, this_support >= NORM_MIN_SUPPORT);
+            if (__msk != 0) {
+                winning_lane = __ffs(__msk) - 1; // Often 0. occasionally more
+            }
+        } else { // find the best lane with above min support (typically of 32 lanes)
+            REAL_T max_support = this_support;
 
-        int __msk = __ballot_sync(WMASK, this_support >= NORM_MIN_SUPPORT);
-        if (__msk != 0) {
-            int winning_lane = __ffs(__msk) - 1; // Often 0. occasionally more
+            #pragma unroll
+            for(int j = BDIM_X/2; j; j /= 2) {
+                const REAL_T other_support = __shfl_xor_sync(WMASK, max_support, j, BDIM_X);
+                max_support = MAX(max_support, other_support);
+            }
 
-            __syncwarp(WMASK);
+            if (max_support >= NORM_MIN_SUPPORT) {
+                const int __msk = __ballot_sync(WMASK, this_support == max_support);
+                winning_lane = __ffs(__msk) - 1;
+            }
+        }
+        if (winning_lane != -1) {
             if (tidx == winning_lane) {
                 if (IS_INIT) {
                     dirs[0] = dir;
                 } else {
                     REAL_T __prop[9];
                     REAL_T __dir[3];
-                    prepare_propagator_d(k1_probe, k2_probe, step_size*STEP_FRAC, __prop);
+                    prepare_propagator_d(k1_probe, k2_probe, step_size/STEP_FRAC, __prop);
                     propogate_frame_d(__prop, probing_frame, __dir);
                     dirs[0] = (REAL3_T) {__dir[0], __dir[1], __dir[2]};
                 }
diff --git a/cuslines/ptt.cuh b/cuslines/ptt.cuh
index b9e7279..0beab08 100644
--- a/cuslines/ptt.cuh
+++ b/cuslines/ptt.cuh
@@ -4,13 +4,14 @@
 #include "disc.h"
 #include "globals.h"
 
-#define STEP_FRAC 1 // divides output step size (usually 0.5) into this many steps
+#define STEP_FRAC 20 // divides output step size (usually 0.5) into this many steps
 #define PROBE_LENGTH 0.25
 #define PROBE_QUALITY 4 // can be 4-7
 #define ALLOW_WEAK_LINK 1
 #define TRIES_PER_REJECTION_SAMPLING 1024
 #define DEFAULT_PTT_MINDATASUPPORT 0.05
 #define K_SMALL 0.0001
+#define PURE_PROBABILISTIC 0
 
 
 #define NORM_MIN_SUPPORT (DEFAULT_PTT_MINDATASUPPORT / PROBE_QUALITY)

From b7ee0d3a33d8b1632ce8abc1181207a8dd823710 Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Thu, 18 Jul 2024 17:01:51 -0700
Subject: [PATCH 18/35] bf constants

---
 cuslines/ptt.cu  | 2 +-
 cuslines/ptt.cuh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuslines/ptt.cu b/cuslines/ptt.cu
index 2d25c60..8a92206 100644
--- a/cuslines/ptt.cu
+++ b/cuslines/ptt.cu
@@ -230,7 +230,7 @@ __device__ int get_direction_ptt_d(
     __frame_sh += tidy*9;
     __last_val_sh += tidy*1;
 
-    const REAL_T max_curvature = SIN(max_angle / 2) / (step_size * 2); // bigger numbers means wiggle more
+    const REAL_T max_curvature = (2 * SIN(max_angle / 2)) / step_size; // bigger numbers means wiggle more
 
     REAL_T __tmp;
 
diff --git a/cuslines/ptt.cuh b/cuslines/ptt.cuh
index 0beab08..81af493 100644
--- a/cuslines/ptt.cuh
+++ b/cuslines/ptt.cuh
@@ -14,7 +14,7 @@
 #define PURE_PROBABILISTIC 0
 
 
-#define NORM_MIN_SUPPORT (DEFAULT_PTT_MINDATASUPPORT / PROBE_QUALITY)
+#define NORM_MIN_SUPPORT (DEFAULT_PTT_MINDATASUPPORT * PROBE_QUALITY)
 #define PROBE_STEP (PROBE_LENGTH / (PROBE_QUALITY - 1))
 
 #if PROBE_QUALITY == 4

From 280ebe7c1645fdefea0db95dcff5e435ddd0a07c Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Fri, 19 Jul 2024 14:58:44 -0700
Subject: [PATCH 19/35] add discs 2, 3; make it possible to vary probabilistic
 vs det; other random BF, UI changes, cleanups

---
 cuslines/disc.h                       | 165 ++++++++++++++++++++++++++
 cuslines/generate_streamlines_cuda.cu |  13 --
 cuslines/globals.h                    |   1 +
 cuslines/ptt.cu                       |  92 +++++++-------
 cuslines/ptt.cuh                      |  31 +++--
 5 files changed, 229 insertions(+), 73 deletions(-)

diff --git a/cuslines/disc.h b/cuslines/disc.h
index f5c3589..0ae36f1 100644
--- a/cuslines/disc.h
+++ b/cuslines/disc.h
@@ -17,6 +17,171 @@ Redistribution and use in source and binary forms, with or without modification,
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
+#define DISC_2_VERT_CNT 24
+#define DISC_2_FACE_CNT 31
+
+#define DISC_2_VERT {\
+	-0.99680788,-0.07983759,\
+	-0.94276539,0.33345677,\
+	-0.87928469,-0.47629658,\
+	-0.72856617,0.68497542,\
+	-0.60006556,-0.79995082,\
+	-0.54129995,-0.02761342,\
+	-0.39271207,0.37117272,\
+	-0.39217391,0.91989110,\
+	-0.36362884,-0.40757367,\
+	-0.22391316,-0.97460910,\
+	-0.00130022,0.53966106,\
+	0.00000000,0.00000000,\
+	0.00973999,0.99995257,\
+	0.01606516,-0.54289908,\
+	0.21342395,-0.97695968,\
+	0.38192071,-0.38666136,\
+	0.38897094,0.37442837,\
+	0.40696681,0.91344295,\
+	0.54387161,-0.01477123,\
+	0.59119367,-0.80652963,\
+	0.73955688,0.67309406,\
+	0.87601150,-0.48229022,\
+	0.94617928,0.32364298,\
+	0.99585368,-0.09096944}
+
+
+
+#define DISC_2_FACE {\
+	9,8,4,\
+	11,16,10,\
+	5,8,11,\
+	5,1,0,\
+	18,16,11,\
+	11,15,18,\
+	13,8,9,\
+	11,8,13,\
+	13,15,11,\
+	22,18,23,\
+	22,20,16,\
+	16,18,22,\
+	16,20,17,\
+	12,10,17,\
+	17,10,16,\
+	15,19,21,\
+	23,18,21,\
+	21,18,15,\
+	2,4,8,\
+	2,5,0,\
+	8,5,2,\
+	7,10,12,\
+	6,7,3,\
+	10,7,6,\
+	3,1,6,\
+	1,5,6,\
+	11,10,6,\
+	6,5,11,\
+	14,19,15,\
+	15,13,14,\
+	14,13,9}
+
+
+
+#define DISC_3_VERT_CNT 36
+#define DISC_3_FACE_CNT 52
+
+#define DISC_3_VERT {\
+	-0.98798409,-0.15455565,\
+	-0.98026530,0.19768646,\
+	-0.87061458,-0.49196570,\
+	-0.85315536,0.52165691,\
+	-0.67948751,0.12870519,\
+	-0.65830249,-0.75275350,\
+	-0.60977645,0.79257345,\
+	-0.60599745,-0.25746218,\
+	-0.49175185,0.45085081,\
+	-0.39584766,-0.56449807,\
+	-0.37151031,0.02599657,\
+	-0.34538749,-0.93846017,\
+	-0.31409968,0.94939001,\
+	-0.19774331,-0.30335822,\
+	-0.18708240,0.31479263,\
+	-0.14013436,0.65112487,\
+	-0.02230445,-0.65649640,\
+	-0.01247874,-0.99992214,\
+	0,0,\
+	0.03699045,0.99931562,\
+	0.15587647,0.33306130,\
+	0.17739302,-0.31129535,\
+	0.23950456,0.64808985,\
+	0.31593561,-0.94878063,\
+	0.34839477,-0.59393230,\
+	0.35674244,0.02011329,\
+	0.38082583,0.92464679,\
+	0.52353496,0.39304489,\
+	0.57766607,-0.30046041,\
+	0.63711661,-0.77076743,\
+	0.66791137,0.74424082,\
+	0.68671421,0.06646131,\
+	0.85301727,-0.52188269,\
+	0.88617706,0.46334676,\
+	0.97866100,-0.20548151,\
+	0.98951863,0.14440528}
+
+
+
+#define DISC_3_FACE {\
+	27,30,22,\
+	9,2,5,\
+	28,32,34,\
+	22,30,26,\
+	26,19,22,\
+	30,27,33,\
+	31,34,35,\
+	28,34,31,\
+	35,33,31,\
+	31,33,27,\
+	25,31,27,\
+	28,31,25,\
+	10,14,8,\
+	10,13,18,\
+	18,14,10,\
+	15,19,12,\
+	22,19,15,\
+	8,14,15,\
+	11,9,5,\
+	13,9,16,\
+	16,11,17,\
+	9,11,16,\
+	17,23,16,\
+	23,24,16,\
+	29,24,23,\
+	29,32,28,\
+	28,24,29,\
+	6,3,8,\
+	6,15,12,\
+	8,15,6,\
+	20,27,22,\
+	20,25,27,\
+	18,25,20,\
+	20,14,18,\
+	22,15,20,\
+	20,15,14,\
+	21,24,28,\
+	28,25,21,\
+	21,25,18,\
+	18,13,21,\
+	13,16,21,\
+	21,16,24,\
+	4,10,8,\
+	8,3,4,\
+	4,3,1,\
+	4,1,0,\
+	7,9,13,\
+	13,10,7,\
+	2,9,7,\
+	10,4,7,\
+	0,2,7,\
+	7,4,0}
+
+
+
 #define DISC_4_VERT_CNT 62
 #define DISC_4_FACE_CNT 97
 
diff --git a/cuslines/generate_streamlines_cuda.cu b/cuslines/generate_streamlines_cuda.cu
index c2711d6..22c9180 100644
--- a/cuslines/generate_streamlines_cuda.cu
+++ b/cuslines/generate_streamlines_cuda.cu
@@ -1285,7 +1285,6 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
                                                         st,
                                                         dataf,
                                                         max_angle,
-                                                        voxel_size,
                                                         step_size,
                                                         direction,
                                                         dimx, dimy, dimz, dimt,
@@ -1590,7 +1589,6 @@ __global__ void genStreamlinesMerge_k(const ModelType model_type,
                                                                                 &st,
                                                                                 dataf,
                                                                                 max_angle,
-                                                                                MAKE_REAL3(1, 1, 1),
                                                                                 step_size,
                                                                                 MAKE_REAL3(-first_step.x, -first_step.y, -first_step.z),
                                                                                 dimx, dimy, dimz, dimt,
@@ -1607,7 +1605,6 @@ __global__ void genStreamlinesMerge_k(const ModelType model_type,
                                                                                         &st,
                                                                                         dataf,
                                                                                         max_angle,
-                                                                                        MAKE_REAL3(1, 1, 1),
                                                                                         step_size,
                                                                                         MAKE_REAL3(first_step.x, first_step.y, first_step.z),
                                                                                         dimx, dimy, dimz, dimt,
@@ -1642,11 +1639,6 @@ __global__ void genStreamlinesMerge_k(const ModelType model_type,
                                 for (int ii = 0; ii < 9; ii++) {
                                         sh_ptr_to_save[ii + 9*tidy] = -sh_ptr[ii + 9*tidy]; // frame
                                 }
-
-                                sh_ptr += BDIM_Y*9;
-                                sh_ptr_to_save += BDIM_Y*9;
-
-                                sh_ptr_to_save[tidy] = sh_ptr[tidy]; // last val
                         }
                         __syncwarp(WMASK);
                 }
@@ -1702,11 +1694,6 @@ __global__ void genStreamlinesMerge_k(const ModelType model_type,
                                 for (int ii = 0; ii < 9; ii++) {
                                         sh_ptr[ii + 9*tidy] = sh_ptr_saved[ii + 9*tidy]; // frame
                                 }
-
-                                sh_ptr += BDIM_Y*9;
-                                sh_ptr_saved += BDIM_Y*9;
-
-                                sh_ptr[tidy] = sh_ptr_saved[tidy]; // last val
                         }
                         __syncwarp(WMASK);
                 }
diff --git a/cuslines/globals.h b/cuslines/globals.h
index ad85d7f..25c2f29 100644
--- a/cuslines/globals.h
+++ b/cuslines/globals.h
@@ -79,6 +79,7 @@
 
 #define MIN(x,y) (((x)<(y))?(x):(y))
 #define MAX(x,y) (((x)>(y))?(x):(y))
+#define POW2(n) (1 << (n))
 
 #define DIV_UP(a,b) (((a)+((b)-1))/(b))
 
diff --git a/cuslines/ptt.cu b/cuslines/ptt.cu
index 8a92206..d7de854 100644
--- a/cuslines/ptt.cu
+++ b/cuslines/ptt.cu
@@ -139,25 +139,22 @@ __device__ void propogate_frame_d(REAL_T* propagator, REAL_T* frame, REAL_T* dir
     for (int ii = 0; ii < 3; ii++) {
         frame[ii] = __tmp[ii];
     }
-
-    norm3_d(direc, 0);
 }
 
 template<typename REAL_T, typename REAL3_T>
 __device__ REAL_T calculate_data_support_d(REAL_T support,
                                            const REAL3_T pos, const REAL_T *__restrict__ pmf,
                                            const int dimx, const int dimy, const int dimz, const int dimt,
+                                           const REAL_T probe_step_size,
                                            const REAL3_T *__restrict__ odf_sphere_vertices,
-                                           const REAL3_T voxel_size,
                                            REAL_T k1, REAL_T k2,
-                                           REAL_T* probing_frame,
-                                           REAL_T* last_val_probe) {
+                                           REAL_T* probing_frame) {
     REAL_T probing_prop[9];
     REAL_T direc[3];
     REAL3_T probing_pos;
     REAL_T fod_amp;
 
-    prepare_propagator_d(k1, k2, PROBE_STEP, probing_prop);
+    prepare_propagator_d(k1, k2, probe_step_size, probing_prop);
     probing_pos.x = pos.x;
     probing_pos.y = pos.y;
     probing_pos.z = pos.z;
@@ -165,9 +162,9 @@ __device__ REAL_T calculate_data_support_d(REAL_T support,
     for (int ii = 0; ii < PROBE_QUALITY; ii++) {
         propogate_frame_d(probing_prop, probing_frame, direc);
 
-        probing_pos.x += direc[0] * voxel_size.x;
-        probing_pos.y += direc[1] * voxel_size.y;
-        probing_pos.z += direc[2] * voxel_size.z;
+        probing_pos.x += direc[0];
+        probing_pos.y += direc[1];
+        probing_pos.z += direc[2];
 
         fod_amp = interp4_d(probing_pos, probing_frame, pmf,
                             dimx, dimy, dimz, dimt,
@@ -176,15 +173,11 @@ __device__ REAL_T calculate_data_support_d(REAL_T support,
             if (ALLOW_WEAK_LINK) {
                 fod_amp = 0;
             } else {
-                *last_val_probe = 0;
                 return 0;    
             }
         }
         support += fod_amp;
     }
-
-    *last_val_probe = fod_amp;
-
     return support;
 }
 
@@ -197,7 +190,6 @@ __device__ int get_direction_ptt_d(
     curandStatePhilox4_32_10_t *st,
     const REAL_T *__restrict__ pmf,
     const REAL_T max_angle,
-    const REAL3_T voxel_size,
     const REAL_T step_size,
     REAL3_T dir,
     const int dimx, const int dimy, const int dimz, const int dimt,
@@ -223,14 +215,15 @@ __device__ int get_direction_ptt_d(
     REAL_T *__face_cdf_sh = reinterpret_cast<REAL_T *>(__sh);
     REAL_T *__vert_pdf_sh = __face_cdf_sh + BDIM_Y*DISC_FACE_CNT; // These first two can be overwritten in other functions
     REAL_T *__frame_sh = __vert_pdf_sh + BDIM_Y*DISC_VERT_CNT; // the rest must persist form call to call
-    REAL_T *__last_val_sh = __frame_sh + BDIM_Y*9;
+    REAL_T *__first_val_sh = __frame_sh + BDIM_Y*9;
 
     __face_cdf_sh += tidy*DISC_FACE_CNT;
     __vert_pdf_sh += tidy*DISC_VERT_CNT; 
     __frame_sh += tidy*9;
-    __last_val_sh += tidy*1;
+    __first_val_sh += tidy*1;
 
     const REAL_T max_curvature = (2 * SIN(max_angle / 2)) / step_size; // bigger numbers means wiggle more
+    const REAL_T probe_step_size = ((step_size / 2) / (PROBE_QUALITY - 1));
 
     REAL_T __tmp;
 
@@ -239,11 +232,13 @@ __device__ int get_direction_ptt_d(
             __frame_sh[0] = dir.x;
             __frame_sh[1] = dir.y;
             __frame_sh[2] = dir.z;
-            *__last_val_sh = interp4_d(pos, __frame_sh, pmf,
-                                       dimx, dimy, dimz, dimt,
-                                       odf_sphere_vertices);
         }
     }
+    if (tidx==0) {
+        *__first_val_sh = interp4_d(pos, __frame_sh, pmf,
+                                    dimx, dimy, dimz, dimt,
+                                    odf_sphere_vertices);
+    }
     __syncwarp(WMASK);
 
     // Calculate __vert_pdf_sh
@@ -258,13 +253,13 @@ __device__ int get_direction_ptt_d(
 
             get_probing_frame_d<IS_INIT>(__frame_sh, st, probing_frame);
 
-            REAL_T this_support = calculate_data_support_d(*__last_val_sh,
-                                                           pos, pmf, dimx, dimy, dimz, dimt,
-                                                           odf_sphere_vertices, 
-                                                           voxel_size,
-                                                           k1_probe, k2_probe,
-                                                           probing_frame,
-                                                           &__tmp);
+            const REAL_T this_support = calculate_data_support_d(
+                *__first_val_sh,
+                pos, pmf, dimx, dimy, dimz, dimt,
+                probe_step_size,
+                odf_sphere_vertices,
+                k1_probe, k2_probe,
+                probing_frame);
 
 #if 0
             if (threadIdx.y == 1 && ii == 0) { 
@@ -283,7 +278,7 @@ __device__ int get_direction_ptt_d(
             const int __msk = __ballot_sync(WMASK, support_found);
             support_found = (__msk != 0);
         }
-    } while (IS_INIT && (!support_found) && (jj++ < TRIES_PER_REJECTION_SAMPLING));
+    } while (IS_INIT && (!support_found) && (jj++ < INIT_FRAME_TRIES));
 
 #if 0
     __syncwarp(WMASK);
@@ -305,7 +300,7 @@ __device__ int get_direction_ptt_d(
         for (int jj = 0; jj < 3; jj++) {
             REAL_T vert_val = __vert_pdf_sh[DISC_FACE[ii*3 + jj]];
             if (vert_val == 0) {
-                all_verts_valid = IS_INIT;
+                all_verts_valid = IS_INIT; // On init, even go with faces that are not fully supported
             }
             __face_cdf_sh[ii] += vert_val;
         }
@@ -355,7 +350,7 @@ __device__ int get_direction_ptt_d(
 			if (__face_cdf_sh[jj] >= __tmp)
 				break;
 		}
- 
+
         const REAL_T vx0 = max_curvature * DISC_VERT[DISC_FACE[jj*3]*2];
         const REAL_T vx1 = max_curvature * DISC_VERT[DISC_FACE[jj*3+1]*2];
         const REAL_T vx2 = max_curvature * DISC_VERT[DISC_FACE[jj*3+2]*2];
@@ -369,38 +364,35 @@ __device__ int get_direction_ptt_d(
 
         get_probing_frame_d<IS_INIT>(__frame_sh, st, probing_frame);
 
-        const REAL_T this_support = calculate_data_support_d(*__last_val_sh,
+        const REAL_T this_support = calculate_data_support_d(*__first_val_sh,
                                                              pos, pmf, dimx, dimy, dimz, dimt,
-                                                             odf_sphere_vertices, 
-                                                             voxel_size,
+                                                             probe_step_size,
+                                                             odf_sphere_vertices,
                                                              k1_probe, k2_probe,
-                                                             probing_frame,
-                                                             &__tmp);
+                                                             probing_frame);
 
 
         __syncwarp(WMASK);
         int winning_lane = -1; // -1 indicates nobody won
-        if(PURE_PROBABILISTIC) { // find the first lane with above min support
-            const int __msk = __ballot_sync(WMASK, this_support >= NORM_MIN_SUPPORT);
-            if (__msk != 0) {
-                winning_lane = __ffs(__msk) - 1; // Often 0. occasionally more
-            }
-        } else { // find the best lane with above min support (typically of 32 lanes)
-            REAL_T max_support = this_support;
-
+        int __msk = __ballot_sync(WMASK, this_support >= NORM_MIN_SUPPORT);
+        if (__msk != 0) {
+            REAL_T group_max_support = this_support;
             #pragma unroll
-            for(int j = BDIM_X/2; j; j /= 2) {
-                const REAL_T other_support = __shfl_xor_sync(WMASK, max_support, j, BDIM_X);
-                max_support = MAX(max_support, other_support);
+            for(int j = 1; j < PROBABILISTIC_GROUP_SZ; j *= 2) {
+                __tmp = __shfl_xor_sync(WMASK, group_max_support, j, BDIM_X);
+                group_max_support = MAX(group_max_support, __tmp);
             }
 
-            if (max_support >= NORM_MIN_SUPPORT) {
-                const int __msk = __ballot_sync(WMASK, this_support == max_support);
-                winning_lane = __ffs(__msk) - 1;
-            }
+            __msk &= __ballot_sync(WMASK, this_support == group_max_support);
+            winning_lane = __ffs(__msk) - 1;
         }
         if (winning_lane != -1) {
             if (tidx == winning_lane) {
+#if 0
+                if (threadIdx.y == 1) {
+                    printf("winning k1 %f, k2 %f, cdf %f, cdf_idx %i", k1_probe, k2_probe, __tmp, jj);
+                }
+#endif
                 if (IS_INIT) {
                     dirs[0] = dir;
                 } else {
@@ -408,13 +400,13 @@ __device__ int get_direction_ptt_d(
                     REAL_T __dir[3];
                     prepare_propagator_d(k1_probe, k2_probe, step_size/STEP_FRAC, __prop);
                     propogate_frame_d(__prop, probing_frame, __dir);
+                    norm3_d(__dir, 0); // this will be scaled by the generic stepping code
                     dirs[0] = (REAL3_T) {__dir[0], __dir[1], __dir[2]};
                 }
 
                 for (int jj = 0; jj < 9; jj++) {
                     __frame_sh[jj] = probing_frame[jj];
                 }
-                *__last_val_sh = __tmp;
             }
             __syncwarp(WMASK);
             return 1;
diff --git a/cuslines/ptt.cuh b/cuslines/ptt.cuh
index 81af493..3457d77 100644
--- a/cuslines/ptt.cuh
+++ b/cuslines/ptt.cuh
@@ -4,35 +4,46 @@
 #include "disc.h"
 #include "globals.h"
 
-#define STEP_FRAC 20 // divides output step size (usually 0.5) into this many steps
-#define PROBE_LENGTH 0.25
-#define PROBE_QUALITY 4 // can be 4-7
+#define STEP_FRAC 20 // divides output step size (usually 0.5) into this many internal steps
+#define PROBE_FRAC 2 // divides output step size (usually 0.5) to find probe length
+#define PROBE_QUALITY 4
+#define SAMPLING_QUALITY 4 // can be 2-7, should be at least 4 to take advantage of GPUs
+#define PROBABILISTIC_BIAS 1 // 1, 2 look good. can be 0-log_2(N_WARPS) (typically 0-5). 0 is fully probabilistic, 4 is close to deterministic.
 #define ALLOW_WEAK_LINK 1
+#define INIT_FRAME_TRIES 16
 #define TRIES_PER_REJECTION_SAMPLING 1024
 #define DEFAULT_PTT_MINDATASUPPORT 0.05
 #define K_SMALL 0.0001
-#define PURE_PROBABILISTIC 0
-
 
 #define NORM_MIN_SUPPORT (DEFAULT_PTT_MINDATASUPPORT * PROBE_QUALITY)
-#define PROBE_STEP (PROBE_LENGTH / (PROBE_QUALITY - 1))
+#define PROBABILISTIC_GROUP_SZ POW2(PROBABILISTIC_BIAS)
 
-#if PROBE_QUALITY == 4
+#if SAMPLING_QUALITY == 2
+#define DISC_VERT_CNT DISC_2_VERT_CNT
+#define DISC_FACE_CNT DISC_2_FACE_CNT
+__device__ __constant__ REAL DISC_VERT[DISC_VERT_CNT*2] = DISC_2_VERT;
+__device__ __constant__ int DISC_FACE[DISC_FACE_CNT*3] = DISC_2_FACE;
+#elif SAMPLING_QUALITY == 3
+#define DISC_VERT_CNT DISC_3_VERT_CNT
+#define DISC_FACE_CNT DISC_3_FACE_CNT
+__device__ __constant__ REAL DISC_VERT[DISC_VERT_CNT*2] = DISC_3_VERT;
+__device__ __constant__ int DISC_FACE[DISC_FACE_CNT*3] = DISC_3_FACE;
+#elif SAMPLING_QUALITY == 4
 #define DISC_VERT_CNT DISC_4_VERT_CNT
 #define DISC_FACE_CNT DISC_4_FACE_CNT
 __device__ __constant__ REAL DISC_VERT[DISC_VERT_CNT*2] = DISC_4_VERT;
 __device__ __constant__ int DISC_FACE[DISC_FACE_CNT*3] = DISC_4_FACE;
-#elif PROBE_QUALITY == 5
+#elif SAMPLING_QUALITY == 5
 #define DISC_VERT_CNT DISC_5_VERT_CNT
 #define DISC_FACE_CNT DISC_5_FACE_CNT
 __device__ __constant__ REAL DISC_VERT[DISC_VERT_CNT*2] = DISC_5_VERT;
 __device__ __constant__ int DISC_FACE[DISC_FACE_CNT*3] = DISC_5_FACE;
-#elif PROBE_QUALITY == 6
+#elif SAMPLING_QUALITY == 6
 #define DISC_VERT_CNT DISC_6_VERT_CNT
 #define DISC_FACE_CNT DISC_6_FACE_CNT
 __device__ __constant__ REAL DISC_VERT[DISC_VERT_CNT*2] = DISC_6_VERT;
 __device__ __constant__ int DISC_FACE[DISC_FACE_CNT*3] = DISC_6_FACE;
-#elif PROBE_QUALITY == 7
+#elif SAMPLING_QUALITY == 7
 #define DISC_VERT_CNT DISC_7_VERT_CNT
 #define DISC_FACE_CNT DISC_7_FACE_CNT
 __device__ __constant__ REAL DISC_VERT[DISC_VERT_CNT*2] = DISC_7_VERT; // TODO: check if removing __constant__ helps or hurts

From f39f07b120a08b4bb9d7181b8517a7300c16ce1a Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Mon, 22 Jul 2024 11:40:56 -0700
Subject: [PATCH 20/35] correct the max angle

---
 cuslines/ptt.cu  | 10 +++-------
 cuslines/ptt.cuh |  2 +-
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/cuslines/ptt.cu b/cuslines/ptt.cu
index d7de854..e3fe7b1 100644
--- a/cuslines/ptt.cu
+++ b/cuslines/ptt.cu
@@ -169,12 +169,8 @@ __device__ REAL_T calculate_data_support_d(REAL_T support,
         fod_amp = interp4_d(probing_pos, probing_frame, pmf,
                             dimx, dimy, dimz, dimt,
                             odf_sphere_vertices);
-        if (fod_amp < PMF_THRESHOLD_P) {
-            if (ALLOW_WEAK_LINK) {
-                fod_amp = 0;
-            } else {
-                return 0;    
-            }
+        if (!ALLOW_WEAK_LINK && (fod_amp < PMF_THRESHOLD_P)) {
+            return 0;
         }
         support += fod_amp;
     }
@@ -222,7 +218,7 @@ __device__ int get_direction_ptt_d(
     __frame_sh += tidy*9;
     __first_val_sh += tidy*1;
 
-    const REAL_T max_curvature = (2 * SIN(max_angle / 2)) / step_size; // bigger numbers means wiggle more
+    const REAL_T max_curvature = SIN(max_angle / 2) / step_size; // bigger numbers means wiggle more
     const REAL_T probe_step_size = ((step_size / 2) / (PROBE_QUALITY - 1));
 
     REAL_T __tmp;
diff --git a/cuslines/ptt.cuh b/cuslines/ptt.cuh
index 3457d77..ae9c5ea 100644
--- a/cuslines/ptt.cuh
+++ b/cuslines/ptt.cuh
@@ -8,7 +8,7 @@
 #define PROBE_FRAC 2 // divides output step size (usually 0.5) to find probe length
 #define PROBE_QUALITY 4
 #define SAMPLING_QUALITY 4 // can be 2-7, should be at least 4 to take advantage of GPUs
-#define PROBABILISTIC_BIAS 1 // 1, 2 look good. can be 0-log_2(N_WARPS) (typically 0-5). 0 is fully probabilistic, 4 is close to deterministic.
+#define PROBABILISTIC_BIAS 1 // 1 looks good. can be 0-log_2(N_WARPS) (typically 0-5). 0 is fully probabilistic, 4 is close to deterministic.
 #define ALLOW_WEAK_LINK 1
 #define INIT_FRAME_TRIES 16
 #define TRIES_PER_REJECTION_SAMPLING 1024

From 08cbfaec17286538162059b7b3984c95b09beb51 Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Mon, 22 Jul 2024 12:32:49 -0700
Subject: [PATCH 21/35] merge these into one file to not have to replicate
 changes

---
 run_dipy_cpu.py                               | 169 ------------
 run_dipy_cpu_hardi.py                         | 182 -------------
 run_dipy_gpu.py                               | 244 ------------------
 ...ipy_gpu_hardi.py => run_gpu_streamlines.py | 181 +++++++------
 4 files changed, 102 insertions(+), 674 deletions(-)
 delete mode 100644 run_dipy_cpu.py
 delete mode 100644 run_dipy_cpu_hardi.py
 delete mode 100644 run_dipy_gpu.py
 rename run_dipy_gpu_hardi.py => run_gpu_streamlines.py (57%)

diff --git a/run_dipy_cpu.py b/run_dipy_cpu.py
deleted file mode 100644
index 67cdc3b..0000000
--- a/run_dipy_cpu.py
+++ /dev/null
@@ -1,169 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-#    list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-#    this list of conditions and the following disclaimer in the documentation
-#    and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-#    contributors may be used to endorse or promote products derived from
-#    this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import argparse
-import random
-import time
-
-import numpy as np
-import numpy.linalg as npl
-
-import dipy.reconst.dti as dti
-from dipy.io import read_bvals_bvecs
-from dipy.io.stateful_tractogram import Origin, Space, StatefulTractogram
-from dipy.io.streamline import save_tractogram
-from dipy.tracking import utils
-from dipy.core.gradients import gradient_table
-from dipy.data import small_sphere
-from dipy.direction import BootDirectionGetter
-from dipy.reconst.shm import OpdtModel, CsaOdfModel
-from dipy.tracking.local_tracking import LocalTracking
-from dipy.tracking.stopping_criterion import ThresholdStoppingCriterion
-from dipy.reconst import shm
-
-import nibabel as nib
-from nibabel.orientations import aff2axcodes
-
-# Import custom module
-#import cuslines.cuslines as cuslines
-
-t0 = time.time()
-
-# set seed to get deterministic streamlines
-np.random.seed(0)
-random.seed(0)
-
-#Get Gradient values
-def get_gtab(fbval, fbvec):
-    bvals, bvecs = read_bvals_bvecs(fbval, fbvec)
-    gtab = gradient_table(bvals, bvecs)
-    return gtab
-
-def get_img(ep2_seq):
-    img = nib.load(ep2_seq)
-    return img
-
-print("parsing arguments")
-parser = argparse.ArgumentParser()
-parser.add_argument("nifti_file", help="path to the ep2multiband sequence nifti file")
-parser.add_argument("bvals", help="path to the bvals")
-parser.add_argument("bvecs", help="path to the bvecs")
-parser.add_argument("mask_nifti", help="path to the mask file")
-parser.add_argument("roi_nifti", help="path to the ROI file")
-parser.add_argument("--output-prefix", type=str, default ='', help="path to the output file")
-parser.add_argument("--chunk-size", type=int, required=True, help="how many seeds to process per sweep")
-parser.add_argument("--nseeds", type=int, default=None, help="how many seeds to process in total")
-parser.add_argument("--max-angle", type=float, default=60, help="max angle (in degrees)")
-parser.add_argument("--min-signal", type=float, default=1.0, help="default: 1.0")
-parser.add_argument("--step-size", type=float, default=0.5, help="default: 0.5")
-parser.add_argument("--sh-order",type=int,default=4,help="sh_order")
-parser.add_argument("--fa-threshold",type=float,default=0.1,help="FA threshold")
-parser.add_argument("--relative-peak-threshold",type=float,default=0.25,help="relative peak threshold")
-parser.add_argument("--min_separation-angle",type=float,default=45,help="min separation angle (in degrees)")
-parser.add_argument("--sm-lambda",type=float,default=0.006,help="smoothing lambda")
-parser.add_argument("--sampling-density", type=int, default=3, help="sampling density for seed generation")
-parser.add_argument("--model", type=str, default="opdt", choices=['opdt', 'csaodf'], help="model to use")
-
-args = parser.parse_args()
-
-img = get_img(args.nifti_file)
-voxel_order = "".join(aff2axcodes(img.affine))
-gtab = get_gtab(args.bvals, args.bvecs)
-roi = get_img(args.roi_nifti)
-mask = get_img(args.mask_nifti)
-data = img.get_data()
-roi_data = roi.get_data()
-mask = mask.get_data()
-
-tenmodel = dti.TensorModel(gtab, fit_method='WLS')
-print('Fitting Tensor')
-tenfit = tenmodel.fit(data, mask)
-print('Computing anisotropy measures (FA,MD,RGB)')
-FA = tenfit.fa
-FA[np.isnan(FA)] = 0
-
-# Setup tissue_classifier args
-tissue_classifier = ThresholdStoppingCriterion(FA, args.fa_threshold)
-metric_map = np.asarray(FA, 'float64')
-
-# Create seeds for ROI
-#seed_mask = utils.seeds_from_mask(roi_data, density=sampling_density, affine=img_affine)
-seed_mask = utils.seeds_from_mask(roi_data, density=args.sampling_density, affine=np.eye(4))
-seed_mask = seed_mask[0:args.nseeds]
-
-# Setup model
-if args.model == "opdt":
-  print("Running OPDT model...")
-  model = OpdtModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=args.min_signal)
-else:
-  print("Running CSAODF model...")
-  model = CsaOdfModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=args.min_signal)
-
-# Setup direction getter args
-print('Bootstrap direction getter')
-boot_dg = BootDirectionGetter.from_data(data, model, max_angle=args.max_angle, sphere=small_sphere, sh_order=args.sh_order, relative_peak_threshold=args.relative_peak_threshold, min_separation_angle=args.min_separation_angle)
-
-print('streamline gen')
-global_chunk_size = args.chunk_size
-nchunks = (seed_mask.shape[0] + global_chunk_size - 1) // global_chunk_size
-
-t1 = time.time()
-streamline_time = 0
-io_time = 0
-for idx in range(int(nchunks)):
-  # Main streamline computation
-  ts = time.time()
-  #streamline_generator = LocalTracking(boot_dg, tissue_classifier, seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size], affine=img_affine, step_size=args.step_size)
-  streamline_generator = LocalTracking(boot_dg, tissue_classifier, seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size], affine=np.eye(4), step_size=args.step_size)
-  streamlines = [s for s in streamline_generator]
-  te = time.time()
-  streamline_time += (te-ts)
-  print("Generated {} streamlines from {} seeds, time: {} s".format(len(streamlines),
-                                                                    seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size].shape[0],
-                                                                    te-ts))
-
-  # Save tracklines file
-  if args.output_prefix:
-    fname = "{}.{}_{}.trk".format(args.output_prefix, idx+1, nchunks)
-    ts = time.time()
-    #save_tractogram(fname, streamlines, img.affine, vox_size=roi.header.get_zooms(), shape=roi_data.shape)
-    sft = StatefulTractogram(streamlines, args.nifti_file, Space.VOX)
-    save_tractogram(sft, fname)
-    te = time.time()
-    print("Saved streamlines to {}, time {} s".format(fname, te-ts))
-    io_time += (te-ts)
-
-t2 = time.time()
-
-print("Completed processing {} seeds.".format(seed_mask.shape[0]))
-print("Initialization time: {} sec".format(t1-t0))
-print("Streamline generation total time: {} sec".format(t2-t1))
-print("\tStreamline processing: {} sec".format(streamline_time))
-if args.output_prefix:
-  print("\tFile writing: {} sec".format(io_time))
diff --git a/run_dipy_cpu_hardi.py b/run_dipy_cpu_hardi.py
deleted file mode 100644
index 2e2b0b6..0000000
--- a/run_dipy_cpu_hardi.py
+++ /dev/null
@@ -1,182 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-#    list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-#    this list of conditions and the following disclaimer in the documentation
-#    and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-#    contributors may be used to endorse or promote products derived from
-#    this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import argparse
-import random
-import time
-
-import numpy as np
-import numpy.linalg as npl
-
-import dipy.reconst.dti as dti
-from dipy.io import read_bvals_bvecs
-from dipy.io.stateful_tractogram import Origin, Space, StatefulTractogram
-from dipy.io.streamline import save_tractogram
-from dipy.tracking import utils
-from dipy.core.gradients import gradient_table
-from dipy.data import small_sphere
-from dipy.direction import BootDirectionGetter
-from dipy.reconst.shm import OpdtModel, CsaOdfModel
-from dipy.tracking.local_tracking import LocalTracking
-from dipy.tracking.stopping_criterion import ThresholdStoppingCriterion
-from dipy.reconst import shm
-from dipy.data import get_fnames
-from dipy.data import read_stanford_pve_maps
-from dipy.tracking.streamline import transform_streamlines
-
-import nibabel as nib
-from nibabel.orientations import aff2axcodes
-
-# Import custom module
-#import cuslines.cuslines as cuslines
-
-t0 = time.time()
-
-# set seed to get deterministic streamlines
-np.random.seed(0)
-random.seed(0)
-
-#Get Gradient values
-def get_gtab(fbval, fbvec):
-    bvals, bvecs = read_bvals_bvecs(fbval, fbvec)
-    gtab = gradient_table(bvals, bvecs)
-    return gtab
-
-def get_img(ep2_seq):
-    img = nib.load(ep2_seq)
-    return img
-
-print("parsing arguments")
-parser = argparse.ArgumentParser()
-parser.add_argument("--output-prefix", type=str, default ='', help="path to the output file")
-parser.add_argument("--chunk-size", type=int, required=True, help="how many seeds to process per sweep")
-parser.add_argument("--nseeds", type=int, default=None, help="how many seeds to process in total")
-parser.add_argument("--max-angle", type=float, default=60, help="max angle (in degrees)")
-parser.add_argument("--min-signal", type=float, default=1.0, help="default: 1.0")
-parser.add_argument("--step-size", type=float, default=0.5, help="default: 0.5")
-parser.add_argument("--sh-order",type=int,default=4,help="sh_order")
-parser.add_argument("--fa-threshold",type=float,default=0.1,help="FA threshold")
-parser.add_argument("--relative-peak-threshold",type=float,default=0.25,help="relative peak threshold")
-parser.add_argument("--min-separation-angle",type=float,default=45,help="min separation angle (in degrees)")
-parser.add_argument("--sm-lambda",type=float,default=0.006,help="smoothing lambda")
-parser.add_argument("--sampling-density", type=int, default=3, help="sampling density for seed generation")
-parser.add_argument("--model", type=str, default="opdt", choices=['opdt', 'csaodf'], help="model to use")
-
-args = parser.parse_args()
-
-# Get Stanford HARDI data
-hardi_nifti_fname, hardi_bval_fname, hardi_bvec_fname = get_fnames('stanford_hardi')
-csf, gm, wm = read_stanford_pve_maps()
-wm_data = wm.get_fdata()
-
-img = get_img(hardi_nifti_fname)
-voxel_order = "".join(aff2axcodes(img.affine))
-img_affine = img.affine
-
-gtab = get_gtab(hardi_bval_fname, hardi_bvec_fname)
-#roi = get_img(hardi_nifti_fname)
-
-data = img.get_fdata()
-roi_data = (wm_data > 0.5)
-mask = roi_data
-
-tenmodel = dti.TensorModel(gtab, fit_method='WLS')
-print('Fitting Tensor')
-tenfit = tenmodel.fit(data, mask)
-print('Computing anisotropy measures (FA,MD,RGB)')
-FA = tenfit.fa
-FA[np.isnan(FA)] = 0
-
-# Setup tissue_classifier args
-tissue_classifier = ThresholdStoppingCriterion(FA, args.fa_threshold)
-metric_map = np.asarray(FA, 'float64')
-
-# Create seeds for ROI
-#seed_mask = utils.seeds_from_mask(roi_data, density=sampling_density, affine=img_affine)
-seed_mask = utils.seeds_from_mask(roi_data, density=args.sampling_density, affine=np.eye(4))
-seed_mask = seed_mask[0:args.nseeds]
-
-# Setup model
-if args.model == "opdt":
-  print("Running OPDT model...")
-  model = OpdtModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=args.min_signal)
-else:
-  print("Running CSAODF model...")
-  model = CsaOdfModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=args.min_signal)
-
-# Setup direction getter args
-print('Bootstrap direction getter')
-boot_dg = BootDirectionGetter.from_data(data, model, max_angle=args.max_angle, sphere=small_sphere, sh_order=args.sh_order, relative_peak_threshold=args.relative_peak_threshold, min_separation_angle=args.min_separation_angle)
-
-print('streamline gen')
-global_chunk_size = args.chunk_size
-nchunks = (seed_mask.shape[0] + global_chunk_size - 1) // global_chunk_size
-
-t1 = time.time()
-streamline_time = 0
-io_time = 0
-for idx in range(int(nchunks)):
-  # Main streamline computation
-  ts = time.time()
-  #streamline_generator = LocalTracking(boot_dg, tissue_classifier, seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size], affine=img_affine, step_size=args.step_size)
-  streamline_generator = LocalTracking(boot_dg, tissue_classifier, seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size], affine=np.eye(4), step_size=args.step_size)
-  streamlines = [s for s in streamline_generator]
-  te = time.time()
-  streamline_time += (te-ts)
-  print("Generated {} streamlines from {} seeds, time: {} s".format(len(streamlines),
-                                                                    seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size].shape[0],
-                                                                    te-ts))
-
-  # Save tracklines file
-  if args.output_prefix:
-    fname = "{}.{}_{}.trk".format(args.output_prefix, idx+1, nchunks)
-    ts = time.time()
-    #save_tractogram(fname, streamlines, img.affine, vox_size=roi.header.get_zooms(), shape=roi_data.shape)
-    #save_tractogram(fname, streamlines)
-    #sft = StatefulTractogram(streamlines, hardi_nifti_fname, Space.VOX)
-    #print(seed_mask)
-    #print(streamlines)
-
-    #inv_affine = npl.inv(img_affine)
-    #streamlines = transform_streamlines(streamlines,inv_affine)
-    sft = StatefulTractogram(streamlines, hardi_nifti_fname, Space.VOX)
-    save_tractogram(sft, fname)
-    te = time.time()
-    print("Saved streamlines to {}, time {} s".format(fname, te-ts))
-
-    io_time += (te-ts)
-
-t2 = time.time()
-
-print("Completed processing {} seeds.".format(seed_mask.shape[0]))
-print("Initialization time: {} sec".format(t1-t0))
-print("Streamline generation total time: {} sec".format(t2-t1))
-print("\tStreamline processing: {} sec".format(streamline_time))
-if args.output_prefix:
-  print("\tFile writing: {} sec".format(io_time))
diff --git a/run_dipy_gpu.py b/run_dipy_gpu.py
deleted file mode 100644
index 26dbab4..0000000
--- a/run_dipy_gpu.py
+++ /dev/null
@@ -1,244 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-#    list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-#    this list of conditions and the following disclaimer in the documentation
-#    and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-#    contributors may be used to endorse or promote products derived from
-#    this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import os
-import argparse
-import random
-import time
-from tqdm import trange, tqdm
-
-import numpy as np
-
-import dipy.reconst.dti as dti
-from dipy.io import read_bvals_bvecs
-from dipy.io.stateful_tractogram import Origin, Space, StatefulTractogram
-from dipy.io.streamline import save_tractogram
-from dipy.tracking import utils
-from dipy.core.gradients import gradient_table
-from dipy.data import small_sphere
-#from dipy.direction import BootDirectionGetter
-from dipy.reconst.shm import OpdtModel, CsaOdfModel
-#from dipy.tracking.local import LocalTracking, ThresholdTissueClassifier
-from dipy.reconst import shm
-
-import nibabel as nib
-from nibabel.orientations import aff2axcodes
-
-# Import custom module
-import cuslines.cuslines as cuslines
-
-t0 = time.time()
-
-# set seed to get deterministic streamlines
-np.random.seed(0)
-random.seed(0)
-
-#Get Gradient values
-def get_gtab(fbval, fbvec):
-    bvals, bvecs = read_bvals_bvecs(fbval, fbvec)
-    gtab = gradient_table(bvals, bvecs)
-    return gtab
-
-def get_img(ep2_seq):
-    img = nib.load(ep2_seq)
-    return img
-
-print("parsing arguments")
-parser = argparse.ArgumentParser()
-parser.add_argument("nifti_file", help="path to the ep2multiband sequence nifti file")
-parser.add_argument("bvals", help="path to the bvals")
-parser.add_argument("bvecs", help="path to the bvecs")
-parser.add_argument("mask_nifti", help="path to the mask file")
-parser.add_argument("--fa_numpy", default=None, help="path to the FA numpy file")
-parser.add_argument("--roi_nifti", default=None, help="path to the ROI file")
-parser.add_argument("--output-prefix", type=str, default ='', help="path to the output file")
-parser.add_argument("--chunk-size", type=int, required=True, help="how many seeds to process per sweep, per GPU")
-parser.add_argument("--ngpus", type=int, required=True, help="number of GPUs to use")
-parser.add_argument("--use-fast-write", action='store_true', help="use fast file write")
-parser.add_argument("--max-angle", type=float, default=60, help="max angle (in degrees)")
-parser.add_argument("--min-signal", type=float, default=1.0, help="default: 1.0")
-parser.add_argument("--step-size", type=float, default=0.5, help="default: 0.5")
-parser.add_argument("--sh-order",type=int,default=4,help="sh_order")
-parser.add_argument("--fa-threshold",type=float,default=0.1,help="FA threshold")
-parser.add_argument("--relative-peak-threshold",type=float,default=0.25,help="relative peak threshold")
-parser.add_argument("--min-separation-angle",type=float,default=45,help="min separation angle (in degrees)")
-parser.add_argument("--sm-lambda",type=float,default=0.006,help="smoothing lambda")
-parser.add_argument("--sampling-density", type=int, default=3, help="sampling density for seed generation")
-parser.add_argument("--model", type=str, default="opdt", choices=['opdt', 'csaodf'], help="model to use")
-args = parser.parse_args()
-
-img = get_img(args.nifti_file)
-voxel_order = "".join(aff2axcodes(img.affine))
-gtab = get_gtab(args.bvals, args.bvecs)
-mask = get_img(args.mask_nifti)
-data = img.get_fdata()
-
-# resample mask if necessary
-if mask.shape != data.shape:
-    from dipy.align.imaffine import AffineMap
-    identity = np.eye(4)
-    affine_map = AffineMap(identity,
-                           img.shape[:3], img.affine,
-                           mask.shape[:3], mask.affine)
-    mask = affine_map.transform(mask.get_fdata())
-    #mask = np.round(mask)
-else:
-    mask = mask.get_fdata()
-
-# load or compute and save FA file
-if (args.fa_numpy is not None) and os.path.isfile(args.fa_numpy):
-    FA = np.load(args.fa_numpy, allow_pickle=True)
-else:
-    # Fit
-    tenmodel = dti.TensorModel(gtab, fit_method='WLS')
-    print('Fitting Tensor')
-    tenfit = tenmodel.fit(data, mask)
-    print('Computing anisotropy measures (FA,MD,RGB)')
-    FA = tenfit.fa
-    FA[np.isnan(FA)] = 0
-
-    if args.fa_numpy is not None:
-        np.save(args.fa_numpy, FA)
-
-# Setup tissue_classifier args
-#tissue_classifier = ThresholdTissueClassifier(FA, args.fa_threshold)
-metric_map = np.asarray(FA, 'float64')
-
-# resample roi if necessary
-if args.roi_nifti is not None:
-    roi = get_img(args.roi_nifti)
-    roi_data = roi.get_fdata()
-else:
-    roi_data = np.zeros(data.shape[:3])
-    roi_data[ FA > 0.1 ] = 1.
-
-# Create seeds for ROI
-seed_mask = utils.seeds_from_mask(roi_data, density=args.sampling_density, affine=np.eye(4))
-
-# Setup model
-if args.model == "opdt":
-  print("Running OPDT model...")
-  model = OpdtModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=args.min_signal)
-  fit_matrix = model._fit_matrix
-  delta_b, delta_q = fit_matrix
-else:
-  print("Running CSAODF model...")
-  model = CsaOdfModel(gtab, sh_order=args.sh_order, smooth=args.sm_lambda, min_signal=args.min_signal)
-  fit_matrix = model._fit_matrix
-  # Unlike OPDT, CSA has a single matrix used for fit_matrix. Populating delta_b and delta_q with necessary values for
-  # now.
-  delta_b = fit_matrix
-  delta_q = fit_matrix
-
-# Setup direction getter args
-print('Bootstrap direction getter')
-#boot_dg = BootDirectionGetter.from_data(data, model, max_angle=args.max_angle, sphere=small_sphere, sh_order=args.sh_order, relative_peak_threshold=args.relative_peak_threshold, min_separation_angle=args.min_separation_angle)
-b0s_mask = gtab.b0s_mask
-dwi_mask = ~b0s_mask
-
-
-# setup sampling matrix
-sphere = small_sphere
-theta = sphere.theta
-phi = sphere.phi
-sampling_matrix, _, _ = shm.real_sym_sh_basis(args.sh_order, theta, phi)
-
-## from BootPmfGen __init__
-# setup H and R matrices
-# TODO: figure out how to get H, R matrices from direction getter object
-x, y, z = model.gtab.gradients[dwi_mask].T
-r, theta, phi = shm.cart2sphere(x, y, z)
-B, _, _ = shm.real_sym_sh_basis(args.sh_order, theta, phi)
-H = shm.hat(B)
-R = shm.lcr_matrix(H)
-
-# create floating point copy of data
-dataf = np.asarray(data, dtype=float)
-
-print('streamline gen')
-global_chunk_size = args.chunk_size * args.ngpus
-nchunks = (seed_mask.shape[0] + global_chunk_size - 1) // global_chunk_size
-
-#streamline_generator = LocalTracking(boot_dg, tissue_classifier, seed_mask, affine=np.eye(4), step_size=args.step_size)
-
-gpu_tracker = cuslines.GPUTracker(cuslines.ModelType.OPDT if args.model == "opdt" else cuslines.ModelType.CSAODF,
-                                  args.max_angle * np.pi/180,
-                                  args.min_signal,
-                                  args.fa_threshold,
-                                  args.step_size,
-                                  args.relative_peak_threshold,
-                                  args.min_separation_angle * np.pi/180,
-                                  dataf, H, R, delta_b, delta_q,
-                                  b0s_mask.astype(np.int32), metric_map, sampling_matrix,
-                                  sphere.vertices, sphere.edges.astype(np.int32),
-                                  ngpus=args.ngpus, rng_seed=0)
-t1 = time.time()
-streamline_time = 0
-io_time = 0
-
-# debug start val to fast forward
-with tqdm(total=seed_mask.shape[0]) as pbar:
-    for idx in range(int(nchunks)):
-        # Main streamline computation
-        ts = time.time()
-        streamlines = gpu_tracker.generate_streamlines(seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size])
-        te = time.time()
-        streamline_time += (te-ts)
-        pbar.update(seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size].shape[0])
-        #print("Generated {} streamlines from {} seeds, time: {} s".format(len(streamlines),
-        #                                                                  seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size].shape[0],
-        #                                                                  te-ts))
-
-        # Save tracklines file
-        if args.output_prefix:
-            if args.use_fast_write:
-                prefix = "{}.{}_{}".format(args.output_prefix, idx+1, nchunks)
-                ts = time.time()
-                gpu_tracker.dump_streamlines(prefix, voxel_order, roi_data.shape, img.header.get_zooms(), img.affine)
-                te = time.time()
-                print("Saved streamlines to {}_*.trk, time {} s".format(prefix, te-ts))
-            else:
-                fname = "{}.{}_{}.trk".format(args.output_prefix, idx+1, nchunks)
-                ts = time.time()
-                #save_tractogram(fname, streamlines, img.affine, vox_size=roi.header.get_zooms(), shape=roi_data.shape)
-                sft = StatefulTractogram(streamlines, args.nifti_file, Space.VOX)
-                save_tractogram(sft, fname)
-                te = time.time()
-                print("Saved streamlines to {}, time {} s".format(fname, te-ts))
-            io_time += (te-ts)
-
-pbar.close()
-t2 = time.time()
-
-print("Completed processing {} seeds.".format(seed_mask.shape[0]))
-print("Initialization time: {} sec".format(t1-t0))
-print("Streamline generation total time: {} sec".format(t2-t1))
-print("\tStreamline processing: {} sec".format(streamline_time))
-if args.output_prefix:
-  print("\tFile writing: {} sec".format(io_time))
diff --git a/run_dipy_gpu_hardi.py b/run_gpu_streamlines.py
similarity index 57%
rename from run_dipy_gpu_hardi.py
rename to run_gpu_streamlines.py
index b60cb9a..2e5be19 100644
--- a/run_dipy_gpu_hardi.py
+++ b/run_gpu_streamlines.py
@@ -32,6 +32,7 @@
 import time
 
 import numpy as np
+import numpy.linalg as npl
 
 import dipy.reconst.dti as dti
 from dipy.io import read_bvals_bvecs
@@ -40,10 +41,11 @@
 from dipy.tracking import utils
 from dipy.core.gradients import gradient_table, unique_bvals_magnitude
 from dipy.data import small_sphere
-#from dipy.direction import BootDirectionGetter
+from dipy.direction import (BootDirectionGetter, ProbabilisticDirectionGetter, PTTDirectionGetter)
 from dipy.reconst.shm import OpdtModel, CsaOdfModel
 from dipy.reconst.csdeconv import ConstrainedSphericalDeconvModel, auto_response_ssst
-#from dipy.tracking.local import LocalTracking, ThresholdTissueClassifier
+from dipy.tracking.local_tracking import LocalTracking
+from dipy.tracking.stopping_criterion import ThresholdStoppingCriterion
 from dipy.reconst import shm
 from dipy.data import get_fnames
 from dipy.data import read_stanford_pve_maps
@@ -72,11 +74,17 @@ def get_img(ep2_seq):
 
 print("parsing arguments")
 parser = argparse.ArgumentParser()
+parser.add_argument("nifti_file", nargs='?', default='hardi', help="path to the DWI nifti file")
+parser.add_argument("bvals", nargs='?', default='hardi', help="path to the bvals")
+parser.add_argument("bvecs", nargs='?', default='hardi', help="path to the bvecs")
+parser.add_argument("mask_nifti", nargs='?', default='hardi', help="path to the mask file")
+parser.add_argument("roi_nifti", nargs='?', default='hardi', help="path to the ROI file")
+parser.add_argument("--device", type=str, default ='gpu', choices=['cpu', 'gpu'], help="Whether to use cpu or gpu")
 parser.add_argument("--output-prefix", type=str, default ='', help="path to the output file")
-parser.add_argument("--chunk-size", type=int, default=100000, help="how many seeds to process per sweep, per GPU.")
-parser.add_argument("--nseeds", type=int, default=None, help="how many seeds to process in total")
-parser.add_argument("--ngpus", type=int, required=True, help="number of GPUs to use")
-parser.add_argument("--use-fast-write", action='store_true', help="use fast file write")
+parser.add_argument("--chunk-size", type=int, default=100000, help="how many seeds to process per sweep, per GPU")
+parser.add_argument("--nseeds", type=int, default=100000, help="how many seeds to process in total")
+parser.add_argument("--ngpus", type=int, default=1, help="number of GPUs to use if using gpu")
+parser.add_argument("--use-fast-write", action='store_true', help="use fast file write (only if using gpu)")
 parser.add_argument("--max-angle", type=float, default=60, help="max angle (in degrees)")
 parser.add_argument("--min-signal", type=float, default=1.0, help="default: 1.0")
 parser.add_argument("--step-size", type=float, default=0.5, help="default: 0.5")
@@ -85,28 +93,37 @@ def get_img(ep2_seq):
 parser.add_argument("--relative-peak-threshold",type=float,default=0.25,help="relative peak threshold")
 parser.add_argument("--min-separation-angle",type=float,default=45,help="min separation angle (in degrees)")
 parser.add_argument("--sm-lambda",type=float,default=0.006,help="smoothing lambda")
-parser.add_argument("--sampling-density", type=int, default=3, help="sampling density for seed generation")
 parser.add_argument("--model", type=str, default="opdt", choices=['opdt', 'csa', 'csd'], help="model to use")
-parser.add_argument("--dg", type=str, default="boot", choices=['boot', 'prob'], help="direction getting scheme to use")
-args = parser.parse_args()
-
-# Get Stanford HARDI data
-hardi_nifti_fname, hardi_bval_fname, hardi_bvec_fname = get_fnames('stanford_hardi')
+parser.add_argument("--dg", type=str, default="boot", choices=['boot', 'prob', 'ptt'], help="direction getting scheme to use")
 
-print(hardi_nifti_fname, hardi_bval_fname, hardi_bvec_fname)
+args = parser.parse_args()
 
-csf, gm, wm = read_stanford_pve_maps()
-wm_data = wm.get_fdata()
+if 'hardi' in [args.nifti_file, args.bvals, args.bvecs, args.mask_nifti, args.roi_nifti]:
+  if not all(arg == 'hardi' for arg in [args.nifti_file, args.bvals, args.bvecs, args.mask_nifti, args.roi_nifti]):
+    raise ValueError("If any of the arguments is 'hardi', all must be 'hardi'")
+  # Get Stanford HARDI data
+  hardi_nifti_fname, hardi_bval_fname, hardi_bvec_fname = get_fnames('stanford_hardi')
+  csf, gm, wm = read_stanford_pve_maps()
+  wm_data = wm.get_fdata()
 
-img = get_img(hardi_nifti_fname)
-voxel_order = "".join(aff2axcodes(img.affine))
+  img = get_img(hardi_nifti_fname)
+  voxel_order = "".join(aff2axcodes(img.affine))
+  img_affine = img.affine
 
-gtab = get_gtab(hardi_bval_fname, hardi_bvec_fname)
-#roi = get_img(hardi_nifti_fname)
+  gtab = get_gtab(hardi_bval_fname, hardi_bvec_fname)
 
-data = img.get_fdata()
-roi_data = (wm_data > 0.5)
-mask = roi_data
+  data = img.get_fdata()
+  roi_data = (wm_data > 0.5)
+  mask = roi_data
+else:
+  img = get_img(args.nifti_file)
+  voxel_order = "".join(aff2axcodes(img.affine))
+  gtab = get_gtab(args.bvals, args.bvecs)
+  roi = get_img(args.roi_nifti)
+  mask = get_img(args.mask_nifti)
+  data = img.get_data()
+  roi_data = roi.get_data()
+  mask = mask.get_data()
 
 tenmodel = dti.TensorModel(gtab, fit_method='WLS')
 print('Fitting Tensor')
@@ -116,11 +133,10 @@ def get_img(ep2_seq):
 FA[np.isnan(FA)] = 0
 
 # Setup tissue_classifier args
-#tissue_classifier = ThresholdTissueClassifier(FA, args.fa_threshold)
+tissue_classifier = ThresholdStoppingCriterion(FA, args.fa_threshold)
 metric_map = np.asarray(FA, 'float64')
 
 # Create seeds for ROI
-#seed_mask = utils.seeds_from_mask(roi_data, density=sampling_density, affine=img_affine)
 # seed_mask = utils.seeds_from_mask(roi_data, density=args.sampling_density, affine=np.eye(4))
 # seed_mask = seed_mask[0:args.nseeds]
 seed_mask = np.asarray(utils.random_seeds_from_mask(
@@ -145,7 +161,6 @@ def get_img(ep2_seq):
   delta_b = fit_matrix
   delta_q = fit_matrix
 else:
-  model_type = cuslines.ModelType.CSD
   print("Running CSD model...")
   unique_bvals = unique_bvals_magnitude(gtab.bvals)
   if len(unique_bvals[unique_bvals > 0]) > 1:
@@ -167,59 +182,75 @@ def get_img(ep2_seq):
   delta_b = model._X
   delta_q = model.B_reg
 
-if args.dg == "prob":
-  model_type = cuslines.ModelType.PROB
+if args.dg != "boot":
+  if args.dg == "prob":
+    model_type = cuslines.ModelType.PROB
+    dg = ProbabilisticDirectionGetter
+  else:
+    model_type = cuslines.ModelType.PTT
+    dg = PTTDirectionGetter
   fit = model.fit(data, mask=(metric_map >= args.fa_threshold))
   data = fit.odf(small_sphere).clip(min=0)
-  print(data.shape)
-
-# Setup direction getter args
-b0s_mask = gtab.b0s_mask
-dwi_mask = ~b0s_mask
-
-
-# setup sampling matrix
-sphere = small_sphere
-theta = sphere.theta
-phi = sphere.phi
-sampling_matrix, _, _ = shm.real_sym_sh_basis(args.sh_order, theta, phi)
+else:
+  dg = BootDirectionGetter
 
-## from BootPmfGen __init__
-# setup H and R matrices
-# TODO: figure out how to get H, R matrices from direction getter object
-x, y, z = model.gtab.gradients[dwi_mask].T
-r, theta, phi = shm.cart2sphere(x, y, z)
-B, _, _ = shm.real_sym_sh_basis(args.sh_order, theta, phi)
-H = shm.hat(B)
-R = shm.lcr_matrix(H)
+global_chunk_size = args.chunk_size
 
-# create floating point copy of data
-dataf = np.asarray(data, dtype=float)
+# Setup direction getter args
+if args.device == "cpu":
+  if args.dg != "boot":
+    dg = dg.from_pmf(data, max_angle=args.max_angle, sphere=small_sphere, relative_peak_threshold=args.relative_peak_threshold, min_separation_angle=args.min_separation_angle)
+  else:
+    dg = BootDirectionGetter.from_data(data, model, max_angle=args.max_angle, sphere=small_sphere, sh_order=args.sh_order, relative_peak_threshold=args.relative_peak_threshold, min_separation_angle=args.min_separation_angle)
+else:
+  # Setup direction getter args
+  b0s_mask = gtab.b0s_mask
+  dwi_mask = ~b0s_mask
+
+  # setup sampling matrix
+  sphere = small_sphere
+  theta = sphere.theta
+  phi = sphere.phi
+  sampling_matrix, _, _ = shm.real_sym_sh_basis(args.sh_order, theta, phi)
+
+  ## from BootPmfGen __init__
+  # setup H and R matrices
+  # TODO: figure out how to get H, R matrices from direction getter object
+  x, y, z = model.gtab.gradients[dwi_mask].T
+  r, theta, phi = shm.cart2sphere(x, y, z)
+  B, _, _ = shm.real_sym_sh_basis(args.sh_order, theta, phi)
+  H = shm.hat(B)
+  R = shm.lcr_matrix(H)
+
+  # create floating point copy of data
+  dataf = np.asarray(data, dtype=np.float64)
+
+  gpu_tracker = cuslines.GPUTracker(model_type,
+                                    args.max_angle * np.pi/180,
+                                    args.min_signal,
+                                    args.fa_threshold,
+                                    args.step_size,
+                                    args.relative_peak_threshold,
+                                    args.min_separation_angle * np.pi/180,
+                                    dataf.astype(np.float64), H.astype(np.float64), R.astype(np.float64), delta_b.astype(np.float64), delta_q.astype(np.float64),
+                                    b0s_mask.astype(np.int32), metric_map.astype(np.float64), sampling_matrix.astype(np.float64),
+                                    sphere.vertices.astype(np.float64), sphere.edges.astype(np.int32),
+                                    ngpus=args.ngpus, rng_seed=0)
 
 print('streamline gen')
-global_chunk_size = args.chunk_size * args.ngpus
 nchunks = (seed_mask.shape[0] + global_chunk_size - 1) // global_chunk_size
 
-#streamline_generator = LocalTracking(boot_dg, tissue_classifier, seed_mask, affine=np.eye(4), step_size=args.step_size)
-
-gpu_tracker = cuslines.GPUTracker(model_type,
-                                  args.max_angle * np.pi/180,
-                                  args.min_signal,
-                                  args.fa_threshold,
-                                  args.step_size,
-                                  args.relative_peak_threshold,
-                                  args.min_separation_angle * np.pi/180,
-                                  dataf, H, R, delta_b, delta_q,
-                                  b0s_mask.astype(np.int32), metric_map, sampling_matrix,
-                                  sphere.vertices, sphere.edges.astype(np.int32),
-                                  ngpus=args.ngpus, rng_seed=0)
 t1 = time.time()
 streamline_time = 0
 io_time = 0
 for idx in range(int(nchunks)):
   # Main streamline computation
   ts = time.time()
-  streamlines = gpu_tracker.generate_streamlines(seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size])
+  if args.device == "cpu":
+    streamline_generator = LocalTracking(dg, tissue_classifier, seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size], affine=np.eye(4), step_size=args.step_size)
+    streamlines = [s for s in streamline_generator]
+  else:
+    streamlines = gpu_tracker.generate_streamlines(seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size])
   te = time.time()
   streamline_time += (te-ts)
   print("Generated {} streamlines from {} seeds, time: {} s".format(len(streamlines),
@@ -228,24 +259,16 @@ def get_img(ep2_seq):
 
   # Save tracklines file
   if args.output_prefix:
-    #print(seed_mask)
-    #print(streamlines)
-    if args.use_fast_write:
-      prefix = "{}.{}_{}".format(args.output_prefix, idx+1, nchunks)
-      ts = time.time()
-      #gpu_tracker.dump_streamlines(prefix, voxel_order, roi.shape, roi.header.get_zooms(), img.affine)
-      gpu_tracker.dump_streamlines(prefix, voxel_order, wm.shape, wm.header.get_zooms(), img.affine)
-      te = time.time()
-      print("Saved streamlines to {}_*.trk, time {} s".format(prefix, te-ts))
+    ts = time.time()
+    if (args.device == "gpu") and args.use_fast_write:
+      fname = "{}.{}_{}".format(args.output_prefix, idx+1, nchunks)
+      gpu_tracker.dump_streamlines(fname, voxel_order, wm.shape, wm.header.get_zooms(), img.affine)
     else:
       fname = "{}.{}_{}.trk".format(args.output_prefix, idx+1, nchunks)
-      ts = time.time()
-      #save_tractogram(fname, streamlines, img.affine, vox_size=roi.header.get_zooms(), shape=roi_data.shape)
-      #save_tractogram(fname, streamlines)
-      sft = StatefulTractogram(streamlines, hardi_nifti_fname, Space.VOX)
+      sft = StatefulTractogram(streamlines, args.nifti_file, Space.VOX)
       save_tractogram(sft, fname)
-      te = time.time()
-      print("Saved streamlines to {}, time {} s".format(fname, te-ts))
+    te = time.time()
+    print("Saved streamlines to {}, time {} s".format(fname, te-ts))
     io_time += (te-ts)
 
 t2 = time.time()

From 9fe8c5fec13488e2250a9022dd0fe81008c90a1e Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Mon, 22 Jul 2024 13:53:07 -0700
Subject: [PATCH 22/35] better naming

---
 cuslines/ptt.cu | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/cuslines/ptt.cu b/cuslines/ptt.cu
index e3fe7b1..6d1a25f 100644
--- a/cuslines/ptt.cu
+++ b/cuslines/ptt.cu
@@ -24,32 +24,30 @@ template<typename REAL_T, typename REAL3_T>
 __device__ REAL_T interp4_d(const REAL3_T pos, const REAL_T* frame, const REAL_T *__restrict__ pmf,
                             const int dimx, const int dimy, const int dimz, const int dimt,
                             const REAL3_T *__restrict__ odf_sphere_vertices) {
-    const REAL_T HALF = static_cast<REAL_T>(0.5);
-
     int closest_odf_idx = 0;
-    REAL_T __tmp = 0;
+    REAL_T __max_cos = 0;
     for (int ii = 0; ii < dimt; ii++) {
         REAL_T cos_sim = FABS(
             odf_sphere_vertices[ii].x * frame[0] \
             + odf_sphere_vertices[ii].y * frame[1] \
             + odf_sphere_vertices[ii].z * frame[2]);
-        if (cos_sim > __tmp) {
-            __tmp = cos_sim;
+        if (cos_sim > __max_cos) {
+            __max_cos = cos_sim;
             closest_odf_idx = ii;
         }
     }
 
     const int rv = trilinear_interp_d<1>(dimx, dimy, dimz, dimt, closest_odf_idx,
-                                         pmf, pos, &__tmp);
+                                         pmf, pos, &__max_cos);
 
 #if 0
-    printf("inerpolated %f at %f, %f, %f, %i\n", __tmp, pos.x, pos.y, pos.z, closest_odf_idx);
+    printf("inerpolated %f at %f, %f, %f, %i\n", __max_cos, pos.x, pos.y, pos.z, closest_odf_idx);
 #endif
 
     if (rv != 0) {
         return -1;
     } else {
-        return __tmp;
+        return __max_cos;
     }
 }
 
@@ -159,7 +157,7 @@ __device__ REAL_T calculate_data_support_d(REAL_T support,
     probing_pos.y = pos.y;
     probing_pos.z = pos.z;
 
-    for (int ii = 0; ii < PROBE_QUALITY; ii++) {
+    for (int ii = 0; ii < PROBE_QUALITY; ii++) { // we spend about 2/3 of our time in this loop when doing PTT
         propogate_frame_d(probing_prop, probing_frame, direc);
 
         probing_pos.x += direc[0];

From 8022a543fafd1223f86fafded00b8807a29e97ea Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Fri, 26 Jul 2024 10:00:44 -0700
Subject: [PATCH 23/35] in process of refactoring algorithms into different
 kernels

---
 cuslines/generate_streamlines_cuda.cu | 417 ++++++++++++++++----------
 1 file changed, 255 insertions(+), 162 deletions(-)

diff --git a/cuslines/generate_streamlines_cuda.cu b/cuslines/generate_streamlines_cuda.cu
index 22c9180..f5828a3 100644
--- a/cuslines/generate_streamlines_cuda.cu
+++ b/cuslines/generate_streamlines_cuda.cu
@@ -254,16 +254,15 @@ __device__ void fit_csa(const int delta_nr,
         __syncwarp(WMASK);
 }
 
-template<int BDIM_X, typename REAL_T>
-__device__ void fit_model_coef(const ModelType model_type,
-                               const int delta_nr, // delta_nr is number of ODF directions
+template<int BDIM_X, ModelType MODEL_T, typename REAL_T>
+__device__ void fit_model_coef(const int delta_nr, // delta_nr is number of ODF directions
                                const int hr_side, // hr_side is number of data directions
                                const REAL_T *__restrict__ delta_q,
                                const REAL_T *__restrict__ delta_b, // these are fit matrices the model can use, different for each model
                                const REAL_T *__restrict__ __msk_data_sh, // __msk_data_sh is the part of the data currently being operated on by this block
                                REAL_T *__restrict__ __h_sh, // these last two are modifications to the coefficients that will be returned
                                REAL_T *__restrict__ __r_sh) {
-        switch(model_type) {
+        switch(MODEL_T) {
                 case OPDT:
                         fit_opdt<BDIM_X>(delta_nr, hr_side, delta_q, delta_b, __msk_data_sh, __h_sh, __r_sh);
                         break;
@@ -918,11 +917,11 @@ __device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st,
 template<int BDIM_X,
          int BDIM_Y,
          int NATTEMPTS,
+         ModelType MODEL_T,
          typename REAL_T,
          typename REAL3_T>
 __device__ int get_direction_boot_d(
                                 curandStatePhilox4_32_10_t *st,
-                                const ModelType model_type,
                                 const REAL_T max_angle,
                                 const REAL_T min_signal,
                                 const REAL_T relative_peak_thres,
@@ -1069,7 +1068,7 @@ __device__ int get_direction_boot_d(
 			maskGet<BDIM_X>(dimt, b0s_mask, __vox_data_sh, __msk_data_sh);
 			__syncwarp(WMASK);
 
-                        fit_model_coef<BDIM_X>(model_type, delta_nr, hr_side, delta_q, delta_b, __msk_data_sh, __h_sh, __r_sh);
+                        fit_model_coef<BDIM_X, MODEL_T>(delta_nr, hr_side, delta_q, delta_b, __msk_data_sh, __h_sh, __r_sh);
 
                         // __r_sh[tidy] <- python 'coef'
 
@@ -1190,10 +1189,10 @@ __device__ int check_point_d(const REAL_T tc_threshold,
 
 template<int BDIM_X,
          int BDIM_Y,
+         ModelType MODEL_T,
          typename REAL_T,
          typename REAL3_T>
 __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
-			 const ModelType model_type,
 			 const REAL_T max_angle,
 			 const REAL_T min_signal,
 			 const REAL_T tc_threshold,
@@ -1253,7 +1252,7 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
         __syncwarp(WMASK);
 
         int step_frac;
-        if (model_type == PTT) {
+        if (MODEL_T == PTT) {
                 step_frac = STEP_FRAC; 
         } else {
                 step_frac = 1; // STEP_FRAC could be useful in other models
@@ -1262,7 +1261,7 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
         int i;
         for(i = 1; i < MAX_SLINE_LEN*step_frac; i++) {
                 int ndir;
-                if (model_type == PROB) {
+                if (MODEL_T == PROB) {
                         ndir = get_direction_prob_d<BDIM_X,
                                                     BDIM_Y,
                                                     0>(
@@ -1278,7 +1277,7 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
                                                         sphere_edges,
                                                         num_edges,
                                                         __shDir);
-                } else if (model_type == PTT) {
+                } else if (MODEL_T == PTT) {
                         ndir = get_direction_ptt_d<BDIM_X,
                                                    BDIM_Y,
                                                    0>(
@@ -1295,9 +1294,9 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
                         // call get_direction_boot_d() with NATTEMPTS=5
                         ndir = get_direction_boot_d<BDIM_X,
                                                     BDIM_Y,
-                                                    5>(
+                                                    5,
+                                                    MODEL_T>(
                                                         st,
-                                                        model_type,
                                                         max_angle,
                                                         min_signal,
                                                         relative_peak_thres,
@@ -1373,13 +1372,13 @@ template<int BDIM_X,
          int BDIM_Y,
          typename REAL_T,
          typename REAL3_T>
-__global__ void getNumStreamlines_k(const ModelType model_type,
+__global__ void getNumStreamlinesBoot_k(
+                                    const ModelType model_type,
                                     const REAL_T max_angle,
-				    const REAL_T min_signal,
-				    const REAL_T relative_peak_thres,
-				    const REAL_T min_separation_angle,
-				    const long long rndSeed,
-                                    const int rndOffset,
+				                    const REAL_T min_signal,
+				                    const REAL_T relative_peak_thres,
+				                    const REAL_T min_separation_angle,
+				                    const long long rndSeed,
                                     const int nseed,
                                     const REAL3_T *__restrict__ seeds,
                                     const int dimx,
@@ -1389,17 +1388,17 @@ __global__ void getNumStreamlines_k(const ModelType model_type,
                                     const REAL_T *__restrict__ dataf,
                                     const REAL_T *__restrict__ H,
                                     const REAL_T *__restrict__ R,
-				    const int delta_nr,
+				                    const int delta_nr,
                                     const REAL_T *__restrict__ delta_b,
                                     const REAL_T *__restrict__ delta_q,
                                     const int  *__restrict__ b0s_mask, // change to int
-				    const int samplm_nr,
+				                    const int samplm_nr,
                                     const REAL_T *__restrict__ sampling_matrix,
                                     const REAL3_T *__restrict__ sphere_vertices,
                                     const int2 *__restrict__ sphere_edges,
                                     const int num_edges,
-                                          REAL3_T *__restrict__ shDir0,
-                                          int *slineOutOff) {
+                                    REAL3_T *__restrict__ shDir0,
+                                    int *slineOutOff) {
 
         const int tidx = threadIdx.x;
         const int slid = blockIdx.x*blockDim.y + threadIdx.y;
@@ -1430,28 +1429,41 @@ __global__ void getNumStreamlines_k(const ModelType model_type,
 	//}
 
         int ndir;
-        if ((model_type == PROB) || (model_type == PTT)) {
-                ndir = get_direction_prob_d<BDIM_X,
+        switch(model_type) {
+            case OPDT:
+                ndir = get_direction_boot_d<BDIM_X,
                                             BDIM_Y,
-                                            1>(
+                                            1,
+                                            OPDT>(
                                                 &st,
-                                                dataf,
                                                 max_angle,
+                                                min_signal,
                                                 relative_peak_thres,
                                                 min_separation_angle,
                                                 MAKE_REAL3(0,0,0),
-                                                dimx, dimy, dimz, dimt,
+                                                dimx, dimy, dimz, dimt, dataf,
+                                                b0s_mask /* !dwi_mask */,
                                                 seed,
+                                                H, R,
+                                                // model unused
+                                                // max_angle, pmf_threshold from global defines
+                                                // b0s_mask already passed
+                                                // min_signal from global defines
+                                                delta_nr,
+                                                delta_b, delta_q, // fit_matrix
+                                                samplm_nr,
+                                                sampling_matrix,
                                                 sphere_vertices,
                                                 sphere_edges,
                                                 num_edges,
                                                 __shDir);
-        } else {
+                break;
+            case CSA:
                 ndir = get_direction_boot_d<BDIM_X,
                                             BDIM_Y,
-                                            1>(
+                                            1,
+                                            CSA>(
                                                 &st,
-                                                model_type,
                                                 max_angle,
                                                 min_signal,
                                                 relative_peak_thres,
@@ -1473,17 +1485,69 @@ __global__ void getNumStreamlines_k(const ModelType model_type,
                                                 sphere_edges,
                                                 num_edges,
                                                 __shDir);
+                break;
+            default:
+                printf("FATAL: Invalid Model Type.\n");
+                break;
         }
+
         if (tidx == 0) {
                 slineOutOff[slid] = ndir;
+        }
 
-		//if (!tidx && !threadIdx.y && !blockIdx.x) {
-		//	printf("ndir: %d\n", ndir);
-		//	for(int i = 0; i < ndir; i++) {
-		//		printf("%f %f %f\n", __shDir[i].x, __shDir[i].y, __shDir[i].z);	
-		//	}
-		//}
+        return;
+}
+
+template<int BDIM_X,
+         int BDIM_Y,
+         typename REAL_T,
+         typename REAL3_T>
+__global__ void getNumStreamlinesProb_k(const REAL_T max_angle,
+				                        const REAL_T relative_peak_thres,
+				                        const REAL_T min_separation_angle,
+				                        const long long rndSeed,
+                                        const int nseed,
+                                        const REAL3_T *__restrict__ seeds,
+                                        const int dimx,
+                                        const int dimy,
+                                        const int dimz,
+                                        const int dimt,
+                                        const REAL_T *__restrict__ dataf,
+                                        const REAL3_T *__restrict__ sphere_vertices,
+                                        const int2 *__restrict__ sphere_edges,
+                                        const int num_edges,
+                                        REAL3_T *__restrict__ shDir0,
+                                        int *slineOutOff) {
+
+        const int tidx = threadIdx.x;
+        const int slid = blockIdx.x*blockDim.y + threadIdx.y;
+        const size_t gid = blockIdx.x * blockDim.y * blockDim.x + blockDim.x * threadIdx.y + threadIdx.x;
 
+        if (slid >= nseed) {
+                return;
+        }
+
+        REAL3_T *__restrict__ __shDir = shDir0+slid*dimt;
+        curandStatePhilox4_32_10_t st;
+        curand_init(rndSeed, gid, 0, &st);
+
+        int ndir = get_direction_prob_d<BDIM_X,
+                                        BDIM_Y,
+                                        1>(
+                                                &st,
+                                                dataf,
+                                                max_angle,
+                                                relative_peak_thres,
+                                                min_separation_angle,
+                                                MAKE_REAL3(0,0,0),
+                                                dimx, dimy, dimz, dimt,
+                                                seeds[slid],
+                                                sphere_vertices,
+                                                sphere_edges,
+                                                num_edges,
+                                                __shDir);
+        if (tidx == 0) {
+                slineOutOff[slid] = ndir;
         }
 
         return;
@@ -1491,9 +1555,10 @@ __global__ void getNumStreamlines_k(const ModelType model_type,
 
 template<int BDIM_X,
          int BDIM_Y,
+         ModelType MODEL_T,
          typename REAL_T,
          typename REAL3_T>
-__global__ void genStreamlinesMerge_k(const ModelType model_type,
+__global__ void genStreamlinesMerge_k(
 				      const REAL_T max_angle,
 				      const REAL_T min_signal,
 				      const REAL_T tc_threshold,
@@ -1580,7 +1645,7 @@ __global__ void genStreamlinesMerge_k(const ModelType model_type,
                 }
 #endif
 
-                if (model_type == PTT) {
+                if (MODEL_T == PTT) {
                         bool init_norm_success;
                         // Here we probabilistic find a good intial normal for this initial direction 
                         init_norm_success = (bool) get_direction_ptt_d<BDIM_X,
@@ -1646,31 +1711,31 @@ __global__ void genStreamlinesMerge_k(const ModelType model_type,
 
                 int stepsB;
                 const int tissue_classB = tracker_d<BDIM_X,
-                                                    BDIM_Y>(&st,
-							    model_type,
-						            max_angle,
-						            min_signal,
-							    tc_threshold,
-							    step_size,
-							    relative_peak_thres,
-							    min_separation_angle,
-                                                            seed,
-                                                            MAKE_REAL3(-first_step.x, -first_step.y, -first_step.z),
-                                                            MAKE_REAL3(1, 1, 1),
-                                                            dimx, dimy, dimz, dimt, dataf,
-                                                            b0s_mask,
-                                                            H, R,
-                                                            metric_map,
-							    delta_nr,
-                                                            delta_b, delta_q, //fit_matrix
-							    samplm_nr,
-                                                            sampling_matrix,
-                                                            sphere_vertices,
-                                                            sphere_edges,
-                                                            num_edges,
-                                                            shDir1 + slid*samplm_nr,
-                                                            &stepsB,
-                                                            currSline);
+                                                    BDIM_Y,
+                                                    MODEL_T>(&st,
+		                        				             max_angle,
+			                        			             min_signal,
+			                            				     tc_threshold,
+	                            						     step_size,
+	                            						     relative_peak_thres,
+	                            						     min_separation_angle,
+                                                             seed,
+                                                             MAKE_REAL3(-first_step.x, -first_step.y, -first_step.z),
+                                                             MAKE_REAL3(1, 1, 1),
+                                                             dimx, dimy, dimz, dimt, dataf,
+                                                             b0s_mask,
+                                                             H, R,
+                                                             metric_map,
+							                                 delta_nr,
+                                                             delta_b, delta_q, //fit_matrix
+		                            					     samplm_nr,
+                                                             sampling_matrix,
+                                                             sphere_vertices,
+                                                             sphere_edges,
+                                                             num_edges,
+                                                             shDir1 + slid*samplm_nr,
+                                                             &stepsB,
+                                                             currSline);
                 //if (tidx == 0) {
                 //        slineLenB[slineOff] = stepsB;
                 //}
@@ -1684,7 +1749,7 @@ __global__ void genStreamlinesMerge_k(const ModelType model_type,
                         }
                 }
 
-                if (model_type == PTT) {
+                if (MODEL_T == PTT) {
                         __syncwarp(WMASK);
                         if (tidx == 0) {
                                 // Setup the forwards step using parameters saved from backwards step
@@ -1700,31 +1765,31 @@ __global__ void genStreamlinesMerge_k(const ModelType model_type,
 
                 int stepsF;
                 const int tissue_classF = tracker_d<BDIM_X,
-                                                    BDIM_Y>(&st,
-							    model_type,
-						            max_angle,
-						            min_signal,
-							    tc_threshold,
-							    step_size,
-							    relative_peak_thres,
-							    min_separation_angle,
-                                                            seed,
-                                                            first_step,
-                                                            MAKE_REAL3(1, 1, 1),
-                                                            dimx, dimy, dimz, dimt, dataf,
-                                                            b0s_mask,
-                                                            H, R,
-                                                            metric_map,
-							    delta_nr,
-                                                            delta_b, delta_q, //fit_matrix
-							    samplm_nr,
-                                                            sampling_matrix,
-                                                            sphere_vertices,
-                                                            sphere_edges,
-                                                            num_edges,
-                                                            shDir1 + slid*samplm_nr,
-                                                            &stepsF,
-                                                            currSline + stepsB-1);
+                                                    BDIM_Y,
+                                                    MODEL_T>(&st,
+     	                    					             max_angle,
+				                        		             min_signal,
+		                            					     tc_threshold,
+	                            						     step_size,
+			                            				     relative_peak_thres,
+				                            			     min_separation_angle,
+                                                             seed,
+                                                             first_step,
+                                                             MAKE_REAL3(1, 1, 1),
+                                                             dimx, dimy, dimz, dimt, dataf,
+                                                             b0s_mask,
+                                                             H, R,
+                                                             metric_map,
+			                            				     delta_nr,
+                                                             delta_b, delta_q, //fit_matrix
+			                            				     samplm_nr,
+                                                             sampling_matrix,
+                                                             sphere_vertices,
+                                                             sphere_edges,
+                                                             num_edges,
+                                                             shDir1 + slid*samplm_nr,
+                                                             &stepsF,
+                                                             currSline + stepsB-1);
                 if (tidx == 0) {
                         slineLen[slineOff] = stepsB-1+stepsF;
                 }
@@ -1807,35 +1872,57 @@ void generate_streamlines_cuda_mgpu(const ModelType model_type, const REAL max_a
     dim3 grid(DIV_UP(nseeds_gpu, THR_X_BL/THR_X_SL));
 
     // Precompute number of streamlines before allocating memory
-    getNumStreamlines_k<THR_X_SL,
-                        THR_X_BL/THR_X_SL>
-                        <<<grid, block, shSizeGNS>>>(model_type,
-                                                     max_angle,
-						     min_signal,
-						     relative_peak_thresh,
-						     min_separation_angle,
-						     rng_seed,
-						     rng_offset + n*nseeds_per_gpu,
-						     nseeds_gpu,
-						     reinterpret_cast<const REAL3 *>(seeds_d[n]),
-						     dimx,
-						     dimy,
-						     dimz,
-						     dimt,
-						     dataf_d[n],
-						     H_d[n],
-						     R_d[n],
-						     delta_nr,
-						     delta_b_d[n],
-						     delta_q_d[n],
-						     b0s_mask_d[n],
-						     samplm_nr,
-						     sampling_matrix_d[n],
-						     reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]),
-						     reinterpret_cast<const int2 *>(sphere_edges_d[n]),
-						     nedges,
-						     shDirTemp0_d[n],
-						     slinesOffs_d[n]);
+    if (!((model_type == PTT) || (model_type == PROB))) {
+        getNumStreamlinesBoot_k<THR_X_SL,
+                                THR_X_BL/THR_X_SL>
+                                <<<grid, block, shSizeGNS>>>(
+                                        model_type,
+                                        max_angle,
+                                        min_signal,
+                                        relative_peak_thresh,
+                                        min_separation_angle,
+                                        rng_seed,
+                                        nseeds_gpu,
+                                        reinterpret_cast<const REAL3 *>(seeds_d[n]),
+                                        dimx,
+                                        dimy,
+                                        dimz,
+                                        dimt,
+                                        dataf_d[n],
+                                        H_d[n],
+                                        R_d[n],
+                                        delta_nr,
+                                        delta_b_d[n],
+                                        delta_q_d[n],
+                                        b0s_mask_d[n],
+                                        samplm_nr,
+                                        sampling_matrix_d[n],
+                                        reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]),
+                                        reinterpret_cast<const int2 *>(sphere_edges_d[n]),
+                                        nedges,
+                                        shDirTemp0_d[n],
+                                        slinesOffs_d[n]);
+    } else {
+        getNumStreamlinesProb_k<THR_X_SL,
+                                THR_X_BL/THR_X_SL>
+                                <<<grid, block, shSizeGNS>>>(
+                                        max_angle,
+                                        relative_peak_thresh,
+                                        min_separation_angle,
+                                        rng_seed,
+                                        nseeds_gpu,
+                                        reinterpret_cast<const REAL3 *>(seeds_d[n]),
+                                        dimx,
+                                        dimy,
+                                        dimz,
+                                        dimt,
+                                        dataf_d[n],
+                                        reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]),
+                                        reinterpret_cast<const int2 *>(sphere_edges_d[n]),
+                                        nedges,
+                                        shDirTemp0_d[n],
+                                        slinesOffs_d[n]);
+    }
   }
 
   std::vector<int> slinesOffs_h;
@@ -1921,52 +2008,58 @@ void generate_streamlines_cuda_mgpu(const ModelType model_type, const REAL max_a
     std::cerr << "Generating " << nSlines_h[n] << " streamlines (from " << nseeds_gpu << " seeds)" << std::endl; 
 #endif
 
-    if (model_type == PROB) {
-        // Shared memory requirements are smaller for probabilistic for main run
-        // than for preliminary run
-        shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*n32dimt; 
-    } else if (model_type == PTT) {
-        // Potentially smaller shared memory requirements for PTT for main run
-        shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*(DISC_VERT_CNT + DISC_FACE_CNT + 2*(9 + 1));
+    //fprintf(stderr, "Launching kernel with %u blocks of size (%u, %u)\n", grid.x, block.x, block.y);
+    switch(model_type) {
+        case OPDT:
+            genStreamlinesMerge_k<THR_X_SL, THR_X_BL/THR_X_SL, OPDT> <<<grid, block, shSizeGNS, streams[n]>>>(
+                max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle,
+                rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast<const REAL3 *>(seeds_d[n]),
+                dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n],
+                b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n],
+                reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]), reinterpret_cast<const int2 *>(sphere_edges_d[n]),
+                nedges, slinesOffs_d[n], shDirTemp0_d[n], shDirTemp1_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]);
+            break;
+
+        case CSA:
+            genStreamlinesMerge_k<THR_X_SL, THR_X_BL/THR_X_SL, CSA> <<<grid, block, shSizeGNS, streams[n]>>>(
+                max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle,
+                rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast<const REAL3 *>(seeds_d[n]),
+                dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n],
+                b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n],
+                reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]), reinterpret_cast<const int2 *>(sphere_edges_d[n]),
+                nedges, slinesOffs_d[n], shDirTemp0_d[n], shDirTemp1_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]);
+            break;
+
+        case PROB:
+            // Shared memory requirements are smaller for probabilistic for main run
+            // than for preliminary run
+            shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*n32dimt;
+            genStreamlinesMerge_k<THR_X_SL, THR_X_BL/THR_X_SL, PROB> <<<grid, block, shSizeGNS, streams[n]>>>(
+                max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle,
+                rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast<const REAL3 *>(seeds_d[n]),
+                dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n],
+                b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n],
+                reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]), reinterpret_cast<const int2 *>(sphere_edges_d[n]),
+                nedges, slinesOffs_d[n], shDirTemp0_d[n], shDirTemp1_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]);
+            break;
+
+        case PTT:
+            // Potentially smaller shared memory requirements for PTT for main run
+            shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*(DISC_VERT_CNT + DISC_FACE_CNT + 2*(9 + 1));
+            genStreamlinesMerge_k<THR_X_SL, THR_X_BL/THR_X_SL, PTT> <<<grid, block, shSizeGNS, streams[n]>>>(
+                max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle,
+                rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast<const REAL3 *>(seeds_d[n]),
+                dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n],
+                b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n],
+                reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]), reinterpret_cast<const int2 *>(sphere_edges_d[n]),
+                nedges, slinesOffs_d[n], shDirTemp0_d[n], shDirTemp1_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]);
+            break;
+
+        default:
+            printf("FATAL: Invalid Model Type.\n");
+            break;
     }
 
-    //fprintf(stderr, "Launching kernel with %u blocks of size (%u, %u)\n", grid.x, block.x, block.y);
-    genStreamlinesMerge_k<THR_X_SL,
-                          THR_X_BL/THR_X_SL>
-                          <<<grid, block, shSizeGNS, streams[n]>>>(model_type,
-								   max_angle,
-								   min_signal,
-								   tc_threshold,
-								   step_size,
-								   relative_peak_thresh,
-								   min_separation_angle,
-								   rng_seed,
-								   rng_offset + n*nseeds_per_gpu,
-								   nseeds_gpu,
-								   reinterpret_cast<const REAL3 *>(seeds_d[n]),
-								   dimx,
-								   dimy,
-								   dimz,
-								   dimt,
-								   dataf_d[n],
-								   H_d[n],
-								   R_d[n],
-								   delta_nr,
-								   delta_b_d[n],
-								   delta_q_d[n],
-								   b0s_mask_d[n],
-								   metric_map_d[n],
-								   samplm_nr,
-								   sampling_matrix_d[n],
-								   reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]),
-								   reinterpret_cast<const int2 *>(sphere_edges_d[n]),
-								   nedges,
-								   slinesOffs_d[n],
-								   shDirTemp0_d[n],
-								   shDirTemp1_d[n],
-								   slineSeed_d[n],
-								   slineLen_d[n],
-								   sline_d[n]);
     CHECK_ERROR("genStreamlinesMerge_k");
   }
 

From 49ebc587a3d479988e2a73f9c7ad40fa8adda47f Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Fri, 26 Jul 2024 12:00:27 -0700
Subject: [PATCH 24/35] some minor changes

---
 cuslines/generate_streamlines_cuda.cu | 71 ++++++++++++---------------
 cuslines/ptt.cuh                      |  2 +-
 2 files changed, 32 insertions(+), 41 deletions(-)

diff --git a/cuslines/generate_streamlines_cuda.cu b/cuslines/generate_streamlines_cuda.cu
index f5828a3..5f66218 100644
--- a/cuslines/generate_streamlines_cuda.cu
+++ b/cuslines/generate_streamlines_cuda.cu
@@ -1226,9 +1226,9 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
                          const REAL3_T *__restrict__ sphere_vertices,
                          const int2 *__restrict__ sphere_edges,
                          const int num_edges,
-                               REAL3_T *__restrict__ __shDir,
-                               int *__restrict__ nsteps,
-                               REAL3_T *__restrict__ streamline) {
+                         REAL3_T *__restrict__ __shDir,
+                         int *__restrict__ nsteps,
+                         REAL3_T *__restrict__ streamline) {
 
         const int tidx = threadIdx.x;
         //const int tidy = threadIdx.y;
@@ -1375,10 +1375,10 @@ template<int BDIM_X,
 __global__ void getNumStreamlinesBoot_k(
                                     const ModelType model_type,
                                     const REAL_T max_angle,
-				                    const REAL_T min_signal,
-				                    const REAL_T relative_peak_thres,
-				                    const REAL_T min_separation_angle,
-				                    const long long rndSeed,
+		                    const REAL_T min_signal,
+		                    const REAL_T relative_peak_thres,
+	                            const REAL_T min_separation_angle,
+		                    const long long rndSeed,
                                     const int nseed,
                                     const REAL3_T *__restrict__ seeds,
                                     const int dimx,
@@ -1388,11 +1388,11 @@ __global__ void getNumStreamlinesBoot_k(
                                     const REAL_T *__restrict__ dataf,
                                     const REAL_T *__restrict__ H,
                                     const REAL_T *__restrict__ R,
-				                    const int delta_nr,
+		                    const int delta_nr,
                                     const REAL_T *__restrict__ delta_b,
                                     const REAL_T *__restrict__ delta_q,
                                     const int  *__restrict__ b0s_mask, // change to int
-				                    const int samplm_nr,
+		                    const int samplm_nr,
                                     const REAL_T *__restrict__ sampling_matrix,
                                     const REAL3_T *__restrict__ sphere_vertices,
                                     const int2 *__restrict__ sphere_edges,
@@ -1503,9 +1503,9 @@ template<int BDIM_X,
          typename REAL_T,
          typename REAL3_T>
 __global__ void getNumStreamlinesProb_k(const REAL_T max_angle,
-				                        const REAL_T relative_peak_thres,
-				                        const REAL_T min_separation_angle,
-				                        const long long rndSeed,
+				        const REAL_T relative_peak_thres,
+				        const REAL_T min_separation_angle,
+				        const long long rndSeed,
                                         const int nseed,
                                         const REAL3_T *__restrict__ seeds,
                                         const int dimx,
@@ -1713,12 +1713,12 @@ __global__ void genStreamlinesMerge_k(
                 const int tissue_classB = tracker_d<BDIM_X,
                                                     BDIM_Y,
                                                     MODEL_T>(&st,
-		                        				             max_angle,
-			                        			             min_signal,
-			                            				     tc_threshold,
-	                            						     step_size,
-	                            						     relative_peak_thres,
-	                            						     min_separation_angle,
+		                		             max_angle,
+			        		             min_signal,
+			        			     tc_threshold,
+	                        			     step_size,
+	                        			     relative_peak_thres,
+	                        			     min_separation_angle,
                                                              seed,
                                                              MAKE_REAL3(-first_step.x, -first_step.y, -first_step.z),
                                                              MAKE_REAL3(1, 1, 1),
@@ -1726,9 +1726,9 @@ __global__ void genStreamlinesMerge_k(
                                                              b0s_mask,
                                                              H, R,
                                                              metric_map,
-							                                 delta_nr,
+			                                     delta_nr,
                                                              delta_b, delta_q, //fit_matrix
-		                            					     samplm_nr,
+		                			     samplm_nr,
                                                              sampling_matrix,
                                                              sphere_vertices,
                                                              sphere_edges,
@@ -1767,12 +1767,12 @@ __global__ void genStreamlinesMerge_k(
                 const int tissue_classF = tracker_d<BDIM_X,
                                                     BDIM_Y,
                                                     MODEL_T>(&st,
-     	                    					             max_angle,
-				                        		             min_signal,
-		                            					     tc_threshold,
-	                            						     step_size,
-			                            				     relative_peak_thres,
-				                            			     min_separation_angle,
+     	                    			             max_angle,
+			        		             min_signal,
+		        				     tc_threshold,
+	                				     step_size,
+			    				     relative_peak_thres,
+			            			     min_separation_angle,
                                                              seed,
                                                              first_step,
                                                              MAKE_REAL3(1, 1, 1),
@@ -1780,9 +1780,9 @@ __global__ void genStreamlinesMerge_k(
                                                              b0s_mask,
                                                              H, R,
                                                              metric_map,
-			                            				     delta_nr,
+			        			     delta_nr,
                                                              delta_b, delta_q, //fit_matrix
-			                            				     samplm_nr,
+			        			     samplm_nr,
                                                              sampling_matrix,
                                                              sphere_vertices,
                                                              sphere_edges,
@@ -1851,18 +1851,6 @@ void generate_streamlines_cuda_mgpu(const ModelType model_type, const REAL max_a
   int n32dimt = ((dimt+31)/32)*32;
 
   size_t shSizeGNS;
-  if (model_type == PROB) {
-    shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*n32dimt + sizeof(int)*(THR_X_BL/THR_X_SL)*n32dimt;
-  } else if (model_type == PTT) {
-    shSizeGNS = MAX(
-      sizeof(REAL)*(THR_X_BL/THR_X_SL)*(DISC_VERT_CNT + DISC_FACE_CNT + 2*(9 + 1)),
-      sizeof(REAL)*(THR_X_BL/THR_X_SL)*n32dimt + sizeof(int)*(THR_X_BL/THR_X_SL)*n32dimt);
-  } else {
-    shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*(2*n32dimt + 2*MAX(n32dimt, samplm_nr)) + // for get_direction_boot_d
-                sizeof(int)*samplm_nr;						        // for peak_directions_d	
-  }
-  //printf("shSizeGNS: %zu\n", shSizeGNS);
-
   //#pragma omp parallel for
   for (int n = 0; n < ngpus; ++n) {
     CHECK_CUDA(cudaSetDevice(n));
@@ -1873,6 +1861,8 @@ void generate_streamlines_cuda_mgpu(const ModelType model_type, const REAL max_a
 
     // Precompute number of streamlines before allocating memory
     if (!((model_type == PTT) || (model_type == PROB))) {
+        shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*(2*n32dimt + 2*MAX(n32dimt, samplm_nr)) + // for get_direction_boot_d
+                    sizeof(int)*samplm_nr;						      // for peak_directions_d	
         getNumStreamlinesBoot_k<THR_X_SL,
                                 THR_X_BL/THR_X_SL>
                                 <<<grid, block, shSizeGNS>>>(
@@ -1903,6 +1893,7 @@ void generate_streamlines_cuda_mgpu(const ModelType model_type, const REAL max_a
                                         shDirTemp0_d[n],
                                         slinesOffs_d[n]);
     } else {
+        shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*n32dimt + sizeof(int)*(THR_X_BL/THR_X_SL)*n32dimt;
         getNumStreamlinesProb_k<THR_X_SL,
                                 THR_X_BL/THR_X_SL>
                                 <<<grid, block, shSizeGNS>>>(
diff --git a/cuslines/ptt.cuh b/cuslines/ptt.cuh
index ae9c5ea..b45cc9b 100644
--- a/cuslines/ptt.cuh
+++ b/cuslines/ptt.cuh
@@ -7,7 +7,7 @@
 #define STEP_FRAC 20 // divides output step size (usually 0.5) into this many internal steps
 #define PROBE_FRAC 2 // divides output step size (usually 0.5) to find probe length
 #define PROBE_QUALITY 4
-#define SAMPLING_QUALITY 4 // can be 2-7, should be at least 4 to take advantage of GPUs
+#define SAMPLING_QUALITY 4 // can be 2-7
 #define PROBABILISTIC_BIAS 1 // 1 looks good. can be 0-log_2(N_WARPS) (typically 0-5). 0 is fully probabilistic, 4 is close to deterministic.
 #define ALLOW_WEAK_LINK 1
 #define INIT_FRAME_TRIES 16

From 8e641008b13f2969e7772882ccc3ad4c2380d25a Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Tue, 30 Jul 2024 15:55:50 -0700
Subject: [PATCH 25/35] fix bug in prob; use statically declared shared memory
 where possible in ptt; refactor some code to be more readable

---
 cuslines/Makefile                     |   4 +-
 cuslines/generate_streamlines_cuda.cu | 253 +++++++++++++-------------
 cuslines/ptt.cu                       |  91 +++++++--
 cuslines/utils.cu                     |  73 ++++----
 4 files changed, 247 insertions(+), 174 deletions(-)

diff --git a/cuslines/Makefile b/cuslines/Makefile
index 4fc427c..1061a16 100644
--- a/cuslines/Makefile
+++ b/cuslines/Makefile
@@ -27,13 +27,13 @@
 
 
 CUDA_HOME=/usr/local/cuda
-CUDACC=$(CUDA_HOME)/bin/nvcc
+CUDACC=$(CUDA_HOME)/bin/nvcc # -G -g -dopt=on
 CXX=g++
 LD=g++
 
 CXXFLAGS= -c -O3 -std=c++11 -fopenmp -fPIC `python3 -m pybind11 --includes` -I$(CUDA_HOME)/include
 
-SMS ?= 60 70
+SMS ?= 70
 CUDA_ARCH = $(foreach SM,$(SMS),-gencode arch=compute_$(SM),code=sm_$(SM))
 LASTSM := $(lastword $(sort $(SMS)))
 CUDA_ARCH += -gencode arch=compute_$(LASTSM),code=compute_$(LASTSM)
diff --git a/cuslines/generate_streamlines_cuda.cu b/cuslines/generate_streamlines_cuda.cu
index 5f66218..374c7c1 100644
--- a/cuslines/generate_streamlines_cuda.cu
+++ b/cuslines/generate_streamlines_cuda.cu
@@ -732,21 +732,13 @@ __device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st,
 
 	const int n32dimt = ((dimt+31)/32)*32;
 
-	extern REAL_T __shared__ __sh[];
-
-	REAL_T *__pmf_data_sh = reinterpret_cast<REAL_T *>(__sh);
-        int *__shInd;
-        if (IS_START) {
-	        __shInd = reinterpret_cast<int *>(__pmf_data_sh + BDIM_Y*n32dimt);
-        }
-
-	__pmf_data_sh += tidy*n32dimt;
-        if (IS_START) {
-	        __shInd += tidy*n32dimt;
-        }
+	extern __shared__ REAL_T __sh[];
+        REAL_T *__pmf_data_sh = __sh + tidy*n32dimt;
 
         // pmf = self.pmf_gen.get_pmf_c(&point[0], pmf)
-        const int rv = trilinear_interp_d<BDIM_X>(dimx, dimy, dimz, dimt, 0, pmf, point, __pmf_data_sh);
+        __syncwarp(WMASK);
+        const int rv = trilinear_interp_d<BDIM_X>(dimx, dimy, dimz, dimt, -1, pmf, point, __pmf_data_sh);
+        __syncwarp(WMASK);
         if (rv != 0) {
                 return 0;
         }
@@ -761,6 +753,7 @@ __device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st,
         // for i in range(_len):
         //     if pmf[i] < absolute_pmf_threshold:
         //         pmf[i] = 0.0
+        #pragma unroll
         for(int i = tidx; i < dimt; i += BDIM_X) {
                 if (__pmf_data_sh[i] < absolpmf_thresh) {
                         __pmf_data_sh[i] = 0.0;
@@ -769,6 +762,7 @@ __device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st,
         __syncwarp(WMASK);
 
         if (IS_START) {
+                int *__shInd = reinterpret_cast<int *>(__sh + BDIM_Y*n32dimt) + tidy*n32dimt;
                 return peak_directions_d<BDIM_X,
                                          BDIM_Y>(__pmf_data_sh,
                                                  dirs,
@@ -788,9 +782,19 @@ __device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st,
                                 printf("absolpmf_thresh %10.8f\n", absolpmf_thresh);
                                 printf("--->            dir %10.8f, %10.8f, %10.8f\n", dir.x, dir.y, dir.z);
                                 printf("--->            point %10.8f, %10.8f, %10.8f\n", point.x, point.y, point.z);
-                                if (sqrt(dir.x*dir.x + dir.y*dir.y + dir.z*dir.z) >= 1.05){
-                                        printf("ERROR dir %10.8f, %10.8f, %10.8f\n", dir.x, dir.y, dir.z);
-                                }
+                        }
+                        __syncwarp(WMASK);
+                        if (tidx == 15) {
+                                printf("absolpmf_thresh %10.8f l15\n", absolpmf_thresh);
+                                printf("--->            dir %10.8f, %10.8f, %10.8f l15\n", dir.x, dir.y, dir.z);
+                                printf("--->            point %10.8f, %10.8f, %10.8f l15\n", point.x, point.y, point.z);
+                        }
+                        __syncwarp(WMASK);
+                        if (tidx == 31) {
+                                printArray("__pmf_data_sh initial l31", 8, dimt, __pmf_data_sh);
+                                printf("absolpmf_thresh %10.8f l31\n", absolpmf_thresh);
+                                printf("--->            dir %10.8f, %10.8f, %10.8f l31\n", dir.x, dir.y, dir.z);
+                                printf("--->            point %10.8f, %10.8f, %10.8f l31\n", point.x, point.y, point.z);
                         }
                         __syncwarp(WMASK);
                 #endif
@@ -810,6 +814,7 @@ __device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st,
                 //                 pmf[i] = 0
                 const REAL_T cos_similarity = COS(max_angle);
 
+                #pragma unroll
                 for(int i = tidx; i < dimt; i += BDIM_X) {
                         const REAL_T dot = dir.x*sphere_vertices[i].x+
                                            dir.y*sphere_vertices[i].y+
@@ -853,15 +858,35 @@ __device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st,
                         __tmp = curand_uniform(st) * last_cdf;
                 }
                 REAL_T selected_cdf = __shfl_sync(WMASK, __tmp, 0, BDIM_X);
-
+// Both these implementations work
+#if 1
+                int low = 0;
+                int high = dimt - 1;
+                while ((high - low) >= BDIM_X) {
+                        const int mid = (low + high) / 2;
+                        if (__pmf_data_sh[mid] < selected_cdf) {
+                                low = mid;
+                        } else {
+                                high = mid;
+                        }
+                }
+                const bool __ballot = (low+tidx <= high) ? (selected_cdf < __pmf_data_sh[low+tidx]) : 0;
+                const int __msk = __ballot_sync(WMASK, __ballot);
+                const int indProb = low + __ffs(__msk) - 1;
+#else
                 int indProb = dimt - 1;
                 for (int ii = 0; ii < dimt; ii+=BDIM_X) {
-                        int __msk = __ballot_sync(WMASK, selected_cdf < __pmf_data_sh[ii+tidx]);
+                        int __is_greater = 0;
+                        if (ii+tidx < dimt) {
+                                __is_greater = selected_cdf < __pmf_data_sh[ii+tidx];
+                        }
+                        const int __msk = __ballot_sync(WMASK, __is_greater);
                         if (__msk != 0) {
                                 indProb = ii + __ffs(__msk) - 1;
                                 break;
                         }
                 }
+#endif
 
                 #ifdef DEBUG
                         __syncwarp(WMASK);
@@ -887,28 +912,37 @@ __device__ int get_direction_prob_d(curandStatePhilox4_32_10_t *st,
                         if ((dir.x * sphere_vertices[indProb].x +
                              dir.y * sphere_vertices[indProb].y +
                              dir.z * sphere_vertices[indProb].z) > 0) {
-                                dirs[0] = MAKE_REAL3(sphere_vertices[indProb].x,
-                                                     sphere_vertices[indProb].y,
-                                                     sphere_vertices[indProb].z);
+                                *dirs = MAKE_REAL3(sphere_vertices[indProb].x,
+                                                   sphere_vertices[indProb].y,
+                                                   sphere_vertices[indProb].z);
                         } else {
-                                dirs[0] = MAKE_REAL3(-sphere_vertices[indProb].x,
-                                                     -sphere_vertices[indProb].y,
-                                                     -sphere_vertices[indProb].z);
+                                *dirs = MAKE_REAL3(-sphere_vertices[indProb].x,
+                                                   -sphere_vertices[indProb].y,
+                                                   -sphere_vertices[indProb].z);
                         }
+                        // printf("direction addr write %p, slid %i\n", dirs, blockIdx.x*blockDim.y+threadIdx.y);
                 }
 
                 #ifdef DEBUG
+                        __syncwarp(WMASK);
                         if (tidx == 0) {
-                                if ((dirs[0].x == dir.x) && (dirs[0].y == dir.y) && (dirs[0].z == dir.z)) {
-                                printf("ERROR dir %10.8f, %10.8f, %10.8f\n", dirs[0].x, dirs[0].y, dirs[0].z);
                                 printf("last_cdf %10.8f\n", last_cdf);
                                 printf("selected_cdf %10.8f\n", selected_cdf);
                                 printf("indProb %i out of %i\n", indProb, dimt);
-                                }
-                                if (sqrt(dirs[0].x*dirs[0].x + dirs[0].y*dirs[0].y + dirs[0].z*dirs[0].z) >= 1.1) {
-                                        printf("ERROR dir %10.8f, %10.8f, %10.8f\n", dirs[0].x, dirs[0].y, dirs[0].z);
-                                }
                         }
+                        __syncwarp(WMASK);
+                        if (tidx == 15) {
+                                printf("last_cdf %10.8f l15\n", last_cdf);
+                                printf("selected_cdf %10.8f l15\n", selected_cdf);
+                                printf("indProb %i out of %i l15\n", indProb, dimt);
+                        }
+                        __syncwarp(WMASK);
+                        if (tidx == 31) {
+                                printf("last_cdf %10.8f l31\n", last_cdf);
+                                printf("selected_cdf %10.8f l31\n", selected_cdf);
+                                printf("indProb %i out of %i l31\n", indProb, dimt);
+                        }
+                        __syncwarp(WMASK);
                 #endif
                 return 1;
         }
@@ -987,7 +1021,7 @@ __device__ int get_direction_boot_d(
         #pragma unroll
         for(int i = 0; i < NATTEMPTS; i++) {
 
-                const int rv = trilinear_interp_d<BDIM_X>(dimx, dimy, dimz, dimt, 0, dataf, point, __vox_data_sh);
+                const int rv = trilinear_interp_d<BDIM_X>(dimx, dimy, dimz, dimt, -1, dataf, point, __vox_data_sh);
 
 		const int nmsk = maskGet<BDIM_X>(dimt, b0s_mask, __vox_data_sh, __msk_data_sh);
 
@@ -1201,6 +1235,7 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
 			 const REAL_T min_separation_angle,
                          REAL3_T seed,
                          REAL3_T first_step,
+                         REAL_T* ptt_frame,
                          REAL3_T voxel_size,
                          const int dimx,
                          const int dimy,
@@ -1226,12 +1261,11 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
                          const REAL3_T *__restrict__ sphere_vertices,
                          const int2 *__restrict__ sphere_edges,
                          const int num_edges,
-                         REAL3_T *__restrict__ __shDir,
                          int *__restrict__ nsteps,
                          REAL3_T *__restrict__ streamline) {
 
         const int tidx = threadIdx.x;
-        //const int tidy = threadIdx.y;
+        const int tidy = threadIdx.y;
 
         const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
         const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
@@ -1240,6 +1274,7 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
 
         REAL3_T point = seed;
         REAL3_T direction = first_step;
+        __shared__ REAL3_T __sh_new_dir[BDIM_Y];
 
         if (tidx == 0) {
                 streamline[0] = point;
@@ -1276,7 +1311,7 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
                                                         sphere_vertices,
                                                         sphere_edges,
                                                         num_edges,
-                                                        __shDir);
+                                                        __sh_new_dir + tidy);
                 } else if (MODEL_T == PTT) {
                         ndir = get_direction_ptt_d<BDIM_X,
                                                    BDIM_Y,
@@ -1286,10 +1321,11 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
                                                         max_angle,
                                                         step_size,
                                                         direction,
+                                                        ptt_frame,
                                                         dimx, dimy, dimz, dimt,
                                                         point,
                                                         sphere_vertices,
-                                                        __shDir);
+                                                        __sh_new_dir + tidy);
                 } else {
                         // call get_direction_boot_d() with NATTEMPTS=5
                         ndir = get_direction_boot_d<BDIM_X,
@@ -1317,10 +1353,10 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
                                                         sphere_vertices,
                                                         sphere_edges,
                                                         num_edges,
-                                                        __shDir);
+                                                        __sh_new_dir + tidy);
                 }
                 __syncwarp(WMASK);
-                direction = __shDir[0];
+                direction = __sh_new_dir[tidy];
                 __syncwarp(WMASK);
 
                 if (ndir == 0) {
@@ -1351,6 +1387,37 @@ __device__ int tracker_d(curandStatePhilox4_32_10_t *st,
 
                 tissue_class = check_point_d<BDIM_X, BDIM_Y>(tc_threshold, point, dimx, dimy, dimz, metric_map);
 
+#if 0
+                __syncwarp(WMASK);
+                if (tidx == 0) {
+                        printf("step_size %f\n", step_size);
+                        printf("direction %f, %f, %f\n", direction.x, direction.y, direction.z);
+                        printf("direction addr read %p, slid %i\n", __shDir, blockIdx.x*blockDim.y+threadIdx.y);
+                        printf("voxel_size %f, %f, %f\n", voxel_size.x, voxel_size.y, voxel_size.z);
+                        printf("point %f, %f, %f\n", point.x, point.y, point.z);
+                        printf("tc %i\n", tissue_class);
+                }
+                __syncwarp(WMASK);
+                if (tidx == 15) {
+                        printf("step_size %f l15\n", step_size);
+                        printf("direction %f, %f, %f l15\n", direction.x, direction.y, direction.z);
+                        printf("direction addr read %p, slid %i l15\n", __shDir, blockIdx.x*blockDim.y+threadIdx.y);
+                        printf("voxel_size %f, %f, %f l15\n", voxel_size.x, voxel_size.y, voxel_size.z);
+                        printf("point %f, %f, %f l15\n", point.x, point.y, point.z);
+                        printf("tc %i l15\n", tissue_class);
+                }
+                __syncwarp(WMASK);
+                if (tidx == 31) {
+                        printf("step_size %f l31\n", step_size);
+                        printf("direction %f, %f, %f l31\n", direction.x, direction.y, direction.z);
+                        printf("direction addr read %p, slid %i l31\n", __shDir, blockIdx.x*blockDim.y+threadIdx.y);
+                        printf("voxel_size %f, %f, %f l31\n", voxel_size.x, voxel_size.y, voxel_size.z);
+                        printf("point %f, %f, %f l31\n", point.x, point.y, point.z);
+                        printf("tc %i l31\n", tissue_class);
+                }
+                __syncwarp(WMASK);
+#endif
+
                 if (tissue_class == ENDPOINT ||
                     tissue_class == INVALIDPOINT ||
                     tissue_class == OUTSIDEIMAGE) {
@@ -1588,7 +1655,6 @@ __global__ void genStreamlinesMerge_k(
                                       const int num_edges,
                                       const int    *__restrict__ slineOutOff,
                                             REAL3_T *__restrict__ shDir0,
-                                            REAL3_T *__restrict__ shDir1,
                                             int     *__restrict__ slineSeed,
                                             int     *__restrict__ slineLen,
                                             REAL3_T *__restrict__ sline) {
@@ -1601,8 +1667,8 @@ __global__ void genStreamlinesMerge_k(
         const int lid = (tidy*BDIM_X + tidx) % 32;
         const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
 
-        extern REAL_T __shared__ __sh[];
-
+        __shared__ REAL_T frame_sh[((MODEL_T == PTT) ? BDIM_Y*18 : 1)]; // Only used by PTT, TODO: way to remove this in other cases
+        REAL_T* __ptt_frame = frame_sh + tidy*18;
 	// const int hr_side = dimt-1;
 
         curandStatePhilox4_32_10_t st;
@@ -1646,66 +1712,25 @@ __global__ void genStreamlinesMerge_k(
 #endif
 
                 if (MODEL_T == PTT) {
-                        bool init_norm_success;
-                        // Here we probabilistic find a good intial normal for this initial direction 
-                        init_norm_success = (bool) get_direction_ptt_d<BDIM_X,
-                                                                       BDIM_Y,
-                                                                       1>(
-                                                                                &st,
-                                                                                dataf,
-                                                                                max_angle,
-                                                                                step_size,
-                                                                                MAKE_REAL3(-first_step.x, -first_step.y, -first_step.z),
-                                                                                dimx, dimy, dimz, dimt,
-                                                                                seed,
-                                                                                sphere_vertices,
-                                                                                shDir1 + slid*samplm_nr);
-
-                        __syncwarp(WMASK);
-                        if (!init_norm_success) {
-                                // try the other direction
-                                init_norm_success = (bool) get_direction_ptt_d<BDIM_X,
-                                                                               BDIM_Y,
-                                                                               1>(
-                                                                                        &st,
-                                                                                        dataf,
-                                                                                        max_angle,
-                                                                                        step_size,
-                                                                                        MAKE_REAL3(first_step.x, first_step.y, first_step.z),
-                                                                                        dimx, dimy, dimz, dimt,
-                                                                                        seed,
-                                                                                        sphere_vertices,
-                                                                                        shDir1 + slid*samplm_nr);
-                                __syncwarp(WMASK);
-                                if (!init_norm_success) { // This is rare
-#if 0
-                                        printf("PTT init seed failed");
-#endif
-                                        if (tidx == 0) {
-                                                slineLen[slineOff] = 1;
-                                                currSline[0] = seed;
-                                        }
-                                        slineOff += 1;
-                                        continue;
-                                } else {
-                                        if (tidx == 0) {
-                                                REAL_T *sh_frame_ptr = reinterpret_cast<REAL_T *>(__sh) + BDIM_Y*(DISC_VERT_CNT + DISC_FACE_CNT);
-                                                for (int ii = 0; ii < 9; ii++) {
-                                                        sh_frame_ptr[ii + 9*tidy] = -sh_frame_ptr[ii + 9*tidy]; // flip frame
-                                                }
-                                        }
-                                }
-                        }
-                        if (tidx == 0) {
-                                // We need to save the frames for the forwards step, and reverse them for the backwards step
-                                REAL_T *sh_ptr = reinterpret_cast<REAL_T *>(__sh) + BDIM_Y*(DISC_VERT_CNT + DISC_FACE_CNT);
-                                REAL_T *sh_ptr_to_save = reinterpret_cast<REAL_T *>(__sh) + BDIM_Y*(DISC_VERT_CNT + DISC_FACE_CNT + 9 + 1);
-                
-                                for (int ii = 0; ii < 9; ii++) {
-                                        sh_ptr_to_save[ii + 9*tidy] = -sh_ptr[ii + 9*tidy]; // frame
+                        if (!init_frame_ptt_d<BDIM_X, BDIM_Y>(
+                                &st,
+                                dataf,
+                                max_angle,
+                                step_size,
+                                first_step,
+                                dimx, dimy, dimz, dimt,
+                                seed,
+                                sphere_vertices,
+                                __ptt_frame
+                        )) { // this fails rarely
+                                if (tidx == 0) {
+                                        slineLen[slineOff] = 1;
+                                        currSline[0] = seed;
                                 }
+                                __syncwarp(WMASK);
+                                slineOff += 1;
+                                continue;
                         }
-                        __syncwarp(WMASK);
                 }
 
 
@@ -1721,6 +1746,7 @@ __global__ void genStreamlinesMerge_k(
 	                        			     min_separation_angle,
                                                              seed,
                                                              MAKE_REAL3(-first_step.x, -first_step.y, -first_step.z),
+                                                             __ptt_frame,
                                                              MAKE_REAL3(1, 1, 1),
                                                              dimx, dimy, dimz, dimt, dataf,
                                                              b0s_mask,
@@ -1733,7 +1759,6 @@ __global__ void genStreamlinesMerge_k(
                                                              sphere_vertices,
                                                              sphere_edges,
                                                              num_edges,
-                                                             shDir1 + slid*samplm_nr,
                                                              &stepsB,
                                                              currSline);
                 //if (tidx == 0) {
@@ -1749,20 +1774,6 @@ __global__ void genStreamlinesMerge_k(
                         }
                 }
 
-                if (MODEL_T == PTT) {
-                        __syncwarp(WMASK);
-                        if (tidx == 0) {
-                                // Setup the forwards step using parameters saved from backwards step
-                                REAL_T *sh_ptr = reinterpret_cast<REAL_T *>(__sh) + BDIM_Y*(DISC_VERT_CNT + DISC_FACE_CNT);
-                                REAL_T *sh_ptr_saved = reinterpret_cast<REAL_T *>(__sh) + BDIM_Y*(DISC_VERT_CNT + DISC_FACE_CNT + 9 + 1);
-
-                                for (int ii = 0; ii < 9; ii++) {
-                                        sh_ptr[ii + 9*tidy] = sh_ptr_saved[ii + 9*tidy]; // frame
-                                }
-                        }
-                        __syncwarp(WMASK);
-                }
-
                 int stepsF;
                 const int tissue_classF = tracker_d<BDIM_X,
                                                     BDIM_Y,
@@ -1775,6 +1786,7 @@ __global__ void genStreamlinesMerge_k(
 			            			     min_separation_angle,
                                                              seed,
                                                              first_step,
+                                                             __ptt_frame + 9,
                                                              MAKE_REAL3(1, 1, 1),
                                                              dimx, dimy, dimz, dimt, dataf,
                                                              b0s_mask,
@@ -1787,7 +1799,6 @@ __global__ void genStreamlinesMerge_k(
                                                              sphere_vertices,
                                                              sphere_edges,
                                                              num_edges,
-                                                             shDir1 + slid*samplm_nr,
                                                              &stepsF,
                                                              currSline + stepsB-1);
                 if (tidx == 0) {
@@ -1834,7 +1845,6 @@ void generate_streamlines_cuda_mgpu(const ModelType model_type, const REAL max_a
 
   std::vector<int*> slinesOffs_d(ngpus, nullptr);
   std::vector<REAL3*> shDirTemp0_d(ngpus, nullptr);
-  std::vector<REAL3*> shDirTemp1_d(ngpus, nullptr);
 
   //#pragma omp parallel for
   for (int n = 0; n < ngpus; ++n) {
@@ -1845,7 +1855,6 @@ void generate_streamlines_cuda_mgpu(const ModelType model_type, const REAL max_a
 
     CHECK_CUDA(cudaMalloc(&slinesOffs_d[n], sizeof(*slinesOffs_d[n])*(nseeds_gpu+1)));
     CHECK_CUDA(cudaMalloc(&shDirTemp0_d[n], sizeof(*shDirTemp0_d[n])*samplm_nr*grid.x*block.y));
-    CHECK_CUDA(cudaMalloc(&shDirTemp1_d[n], sizeof(*shDirTemp1_d[n])*samplm_nr*grid.x*block.y));
   }
 
   int n32dimt = ((dimt+31)/32)*32;
@@ -2008,7 +2017,7 @@ void generate_streamlines_cuda_mgpu(const ModelType model_type, const REAL max_a
                 dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n],
                 b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n],
                 reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]), reinterpret_cast<const int2 *>(sphere_edges_d[n]),
-                nedges, slinesOffs_d[n], shDirTemp0_d[n], shDirTemp1_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]);
+                nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]);
             break;
 
         case CSA:
@@ -2018,7 +2027,7 @@ void generate_streamlines_cuda_mgpu(const ModelType model_type, const REAL max_a
                 dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n],
                 b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n],
                 reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]), reinterpret_cast<const int2 *>(sphere_edges_d[n]),
-                nedges, slinesOffs_d[n], shDirTemp0_d[n], shDirTemp1_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]);
+                nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]);
             break;
 
         case PROB:
@@ -2031,19 +2040,18 @@ void generate_streamlines_cuda_mgpu(const ModelType model_type, const REAL max_a
                 dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n],
                 b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n],
                 reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]), reinterpret_cast<const int2 *>(sphere_edges_d[n]),
-                nedges, slinesOffs_d[n], shDirTemp0_d[n], shDirTemp1_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]);
+                nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]);
             break;
 
         case PTT:
-            // Potentially smaller shared memory requirements for PTT for main run
-            shSizeGNS = sizeof(REAL)*(THR_X_BL/THR_X_SL)*(DISC_VERT_CNT + DISC_FACE_CNT + 2*(9 + 1));
+            shSizeGNS = 0; // PTT uses exclusively static shared memory
             genStreamlinesMerge_k<THR_X_SL, THR_X_BL/THR_X_SL, PTT> <<<grid, block, shSizeGNS, streams[n]>>>(
                 max_angle, min_signal, tc_threshold, step_size, relative_peak_thresh, min_separation_angle,
                 rng_seed, rng_offset + n*nseeds_per_gpu, nseeds_gpu, reinterpret_cast<const REAL3 *>(seeds_d[n]),
                 dimx, dimy, dimz, dimt, dataf_d[n], H_d[n], R_d[n], delta_nr, delta_b_d[n], delta_q_d[n],
                 b0s_mask_d[n], metric_map_d[n], samplm_nr, sampling_matrix_d[n],
                 reinterpret_cast<const REAL3 *>(sphere_vertices_d[n]), reinterpret_cast<const int2 *>(sphere_edges_d[n]),
-                nedges, slinesOffs_d[n], shDirTemp0_d[n], shDirTemp1_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]);
+                nedges, slinesOffs_d[n], shDirTemp0_d[n], slineSeed_d[n], slineLen_d[n], sline_d[n]);
             break;
 
         default:
@@ -2078,7 +2086,6 @@ void generate_streamlines_cuda_mgpu(const ModelType model_type, const REAL max_a
     CHECK_CUDA(cudaFree(slineSeed_d[n]));
     CHECK_CUDA(cudaFree(slinesOffs_d[n]));
     CHECK_CUDA(cudaFree(shDirTemp0_d[n]));
-    CHECK_CUDA(cudaFree(shDirTemp1_d[n]));
     CHECK_CUDA(cudaFree(slineLen_d[n]));
     CHECK_CUDA(cudaFree(sline_d[n]));
   }
diff --git a/cuslines/ptt.cu b/cuslines/ptt.cu
index 6d1a25f..1894579 100644
--- a/cuslines/ptt.cu
+++ b/cuslines/ptt.cu
@@ -37,8 +37,7 @@ __device__ REAL_T interp4_d(const REAL3_T pos, const REAL_T* frame, const REAL_T
         }
     }
 
-    const int rv = trilinear_interp_d<1>(dimx, dimy, dimz, dimt, closest_odf_idx,
-                                         pmf, pos, &__max_cos);
+    const int rv = trilinear_interp_d<THR_X_SL>(dimx, dimy, dimz, dimt, closest_odf_idx, pmf, pos, &__max_cos);
 
 #if 0
     printf("inerpolated %f at %f, %f, %f, %i\n", __max_cos, pos.x, pos.y, pos.z, closest_odf_idx);
@@ -186,6 +185,7 @@ __device__ int get_direction_ptt_d(
     const REAL_T max_angle,
     const REAL_T step_size,
     REAL3_T dir,
+    REAL_T *__frame_sh,
     const int dimx, const int dimy, const int dimz, const int dimt,
     REAL3_T pos,
     const REAL3_T *__restrict__ odf_sphere_vertices,
@@ -204,23 +204,20 @@ __device__ int get_direction_ptt_d(
     const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
     const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
 
-	extern REAL_T __shared__ __sh[];
+	REAL_T __shared__ face_cdf_sh[BDIM_Y*DISC_FACE_CNT];
+    REAL_T __shared__ vert_pdf_sh[BDIM_Y*DISC_VERT_CNT];
+    REAL_T __shared__ first_val_sh[BDIM_Y];
 
-    REAL_T *__face_cdf_sh = reinterpret_cast<REAL_T *>(__sh);
-    REAL_T *__vert_pdf_sh = __face_cdf_sh + BDIM_Y*DISC_FACE_CNT; // These first two can be overwritten in other functions
-    REAL_T *__frame_sh = __vert_pdf_sh + BDIM_Y*DISC_VERT_CNT; // the rest must persist form call to call
-    REAL_T *__first_val_sh = __frame_sh + BDIM_Y*9;
-
-    __face_cdf_sh += tidy*DISC_FACE_CNT;
-    __vert_pdf_sh += tidy*DISC_VERT_CNT; 
-    __frame_sh += tidy*9;
-    __first_val_sh += tidy*1;
+    REAL_T *__face_cdf_sh = face_cdf_sh + tidy*DISC_FACE_CNT;
+    REAL_T *__vert_pdf_sh = vert_pdf_sh + tidy*DISC_VERT_CNT;
+    REAL_T *__first_val_sh = first_val_sh + tidy;
 
     const REAL_T max_curvature = SIN(max_angle / 2) / step_size; // bigger numbers means wiggle more
     const REAL_T probe_step_size = ((step_size / 2) / (PROBE_QUALITY - 1));
 
     REAL_T __tmp;
 
+    __syncwarp(WMASK);
     if (IS_INIT) {
         if (tidx==0) {
             __frame_sh[0] = dir.x;
@@ -410,3 +407,73 @@ __device__ int get_direction_ptt_d(
 }
 
 
+template<int BDIM_X,
+         int BDIM_Y,
+         typename REAL_T,
+         typename REAL3_T>
+__device__ bool init_frame_ptt_d(
+    curandStatePhilox4_32_10_t *st,
+    const REAL_T *__restrict__ pmf,
+    const REAL_T max_angle,
+    const REAL_T step_size,
+    REAL3_T first_step,
+    const int dimx, const int dimy, const int dimz, const int dimt,
+    REAL3_T seed,
+    const REAL3_T *__restrict__ sphere_vertices,
+    REAL_T* __frame) {
+    const int tidx = threadIdx.x;
+
+    const int lid = (threadIdx.y*BDIM_X + tidx) % 32;
+    const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
+
+    bool init_norm_success;
+    REAL3_T tmp;
+
+    // Here we probabilistic find a good intial normal for this initial direction 
+    init_norm_success = (bool) get_direction_ptt_d<BDIM_X, BDIM_Y, 1>(
+        st,
+        pmf,
+        max_angle,
+        step_size,
+        MAKE_REAL3(-first_step.x, -first_step.y, -first_step.z),
+        __frame,
+        dimx, dimy, dimz, dimt,
+        seed,
+        sphere_vertices,
+        &tmp);
+    __syncwarp(WMASK);
+
+    if (!init_norm_success) {
+        // try the other direction
+        init_norm_success = (bool) get_direction_ptt_d<BDIM_X, BDIM_Y, 1>(
+            st,
+            pmf,
+            max_angle,
+            step_size,
+            MAKE_REAL3(first_step.x, first_step.y, first_step.z),
+            __frame,
+            dimx, dimy, dimz, dimt,
+            seed,
+            sphere_vertices,
+            &tmp);
+        __syncwarp(WMASK);
+
+        if (!init_norm_success) { // This is rare
+            return false;
+        } else {
+            if (tidx == 0) {
+                for (int ii = 0; ii < 9; ii++) {
+                    __frame[ii] = -__frame[ii];
+                }
+            }
+            __syncwarp(WMASK);
+        }
+    }
+    if (tidx == 0) {
+        for (int ii = 0; ii < 9; ii++) {
+            __frame[9+ii] = -__frame[ii]; // save flipped frame for second run
+        }
+    }
+    __syncwarp(WMASK);
+    return true;
+}
diff --git a/cuslines/utils.cu b/cuslines/utils.cu
index 5589fc3..c7fe47f 100644
--- a/cuslines/utils.cu
+++ b/cuslines/utils.cu
@@ -6,6 +6,16 @@ __device__ void prefix_sum_sh_d(REAL_T *num_sh, int __len) {
     const int lid = (threadIdx.y*BDIM_X + threadIdx.x) % 32;
     const unsigned int WMASK = ((1ull << BDIM_X)-1) << (lid & (~(BDIM_X-1)));
 
+#if 0
+    // for debugging
+    __syncwarp(WMASK);
+    if (tidx == 0) {
+        for (int j = 1; j < __len; j++) {
+            num_sh[j] += num_sh[j-1];
+        }
+    }
+    __syncwarp(WMASK);
+#else
     for (int j = 0; j < __len; j += BDIM_X) {
         if ((tidx == 0) && (j != 0)) {
             num_sh[j] += num_sh[j-1];
@@ -27,6 +37,7 @@ __device__ void prefix_sum_sh_d(REAL_T *num_sh, int __len) {
         }
         __syncwarp(WMASK);
     }
+#endif
 }
 
 template<typename REAL_T>
@@ -47,6 +58,26 @@ __device__ void printArray(const char *name, int ncol, int n, REAL_T *arr) {
 	}
 }
 
+template<typename REAL_T>
+__device__ REAL_T interpolation_helper_d(const REAL_T* dataf, const REAL_T wgh[3][2], const long long coo[3][2], int dimy, int dimz, int dimt, int t) {
+    REAL_T __tmp = 0;
+    #pragma unroll
+    for (int i = 0; i < 2; i++) {
+        #pragma unroll
+        for (int j = 0; j < 2; j++) {
+            #pragma unroll
+            for (int k = 0; k < 2; k++) {
+                __tmp += wgh[0][i] * wgh[1][j] * wgh[2][k] *
+                         dataf[coo[0][i] * dimy * dimz * dimt +
+                               coo[1][j] * dimz * dimt +
+                               coo[2][k] * dimt +
+                               t];
+            }
+        }
+    }
+    return __tmp;
+}
+
 template<int BDIM_X,
          typename REAL_T,
          typename REAL3_T>
@@ -54,7 +85,7 @@ __device__ int trilinear_interp_d(const int dimx,
                                   const int dimy,
                                   const int dimz,
                                   const int dimt,
-                                  int dimt_start_idx,
+                                  int dimt_idx, // If -1, get all
                                   const REAL_T *__restrict__ dataf,
                                   const REAL3_T point,
                                   REAL_T *__restrict__ __vox_data) {
@@ -91,44 +122,12 @@ __device__ int trilinear_interp_d(const int dimx,
         coo[2][0] = MAX(0, fl.z);
         coo[2][1] = MIN(dimz-1, coo[2][0]+1);
 
-        if (BDIM_X == 1) { // Whether this is being multithreaded
-                REAL_T __tmp = 0;
-                #pragma unroll
-                for(int i = 0; i < 2; i++) {
-                        #pragma unroll
-                        for(int j = 0; j < 2; j++) {
-                                #pragma unroll
-                                for(int k = 0; k < 2; k++) {
-                                        __tmp += wgh[0][i]*wgh[1][j]*wgh[2][k]*
-                                                dataf[coo[0][i]*dimy*dimz*dimt +
-                                                coo[1][j]*dimz*dimt +
-                                                coo[2][k]*dimt +
-                                                dimt_start_idx];
-                                }
-                        }
+        if (dimt_idx == -1) {
+                for (int t = threadIdx.x; t < dimt; t += BDIM_X) {
+                        __vox_data[t] = interpolation_helper_d(dataf, wgh, coo, dimy, dimz, dimt, t);
                 }
-                __vox_data[0] = __tmp;
         } else {
-                for(int t = dimt_start_idx + threadIdx.x; t < dimt; t += BDIM_X) {
-
-                        REAL_T __tmp = 0;
-
-                        #pragma unroll
-                        for(int i = 0; i < 2; i++) {
-                                #pragma unroll
-                                for(int j = 0; j < 2; j++) {
-                                        #pragma unroll
-                                        for(int k = 0; k < 2; k++) {
-                                                __tmp += wgh[0][i]*wgh[1][j]*wgh[2][k]*
-                                                        dataf[coo[0][i]*dimy*dimz*dimt +
-                                                        coo[1][j]*dimz*dimt +
-                                                        coo[2][k]*dimt +
-                                                        t];
-                                        }
-                                }
-                        }
-                        __vox_data[t] = __tmp;
-                }
+                *__vox_data = interpolation_helper_d(dataf, wgh, coo, dimy, dimz, dimt, dimt_idx);
         }
 
         /*

From 274a98cf146b0871013956fce55fb5565799e4f1 Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Mon, 19 Aug 2024 10:55:00 -0700
Subject: [PATCH 26/35] fix ptt frame init

---
 cuslines/ptt.cu  | 79 +++++++++++++++++++++++-------------------------
 cuslines/ptt.cuh |  1 -
 2 files changed, 38 insertions(+), 42 deletions(-)

diff --git a/cuslines/ptt.cu b/cuslines/ptt.cu
index 1894579..b36e747 100644
--- a/cuslines/ptt.cu
+++ b/cuslines/ptt.cu
@@ -90,22 +90,22 @@ __device__ void prepare_propagator_d(REAL_T k1, REAL_T k2, REAL_T arclength,
 template<bool IS_INIT, typename REAL_T>
 __device__ void get_probing_frame_d(const REAL_T* frame, curandStatePhilox4_32_10_t *st, REAL_T* probing_frame) {
     if (IS_INIT) {
-        for (int ii = 0; ii < 3; ii++) {
-            probing_frame[ii] =  frame[ii];
+        for (int ii = 0; ii < 3; ii++) { // tangent
+            probing_frame[ii] = frame[ii];
         }
-        REAL_T __tmp[3];
-        for (int ii = 0; ii < 3; ii++) {
-            do {
-                __tmp[ii] = 2.0 * curand_uniform(st) - 1.0;
-            } while(!__tmp[ii]);
+        if ((probing_frame[0] != 0) && (probing_frame[1] != 0)) { // norm
+            probing_frame[3] = -probing_frame[1];
+            probing_frame[4] = probing_frame[0];
+            probing_frame[5] = 0;
+        } else {
+            probing_frame[3] = 0;
+            probing_frame[4] = -probing_frame[2];
+            probing_frame[5] = 0;
         }
 
         norm3_d(probing_frame, 0); // tangent
-        norm3_d(__tmp, 1); // norm
-        crossnorm3_d(probing_frame + 2*3, probing_frame, __tmp, 2); // binorm
-        for (int ii = 0; ii < 3; ii++) {
-            probing_frame[3 + ii] = __tmp[ii];
-        }
+        norm3_d(probing_frame + 3, 1); // norm
+        crossnorm3_d(probing_frame + 2*3, probing_frame, probing_frame + 3, 2); // binorm
     } else {
         for (int ii = 0; ii < 9; ii++) {
             probing_frame[ii] =  frame[ii];
@@ -236,40 +236,37 @@ __device__ int get_direction_ptt_d(
     REAL_T probing_frame[9];
     REAL_T k1_probe, k2_probe;
     bool support_found = 0;
-    int jj = 0;
-    do {
-        for (int ii = tidx; ii < DISC_VERT_CNT; ii += BDIM_X) {
-            k1_probe = DISC_VERT[ii*2] * max_curvature;
-            k2_probe = DISC_VERT[ii*2+1] * max_curvature;
-
-            get_probing_frame_d<IS_INIT>(__frame_sh, st, probing_frame);
-
-            const REAL_T this_support = calculate_data_support_d(
-                *__first_val_sh,
-                pos, pmf, dimx, dimy, dimz, dimt,
-                probe_step_size,
-                odf_sphere_vertices,
-                k1_probe, k2_probe,
-                probing_frame);
+    for (int ii = tidx; ii < DISC_VERT_CNT; ii += BDIM_X) {
+        k1_probe = DISC_VERT[ii*2] * max_curvature;
+        k2_probe = DISC_VERT[ii*2+1] * max_curvature;
+
+        get_probing_frame_d<IS_INIT>(__frame_sh, st, probing_frame);
+
+        const REAL_T this_support = calculate_data_support_d(
+            *__first_val_sh,
+            pos, pmf, dimx, dimy, dimz, dimt,
+            probe_step_size,
+            odf_sphere_vertices,
+            k1_probe, k2_probe,
+            probing_frame);
 
 #if 0
-            if (threadIdx.y == 1 && ii == 0) { 
-                printf("    k1_probe: %f, k2_probe %f, support %f for id: %i\n", k1_probe, k2_probe, this_support, tidx);
-            }
+        if (threadIdx.y == 1 && ii == 0) { 
+            printf("    k1_probe: %f, k2_probe %f, support %f for id: %i\n", k1_probe, k2_probe, this_support, tidx);
+        }
 #endif
 
-            if (this_support < NORM_MIN_SUPPORT) {
-                __vert_pdf_sh[ii] = 0;
-            } else {
-                __vert_pdf_sh[ii] = this_support;
-                support_found = 1;
-            }
-        }
-        if (IS_INIT) {
-            const int __msk = __ballot_sync(WMASK, support_found);
-            support_found = (__msk != 0);
+        if (this_support < NORM_MIN_SUPPORT) {
+            __vert_pdf_sh[ii] = 0;
+        } else {
+            __vert_pdf_sh[ii] = this_support;
+            support_found = 1;
         }
-    } while (IS_INIT && (!support_found) && (jj++ < INIT_FRAME_TRIES));
+    }
+    const int __msk = __ballot_sync(WMASK, support_found);
+    if (__msk == 0) {
+        return 0;
+    }
 
 #if 0
     __syncwarp(WMASK);
diff --git a/cuslines/ptt.cuh b/cuslines/ptt.cuh
index b45cc9b..d8986b5 100644
--- a/cuslines/ptt.cuh
+++ b/cuslines/ptt.cuh
@@ -10,7 +10,6 @@
 #define SAMPLING_QUALITY 4 // can be 2-7
 #define PROBABILISTIC_BIAS 1 // 1 looks good. can be 0-log_2(N_WARPS) (typically 0-5). 0 is fully probabilistic, 4 is close to deterministic.
 #define ALLOW_WEAK_LINK 1
-#define INIT_FRAME_TRIES 16
 #define TRIES_PER_REJECTION_SAMPLING 1024
 #define DEFAULT_PTT_MINDATASUPPORT 0.05
 #define K_SMALL 0.0001

From b588fbb62cc1e99e2b7ca9c717847684101da8eb Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Mon, 19 Aug 2024 11:17:38 -0700
Subject: [PATCH 27/35] no point in using small_sphere anymore

---
 run_gpu_streamlines.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/run_gpu_streamlines.py b/run_gpu_streamlines.py
index 2e5be19..978da42 100644
--- a/run_gpu_streamlines.py
+++ b/run_gpu_streamlines.py
@@ -40,7 +40,7 @@
 from dipy.io.streamline import save_tractogram
 from dipy.tracking import utils
 from dipy.core.gradients import gradient_table, unique_bvals_magnitude
-from dipy.data import small_sphere
+from dipy.data import default_sphere
 from dipy.direction import (BootDirectionGetter, ProbabilisticDirectionGetter, PTTDirectionGetter)
 from dipy.reconst.shm import OpdtModel, CsaOdfModel
 from dipy.reconst.csdeconv import ConstrainedSphericalDeconvModel, auto_response_ssst
@@ -145,6 +145,7 @@ def get_img(ep2_seq):
   affine=np.eye(4)))
 
 # Setup model
+sphere = default_sphere
 if args.model == "opdt":
   model_type = cuslines.ModelType.OPDT
   print("Running OPDT model...")
@@ -190,7 +191,7 @@ def get_img(ep2_seq):
     model_type = cuslines.ModelType.PTT
     dg = PTTDirectionGetter
   fit = model.fit(data, mask=(metric_map >= args.fa_threshold))
-  data = fit.odf(small_sphere).clip(min=0)
+  data = fit.odf(sphere).clip(min=0)
 else:
   dg = BootDirectionGetter
 
@@ -199,16 +200,15 @@ def get_img(ep2_seq):
 # Setup direction getter args
 if args.device == "cpu":
   if args.dg != "boot":
-    dg = dg.from_pmf(data, max_angle=args.max_angle, sphere=small_sphere, relative_peak_threshold=args.relative_peak_threshold, min_separation_angle=args.min_separation_angle)
+    dg = dg.from_pmf(data, max_angle=args.max_angle, sphere=sphere, relative_peak_threshold=args.relative_peak_threshold, min_separation_angle=args.min_separation_angle)
   else:
-    dg = BootDirectionGetter.from_data(data, model, max_angle=args.max_angle, sphere=small_sphere, sh_order=args.sh_order, relative_peak_threshold=args.relative_peak_threshold, min_separation_angle=args.min_separation_angle)
+    dg = BootDirectionGetter.from_data(data, model, max_angle=args.max_angle, sphere=sphere, sh_order=args.sh_order, relative_peak_threshold=args.relative_peak_threshold, min_separation_angle=args.min_separation_angle)
 else:
   # Setup direction getter args
   b0s_mask = gtab.b0s_mask
   dwi_mask = ~b0s_mask
 
   # setup sampling matrix
-  sphere = small_sphere
   theta = sphere.theta
   phi = sphere.phi
   sampling_matrix, _, _ = shm.real_sym_sh_basis(args.sh_order, theta, phi)

From 277cf28447eea5e33fe202941994105f3dc0184a Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Mon, 26 Aug 2024 09:26:43 -0700
Subject: [PATCH 28/35] update cmakelists

---
 CMakeLists.txt | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 229ea23..7af5dd4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -86,7 +86,4 @@ set_target_properties(cuslines PROPERTIES CUDA_ARCHITECTURES "${CUDA_COMPUTE_CAP
 # install
 install(TARGETS cuslines cuslines_kernels-shared LIBRARY DESTINATION "${CMAKE_INSTALL_PREFIX}/cuslines")
 install(FILES "${CMAKE_SOURCE_DIR}/cuslines/__init__.py" DESTINATION "${CMAKE_INSTALL_PREFIX}/cuslines")
-install(FILES "${CMAKE_SOURCE_DIR}/run_dipy_gpu.py" DESTINATION "${CMAKE_INSTALL_PREFIX}/")
-install(FILES "${CMAKE_SOURCE_DIR}/run_dipy_cpu.py" DESTINATION "${CMAKE_INSTALL_PREFIX}/")
-install(FILES "${CMAKE_SOURCE_DIR}/run_dipy_gpu_hardi.py" DESTINATION "${CMAKE_INSTALL_PREFIX}/")
-install(FILES "${CMAKE_SOURCE_DIR}/run_dipy_cpu_hardi.py" DESTINATION "${CMAKE_INSTALL_PREFIX}/")
+install(FILES "${CMAKE_SOURCE_DIR}/run_gpu_streamlines.py" DESTINATION "${CMAKE_INSTALL_PREFIX}/")

From 67592806951d5dbb48d08637dcab18d25d796c5b Mon Sep 17 00:00:00 2001
From: Josh Romero <joshr@nvidia.com>
Date: Wed, 28 Aug 2024 12:47:05 -0700
Subject: [PATCH 29/35] merge build updates

---
 CMakeLists.txt        |  70 ++++---------
 README.md             |   6 +-
 cuslines/__init__.py  |   1 -
 pyproject.toml        |  14 +++
 run_dipy_cpu.py       | 151 ++++++++++++++++++++++++++++
 run_dipy_cpu_hardi.py | 158 +++++++++++++++++++++++++++++
 run_dipy_gpu.py       | 228 ++++++++++++++++++++++++++++++++++++++++++
 run_dipy_gpu_hardi.py | 205 +++++++++++++++++++++++++++++++++++++
 8 files changed, 781 insertions(+), 52 deletions(-)
 delete mode 100644 cuslines/__init__.py
 create mode 100644 pyproject.toml
 create mode 100644 run_dipy_cpu.py
 create mode 100644 run_dipy_cpu_hardi.py
 create mode 100644 run_dipy_gpu.py
 create mode 100644 run_dipy_gpu_hardi.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7af5dd4..c946eab 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,45 +1,39 @@
-cmake_minimum_required(VERSION 3.10)
+cmake_minimum_required(VERSION 3.18)
 
-#project definition
-cmake_policy(SET CMP0104 NEW)
 project(cuslines LANGUAGES CUDA CXX VERSION 1.0)
 
-#include the external project stuff
-include(ExternalProject)
-
-#global settings
-set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH})
+# Build settings
 set(CMAKE_CXX_STANDARD 11)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -Wall -Werror=reorder")
 set(CMAKE_CXX_FLAGS_RELEASE "-O3")
 
-#determine type of build
 if(NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE "Release")
 endif()
 
-#now, add the right flags
 if (CMAKE_BUILD_TYPE STREQUAL "Debug" )
   set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG}")
 else()
   set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE}")
 endif()
 
-#decide whether to use CUDA or not
-#find_package(CUDAToolkit REQUIRED)
-if(NOT CUDA_COMPUTE_CAPABILITY)
-  set(CUDA_COMPUTE_CAPABILITY 70 80)
+# CUDA
+find_package(CUDAToolkit REQUIRED)
+
+# Set default CUDA compute capabilities if unset
+if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+  set(CMAKE_CUDA_ARCHITECTURES 70 80 90)
 endif()
 
-#Find OpenMP
+# OpenMP
 find_package(OpenMP)
 if(OPENMP_FOUND)
     set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
     set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
     
-    #check which compiler we have to determine openmp runtime
+    # Set OMP runtime based on compiler
     if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
       set(OMP_RUNTIME "INTEL")
     elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
@@ -50,40 +44,18 @@ if(OPENMP_FOUND)
     message(STATUS "OpenMP runtime used: ${OMP_RUNTIME}")
 endif()
 
-# add object lib
-add_library(cuslines_kernels-object OBJECT
-  "${CMAKE_SOURCE_DIR}/cuslines/generate_streamlines_cuda.cu"
-)
-target_include_directories( cuslines_kernels-object PUBLIC "${CMAKE_SOURCE_DIR}/cuslines" )
-target_link_libraries(cuslines_kernels-object PUBLIC cudart)
-set_target_properties(cuslines_kernels-object PROPERTIES POSITION_INDEPENDENT_CODE TRUE
-                                                         CUDA_ARCHITECTURES "${CUDA_COMPUTE_CAPABILITY}")
-
-##############################################
-# PyBind11/Python build
-##############################################
+# Build library and pybind11 module
 add_subdirectory(external/pybind11)
-
-FIND_PACKAGE( PythonLibs )
-INCLUDE_DIRECTORIES( ${PYTHON_INCLUDE_PATH} )
-
-add_library(cuslines_kernels-shared STATIC $<TARGET_OBJECTS:cuslines_kernels-object>)
-set_target_properties(cuslines_kernels-shared PROPERTIES OUTPUT_NAME cuslines_kernels 
-                                                         POSITION_INDEPENDENT_CODE TRUE
-                                                         CUDA_ARCHITECTURES "${CUDA_COMPUTE_CAPABILITY}")
+add_library(cuslines_kernels)
+target_sources(cuslines_kernels
+	       PRIVATE
+	       ${CMAKE_SOURCE_DIR}/cuslines/generate_streamlines_cuda.cu)
+set_target_properties(cuslines_kernels PROPERTIES OUTPUT_NAME cuslines_kernels
+                                                  POSITION_INDEPENDENT_CODE TRUE)
 
 pybind11_add_module(cuslines ${CMAKE_SOURCE_DIR}/cuslines/cuslines.cpp)
-target_include_directories(cuslines PUBLIC "${CMAKE_SOURCE_DIR}/cuslines" "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}")
-target_link_libraries(cuslines PRIVATE cuslines_kernels-shared cudart)
-set_target_properties(cuslines PROPERTIES CUDA_ARCHITECTURES "${CUDA_COMPUTE_CAPABILITY}")
-
-# custom target for install
-#add_custom_target(mvcmd_py ALL
-#  COMMAND ${CMAKE_COMMAND} -E copy_if_different "${CMAKE_SOURCE_DIR}/cuslines/__init__.py" "${CMAKE_INSTALL_PREFIX}/cuslines/__init__.py"
-#  DEPENDS "${CMAKE_SOURCE_DIR}/cuslines/__init__.py"
-#)
+target_include_directories(cuslines PUBLIC "${CMAKE_SOURCE_DIR}/cuslines" "${CUDAToolkit_INCLUDE_DIRS}")
+target_link_libraries(cuslines PRIVATE cuslines_kernels CUDA::cudart_static)
 
-# install
-install(TARGETS cuslines cuslines_kernels-shared LIBRARY DESTINATION "${CMAKE_INSTALL_PREFIX}/cuslines")
-install(FILES "${CMAKE_SOURCE_DIR}/cuslines/__init__.py" DESTINATION "${CMAKE_INSTALL_PREFIX}/cuslines")
-install(FILES "${CMAKE_SOURCE_DIR}/run_gpu_streamlines.py" DESTINATION "${CMAKE_INSTALL_PREFIX}/")
+# Install
+install(TARGETS cuslines cuslines_kernels LIBRARY DESTINATION .)
diff --git a/README.md b/README.md
index 4768c84..f4c1202 100644
--- a/README.md
+++ b/README.md
@@ -3,10 +3,12 @@
 ## Installation
 When cloning the repo, please use "git clone --recursive" to pull all the requirements.
 
-To install, simply run `./build_cuslines.sh`. This will place the cuslines library and files within a new subdirectory `./install` as well as copy the example script.
+To install, simply run `pip install .` in the top-level repository directory.
 
 ## Running the examples
-First, navigate into the `./builds` subdirectory. There are two example scripts using the HARDI dataset, `run_dipy_cpu_hardi.py` and `run_dipy_gpu_hardi.py`, which run on CPU and GPU respectively.
+This repository contains several example usage scripts.
+
+There are two example scripts using the HARDI dataset, `run_dipy_cpu_hardi.py` and `run_dipy_gpu_hardi.py`, which run on CPU and GPU respectively.
 
 To run the baseline CPU example on a random set of 1000 seeds, this is the command and example output:
 ```
diff --git a/cuslines/__init__.py b/cuslines/__init__.py
deleted file mode 100644
index f3d06a8..0000000
--- a/cuslines/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-import cuslines
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..d6d2422
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,14 @@
+[build-system]
+requires = ["scikit-build-core", "pybind11"]
+build-backend = "scikit_build_core.build"
+
+[project]
+name = "cuslines"
+version = "1.0.0"
+description="GPU-accelerated tractography package"
+readme = "README.md"
+requires-python = ">=3.7"
+
+[tool.scikit-build]
+cmake.build-type = "Release"
+cmake.args = ["-DCMAKE_CUDA_ARCHITECTURES=70;80;90"]
diff --git a/run_dipy_cpu.py b/run_dipy_cpu.py
new file mode 100644
index 0000000..cc839a2
--- /dev/null
+++ b/run_dipy_cpu.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python
+
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+#    contributors may be used to endorse or promote products derived from
+#    this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+import random
+import time
+
+import numpy as np
+
+import dipy.reconst.dti as dti
+from dipy.io import read_bvals_bvecs
+from dipy.io.stateful_tractogram import Origin, Space, StatefulTractogram
+from dipy.io.streamline import save_tractogram
+from dipy.tracking import utils
+from dipy.core.gradients import gradient_table
+from dipy.data import small_sphere
+from dipy.direction import BootDirectionGetter
+from dipy.reconst.shm import OpdtModel
+from dipy.tracking.local_tracking import LocalTracking
+from dipy.tracking.stopping_criterion import ThresholdStoppingCriterion
+from dipy.reconst import shm
+
+import nibabel as nib
+from nibabel.orientations import aff2axcodes
+
+# Import custom module
+import cuslines
+
+t0 = time.time()
+
+# set seed to get deterministic streamlines
+np.random.seed(0)
+random.seed(0)
+
+#Get Gradient values
+def get_gtab(fbval, fbvec):
+    bvals, bvecs = read_bvals_bvecs(fbval, fbvec)
+    gtab = gradient_table(bvals, bvecs)
+    return gtab
+
+def get_img(ep2_seq):
+    img = nib.load(ep2_seq)
+    return img
+
+print("parsing arguments")
+parser = argparse.ArgumentParser()
+parser.add_argument("nifti_file", help="path to the ep2multiband sequence nifti file")
+parser.add_argument("bvals", help="path to the bvals")
+parser.add_argument("bvecs", help="path to the bvecs")
+parser.add_argument("mask_nifti", help="path to the mask file")
+parser.add_argument("roi_nifti", help="path to the ROI file")
+parser.add_argument("--output-prefix", type=str, default ='', help="path to the output file")
+parser.add_argument("--chunk-size", type=int, required=True, help="how many seeds to process per sweep, per GPU")
+args = parser.parse_args()
+
+img = get_img(args.nifti_file)
+voxel_order = "".join(aff2axcodes(img.affine))
+gtab = get_gtab(args.bvals, args.bvecs)
+roi = get_img(args.roi_nifti)
+mask = get_img(args.mask_nifti)
+data = img.get_data()
+roi_data = roi.get_data()
+mask = mask.get_data()
+
+tenmodel = dti.TensorModel(gtab, fit_method='WLS')
+print('Fitting Tensor')
+tenfit = tenmodel.fit(data, mask)
+print('Computing anisotropy measures (FA,MD,RGB)')
+FA = tenfit.fa
+FA[np.isnan(FA)] = 0
+
+# Setup tissue_classifier args
+tissue_classifier = ThresholdStoppingCriterion(FA, 0.1)
+metric_map = np.asarray(FA, 'float64')
+
+# Create seeds for ROI
+seed_mask = utils.seeds_from_mask(roi_data, density=3, affine=np.eye(4))
+
+# Setup model
+print('slowadcodf')
+sh_order = 6
+model = OpdtModel(gtab, sh_order=sh_order, min_signal=1)
+
+# Setup direction getter args
+print('Bootstrap direction getter')
+boot_dg = BootDirectionGetter.from_data(data, model, max_angle=60., sphere=small_sphere)
+
+print('streamline gen')
+global_chunk_size = args.chunk_size
+nchunks = (seed_mask.shape[0] + global_chunk_size - 1) // global_chunk_size
+
+streamline_generator = LocalTracking(boot_dg, tissue_classifier, seed_mask, affine=np.eye(4), step_size=.5)
+
+t1 = time.time()
+streamline_time = 0
+io_time = 0
+for idx in range(int(nchunks)):
+  # Main streamline computation
+  ts = time.time()
+  streamlines = [s for s in streamline_generator]
+  te = time.time()
+  streamline_time += (te-ts)
+  print("Generated {} streamlines from {} seeds, time: {} s".format(len(streamlines),
+                                                                    seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size].shape[0],
+                                                                    te-ts))
+
+  # Save tracklines file
+  if args.output_prefix:
+    fname = "{}.{}_{}.trk".format(args.output_prefix, idx+1, nchunks)
+    ts = time.time()
+    #save_tractogram(fname, streamlines, img.affine, vox_size=roi.header.get_zooms(), shape=roi_data.shape)
+    sft = StatefulTractogram(streamlines, args.nifti_file, Space.VOX)
+    save_tractogram(sft, fname)
+    te = time.time()
+    print("Saved streamlines to {}, time {} s".format(fname, te-ts))
+    io_time += (te-ts)
+
+t2 = time.time()
+
+print("Completed processing {} seeds.".format(seed_mask.shape[0]))
+print("Initialization time: {} sec".format(t1-t0))
+print("Streamline generation total time: {} sec".format(t2-t1))
+print("\tStreamline processing: {} sec".format(streamline_time))
+if args.output_prefix:
+  print("\tFile writing: {} sec".format(io_time))
diff --git a/run_dipy_cpu_hardi.py b/run_dipy_cpu_hardi.py
new file mode 100644
index 0000000..c8aaea1
--- /dev/null
+++ b/run_dipy_cpu_hardi.py
@@ -0,0 +1,158 @@
+#!/usr/bin/env python
+
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+#    contributors may be used to endorse or promote products derived from
+#    this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+import random
+import time
+
+import numpy as np
+
+import dipy.reconst.dti as dti
+from dipy.io import read_bvals_bvecs
+from dipy.io.stateful_tractogram import Origin, Space, StatefulTractogram
+from dipy.io.streamline import save_tractogram
+from dipy.tracking import utils
+from dipy.core.gradients import gradient_table
+from dipy.data import small_sphere
+from dipy.direction import BootDirectionGetter
+from dipy.reconst.shm import OpdtModel
+from dipy.tracking.local_tracking import LocalTracking
+from dipy.tracking.stopping_criterion import ThresholdStoppingCriterion
+from dipy.reconst import shm
+from dipy.data import get_fnames
+from dipy.data import read_stanford_pve_maps
+
+import nibabel as nib
+from nibabel.orientations import aff2axcodes
+
+# Import custom module
+import cuslines
+
+t0 = time.time()
+
+# set seed to get deterministic streamlines
+np.random.seed(0)
+random.seed(0)
+
+#Get Gradient values
+def get_gtab(fbval, fbvec):
+    bvals, bvecs = read_bvals_bvecs(fbval, fbvec)
+    gtab = gradient_table(bvals, bvecs)
+    return gtab
+
+def get_img(ep2_seq):
+    img = nib.load(ep2_seq)
+    return img
+
+print("parsing arguments")
+parser = argparse.ArgumentParser()
+parser.add_argument("--output-prefix", type=str, default ='', help="path to the output file")
+parser.add_argument("--chunk-size", type=int, required=True, help="how many seeds to process per sweep")
+parser.add_argument("--nseeds", type=int, default=None, help="how many seeds to process in total")
+args = parser.parse_args()
+
+# Get Stanford HARDI data
+hardi_nifti_fname, hardi_bval_fname, hardi_bvec_fname = get_fnames('stanford_hardi')
+csf, gm, wm = read_stanford_pve_maps()
+wm_data = wm.get_fdata()
+
+img = get_img(hardi_nifti_fname)
+voxel_order = "".join(aff2axcodes(img.affine))
+
+gtab = get_gtab(hardi_bval_fname, hardi_bvec_fname)
+#roi = get_img(hardi_nifti_fname)
+
+data = img.get_fdata()
+roi_data = (wm_data > 0.5)
+mask = roi_data
+
+tenmodel = dti.TensorModel(gtab, fit_method='WLS')
+print('Fitting Tensor')
+tenfit = tenmodel.fit(data, mask)
+print('Computing anisotropy measures (FA,MD,RGB)')
+FA = tenfit.fa
+FA[np.isnan(FA)] = 0
+
+# Setup tissue_classifier args
+tissue_classifier = ThresholdStoppingCriterion(FA, 0.1)
+metric_map = np.asarray(FA, 'float64')
+
+# Create seeds for ROI
+seed_mask = utils.seeds_from_mask(roi_data, density=3, affine=np.eye(4))
+seed_mask = seed_mask[0:args.nseeds]
+
+# Setup model
+print('slowadcodf')
+sh_order = 6
+model = OpdtModel(gtab, sh_order=sh_order, min_signal=1)
+
+# Setup direction getter args
+print('Bootstrap direction getter')
+boot_dg = BootDirectionGetter.from_data(data, model, max_angle=60., sphere=small_sphere)
+
+
+print('streamline gen')
+global_chunk_size = args.chunk_size
+nchunks = (seed_mask.shape[0] + global_chunk_size - 1) // global_chunk_size
+
+t1 = time.time()
+streamline_time = 0
+io_time = 0
+for idx in range(int(nchunks)):
+  # Main streamline computation
+  ts = time.time()
+  streamline_generator = LocalTracking(boot_dg, tissue_classifier, seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size], affine=np.eye(4), step_size=.5)
+  streamlines = [s for s in streamline_generator]
+  te = time.time()
+  streamline_time += (te-ts)
+  print("Generated {} streamlines from {} seeds, time: {} s".format(len(streamlines),
+                                                                    seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size].shape[0],
+                                                                    te-ts))
+
+  # Save tracklines file
+  if args.output_prefix:
+    fname = "{}.{}_{}.trk".format(args.output_prefix, idx+1, nchunks)
+    ts = time.time()
+    #save_tractogram(fname, streamlines, img.affine, vox_size=roi.header.get_zooms(), shape=roi_data.shape)
+    #save_tractogram(fname, streamlines)
+    sft = StatefulTractogram(streamlines, hardi_nifti_fname, Space.VOX)
+    save_tractogram(sft, fname)
+    te = time.time()
+    print("Saved streamlines to {}, time {} s".format(fname, te-ts))
+
+    io_time += (te-ts)
+
+t2 = time.time()
+
+print("Completed processing {} seeds.".format(seed_mask.shape[0]))
+print("Initialization time: {} sec".format(t1-t0))
+print("Streamline generation total time: {} sec".format(t2-t1))
+print("\tStreamline processing: {} sec".format(streamline_time))
+if args.output_prefix:
+  print("\tFile writing: {} sec".format(io_time))
diff --git a/run_dipy_gpu.py b/run_dipy_gpu.py
new file mode 100644
index 0000000..c7b543e
--- /dev/null
+++ b/run_dipy_gpu.py
@@ -0,0 +1,228 @@
+#!/usr/bin/env python
+
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+#    contributors may be used to endorse or promote products derived from
+#    this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import argparse
+import random
+import time
+from tqdm import trange, tqdm
+
+import numpy as np
+
+import dipy.reconst.dti as dti
+from dipy.io import read_bvals_bvecs
+from dipy.io.stateful_tractogram import Origin, Space, StatefulTractogram
+from dipy.io.streamline import save_tractogram
+from dipy.tracking import utils
+from dipy.core.gradients import gradient_table
+from dipy.data import small_sphere
+#from dipy.direction import BootDirectionGetter
+from dipy.reconst.shm import OpdtModel
+#from dipy.tracking.local import LocalTracking, ThresholdTissueClassifier
+from dipy.reconst import shm
+
+import nibabel as nib
+from nibabel.orientations import aff2axcodes
+
+# Import custom module
+import cuslines
+
+t0 = time.time()
+
+# set seed to get deterministic streamlines
+np.random.seed(0)
+random.seed(0)
+
+#Get Gradient values
+def get_gtab(fbval, fbvec):
+    bvals, bvecs = read_bvals_bvecs(fbval, fbvec)
+    gtab = gradient_table(bvals, bvecs)
+    return gtab
+
+def get_img(ep2_seq):
+    img = nib.load(ep2_seq)
+    return img
+
+print("parsing arguments")
+parser = argparse.ArgumentParser()
+parser.add_argument("nifti_file", help="path to the ep2multiband sequence nifti file")
+parser.add_argument("bvals", help="path to the bvals")
+parser.add_argument("bvecs", help="path to the bvecs")
+parser.add_argument("mask_nifti", help="path to the mask file")
+parser.add_argument("--fa_numpy", default=None, help="path to the FA numpy file")
+parser.add_argument("--roi_nifti", default=None, help="path to the ROI file")
+parser.add_argument("--output-prefix", type=str, default ='', help="path to the output file")
+parser.add_argument("--chunk-size", type=int, required=True, help="how many seeds to process per sweep, per GPU")
+parser.add_argument("--sampling-density", type=int, default=3, help="sampling density for seed generation")
+parser.add_argument("--ngpus", type=int, required=True, help="number of GPUs to use")
+parser.add_argument("--use-fast-write", action='store_true', help="use fast file write")
+parser.add_argument("--max-angle", type=float, default=1.0471975511965976, help="default: 60 deg (in rad)")
+parser.add_argument("--min-signal", type=float, default=1.0, help="default: 1.0")
+parser.add_argument("--tc-threshold", type=float, default=0.1, help="default: 0.1")
+parser.add_argument("--step-size", type=float, default=0.5, help="default: 0.5")
+args = parser.parse_args()
+
+img = get_img(args.nifti_file)
+voxel_order = "".join(aff2axcodes(img.affine))
+gtab = get_gtab(args.bvals, args.bvecs)
+mask = get_img(args.mask_nifti)
+data = img.get_fdata()
+
+# resample mask if necessary
+if mask.shape != data.shape:
+    from dipy.align.imaffine import AffineMap
+    identity = np.eye(4)
+    affine_map = AffineMap(identity,
+                           img.shape[:3], img.affine,
+                           mask.shape[:3], mask.affine)
+    mask = affine_map.transform(mask.get_fdata())
+    #mask = np.round(mask)
+else:
+    mask = mask.get_fdata()
+
+# load or compute and save FA file
+if (args.fa_numpy is not None) and os.path.isfile(args.fa_numpy):
+    FA = np.load(args.fa_numpy, allow_pickle=True)
+else:
+    # Fit
+    tenmodel = dti.TensorModel(gtab, fit_method='WLS')
+    print('Fitting Tensor')
+    tenfit = tenmodel.fit(data, mask)
+    print('Computing anisotropy measures (FA,MD,RGB)')
+    FA = tenfit.fa
+    FA[np.isnan(FA)] = 0
+
+    if args.fa_numpy is not None:
+        np.save(args.fa_numpy, FA)
+
+# Setup tissue_classifier args
+#tissue_classifier = ThresholdTissueClassifier(FA, 0.1)
+metric_map = np.asarray(FA, 'float64')
+
+# resample roi if necessary
+if args.roi_nifti is not None:
+    roi = get_img(args.roi_nifti)
+    roi_data = roi.get_fdata()
+else:
+    roi_data = np.zeros(data.shape[:3])
+    roi_data[ FA > 0.1 ] = 1.
+
+# Create seeds for ROI
+seed_mask = utils.seeds_from_mask(roi_data, density=args.sampling_density, affine=np.eye(4))
+
+# Setup model
+print('slowadcodf')
+sh_order = 6
+model = OpdtModel(gtab, sh_order=sh_order, min_signal=1)
+
+# Setup direction getter args
+print('Bootstrap direction getter')
+#boot_dg = BootDirectionGetter.from_data(data, model, max_angle=60., sphere=small_sphere)
+b0s_mask = gtab.b0s_mask
+dwi_mask = ~b0s_mask
+
+# get fit_matrix from model
+fit_matrix = model._fit_matrix
+delta_b, delta_q = fit_matrix
+
+# setup sampling matrix
+sphere = small_sphere
+theta = sphere.theta
+phi = sphere.phi
+sampling_matrix, _, _ = shm.real_sym_sh_basis(sh_order, theta, phi)
+
+## from BootPmfGen __init__
+# setup H and R matrices
+# TODO: figure out how to get H, R matrices from direction getter object
+x, y, z = model.gtab.gradients[dwi_mask].T
+r, theta, phi = shm.cart2sphere(x, y, z)
+B, _, _ = shm.real_sym_sh_basis(sh_order, theta, phi)
+H = shm.hat(B)
+R = shm.lcr_matrix(H)
+
+# create floating point copy of data
+dataf = np.asarray(data, dtype=float)
+
+print('streamline gen')
+global_chunk_size = args.chunk_size * args.ngpus
+nchunks = (seed_mask.shape[0] + global_chunk_size - 1) // global_chunk_size
+
+#streamline_generator = LocalTracking(boot_dg, tissue_classifier, seed_mask, affine=np.eye(4), step_size=.5)
+gpu_tracker = cuslines.GPUTracker(args.max_angle,
+                                  args.min_signal,
+                                  args.tc_threshold,
+                                  args.step_size,
+                                  dataf, H, R, delta_b, delta_q,
+                                  b0s_mask.astype(np.int32), metric_map, sampling_matrix,
+                                  sphere.vertices, sphere.edges.astype(np.int32),
+                                  ngpus=args.ngpus, rng_seed=0)
+t1 = time.time()
+streamline_time = 0
+io_time = 0
+
+# debug start val to fast forward
+with tqdm(total=seed_mask.shape[0]) as pbar:
+    for idx in range(int(nchunks)):
+        # Main streamline computation
+        ts = time.time()
+        streamlines = gpu_tracker.generate_streamlines(seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size])
+        te = time.time()
+        streamline_time += (te-ts)
+        pbar.update(seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size].shape[0])
+        #print("Generated {} streamlines from {} seeds, time: {} s".format(len(streamlines),
+        #                                                                  seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size].shape[0],
+        #                                                                  te-ts))
+
+        # Save tracklines file
+        if args.output_prefix:
+            if args.use_fast_write:
+                prefix = "{}.{}_{}".format(args.output_prefix, idx+1, nchunks)
+                ts = time.time()
+                gpu_tracker.dump_streamlines(prefix, voxel_order, roi_data.shape, img.header.get_zooms(), img.affine)
+                te = time.time()
+                print("Saved streamlines to {}_*.trk, time {} s".format(prefix, te-ts))
+            else:
+                fname = "{}.{}_{}.trk".format(args.output_prefix, idx+1, nchunks)
+                ts = time.time()
+                #save_tractogram(fname, streamlines, img.affine, vox_size=roi.header.get_zooms(), shape=roi_data.shape)
+                sft = StatefulTractogram(streamlines, args.nifti_file, Space.VOX)
+                save_tractogram(sft, fname)
+                te = time.time()
+                print("Saved streamlines to {}, time {} s".format(fname, te-ts))
+            io_time += (te-ts)
+
+pbar.close()
+t2 = time.time()
+
+print("Completed processing {} seeds.".format(seed_mask.shape[0]))
+print("Initialization time: {} sec".format(t1-t0))
+print("Streamline generation total time: {} sec".format(t2-t1))
+print("\tStreamline processing: {} sec".format(streamline_time))
+if args.output_prefix:
+  print("\tFile writing: {} sec".format(io_time))
diff --git a/run_dipy_gpu_hardi.py b/run_dipy_gpu_hardi.py
new file mode 100644
index 0000000..f72daa4
--- /dev/null
+++ b/run_dipy_gpu_hardi.py
@@ -0,0 +1,205 @@
+#!/usr/bin/env python
+
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+#    contributors may be used to endorse or promote products derived from
+#    this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+import random
+import time
+
+import numpy as np
+
+import dipy.reconst.dti as dti
+from dipy.io import read_bvals_bvecs
+from dipy.io.stateful_tractogram import Origin, Space, StatefulTractogram
+from dipy.io.streamline import save_tractogram
+from dipy.tracking import utils
+from dipy.core.gradients import gradient_table
+from dipy.data import small_sphere
+#from dipy.direction import BootDirectionGetter
+from dipy.reconst.shm import OpdtModel
+#from dipy.tracking.local import LocalTracking, ThresholdTissueClassifier
+from dipy.reconst import shm
+from dipy.data import get_fnames
+from dipy.data import read_stanford_pve_maps
+
+import nibabel as nib
+from nibabel.orientations import aff2axcodes
+
+# Import custom module
+import cuslines
+
+t0 = time.time()
+
+# set seed to get deterministic streamlines
+np.random.seed(0)
+random.seed(0)
+
+#Get Gradient values
+def get_gtab(fbval, fbvec):
+    bvals, bvecs = read_bvals_bvecs(fbval, fbvec)
+    gtab = gradient_table(bvals, bvecs)
+    return gtab
+
+def get_img(ep2_seq):
+    img = nib.load(ep2_seq)
+    return img
+
+print("parsing arguments")
+parser = argparse.ArgumentParser()
+parser.add_argument("--output-prefix", type=str, default ='', help="path to the output file")
+parser.add_argument("--chunk-size", type=int, required=True, help="how many seeds to process per sweep, per GPU")
+parser.add_argument("--nseeds", type=int, default=None, help="how many seeds to process in total")
+parser.add_argument("--ngpus", type=int, required=True, help="number of GPUs to use")
+parser.add_argument("--use-fast-write", action='store_true', help="use fast file write")
+parser.add_argument("--max-angle", type=float, default=1.0471975511965976, help="default: 60 deg (in rad)")
+parser.add_argument("--min-signal", type=float, default=1.0, help="default: 1.0")
+parser.add_argument("--tc-threshold", type=float, default=0.1, help="default: 0.1")
+parser.add_argument("--step-size", type=float, default=0.5, help="default: 0.5")
+args = parser.parse_args()
+
+# Get Stanford HARDI data
+hardi_nifti_fname, hardi_bval_fname, hardi_bvec_fname = get_fnames('stanford_hardi')
+
+print(hardi_nifti_fname, hardi_bval_fname, hardi_bvec_fname)
+
+csf, gm, wm = read_stanford_pve_maps()
+wm_data = wm.get_fdata()
+
+img = get_img(hardi_nifti_fname)
+voxel_order = "".join(aff2axcodes(img.affine))
+
+gtab = get_gtab(hardi_bval_fname, hardi_bvec_fname)
+#roi = get_img(hardi_nifti_fname)
+
+data = img.get_fdata()
+roi_data = (wm_data > 0.5)
+mask = roi_data
+
+tenmodel = dti.TensorModel(gtab, fit_method='WLS')
+print('Fitting Tensor')
+tenfit = tenmodel.fit(data, mask)
+print('Computing anisotropy measures (FA,MD,RGB)')
+FA = tenfit.fa
+FA[np.isnan(FA)] = 0
+
+# Setup tissue_classifier args
+#tissue_classifier = ThresholdTissueClassifier(FA, 0.1)
+metric_map = np.asarray(FA, 'float64')
+
+# Create seeds for ROI
+seed_mask = utils.seeds_from_mask(roi_data, density=3, affine=np.eye(4))
+seed_mask = seed_mask[0:args.nseeds]
+
+# Setup model
+print('slowadcodf')
+sh_order = 6
+model = OpdtModel(gtab, sh_order=sh_order, min_signal=1)
+
+# Setup direction getter args
+print('Bootstrap direction getter')
+#boot_dg = BootDirectionGetter.from_data(data, model, max_angle=60., sphere=small_sphere)
+b0s_mask = gtab.b0s_mask
+dwi_mask = ~b0s_mask
+
+# get fit_matrix from model
+fit_matrix = model._fit_matrix
+delta_b, delta_q = fit_matrix
+
+# setup sampling matrix
+sphere = small_sphere
+theta = sphere.theta
+phi = sphere.phi
+sampling_matrix, _, _ = shm.real_sym_sh_basis(sh_order, theta, phi)
+
+## from BootPmfGen __init__
+# setup H and R matrices
+# TODO: figure out how to get H, R matrices from direction getter object
+x, y, z = model.gtab.gradients[dwi_mask].T
+r, theta, phi = shm.cart2sphere(x, y, z)
+B, _, _ = shm.real_sym_sh_basis(sh_order, theta, phi)
+H = shm.hat(B)
+R = shm.lcr_matrix(H)
+
+# create floating point copy of data
+dataf = np.asarray(data, dtype=float)
+
+print('streamline gen')
+global_chunk_size = args.chunk_size * args.ngpus
+nchunks = (seed_mask.shape[0] + global_chunk_size - 1) // global_chunk_size
+
+#streamline_generator = LocalTracking(boot_dg, tissue_classifier, seed_mask, affine=np.eye(4), step_size=.5)
+
+gpu_tracker = cuslines.GPUTracker(args.max_angle,
+                                  args.min_signal,
+                                  args.tc_threshold,
+                                  args.step_size,
+                                  dataf, H, R, delta_b, delta_q,
+                                  b0s_mask.astype(np.int32), metric_map, sampling_matrix,
+                                  sphere.vertices, sphere.edges.astype(np.int32),
+                                  ngpus=args.ngpus, rng_seed=0)
+t1 = time.time()
+streamline_time = 0
+io_time = 0
+for idx in range(int(nchunks)):
+  # Main streamline computation
+  ts = time.time()
+  streamlines = gpu_tracker.generate_streamlines(seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size])
+  te = time.time()
+  streamline_time += (te-ts)
+  print("Generated {} streamlines from {} seeds, time: {} s".format(len(streamlines),
+                                                                    seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size].shape[0],
+                                                                    te-ts))
+
+  # Save tracklines file
+  if args.output_prefix:
+    if args.use_fast_write:
+      prefix = "{}.{}_{}".format(args.output_prefix, idx+1, nchunks)
+      ts = time.time()
+      #gpu_tracker.dump_streamlines(prefix, voxel_order, roi.shape, roi.header.get_zooms(), img.affine)
+      gpu_tracker.dump_streamlines(prefix, voxel_order, wm.shape, wm.header.get_zooms(), img.affine)
+      te = time.time()
+      print("Saved streamlines to {}_*.trk, time {} s".format(prefix, te-ts))
+    else:
+      fname = "{}.{}_{}.trk".format(args.output_prefix, idx+1, nchunks)
+      ts = time.time()
+      #save_tractogram(fname, streamlines, img.affine, vox_size=roi.header.get_zooms(), shape=roi_data.shape)
+      #save_tractogram(fname, streamlines)
+      sft = StatefulTractogram(streamlines, hardi_nifti_fname, Space.VOX)
+      save_tractogram(sft, fname)
+      te = time.time()
+      print("Saved streamlines to {}, time {} s".format(fname, te-ts))
+    io_time += (te-ts)
+
+t2 = time.time()
+
+print("Completed processing {} seeds.".format(seed_mask.shape[0]))
+print("Initialization time: {} sec".format(t1-t0))
+print("Streamline generation total time: {} sec".format(t2-t1))
+print("\tStreamline processing: {} sec".format(streamline_time))
+if args.output_prefix:
+  print("\tFile writing: {} sec".format(io_time))

From 4cc00f47d5bd18a2e3831498dd9ad77a812cba0f Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Fri, 30 Aug 2024 12:07:38 -0700
Subject: [PATCH 30/35] update README

---
 README.md | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index f4c1202..37661cc 100644
--- a/README.md
+++ b/README.md
@@ -8,11 +8,11 @@ To install, simply run `pip install .` in the top-level repository directory.
 ## Running the examples
 This repository contains several example usage scripts.
 
-There are two example scripts using the HARDI dataset, `run_dipy_cpu_hardi.py` and `run_dipy_gpu_hardi.py`, which run on CPU and GPU respectively.
+The script `run_gpu_streamlines.py` demonstrates how to run any diffusion MRI dataset on the GPU. It can also run on the CPU for reference, if the argument `--device=cpu` is used. If not data is passed, it will donaload and use the HARDI dataset.
 
 To run the baseline CPU example on a random set of 1000 seeds, this is the command and example output:
 ```
-$ python run_dipy_cpu_hardi.py --chunk-size 100000 --output-prefix small --nseeds 1000
+$ python run_gpu_streamlines.py --device=cpu --output-prefix small --nseeds 1000
 parsing arguments
 Fitting Tensor
 Computing anisotropy measures (FA,MD,RGB)
@@ -30,7 +30,7 @@ Streamline generation total time: 6.9404990673065186 sec
 
 To run the same case on a single GPU, this is the command and example output:
 ```
-$ python run_dipy_gpu_hardi.py --chunk-size 100000 --output-prefix small --nseeds 1000 --ngpus 1
+$ python run_gpu_streamlines.py --output-prefix small --nseeds 1000 --ngpus 1
 parsing arguments
 Fitting Tensor
 Computing anisotropy measures (FA,MD,RGB)
@@ -48,11 +48,13 @@ Streamline generation total time: 0.3834989070892334 sec
 Destroy GPUTracker...
 ```
 
+Note that if you experience memory errors, you can adjust the `--chunk-size` flag.
+
 To run on more seeds, we suggest enabling the `--use-fast-write` flag in the GPU script to not get bottlenecked by writing files. Here is a comparison running on 500K seeds on 1 GPU with and without this flag:
 
 Without `--use-fast-write`:
 ```
-$ python run_dipy_gpu_hardi.py --chunk-size 100000 --output-prefix small --nseeds 500000 --ngpus 1
+$ python run_gpu_streamlines.py --output-prefix small --nseeds 500000 --ngpus 1
 parsing arguments
 Fitting Tensor
 Computing anisotropy measures (FA,MD,RGB)
@@ -80,7 +82,7 @@ Destroy GPUTracker...
 
 With `--use-fast-write`:
 ```
-$ python run_dipy_gpu_hardi.py --chunk-size 100000 --output-prefix small --nseeds 500000 --ngpus 1 --use-fast-write
+$ python run_gpu_streamlines.py --output-prefix small --nseeds 500000 --ngpus 1 --use-fast-write
 parsing arguments
 Fitting Tensor
 Computing anisotropy measures (FA,MD,RGB)
@@ -120,11 +122,10 @@ $ docker pull docker.pkg.github.com/dipy/gpustreamlines/gpustreamlines:latest
 4. Run the code, mounting the current directory into the container for easy result retrieval:
 ```
 $ docker run --gpus=all -v ${PWD}:/opt/exec/output:rw -it docker.pkg.github.com/dipy/gpustreamlines/gpustreamlines:latest \
- python run_dipy_gpu_hardi.py --chunk-size 100000 --ngpus 1 --output-prefix output/hardi_gpu_full --use-fast-write
+ python run_gpu_streamlines.py --ngpus 1 --output-prefix output/hardi_gpu_full --use-fast-write
 ```
 5. The code produces a number of independent track files (one per processed "chunk"), but we have provided a merge script to combine them into a single trk file. To merge files, run:
 ```
 $ docker run --gpus=all -v ${PWD}:/opt/exec/output:rw -it docker.pkg.github.com/dipy/gpustreamlines/gpustreamlines:latest \
  ./merge_trk.sh -o output/hardi_tracks.trk output/hardi_gpu_full*
-```
-
+```
\ No newline at end of file

From 0c46961500c709cd9f86620438da19f18fe86337 Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Fri, 30 Aug 2024 12:41:47 -0700
Subject: [PATCH 31/35] these were brought in from merge; remove them, update
 new run file

---
 run_dipy_cpu.py        | 151 ---------------------------
 run_dipy_cpu_hardi.py  | 158 ----------------------------
 run_dipy_gpu.py        | 228 -----------------------------------------
 run_dipy_gpu_hardi.py  | 205 ------------------------------------
 run_gpu_streamlines.py |   2 +-
 5 files changed, 1 insertion(+), 743 deletions(-)
 delete mode 100644 run_dipy_cpu.py
 delete mode 100644 run_dipy_cpu_hardi.py
 delete mode 100644 run_dipy_gpu.py
 delete mode 100644 run_dipy_gpu_hardi.py

diff --git a/run_dipy_cpu.py b/run_dipy_cpu.py
deleted file mode 100644
index cc839a2..0000000
--- a/run_dipy_cpu.py
+++ /dev/null
@@ -1,151 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-#    list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-#    this list of conditions and the following disclaimer in the documentation
-#    and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-#    contributors may be used to endorse or promote products derived from
-#    this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import argparse
-import random
-import time
-
-import numpy as np
-
-import dipy.reconst.dti as dti
-from dipy.io import read_bvals_bvecs
-from dipy.io.stateful_tractogram import Origin, Space, StatefulTractogram
-from dipy.io.streamline import save_tractogram
-from dipy.tracking import utils
-from dipy.core.gradients import gradient_table
-from dipy.data import small_sphere
-from dipy.direction import BootDirectionGetter
-from dipy.reconst.shm import OpdtModel
-from dipy.tracking.local_tracking import LocalTracking
-from dipy.tracking.stopping_criterion import ThresholdStoppingCriterion
-from dipy.reconst import shm
-
-import nibabel as nib
-from nibabel.orientations import aff2axcodes
-
-# Import custom module
-import cuslines
-
-t0 = time.time()
-
-# set seed to get deterministic streamlines
-np.random.seed(0)
-random.seed(0)
-
-#Get Gradient values
-def get_gtab(fbval, fbvec):
-    bvals, bvecs = read_bvals_bvecs(fbval, fbvec)
-    gtab = gradient_table(bvals, bvecs)
-    return gtab
-
-def get_img(ep2_seq):
-    img = nib.load(ep2_seq)
-    return img
-
-print("parsing arguments")
-parser = argparse.ArgumentParser()
-parser.add_argument("nifti_file", help="path to the ep2multiband sequence nifti file")
-parser.add_argument("bvals", help="path to the bvals")
-parser.add_argument("bvecs", help="path to the bvecs")
-parser.add_argument("mask_nifti", help="path to the mask file")
-parser.add_argument("roi_nifti", help="path to the ROI file")
-parser.add_argument("--output-prefix", type=str, default ='', help="path to the output file")
-parser.add_argument("--chunk-size", type=int, required=True, help="how many seeds to process per sweep, per GPU")
-args = parser.parse_args()
-
-img = get_img(args.nifti_file)
-voxel_order = "".join(aff2axcodes(img.affine))
-gtab = get_gtab(args.bvals, args.bvecs)
-roi = get_img(args.roi_nifti)
-mask = get_img(args.mask_nifti)
-data = img.get_data()
-roi_data = roi.get_data()
-mask = mask.get_data()
-
-tenmodel = dti.TensorModel(gtab, fit_method='WLS')
-print('Fitting Tensor')
-tenfit = tenmodel.fit(data, mask)
-print('Computing anisotropy measures (FA,MD,RGB)')
-FA = tenfit.fa
-FA[np.isnan(FA)] = 0
-
-# Setup tissue_classifier args
-tissue_classifier = ThresholdStoppingCriterion(FA, 0.1)
-metric_map = np.asarray(FA, 'float64')
-
-# Create seeds for ROI
-seed_mask = utils.seeds_from_mask(roi_data, density=3, affine=np.eye(4))
-
-# Setup model
-print('slowadcodf')
-sh_order = 6
-model = OpdtModel(gtab, sh_order=sh_order, min_signal=1)
-
-# Setup direction getter args
-print('Bootstrap direction getter')
-boot_dg = BootDirectionGetter.from_data(data, model, max_angle=60., sphere=small_sphere)
-
-print('streamline gen')
-global_chunk_size = args.chunk_size
-nchunks = (seed_mask.shape[0] + global_chunk_size - 1) // global_chunk_size
-
-streamline_generator = LocalTracking(boot_dg, tissue_classifier, seed_mask, affine=np.eye(4), step_size=.5)
-
-t1 = time.time()
-streamline_time = 0
-io_time = 0
-for idx in range(int(nchunks)):
-  # Main streamline computation
-  ts = time.time()
-  streamlines = [s for s in streamline_generator]
-  te = time.time()
-  streamline_time += (te-ts)
-  print("Generated {} streamlines from {} seeds, time: {} s".format(len(streamlines),
-                                                                    seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size].shape[0],
-                                                                    te-ts))
-
-  # Save tracklines file
-  if args.output_prefix:
-    fname = "{}.{}_{}.trk".format(args.output_prefix, idx+1, nchunks)
-    ts = time.time()
-    #save_tractogram(fname, streamlines, img.affine, vox_size=roi.header.get_zooms(), shape=roi_data.shape)
-    sft = StatefulTractogram(streamlines, args.nifti_file, Space.VOX)
-    save_tractogram(sft, fname)
-    te = time.time()
-    print("Saved streamlines to {}, time {} s".format(fname, te-ts))
-    io_time += (te-ts)
-
-t2 = time.time()
-
-print("Completed processing {} seeds.".format(seed_mask.shape[0]))
-print("Initialization time: {} sec".format(t1-t0))
-print("Streamline generation total time: {} sec".format(t2-t1))
-print("\tStreamline processing: {} sec".format(streamline_time))
-if args.output_prefix:
-  print("\tFile writing: {} sec".format(io_time))
diff --git a/run_dipy_cpu_hardi.py b/run_dipy_cpu_hardi.py
deleted file mode 100644
index c8aaea1..0000000
--- a/run_dipy_cpu_hardi.py
+++ /dev/null
@@ -1,158 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-#    list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-#    this list of conditions and the following disclaimer in the documentation
-#    and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-#    contributors may be used to endorse or promote products derived from
-#    this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import argparse
-import random
-import time
-
-import numpy as np
-
-import dipy.reconst.dti as dti
-from dipy.io import read_bvals_bvecs
-from dipy.io.stateful_tractogram import Origin, Space, StatefulTractogram
-from dipy.io.streamline import save_tractogram
-from dipy.tracking import utils
-from dipy.core.gradients import gradient_table
-from dipy.data import small_sphere
-from dipy.direction import BootDirectionGetter
-from dipy.reconst.shm import OpdtModel
-from dipy.tracking.local_tracking import LocalTracking
-from dipy.tracking.stopping_criterion import ThresholdStoppingCriterion
-from dipy.reconst import shm
-from dipy.data import get_fnames
-from dipy.data import read_stanford_pve_maps
-
-import nibabel as nib
-from nibabel.orientations import aff2axcodes
-
-# Import custom module
-import cuslines
-
-t0 = time.time()
-
-# set seed to get deterministic streamlines
-np.random.seed(0)
-random.seed(0)
-
-#Get Gradient values
-def get_gtab(fbval, fbvec):
-    bvals, bvecs = read_bvals_bvecs(fbval, fbvec)
-    gtab = gradient_table(bvals, bvecs)
-    return gtab
-
-def get_img(ep2_seq):
-    img = nib.load(ep2_seq)
-    return img
-
-print("parsing arguments")
-parser = argparse.ArgumentParser()
-parser.add_argument("--output-prefix", type=str, default ='', help="path to the output file")
-parser.add_argument("--chunk-size", type=int, required=True, help="how many seeds to process per sweep")
-parser.add_argument("--nseeds", type=int, default=None, help="how many seeds to process in total")
-args = parser.parse_args()
-
-# Get Stanford HARDI data
-hardi_nifti_fname, hardi_bval_fname, hardi_bvec_fname = get_fnames('stanford_hardi')
-csf, gm, wm = read_stanford_pve_maps()
-wm_data = wm.get_fdata()
-
-img = get_img(hardi_nifti_fname)
-voxel_order = "".join(aff2axcodes(img.affine))
-
-gtab = get_gtab(hardi_bval_fname, hardi_bvec_fname)
-#roi = get_img(hardi_nifti_fname)
-
-data = img.get_fdata()
-roi_data = (wm_data > 0.5)
-mask = roi_data
-
-tenmodel = dti.TensorModel(gtab, fit_method='WLS')
-print('Fitting Tensor')
-tenfit = tenmodel.fit(data, mask)
-print('Computing anisotropy measures (FA,MD,RGB)')
-FA = tenfit.fa
-FA[np.isnan(FA)] = 0
-
-# Setup tissue_classifier args
-tissue_classifier = ThresholdStoppingCriterion(FA, 0.1)
-metric_map = np.asarray(FA, 'float64')
-
-# Create seeds for ROI
-seed_mask = utils.seeds_from_mask(roi_data, density=3, affine=np.eye(4))
-seed_mask = seed_mask[0:args.nseeds]
-
-# Setup model
-print('slowadcodf')
-sh_order = 6
-model = OpdtModel(gtab, sh_order=sh_order, min_signal=1)
-
-# Setup direction getter args
-print('Bootstrap direction getter')
-boot_dg = BootDirectionGetter.from_data(data, model, max_angle=60., sphere=small_sphere)
-
-
-print('streamline gen')
-global_chunk_size = args.chunk_size
-nchunks = (seed_mask.shape[0] + global_chunk_size - 1) // global_chunk_size
-
-t1 = time.time()
-streamline_time = 0
-io_time = 0
-for idx in range(int(nchunks)):
-  # Main streamline computation
-  ts = time.time()
-  streamline_generator = LocalTracking(boot_dg, tissue_classifier, seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size], affine=np.eye(4), step_size=.5)
-  streamlines = [s for s in streamline_generator]
-  te = time.time()
-  streamline_time += (te-ts)
-  print("Generated {} streamlines from {} seeds, time: {} s".format(len(streamlines),
-                                                                    seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size].shape[0],
-                                                                    te-ts))
-
-  # Save tracklines file
-  if args.output_prefix:
-    fname = "{}.{}_{}.trk".format(args.output_prefix, idx+1, nchunks)
-    ts = time.time()
-    #save_tractogram(fname, streamlines, img.affine, vox_size=roi.header.get_zooms(), shape=roi_data.shape)
-    #save_tractogram(fname, streamlines)
-    sft = StatefulTractogram(streamlines, hardi_nifti_fname, Space.VOX)
-    save_tractogram(sft, fname)
-    te = time.time()
-    print("Saved streamlines to {}, time {} s".format(fname, te-ts))
-
-    io_time += (te-ts)
-
-t2 = time.time()
-
-print("Completed processing {} seeds.".format(seed_mask.shape[0]))
-print("Initialization time: {} sec".format(t1-t0))
-print("Streamline generation total time: {} sec".format(t2-t1))
-print("\tStreamline processing: {} sec".format(streamline_time))
-if args.output_prefix:
-  print("\tFile writing: {} sec".format(io_time))
diff --git a/run_dipy_gpu.py b/run_dipy_gpu.py
deleted file mode 100644
index c7b543e..0000000
--- a/run_dipy_gpu.py
+++ /dev/null
@@ -1,228 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-#    list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-#    this list of conditions and the following disclaimer in the documentation
-#    and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-#    contributors may be used to endorse or promote products derived from
-#    this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import os
-import argparse
-import random
-import time
-from tqdm import trange, tqdm
-
-import numpy as np
-
-import dipy.reconst.dti as dti
-from dipy.io import read_bvals_bvecs
-from dipy.io.stateful_tractogram import Origin, Space, StatefulTractogram
-from dipy.io.streamline import save_tractogram
-from dipy.tracking import utils
-from dipy.core.gradients import gradient_table
-from dipy.data import small_sphere
-#from dipy.direction import BootDirectionGetter
-from dipy.reconst.shm import OpdtModel
-#from dipy.tracking.local import LocalTracking, ThresholdTissueClassifier
-from dipy.reconst import shm
-
-import nibabel as nib
-from nibabel.orientations import aff2axcodes
-
-# Import custom module
-import cuslines
-
-t0 = time.time()
-
-# set seed to get deterministic streamlines
-np.random.seed(0)
-random.seed(0)
-
-#Get Gradient values
-def get_gtab(fbval, fbvec):
-    bvals, bvecs = read_bvals_bvecs(fbval, fbvec)
-    gtab = gradient_table(bvals, bvecs)
-    return gtab
-
-def get_img(ep2_seq):
-    img = nib.load(ep2_seq)
-    return img
-
-print("parsing arguments")
-parser = argparse.ArgumentParser()
-parser.add_argument("nifti_file", help="path to the ep2multiband sequence nifti file")
-parser.add_argument("bvals", help="path to the bvals")
-parser.add_argument("bvecs", help="path to the bvecs")
-parser.add_argument("mask_nifti", help="path to the mask file")
-parser.add_argument("--fa_numpy", default=None, help="path to the FA numpy file")
-parser.add_argument("--roi_nifti", default=None, help="path to the ROI file")
-parser.add_argument("--output-prefix", type=str, default ='', help="path to the output file")
-parser.add_argument("--chunk-size", type=int, required=True, help="how many seeds to process per sweep, per GPU")
-parser.add_argument("--sampling-density", type=int, default=3, help="sampling density for seed generation")
-parser.add_argument("--ngpus", type=int, required=True, help="number of GPUs to use")
-parser.add_argument("--use-fast-write", action='store_true', help="use fast file write")
-parser.add_argument("--max-angle", type=float, default=1.0471975511965976, help="default: 60 deg (in rad)")
-parser.add_argument("--min-signal", type=float, default=1.0, help="default: 1.0")
-parser.add_argument("--tc-threshold", type=float, default=0.1, help="default: 0.1")
-parser.add_argument("--step-size", type=float, default=0.5, help="default: 0.5")
-args = parser.parse_args()
-
-img = get_img(args.nifti_file)
-voxel_order = "".join(aff2axcodes(img.affine))
-gtab = get_gtab(args.bvals, args.bvecs)
-mask = get_img(args.mask_nifti)
-data = img.get_fdata()
-
-# resample mask if necessary
-if mask.shape != data.shape:
-    from dipy.align.imaffine import AffineMap
-    identity = np.eye(4)
-    affine_map = AffineMap(identity,
-                           img.shape[:3], img.affine,
-                           mask.shape[:3], mask.affine)
-    mask = affine_map.transform(mask.get_fdata())
-    #mask = np.round(mask)
-else:
-    mask = mask.get_fdata()
-
-# load or compute and save FA file
-if (args.fa_numpy is not None) and os.path.isfile(args.fa_numpy):
-    FA = np.load(args.fa_numpy, allow_pickle=True)
-else:
-    # Fit
-    tenmodel = dti.TensorModel(gtab, fit_method='WLS')
-    print('Fitting Tensor')
-    tenfit = tenmodel.fit(data, mask)
-    print('Computing anisotropy measures (FA,MD,RGB)')
-    FA = tenfit.fa
-    FA[np.isnan(FA)] = 0
-
-    if args.fa_numpy is not None:
-        np.save(args.fa_numpy, FA)
-
-# Setup tissue_classifier args
-#tissue_classifier = ThresholdTissueClassifier(FA, 0.1)
-metric_map = np.asarray(FA, 'float64')
-
-# resample roi if necessary
-if args.roi_nifti is not None:
-    roi = get_img(args.roi_nifti)
-    roi_data = roi.get_fdata()
-else:
-    roi_data = np.zeros(data.shape[:3])
-    roi_data[ FA > 0.1 ] = 1.
-
-# Create seeds for ROI
-seed_mask = utils.seeds_from_mask(roi_data, density=args.sampling_density, affine=np.eye(4))
-
-# Setup model
-print('slowadcodf')
-sh_order = 6
-model = OpdtModel(gtab, sh_order=sh_order, min_signal=1)
-
-# Setup direction getter args
-print('Bootstrap direction getter')
-#boot_dg = BootDirectionGetter.from_data(data, model, max_angle=60., sphere=small_sphere)
-b0s_mask = gtab.b0s_mask
-dwi_mask = ~b0s_mask
-
-# get fit_matrix from model
-fit_matrix = model._fit_matrix
-delta_b, delta_q = fit_matrix
-
-# setup sampling matrix
-sphere = small_sphere
-theta = sphere.theta
-phi = sphere.phi
-sampling_matrix, _, _ = shm.real_sym_sh_basis(sh_order, theta, phi)
-
-## from BootPmfGen __init__
-# setup H and R matrices
-# TODO: figure out how to get H, R matrices from direction getter object
-x, y, z = model.gtab.gradients[dwi_mask].T
-r, theta, phi = shm.cart2sphere(x, y, z)
-B, _, _ = shm.real_sym_sh_basis(sh_order, theta, phi)
-H = shm.hat(B)
-R = shm.lcr_matrix(H)
-
-# create floating point copy of data
-dataf = np.asarray(data, dtype=float)
-
-print('streamline gen')
-global_chunk_size = args.chunk_size * args.ngpus
-nchunks = (seed_mask.shape[0] + global_chunk_size - 1) // global_chunk_size
-
-#streamline_generator = LocalTracking(boot_dg, tissue_classifier, seed_mask, affine=np.eye(4), step_size=.5)
-gpu_tracker = cuslines.GPUTracker(args.max_angle,
-                                  args.min_signal,
-                                  args.tc_threshold,
-                                  args.step_size,
-                                  dataf, H, R, delta_b, delta_q,
-                                  b0s_mask.astype(np.int32), metric_map, sampling_matrix,
-                                  sphere.vertices, sphere.edges.astype(np.int32),
-                                  ngpus=args.ngpus, rng_seed=0)
-t1 = time.time()
-streamline_time = 0
-io_time = 0
-
-# debug start val to fast forward
-with tqdm(total=seed_mask.shape[0]) as pbar:
-    for idx in range(int(nchunks)):
-        # Main streamline computation
-        ts = time.time()
-        streamlines = gpu_tracker.generate_streamlines(seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size])
-        te = time.time()
-        streamline_time += (te-ts)
-        pbar.update(seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size].shape[0])
-        #print("Generated {} streamlines from {} seeds, time: {} s".format(len(streamlines),
-        #                                                                  seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size].shape[0],
-        #                                                                  te-ts))
-
-        # Save tracklines file
-        if args.output_prefix:
-            if args.use_fast_write:
-                prefix = "{}.{}_{}".format(args.output_prefix, idx+1, nchunks)
-                ts = time.time()
-                gpu_tracker.dump_streamlines(prefix, voxel_order, roi_data.shape, img.header.get_zooms(), img.affine)
-                te = time.time()
-                print("Saved streamlines to {}_*.trk, time {} s".format(prefix, te-ts))
-            else:
-                fname = "{}.{}_{}.trk".format(args.output_prefix, idx+1, nchunks)
-                ts = time.time()
-                #save_tractogram(fname, streamlines, img.affine, vox_size=roi.header.get_zooms(), shape=roi_data.shape)
-                sft = StatefulTractogram(streamlines, args.nifti_file, Space.VOX)
-                save_tractogram(sft, fname)
-                te = time.time()
-                print("Saved streamlines to {}, time {} s".format(fname, te-ts))
-            io_time += (te-ts)
-
-pbar.close()
-t2 = time.time()
-
-print("Completed processing {} seeds.".format(seed_mask.shape[0]))
-print("Initialization time: {} sec".format(t1-t0))
-print("Streamline generation total time: {} sec".format(t2-t1))
-print("\tStreamline processing: {} sec".format(streamline_time))
-if args.output_prefix:
-  print("\tFile writing: {} sec".format(io_time))
diff --git a/run_dipy_gpu_hardi.py b/run_dipy_gpu_hardi.py
deleted file mode 100644
index f72daa4..0000000
--- a/run_dipy_gpu_hardi.py
+++ /dev/null
@@ -1,205 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-#    list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-#    this list of conditions and the following disclaimer in the documentation
-#    and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-#    contributors may be used to endorse or promote products derived from
-#    this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import argparse
-import random
-import time
-
-import numpy as np
-
-import dipy.reconst.dti as dti
-from dipy.io import read_bvals_bvecs
-from dipy.io.stateful_tractogram import Origin, Space, StatefulTractogram
-from dipy.io.streamline import save_tractogram
-from dipy.tracking import utils
-from dipy.core.gradients import gradient_table
-from dipy.data import small_sphere
-#from dipy.direction import BootDirectionGetter
-from dipy.reconst.shm import OpdtModel
-#from dipy.tracking.local import LocalTracking, ThresholdTissueClassifier
-from dipy.reconst import shm
-from dipy.data import get_fnames
-from dipy.data import read_stanford_pve_maps
-
-import nibabel as nib
-from nibabel.orientations import aff2axcodes
-
-# Import custom module
-import cuslines
-
-t0 = time.time()
-
-# set seed to get deterministic streamlines
-np.random.seed(0)
-random.seed(0)
-
-#Get Gradient values
-def get_gtab(fbval, fbvec):
-    bvals, bvecs = read_bvals_bvecs(fbval, fbvec)
-    gtab = gradient_table(bvals, bvecs)
-    return gtab
-
-def get_img(ep2_seq):
-    img = nib.load(ep2_seq)
-    return img
-
-print("parsing arguments")
-parser = argparse.ArgumentParser()
-parser.add_argument("--output-prefix", type=str, default ='', help="path to the output file")
-parser.add_argument("--chunk-size", type=int, required=True, help="how many seeds to process per sweep, per GPU")
-parser.add_argument("--nseeds", type=int, default=None, help="how many seeds to process in total")
-parser.add_argument("--ngpus", type=int, required=True, help="number of GPUs to use")
-parser.add_argument("--use-fast-write", action='store_true', help="use fast file write")
-parser.add_argument("--max-angle", type=float, default=1.0471975511965976, help="default: 60 deg (in rad)")
-parser.add_argument("--min-signal", type=float, default=1.0, help="default: 1.0")
-parser.add_argument("--tc-threshold", type=float, default=0.1, help="default: 0.1")
-parser.add_argument("--step-size", type=float, default=0.5, help="default: 0.5")
-args = parser.parse_args()
-
-# Get Stanford HARDI data
-hardi_nifti_fname, hardi_bval_fname, hardi_bvec_fname = get_fnames('stanford_hardi')
-
-print(hardi_nifti_fname, hardi_bval_fname, hardi_bvec_fname)
-
-csf, gm, wm = read_stanford_pve_maps()
-wm_data = wm.get_fdata()
-
-img = get_img(hardi_nifti_fname)
-voxel_order = "".join(aff2axcodes(img.affine))
-
-gtab = get_gtab(hardi_bval_fname, hardi_bvec_fname)
-#roi = get_img(hardi_nifti_fname)
-
-data = img.get_fdata()
-roi_data = (wm_data > 0.5)
-mask = roi_data
-
-tenmodel = dti.TensorModel(gtab, fit_method='WLS')
-print('Fitting Tensor')
-tenfit = tenmodel.fit(data, mask)
-print('Computing anisotropy measures (FA,MD,RGB)')
-FA = tenfit.fa
-FA[np.isnan(FA)] = 0
-
-# Setup tissue_classifier args
-#tissue_classifier = ThresholdTissueClassifier(FA, 0.1)
-metric_map = np.asarray(FA, 'float64')
-
-# Create seeds for ROI
-seed_mask = utils.seeds_from_mask(roi_data, density=3, affine=np.eye(4))
-seed_mask = seed_mask[0:args.nseeds]
-
-# Setup model
-print('slowadcodf')
-sh_order = 6
-model = OpdtModel(gtab, sh_order=sh_order, min_signal=1)
-
-# Setup direction getter args
-print('Bootstrap direction getter')
-#boot_dg = BootDirectionGetter.from_data(data, model, max_angle=60., sphere=small_sphere)
-b0s_mask = gtab.b0s_mask
-dwi_mask = ~b0s_mask
-
-# get fit_matrix from model
-fit_matrix = model._fit_matrix
-delta_b, delta_q = fit_matrix
-
-# setup sampling matrix
-sphere = small_sphere
-theta = sphere.theta
-phi = sphere.phi
-sampling_matrix, _, _ = shm.real_sym_sh_basis(sh_order, theta, phi)
-
-## from BootPmfGen __init__
-# setup H and R matrices
-# TODO: figure out how to get H, R matrices from direction getter object
-x, y, z = model.gtab.gradients[dwi_mask].T
-r, theta, phi = shm.cart2sphere(x, y, z)
-B, _, _ = shm.real_sym_sh_basis(sh_order, theta, phi)
-H = shm.hat(B)
-R = shm.lcr_matrix(H)
-
-# create floating point copy of data
-dataf = np.asarray(data, dtype=float)
-
-print('streamline gen')
-global_chunk_size = args.chunk_size * args.ngpus
-nchunks = (seed_mask.shape[0] + global_chunk_size - 1) // global_chunk_size
-
-#streamline_generator = LocalTracking(boot_dg, tissue_classifier, seed_mask, affine=np.eye(4), step_size=.5)
-
-gpu_tracker = cuslines.GPUTracker(args.max_angle,
-                                  args.min_signal,
-                                  args.tc_threshold,
-                                  args.step_size,
-                                  dataf, H, R, delta_b, delta_q,
-                                  b0s_mask.astype(np.int32), metric_map, sampling_matrix,
-                                  sphere.vertices, sphere.edges.astype(np.int32),
-                                  ngpus=args.ngpus, rng_seed=0)
-t1 = time.time()
-streamline_time = 0
-io_time = 0
-for idx in range(int(nchunks)):
-  # Main streamline computation
-  ts = time.time()
-  streamlines = gpu_tracker.generate_streamlines(seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size])
-  te = time.time()
-  streamline_time += (te-ts)
-  print("Generated {} streamlines from {} seeds, time: {} s".format(len(streamlines),
-                                                                    seed_mask[idx*global_chunk_size:(idx+1)*global_chunk_size].shape[0],
-                                                                    te-ts))
-
-  # Save tracklines file
-  if args.output_prefix:
-    if args.use_fast_write:
-      prefix = "{}.{}_{}".format(args.output_prefix, idx+1, nchunks)
-      ts = time.time()
-      #gpu_tracker.dump_streamlines(prefix, voxel_order, roi.shape, roi.header.get_zooms(), img.affine)
-      gpu_tracker.dump_streamlines(prefix, voxel_order, wm.shape, wm.header.get_zooms(), img.affine)
-      te = time.time()
-      print("Saved streamlines to {}_*.trk, time {} s".format(prefix, te-ts))
-    else:
-      fname = "{}.{}_{}.trk".format(args.output_prefix, idx+1, nchunks)
-      ts = time.time()
-      #save_tractogram(fname, streamlines, img.affine, vox_size=roi.header.get_zooms(), shape=roi_data.shape)
-      #save_tractogram(fname, streamlines)
-      sft = StatefulTractogram(streamlines, hardi_nifti_fname, Space.VOX)
-      save_tractogram(sft, fname)
-      te = time.time()
-      print("Saved streamlines to {}, time {} s".format(fname, te-ts))
-    io_time += (te-ts)
-
-t2 = time.time()
-
-print("Completed processing {} seeds.".format(seed_mask.shape[0]))
-print("Initialization time: {} sec".format(t1-t0))
-print("Streamline generation total time: {} sec".format(t2-t1))
-print("\tStreamline processing: {} sec".format(streamline_time))
-if args.output_prefix:
-  print("\tFile writing: {} sec".format(io_time))
diff --git a/run_gpu_streamlines.py b/run_gpu_streamlines.py
index 978da42..962d415 100644
--- a/run_gpu_streamlines.py
+++ b/run_gpu_streamlines.py
@@ -54,7 +54,7 @@
 from nibabel.orientations import aff2axcodes
 
 # Import custom module
-import cuslines.cuslines as cuslines
+import cuslines
 
 t0 = time.time()
 

From c799371c7c06abfa5cc704c3515da158e07e6c9d Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Fri, 30 Aug 2024 12:43:41 -0700
Subject: [PATCH 32/35] these must all be fdata

---
 run_gpu_streamlines.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/run_gpu_streamlines.py b/run_gpu_streamlines.py
index 962d415..7c8982c 100644
--- a/run_gpu_streamlines.py
+++ b/run_gpu_streamlines.py
@@ -121,9 +121,9 @@ def get_img(ep2_seq):
   gtab = get_gtab(args.bvals, args.bvecs)
   roi = get_img(args.roi_nifti)
   mask = get_img(args.mask_nifti)
-  data = img.get_data()
-  roi_data = roi.get_data()
-  mask = mask.get_data()
+  data = img.get_fdata()
+  roi_data = roi.get_fdata()
+  mask = mask.get_fdata()
 
 tenmodel = dti.TensorModel(gtab, fit_method='WLS')
 print('Fitting Tensor')

From ba3660250964542495b803882b6c2936b98d2f73 Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Tue, 3 Sep 2024 11:38:00 -0700
Subject: [PATCH 33/35] pip install for native architectures

---
 CMakeLists.txt | 2 +-
 pyproject.toml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c946eab..2a694b4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.18)
+cmake_minimum_required(VERSION 3.24)
 
 project(cuslines LANGUAGES CUDA CXX VERSION 1.0)
 
diff --git a/pyproject.toml b/pyproject.toml
index d6d2422..3291739 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,4 +11,4 @@ requires-python = ">=3.7"
 
 [tool.scikit-build]
 cmake.build-type = "Release"
-cmake.args = ["-DCMAKE_CUDA_ARCHITECTURES=70;80;90"]
+cmake.args = ["-DCMAKE_CUDA_ARCHITECTURES=native"] 

From f017462322a7546e0f1f28c51ee51b04e6451c4c Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Tue, 3 Sep 2024 11:55:18 -0700
Subject: [PATCH 34/35] explicit cmake arch list

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3291739..b34f9a0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,4 +11,4 @@ requires-python = ">=3.7"
 
 [tool.scikit-build]
 cmake.build-type = "Release"
-cmake.args = ["-DCMAKE_CUDA_ARCHITECTURES=native"] 
+cmake.args = ["-DCMAKE_CUDA_ARCHITECTURES=52-real;60-real;61-real;70-real;72-real;75-real;80-real;86-real;87-real;90"] 

From 4bef2e0934bbfd9f855ccb221df93911cce86240 Mon Sep 17 00:00:00 2001
From: 36000 <jk6.28@outlook.com>
Date: Mon, 9 Sep 2024 13:42:14 -0700
Subject: [PATCH 35/35] add trx support

---
 run_gpu_streamlines.py | 78 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 70 insertions(+), 8 deletions(-)

diff --git a/run_gpu_streamlines.py b/run_gpu_streamlines.py
index 7c8982c..e627978 100644
--- a/run_gpu_streamlines.py
+++ b/run_gpu_streamlines.py
@@ -30,6 +30,7 @@
 import argparse
 import random
 import time
+import zipfile
 
 import numpy as np
 import numpy.linalg as npl
@@ -53,6 +54,8 @@
 import nibabel as nib
 from nibabel.orientations import aff2axcodes
 
+from trx.trx_file_memmap import TrxFile, zip_from_folder
+
 # Import custom module
 import cuslines
 
@@ -84,7 +87,7 @@ def get_img(ep2_seq):
 parser.add_argument("--chunk-size", type=int, default=100000, help="how many seeds to process per sweep, per GPU")
 parser.add_argument("--nseeds", type=int, default=100000, help="how many seeds to process in total")
 parser.add_argument("--ngpus", type=int, default=1, help="number of GPUs to use if using gpu")
-parser.add_argument("--use-fast-write", action='store_true', help="use fast file write (only if using gpu)")
+parser.add_argument("--write-method", type=str, default="fast", help="Can be trx, fast, or standard")
 parser.add_argument("--max-angle", type=float, default=60, help="max angle (in degrees)")
 parser.add_argument("--min-signal", type=float, default=1.0, help="default: 1.0")
 parser.add_argument("--step-size", type=float, default=0.5, help="default: 0.5")
@@ -98,6 +101,12 @@ def get_img(ep2_seq):
 
 args = parser.parse_args()
 
+if args.device == "cpu" and args.write_method != "standard":
+  print("WARNING: only standard write method is implemented for cpu testing.")
+  write_method = "standard"
+else:
+  write_method = args.write_method
+
 if 'hardi' in [args.nifti_file, args.bvals, args.bvecs, args.mask_nifti, args.roi_nifti]:
   if not all(arg == 'hardi' for arg in [args.nifti_file, args.bvals, args.bvecs, args.mask_nifti, args.roi_nifti]):
     raise ValueError("If any of the arguments is 'hardi', all must be 'hardi'")
@@ -108,7 +117,6 @@ def get_img(ep2_seq):
 
   img = get_img(hardi_nifti_fname)
   voxel_order = "".join(aff2axcodes(img.affine))
-  img_affine = img.affine
 
   gtab = get_gtab(hardi_bval_fname, hardi_bvec_fname)
 
@@ -243,6 +251,21 @@ def get_img(ep2_seq):
 t1 = time.time()
 streamline_time = 0
 io_time = 0
+
+if args.output_prefix and write_method == "trx":
+  # Will resize by a factor of 2 if these are exceeded
+  sl_len_guess = 100
+  sl_per_seed_guess = 3
+  n_sls_guess = sl_per_seed_guess*len(seed_mask)
+
+  # trx files use memory mapping
+  trx_file = TrxFile(
+    reference=hardi_nifti_fname,
+    nb_streamlines=n_sls_guess,
+    nb_vertices=n_sls_guess*sl_len_guess)
+  offsets_idx = 0
+  sls_data_idx = 0
+
 for idx in range(int(nchunks)):
   # Main streamline computation
   ts = time.time()
@@ -260,17 +283,56 @@ def get_img(ep2_seq):
   # Save tracklines file
   if args.output_prefix:
     ts = time.time()
-    if (args.device == "gpu") and args.use_fast_write:
-      fname = "{}.{}_{}".format(args.output_prefix, idx+1, nchunks)
-      gpu_tracker.dump_streamlines(fname, voxel_order, wm.shape, wm.header.get_zooms(), img.affine)
-    else:
+    if write_method == "standard":
       fname = "{}.{}_{}.trk".format(args.output_prefix, idx+1, nchunks)
       sft = StatefulTractogram(streamlines, args.nifti_file, Space.VOX)
       save_tractogram(sft, fname)
-    te = time.time()
-    print("Saved streamlines to {}, time {} s".format(fname, te-ts))
+      te = time.time()
+      print("Saved streamlines to {}, time {} s".format(fname, te-ts))
+    elif write_method == "trx":
+      tractogram = nib.streamlines.Tractogram(streamlines, affine_to_rasmm=img.affine)
+      tractogram.to_world()
+      sls = tractogram.streamlines
+
+      new_offsets_idx = offsets_idx + len(sls._offsets)
+      new_sls_data_idx = sls_data_idx + len(sls._data)
+
+      if new_offsets_idx > trx_file.header["NB_STREAMLINES"]\
+          or new_sls_data_idx > trx_file.header["NB_VERTICES"]:
+        print("TRX resizing...")
+        trx_file.resize(nb_streamlines=new_offsets_idx*2, nb_vertices=new_sls_data_idx*2)
+
+      # TRX uses memmaps here
+      trx_file.streamlines._data[sls_data_idx:new_sls_data_idx] = sls._data
+      trx_file.streamlines._offsets[offsets_idx:new_offsets_idx] = offsets_idx + sls._offsets
+      trx_file.streamlines._lengths[offsets_idx:new_offsets_idx] = sls._lengths
+
+      offsets_idx = new_offsets_idx
+      sls_data_idx = new_sls_data_idx
+      
+      te = time.time()
+      print("Streamlines to TRX format, time {} s".format(te-ts))
+    else:
+      fname = "{}.{}_{}".format(args.output_prefix, idx+1, nchunks)
+      gpu_tracker.dump_streamlines(fname, voxel_order, wm.shape, wm.header.get_zooms(), img.affine)
+      te = time.time()
+      print("Saved streamlines to {}, time {} s".format(fname, te-ts))
+
     io_time += (te-ts)
 
+if args.output_prefix and write_method == "trx":
+  ts = time.time()
+  fname = "{}.trx".format(args.output_prefix)
+  trx_file.resize()
+  zip_from_folder(
+    trx_file._uncompressed_folder_handle.name,
+    fname,
+    zipfile.ZIP_STORED)
+  trx_file.close()
+  te = time.time()
+  print("Saved streamlines to {}, time {} s".format(fname, te-ts))
+  io_time += (te-ts)
+
 t2 = time.time()
 
 print("Completed processing {} seeds.".format(seed_mask.shape[0]))