From 1ddb31b364cbad4282f5fa18ed17a1c121b322b7 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Fri, 13 Mar 2020 18:33:03 -0600
Subject: [PATCH 001/190] adding cuda to cython compilation

---
 setup.py | 276 ++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 233 insertions(+), 43 deletions(-)

diff --git a/setup.py b/setup.py
index b17c6ec5..45cd02b7 100644
--- a/setup.py
+++ b/setup.py
@@ -3,52 +3,242 @@
 from Cython.Build import cythonize
 import numpy
 import amico
+import os
+from os.path import join as pjoin
 
 amico_version = amico.__version__.split('.')
 amico_version = [int(version_val) for version_val in amico_version]
 if amico_version[0] == 1 and amico_version[1] < 1:
     raise RuntimeError( 'COMMIT requires AMICO v1.1.0 or above. Current AMICO version is %s' % amico.__version__ )
 
-# Cython extension to create the sparse data structure from a tractogram
-# for the computation of matrix-vector multiplications
-ext1 = Extension(
-    name='commit.trk2dictionary',
-    sources=['commit/trk2dictionary/trk2dictionary.pyx'],
-    include_dirs=[numpy.get_include()],
-    extra_compile_args=['-w'],
-    extra_link_args=[],
-    language='c++',
-)
-
-ext2 = Extension(
-    name='commit.core',
-    sources=['commit/core.pyx'],
-    include_dirs=[numpy.get_include()],
-    extra_compile_args=['-w'],
-    extra_link_args=[],
-    language='c++',
-)
-
-ext3 = Extension(
-    name='commit.proximals',
-    sources=['commit/proximals.pyx'],
-    include_dirs=[numpy.get_include()],
-    extra_compile_args=['-w'],
-    extra_link_args=[],
-    language='c++',
-)
-
-setup(
-    name='commit',
-    version='1.3.0',
-    description='Convex Optimization Modeling for Microstructure Informed Tractography (COMMIT)',
-    author='Alessandro Daducci',
-    author_email='alessandro.daducci@gmail.com',
-    url='https://github.com/daducci/COMMIT',
-    cmdclass = {'build_ext':build_ext},
-    ext_modules = [ ext1, ext2, ext3 ],
-    packages=['commit','commit.operator'],
-    package_data={
-        'commit.operator':["*.*"], # needed by pyximport to compile at runtime
-    },
-)
+
+# taken from npcuda
+def find_in_path(name, path):
+    """Find a file in a search path"""
+
+    # Adapted fom http://code.activestate.com/recipes/52224
+    for dir in path.split(os.pathsep):
+        binpath = pjoin(dir, name)
+        if os.path.exists(binpath):
+            return os.path.abspath(binpath)
+    return None
+
+def locate_cuda():
+    """Locate the CUDA environment on the system
+    Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64'
+    and values giving the absolute path to each directory.
+    Starts by looking for the CUDAHOME env variable. If not found,
+    everything is based on finding 'nvcc' in the PATH.
+    """
+
+    # First check if the CUDAHOME env variable is in use
+    if 'CUDAHOME' in os.environ:
+        home = os.environ['CUDAHOME']
+        nvcc = pjoin(home, 'bin', 'nvcc')
+    else:
+        # Otherwise, search the PATH for NVCC
+        nvcc = find_in_path('nvcc', os.environ['PATH'])
+        if nvcc is None:
+            return None
+        home = os.path.dirname(os.path.dirname(nvcc))
+
+    cudaconfig = {'home': home, 'nvcc': nvcc,
+                  'include': pjoin(home, 'include'),
+                  'lib64': pjoin(home, 'lib64')}
+    for k, v in iter(cudaconfig.items()):
+        if not os.path.exists(v):
+            return None
+
+    return cudaconfig
+
+def customize_compiler_for_nvcc(self):
+    """Inject deep into distutils to customize how the dispatch
+    to gcc/nvcc works.
+    If you subclass UnixCCompiler, it's not trivial to get your subclass
+    injected in, and still have the right customizations (i.e.
+    distutils.sysconfig.customize_compiler) run on it. So instead of going
+    the OO route, I have this. Note, it's kindof like a wierd functional
+    subclassing going on.
+    """
+
+    # Tell the compiler it can processes .cu
+    self.src_extensions.append('.cu')
+
+    # Save references to the default compiler_so and _comple methods
+    default_compiler_so = self.compiler_so
+    super = self._compile
+
+    # Now redefine the _compile method. This gets executed for each
+    # object but distutils doesn't have the ability to change compilers
+    # based on source extension: we add it.
+    def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
+        if os.path.splitext(src)[1] == '.cu':
+            # use the cuda for .cu files
+            self.set_executable('compiler_so', CUDA['nvcc'])
+            # use only a subset of the extra_postargs, which are 1-1
+            # translated from the extra_compile_args in the Extension class
+            print('\n--------nvcc aqui--------')
+            print(type(extra_postargs))
+            print(extra_postargs)
+            print('--------------------\n')
+            postargs = extra_postargs['nvcc']
+        else:
+            print('\n--------gcc aqui--------')
+            print(type(extra_postargs))
+            print(extra_postargs)
+            print('--------------------\n')
+            postargs = extra_postargs['gcc']
+
+        super(obj, src, ext, cc_args, postargs, pp_opts)
+        # Reset the default compiler_so, which we might have changed for cuda
+        self.compiler_so = default_compiler_so
+
+    # Inject our redefined _compile method into the class
+    self._compile = _compile
+
+# Try to locate CUDA
+CUDA = locate_cuda()
+
+if CUDA != None:
+    print('Installing CUDA Version')
+
+    # Run the customize_compiler
+    class custom_build_ext(build_ext):
+        def build_extensions(self):
+            customize_compiler_for_nvcc(self.compiler)
+            build_ext.build_extensions(self)
+
+    # Obtain the numpy include directory. This logic works across numpy versions.
+    try:
+        numpy_include = numpy.get_include()
+    except AttributeError:
+        numpy_include = numpy.get_numpy_include()
+
+    # Cython extension to create the sparse data structure from a tractogram
+    # for the computation of matrix-vector multiplications
+    ext1 = Extension(
+        name='commit.trk2dictionary',
+        sources=['commit/trk2dictionary/trk2dictionary.pyx'],
+        include_dirs=[numpy.get_include()],
+        extra_compile_args= {
+            'gcc': ['-w'],
+            'nvcc': [
+                '-arch=sm_30', '--ptxas-options=-v', '-c',
+                '--compiler-options', "'-fPIC'"
+                ]
+            },
+        extra_link_args=[],
+        language='c++',
+    )
+
+    ext2 = Extension(
+        name='commit.core',
+        sources=['commit/core.pyx'],
+        include_dirs=[numpy.get_include()],
+        extra_compile_args= {
+            'gcc': ['-w'],
+            'nvcc': [
+                '-arch=sm_30', '--ptxas-options=-v', '-c',
+                '--compiler-options', "'-fPIC'"
+                ]
+            },
+        extra_link_args=[],
+        language='c++',
+    )
+
+    ext3 = Extension(
+        name='commit.proximals',
+        sources=['commit/proximals.pyx'],
+        include_dirs=[numpy.get_include()],
+        extra_compile_args= {
+            'gcc': ['-w'],
+            'nvcc': [
+                '-arch=sm_30', '--ptxas-options=-v', '-c',
+                '--compiler-options', "'-fPIC'"
+                ]
+            },
+        extra_link_args=[],
+        language='c++',
+    )
+
+    ext = Extension(
+        name='commit.gpumanager',
+        sources = ['commit/gpumanager.cu'],
+        library_dirs = [CUDA['lib64']],
+        libraries = ['cudart'],
+        language = 'c++',
+        runtime_library_dirs = [CUDA['lib64']],
+        # This syntax is specific to this build system
+        # we're only going to use certain compiler args with nvcc
+        # and not with gcc the implementation of this trick is in
+        # customize_compiler()
+        extra_compile_args= {
+            'gcc': ['-w'],
+            'nvcc': [
+                '-arch=sm_30', '--ptxas-options=-v', '-c',
+                '--compiler-options', "'-fPIC'"
+                ]
+            },
+        include_dirs = [numpy_include, CUDA['include']]
+    )
+
+    setup(
+        name='commit',
+        version='1.4.0',
+        description='Convex Optimization Modeling for Microstructure Informed Tractography (COMMIT)',
+        author='Alessandro Daducci',
+        author_email='alessandro.daducci@gmail.com',
+        url='https://github.com/daducci/COMMIT',
+        cmdclass = {'build_ext':custom_build_ext},
+        ext_modules = [ ext1, ext2, ext3, ext ],
+        packages=['commit','commit.operator'],
+        package_data={
+            'commit.operator':["*.*"], # needed by pyximport to compile at runtime
+        },
+    )
+else:
+    print('Installing CPU version')
+
+    # Cython extension to create the sparse data structure from a tractogram
+    # for the computation of matrix-vector multiplications
+    ext1 = Extension(
+        name='commit.trk2dictionary',
+        sources=['commit/trk2dictionary/trk2dictionary.pyx'],
+        include_dirs=[numpy.get_include()],
+        extra_compile_args=['-w'],
+        extra_link_args=[],
+        language='c++',
+    )
+
+    ext2 = Extension(
+        name='commit.core',
+        sources=['commit/core.pyx'],
+        include_dirs=[numpy.get_include()],
+        extra_compile_args=['-w'],
+        extra_link_args=[],
+        language='c++',
+    )
+
+    ext3 = Extension(
+        name='commit.proximals',
+        sources=['commit/proximals.pyx'],
+        include_dirs=[numpy.get_include()],
+        extra_compile_args=['-w'],
+        extra_link_args=[],
+        language='c++',
+    )
+
+    setup(
+        name='commit',
+        version='1.4.0',
+        description='Convex Optimization Modeling for Microstructure Informed Tractography (COMMIT)',
+        author='Alessandro Daducci',
+        author_email='alessandro.daducci@gmail.com',
+        url='https://github.com/daducci/COMMIT',
+        cmdclass = {'build_ext':build_ext},
+        ext_modules = [ ext1, ext2, ext3 ],
+        packages=['commit','commit.operator'],
+        package_data={
+            'commit.operator':["*.*"], # needed by pyximport to compile at runtime
+        },
+    )

From d1d594b37d54ac0b68bbf2c9f1bc232e23c0151a Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Fri, 13 Mar 2020 18:35:42 -0600
Subject: [PATCH 002/190] adding gpu manager with CudaLinearOperator class

---
 commit/gpumanager.cu  | 195 ++++++++++++++++++++++++++++++++++++++++++
 commit/gpumanager.cuh | 102 ++++++++++++++++++++++
 2 files changed, 297 insertions(+)
 create mode 100644 commit/gpumanager.cu
 create mode 100644 commit/gpumanager.cuh

diff --git a/commit/gpumanager.cu b/commit/gpumanager.cu
new file mode 100644
index 00000000..33b1fd9e
--- /dev/null
+++ b/commit/gpumanager.cu
@@ -0,0 +1,195 @@
+#include "gpumanager.cuh"
+
+bool cudaCheck(cudaError_t cudaStatus){
+    return cudaStatus == cudaSuccess;
+}
+
+void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compartmentsPerBlock, uint32_t* offsetPerBlock, int NUM_BLOCKS){
+
+    // fill arrays with zeros
+    memset(compartmentsPerBlock, 0, NUM_BLOCKS * sizeof(uint32_t));
+    memset(offsetPerBlock,       0, NUM_BLOCKS * sizeof(uint32_t));
+
+    // count compartments per block
+    for(int i = 0; i < NUM_COMPARTMENTS; i++)
+        compartmentsPerBlock[data[i]]++;
+
+    // calculate offset per block
+    offsetPerBlock[0] = 0;
+    for(int i = 1; i < NUM_BLOCKS; i++)
+        offsetPerBlock[i] = offsetPerBlock[i-1] + compartmentsPerBlock[i-1];
+}
+
+/*
+__dual__ segment::segment() {}
+
+__dual__ segment::~segment() {}
+//*/
+
+CudaLinearOperator::CudaLinearOperator(
+    uint32_t* voxelIC,
+    uint32_t* fiberIC,
+    uint16_t* orienIC,
+    float*    lengthIC,
+    float*    lutIC,
+
+    uint32_t* voxelEC,
+    uint16_t* orienEC,
+    float*    lutEC,
+
+    float*    lutISO,
+
+    int nsegments,
+    int nvoxels,      
+    int nfibers,      
+    int npeaks,       
+    int norientations,
+    int nsamples,     
+    int ndiameters,   
+    int nzeppelins,   
+    int nballs)
+{
+    int nrows = nvoxels * nsamples;
+    int ncols = nfibers*ndiameters + npeaks*nzeppelins + nvoxels*nballs;
+    int size_lutic  = ndiameters*norientations*nsamples;
+    int size_lutec  = nzeppelins*norientations*nsamples;
+    int size_lutiso = nballs*nsamples;
+    bool status;
+
+    uint32_t* segmentsPerBlock = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
+    uint32_t* offsetPerBlock   = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
+
+    // copy constant values to GPU
+    printf("\t* constant global values ... ");
+    status = true;
+    status = status && cudaCheck( cudaMemcpyToSymbol(NUM_VOXELS,       &nvoxels,       sizeof(int)) );
+    status = status && cudaCheck( cudaMemcpyToSymbol(NUM_FIBERS,       &nfibers,       sizeof(int)) );
+    status = status && cudaCheck( cudaMemcpyToSymbol(NUM_PEAKS,        &npeaks,        sizeof(int)) );
+    status = status && cudaCheck( cudaMemcpyToSymbol(NUM_ORIENTATIONS, &norientations, sizeof(int)) );
+    status = status && cudaCheck( cudaMemcpyToSymbol(NUM_SAMPLES,      &nsamples,      sizeof(int)) );
+    status = status && cudaCheck( cudaMemcpyToSymbol(NUM_DIAMETERS,    &ndiameters,    sizeof(int)) );
+    status = status && cudaCheck( cudaMemcpyToSymbol(NUM_ZEPPELINS,    &nzeppelins,    sizeof(int)) );
+    status = status && cudaCheck( cudaMemcpyToSymbol(NUM_BALLS,        &nballs,        sizeof(int)) );
+    status = status && cudaCheck( cudaMemcpyToSymbol(NUM_ROWS,         &nrows,         sizeof(int)) );
+    status = status && cudaCheck( cudaMemcpyToSymbol(NUM_COLS,         &ncols,         sizeof(int)) );
+    status = status && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTIC,       &size_lutic,    sizeof(int)) );
+    status = status && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTEC,       &size_lutec,    sizeof(int)) );
+    status = status && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTISO,      &size_lutiso,   sizeof(int)) );
+    if (status) printf("[ OK ]\n");
+    else        printf("[ ERROR ]\n");
+
+
+    // alloc memory in GPU for vectors x and y
+    printf("\t* memory for vectors x and y ... ");
+    status = true;
+    status = status && cudaCheck( cudaMalloc((void**)&(this->x), ncols*sizeof(float64_t)) );
+    status = status && cudaCheck( cudaMalloc((void**)&(this->y), nrows*sizeof(float64_t)) );
+    if (status) printf("[ OK ]\n");
+    else        printf("[ ERROR ]\n");
+
+    // alloc GPU memory for segments
+    printf("\t* memory for LUT (IC part) ... ");
+    status = true;
+    status = status && cudaCheck( cudaMalloc((void**)&(this->lutIC), size_lutic*sizeof(float32_t)) );
+    if (status) printf("[ OK ]\n");
+    else        printf("[ ERROR ]\n");
+
+    printf("\t* copying LUT in GPU (IC part) ... ");
+    status = true;
+    status = status && cudaCheck( cudaMemcpy(this->lutIC, lutIC, size_lutic*sizeof(float32_t), cudaMemcpyHostToDevice) );
+    if (status) printf("[ OK ]\n");
+    else        printf("[ ERROR ]\n");
+
+    printf("\t* allocating memory for LUT in GPU (EC part) ... ");
+    status = cudaCheck( cudaMalloc((void**)&(this->lutEC), size_lutec*sizeof(float32_t)) );
+    if (status) printf("[ OK ]\n");
+    else        printf("[ ERROR ]\n");
+
+    printf("\t* copying LUT in GPU (EC part) ... ");
+    status = cudaCheck( cudaMemcpy(this->lutEC, lutEC, size_lutec*sizeof(float32_t), cudaMemcpyHostToDevice) );
+    if (status) printf("[ OK ]\n");
+    else        printf("[ ERROR ]\n");
+
+    printf("\t* allocating memory for LUT in GPU (ISO part) ... ");
+    status = cudaCheck( cudaMalloc((void**)&(this->lutISO), size_lutiso*sizeof(float32_t)) );
+    if (status) printf("[ OK ]\n");
+    else        printf("[ ERROR ]\n");
+
+    printf("\t* copying LUT in GPU (ISO part) ... ");
+    status = cudaCheck( cudaMemcpy(this->lutISO, lutISO, size_lutiso*sizeof(float32_t), cudaMemcpyHostToDevice) );
+    if (status) printf("[ OK ]\n");
+    else        printf("[ ERROR ]\n");
+
+    printf("\t* preprocessing data for GPU ... ");
+    preprocessDataForGPU(voxelIC, nsegments, segmentsPerBlock, offsetPerBlock, nvoxels);
+    printf("\n");
+
+    printf("\t* fiber segments memory allocation ... ");
+    status = true;
+    status = status && cudaCheck( cudaMalloc((void**)&(this->voxelIC),  nsegments*sizeof(uint32_t))  );
+    status = status && cudaCheck( cudaMalloc((void**)&(this->fiberIC),  nsegments*sizeof(uint32_t))  );
+    status = status && cudaCheck( cudaMalloc((void**)&(this->orienIC),  nsegments*sizeof(uint16_t))  );
+    status = status && cudaCheck( cudaMalloc((void**)&(this->lengthIC), nsegments*sizeof(float32_t)) );
+    status = status && cudaCheck( cudaMalloc((void**)&(this->segmentsPerBlockIC), nvoxels*sizeof(uint32_t)) );
+    status = status && cudaCheck( cudaMalloc((void**)&(this->offsetPerBlockIC),   nvoxels*sizeof(uint32_t)) );
+    if (status) printf("[ OK ]\n");
+    else        printf("[ ERROR ]\n");
+
+    printf("\t* transfering fiber segments ... ");
+    status = true;
+    status = status && cudaCheck( cudaMemcpy(this->voxelIC,  voxelIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    status = status && cudaCheck( cudaMemcpy(this->fiberIC,  fiberIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    status = status && cudaCheck( cudaMemcpy(this->orienIC,  orienIC,  nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
+    status = status && cudaCheck( cudaMemcpy(this->lengthIC, lengthIC, nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
+    status = status && cudaCheck( cudaMemcpy(this->segmentsPerBlockIC, segmentsPerBlock, nvoxels*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    status = status && cudaCheck( cudaMemcpy(this->offsetPerBlockIC,   offsetPerBlock,   nvoxels*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    if (status) printf("[ OK ]\n");
+    else        printf("[ ERROR ]\n");
+
+    // ---------------------------------------- EC DATA ---------------------------------------- //
+    printf("\t* allocating memory for operator A in GPU (EC part) ... ");
+    status = true;
+    status = status && cudaCheck( cudaMalloc((void**)&(this->voxelEC),  npeaks*sizeof(uint32_t)) );
+    status = status && cudaCheck( cudaMalloc((void**)&(this->orienEC),  npeaks*sizeof(uint16_t)) );
+    status = status && cudaCheck( cudaMalloc((void**)&(this->segmentsPerBlockEC), nvoxels*sizeof(uint32_t))  );
+    status = status && cudaCheck( cudaMalloc((void**)&(this->offsetPerBlockEC),   nvoxels*sizeof(uint32_t))  );
+    if (status) printf("[ OK ]\n");
+    else        printf("[ ERROR ]\n");
+
+    printf("\t* preprocessing EC data for GPU ... ");
+    preprocessDataForGPU(voxelEC, npeaks, segmentsPerBlock, offsetPerBlock, nvoxels);
+    printf("\n");
+
+    printf("\t* copying operator A to GPU (EC part) ... ");
+    status = true;
+    status = status && cudaCheck( cudaMemcpy(this->voxelEC,            voxelEC,              npeaks*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    status = status && cudaCheck( cudaMemcpy(this->orienEC,            orienEC,              npeaks*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
+    status = status && cudaCheck( cudaMemcpy(this->segmentsPerBlockEC, segmentsPerBlock,     nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
+    status = status && cudaCheck( cudaMemcpy(this->offsetPerBlockEC,   offsetPerBlock,       nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
+    if (status) printf("[ OK ]\n");
+    else        printf("[ ERROR ]\n");
+
+    free(segmentsPerBlock);
+    free(offsetPerBlock);
+}
+
+CudaLinearOperator::~CudaLinearOperator(){
+    cudaFree(voxelIC);
+    cudaFree(fiberIC);
+    cudaFree(orienIC);
+    cudaFree(lengthIC);
+    cudaFree(lutIC);
+    cudaFree(segmentsPerBlockIC);
+    cudaFree(offsetPerBlockIC);
+    
+    cudaFree(voxelEC);
+    cudaFree(orienEC);
+    cudaFree(lutEC);
+    cudaFree(segmentsPerBlockEC);
+    cudaFree(offsetPerBlockEC);
+
+    cudaFree(lutISO);
+
+    cudaFree(x);
+    cudaFree(y);
+}
\ No newline at end of file
diff --git a/commit/gpumanager.cuh b/commit/gpumanager.cuh
new file mode 100644
index 00000000..a4dd187a
--- /dev/null
+++ b/commit/gpumanager.cuh
@@ -0,0 +1,102 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
+#include <device_launch_parameters.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <algorithm>
+
+//#define __dual__ __host__ __device__
+
+using namespace std;
+
+typedef unsigned int uint32_t;
+typedef unsigned short int uint16_t;
+typedef float float32_t;
+typedef double float64_t;
+
+bool cudaCheck(cudaError_t cudaStatus);
+void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compartmentsPerBlock, uint32_t* offsetPerBlock, int NUM_BLOCKS);
+
+/*class segment_t {
+    public:
+    
+    // pointer to the GPU memory where the array is stored
+    uint32_t voxelID;
+    uint32_t fiberID;
+    uint16_t orienID;
+    float length;
+  
+    __dual__  segment();
+    __dual__ ~segment();
+};//*/
+
+// constant values in GPU
+__constant__ int NUM_VOXELS;
+__constant__ int NUM_FIBERS;
+__constant__ int NUM_PEAKS;
+__constant__ int NUM_ORIENTATIONS;
+__constant__ int NUM_SAMPLES;
+__constant__ int NUM_DIAMETERS;
+__constant__ int NUM_ZEPPELINS;
+__constant__ int NUM_BALLS;
+__constant__ int NUM_ROWS;        
+__constant__ int NUM_COLS;      
+__constant__ int SIZE_LUTIC;      
+__constant__ int SIZE_LUTEC;     
+__constant__ int SIZE_LUTISO;
+
+class CudaLinearOperator {
+
+    // pointers to IC data in GPU memory
+    uint32_t*  voxelIC;
+    uint32_t*  fiberIC;
+    uint16_t*  orienIC;
+    float32_t* lengthIC;
+
+    // auxiliar arrays for GPU
+    uint32_t* segmentsPerBlockIC;
+    uint32_t* offsetPerBlockIC;
+    uint32_t* segmentsPerBlockEC;
+    uint32_t* offsetPerBlockEC;
+
+    // pointers to EC data in GPU memory
+    uint32_t*  voxelEC;
+    uint16_t*  orienEC;
+
+    // pointers to LUTs in GPU memory
+    float32_t* lutIC;
+    float32_t* lutEC;
+    float32_t* lutISO;
+
+    // pointers to vector x and y
+    float64_t* x;
+    float64_t* y;
+
+    public:
+        CudaLinearOperator(
+            uint32_t* voxelIC,
+            uint32_t* fiberIC,
+            uint16_t* orienIC,
+            float*    lengthIC,
+            float*    lutIC,
+        
+            uint32_t* voxelEC,
+            uint16_t* orienEC,
+            float*    lutEC,
+        
+            float*    lutISO,
+        
+            int nsegments,
+            int nvoxels,      
+            int nfibers,      
+            int npeaks,       
+            int norientations,
+            int nsamples,     
+            int ndiameters,   
+            int nzeppelins,   
+            int nballs);
+        
+        ~CudaLinearOperator();
+};
\ No newline at end of file

From 77816f3882384fffa6d2d1909acbfa75bee0c10a Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Fri, 13 Mar 2020 20:09:33 -0600
Subject: [PATCH 003/190] testing CudaLinearOperator on core.pyx file

---
 commit/core.pyx         |   5 +
 commit/cudaoperator.pyx | 203 ++++++++++++++++++++++++++++++++++++++++
 setup.py                |   4 +-
 3 files changed, 210 insertions(+), 2 deletions(-)
 create mode 100644 commit/cudaoperator.pyx

diff --git a/commit/core.pyx b/commit/core.pyx
index 58063114..2126a9cb 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -71,6 +71,7 @@ cdef class Evaluation :
     cdef public A
     cdef public x
     cdef public CONFIG
+    cdef public gpu_A
 
     def __init__( self, study_path, subject ) :
         """Setup the data structures with default values.
@@ -90,6 +91,7 @@ cdef class Evaluation :
         self.THREADS    = None # set by "set_threads" method
         self.A          = None # set by "build_operator" method
         self.x          = None # set by "fit" method
+        self.gpu_A      = None
 
         # store all the parameters of an evaluation with COMMIT
         self.CONFIG = {}
@@ -649,6 +651,9 @@ cdef class Evaluation :
         else :
             reload( sys.modules['commit.operator.operator'] )
         self.A = sys.modules['commit.operator.operator'].LinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
+        
+        import commit.cudaoperator
+        self.gpu_A = commit.cudaoperator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
 
         print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
 
diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
new file mode 100644
index 00000000..7aed0933
--- /dev/null
+++ b/commit/cudaoperator.pyx
@@ -0,0 +1,203 @@
+#!python
+#cython: language_level=3, boundscheck=False, wraparound=False, profile=False
+
+import cython
+import numpy as np
+cimport numpy as np
+
+cdef extern from "gpumanager.cuh":
+    cdef cppclass C_CudaLinearOperator "CudaLinearOperator":
+        C_CudaLinearOperator(
+            np.uint32_t*,
+            np.uint32_t*,
+            np.uint16_t*,
+            np.float32_t*,
+            np.float32_t*,
+
+            np.uint32_t*,
+            np.uint16_t*,
+            np.float32_t*,
+
+            np.float32_t*,
+
+            int,
+            int,
+            int,
+            int,
+            int,
+            int,
+            int,
+            int,
+            int)
+
+cdef class CudaLinearOperator :
+    """This class is a wrapper to the C code for performing marix-vector multiplications
+    with the COMMIT linear operator A. The multiplications are done using C code
+    that uses information from the DICTIONARY, KERNELS and THREADS data structures.
+    """
+    cdef int nS, nF, nR, nE, nT, nV, nI, n, ndirs
+    cdef public int adjoint, n1, n2
+
+    cdef DICTIONARY
+    cdef KERNELS
+    cdef THREADS
+
+    cdef unsigned int*   ICf
+    cdef float*          ICl
+    cdef unsigned int*   ICv
+    cdef unsigned short* ICo
+    cdef unsigned int*   ECv
+    cdef unsigned short* ECo
+    cdef unsigned int*   ISOv
+
+    cdef float* LUT_IC
+    cdef float* LUT_EC
+    cdef float* LUT_ISO
+
+    cdef unsigned int*   ICthreads
+    cdef unsigned int*   ECthreads
+    cdef unsigned int*   ISOthreads
+
+    cdef unsigned char*  ICthreadsT
+    cdef unsigned int*   ECthreadsT
+    cdef unsigned int*   ISOthreadsT
+    cdef C_CudaLinearOperator* A
+
+
+    def __init__( self, DICTIONARY, KERNELS, THREADS ) :
+        """Set the pointers to the data structures used by the C code."""
+        self.DICTIONARY = DICTIONARY
+        self.KERNELS    = KERNELS
+        self.THREADS    = THREADS
+
+        self.nF         = DICTIONARY['IC']['nF']    # number of FIBERS
+        self.nR         = KERNELS['wmr'].shape[0]   # number of FIBER RADII
+        self.nE         = DICTIONARY['EC']['nE']    # number of EC segments
+        self.nT         = KERNELS['wmh'].shape[0]   # number of EC TORTUOSITY values
+        self.nV         = DICTIONARY['nV']          # number of VOXELS
+        self.nI         = KERNELS['iso'].shape[0]   # number of ISO contributions
+        self.n          = DICTIONARY['IC']['n']     # numbner of IC segments
+        self.ndirs      = KERNELS['wmr'].shape[1]   # number of directions
+
+        if KERNELS['wmr'].size > 0 :
+            self.nS = KERNELS['wmr'].shape[2]       # number of SAMPLES
+        elif KERNELS['wmh'].size > 0 :
+            self.nS = KERNELS['wmh'].shape[2]
+        else :
+            self.nS = KERNELS['wmr'].shape[1]
+
+        self.adjoint    = 0                         # direct of inverse product
+
+        self.n1 = self.nV*self.nS
+        self.n2 = self.nR*self.nF + self.nT*self.nE + self.nI*self.nV
+
+        # get C pointers to arrays in DICTIONARY
+        cdef unsigned int [::1]   ICf  = DICTIONARY['IC']['fiber']
+        self.ICf = &ICf[0]
+        cdef float [::1]          ICl  = DICTIONARY['IC']['len']
+        self.ICl = &ICl[0]
+        cdef unsigned int [::1]   ICv  = DICTIONARY['IC']['v']
+        self.ICv = &ICv[0]
+        cdef unsigned short [::1] ICo  = DICTIONARY['IC']['o']
+        self.ICo = &ICo[0]
+        cdef unsigned int [::1]   ECv  = DICTIONARY['EC']['v']
+        self.ECv = &ECv[0]
+        cdef unsigned short [::1] ECo  = DICTIONARY['EC']['o']
+        self.ECo = &ECo[0]
+        cdef unsigned int [::1]   ISOv = DICTIONARY['ISO']['v']
+        self.ISOv = &ISOv[0]
+
+        # get C pointers to arrays in KERNELS
+        cdef float [:, :, ::1] wmrSFP = KERNELS['wmr']
+        self.LUT_IC  = &wmrSFP[0,0,0]
+        cdef float [:, :, ::1] wmhSFP = KERNELS['wmh']
+        self.LUT_EC  = &wmhSFP[0,0,0]
+        cdef float [:, ::1] isoSFP = KERNELS['iso']
+        self.LUT_ISO = &isoSFP[0,0]
+
+        # get C pointers to arrays in THREADS
+        cdef unsigned int [::1] ICthreads = THREADS['IC']
+        self.ICthreads  = &ICthreads[0]
+        cdef unsigned int [::1] ECthreads = THREADS['EC']
+        self.ECthreads  = &ECthreads[0]
+        cdef unsigned int [::1] ISOthreads = THREADS['ISO']
+        self.ISOthreads = &ISOthreads[0]
+
+        cdef unsigned char [::1] ICthreadsT = THREADS['ICt']
+        self.ICthreadsT  = &ICthreadsT[0]
+        cdef unsigned int  [::1] ECthreadsT = THREADS['ECt']
+        self.ECthreadsT  = &ECthreadsT[0]
+        cdef unsigned int  [::1] ISOthreadsT = THREADS['ISOt']
+        self.ISOthreadsT = &ISOthreadsT[0]
+
+        self.A = new C_CudaLinearOperator(
+            &ICv[0],
+            &ICf[0],
+            &ICo[0],
+            &ICl[0],
+            &wmrSFP[0,0,0],
+
+            &ECv[0],
+            &ECo[0],
+            &wmhSFP[0,0,0],
+
+            &isoSFP[0,0],
+
+            self.n,
+            self.nV,
+            self.nF,
+            self.nE,
+            self.ndirs,
+            self.nS,
+            self.nR,
+            self.nT,
+            self.nI
+        )
+
+    @property
+    def T( self ) :
+        """Transpose of the explicit matrix."""
+        C = CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
+        C.adjoint = 1 - C.adjoint
+        return C
+
+
+    @property
+    def shape( self ) :
+        """Size of the explicit matrix."""
+        if not self.adjoint :
+            return ( self.n1, self.n2 )
+        else :
+            return ( self.n2, self.n1 )
+
+
+    def dot( self, double [::1] v_in  ):
+        """Wrapper to C code for efficiently performing the matrix-vector multiplications.
+
+        Parameters
+        ----------
+        v_in : 1D numpy.array of double
+            Input vector for the matrix-vector multiplication
+
+        Returns
+        -------
+        v_out : 1D numpy.array of double
+            Results of the multiplication
+        """
+
+        # Permit only matrix-vector multiplications
+        if v_in.size != self.shape[1] :
+            raise RuntimeError( "A.dot(): dimensions do not match" )
+
+        # Create output array
+        cdef double [::1] v_out = np.zeros( self.shape[0], dtype=np.float64 )
+
+        # Call the cython function to read the memory pointers
+        if not self.adjoint :
+            # DIRECT PRODUCT A*x
+            print('MULTIPLICO Ax')
+        else :
+            # INVERSE PRODUCT A'*y
+            print('MULTIPLICO A\'y')
+
+        return v_out
diff --git a/setup.py b/setup.py
index 45cd02b7..d0d95e13 100644
--- a/setup.py
+++ b/setup.py
@@ -162,8 +162,8 @@ def build_extensions(self):
     )
 
     ext = Extension(
-        name='commit.gpumanager',
-        sources = ['commit/gpumanager.cu'],
+        name='commit.cudaoperator',
+        sources = ['commit/gpumanager.cu', 'commit/cudaoperator.pyx'],
         library_dirs = [CUDA['lib64']],
         libraries = ['cudart'],
         language = 'c++',

From 91ebe72be3decee4b1bd26c1cf190a0821c011d7 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Tue, 24 Mar 2020 11:43:41 -0600
Subject: [PATCH 004/190] Adding kernels for Ax multiplication in GPU

---
 commit/cudaoperator.pyx |   2 +
 commit/gpumanager.cu    | 175 +++++++++++++++++++++++++++++++++++++++-
 commit/gpumanager.cuh   |  17 +++-
 setup.py                |   2 +-
 4 files changed, 189 insertions(+), 7 deletions(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 7aed0933..6fe3b389 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -196,8 +196,10 @@ cdef class CudaLinearOperator :
         if not self.adjoint :
             # DIRECT PRODUCT A*x
             print('MULTIPLICO Ax')
+            self.A.multiplyByX(&v_in[0], &v_out[0])
         else :
             # INVERSE PRODUCT A'*y
             print('MULTIPLICO A\'y')
+            self.A.multiplyByY(&v_in[0], &v_out[0])
 
         return v_out
diff --git a/commit/gpumanager.cu b/commit/gpumanager.cu
index 33b1fd9e..595df3c3 100644
--- a/commit/gpumanager.cu
+++ b/commit/gpumanager.cu
@@ -49,8 +49,10 @@ CudaLinearOperator::CudaLinearOperator(
     int nzeppelins,   
     int nballs)
 {
-    int nrows = nvoxels * nsamples;
-    int ncols = nfibers*ndiameters + npeaks*nzeppelins + nvoxels*nballs;
+    this->nvoxels = nvoxels;
+    this->nfibers = nfibers;
+    this->nrows = nvoxels * nsamples;
+    this->ncols = nfibers*ndiameters + npeaks*nzeppelins + nvoxels*nballs;
     int size_lutic  = ndiameters*norientations*nsamples;
     int size_lutec  = nzeppelins*norientations*nsamples;
     int size_lutiso = nballs*nsamples;
@@ -192,4 +194,173 @@ CudaLinearOperator::~CudaLinearOperator(){
 
     cudaFree(x);
     cudaFree(y);
+}
+
+__global__ void multiply_Ax_ICpart(
+    uint32_t*  voxelIDs,
+    uint32_t*  fiberIDs,
+    uint16_t*  orienIDs,
+    float32_t* lengths,
+    uint32_t*  segmentsPerBlock,
+    uint32_t*  offsetPerBlock,
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y){
+
+    __shared__ float64_t shmem[1024];
+
+    uint32_t bid = blockIdx.x;
+    uint32_t tid = threadIdx.x;
+    uint32_t gid = threadIdx.x / 512;
+    uint32_t sid = threadIdx.x - 512*gid;
+
+    shmem[tid] = 0.0;
+
+    if(sid >= num_samples) return;
+
+    uint32_t offset = offsetPerBlock[bid] + (segmentsPerBlock[bid]/2)*gid;
+    uint32_t nsegments = segmentsPerBlock[bid]/2 + (segmentsPerBlock[bid]%2)*gid;
+
+    //segment_t* segment = segments + offset;
+    uint32_t*  voxel  = voxelIDs + offset;
+    uint32_t*  fiber  = fiberIDs + offset;
+    uint16_t*  orien  = orienIDs + offset;
+    float32_t* length = lengths  + offset;
+
+    float64_t sum = 0.0;
+
+    for(int i = 0; i < nsegments; i++){
+        int offset_lut = (*orien)*NUM_SAMPLES + sid;
+
+        float64_t aux = 0.0;
+        for(int j = 0; j < NUM_DIAMETERS; j++){
+            aux += (double)(lut[offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES])*x[(*fiber) + j*NUM_FIBERS];
+            //aux += tex1Dfetch(tex_lutIC, offset_lut + j*num_orientations*num_samples) * x[(*fiber) + j*num_fibers];
+        }
+
+        sum += aux * (*length);
+
+        fiber++;
+        orien++;
+        length++;
+    }
+
+    shmem[tid] = sum;
+    __syncthreads();
+
+    if(tid < NUM_SAMPLES)
+        y[(*voxel)*NUM_SAMPLES + sid] = sum + shmem[tid+512];
+}
+
+__global__ void multiply_Ax_ECpart(
+    uint32_t*  voxelIDs,
+    uint16_t*  orienIDs,
+    uint32_t*  segmentsPerBlock,
+    uint32_t*  offsetPerBlock,
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y)
+{
+    uint32_t bid = blockIdx.x;
+    uint32_t tid = threadIdx.x;
+
+    if(tid >= NUM_SAMPLES) return;
+
+    uint32_t offset  = offsetPerBlock[bid];
+    uint32_t nsegments = segmentsPerBlock[bid];
+
+    //compartmentEC_t* excomp = excomps + offset;
+    uint32_t* voxel = voxelIDs + offset;
+    uint16_t* orien = orienIDs + offset;
+
+    uint32_t target = NUM_FIBERS*NUM_DIAMETERS + offset;
+
+    float64_t sum = 0.0;
+    for(int i = 0; i < nsegments; i++){
+        uint32_t offset_lut = (*orientation)*num_samples + tid;
+
+        for(int j = 0; j < NUM_ZEPPELINS; j++)
+            //sum += (double)(lut[lut_offset + j*num_orientations*num_samples])*x[target + j*num_excomps + i];
+            sum += tex1Dfetch(tex_lutEC, offset_lut + j*num_orientations*num_samples) * x[target + j*num_excomps + i];
+
+        orientation++;
+    }
+
+    y[(*voxel)*num_samples + tid] += sum;
+}
+
+__global__ void multiply_Ax_ISOpart(
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y)
+{
+    uint32_t bid = blockIdx.x;
+    uint32_t tid = threadIdx.x;
+
+    if(tid >= NUM_SAMPLES) return;
+
+    uint32_t target = NUM_FIBERS*NUM_DIAMETERS + NUM_PEAKS*NUM_ZEPPELINS + bid;
+
+    float64_t sum = 0.0;
+    for(int j = 0; j < NUM_BALLS; j++)
+        sum += (double)(lut[j*NUM_SAMPLES + tid])*x[target + j*NUM_VOXELS];
+        //sum += (double)(tex1Dfetch(tex_lutISO, j*num_samples + tid))*x[target + j*num_voxels];
+        
+
+    y[bid*NUM_SAMPLES + tid] += sum;
+}
+
+void CudaLinearOperator::multiplyByX(float64_t* x, float64_t* y){
+
+    // Copy vector x to the GPU
+    cudaMemcpy(this->x, x, ncols*sizeof(double), cudaMemcpyHostToDevice);
+
+    // Multiply IC part in the GPU
+    multiply_Ax_ICpart<<<nvoxels, 1024>>>(voxelIC, fiberIC, orientIC, lengthIC, segmentsPerBlockIC, offsetPerBlockIC, lutIC, this->x, this->y);
+
+    //cudaCheckKernel();
+
+    // Multiply EC part in the GPU
+    multiply_Ax_ECpart<<<nvoxels, 512>>>(voxelEC, orientEC, segmentsPerBlockEC, offsetPerBlockEC, lutEC, this->x, this->y);
+
+    //cudaCheckKernel();
+
+    // Multiply ISO part in the GPU
+    multiply_Ax_ISOpart<<<nvoxels, 512>>>(lutISO, this->x, this->y);
+
+    //cudaCheckKernel();
+
+    // Copy back result to CPU
+    cudaMemcpy(y, this->y, nrows*sizeof(double), cudaMemcpyDeviceToHost);
+}
+
+void CudaLinearOperator::multiplyByY(float64_t* y, float64_t* x){
+        
+    // Copy vector y to the GPU
+    //cudaCheck( cudaMemset(gpu_x, 0, NUM_COLS*sizeof(float64_t)) );
+    //cudaCheck( cudaMemcpy(gpu_x, x, NUM_COLS*sizeof(double), cudaMemcpyHostToDevice) );
+    //cudaCheck( cudaMemcpy(gpu_y, y, NUM_ROWS*sizeof(double), cudaMemcpyHostToDevice) );
+
+    // Multiply IC part in the GPU
+    //multiply_Aty_ICpart<<<NUM_FIBERS, 512>>>(gpu_voxelICt, gpu_fiberICt, gpu_orientICt, gpu_lengthICt, gpu_segmentsPerBlockICt, gpu_offsetPerBlockICt, gpu_lutIC, gpu_x, gpu_y);
+
+    //cudaCheckKernel();//*/
+
+    // Multiply EC part in the GPU
+    //multiply_Aty_ECpart<<<NUM_VOXELS, 512>>>(gpu_voxelEC, gpu_orientEC, gpu_segmentsPerBlockEC, gpu_offsetPerBlockEC, gpu_lutEC, gpu_x, gpu_y);
+
+    //cudaCheckKernel();
+
+    // Multiply ISO part in the GPU
+    //multiply_Aty_ISOpart<<<NUM_VOXELS, 512>>>(gpu_lutISO, gpu_x, gpu_y);
+
+    //cudaCheckKernel();//*/
+
+    // Copy back result to CPU
+    //cudaCheck( cudaMemcpy(x, gpu_x, NUM_COLS*sizeof(double), cudaMemcpyDeviceToHost) ); 
+        
+    /*printf("\n\n VECTOR X EC PART:\n");
+    for(int i = NUM_FIBERS*NUM_RESFUNCIC; i < NUM_FIBERS*NUM_RESFUNCIC+20; i++)
+        printf("%lf ", x[i]);
+    printf("\n\n");//*/
 }
\ No newline at end of file
diff --git a/commit/gpumanager.cuh b/commit/gpumanager.cuh
index a4dd187a..7971c7b8 100644
--- a/commit/gpumanager.cuh
+++ b/commit/gpumanager.cuh
@@ -74,19 +74,25 @@ class CudaLinearOperator {
     float64_t* x;
     float64_t* y;
 
+    // dimensions of the operator
+    int nrows;
+    int ncols;
+    int nvoxels;
+    int nfibers;
+
     public:
         CudaLinearOperator(
             uint32_t* voxelIC,
             uint32_t* fiberIC,
             uint16_t* orienIC,
-            float*    lengthIC,
-            float*    lutIC,
+            float32_t*    lengthIC,
+            float32_t*    lutIC,
         
             uint32_t* voxelEC,
             uint16_t* orienEC,
-            float*    lutEC,
+            float32_t*    lutEC,
         
-            float*    lutISO,
+            float32_t*    lutISO,
         
             int nsegments,
             int nvoxels,      
@@ -99,4 +105,7 @@ class CudaLinearOperator {
             int nballs);
         
         ~CudaLinearOperator();
+
+        void multiplyByX(float64_t* x, float64_t* y);
+        void multiplyByY(float64_t* y, float64_t* x);
 };
\ No newline at end of file
diff --git a/setup.py b/setup.py
index d0d95e13..df8fd35e 100644
--- a/setup.py
+++ b/setup.py
@@ -230,7 +230,7 @@ def build_extensions(self):
 
     setup(
         name='commit',
-        version='1.4.0',
+        version='1.3.0',
         description='Convex Optimization Modeling for Microstructure Informed Tractography (COMMIT)',
         author='Alessandro Daducci',
         author_email='alessandro.daducci@gmail.com',

From ad4c7bc03fc9944820d84012be3a6a430ef9cec0 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Tue, 24 Mar 2020 12:07:14 -0600
Subject: [PATCH 005/190] Fixing compilation error with variable names

---
 commit/gpumanager.cu | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/commit/gpumanager.cu b/commit/gpumanager.cu
index 595df3c3..ebadeb77 100644
--- a/commit/gpumanager.cu
+++ b/commit/gpumanager.cu
@@ -216,7 +216,7 @@ __global__ void multiply_Ax_ICpart(
 
     shmem[tid] = 0.0;
 
-    if(sid >= num_samples) return;
+    if(sid >= NUM_SAMPLES) return;
 
     uint32_t offset = offsetPerBlock[bid] + (segmentsPerBlock[bid]/2)*gid;
     uint32_t nsegments = segmentsPerBlock[bid]/2 + (segmentsPerBlock[bid]%2)*gid;
@@ -277,16 +277,16 @@ __global__ void multiply_Ax_ECpart(
 
     float64_t sum = 0.0;
     for(int i = 0; i < nsegments; i++){
-        uint32_t offset_lut = (*orientation)*num_samples + tid;
+        uint32_t offset_lut = (*orien)*NUM_SAMPLES + tid;
 
         for(int j = 0; j < NUM_ZEPPELINS; j++)
-            //sum += (double)(lut[lut_offset + j*num_orientations*num_samples])*x[target + j*num_excomps + i];
-            sum += tex1Dfetch(tex_lutEC, offset_lut + j*num_orientations*num_samples) * x[target + j*num_excomps + i];
+            sum += (double)(lut[lut_offset + j*NUM_ORIENTATIONS*NUM_SAMPLES])*x[target + j*NUM_PEAKS + i];
+            //sum += tex1Dfetch(tex_lutEC, offset_lut + j*num_orientations*num_samples) * x[target + j*num_excomps + i];
 
-        orientation++;
+        orien++;
     }
 
-    y[(*voxel)*num_samples + tid] += sum;
+    y[(*voxel)*NUM_SAMPLES + tid] += sum;
 }
 
 __global__ void multiply_Ax_ISOpart(
@@ -316,12 +316,12 @@ void CudaLinearOperator::multiplyByX(float64_t* x, float64_t* y){
     cudaMemcpy(this->x, x, ncols*sizeof(double), cudaMemcpyHostToDevice);
 
     // Multiply IC part in the GPU
-    multiply_Ax_ICpart<<<nvoxels, 1024>>>(voxelIC, fiberIC, orientIC, lengthIC, segmentsPerBlockIC, offsetPerBlockIC, lutIC, this->x, this->y);
+    multiply_Ax_ICpart<<<nvoxels, 1024>>>(voxelIC, fiberIC, orienIC, lengthIC, segmentsPerBlockIC, offsetPerBlockIC, lutIC, this->x, this->y);
 
     //cudaCheckKernel();
 
     // Multiply EC part in the GPU
-    multiply_Ax_ECpart<<<nvoxels, 512>>>(voxelEC, orientEC, segmentsPerBlockEC, offsetPerBlockEC, lutEC, this->x, this->y);
+    multiply_Ax_ECpart<<<nvoxels, 512>>>(voxelEC, orienEC, segmentsPerBlockEC, offsetPerBlockEC, lutEC, this->x, this->y);
 
     //cudaCheckKernel();
 

From 0438fcd0587b2cfbd38493848765d110f6eb57ec Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Tue, 24 Mar 2020 12:08:54 -0600
Subject: [PATCH 006/190] Fixing compilation error with variable names

---
 commit/gpumanager.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/commit/gpumanager.cu b/commit/gpumanager.cu
index ebadeb77..8917e63f 100644
--- a/commit/gpumanager.cu
+++ b/commit/gpumanager.cu
@@ -280,7 +280,7 @@ __global__ void multiply_Ax_ECpart(
         uint32_t offset_lut = (*orien)*NUM_SAMPLES + tid;
 
         for(int j = 0; j < NUM_ZEPPELINS; j++)
-            sum += (double)(lut[lut_offset + j*NUM_ORIENTATIONS*NUM_SAMPLES])*x[target + j*NUM_PEAKS + i];
+            sum += (double)(lut[offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES])*x[target + j*NUM_PEAKS + i];
             //sum += tex1Dfetch(tex_lutEC, offset_lut + j*num_orientations*num_samples) * x[target + j*num_excomps + i];
 
         orien++;

From e1ba2873cb73e5527e25cf4bce70e79023d92724 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Tue, 24 Mar 2020 12:34:01 -0600
Subject: [PATCH 007/190] Fixing compilation error with variable names

---
 commit/cudaoperator.pyx | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 6fe3b389..2f90f466 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -30,6 +30,9 @@ cdef extern from "gpumanager.cuh":
             int,
             int)
 
+        void multiplyByX(np.float64_t*, np.float64_t*)
+        void multiplyByY(np.float64_t*, np.float64_t*)
+
 cdef class CudaLinearOperator :
     """This class is a wrapper to the C code for performing marix-vector multiplications
     with the COMMIT linear operator A. The multiplications are done using C code

From 173d9530b4cbac468d0fd211667377c1af519887 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Tue, 24 Mar 2020 17:30:28 -0600
Subject: [PATCH 008/190] Adding kernels for operation A'y in GPU

---
 commit/cudaoperator.pyx |  20 ++++
 commit/gpumanager.cu    | 219 ++++++++++++++++++++++++++++++++++++++--
 commit/gpumanager.cuh   |   9 ++
 3 files changed, 242 insertions(+), 6 deletions(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 2f90f466..3dceb8c3 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -30,6 +30,7 @@ cdef extern from "gpumanager.cuh":
             int,
             int)
 
+        void setTransposeData(np.uint32_t*, np.uint32_t*, np.uint16_t*, np.float32_t*)
         void multiplyByX(np.float64_t*, np.float64_t*)
         void multiplyByY(np.float64_t*, np.float64_t*)
 
@@ -133,6 +134,16 @@ cdef class CudaLinearOperator :
         cdef unsigned int  [::1] ISOthreadsT = THREADS['ISOt']
         self.ISOthreadsT = &ISOthreadsT[0]
 
+        idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['v'])] )
+        self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
+        self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]
+        self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
+        self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
+
+        idx = np.lexsort( [np.array(self.DICTIONARY['EC']['o']), np.array(self.DICTIONARY['EC']['v'])] )
+        self.DICTIONARY['EC']['v'] = self.DICTIONARY['EC']['v'][ idx ]
+        self.DICTIONARY['EC']['o'] = self.DICTIONARY['EC']['o'][ idx ]
+
         self.A = new C_CudaLinearOperator(
             &ICv[0],
             &ICf[0],
@@ -157,6 +168,15 @@ cdef class CudaLinearOperator :
             self.nI
         )
 
+        idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['fiber'])] )
+        self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
+        self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]
+        self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
+        self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
+        del idx
+
+        self.A.setTransposeData(&ICv[0], &ICf[0], &ICo[0], &ICl[0])
+
     @property
     def T( self ) :
         """Transpose of the explicit matrix."""
diff --git a/commit/gpumanager.cu b/commit/gpumanager.cu
index 8917e63f..cfa2aaae 100644
--- a/commit/gpumanager.cu
+++ b/commit/gpumanager.cu
@@ -20,6 +20,44 @@ void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compar
         offsetPerBlock[i] = offsetPerBlock[i-1] + compartmentsPerBlock[i-1];
 }
 
+void CudaLinearOperator::setTransponseData(
+    uint32_t*  voxelIDs,
+    uint32_t*  fiberIDs,
+    uint16_t*  orienIDs,
+    float32_t* lengths)
+{
+    bool status;
+    uint32_t*  fibersPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
+    uint32_t*  offsetPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
+
+    preprocessDataForGPU(fiberIDs, nsegments, fibersPerBlock, offsetPerBlock, nfibers);
+
+    printf("\t* extra memory for operator A' ... ");
+    status = true;
+    status = status && cudaCheck( cudaMalloc((void**)&(this->voxelICt),  nsegments*sizeof(uint32_t))  );
+    status = status && cudaCheck( cudaMalloc((void**)&(this->fiberICt),  nsegments*sizeof(uint32_t))  );
+    status = status && cudaCheck( cudaMalloc((void**)&(this->orienICt),  nsegments*sizeof(uint16_t))  );
+    status = status && cudaCheck( cudaMalloc((void**)&(this->lengthICt), nsegments*sizeof(float32_t)) );
+    status = status && cudaCheck( cudaMalloc((void**)&(this->fibersPerBlockICt),  nfibers*sizeof(uint32_t)) );
+    status = status && cudaCheck( cudaMalloc((void**)&(this->offsetPerBlockICt) , nfibers*sizeof(uint32_t)) );
+    if (status) printf("[ OK ]\n");
+    else        printf("[ ERROR ]\n");
+
+    printf("\t* transfering memory for operator A' ... ");
+    status = true;
+    status = status && cudaCheck( cudaMemcpy(this->voxelICt,  voxel,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    status = status && cudaCheck( cudaMemcpy(this->fiberICt,  fiber,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    status = status && cudaCheck( cudaMemcpy(this->orienICt,  orien,  nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
+    status = status && cudaCheck( cudaMemcpy(this->lengthICt, length, nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
+    status = status && cudaCheck( cudaMemcpy(this->fibersPerBlockICt, fibersPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    status = status && cudaCheck( cudaMemcpy(this->offsetPerBlockICt, offsetPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    if (status) printf("[ OK ]\n");
+    else        printf("[ ERROR ]\n");
+
+    free(fibersPerBlock);
+    free(offsetPerBlock);
+}
+
 /*
 __dual__ segment::segment() {}
 
@@ -192,6 +230,13 @@ CudaLinearOperator::~CudaLinearOperator(){
 
     cudaFree(lutISO);
 
+    cudaFree(voxelICt);
+    cudaFree(fiberICt);
+    cudaFree(orienICt);
+    cudaFree(lengthICt);
+    cudaFree(fibersPerBlockICt);
+    cudaFree(offsetPerBlockICt);
+
     cudaFree(x);
     cudaFree(y);
 }
@@ -310,6 +355,168 @@ __global__ void multiply_Ax_ISOpart(
     y[bid*NUM_SAMPLES + tid] += sum;
 }
 
+__global__ void multiply_Aty_ICpart(
+    uint32_t*  voxelICt,
+    uint32_t*  fiberICt,
+    uint16_t*  orienICt,
+    float32_t* lengthICt,
+    uint32_t*  compartmentsPerBlock,
+    uint32_t*  offsetPerBlock,
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y)
+{
+    __shared__ float64_t shmem[512];
+
+    uint32_t bid = blockIdx.x;
+    uint32_t tid = threadIdx.x;
+
+    shmem[tid] = 0.0;
+
+    if(tid >= NUM_SAMPLES) return;
+
+    /*if(bid == 0 && tid == 0){
+    for(int i = 0; i < 10; i++){
+    printf("%d %d %d %f\n", voxelICt[i], fiberICt[i], orientICt[i], lengthICt[i]);
+    }
+    }
+    else if(bid != 0) return;
+    //__syncthreads();//*/
+
+    uint32_t offset = offsetPerBlock[bid];
+    uint32_t nsegments = offset + compartmentsPerBlock[bid];
+
+    //segment_t* segment = segments + offset;
+    uint32_t*  voxel  = voxelICt  + offset;
+    uint32_t*  fiber  = fiberICt  + offset;
+    uint16_t*  orien  = orienICt  + offset;
+    float32_t* length = lengthICt + offset;
+    //uint fiber = segment->fiber;
+
+    for(int j = 0; j < NUM_DIAMETERS; j++){
+        int offset_lut = j*NUM_ORIENTATIONS*NUM_SAMPLES + tid;
+
+        float64_t sum = 0.0;
+        //segment = segments + offset;
+        voxel  = voxelICt  + offset;
+        orient = orienICt  + offset;
+        length = lengthICt + offset;
+        for(int i = offset; i < nsegments; i++){
+            sum += ((float64_t)(*length)) *( (float64_t) lut[offset_lut + (*orien)*NUM_SAMPLES] )* y[(*voxel)*NUM_SAMPLES + tid];
+            //sum += ((float64_t)(*length)) *( (float64_t) tex1Dfetch(tex_lutIC, offset_lut + (*orient)*num_samples) )* y[(*voxel)*num_samples + tid];
+            //segment++;
+            voxel++;
+            //fiber++;
+            orien++;
+            length++;
+        }
+
+        shmem[tid] = sum;
+        __syncthreads();
+
+        if(tid < 256) shmem[tid] += shmem[tid + 256]; __syncthreads();
+        if(tid < 128) shmem[tid] += shmem[tid + 128]; __syncthreads();
+        if(tid <  64) shmem[tid] += shmem[tid +  64]; __syncthreads();
+        if(tid <  32) shmem[tid] += shmem[tid +  32]; __syncthreads();
+        if(tid <  16) shmem[tid] += shmem[tid +  16]; __syncthreads();
+        if(tid <   8) shmem[tid] += shmem[tid +   8]; __syncthreads();
+        if(tid <   4) shmem[tid] += shmem[tid +   4]; __syncthreads();
+        //if(tid <   2) shmem[tid] += shmem[tid +   2]; __syncthreads();
+
+        if(tid == 0) x[j*NUM_FIBERS + (*fiber)] = shmem[0] + shmem[1] + shmem[2] + shmem[3];
+
+        __syncthreads();
+    }
+}
+
+__global__ void multiply_Aty_ECpart(
+    uint32_t*  voxelEC,
+    uint16_t*  orienEC,
+    uint32_t*  segmentsPerBlock,
+    uint32_t*  offsetPerBlock,
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y)
+{
+    __shared__ float64_t shmem[512];
+
+    uint32_t bid = blockIdx.x;
+    uint32_t tid = threadIdx.x;
+
+    shmem[tid] = 0.0;
+
+    if(tid >= NUM_SAMPLES) return;
+
+    uint32_t offset  = offsetPerBlock[bid];
+    uint32_t ncompartments = segmentsPerBlock[bid] + offset;
+
+    //compartmentEC_t* peak = peaks + offset;
+    uint32_t* voxel = voxelEC + offset;
+    uint16_t* orien = orienEC + offset;
+
+    for(int j = 0; j < NUM_ZEPPELINS; j++){        
+        uint32_t offset_lut = j*NUM_ORIENTATIONS*NUM_SAMPLES + tid;
+
+        //peak = peaks + offset;
+        voxel = voxelEC + offset;
+        orien = orienEC + offset;
+        for(int i = offset; i < ncompartments; i++){
+            //shmem[tid] =( (float64_t)tex1Dfetch(tex_lutEC, (*orient)*num_samples + offset_lut) )* y[(*voxel)*num_samples + tid];
+            shmem[tid] =( (float64_t)(lut[(*orien)*NUM_SAMPLES + offset_lut] ))* y[(*voxel)*NUM_SAMPLES + tid];
+            __syncthreads();
+
+            //if(bid == 0){
+            //printf("%lf\n", lut[(peak->orientation)*num_samples + lut_offset] * y[(peak->voxel)*num_samples + tid]);
+
+            if(tid < 256) shmem[tid] += shmem[tid + 256]; __syncthreads();
+            if(tid < 128) shmem[tid] += shmem[tid + 128]; __syncthreads();
+            if(tid <  64) shmem[tid] += shmem[tid +  64]; __syncthreads();
+            if(tid <  32) shmem[tid] += shmem[tid +  32]; __syncthreads();
+            if(tid <  16) shmem[tid] += shmem[tid +  16]; __syncthreads();
+            if(tid <   8) shmem[tid] += shmem[tid +   8]; __syncthreads();
+            if(tid <   4) shmem[tid] += shmem[tid +   4]; __syncthreads();
+            if(tid <   2) shmem[tid] += shmem[tid +   2]; __syncthreads();
+
+            if(tid == 0) x[NUM_FIBERS*NUM_DIAMETERS + j*NUM_PEAKS + i] = shmem[0] + shmem[1];
+            //}
+
+            //peak++;
+            voxel++;
+            orien++;
+            __syncthreads();
+        }
+    }
+} //*/
+
+__global__ void multiply_Aty_ISOpart(float* lut, double* x, double* y){
+    __shared__ double shmem[512];
+
+    uint bid = blockIdx.x;
+    uint tid = threadIdx.x;
+    uint offset = NUM_FIBERS*NUM_DIAMETERS + NUM_PEAKS*NUM_ZEPPELINS + bid;
+
+    shmem[tid] = 0.0;
+
+    if(tid >= NUM_SAMPLES) return;
+
+    for(int j = 0; j < NUM_BALLS; j++){
+        shmem[tid] =( (float64_t) lut[j*NUM_SAMPLES + tid] )* y[bid*NUM_SAMPLES + tid];
+        //shmem[tid] =( (float64_t) tex1Dfetch(tex_lutISO, j*num_samples + tid) )* y[bid*num_samples + tid];
+        __syncthreads();
+
+        if(tid < 256) shmem[tid] += shmem[tid + 256]; __syncthreads();
+        if(tid < 128) shmem[tid] += shmem[tid + 128]; __syncthreads();
+        if(tid <  64) shmem[tid] += shmem[tid +  64]; __syncthreads();
+        if(tid <  32) shmem[tid] += shmem[tid +  32]; __syncthreads();
+        if(tid <  16) shmem[tid] += shmem[tid +  16]; __syncthreads();
+        if(tid <   8) shmem[tid] += shmem[tid +   8]; __syncthreads();
+        if(tid <   4) shmem[tid] += shmem[tid +   4]; __syncthreads(); 
+
+        if(tid == 0)
+            x[offset + j*NUM_VOXELS] = shmem[0] + shmem[1] + shmem[2] + shmem[3];
+    }
+}//*/
+
 void CudaLinearOperator::multiplyByX(float64_t* x, float64_t* y){
 
     // Copy vector x to the GPU
@@ -334,30 +541,30 @@ void CudaLinearOperator::multiplyByX(float64_t* x, float64_t* y){
     cudaMemcpy(y, this->y, nrows*sizeof(double), cudaMemcpyDeviceToHost);
 }
 
-void CudaLinearOperator::multiplyByY(float64_t* y, float64_t* x){
+void CudaLinearOperator::multiplyByY(float64_t* v_in, float64_t* v_out){
         
     // Copy vector y to the GPU
     //cudaCheck( cudaMemset(gpu_x, 0, NUM_COLS*sizeof(float64_t)) );
     //cudaCheck( cudaMemcpy(gpu_x, x, NUM_COLS*sizeof(double), cudaMemcpyHostToDevice) );
-    //cudaCheck( cudaMemcpy(gpu_y, y, NUM_ROWS*sizeof(double), cudaMemcpyHostToDevice) );
+    cudaCheck( cudaMemcpy(y, v_in, nrows*sizeof(double), cudaMemcpyHostToDevice) );
 
     // Multiply IC part in the GPU
-    //multiply_Aty_ICpart<<<NUM_FIBERS, 512>>>(gpu_voxelICt, gpu_fiberICt, gpu_orientICt, gpu_lengthICt, gpu_segmentsPerBlockICt, gpu_offsetPerBlockICt, gpu_lutIC, gpu_x, gpu_y);
+    multiply_Aty_ICpart<<<nfibers, 512>>>(voxelICt, fiberICt, orienICt, lengthICt, fibersPerBlockICt, offsetPerBlockICt, lutIC, x, y);
 
     //cudaCheckKernel();//*/
 
     // Multiply EC part in the GPU
-    //multiply_Aty_ECpart<<<NUM_VOXELS, 512>>>(gpu_voxelEC, gpu_orientEC, gpu_segmentsPerBlockEC, gpu_offsetPerBlockEC, gpu_lutEC, gpu_x, gpu_y);
+    multiply_Aty_ECpart<<<nvoxels, 512>>>(voxelEC, orienEC, segmentsPerBlockEC, offsetPerBlockEC, lutEC, x, y);
 
     //cudaCheckKernel();
 
     // Multiply ISO part in the GPU
-    //multiply_Aty_ISOpart<<<NUM_VOXELS, 512>>>(gpu_lutISO, gpu_x, gpu_y);
+    multiply_Aty_ISOpart<<<nvoxels, 512>>>(lutISO, x, y);
 
     //cudaCheckKernel();//*/
 
     // Copy back result to CPU
-    //cudaCheck( cudaMemcpy(x, gpu_x, NUM_COLS*sizeof(double), cudaMemcpyDeviceToHost) ); 
+    cudaCheck( cudaMemcpy(v_out, x, ncols*sizeof(double), cudaMemcpyDeviceToHost) );
         
     /*printf("\n\n VECTOR X EC PART:\n");
     for(int i = NUM_FIBERS*NUM_RESFUNCIC; i < NUM_FIBERS*NUM_RESFUNCIC+20; i++)
diff --git a/commit/gpumanager.cuh b/commit/gpumanager.cuh
index 7971c7b8..6062621e 100644
--- a/commit/gpumanager.cuh
+++ b/commit/gpumanager.cuh
@@ -55,6 +55,14 @@ class CudaLinearOperator {
     uint16_t*  orienIC;
     float32_t* lengthIC;
 
+    // pointers to IC data (transpose) in GPU memory
+    uint32_t*  voxelICt;
+    uint32_t*  fiberICt;
+    uint16_t*  orienICt;
+    float32_t* lengthICt;
+    uint32_t* fibersPerBlockICt;
+    uint32_t* offsetPerBlockICt;
+
     // auxiliar arrays for GPU
     uint32_t* segmentsPerBlockIC;
     uint32_t* offsetPerBlockIC;
@@ -106,6 +114,7 @@ class CudaLinearOperator {
         
         ~CudaLinearOperator();
 
+        void setTransposeData(uint32_t* voxelIDs, uint32_t* fiberIDs, uint16_t* orienIDs, float32_t* lengths);
         void multiplyByX(float64_t* x, float64_t* y);
         void multiplyByY(float64_t* y, float64_t* x);
 };
\ No newline at end of file

From 1ef189ca5be6821ed4fcb198d9a87b8bd33a4fc0 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Tue, 24 Mar 2020 18:27:10 -0600
Subject: [PATCH 009/190] Adding kernels for operation A'y in GPU

---
 commit/gpumanager.cu | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/commit/gpumanager.cu b/commit/gpumanager.cu
index cfa2aaae..c7dc5a50 100644
--- a/commit/gpumanager.cu
+++ b/commit/gpumanager.cu
@@ -34,23 +34,23 @@ void CudaLinearOperator::setTransponseData(
 
     printf("\t* extra memory for operator A' ... ");
     status = true;
-    status = status && cudaCheck( cudaMalloc((void**)&(this->voxelICt),  nsegments*sizeof(uint32_t))  );
-    status = status && cudaCheck( cudaMalloc((void**)&(this->fiberICt),  nsegments*sizeof(uint32_t))  );
-    status = status && cudaCheck( cudaMalloc((void**)&(this->orienICt),  nsegments*sizeof(uint16_t))  );
-    status = status && cudaCheck( cudaMalloc((void**)&(this->lengthICt), nsegments*sizeof(float32_t)) );
-    status = status && cudaCheck( cudaMalloc((void**)&(this->fibersPerBlockICt),  nfibers*sizeof(uint32_t)) );
-    status = status && cudaCheck( cudaMalloc((void**)&(this->offsetPerBlockICt) , nfibers*sizeof(uint32_t)) );
+    status = status && cudaCheck( cudaMalloc((void**)&(voxelICt),  nsegments*sizeof(uint32_t))  );
+    status = status && cudaCheck( cudaMalloc((void**)&(fiberICt),  nsegments*sizeof(uint32_t))  );
+    status = status && cudaCheck( cudaMalloc((void**)&(orienICt),  nsegments*sizeof(uint16_t))  );
+    status = status && cudaCheck( cudaMalloc((void**)&(lengthICt), nsegments*sizeof(float32_t)) );
+    status = status && cudaCheck( cudaMalloc((void**)&(fibersPerBlockICt), nfibers*sizeof(uint32_t)) );
+    status = status && cudaCheck( cudaMalloc((void**)&(offsetPerBlockICt), nfibers*sizeof(uint32_t)) );
     if (status) printf("[ OK ]\n");
     else        printf("[ ERROR ]\n");
 
     printf("\t* transfering memory for operator A' ... ");
     status = true;
-    status = status && cudaCheck( cudaMemcpy(this->voxelICt,  voxel,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    status = status && cudaCheck( cudaMemcpy(this->fiberICt,  fiber,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    status = status && cudaCheck( cudaMemcpy(this->orienICt,  orien,  nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
-    status = status && cudaCheck( cudaMemcpy(this->lengthICt, length, nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
-    status = status && cudaCheck( cudaMemcpy(this->fibersPerBlockICt, fibersPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    status = status && cudaCheck( cudaMemcpy(this->offsetPerBlockICt, offsetPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    status = status && cudaCheck( cudaMemcpy(voxelICt,  voxelIDs, nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    status = status && cudaCheck( cudaMemcpy(fiberICt,  fiberIDs, nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    status = status && cudaCheck( cudaMemcpy(orienICt,  orienIDs, nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
+    status = status && cudaCheck( cudaMemcpy(lengthICt, lengths,  nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
+    status = status && cudaCheck( cudaMemcpy(fibersPerBlockICt, fibersPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    status = status && cudaCheck( cudaMemcpy(offsetPerBlockICt, offsetPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
     if (status) printf("[ OK ]\n");
     else        printf("[ ERROR ]\n");
 

From 5ac15339dd86d6f055957073fcacab9b00258356 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Tue, 24 Mar 2020 19:16:47 -0600
Subject: [PATCH 010/190] Adding kernels for operation A'y in GPU

---
 commit/gpumanager.cu  | 76 +++++++++++++++++++++----------------------
 commit/gpumanager.cuh |  2 +-
 2 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/commit/gpumanager.cu b/commit/gpumanager.cu
index c7dc5a50..662bc731 100644
--- a/commit/gpumanager.cu
+++ b/commit/gpumanager.cu
@@ -20,44 +20,6 @@ void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compar
         offsetPerBlock[i] = offsetPerBlock[i-1] + compartmentsPerBlock[i-1];
 }
 
-void CudaLinearOperator::setTransponseData(
-    uint32_t*  voxelIDs,
-    uint32_t*  fiberIDs,
-    uint16_t*  orienIDs,
-    float32_t* lengths)
-{
-    bool status;
-    uint32_t*  fibersPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
-    uint32_t*  offsetPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
-
-    preprocessDataForGPU(fiberIDs, nsegments, fibersPerBlock, offsetPerBlock, nfibers);
-
-    printf("\t* extra memory for operator A' ... ");
-    status = true;
-    status = status && cudaCheck( cudaMalloc((void**)&(voxelICt),  nsegments*sizeof(uint32_t))  );
-    status = status && cudaCheck( cudaMalloc((void**)&(fiberICt),  nsegments*sizeof(uint32_t))  );
-    status = status && cudaCheck( cudaMalloc((void**)&(orienICt),  nsegments*sizeof(uint16_t))  );
-    status = status && cudaCheck( cudaMalloc((void**)&(lengthICt), nsegments*sizeof(float32_t)) );
-    status = status && cudaCheck( cudaMalloc((void**)&(fibersPerBlockICt), nfibers*sizeof(uint32_t)) );
-    status = status && cudaCheck( cudaMalloc((void**)&(offsetPerBlockICt), nfibers*sizeof(uint32_t)) );
-    if (status) printf("[ OK ]\n");
-    else        printf("[ ERROR ]\n");
-
-    printf("\t* transfering memory for operator A' ... ");
-    status = true;
-    status = status && cudaCheck( cudaMemcpy(voxelICt,  voxelIDs, nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    status = status && cudaCheck( cudaMemcpy(fiberICt,  fiberIDs, nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    status = status && cudaCheck( cudaMemcpy(orienICt,  orienIDs, nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
-    status = status && cudaCheck( cudaMemcpy(lengthICt, lengths,  nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
-    status = status && cudaCheck( cudaMemcpy(fibersPerBlockICt, fibersPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    status = status && cudaCheck( cudaMemcpy(offsetPerBlockICt, offsetPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    if (status) printf("[ OK ]\n");
-    else        printf("[ ERROR ]\n");
-
-    free(fibersPerBlock);
-    free(offsetPerBlock);
-}
-
 /*
 __dual__ segment::segment() {}
 
@@ -241,6 +203,44 @@ CudaLinearOperator::~CudaLinearOperator(){
     cudaFree(y);
 }
 
+static void CudaLinearOperator::setTransponseData(
+    uint32_t*  voxelIDs,
+    uint32_t*  fiberIDs,
+    uint16_t*  orienIDs,
+    float32_t* lengths)
+{
+    bool status;
+    uint32_t*  fibersPerBlock = (uint32_t*) malloc(this->nfibers*sizeof(uint32_t));
+    uint32_t*  offsetPerBlock = (uint32_t*) malloc(this->nfibers*sizeof(uint32_t));
+
+    preprocessDataForGPU(fiberIDs, nsegments, fibersPerBlock, offsetPerBlock, nfibers);
+
+    printf("\t* extra memory for operator A' ... ");
+    status = true;
+    status = status && cudaCheck( cudaMalloc((void**)&(voxelICt),  nsegments*sizeof(uint32_t))  );
+    status = status && cudaCheck( cudaMalloc((void**)&(fiberICt),  nsegments*sizeof(uint32_t))  );
+    status = status && cudaCheck( cudaMalloc((void**)&(orienICt),  nsegments*sizeof(uint16_t))  );
+    status = status && cudaCheck( cudaMalloc((void**)&(lengthICt), nsegments*sizeof(float32_t)) );
+    status = status && cudaCheck( cudaMalloc((void**)&(fibersPerBlockICt), nfibers*sizeof(uint32_t)) );
+    status = status && cudaCheck( cudaMalloc((void**)&(offsetPerBlockICt), nfibers*sizeof(uint32_t)) );
+    if (status) printf("[ OK ]\n");
+    else        printf("[ ERROR ]\n");
+
+    printf("\t* transfering memory for operator A' ... ");
+    status = true;
+    status = status && cudaCheck( cudaMemcpy(voxelICt,  voxelIDs, nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    status = status && cudaCheck( cudaMemcpy(fiberICt,  fiberIDs, nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    status = status && cudaCheck( cudaMemcpy(orienICt,  orienIDs, nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
+    status = status && cudaCheck( cudaMemcpy(lengthICt, lengths,  nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
+    status = status && cudaCheck( cudaMemcpy(fibersPerBlockICt, fibersPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    status = status && cudaCheck( cudaMemcpy(offsetPerBlockICt, offsetPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    if (status) printf("[ OK ]\n");
+    else        printf("[ ERROR ]\n");
+
+    free(fibersPerBlock);
+    free(offsetPerBlock);
+}
+
 __global__ void multiply_Ax_ICpart(
     uint32_t*  voxelIDs,
     uint32_t*  fiberIDs,
diff --git a/commit/gpumanager.cuh b/commit/gpumanager.cuh
index 6062621e..8615cc29 100644
--- a/commit/gpumanager.cuh
+++ b/commit/gpumanager.cuh
@@ -114,7 +114,7 @@ class CudaLinearOperator {
         
         ~CudaLinearOperator();
 
-        void setTransposeData(uint32_t* voxelIDs, uint32_t* fiberIDs, uint16_t* orienIDs, float32_t* lengths);
+        static void setTransposeData(uint32_t* voxelIDs, uint32_t* fiberIDs, uint16_t* orienIDs, float32_t* lengths);
         void multiplyByX(float64_t* x, float64_t* y);
         void multiplyByY(float64_t* y, float64_t* x);
 };
\ No newline at end of file

From d9d686819289a26ab8bd30e905740cfdd906ec88 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Tue, 24 Mar 2020 19:18:27 -0600
Subject: [PATCH 011/190] Adding kernels for operation A'y in GPU

---
 commit/gpumanager.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/commit/gpumanager.cu b/commit/gpumanager.cu
index 662bc731..0d3e7bd2 100644
--- a/commit/gpumanager.cu
+++ b/commit/gpumanager.cu
@@ -203,15 +203,15 @@ CudaLinearOperator::~CudaLinearOperator(){
     cudaFree(y);
 }
 
-static void CudaLinearOperator::setTransponseData(
+void CudaLinearOperator::setTransposeData(
     uint32_t*  voxelIDs,
     uint32_t*  fiberIDs,
     uint16_t*  orienIDs,
     float32_t* lengths)
 {
     bool status;
-    uint32_t*  fibersPerBlock = (uint32_t*) malloc(this->nfibers*sizeof(uint32_t));
-    uint32_t*  offsetPerBlock = (uint32_t*) malloc(this->nfibers*sizeof(uint32_t));
+    uint32_t*  fibersPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
+    uint32_t*  offsetPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
 
     preprocessDataForGPU(fiberIDs, nsegments, fibersPerBlock, offsetPerBlock, nfibers);
 

From c9927eea1d25b36ecb9607aa3e2f769438803eb7 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Tue, 24 Mar 2020 19:19:14 -0600
Subject: [PATCH 012/190] Adding kernels for operation A'y in GPU

---
 commit/gpumanager.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/commit/gpumanager.cuh b/commit/gpumanager.cuh
index 8615cc29..6062621e 100644
--- a/commit/gpumanager.cuh
+++ b/commit/gpumanager.cuh
@@ -114,7 +114,7 @@ class CudaLinearOperator {
         
         ~CudaLinearOperator();
 
-        static void setTransposeData(uint32_t* voxelIDs, uint32_t* fiberIDs, uint16_t* orienIDs, float32_t* lengths);
+        void setTransposeData(uint32_t* voxelIDs, uint32_t* fiberIDs, uint16_t* orienIDs, float32_t* lengths);
         void multiplyByX(float64_t* x, float64_t* y);
         void multiplyByY(float64_t* y, float64_t* x);
 };
\ No newline at end of file

From f2f489ef201849dd04ac6af9ae41378ea0ac5b9f Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Tue, 24 Mar 2020 19:21:21 -0600
Subject: [PATCH 013/190] Adding kernels for operation A'y in GPU

---
 commit/cudaoperator.pyx | 2 +-
 commit/gpumanager.cu    | 3 ++-
 commit/gpumanager.cuh   | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 3dceb8c3..e1ca4d45 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -175,7 +175,7 @@ cdef class CudaLinearOperator :
         self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
         del idx
 
-        self.A.setTransposeData(&ICv[0], &ICf[0], &ICo[0], &ICl[0])
+        self.A.setTransposeData(&ICv[0], &ICf[0], &ICo[0], &ICl[0], self.n)
 
     @property
     def T( self ) :
diff --git a/commit/gpumanager.cu b/commit/gpumanager.cu
index 0d3e7bd2..10fb6e2f 100644
--- a/commit/gpumanager.cu
+++ b/commit/gpumanager.cu
@@ -207,7 +207,8 @@ void CudaLinearOperator::setTransposeData(
     uint32_t*  voxelIDs,
     uint32_t*  fiberIDs,
     uint16_t*  orienIDs,
-    float32_t* lengths)
+    float32_t* lengths,
+    int nsegments)
 {
     bool status;
     uint32_t*  fibersPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
diff --git a/commit/gpumanager.cuh b/commit/gpumanager.cuh
index 6062621e..1d20cc3d 100644
--- a/commit/gpumanager.cuh
+++ b/commit/gpumanager.cuh
@@ -114,7 +114,7 @@ class CudaLinearOperator {
         
         ~CudaLinearOperator();
 
-        void setTransposeData(uint32_t* voxelIDs, uint32_t* fiberIDs, uint16_t* orienIDs, float32_t* lengths);
+        void setTransposeData(uint32_t* voxelIDs, uint32_t* fiberIDs, uint16_t* orienIDs, float32_t* lengths, int nsegments);
         void multiplyByX(float64_t* x, float64_t* y);
         void multiplyByY(float64_t* y, float64_t* x);
 };
\ No newline at end of file

From e8da361c0cf935a1a0feb804fa5e8c0cffd7daf3 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Tue, 24 Mar 2020 19:23:17 -0600
Subject: [PATCH 014/190] Adding kernels for operation A'y in GPU

---
 commit/gpumanager.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/commit/gpumanager.cu b/commit/gpumanager.cu
index 10fb6e2f..596226b4 100644
--- a/commit/gpumanager.cu
+++ b/commit/gpumanager.cu
@@ -400,7 +400,7 @@ __global__ void multiply_Aty_ICpart(
         float64_t sum = 0.0;
         //segment = segments + offset;
         voxel  = voxelICt  + offset;
-        orient = orienICt  + offset;
+        orien = orienICt  + offset;
         length = lengthICt + offset;
         for(int i = offset; i < nsegments; i++){
             sum += ((float64_t)(*length)) *( (float64_t) lut[offset_lut + (*orien)*NUM_SAMPLES] )* y[(*voxel)*NUM_SAMPLES + tid];

From 491ce82fe267a4f9561332053c2c38968a8cb93f Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Tue, 24 Mar 2020 19:24:39 -0600
Subject: [PATCH 015/190] Adding kernels for operation A'y in GPU

---
 commit/cudaoperator.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index e1ca4d45..ddeb0246 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -30,7 +30,7 @@ cdef extern from "gpumanager.cuh":
             int,
             int)
 
-        void setTransposeData(np.uint32_t*, np.uint32_t*, np.uint16_t*, np.float32_t*)
+        void setTransposeData(np.uint32_t*, np.uint32_t*, np.uint16_t*, np.float32_t*, int)
         void multiplyByX(np.float64_t*, np.float64_t*)
         void multiplyByY(np.float64_t*, np.float64_t*)
 

From 71dc3d509e27eb58cfbefe49046ccaa06c725c3a Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Tue, 24 Mar 2020 19:53:31 -0600
Subject: [PATCH 016/190] Adding kernels for operation A'y in GPU

---
 commit/cudaoperator.pyx | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index ddeb0246..902316cc 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -173,10 +173,20 @@ cdef class CudaLinearOperator :
         self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]
         self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
         self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
-        del idx
 
         self.A.setTransposeData(&ICv[0], &ICf[0], &ICo[0], &ICl[0], self.n)
 
+        idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['v'])] )
+        self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
+        self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]
+        self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
+        self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
+
+        idx = np.lexsort( [np.array(self.DICTIONARY['EC']['o']), np.array(self.DICTIONARY['EC']['v'])] )
+        self.DICTIONARY['EC']['v'] = self.DICTIONARY['EC']['v'][ idx ]
+        self.DICTIONARY['EC']['o'] = self.DICTIONARY['EC']['o'][ idx ]
+        del idx
+
     @property
     def T( self ) :
         """Transpose of the explicit matrix."""

From 4ffcd26154a5c917fa10f266dd4dbfe4efafcd5b Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Tue, 24 Mar 2020 19:58:00 -0600
Subject: [PATCH 017/190] Adding kernels for operation A'y in GPU

---
 commit/cudaoperator.pyx | 42 ++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 902316cc..bb25d721 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -134,15 +134,15 @@ cdef class CudaLinearOperator :
         cdef unsigned int  [::1] ISOthreadsT = THREADS['ISOt']
         self.ISOthreadsT = &ISOthreadsT[0]
 
-        idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['v'])] )
-        self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
-        self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]
-        self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
-        self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
+        idx = np.lexsort( [np.array(DICTIONARY['IC']['o']), np.array(DICTIONARY['IC']['v'])] )
+        DICTIONARY['IC']['v']     = DICTIONARY['IC']['v'][ idx ]
+        DICTIONARY['IC']['o']     = DICTIONARY['IC']['o'][ idx ]
+        DICTIONARY['IC']['fiber'] = DICTIONARY['IC']['fiber'][ idx ]
+        DICTIONARY['IC']['len']   = DICTIONARY['IC']['len'][ idx ]
 
-        idx = np.lexsort( [np.array(self.DICTIONARY['EC']['o']), np.array(self.DICTIONARY['EC']['v'])] )
-        self.DICTIONARY['EC']['v'] = self.DICTIONARY['EC']['v'][ idx ]
-        self.DICTIONARY['EC']['o'] = self.DICTIONARY['EC']['o'][ idx ]
+        idx = np.lexsort( [np.array(DICTIONARY['EC']['o']), np.array(DICTIONARY['EC']['v'])] )
+        DICTIONARY['EC']['v'] = DICTIONARY['EC']['v'][ idx ]
+        DICTIONARY['EC']['o'] = DICTIONARY['EC']['o'][ idx ]
 
         self.A = new C_CudaLinearOperator(
             &ICv[0],
@@ -168,23 +168,23 @@ cdef class CudaLinearOperator :
             self.nI
         )
 
-        idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['fiber'])] )
-        self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
-        self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]
-        self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
-        self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
+        idx = np.lexsort( [np.array(DICTIONARY['IC']['o']), np.array(DICTIONARY['IC']['fiber'])] )
+        DICTIONARY['IC']['v']     = DICTIONARY['IC']['v'][ idx ]
+        DICTIONARY['IC']['o']     = DICTIONARY['IC']['o'][ idx ]
+        DICTIONARY['IC']['fiber'] = DICTIONARY['IC']['fiber'][ idx ]
+        DICTIONARY['IC']['len']   = DICTIONARY['IC']['len'][ idx ]
 
         self.A.setTransposeData(&ICv[0], &ICf[0], &ICo[0], &ICl[0], self.n)
 
-        idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['v'])] )
-        self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
-        self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]
-        self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
-        self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
+        idx = np.lexsort( [np.array(DICTIONARY['IC']['o']), np.array(DICTIONARY['IC']['v'])] )
+        DICTIONARY['IC']['v']     = DICTIONARY['IC']['v'][ idx ]
+        DICTIONARY['IC']['o']     = DICTIONARY['IC']['o'][ idx ]
+        DICTIONARY['IC']['fiber'] = DICTIONARY['IC']['fiber'][ idx ]
+        DICTIONARY['IC']['len']   = DICTIONARY['IC']['len'][ idx ]
 
-        idx = np.lexsort( [np.array(self.DICTIONARY['EC']['o']), np.array(self.DICTIONARY['EC']['v'])] )
-        self.DICTIONARY['EC']['v'] = self.DICTIONARY['EC']['v'][ idx ]
-        self.DICTIONARY['EC']['o'] = self.DICTIONARY['EC']['o'][ idx ]
+        idx = np.lexsort( [np.array(DICTIONARY['EC']['o']), np.array(DICTIONARY['EC']['v'])] )
+        DICTIONARY['EC']['v'] = DICTIONARY['EC']['v'][ idx ]
+        DICTIONARY['EC']['o'] = DICTIONARY['EC']['o'][ idx ]
         del idx
 
     @property

From a2f1fa1f5a177a41223d129b81416e9f89e53c23 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Tue, 24 Mar 2020 20:18:15 -0600
Subject: [PATCH 018/190] Adding kernels for operation A'y in GPU

---
 commit/cudaoperator.pyx | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index bb25d721..0bbabb43 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -134,7 +134,7 @@ cdef class CudaLinearOperator :
         cdef unsigned int  [::1] ISOthreadsT = THREADS['ISOt']
         self.ISOthreadsT = &ISOthreadsT[0]
 
-        idx = np.lexsort( [np.array(DICTIONARY['IC']['o']), np.array(DICTIONARY['IC']['v'])] )
+        """idx = np.lexsort( [np.array(DICTIONARY['IC']['o']), np.array(DICTIONARY['IC']['v'])] )
         DICTIONARY['IC']['v']     = DICTIONARY['IC']['v'][ idx ]
         DICTIONARY['IC']['o']     = DICTIONARY['IC']['o'][ idx ]
         DICTIONARY['IC']['fiber'] = DICTIONARY['IC']['fiber'][ idx ]
@@ -142,7 +142,7 @@ cdef class CudaLinearOperator :
 
         idx = np.lexsort( [np.array(DICTIONARY['EC']['o']), np.array(DICTIONARY['EC']['v'])] )
         DICTIONARY['EC']['v'] = DICTIONARY['EC']['v'][ idx ]
-        DICTIONARY['EC']['o'] = DICTIONARY['EC']['o'][ idx ]
+        DICTIONARY['EC']['o'] = DICTIONARY['EC']['o'][ idx ]"""
 
         self.A = new C_CudaLinearOperator(
             &ICv[0],
@@ -168,6 +168,7 @@ cdef class CudaLinearOperator :
             self.nI
         )
 
+        """
         idx = np.lexsort( [np.array(DICTIONARY['IC']['o']), np.array(DICTIONARY['IC']['fiber'])] )
         DICTIONARY['IC']['v']     = DICTIONARY['IC']['v'][ idx ]
         DICTIONARY['IC']['o']     = DICTIONARY['IC']['o'][ idx ]
@@ -184,7 +185,7 @@ cdef class CudaLinearOperator :
 
         idx = np.lexsort( [np.array(DICTIONARY['EC']['o']), np.array(DICTIONARY['EC']['v'])] )
         DICTIONARY['EC']['v'] = DICTIONARY['EC']['v'][ idx ]
-        DICTIONARY['EC']['o'] = DICTIONARY['EC']['o'][ idx ]
+        DICTIONARY['EC']['o'] = DICTIONARY['EC']['o'][ idx ]"""
         del idx
 
     @property

From 80adafe04dabafe7c1fd858e86c3a71fd2379274 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Tue, 24 Mar 2020 20:19:48 -0600
Subject: [PATCH 019/190] Adding kernels for operation A'y in GPU

---
 commit/cudaoperator.pyx | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 0bbabb43..2fc06f4e 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -185,8 +185,9 @@ cdef class CudaLinearOperator :
 
         idx = np.lexsort( [np.array(DICTIONARY['EC']['o']), np.array(DICTIONARY['EC']['v'])] )
         DICTIONARY['EC']['v'] = DICTIONARY['EC']['v'][ idx ]
-        DICTIONARY['EC']['o'] = DICTIONARY['EC']['o'][ idx ]"""
+        DICTIONARY['EC']['o'] = DICTIONARY['EC']['o'][ idx ]
         del idx
+        """
 
     @property
     def T( self ) :

From e6964ea6b0e86e4f7d694d8a832c9c24ef5832db Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Tue, 24 Mar 2020 20:22:25 -0600
Subject: [PATCH 020/190] Adding kernels for operation A'y in GPU

---
 commit/cudaoperator.pyx | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 2fc06f4e..bdad5d8d 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -134,15 +134,17 @@ cdef class CudaLinearOperator :
         cdef unsigned int  [::1] ISOthreadsT = THREADS['ISOt']
         self.ISOthreadsT = &ISOthreadsT[0]
 
-        """idx = np.lexsort( [np.array(DICTIONARY['IC']['o']), np.array(DICTIONARY['IC']['v'])] )
-        DICTIONARY['IC']['v']     = DICTIONARY['IC']['v'][ idx ]
-        DICTIONARY['IC']['o']     = DICTIONARY['IC']['o'][ idx ]
-        DICTIONARY['IC']['fiber'] = DICTIONARY['IC']['fiber'][ idx ]
-        DICTIONARY['IC']['len']   = DICTIONARY['IC']['len'][ idx ]
+        idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['v'])] )
+        self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
+        self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]
+        self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
+        self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
+        del idx
 
-        idx = np.lexsort( [np.array(DICTIONARY['EC']['o']), np.array(DICTIONARY['EC']['v'])] )
-        DICTIONARY['EC']['v'] = DICTIONARY['EC']['v'][ idx ]
-        DICTIONARY['EC']['o'] = DICTIONARY['EC']['o'][ idx ]"""
+        idx = np.lexsort( [np.array(self.DICTIONARY['EC']['o']), np.array(self.DICTIONARY['EC']['v'])] )
+        self.DICTIONARY['EC']['v'] = self.DICTIONARY['EC']['v'][ idx ]
+        self.DICTIONARY['EC']['o'] = self.DICTIONARY['EC']['o'][ idx ]
+        del idx
 
         self.A = new C_CudaLinearOperator(
             &ICv[0],

From da54fc296ea234e22b3e6dff59ecf662f5a0d8f2 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Tue, 24 Mar 2020 20:33:23 -0600
Subject: [PATCH 021/190] Adding kernels for operation A'y in GPU

---
 commit/cudaoperator.pyx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index bdad5d8d..94afaf9b 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -134,6 +134,7 @@ cdef class CudaLinearOperator :
         cdef unsigned int  [::1] ISOthreadsT = THREADS['ISOt']
         self.ISOthreadsT = &ISOthreadsT[0]
 
+        """
         idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['v'])] )
         self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
         self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]
@@ -145,6 +146,7 @@ cdef class CudaLinearOperator :
         self.DICTIONARY['EC']['v'] = self.DICTIONARY['EC']['v'][ idx ]
         self.DICTIONARY['EC']['o'] = self.DICTIONARY['EC']['o'][ idx ]
         del idx
+        """
 
         self.A = new C_CudaLinearOperator(
             &ICv[0],

From 6e5974d6435add9b99834dc7a5ff73dc5638fc3f Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Thu, 26 Mar 2020 22:52:17 -0600
Subject: [PATCH 022/190] Adding kernels for operation A'y in GPU

---
 commit/core.pyx         | 13 +++++++++++++
 commit/cudaoperator.pyx | 14 +-------------
 commit/gpumanager.cu    |  6 ++++++
 3 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index 2126a9cb..e6fadb04 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -393,12 +393,25 @@ cdef class Evaluation :
         self.DICTIONARY['IC']['nF']    = self.DICTIONARY['TRK']['norm'].size
 
         # reorder the segments based on the "v" field
+        idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['v'])] )
+        self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
+        self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]
+        self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
+        self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
+        del idx
+
+        idx = np.lexsort( [np.array(self.DICTIONARY['EC']['o']), np.array(self.DICTIONARY['EC']['v'])] )
+        self.DICTIONARY['EC']['v'] = self.DICTIONARY['EC']['v'][ idx ]
+        self.DICTIONARY['EC']['o'] = self.DICTIONARY['EC']['o'][ idx ]
+        del idx
+        """
         idx = np.argsort( self.DICTIONARY['IC']['v'], kind='mergesort' )
         self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
         self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]
         self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
         self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
         del idx
+        """
 
         # divide the length of each segment by the fiber length so that all the columns of the libear operator will have same length
         # NB: it works in conjunction with the normalization of the kernels
diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 94afaf9b..96847174 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -134,19 +134,7 @@ cdef class CudaLinearOperator :
         cdef unsigned int  [::1] ISOthreadsT = THREADS['ISOt']
         self.ISOthreadsT = &ISOthreadsT[0]
 
-        """
-        idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['v'])] )
-        self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
-        self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]
-        self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
-        self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
-        del idx
-
-        idx = np.lexsort( [np.array(self.DICTIONARY['EC']['o']), np.array(self.DICTIONARY['EC']['v'])] )
-        self.DICTIONARY['EC']['v'] = self.DICTIONARY['EC']['v'][ idx ]
-        self.DICTIONARY['EC']['o'] = self.DICTIONARY['EC']['o'][ idx ]
-        del idx
-        """
+        #sort here
 
         self.A = new C_CudaLinearOperator(
             &ICv[0],
diff --git a/commit/gpumanager.cu b/commit/gpumanager.cu
index 596226b4..b0afc581 100644
--- a/commit/gpumanager.cu
+++ b/commit/gpumanager.cu
@@ -201,6 +201,12 @@ CudaLinearOperator::~CudaLinearOperator(){
 
     cudaFree(x);
     cudaFree(y);
+
+    printf("\t* reseting GPU ... ");
+    status = true;
+    status = status && cudaCheck( cudaDeviceReset() );
+    if (status) printf("[ OK ]\n");
+    else        printf("[ ERROR ]\n");
 }
 
 void CudaLinearOperator::setTransposeData(

From c121e13bd37027779c5fb4bd7a1adbb91a0d6723 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Thu, 26 Mar 2020 22:54:02 -0600
Subject: [PATCH 023/190] Adding kernels for operation A'y in GPU

---
 commit/gpumanager.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/commit/gpumanager.cu b/commit/gpumanager.cu
index b0afc581..9fc41c4f 100644
--- a/commit/gpumanager.cu
+++ b/commit/gpumanager.cu
@@ -203,7 +203,7 @@ CudaLinearOperator::~CudaLinearOperator(){
     cudaFree(y);
 
     printf("\t* reseting GPU ... ");
-    status = true;
+    bool status = true;
     status = status && cudaCheck( cudaDeviceReset() );
     if (status) printf("[ OK ]\n");
     else        printf("[ ERROR ]\n");

From 4835b3f9191185b456f78be867866f44873bf080 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Thu, 26 Mar 2020 23:00:30 -0600
Subject: [PATCH 024/190] Adding kernels for operation A'y in GPU

---
 commit/core.pyx | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index e6fadb04..5012838e 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -399,11 +399,6 @@ cdef class Evaluation :
         self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
         self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
         del idx
-
-        idx = np.lexsort( [np.array(self.DICTIONARY['EC']['o']), np.array(self.DICTIONARY['EC']['v'])] )
-        self.DICTIONARY['EC']['v'] = self.DICTIONARY['EC']['v'][ idx ]
-        self.DICTIONARY['EC']['o'] = self.DICTIONARY['EC']['o'][ idx ]
-        del idx
         """
         idx = np.argsort( self.DICTIONARY['IC']['v'], kind='mergesort' )
         self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
@@ -437,10 +432,14 @@ cdef class Evaluation :
         self.DICTIONARY['EC']['nE'] = self.DICTIONARY['EC']['v'].size
 
         # reorder the segments based on the "v" field
-        idx = np.argsort( self.DICTIONARY['EC']['v'], kind='mergesort' )
+        idx = np.lexsort( [np.array(self.DICTIONARY['EC']['o']), np.array(self.DICTIONARY['EC']['v'])] )
         self.DICTIONARY['EC']['v'] = self.DICTIONARY['EC']['v'][ idx ]
         self.DICTIONARY['EC']['o'] = self.DICTIONARY['EC']['o'][ idx ]
         del idx
+        """idx = np.argsort( self.DICTIONARY['EC']['v'], kind='mergesort' )
+        self.DICTIONARY['EC']['v'] = self.DICTIONARY['EC']['v'][ idx ]
+        self.DICTIONARY['EC']['o'] = self.DICTIONARY['EC']['o'][ idx ]
+        del idx """
 
         print( ' [ %d segments ]' % self.DICTIONARY['EC']['nE'] )
 

From 528b9b38d1705357d0b3d8042db0ce9998f764b3 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Thu, 26 Mar 2020 23:07:23 -0600
Subject: [PATCH 025/190] Adding kernels for operation A'y in GPU

---
 commit/cudaoperator.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 96847174..20ba8cef 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -160,7 +160,7 @@ cdef class CudaLinearOperator :
             self.nI
         )
 
-        """
+        
         idx = np.lexsort( [np.array(DICTIONARY['IC']['o']), np.array(DICTIONARY['IC']['fiber'])] )
         DICTIONARY['IC']['v']     = DICTIONARY['IC']['v'][ idx ]
         DICTIONARY['IC']['o']     = DICTIONARY['IC']['o'][ idx ]
@@ -179,7 +179,7 @@ cdef class CudaLinearOperator :
         DICTIONARY['EC']['v'] = DICTIONARY['EC']['v'][ idx ]
         DICTIONARY['EC']['o'] = DICTIONARY['EC']['o'][ idx ]
         del idx
-        """
+        #"""
 
     @property
     def T( self ) :

From be50c269540305543ca5dd46fa68609c33a9aeda Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Thu, 26 Mar 2020 23:41:54 -0600
Subject: [PATCH 026/190] Adding kernels for operation A'y in GPU

---
 commit/cudaoperator.pyx | 42 ++++++++++++++++++++++++++++-------------
 1 file changed, 29 insertions(+), 13 deletions(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 20ba8cef..c3198e71 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -161,23 +161,39 @@ cdef class CudaLinearOperator :
         )
 
         
-        idx = np.lexsort( [np.array(DICTIONARY['IC']['o']), np.array(DICTIONARY['IC']['fiber'])] )
-        DICTIONARY['IC']['v']     = DICTIONARY['IC']['v'][ idx ]
-        DICTIONARY['IC']['o']     = DICTIONARY['IC']['o'][ idx ]
-        DICTIONARY['IC']['fiber'] = DICTIONARY['IC']['fiber'][ idx ]
-        DICTIONARY['IC']['len']   = DICTIONARY['IC']['len'][ idx ]
+        idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['fiber'])] )
+        self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
+        self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]
+        self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
+        self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
+
+        ICf  = self.DICTIONARY['IC']['fiber']
+        self.ICf = &ICf[0]
+        ICl  = self.DICTIONARY['IC']['len']
+        self.ICl = &ICl[0]
+        ICv  = self.DICTIONARY['IC']['v']
+        self.ICv = &ICv[0]
+        ICo  = self.DICTIONARY['IC']['o']
+        self.ICo = &ICo[0]
+        ECv  = self.DICTIONARY['EC']['v']
+        self.ECv = &ECv[0]
+        ECo  = self.DICTIONARY['EC']['o']
+        self.ECo = &ECo[0]
+        ISOv = self.DICTIONARY['ISO']['v']
+        self.ISOv = &ISOv[0]
 
         self.A.setTransposeData(&ICv[0], &ICf[0], &ICo[0], &ICl[0], self.n)
 
-        idx = np.lexsort( [np.array(DICTIONARY['IC']['o']), np.array(DICTIONARY['IC']['v'])] )
-        DICTIONARY['IC']['v']     = DICTIONARY['IC']['v'][ idx ]
-        DICTIONARY['IC']['o']     = DICTIONARY['IC']['o'][ idx ]
-        DICTIONARY['IC']['fiber'] = DICTIONARY['IC']['fiber'][ idx ]
-        DICTIONARY['IC']['len']   = DICTIONARY['IC']['len'][ idx ]
+        idx = np.argsort( self.DICTIONARY['IC']['v'], kind='mergesort' )
+        self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
+        self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]
+        self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
+        self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
+        del idx
 
-        idx = np.lexsort( [np.array(DICTIONARY['EC']['o']), np.array(DICTIONARY['EC']['v'])] )
-        DICTIONARY['EC']['v'] = DICTIONARY['EC']['v'][ idx ]
-        DICTIONARY['EC']['o'] = DICTIONARY['EC']['o'][ idx ]
+        idx = np.argsort( self.DICTIONARY['EC']['v'], kind='mergesort' )
+        self.DICTIONARY['EC']['v'] = self.DICTIONARY['EC']['v'][ idx ]
+        self.DICTIONARY['EC']['o'] = self.DICTIONARY['EC']['o'][ idx ]
         del idx
         #"""
 

From 24eb5d060a1526bf770386f347b325943f502b56 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Thu, 26 Mar 2020 23:50:30 -0600
Subject: [PATCH 027/190] Adding kernels for operation A'y in GPU

---
 commit/core.pyx         | 6 +++---
 commit/cudaoperator.pyx | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index 5012838e..b6d13a07 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -482,6 +482,9 @@ cdef class Evaluation :
         self.DICTIONARY['EC'][ 'v'] = lut[ self.DICTIONARY['EC'][ 'v'] ]
         self.DICTIONARY['ISO']['v'] = lut[ self.DICTIONARY['ISO']['v'] ]
 
+        import commit.cudaoperator
+        self.gpu_A = commit.cudaoperator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
+
         print( '         [ OK ]' )
 
         print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
@@ -663,9 +666,6 @@ cdef class Evaluation :
         else :
             reload( sys.modules['commit.operator.operator'] )
         self.A = sys.modules['commit.operator.operator'].LinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
-        
-        import commit.cudaoperator
-        self.gpu_A = commit.cudaoperator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
 
         print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
 
diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index c3198e71..acdad0be 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -167,6 +167,7 @@ cdef class CudaLinearOperator :
         self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
         self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
 
+        """
         ICf  = self.DICTIONARY['IC']['fiber']
         self.ICf = &ICf[0]
         ICl  = self.DICTIONARY['IC']['len']
@@ -180,7 +181,7 @@ cdef class CudaLinearOperator :
         ECo  = self.DICTIONARY['EC']['o']
         self.ECo = &ECo[0]
         ISOv = self.DICTIONARY['ISO']['v']
-        self.ISOv = &ISOv[0]
+        self.ISOv = &ISOv[0]"""
 
         self.A.setTransposeData(&ICv[0], &ICf[0], &ICo[0], &ICl[0], self.n)
 

From 2f37df8a361999d8677ad9bc0003e6aed3d1d722 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Thu, 26 Mar 2020 23:53:55 -0600
Subject: [PATCH 028/190] Adding kernels for operation A'y in GPU

---
 commit/cudaoperator.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index acdad0be..bd66229f 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -119,7 +119,7 @@ cdef class CudaLinearOperator :
         cdef float [:, ::1] isoSFP = KERNELS['iso']
         self.LUT_ISO = &isoSFP[0,0]
 
-        # get C pointers to arrays in THREADS
+        """# get C pointers to arrays in THREADS
         cdef unsigned int [::1] ICthreads = THREADS['IC']
         self.ICthreads  = &ICthreads[0]
         cdef unsigned int [::1] ECthreads = THREADS['EC']
@@ -132,7 +132,7 @@ cdef class CudaLinearOperator :
         cdef unsigned int  [::1] ECthreadsT = THREADS['ECt']
         self.ECthreadsT  = &ECthreadsT[0]
         cdef unsigned int  [::1] ISOthreadsT = THREADS['ISOt']
-        self.ISOthreadsT = &ISOthreadsT[0]
+        self.ISOthreadsT = &ISOthreadsT[0] """
 
         #sort here
 

From 8184b955376e2d2b9de86f2e3c375c15849baa9e Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Fri, 27 Mar 2020 00:08:09 -0600
Subject: [PATCH 029/190] Adding kernels for operation A'y in GPU

---
 commit/cudaoperator.pyx | 6 +++---
 commit/gpumanager.cu    | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index bd66229f..4637a0f2 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -167,7 +167,7 @@ cdef class CudaLinearOperator :
         self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
         self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
 
-        """
+        
         ICf  = self.DICTIONARY['IC']['fiber']
         self.ICf = &ICf[0]
         ICl  = self.DICTIONARY['IC']['len']
@@ -181,9 +181,9 @@ cdef class CudaLinearOperator :
         ECo  = self.DICTIONARY['EC']['o']
         self.ECo = &ECo[0]
         ISOv = self.DICTIONARY['ISO']['v']
-        self.ISOv = &ISOv[0]"""
+        self.ISOv = &ISOv[0]
 
-        self.A.setTransposeData(&ICv[0], &ICf[0], &ICo[0], &ICl[0], self.n)
+        self.A.setTransposeData(&self.ICv[0], &self.ICf[0], &self.ICo[0], &self.ICl[0], self.n)
 
         idx = np.argsort( self.DICTIONARY['IC']['v'], kind='mergesort' )
         self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
diff --git a/commit/gpumanager.cu b/commit/gpumanager.cu
index 9fc41c4f..83d9713d 100644
--- a/commit/gpumanager.cu
+++ b/commit/gpumanager.cu
@@ -406,7 +406,7 @@ __global__ void multiply_Aty_ICpart(
         float64_t sum = 0.0;
         //segment = segments + offset;
         voxel  = voxelICt  + offset;
-        orien = orienICt  + offset;
+        orien  = orienICt  + offset;
         length = lengthICt + offset;
         for(int i = offset; i < nsegments; i++){
             sum += ((float64_t)(*length)) *( (float64_t) lut[offset_lut + (*orien)*NUM_SAMPLES] )* y[(*voxel)*NUM_SAMPLES + tid];

From c00957ca6c1cdf1dee8723d22c95a6406a57e5a5 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Fri, 27 Mar 2020 15:07:03 -0600
Subject: [PATCH 030/190] Enabling nthreads=0 in set_threads() function

---
 commit/core.pyx                    | 57 +++++++++++++++---------------
 commit/operator/operator.pyxbld    |  2 +-
 commit/operator/operator_noLUT.c   |  4 +--
 commit/operator/operator_withLUT.c |  4 +--
 4 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index b6d13a07..21d51d2c 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -490,31 +490,32 @@ cdef class Evaluation :
         print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
 
 
-    def set_threads( self, n = None ) :
+    def set_threads( self, nthreads = None ) :
         """Set the number of threads to use for the matrix-vector operations with A and A'.
 
         Parameters
         ----------
-        n : integer
-            Number of threads to use (default : number of CPUs in the system)
+        nthreads : integer
+            Number of threads to use (nthreads = None ---> all the CPU threads available in the system
+                                      nthreads = 0    ---> enable CUDA GPU acceleration)
         """
-        if n is None :
+        if nthreads is None :
             # Set to the number of CPUs in the system
             try :
                 import multiprocessing
-                n = multiprocessing.cpu_count()
+                nthreads = multiprocessing.cpu_count()
             except :
-                n = 1
+                nthreads = 1
 
-        if n < 1 or n > 255 :
-            raise RuntimeError( 'Number of threads must be between 1 and 255' )
+        if nthreads < 0 or nthreads > 255 :
+            raise RuntimeError( 'Number of threads must be between 0 and 255' )
         if self.DICTIONARY is None :
             raise RuntimeError( 'Dictionary not loaded; call "load_dictionary()" first.' )
         if self.KERNELS is None :
             raise RuntimeError( 'Response functions not generated; call "generate_kernels()" and "load_kernels()" first.' )
 
         self.THREADS = {}
-        self.THREADS['n'] = n
+        self.THREADS['n'] = nthreads
 
         cdef :
             long [:] C
@@ -523,7 +524,7 @@ cdef class Evaluation :
 
         tic = time.time()
         print( '\n-> Distributing workload to different threads:' )
-        print( '\t* number of threads : %d' % n )
+        print( '\t* number of threads : %d' % nthreads )
 
         # Distribute load for the computation of A*x product
         print( '\t* A operator...', end="" )
@@ -531,8 +532,8 @@ cdef class Evaluation :
 
         if self.DICTIONARY['IC']['n'] > 0 :
             self.THREADS['IC'] = np.zeros( n+1, dtype=np.uint32 )
-            if n > 1 :
-                N = np.floor( self.DICTIONARY['IC']['n']/n )
+            if nthreads > 1 :
+                N = np.floor( self.DICTIONARY['IC']['n']/nthreads )
                 t = 1
                 tot = 0
                 C = np.bincount( self.DICTIONARY['IC']['v'] )
@@ -542,7 +543,7 @@ cdef class Evaluation :
                         self.THREADS['IC'][t] = self.THREADS['IC'][t-1] + tot
                         t += 1
                         tot = 0
-            self.THREADS['IC'][n] = self.DICTIONARY['IC']['n']
+            self.THREADS['IC'][nthreads] = self.DICTIONARY['IC']['n']
 
             # check if some threads are not assigned any segment
             if np.count_nonzero( np.diff( self.THREADS['IC'].astype(np.int32) ) <= 0 ) :
@@ -553,9 +554,9 @@ cdef class Evaluation :
 
         if self.DICTIONARY['EC']['nE'] > 0 :
             self.THREADS['EC'] = np.zeros( n+1, dtype=np.uint32 )
-            for i in xrange(n) :
+            for i in xrange(nthreads) :
                 self.THREADS['EC'][i] = np.searchsorted( self.DICTIONARY['EC']['v'], self.DICTIONARY['IC']['v'][ self.THREADS['IC'][i] ] )
-            self.THREADS['EC'][n] = self.DICTIONARY['EC']['nE']
+            self.THREADS['EC'][nthreads] = self.DICTIONARY['EC']['nE']
 
             # check if some threads are not assigned any segment
             if np.count_nonzero( np.diff( self.THREADS['EC'].astype(np.int32) ) <= 0 ) :
@@ -566,9 +567,9 @@ cdef class Evaluation :
 
         if self.DICTIONARY['nV'] > 0 :
             self.THREADS['ISO'] = np.zeros( n+1, dtype=np.uint32 )
-            for i in xrange(n) :
+            for i in xrange(nthreads) :
                 self.THREADS['ISO'][i] = np.searchsorted( self.DICTIONARY['ISO']['v'], self.DICTIONARY['IC']['v'][ self.THREADS['IC'][i] ] )
-            self.THREADS['ISO'][n] = self.DICTIONARY['nV']
+            self.THREADS['ISO'][nthreads] = self.DICTIONARY['nV']
 
             # check if some threads are not assigned any segment
             if np.count_nonzero( np.diff( self.THREADS['ISO'].astype(np.int32) ) <= 0 ) :
@@ -584,19 +585,19 @@ cdef class Evaluation :
         sys.stdout.flush()
 
         if self.DICTIONARY['IC']['n'] > 0 :
-            self.THREADS['ICt'] = np.full( self.DICTIONARY['IC']['n'], n-1, dtype=np.uint8 )
-            if n > 1 :
+            self.THREADS['ICt'] = np.full( self.DICTIONARY['IC']['n'], nthreads-1, dtype=np.uint8 )
+            if nthreads > 1 :
                 idx = np.argsort( self.DICTIONARY['IC']['fiber'], kind='mergesort' )
                 C = np.bincount( self.DICTIONARY['IC']['fiber'] )
                 t = tot = i1 = i2 = 0
-                N = np.floor(self.DICTIONARY['IC']['n']/n)
+                N = np.floor(self.DICTIONARY['IC']['n']/nthreads)
                 for c in C :
                     i2 += c
                     tot += c
                     if tot >= N :
                         self.THREADS['ICt'][ i1:i2 ] = t
                         t += 1
-                        if t==n-1 :
+                        if t==nthreads-1 :
                             break
                         i1 = i2
                         tot = c
@@ -606,11 +607,11 @@ cdef class Evaluation :
             self.THREADS['ICt'] = None
 
         if self.DICTIONARY['EC']['nE'] > 0 :
-            self.THREADS['ECt'] = np.zeros( n+1, dtype=np.uint32 )
-            N = np.floor( self.DICTIONARY['EC']['nE']/n )
-            for i in xrange(1,n) :
+            self.THREADS['ECt'] = np.zeros( nthreads+1, dtype=np.uint32 )
+            N = np.floor( self.DICTIONARY['EC']['nE']/nthreads )
+            for i in xrange(1,nthreads) :
                 self.THREADS['ECt'][i] = self.THREADS['ECt'][i-1] + N
-            self.THREADS['ECt'][n] = self.DICTIONARY['EC']['nE']
+            self.THREADS['ECt'][nthreads] = self.DICTIONARY['EC']['nE']
 
             # check if some threads are not assigned any segment
             if np.count_nonzero( np.diff( self.THREADS['ECt'].astype(np.int32) ) <= 0 ) :
@@ -621,10 +622,10 @@ cdef class Evaluation :
 
         if self.DICTIONARY['nV'] > 0 :
             self.THREADS['ISOt'] = np.zeros( n+1, dtype=np.uint32 )
-            N = np.floor( self.DICTIONARY['nV']/n )
-            for i in xrange(1,n) :
+            N = np.floor( self.DICTIONARY['nV']/nthreads )
+            for i in xrange(1,nthreads) :
                 self.THREADS['ISOt'][i] = self.THREADS['ISOt'][i-1] + N
-            self.THREADS['ISOt'][n] = self.DICTIONARY['nV']
+            self.THREADS['ISOt'][nthreads] = self.DICTIONARY['nV']
 
             # check if some threads are not assigned any segment
             if np.count_nonzero( np.diff( self.THREADS['ISOt'].astype(np.int32) ) <= 0 ) :
diff --git a/commit/operator/operator.pyxbld b/commit/operator/operator.pyxbld
index c2cf5f5b..6e5b9f12 100755
--- a/commit/operator/operator.pyxbld
+++ b/commit/operator/operator.pyxbld
@@ -9,7 +9,7 @@ from commit.operator import config
 
 def make_ext(modname, pyxfilename):
 
-    if ( config.nTHREADS is None or config.nTHREADS < 1 or config.nTHREADS > 255 ):
+    if ( config.nTHREADS is None or config.nTHREADS < 0 or config.nTHREADS > 255 ):
        raise RuntimeError( 'config.nTHREADS must be between 1 and 255' )
     if ( config.nIC is None or config.nIC < 0 or config.nIC > 20 ):
        raise RuntimeError( 'config.nIC must be in the range [0..20]' )
diff --git a/commit/operator/operator_noLUT.c b/commit/operator/operator_noLUT.c
index 0046f237..d8b6706b 100644
--- a/commit/operator/operator_noLUT.c
+++ b/commit/operator/operator_noLUT.c
@@ -3,8 +3,8 @@
 
 // number of THREADS
 #ifdef nTHREADS
-    #if (nTHREADS<1 || nTHREADS>255)
-    #error "nTHREADS" must be in the range 1..255
+    #if (nTHREADS<0 || nTHREADS>255)
+    #error "nTHREADS" must be in the range 0..255
     #endif
 #else
     #error "nTHREADS" parameter must be passed to the compiler as "-DnTHREADS=<value>"
diff --git a/commit/operator/operator_withLUT.c b/commit/operator/operator_withLUT.c
index 9c959f57..1b6fd1ae 100644
--- a/commit/operator/operator_withLUT.c
+++ b/commit/operator/operator_withLUT.c
@@ -3,8 +3,8 @@
 
 // number of THREADS
 #ifdef nTHREADS
-    #if (nTHREADS<1 || nTHREADS>255)
-    #error "nTHREADS" must be in the range 1..255
+    #if (nTHREADS<0 || nTHREADS>255)
+    #error "nTHREADS" must be in the range 0..255
     #endif
 #else
     #error "nTHREADS" parameter must be passed to the compiler as "-DnTHREADS=<value>"

From 4a61912765bcf2012e33b7e9843cc50eba5547b9 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Fri, 27 Mar 2020 15:21:54 -0600
Subject: [PATCH 031/190] Enabling nthreads=0 in set_threads() function

---
 commit/core.pyx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index 21d51d2c..cc26a26f 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -531,7 +531,7 @@ cdef class Evaluation :
         sys.stdout.flush()
 
         if self.DICTIONARY['IC']['n'] > 0 :
-            self.THREADS['IC'] = np.zeros( n+1, dtype=np.uint32 )
+            self.THREADS['IC'] = np.zeros( nthreads+1, dtype=np.uint32 )
             if nthreads > 1 :
                 N = np.floor( self.DICTIONARY['IC']['n']/nthreads )
                 t = 1
@@ -553,7 +553,7 @@ cdef class Evaluation :
             self.THREADS['IC'] = None
 
         if self.DICTIONARY['EC']['nE'] > 0 :
-            self.THREADS['EC'] = np.zeros( n+1, dtype=np.uint32 )
+            self.THREADS['EC'] = np.zeros( nthreads+1, dtype=np.uint32 )
             for i in xrange(nthreads) :
                 self.THREADS['EC'][i] = np.searchsorted( self.DICTIONARY['EC']['v'], self.DICTIONARY['IC']['v'][ self.THREADS['IC'][i] ] )
             self.THREADS['EC'][nthreads] = self.DICTIONARY['EC']['nE']
@@ -566,7 +566,7 @@ cdef class Evaluation :
             self.THREADS['EC'] = None
 
         if self.DICTIONARY['nV'] > 0 :
-            self.THREADS['ISO'] = np.zeros( n+1, dtype=np.uint32 )
+            self.THREADS['ISO'] = np.zeros( nthreads+1, dtype=np.uint32 )
             for i in xrange(nthreads) :
                 self.THREADS['ISO'][i] = np.searchsorted( self.DICTIONARY['ISO']['v'], self.DICTIONARY['IC']['v'][ self.THREADS['IC'][i] ] )
             self.THREADS['ISO'][nthreads] = self.DICTIONARY['nV']

From 4f772c08bffc28464593258b6365ce95ac7097f4 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Fri, 27 Mar 2020 15:41:29 -0600
Subject: [PATCH 032/190] Enabling nthreads=0 in set_threads() function

---
 commit/core.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index cc26a26f..76ee8f78 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -621,7 +621,7 @@ cdef class Evaluation :
             self.THREADS['ECt'] = None
 
         if self.DICTIONARY['nV'] > 0 :
-            self.THREADS['ISOt'] = np.zeros( n+1, dtype=np.uint32 )
+            self.THREADS['ISOt'] = np.zeros( nthreads+1, dtype=np.uint32 )
             N = np.floor( self.DICTIONARY['nV']/nthreads )
             for i in xrange(1,nthreads) :
                 self.THREADS['ISOt'][i] = self.THREADS['ISOt'][i-1] + N

From b3a1a8e6b7827c01367a733e607eb07065573ff5 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Fri, 27 Mar 2020 19:13:44 -0600
Subject: [PATCH 033/190] Modfying some variable names

---
 commit/gpumanager.cuh                         | 120 ------------------
 .../{gpumanager.cu => operator_withCUDA.cu}   | 112 +++++++++++++++-
 setup.py                                      |  30 ++---
 3 files changed, 119 insertions(+), 143 deletions(-)
 delete mode 100644 commit/gpumanager.cuh
 rename commit/{gpumanager.cu => operator_withCUDA.cu} (88%)

diff --git a/commit/gpumanager.cuh b/commit/gpumanager.cuh
deleted file mode 100644
index 1d20cc3d..00000000
--- a/commit/gpumanager.cuh
+++ /dev/null
@@ -1,120 +0,0 @@
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cuda_runtime_api.h>
-#include <device_launch_parameters.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <algorithm>
-
-//#define __dual__ __host__ __device__
-
-using namespace std;
-
-typedef unsigned int uint32_t;
-typedef unsigned short int uint16_t;
-typedef float float32_t;
-typedef double float64_t;
-
-bool cudaCheck(cudaError_t cudaStatus);
-void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compartmentsPerBlock, uint32_t* offsetPerBlock, int NUM_BLOCKS);
-
-/*class segment_t {
-    public:
-    
-    // pointer to the GPU memory where the array is stored
-    uint32_t voxelID;
-    uint32_t fiberID;
-    uint16_t orienID;
-    float length;
-  
-    __dual__  segment();
-    __dual__ ~segment();
-};//*/
-
-// constant values in GPU
-__constant__ int NUM_VOXELS;
-__constant__ int NUM_FIBERS;
-__constant__ int NUM_PEAKS;
-__constant__ int NUM_ORIENTATIONS;
-__constant__ int NUM_SAMPLES;
-__constant__ int NUM_DIAMETERS;
-__constant__ int NUM_ZEPPELINS;
-__constant__ int NUM_BALLS;
-__constant__ int NUM_ROWS;        
-__constant__ int NUM_COLS;      
-__constant__ int SIZE_LUTIC;      
-__constant__ int SIZE_LUTEC;     
-__constant__ int SIZE_LUTISO;
-
-class CudaLinearOperator {
-
-    // pointers to IC data in GPU memory
-    uint32_t*  voxelIC;
-    uint32_t*  fiberIC;
-    uint16_t*  orienIC;
-    float32_t* lengthIC;
-
-    // pointers to IC data (transpose) in GPU memory
-    uint32_t*  voxelICt;
-    uint32_t*  fiberICt;
-    uint16_t*  orienICt;
-    float32_t* lengthICt;
-    uint32_t* fibersPerBlockICt;
-    uint32_t* offsetPerBlockICt;
-
-    // auxiliar arrays for GPU
-    uint32_t* segmentsPerBlockIC;
-    uint32_t* offsetPerBlockIC;
-    uint32_t* segmentsPerBlockEC;
-    uint32_t* offsetPerBlockEC;
-
-    // pointers to EC data in GPU memory
-    uint32_t*  voxelEC;
-    uint16_t*  orienEC;
-
-    // pointers to LUTs in GPU memory
-    float32_t* lutIC;
-    float32_t* lutEC;
-    float32_t* lutISO;
-
-    // pointers to vector x and y
-    float64_t* x;
-    float64_t* y;
-
-    // dimensions of the operator
-    int nrows;
-    int ncols;
-    int nvoxels;
-    int nfibers;
-
-    public:
-        CudaLinearOperator(
-            uint32_t* voxelIC,
-            uint32_t* fiberIC,
-            uint16_t* orienIC,
-            float32_t*    lengthIC,
-            float32_t*    lutIC,
-        
-            uint32_t* voxelEC,
-            uint16_t* orienEC,
-            float32_t*    lutEC,
-        
-            float32_t*    lutISO,
-        
-            int nsegments,
-            int nvoxels,      
-            int nfibers,      
-            int npeaks,       
-            int norientations,
-            int nsamples,     
-            int ndiameters,   
-            int nzeppelins,   
-            int nballs);
-        
-        ~CudaLinearOperator();
-
-        void setTransposeData(uint32_t* voxelIDs, uint32_t* fiberIDs, uint16_t* orienIDs, float32_t* lengths, int nsegments);
-        void multiplyByX(float64_t* x, float64_t* y);
-        void multiplyByY(float64_t* y, float64_t* x);
-};
\ No newline at end of file
diff --git a/commit/gpumanager.cu b/commit/operator_withCUDA.cu
similarity index 88%
rename from commit/gpumanager.cu
rename to commit/operator_withCUDA.cu
index 83d9713d..cb9491fd 100644
--- a/commit/gpumanager.cu
+++ b/commit/operator_withCUDA.cu
@@ -1,4 +1,108 @@
-#include "gpumanager.cuh"
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
+#include <device_launch_parameters.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <algorithm>
+
+using namespace std;
+
+typedef unsigned int uint32_t;
+typedef unsigned short int uint16_t;
+typedef float float32_t;
+typedef double float64_t;
+
+bool cudaCheck(cudaError_t cudaStatus);
+void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compartmentsPerBlock, uint32_t* offsetPerBlock, int NUM_BLOCKS);
+
+// constant values in GPU
+__constant__ int NUM_VOXELS;
+__constant__ int NUM_FIBERS;
+__constant__ int NUM_PEAKS;
+__constant__ int NUM_ORIENTATIONS;
+__constant__ int NUM_SAMPLES;
+__constant__ int NUM_DIAMETERS;
+__constant__ int NUM_ZEPPELINS;
+__constant__ int NUM_BALLS;
+__constant__ int NUM_ROWS;        
+__constant__ int NUM_COLS;      
+__constant__ int SIZE_LUTIC;      
+__constant__ int SIZE_LUTEC;     
+__constant__ int SIZE_LUTISO;
+
+class CudaLinearOperator {
+
+    // pointers to IC data in GPU memory
+    uint32_t*  voxelIC;
+    uint32_t*  fiberIC;
+    uint16_t*  orienIC;
+    float32_t* lengthIC;
+
+    // pointers to IC data (transpose) in GPU memory
+    uint32_t*  voxelICt;
+    uint32_t*  fiberICt;
+    uint16_t*  orienICt;
+    float32_t* lengthICt;
+    uint32_t* fibersPerBlockICt;
+    uint32_t* offsetPerBlockICt;
+
+    // auxiliar arrays for GPU
+    uint32_t* segmentsPerBlockIC;
+    uint32_t* offsetPerBlockIC;
+    uint32_t* segmentsPerBlockEC;
+    uint32_t* offsetPerBlockEC;
+
+    // pointers to EC data in GPU memory
+    uint32_t*  voxelEC;
+    uint16_t*  orienEC;
+
+    // pointers to LUTs in GPU memory
+    float32_t* lutIC;
+    float32_t* lutEC;
+    float32_t* lutISO;
+
+    // pointers to vector x and y
+    float64_t* x;
+    float64_t* y;
+
+    // dimensions of the operator
+    int nrows;
+    int ncols;
+    int nvoxels;
+    int nfibers;
+
+    public:
+        CudaLinearOperator(
+            uint32_t* voxelIC,
+            uint32_t* fiberIC,
+            uint16_t* orienIC,
+            float32_t*    lengthIC,
+            float32_t*    lutIC,
+        
+            uint32_t* voxelEC,
+            uint16_t* orienEC,
+            float32_t*    lutEC,
+        
+            float32_t*    lutISO,
+        
+            int nsegments,
+            int nvoxels,      
+            int nfibers,      
+            int npeaks,       
+            int norientations,
+            int nsamples,     
+            int ndiameters,   
+            int nzeppelins,   
+            int nballs);
+        
+        ~CudaLinearOperator();
+
+        void setTransposeData(uint32_t* voxelIDs, uint32_t* fiberIDs, uint16_t* orienIDs, float32_t* lengths, int nsegments);
+        void multiplyByX(float64_t* x, float64_t* y);
+        void multiplyByY(float64_t* y, float64_t* x);
+};
 
 bool cudaCheck(cudaError_t cudaStatus){
     return cudaStatus == cudaSuccess;
@@ -20,12 +124,6 @@ void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compar
         offsetPerBlock[i] = offsetPerBlock[i-1] + compartmentsPerBlock[i-1];
 }
 
-/*
-__dual__ segment::segment() {}
-
-__dual__ segment::~segment() {}
-//*/
-
 CudaLinearOperator::CudaLinearOperator(
     uint32_t* voxelIC,
     uint32_t* fiberIC,
diff --git a/setup.py b/setup.py
index df8fd35e..817e5e65 100644
--- a/setup.py
+++ b/setup.py
@@ -96,27 +96,25 @@ def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
     # Inject our redefined _compile method into the class
     self._compile = _compile
 
+# Obtain the numpy include directory. This logic works across numpy versions.
+try:
+    numpy_include = numpy.get_include()
+except AttributeError:
+    numpy_include = numpy.get_numpy_include()
+
 # Try to locate CUDA
 CUDA = locate_cuda()
 
 if CUDA != None:
-    print('Installing CUDA Version')
-
     # Run the customize_compiler
-    class custom_build_ext(build_ext):
+    class cuda_build_ext(build_ext):
         def build_extensions(self):
             customize_compiler_for_nvcc(self.compiler)
             build_ext.build_extensions(self)
 
-    # Obtain the numpy include directory. This logic works across numpy versions.
-    try:
-        numpy_include = numpy.get_include()
-    except AttributeError:
-        numpy_include = numpy.get_numpy_include()
-
     # Cython extension to create the sparse data structure from a tractogram
     # for the computation of matrix-vector multiplications
-    ext1 = Extension(
+    trk2dictionary_ext = Extension(
         name='commit.trk2dictionary',
         sources=['commit/trk2dictionary/trk2dictionary.pyx'],
         include_dirs=[numpy.get_include()],
@@ -131,7 +129,7 @@ def build_extensions(self):
         language='c++',
     )
 
-    ext2 = Extension(
+    core_ext = Extension(
         name='commit.core',
         sources=['commit/core.pyx'],
         include_dirs=[numpy.get_include()],
@@ -146,7 +144,7 @@ def build_extensions(self):
         language='c++',
     )
 
-    ext3 = Extension(
+    proximals_ext = Extension(
         name='commit.proximals',
         sources=['commit/proximals.pyx'],
         include_dirs=[numpy.get_include()],
@@ -161,9 +159,9 @@ def build_extensions(self):
         language='c++',
     )
 
-    ext = Extension(
+    cudaoperator_ext = Extension(
         name='commit.cudaoperator',
-        sources = ['commit/gpumanager.cu', 'commit/cudaoperator.pyx'],
+        sources = ['commit/operator_withCUDA.cu', 'commit/cudaoperator.pyx'],
         library_dirs = [CUDA['lib64']],
         libraries = ['cudart'],
         language = 'c++',
@@ -189,8 +187,8 @@ def build_extensions(self):
         author='Alessandro Daducci',
         author_email='alessandro.daducci@gmail.com',
         url='https://github.com/daducci/COMMIT',
-        cmdclass = {'build_ext':custom_build_ext},
-        ext_modules = [ ext1, ext2, ext3, ext ],
+        cmdclass = {'build_ext':cuda_build_ext},
+        ext_modules = [ trk2dictionary_ext, core_ext, proximals_ext, cudaoperator_ext ],
         packages=['commit','commit.operator'],
         package_data={
             'commit.operator':["*.*"], # needed by pyximport to compile at runtime

From 02aa561f16b7a08afa620fe99e8cdd3262749cda Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Fri, 27 Mar 2020 19:19:53 -0600
Subject: [PATCH 034/190] Modfying some variable names

---
 commit/cudaoperator.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 4637a0f2..ef19b38f 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -5,7 +5,7 @@ import cython
 import numpy as np
 cimport numpy as np
 
-cdef extern from "gpumanager.cuh":
+cdef extern from "operator_withCUDA.cuh":
     cdef cppclass C_CudaLinearOperator "CudaLinearOperator":
         C_CudaLinearOperator(
             np.uint32_t*,

From 071464db39166b897a39161865611da1d6df1a03 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Fri, 27 Mar 2020 19:22:41 -0600
Subject: [PATCH 035/190] Modifying some variable names

---
 commit/cudaoperator.pyx |   2 +-
 setup.py                | 484 ++++++++++++++++++++--------------------
 2 files changed, 243 insertions(+), 243 deletions(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index ef19b38f..8471709d 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -5,7 +5,7 @@ import cython
 import numpy as np
 cimport numpy as np
 
-cdef extern from "operator_withCUDA.cuh":
+cdef extern from "operator_withCUDA.cu":
     cdef cppclass C_CudaLinearOperator "CudaLinearOperator":
         C_CudaLinearOperator(
             np.uint32_t*,
diff --git a/setup.py b/setup.py
index 817e5e65..ea11a4e9 100644
--- a/setup.py
+++ b/setup.py
@@ -1,242 +1,242 @@
-from distutils.core import setup, Extension
-from Cython.Distutils import build_ext
-from Cython.Build import cythonize
-import numpy
-import amico
-import os
-from os.path import join as pjoin
-
-amico_version = amico.__version__.split('.')
-amico_version = [int(version_val) for version_val in amico_version]
-if amico_version[0] == 1 and amico_version[1] < 1:
-    raise RuntimeError( 'COMMIT requires AMICO v1.1.0 or above. Current AMICO version is %s' % amico.__version__ )
-
-
-# taken from npcuda
-def find_in_path(name, path):
-    """Find a file in a search path"""
-
-    # Adapted fom http://code.activestate.com/recipes/52224
-    for dir in path.split(os.pathsep):
-        binpath = pjoin(dir, name)
-        if os.path.exists(binpath):
-            return os.path.abspath(binpath)
-    return None
-
-def locate_cuda():
-    """Locate the CUDA environment on the system
-    Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64'
-    and values giving the absolute path to each directory.
-    Starts by looking for the CUDAHOME env variable. If not found,
-    everything is based on finding 'nvcc' in the PATH.
-    """
-
-    # First check if the CUDAHOME env variable is in use
-    if 'CUDAHOME' in os.environ:
-        home = os.environ['CUDAHOME']
-        nvcc = pjoin(home, 'bin', 'nvcc')
-    else:
-        # Otherwise, search the PATH for NVCC
-        nvcc = find_in_path('nvcc', os.environ['PATH'])
-        if nvcc is None:
-            return None
-        home = os.path.dirname(os.path.dirname(nvcc))
-
-    cudaconfig = {'home': home, 'nvcc': nvcc,
-                  'include': pjoin(home, 'include'),
-                  'lib64': pjoin(home, 'lib64')}
-    for k, v in iter(cudaconfig.items()):
-        if not os.path.exists(v):
-            return None
-
-    return cudaconfig
-
-def customize_compiler_for_nvcc(self):
-    """Inject deep into distutils to customize how the dispatch
-    to gcc/nvcc works.
-    If you subclass UnixCCompiler, it's not trivial to get your subclass
-    injected in, and still have the right customizations (i.e.
-    distutils.sysconfig.customize_compiler) run on it. So instead of going
-    the OO route, I have this. Note, it's kindof like a wierd functional
-    subclassing going on.
-    """
-
-    # Tell the compiler it can processes .cu
-    self.src_extensions.append('.cu')
-
-    # Save references to the default compiler_so and _comple methods
-    default_compiler_so = self.compiler_so
-    super = self._compile
-
-    # Now redefine the _compile method. This gets executed for each
-    # object but distutils doesn't have the ability to change compilers
-    # based on source extension: we add it.
-    def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
-        if os.path.splitext(src)[1] == '.cu':
-            # use the cuda for .cu files
-            self.set_executable('compiler_so', CUDA['nvcc'])
-            # use only a subset of the extra_postargs, which are 1-1
-            # translated from the extra_compile_args in the Extension class
-            print('\n--------nvcc aqui--------')
-            print(type(extra_postargs))
-            print(extra_postargs)
-            print('--------------------\n')
-            postargs = extra_postargs['nvcc']
-        else:
-            print('\n--------gcc aqui--------')
-            print(type(extra_postargs))
-            print(extra_postargs)
-            print('--------------------\n')
-            postargs = extra_postargs['gcc']
-
-        super(obj, src, ext, cc_args, postargs, pp_opts)
-        # Reset the default compiler_so, which we might have changed for cuda
-        self.compiler_so = default_compiler_so
-
-    # Inject our redefined _compile method into the class
-    self._compile = _compile
-
-# Obtain the numpy include directory. This logic works across numpy versions.
-try:
-    numpy_include = numpy.get_include()
-except AttributeError:
-    numpy_include = numpy.get_numpy_include()
-
-# Try to locate CUDA
-CUDA = locate_cuda()
-
-if CUDA != None:
-    # Run the customize_compiler
-    class cuda_build_ext(build_ext):
-        def build_extensions(self):
-            customize_compiler_for_nvcc(self.compiler)
-            build_ext.build_extensions(self)
-
-    # Cython extension to create the sparse data structure from a tractogram
-    # for the computation of matrix-vector multiplications
-    trk2dictionary_ext = Extension(
-        name='commit.trk2dictionary',
-        sources=['commit/trk2dictionary/trk2dictionary.pyx'],
-        include_dirs=[numpy.get_include()],
-        extra_compile_args= {
-            'gcc': ['-w'],
-            'nvcc': [
-                '-arch=sm_30', '--ptxas-options=-v', '-c',
-                '--compiler-options', "'-fPIC'"
-                ]
-            },
-        extra_link_args=[],
-        language='c++',
-    )
-
-    core_ext = Extension(
-        name='commit.core',
-        sources=['commit/core.pyx'],
-        include_dirs=[numpy.get_include()],
-        extra_compile_args= {
-            'gcc': ['-w'],
-            'nvcc': [
-                '-arch=sm_30', '--ptxas-options=-v', '-c',
-                '--compiler-options', "'-fPIC'"
-                ]
-            },
-        extra_link_args=[],
-        language='c++',
-    )
-
-    proximals_ext = Extension(
-        name='commit.proximals',
-        sources=['commit/proximals.pyx'],
-        include_dirs=[numpy.get_include()],
-        extra_compile_args= {
-            'gcc': ['-w'],
-            'nvcc': [
-                '-arch=sm_30', '--ptxas-options=-v', '-c',
-                '--compiler-options', "'-fPIC'"
-                ]
-            },
-        extra_link_args=[],
-        language='c++',
-    )
-
-    cudaoperator_ext = Extension(
-        name='commit.cudaoperator',
-        sources = ['commit/operator_withCUDA.cu', 'commit/cudaoperator.pyx'],
-        library_dirs = [CUDA['lib64']],
-        libraries = ['cudart'],
-        language = 'c++',
-        runtime_library_dirs = [CUDA['lib64']],
-        # This syntax is specific to this build system
-        # we're only going to use certain compiler args with nvcc
-        # and not with gcc the implementation of this trick is in
-        # customize_compiler()
-        extra_compile_args= {
-            'gcc': ['-w'],
-            'nvcc': [
-                '-arch=sm_30', '--ptxas-options=-v', '-c',
-                '--compiler-options', "'-fPIC'"
-                ]
-            },
-        include_dirs = [numpy_include, CUDA['include']]
-    )
-
-    setup(
-        name='commit',
-        version='1.4.0',
-        description='Convex Optimization Modeling for Microstructure Informed Tractography (COMMIT)',
-        author='Alessandro Daducci',
-        author_email='alessandro.daducci@gmail.com',
-        url='https://github.com/daducci/COMMIT',
-        cmdclass = {'build_ext':cuda_build_ext},
-        ext_modules = [ trk2dictionary_ext, core_ext, proximals_ext, cudaoperator_ext ],
-        packages=['commit','commit.operator'],
-        package_data={
-            'commit.operator':["*.*"], # needed by pyximport to compile at runtime
-        },
-    )
-else:
-    print('Installing CPU version')
-
-    # Cython extension to create the sparse data structure from a tractogram
-    # for the computation of matrix-vector multiplications
-    ext1 = Extension(
-        name='commit.trk2dictionary',
-        sources=['commit/trk2dictionary/trk2dictionary.pyx'],
-        include_dirs=[numpy.get_include()],
-        extra_compile_args=['-w'],
-        extra_link_args=[],
-        language='c++',
-    )
-
-    ext2 = Extension(
-        name='commit.core',
-        sources=['commit/core.pyx'],
-        include_dirs=[numpy.get_include()],
-        extra_compile_args=['-w'],
-        extra_link_args=[],
-        language='c++',
-    )
-
-    ext3 = Extension(
-        name='commit.proximals',
-        sources=['commit/proximals.pyx'],
-        include_dirs=[numpy.get_include()],
-        extra_compile_args=['-w'],
-        extra_link_args=[],
-        language='c++',
-    )
-
-    setup(
-        name='commit',
-        version='1.3.0',
-        description='Convex Optimization Modeling for Microstructure Informed Tractography (COMMIT)',
-        author='Alessandro Daducci',
-        author_email='alessandro.daducci@gmail.com',
-        url='https://github.com/daducci/COMMIT',
-        cmdclass = {'build_ext':build_ext},
-        ext_modules = [ ext1, ext2, ext3 ],
-        packages=['commit','commit.operator'],
-        package_data={
-            'commit.operator':["*.*"], # needed by pyximport to compile at runtime
-        },
-    )
+from distutils.core import setup, Extension
+from Cython.Distutils import build_ext
+from Cython.Build import cythonize
+import numpy
+import amico
+import os
+from os.path import join as pjoin
+
+amico_version = amico.__version__.split('.')
+amico_version = [int(version_val) for version_val in amico_version]
+if amico_version[0] == 1 and amico_version[1] < 1:
+    raise RuntimeError( 'COMMIT requires AMICO v1.1.0 or above. Current AMICO version is %s' % amico.__version__ )
+
+
+# taken from npcuda
+def find_in_path(name, path):
+    """Find a file in a search path"""
+
+    # Adapted fom http://code.activestate.com/recipes/52224
+    for dir in path.split(os.pathsep):
+        binpath = pjoin(dir, name)
+        if os.path.exists(binpath):
+            return os.path.abspath(binpath)
+    return None
+
+def locate_cuda():
+    """Locate the CUDA environment on the system
+    Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64'
+    and values giving the absolute path to each directory.
+    Starts by looking for the CUDAHOME env variable. If not found,
+    everything is based on finding 'nvcc' in the PATH.
+    """
+
+    # First check if the CUDAHOME env variable is in use
+    if 'CUDAHOME' in os.environ:
+        home = os.environ['CUDAHOME']
+        nvcc = pjoin(home, 'bin', 'nvcc')
+    else:
+        # Otherwise, search the PATH for NVCC
+        nvcc = find_in_path('nvcc', os.environ['PATH'])
+        if nvcc is None:
+            return None
+        home = os.path.dirname(os.path.dirname(nvcc))
+
+    cudaconfig = {'home': home, 'nvcc': nvcc,
+                  'include': pjoin(home, 'include'),
+                  'lib64': pjoin(home, 'lib64')}
+    for k, v in iter(cudaconfig.items()):
+        if not os.path.exists(v):
+            return None
+
+    return cudaconfig
+
+def customize_compiler_for_nvcc(self):
+    """Inject deep into distutils to customize how the dispatch
+    to gcc/nvcc works.
+    If you subclass UnixCCompiler, it's not trivial to get your subclass
+    injected in, and still have the right customizations (i.e.
+    distutils.sysconfig.customize_compiler) run on it. So instead of going
+    the OO route, I have this. Note, it's kindof like a wierd functional
+    subclassing going on.
+    """
+
+    # Tell the compiler it can processes .cu
+    self.src_extensions.append('.cu')
+
+    # Save references to the default compiler_so and _comple methods
+    default_compiler_so = self.compiler_so
+    super = self._compile
+
+    # Now redefine the _compile method. This gets executed for each
+    # object but distutils doesn't have the ability to change compilers
+    # based on source extension: we add it.
+    def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
+        if os.path.splitext(src)[1] == '.cu':
+            # use the cuda for .cu files
+            self.set_executable('compiler_so', CUDA['nvcc'])
+            # use only a subset of the extra_postargs, which are 1-1
+            # translated from the extra_compile_args in the Extension class
+            print('\n--------nvcc aqui--------')
+            print(type(extra_postargs))
+            print(extra_postargs)
+            print('--------------------\n')
+            postargs = extra_postargs['nvcc']
+        else:
+            print('\n--------gcc aqui--------')
+            print(type(extra_postargs))
+            print(extra_postargs)
+            print('--------------------\n')
+            postargs = extra_postargs['gcc']
+
+        super(obj, src, ext, cc_args, postargs, pp_opts)
+        # Reset the default compiler_so, which we might have changed for cuda
+        self.compiler_so = default_compiler_so
+
+    # Inject our redefined _compile method into the class
+    self._compile = _compile
+
+# Obtain the numpy include directory. This logic works across numpy versions.
+try:
+    numpy_include = numpy.get_include()
+except AttributeError:
+    numpy_include = numpy.get_numpy_include()
+
+# Try to locate CUDA
+CUDA = locate_cuda()
+
+if CUDA != None:
+    # Run the customize_compiler
+    class cuda_build_ext(build_ext):
+        def build_extensions(self):
+            customize_compiler_for_nvcc(self.compiler)
+            build_ext.build_extensions(self)
+
+    # Cython extension to create the sparse data structure from a tractogram
+    # for the computation of matrix-vector multiplications
+    trk2dictionary_ext = Extension(
+        name='commit.trk2dictionary',
+        sources=['commit/trk2dictionary/trk2dictionary.pyx'],
+        include_dirs=[numpy.get_include()],
+        extra_compile_args= {
+            'gcc': ['-w'],
+            'nvcc': [
+                '-arch=sm_30', '--ptxas-options=-v', '-c',
+                '--compiler-options', "'-fPIC'"
+                ]
+            },
+        extra_link_args=[],
+        language='c++',
+    )
+
+    core_ext = Extension(
+        name='commit.core',
+        sources=['commit/core.pyx'],
+        include_dirs=[numpy.get_include()],
+        extra_compile_args= {
+            'gcc': ['-w'],
+            'nvcc': [
+                '-arch=sm_30', '--ptxas-options=-v', '-c',
+                '--compiler-options', "'-fPIC'"
+                ]
+            },
+        extra_link_args=[],
+        language='c++',
+    )
+
+    proximals_ext = Extension(
+        name='commit.proximals',
+        sources=['commit/proximals.pyx'],
+        include_dirs=[numpy.get_include()],
+        extra_compile_args= {
+            'gcc': ['-w'],
+            'nvcc': [
+                '-arch=sm_30', '--ptxas-options=-v', '-c',
+                '--compiler-options', "'-fPIC'"
+                ]
+            },
+        extra_link_args=[],
+        language='c++',
+    )
+
+    cudaoperator_ext = Extension(
+        name='commit.cudaoperator',
+        sources = ['commit/operator_withCUDA.cu', 'commit/cudaoperator.pyx'],
+        library_dirs = [CUDA['lib64']],
+        libraries = ['cudart'],
+        language = 'c++',
+        runtime_library_dirs = [CUDA['lib64']],
+        # This syntax is specific to this build system
+        # we're only going to use certain compiler args with nvcc
+        # and not with gcc the implementation of this trick is in
+        # customize_compiler()
+        extra_compile_args= {
+            'gcc': ['-w'],
+            'nvcc': [
+                '-arch=sm_30', '--ptxas-options=-v', '-c',
+                '--compiler-options', "'-fPIC'"
+                ]
+            },
+        include_dirs = [numpy_include, CUDA['include']]
+    )
+
+    setup(
+        name='commit',
+        version='1.4.0',
+        description='Convex Optimization Modeling for Microstructure Informed Tractography (COMMIT)',
+        author='Alessandro Daducci',
+        author_email='alessandro.daducci@gmail.com',
+        url='https://github.com/daducci/COMMIT',
+        cmdclass = {'build_ext':cuda_build_ext},
+        ext_modules = [ trk2dictionary_ext, core_ext, proximals_ext, cudaoperator_ext ],
+        packages=['commit','commit.operator'],
+        package_data={
+            'commit.operator':["*.*"], # needed by pyximport to compile at runtime
+        },
+    )
+else:
+    print('Installing CPU version')
+
+    # Cython extension to create the sparse data structure from a tractogram
+    # for the computation of matrix-vector multiplications
+    ext1 = Extension(
+        name='commit.trk2dictionary',
+        sources=['commit/trk2dictionary/trk2dictionary.pyx'],
+        include_dirs=[numpy.get_include()],
+        extra_compile_args=['-w'],
+        extra_link_args=[],
+        language='c++',
+    )
+
+    ext2 = Extension(
+        name='commit.core',
+        sources=['commit/core.pyx'],
+        include_dirs=[numpy.get_include()],
+        extra_compile_args=['-w'],
+        extra_link_args=[],
+        language='c++',
+    )
+
+    ext3 = Extension(
+        name='commit.proximals',
+        sources=['commit/proximals.pyx'],
+        include_dirs=[numpy.get_include()],
+        extra_compile_args=['-w'],
+        extra_link_args=[],
+        language='c++',
+    )
+
+    setup(
+        name='commit',
+        version='1.3.0',
+        description='Convex Optimization Modeling for Microstructure Informed Tractography (COMMIT)',
+        author='Alessandro Daducci',
+        author_email='alessandro.daducci@gmail.com',
+        url='https://github.com/daducci/COMMIT',
+        cmdclass = {'build_ext':build_ext},
+        ext_modules = [ ext1, ext2, ext3 ],
+        packages=['commit','commit.operator'],
+        package_data={
+            'commit.operator':["*.*"], # needed by pyximport to compile at runtime
+        },
+    )

From 43a14392e593c248b12c426ee7a061bd28d2af58 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 28 Mar 2020 00:10:46 -0600
Subject: [PATCH 036/190] Modifying some variable names

---
 commit/operator_withCUDA.cu | 285 +++++++++++++++++++++++++++++++++---
 1 file changed, 266 insertions(+), 19 deletions(-)

diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index cb9491fd..3b0f9873 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -78,14 +78,14 @@ class CudaLinearOperator {
             uint32_t* voxelIC,
             uint32_t* fiberIC,
             uint16_t* orienIC,
-            float32_t*    lengthIC,
-            float32_t*    lutIC,
+            float*    lengthIC,
+            float*    lutIC,
         
             uint32_t* voxelEC,
             uint16_t* orienEC,
-            float32_t*    lutEC,
+            float*    lutEC,
         
-            float32_t*    lutISO,
+            float*    lutISO,
         
             int nsegments,
             int nvoxels,      
@@ -95,13 +95,260 @@ class CudaLinearOperator {
             int nsamples,     
             int ndiameters,   
             int nzeppelins,   
-            int nballs);
+            int nballs)
+        {
+            this->nvoxels = nvoxels;
+            this->nfibers = nfibers;
+            this->nrows = nvoxels * nsamples;
+            this->ncols = nfibers*ndiameters + npeaks*nzeppelins + nvoxels*nballs;
+            int size_lutic  = ndiameters*norientations*nsamples;
+            int size_lutec  = nzeppelins*norientations*nsamples;
+            int size_lutiso = nballs*nsamples;
+            bool status;
         
-        ~CudaLinearOperator();
+            uint32_t* segmentsPerBlock = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
+            uint32_t* offsetPerBlock   = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
+        
+            // copy constant values to GPU
+            printf("\t* constant global values ... ");
+            status = true;
+            status = status && cudaCheck( cudaMemcpyToSymbol(NUM_VOXELS,       &nvoxels,       sizeof(int)) );
+            status = status && cudaCheck( cudaMemcpyToSymbol(NUM_FIBERS,       &nfibers,       sizeof(int)) );
+            status = status && cudaCheck( cudaMemcpyToSymbol(NUM_PEAKS,        &npeaks,        sizeof(int)) );
+            status = status && cudaCheck( cudaMemcpyToSymbol(NUM_ORIENTATIONS, &norientations, sizeof(int)) );
+            status = status && cudaCheck( cudaMemcpyToSymbol(NUM_SAMPLES,      &nsamples,      sizeof(int)) );
+            status = status && cudaCheck( cudaMemcpyToSymbol(NUM_DIAMETERS,    &ndiameters,    sizeof(int)) );
+            status = status && cudaCheck( cudaMemcpyToSymbol(NUM_ZEPPELINS,    &nzeppelins,    sizeof(int)) );
+            status = status && cudaCheck( cudaMemcpyToSymbol(NUM_BALLS,        &nballs,        sizeof(int)) );
+            status = status && cudaCheck( cudaMemcpyToSymbol(NUM_ROWS,         &nrows,         sizeof(int)) );
+            status = status && cudaCheck( cudaMemcpyToSymbol(NUM_COLS,         &ncols,         sizeof(int)) );
+            status = status && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTIC,       &size_lutic,    sizeof(int)) );
+            status = status && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTEC,       &size_lutec,    sizeof(int)) );
+            status = status && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTISO,      &size_lutiso,   sizeof(int)) );
+            if (status) printf("[ OK ]\n");
+            else        printf("[ ERROR ]\n");
+        
+        
+            // alloc memory in GPU for vectors x and y
+            printf("\t* memory for vectors x and y ... ");
+            status = true;
+            status = status && cudaCheck( cudaMalloc((void**)&(this->x), ncols*sizeof(float64_t)) );
+            status = status && cudaCheck( cudaMalloc((void**)&(this->y), nrows*sizeof(float64_t)) );
+            if (status) printf("[ OK ]\n");
+            else        printf("[ ERROR ]\n");
+        
+            // alloc GPU memory for segments
+            printf("\t* memory for LUT (IC part) ... ");
+            status = true;
+            status = status && cudaCheck( cudaMalloc((void**)&(this->lutIC), size_lutic*sizeof(float32_t)) );
+            if (status) printf("[ OK ]\n");
+            else        printf("[ ERROR ]\n");
+        
+            printf("\t* copying LUT in GPU (IC part) ... ");
+            status = true;
+            status = status && cudaCheck( cudaMemcpy(this->lutIC, lutIC, size_lutic*sizeof(float32_t), cudaMemcpyHostToDevice) );
+            if (status) printf("[ OK ]\n");
+            else        printf("[ ERROR ]\n");
+        
+            printf("\t* allocating memory for LUT in GPU (EC part) ... ");
+            status = cudaCheck( cudaMalloc((void**)&(this->lutEC), size_lutec*sizeof(float32_t)) );
+            if (status) printf("[ OK ]\n");
+            else        printf("[ ERROR ]\n");
+        
+            printf("\t* copying LUT in GPU (EC part) ... ");
+            status = cudaCheck( cudaMemcpy(this->lutEC, lutEC, size_lutec*sizeof(float32_t), cudaMemcpyHostToDevice) );
+            if (status) printf("[ OK ]\n");
+            else        printf("[ ERROR ]\n");
+        
+            printf("\t* allocating memory for LUT in GPU (ISO part) ... ");
+            status = cudaCheck( cudaMalloc((void**)&(this->lutISO), size_lutiso*sizeof(float32_t)) );
+            if (status) printf("[ OK ]\n");
+            else        printf("[ ERROR ]\n");
+        
+            printf("\t* copying LUT in GPU (ISO part) ... ");
+            status = cudaCheck( cudaMemcpy(this->lutISO, lutISO, size_lutiso*sizeof(float32_t), cudaMemcpyHostToDevice) );
+            if (status) printf("[ OK ]\n");
+            else        printf("[ ERROR ]\n");
+        
+            printf("\t* preprocessing data for GPU ... ");
+            preprocessDataForGPU(voxelIC, nsegments, segmentsPerBlock, offsetPerBlock, nvoxels);
+            printf("\n");
+        
+            printf("\t* fiber segments memory allocation ... ");
+            status = true;
+            status = status && cudaCheck( cudaMalloc((void**)&(this->voxelIC),  nsegments*sizeof(uint32_t))  );
+            status = status && cudaCheck( cudaMalloc((void**)&(this->fiberIC),  nsegments*sizeof(uint32_t))  );
+            status = status && cudaCheck( cudaMalloc((void**)&(this->orienIC),  nsegments*sizeof(uint16_t))  );
+            status = status && cudaCheck( cudaMalloc((void**)&(this->lengthIC), nsegments*sizeof(float32_t)) );
+            status = status && cudaCheck( cudaMalloc((void**)&(this->segmentsPerBlockIC), nvoxels*sizeof(uint32_t)) );
+            status = status && cudaCheck( cudaMalloc((void**)&(this->offsetPerBlockIC),   nvoxels*sizeof(uint32_t)) );
+            if (status) printf("[ OK ]\n");
+            else        printf("[ ERROR ]\n");
+        
+            printf("\t* transfering fiber segments ... ");
+            status = true;
+            status = status && cudaCheck( cudaMemcpy(this->voxelIC,  voxelIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+            status = status && cudaCheck( cudaMemcpy(this->fiberIC,  fiberIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+            status = status && cudaCheck( cudaMemcpy(this->orienIC,  orienIC,  nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
+            status = status && cudaCheck( cudaMemcpy(this->lengthIC, lengthIC, nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
+            status = status && cudaCheck( cudaMemcpy(this->segmentsPerBlockIC, segmentsPerBlock, nvoxels*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+            status = status && cudaCheck( cudaMemcpy(this->offsetPerBlockIC,   offsetPerBlock,   nvoxels*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+            if (status) printf("[ OK ]\n");
+            else        printf("[ ERROR ]\n");
+        
+            // ---------------------------------------- EC DATA ---------------------------------------- //
+            printf("\t* allocating memory for operator A in GPU (EC part) ... ");
+            status = true;
+            status = status && cudaCheck( cudaMalloc((void**)&(this->voxelEC),  npeaks*sizeof(uint32_t)) );
+            status = status && cudaCheck( cudaMalloc((void**)&(this->orienEC),  npeaks*sizeof(uint16_t)) );
+            status = status && cudaCheck( cudaMalloc((void**)&(this->segmentsPerBlockEC), nvoxels*sizeof(uint32_t))  );
+            status = status && cudaCheck( cudaMalloc((void**)&(this->offsetPerBlockEC),   nvoxels*sizeof(uint32_t))  );
+            if (status) printf("[ OK ]\n");
+            else        printf("[ ERROR ]\n");
+        
+            printf("\t* preprocessing EC data for GPU ... ");
+            preprocessDataForGPU(voxelEC, npeaks, segmentsPerBlock, offsetPerBlock, nvoxels);
+            printf("\n");
+        
+            printf("\t* copying operator A to GPU (EC part) ... ");
+            status = true;
+            status = status && cudaCheck( cudaMemcpy(this->voxelEC,            voxelEC,              npeaks*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+            status = status && cudaCheck( cudaMemcpy(this->orienEC,            orienEC,              npeaks*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
+            status = status && cudaCheck( cudaMemcpy(this->segmentsPerBlockEC, segmentsPerBlock,     nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
+            status = status && cudaCheck( cudaMemcpy(this->offsetPerBlockEC,   offsetPerBlock,       nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
+            if (status) printf("[ OK ]\n");
+            else        printf("[ ERROR ]\n");
+        
+            free(segmentsPerBlock);
+            free(offsetPerBlock);
+        }
+        
+        ~CudaLinearOperator(){
+            cudaFree(voxelIC);
+            cudaFree(fiberIC);
+            cudaFree(orienIC);
+            cudaFree(lengthIC);
+            cudaFree(lutIC);
+            cudaFree(segmentsPerBlockIC);
+            cudaFree(offsetPerBlockIC);
+            
+            cudaFree(voxelEC);
+            cudaFree(orienEC);
+            cudaFree(lutEC);
+            cudaFree(segmentsPerBlockEC);
+            cudaFree(offsetPerBlockEC);
+        
+            cudaFree(lutISO);
+        
+            cudaFree(voxelICt);
+            cudaFree(fiberICt);
+            cudaFree(orienICt);
+            cudaFree(lengthICt);
+            cudaFree(fibersPerBlockICt);
+            cudaFree(offsetPerBlockICt);
+        
+            cudaFree(x);
+            cudaFree(y);
+        
+            printf("\t* reseting GPU ... ");
+            bool status = true;
+            status = status && cudaCheck( cudaDeviceReset() );
+            if (status) printf("[ OK ]\n");
+            else        printf("[ ERROR ]\n");
+        }
 
-        void setTransposeData(uint32_t* voxelIDs, uint32_t* fiberIDs, uint16_t* orienIDs, float32_t* lengths, int nsegments);
-        void multiplyByX(float64_t* x, float64_t* y);
-        void multiplyByY(float64_t* y, float64_t* x);
+        void setTransposeData(
+            uint32_t*  voxelIDs,
+            uint32_t*  fiberIDs,
+            uint16_t*  orienIDs,
+            float32_t* lengths,
+            int nsegments)
+        {
+            bool status;
+            uint32_t*  fibersPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
+            uint32_t*  offsetPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
+        
+            preprocessDataForGPU(fiberIDs, nsegments, fibersPerBlock, offsetPerBlock, nfibers);
+        
+            printf("\t* extra memory for operator A' ... ");
+            status = true;
+            status = status && cudaCheck( cudaMalloc((void**)&(voxelICt),  nsegments*sizeof(uint32_t))  );
+            status = status && cudaCheck( cudaMalloc((void**)&(fiberICt),  nsegments*sizeof(uint32_t))  );
+            status = status && cudaCheck( cudaMalloc((void**)&(orienICt),  nsegments*sizeof(uint16_t))  );
+            status = status && cudaCheck( cudaMalloc((void**)&(lengthICt), nsegments*sizeof(float32_t)) );
+            status = status && cudaCheck( cudaMalloc((void**)&(fibersPerBlockICt), nfibers*sizeof(uint32_t)) );
+            status = status && cudaCheck( cudaMalloc((void**)&(offsetPerBlockICt), nfibers*sizeof(uint32_t)) );
+            if (status) printf("[ OK ]\n");
+            else        printf("[ ERROR ]\n");
+        
+            printf("\t* transfering memory for operator A' ... ");
+            status = true;
+            status = status && cudaCheck( cudaMemcpy(voxelICt,  voxelIDs, nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+            status = status && cudaCheck( cudaMemcpy(fiberICt,  fiberIDs, nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+            status = status && cudaCheck( cudaMemcpy(orienICt,  orienIDs, nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
+            status = status && cudaCheck( cudaMemcpy(lengthICt, lengths,  nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
+            status = status && cudaCheck( cudaMemcpy(fibersPerBlockICt, fibersPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+            status = status && cudaCheck( cudaMemcpy(offsetPerBlockICt, offsetPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+            if (status) printf("[ OK ]\n");
+            else        printf("[ ERROR ]\n");
+        
+            free(fibersPerBlock);
+            free(offsetPerBlock);
+        }
+
+        void multiplyByX(float64_t* x, float64_t* y){
+            // Copy vector x to the GPU
+            cudaMemcpy(this->x, x, ncols*sizeof(double), cudaMemcpyHostToDevice);
+
+            // Multiply IC part in the GPU
+            multiply_Ax_ICpart<<<nvoxels, 1024>>>(voxelIC, fiberIC, orienIC, lengthIC, segmentsPerBlockIC, offsetPerBlockIC, lutIC, this->x, this->y);
+
+            //cudaCheckKernel();
+
+            // Multiply EC part in the GPU
+            multiply_Ax_ECpart<<<nvoxels, 512>>>(voxelEC, orienEC, segmentsPerBlockEC, offsetPerBlockEC, lutEC, this->x, this->y);
+
+            //cudaCheckKernel();
+
+            // Multiply ISO part in the GPU
+            multiply_Ax_ISOpart<<<nvoxels, 512>>>(lutISO, this->x, this->y);
+
+            //cudaCheckKernel();
+
+            // Copy back result to CPU
+            cudaMemcpy(y, this->y, nrows*sizeof(double), cudaMemcpyDeviceToHost);
+        }
+
+        void multiplyByY(float64_t* v_in, float64_t* v_out){
+        
+            // Copy vector y to the GPU
+            //cudaCheck( cudaMemset(gpu_x, 0, NUM_COLS*sizeof(float64_t)) );
+            //cudaCheck( cudaMemcpy(gpu_x, x, NUM_COLS*sizeof(double), cudaMemcpyHostToDevice) );
+            cudaCheck( cudaMemcpy(y, v_in, nrows*sizeof(double), cudaMemcpyHostToDevice) );
+        
+            // Multiply IC part in the GPU
+            multiply_Aty_ICpart<<<nfibers, 512>>>(voxelICt, fiberICt, orienICt, lengthICt, fibersPerBlockICt, offsetPerBlockICt, lutIC, x, y);
+        
+            //cudaCheckKernel();//*/
+        
+            // Multiply EC part in the GPU
+            multiply_Aty_ECpart<<<nvoxels, 512>>>(voxelEC, orienEC, segmentsPerBlockEC, offsetPerBlockEC, lutEC, x, y);
+        
+            //cudaCheckKernel();
+        
+            // Multiply ISO part in the GPU
+            multiply_Aty_ISOpart<<<nvoxels, 512>>>(lutISO, x, y);
+        
+            //cudaCheckKernel();//*/
+        
+            // Copy back result to CPU
+            cudaCheck( cudaMemcpy(v_out, x, ncols*sizeof(double), cudaMemcpyDeviceToHost) );
+                
+            /*printf("\n\n VECTOR X EC PART:\n");
+            for(int i = NUM_FIBERS*NUM_RESFUNCIC; i < NUM_FIBERS*NUM_RESFUNCIC+20; i++)
+                printf("%lf ", x[i]);
+            printf("\n\n");//*/
+        }
 };
 
 bool cudaCheck(cudaError_t cudaStatus){
@@ -124,7 +371,7 @@ void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compar
         offsetPerBlock[i] = offsetPerBlock[i-1] + compartmentsPerBlock[i-1];
 }
 
-CudaLinearOperator::CudaLinearOperator(
+/*CudaLinearOperator::CudaLinearOperator(
     uint32_t* voxelIC,
     uint32_t* fiberIC,
     uint16_t* orienIC,
@@ -271,9 +518,9 @@ CudaLinearOperator::CudaLinearOperator(
 
     free(segmentsPerBlock);
     free(offsetPerBlock);
-}
+}*/
 
-CudaLinearOperator::~CudaLinearOperator(){
+/*CudaLinearOperator::~CudaLinearOperator(){
     cudaFree(voxelIC);
     cudaFree(fiberIC);
     cudaFree(orienIC);
@@ -305,9 +552,9 @@ CudaLinearOperator::~CudaLinearOperator(){
     status = status && cudaCheck( cudaDeviceReset() );
     if (status) printf("[ OK ]\n");
     else        printf("[ ERROR ]\n");
-}
+}*/
 
-void CudaLinearOperator::setTransposeData(
+/*void CudaLinearOperator::setTransposeData(
     uint32_t*  voxelIDs,
     uint32_t*  fiberIDs,
     uint16_t*  orienIDs,
@@ -344,7 +591,7 @@ void CudaLinearOperator::setTransposeData(
 
     free(fibersPerBlock);
     free(offsetPerBlock);
-}
+}*/
 
 __global__ void multiply_Ax_ICpart(
     uint32_t*  voxelIDs,
@@ -622,7 +869,7 @@ __global__ void multiply_Aty_ISOpart(float* lut, double* x, double* y){
     }
 }//*/
 
-void CudaLinearOperator::multiplyByX(float64_t* x, float64_t* y){
+/*void CudaLinearOperator::multiplyByX(float64_t* x, float64_t* y){
 
     // Copy vector x to the GPU
     cudaMemcpy(this->x, x, ncols*sizeof(double), cudaMemcpyHostToDevice);
@@ -644,9 +891,9 @@ void CudaLinearOperator::multiplyByX(float64_t* x, float64_t* y){
 
     // Copy back result to CPU
     cudaMemcpy(y, this->y, nrows*sizeof(double), cudaMemcpyDeviceToHost);
-}
+}*/
 
-void CudaLinearOperator::multiplyByY(float64_t* v_in, float64_t* v_out){
+/*void CudaLinearOperator::multiplyByY(float64_t* v_in, float64_t* v_out){
         
     // Copy vector y to the GPU
     //cudaCheck( cudaMemset(gpu_x, 0, NUM_COLS*sizeof(float64_t)) );
@@ -675,4 +922,4 @@ void CudaLinearOperator::multiplyByY(float64_t* v_in, float64_t* v_out){
     for(int i = NUM_FIBERS*NUM_RESFUNCIC; i < NUM_FIBERS*NUM_RESFUNCIC+20; i++)
         printf("%lf ", x[i]);
     printf("\n\n");//*/
-}
\ No newline at end of file
+}*/
\ No newline at end of file

From 437e553165399abe06e7a5e3a10ff3f8702175d8 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 28 Mar 2020 00:17:47 -0600
Subject: [PATCH 037/190] Modifying some variable names

---
 commit/operator_withCUDA.cu | 58 +++++++++++++++++++++++++++++++++----
 1 file changed, 52 insertions(+), 6 deletions(-)

diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index 3b0f9873..94d2c2d7 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -17,6 +17,56 @@ typedef double float64_t;
 bool cudaCheck(cudaError_t cudaStatus);
 void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compartmentsPerBlock, uint32_t* offsetPerBlock, int NUM_BLOCKS);
 
+__global__ void multiply_Ax_ICpart(
+    uint32_t*  voxelIDs,
+    uint32_t*  fiberIDs,
+    uint16_t*  orienIDs,
+    float32_t* lengths,
+    uint32_t*  segmentsPerBlock,
+    uint32_t*  offsetPerBlock,
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y);
+
+__global__ void multiply_Ax_ECpart(
+        uint32_t*  voxelIDs,
+        uint16_t*  orienIDs,
+        uint32_t*  segmentsPerBlock,
+        uint32_t*  offsetPerBlock,
+        float32_t* lut,
+        float64_t* x,
+        float64_t* y);
+
+__global__ void multiply_Ax_ISOpart(
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y);
+
+__global__ void multiply_Aty_ICpart(
+    uint32_t*  voxelICt,
+    uint32_t*  fiberICt,
+    uint16_t*  orienICt,
+    float32_t* lengthICt,
+    uint32_t*  compartmentsPerBlock,
+    uint32_t*  offsetPerBlock,
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y);
+
+__global__ void multiply_Aty_ECpart(
+    uint32_t*  voxelEC,
+    uint16_t*  orienEC,
+    uint32_t*  segmentsPerBlock,
+    uint32_t*  offsetPerBlock,
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y);
+
+__global__ void multiply_Aty_ISOpart(
+    float* lut,
+    double* x,
+    double* y);
+
 // constant values in GPU
 __constant__ int NUM_VOXELS;
 __constant__ int NUM_FIBERS;
@@ -903,7 +953,7 @@ __global__ void multiply_Aty_ISOpart(float* lut, double* x, double* y){
     // Multiply IC part in the GPU
     multiply_Aty_ICpart<<<nfibers, 512>>>(voxelICt, fiberICt, orienICt, lengthICt, fibersPerBlockICt, offsetPerBlockICt, lutIC, x, y);
 
-    //cudaCheckKernel();//*/
+    //cudaCheckKernel();
 
     // Multiply EC part in the GPU
     multiply_Aty_ECpart<<<nvoxels, 512>>>(voxelEC, orienEC, segmentsPerBlockEC, offsetPerBlockEC, lutEC, x, y);
@@ -913,13 +963,9 @@ __global__ void multiply_Aty_ISOpart(float* lut, double* x, double* y){
     // Multiply ISO part in the GPU
     multiply_Aty_ISOpart<<<nvoxels, 512>>>(lutISO, x, y);
 
-    //cudaCheckKernel();//*/
+    //cudaCheckKernel();
 
     // Copy back result to CPU
     cudaCheck( cudaMemcpy(v_out, x, ncols*sizeof(double), cudaMemcpyDeviceToHost) );
         
-    /*printf("\n\n VECTOR X EC PART:\n");
-    for(int i = NUM_FIBERS*NUM_RESFUNCIC; i < NUM_FIBERS*NUM_RESFUNCIC+20; i++)
-        printf("%lf ", x[i]);
-    printf("\n\n");//*/
 }*/
\ No newline at end of file

From 0e41caf4d43dd50594bfca90e37cc31097d3aaaa Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 28 Mar 2020 02:29:52 -0600
Subject: [PATCH 038/190] Revert "Enabling nthreads=0 in set_threads()
 function"

This reverts commit 4f772c08bffc28464593258b6365ce95ac7097f4.
---
 commit/core.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index 76ee8f78..cc26a26f 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -621,7 +621,7 @@ cdef class Evaluation :
             self.THREADS['ECt'] = None
 
         if self.DICTIONARY['nV'] > 0 :
-            self.THREADS['ISOt'] = np.zeros( nthreads+1, dtype=np.uint32 )
+            self.THREADS['ISOt'] = np.zeros( n+1, dtype=np.uint32 )
             N = np.floor( self.DICTIONARY['nV']/nthreads )
             for i in xrange(1,nthreads) :
                 self.THREADS['ISOt'][i] = self.THREADS['ISOt'][i-1] + N

From c64337a2e3796eb9319ed91fe78e2490c1f4c4a8 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 28 Mar 2020 11:21:36 -0600
Subject: [PATCH 039/190] Reverting wrong file merging and modifying some
 variable names

---
 commit/operator_withCUDA.cu  | 571 ++++++-----------------------------
 commit/operator_withCUDA.cuh | 157 ++++++++++
 2 files changed, 245 insertions(+), 483 deletions(-)
 create mode 100644 commit/operator_withCUDA.cuh

diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index 94d2c2d7..5c5df373 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -1,439 +1,19 @@
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cuda_runtime_api.h>
-#include <device_launch_parameters.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <algorithm>
+#include "operator_withCUDA.cuh"
 
-using namespace std;
-
-typedef unsigned int uint32_t;
-typedef unsigned short int uint16_t;
-typedef float float32_t;
-typedef double float64_t;
-
-bool cudaCheck(cudaError_t cudaStatus);
-void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compartmentsPerBlock, uint32_t* offsetPerBlock, int NUM_BLOCKS);
-
-__global__ void multiply_Ax_ICpart(
-    uint32_t*  voxelIDs,
-    uint32_t*  fiberIDs,
-    uint16_t*  orienIDs,
-    float32_t* lengths,
-    uint32_t*  segmentsPerBlock,
-    uint32_t*  offsetPerBlock,
-    float32_t* lut,
-    float64_t* x,
-    float64_t* y);
-
-__global__ void multiply_Ax_ECpart(
-        uint32_t*  voxelIDs,
-        uint16_t*  orienIDs,
-        uint32_t*  segmentsPerBlock,
-        uint32_t*  offsetPerBlock,
-        float32_t* lut,
-        float64_t* x,
-        float64_t* y);
-
-__global__ void multiply_Ax_ISOpart(
-    float32_t* lut,
-    float64_t* x,
-    float64_t* y);
-
-__global__ void multiply_Aty_ICpart(
-    uint32_t*  voxelICt,
-    uint32_t*  fiberICt,
-    uint16_t*  orienICt,
-    float32_t* lengthICt,
-    uint32_t*  compartmentsPerBlock,
-    uint32_t*  offsetPerBlock,
-    float32_t* lut,
-    float64_t* x,
-    float64_t* y);
-
-__global__ void multiply_Aty_ECpart(
-    uint32_t*  voxelEC,
-    uint16_t*  orienEC,
-    uint32_t*  segmentsPerBlock,
-    uint32_t*  offsetPerBlock,
-    float32_t* lut,
-    float64_t* x,
-    float64_t* y);
-
-__global__ void multiply_Aty_ISOpart(
-    float* lut,
-    double* x,
-    double* y);
-
-// constant values in GPU
-__constant__ int NUM_VOXELS;
-__constant__ int NUM_FIBERS;
-__constant__ int NUM_PEAKS;
-__constant__ int NUM_ORIENTATIONS;
-__constant__ int NUM_SAMPLES;
-__constant__ int NUM_DIAMETERS;
-__constant__ int NUM_ZEPPELINS;
-__constant__ int NUM_BALLS;
-__constant__ int NUM_ROWS;        
-__constant__ int NUM_COLS;      
-__constant__ int SIZE_LUTIC;      
-__constant__ int SIZE_LUTEC;     
-__constant__ int SIZE_LUTISO;
-
-class CudaLinearOperator {
-
-    // pointers to IC data in GPU memory
-    uint32_t*  voxelIC;
-    uint32_t*  fiberIC;
-    uint16_t*  orienIC;
-    float32_t* lengthIC;
-
-    // pointers to IC data (transpose) in GPU memory
-    uint32_t*  voxelICt;
-    uint32_t*  fiberICt;
-    uint16_t*  orienICt;
-    float32_t* lengthICt;
-    uint32_t* fibersPerBlockICt;
-    uint32_t* offsetPerBlockICt;
-
-    // auxiliar arrays for GPU
-    uint32_t* segmentsPerBlockIC;
-    uint32_t* offsetPerBlockIC;
-    uint32_t* segmentsPerBlockEC;
-    uint32_t* offsetPerBlockEC;
-
-    // pointers to EC data in GPU memory
-    uint32_t*  voxelEC;
-    uint16_t*  orienEC;
-
-    // pointers to LUTs in GPU memory
-    float32_t* lutIC;
-    float32_t* lutEC;
-    float32_t* lutISO;
-
-    // pointers to vector x and y
-    float64_t* x;
-    float64_t* y;
-
-    // dimensions of the operator
-    int nrows;
-    int ncols;
-    int nvoxels;
-    int nfibers;
-
-    public:
-        CudaLinearOperator(
-            uint32_t* voxelIC,
-            uint32_t* fiberIC,
-            uint16_t* orienIC,
-            float*    lengthIC,
-            float*    lutIC,
-        
-            uint32_t* voxelEC,
-            uint16_t* orienEC,
-            float*    lutEC,
-        
-            float*    lutISO,
-        
-            int nsegments,
-            int nvoxels,      
-            int nfibers,      
-            int npeaks,       
-            int norientations,
-            int nsamples,     
-            int ndiameters,   
-            int nzeppelins,   
-            int nballs)
-        {
-            this->nvoxels = nvoxels;
-            this->nfibers = nfibers;
-            this->nrows = nvoxels * nsamples;
-            this->ncols = nfibers*ndiameters + npeaks*nzeppelins + nvoxels*nballs;
-            int size_lutic  = ndiameters*norientations*nsamples;
-            int size_lutec  = nzeppelins*norientations*nsamples;
-            int size_lutiso = nballs*nsamples;
-            bool status;
-        
-            uint32_t* segmentsPerBlock = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
-            uint32_t* offsetPerBlock   = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
-        
-            // copy constant values to GPU
-            printf("\t* constant global values ... ");
-            status = true;
-            status = status && cudaCheck( cudaMemcpyToSymbol(NUM_VOXELS,       &nvoxels,       sizeof(int)) );
-            status = status && cudaCheck( cudaMemcpyToSymbol(NUM_FIBERS,       &nfibers,       sizeof(int)) );
-            status = status && cudaCheck( cudaMemcpyToSymbol(NUM_PEAKS,        &npeaks,        sizeof(int)) );
-            status = status && cudaCheck( cudaMemcpyToSymbol(NUM_ORIENTATIONS, &norientations, sizeof(int)) );
-            status = status && cudaCheck( cudaMemcpyToSymbol(NUM_SAMPLES,      &nsamples,      sizeof(int)) );
-            status = status && cudaCheck( cudaMemcpyToSymbol(NUM_DIAMETERS,    &ndiameters,    sizeof(int)) );
-            status = status && cudaCheck( cudaMemcpyToSymbol(NUM_ZEPPELINS,    &nzeppelins,    sizeof(int)) );
-            status = status && cudaCheck( cudaMemcpyToSymbol(NUM_BALLS,        &nballs,        sizeof(int)) );
-            status = status && cudaCheck( cudaMemcpyToSymbol(NUM_ROWS,         &nrows,         sizeof(int)) );
-            status = status && cudaCheck( cudaMemcpyToSymbol(NUM_COLS,         &ncols,         sizeof(int)) );
-            status = status && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTIC,       &size_lutic,    sizeof(int)) );
-            status = status && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTEC,       &size_lutec,    sizeof(int)) );
-            status = status && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTISO,      &size_lutiso,   sizeof(int)) );
-            if (status) printf("[ OK ]\n");
-            else        printf("[ ERROR ]\n");
-        
-        
-            // alloc memory in GPU for vectors x and y
-            printf("\t* memory for vectors x and y ... ");
-            status = true;
-            status = status && cudaCheck( cudaMalloc((void**)&(this->x), ncols*sizeof(float64_t)) );
-            status = status && cudaCheck( cudaMalloc((void**)&(this->y), nrows*sizeof(float64_t)) );
-            if (status) printf("[ OK ]\n");
-            else        printf("[ ERROR ]\n");
-        
-            // alloc GPU memory for segments
-            printf("\t* memory for LUT (IC part) ... ");
-            status = true;
-            status = status && cudaCheck( cudaMalloc((void**)&(this->lutIC), size_lutic*sizeof(float32_t)) );
-            if (status) printf("[ OK ]\n");
-            else        printf("[ ERROR ]\n");
-        
-            printf("\t* copying LUT in GPU (IC part) ... ");
-            status = true;
-            status = status && cudaCheck( cudaMemcpy(this->lutIC, lutIC, size_lutic*sizeof(float32_t), cudaMemcpyHostToDevice) );
-            if (status) printf("[ OK ]\n");
-            else        printf("[ ERROR ]\n");
-        
-            printf("\t* allocating memory for LUT in GPU (EC part) ... ");
-            status = cudaCheck( cudaMalloc((void**)&(this->lutEC), size_lutec*sizeof(float32_t)) );
-            if (status) printf("[ OK ]\n");
-            else        printf("[ ERROR ]\n");
-        
-            printf("\t* copying LUT in GPU (EC part) ... ");
-            status = cudaCheck( cudaMemcpy(this->lutEC, lutEC, size_lutec*sizeof(float32_t), cudaMemcpyHostToDevice) );
-            if (status) printf("[ OK ]\n");
-            else        printf("[ ERROR ]\n");
-        
-            printf("\t* allocating memory for LUT in GPU (ISO part) ... ");
-            status = cudaCheck( cudaMalloc((void**)&(this->lutISO), size_lutiso*sizeof(float32_t)) );
-            if (status) printf("[ OK ]\n");
-            else        printf("[ ERROR ]\n");
-        
-            printf("\t* copying LUT in GPU (ISO part) ... ");
-            status = cudaCheck( cudaMemcpy(this->lutISO, lutISO, size_lutiso*sizeof(float32_t), cudaMemcpyHostToDevice) );
-            if (status) printf("[ OK ]\n");
-            else        printf("[ ERROR ]\n");
-        
-            printf("\t* preprocessing data for GPU ... ");
-            preprocessDataForGPU(voxelIC, nsegments, segmentsPerBlock, offsetPerBlock, nvoxels);
-            printf("\n");
-        
-            printf("\t* fiber segments memory allocation ... ");
-            status = true;
-            status = status && cudaCheck( cudaMalloc((void**)&(this->voxelIC),  nsegments*sizeof(uint32_t))  );
-            status = status && cudaCheck( cudaMalloc((void**)&(this->fiberIC),  nsegments*sizeof(uint32_t))  );
-            status = status && cudaCheck( cudaMalloc((void**)&(this->orienIC),  nsegments*sizeof(uint16_t))  );
-            status = status && cudaCheck( cudaMalloc((void**)&(this->lengthIC), nsegments*sizeof(float32_t)) );
-            status = status && cudaCheck( cudaMalloc((void**)&(this->segmentsPerBlockIC), nvoxels*sizeof(uint32_t)) );
-            status = status && cudaCheck( cudaMalloc((void**)&(this->offsetPerBlockIC),   nvoxels*sizeof(uint32_t)) );
-            if (status) printf("[ OK ]\n");
-            else        printf("[ ERROR ]\n");
-        
-            printf("\t* transfering fiber segments ... ");
-            status = true;
-            status = status && cudaCheck( cudaMemcpy(this->voxelIC,  voxelIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-            status = status && cudaCheck( cudaMemcpy(this->fiberIC,  fiberIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-            status = status && cudaCheck( cudaMemcpy(this->orienIC,  orienIC,  nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
-            status = status && cudaCheck( cudaMemcpy(this->lengthIC, lengthIC, nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
-            status = status && cudaCheck( cudaMemcpy(this->segmentsPerBlockIC, segmentsPerBlock, nvoxels*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-            status = status && cudaCheck( cudaMemcpy(this->offsetPerBlockIC,   offsetPerBlock,   nvoxels*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-            if (status) printf("[ OK ]\n");
-            else        printf("[ ERROR ]\n");
-        
-            // ---------------------------------------- EC DATA ---------------------------------------- //
-            printf("\t* allocating memory for operator A in GPU (EC part) ... ");
-            status = true;
-            status = status && cudaCheck( cudaMalloc((void**)&(this->voxelEC),  npeaks*sizeof(uint32_t)) );
-            status = status && cudaCheck( cudaMalloc((void**)&(this->orienEC),  npeaks*sizeof(uint16_t)) );
-            status = status && cudaCheck( cudaMalloc((void**)&(this->segmentsPerBlockEC), nvoxels*sizeof(uint32_t))  );
-            status = status && cudaCheck( cudaMalloc((void**)&(this->offsetPerBlockEC),   nvoxels*sizeof(uint32_t))  );
-            if (status) printf("[ OK ]\n");
-            else        printf("[ ERROR ]\n");
-        
-            printf("\t* preprocessing EC data for GPU ... ");
-            preprocessDataForGPU(voxelEC, npeaks, segmentsPerBlock, offsetPerBlock, nvoxels);
-            printf("\n");
-        
-            printf("\t* copying operator A to GPU (EC part) ... ");
-            status = true;
-            status = status && cudaCheck( cudaMemcpy(this->voxelEC,            voxelEC,              npeaks*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-            status = status && cudaCheck( cudaMemcpy(this->orienEC,            orienEC,              npeaks*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
-            status = status && cudaCheck( cudaMemcpy(this->segmentsPerBlockEC, segmentsPerBlock,     nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
-            status = status && cudaCheck( cudaMemcpy(this->offsetPerBlockEC,   offsetPerBlock,       nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
-            if (status) printf("[ OK ]\n");
-            else        printf("[ ERROR ]\n");
-        
-            free(segmentsPerBlock);
-            free(offsetPerBlock);
-        }
-        
-        ~CudaLinearOperator(){
-            cudaFree(voxelIC);
-            cudaFree(fiberIC);
-            cudaFree(orienIC);
-            cudaFree(lengthIC);
-            cudaFree(lutIC);
-            cudaFree(segmentsPerBlockIC);
-            cudaFree(offsetPerBlockIC);
-            
-            cudaFree(voxelEC);
-            cudaFree(orienEC);
-            cudaFree(lutEC);
-            cudaFree(segmentsPerBlockEC);
-            cudaFree(offsetPerBlockEC);
-        
-            cudaFree(lutISO);
-        
-            cudaFree(voxelICt);
-            cudaFree(fiberICt);
-            cudaFree(orienICt);
-            cudaFree(lengthICt);
-            cudaFree(fibersPerBlockICt);
-            cudaFree(offsetPerBlockICt);
-        
-            cudaFree(x);
-            cudaFree(y);
-        
-            printf("\t* reseting GPU ... ");
-            bool status = true;
-            status = status && cudaCheck( cudaDeviceReset() );
-            if (status) printf("[ OK ]\n");
-            else        printf("[ ERROR ]\n");
-        }
-
-        void setTransposeData(
-            uint32_t*  voxelIDs,
-            uint32_t*  fiberIDs,
-            uint16_t*  orienIDs,
-            float32_t* lengths,
-            int nsegments)
-        {
-            bool status;
-            uint32_t*  fibersPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
-            uint32_t*  offsetPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
-        
-            preprocessDataForGPU(fiberIDs, nsegments, fibersPerBlock, offsetPerBlock, nfibers);
-        
-            printf("\t* extra memory for operator A' ... ");
-            status = true;
-            status = status && cudaCheck( cudaMalloc((void**)&(voxelICt),  nsegments*sizeof(uint32_t))  );
-            status = status && cudaCheck( cudaMalloc((void**)&(fiberICt),  nsegments*sizeof(uint32_t))  );
-            status = status && cudaCheck( cudaMalloc((void**)&(orienICt),  nsegments*sizeof(uint16_t))  );
-            status = status && cudaCheck( cudaMalloc((void**)&(lengthICt), nsegments*sizeof(float32_t)) );
-            status = status && cudaCheck( cudaMalloc((void**)&(fibersPerBlockICt), nfibers*sizeof(uint32_t)) );
-            status = status && cudaCheck( cudaMalloc((void**)&(offsetPerBlockICt), nfibers*sizeof(uint32_t)) );
-            if (status) printf("[ OK ]\n");
-            else        printf("[ ERROR ]\n");
-        
-            printf("\t* transfering memory for operator A' ... ");
-            status = true;
-            status = status && cudaCheck( cudaMemcpy(voxelICt,  voxelIDs, nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-            status = status && cudaCheck( cudaMemcpy(fiberICt,  fiberIDs, nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-            status = status && cudaCheck( cudaMemcpy(orienICt,  orienIDs, nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
-            status = status && cudaCheck( cudaMemcpy(lengthICt, lengths,  nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
-            status = status && cudaCheck( cudaMemcpy(fibersPerBlockICt, fibersPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-            status = status && cudaCheck( cudaMemcpy(offsetPerBlockICt, offsetPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-            if (status) printf("[ OK ]\n");
-            else        printf("[ ERROR ]\n");
-        
-            free(fibersPerBlock);
-            free(offsetPerBlock);
-        }
-
-        void multiplyByX(float64_t* x, float64_t* y){
-            // Copy vector x to the GPU
-            cudaMemcpy(this->x, x, ncols*sizeof(double), cudaMemcpyHostToDevice);
-
-            // Multiply IC part in the GPU
-            multiply_Ax_ICpart<<<nvoxels, 1024>>>(voxelIC, fiberIC, orienIC, lengthIC, segmentsPerBlockIC, offsetPerBlockIC, lutIC, this->x, this->y);
-
-            //cudaCheckKernel();
-
-            // Multiply EC part in the GPU
-            multiply_Ax_ECpart<<<nvoxels, 512>>>(voxelEC, orienEC, segmentsPerBlockEC, offsetPerBlockEC, lutEC, this->x, this->y);
-
-            //cudaCheckKernel();
-
-            // Multiply ISO part in the GPU
-            multiply_Ax_ISOpart<<<nvoxels, 512>>>(lutISO, this->x, this->y);
-
-            //cudaCheckKernel();
-
-            // Copy back result to CPU
-            cudaMemcpy(y, this->y, nrows*sizeof(double), cudaMemcpyDeviceToHost);
-        }
-
-        void multiplyByY(float64_t* v_in, float64_t* v_out){
-        
-            // Copy vector y to the GPU
-            //cudaCheck( cudaMemset(gpu_x, 0, NUM_COLS*sizeof(float64_t)) );
-            //cudaCheck( cudaMemcpy(gpu_x, x, NUM_COLS*sizeof(double), cudaMemcpyHostToDevice) );
-            cudaCheck( cudaMemcpy(y, v_in, nrows*sizeof(double), cudaMemcpyHostToDevice) );
-        
-            // Multiply IC part in the GPU
-            multiply_Aty_ICpart<<<nfibers, 512>>>(voxelICt, fiberICt, orienICt, lengthICt, fibersPerBlockICt, offsetPerBlockICt, lutIC, x, y);
-        
-            //cudaCheckKernel();//*/
-        
-            // Multiply EC part in the GPU
-            multiply_Aty_ECpart<<<nvoxels, 512>>>(voxelEC, orienEC, segmentsPerBlockEC, offsetPerBlockEC, lutEC, x, y);
-        
-            //cudaCheckKernel();
-        
-            // Multiply ISO part in the GPU
-            multiply_Aty_ISOpart<<<nvoxels, 512>>>(lutISO, x, y);
-        
-            //cudaCheckKernel();//*/
-        
-            // Copy back result to CPU
-            cudaCheck( cudaMemcpy(v_out, x, ncols*sizeof(double), cudaMemcpyDeviceToHost) );
-                
-            /*printf("\n\n VECTOR X EC PART:\n");
-            for(int i = NUM_FIBERS*NUM_RESFUNCIC; i < NUM_FIBERS*NUM_RESFUNCIC+20; i++)
-                printf("%lf ", x[i]);
-            printf("\n\n");//*/
-        }
-};
-
-bool cudaCheck(cudaError_t cudaStatus){
-    return cudaStatus == cudaSuccess;
-}
-
-void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compartmentsPerBlock, uint32_t* offsetPerBlock, int NUM_BLOCKS){
-
-    // fill arrays with zeros
-    memset(compartmentsPerBlock, 0, NUM_BLOCKS * sizeof(uint32_t));
-    memset(offsetPerBlock,       0, NUM_BLOCKS * sizeof(uint32_t));
-
-    // count compartments per block
-    for(int i = 0; i < NUM_COMPARTMENTS; i++)
-        compartmentsPerBlock[data[i]]++;
-
-    // calculate offset per block
-    offsetPerBlock[0] = 0;
-    for(int i = 1; i < NUM_BLOCKS; i++)
-        offsetPerBlock[i] = offsetPerBlock[i-1] + compartmentsPerBlock[i-1];
-}
-
-/*CudaLinearOperator::CudaLinearOperator(
+CudaLinearOperator::CudaLinearOperator(
+    // pointers to IC data in CPU memory
     uint32_t* voxelIC,
     uint32_t* fiberIC,
     uint16_t* orienIC,
     float*    lengthIC,
     float*    lutIC,
-
+    // pointers to EC data in CPU memory
     uint32_t* voxelEC,
     uint16_t* orienEC,
     float*    lutEC,
-
+    // pointer to ISO data in CPU memory
     float*    lutISO,
-
+    // dataset constant values
     int nsegments,
     int nvoxels,      
     int nfibers,      
@@ -444,6 +24,7 @@ void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compar
     int nzeppelins,   
     int nballs)
 {
+    this->nsegments = nsegments;
     this->nvoxels = nvoxels;
     this->nfibers = nfibers;
     this->nrows = nvoxels * nsamples;
@@ -568,9 +149,9 @@ void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compar
 
     free(segmentsPerBlock);
     free(offsetPerBlock);
-}*/
+}
 
-/*CudaLinearOperator::~CudaLinearOperator(){
+CudaLinearOperator::~CudaLinearOperator(){
     cudaFree(voxelIC);
     cudaFree(fiberIC);
     cudaFree(orienIC);
@@ -602,14 +183,13 @@ void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compar
     status = status && cudaCheck( cudaDeviceReset() );
     if (status) printf("[ OK ]\n");
     else        printf("[ ERROR ]\n");
-}*/
+}
 
-/*void CudaLinearOperator::setTransposeData(
+void CudaLinearOperator::setTransposeData(
     uint32_t*  voxelIDs,
     uint32_t*  fiberIDs,
     uint16_t*  orienIDs,
-    float32_t* lengths,
-    int nsegments)
+    float32_t* lengths)
 {
     bool status;
     uint32_t*  fibersPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
@@ -641,7 +221,82 @@ void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compar
 
     free(fibersPerBlock);
     free(offsetPerBlock);
-}*/
+}
+
+void CudaLinearOperator::dot(float64_t* v_in, float64_t* v_out){
+    // Copy vector x to the GPU
+    cudaMemcpy(x, v_in, ncols*sizeof(double), cudaMemcpyHostToDevice);
+
+    // Multiply IC part in the GPU
+    multiply_Ax_ICpart<<<nvoxels, 1024>>>(voxelIC, fiberIC, orienIC, lengthIC, segmentsPerBlockIC, offsetPerBlockIC, lutIC, x, y);
+
+    //cudaCheckKernel();
+
+    // Multiply EC part in the GPU
+    multiply_Ax_ECpart<<<nvoxels, 512>>>(voxelEC, orienEC, segmentsPerBlockEC, offsetPerBlockEC, lutEC, x, y);
+
+    //cudaCheckKernel();
+
+    // Multiply ISO part in the GPU
+    multiply_Ax_ISOpart<<<nvoxels, 512>>>(lutISO, x, y);
+
+    //cudaCheckKernel();
+
+    // Copy back result to CPU
+    cudaMemcpy(v_out, y, nrows*sizeof(double), cudaMemcpyDeviceToHost);
+}
+
+void Tdot(float64_t* v_in, float64_t* v_out){
+        
+    // Copy vector y to the GPU
+    //cudaCheck( cudaMemset(gpu_x, 0, NUM_COLS*sizeof(float64_t)) );
+    //cudaCheck( cudaMemcpy(gpu_x, x, NUM_COLS*sizeof(double), cudaMemcpyHostToDevice) );
+    cudaCheck( cudaMemcpy(y, v_in, nrows*sizeof(double), cudaMemcpyHostToDevice) );
+
+    // Multiply IC part in the GPU
+    multiply_Aty_ICpart<<<nfibers, 512>>>(TvoxelIC, TfiberIC, TorienIC, TlengthIC, TfibersPerBlockIC, ToffsetPerBlockIC, lutIC, x, y);
+
+    //cudaCheckKernel();//*/
+
+    // Multiply EC part in the GPU
+    multiply_Aty_ECpart<<<nvoxels, 512>>>(voxelEC, orienEC, segmentsPerBlockEC, offsetPerBlockEC, lutEC, x, y);
+
+    //cudaCheckKernel();
+
+    // Multiply ISO part in the GPU
+    multiply_Aty_ISOpart<<<nvoxels, 512>>>(lutISO, x, y);
+
+    //cudaCheckKernel();//*/
+
+    // Copy back result to CPU
+    cudaCheck( cudaMemcpy(v_out, x, ncols*sizeof(double), cudaMemcpyDeviceToHost) );
+        
+    /*printf("\n\n VECTOR X EC PART:\n");
+    for(int i = NUM_FIBERS*NUM_RESFUNCIC; i < NUM_FIBERS*NUM_RESFUNCIC+20; i++)
+        printf("%lf ", x[i]);
+    printf("\n\n");//*/
+}
+
+bool cudaCheck(cudaError_t cudaStatus){
+    return cudaStatus == cudaSuccess;
+}
+
+void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compartmentsPerBlock, uint32_t* offsetPerBlock, int NUM_BLOCKS){
+
+    // fill arrays with zeros
+    memset(compartmentsPerBlock, 0, NUM_BLOCKS * sizeof(uint32_t));
+    memset(offsetPerBlock,       0, NUM_BLOCKS * sizeof(uint32_t));
+
+    // count compartments per block
+    for(int i = 0; i < NUM_COMPARTMENTS; i++)
+        compartmentsPerBlock[data[i]]++;
+
+    // calculate offset per block
+    offsetPerBlock[0] = 0;
+    for(int i = 1; i < NUM_BLOCKS; i++)
+        offsetPerBlock[i] = offsetPerBlock[i-1] + compartmentsPerBlock[i-1];
+}
+
 
 __global__ void multiply_Ax_ICpart(
     uint32_t*  voxelIDs,
@@ -919,53 +574,3 @@ __global__ void multiply_Aty_ISOpart(float* lut, double* x, double* y){
     }
 }//*/
 
-/*void CudaLinearOperator::multiplyByX(float64_t* x, float64_t* y){
-
-    // Copy vector x to the GPU
-    cudaMemcpy(this->x, x, ncols*sizeof(double), cudaMemcpyHostToDevice);
-
-    // Multiply IC part in the GPU
-    multiply_Ax_ICpart<<<nvoxels, 1024>>>(voxelIC, fiberIC, orienIC, lengthIC, segmentsPerBlockIC, offsetPerBlockIC, lutIC, this->x, this->y);
-
-    //cudaCheckKernel();
-
-    // Multiply EC part in the GPU
-    multiply_Ax_ECpart<<<nvoxels, 512>>>(voxelEC, orienEC, segmentsPerBlockEC, offsetPerBlockEC, lutEC, this->x, this->y);
-
-    //cudaCheckKernel();
-
-    // Multiply ISO part in the GPU
-    multiply_Ax_ISOpart<<<nvoxels, 512>>>(lutISO, this->x, this->y);
-
-    //cudaCheckKernel();
-
-    // Copy back result to CPU
-    cudaMemcpy(y, this->y, nrows*sizeof(double), cudaMemcpyDeviceToHost);
-}*/
-
-/*void CudaLinearOperator::multiplyByY(float64_t* v_in, float64_t* v_out){
-        
-    // Copy vector y to the GPU
-    //cudaCheck( cudaMemset(gpu_x, 0, NUM_COLS*sizeof(float64_t)) );
-    //cudaCheck( cudaMemcpy(gpu_x, x, NUM_COLS*sizeof(double), cudaMemcpyHostToDevice) );
-    cudaCheck( cudaMemcpy(y, v_in, nrows*sizeof(double), cudaMemcpyHostToDevice) );
-
-    // Multiply IC part in the GPU
-    multiply_Aty_ICpart<<<nfibers, 512>>>(voxelICt, fiberICt, orienICt, lengthICt, fibersPerBlockICt, offsetPerBlockICt, lutIC, x, y);
-
-    //cudaCheckKernel();
-
-    // Multiply EC part in the GPU
-    multiply_Aty_ECpart<<<nvoxels, 512>>>(voxelEC, orienEC, segmentsPerBlockEC, offsetPerBlockEC, lutEC, x, y);
-
-    //cudaCheckKernel();
-
-    // Multiply ISO part in the GPU
-    multiply_Aty_ISOpart<<<nvoxels, 512>>>(lutISO, x, y);
-
-    //cudaCheckKernel();
-
-    // Copy back result to CPU
-    cudaCheck( cudaMemcpy(v_out, x, ncols*sizeof(double), cudaMemcpyDeviceToHost) );
-        
-}*/
\ No newline at end of file
diff --git a/commit/operator_withCUDA.cuh b/commit/operator_withCUDA.cuh
new file mode 100644
index 00000000..44102fac
--- /dev/null
+++ b/commit/operator_withCUDA.cuh
@@ -0,0 +1,157 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
+#include <device_launch_parameters.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <algorithm>
+
+using namespace std;
+
+typedef unsigned int uint32_t;
+typedef unsigned short int uint16_t;
+typedef float float32_t;
+typedef double float64_t;
+
+bool cudaCheck(cudaError_t cudaStatus);
+void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compartmentsPerBlock, uint32_t* offsetPerBlock, int NUM_BLOCKS);
+
+__global__ void multiply_Ax_ICpart(
+    uint32_t*  voxelIDs,
+    uint32_t*  fiberIDs,
+    uint16_t*  orienIDs,
+    float32_t* lengths,
+    uint32_t*  segmentsPerBlock,
+    uint32_t*  offsetPerBlock,
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y);
+
+__global__ void multiply_Ax_ECpart(
+        uint32_t*  voxelIDs,
+        uint16_t*  orienIDs,
+        uint32_t*  segmentsPerBlock,
+        uint32_t*  offsetPerBlock,
+        float32_t* lut,
+        float64_t* x,
+        float64_t* y);
+
+__global__ void multiply_Ax_ISOpart(
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y);
+
+__global__ void multiply_Aty_ICpart(
+    uint32_t*  TvoxelIC,
+    uint32_t*  TfiberIC,
+    uint16_t*  TorienIC,
+    float32_t* TlengthIC,
+    uint32_t*  compartmentsPerBlock,
+    uint32_t*  offsetPerBlock,
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y);
+
+__global__ void multiply_Aty_ECpart(
+    uint32_t*  voxelEC,
+    uint16_t*  orienEC,
+    uint32_t*  segmentsPerBlock,
+    uint32_t*  offsetPerBlock,
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y);
+
+__global__ void multiply_Aty_ISOpart(
+    float* lut,
+    double* x,
+    double* y);
+
+// constant values in GPU
+__constant__ int NUM_VOXELS;
+__constant__ int NUM_FIBERS;
+__constant__ int NUM_PEAKS;
+__constant__ int NUM_ORIENTATIONS;
+__constant__ int NUM_SAMPLES;
+__constant__ int NUM_DIAMETERS;
+__constant__ int NUM_ZEPPELINS;
+__constant__ int NUM_BALLS;
+__constant__ int NUM_ROWS;        
+__constant__ int NUM_COLS;      
+__constant__ int SIZE_LUTIC;      
+__constant__ int SIZE_LUTEC;     
+__constant__ int SIZE_LUTISO;
+
+class CudaLinearOperator {
+
+    // pointers to IC data in GPU memory
+    uint32_t*  voxelIC;
+    uint32_t*  fiberIC;
+    uint16_t*  orienIC;
+    float32_t* lengthIC;
+
+    // pointers to IC data (transpose) in GPU memory
+    uint32_t*  TvoxelIC;
+    uint32_t*  TfiberIC;
+    uint16_t*  TorienIC;
+    float32_t* TlengthIC;
+    uint32_t* TfibersPerBlockIC;
+    uint32_t* ToffsetPerBlockIC;
+
+    // auxiliar arrays for GPU
+    uint32_t* segmentsPerBlockIC;
+    uint32_t* offsetPerBlockIC;
+    uint32_t* segmentsPerBlockEC;
+    uint32_t* offsetPerBlockEC;
+
+    // pointers to EC data in GPU memory
+    uint32_t*  voxelEC;
+    uint16_t*  orienEC;
+
+    // pointers to LUTs in GPU memory
+    float32_t* lutIC;
+    float32_t* lutEC;
+    float32_t* lutISO;
+
+    // pointers to vector x and y
+    float64_t* x;
+    float64_t* y;
+
+    // dimensions of the operator
+    int nrows;
+    int ncols;
+    int nvoxels;
+    int nfibers;
+    int nsegments;
+
+    public:
+        CudaLinearOperator(
+            uint32_t* voxelIC,
+            uint32_t* fiberIC,
+            uint16_t* orienIC,
+            float*    lengthIC,
+            float*    lutIC,
+        
+            uint32_t* voxelEC,
+            uint16_t* orienEC,
+            float*    lutEC,
+        
+            float*    lutISO,
+        
+            int nsegments,
+            int nvoxels,      
+            int nfibers,      
+            int npeaks,       
+            int norientations,
+            int nsamples,     
+            int ndiameters,   
+            int nzeppelins,   
+            int nballs);
+        }
+        ~CudaLinearOperator();
+
+        void setTransposeData(uint32_t*  voxelIDs, uint32_t*  fiberIDs, uint16_t*  orienIDs, float32_t* lengths, int nsegments);
+
+        void  dot(float64_t* v_in, float64_t* v_out);
+        void Tdot(float64_t* v_in, float64_t* v_out);
+};
\ No newline at end of file

From 86c9d324b97a343ee597759f9085aa595f35adb3 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 28 Mar 2020 11:28:23 -0600
Subject: [PATCH 040/190] Reverting wrong file merging and modifying some
 variable names

---
 commit/core.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index cc26a26f..76ee8f78 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -621,7 +621,7 @@ cdef class Evaluation :
             self.THREADS['ECt'] = None
 
         if self.DICTIONARY['nV'] > 0 :
-            self.THREADS['ISOt'] = np.zeros( n+1, dtype=np.uint32 )
+            self.THREADS['ISOt'] = np.zeros( nthreads+1, dtype=np.uint32 )
             N = np.floor( self.DICTIONARY['nV']/nthreads )
             for i in xrange(1,nthreads) :
                 self.THREADS['ISOt'][i] = self.THREADS['ISOt'][i-1] + N

From 242c7fbd6b9ced1808904285c5542427039c7290 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 28 Mar 2020 11:32:51 -0600
Subject: [PATCH 041/190] Reverting wrong file merging and modifying some
 variable names

---
 commit/operator_withCUDA.cu  | 24 ++++++++++++------------
 commit/operator_withCUDA.cuh |  2 +-
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index 5c5df373..c151f35e 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -199,23 +199,23 @@ void CudaLinearOperator::setTransposeData(
 
     printf("\t* extra memory for operator A' ... ");
     status = true;
-    status = status && cudaCheck( cudaMalloc((void**)&(voxelICt),  nsegments*sizeof(uint32_t))  );
-    status = status && cudaCheck( cudaMalloc((void**)&(fiberICt),  nsegments*sizeof(uint32_t))  );
-    status = status && cudaCheck( cudaMalloc((void**)&(orienICt),  nsegments*sizeof(uint16_t))  );
-    status = status && cudaCheck( cudaMalloc((void**)&(lengthICt), nsegments*sizeof(float32_t)) );
-    status = status && cudaCheck( cudaMalloc((void**)&(fibersPerBlockICt), nfibers*sizeof(uint32_t)) );
-    status = status && cudaCheck( cudaMalloc((void**)&(offsetPerBlockICt), nfibers*sizeof(uint32_t)) );
+    status = status && cudaCheck( cudaMalloc((void**)&(TvoxelIC),  nsegments*sizeof(uint32_t))  );
+    status = status && cudaCheck( cudaMalloc((void**)&(TfiberIC),  nsegments*sizeof(uint32_t))  );
+    status = status && cudaCheck( cudaMalloc((void**)&(TorienIC),  nsegments*sizeof(uint16_t))  );
+    status = status && cudaCheck( cudaMalloc((void**)&(TlengthIC), nsegments*sizeof(float32_t)) );
+    status = status && cudaCheck( cudaMalloc((void**)&(TfibersPerBlockIC), nfibers*sizeof(uint32_t)) );
+    status = status && cudaCheck( cudaMalloc((void**)&(ToffsetPerBlockIC), nfibers*sizeof(uint32_t)) );
     if (status) printf("[ OK ]\n");
     else        printf("[ ERROR ]\n");
 
     printf("\t* transfering memory for operator A' ... ");
     status = true;
-    status = status && cudaCheck( cudaMemcpy(voxelICt,  voxelIDs, nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    status = status && cudaCheck( cudaMemcpy(fiberICt,  fiberIDs, nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    status = status && cudaCheck( cudaMemcpy(orienICt,  orienIDs, nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
-    status = status && cudaCheck( cudaMemcpy(lengthICt, lengths,  nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
-    status = status && cudaCheck( cudaMemcpy(fibersPerBlockICt, fibersPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    status = status && cudaCheck( cudaMemcpy(offsetPerBlockICt, offsetPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    status = status && cudaCheck( cudaMemcpy(TvoxelIC,  voxelIDs, nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    status = status && cudaCheck( cudaMemcpy(TfiberIC,  fiberIDs, nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    status = status && cudaCheck( cudaMemcpy(TorienIC,  orienIDs, nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
+    status = status && cudaCheck( cudaMemcpy(TlengthIC, lengths,  nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
+    status = status && cudaCheck( cudaMemcpy(TfibersPerBlockIC, fibersPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    status = status && cudaCheck( cudaMemcpy(ToffsetPerBlockIC, offsetPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
     if (status) printf("[ OK ]\n");
     else        printf("[ ERROR ]\n");
 
diff --git a/commit/operator_withCUDA.cuh b/commit/operator_withCUDA.cuh
index 44102fac..9f1ffa93 100644
--- a/commit/operator_withCUDA.cuh
+++ b/commit/operator_withCUDA.cuh
@@ -150,7 +150,7 @@ class CudaLinearOperator {
         }
         ~CudaLinearOperator();
 
-        void setTransposeData(uint32_t*  voxelIDs, uint32_t*  fiberIDs, uint16_t*  orienIDs, float32_t* lengths, int nsegments);
+        void setTransposeData(uint32_t*  voxelIDs, uint32_t*  fiberIDs, uint16_t*  orienIDs, float32_t* lengths);
 
         void  dot(float64_t* v_in, float64_t* v_out);
         void Tdot(float64_t* v_in, float64_t* v_out);

From f4b13827962c041f04ba4c3cdc2a1c933cfa7111 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 28 Mar 2020 11:38:59 -0600
Subject: [PATCH 042/190] Reverting wrong file merging and modifying some
 variable names

---
 commit/cudaoperator.pyx     | 2 +-
 commit/operator_withCUDA.cu | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 8471709d..ef19b38f 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -5,7 +5,7 @@ import cython
 import numpy as np
 cimport numpy as np
 
-cdef extern from "operator_withCUDA.cu":
+cdef extern from "operator_withCUDA.cuh":
     cdef cppclass C_CudaLinearOperator "CudaLinearOperator":
         C_CudaLinearOperator(
             np.uint32_t*,
diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index c151f35e..d0df5051 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -246,7 +246,7 @@ void CudaLinearOperator::dot(float64_t* v_in, float64_t* v_out){
     cudaMemcpy(v_out, y, nrows*sizeof(double), cudaMemcpyDeviceToHost);
 }
 
-void Tdot(float64_t* v_in, float64_t* v_out){
+void CudaLinearOperator::Tdot(float64_t* v_in, float64_t* v_out){
         
     // Copy vector y to the GPU
     //cudaCheck( cudaMemset(gpu_x, 0, NUM_COLS*sizeof(float64_t)) );

From a66b5a3a97a2be4f0c8b0b21451b6158c0966906 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 28 Mar 2020 11:42:14 -0600
Subject: [PATCH 043/190] Reverting wrong file merging and modifying some
 variable names

---
 commit/cudaoperator.pyx | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index ef19b38f..967f955a 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -30,9 +30,9 @@ cdef extern from "operator_withCUDA.cuh":
             int,
             int)
 
-        void setTransposeData(np.uint32_t*, np.uint32_t*, np.uint16_t*, np.float32_t*, int)
-        void multiplyByX(np.float64_t*, np.float64_t*)
-        void multiplyByY(np.float64_t*, np.float64_t*)
+        void setTransposeData(np.uint32_t*, np.uint32_t*, np.uint16_t*, np.float32_t*)
+        void  dot(np.float64_t*, np.float64_t*)
+        void Tdot(np.float64_t*, np.float64_t*)
 
 cdef class CudaLinearOperator :
     """This class is a wrapper to the C code for performing marix-vector multiplications
@@ -183,7 +183,7 @@ cdef class CudaLinearOperator :
         ISOv = self.DICTIONARY['ISO']['v']
         self.ISOv = &ISOv[0]
 
-        self.A.setTransposeData(&self.ICv[0], &self.ICf[0], &self.ICo[0], &self.ICl[0], self.n)
+        self.A.setTransposeData(&self.ICv[0], &self.ICf[0], &self.ICo[0], &self.ICl[0])
 
         idx = np.argsort( self.DICTIONARY['IC']['v'], kind='mergesort' )
         self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
@@ -240,10 +240,10 @@ cdef class CudaLinearOperator :
         if not self.adjoint :
             # DIRECT PRODUCT A*x
             print('MULTIPLICO Ax')
-            self.A.multiplyByX(&v_in[0], &v_out[0])
+            self.A.dot(&v_in[0], &v_out[0])
         else :
             # INVERSE PRODUCT A'*y
             print('MULTIPLICO A\'y')
-            self.A.multiplyByY(&v_in[0], &v_out[0])
+            self.A.Tdot(&v_in[0], &v_out[0])
 
         return v_out

From f8c5b0c95059371fff11850c6ae35ba786a5da2b Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 28 Mar 2020 11:47:40 -0600
Subject: [PATCH 044/190] Reverting wrong file merging and modifying some
 variable names

---
 commit/operator_withCUDA.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/commit/operator_withCUDA.cuh b/commit/operator_withCUDA.cuh
index 9f1ffa93..2567fbe8 100644
--- a/commit/operator_withCUDA.cuh
+++ b/commit/operator_withCUDA.cuh
@@ -147,7 +147,7 @@ class CudaLinearOperator {
             int ndiameters,   
             int nzeppelins,   
             int nballs);
-        }
+
         ~CudaLinearOperator();
 
         void setTransposeData(uint32_t*  voxelIDs, uint32_t*  fiberIDs, uint16_t*  orienIDs, float32_t* lengths);

From 211744b64ac79bbb39985858fbca4b786a32eba5 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 28 Mar 2020 11:49:22 -0600
Subject: [PATCH 045/190] Reverting wrong file merging and modifying some
 variable names

---
 commit/operator_withCUDA.cu | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index d0df5051..dc44c01e 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -168,12 +168,12 @@ CudaLinearOperator::~CudaLinearOperator(){
 
     cudaFree(lutISO);
 
-    cudaFree(voxelICt);
-    cudaFree(fiberICt);
-    cudaFree(orienICt);
-    cudaFree(lengthICt);
-    cudaFree(fibersPerBlockICt);
-    cudaFree(offsetPerBlockICt);
+    cudaFree(TvoxelIC);
+    cudaFree(TfiberIC);
+    cudaFree(TorienIC);
+    cudaFree(TlengthIC);
+    cudaFree(TfibersPerBlockIC);
+    cudaFree(ToffsetPerBlockIC);
 
     cudaFree(x);
     cudaFree(y);

From fe9eb384d5c8dc700ed8722f7a62c11c5ad3feb5 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 28 Mar 2020 19:55:16 -0600
Subject: [PATCH 046/190] Adding CUDA error checker

---
 commit/core.pyx              |   5 +
 commit/operator_withCUDA.cu  | 301 +++++++++++++++++------------------
 commit/operator_withCUDA.cuh |  51 +++---
 3 files changed, 180 insertions(+), 177 deletions(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index 76ee8f78..d129c18c 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -483,7 +483,12 @@ cdef class Evaluation :
         self.DICTIONARY['ISO']['v'] = lut[ self.DICTIONARY['ISO']['v'] ]
 
         import commit.cudaoperator
+        print( '\t* building dictionary in GPU ... ' )
         self.gpu_A = commit.cudaoperator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
+        if gpu_A.status == True:
+            print( '[ OK ]' )
+        else:
+            print( '[ WRONG ]' )
 
         print( '         [ OK ]' )
 
diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index dc44c01e..76756087 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -32,157 +32,154 @@ CudaLinearOperator::CudaLinearOperator(
     int size_lutic  = ndiameters*norientations*nsamples;
     int size_lutec  = nzeppelins*norientations*nsamples;
     int size_lutiso = nballs*nsamples;
-    bool status;
+    //bool status;
 
     uint32_t* segmentsPerBlock = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
     uint32_t* offsetPerBlock   = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
 
+    cudaStatus = true;
+
     // copy constant values to GPU
-    printf("\t* constant global values ... ");
-    status = true;
-    status = status && cudaCheck( cudaMemcpyToSymbol(NUM_VOXELS,       &nvoxels,       sizeof(int)) );
-    status = status && cudaCheck( cudaMemcpyToSymbol(NUM_FIBERS,       &nfibers,       sizeof(int)) );
-    status = status && cudaCheck( cudaMemcpyToSymbol(NUM_PEAKS,        &npeaks,        sizeof(int)) );
-    status = status && cudaCheck( cudaMemcpyToSymbol(NUM_ORIENTATIONS, &norientations, sizeof(int)) );
-    status = status && cudaCheck( cudaMemcpyToSymbol(NUM_SAMPLES,      &nsamples,      sizeof(int)) );
-    status = status && cudaCheck( cudaMemcpyToSymbol(NUM_DIAMETERS,    &ndiameters,    sizeof(int)) );
-    status = status && cudaCheck( cudaMemcpyToSymbol(NUM_ZEPPELINS,    &nzeppelins,    sizeof(int)) );
-    status = status && cudaCheck( cudaMemcpyToSymbol(NUM_BALLS,        &nballs,        sizeof(int)) );
-    status = status && cudaCheck( cudaMemcpyToSymbol(NUM_ROWS,         &nrows,         sizeof(int)) );
-    status = status && cudaCheck( cudaMemcpyToSymbol(NUM_COLS,         &ncols,         sizeof(int)) );
-    status = status && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTIC,       &size_lutic,    sizeof(int)) );
-    status = status && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTEC,       &size_lutec,    sizeof(int)) );
-    status = status && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTISO,      &size_lutiso,   sizeof(int)) );
-    if (status) printf("[ OK ]\n");
-    else        printf("[ ERROR ]\n");
+    //printf("\t* constant global values ... ");
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_VOXELS,       &nvoxels,       sizeof(int)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_FIBERS,       &nfibers,       sizeof(int)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_PEAKS,        &npeaks,        sizeof(int)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_ORIENTATIONS, &norientations, sizeof(int)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_SAMPLES,      &nsamples,      sizeof(int)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_DIAMETERS,    &ndiameters,    sizeof(int)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_ZEPPELINS,    &nzeppelins,    sizeof(int)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_BALLS,        &nballs,        sizeof(int)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_ROWS,         &nrows,         sizeof(int)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_COLS,         &ncols,         sizeof(int)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTIC,       &size_lutic,    sizeof(int)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTEC,       &size_lutec,    sizeof(int)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTISO,      &size_lutiso,   sizeof(int)) );
+    /*if (cudaStatus) printf("[ OK ]\n");
+    else            printf("[ ERROR ]\n");//*/
 
 
     // alloc memory in GPU for vectors x and y
-    printf("\t* memory for vectors x and y ... ");
-    status = true;
-    status = status && cudaCheck( cudaMalloc((void**)&(this->x), ncols*sizeof(float64_t)) );
-    status = status && cudaCheck( cudaMalloc((void**)&(this->y), nrows*sizeof(float64_t)) );
-    if (status) printf("[ OK ]\n");
-    else        printf("[ ERROR ]\n");
+    //printf("\t* memory for vectors x and y ... ");
+    //status = true;
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(this->x), ncols*sizeof(float64_t)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(this->y), nrows*sizeof(float64_t)) );
+    /*if (status) printf("[ OK ]\n");
+    else        printf("[ ERROR ]\n");//*/
 
     // alloc GPU memory for segments
-    printf("\t* memory for LUT (IC part) ... ");
-    status = true;
-    status = status && cudaCheck( cudaMalloc((void**)&(this->lutIC), size_lutic*sizeof(float32_t)) );
-    if (status) printf("[ OK ]\n");
-    else        printf("[ ERROR ]\n");
-
-    printf("\t* copying LUT in GPU (IC part) ... ");
-    status = true;
-    status = status && cudaCheck( cudaMemcpy(this->lutIC, lutIC, size_lutic*sizeof(float32_t), cudaMemcpyHostToDevice) );
-    if (status) printf("[ OK ]\n");
-    else        printf("[ ERROR ]\n");
-
-    printf("\t* allocating memory for LUT in GPU (EC part) ... ");
-    status = cudaCheck( cudaMalloc((void**)&(this->lutEC), size_lutec*sizeof(float32_t)) );
-    if (status) printf("[ OK ]\n");
-    else        printf("[ ERROR ]\n");
-
-    printf("\t* copying LUT in GPU (EC part) ... ");
-    status = cudaCheck( cudaMemcpy(this->lutEC, lutEC, size_lutec*sizeof(float32_t), cudaMemcpyHostToDevice) );
-    if (status) printf("[ OK ]\n");
-    else        printf("[ ERROR ]\n");
-
-    printf("\t* allocating memory for LUT in GPU (ISO part) ... ");
-    status = cudaCheck( cudaMalloc((void**)&(this->lutISO), size_lutiso*sizeof(float32_t)) );
-    if (status) printf("[ OK ]\n");
-    else        printf("[ ERROR ]\n");
-
-    printf("\t* copying LUT in GPU (ISO part) ... ");
-    status = cudaCheck( cudaMemcpy(this->lutISO, lutISO, size_lutiso*sizeof(float32_t), cudaMemcpyHostToDevice) );
-    if (status) printf("[ OK ]\n");
-    else        printf("[ ERROR ]\n");
-
-    printf("\t* preprocessing data for GPU ... ");
+    //printf("\t* memory for LUT (IC part) ... ");
+    //status = true;
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(this->lutIC), size_lutic*sizeof(float32_t)) );
+    /*if (status) printf("[ OK ]\n");
+    else        printf("[ ERROR ]\n");//*/
+
+    //printf("\t* copying LUT in GPU (IC part) ... ");
+    //status = true;
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(this->lutIC, lutIC, size_lutic*sizeof(float32_t), cudaMemcpyHostToDevice) );
+    /*if (status) printf("[ OK ]\n");
+    else        printf("[ ERROR ]\n");//*/
+
+    //printf("\t* allocating memory for LUT in GPU (EC part) ... ");
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(this->lutEC), size_lutec*sizeof(float32_t)) );
+    /*if (status) printf("[ OK ]\n");
+    else        printf("[ ERROR ]\n");//*/
+
+    //printf("\t* copying LUT in GPU (EC part) ... ");
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(this->lutEC, lutEC, size_lutec*sizeof(float32_t), cudaMemcpyHostToDevice) );
+    /*if (status) printf("[ OK ]\n");
+    else        printf("[ ERROR ]\n");//*/
+
+    //printf("\t* allocating memory for LUT in GPU (ISO part) ... ");
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(this->lutISO), size_lutiso*sizeof(float32_t)) );
+    /*if (status) printf("[ OK ]\n");
+    else        printf("[ ERROR ]\n");//*/
+
+    //printf("\t* copying LUT in GPU (ISO part) ... ");
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(this->lutISO, lutISO, size_lutiso*sizeof(float32_t), cudaMemcpyHostToDevice) );
+    /*if (status) printf("[ OK ]\n");
+    else        printf("[ ERROR ]\n");//*/
+
+    //printf("\t* preprocessing data for GPU ... ");
     preprocessDataForGPU(voxelIC, nsegments, segmentsPerBlock, offsetPerBlock, nvoxels);
-    printf("\n");
-
-    printf("\t* fiber segments memory allocation ... ");
-    status = true;
-    status = status && cudaCheck( cudaMalloc((void**)&(this->voxelIC),  nsegments*sizeof(uint32_t))  );
-    status = status && cudaCheck( cudaMalloc((void**)&(this->fiberIC),  nsegments*sizeof(uint32_t))  );
-    status = status && cudaCheck( cudaMalloc((void**)&(this->orienIC),  nsegments*sizeof(uint16_t))  );
-    status = status && cudaCheck( cudaMalloc((void**)&(this->lengthIC), nsegments*sizeof(float32_t)) );
-    status = status && cudaCheck( cudaMalloc((void**)&(this->segmentsPerBlockIC), nvoxels*sizeof(uint32_t)) );
-    status = status && cudaCheck( cudaMalloc((void**)&(this->offsetPerBlockIC),   nvoxels*sizeof(uint32_t)) );
-    if (status) printf("[ OK ]\n");
-    else        printf("[ ERROR ]\n");
-
-    printf("\t* transfering fiber segments ... ");
-    status = true;
-    status = status && cudaCheck( cudaMemcpy(this->voxelIC,  voxelIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    status = status && cudaCheck( cudaMemcpy(this->fiberIC,  fiberIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    status = status && cudaCheck( cudaMemcpy(this->orienIC,  orienIC,  nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
-    status = status && cudaCheck( cudaMemcpy(this->lengthIC, lengthIC, nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
-    status = status && cudaCheck( cudaMemcpy(this->segmentsPerBlockIC, segmentsPerBlock, nvoxels*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    status = status && cudaCheck( cudaMemcpy(this->offsetPerBlockIC,   offsetPerBlock,   nvoxels*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    if (status) printf("[ OK ]\n");
-    else        printf("[ ERROR ]\n");
+    //printf("\n");
+
+    /*printf("\t* fiber segments memory allocation ... ");
+    status = true;//*/
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(this->voxelIC),  nsegments*sizeof(uint32_t))  );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(this->fiberIC),  nsegments*sizeof(uint32_t))  );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(this->orienIC),  nsegments*sizeof(uint16_t))  );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(this->lengthIC), nsegments*sizeof(float32_t)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(this->segmentsPerBlockIC), nvoxels*sizeof(uint32_t)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(this->offsetPerBlockIC),   nvoxels*sizeof(uint32_t)) );
+    /*if (status) printf("[ OK ]\n");
+    else        printf("[ ERROR ]\n");//*/
+
+    /*printf("\t* transfering fiber segments ... ");
+    status = true;//*/
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(this->voxelIC,  voxelIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(this->fiberIC,  fiberIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(this->orienIC,  orienIC,  nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(this->lengthIC, lengthIC, nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(this->segmentsPerBlockIC, segmentsPerBlock, nvoxels*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(this->offsetPerBlockIC,   offsetPerBlock,   nvoxels*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    /*if (status) printf("[ OK ]\n");
+    else        printf("[ ERROR ]\n");//*/
 
     // ---------------------------------------- EC DATA ---------------------------------------- //
-    printf("\t* allocating memory for operator A in GPU (EC part) ... ");
-    status = true;
-    status = status && cudaCheck( cudaMalloc((void**)&(this->voxelEC),  npeaks*sizeof(uint32_t)) );
-    status = status && cudaCheck( cudaMalloc((void**)&(this->orienEC),  npeaks*sizeof(uint16_t)) );
-    status = status && cudaCheck( cudaMalloc((void**)&(this->segmentsPerBlockEC), nvoxels*sizeof(uint32_t))  );
-    status = status && cudaCheck( cudaMalloc((void**)&(this->offsetPerBlockEC),   nvoxels*sizeof(uint32_t))  );
-    if (status) printf("[ OK ]\n");
-    else        printf("[ ERROR ]\n");
-
-    printf("\t* preprocessing EC data for GPU ... ");
+    /*printf("\t* allocating memory for operator A in GPU (EC part) ... ");
+    status = true;//*/
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(this->voxelEC),  npeaks*sizeof(uint32_t)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(this->orienEC),  npeaks*sizeof(uint16_t)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(this->segmentsPerBlockEC), nvoxels*sizeof(uint32_t))  );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(this->offsetPerBlockEC),   nvoxels*sizeof(uint32_t))  );
+    /*if (status) printf("[ OK ]\n");
+    else        printf("[ ERROR ]\n");//*/
+
+    //printf("\t* preprocessing EC data for GPU ... ");
     preprocessDataForGPU(voxelEC, npeaks, segmentsPerBlock, offsetPerBlock, nvoxels);
-    printf("\n");
+    //printf("\n");
 
-    printf("\t* copying operator A to GPU (EC part) ... ");
-    status = true;
-    status = status && cudaCheck( cudaMemcpy(this->voxelEC,            voxelEC,              npeaks*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    status = status && cudaCheck( cudaMemcpy(this->orienEC,            orienEC,              npeaks*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
-    status = status && cudaCheck( cudaMemcpy(this->segmentsPerBlockEC, segmentsPerBlock,     nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
-    status = status && cudaCheck( cudaMemcpy(this->offsetPerBlockEC,   offsetPerBlock,       nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
-    if (status) printf("[ OK ]\n");
-    else        printf("[ ERROR ]\n");
+    /*printf("\t* copying operator A to GPU (EC part) ... ");
+    status = true;//*/
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(this->voxelEC,            voxelEC,          npeaks*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(this->orienEC,            orienEC,          npeaks*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(this->segmentsPerBlockEC, segmentsPerBlock, nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(this->offsetPerBlockEC,   offsetPerBlock,   nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
+    /*if (status) printf("[ OK ]\n");
+    else        printf("[ ERROR ]\n");//*/
 
     free(segmentsPerBlock);
     free(offsetPerBlock);
 }
 
 CudaLinearOperator::~CudaLinearOperator(){
-    cudaFree(voxelIC);
-    cudaFree(fiberIC);
-    cudaFree(orienIC);
-    cudaFree(lengthIC);
-    cudaFree(lutIC);
-    cudaFree(segmentsPerBlockIC);
-    cudaFree(offsetPerBlockIC);
-    
-    cudaFree(voxelEC);
-    cudaFree(orienEC);
-    cudaFree(lutEC);
-    cudaFree(segmentsPerBlockEC);
-    cudaFree(offsetPerBlockEC);
-
-    cudaFree(lutISO);
-
-    cudaFree(TvoxelIC);
-    cudaFree(TfiberIC);
-    cudaFree(TorienIC);
-    cudaFree(TlengthIC);
-    cudaFree(TfibersPerBlockIC);
-    cudaFree(ToffsetPerBlockIC);
-
-    cudaFree(x);
-    cudaFree(y);
-
-    printf("\t* reseting GPU ... ");
-    bool status = true;
-    status = status && cudaCheck( cudaDeviceReset() );
-    if (status) printf("[ OK ]\n");
-    else        printf("[ ERROR ]\n");
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(voxelIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(fiberIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(orienIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(lengthIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(lutIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(segmentsPerBlockIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(offsetPerBlockIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(voxelEC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(orienEC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(lutEC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(segmentsPerBlockEC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(offsetPerBlockEC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(lutISO) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(TvoxelIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(TfiberIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(TorienIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(TlengthIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(TfibersPerBlockIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(ToffsetPerBlockIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(x) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(y) );
+
+    /*printf("\t* reseting GPU ... ");
+    bool status = true;//*/
+    cudaStatus = cudaStatus && cudaCheck( cudaDeviceReset() );
+    /*if (status) printf("[ OK ]\n");
+    else        printf("[ ERROR ]\n");//*/
 }
 
 void CudaLinearOperator::setTransposeData(
@@ -191,33 +188,33 @@ void CudaLinearOperator::setTransposeData(
     uint16_t*  orienIDs,
     float32_t* lengths)
 {
-    bool status;
+    //bool status;
     uint32_t*  fibersPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
     uint32_t*  offsetPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
 
     preprocessDataForGPU(fiberIDs, nsegments, fibersPerBlock, offsetPerBlock, nfibers);
 
-    printf("\t* extra memory for operator A' ... ");
-    status = true;
-    status = status && cudaCheck( cudaMalloc((void**)&(TvoxelIC),  nsegments*sizeof(uint32_t))  );
-    status = status && cudaCheck( cudaMalloc((void**)&(TfiberIC),  nsegments*sizeof(uint32_t))  );
-    status = status && cudaCheck( cudaMalloc((void**)&(TorienIC),  nsegments*sizeof(uint16_t))  );
-    status = status && cudaCheck( cudaMalloc((void**)&(TlengthIC), nsegments*sizeof(float32_t)) );
-    status = status && cudaCheck( cudaMalloc((void**)&(TfibersPerBlockIC), nfibers*sizeof(uint32_t)) );
-    status = status && cudaCheck( cudaMalloc((void**)&(ToffsetPerBlockIC), nfibers*sizeof(uint32_t)) );
-    if (status) printf("[ OK ]\n");
-    else        printf("[ ERROR ]\n");
-
-    printf("\t* transfering memory for operator A' ... ");
-    status = true;
-    status = status && cudaCheck( cudaMemcpy(TvoxelIC,  voxelIDs, nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    status = status && cudaCheck( cudaMemcpy(TfiberIC,  fiberIDs, nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    status = status && cudaCheck( cudaMemcpy(TorienIC,  orienIDs, nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
-    status = status && cudaCheck( cudaMemcpy(TlengthIC, lengths,  nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
-    status = status && cudaCheck( cudaMemcpy(TfibersPerBlockIC, fibersPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    status = status && cudaCheck( cudaMemcpy(ToffsetPerBlockIC, offsetPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    if (status) printf("[ OK ]\n");
-    else        printf("[ ERROR ]\n");
+    /*printf("\t* extra memory for operator A' ... ");
+    status = true;//*/
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(TvoxelIC),  nsegments*sizeof(uint32_t))  );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(TfiberIC),  nsegments*sizeof(uint32_t))  );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(TorienIC),  nsegments*sizeof(uint16_t))  );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(TlengthIC), nsegments*sizeof(float32_t)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(TfibersPerBlockIC), nfibers*sizeof(uint32_t)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(ToffsetPerBlockIC), nfibers*sizeof(uint32_t)) );
+    /*if (status) printf("[ OK ]\n");
+    else        printf("[ ERROR ]\n");//*/
+
+    /*printf("\t* transfering memory for operator A' ... ");
+    status = true;//*/
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(TvoxelIC,  voxelIDs, nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(TfiberIC,  fiberIDs, nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(TorienIC,  orienIDs, nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(TlengthIC, lengths,  nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(TfibersPerBlockIC, fibersPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(ToffsetPerBlockIC, offsetPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    /*if (status) printf("[ OK ]\n");
+    else        printf("[ ERROR ]\n");//*/
 
     free(fibersPerBlock);
     free(offsetPerBlock);
diff --git a/commit/operator_withCUDA.cuh b/commit/operator_withCUDA.cuh
index 2567fbe8..6b03dfe8 100644
--- a/commit/operator_withCUDA.cuh
+++ b/commit/operator_withCUDA.cuh
@@ -67,21 +67,6 @@ __global__ void multiply_Aty_ISOpart(
     double* x,
     double* y);
 
-// constant values in GPU
-__constant__ int NUM_VOXELS;
-__constant__ int NUM_FIBERS;
-__constant__ int NUM_PEAKS;
-__constant__ int NUM_ORIENTATIONS;
-__constant__ int NUM_SAMPLES;
-__constant__ int NUM_DIAMETERS;
-__constant__ int NUM_ZEPPELINS;
-__constant__ int NUM_BALLS;
-__constant__ int NUM_ROWS;        
-__constant__ int NUM_COLS;      
-__constant__ int SIZE_LUTIC;      
-__constant__ int SIZE_LUTEC;     
-__constant__ int SIZE_LUTISO;
-
 class CudaLinearOperator {
 
     // pointers to IC data in GPU memory
@@ -89,25 +74,23 @@ class CudaLinearOperator {
     uint32_t*  fiberIC;
     uint16_t*  orienIC;
     float32_t* lengthIC;
+    uint32_t*  segmentsPerBlockIC;
+    uint32_t*  offsetPerBlockIC;
 
     // pointers to IC data (transpose) in GPU memory
     uint32_t*  TvoxelIC;
     uint32_t*  TfiberIC;
     uint16_t*  TorienIC;
     float32_t* TlengthIC;
-    uint32_t* TfibersPerBlockIC;
-    uint32_t* ToffsetPerBlockIC;
+    uint32_t*  TfibersPerBlockIC;
+    uint32_t*  ToffsetPerBlockIC;
 
-    // auxiliar arrays for GPU
-    uint32_t* segmentsPerBlockIC;
-    uint32_t* offsetPerBlockIC;
+    // pointers to EC data in GPU memory
+    uint32_t* voxelEC;
+    uint16_t* orienEC;
     uint32_t* segmentsPerBlockEC;
     uint32_t* offsetPerBlockEC;
 
-    // pointers to EC data in GPU memory
-    uint32_t*  voxelEC;
-    uint16_t*  orienEC;
-
     // pointers to LUTs in GPU memory
     float32_t* lutIC;
     float32_t* lutEC;
@@ -117,13 +100,31 @@ class CudaLinearOperator {
     float64_t* x;
     float64_t* y;
 
-    // dimensions of the operator
+    // constant values in GPU
+    __constant__ int NUM_VOXELS;
+    __constant__ int NUM_FIBERS;
+    __constant__ int NUM_PEAKS;
+    __constant__ int NUM_ORIENTATIONS;
+    __constant__ int NUM_SAMPLES;
+    __constant__ int NUM_DIAMETERS;
+    __constant__ int NUM_ZEPPELINS;
+    __constant__ int NUM_BALLS;
+    __constant__ int NUM_ROWS;        
+    __constant__ int NUM_COLS;      
+    __constant__ int SIZE_LUTIC;      
+    __constant__ int SIZE_LUTEC;     
+    __constant__ int SIZE_LUTISO;
+
+    // constant values in CPU
     int nrows;
     int ncols;
     int nvoxels;
     int nfibers;
     int nsegments;
 
+    // CUDA GPU status
+    bool cudaStatus;
+
     public:
         CudaLinearOperator(
             uint32_t* voxelIC,

From a0e5213295c223917a067467bf1eb4283c8e4c9a Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 28 Mar 2020 20:00:12 -0600
Subject: [PATCH 047/190] Adding CUDA error checker

---
 commit/core.pyx         | 2 +-
 commit/cudaoperator.pyx | 7 ++++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index d129c18c..ebafb260 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -485,7 +485,7 @@ cdef class Evaluation :
         import commit.cudaoperator
         print( '\t* building dictionary in GPU ... ' )
         self.gpu_A = commit.cudaoperator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
-        if gpu_A.status == True:
+        if gpu_A.cuda_status == True:
             print( '[ OK ]' )
         else:
             print( '[ WRONG ]' )
diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 967f955a..ac6fd902 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -41,6 +41,7 @@ cdef class CudaLinearOperator :
     """
     cdef int nS, nF, nR, nE, nT, nV, nI, n, ndirs
     cdef public int adjoint, n1, n2
+    cdef public bool status
 
     cdef DICTIONARY
     cdef KERNELS
@@ -205,7 +206,6 @@ cdef class CudaLinearOperator :
         C.adjoint = 1 - C.adjoint
         return C
 
-
     @property
     def shape( self ) :
         """Size of the explicit matrix."""
@@ -247,3 +247,8 @@ cdef class CudaLinearOperator :
             self.A.Tdot(&v_in[0], &v_out[0])
 
         return v_out
+
+    @property
+    def cuda_status( self ):
+        """Return status of CUDA GPU"""
+        return self.A.cudaStatus
\ No newline at end of file

From 6f6740a4a6bdc68f27649685ba6456f29213aea9 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 28 Mar 2020 20:03:02 -0600
Subject: [PATCH 048/190] Adding CUDA error checker

---
 commit/cudaoperator.pyx      | 3 ++-
 commit/operator_withCUDA.cuh | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index ac6fd902..99b93d5f 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -30,6 +30,7 @@ cdef extern from "operator_withCUDA.cuh":
             int,
             int)
 
+        bool getCudaStatus()
         void setTransposeData(np.uint32_t*, np.uint32_t*, np.uint16_t*, np.float32_t*)
         void  dot(np.float64_t*, np.float64_t*)
         void Tdot(np.float64_t*, np.float64_t*)
@@ -251,4 +252,4 @@ cdef class CudaLinearOperator :
     @property
     def cuda_status( self ):
         """Return status of CUDA GPU"""
-        return self.A.cudaStatus
\ No newline at end of file
+        return self.A.getCudaStatus()
\ No newline at end of file
diff --git a/commit/operator_withCUDA.cuh b/commit/operator_withCUDA.cuh
index 6b03dfe8..df441a99 100644
--- a/commit/operator_withCUDA.cuh
+++ b/commit/operator_withCUDA.cuh
@@ -151,6 +151,7 @@ class CudaLinearOperator {
 
         ~CudaLinearOperator();
 
+        bool getCudaStatus() const { return cudaStatus; }
         void setTransposeData(uint32_t*  voxelIDs, uint32_t*  fiberIDs, uint16_t*  orienIDs, float32_t* lengths);
 
         void  dot(float64_t* v_in, float64_t* v_out);

From e07412756d5ef92e3233e3a7f8a8094b46c912d2 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 28 Mar 2020 20:03:51 -0600
Subject: [PATCH 049/190] Adding CUDA error checker

---
 commit/cudaoperator.pyx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 99b93d5f..6aa4f435 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -42,7 +42,6 @@ cdef class CudaLinearOperator :
     """
     cdef int nS, nF, nR, nE, nT, nV, nI, n, ndirs
     cdef public int adjoint, n1, n2
-    cdef public bool status
 
     cdef DICTIONARY
     cdef KERNELS

From aea527c5c244250375abf3cefda2e740385c548a Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 28 Mar 2020 20:05:53 -0600
Subject: [PATCH 050/190] Adding CUDA error checker

---
 commit/core.pyx              | 2 +-
 commit/cudaoperator.pyx      | 2 +-
 commit/operator_withCUDA.cuh | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index ebafb260..29d17391 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -485,7 +485,7 @@ cdef class Evaluation :
         import commit.cudaoperator
         print( '\t* building dictionary in GPU ... ' )
         self.gpu_A = commit.cudaoperator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
-        if gpu_A.cuda_status == True:
+        if gpu_A.cuda_status == 1:
             print( '[ OK ]' )
         else:
             print( '[ WRONG ]' )
diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 6aa4f435..30b6a4ad 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -30,7 +30,7 @@ cdef extern from "operator_withCUDA.cuh":
             int,
             int)
 
-        bool getCudaStatus()
+        int getCudaStatus()
         void setTransposeData(np.uint32_t*, np.uint32_t*, np.uint16_t*, np.float32_t*)
         void  dot(np.float64_t*, np.float64_t*)
         void Tdot(np.float64_t*, np.float64_t*)
diff --git a/commit/operator_withCUDA.cuh b/commit/operator_withCUDA.cuh
index df441a99..78fd9242 100644
--- a/commit/operator_withCUDA.cuh
+++ b/commit/operator_withCUDA.cuh
@@ -151,7 +151,7 @@ class CudaLinearOperator {
 
         ~CudaLinearOperator();
 
-        bool getCudaStatus() const { return cudaStatus; }
+        int getCudaStatus() { return (int)cudaStatus; }
         void setTransposeData(uint32_t*  voxelIDs, uint32_t*  fiberIDs, uint16_t*  orienIDs, float32_t* lengths);
 
         void  dot(float64_t* v_in, float64_t* v_out);

From 2046caef2dcdac0ef392e5ab24e572492ca8f703 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 28 Mar 2020 20:06:51 -0600
Subject: [PATCH 051/190] Adding CUDA error checker

---
 commit/core.pyx         | 2 +-
 commit/cudaoperator.pyx | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index 29d17391..777ca1ca 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -485,7 +485,7 @@ cdef class Evaluation :
         import commit.cudaoperator
         print( '\t* building dictionary in GPU ... ' )
         self.gpu_A = commit.cudaoperator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
-        if gpu_A.cuda_status == 1:
+        if gpu_A.cuda_status() == 1:
             print( '[ OK ]' )
         else:
             print( '[ WRONG ]' )
diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 30b6a4ad..69697997 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -248,7 +248,6 @@ cdef class CudaLinearOperator :
 
         return v_out
 
-    @property
     def cuda_status( self ):
         """Return status of CUDA GPU"""
         return self.A.getCudaStatus()
\ No newline at end of file

From 06be9b9011f25d8f7a30c8bf4935f5de225e8231 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 28 Mar 2020 20:08:20 -0600
Subject: [PATCH 052/190] Adding CUDA error checker

---
 commit/core.pyx         | 2 +-
 commit/cudaoperator.pyx | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index 777ca1ca..ea060889 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -485,7 +485,7 @@ cdef class Evaluation :
         import commit.cudaoperator
         print( '\t* building dictionary in GPU ... ' )
         self.gpu_A = commit.cudaoperator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
-        if gpu_A.cuda_status() == 1:
+        if self.gpu_A.cuda_status == 1:
             print( '[ OK ]' )
         else:
             print( '[ WRONG ]' )
diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 69697997..30b6a4ad 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -248,6 +248,7 @@ cdef class CudaLinearOperator :
 
         return v_out
 
+    @property
     def cuda_status( self ):
         """Return status of CUDA GPU"""
         return self.A.getCudaStatus()
\ No newline at end of file

From 1253405e688778e2241bda4026fd4be77315b6e7 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 28 Mar 2020 20:09:48 -0600
Subject: [PATCH 053/190] Adding CUDA error checker

---
 commit/operator_withCUDA.cuh | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/commit/operator_withCUDA.cuh b/commit/operator_withCUDA.cuh
index 78fd9242..64705204 100644
--- a/commit/operator_withCUDA.cuh
+++ b/commit/operator_withCUDA.cuh
@@ -67,6 +67,21 @@ __global__ void multiply_Aty_ISOpart(
     double* x,
     double* y);
 
+// constant values in GPU
+__constant__ int NUM_VOXELS;
+__constant__ int NUM_FIBERS;
+__constant__ int NUM_PEAKS;
+__constant__ int NUM_ORIENTATIONS;
+__constant__ int NUM_SAMPLES;
+__constant__ int NUM_DIAMETERS;
+__constant__ int NUM_ZEPPELINS;
+__constant__ int NUM_BALLS;
+__constant__ int NUM_ROWS;        
+__constant__ int NUM_COLS;      
+__constant__ int SIZE_LUTIC;      
+__constant__ int SIZE_LUTEC;     
+__constant__ int SIZE_LUTISO;
+
 class CudaLinearOperator {
 
     // pointers to IC data in GPU memory
@@ -100,21 +115,6 @@ class CudaLinearOperator {
     float64_t* x;
     float64_t* y;
 
-    // constant values in GPU
-    __constant__ int NUM_VOXELS;
-    __constant__ int NUM_FIBERS;
-    __constant__ int NUM_PEAKS;
-    __constant__ int NUM_ORIENTATIONS;
-    __constant__ int NUM_SAMPLES;
-    __constant__ int NUM_DIAMETERS;
-    __constant__ int NUM_ZEPPELINS;
-    __constant__ int NUM_BALLS;
-    __constant__ int NUM_ROWS;        
-    __constant__ int NUM_COLS;      
-    __constant__ int SIZE_LUTIC;      
-    __constant__ int SIZE_LUTEC;     
-    __constant__ int SIZE_LUTISO;
-
     // constant values in CPU
     int nrows;
     int ncols;

From 0dbe1dcff4ead316c71d7b9816ffeaccc6698b50 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sun, 29 Mar 2020 03:10:41 -0600
Subject: [PATCH 054/190] Adding CudaLinearOperator to build_dictionary()

---
 commit/core.pyx | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index ea060889..f3974a7f 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -468,7 +468,7 @@ cdef class Evaluation :
 
         # post-processing
         # ---------------
-        print( '\t* post-processing...', end="" )
+        print( '\t* post-processing...' )
         sys.stdout.flush()
 
         # get the indices to extract the VOI as in MATLAB (in place of DICTIONARY.MASKidx)
@@ -482,14 +482,6 @@ cdef class Evaluation :
         self.DICTIONARY['EC'][ 'v'] = lut[ self.DICTIONARY['EC'][ 'v'] ]
         self.DICTIONARY['ISO']['v'] = lut[ self.DICTIONARY['ISO']['v'] ]
 
-        import commit.cudaoperator
-        print( '\t* building dictionary in GPU ... ' )
-        self.gpu_A = commit.cudaoperator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
-        if self.gpu_A.cuda_status == 1:
-            print( '[ OK ]' )
-        else:
-            print( '[ WRONG ]' )
-
         print( '         [ OK ]' )
 
         print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
@@ -671,7 +663,17 @@ cdef class Evaluation :
             import commit.operator.operator
         else :
             reload( sys.modules['commit.operator.operator'] )
-        self.A = sys.modules['commit.operator.operator'].LinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
+
+        if self.THREADS['n'] > 0:
+            self.A = sys.modules['commit.operator.operator'].LinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
+        else:
+            import commit.cudaoperator
+            #print( '\t* building dictionary in GPU ... ' )
+            self.A = commit.cudaoperator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
+            """if self.gpu_A.cuda_status == 1:
+                print( '[ OK ]' )
+            else:
+                print( '[ ERROR ]' )"""
 
         print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
 

From e1c3319ea528255e928eba3ccd21b0e8d590eab6 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sun, 29 Mar 2020 03:19:13 -0600
Subject: [PATCH 055/190] Adding CudaLinearOperator to build_dictionary()

---
 commit/core.pyx | 232 +++++++++++++++++++++++++-----------------------
 1 file changed, 120 insertions(+), 112 deletions(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index f3974a7f..ef539263 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -513,123 +513,131 @@ cdef class Evaluation :
 
         self.THREADS = {}
         self.THREADS['n'] = nthreads
+        self.THREADS['IC'] = None
+        self.THREADS['EC'] = None
+        self.THREADS['ISO'] = None
+        self.THREADS['ICt'] = None
+        self.THREADS['ECt'] = None
+        self.THREADS['ISOt'] = None
+
+        if nthreads > 0:
+
+            cdef :
+                long [:] C
+                long t, tot, i1, i2, N, c
+                int i
+
+            tic = time.time()
+            print( '\n-> Distributing workload to different threads:' )
+            print( '\t* number of threads : %d' % nthreads )
+
+            # Distribute load for the computation of A*x product
+            print( '\t* A operator...', end="" )
+            sys.stdout.flush()
 
-        cdef :
-            long [:] C
-            long t, tot, i1, i2, N, c
-            int i
-
-        tic = time.time()
-        print( '\n-> Distributing workload to different threads:' )
-        print( '\t* number of threads : %d' % nthreads )
-
-        # Distribute load for the computation of A*x product
-        print( '\t* A operator...', end="" )
-        sys.stdout.flush()
-
-        if self.DICTIONARY['IC']['n'] > 0 :
-            self.THREADS['IC'] = np.zeros( nthreads+1, dtype=np.uint32 )
-            if nthreads > 1 :
-                N = np.floor( self.DICTIONARY['IC']['n']/nthreads )
-                t = 1
-                tot = 0
-                C = np.bincount( self.DICTIONARY['IC']['v'] )
-                for c in C :
-                    tot += c
-                    if tot >= N :
-                        self.THREADS['IC'][t] = self.THREADS['IC'][t-1] + tot
-                        t += 1
-                        tot = 0
-            self.THREADS['IC'][nthreads] = self.DICTIONARY['IC']['n']
-
-            # check if some threads are not assigned any segment
-            if np.count_nonzero( np.diff( self.THREADS['IC'].astype(np.int32) ) <= 0 ) :
-                self.THREADS = None
-                raise RuntimeError( 'Too many threads for the IC compartments to evaluate; try decreasing the number.' )
-        else :
-            self.THREADS['IC'] = None
-
-        if self.DICTIONARY['EC']['nE'] > 0 :
-            self.THREADS['EC'] = np.zeros( nthreads+1, dtype=np.uint32 )
-            for i in xrange(nthreads) :
-                self.THREADS['EC'][i] = np.searchsorted( self.DICTIONARY['EC']['v'], self.DICTIONARY['IC']['v'][ self.THREADS['IC'][i] ] )
-            self.THREADS['EC'][nthreads] = self.DICTIONARY['EC']['nE']
-
-            # check if some threads are not assigned any segment
-            if np.count_nonzero( np.diff( self.THREADS['EC'].astype(np.int32) ) <= 0 ) :
-                self.THREADS = None
-                raise RuntimeError( 'Too many threads for the EC compartments to evaluate; try decreasing the number.' )
-        else :
-            self.THREADS['EC'] = None
-
-        if self.DICTIONARY['nV'] > 0 :
-            self.THREADS['ISO'] = np.zeros( nthreads+1, dtype=np.uint32 )
-            for i in xrange(nthreads) :
-                self.THREADS['ISO'][i] = np.searchsorted( self.DICTIONARY['ISO']['v'], self.DICTIONARY['IC']['v'][ self.THREADS['IC'][i] ] )
-            self.THREADS['ISO'][nthreads] = self.DICTIONARY['nV']
-
-            # check if some threads are not assigned any segment
-            if np.count_nonzero( np.diff( self.THREADS['ISO'].astype(np.int32) ) <= 0 ) :
-                self.THREADS = None
-                raise RuntimeError( 'Too many threads for the ISO compartments to evaluate; try decreasing the number.' )
-        else :
-            self.THREADS['ISO'] = None
+            if self.DICTIONARY['IC']['n'] > 0 :
+                self.THREADS['IC'] = np.zeros( nthreads+1, dtype=np.uint32 )
+                if nthreads > 1 :
+                    N = np.floor( self.DICTIONARY['IC']['n']/nthreads )
+                    t = 1
+                    tot = 0
+                    C = np.bincount( self.DICTIONARY['IC']['v'] )
+                    for c in C :
+                        tot += c
+                        if tot >= N :
+                            self.THREADS['IC'][t] = self.THREADS['IC'][t-1] + tot
+                            t += 1
+                            tot = 0
+                self.THREADS['IC'][nthreads] = self.DICTIONARY['IC']['n']
+
+                # check if some threads are not assigned any segment
+                if np.count_nonzero( np.diff( self.THREADS['IC'].astype(np.int32) ) <= 0 ) :
+                    self.THREADS = None
+                    raise RuntimeError( 'Too many threads for the IC compartments to evaluate; try decreasing the number.' )
+            else :
+                self.THREADS['IC'] = None
+
+            if self.DICTIONARY['EC']['nE'] > 0 :
+                self.THREADS['EC'] = np.zeros( nthreads+1, dtype=np.uint32 )
+                for i in xrange(nthreads) :
+                    self.THREADS['EC'][i] = np.searchsorted( self.DICTIONARY['EC']['v'], self.DICTIONARY['IC']['v'][ self.THREADS['IC'][i] ] )
+                self.THREADS['EC'][nthreads] = self.DICTIONARY['EC']['nE']
+
+                # check if some threads are not assigned any segment
+                if np.count_nonzero( np.diff( self.THREADS['EC'].astype(np.int32) ) <= 0 ) :
+                    self.THREADS = None
+                    raise RuntimeError( 'Too many threads for the EC compartments to evaluate; try decreasing the number.' )
+            else :
+                self.THREADS['IC'] = None
+
+            if self.DICTIONARY['nV'] > 0 :
+                self.THREADS['ISO'] = np.zeros( nthreads+1, dtype=np.uint32 )
+                for i in xrange(nthreads) :
+                    self.THREADS['ISO'][i] = np.searchsorted( self.DICTIONARY['ISO']['v'], self.DICTIONARY['IC']['v'][ self.THREADS['IC'][i] ] )
+                self.THREADS['ISO'][nthreads] = self.DICTIONARY['nV']
+
+                # check if some threads are not assigned any segment
+                if np.count_nonzero( np.diff( self.THREADS['ISO'].astype(np.int32) ) <= 0 ) :
+                    self.THREADS = None
+                    raise RuntimeError( 'Too many threads for the ISO compartments to evaluate; try decreasing the number.' )
+            else :
+                self.THREADS['ISO'] = None
 
-        print( ' [ OK ]' )
+            print( ' [ OK ]' )
 
-        # Distribute load for the computation of At*y product
-        print( '\t* A\' operator...', end="" )
-        sys.stdout.flush()
+            # Distribute load for the computation of At*y product
+            print( '\t* A\' operator...', end="" )
+            sys.stdout.flush()
 
-        if self.DICTIONARY['IC']['n'] > 0 :
-            self.THREADS['ICt'] = np.full( self.DICTIONARY['IC']['n'], nthreads-1, dtype=np.uint8 )
-            if nthreads > 1 :
-                idx = np.argsort( self.DICTIONARY['IC']['fiber'], kind='mergesort' )
-                C = np.bincount( self.DICTIONARY['IC']['fiber'] )
-                t = tot = i1 = i2 = 0
-                N = np.floor(self.DICTIONARY['IC']['n']/nthreads)
-                for c in C :
-                    i2 += c
-                    tot += c
-                    if tot >= N :
-                        self.THREADS['ICt'][ i1:i2 ] = t
-                        t += 1
-                        if t==nthreads-1 :
-                            break
-                        i1 = i2
-                        tot = c
-                self.THREADS['ICt'][idx] = self.THREADS['ICt'].copy()
+            if self.DICTIONARY['IC']['n'] > 0 :
+                self.THREADS['ICt'] = np.full( self.DICTIONARY['IC']['n'], nthreads-1, dtype=np.uint8 )
+                if nthreads > 1 :
+                    idx = np.argsort( self.DICTIONARY['IC']['fiber'], kind='mergesort' )
+                    C = np.bincount( self.DICTIONARY['IC']['fiber'] )
+                    t = tot = i1 = i2 = 0
+                    N = np.floor(self.DICTIONARY['IC']['n']/nthreads)
+                    for c in C :
+                        i2 += c
+                        tot += c
+                        if tot >= N :
+                            self.THREADS['ICt'][ i1:i2 ] = t
+                            t += 1
+                            if t==nthreads-1 :
+                                break
+                            i1 = i2
+                            tot = c
+                    self.THREADS['ICt'][idx] = self.THREADS['ICt'].copy()
 
-        else :
-            self.THREADS['ICt'] = None
-
-        if self.DICTIONARY['EC']['nE'] > 0 :
-            self.THREADS['ECt'] = np.zeros( nthreads+1, dtype=np.uint32 )
-            N = np.floor( self.DICTIONARY['EC']['nE']/nthreads )
-            for i in xrange(1,nthreads) :
-                self.THREADS['ECt'][i] = self.THREADS['ECt'][i-1] + N
-            self.THREADS['ECt'][nthreads] = self.DICTIONARY['EC']['nE']
-
-            # check if some threads are not assigned any segment
-            if np.count_nonzero( np.diff( self.THREADS['ECt'].astype(np.int32) ) <= 0 ) :
-                self.THREADS = None
-                raise RuntimeError( 'Too many threads for the EC compartments to evaluate; try decreasing the number.' )
-        else :
-            self.THREADS['ECt'] = None
-
-        if self.DICTIONARY['nV'] > 0 :
-            self.THREADS['ISOt'] = np.zeros( nthreads+1, dtype=np.uint32 )
-            N = np.floor( self.DICTIONARY['nV']/nthreads )
-            for i in xrange(1,nthreads) :
-                self.THREADS['ISOt'][i] = self.THREADS['ISOt'][i-1] + N
-            self.THREADS['ISOt'][nthreads] = self.DICTIONARY['nV']
-
-            # check if some threads are not assigned any segment
-            if np.count_nonzero( np.diff( self.THREADS['ISOt'].astype(np.int32) ) <= 0 ) :
-                self.THREADS = None
-                raise RuntimeError( 'Too many threads for the ISO compartments to evaluate; try decreasing the number.' )
-        else :
-            self.THREADS['ISOt'] = None
+            else :
+                self.THREADS['ICt'] = None
+
+            if self.DICTIONARY['EC']['nE'] > 0 :
+                self.THREADS['ECt'] = np.zeros( nthreads+1, dtype=np.uint32 )
+                N = np.floor( self.DICTIONARY['EC']['nE']/nthreads )
+                for i in xrange(1,nthreads) :
+                    self.THREADS['ECt'][i] = self.THREADS['ECt'][i-1] + N
+                self.THREADS['ECt'][nthreads] = self.DICTIONARY['EC']['nE']
+
+                # check if some threads are not assigned any segment
+                if np.count_nonzero( np.diff( self.THREADS['ECt'].astype(np.int32) ) <= 0 ) :
+                    self.THREADS = None
+                    raise RuntimeError( 'Too many threads for the EC compartments to evaluate; try decreasing the number.' )
+            else :
+                self.THREADS['ECt'] = None
+
+            if self.DICTIONARY['nV'] > 0 :
+                self.THREADS['ISOt'] = np.zeros( nthreads+1, dtype=np.uint32 )
+                N = np.floor( self.DICTIONARY['nV']/nthreads )
+                for i in xrange(1,nthreads) :
+                    self.THREADS['ISOt'][i] = self.THREADS['ISOt'][i-1] + N
+                self.THREADS['ISOt'][nthreads] = self.DICTIONARY['nV']
+
+                # check if some threads are not assigned any segment
+                if np.count_nonzero( np.diff( self.THREADS['ISOt'].astype(np.int32) ) <= 0 ) :
+                    self.THREADS = None
+                    raise RuntimeError( 'Too many threads for the ISO compartments to evaluate; try decreasing the number.' )
+            else :
+                self.THREADS['ISOt'] = None
 
         print( '[ OK ]' )
 

From b187eed6b2ca3d26f44bb93791a21c5e37fcd68a Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sun, 29 Mar 2020 03:21:22 -0600
Subject: [PATCH 056/190] Adding CudaLinearOperator to build_dictionary()

---
 commit/core.pyx | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index ef539263..898ba681 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -520,14 +520,14 @@ cdef class Evaluation :
         self.THREADS['ECt'] = None
         self.THREADS['ISOt'] = None
 
-        if nthreads > 0:
+        cdef :
+            long [:] C
+            long t, tot, i1, i2, N, c
+            int i
 
-            cdef :
-                long [:] C
-                long t, tot, i1, i2, N, c
-                int i
+        tic = time.time()
 
-            tic = time.time()
+        if nthreads > 0:
             print( '\n-> Distributing workload to different threads:' )
             print( '\t* number of threads : %d' % nthreads )
 

From 1ab2b2866c98cd69e01ff6a16e3ae45f70bc6860 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sun, 29 Mar 2020 14:59:48 -0600
Subject: [PATCH 057/190] Fixing bug with A'y operation in CUDA

---
 commit/core.pyx         | 6 +++---
 commit/cudaoperator.pyx | 5 +++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index 898ba681..75530c41 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -678,10 +678,10 @@ cdef class Evaluation :
             import commit.cudaoperator
             #print( '\t* building dictionary in GPU ... ' )
             self.A = commit.cudaoperator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
-            """if self.gpu_A.cuda_status == 1:
-                print( '[ OK ]' )
+            if self.gpu_A.cuda_status == 1:
+                print( '[ CUDA OK ]' )
             else:
-                print( '[ ERROR ]' )"""
+                print( '[ CUDA ERROR ]' )
 
         print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
 
diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 30b6a4ad..326319fd 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -30,8 +30,8 @@ cdef extern from "operator_withCUDA.cuh":
             int,
             int)
 
-        int getCudaStatus()
-        void setTransposeData(np.uint32_t*, np.uint32_t*, np.uint16_t*, np.float32_t*)
+        int   getCudaStatus()
+        void  setTransposeData(np.uint32_t*, np.uint32_t*, np.uint16_t*, np.float32_t*)
         void  dot(np.float64_t*, np.float64_t*)
         void Tdot(np.float64_t*, np.float64_t*)
 
@@ -186,6 +186,7 @@ cdef class CudaLinearOperator :
 
         self.A.setTransposeData(&self.ICv[0], &self.ICf[0], &self.ICo[0], &self.ICl[0])
 
+        """
         idx = np.argsort( self.DICTIONARY['IC']['v'], kind='mergesort' )
         self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
         self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]

From 71099f6c82d3d39649f09d1e56792f354bc2d0fe Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sun, 29 Mar 2020 15:06:21 -0600
Subject: [PATCH 058/190] Fixing bug with A'y operation in CUDA

---
 commit/core.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index 75530c41..09056935 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -678,7 +678,7 @@ cdef class Evaluation :
             import commit.cudaoperator
             #print( '\t* building dictionary in GPU ... ' )
             self.A = commit.cudaoperator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
-            if self.gpu_A.cuda_status == 1:
+            if self.A.cuda_status == 1:
                 print( '[ CUDA OK ]' )
             else:
                 print( '[ CUDA ERROR ]' )

From 2e19a237a33cb561ae7f58ea5d541847df4a3a1a Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sun, 29 Mar 2020 15:38:13 -0600
Subject: [PATCH 059/190] Fixing bug with A'y operation in CUDA

---
 commit/core.pyx | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/commit/core.pyx b/commit/core.pyx
index 09056935..caa9bd2f 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -482,6 +482,15 @@ cdef class Evaluation :
         self.DICTIONARY['EC'][ 'v'] = lut[ self.DICTIONARY['EC'][ 'v'] ]
         self.DICTIONARY['ISO']['v'] = lut[ self.DICTIONARY['ISO']['v'] ]
 
+
+        import commit.cudaoperator
+        print( '\t* building dictionary in GPU ... ' )
+        self.A = commit.cudaoperator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
+        if self.A.cuda_status == 1:
+            print( '[ CUDA OK ]' )
+        else:
+            print( '[ CUDA ERROR ]' )
+
         print( '         [ OK ]' )
 
         print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
@@ -674,6 +683,7 @@ cdef class Evaluation :
 
         if self.THREADS['n'] > 0:
             self.A = sys.modules['commit.operator.operator'].LinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
+        """
         else:
             import commit.cudaoperator
             #print( '\t* building dictionary in GPU ... ' )
@@ -682,6 +692,7 @@ cdef class Evaluation :
                 print( '[ CUDA OK ]' )
             else:
                 print( '[ CUDA ERROR ]' )
+        """
 
         print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
 

From e00cc6ea1bf1bf84121225e85ce2f9eae0d96529 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sun, 29 Mar 2020 16:05:34 -0600
Subject: [PATCH 060/190] Fixing bug with A'y operation in CUDA

---
 commit/operator_withCUDA.cu | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index 76756087..709f73eb 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -243,30 +243,50 @@ void CudaLinearOperator::dot(float64_t* v_in, float64_t* v_out){
     cudaMemcpy(v_out, y, nrows*sizeof(double), cudaMemcpyDeviceToHost);
 }
 
+void cudaCheckKernel(){
+    cudaError_t cudaStatus;
+    
+    cudaStatus = cudaGetLastError();
+	if(cudaStatus != cudaSuccess)
+        fprintf(stderr, "\t* kernel launch... [ ERROR ]: %s\n\n", cudaGetErrorString(cudaStatus));
+    else
+        printf("\t* kernel launch... [ OK ]\n");
+
+    cudaStatus = cudaDeviceSynchronize();
+	if(cudaStatus != cudaSuccess)
+        fprintf(stderr, "\t* cudaDeviceSynchronize() after launching kernel... [ ERROR ]: %d\n", cudaStatus);
+    else
+        printf("\t* cudaDeviceSynchronize() after launching kernel... [ OK ]\n");
+}
+
 void CudaLinearOperator::Tdot(float64_t* v_in, float64_t* v_out){
         
     // Copy vector y to the GPU
     //cudaCheck( cudaMemset(gpu_x, 0, NUM_COLS*sizeof(float64_t)) );
     //cudaCheck( cudaMemcpy(gpu_x, x, NUM_COLS*sizeof(double), cudaMemcpyHostToDevice) );
-    cudaCheck( cudaMemcpy(y, v_in, nrows*sizeof(double), cudaMemcpyHostToDevice) );
+    bool cudaStatus = cudaCheck( cudaMemcpy(y, v_in, nrows*sizeof(double), cudaMemcpyHostToDevice) );
+    if (cudaStatus != cudaSuccess) printf("\t* tranfering y to GPU ... [ ERROR ]");
+    else                           printf("\t* tranfering y to GPU ... [   OK  ]");
 
     // Multiply IC part in the GPU
     multiply_Aty_ICpart<<<nfibers, 512>>>(TvoxelIC, TfiberIC, TorienIC, TlengthIC, TfibersPerBlockIC, ToffsetPerBlockIC, lutIC, x, y);
 
-    //cudaCheckKernel();//*/
+    cudaCheckKernel();
 
     // Multiply EC part in the GPU
     multiply_Aty_ECpart<<<nvoxels, 512>>>(voxelEC, orienEC, segmentsPerBlockEC, offsetPerBlockEC, lutEC, x, y);
 
-    //cudaCheckKernel();
+    cudaCheckKernel();
 
     // Multiply ISO part in the GPU
     multiply_Aty_ISOpart<<<nvoxels, 512>>>(lutISO, x, y);
 
-    //cudaCheckKernel();//*/
+    cudaCheckKernel();
 
     // Copy back result to CPU
-    cudaCheck( cudaMemcpy(v_out, x, ncols*sizeof(double), cudaMemcpyDeviceToHost) );
+    bool cudaStatus = cudaCheck( cudaMemcpy(v_out, x, ncols*sizeof(double), cudaMemcpyDeviceToHost) );
+    if (cudaStatus != cudaSuccess) printf("\t* tranfering x to CPU ... [ ERROR ]");
+    else                           printf("\t* tranfering x to CPU ... [   OK  ]");
         
     /*printf("\n\n VECTOR X EC PART:\n");
     for(int i = NUM_FIBERS*NUM_RESFUNCIC; i < NUM_FIBERS*NUM_RESFUNCIC+20; i++)

From 75b7219e63852c97441ccfb33bbe9cf27d1453fa Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sun, 29 Mar 2020 16:07:20 -0600
Subject: [PATCH 061/190] Fixing bug with A'y operation in CUDA

---
 commit/operator_withCUDA.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index 709f73eb..cbe5cd41 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -284,7 +284,7 @@ void CudaLinearOperator::Tdot(float64_t* v_in, float64_t* v_out){
     cudaCheckKernel();
 
     // Copy back result to CPU
-    bool cudaStatus = cudaCheck( cudaMemcpy(v_out, x, ncols*sizeof(double), cudaMemcpyDeviceToHost) );
+    cudaStatus = cudaCheck( cudaMemcpy(v_out, x, ncols*sizeof(double), cudaMemcpyDeviceToHost) );
     if (cudaStatus != cudaSuccess) printf("\t* tranfering x to CPU ... [ ERROR ]");
     else                           printf("\t* tranfering x to CPU ... [   OK  ]");
         

From c2c6a8d5629b5abc40c1fbd8dcc5f55ef2d950df Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sun, 29 Mar 2020 19:55:14 -0600
Subject: [PATCH 062/190] Solving bug in operation A'y in GPU

---
 commit/cudaoperator.pyx      |  1 +
 commit/operator_withCUDA.cu  | 80 +++++++++++++++++++++++++-----------
 commit/operator_withCUDA.cuh |  3 +-
 3 files changed, 58 insertions(+), 26 deletions(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 326319fd..9135cd30 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -32,6 +32,7 @@ cdef extern from "operator_withCUDA.cuh":
 
         int   getCudaStatus()
         void  setTransposeData(np.uint32_t*, np.uint32_t*, np.uint16_t*, np.float32_t*)
+        void  destroy()
         void  dot(np.float64_t*, np.float64_t*)
         void Tdot(np.float64_t*, np.float64_t*)
 
diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index cbe5cd41..fa4ff338 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -153,6 +153,32 @@ CudaLinearOperator::CudaLinearOperator(
 }
 
 CudaLinearOperator::~CudaLinearOperator(){
+    /*cudaStatus = cudaStatus && cudaCheck( cudaFree(voxelIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(fiberIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(orienIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(lengthIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(lutIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(segmentsPerBlockIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(offsetPerBlockIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(voxelEC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(orienEC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(lutEC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(segmentsPerBlockEC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(offsetPerBlockEC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(lutISO) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(TvoxelIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(TfiberIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(TorienIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(TlengthIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(TfibersPerBlockIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(ToffsetPerBlockIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(x) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(y) );
+
+    cudaStatus = cudaStatus && cudaCheck( cudaDeviceReset() );//*/
+}
+
+CudaLinearOperator::destroy(){
     cudaStatus = cudaStatus && cudaCheck( cudaFree(voxelIC) );
     cudaStatus = cudaStatus && cudaCheck( cudaFree(fiberIC) );
     cudaStatus = cudaStatus && cudaCheck( cudaFree(orienIC) );
@@ -220,43 +246,47 @@ void CudaLinearOperator::setTransposeData(
     free(offsetPerBlock);
 }
 
+void cudaCheckKernel(){
+    cudaError_t cudaStatus;
+    
+    cudaStatus = cudaGetLastError();
+	if(cudaStatus != cudaSuccess)
+        fprintf(stderr, "\t* kernel launch... [ ERROR ]: %s\n\n", cudaGetErrorString(cudaStatus));
+    else
+        printf("\t* kernel launch... [ OK ]\n");
+
+    cudaStatus = cudaDeviceSynchronize();
+	if(cudaStatus != cudaSuccess)
+        fprintf(stderr, "\t* cudaDeviceSynchronize() after launching kernel... [ ERROR ]: %d\n", cudaStatus);
+    else
+        printf("\t* cudaDeviceSynchronize() after launching kernel... [ OK ]\n");
+}
+
 void CudaLinearOperator::dot(float64_t* v_in, float64_t* v_out){
     // Copy vector x to the GPU
-    cudaMemcpy(x, v_in, ncols*sizeof(double), cudaMemcpyHostToDevice);
+    bool cudaStatus = cudaMemcpy(x, v_in, ncols*sizeof(double), cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) printf("\t* tranfering x to GPU ... [ ERROR ]\n");
+    else                           printf("\t* tranfering x to GPU ... [   OK  ]\n");
 
     // Multiply IC part in the GPU
     multiply_Ax_ICpart<<<nvoxels, 1024>>>(voxelIC, fiberIC, orienIC, lengthIC, segmentsPerBlockIC, offsetPerBlockIC, lutIC, x, y);
 
-    //cudaCheckKernel();
+    cudaCheckKernel();
 
     // Multiply EC part in the GPU
     multiply_Ax_ECpart<<<nvoxels, 512>>>(voxelEC, orienEC, segmentsPerBlockEC, offsetPerBlockEC, lutEC, x, y);
 
-    //cudaCheckKernel();
+    cudaCheckKernel();
 
     // Multiply ISO part in the GPU
     multiply_Ax_ISOpart<<<nvoxels, 512>>>(lutISO, x, y);
 
-    //cudaCheckKernel();
+    cudaCheckKernel();
 
     // Copy back result to CPU
-    cudaMemcpy(v_out, y, nrows*sizeof(double), cudaMemcpyDeviceToHost);
-}
-
-void cudaCheckKernel(){
-    cudaError_t cudaStatus;
-    
-    cudaStatus = cudaGetLastError();
-	if(cudaStatus != cudaSuccess)
-        fprintf(stderr, "\t* kernel launch... [ ERROR ]: %s\n\n", cudaGetErrorString(cudaStatus));
-    else
-        printf("\t* kernel launch... [ OK ]\n");
-
-    cudaStatus = cudaDeviceSynchronize();
-	if(cudaStatus != cudaSuccess)
-        fprintf(stderr, "\t* cudaDeviceSynchronize() after launching kernel... [ ERROR ]: %d\n", cudaStatus);
-    else
-        printf("\t* cudaDeviceSynchronize() after launching kernel... [ OK ]\n");
+    cudaStatus = cudaMemcpy(v_out, y, nrows*sizeof(double), cudaMemcpyDeviceToHost);
+    if (cudaStatus != cudaSuccess) printf("\t* tranfering y to CPU ... [ ERROR ]\n");
+    else                           printf("\t* tranfering y to CPU ... [   OK  ]\n");
 }
 
 void CudaLinearOperator::Tdot(float64_t* v_in, float64_t* v_out){
@@ -265,8 +295,8 @@ void CudaLinearOperator::Tdot(float64_t* v_in, float64_t* v_out){
     //cudaCheck( cudaMemset(gpu_x, 0, NUM_COLS*sizeof(float64_t)) );
     //cudaCheck( cudaMemcpy(gpu_x, x, NUM_COLS*sizeof(double), cudaMemcpyHostToDevice) );
     bool cudaStatus = cudaCheck( cudaMemcpy(y, v_in, nrows*sizeof(double), cudaMemcpyHostToDevice) );
-    if (cudaStatus != cudaSuccess) printf("\t* tranfering y to GPU ... [ ERROR ]");
-    else                           printf("\t* tranfering y to GPU ... [   OK  ]");
+    if (cudaStatus != cudaSuccess) printf("\t* tranfering y to GPU ... [ ERROR ]\n");
+    else                           printf("\t* tranfering y to GPU ... [   OK  ]\n");
 
     // Multiply IC part in the GPU
     multiply_Aty_ICpart<<<nfibers, 512>>>(TvoxelIC, TfiberIC, TorienIC, TlengthIC, TfibersPerBlockIC, ToffsetPerBlockIC, lutIC, x, y);
@@ -285,8 +315,8 @@ void CudaLinearOperator::Tdot(float64_t* v_in, float64_t* v_out){
 
     // Copy back result to CPU
     cudaStatus = cudaCheck( cudaMemcpy(v_out, x, ncols*sizeof(double), cudaMemcpyDeviceToHost) );
-    if (cudaStatus != cudaSuccess) printf("\t* tranfering x to CPU ... [ ERROR ]");
-    else                           printf("\t* tranfering x to CPU ... [   OK  ]");
+    if (cudaStatus != cudaSuccess) printf("\t* tranfering x to CPU ... [ ERROR ]\n");
+    else                           printf("\t* tranfering x to CPU ... [   OK  ]\n");
         
     /*printf("\n\n VECTOR X EC PART:\n");
     for(int i = NUM_FIBERS*NUM_RESFUNCIC; i < NUM_FIBERS*NUM_RESFUNCIC+20; i++)
diff --git a/commit/operator_withCUDA.cuh b/commit/operator_withCUDA.cuh
index 64705204..5955a874 100644
--- a/commit/operator_withCUDA.cuh
+++ b/commit/operator_withCUDA.cuh
@@ -151,8 +151,9 @@ class CudaLinearOperator {
 
         ~CudaLinearOperator();
 
-        int getCudaStatus() { return (int)cudaStatus; }
+        int  getCudaStatus() { return (int)cudaStatus; }
         void setTransposeData(uint32_t*  voxelIDs, uint32_t*  fiberIDs, uint16_t*  orienIDs, float32_t* lengths);
+        void destroy();
 
         void  dot(float64_t* v_in, float64_t* v_out);
         void Tdot(float64_t* v_in, float64_t* v_out);

From 2e1559ad6440f2b2bfe32dbb98e89ba58b22400d Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sun, 29 Mar 2020 19:57:42 -0600
Subject: [PATCH 063/190] Solving bug with A'y operation in CUDA

---
 commit/cudaoperator.pyx     | 5 ++++-
 commit/operator_withCUDA.cu | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 9135cd30..98fdc9cf 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -253,4 +253,7 @@ cdef class CudaLinearOperator :
     @property
     def cuda_status( self ):
         """Return status of CUDA GPU"""
-        return self.A.getCudaStatus()
\ No newline at end of file
+        return self.A.getCudaStatus()
+
+    def destroy( self ):
+        self.A.destroy()
\ No newline at end of file
diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index fa4ff338..6ce42392 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -179,6 +179,7 @@ CudaLinearOperator::~CudaLinearOperator(){
 }
 
 CudaLinearOperator::destroy(){
+    printf("\t* destroying and reseting GPU\n");
     cudaStatus = cudaStatus && cudaCheck( cudaFree(voxelIC) );
     cudaStatus = cudaStatus && cudaCheck( cudaFree(fiberIC) );
     cudaStatus = cudaStatus && cudaCheck( cudaFree(orienIC) );

From 484df81f1083a377111a4a31ec28aab00f76602d Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sun, 29 Mar 2020 20:08:36 -0600
Subject: [PATCH 064/190] Solving bug with A'y operation in CUDA

---
 commit/operator_withCUDA.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index 6ce42392..3524065e 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -178,7 +178,7 @@ CudaLinearOperator::~CudaLinearOperator(){
     cudaStatus = cudaStatus && cudaCheck( cudaDeviceReset() );//*/
 }
 
-CudaLinearOperator::destroy(){
+void CudaLinearOperator::destroy(){
     printf("\t* destroying and reseting GPU\n");
     cudaStatus = cudaStatus && cudaCheck( cudaFree(voxelIC) );
     cudaStatus = cudaStatus && cudaCheck( cudaFree(fiberIC) );

From f06d11cd6cbceb78ef4a6fac8529d84bf7358b38 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sun, 29 Mar 2020 23:28:19 -0600
Subject: [PATCH 065/190] Solving bug with operation A'y in CUDA

---
 commit/operator_withCUDA.cu | 68 ++++++++++++++++++++-----------------
 1 file changed, 36 insertions(+), 32 deletions(-)

diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index 3524065e..da21131b 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -179,34 +179,35 @@ CudaLinearOperator::~CudaLinearOperator(){
 }
 
 void CudaLinearOperator::destroy(){
-    printf("\t* destroying and reseting GPU\n");
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(voxelIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(fiberIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(orienIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(lengthIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(lutIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(segmentsPerBlockIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(offsetPerBlockIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(voxelEC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(orienEC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(lutEC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(segmentsPerBlockEC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(offsetPerBlockEC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(lutISO) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(TvoxelIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(TfiberIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(TorienIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(TlengthIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(TfibersPerBlockIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(ToffsetPerBlockIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(x) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(y) );
+    printf("\t* destroying and reseting GPU ... \n");
+    bool status = true;
+    status = status && cudaCheck( cudaFree(voxelIC) );
+    status = status && cudaCheck( cudaFree(fiberIC) );
+    status = status && cudaCheck( cudaFree(orienIC) );
+    status = status && cudaCheck( cudaFree(lengthIC) );
+    status = status && cudaCheck( cudaFree(lutIC) );
+    status = status && cudaCheck( cudaFree(segmentsPerBlockIC) );
+    status = status && cudaCheck( cudaFree(offsetPerBlockIC) );
+    status = status && cudaCheck( cudaFree(voxelEC) );
+    status = status && cudaCheck( cudaFree(orienEC) );
+    status = status && cudaCheck( cudaFree(lutEC) );
+    status = status && cudaCheck( cudaFree(segmentsPerBlockEC) );
+    status = status && cudaCheck( cudaFree(offsetPerBlockEC) );
+    status = status && cudaCheck( cudaFree(lutISO) );
+    status = status && cudaCheck( cudaFree(TvoxelIC) );
+    status = status && cudaCheck( cudaFree(TfiberIC) );
+    status = status && cudaCheck( cudaFree(TorienIC) );
+    status = status && cudaCheck( cudaFree(TlengthIC) );
+    status = status && cudaCheck( cudaFree(TfibersPerBlockIC) );
+    status = status && cudaCheck( cudaFree(ToffsetPerBlockIC) );
+    status = status && cudaCheck( cudaFree(x) );
+    status = status && cudaCheck( cudaFree(y) );
 
     /*printf("\t* reseting GPU ... ");
     bool status = true;//*/
-    cudaStatus = cudaStatus && cudaCheck( cudaDeviceReset() );
-    /*if (status) printf("[ OK ]\n");
-    else        printf("[ ERROR ]\n");//*/
+    status = status && cudaCheck( cudaDeviceReset() );
+    if (status) printf("[ OK ]\n");
+    else        printf("[ CUDA ERROR ]\n");//*/
 }
 
 void CudaLinearOperator::setTransposeData(
@@ -264,9 +265,11 @@ void cudaCheckKernel(){
 }
 
 void CudaLinearOperator::dot(float64_t* v_in, float64_t* v_out){
+    cudaError_t cudaStatus;
+    
     // Copy vector x to the GPU
-    bool cudaStatus = cudaMemcpy(x, v_in, ncols*sizeof(double), cudaMemcpyHostToDevice);
-    if (cudaStatus != cudaSuccess) printf("\t* tranfering x to GPU ... [ ERROR ]\n");
+    cudaStatus = cudaMemcpy(x, v_in, ncols*sizeof(double), cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) printf("\t* tranfering x to GPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
     else                           printf("\t* tranfering x to GPU ... [   OK  ]\n");
 
     // Multiply IC part in the GPU
@@ -286,17 +289,18 @@ void CudaLinearOperator::dot(float64_t* v_in, float64_t* v_out){
 
     // Copy back result to CPU
     cudaStatus = cudaMemcpy(v_out, y, nrows*sizeof(double), cudaMemcpyDeviceToHost);
-    if (cudaStatus != cudaSuccess) printf("\t* tranfering y to CPU ... [ ERROR ]\n");
+    if (cudaStatus != cudaSuccess) printf("\t* tranfering y to CPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
     else                           printf("\t* tranfering y to CPU ... [   OK  ]\n");
 }
 
 void CudaLinearOperator::Tdot(float64_t* v_in, float64_t* v_out){
         
+    cudaError_t cudaStatus;
     // Copy vector y to the GPU
     //cudaCheck( cudaMemset(gpu_x, 0, NUM_COLS*sizeof(float64_t)) );
     //cudaCheck( cudaMemcpy(gpu_x, x, NUM_COLS*sizeof(double), cudaMemcpyHostToDevice) );
-    bool cudaStatus = cudaCheck( cudaMemcpy(y, v_in, nrows*sizeof(double), cudaMemcpyHostToDevice) );
-    if (cudaStatus != cudaSuccess) printf("\t* tranfering y to GPU ... [ ERROR ]\n");
+    cudaStatus = cudaMemcpy(y, v_in, nrows*sizeof(double), cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) printf("\t* tranfering y to GPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
     else                           printf("\t* tranfering y to GPU ... [   OK  ]\n");
 
     // Multiply IC part in the GPU
@@ -315,8 +319,8 @@ void CudaLinearOperator::Tdot(float64_t* v_in, float64_t* v_out){
     cudaCheckKernel();
 
     // Copy back result to CPU
-    cudaStatus = cudaCheck( cudaMemcpy(v_out, x, ncols*sizeof(double), cudaMemcpyDeviceToHost) );
-    if (cudaStatus != cudaSuccess) printf("\t* tranfering x to CPU ... [ ERROR ]\n");
+    cudaStatus = cudaMemcpy(v_out, x, ncols*sizeof(double), cudaMemcpyDeviceToHost);
+    if (cudaStatus != cudaSuccess) printf("\t* tranfering x to CPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
     else                           printf("\t* tranfering x to CPU ... [   OK  ]\n");
         
     /*printf("\n\n VECTOR X EC PART:\n");

From 6c335cd86b909b3f40ff7458b106b1ae6e400436 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Mon, 30 Mar 2020 00:12:48 -0600
Subject: [PATCH 066/190] Solving bug with operation A'y in CUDA

---
 commit/core.pyx | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index caa9bd2f..f7b5871a 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -482,15 +482,6 @@ cdef class Evaluation :
         self.DICTIONARY['EC'][ 'v'] = lut[ self.DICTIONARY['EC'][ 'v'] ]
         self.DICTIONARY['ISO']['v'] = lut[ self.DICTIONARY['ISO']['v'] ]
 
-
-        import commit.cudaoperator
-        print( '\t* building dictionary in GPU ... ' )
-        self.A = commit.cudaoperator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
-        if self.A.cuda_status == 1:
-            print( '[ CUDA OK ]' )
-        else:
-            print( '[ CUDA ERROR ]' )
-
         print( '         [ OK ]' )
 
         print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
@@ -683,16 +674,15 @@ cdef class Evaluation :
 
         if self.THREADS['n'] > 0:
             self.A = sys.modules['commit.operator.operator'].LinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
-        """
+        
         else:
             import commit.cudaoperator
             #print( '\t* building dictionary in GPU ... ' )
             self.A = commit.cudaoperator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
             if self.A.cuda_status == 1:
-                print( '[ CUDA OK ]' )
+                print( '[ OPERATOR OK ]' )
             else:
-                print( '[ CUDA ERROR ]' )
-        """
+                print( '[ OPERATOR ERROR ]' )
 
         print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
 

From 6cb89943b957f3c98ff28586905bba5846f5b4b7 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Mon, 30 Mar 2020 14:20:45 -0600
Subject: [PATCH 067/190] Adding memory size and CUDA version checker

---
 commit/cudaoperator.pyx | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 98fdc9cf..8a5c4465 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -97,6 +97,11 @@ cdef class CudaLinearOperator :
         self.n1 = self.nV*self.nS
         self.n2 = self.nR*self.nF + self.nT*self.nE + self.nI*self.nV
 
+        gpumem = 14*self.n + 6*self.nE + 16*nV + 4*(self.nR*self.ndirs*self.nS + self.nT*self.ndirs*self.nS + self.nI*self.nS + self.n1 + self.n2)
+        print('Required GPU Memory = %f GB' % (gpumem*1E-6))
+        if gpumen > 8.0:
+            raise RuntimeError( 'GPU Memory exceeded!!!!!!' )
+
         # get C pointers to arrays in DICTIONARY
         cdef unsigned int [::1]   ICf  = DICTIONARY['IC']['fiber']
         self.ICf = &ICf[0]

From 1c32ed75d0bc0206add676731b32bb03395a3d32 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Mon, 30 Mar 2020 14:23:09 -0600
Subject: [PATCH 068/190] Adding memory size and CUDA version checker

---
 commit/cudaoperator.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 8a5c4465..7d616452 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -100,7 +100,7 @@ cdef class CudaLinearOperator :
         gpumem = 14*self.n + 6*self.nE + 16*nV + 4*(self.nR*self.ndirs*self.nS + self.nT*self.ndirs*self.nS + self.nI*self.nS + self.n1 + self.n2)
         print('Required GPU Memory = %f GB' % (gpumem*1E-6))
         if gpumen > 8.0:
-            raise RuntimeError( 'GPU Memory exceeded!!!!!!' )
+            print( 'GPU Memory exceeded!!!!!!' )
 
         # get C pointers to arrays in DICTIONARY
         cdef unsigned int [::1]   ICf  = DICTIONARY['IC']['fiber']

From 02b1c67917af10c4ea80ed09d18153b999bdc0c6 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Mon, 30 Mar 2020 14:24:37 -0600
Subject: [PATCH 069/190] Adding memory size and CUDA version checker

---
 commit/cudaoperator.pyx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 7d616452..2b00a0fa 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -97,10 +97,12 @@ cdef class CudaLinearOperator :
         self.n1 = self.nV*self.nS
         self.n2 = self.nR*self.nF + self.nT*self.nE + self.nI*self.nV
 
+        """
         gpumem = 14*self.n + 6*self.nE + 16*nV + 4*(self.nR*self.ndirs*self.nS + self.nT*self.ndirs*self.nS + self.nI*self.nS + self.n1 + self.n2)
         print('Required GPU Memory = %f GB' % (gpumem*1E-6))
         if gpumen > 8.0:
             print( 'GPU Memory exceeded!!!!!!' )
+        """
 
         # get C pointers to arrays in DICTIONARY
         cdef unsigned int [::1]   ICf  = DICTIONARY['IC']['fiber']

From 2bea86b7532f7f04def539aef250625347d5b03b Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Mon, 30 Mar 2020 14:26:11 -0600
Subject: [PATCH 070/190] Adding memory size and CUDA version checker

---
 commit/cudaoperator.pyx | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 2b00a0fa..b459b865 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -97,12 +97,11 @@ cdef class CudaLinearOperator :
         self.n1 = self.nV*self.nS
         self.n2 = self.nR*self.nF + self.nT*self.nE + self.nI*self.nV
 
-        """
-        gpumem = 14*self.n + 6*self.nE + 16*nV + 4*(self.nR*self.ndirs*self.nS + self.nT*self.ndirs*self.nS + self.nI*self.nS + self.n1 + self.n2)
+        
+        cdef float gpumem = 14*self.n + 6*self.nE + 16*nV + 4*(self.nR*self.ndirs*self.nS + self.nT*self.ndirs*self.nS + self.nI*self.nS + self.n1 + self.n2)
         print('Required GPU Memory = %f GB' % (gpumem*1E-6))
         if gpumen > 8.0:
             print( 'GPU Memory exceeded!!!!!!' )
-        """
 
         # get C pointers to arrays in DICTIONARY
         cdef unsigned int [::1]   ICf  = DICTIONARY['IC']['fiber']

From 38e39a83e6ffc433809bd2717e5e4636470c41a5 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Mon, 30 Mar 2020 14:28:17 -0600
Subject: [PATCH 071/190] Adding memory size and CUDA version checker

---
 commit/cudaoperator.pyx | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index b459b865..995d9810 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -42,6 +42,7 @@ cdef class CudaLinearOperator :
     that uses information from the DICTIONARY, KERNELS and THREADS data structures.
     """
     cdef int nS, nF, nR, nE, nT, nV, nI, n, ndirs
+    cdef float gpumem
     cdef public int adjoint, n1, n2
 
     cdef DICTIONARY
@@ -98,10 +99,10 @@ cdef class CudaLinearOperator :
         self.n2 = self.nR*self.nF + self.nT*self.nE + self.nI*self.nV
 
         
-        cdef float gpumem = 14*self.n + 6*self.nE + 16*nV + 4*(self.nR*self.ndirs*self.nS + self.nT*self.ndirs*self.nS + self.nI*self.nS + self.n1 + self.n2)
-        print('Required GPU Memory = %f GB' % (gpumem*1E-6))
-        if gpumen > 8.0:
-            print( 'GPU Memory exceeded!!!!!!' )
+        self.gpumem = 14*self.n + 6*self.nE + 16*nV + 4*(self.nR*self.ndirs*self.nS + self.nT*self.ndirs*self.nS + self.nI*self.nS + self.n1 + self.n2)
+        print('Required GPU Memory = %f GB' % (self.gpumem*1E-6))
+        if self.gpumen > 8.0:
+            raise RuntimeError( 'GPU Memory exceeded!!!!!!' )
 
         # get C pointers to arrays in DICTIONARY
         cdef unsigned int [::1]   ICf  = DICTIONARY['IC']['fiber']

From 58f09dc4feeeebe6d86c00f60a9d9f7daa041c64 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Mon, 30 Mar 2020 14:31:49 -0600
Subject: [PATCH 072/190] Adding memory size and CUDA version checker

---
 commit/cudaoperator.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 995d9810..f2211760 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -99,7 +99,7 @@ cdef class CudaLinearOperator :
         self.n2 = self.nR*self.nF + self.nT*self.nE + self.nI*self.nV
 
         
-        self.gpumem = 14*self.n + 6*self.nE + 16*nV + 4*(self.nR*self.ndirs*self.nS + self.nT*self.ndirs*self.nS + self.nI*self.nS + self.n1 + self.n2)
+        self.gpumem = 14*self.n + 6*self.nE + 16*self.nV + 4*(self.nR*self.ndirs*self.nS + self.nT*self.ndirs*self.nS + self.nI*self.nS + self.n1 + self.n2)
         print('Required GPU Memory = %f GB' % (self.gpumem*1E-6))
         if self.gpumen > 8.0:
             raise RuntimeError( 'GPU Memory exceeded!!!!!!' )

From 65a101ea99661c9da58ea28061e2251150d0a53d Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Mon, 30 Mar 2020 14:33:58 -0600
Subject: [PATCH 073/190] Adding memory size and CUDA version checker

---
 commit/cudaoperator.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index f2211760..da780f05 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -100,7 +100,7 @@ cdef class CudaLinearOperator :
 
         
         self.gpumem = 14*self.n + 6*self.nE + 16*self.nV + 4*(self.nR*self.ndirs*self.nS + self.nT*self.ndirs*self.nS + self.nI*self.nS + self.n1 + self.n2)
-        print('Required GPU Memory = %f GB' % (self.gpumem*1E-6))
+        print('Required GPU Memory = %f GB' % (self.gpumem*1E-9))
         if self.gpumen > 8.0:
             raise RuntimeError( 'GPU Memory exceeded!!!!!!' )
 

From 4d59900efa44b7068e0345193377ab1b66f69a48 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Mon, 30 Mar 2020 14:37:21 -0600
Subject: [PATCH 074/190] Adding memory size and CUDA version checker

---
 commit/cudaoperator.pyx | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index da780f05..1a22593e 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -42,7 +42,6 @@ cdef class CudaLinearOperator :
     that uses information from the DICTIONARY, KERNELS and THREADS data structures.
     """
     cdef int nS, nF, nR, nE, nT, nV, nI, n, ndirs
-    cdef float gpumem
     cdef public int adjoint, n1, n2
 
     cdef DICTIONARY
@@ -99,9 +98,9 @@ cdef class CudaLinearOperator :
         self.n2 = self.nR*self.nF + self.nT*self.nE + self.nI*self.nV
 
         
-        self.gpumem = 14*self.n + 6*self.nE + 16*self.nV + 4*(self.nR*self.ndirs*self.nS + self.nT*self.ndirs*self.nS + self.nI*self.nS + self.n1 + self.n2)
-        print('Required GPU Memory = %f GB' % (self.gpumem*1E-9))
-        if self.gpumen > 8.0:
+        cdef int gpumem = 14*self.n + 6*self.nE + 16*self.nV + 4*(self.nR*self.ndirs*self.nS + self.nT*self.ndirs*self.nS + self.nI*self.nS + self.n1 + self.n2)
+        print('Required GPU Memory = %f GB' % (gpumem*1E-9))
+        if gpumem > 8.0:
             raise RuntimeError( 'GPU Memory exceeded!!!!!!' )
 
         # get C pointers to arrays in DICTIONARY

From 06396e07c57f24966934640356206442195a75ae Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Mon, 30 Mar 2020 14:40:34 -0600
Subject: [PATCH 075/190] Adding memory size and CUDA version checker

---
 commit/cudaoperator.pyx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 1a22593e..fac0aa60 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -98,9 +98,9 @@ cdef class CudaLinearOperator :
         self.n2 = self.nR*self.nF + self.nT*self.nE + self.nI*self.nV
 
         
-        cdef int gpumem = 14*self.n + 6*self.nE + 16*self.nV + 4*(self.nR*self.ndirs*self.nS + self.nT*self.ndirs*self.nS + self.nI*self.nS + self.n1 + self.n2)
-        print('Required GPU Memory = %f GB' % (gpumem*1E-9))
-        if gpumem > 8.0:
+        cdef int gpumem = 1E-6 * (14*self.n + 6*self.nE + 16*self.nV + 4*(self.nR*self.ndirs*self.nS + self.nT*self.ndirs*self.nS + self.nI*self.nS + self.n1 + self.n2) )
+        print('Required GPU Memory = %f MB' % (gpumem))
+        if gpumem > 8000.0:
             raise RuntimeError( 'GPU Memory exceeded!!!!!!' )
 
         # get C pointers to arrays in DICTIONARY

From f20936d5c7b6f2430e5cac1b370febe978c85d7d Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Mon, 30 Mar 2020 14:42:36 -0600
Subject: [PATCH 076/190] Adding memory size and CUDA version checker

---
 commit/cudaoperator.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index fac0aa60..6d843794 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -98,8 +98,8 @@ cdef class CudaLinearOperator :
         self.n2 = self.nR*self.nF + self.nT*self.nE + self.nI*self.nV
 
         
-        cdef int gpumem = 1E-6 * (14*self.n + 6*self.nE + 16*self.nV + 4*(self.nR*self.ndirs*self.nS + self.nT*self.ndirs*self.nS + self.nI*self.nS + self.n1 + self.n2) )
-        print('Required GPU Memory = %f MB' % (gpumem))
+        cdef float gpumem = 1E-6 * (14*self.n + 6*self.nE + 16*self.nV + 4*(self.nR*self.ndirs*self.nS + self.nT*self.ndirs*self.nS + self.nI*self.nS + self.n1 + self.n2) )
+        print('Required GPU Memory = %f MB' % gpumem)
         if gpumem > 8000.0:
             raise RuntimeError( 'GPU Memory exceeded!!!!!!' )
 

From b4cb321cc57648571b5967d147b9adfbcd4ff693 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Mon, 30 Mar 2020 14:45:30 -0600
Subject: [PATCH 077/190] Adding memory size and CUDA version checker

---
 commit/cudaoperator.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 6d843794..e1a6e696 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -98,7 +98,7 @@ cdef class CudaLinearOperator :
         self.n2 = self.nR*self.nF + self.nT*self.nE + self.nI*self.nV
 
         
-        cdef float gpumem = 1E-6 * (14*self.n + 6*self.nE + 16*self.nV + 4*(self.nR*self.ndirs*self.nS + self.nT*self.ndirs*self.nS + self.nI*self.nS + self.n1 + self.n2) )
+        cdef float gpumem = 1E-6 * (28*self.n + 6*self.nE + 8*(self.nF) + 16*self.nV + 4*(self.nR*self.ndirs*self.nS + self.nT*self.ndirs*self.nS + self.nI*self.nS + self.n1 + self.n2) )
         print('Required GPU Memory = %f MB' % gpumem)
         if gpumem > 8000.0:
             raise RuntimeError( 'GPU Memory exceeded!!!!!!' )

From c4f0b9d4026a68fd3a836a4a4faa4355cba21082 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Mon, 30 Mar 2020 14:53:19 -0600
Subject: [PATCH 078/190] Adding memory size and CUDA version checker

---
 commit/cudaoperator.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index e1a6e696..c661723c 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -98,7 +98,7 @@ cdef class CudaLinearOperator :
         self.n2 = self.nR*self.nF + self.nT*self.nE + self.nI*self.nV
 
         
-        cdef float gpumem = 1E-6 * (28*self.n + 6*self.nE + 8*(self.nF) + 16*self.nV + 4*(self.nR*self.ndirs*self.nS + self.nT*self.ndirs*self.nS + self.nI*self.nS + self.n1 + self.n2) )
+        cdef double gpumem = 1E-6 * (28.0*self.n + 6.0*self.nE + 8.0*(self.nF) + 16.0*self.nV + 4.0*(self.nR*self.ndirs*self.nS + self.nT*self.ndirs*self.nS + self.nI*self.nS + self.n1 + self.n2) )
         print('Required GPU Memory = %f MB' % gpumem)
         if gpumem > 8000.0:
             raise RuntimeError( 'GPU Memory exceeded!!!!!!' )

From eda206107353805668ae00a225d540348e534631 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Tue, 31 Mar 2020 14:26:35 -0600
Subject: [PATCH 079/190] Solving bug with operation A'y in CUDA

---
 commit/operator_withCUDA.cu  | 285 ++++++++++++++++-------------------
 commit/operator_withCUDA.cuh |  62 ++++----
 2 files changed, 160 insertions(+), 187 deletions(-)

diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index da21131b..7a247367 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -34,9 +34,6 @@ CudaLinearOperator::CudaLinearOperator(
     int size_lutiso = nballs*nsamples;
     //bool status;
 
-    uint32_t* segmentsPerBlock = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
-    uint32_t* offsetPerBlock   = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
-
     cudaStatus = true;
 
     // copy constant values to GPU
@@ -59,97 +56,76 @@ CudaLinearOperator::CudaLinearOperator(
 
 
     // alloc memory in GPU for vectors x and y
-    //printf("\t* memory for vectors x and y ... ");
-    //status = true;
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(this->x), ncols*sizeof(float64_t)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(this->y), nrows*sizeof(float64_t)) );
-    /*if (status) printf("[ OK ]\n");
-    else        printf("[ ERROR ]\n");//*/
-
-    // alloc GPU memory for segments
-    //printf("\t* memory for LUT (IC part) ... ");
-    //status = true;
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(this->lutIC), size_lutic*sizeof(float32_t)) );
-    /*if (status) printf("[ OK ]\n");
-    else        printf("[ ERROR ]\n");//*/
-
-    //printf("\t* copying LUT in GPU (IC part) ... ");
-    //status = true;
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(this->lutIC, lutIC, size_lutic*sizeof(float32_t), cudaMemcpyHostToDevice) );
-    /*if (status) printf("[ OK ]\n");
-    else        printf("[ ERROR ]\n");//*/
-
-    //printf("\t* allocating memory for LUT in GPU (EC part) ... ");
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(this->lutEC), size_lutec*sizeof(float32_t)) );
-    /*if (status) printf("[ OK ]\n");
-    else        printf("[ ERROR ]\n");//*/
-
-    //printf("\t* copying LUT in GPU (EC part) ... ");
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(this->lutEC, lutEC, size_lutec*sizeof(float32_t), cudaMemcpyHostToDevice) );
-    /*if (status) printf("[ OK ]\n");
-    else        printf("[ ERROR ]\n");//*/
-
-    //printf("\t* allocating memory for LUT in GPU (ISO part) ... ");
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(this->lutISO), size_lutiso*sizeof(float32_t)) );
-    /*if (status) printf("[ OK ]\n");
-    else        printf("[ ERROR ]\n");//*/
-
-    //printf("\t* copying LUT in GPU (ISO part) ... ");
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(this->lutISO, lutISO, size_lutiso*sizeof(float32_t), cudaMemcpyHostToDevice) );
-    /*if (status) printf("[ OK ]\n");
-    else        printf("[ ERROR ]\n");//*/
-
-    //printf("\t* preprocessing data for GPU ... ");
-    preprocessDataForGPU(voxelIC, nsegments, segmentsPerBlock, offsetPerBlock, nvoxels);
-    //printf("\n");
-
-    /*printf("\t* fiber segments memory allocation ... ");
-    status = true;//*/
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(this->voxelIC),  nsegments*sizeof(uint32_t))  );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(this->fiberIC),  nsegments*sizeof(uint32_t))  );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(this->orienIC),  nsegments*sizeof(uint16_t))  );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(this->lengthIC), nsegments*sizeof(float32_t)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(this->segmentsPerBlockIC), nvoxels*sizeof(uint32_t)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(this->offsetPerBlockIC),   nvoxels*sizeof(uint32_t)) );
-    /*if (status) printf("[ OK ]\n");
-    else        printf("[ ERROR ]\n");//*/
-
-    /*printf("\t* transfering fiber segments ... ");
-    status = true;//*/
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(this->voxelIC,  voxelIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(this->fiberIC,  fiberIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(this->orienIC,  orienIC,  nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(this->lengthIC, lengthIC, nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(this->segmentsPerBlockIC, segmentsPerBlock, nvoxels*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(this->offsetPerBlockIC,   offsetPerBlock,   nvoxels*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    /*if (status) printf("[ OK ]\n");
-    else        printf("[ ERROR ]\n");//*/
-
-    // ---------------------------------------- EC DATA ---------------------------------------- //
-    /*printf("\t* allocating memory for operator A in GPU (EC part) ... ");
-    status = true;//*/
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(this->voxelEC),  npeaks*sizeof(uint32_t)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(this->orienEC),  npeaks*sizeof(uint16_t)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(this->segmentsPerBlockEC), nvoxels*sizeof(uint32_t))  );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(this->offsetPerBlockEC),   nvoxels*sizeof(uint32_t))  );
-    /*if (status) printf("[ OK ]\n");
-    else        printf("[ ERROR ]\n");//*/
-
-    //printf("\t* preprocessing EC data for GPU ... ");
-    preprocessDataForGPU(voxelEC, npeaks, segmentsPerBlock, offsetPerBlock, nvoxels);
-    //printf("\n");
-
-    /*printf("\t* copying operator A to GPU (EC part) ... ");
-    status = true;//*/
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(this->voxelEC,            voxelEC,          npeaks*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(this->orienEC,            orienEC,          npeaks*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(this->segmentsPerBlockEC, segmentsPerBlock, nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(this->offsetPerBlockEC,   offsetPerBlock,   nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
-    /*if (status) printf("[ OK ]\n");
-    else        printf("[ ERROR ]\n");//*/
-
-    free(segmentsPerBlock);
-    free(offsetPerBlock);
+    if (gpu_x == NULL) cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_x, ncols*sizeof(float64_t)) );
+    if (gpu_y == NULL) cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_y, nrows*sizeof(float64_t)) );
+
+    // setup LUTs
+    if (gpu_lutIC  == NULL){
+        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lutIC, size_lutic*sizeof(float32_t)) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lutIC, lutIC, size_lutic*sizeof(float32_t), cudaMemcpyHostToDevice) );
+    }
+
+    if (gpu_lutEC  == NULL){
+        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lutEC,  size_lutec*sizeof(float32_t)) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lutEC, lutEC, size_lutec*sizeof(float32_t), cudaMemcpyHostToDevice) );
+    }
+
+    if (gpu_lutISO == NULL){
+        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lutISO, size_lutiso*sizeof(float32_t)) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lutISO, lutISO, size_lutiso*sizeof(float32_t), cudaMemcpyHostToDevice) );
+    }
+
+    if (gpu_voxelIC == NULL || gpu_fiberIC == NULL || gpu_orienIC == NULL || gpu_lengthIC == NULL) {
+        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_voxelIC,  nsegments*sizeof(uint32_t))  );
+        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_fiberIC,  nsegments*sizeof(uint32_t))  );
+        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_orienIC,  nsegments*sizeof(uint16_t))  );
+        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lengthIC, nsegments*sizeof(float32_t)) );
+
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_voxelIC,  voxelIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_fiberIC,  fiberIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_orienIC,  orienIC,  nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lengthIC, lengthIC, nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
+    }
+    
+    if (gpu_segmentsPerBlockIC == NULL || gpu_offsetPerBlockIC == NULL) {
+        uint32_t* segmentsPerBlock = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
+        uint32_t* offsetPerBlock   = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
+
+        preprocessDataForGPU(voxelIC, nsegments, segmentsPerBlock, offsetPerBlock, nvoxels);
+
+        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_segmentsPerBlockIC, nvoxels*sizeof(uint32_t)) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_offsetPerBlockIC,   nvoxels*sizeof(uint32_t)) );
+
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_segmentsPerBlockIC, segmentsPerBlock, nvoxels*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_offsetPerBlockIC,   offsetPerBlock,   nvoxels*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+
+        free(segmentsPerBlock);
+        free(offsetPerBlock);
+    }
+
+    if (gpu_voxelEC == NULL || gpu_orienEC == NULL) {
+        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_voxelEC,  npeaks*sizeof(uint32_t)) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_orienEC,  npeaks*sizeof(uint16_t)) );
+
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_voxelEC, voxelEC, npeaks*sizeof(uint32_t), cudaMemcpyHostToDevice) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_orienEC, orienEC, npeaks*sizeof(uint16_t), cudaMemcpyHostToDevice) );
+    }
+
+    if (gpu_segmentsPerBlockEC == NULL || gpu_offsetPerBlockEC == NULL) {
+        uint32_t* segmentsPerBlock = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
+        uint32_t* offsetPerBlock   = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
+
+        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_segmentsPerBlockEC, nvoxels*sizeof(uint32_t)) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_offsetPerBlockEC,   nvoxels*sizeof(uint32_t)) );
+
+        preprocessDataForGPU(voxelEC, npeaks, segmentsPerBlock, offsetPerBlock, nvoxels);
+
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_segmentsPerBlockEC, segmentsPerBlock, nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_offsetPerBlockEC,   offsetPerBlock,   nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
+
+        free(segmentsPerBlock);
+        free(offsetPerBlock);
+    }
 }
 
 CudaLinearOperator::~CudaLinearOperator(){
@@ -179,29 +155,29 @@ CudaLinearOperator::~CudaLinearOperator(){
 }
 
 void CudaLinearOperator::destroy(){
-    printf("\t* destroying and reseting GPU ... \n");
+    printf("\t* destroying and reseting GPU ... ");
     bool status = true;
-    status = status && cudaCheck( cudaFree(voxelIC) );
-    status = status && cudaCheck( cudaFree(fiberIC) );
-    status = status && cudaCheck( cudaFree(orienIC) );
-    status = status && cudaCheck( cudaFree(lengthIC) );
-    status = status && cudaCheck( cudaFree(lutIC) );
-    status = status && cudaCheck( cudaFree(segmentsPerBlockIC) );
-    status = status && cudaCheck( cudaFree(offsetPerBlockIC) );
-    status = status && cudaCheck( cudaFree(voxelEC) );
-    status = status && cudaCheck( cudaFree(orienEC) );
-    status = status && cudaCheck( cudaFree(lutEC) );
-    status = status && cudaCheck( cudaFree(segmentsPerBlockEC) );
-    status = status && cudaCheck( cudaFree(offsetPerBlockEC) );
-    status = status && cudaCheck( cudaFree(lutISO) );
-    status = status && cudaCheck( cudaFree(TvoxelIC) );
-    status = status && cudaCheck( cudaFree(TfiberIC) );
-    status = status && cudaCheck( cudaFree(TorienIC) );
-    status = status && cudaCheck( cudaFree(TlengthIC) );
-    status = status && cudaCheck( cudaFree(TfibersPerBlockIC) );
-    status = status && cudaCheck( cudaFree(ToffsetPerBlockIC) );
-    status = status && cudaCheck( cudaFree(x) );
-    status = status && cudaCheck( cudaFree(y) );
+    status = status && cudaCheck( cudaFree(gpu_voxelIC) );
+    status = status && cudaCheck( cudaFree(gpu_fiberIC) );
+    status = status && cudaCheck( cudaFree(gpu_orienIC) );
+    status = status && cudaCheck( cudaFree(gpu_lengthIC) );
+    status = status && cudaCheck( cudaFree(gpu_lutIC) );
+    status = status && cudaCheck( cudaFree(gpu_segmentsPerBlockIC) );
+    status = status && cudaCheck( cudaFree(gpu_offsetPerBlockIC) );
+    status = status && cudaCheck( cudaFree(gpu_voxelEC) );
+    status = status && cudaCheck( cudaFree(gpu_orienEC) );
+    status = status && cudaCheck( cudaFree(gpu_lutEC) );
+    status = status && cudaCheck( cudaFree(gpu_segmentsPerBlockEC) );
+    status = status && cudaCheck( cudaFree(gpu_offsetPerBlockEC) );
+    status = status && cudaCheck( cudaFree(gpu_lutISO) );
+    status = status && cudaCheck( cudaFree(gpu_TvoxelIC) );
+    status = status && cudaCheck( cudaFree(gpu_TfiberIC) );
+    status = status && cudaCheck( cudaFree(gpu_TorienIC) );
+    status = status && cudaCheck( cudaFree(gpu_TlengthIC) );
+    status = status && cudaCheck( cudaFree(gpu_TfibersPerBlockIC) );
+    status = status && cudaCheck( cudaFree(gpu_ToffsetPerBlockIC) );
+    status = status && cudaCheck( cudaFree(gpu_x) );
+    status = status && cudaCheck( cudaFree(gpu_y) );
 
     /*printf("\t* reseting GPU ... ");
     bool status = true;//*/
@@ -216,36 +192,33 @@ void CudaLinearOperator::setTransposeData(
     uint16_t*  orienIDs,
     float32_t* lengths)
 {
-    //bool status;
-    uint32_t*  fibersPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
-    uint32_t*  offsetPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
-
-    preprocessDataForGPU(fiberIDs, nsegments, fibersPerBlock, offsetPerBlock, nfibers);
-
-    /*printf("\t* extra memory for operator A' ... ");
-    status = true;//*/
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(TvoxelIC),  nsegments*sizeof(uint32_t))  );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(TfiberIC),  nsegments*sizeof(uint32_t))  );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(TorienIC),  nsegments*sizeof(uint16_t))  );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(TlengthIC), nsegments*sizeof(float32_t)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(TfibersPerBlockIC), nfibers*sizeof(uint32_t)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&(ToffsetPerBlockIC), nfibers*sizeof(uint32_t)) );
-    /*if (status) printf("[ OK ]\n");
-    else        printf("[ ERROR ]\n");//*/
-
-    /*printf("\t* transfering memory for operator A' ... ");
-    status = true;//*/
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(TvoxelIC,  voxelIDs, nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(TfiberIC,  fiberIDs, nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(TorienIC,  orienIDs, nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(TlengthIC, lengths,  nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(TfibersPerBlockIC, fibersPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(ToffsetPerBlockIC, offsetPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    /*if (status) printf("[ OK ]\n");
-    else        printf("[ ERROR ]\n");//*/
-
-    free(fibersPerBlock);
-    free(offsetPerBlock);
+    if (gpu_TvoxelIC == NULL || gpu_TfiberIC == NULL || gpu_TorienIC == NULL || gpu_TlengthIC == NULL) {
+        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TvoxelIC,  nsegments*sizeof(uint32_t))  );
+        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TfiberIC,  nsegments*sizeof(uint32_t))  );
+        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TorienIC,  nsegments*sizeof(uint16_t))  );
+        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TlengthIC, nsegments*sizeof(float32_t)) );
+
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TvoxelIC,  voxelIDs, nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TfiberIC,  fiberIDs, nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TorienIC,  orienIDs, nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TlengthIC, lengths,  nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
+    }
+    
+    if (gpu_TfibersPerBlockIC == NULL || gpu_ToffsetPerBlockIC == NULL) {
+        uint32_t*  fibersPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
+        uint32_t*  offsetPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
+    
+        preprocessDataForGPU(fiberIDs, nsegments, fibersPerBlock, offsetPerBlock, nfibers);
+    
+        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TfibersPerBlockIC, nfibers*sizeof(uint32_t)) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_ToffsetPerBlockIC, nfibers*sizeof(uint32_t)) );
+    
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TfibersPerBlockIC, fibersPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_ToffsetPerBlockIC, offsetPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    
+        free(fibersPerBlock);
+        free(offsetPerBlock);
+    }
 }
 
 void cudaCheckKernel(){
@@ -268,27 +241,27 @@ void CudaLinearOperator::dot(float64_t* v_in, float64_t* v_out){
     cudaError_t cudaStatus;
     
     // Copy vector x to the GPU
-    cudaStatus = cudaMemcpy(x, v_in, ncols*sizeof(double), cudaMemcpyHostToDevice);
+    cudaStatus = cudaMemcpy(gpu_x, v_in, ncols*sizeof(double), cudaMemcpyHostToDevice);
     if (cudaStatus != cudaSuccess) printf("\t* tranfering x to GPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
     else                           printf("\t* tranfering x to GPU ... [   OK  ]\n");
 
     // Multiply IC part in the GPU
-    multiply_Ax_ICpart<<<nvoxels, 1024>>>(voxelIC, fiberIC, orienIC, lengthIC, segmentsPerBlockIC, offsetPerBlockIC, lutIC, x, y);
+    multiply_Ax_ICpart<<<nvoxels, 1024>>>(gpu_voxelIC, gpu_fiberIC, gpu_orienIC, gpu_lengthIC, gpu_segmentsPerBlockIC, gpu_offsetPerBlockIC, gpu_lutIC, gpu_x, gpu_y);
 
     cudaCheckKernel();
 
     // Multiply EC part in the GPU
-    multiply_Ax_ECpart<<<nvoxels, 512>>>(voxelEC, orienEC, segmentsPerBlockEC, offsetPerBlockEC, lutEC, x, y);
+    multiply_Ax_ECpart<<<nvoxels, 512>>>(gpu_voxelEC, gpu_orienEC, gpu_segmentsPerBlockEC, gpu_offsetPerBlockEC, gpu_lutEC, gpu_x, gpu_y);
 
     cudaCheckKernel();
 
     // Multiply ISO part in the GPU
-    multiply_Ax_ISOpart<<<nvoxels, 512>>>(lutISO, x, y);
+    multiply_Ax_ISOpart<<<nvoxels, 512>>>(gpu_lutISO, gpu_x, gpu_y);
 
     cudaCheckKernel();
 
     // Copy back result to CPU
-    cudaStatus = cudaMemcpy(v_out, y, nrows*sizeof(double), cudaMemcpyDeviceToHost);
+    cudaStatus = cudaMemcpy(v_out, gpu_y, nrows*sizeof(double), cudaMemcpyDeviceToHost);
     if (cudaStatus != cudaSuccess) printf("\t* tranfering y to CPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
     else                           printf("\t* tranfering y to CPU ... [   OK  ]\n");
 }
@@ -299,27 +272,27 @@ void CudaLinearOperator::Tdot(float64_t* v_in, float64_t* v_out){
     // Copy vector y to the GPU
     //cudaCheck( cudaMemset(gpu_x, 0, NUM_COLS*sizeof(float64_t)) );
     //cudaCheck( cudaMemcpy(gpu_x, x, NUM_COLS*sizeof(double), cudaMemcpyHostToDevice) );
-    cudaStatus = cudaMemcpy(y, v_in, nrows*sizeof(double), cudaMemcpyHostToDevice);
+    cudaStatus = cudaMemcpy(gpu_y, v_in, nrows*sizeof(double), cudaMemcpyHostToDevice);
     if (cudaStatus != cudaSuccess) printf("\t* tranfering y to GPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
     else                           printf("\t* tranfering y to GPU ... [   OK  ]\n");
 
     // Multiply IC part in the GPU
-    multiply_Aty_ICpart<<<nfibers, 512>>>(TvoxelIC, TfiberIC, TorienIC, TlengthIC, TfibersPerBlockIC, ToffsetPerBlockIC, lutIC, x, y);
+    multiply_Aty_ICpart<<<nfibers, 512>>>(gpu_TvoxelIC, gpu_TfiberIC, gpu_TorienIC, gpu_TlengthIC, gpu_TfibersPerBlockIC, gpu_ToffsetPerBlockIC, gpu_lutIC, gpu_x, gpu_y);
 
     cudaCheckKernel();
 
     // Multiply EC part in the GPU
-    multiply_Aty_ECpart<<<nvoxels, 512>>>(voxelEC, orienEC, segmentsPerBlockEC, offsetPerBlockEC, lutEC, x, y);
+    multiply_Aty_ECpart<<<nvoxels, 512>>>(gpu_voxelEC, gpu_orienEC, gpu_segmentsPerBlockEC, gpu_offsetPerBlockEC, gpu_lutEC, gpu_x, gpu_y);
 
     cudaCheckKernel();
 
     // Multiply ISO part in the GPU
-    multiply_Aty_ISOpart<<<nvoxels, 512>>>(lutISO, x, y);
+    multiply_Aty_ISOpart<<<nvoxels, 512>>>(gpu_lutISO, gpu_x, gpu_y);
 
     cudaCheckKernel();
 
     // Copy back result to CPU
-    cudaStatus = cudaMemcpy(v_out, x, ncols*sizeof(double), cudaMemcpyDeviceToHost);
+    cudaStatus = cudaMemcpy(v_out, gpu_x, ncols*sizeof(double), cudaMemcpyDeviceToHost);
     if (cudaStatus != cudaSuccess) printf("\t* tranfering x to CPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
     else                           printf("\t* tranfering x to CPU ... [   OK  ]\n");
         
diff --git a/commit/operator_withCUDA.cuh b/commit/operator_withCUDA.cuh
index 5955a874..cefa3e18 100644
--- a/commit/operator_withCUDA.cuh
+++ b/commit/operator_withCUDA.cuh
@@ -82,38 +82,38 @@ __constant__ int SIZE_LUTIC;
 __constant__ int SIZE_LUTEC;     
 __constant__ int SIZE_LUTISO;
 
-class CudaLinearOperator {
+// pointers to IC data in GPU memory
+static uint32_t*  gpu_voxelIC;
+static uint32_t*  gpu_fiberIC;
+static uint16_t*  gpu_orienIC;
+static float32_t* gpu_lengthIC;
+static uint32_t*  gpu_segmentsPerBlockIC;
+static uint32_t*  gpu_offsetPerBlockIC;
+
+// pointers to IC data (transpose) in GPU memory
+static uint32_t*  gpu_TvoxelIC;
+static uint32_t*  gpu_TfiberIC;
+static uint16_t*  gpu_TorienIC;
+static float32_t* gpu_TlengthIC;
+static uint32_t*  gpu_TfibersPerBlockIC;
+static uint32_t*  gpu_ToffsetPerBlockIC;
+
+// pointers to EC data in GPU memory
+static uint32_t* gpu_voxelEC;
+static uint16_t* gpu_orienEC;
+static uint32_t* gpu_segmentsPerBlockEC;
+static uint32_t* gpu_offsetPerBlockEC;
+
+// pointers to LUTs in GPU memory
+static float32_t* gpu_lutIC;
+static float32_t* gpu_lutEC;
+static float32_t* gpu_lutISO;
+
+// pointers to vector x and y
+static float64_t* gpu_x;
+static float64_t* gpu_y;
 
-    // pointers to IC data in GPU memory
-    uint32_t*  voxelIC;
-    uint32_t*  fiberIC;
-    uint16_t*  orienIC;
-    float32_t* lengthIC;
-    uint32_t*  segmentsPerBlockIC;
-    uint32_t*  offsetPerBlockIC;
-
-    // pointers to IC data (transpose) in GPU memory
-    uint32_t*  TvoxelIC;
-    uint32_t*  TfiberIC;
-    uint16_t*  TorienIC;
-    float32_t* TlengthIC;
-    uint32_t*  TfibersPerBlockIC;
-    uint32_t*  ToffsetPerBlockIC;
-
-    // pointers to EC data in GPU memory
-    uint32_t* voxelEC;
-    uint16_t* orienEC;
-    uint32_t* segmentsPerBlockEC;
-    uint32_t* offsetPerBlockEC;
-
-    // pointers to LUTs in GPU memory
-    float32_t* lutIC;
-    float32_t* lutEC;
-    float32_t* lutISO;
-
-    // pointers to vector x and y
-    float64_t* x;
-    float64_t* y;
+class CudaLinearOperator {
 
     // constant values in CPU
     int nrows;

From b2adc3306ccdf11c9bb052b340d11be7492ce432 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Tue, 31 Mar 2020 14:57:02 -0600
Subject: [PATCH 080/190] Solving bug with operation A'y in CUDA

---
 commit/core.pyx             |  1 +
 commit/cudaoperator.pyx     | 52 ++++++++++++++++++-------------------
 commit/operator_withCUDA.cu | 32 +++++++++++------------
 3 files changed, 43 insertions(+), 42 deletions(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index f7b5871a..d2118af9 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -680,6 +680,7 @@ cdef class Evaluation :
             #print( '\t* building dictionary in GPU ... ' )
             self.A = commit.cudaoperator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
             if self.A.cuda_status == 1:
+                self.A.set_transpose_data()
                 print( '[ OPERATOR OK ]' )
             else:
                 print( '[ OPERATOR ERROR ]' )
diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index c661723c..187da1af 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -168,31 +168,6 @@ cdef class CudaLinearOperator :
             self.nI
         )
 
-        
-        idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['fiber'])] )
-        self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
-        self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]
-        self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
-        self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
-
-        
-        ICf  = self.DICTIONARY['IC']['fiber']
-        self.ICf = &ICf[0]
-        ICl  = self.DICTIONARY['IC']['len']
-        self.ICl = &ICl[0]
-        ICv  = self.DICTIONARY['IC']['v']
-        self.ICv = &ICv[0]
-        ICo  = self.DICTIONARY['IC']['o']
-        self.ICo = &ICo[0]
-        ECv  = self.DICTIONARY['EC']['v']
-        self.ECv = &ECv[0]
-        ECo  = self.DICTIONARY['EC']['o']
-        self.ECo = &ECo[0]
-        ISOv = self.DICTIONARY['ISO']['v']
-        self.ISOv = &ISOv[0]
-
-        self.A.setTransposeData(&self.ICv[0], &self.ICf[0], &self.ICo[0], &self.ICl[0])
-
         """
         idx = np.argsort( self.DICTIONARY['IC']['v'], kind='mergesort' )
         self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
@@ -262,4 +237,29 @@ cdef class CudaLinearOperator :
         return self.A.getCudaStatus()
 
     def destroy( self ):
-        self.A.destroy()
\ No newline at end of file
+        self.A.destroy()
+
+    def set_transpose_data():
+        idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['fiber'])] )
+        self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
+        self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]
+        self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
+        self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
+
+        
+        ICf  = self.DICTIONARY['IC']['fiber']
+        self.ICf = &ICf[0]
+        ICl  = self.DICTIONARY['IC']['len']
+        self.ICl = &ICl[0]
+        ICv  = self.DICTIONARY['IC']['v']
+        self.ICv = &ICv[0]
+        ICo  = self.DICTIONARY['IC']['o']
+        self.ICo = &ICo[0]
+        ECv  = self.DICTIONARY['EC']['v']
+        self.ECv = &ECv[0]
+        ECo  = self.DICTIONARY['EC']['o']
+        self.ECo = &ECo[0]
+        ISOv = self.DICTIONARY['ISO']['v']
+        self.ISOv = &ISOv[0]
+
+        self.A.setTransposeData(&self.ICv[0], &self.ICf[0], &self.ICo[0], &self.ICl[0])
\ No newline at end of file
diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index 7a247367..9dfb31e2 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -238,63 +238,63 @@ void cudaCheckKernel(){
 }
 
 void CudaLinearOperator::dot(float64_t* v_in, float64_t* v_out){
-    cudaError_t cudaStatus;
+    //cudaError_t cudaStatus;
     
     // Copy vector x to the GPU
     cudaStatus = cudaMemcpy(gpu_x, v_in, ncols*sizeof(double), cudaMemcpyHostToDevice);
-    if (cudaStatus != cudaSuccess) printf("\t* tranfering x to GPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
-    else                           printf("\t* tranfering x to GPU ... [   OK  ]\n");
+    /*if (cudaStatus != cudaSuccess) printf("\t* tranfering x to GPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
+    else                           printf("\t* tranfering x to GPU ... [   OK  ]\n");//*/
 
     // Multiply IC part in the GPU
     multiply_Ax_ICpart<<<nvoxels, 1024>>>(gpu_voxelIC, gpu_fiberIC, gpu_orienIC, gpu_lengthIC, gpu_segmentsPerBlockIC, gpu_offsetPerBlockIC, gpu_lutIC, gpu_x, gpu_y);
 
-    cudaCheckKernel();
+    //cudaCheckKernel();
 
     // Multiply EC part in the GPU
     multiply_Ax_ECpart<<<nvoxels, 512>>>(gpu_voxelEC, gpu_orienEC, gpu_segmentsPerBlockEC, gpu_offsetPerBlockEC, gpu_lutEC, gpu_x, gpu_y);
 
-    cudaCheckKernel();
+    //cudaCheckKernel();
 
     // Multiply ISO part in the GPU
     multiply_Ax_ISOpart<<<nvoxels, 512>>>(gpu_lutISO, gpu_x, gpu_y);
 
-    cudaCheckKernel();
+    //cudaCheckKernel();
 
     // Copy back result to CPU
     cudaStatus = cudaMemcpy(v_out, gpu_y, nrows*sizeof(double), cudaMemcpyDeviceToHost);
-    if (cudaStatus != cudaSuccess) printf("\t* tranfering y to CPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
-    else                           printf("\t* tranfering y to CPU ... [   OK  ]\n");
+    /*if (cudaStatus != cudaSuccess) printf("\t* tranfering y to CPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
+    else                           printf("\t* tranfering y to CPU ... [   OK  ]\n");//*/
 }
 
 void CudaLinearOperator::Tdot(float64_t* v_in, float64_t* v_out){
         
-    cudaError_t cudaStatus;
+    //cudaError_t cudaStatus;
     // Copy vector y to the GPU
     //cudaCheck( cudaMemset(gpu_x, 0, NUM_COLS*sizeof(float64_t)) );
     //cudaCheck( cudaMemcpy(gpu_x, x, NUM_COLS*sizeof(double), cudaMemcpyHostToDevice) );
     cudaStatus = cudaMemcpy(gpu_y, v_in, nrows*sizeof(double), cudaMemcpyHostToDevice);
-    if (cudaStatus != cudaSuccess) printf("\t* tranfering y to GPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
-    else                           printf("\t* tranfering y to GPU ... [   OK  ]\n");
+    /*if (cudaStatus != cudaSuccess) printf("\t* tranfering y to GPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
+    else                           printf("\t* tranfering y to GPU ... [   OK  ]\n");//*/
 
     // Multiply IC part in the GPU
     multiply_Aty_ICpart<<<nfibers, 512>>>(gpu_TvoxelIC, gpu_TfiberIC, gpu_TorienIC, gpu_TlengthIC, gpu_TfibersPerBlockIC, gpu_ToffsetPerBlockIC, gpu_lutIC, gpu_x, gpu_y);
 
-    cudaCheckKernel();
+    //cudaCheckKernel();
 
     // Multiply EC part in the GPU
     multiply_Aty_ECpart<<<nvoxels, 512>>>(gpu_voxelEC, gpu_orienEC, gpu_segmentsPerBlockEC, gpu_offsetPerBlockEC, gpu_lutEC, gpu_x, gpu_y);
 
-    cudaCheckKernel();
+    //cudaCheckKernel();
 
     // Multiply ISO part in the GPU
     multiply_Aty_ISOpart<<<nvoxels, 512>>>(gpu_lutISO, gpu_x, gpu_y);
 
-    cudaCheckKernel();
+    //cudaCheckKernel();
 
     // Copy back result to CPU
     cudaStatus = cudaMemcpy(v_out, gpu_x, ncols*sizeof(double), cudaMemcpyDeviceToHost);
-    if (cudaStatus != cudaSuccess) printf("\t* tranfering x to CPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
-    else                           printf("\t* tranfering x to CPU ... [   OK  ]\n");
+    /*if (cudaStatus != cudaSuccess) printf("\t* tranfering x to CPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
+    else                           printf("\t* tranfering x to CPU ... [   OK  ]\n");//*/
         
     /*printf("\n\n VECTOR X EC PART:\n");
     for(int i = NUM_FIBERS*NUM_RESFUNCIC; i < NUM_FIBERS*NUM_RESFUNCIC+20; i++)

From 258b4bcc4f37ef2d6d298ae503f02edd1d1e9557 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Tue, 31 Mar 2020 15:00:26 -0600
Subject: [PATCH 081/190] Solving bug with operation A'y in CUDA

---
 commit/cudaoperator.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 187da1af..320166a6 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -239,7 +239,7 @@ cdef class CudaLinearOperator :
     def destroy( self ):
         self.A.destroy()
 
-    def set_transpose_data():
+    def set_transpose_data( self ):
         idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['fiber'])] )
         self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
         self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]

From bd4eaf9fa23350ec5f9d2af45cefab88335ebb86 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Tue, 31 Mar 2020 15:04:47 -0600
Subject: [PATCH 082/190] Solving bug with operation A'y in CUDA

---
 commit/cudaoperator.pyx | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 320166a6..d8e32005 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -247,19 +247,13 @@ cdef class CudaLinearOperator :
         self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
 
         
-        ICf  = self.DICTIONARY['IC']['fiber']
+        cdef unsigned int [::1] ICf  = self.DICTIONARY['IC']['fiber']
         self.ICf = &ICf[0]
-        ICl  = self.DICTIONARY['IC']['len']
+        cdef float [::1] ICl  = self.DICTIONARY['IC']['len']
         self.ICl = &ICl[0]
-        ICv  = self.DICTIONARY['IC']['v']
+        cdef unsigned int [::1] ICv  = self.DICTIONARY['IC']['v']
         self.ICv = &ICv[0]
-        ICo  = self.DICTIONARY['IC']['o']
+        cdef unsigned short [::1] ICo  = self.DICTIONARY['IC']['o']
         self.ICo = &ICo[0]
-        ECv  = self.DICTIONARY['EC']['v']
-        self.ECv = &ECv[0]
-        ECo  = self.DICTIONARY['EC']['o']
-        self.ECo = &ECo[0]
-        ISOv = self.DICTIONARY['ISO']['v']
-        self.ISOv = &ISOv[0]
 
         self.A.setTransposeData(&self.ICv[0], &self.ICf[0], &self.ICo[0], &self.ICl[0])
\ No newline at end of file

From f6c22b322a4d34d6165d2115bf7c59a473770ce2 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Tue, 31 Mar 2020 20:17:16 -0600
Subject: [PATCH 083/190] Adding fcall flag to CudaLinearOperator

---
 commit/core.pyx              |   2 +-
 commit/cudaoperator.pyx      |  71 ++++++---------
 commit/operator_withCUDA.cu  | 169 +++++++++++++++++------------------
 commit/operator_withCUDA.cuh |  46 +++++++++-
 4 files changed, 154 insertions(+), 134 deletions(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index d2118af9..f18573bc 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -678,7 +678,7 @@ cdef class Evaluation :
         else:
             import commit.cudaoperator
             #print( '\t* building dictionary in GPU ... ' )
-            self.A = commit.cudaoperator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
+            self.A = commit.cudaoperator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS. fcall=True )
             if self.A.cuda_status == 1:
                 self.A.set_transpose_data()
                 print( '[ OPERATOR OK ]' )
diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index d8e32005..c1f0cf1b 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -28,7 +28,9 @@ cdef extern from "operator_withCUDA.cuh":
             int,
             int,
             int,
-            int)
+            int,
+            
+            bool)
 
         int   getCudaStatus()
         void  setTransposeData(np.uint32_t*, np.uint32_t*, np.uint16_t*, np.float32_t*)
@@ -70,7 +72,7 @@ cdef class CudaLinearOperator :
     cdef C_CudaLinearOperator* A
 
 
-    def __init__( self, DICTIONARY, KERNELS, THREADS ) :
+    def __init__( self, DICTIONARY, KERNELS, THREADS, fcall = False ) :
         """Set the pointers to the data structures used by the C code."""
         self.DICTIONARY = DICTIONARY
         self.KERNELS    = KERNELS
@@ -92,16 +94,17 @@ cdef class CudaLinearOperator :
         else :
             self.nS = KERNELS['wmr'].shape[1]
 
-        self.adjoint    = 0                         # direct of inverse product
+        self.adjoint = 0                            # direct of inverse product
 
         self.n1 = self.nV*self.nS
         self.n2 = self.nR*self.nF + self.nT*self.nE + self.nI*self.nV
-
         
-        cdef double gpumem = 1E-6 * (28.0*self.n + 6.0*self.nE + 8.0*(self.nF) + 16.0*self.nV + 4.0*(self.nR*self.ndirs*self.nS + self.nT*self.ndirs*self.nS + self.nI*self.nS + self.n1 + self.n2) )
+        """
+        cdef double gpumem = 1E-6 * ( 28.0*self.n + 6.0*self.nE + 8.0*(self.nF) + 16.0*self.nV + 4.0*(self.nR*self.ndirs*self.nS + self.nT*self.ndirs*self.nS + self.nI*self.nS + self.n1 + self.n2) )
         print('Required GPU Memory = %f MB' % gpumem)
         if gpumem > 8000.0:
             raise RuntimeError( 'GPU Memory exceeded!!!!!!' )
+        """
 
         # get C pointers to arrays in DICTIONARY
         cdef unsigned int [::1]   ICf  = DICTIONARY['IC']['fiber']
@@ -127,23 +130,6 @@ cdef class CudaLinearOperator :
         cdef float [:, ::1] isoSFP = KERNELS['iso']
         self.LUT_ISO = &isoSFP[0,0]
 
-        """# get C pointers to arrays in THREADS
-        cdef unsigned int [::1] ICthreads = THREADS['IC']
-        self.ICthreads  = &ICthreads[0]
-        cdef unsigned int [::1] ECthreads = THREADS['EC']
-        self.ECthreads  = &ECthreads[0]
-        cdef unsigned int [::1] ISOthreads = THREADS['ISO']
-        self.ISOthreads = &ISOthreads[0]
-
-        cdef unsigned char [::1] ICthreadsT = THREADS['ICt']
-        self.ICthreadsT  = &ICthreadsT[0]
-        cdef unsigned int  [::1] ECthreadsT = THREADS['ECt']
-        self.ECthreadsT  = &ECthreadsT[0]
-        cdef unsigned int  [::1] ISOthreadsT = THREADS['ISOt']
-        self.ISOthreadsT = &ISOthreadsT[0] """
-
-        #sort here
-
         self.A = new C_CudaLinearOperator(
             &ICv[0],
             &ICf[0],
@@ -165,22 +151,10 @@ cdef class CudaLinearOperator :
             self.nS,
             self.nR,
             self.nT,
-            self.nI
-        )
+            self.nI,
 
-        """
-        idx = np.argsort( self.DICTIONARY['IC']['v'], kind='mergesort' )
-        self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
-        self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]
-        self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
-        self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
-        del idx
-
-        idx = np.argsort( self.DICTIONARY['EC']['v'], kind='mergesort' )
-        self.DICTIONARY['EC']['v'] = self.DICTIONARY['EC']['v'][ idx ]
-        self.DICTIONARY['EC']['o'] = self.DICTIONARY['EC']['o'][ idx ]
-        del idx
-        #"""
+            fcall
+        )
 
     @property
     def T( self ) :
@@ -222,38 +196,43 @@ cdef class CudaLinearOperator :
         # Call the cython function to read the memory pointers
         if not self.adjoint :
             # DIRECT PRODUCT A*x
-            print('MULTIPLICO Ax')
             self.A.dot(&v_in[0], &v_out[0])
         else :
             # INVERSE PRODUCT A'*y
-            print('MULTIPLICO A\'y')
             self.A.Tdot(&v_in[0], &v_out[0])
 
         return v_out
 
     @property
     def cuda_status( self ):
-        """Return status of CUDA GPU"""
+        """Return status of the CUDA GPU"""
         return self.A.getCudaStatus()
 
     def destroy( self ):
+        """Free all memory of the CUDA GPU"""
         self.A.destroy()
 
     def set_transpose_data( self ):
+        """Send A' data to the CUDA GPU"""
         idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['fiber'])] )
+
         self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
         self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]
         self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
         self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
 
-        
-        cdef unsigned int [::1] ICf  = self.DICTIONARY['IC']['fiber']
+        cdef unsigned int   [::1] ICf = self.DICTIONARY['IC']['fiber']
+        cdef float          [::1] ICl = self.DICTIONARY['IC']['len']
+        cdef unsigned int   [::1] ICv = self.DICTIONARY['IC']['v']
+        cdef unsigned short [::1] ICo = self.DICTIONARY['IC']['o']
+
         self.ICf = &ICf[0]
-        cdef float [::1] ICl  = self.DICTIONARY['IC']['len']
         self.ICl = &ICl[0]
-        cdef unsigned int [::1] ICv  = self.DICTIONARY['IC']['v']
         self.ICv = &ICv[0]
-        cdef unsigned short [::1] ICo  = self.DICTIONARY['IC']['o']
         self.ICo = &ICo[0]
 
-        self.A.setTransposeData(&self.ICv[0], &self.ICf[0], &self.ICo[0], &self.ICl[0])
\ No newline at end of file
+        self.A.setTransposeData(&self.ICv[0], &self.ICf[0], &self.ICo[0], &self.ICl[0])
+
+    def gpu_compatibility( self ):
+        """Check if the available GPU is compatible"""
+        return 0
diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index 9dfb31e2..07b879d7 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -22,109 +22,106 @@ CudaLinearOperator::CudaLinearOperator(
     int nsamples,     
     int ndiameters,   
     int nzeppelins,   
-    int nballs)
+    int nballs,
+
+    bool fcall)
 {
     this->nsegments = nsegments;
-    this->nvoxels = nvoxels;
-    this->nfibers = nfibers;
-    this->nrows = nvoxels * nsamples;
-    this->ncols = nfibers*ndiameters + npeaks*nzeppelins + nvoxels*nballs;
-    int size_lutic  = ndiameters*norientations*nsamples;
-    int size_lutec  = nzeppelins*norientations*nsamples;
-    int size_lutiso = nballs*nsamples;
-    //bool status;
-
-    cudaStatus = true;
-
-    // copy constant values to GPU
-    //printf("\t* constant global values ... ");
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_VOXELS,       &nvoxels,       sizeof(int)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_FIBERS,       &nfibers,       sizeof(int)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_PEAKS,        &npeaks,        sizeof(int)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_ORIENTATIONS, &norientations, sizeof(int)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_SAMPLES,      &nsamples,      sizeof(int)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_DIAMETERS,    &ndiameters,    sizeof(int)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_ZEPPELINS,    &nzeppelins,    sizeof(int)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_BALLS,        &nballs,        sizeof(int)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_ROWS,         &nrows,         sizeof(int)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_COLS,         &ncols,         sizeof(int)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTIC,       &size_lutic,    sizeof(int)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTEC,       &size_lutec,    sizeof(int)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTISO,      &size_lutiso,   sizeof(int)) );
-    /*if (cudaStatus) printf("[ OK ]\n");
-    else            printf("[ ERROR ]\n");//*/
-
-
-    // alloc memory in GPU for vectors x and y
-    if (gpu_x == NULL) cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_x, ncols*sizeof(float64_t)) );
-    if (gpu_y == NULL) cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_y, nrows*sizeof(float64_t)) );
-
-    // setup LUTs
-    if (gpu_lutIC  == NULL){
+    this->nvoxels   = nvoxels;
+    this->nfibers   = nfibers;
+    this->nrows     = nvoxels * nsamples;
+    this->ncols     = nfibers*ndiameters + npeaks*nzeppelins + nvoxels*nballs;
+
+    if (fcall) {
+        print("\t* configuring dictionary in CUDA GPU ... \n");
+
+        int size_lutic  = ndiameters*norientations*nsamples;
+        int size_lutec  = nzeppelins*norientations*nsamples;
+        int size_lutiso = nballs*nsamples;
+
+        size_t required_mem = 28*nsegments + 6.0*npeaks + 8.0*nfibers + 16.0*nvoxels + 4.0*(size_lutic + size_lutec + size_lutiso + this->nrows + this->ncols);
+        checkCompatibility(required_mem);
+
+        cudaStatus = true;
+
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_VOXELS,       &nvoxels,       sizeof(int)) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_FIBERS,       &nfibers,       sizeof(int)) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_PEAKS,        &npeaks,        sizeof(int)) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_ORIENTATIONS, &norientations, sizeof(int)) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_SAMPLES,      &nsamples,      sizeof(int)) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_DIAMETERS,    &ndiameters,    sizeof(int)) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_ZEPPELINS,    &nzeppelins,    sizeof(int)) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_BALLS,        &nballs,        sizeof(int)) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_ROWS,         &nrows,         sizeof(int)) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_COLS,         &ncols,         sizeof(int)) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTIC,       &size_lutic,    sizeof(int)) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTEC,       &size_lutec,    sizeof(int)) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTISO,      &size_lutiso,   sizeof(int)) );
+
+        // alloc memory in GPU for vectors x and y
+        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_x, ncols*sizeof(float64_t)) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_y, nrows*sizeof(float64_t)) );
+
+        // setup LUTs
         cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lutIC, size_lutic*sizeof(float32_t)) );
         cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lutIC, lutIC, size_lutic*sizeof(float32_t), cudaMemcpyHostToDevice) );
-    }
 
-    if (gpu_lutEC  == NULL){
         cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lutEC,  size_lutec*sizeof(float32_t)) );
         cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lutEC, lutEC, size_lutec*sizeof(float32_t), cudaMemcpyHostToDevice) );
-    }
 
-    if (gpu_lutISO == NULL){
         cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lutISO, size_lutiso*sizeof(float32_t)) );
         cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lutISO, lutISO, size_lutiso*sizeof(float32_t), cudaMemcpyHostToDevice) );
-    }
-
-    if (gpu_voxelIC == NULL || gpu_fiberIC == NULL || gpu_orienIC == NULL || gpu_lengthIC == NULL) {
-        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_voxelIC,  nsegments*sizeof(uint32_t))  );
-        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_fiberIC,  nsegments*sizeof(uint32_t))  );
-        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_orienIC,  nsegments*sizeof(uint16_t))  );
-        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lengthIC, nsegments*sizeof(float32_t)) );
-
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_voxelIC,  voxelIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_fiberIC,  fiberIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_orienIC,  orienIC,  nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lengthIC, lengthIC, nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
-    }
-    
-    if (gpu_segmentsPerBlockIC == NULL || gpu_offsetPerBlockIC == NULL) {
-        uint32_t* segmentsPerBlock = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
-        uint32_t* offsetPerBlock   = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
 
-        preprocessDataForGPU(voxelIC, nsegments, segmentsPerBlock, offsetPerBlock, nvoxels);
+        //if (gpu_voxelIC == NULL || gpu_fiberIC == NULL || gpu_orienIC == NULL || gpu_lengthIC == NULL) {
+            cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_voxelIC,  nsegments*sizeof(uint32_t))  );
+            cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_fiberIC,  nsegments*sizeof(uint32_t))  );
+            cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_orienIC,  nsegments*sizeof(uint16_t))  );
+            cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lengthIC, nsegments*sizeof(float32_t)) );
+
+            cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_voxelIC,  voxelIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+            cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_fiberIC,  fiberIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+            cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_orienIC,  orienIC,  nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
+            cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lengthIC, lengthIC, nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
+        //}
+        
+        //if (gpu_segmentsPerBlockIC == NULL || gpu_offsetPerBlockIC == NULL) {
+            uint32_t* segmentsPerBlock = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
+            uint32_t* offsetPerBlock   = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
 
-        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_segmentsPerBlockIC, nvoxels*sizeof(uint32_t)) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_offsetPerBlockIC,   nvoxels*sizeof(uint32_t)) );
+            preprocessDataForGPU(voxelIC, nsegments, segmentsPerBlock, offsetPerBlock, nvoxels);
 
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_segmentsPerBlockIC, segmentsPerBlock, nvoxels*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_offsetPerBlockIC,   offsetPerBlock,   nvoxels*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+            cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_segmentsPerBlockIC, nvoxels*sizeof(uint32_t)) );
+            cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_offsetPerBlockIC,   nvoxels*sizeof(uint32_t)) );
 
-        free(segmentsPerBlock);
-        free(offsetPerBlock);
-    }
+            cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_segmentsPerBlockIC, segmentsPerBlock, nvoxels*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+            cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_offsetPerBlockIC,   offsetPerBlock,   nvoxels*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
 
-    if (gpu_voxelEC == NULL || gpu_orienEC == NULL) {
-        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_voxelEC,  npeaks*sizeof(uint32_t)) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_orienEC,  npeaks*sizeof(uint16_t)) );
+            //free(segmentsPerBlock);
+            //free(offsetPerBlock);
+        //}
 
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_voxelEC, voxelEC, npeaks*sizeof(uint32_t), cudaMemcpyHostToDevice) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_orienEC, orienEC, npeaks*sizeof(uint16_t), cudaMemcpyHostToDevice) );
-    }
+        //if (gpu_voxelEC == NULL || gpu_orienEC == NULL) {
+            cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_voxelEC,  npeaks*sizeof(uint32_t)) );
+            cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_orienEC,  npeaks*sizeof(uint16_t)) );
 
-    if (gpu_segmentsPerBlockEC == NULL || gpu_offsetPerBlockEC == NULL) {
-        uint32_t* segmentsPerBlock = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
-        uint32_t* offsetPerBlock   = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
+            cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_voxelEC, voxelEC, npeaks*sizeof(uint32_t), cudaMemcpyHostToDevice) );
+            cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_orienEC, orienEC, npeaks*sizeof(uint16_t), cudaMemcpyHostToDevice) );
+        //}
 
-        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_segmentsPerBlockEC, nvoxels*sizeof(uint32_t)) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_offsetPerBlockEC,   nvoxels*sizeof(uint32_t)) );
+        //if (gpu_segmentsPerBlockEC == NULL || gpu_offsetPerBlockEC == NULL) {
+            //uint32_t* segmentsPerBlock = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
+            //uint32_t* offsetPerBlock   = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
+            preprocessDataForGPU(voxelEC, npeaks, segmentsPerBlock, offsetPerBlock, nvoxels);
 
-        preprocessDataForGPU(voxelEC, npeaks, segmentsPerBlock, offsetPerBlock, nvoxels);
+            cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_segmentsPerBlockEC, nvoxels*sizeof(uint32_t)) );
+            cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_offsetPerBlockEC,   nvoxels*sizeof(uint32_t)) );
 
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_segmentsPerBlockEC, segmentsPerBlock, nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_offsetPerBlockEC,   offsetPerBlock,   nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
+            cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_segmentsPerBlockEC, segmentsPerBlock, nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
+            cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_offsetPerBlockEC,   offsetPerBlock,   nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
 
-        free(segmentsPerBlock);
-        free(offsetPerBlock);
+            free(segmentsPerBlock);
+            free(offsetPerBlock);
+        //}
     }
 }
 
@@ -192,7 +189,7 @@ void CudaLinearOperator::setTransposeData(
     uint16_t*  orienIDs,
     float32_t* lengths)
 {
-    if (gpu_TvoxelIC == NULL || gpu_TfiberIC == NULL || gpu_TorienIC == NULL || gpu_TlengthIC == NULL) {
+    //if (gpu_TvoxelIC == NULL || gpu_TfiberIC == NULL || gpu_TorienIC == NULL || gpu_TlengthIC == NULL) {
         cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TvoxelIC,  nsegments*sizeof(uint32_t))  );
         cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TfiberIC,  nsegments*sizeof(uint32_t))  );
         cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TorienIC,  nsegments*sizeof(uint16_t))  );
@@ -202,9 +199,9 @@ void CudaLinearOperator::setTransposeData(
         cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TfiberIC,  fiberIDs, nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
         cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TorienIC,  orienIDs, nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
         cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TlengthIC, lengths,  nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
-    }
+    //}
     
-    if (gpu_TfibersPerBlockIC == NULL || gpu_ToffsetPerBlockIC == NULL) {
+    //if (gpu_TfibersPerBlockIC == NULL || gpu_ToffsetPerBlockIC == NULL) {
         uint32_t*  fibersPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
         uint32_t*  offsetPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
     
@@ -218,7 +215,7 @@ void CudaLinearOperator::setTransposeData(
     
         free(fibersPerBlock);
         free(offsetPerBlock);
-    }
+    //}
 }
 
 void cudaCheckKernel(){
diff --git a/commit/operator_withCUDA.cuh b/commit/operator_withCUDA.cuh
index cefa3e18..57ba26cc 100644
--- a/commit/operator_withCUDA.cuh
+++ b/commit/operator_withCUDA.cuh
@@ -16,6 +16,48 @@ typedef double float64_t;
 
 bool cudaCheck(cudaError_t cudaStatus);
 void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compartmentsPerBlock, uint32_t* offsetPerBlock, int NUM_BLOCKS);
+bool checkCompatibility(size_t required_mem, int gpu_id = 0) {
+    int num_gpus;
+    cudaError_t cudaStatus;
+    
+    //printf("-> Checking availability of CUDA:\n");
+    cudaStatus = cudaGetDeviceCount(&num_gpus);
+
+    if (num_gpus <= 0 || num_gpus <= gpu_id) {
+        print("\t* the selected GPU does not exist or is not detected \n");
+        return false;
+    }
+
+    if (required_mem <= gpu_properties.totalGlobalMem) {
+        printf("\t* using %f GB of total %f GB... [ OK ]\n", required_mem*1e-9, gpu_properties.totalGlobalMem*1e-9);
+    }
+    else {
+        printf("\t* using %f GB of total %f GB... [ ERROR ]: dictionary too big for GPU memory\n", required_mem*1e-9, gpu_properties.totalGlobalMem*1e-9);
+    }
+
+    if(cudaStatus == cudaSuccess){
+        cudaDeviceProp gpu_properties;
+        cudaGetDeviceProperties(&gpu_properties, gpu_id);
+
+        printf("\t* checking availability of CUDA ... [ OK ]\n");
+        printf("\t* number of CUDA GPUs detected: %d\n", num_gpus);
+        printf("\t* using GPU %s with ID %d... \n", gpu_properties.name, gpu_id);
+
+        if(gpu_properties.major >= 5){
+            printf("\t* compute capability: %d.%d [ OK ]\n", gpu_properties.major, gpu_properties.minor);
+        }
+        else{
+            printf("\t* compute capability: %d.%d [ ERROR ]. GPU compute capability must be at least 5.0\n", gpu_properties.major, gpu_properties.minor);
+            return false;
+        }
+
+        return true;
+    }
+    else{
+        printf("\t* checking availability of CUDA ... [ ERROR ]: CUDA is not available or GPU is not CUDA compatible\n");
+        return false;
+    }
+}
 
 __global__ void multiply_Ax_ICpart(
     uint32_t*  voxelIDs,
@@ -147,7 +189,9 @@ class CudaLinearOperator {
             int nsamples,     
             int ndiameters,   
             int nzeppelins,   
-            int nballs);
+            int nballs,
+        
+            bool fcall);
 
         ~CudaLinearOperator();
 

From 9ccdcb8f873237cfb637ffea92775f76afbb6dc3 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Tue, 31 Mar 2020 20:19:28 -0600
Subject: [PATCH 084/190] Adding fcall flag to CudaLinearOperator

---
 commit/core.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index f18573bc..8dba44f7 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -678,7 +678,7 @@ cdef class Evaluation :
         else:
             import commit.cudaoperator
             #print( '\t* building dictionary in GPU ... ' )
-            self.A = commit.cudaoperator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS. fcall=True )
+            self.A = commit.cudaoperator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS, fcall=True )
             if self.A.cuda_status == 1:
                 self.A.set_transpose_data()
                 print( '[ OPERATOR OK ]' )

From e718fa393acaf7c6bb9816bb6100b731da9831c8 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Tue, 31 Mar 2020 20:21:45 -0600
Subject: [PATCH 085/190] Adding fcall flag to CudaLinearOperator

---
 commit/operator_withCUDA.cuh | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/commit/operator_withCUDA.cuh b/commit/operator_withCUDA.cuh
index 57ba26cc..de623897 100644
--- a/commit/operator_withCUDA.cuh
+++ b/commit/operator_withCUDA.cuh
@@ -24,17 +24,10 @@ bool checkCompatibility(size_t required_mem, int gpu_id = 0) {
     cudaStatus = cudaGetDeviceCount(&num_gpus);
 
     if (num_gpus <= 0 || num_gpus <= gpu_id) {
-        print("\t* the selected GPU does not exist or is not detected \n");
+        printf("\t* the selected GPU does not exist or is not detected \n");
         return false;
     }
 
-    if (required_mem <= gpu_properties.totalGlobalMem) {
-        printf("\t* using %f GB of total %f GB... [ OK ]\n", required_mem*1e-9, gpu_properties.totalGlobalMem*1e-9);
-    }
-    else {
-        printf("\t* using %f GB of total %f GB... [ ERROR ]: dictionary too big for GPU memory\n", required_mem*1e-9, gpu_properties.totalGlobalMem*1e-9);
-    }
-
     if(cudaStatus == cudaSuccess){
         cudaDeviceProp gpu_properties;
         cudaGetDeviceProperties(&gpu_properties, gpu_id);
@@ -43,6 +36,13 @@ bool checkCompatibility(size_t required_mem, int gpu_id = 0) {
         printf("\t* number of CUDA GPUs detected: %d\n", num_gpus);
         printf("\t* using GPU %s with ID %d... \n", gpu_properties.name, gpu_id);
 
+        if (required_mem <= gpu_properties.totalGlobalMem) {
+            printf("\t* using %f GB of total %f GB... [ OK ]\n", required_mem*1e-9, gpu_properties.totalGlobalMem*1e-9);
+        }
+        else {
+            printf("\t* using %f GB of total %f GB... [ ERROR ]: dictionary too big for GPU memory\n", required_mem*1e-9, gpu_properties.totalGlobalMem*1e-9);
+        }
+
         if(gpu_properties.major >= 5){
             printf("\t* compute capability: %d.%d [ OK ]\n", gpu_properties.major, gpu_properties.minor);
         }

From 76fa0c7fcd69275c60169d1de3b86216d7968104 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Tue, 31 Mar 2020 20:22:58 -0600
Subject: [PATCH 086/190] Adding fcall flag to CudaLinearOperator

---
 commit/operator_withCUDA.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index 07b879d7..5757afac 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -33,7 +33,7 @@ CudaLinearOperator::CudaLinearOperator(
     this->ncols     = nfibers*ndiameters + npeaks*nzeppelins + nvoxels*nballs;
 
     if (fcall) {
-        print("\t* configuring dictionary in CUDA GPU ... \n");
+        printf("\t* configuring dictionary in CUDA GPU ... \n");
 
         int size_lutic  = ndiameters*norientations*nsamples;
         int size_lutec  = nzeppelins*norientations*nsamples;

From 4d9cc30684de6c71f91dcd348bac1d1facdb5af2 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Tue, 31 Mar 2020 20:25:50 -0600
Subject: [PATCH 087/190] Adding fcall flag to CudaLinearOperator

---
 commit/operator_withCUDA.cu  | 43 ++++++++++++++++++++++++++++++++++++
 commit/operator_withCUDA.cuh | 43 +-----------------------------------
 2 files changed, 44 insertions(+), 42 deletions(-)

diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index 5757afac..33bd4722 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -1,5 +1,48 @@
 #include "operator_withCUDA.cuh"
 
+bool checkCompatibility(size_t required_mem, int gpu_id = 0) {
+    int num_gpus;
+    cudaError_t cudaStatus;
+    
+    //printf("-> Checking availability of CUDA:\n");
+    cudaStatus = cudaGetDeviceCount(&num_gpus);
+
+    if (num_gpus <= 0 || num_gpus <= gpu_id) {
+        printf("\t* the selected GPU does not exist or is not detected \n");
+        return false;
+    }
+
+    if(cudaStatus == cudaSuccess){
+        cudaDeviceProp gpu_properties;
+        cudaGetDeviceProperties(&gpu_properties, gpu_id);
+
+        printf("\t* checking availability of CUDA ... [ OK ]\n");
+        printf("\t* number of CUDA GPUs detected: %d\n", num_gpus);
+        printf("\t* using GPU %s with ID %d... \n", gpu_properties.name, gpu_id);
+
+        if (required_mem <= gpu_properties.totalGlobalMem) {
+            printf("\t* using %f GB of total %f GB... [ OK ]\n", required_mem*1e-9, gpu_properties.totalGlobalMem*1e-9);
+        }
+        else {
+            printf("\t* using %f GB of total %f GB... [ ERROR ]: dictionary too big for GPU memory\n", required_mem*1e-9, gpu_properties.totalGlobalMem*1e-9);
+        }
+
+        if(gpu_properties.major >= 5){
+            printf("\t* compute capability: %d.%d [ OK ]\n", gpu_properties.major, gpu_properties.minor);
+        }
+        else{
+            printf("\t* compute capability: %d.%d [ ERROR ]. GPU compute capability must be at least 5.0\n", gpu_properties.major, gpu_properties.minor);
+            return false;
+        }
+
+        return true;
+    }
+    else{
+        printf("\t* checking availability of CUDA ... [ ERROR ]: CUDA is not available or GPU is not CUDA compatible\n");
+        return false;
+    }
+}
+
 CudaLinearOperator::CudaLinearOperator(
     // pointers to IC data in CPU memory
     uint32_t* voxelIC,
diff --git a/commit/operator_withCUDA.cuh b/commit/operator_withCUDA.cuh
index de623897..6eb7d644 100644
--- a/commit/operator_withCUDA.cuh
+++ b/commit/operator_withCUDA.cuh
@@ -16,48 +16,7 @@ typedef double float64_t;
 
 bool cudaCheck(cudaError_t cudaStatus);
 void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compartmentsPerBlock, uint32_t* offsetPerBlock, int NUM_BLOCKS);
-bool checkCompatibility(size_t required_mem, int gpu_id = 0) {
-    int num_gpus;
-    cudaError_t cudaStatus;
-    
-    //printf("-> Checking availability of CUDA:\n");
-    cudaStatus = cudaGetDeviceCount(&num_gpus);
-
-    if (num_gpus <= 0 || num_gpus <= gpu_id) {
-        printf("\t* the selected GPU does not exist or is not detected \n");
-        return false;
-    }
-
-    if(cudaStatus == cudaSuccess){
-        cudaDeviceProp gpu_properties;
-        cudaGetDeviceProperties(&gpu_properties, gpu_id);
-
-        printf("\t* checking availability of CUDA ... [ OK ]\n");
-        printf("\t* number of CUDA GPUs detected: %d\n", num_gpus);
-        printf("\t* using GPU %s with ID %d... \n", gpu_properties.name, gpu_id);
-
-        if (required_mem <= gpu_properties.totalGlobalMem) {
-            printf("\t* using %f GB of total %f GB... [ OK ]\n", required_mem*1e-9, gpu_properties.totalGlobalMem*1e-9);
-        }
-        else {
-            printf("\t* using %f GB of total %f GB... [ ERROR ]: dictionary too big for GPU memory\n", required_mem*1e-9, gpu_properties.totalGlobalMem*1e-9);
-        }
-
-        if(gpu_properties.major >= 5){
-            printf("\t* compute capability: %d.%d [ OK ]\n", gpu_properties.major, gpu_properties.minor);
-        }
-        else{
-            printf("\t* compute capability: %d.%d [ ERROR ]. GPU compute capability must be at least 5.0\n", gpu_properties.major, gpu_properties.minor);
-            return false;
-        }
-
-        return true;
-    }
-    else{
-        printf("\t* checking availability of CUDA ... [ ERROR ]: CUDA is not available or GPU is not CUDA compatible\n");
-        return false;
-    }
-}
+bool checkCompatibility(size_t required_mem, int gpu_id = 0);
 
 __global__ void multiply_Ax_ICpart(
     uint32_t*  voxelIDs,

From 2aa0ac94f01868bdf92f65ce2abf2fc5948928cf Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Tue, 31 Mar 2020 20:28:17 -0600
Subject: [PATCH 088/190] Adding fcall flag to CudaLinearOperator

---
 commit/operator_withCUDA.cu  | 4 ++--
 commit/operator_withCUDA.cuh | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index 33bd4722..de062bdb 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -1,6 +1,6 @@
 #include "operator_withCUDA.cuh"
 
-bool checkCompatibility(size_t required_mem, int gpu_id = 0) {
+bool checkCompatibility(size_t required_mem, int gpu_id) {
     int num_gpus;
     cudaError_t cudaStatus;
     
@@ -83,7 +83,7 @@ CudaLinearOperator::CudaLinearOperator(
         int size_lutiso = nballs*nsamples;
 
         size_t required_mem = 28*nsegments + 6.0*npeaks + 8.0*nfibers + 16.0*nvoxels + 4.0*(size_lutic + size_lutec + size_lutiso + this->nrows + this->ncols);
-        checkCompatibility(required_mem);
+        checkCompatibility(required_mem, 0);
 
         cudaStatus = true;
 
diff --git a/commit/operator_withCUDA.cuh b/commit/operator_withCUDA.cuh
index 6eb7d644..83ae35c0 100644
--- a/commit/operator_withCUDA.cuh
+++ b/commit/operator_withCUDA.cuh
@@ -16,7 +16,7 @@ typedef double float64_t;
 
 bool cudaCheck(cudaError_t cudaStatus);
 void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compartmentsPerBlock, uint32_t* offsetPerBlock, int NUM_BLOCKS);
-bool checkCompatibility(size_t required_mem, int gpu_id = 0);
+bool checkCompatibility(size_t required_mem, int gpu_id);
 
 __global__ void multiply_Ax_ICpart(
     uint32_t*  voxelIDs,

From 9b8b9f8bfc1ed003d90f5bf9408c4733f03288cc Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Tue, 31 Mar 2020 20:45:32 -0600
Subject: [PATCH 089/190] Adding fcall flag to CudaLinearOperator

---
 commit/core.pyx              | 2 +-
 commit/cudaoperator.pyx      | 4 ++--
 commit/operator_withCUDA.cu  | 4 ++--
 commit/operator_withCUDA.cuh | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index 8dba44f7..9e0d6101 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -678,7 +678,7 @@ cdef class Evaluation :
         else:
             import commit.cudaoperator
             #print( '\t* building dictionary in GPU ... ' )
-            self.A = commit.cudaoperator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS, fcall=True )
+            self.A = commit.cudaoperator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS, fcall=1 )
             if self.A.cuda_status == 1:
                 self.A.set_transpose_data()
                 print( '[ OPERATOR OK ]' )
diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index c1f0cf1b..9c694f9f 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -30,7 +30,7 @@ cdef extern from "operator_withCUDA.cuh":
             int,
             int,
             
-            bool)
+            int)
 
         int   getCudaStatus()
         void  setTransposeData(np.uint32_t*, np.uint32_t*, np.uint16_t*, np.float32_t*)
@@ -72,7 +72,7 @@ cdef class CudaLinearOperator :
     cdef C_CudaLinearOperator* A
 
 
-    def __init__( self, DICTIONARY, KERNELS, THREADS, fcall = False ) :
+    def __init__( self, DICTIONARY, KERNELS, THREADS, fcall = 0 ) :
         """Set the pointers to the data structures used by the C code."""
         self.DICTIONARY = DICTIONARY
         self.KERNELS    = KERNELS
diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index de062bdb..f2515880 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -67,7 +67,7 @@ CudaLinearOperator::CudaLinearOperator(
     int nzeppelins,   
     int nballs,
 
-    bool fcall)
+    int fcall)
 {
     this->nsegments = nsegments;
     this->nvoxels   = nvoxels;
@@ -82,7 +82,7 @@ CudaLinearOperator::CudaLinearOperator(
         int size_lutec  = nzeppelins*norientations*nsamples;
         int size_lutiso = nballs*nsamples;
 
-        size_t required_mem = 28*nsegments + 6.0*npeaks + 8.0*nfibers + 16.0*nvoxels + 4.0*(size_lutic + size_lutec + size_lutiso + this->nrows + this->ncols);
+        size_t required_mem = 28*(size_t)nsegments + 6.0*(size_t)npeaks + 8.0*(size_t)nfibers + 16.0*(size_t)nvoxels + 4.0*((size_t)size_lutic + (size_t)size_lutec + (size_t)size_lutiso + (size_t)this->nrows + (size_t)this->ncols);
         checkCompatibility(required_mem, 0);
 
         cudaStatus = true;
diff --git a/commit/operator_withCUDA.cuh b/commit/operator_withCUDA.cuh
index 83ae35c0..231a4f77 100644
--- a/commit/operator_withCUDA.cuh
+++ b/commit/operator_withCUDA.cuh
@@ -150,7 +150,7 @@ class CudaLinearOperator {
             int nzeppelins,   
             int nballs,
         
-            bool fcall);
+            int fcall);
 
         ~CudaLinearOperator();
 

From 9b529aaf6d581988022dacf905de96ef4b0dd986 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Tue, 31 Mar 2020 21:44:10 -0600
Subject: [PATCH 090/190] Adding fcall flag to CudaLinearOperator

---
 commit/core.pyx             |  12 ++--
 commit/operator_withCUDA.cu | 118 ++++++++++++++++++++++++------------
 2 files changed, 84 insertions(+), 46 deletions(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index 9e0d6101..d1d4c3e7 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -468,7 +468,7 @@ cdef class Evaluation :
 
         # post-processing
         # ---------------
-        print( '\t* post-processing...' )
+        print( '\t* post-processing...', end="" )
         sys.stdout.flush()
 
         # get the indices to extract the VOI as in MATLAB (in place of DICTIONARY.MASKidx)
@@ -525,14 +525,14 @@ cdef class Evaluation :
             long t, tot, i1, i2, N, c
             int i
 
-        tic = time.time()
-
         if nthreads > 0:
             print( '\n-> Distributing workload to different threads:' )
             print( '\t* number of threads : %d' % nthreads )
 
+            tic = time.time()
+
             # Distribute load for the computation of A*x product
-            print( '\t* A operator...', end="" )
+            print( '\t* A  operator...', end="" )
             sys.stdout.flush()
 
             if self.DICTIONARY['IC']['n'] > 0 :
@@ -639,9 +639,9 @@ cdef class Evaluation :
             else :
                 self.THREADS['ISOt'] = None
 
-        print( '[ OK ]' )
+            print( '[ OK ]' )
 
-        print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
+            print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
 
 
     def build_operator( self ) :
diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index f2515880..bb577a23 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -18,10 +18,10 @@ bool checkCompatibility(size_t required_mem, int gpu_id) {
 
         printf("\t* checking availability of CUDA ... [ OK ]\n");
         printf("\t* number of CUDA GPUs detected: %d\n", num_gpus);
-        printf("\t* using GPU %s with ID %d... \n", gpu_properties.name, gpu_id);
+        printf("\t* using GPU with ID %d... [ %s ]\n", gpu_id, gpu_properties.name);
 
         if (required_mem <= gpu_properties.totalGlobalMem) {
-            printf("\t* using %f GB of total %f GB... [ OK ]\n", required_mem*1e-9, gpu_properties.totalGlobalMem*1e-9);
+            printf("\t* using %.2f GB of total %.2f GB... [ OK ]\n", required_mem*1e-9, gpu_properties.totalGlobalMem*1e-9);
         }
         else {
             printf("\t* using %f GB of total %f GB... [ ERROR ]: dictionary too big for GPU memory\n", required_mem*1e-9, gpu_properties.totalGlobalMem*1e-9);
@@ -76,7 +76,7 @@ CudaLinearOperator::CudaLinearOperator(
     this->ncols     = nfibers*ndiameters + npeaks*nzeppelins + nvoxels*nballs;
 
     if (fcall) {
-        printf("\t* configuring dictionary in CUDA GPU ... \n");
+        //printf("\t* configuring dictionary in CUDA GPU ... \n");
 
         int size_lutic  = ndiameters*norientations*nsamples;
         int size_lutec  = nzeppelins*norientations*nsamples;
@@ -87,6 +87,8 @@ CudaLinearOperator::CudaLinearOperator(
 
         cudaStatus = true;
 
+        printf("\t* constant values ... ");
+        cudaStatus = true;
         cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_VOXELS,       &nvoxels,       sizeof(int)) );
         cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_FIBERS,       &nfibers,       sizeof(int)) );
         cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_PEAKS,        &npeaks,        sizeof(int)) );
@@ -100,12 +102,16 @@ CudaLinearOperator::CudaLinearOperator(
         cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTIC,       &size_lutic,    sizeof(int)) );
         cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTEC,       &size_lutec,    sizeof(int)) );
         cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTISO,      &size_lutiso,   sizeof(int)) );
+        if (cudaStatus) printf("[ OK ]\n");
+        else            printf("[ CUDA ERROR ]\n");
 
         // alloc memory in GPU for vectors x and y
         cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_x, ncols*sizeof(float64_t)) );
         cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_y, nrows*sizeof(float64_t)) );
 
         // setup LUTs
+        printf("\t* loading LUT ... ");
+        cudaStatus = true;
         cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lutIC, size_lutic*sizeof(float32_t)) );
         cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lutIC, lutIC, size_lutic*sizeof(float32_t), cudaMemcpyHostToDevice) );
 
@@ -114,20 +120,11 @@ CudaLinearOperator::CudaLinearOperator(
 
         cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lutISO, size_lutiso*sizeof(float32_t)) );
         cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lutISO, lutISO, size_lutiso*sizeof(float32_t), cudaMemcpyHostToDevice) );
+        if (cudaStatus) printf("[ OK ]\n");
+        else            printf("[ CUDA ERROR ]\n");
 
-        //if (gpu_voxelIC == NULL || gpu_fiberIC == NULL || gpu_orienIC == NULL || gpu_lengthIC == NULL) {
-            cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_voxelIC,  nsegments*sizeof(uint32_t))  );
-            cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_fiberIC,  nsegments*sizeof(uint32_t))  );
-            cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_orienIC,  nsegments*sizeof(uint16_t))  );
-            cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lengthIC, nsegments*sizeof(float32_t)) );
-
-            cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_voxelIC,  voxelIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-            cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_fiberIC,  fiberIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-            cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_orienIC,  orienIC,  nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
-            cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lengthIC, lengthIC, nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
-        //}
-        
-        //if (gpu_segmentsPerBlockIC == NULL || gpu_offsetPerBlockIC == NULL) {
+        printf("\t* pre-processing ... ");
+        cudaStatus = true;
             uint32_t* segmentsPerBlock = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
             uint32_t* offsetPerBlock   = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
 
@@ -141,7 +138,22 @@ CudaLinearOperator::CudaLinearOperator(
 
             //free(segmentsPerBlock);
             //free(offsetPerBlock);
-        //}
+        if (cudaStatus) printf("[ OK ]\n");
+        else            printf("[ CUDA ERROR ]\n");
+
+        printf("\t*A  operator ... ");
+        cudaStatus = true;
+        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_voxelIC,  nsegments*sizeof(uint32_t))  );
+        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_fiberIC,  nsegments*sizeof(uint32_t))  );
+        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_orienIC,  nsegments*sizeof(uint16_t))  );
+        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lengthIC, nsegments*sizeof(float32_t)) );
+
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_voxelIC,  voxelIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_fiberIC,  fiberIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_orienIC,  orienIC,  nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lengthIC, lengthIC, nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
+        if (cudaStatus) printf("[ OK ]\n");
+        else            printf("[ CUDA ERROR ]\n");
 
         //if (gpu_voxelEC == NULL || gpu_orienEC == NULL) {
             cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_voxelEC,  npeaks*sizeof(uint32_t)) );
@@ -164,6 +176,11 @@ CudaLinearOperator::CudaLinearOperator(
 
             free(segmentsPerBlock);
             free(offsetPerBlock);
+
+            printf("\t* A' operator ... ");
+            cudaStatus = true;
+            if (cudaStatus) printf("[ OK ]\n");
+            else            printf("[ CUDA ERROR ]\n");
         //}
     }
 }
@@ -195,35 +212,56 @@ CudaLinearOperator::~CudaLinearOperator(){
 }
 
 void CudaLinearOperator::destroy(){
-    printf("\t* destroying and reseting GPU ... ");
-    bool status = true;
-    status = status && cudaCheck( cudaFree(gpu_voxelIC) );
-    status = status && cudaCheck( cudaFree(gpu_fiberIC) );
-    status = status && cudaCheck( cudaFree(gpu_orienIC) );
-    status = status && cudaCheck( cudaFree(gpu_lengthIC) );
-    status = status && cudaCheck( cudaFree(gpu_lutIC) );
-    status = status && cudaCheck( cudaFree(gpu_segmentsPerBlockIC) );
-    status = status && cudaCheck( cudaFree(gpu_offsetPerBlockIC) );
-    status = status && cudaCheck( cudaFree(gpu_voxelEC) );
-    status = status && cudaCheck( cudaFree(gpu_orienEC) );
-    status = status && cudaCheck( cudaFree(gpu_lutEC) );
-    status = status && cudaCheck( cudaFree(gpu_segmentsPerBlockEC) );
-    status = status && cudaCheck( cudaFree(gpu_offsetPerBlockEC) );
-    status = status && cudaCheck( cudaFree(gpu_lutISO) );
+    bool cudaStatus;
+
+    printf("-> Deleting GPU memory:\n");
+
+    printf("\t* deleting A  ... ");
+    cudaStatus = true;
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_voxelIC)  );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_fiberIC)  );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_orienIC)  );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_lengthIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_voxelEC)  );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_orienEC)  );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_segmentsPerBlockIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_offsetPerBlockIC)   );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_segmentsPerBlockEC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_offsetPerBlockEC)   );
+    if (cudaStatus) printf("[ OK ]\n");
+    else            printf("[ CUDA ERROR ]\n");
+
+    printf("\t* deleting A' ... ");
+    cudaStatus = true;
     status = status && cudaCheck( cudaFree(gpu_TvoxelIC) );
     status = status && cudaCheck( cudaFree(gpu_TfiberIC) );
     status = status && cudaCheck( cudaFree(gpu_TorienIC) );
     status = status && cudaCheck( cudaFree(gpu_TlengthIC) );
     status = status && cudaCheck( cudaFree(gpu_TfibersPerBlockIC) );
     status = status && cudaCheck( cudaFree(gpu_ToffsetPerBlockIC) );
-    status = status && cudaCheck( cudaFree(gpu_x) );
-    status = status && cudaCheck( cudaFree(gpu_y) );
-
-    /*printf("\t* reseting GPU ... ");
-    bool status = true;//*/
-    status = status && cudaCheck( cudaDeviceReset() );
-    if (status) printf("[ OK ]\n");
-    else        printf("[ CUDA ERROR ]\n");//*/
+    if (cudaStatus) printf("[ OK ]\n");
+    else            printf("[ CUDA ERROR ]\n");
+
+    printf("\t* deleting LUT ... ");
+    cudaStatus = true;
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_lutIC)  );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_lutEC)  );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_lutISO) );
+    if (cudaStatus) printf("[ OK ]\n");
+    else            printf("[ CUDA ERROR ]\n");
+
+    printf("\t* deleting vectors x and y ... ");
+    cudaStatus = true;
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_x) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_y) );
+    if (cudaStatus) printf("[ OK ]\n");
+    else            printf("[ CUDA ERROR ]\n");
+
+    printf("\t* reseting GPU ... ");
+    cudaStatus = true;
+    cudaStatus = cudaStatus && cudaCheck( cudaDeviceReset() );
+    if (cudaStatus) printf("[ OK ]\n");
+    else            printf("[ CUDA ERROR ]\n");
 }
 
 void CudaLinearOperator::setTransposeData(

From 948d63869434d0c23d7d2b72dbd5031b6e690b9f Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Tue, 31 Mar 2020 22:22:25 -0600
Subject: [PATCH 091/190] Adding fcall flag to CudaLinearOperator

---
 commit/operator_withCUDA.cu | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index bb577a23..955e0171 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -233,12 +233,12 @@ void CudaLinearOperator::destroy(){
 
     printf("\t* deleting A' ... ");
     cudaStatus = true;
-    status = status && cudaCheck( cudaFree(gpu_TvoxelIC) );
-    status = status && cudaCheck( cudaFree(gpu_TfiberIC) );
-    status = status && cudaCheck( cudaFree(gpu_TorienIC) );
-    status = status && cudaCheck( cudaFree(gpu_TlengthIC) );
-    status = status && cudaCheck( cudaFree(gpu_TfibersPerBlockIC) );
-    status = status && cudaCheck( cudaFree(gpu_ToffsetPerBlockIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_TvoxelIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_TfiberIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_TorienIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_TlengthIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_TfibersPerBlockIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_ToffsetPerBlockIC) );
     if (cudaStatus) printf("[ OK ]\n");
     else            printf("[ CUDA ERROR ]\n");
 

From 8c9e1ecbe27145fdbcd7b3e64ebb5232f31e7cd5 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Wed, 1 Apr 2020 01:16:16 -0600
Subject: [PATCH 092/190] Adding fcall flag to CudaLinearOperator

---
 commit/core.pyx              |  39 +---
 commit/cudaoperator.pyx      |  33 +--
 commit/operator_withCUDA.cu  | 378 +++++++++++++++--------------------
 commit/operator_withCUDA.cuh |   4 +-
 4 files changed, 190 insertions(+), 264 deletions(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index d1d4c3e7..f4a7fdee 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -71,7 +71,6 @@ cdef class Evaluation :
     cdef public A
     cdef public x
     cdef public CONFIG
-    cdef public gpu_A
 
     def __init__( self, study_path, subject ) :
         """Setup the data structures with default values.
@@ -399,14 +398,6 @@ cdef class Evaluation :
         self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
         self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
         del idx
-        """
-        idx = np.argsort( self.DICTIONARY['IC']['v'], kind='mergesort' )
-        self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
-        self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]
-        self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
-        self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
-        del idx
-        """
 
         # divide the length of each segment by the fiber length so that all the columns of the libear operator will have same length
         # NB: it works in conjunction with the normalization of the kernels
@@ -436,10 +427,6 @@ cdef class Evaluation :
         self.DICTIONARY['EC']['v'] = self.DICTIONARY['EC']['v'][ idx ]
         self.DICTIONARY['EC']['o'] = self.DICTIONARY['EC']['o'][ idx ]
         del idx
-        """idx = np.argsort( self.DICTIONARY['EC']['v'], kind='mergesort' )
-        self.DICTIONARY['EC']['v'] = self.DICTIONARY['EC']['v'][ idx ]
-        self.DICTIONARY['EC']['o'] = self.DICTIONARY['EC']['o'][ idx ]
-        del idx """
 
         print( ' [ %d segments ]' % self.DICTIONARY['EC']['nE'] )
 
@@ -532,7 +519,7 @@ cdef class Evaluation :
             tic = time.time()
 
             # Distribute load for the computation of A*x product
-            print( '\t* A  operator...', end="" )
+            print( '\t* A  operator... ', end="" )
             sys.stdout.flush()
 
             if self.DICTIONARY['IC']['n'] > 0 :
@@ -554,8 +541,6 @@ cdef class Evaluation :
                 if np.count_nonzero( np.diff( self.THREADS['IC'].astype(np.int32) ) <= 0 ) :
                     self.THREADS = None
                     raise RuntimeError( 'Too many threads for the IC compartments to evaluate; try decreasing the number.' )
-            else :
-                self.THREADS['IC'] = None
 
             if self.DICTIONARY['EC']['nE'] > 0 :
                 self.THREADS['EC'] = np.zeros( nthreads+1, dtype=np.uint32 )
@@ -567,8 +552,6 @@ cdef class Evaluation :
                 if np.count_nonzero( np.diff( self.THREADS['EC'].astype(np.int32) ) <= 0 ) :
                     self.THREADS = None
                     raise RuntimeError( 'Too many threads for the EC compartments to evaluate; try decreasing the number.' )
-            else :
-                self.THREADS['IC'] = None
 
             if self.DICTIONARY['nV'] > 0 :
                 self.THREADS['ISO'] = np.zeros( nthreads+1, dtype=np.uint32 )
@@ -580,13 +563,11 @@ cdef class Evaluation :
                 if np.count_nonzero( np.diff( self.THREADS['ISO'].astype(np.int32) ) <= 0 ) :
                     self.THREADS = None
                     raise RuntimeError( 'Too many threads for the ISO compartments to evaluate; try decreasing the number.' )
-            else :
-                self.THREADS['ISO'] = None
 
-            print( ' [ OK ]' )
+            print( '[ OK ]' )
 
             # Distribute load for the computation of At*y product
-            print( '\t* A\' operator...', end="" )
+            print( '\t* A\' operator... ', end="" )
             sys.stdout.flush()
 
             if self.DICTIONARY['IC']['n'] > 0 :
@@ -608,9 +589,6 @@ cdef class Evaluation :
                             tot = c
                     self.THREADS['ICt'][idx] = self.THREADS['ICt'].copy()
 
-            else :
-                self.THREADS['ICt'] = None
-
             if self.DICTIONARY['EC']['nE'] > 0 :
                 self.THREADS['ECt'] = np.zeros( nthreads+1, dtype=np.uint32 )
                 N = np.floor( self.DICTIONARY['EC']['nE']/nthreads )
@@ -622,8 +600,6 @@ cdef class Evaluation :
                 if np.count_nonzero( np.diff( self.THREADS['ECt'].astype(np.int32) ) <= 0 ) :
                     self.THREADS = None
                     raise RuntimeError( 'Too many threads for the EC compartments to evaluate; try decreasing the number.' )
-            else :
-                self.THREADS['ECt'] = None
 
             if self.DICTIONARY['nV'] > 0 :
                 self.THREADS['ISOt'] = np.zeros( nthreads+1, dtype=np.uint32 )
@@ -636,8 +612,6 @@ cdef class Evaluation :
                 if np.count_nonzero( np.diff( self.THREADS['ISOt'].astype(np.int32) ) <= 0 ) :
                     self.THREADS = None
                     raise RuntimeError( 'Too many threads for the ISO compartments to evaluate; try decreasing the number.' )
-            else :
-                self.THREADS['ISOt'] = None
 
             print( '[ OK ]' )
 
@@ -674,16 +648,9 @@ cdef class Evaluation :
 
         if self.THREADS['n'] > 0:
             self.A = sys.modules['commit.operator.operator'].LinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
-        
         else:
             import commit.cudaoperator
-            #print( '\t* building dictionary in GPU ... ' )
             self.A = commit.cudaoperator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS, fcall=1 )
-            if self.A.cuda_status == 1:
-                self.A.set_transpose_data()
-                print( '[ OPERATOR OK ]' )
-            else:
-                print( '[ OPERATOR ERROR ]' )
 
         print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
 
diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 9c694f9f..80673fdd 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -98,13 +98,6 @@ cdef class CudaLinearOperator :
 
         self.n1 = self.nV*self.nS
         self.n2 = self.nR*self.nF + self.nT*self.nE + self.nI*self.nV
-        
-        """
-        cdef double gpumem = 1E-6 * ( 28.0*self.n + 6.0*self.nE + 8.0*(self.nF) + 16.0*self.nV + 4.0*(self.nR*self.ndirs*self.nS + self.nT*self.ndirs*self.nS + self.nI*self.nS + self.n1 + self.n2) )
-        print('Required GPU Memory = %f MB' % gpumem)
-        if gpumem > 8000.0:
-            raise RuntimeError( 'GPU Memory exceeded!!!!!!' )
-        """
 
         # get C pointers to arrays in DICTIONARY
         cdef unsigned int [::1]   ICf  = DICTIONARY['IC']['fiber']
@@ -151,10 +144,27 @@ cdef class CudaLinearOperator :
             self.nS,
             self.nR,
             self.nT,
-            self.nI,
+            self.nI)
+
+        if fcall == 1:
+            idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['fiber'])] )
+
+            self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
+            self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]
+            self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
+            self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
+
+            cdef unsigned int   [::1] ICf = self.DICTIONARY['IC']['fiber']
+            cdef float          [::1] ICl = self.DICTIONARY['IC']['len']
+            cdef unsigned int   [::1] ICv = self.DICTIONARY['IC']['v']
+            cdef unsigned short [::1] ICo = self.DICTIONARY['IC']['o']
+
+            self.ICf = &ICf[0]
+            self.ICl = &ICl[0]
+            self.ICv = &ICv[0]
+            self.ICo = &ICo[0]
 
-            fcall
-        )
+            self.A.setTransposeData(&self.ICv[0], &self.ICf[0], &self.ICo[0], &self.ICl[0])
 
     @property
     def T( self ) :
@@ -233,6 +243,3 @@ cdef class CudaLinearOperator :
 
         self.A.setTransposeData(&self.ICv[0], &self.ICf[0], &self.ICo[0], &self.ICl[0])
 
-    def gpu_compatibility( self ):
-        """Check if the available GPU is compatible"""
-        return 0
diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index 955e0171..3f61da55 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -1,5 +1,25 @@
 #include "operator_withCUDA.cuh"
 
+bool cudaCheck(cudaError_t cudaStatus){
+    return cudaStatus == cudaSuccess;
+}
+
+void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compartmentsPerBlock, uint32_t* offsetPerBlock, int NUM_BLOCKS){
+
+    // fill arrays with zeros
+    memset(compartmentsPerBlock, 0, NUM_BLOCKS * sizeof(uint32_t));
+    memset(offsetPerBlock,       0, NUM_BLOCKS * sizeof(uint32_t));
+
+    // count compartments per block
+    for(int i = 0; i < NUM_COMPARTMENTS; i++)
+        compartmentsPerBlock[data[i]]++;
+
+    // calculate offset per block
+    offsetPerBlock[0] = 0;
+    for(int i = 1; i < NUM_BLOCKS; i++)
+        offsetPerBlock[i] = offsetPerBlock[i-1] + compartmentsPerBlock[i-1];
+}
+
 bool checkCompatibility(size_t required_mem, int gpu_id) {
     int num_gpus;
     cudaError_t cudaStatus;
@@ -65,9 +85,7 @@ CudaLinearOperator::CudaLinearOperator(
     int nsamples,     
     int ndiameters,   
     int nzeppelins,   
-    int nballs,
-
-    int fcall)
+    int nballs)
 {
     this->nsegments = nsegments;
     this->nvoxels   = nvoxels;
@@ -75,148 +93,106 @@ CudaLinearOperator::CudaLinearOperator(
     this->nrows     = nvoxels * nsamples;
     this->ncols     = nfibers*ndiameters + npeaks*nzeppelins + nvoxels*nballs;
 
-    if (fcall) {
-        //printf("\t* configuring dictionary in CUDA GPU ... \n");
-
-        int size_lutic  = ndiameters*norientations*nsamples;
-        int size_lutec  = nzeppelins*norientations*nsamples;
-        int size_lutiso = nballs*nsamples;
-
-        size_t required_mem = 28*(size_t)nsegments + 6.0*(size_t)npeaks + 8.0*(size_t)nfibers + 16.0*(size_t)nvoxels + 4.0*((size_t)size_lutic + (size_t)size_lutec + (size_t)size_lutiso + (size_t)this->nrows + (size_t)this->ncols);
-        checkCompatibility(required_mem, 0);
-
-        cudaStatus = true;
-
-        printf("\t* constant values ... ");
-        cudaStatus = true;
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_VOXELS,       &nvoxels,       sizeof(int)) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_FIBERS,       &nfibers,       sizeof(int)) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_PEAKS,        &npeaks,        sizeof(int)) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_ORIENTATIONS, &norientations, sizeof(int)) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_SAMPLES,      &nsamples,      sizeof(int)) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_DIAMETERS,    &ndiameters,    sizeof(int)) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_ZEPPELINS,    &nzeppelins,    sizeof(int)) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_BALLS,        &nballs,        sizeof(int)) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_ROWS,         &nrows,         sizeof(int)) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_COLS,         &ncols,         sizeof(int)) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTIC,       &size_lutic,    sizeof(int)) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTEC,       &size_lutec,    sizeof(int)) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTISO,      &size_lutiso,   sizeof(int)) );
-        if (cudaStatus) printf("[ OK ]\n");
-        else            printf("[ CUDA ERROR ]\n");
-
-        // alloc memory in GPU for vectors x and y
-        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_x, ncols*sizeof(float64_t)) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_y, nrows*sizeof(float64_t)) );
-
-        // setup LUTs
-        printf("\t* loading LUT ... ");
-        cudaStatus = true;
-        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lutIC, size_lutic*sizeof(float32_t)) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lutIC, lutIC, size_lutic*sizeof(float32_t), cudaMemcpyHostToDevice) );
-
-        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lutEC,  size_lutec*sizeof(float32_t)) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lutEC, lutEC, size_lutec*sizeof(float32_t), cudaMemcpyHostToDevice) );
-
-        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lutISO, size_lutiso*sizeof(float32_t)) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lutISO, lutISO, size_lutiso*sizeof(float32_t), cudaMemcpyHostToDevice) );
-        if (cudaStatus) printf("[ OK ]\n");
-        else            printf("[ CUDA ERROR ]\n");
-
-        printf("\t* pre-processing ... ");
-        cudaStatus = true;
-            uint32_t* segmentsPerBlock = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
-            uint32_t* offsetPerBlock   = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
-
-            preprocessDataForGPU(voxelIC, nsegments, segmentsPerBlock, offsetPerBlock, nvoxels);
-
-            cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_segmentsPerBlockIC, nvoxels*sizeof(uint32_t)) );
-            cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_offsetPerBlockIC,   nvoxels*sizeof(uint32_t)) );
-
-            cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_segmentsPerBlockIC, segmentsPerBlock, nvoxels*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-            cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_offsetPerBlockIC,   offsetPerBlock,   nvoxels*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-
-            //free(segmentsPerBlock);
-            //free(offsetPerBlock);
-        if (cudaStatus) printf("[ OK ]\n");
-        else            printf("[ CUDA ERROR ]\n");
-
-        printf("\t*A  operator ... ");
-        cudaStatus = true;
-        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_voxelIC,  nsegments*sizeof(uint32_t))  );
-        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_fiberIC,  nsegments*sizeof(uint32_t))  );
-        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_orienIC,  nsegments*sizeof(uint16_t))  );
-        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lengthIC, nsegments*sizeof(float32_t)) );
-
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_voxelIC,  voxelIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_fiberIC,  fiberIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_orienIC,  orienIC,  nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lengthIC, lengthIC, nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
-        if (cudaStatus) printf("[ OK ]\n");
-        else            printf("[ CUDA ERROR ]\n");
-
-        //if (gpu_voxelEC == NULL || gpu_orienEC == NULL) {
-            cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_voxelEC,  npeaks*sizeof(uint32_t)) );
-            cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_orienEC,  npeaks*sizeof(uint16_t)) );
-
-            cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_voxelEC, voxelEC, npeaks*sizeof(uint32_t), cudaMemcpyHostToDevice) );
-            cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_orienEC, orienEC, npeaks*sizeof(uint16_t), cudaMemcpyHostToDevice) );
-        //}
-
-        //if (gpu_segmentsPerBlockEC == NULL || gpu_offsetPerBlockEC == NULL) {
-            //uint32_t* segmentsPerBlock = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
-            //uint32_t* offsetPerBlock   = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
-            preprocessDataForGPU(voxelEC, npeaks, segmentsPerBlock, offsetPerBlock, nvoxels);
-
-            cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_segmentsPerBlockEC, nvoxels*sizeof(uint32_t)) );
-            cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_offsetPerBlockEC,   nvoxels*sizeof(uint32_t)) );
-
-            cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_segmentsPerBlockEC, segmentsPerBlock, nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
-            cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_offsetPerBlockEC,   offsetPerBlock,   nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
-
-            free(segmentsPerBlock);
-            free(offsetPerBlock);
-
-            printf("\t* A' operator ... ");
-            cudaStatus = true;
-            if (cudaStatus) printf("[ OK ]\n");
-            else            printf("[ CUDA ERROR ]\n");
-        //}
-    }
-}
+    int size_lutic  = ndiameters*norientations*nsamples;
+    int size_lutec  = nzeppelins*norientations*nsamples;
+    int size_lutiso = nballs*nsamples;
+
+    size_t required_mem = 28*(size_t)nsegments + 6.0*(size_t)npeaks + 8.0*(size_t)nfibers + 16.0*(size_t)nvoxels + 4.0*((size_t)size_lutic + (size_t)size_lutec + (size_t)size_lutiso + (size_t)this->nrows + (size_t)this->ncols);
+    checkCompatibility(required_mem, 0);
+
+    // transfer constant values to the GPU
+    printf("\t* constant values ... ");
+    cudaStatus = true;
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_VOXELS,       &nvoxels,       sizeof(int)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_FIBERS,       &nfibers,       sizeof(int)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_PEAKS,        &npeaks,        sizeof(int)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_ORIENTATIONS, &norientations, sizeof(int)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_SAMPLES,      &nsamples,      sizeof(int)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_DIAMETERS,    &ndiameters,    sizeof(int)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_ZEPPELINS,    &nzeppelins,    sizeof(int)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_BALLS,        &nballs,        sizeof(int)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_ROWS,         &nrows,         sizeof(int)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_COLS,         &ncols,         sizeof(int)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTIC,       &size_lutic,    sizeof(int)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTEC,       &size_lutec,    sizeof(int)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTISO,      &size_lutiso,   sizeof(int)) );
+    if (cudaStatus) printf("[ OK ]\n");
+    else            printf("[ CUDA ERROR ]\n");
+
+    // alloc memory in GPU for vectors x and y
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_x, ncols*sizeof(float64_t)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_y, nrows*sizeof(float64_t)) );
+
+    // pre-process data for GPU
+    printf("\t* pre-processing ... ");
+    cudaStatus = true;
+    uint32_t* segmentsPerBlock = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
+    uint32_t* offsetPerBlock   = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
+
+    preprocessDataForGPU(voxelIC, nsegments, segmentsPerBlock, offsetPerBlock, nvoxels);
+
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_segmentsPerBlockIC, nvoxels*sizeof(uint32_t)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_offsetPerBlockIC,   nvoxels*sizeof(uint32_t)) );
+
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_segmentsPerBlockIC, segmentsPerBlock, nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_offsetPerBlockIC,   offsetPerBlock,   nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
+
+    preprocessDataForGPU(voxelEC, npeaks, segmentsPerBlock, offsetPerBlock, nvoxels);
+
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_segmentsPerBlockEC, nvoxels*sizeof(uint32_t)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_offsetPerBlockEC,   nvoxels*sizeof(uint32_t)) );
+
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_segmentsPerBlockEC, segmentsPerBlock, nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_offsetPerBlockEC,   offsetPerBlock,   nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
+
+    free(segmentsPerBlock);
+    free(offsetPerBlock);
+    if (cudaStatus) printf("[ OK ]\n");
+    else            printf("[ CUDA ERROR ]\n");
+
+    // alloc and transfer LUTs
+    printf("\t* loading LUT ... ");
+    cudaStatus = true;
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lutIC, size_lutic*sizeof(float32_t)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lutIC, lutIC, size_lutic*sizeof(float32_t), cudaMemcpyHostToDevice) );
+
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lutEC,  size_lutec*sizeof(float32_t)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lutEC, lutEC, size_lutec*sizeof(float32_t), cudaMemcpyHostToDevice) );
+
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lutISO, size_lutiso*sizeof(float32_t)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lutISO, lutISO, size_lutiso*sizeof(float32_t), cudaMemcpyHostToDevice) );
+    if (cudaStatus) printf("[ OK ]\n");
+    else            printf("[ CUDA ERROR ]\n");
+
+    // alloc and transfer operator A
+    printf("\t* A  operator... ");
+    cudaStatus = true;
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_voxelIC,  nsegments*sizeof(uint32_t))  );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_fiberIC,  nsegments*sizeof(uint32_t))  );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_orienIC,  nsegments*sizeof(uint16_t))  );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lengthIC, nsegments*sizeof(float32_t)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_voxelEC,  npeaks*sizeof(uint32_t))     );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_orienEC,  npeaks*sizeof(uint16_t))     );
+
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_voxelIC,  voxelIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_fiberIC,  fiberIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_orienIC,  orienIC,  nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lengthIC, lengthIC, nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_voxelEC,  voxelEC,  npeaks*sizeof(uint32_t),     cudaMemcpyHostToDevice) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_orienEC,  orienEC,  npeaks*sizeof(uint16_t),     cudaMemcpyHostToDevice) );
+    if (cudaStatus) printf("[ OK ]\n");
+    else            printf("[ CUDA ERROR ]\n");
 
-CudaLinearOperator::~CudaLinearOperator(){
-    /*cudaStatus = cudaStatus && cudaCheck( cudaFree(voxelIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(fiberIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(orienIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(lengthIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(lutIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(segmentsPerBlockIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(offsetPerBlockIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(voxelEC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(orienEC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(lutEC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(segmentsPerBlockEC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(offsetPerBlockEC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(lutISO) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(TvoxelIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(TfiberIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(TorienIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(TlengthIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(TfibersPerBlockIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(ToffsetPerBlockIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(x) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(y) );
-
-    cudaStatus = cudaStatus && cudaCheck( cudaDeviceReset() );//*/
 }
 
+CudaLinearOperator::~CudaLinearOperator() {}
+
 void CudaLinearOperator::destroy(){
     bool cudaStatus;
 
     printf("-> Deleting GPU memory:\n");
 
-    printf("\t* deleting A  ... ");
+    printf("\t* deleting A...   ");
     cudaStatus = true;
     cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_voxelIC)  );
     cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_fiberIC)  );
@@ -231,7 +207,7 @@ void CudaLinearOperator::destroy(){
     if (cudaStatus) printf("[ OK ]\n");
     else            printf("[ CUDA ERROR ]\n");
 
-    printf("\t* deleting A' ... ");
+    printf("\t* deleting A'...  ");
     cudaStatus = true;
     cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_TvoxelIC) );
     cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_TfiberIC) );
@@ -242,61 +218,60 @@ void CudaLinearOperator::destroy(){
     if (cudaStatus) printf("[ OK ]\n");
     else            printf("[ CUDA ERROR ]\n");
 
-    printf("\t* deleting LUT ... ");
+    printf("\t* deleting x&y ... ");
     cudaStatus = true;
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_lutIC)  );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_lutEC)  );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_lutISO) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_x) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_y) );
     if (cudaStatus) printf("[ OK ]\n");
     else            printf("[ CUDA ERROR ]\n");
 
-    printf("\t* deleting vectors x and y ... ");
+    printf("\t* deleting LUT... ");
     cudaStatus = true;
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_x) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_y) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_lutIC)  );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_lutEC)  );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_lutISO) );
     if (cudaStatus) printf("[ OK ]\n");
     else            printf("[ CUDA ERROR ]\n");
 
-    printf("\t* reseting GPU ... ");
+    printf("\t* reseting GPU... ");
     cudaStatus = true;
     cudaStatus = cudaStatus && cudaCheck( cudaDeviceReset() );
     if (cudaStatus) printf("[ OK ]\n");
     else            printf("[ CUDA ERROR ]\n");
 }
 
-void CudaLinearOperator::setTransposeData(
-    uint32_t*  voxelIDs,
-    uint32_t*  fiberIDs,
-    uint16_t*  orienIDs,
-    float32_t* lengths)
+void CudaLinearOperator::setTransposeData(uint32_t*  voxelIDs,
+                                          uint32_t*  fiberIDs,
+                                          uint16_t*  orienIDs,
+                                          float32_t* lengths)
 {
-    //if (gpu_TvoxelIC == NULL || gpu_TfiberIC == NULL || gpu_TorienIC == NULL || gpu_TlengthIC == NULL) {
-        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TvoxelIC,  nsegments*sizeof(uint32_t))  );
-        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TfiberIC,  nsegments*sizeof(uint32_t))  );
-        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TorienIC,  nsegments*sizeof(uint16_t))  );
-        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TlengthIC, nsegments*sizeof(float32_t)) );
-
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TvoxelIC,  voxelIDs, nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TfiberIC,  fiberIDs, nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TorienIC,  orienIDs, nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TlengthIC, lengths,  nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
-    //}
-    
-    //if (gpu_TfibersPerBlockIC == NULL || gpu_ToffsetPerBlockIC == NULL) {
-        uint32_t*  fibersPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
-        uint32_t*  offsetPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
-    
-        preprocessDataForGPU(fiberIDs, nsegments, fibersPerBlock, offsetPerBlock, nfibers);
-    
-        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TfibersPerBlockIC, nfibers*sizeof(uint32_t)) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_ToffsetPerBlockIC, nfibers*sizeof(uint32_t)) );
-    
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TfibersPerBlockIC, fibersPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_ToffsetPerBlockIC, offsetPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    
-        free(fibersPerBlock);
-        free(offsetPerBlock);
-    //}
+    printf("\t* A' operator... ");
+    cudaStatus = true;
+    uint32_t*  fibersPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
+    uint32_t*  offsetPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
+
+    preprocessDataForGPU(fiberIDs, nsegments, fibersPerBlock, offsetPerBlock, nfibers);
+
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TfibersPerBlockIC, nfibers*sizeof(uint32_t)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_ToffsetPerBlockIC, nfibers*sizeof(uint32_t)) );
+
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TfibersPerBlockIC, fibersPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_ToffsetPerBlockIC, offsetPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+
+    free(fibersPerBlock);
+    free(offsetPerBlock);
+
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TvoxelIC,  nsegments*sizeof(uint32_t))  );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TfiberIC,  nsegments*sizeof(uint32_t))  );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TorienIC,  nsegments*sizeof(uint16_t))  );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TlengthIC, nsegments*sizeof(float32_t)) );
+
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TvoxelIC,  voxelIDs, nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TfiberIC,  fiberIDs, nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TorienIC,  orienIDs, nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TlengthIC, lengths,  nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
+    if (cudaStatus) printf("[ OK ]\n");
+    else            printf("[ CUDA ERROR ]\n");
 }
 
 void cudaCheckKernel(){
@@ -316,7 +291,7 @@ void cudaCheckKernel(){
 }
 
 void CudaLinearOperator::dot(float64_t* v_in, float64_t* v_out){
-    //cudaError_t cudaStatus;
+    cudaError_t cudaStatus;
     
     // Copy vector x to the GPU
     cudaStatus = cudaMemcpy(gpu_x, v_in, ncols*sizeof(double), cudaMemcpyHostToDevice);
@@ -346,7 +321,7 @@ void CudaLinearOperator::dot(float64_t* v_in, float64_t* v_out){
 
 void CudaLinearOperator::Tdot(float64_t* v_in, float64_t* v_out){
         
-    //cudaError_t cudaStatus;
+    cudaError_t cudaStatus;
     // Copy vector y to the GPU
     //cudaCheck( cudaMemset(gpu_x, 0, NUM_COLS*sizeof(float64_t)) );
     //cudaCheck( cudaMemcpy(gpu_x, x, NUM_COLS*sizeof(double), cudaMemcpyHostToDevice) );
@@ -380,38 +355,17 @@ void CudaLinearOperator::Tdot(float64_t* v_in, float64_t* v_out){
     printf("\n\n");//*/
 }
 
-bool cudaCheck(cudaError_t cudaStatus){
-    return cudaStatus == cudaSuccess;
-}
-
-void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compartmentsPerBlock, uint32_t* offsetPerBlock, int NUM_BLOCKS){
-
-    // fill arrays with zeros
-    memset(compartmentsPerBlock, 0, NUM_BLOCKS * sizeof(uint32_t));
-    memset(offsetPerBlock,       0, NUM_BLOCKS * sizeof(uint32_t));
-
-    // count compartments per block
-    for(int i = 0; i < NUM_COMPARTMENTS; i++)
-        compartmentsPerBlock[data[i]]++;
-
-    // calculate offset per block
-    offsetPerBlock[0] = 0;
-    for(int i = 1; i < NUM_BLOCKS; i++)
-        offsetPerBlock[i] = offsetPerBlock[i-1] + compartmentsPerBlock[i-1];
-}
-
-
-__global__ void multiply_Ax_ICpart(
-    uint32_t*  voxelIDs,
-    uint32_t*  fiberIDs,
-    uint16_t*  orienIDs,
-    float32_t* lengths,
-    uint32_t*  segmentsPerBlock,
-    uint32_t*  offsetPerBlock,
-    float32_t* lut,
-    float64_t* x,
-    float64_t* y){
-
+// ------------------------------------------------------- KERNELS ------------------------------------------------------- //
+__global__ void multiply_Ax_ICpart(uint32_t*  voxelIDs,
+                                   uint32_t*  fiberIDs,
+                                   uint16_t*  orienIDs,
+                                   float32_t* lengths,
+                                   uint32_t*  segmentsPerBlock,
+                                   uint32_t*  offsetPerBlock,
+                                   float32_t* lut,
+                                   float64_t* x,
+                                   float64_t* y)
+{
     __shared__ float64_t shmem[1024];
 
     uint32_t bid = blockIdx.x;
diff --git a/commit/operator_withCUDA.cuh b/commit/operator_withCUDA.cuh
index 231a4f77..435ddf56 100644
--- a/commit/operator_withCUDA.cuh
+++ b/commit/operator_withCUDA.cuh
@@ -148,9 +148,7 @@ class CudaLinearOperator {
             int nsamples,     
             int ndiameters,   
             int nzeppelins,   
-            int nballs,
-        
-            int fcall);
+            int nballs);
 
         ~CudaLinearOperator();
 

From 07287e74e415cfaacf2c8ac47f2f20604b8b9613 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Wed, 1 Apr 2020 01:18:59 -0600
Subject: [PATCH 093/190] Adding fcall flag to CudaLinearOperator

---
 commit/cudaoperator.pyx | 2 --
 1 file changed, 2 deletions(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 80673fdd..caf7d2fb 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -28,8 +28,6 @@ cdef extern from "operator_withCUDA.cuh":
             int,
             int,
             int,
-            int,
-            
             int)
 
         int   getCudaStatus()

From 4bf271a506bada27416e291113eece731b842048 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Wed, 1 Apr 2020 01:22:42 -0600
Subject: [PATCH 094/190] Adding fcall flag to CudaLinearOperator

---
 commit/cudaoperator.pyx | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index caf7d2fb..3d35a632 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -152,10 +152,10 @@ cdef class CudaLinearOperator :
             self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
             self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
 
-            cdef unsigned int   [::1] ICf = self.DICTIONARY['IC']['fiber']
-            cdef float          [::1] ICl = self.DICTIONARY['IC']['len']
-            cdef unsigned int   [::1] ICv = self.DICTIONARY['IC']['v']
-            cdef unsigned short [::1] ICo = self.DICTIONARY['IC']['o']
+            ICf = self.DICTIONARY['IC']['fiber']
+            ICl = self.DICTIONARY['IC']['len']
+            ICv = self.DICTIONARY['IC']['v']
+            ICo = self.DICTIONARY['IC']['o']
 
             self.ICf = &ICf[0]
             self.ICl = &ICl[0]

From 7a23b1b1243bb254f1b4f2bac6e0a698c5ff9770 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Wed, 1 Apr 2020 01:26:10 -0600
Subject: [PATCH 095/190] Adding fcall flag to CudaLinearOperator

---
 commit/core.pyx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index f4a7fdee..bf9c4d25 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -90,7 +90,6 @@ cdef class Evaluation :
         self.THREADS    = None # set by "set_threads" method
         self.A          = None # set by "build_operator" method
         self.x          = None # set by "fit" method
-        self.gpu_A      = None
 
         # store all the parameters of an evaluation with COMMIT
         self.CONFIG = {}

From d911b00ba3dd6b6796371d336054d90289153e3a Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Wed, 1 Apr 2020 01:32:40 -0600
Subject: [PATCH 096/190] Adding fcall flag to CudaLinearOperator

---
 commit/cudaoperator.pyx     | 46 ++++++++++++++++++-------------------
 commit/operator_withCUDA.cu |  2 +-
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 3d35a632..7b68eb2b 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -121,30 +121,30 @@ cdef class CudaLinearOperator :
         cdef float [:, ::1] isoSFP = KERNELS['iso']
         self.LUT_ISO = &isoSFP[0,0]
 
-        self.A = new C_CudaLinearOperator(
-            &ICv[0],
-            &ICf[0],
-            &ICo[0],
-            &ICl[0],
-            &wmrSFP[0,0,0],
-
-            &ECv[0],
-            &ECo[0],
-            &wmhSFP[0,0,0],
-
-            &isoSFP[0,0],
-
-            self.n,
-            self.nV,
-            self.nF,
-            self.nE,
-            self.ndirs,
-            self.nS,
-            self.nR,
-            self.nT,
-            self.nI)
-
         if fcall == 1:
+            self.A = new C_CudaLinearOperator(
+                &ICv[0],
+                &ICf[0],
+                &ICo[0],
+                &ICl[0],
+                &wmrSFP[0,0,0],
+
+                &ECv[0],
+                &ECo[0],
+                &wmhSFP[0,0,0],
+
+                &isoSFP[0,0],
+
+                self.n,
+                self.nV,
+                self.nF,
+                self.nE,
+                self.ndirs,
+                self.nS,
+                self.nR,
+                self.nT,
+                self.nI)
+
             idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['fiber'])] )
 
             self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index 3f61da55..628613b1 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -218,7 +218,7 @@ void CudaLinearOperator::destroy(){
     if (cudaStatus) printf("[ OK ]\n");
     else            printf("[ CUDA ERROR ]\n");
 
-    printf("\t* deleting x&y ... ");
+    printf("\t* deleting x&y... ");
     cudaStatus = true;
     cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_x) );
     cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_y) );

From 35c7fab95c9683c73e4cd1ccd8f7535a797b5b8a Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Wed, 1 Apr 2020 18:49:40 -0600
Subject: [PATCH 097/190] Adding fcall flag to CudaLinearOperator

---
 commit/core.pyx         | 1 +
 commit/cudaoperator.pyx | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/commit/core.pyx b/commit/core.pyx
index bf9c4d25..1bd619a6 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -650,6 +650,7 @@ cdef class Evaluation :
         else:
             import commit.cudaoperator
             self.A = commit.cudaoperator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS, fcall=1 )
+            self.A.set_transpose_data()
 
         print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
 
diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 7b68eb2b..8d85d264 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -145,6 +145,7 @@ cdef class CudaLinearOperator :
                 self.nT,
                 self.nI)
 
+            """
             idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['fiber'])] )
 
             self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
@@ -163,6 +164,7 @@ cdef class CudaLinearOperator :
             self.ICo = &ICo[0]
 
             self.A.setTransposeData(&self.ICv[0], &self.ICf[0], &self.ICo[0], &self.ICl[0])
+            """
 
     @property
     def T( self ) :

From f7ac4969ba855e3a1d24800dc63930d62467317f Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Wed, 1 Apr 2020 19:02:56 -0600
Subject: [PATCH 098/190] Adding fcall flag to CudaLinearOperator

---
 commit/core.pyx             |  2 +-
 commit/cudaoperator.pyx     |  2 --
 commit/operator_withCUDA.cu | 10 +++++-----
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index 1bd619a6..da6fa714 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -650,7 +650,7 @@ cdef class Evaluation :
         else:
             import commit.cudaoperator
             self.A = commit.cudaoperator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS, fcall=1 )
-            self.A.set_transpose_data()
+            #self.A.set_transpose_data()
 
         print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
 
diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 8d85d264..7b68eb2b 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -145,7 +145,6 @@ cdef class CudaLinearOperator :
                 self.nT,
                 self.nI)
 
-            """
             idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['fiber'])] )
 
             self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
@@ -164,7 +163,6 @@ cdef class CudaLinearOperator :
             self.ICo = &ICo[0]
 
             self.A.setTransposeData(&self.ICv[0], &self.ICf[0], &self.ICo[0], &self.ICl[0])
-            """
 
     @property
     def T( self ) :
diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index 628613b1..98d48845 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -326,27 +326,27 @@ void CudaLinearOperator::Tdot(float64_t* v_in, float64_t* v_out){
     //cudaCheck( cudaMemset(gpu_x, 0, NUM_COLS*sizeof(float64_t)) );
     //cudaCheck( cudaMemcpy(gpu_x, x, NUM_COLS*sizeof(double), cudaMemcpyHostToDevice) );
     cudaStatus = cudaMemcpy(gpu_y, v_in, nrows*sizeof(double), cudaMemcpyHostToDevice);
-    /*if (cudaStatus != cudaSuccess) printf("\t* tranfering y to GPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
+    if (cudaStatus != cudaSuccess) printf("\t* tranfering y to GPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
     else                           printf("\t* tranfering y to GPU ... [   OK  ]\n");//*/
 
     // Multiply IC part in the GPU
     multiply_Aty_ICpart<<<nfibers, 512>>>(gpu_TvoxelIC, gpu_TfiberIC, gpu_TorienIC, gpu_TlengthIC, gpu_TfibersPerBlockIC, gpu_ToffsetPerBlockIC, gpu_lutIC, gpu_x, gpu_y);
 
-    //cudaCheckKernel();
+    cudaCheckKernel();
 
     // Multiply EC part in the GPU
     multiply_Aty_ECpart<<<nvoxels, 512>>>(gpu_voxelEC, gpu_orienEC, gpu_segmentsPerBlockEC, gpu_offsetPerBlockEC, gpu_lutEC, gpu_x, gpu_y);
 
-    //cudaCheckKernel();
+    cudaCheckKernel();
 
     // Multiply ISO part in the GPU
     multiply_Aty_ISOpart<<<nvoxels, 512>>>(gpu_lutISO, gpu_x, gpu_y);
 
-    //cudaCheckKernel();
+    cudaCheckKernel();
 
     // Copy back result to CPU
     cudaStatus = cudaMemcpy(v_out, gpu_x, ncols*sizeof(double), cudaMemcpyDeviceToHost);
-    /*if (cudaStatus != cudaSuccess) printf("\t* tranfering x to CPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
+    if (cudaStatus != cudaSuccess) printf("\t* tranfering x to CPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
     else                           printf("\t* tranfering x to CPU ... [   OK  ]\n");//*/
         
     /*printf("\n\n VECTOR X EC PART:\n");

From d0a855e8e501a959155d8f871b4efcaf89a0a59d Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Wed, 1 Apr 2020 19:15:17 -0600
Subject: [PATCH 099/190] Adding fcall flag to CudaLinearOperator

---
 commit/cudaoperator.pyx     |  1 +
 commit/operator_withCUDA.cu | 10 +++++-----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 7b68eb2b..b337cf36 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -207,6 +207,7 @@ cdef class CudaLinearOperator :
             self.A.dot(&v_in[0], &v_out[0])
         else :
             # INVERSE PRODUCT A'*y
+            print('transpuesta')
             self.A.Tdot(&v_in[0], &v_out[0])
 
         return v_out
diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index 98d48845..91b369cf 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -295,27 +295,27 @@ void CudaLinearOperator::dot(float64_t* v_in, float64_t* v_out){
     
     // Copy vector x to the GPU
     cudaStatus = cudaMemcpy(gpu_x, v_in, ncols*sizeof(double), cudaMemcpyHostToDevice);
-    /*if (cudaStatus != cudaSuccess) printf("\t* tranfering x to GPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
+    if (cudaStatus != cudaSuccess) printf("\t* tranfering x to GPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
     else                           printf("\t* tranfering x to GPU ... [   OK  ]\n");//*/
 
     // Multiply IC part in the GPU
     multiply_Ax_ICpart<<<nvoxels, 1024>>>(gpu_voxelIC, gpu_fiberIC, gpu_orienIC, gpu_lengthIC, gpu_segmentsPerBlockIC, gpu_offsetPerBlockIC, gpu_lutIC, gpu_x, gpu_y);
 
-    //cudaCheckKernel();
+    cudaCheckKernel();
 
     // Multiply EC part in the GPU
     multiply_Ax_ECpart<<<nvoxels, 512>>>(gpu_voxelEC, gpu_orienEC, gpu_segmentsPerBlockEC, gpu_offsetPerBlockEC, gpu_lutEC, gpu_x, gpu_y);
 
-    //cudaCheckKernel();
+    cudaCheckKernel();
 
     // Multiply ISO part in the GPU
     multiply_Ax_ISOpart<<<nvoxels, 512>>>(gpu_lutISO, gpu_x, gpu_y);
 
-    //cudaCheckKernel();
+    cudaCheckKernel();
 
     // Copy back result to CPU
     cudaStatus = cudaMemcpy(v_out, gpu_y, nrows*sizeof(double), cudaMemcpyDeviceToHost);
-    /*if (cudaStatus != cudaSuccess) printf("\t* tranfering y to CPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
+    if (cudaStatus != cudaSuccess) printf("\t* tranfering y to CPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
     else                           printf("\t* tranfering y to CPU ... [   OK  ]\n");//*/
 }
 

From 7cfbad6e70bfeb58b467e3461689babd4a8ae69e Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Wed, 1 Apr 2020 19:46:56 -0600
Subject: [PATCH 100/190] Adding fcall flag to CudaLinearOperator

---
 commit/operator_withCUDA.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index 91b369cf..7f11bfe6 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -250,6 +250,8 @@ void CudaLinearOperator::setTransposeData(uint32_t*  voxelIDs,
     uint32_t*  fibersPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
     uint32_t*  offsetPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
 
+    if(fibersPerBlock == NULL || offsetPerBlock == NULL) printf("problemas\n");
+
     preprocessDataForGPU(fiberIDs, nsegments, fibersPerBlock, offsetPerBlock, nfibers);
 
     cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TfibersPerBlockIC, nfibers*sizeof(uint32_t)) );

From 99c332d0df29531a3d25fceb8798f3b030c97711 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Wed, 1 Apr 2020 20:15:38 -0600
Subject: [PATCH 101/190] Adding fcall flag to CudaLinearOperator

---
 commit/operator_withCUDA.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index 7f11bfe6..161dd5c2 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -332,7 +332,7 @@ void CudaLinearOperator::Tdot(float64_t* v_in, float64_t* v_out){
     else                           printf("\t* tranfering y to GPU ... [   OK  ]\n");//*/
 
     // Multiply IC part in the GPU
-    multiply_Aty_ICpart<<<nfibers, 512>>>(gpu_TvoxelIC, gpu_TfiberIC, gpu_TorienIC, gpu_TlengthIC, gpu_TfibersPerBlockIC, gpu_ToffsetPerBlockIC, gpu_lutIC, gpu_x, gpu_y);
+    /*multiply_Aty_ICpart<<<nfibers, 512>>>(gpu_TvoxelIC, gpu_TfiberIC, gpu_TorienIC, gpu_TlengthIC, gpu_TfibersPerBlockIC, gpu_ToffsetPerBlockIC, gpu_lutIC, gpu_x, gpu_y);
 
     cudaCheckKernel();
 
@@ -344,7 +344,7 @@ void CudaLinearOperator::Tdot(float64_t* v_in, float64_t* v_out){
     // Multiply ISO part in the GPU
     multiply_Aty_ISOpart<<<nvoxels, 512>>>(gpu_lutISO, gpu_x, gpu_y);
 
-    cudaCheckKernel();
+    cudaCheckKernel();//*/
 
     // Copy back result to CPU
     cudaStatus = cudaMemcpy(v_out, gpu_x, ncols*sizeof(double), cudaMemcpyDeviceToHost);

From 90df1b24997b6b68cd4b2358120d12c0e6e2142d Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Wed, 1 Apr 2020 20:19:41 -0600
Subject: [PATCH 102/190] Adding fcall flag to CudaLinearOperator

---
 commit/operator_withCUDA.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index 161dd5c2..980f9fe4 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -327,7 +327,7 @@ void CudaLinearOperator::Tdot(float64_t* v_in, float64_t* v_out){
     // Copy vector y to the GPU
     //cudaCheck( cudaMemset(gpu_x, 0, NUM_COLS*sizeof(float64_t)) );
     //cudaCheck( cudaMemcpy(gpu_x, x, NUM_COLS*sizeof(double), cudaMemcpyHostToDevice) );
-    cudaStatus = cudaMemcpy(gpu_y, v_in, nrows*sizeof(double), cudaMemcpyHostToDevice);
+    /*cudaStatus = cudaMemcpy(gpu_y, v_in, nrows*sizeof(double), cudaMemcpyHostToDevice);
     if (cudaStatus != cudaSuccess) printf("\t* tranfering y to GPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
     else                           printf("\t* tranfering y to GPU ... [   OK  ]\n");//*/
 
@@ -347,7 +347,7 @@ void CudaLinearOperator::Tdot(float64_t* v_in, float64_t* v_out){
     cudaCheckKernel();//*/
 
     // Copy back result to CPU
-    cudaStatus = cudaMemcpy(v_out, gpu_x, ncols*sizeof(double), cudaMemcpyDeviceToHost);
+    /*cudaStatus = cudaMemcpy(v_out, gpu_x, ncols*sizeof(double), cudaMemcpyDeviceToHost);
     if (cudaStatus != cudaSuccess) printf("\t* tranfering x to CPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
     else                           printf("\t* tranfering x to CPU ... [   OK  ]\n");//*/
         

From 4f6ef08338b60fea443b47410d668fa7d3d954d9 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Wed, 1 Apr 2020 20:43:49 -0600
Subject: [PATCH 103/190] Adding fcall flag to CudaLinearOperator

---
 commit/operator_withCUDA.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index 980f9fe4..00588360 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -327,7 +327,7 @@ void CudaLinearOperator::Tdot(float64_t* v_in, float64_t* v_out){
     // Copy vector y to the GPU
     //cudaCheck( cudaMemset(gpu_x, 0, NUM_COLS*sizeof(float64_t)) );
     //cudaCheck( cudaMemcpy(gpu_x, x, NUM_COLS*sizeof(double), cudaMemcpyHostToDevice) );
-    /*cudaStatus = cudaMemcpy(gpu_y, v_in, nrows*sizeof(double), cudaMemcpyHostToDevice);
+    cudaStatus = cudaMemcpy(gpu_y, v_in, nrows*sizeof(double), cudaMemcpyHostToDevice);
     if (cudaStatus != cudaSuccess) printf("\t* tranfering y to GPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
     else                           printf("\t* tranfering y to GPU ... [   OK  ]\n");//*/
 

From da1df874b96b86a3a378f539799d4ed16b8fd6f0 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Wed, 1 Apr 2020 21:25:24 -0600
Subject: [PATCH 104/190] Adding fcall flag to CudaLinearOperator

---
 commit/operator_withCUDA.cu | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index 00588360..6248bf2e 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -120,8 +120,12 @@ CudaLinearOperator::CudaLinearOperator(
     else            printf("[ CUDA ERROR ]\n");
 
     // alloc memory in GPU for vectors x and y
+    printf("\t* vectors x&y ... ");
+    cudaStatus = true;
     cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_x, ncols*sizeof(float64_t)) );
     cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_y, nrows*sizeof(float64_t)) );
+    if (cudaStatus) printf("[ OK ]\n");
+    else            printf("[ CUDA ERROR ]\n");
 
     // pre-process data for GPU
     printf("\t* pre-processing ... ");
@@ -327,7 +331,7 @@ void CudaLinearOperator::Tdot(float64_t* v_in, float64_t* v_out){
     // Copy vector y to the GPU
     //cudaCheck( cudaMemset(gpu_x, 0, NUM_COLS*sizeof(float64_t)) );
     //cudaCheck( cudaMemcpy(gpu_x, x, NUM_COLS*sizeof(double), cudaMemcpyHostToDevice) );
-    cudaStatus = cudaMemcpy(gpu_y, v_in, nrows*sizeof(double), cudaMemcpyHostToDevice);
+    /*cudaStatus = cudaMemcpy(gpu_y, v_in, nrows*sizeof(double), cudaMemcpyHostToDevice);
     if (cudaStatus != cudaSuccess) printf("\t* tranfering y to GPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
     else                           printf("\t* tranfering y to GPU ... [   OK  ]\n");//*/
 
@@ -347,7 +351,7 @@ void CudaLinearOperator::Tdot(float64_t* v_in, float64_t* v_out){
     cudaCheckKernel();//*/
 
     // Copy back result to CPU
-    /*cudaStatus = cudaMemcpy(v_out, gpu_x, ncols*sizeof(double), cudaMemcpyDeviceToHost);
+    cudaStatus = cudaMemcpy(v_out, gpu_x, ncols*sizeof(double), cudaMemcpyDeviceToHost);
     if (cudaStatus != cudaSuccess) printf("\t* tranfering x to CPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
     else                           printf("\t* tranfering x to CPU ... [   OK  ]\n");//*/
         

From b51f9b50bcd26cc7393acb3f5d8077968a37f5d0 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Wed, 1 Apr 2020 21:35:29 -0600
Subject: [PATCH 105/190] Adding fcall flag to CudaLinearOperator

---
 commit/operator_withCUDA.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index 6248bf2e..86f3757c 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -351,6 +351,8 @@ void CudaLinearOperator::Tdot(float64_t* v_in, float64_t* v_out){
     cudaCheckKernel();//*/
 
     // Copy back result to CPU
+    if (gpu_y == NULL) printf("------------------------problemas gpy_y--------------------------------\n");
+    if (v_out == NULL) printf("------------------------problemas v_out--------------------------------\n");
     cudaStatus = cudaMemcpy(v_out, gpu_x, ncols*sizeof(double), cudaMemcpyDeviceToHost);
     if (cudaStatus != cudaSuccess) printf("\t* tranfering x to CPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
     else                           printf("\t* tranfering x to CPU ... [   OK  ]\n");//*/

From e442dc5a55e45c9db8411b8518c987b54fd0fc49 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Wed, 1 Apr 2020 22:04:19 -0600
Subject: [PATCH 106/190] Adding fcall flag to CudaLinearOperator

---
 commit/core.pyx              |  1 -
 commit/cudaoperator.pyx      | 48 +++++++++++++++++++-----------------
 commit/operator_withCUDA.cu  | 18 ++++++++------
 commit/operator_withCUDA.cuh |  4 ++-
 4 files changed, 38 insertions(+), 33 deletions(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index da6fa714..bf9c4d25 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -650,7 +650,6 @@ cdef class Evaluation :
         else:
             import commit.cudaoperator
             self.A = commit.cudaoperator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS, fcall=1 )
-            #self.A.set_transpose_data()
 
         print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
 
diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index b337cf36..03aced31 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -121,30 +121,32 @@ cdef class CudaLinearOperator :
         cdef float [:, ::1] isoSFP = KERNELS['iso']
         self.LUT_ISO = &isoSFP[0,0]
 
-        if fcall == 1:
-            self.A = new C_CudaLinearOperator(
-                &ICv[0],
-                &ICf[0],
-                &ICo[0],
-                &ICl[0],
-                &wmrSFP[0,0,0],
-
-                &ECv[0],
-                &ECo[0],
-                &wmhSFP[0,0,0],
-
-                &isoSFP[0,0],
-
-                self.n,
-                self.nV,
-                self.nF,
-                self.nE,
-                self.ndirs,
-                self.nS,
-                self.nR,
-                self.nT,
-                self.nI)
+        self.A = new C_CudaLinearOperator(
+            &ICv[0],
+            &ICf[0],
+            &ICo[0],
+            &ICl[0],
+            &wmrSFP[0,0,0],
+
+            &ECv[0],
+            &ECo[0],
+            &wmhSFP[0,0,0],
+
+            &isoSFP[0,0],
+
+            self.n,
+            self.nV,
+            self.nF,
+            self.nE,
+            self.ndirs,
+            self.nS,
+            self.nR,
+            self.nT,
+            self.nI,
+            
+            fcall)
 
+        if fcall == 1:
             idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['fiber'])] )
 
             self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index 86f3757c..fc046e57 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -85,7 +85,9 @@ CudaLinearOperator::CudaLinearOperator(
     int nsamples,     
     int ndiameters,   
     int nzeppelins,   
-    int nballs)
+    int nballs,
+
+    int fcall)
 {
     this->nsegments = nsegments;
     this->nvoxels   = nvoxels;
@@ -93,6 +95,7 @@ CudaLinearOperator::CudaLinearOperator(
     this->nrows     = nvoxels * nsamples;
     this->ncols     = nfibers*ndiameters + npeaks*nzeppelins + nvoxels*nballs;
 
+    if (fcall == 1) {
     int size_lutic  = ndiameters*norientations*nsamples;
     int size_lutec  = nzeppelins*norientations*nsamples;
     int size_lutiso = nballs*nsamples;
@@ -186,6 +189,7 @@ CudaLinearOperator::CudaLinearOperator(
     cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_orienEC,  orienEC,  npeaks*sizeof(uint16_t),     cudaMemcpyHostToDevice) );
     if (cudaStatus) printf("[ OK ]\n");
     else            printf("[ CUDA ERROR ]\n");
+    }
 
 }
 
@@ -331,28 +335,26 @@ void CudaLinearOperator::Tdot(float64_t* v_in, float64_t* v_out){
     // Copy vector y to the GPU
     //cudaCheck( cudaMemset(gpu_x, 0, NUM_COLS*sizeof(float64_t)) );
     //cudaCheck( cudaMemcpy(gpu_x, x, NUM_COLS*sizeof(double), cudaMemcpyHostToDevice) );
-    /*cudaStatus = cudaMemcpy(gpu_y, v_in, nrows*sizeof(double), cudaMemcpyHostToDevice);
+    cudaStatus = cudaMemcpy(gpu_y, v_in, nrows*sizeof(double), cudaMemcpyHostToDevice);
     if (cudaStatus != cudaSuccess) printf("\t* tranfering y to GPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
     else                           printf("\t* tranfering y to GPU ... [   OK  ]\n");//*/
 
     // Multiply IC part in the GPU
-    /*multiply_Aty_ICpart<<<nfibers, 512>>>(gpu_TvoxelIC, gpu_TfiberIC, gpu_TorienIC, gpu_TlengthIC, gpu_TfibersPerBlockIC, gpu_ToffsetPerBlockIC, gpu_lutIC, gpu_x, gpu_y);
+    multiply_Aty_ICpart<<<nfibers, 512>>>(gpu_TvoxelIC, gpu_TfiberIC, gpu_TorienIC, gpu_TlengthIC, gpu_TfibersPerBlockIC, gpu_ToffsetPerBlockIC, gpu_lutIC, gpu_x, gpu_y);
 
-    cudaCheckKernel();
+    //cudaCheckKernel();
 
     // Multiply EC part in the GPU
     multiply_Aty_ECpart<<<nvoxels, 512>>>(gpu_voxelEC, gpu_orienEC, gpu_segmentsPerBlockEC, gpu_offsetPerBlockEC, gpu_lutEC, gpu_x, gpu_y);
 
-    cudaCheckKernel();
+    //cudaCheckKernel();
 
     // Multiply ISO part in the GPU
     multiply_Aty_ISOpart<<<nvoxels, 512>>>(gpu_lutISO, gpu_x, gpu_y);
 
-    cudaCheckKernel();//*/
+    //cudaCheckKernel();
 
     // Copy back result to CPU
-    if (gpu_y == NULL) printf("------------------------problemas gpy_y--------------------------------\n");
-    if (v_out == NULL) printf("------------------------problemas v_out--------------------------------\n");
     cudaStatus = cudaMemcpy(v_out, gpu_x, ncols*sizeof(double), cudaMemcpyDeviceToHost);
     if (cudaStatus != cudaSuccess) printf("\t* tranfering x to CPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
     else                           printf("\t* tranfering x to CPU ... [   OK  ]\n");//*/
diff --git a/commit/operator_withCUDA.cuh b/commit/operator_withCUDA.cuh
index 435ddf56..231a4f77 100644
--- a/commit/operator_withCUDA.cuh
+++ b/commit/operator_withCUDA.cuh
@@ -148,7 +148,9 @@ class CudaLinearOperator {
             int nsamples,     
             int ndiameters,   
             int nzeppelins,   
-            int nballs);
+            int nballs,
+        
+            int fcall);
 
         ~CudaLinearOperator();
 

From 5ebdad2addb9a1048361ca99691b50c34a65b047 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Wed, 1 Apr 2020 22:05:58 -0600
Subject: [PATCH 107/190] Adding fcall flag to CudaLinearOperator

---
 commit/cudaoperator.pyx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 03aced31..386adb01 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -28,6 +28,8 @@ cdef extern from "operator_withCUDA.cuh":
             int,
             int,
             int,
+            int,
+            
             int)
 
         int   getCudaStatus()

From e0250e83667a2ea38bc5a9f5b3ca7019196f5f19 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Wed, 1 Apr 2020 22:41:25 -0600
Subject: [PATCH 108/190] Adding fcall flag to CudaLinearOperator

---
 commit/cudaoperator.pyx     | 27 ---------------------------
 commit/operator_withCUDA.cu | 28 ++++++++++++++--------------
 2 files changed, 14 insertions(+), 41 deletions(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 386adb01..3cc6591a 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -211,38 +211,11 @@ cdef class CudaLinearOperator :
             self.A.dot(&v_in[0], &v_out[0])
         else :
             # INVERSE PRODUCT A'*y
-            print('transpuesta')
             self.A.Tdot(&v_in[0], &v_out[0])
 
         return v_out
 
-    @property
-    def cuda_status( self ):
-        """Return status of the CUDA GPU"""
-        return self.A.getCudaStatus()
-
     def destroy( self ):
         """Free all memory of the CUDA GPU"""
         self.A.destroy()
 
-    def set_transpose_data( self ):
-        """Send A' data to the CUDA GPU"""
-        idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['fiber'])] )
-
-        self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
-        self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]
-        self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
-        self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
-
-        cdef unsigned int   [::1] ICf = self.DICTIONARY['IC']['fiber']
-        cdef float          [::1] ICl = self.DICTIONARY['IC']['len']
-        cdef unsigned int   [::1] ICv = self.DICTIONARY['IC']['v']
-        cdef unsigned short [::1] ICo = self.DICTIONARY['IC']['o']
-
-        self.ICf = &ICf[0]
-        self.ICl = &ICl[0]
-        self.ICv = &ICv[0]
-        self.ICo = &ICo[0]
-
-        self.A.setTransposeData(&self.ICv[0], &self.ICf[0], &self.ICo[0], &self.ICl[0])
-
diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index fc046e57..01f3e6de 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -198,7 +198,7 @@ CudaLinearOperator::~CudaLinearOperator() {}
 void CudaLinearOperator::destroy(){
     bool cudaStatus;
 
-    printf("-> Deleting GPU memory:\n");
+    printf("\n-> Deleting GPU memory:\n");
 
     printf("\t* deleting A...   ");
     cudaStatus = true;
@@ -301,42 +301,42 @@ void cudaCheckKernel(){
 }
 
 void CudaLinearOperator::dot(float64_t* v_in, float64_t* v_out){
-    cudaError_t cudaStatus;
+    //cudaError_t cudaStatus;
     
     // Copy vector x to the GPU
-    cudaStatus = cudaMemcpy(gpu_x, v_in, ncols*sizeof(double), cudaMemcpyHostToDevice);
-    if (cudaStatus != cudaSuccess) printf("\t* tranfering x to GPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
+    cudaMemcpy(gpu_x, v_in, ncols*sizeof(double), cudaMemcpyHostToDevice);
+    /*if (cudaStatus != cudaSuccess) printf("\t* tranfering x to GPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
     else                           printf("\t* tranfering x to GPU ... [   OK  ]\n");//*/
 
     // Multiply IC part in the GPU
     multiply_Ax_ICpart<<<nvoxels, 1024>>>(gpu_voxelIC, gpu_fiberIC, gpu_orienIC, gpu_lengthIC, gpu_segmentsPerBlockIC, gpu_offsetPerBlockIC, gpu_lutIC, gpu_x, gpu_y);
 
-    cudaCheckKernel();
+    //cudaCheckKernel();
 
     // Multiply EC part in the GPU
     multiply_Ax_ECpart<<<nvoxels, 512>>>(gpu_voxelEC, gpu_orienEC, gpu_segmentsPerBlockEC, gpu_offsetPerBlockEC, gpu_lutEC, gpu_x, gpu_y);
 
-    cudaCheckKernel();
+    //cudaCheckKernel();
 
     // Multiply ISO part in the GPU
     multiply_Ax_ISOpart<<<nvoxels, 512>>>(gpu_lutISO, gpu_x, gpu_y);
 
-    cudaCheckKernel();
+    //cudaCheckKernel();
 
     // Copy back result to CPU
-    cudaStatus = cudaMemcpy(v_out, gpu_y, nrows*sizeof(double), cudaMemcpyDeviceToHost);
-    if (cudaStatus != cudaSuccess) printf("\t* tranfering y to CPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
+    cudaMemcpy(v_out, gpu_y, nrows*sizeof(double), cudaMemcpyDeviceToHost);
+    /*if (cudaStatus != cudaSuccess) printf("\t* tranfering y to CPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
     else                           printf("\t* tranfering y to CPU ... [   OK  ]\n");//*/
 }
 
 void CudaLinearOperator::Tdot(float64_t* v_in, float64_t* v_out){
         
-    cudaError_t cudaStatus;
+    //cudaError_t cudaStatus;
     // Copy vector y to the GPU
     //cudaCheck( cudaMemset(gpu_x, 0, NUM_COLS*sizeof(float64_t)) );
     //cudaCheck( cudaMemcpy(gpu_x, x, NUM_COLS*sizeof(double), cudaMemcpyHostToDevice) );
-    cudaStatus = cudaMemcpy(gpu_y, v_in, nrows*sizeof(double), cudaMemcpyHostToDevice);
-    if (cudaStatus != cudaSuccess) printf("\t* tranfering y to GPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
+    cudaMemcpy(gpu_y, v_in, nrows*sizeof(double), cudaMemcpyHostToDevice);
+    /*if (cudaStatus != cudaSuccess) printf("\t* tranfering y to GPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
     else                           printf("\t* tranfering y to GPU ... [   OK  ]\n");//*/
 
     // Multiply IC part in the GPU
@@ -355,8 +355,8 @@ void CudaLinearOperator::Tdot(float64_t* v_in, float64_t* v_out){
     //cudaCheckKernel();
 
     // Copy back result to CPU
-    cudaStatus = cudaMemcpy(v_out, gpu_x, ncols*sizeof(double), cudaMemcpyDeviceToHost);
-    if (cudaStatus != cudaSuccess) printf("\t* tranfering x to CPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
+    cudaMemcpy(v_out, gpu_x, ncols*sizeof(double), cudaMemcpyDeviceToHost);
+    /*if (cudaStatus != cudaSuccess) printf("\t* tranfering x to CPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
     else                           printf("\t* tranfering x to CPU ... [   OK  ]\n");//*/
         
     /*printf("\n\n VECTOR X EC PART:\n");

From 9a7dd02de47ccf3da02840a9c0211d341e56b7e7 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Fri, 3 Apr 2020 13:29:45 -0600
Subject: [PATCH 109/190] Minor clenup

---
 .gitignore                                    |    4 +-
 CHANGELOG.md                                  |   32 +-
 commit/core.pyx                               | 1758 +++----
 commit/cudaoperator.pyx                       |  442 +-
 commit/operator/operator.pyx                  |  382 +-
 commit/operator/operator.pyxbld               |   66 +-
 commit/operator/operator_noLUT.c              |  374 +-
 commit/operator/operator_withLUT.c            | 4494 ++++++++---------
 commit/operator_withCUDA.cu                   | 1263 +++--
 commit/solvers.py                             |  806 +--
 commit/trk2dictionary/trk2dictionary.pyx      |  912 ++--
 commit/trk2dictionary/trk2dictionary_c.cpp    | 1216 ++---
 doc/tutorials/AdvancedSolvers/README.md       |  314 +-
 .../AdvancedSolvers/tutorial_solvers.ipynb    |  528 +-
 14 files changed, 6284 insertions(+), 6307 deletions(-)

diff --git a/.gitignore b/.gitignore
index b170dda4..7411a854 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,2 @@
-build
-.ipynb_checkpoints
+build
+.ipynb_checkpoints
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 00441eca..026290cf 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,16 +1,16 @@
-
-# Change Log
-All notable changes to COMMIT will be documented in this file.
- 
-## [1.3] - 2019-10-30
-
-This version of COMMIT *is not compatible* with [AMICO](https://github.com/daducci/AMICO) v1.0.1 of below. If you update COMMIT to this version, please update AMICO to version 1.1.0 or above.
- 
-### Added
-- Changelog file to keep tracking of the COMMIT versions.
- 
-### Changed
-- Added compatibility with low resolution LUTs.
- 
-### Fixed
-- Nothing.
+
+# Change Log
+All notable changes to COMMIT will be documented in this file.
+ 
+## [1.3] - 2019-10-30
+
+This version of COMMIT *is not compatible* with [AMICO](https://github.com/daducci/AMICO) v1.0.1 of below. If you update COMMIT to this version, please update AMICO to version 1.1.0 or above.
+ 
+### Added
+- Changelog file to keep tracking of the COMMIT versions.
+ 
+### Changed
+- Added compatibility with low resolution LUTs.
+ 
+### Fixed
+- Nothing.
diff --git a/commit/core.pyx b/commit/core.pyx
index bf9c4d25..4f7462e0 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -1,879 +1,879 @@
-#!python
-#cython: language_level=3, boundscheck=False, wraparound=False, nonecheck=False, cdivision=True, initializedcheck=False, binding=False
-from __future__ import print_function
-cimport cython
-import numpy as np
-cimport numpy as np
-
-import time
-import glob
-import sys
-from os import makedirs, remove
-from os.path import exists, join as pjoin, isfile
-import nibabel
-import pickle
-import commit.models
-import commit.solvers
-import amico.scheme
-import amico.lut
-import pyximport
-pyximport.install( reload_support=True, language_level=3 )
-
-
-def setup( lmax = 12, ndirs = 32761 ) :
-    """General setup/initialization of the COMMIT framework.
-    
-    Parameters
-    ----------
-    lmax : int
-        Maximum SH order to use for the rotation phase (default : 12)
-    ndirs : int
-        Number of directions on the half of the sphere representing the possible orientations of the response functions (default : 32761)
-    """
-
-    if not amico.lut.is_valid(ndirs):
-        raise RuntimeError( 'Unsupported value for ndirs.\nNote: Supported values for ndirs are [500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 10000, 32761 (default)]' )
-
-    amico.lut.precompute_rotation_matrices( lmax, ndirs )
-
-def load_dictionary_info(filename):
-    """Function to load dictionary info file
-    
-    Parameters
-    ----------
-    filename : string
-        This value is always COMMIT_PATH + dictionary_info.pickle
-    """
-    if not isfile( filename ):
-        raise RuntimeError( 'Dictionary is outdated or not found. Execute ''trk2dictionary'' script first.' )
-    with open( filename, 'rb' ) as dictionary_info_file:
-        if sys.version_info.major == 3:
-            aux = pickle.load( dictionary_info_file, fix_imports=True, encoding='bytes' )
-            # Pickle files written by Python 2 are loaded with byte
-            # keys, whereas those written by Python 3 are loaded with
-            # str keys, even when both are written using protocol=2
-            result_aux = {(k.decode() if hasattr(k,"decode") else k): v for k, v in aux.items()}
-            return result_aux
-        else:
-            return pickle.load( dictionary_info_file )
-
-cdef class Evaluation :
-    """Class to hold all the information (data and parameters) when performing an
-    evaluation with the COMMIT framework.
-    """
-    cdef public niiDWI
-    cdef public niiDWI_img
-    cdef public scheme
-    cdef public model
-    cdef public KERNELS
-    cdef public DICTIONARY
-    cdef public THREADS
-    cdef public A
-    cdef public x
-    cdef public CONFIG
-
-    def __init__( self, study_path, subject ) :
-        """Setup the data structures with default values.
-
-        Parameters
-        ----------
-        study_path : string
-            The path to the folder containing all the subjects from one study
-        subject : string
-            The path (relative to previous folder) to the subject folder
-        """
-        self.niiDWI     = None # set by "load_data" method
-        self.scheme     = None # set by "load_data" method
-        self.model      = None # set by "set_model" method
-        self.KERNELS    = None # set by "load_kernels" method
-        self.DICTIONARY = None # set by "load_dictionary" method
-        self.THREADS    = None # set by "set_threads" method
-        self.A          = None # set by "build_operator" method
-        self.x          = None # set by "fit" method
-
-        # store all the parameters of an evaluation with COMMIT
-        self.CONFIG = {}
-        self.set_config('study_path', study_path)
-        self.set_config('subject', subject)
-        self.set_config('DATA_path', pjoin( study_path, subject ))
-
-        self.set_config('doNormalizeSignal', True)
-        self.set_config('doMergeB0', False)
-        self.set_config('doNormalizeKernels', True)
-        self.set_config('doDemean', False)
-        self.set_config('doNormalizeMaps', False)
-
-
-
-    def set_config( self, key, value ) :
-        self.CONFIG[ key ] = value
-
-    def get_config( self, key ) :
-        return self.CONFIG.get( key )
-
-
-    def load_data( self, dwi_filename = 'DWI.nii', scheme_filename = 'DWI.scheme', b0_thr = 0 ) :
-        """Load the diffusion signal and its corresponding acquisition scheme.
-
-        Parameters
-        ----------
-        dwi_filename : string
-            The file name of the DWI data, relative to the subject folder (default : 'DWI.nii')
-        scheme_filename : string
-            The file name of the corresponding acquisition scheme (default : 'DWI.scheme')
-        b0_thr : float
-            The threshold below which a b-value is considered a b0 (default : 0)
-        """
-
-        # Loading data and acquisition scheme
-        tic = time.time()
-        print( '\n-> Loading data:' )
-
-        print( '\t* DWI signal...' )
-        self.set_config('dwi_filename', dwi_filename)
-        self.niiDWI  = nibabel.load( pjoin( self.get_config('DATA_path'), dwi_filename) )
-        self.niiDWI_img = self.niiDWI.get_data().astype(np.float32)
-        if self.niiDWI_img.ndim ==3 :
-            self.niiDWI_img = np.expand_dims( self.niiDWI_img, axis=3 )
-        hdr = self.niiDWI.header if nibabel.__version__ >= '2.0.0' else self.niiDWI.get_header()
-        self.set_config('dim', self.niiDWI_img.shape[0:3])
-        self.set_config('pixdim', tuple( hdr.get_zooms()[:3] ))
-        print( '\t\t- dim    = %d x %d x %d x %d' % self.niiDWI_img.shape )
-        print( '\t\t- pixdim = %.3f x %.3f x %.3f' % self.get_config('pixdim') )
-
-        print( '\t* Acquisition scheme...' )
-        self.set_config('scheme_filename', scheme_filename)
-        self.set_config('b0_thr', b0_thr)
-        self.scheme = amico.scheme.Scheme( pjoin( self.get_config('DATA_path'), scheme_filename), b0_thr )
-        print( '\t\t- %d samples, %d shells' % ( self.scheme.nS, len(self.scheme.shells) ) )
-        print( '\t\t- %d @ b=0' % ( self.scheme.b0_count ), end="" )
-        for i in xrange(len(self.scheme.shells)) :
-            print( ', %d @ b=%.1f' % ( len(self.scheme.shells[i]['idx']), self.scheme.shells[i]['b'] ), end="" )
-        print()
-
-        if self.scheme.nS != self.niiDWI_img.shape[3] :
-            raise ValueError( 'Scheme does not match with DWI data' )
-
-        if self.scheme.dwi_count == 0 :
-            raise ValueError( 'There are no DWI volumes in the data' )
-
-        print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
-
-        # Preprocessing
-        tic = time.time()
-        print( '\n-> Preprocessing:' )
-
-        if self.get_config('doNormalizeSignal') :
-            if self.scheme.b0_count > 0 :
-                print( '\t* Normalizing to b0...', end="" )
-                sys.stdout.flush()
-                mean = np.mean( self.niiDWI_img[:,:,:,self.scheme.b0_idx], axis=3 )
-                idx = mean <= 0
-                mean[ idx ] = 1
-                mean = 1 / mean
-                mean[ idx ] = 0
-                for i in xrange(self.scheme.nS) :
-                    self.niiDWI_img[:,:,:,i] *= mean
-            else :
-                print( '\t* There are no b0 volume(s) for normalization...', end="" )
-            print( '[ min=%.2f,  mean=%.2f, max=%.2f ]' % ( self.niiDWI_img.min(), self.niiDWI_img.mean(), self.niiDWI_img.max() ) )
-
-        if self.scheme.b0_count > 1 :
-            if self.get_config('doMergeB0') :
-                print( '\t* Merging multiple b0 volume(s)...', end="" )
-                mean = np.expand_dims( np.mean( self.niiDWI_img[:,:,:,self.scheme.b0_idx], axis=3 ), axis=3 )
-                self.niiDWI_img = np.concatenate( (mean, self.niiDWI_img[:,:,:,self.scheme.dwi_idx]), axis=3 )
-            else :
-                print( '\t* Keeping all b0 volume(s)...', end="" )
-            print( '[ %d x %d x %d x %d ]' % self.niiDWI_img.shape )
-
-        if self.get_config('doDemean') :
-            print( '\t* Demeaning signal...', end="" )
-            sys.stdout.flush()
-            mean = np.repeat( np.expand_dims(np.mean(self.niiDWI_img,axis=3),axis=3), self.niiDWI_img.shape[3], axis=3 )
-            self.niiDWI_img = self.niiDWI_img - mean
-            print( '[ min=%.2f,  mean=%.2f, max=%.2f ]' % ( self.niiDWI_img.min(), self.niiDWI_img.mean(), self.niiDWI_img.max() ) )
-
-        print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
-
-
-    def set_model( self, model_name ) :
-        """Set the model to use to describe the signal contributions in each voxel.
-
-        Parameters
-        ----------
-        model_name : string
-            The name of the model (must match a class name in "commit.models" module)
-        """
-        # Call the specific model constructor
-        if hasattr(commit.models, model_name ) :
-            self.model = getattr(commit.models,model_name)()
-        else :
-            raise ValueError( 'Model "%s" not recognized' % model_name )
-
-        self.set_config('ATOMS_path', pjoin( self.get_config('study_path'), 'kernels', self.model.id ))
-
-
-    def generate_kernels( self, regenerate = False, lmax = 12, ndirs = 32761 ) :
-        """Generate the high-resolution response functions for each compartment.
-        Dispatch to the proper function, depending on the model.
-
-        Parameters
-        ----------
-        regenerate : boolean
-            Regenerate kernels if they already exist (default : False)
-        lmax : int
-            Maximum SH order to use for the rotation procedure (default : 12)
-        ndirs : int
-            Number of directions on the half of the sphere representing the possible orientations of the response functions (default : 32761)
-        """
-        if not amico.lut.is_valid(ndirs):
-            raise RuntimeError( 'Unsupported value for ndirs.\nNote: Supported values for ndirs are [500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 10000, 32761 (default)]' )
-        if self.scheme is None :
-            raise RuntimeError( 'Scheme not loaded; call "load_data()" first.' )
-        if self.model is None :
-            raise RuntimeError( 'Model not set; call "set_model()" method first.' )
-
-        # store some values for later use
-        self.set_config('lmax', lmax)
-        self.set_config('ndirs', ndirs)
-        self.model.scheme = self.scheme
-
-        print( '\n-> Simulating with "%s" model:' % self.model.name )
-
-        # check if kernels were already generated
-        tmp = glob.glob( pjoin(self.get_config('ATOMS_path'),'A_*.npy') )
-        if len(tmp)>0 and not regenerate :
-            print( '   [ Kernels already computed. Call "generate_kernels( regenerate=True )" to force regeneration. ]' )
-            return
-
-        # create folder or delete existing files (if any)
-        if not exists( self.get_config('ATOMS_path') ) :
-            makedirs( self.get_config('ATOMS_path') )
-        else :
-            for f in glob.glob( pjoin(self.get_config('ATOMS_path'),'*') ) :
-                remove( f )
-
-        # auxiliary data structures
-        aux = amico.lut.load_precomputed_rotation_matrices( lmax, ndirs )
-        idx_IN, idx_OUT = amico.lut.aux_structures_generate( self.scheme, lmax )
-
-        # Dispatch to the right handler for each model
-        tic = time.time()
-        self.model.generate( self.get_config('ATOMS_path'), aux, idx_IN, idx_OUT, ndirs )
-        print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
-
-
-    def load_kernels( self ) :
-        """Load rotated kernels and project to the specific gradient scheme of this subject.
-        Dispatch to the proper function, depending on the model.
-        """
-        if self.model is None :
-            raise RuntimeError( 'Model not set; call "set_model()" method first.' )
-        if self.scheme is None :
-            raise RuntimeError( 'Scheme not loaded; call "load_data()" first.' )
-
-        tic = time.time()
-        print( '\n-> Resampling LUT for subject "%s":' % self.get_config('subject') )
-
-        # auxiliary data structures
-        idx_OUT, Ylm_OUT = amico.lut.aux_structures_resample( self.scheme, self.get_config('lmax') )
-
-        # Dispatch to the right handler for each model
-        if self.get_config('doMergeB0') :
-            print( '\t* Merging multiple b0 volume(s)...', end="" )
-        else :
-            print( '\t* Keeping all b0 volume(s)...', end="" )
-        self.KERNELS = self.model.resample( self.get_config('ATOMS_path'), idx_OUT, Ylm_OUT, self.get_config('doMergeB0'), self.get_config('ndirs') )
-        nIC  = self.KERNELS['wmr'].shape[0]
-        nEC  = self.KERNELS['wmh'].shape[0]
-        nISO = self.KERNELS['iso'].shape[0]
-        print( '[ OK ]' )
-
-
-        # ensure contiguous arrays for C part
-        self.KERNELS['wmr'] = np.ascontiguousarray( self.KERNELS['wmr'] )
-        self.KERNELS['wmh'] = np.ascontiguousarray( self.KERNELS['wmh'] )
-        self.KERNELS['iso'] = np.ascontiguousarray( self.KERNELS['iso'] )
-
-        # De-mean kernels
-        if self.get_config('doDemean') :
-            print( '\t* Demeaning signal...', end="" )
-            for j in xrange(self.get_config('ndirs')) :
-                for i in xrange(nIC) :
-                    self.KERNELS['wmr'][i,j,:] -= self.KERNELS['wmr'][i,j,:].mean()
-                for i in xrange(nEC) :
-                    self.KERNELS['wmh'][i,j,:] -= self.KERNELS['wmh'][i,j,:].mean()
-            for i in xrange(nISO) :
-                self.KERNELS['iso'][i] -= self.KERNELS['iso'][i].mean()
-            print( '[ OK ]' )
-
-        # Normalize atoms
-        if self.get_config('doNormalizeKernels') :
-            print( '\t* Normalizing...', end="" )
-
-            self.KERNELS['wmr_norm'] = np.zeros( nIC )
-            for i in xrange(nIC) :
-                self.KERNELS['wmr_norm'][i] = np.linalg.norm( self.KERNELS['wmr'][i,0,:] )
-                for j in xrange(self.get_config('ndirs')) :
-                    self.KERNELS['wmr'][i,j,:] /= self.KERNELS['wmr_norm'][i]
-
-            self.KERNELS['wmh_norm'] = np.zeros( nEC )
-            for i in xrange(nEC) :
-                self.KERNELS['wmh_norm'][i] = np.linalg.norm( self.KERNELS['wmh'][i,0,:] )
-                for j in xrange(self.get_config('ndirs')) :
-                    self.KERNELS['wmh'][i,j,:] /= self.KERNELS['wmh_norm'][i]
-
-            self.KERNELS['iso_norm'] = np.zeros( nISO )
-            for i in xrange(nISO) :
-                self.KERNELS['iso_norm'][i] = np.linalg.norm( self.KERNELS['iso'][i,:] )
-                self.KERNELS['iso'][i,:] /= self.KERNELS['iso_norm'][i]
-
-            print( '[ OK ]' )
-
-        print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
-
-
-    cpdef load_dictionary( self, path, use_mask = False ) :
-        """Load the sparse structure previously created with "trk2dictionary" script.
-
-        Parameters
-        ----------
-        path : string
-            Folder containing the output of the trk2dictionary script (relative to subject path)
-        use_mask : boolean
-            If False (default) the optimization will be conducted only on the voxels actually
-            traversed by tracts. If True, the mask specified in trk2dictionary
-            (i.e. "filename_mask" paramater) will be used instead.
-            NB: if no mask was specified in trk2dictionary, the "tdi" and
-            "mask" masks are equivalent and this parameter is not influent.
-        """
-        if self.niiDWI is None :
-            raise RuntimeError( 'Data not loaded; call "load_data()" first.' )
-
-        tic = time.time()
-        print( '\n-> Loading the dictionary:' )
-        self.DICTIONARY = {}
-        self.set_config('TRACKING_path', pjoin(self.get_config('DATA_path'),path))
-
-        # load mask
-        self.set_config('dictionary_mask', 'mask' if use_mask else 'tdi' )
-        mask_filename = pjoin(self.get_config('TRACKING_path'),'dictionary_%s.nii'%self.get_config('dictionary_mask'))
-        if not exists( mask_filename ) :
-            mask_filename += '.gz'
-            if not exists( mask_filename ) :
-                raise RuntimeError( 'Dictionary not found. Execute ''trk2dictionary'' script first.' );
-        niiMASK = nibabel.load( mask_filename )
-        self.DICTIONARY['MASK'] = (niiMASK.get_data() > 0).astype(np.uint8)
-
-        # segments from the tracts
-        # ------------------------
-        print( '\t* segments from the tracts...', end="" )
-        sys.stdout.flush()
-
-        dictionary_info = load_dictionary_info( pjoin(self.get_config('TRACKING_path'), "dictionary_info.pickle") )
-
-        self.DICTIONARY['ndirs'] = dictionary_info['ndirs']
-
-        if self.DICTIONARY['ndirs'] != self.get_config('ndirs'):
-            raise RuntimeError( 'Dictionary is outdated. Execute ''trk2dictionary'' script first.' )
-
-        self.DICTIONARY['TRK'] = {}
-        self.DICTIONARY['TRK']['norm'] = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_TRK_norm.dict'), dtype=np.float32 )
-        self.DICTIONARY['TRK']['len']  = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_TRK_len.dict'), dtype=np.float32 )
-
-        self.DICTIONARY['IC'] = {}
-        self.DICTIONARY['IC']['fiber'] = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_IC_f.dict'), dtype=np.uint32 )
-        self.DICTIONARY['IC']['v']     = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_IC_v.dict'), dtype=np.uint32 )
-        self.DICTIONARY['IC']['o']     = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_IC_o.dict'), dtype=np.uint16 )
-        self.DICTIONARY['IC']['len']   = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_IC_len.dict'), dtype=np.float32 )
-        self.DICTIONARY['IC']['n']     = self.DICTIONARY['IC']['fiber'].size
-        self.DICTIONARY['IC']['nF']    = self.DICTIONARY['TRK']['norm'].size
-
-        # reorder the segments based on the "v" field
-        idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['v'])] )
-        self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
-        self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]
-        self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
-        self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
-        del idx
-
-        # divide the length of each segment by the fiber length so that all the columns of the libear operator will have same length
-        # NB: it works in conjunction with the normalization of the kernels
-        cdef :
-            np.float32_t [:] sl = self.DICTIONARY['IC']['len']
-            np.float32_t [:] tl = self.DICTIONARY['TRK']['norm']
-            np.uint32_t  [:] f  = self.DICTIONARY['IC']['fiber']
-            int s
-        if self.get_config('doNormalizeKernels') :
-            for s in xrange(self.DICTIONARY['IC']['n']) :
-                sl[s] /= tl[ f[s] ]
-
-        print( '[ %d fibers and %d segments ]' % ( self.DICTIONARY['IC']['nF'], self.DICTIONARY['IC']['n'] ) )
-
-        # segments from the peaks
-        # -----------------------
-        print( '\t* segments from the peaks...', end="" )
-        sys.stdout.flush()
-
-        self.DICTIONARY['EC'] = {}
-        self.DICTIONARY['EC']['v']  = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_EC_v.dict'), dtype=np.uint32 )
-        self.DICTIONARY['EC']['o']  = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_EC_o.dict'), dtype=np.uint16 )
-        self.DICTIONARY['EC']['nE'] = self.DICTIONARY['EC']['v'].size
-
-        # reorder the segments based on the "v" field
-        idx = np.lexsort( [np.array(self.DICTIONARY['EC']['o']), np.array(self.DICTIONARY['EC']['v'])] )
-        self.DICTIONARY['EC']['v'] = self.DICTIONARY['EC']['v'][ idx ]
-        self.DICTIONARY['EC']['o'] = self.DICTIONARY['EC']['o'][ idx ]
-        del idx
-
-        print( ' [ %d segments ]' % self.DICTIONARY['EC']['nE'] )
-
-        # isotropic compartments
-        # ----------------------
-        print( '\t* isotropic contributions...', end="" )
-        sys.stdout.flush()
-
-        self.DICTIONARY['ISO'] = {}
-
-        self.DICTIONARY['nV'] = self.DICTIONARY['MASK'].sum()
-
-        vx, vy, vz = ( self.DICTIONARY['MASK'] > 0 ).nonzero() # [TODO] find a way to avoid using int64 (not necessary and waste of memory)
-        vx = vx.astype(np.int32)
-        vy = vy.astype(np.int32)
-        vz = vz.astype(np.int32)
-        self.DICTIONARY['ISO']['v'] = vx + self.get_config('dim')[0] * ( vy + self.get_config('dim')[1] * vz )
-        del vx, vy, vz
-
-        # reorder the segments based on the "v" field
-        idx = np.argsort( self.DICTIONARY['ISO']['v'], kind='mergesort' )
-        self.DICTIONARY['ISO']['v'] = self.DICTIONARY['ISO']['v'][ idx ]
-        del idx
-
-        print( ' [ %d voxels ]' % self.DICTIONARY['nV'] )
-
-        # post-processing
-        # ---------------
-        print( '\t* post-processing...', end="" )
-        sys.stdout.flush()
-
-        # get the indices to extract the VOI as in MATLAB (in place of DICTIONARY.MASKidx)
-        idx = self.DICTIONARY['MASK'].ravel(order='F').nonzero()[0]
-        self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] = np.unravel_index( idx, self.DICTIONARY['MASK'].shape, order='F' )
-
-        lut = np.zeros( self.get_config('dim'), dtype=np.uint32 ).ravel()
-        for i in xrange(idx.size) :
-            lut[ idx[i] ] = i
-        self.DICTIONARY['IC'][ 'v'] = lut[ self.DICTIONARY['IC'][ 'v'] ]
-        self.DICTIONARY['EC'][ 'v'] = lut[ self.DICTIONARY['EC'][ 'v'] ]
-        self.DICTIONARY['ISO']['v'] = lut[ self.DICTIONARY['ISO']['v'] ]
-
-        print( '         [ OK ]' )
-
-        print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
-
-
-    def set_threads( self, nthreads = None ) :
-        """Set the number of threads to use for the matrix-vector operations with A and A'.
-
-        Parameters
-        ----------
-        nthreads : integer
-            Number of threads to use (nthreads = None ---> all the CPU threads available in the system
-                                      nthreads = 0    ---> enable CUDA GPU acceleration)
-        """
-        if nthreads is None :
-            # Set to the number of CPUs in the system
-            try :
-                import multiprocessing
-                nthreads = multiprocessing.cpu_count()
-            except :
-                nthreads = 1
-
-        if nthreads < 0 or nthreads > 255 :
-            raise RuntimeError( 'Number of threads must be between 0 and 255' )
-        if self.DICTIONARY is None :
-            raise RuntimeError( 'Dictionary not loaded; call "load_dictionary()" first.' )
-        if self.KERNELS is None :
-            raise RuntimeError( 'Response functions not generated; call "generate_kernels()" and "load_kernels()" first.' )
-
-        self.THREADS = {}
-        self.THREADS['n'] = nthreads
-        self.THREADS['IC'] = None
-        self.THREADS['EC'] = None
-        self.THREADS['ISO'] = None
-        self.THREADS['ICt'] = None
-        self.THREADS['ECt'] = None
-        self.THREADS['ISOt'] = None
-
-        cdef :
-            long [:] C
-            long t, tot, i1, i2, N, c
-            int i
-
-        if nthreads > 0:
-            print( '\n-> Distributing workload to different threads:' )
-            print( '\t* number of threads : %d' % nthreads )
-
-            tic = time.time()
-
-            # Distribute load for the computation of A*x product
-            print( '\t* A  operator... ', end="" )
-            sys.stdout.flush()
-
-            if self.DICTIONARY['IC']['n'] > 0 :
-                self.THREADS['IC'] = np.zeros( nthreads+1, dtype=np.uint32 )
-                if nthreads > 1 :
-                    N = np.floor( self.DICTIONARY['IC']['n']/nthreads )
-                    t = 1
-                    tot = 0
-                    C = np.bincount( self.DICTIONARY['IC']['v'] )
-                    for c in C :
-                        tot += c
-                        if tot >= N :
-                            self.THREADS['IC'][t] = self.THREADS['IC'][t-1] + tot
-                            t += 1
-                            tot = 0
-                self.THREADS['IC'][nthreads] = self.DICTIONARY['IC']['n']
-
-                # check if some threads are not assigned any segment
-                if np.count_nonzero( np.diff( self.THREADS['IC'].astype(np.int32) ) <= 0 ) :
-                    self.THREADS = None
-                    raise RuntimeError( 'Too many threads for the IC compartments to evaluate; try decreasing the number.' )
-
-            if self.DICTIONARY['EC']['nE'] > 0 :
-                self.THREADS['EC'] = np.zeros( nthreads+1, dtype=np.uint32 )
-                for i in xrange(nthreads) :
-                    self.THREADS['EC'][i] = np.searchsorted( self.DICTIONARY['EC']['v'], self.DICTIONARY['IC']['v'][ self.THREADS['IC'][i] ] )
-                self.THREADS['EC'][nthreads] = self.DICTIONARY['EC']['nE']
-
-                # check if some threads are not assigned any segment
-                if np.count_nonzero( np.diff( self.THREADS['EC'].astype(np.int32) ) <= 0 ) :
-                    self.THREADS = None
-                    raise RuntimeError( 'Too many threads for the EC compartments to evaluate; try decreasing the number.' )
-
-            if self.DICTIONARY['nV'] > 0 :
-                self.THREADS['ISO'] = np.zeros( nthreads+1, dtype=np.uint32 )
-                for i in xrange(nthreads) :
-                    self.THREADS['ISO'][i] = np.searchsorted( self.DICTIONARY['ISO']['v'], self.DICTIONARY['IC']['v'][ self.THREADS['IC'][i] ] )
-                self.THREADS['ISO'][nthreads] = self.DICTIONARY['nV']
-
-                # check if some threads are not assigned any segment
-                if np.count_nonzero( np.diff( self.THREADS['ISO'].astype(np.int32) ) <= 0 ) :
-                    self.THREADS = None
-                    raise RuntimeError( 'Too many threads for the ISO compartments to evaluate; try decreasing the number.' )
-
-            print( '[ OK ]' )
-
-            # Distribute load for the computation of At*y product
-            print( '\t* A\' operator... ', end="" )
-            sys.stdout.flush()
-
-            if self.DICTIONARY['IC']['n'] > 0 :
-                self.THREADS['ICt'] = np.full( self.DICTIONARY['IC']['n'], nthreads-1, dtype=np.uint8 )
-                if nthreads > 1 :
-                    idx = np.argsort( self.DICTIONARY['IC']['fiber'], kind='mergesort' )
-                    C = np.bincount( self.DICTIONARY['IC']['fiber'] )
-                    t = tot = i1 = i2 = 0
-                    N = np.floor(self.DICTIONARY['IC']['n']/nthreads)
-                    for c in C :
-                        i2 += c
-                        tot += c
-                        if tot >= N :
-                            self.THREADS['ICt'][ i1:i2 ] = t
-                            t += 1
-                            if t==nthreads-1 :
-                                break
-                            i1 = i2
-                            tot = c
-                    self.THREADS['ICt'][idx] = self.THREADS['ICt'].copy()
-
-            if self.DICTIONARY['EC']['nE'] > 0 :
-                self.THREADS['ECt'] = np.zeros( nthreads+1, dtype=np.uint32 )
-                N = np.floor( self.DICTIONARY['EC']['nE']/nthreads )
-                for i in xrange(1,nthreads) :
-                    self.THREADS['ECt'][i] = self.THREADS['ECt'][i-1] + N
-                self.THREADS['ECt'][nthreads] = self.DICTIONARY['EC']['nE']
-
-                # check if some threads are not assigned any segment
-                if np.count_nonzero( np.diff( self.THREADS['ECt'].astype(np.int32) ) <= 0 ) :
-                    self.THREADS = None
-                    raise RuntimeError( 'Too many threads for the EC compartments to evaluate; try decreasing the number.' )
-
-            if self.DICTIONARY['nV'] > 0 :
-                self.THREADS['ISOt'] = np.zeros( nthreads+1, dtype=np.uint32 )
-                N = np.floor( self.DICTIONARY['nV']/nthreads )
-                for i in xrange(1,nthreads) :
-                    self.THREADS['ISOt'][i] = self.THREADS['ISOt'][i-1] + N
-                self.THREADS['ISOt'][nthreads] = self.DICTIONARY['nV']
-
-                # check if some threads are not assigned any segment
-                if np.count_nonzero( np.diff( self.THREADS['ISOt'].astype(np.int32) ) <= 0 ) :
-                    self.THREADS = None
-                    raise RuntimeError( 'Too many threads for the ISO compartments to evaluate; try decreasing the number.' )
-
-            print( '[ OK ]' )
-
-            print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
-
-
-    def build_operator( self ) :
-        """Compile/build the operator for computing the matrix-vector multiplications by A and A'
-        using the informations from self.DICTIONARY, self.KERNELS and self.THREADS.
-        NB: needs to call this function to update pointers to data structures in case
-            the data is changed in self.DICTIONARY, self.KERNELS or self.THREADS.
-        """
-        if self.DICTIONARY is None :
-            raise RuntimeError( 'Dictionary not loaded; call "load_dictionary()" first.' )
-        if self.KERNELS is None :
-            raise RuntimeError( 'Response functions not generated; call "generate_kernels()" and "load_kernels()" first.' )
-        if self.THREADS is None :
-            raise RuntimeError( 'Threads not set; call "set_threads()" first.' )
-
-        tic = time.time()
-        print( '\n-> Building linear operator A:' )
-
-        # need to pass these parameters at runtime for compiling the C code
-        from commit.operator import config
-        config.nTHREADS = self.THREADS['n']
-        config.model    = self.model.id
-        config.nIC      = self.KERNELS['wmr'].shape[0]
-        config.nEC      = self.KERNELS['wmh'].shape[0]
-        config.nISO     = self.KERNELS['iso'].shape[0]
-        if not 'commit.operator.operator' in sys.modules :
-            import commit.operator.operator
-        else :
-            reload( sys.modules['commit.operator.operator'] )
-
-        if self.THREADS['n'] > 0:
-            self.A = sys.modules['commit.operator.operator'].LinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
-        else:
-            import commit.cudaoperator
-            self.A = commit.cudaoperator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS, fcall=1 )
-
-        print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
-
-    def get_y( self ):
-        """
-        Returns a numpy array that corresponds to the 'y' vector of the optimisation problem.
-        NB: this can be run only after having loaded the dictionary and the data.
-        """
-        if self.DICTIONARY is None :
-            raise RuntimeError( 'Dictionary not loaded; call "load_dictionary()" first.' )
-        if self.niiDWI is None :
-            raise RuntimeError( 'Data not loaded; call "load_data()" first.' )
-        return self.niiDWI_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'], : ].flatten().astype(np.float64)
-
-    def fit( self, tol_fun = 1e-3, tol_x = 1e-6, max_iter = 100, verbose = 1, x0 = None, regularisation = None ) :
-        """Fit the model to the data.
-
-        Parameters
-        ----------
-        tol_fun : float
-            Tolerance on the objective function (default : 1e-3)
-        max_iter : integer
-            Maximum number of iterations (default : 100)
-        verbose : integer
-            Level of verbosity: 0=no print, 1=print progress (default : 1)
-        x0 : np.array
-            Initial guess for the solution of the problem (default : None)
-        regularisation : commit.solvers.init_regularisation object
-            Python dictionary that describes the wanted regularisation term.
-            Check the documentation of commit.solvers.init_regularisation to see
-            how to properly define the wanted mathematical formulation
-            ( default : None )
-        """
-        if self.niiDWI is None :
-            raise RuntimeError( 'Data not loaded; call "load_data()" first.' )
-        if self.DICTIONARY is None :
-            raise RuntimeError( 'Dictionary not loaded; call "load_dictionary()" first.' )
-        if self.KERNELS is None :
-            raise RuntimeError( 'Response functions not generated; call "generate_kernels()" and "load_kernels()" first.' )
-        if self.THREADS is None :
-            raise RuntimeError( 'Threads not set; call "set_threads()" first.' )
-        if self.A is None :
-            raise RuntimeError( 'Operator not built; call "build_operator()" first.' )
-
-        if x0 is not None :
-            if x0.shape[0] != self.A.shape[1] :
-                raise RuntimeError( 'x0: dimension does not match the number of columns of the dictionary.' )
-        if regularisation is None :
-            regularisation = commit.solvers.init_regularisation(self)
-
-        self.CONFIG['optimization']                   = {}
-        self.CONFIG['optimization']['tol_fun']        = tol_fun
-        self.CONFIG['optimization']['tol_x']          = tol_x
-        self.CONFIG['optimization']['max_iter']       = max_iter
-        self.CONFIG['optimization']['verbose']        = verbose
-        self.CONFIG['optimization']['regularisation'] = regularisation
-
-        # run solver
-        t = time.time()
-        print( '\n-> Fit model' )
-
-        self.x, opt_details = commit.solvers.solve(self.get_y(), self.A, self.A.T, tol_fun = tol_fun, tol_x = tol_x, max_iter = max_iter, verbose = verbose, x0 = x0, regularisation = regularisation)
-
-        self.CONFIG['optimization']['fit_details'] = opt_details
-        self.CONFIG['optimization']['fit_time'] = time.time()-t
-
-        print( '   [ %s ]' % ( time.strftime("%Hh %Mm %Ss", time.gmtime(self.CONFIG['optimization']['fit_time']) ) ) )
-
-
-    def save_results( self, path_suffix = None, save_opt_details = True, save_coeff = False ) :
-        """Save the output (coefficients, errors, maps etc).
-
-        Parameters
-        ----------
-        path_suffix : string
-            Text to be appended to "Results" to create the output path (default : None)
-        save_opt_details : boolean
-            Save everything in a pickle file containing the following list L:
-                L[0]: dictionary with all the configuration details
-                L[1]: np.array obtained through the optimisation process with the normalised kernels
-                L[2]: np.array renormalisation of L[1]
-            (default : True)
-        save_coeff : boolean
-            Save three txt files containing the coefficients related to each
-            compartment and a pickle file containing the dictionary with all
-            the configuration details.
-            (default : False)
-        """
-        if self.x is None :
-            raise RuntimeError( 'Model not fitted to the data; call "fit()" first.' )
-
-        RESULTS_path = 'Results_' + self.model.id
-        if path_suffix :
-            self.set_config('path_suffix', path_suffix)
-            RESULTS_path = RESULTS_path + path_suffix
-
-        print( '\n-> Saving results to "%s/*":' % RESULTS_path )
-        tic = time.time()
-
-        # create folder or delete existing files (if any)
-        RESULTS_path = pjoin( self.get_config('TRACKING_path'), RESULTS_path )
-        if not exists( RESULTS_path ) :
-            makedirs( RESULTS_path )
-        else :
-            for f in glob.glob( pjoin(RESULTS_path,'*') ) :
-                remove( f )
-        self.set_config('RESULTS_path', RESULTS_path)
-
-        # Configuration and results
-        print( '\t* configuration and results:' )
-
-        nF = self.DICTIONARY['IC']['nF']
-        nE = self.DICTIONARY['EC']['nE']
-        nV = self.DICTIONARY['nV']
-        norm_fib = np.ones( nF )
-        # x is the x of the original problem
-        # self.x is the x preconditioned
-        if self.get_config('doNormalizeKernels') :
-            # renormalize the coefficients
-            norm1 = np.repeat(self.KERNELS['wmr_norm'],nF)
-            norm2 = np.repeat(self.KERNELS['wmh_norm'],nE)
-            norm3 = np.repeat(self.KERNELS['iso_norm'],nV)
-            norm_fib = np.kron(np.ones(self.KERNELS['wmr'].shape[0]), self.DICTIONARY['TRK']['norm'])
-            x = self.x / np.hstack( (norm1*norm_fib,norm2,norm3) )
-        else :
-            x = self.x
-        if save_opt_details:
-            print( '\t\t- pickle... ', end="" )
-            sys.stdout.flush()
-            with open( pjoin(RESULTS_path,'results.pickle'), 'wb+' ) as fid :
-                pickle.dump( [self.CONFIG, self.x, x], fid, protocol=2 )
-            print( '[ OK ]' )
-        if save_coeff:
-            print( '\t\t- txt... ', end="" )
-            sys.stdout.flush()
-            np.savetxt(pjoin(RESULTS_path,'xic.txt'), x[0:nF])
-            np.savetxt(pjoin(RESULTS_path,'xec.txt'), x[nF:nF+nE])
-            np.savetxt(pjoin(RESULTS_path,'xiso.txt'), x[(nF+nE):])
-            with open( pjoin(RESULTS_path,'config.pickle'), 'wb+' ) as fid :
-                pickle.dump( self.CONFIG, fid, protocol=2 )
-            print( '[ OK ]' )
-
-
-        # Map of wovelwise errors
-        print( '\t* fitting errors:' )
-
-        not_NaN = np.ones( self.get_config('dim'), dtype=np.float32 ) * 1e-16 # avoid division by 0
-
-        niiMAP_img = np.zeros( self.get_config('dim'), dtype=np.float32 )
-        affine = self.niiDWI.affine if nibabel.__version__ >= '2.0.0' else self.niiDWI.get_affine()
-        niiMAP     = nibabel.Nifti1Image( niiMAP_img, affine )
-        niiMAP_hdr = niiMAP.header if nibabel.__version__ >= '2.0.0' else niiMAP.get_header()
-
-        y_mea = np.reshape( self.niiDWI_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'], : ].flatten().astype(np.float32), (nV,-1) )
-        y_est = np.reshape( self.A.dot(self.x), (nV,-1) ).astype(np.float32)
-
-        print( '\t\t- RMSE...', end="" )
-        sys.stdout.flush()
-        tmp = np.sqrt( np.mean((y_mea-y_est)**2,axis=1) )
-        niiMAP_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = tmp
-        niiMAP_hdr['cal_min'] = 0
-        niiMAP_hdr['cal_max'] = tmp.max()
-        nibabel.save( niiMAP, pjoin(RESULTS_path,'fit_RMSE.nii.gz') )
-        print( ' [ %.3f +/- %.3f ]' % ( tmp.mean(), tmp.std() ) )
-
-        print( '\t\t- NRMSE...', end="" )
-        sys.stdout.flush()
-        tmp = np.sum(y_mea**2,axis=1)
-        idx = np.where( tmp < 1E-12 )
-        tmp[ idx ] = 1
-        tmp = np.sqrt( np.sum((y_mea-y_est)**2,axis=1) / tmp )
-        tmp[ idx ] = 0
-        niiMAP_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = tmp
-        niiMAP_hdr['cal_min'] = 0
-        niiMAP_hdr['cal_max'] = 1
-        nibabel.save( niiMAP, pjoin(RESULTS_path,'fit_NRMSE.nii.gz') )
-        print( '[ %.3f +/- %.3f ]' % ( tmp.mean(), tmp.std() ) )
-
-        # Map of compartment contributions
-        print( '\t* voxelwise contributions:' )
-
-        print( '\t\t- intra-axonal', end="" )
-        sys.stdout.flush()
-        niiIC_img = np.zeros( self.get_config('dim'), dtype=np.float32 )
-        if len(self.KERNELS['wmr']) > 0 :
-            offset = nF * self.KERNELS['wmr'].shape[0]
-            tmp = ( x[:offset].reshape( (-1,nF) ) * norm_fib.reshape( (-1,nF) ) ).sum( axis=0 )
-            xv = np.bincount( self.DICTIONARY['IC']['v'], minlength=nV,
-                weights=tmp[ self.DICTIONARY['IC']['fiber'] ] * self.DICTIONARY['IC']['len']
-            ).astype(np.float32)
-            niiIC_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = xv
-        print( '[ OK ]' )
-
-        print( '\t\t- extra-axonal', end="" )
-        sys.stdout.flush()
-        niiEC_img = np.zeros( self.get_config('dim'), dtype=np.float32 )
-        if len(self.KERNELS['wmh']) > 0 :
-            offset = nF * self.KERNELS['wmr'].shape[0]
-            tmp = x[offset:offset+nE*len(self.KERNELS['wmh'])].reshape( (-1,nE) ).sum( axis=0 )
-            xv = np.bincount( self.DICTIONARY['EC']['v'], weights=tmp, minlength=nV ).astype(np.float32)
-            niiEC_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = xv
-        print( '[ OK ]' )
-
-        print( '\t\t- isotropic', end="" )
-        sys.stdout.flush()
-        niiISO_img = np.zeros( self.get_config('dim'), dtype=np.float32 )
-        if len(self.KERNELS['iso']) > 0 :
-            offset = nF * self.KERNELS['wmr'].shape[0] + nE * self.KERNELS['wmh'].shape[0]
-            xv = x[offset:].reshape( (-1,nV) ).sum( axis=0 )
-            niiISO_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = xv
-        print( '   [ OK ]' )
-
-        if self.get_config('doNormalizeMaps') :
-                niiIC = nibabel.Nifti1Image( niiIC_img / ( niiIC_img + niiEC_img + niiISO_img + not_NaN), affine )
-                niiEC = nibabel.Nifti1Image( niiEC_img / ( niiIC_img + niiEC_img + niiISO_img + not_NaN), affine )
-                niiISO = nibabel.Nifti1Image( niiISO_img / ( niiIC_img + niiEC_img + niiISO_img + not_NaN), affine )
-        else:
-                niiIC = nibabel.Nifti1Image( niiIC_img, affine )
-                niiEC = nibabel.Nifti1Image( niiEC_img, affine )
-                niiISO = nibabel.Nifti1Image( niiISO_img, affine )
-
-        nibabel.save( niiIC , pjoin(RESULTS_path,'compartment_IC.nii.gz') )
-        nibabel.save( niiEC , pjoin(RESULTS_path,'compartment_EC.nii.gz') )
-        nibabel.save( niiISO , pjoin(RESULTS_path,'compartment_ISO.nii.gz') )
-
-
-        print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
+#!python
+#cython: language_level=3, boundscheck=False, wraparound=False, nonecheck=False, cdivision=True, initializedcheck=False, binding=False
+from __future__ import print_function
+cimport cython
+import numpy as np
+cimport numpy as np
+
+import time
+import glob
+import sys
+from os import makedirs, remove
+from os.path import exists, join as pjoin, isfile
+import nibabel
+import pickle
+import commit.models
+import commit.solvers
+import amico.scheme
+import amico.lut
+import pyximport
+pyximport.install( reload_support=True, language_level=3 )
+
+
+def setup( lmax = 12, ndirs = 32761 ) :
+    """General setup/initialization of the COMMIT framework.
+    
+    Parameters
+    ----------
+    lmax : int
+        Maximum SH order to use for the rotation phase (default : 12)
+    ndirs : int
+        Number of directions on the half of the sphere representing the possible orientations of the response functions (default : 32761)
+    """
+
+    if not amico.lut.is_valid(ndirs):
+        raise RuntimeError( 'Unsupported value for ndirs.\nNote: Supported values for ndirs are [500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 10000, 32761 (default)]' )
+
+    amico.lut.precompute_rotation_matrices( lmax, ndirs )
+
+def load_dictionary_info(filename):
+    """Function to load dictionary info file
+    
+    Parameters
+    ----------
+    filename : string
+        This value is always COMMIT_PATH + dictionary_info.pickle
+    """
+    if not isfile( filename ):
+        raise RuntimeError( 'Dictionary is outdated or not found. Execute ''trk2dictionary'' script first.' )
+    with open( filename, 'rb' ) as dictionary_info_file:
+        if sys.version_info.major == 3:
+            aux = pickle.load( dictionary_info_file, fix_imports=True, encoding='bytes' )
+            # Pickle files written by Python 2 are loaded with byte
+            # keys, whereas those written by Python 3 are loaded with
+            # str keys, even when both are written using protocol=2
+            result_aux = {(k.decode() if hasattr(k,"decode") else k): v for k, v in aux.items()}
+            return result_aux
+        else:
+            return pickle.load( dictionary_info_file )
+
+cdef class Evaluation :
+    """Class to hold all the information (data and parameters) when performing an
+    evaluation with the COMMIT framework.
+    """
+    cdef public niiDWI
+    cdef public niiDWI_img
+    cdef public scheme
+    cdef public model
+    cdef public KERNELS
+    cdef public DICTIONARY
+    cdef public THREADS
+    cdef public A
+    cdef public x
+    cdef public CONFIG
+
+    def __init__( self, study_path, subject ) :
+        """Setup the data structures with default values.
+
+        Parameters
+        ----------
+        study_path : string
+            The path to the folder containing all the subjects from one study
+        subject : string
+            The path (relative to previous folder) to the subject folder
+        """
+        self.niiDWI     = None # set by "load_data" method
+        self.scheme     = None # set by "load_data" method
+        self.model      = None # set by "set_model" method
+        self.KERNELS    = None # set by "load_kernels" method
+        self.DICTIONARY = None # set by "load_dictionary" method
+        self.THREADS    = None # set by "set_threads" method
+        self.A          = None # set by "build_operator" method
+        self.x          = None # set by "fit" method
+
+        # store all the parameters of an evaluation with COMMIT
+        self.CONFIG = {}
+        self.set_config('study_path', study_path)
+        self.set_config('subject', subject)
+        self.set_config('DATA_path', pjoin( study_path, subject ))
+
+        self.set_config('doNormalizeSignal', True)
+        self.set_config('doMergeB0', False)
+        self.set_config('doNormalizeKernels', True)
+        self.set_config('doDemean', False)
+        self.set_config('doNormalizeMaps', False)
+
+
+
+    def set_config( self, key, value ) :
+        self.CONFIG[ key ] = value
+
+    def get_config( self, key ) :
+        return self.CONFIG.get( key )
+
+
+    def load_data( self, dwi_filename = 'DWI.nii', scheme_filename = 'DWI.scheme', b0_thr = 0 ) :
+        """Load the diffusion signal and its corresponding acquisition scheme.
+
+        Parameters
+        ----------
+        dwi_filename : string
+            The file name of the DWI data, relative to the subject folder (default : 'DWI.nii')
+        scheme_filename : string
+            The file name of the corresponding acquisition scheme (default : 'DWI.scheme')
+        b0_thr : float
+            The threshold below which a b-value is considered a b0 (default : 0)
+        """
+
+        # Loading data and acquisition scheme
+        tic = time.time()
+        print( '\n-> Loading data:' )
+
+        print( '\t* DWI signal...' )
+        self.set_config('dwi_filename', dwi_filename)
+        self.niiDWI  = nibabel.load( pjoin( self.get_config('DATA_path'), dwi_filename) )
+        self.niiDWI_img = self.niiDWI.get_data().astype(np.float32)
+        if self.niiDWI_img.ndim ==3 :
+            self.niiDWI_img = np.expand_dims( self.niiDWI_img, axis=3 )
+        hdr = self.niiDWI.header if nibabel.__version__ >= '2.0.0' else self.niiDWI.get_header()
+        self.set_config('dim', self.niiDWI_img.shape[0:3])
+        self.set_config('pixdim', tuple( hdr.get_zooms()[:3] ))
+        print( '\t\t- dim    = %d x %d x %d x %d' % self.niiDWI_img.shape )
+        print( '\t\t- pixdim = %.3f x %.3f x %.3f' % self.get_config('pixdim') )
+
+        print( '\t* Acquisition scheme...' )
+        self.set_config('scheme_filename', scheme_filename)
+        self.set_config('b0_thr', b0_thr)
+        self.scheme = amico.scheme.Scheme( pjoin( self.get_config('DATA_path'), scheme_filename), b0_thr )
+        print( '\t\t- %d samples, %d shells' % ( self.scheme.nS, len(self.scheme.shells) ) )
+        print( '\t\t- %d @ b=0' % ( self.scheme.b0_count ), end="" )
+        for i in xrange(len(self.scheme.shells)) :
+            print( ', %d @ b=%.1f' % ( len(self.scheme.shells[i]['idx']), self.scheme.shells[i]['b'] ), end="" )
+        print()
+
+        if self.scheme.nS != self.niiDWI_img.shape[3] :
+            raise ValueError( 'Scheme does not match with DWI data' )
+
+        if self.scheme.dwi_count == 0 :
+            raise ValueError( 'There are no DWI volumes in the data' )
+
+        print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
+
+        # Preprocessing
+        tic = time.time()
+        print( '\n-> Preprocessing:' )
+
+        if self.get_config('doNormalizeSignal') :
+            if self.scheme.b0_count > 0 :
+                print( '\t* Normalizing to b0...', end="" )
+                sys.stdout.flush()
+                mean = np.mean( self.niiDWI_img[:,:,:,self.scheme.b0_idx], axis=3 )
+                idx = mean <= 0
+                mean[ idx ] = 1
+                mean = 1 / mean
+                mean[ idx ] = 0
+                for i in xrange(self.scheme.nS) :
+                    self.niiDWI_img[:,:,:,i] *= mean
+            else :
+                print( '\t* There are no b0 volume(s) for normalization...', end="" )
+            print( '[ min=%.2f,  mean=%.2f, max=%.2f ]' % ( self.niiDWI_img.min(), self.niiDWI_img.mean(), self.niiDWI_img.max() ) )
+
+        if self.scheme.b0_count > 1 :
+            if self.get_config('doMergeB0') :
+                print( '\t* Merging multiple b0 volume(s)...', end="" )
+                mean = np.expand_dims( np.mean( self.niiDWI_img[:,:,:,self.scheme.b0_idx], axis=3 ), axis=3 )
+                self.niiDWI_img = np.concatenate( (mean, self.niiDWI_img[:,:,:,self.scheme.dwi_idx]), axis=3 )
+            else :
+                print( '\t* Keeping all b0 volume(s)...', end="" )
+            print( '[ %d x %d x %d x %d ]' % self.niiDWI_img.shape )
+
+        if self.get_config('doDemean') :
+            print( '\t* Demeaning signal...', end="" )
+            sys.stdout.flush()
+            mean = np.repeat( np.expand_dims(np.mean(self.niiDWI_img,axis=3),axis=3), self.niiDWI_img.shape[3], axis=3 )
+            self.niiDWI_img = self.niiDWI_img - mean
+            print( '[ min=%.2f,  mean=%.2f, max=%.2f ]' % ( self.niiDWI_img.min(), self.niiDWI_img.mean(), self.niiDWI_img.max() ) )
+
+        print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
+
+
+    def set_model( self, model_name ) :
+        """Set the model to use to describe the signal contributions in each voxel.
+
+        Parameters
+        ----------
+        model_name : string
+            The name of the model (must match a class name in "commit.models" module)
+        """
+        # Call the specific model constructor
+        if hasattr(commit.models, model_name ) :
+            self.model = getattr(commit.models,model_name)()
+        else :
+            raise ValueError( 'Model "%s" not recognized' % model_name )
+
+        self.set_config('ATOMS_path', pjoin( self.get_config('study_path'), 'kernels', self.model.id ))
+
+
+    def generate_kernels( self, regenerate = False, lmax = 12, ndirs = 32761 ) :
+        """Generate the high-resolution response functions for each compartment.
+        Dispatch to the proper function, depending on the model.
+
+        Parameters
+        ----------
+        regenerate : boolean
+            Regenerate kernels if they already exist (default : False)
+        lmax : int
+            Maximum SH order to use for the rotation procedure (default : 12)
+        ndirs : int
+            Number of directions on the half of the sphere representing the possible orientations of the response functions (default : 32761)
+        """
+        if not amico.lut.is_valid(ndirs):
+            raise RuntimeError( 'Unsupported value for ndirs.\nNote: Supported values for ndirs are [500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 10000, 32761 (default)]' )
+        if self.scheme is None :
+            raise RuntimeError( 'Scheme not loaded; call "load_data()" first.' )
+        if self.model is None :
+            raise RuntimeError( 'Model not set; call "set_model()" method first.' )
+
+        # store some values for later use
+        self.set_config('lmax', lmax)
+        self.set_config('ndirs', ndirs)
+        self.model.scheme = self.scheme
+
+        print( '\n-> Simulating with "%s" model:' % self.model.name )
+
+        # check if kernels were already generated
+        tmp = glob.glob( pjoin(self.get_config('ATOMS_path'),'A_*.npy') )
+        if len(tmp)>0 and not regenerate :
+            print( '   [ Kernels already computed. Call "generate_kernels( regenerate=True )" to force regeneration. ]' )
+            return
+
+        # create folder or delete existing files (if any)
+        if not exists( self.get_config('ATOMS_path') ) :
+            makedirs( self.get_config('ATOMS_path') )
+        else :
+            for f in glob.glob( pjoin(self.get_config('ATOMS_path'),'*') ) :
+                remove( f )
+
+        # auxiliary data structures
+        aux = amico.lut.load_precomputed_rotation_matrices( lmax, ndirs )
+        idx_IN, idx_OUT = amico.lut.aux_structures_generate( self.scheme, lmax )
+
+        # Dispatch to the right handler for each model
+        tic = time.time()
+        self.model.generate( self.get_config('ATOMS_path'), aux, idx_IN, idx_OUT, ndirs )
+        print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
+
+
+    def load_kernels( self ) :
+        """Load rotated kernels and project to the specific gradient scheme of this subject.
+        Dispatch to the proper function, depending on the model.
+        """
+        if self.model is None :
+            raise RuntimeError( 'Model not set; call "set_model()" method first.' )
+        if self.scheme is None :
+            raise RuntimeError( 'Scheme not loaded; call "load_data()" first.' )
+
+        tic = time.time()
+        print( '\n-> Resampling LUT for subject "%s":' % self.get_config('subject') )
+
+        # auxiliary data structures
+        idx_OUT, Ylm_OUT = amico.lut.aux_structures_resample( self.scheme, self.get_config('lmax') )
+
+        # Dispatch to the right handler for each model
+        if self.get_config('doMergeB0') :
+            print( '\t* Merging multiple b0 volume(s)...', end="" )
+        else :
+            print( '\t* Keeping all b0 volume(s)...', end="" )
+        self.KERNELS = self.model.resample( self.get_config('ATOMS_path'), idx_OUT, Ylm_OUT, self.get_config('doMergeB0'), self.get_config('ndirs') )
+        nIC  = self.KERNELS['wmr'].shape[0]
+        nEC  = self.KERNELS['wmh'].shape[0]
+        nISO = self.KERNELS['iso'].shape[0]
+        print( '[ OK ]' )
+
+
+        # ensure contiguous arrays for C part
+        self.KERNELS['wmr'] = np.ascontiguousarray( self.KERNELS['wmr'] )
+        self.KERNELS['wmh'] = np.ascontiguousarray( self.KERNELS['wmh'] )
+        self.KERNELS['iso'] = np.ascontiguousarray( self.KERNELS['iso'] )
+
+        # De-mean kernels
+        if self.get_config('doDemean') :
+            print( '\t* Demeaning signal...', end="" )
+            for j in xrange(self.get_config('ndirs')) :
+                for i in xrange(nIC) :
+                    self.KERNELS['wmr'][i,j,:] -= self.KERNELS['wmr'][i,j,:].mean()
+                for i in xrange(nEC) :
+                    self.KERNELS['wmh'][i,j,:] -= self.KERNELS['wmh'][i,j,:].mean()
+            for i in xrange(nISO) :
+                self.KERNELS['iso'][i] -= self.KERNELS['iso'][i].mean()
+            print( '[ OK ]' )
+
+        # Normalize atoms
+        if self.get_config('doNormalizeKernels') :
+            print( '\t* Normalizing...', end="" )
+
+            self.KERNELS['wmr_norm'] = np.zeros( nIC )
+            for i in xrange(nIC) :
+                self.KERNELS['wmr_norm'][i] = np.linalg.norm( self.KERNELS['wmr'][i,0,:] )
+                for j in xrange(self.get_config('ndirs')) :
+                    self.KERNELS['wmr'][i,j,:] /= self.KERNELS['wmr_norm'][i]
+
+            self.KERNELS['wmh_norm'] = np.zeros( nEC )
+            for i in xrange(nEC) :
+                self.KERNELS['wmh_norm'][i] = np.linalg.norm( self.KERNELS['wmh'][i,0,:] )
+                for j in xrange(self.get_config('ndirs')) :
+                    self.KERNELS['wmh'][i,j,:] /= self.KERNELS['wmh_norm'][i]
+
+            self.KERNELS['iso_norm'] = np.zeros( nISO )
+            for i in xrange(nISO) :
+                self.KERNELS['iso_norm'][i] = np.linalg.norm( self.KERNELS['iso'][i,:] )
+                self.KERNELS['iso'][i,:] /= self.KERNELS['iso_norm'][i]
+
+            print( '[ OK ]' )
+
+        print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
+
+
+    cpdef load_dictionary( self, path, use_mask = False ) :
+        """Load the sparse structure previously created with "trk2dictionary" script.
+
+        Parameters
+        ----------
+        path : string
+            Folder containing the output of the trk2dictionary script (relative to subject path)
+        use_mask : boolean
+            If False (default) the optimization will be conducted only on the voxels actually
+            traversed by tracts. If True, the mask specified in trk2dictionary
+            (i.e. "filename_mask" paramater) will be used instead.
+            NB: if no mask was specified in trk2dictionary, the "tdi" and
+            "mask" masks are equivalent and this parameter is not influent.
+        """
+        if self.niiDWI is None :
+            raise RuntimeError( 'Data not loaded; call "load_data()" first.' )
+
+        tic = time.time()
+        print( '\n-> Loading the dictionary:' )
+        self.DICTIONARY = {}
+        self.set_config('TRACKING_path', pjoin(self.get_config('DATA_path'),path))
+
+        # load mask
+        self.set_config('dictionary_mask', 'mask' if use_mask else 'tdi' )
+        mask_filename = pjoin(self.get_config('TRACKING_path'),'dictionary_%s.nii'%self.get_config('dictionary_mask'))
+        if not exists( mask_filename ) :
+            mask_filename += '.gz'
+            if not exists( mask_filename ) :
+                raise RuntimeError( 'Dictionary not found. Execute ''trk2dictionary'' script first.' );
+        niiMASK = nibabel.load( mask_filename )
+        self.DICTIONARY['MASK'] = (niiMASK.get_data() > 0).astype(np.uint8)
+
+        # segments from the tracts
+        # ------------------------
+        print( '\t* segments from the tracts...', end="" )
+        sys.stdout.flush()
+
+        dictionary_info = load_dictionary_info( pjoin(self.get_config('TRACKING_path'), "dictionary_info.pickle") )
+
+        self.DICTIONARY['ndirs'] = dictionary_info['ndirs']
+
+        if self.DICTIONARY['ndirs'] != self.get_config('ndirs'):
+            raise RuntimeError( 'Dictionary is outdated. Execute ''trk2dictionary'' script first.' )
+
+        self.DICTIONARY['TRK'] = {}
+        self.DICTIONARY['TRK']['norm'] = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_TRK_norm.dict'), dtype=np.float32 )
+        self.DICTIONARY['TRK']['len']  = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_TRK_len.dict'), dtype=np.float32 )
+
+        self.DICTIONARY['IC'] = {}
+        self.DICTIONARY['IC']['fiber'] = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_IC_f.dict'), dtype=np.uint32 )
+        self.DICTIONARY['IC']['v']     = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_IC_v.dict'), dtype=np.uint32 )
+        self.DICTIONARY['IC']['o']     = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_IC_o.dict'), dtype=np.uint16 )
+        self.DICTIONARY['IC']['len']   = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_IC_len.dict'), dtype=np.float32 )
+        self.DICTIONARY['IC']['n']     = self.DICTIONARY['IC']['fiber'].size
+        self.DICTIONARY['IC']['nF']    = self.DICTIONARY['TRK']['norm'].size
+
+        # reorder the segments based on the "v" field
+        idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['v'])] )
+        self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
+        self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]
+        self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
+        self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
+        del idx
+
+        # divide the length of each segment by the fiber length so that all the columns of the libear operator will have same length
+        # NB: it works in conjunction with the normalization of the kernels
+        cdef :
+            np.float32_t [:] sl = self.DICTIONARY['IC']['len']
+            np.float32_t [:] tl = self.DICTIONARY['TRK']['norm']
+            np.uint32_t  [:] f  = self.DICTIONARY['IC']['fiber']
+            int s
+        if self.get_config('doNormalizeKernels') :
+            for s in xrange(self.DICTIONARY['IC']['n']) :
+                sl[s] /= tl[ f[s] ]
+
+        print( '[ %d fibers and %d segments ]' % ( self.DICTIONARY['IC']['nF'], self.DICTIONARY['IC']['n'] ) )
+
+        # segments from the peaks
+        # -----------------------
+        print( '\t* segments from the peaks...', end="" )
+        sys.stdout.flush()
+
+        self.DICTIONARY['EC'] = {}
+        self.DICTIONARY['EC']['v']  = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_EC_v.dict'), dtype=np.uint32 )
+        self.DICTIONARY['EC']['o']  = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_EC_o.dict'), dtype=np.uint16 )
+        self.DICTIONARY['EC']['nE'] = self.DICTIONARY['EC']['v'].size
+
+        # reorder the segments based on the "v" field
+        idx = np.lexsort( [np.array(self.DICTIONARY['EC']['o']), np.array(self.DICTIONARY['EC']['v'])] )
+        self.DICTIONARY['EC']['v'] = self.DICTIONARY['EC']['v'][ idx ]
+        self.DICTIONARY['EC']['o'] = self.DICTIONARY['EC']['o'][ idx ]
+        del idx
+
+        print( ' [ %d segments ]' % self.DICTIONARY['EC']['nE'] )
+
+        # isotropic compartments
+        # ----------------------
+        print( '\t* isotropic contributions...', end="" )
+        sys.stdout.flush()
+
+        self.DICTIONARY['ISO'] = {}
+
+        self.DICTIONARY['nV'] = self.DICTIONARY['MASK'].sum()
+
+        vx, vy, vz = ( self.DICTIONARY['MASK'] > 0 ).nonzero() # [TODO] find a way to avoid using int64 (not necessary and waste of memory)
+        vx = vx.astype(np.int32)
+        vy = vy.astype(np.int32)
+        vz = vz.astype(np.int32)
+        self.DICTIONARY['ISO']['v'] = vx + self.get_config('dim')[0] * ( vy + self.get_config('dim')[1] * vz )
+        del vx, vy, vz
+
+        # reorder the segments based on the "v" field
+        idx = np.argsort( self.DICTIONARY['ISO']['v'], kind='mergesort' )
+        self.DICTIONARY['ISO']['v'] = self.DICTIONARY['ISO']['v'][ idx ]
+        del idx
+
+        print( ' [ %d voxels ]' % self.DICTIONARY['nV'] )
+
+        # post-processing
+        # ---------------
+        print( '\t* post-processing...', end="" )
+        sys.stdout.flush()
+
+        # get the indices to extract the VOI as in MATLAB (in place of DICTIONARY.MASKidx)
+        idx = self.DICTIONARY['MASK'].ravel(order='F').nonzero()[0]
+        self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] = np.unravel_index( idx, self.DICTIONARY['MASK'].shape, order='F' )
+
+        lut = np.zeros( self.get_config('dim'), dtype=np.uint32 ).ravel()
+        for i in xrange(idx.size) :
+            lut[ idx[i] ] = i
+        self.DICTIONARY['IC'][ 'v'] = lut[ self.DICTIONARY['IC'][ 'v'] ]
+        self.DICTIONARY['EC'][ 'v'] = lut[ self.DICTIONARY['EC'][ 'v'] ]
+        self.DICTIONARY['ISO']['v'] = lut[ self.DICTIONARY['ISO']['v'] ]
+
+        print( '         [ OK ]' )
+
+        print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
+
+
+    def set_threads( self, nthreads = None ) :
+        """Set the number of threads to use for the matrix-vector operations with A and A'.
+
+        Parameters
+        ----------
+        nthreads : integer
+            Number of threads to use (nthreads = None ---> all the CPU threads available in the system
+                                      nthreads = 0    ---> enable CUDA GPU acceleration)
+        """
+        if nthreads is None :
+            # Set to the number of CPUs in the system
+            try :
+                import multiprocessing
+                nthreads = multiprocessing.cpu_count()
+            except :
+                nthreads = 1
+
+        if nthreads < 0 or nthreads > 255 :
+            raise RuntimeError( 'Number of threads must be between 0 and 255' )
+        if self.DICTIONARY is None :
+            raise RuntimeError( 'Dictionary not loaded; call "load_dictionary()" first.' )
+        if self.KERNELS is None :
+            raise RuntimeError( 'Response functions not generated; call "generate_kernels()" and "load_kernels()" first.' )
+
+        self.THREADS = {}
+        self.THREADS['n'] = nthreads
+        self.THREADS['IC'] = None
+        self.THREADS['EC'] = None
+        self.THREADS['ISO'] = None
+        self.THREADS['ICt'] = None
+        self.THREADS['ECt'] = None
+        self.THREADS['ISOt'] = None
+
+        cdef :
+            long [:] C
+            long t, tot, i1, i2, N, c
+            int i
+
+        if nthreads > 0:
+            print( '\n-> Distributing workload to different threads:' )
+            print( '\t* number of threads : %d' % nthreads )
+
+            tic = time.time()
+
+            # Distribute load for the computation of A*x product
+            print( '\t* A  operator... ', end="" )
+            sys.stdout.flush()
+
+            if self.DICTIONARY['IC']['n'] > 0 :
+                self.THREADS['IC'] = np.zeros( nthreads+1, dtype=np.uint32 )
+                if nthreads > 1 :
+                    N = np.floor( self.DICTIONARY['IC']['n']/nthreads )
+                    t = 1
+                    tot = 0
+                    C = np.bincount( self.DICTIONARY['IC']['v'] )
+                    for c in C :
+                        tot += c
+                        if tot >= N :
+                            self.THREADS['IC'][t] = self.THREADS['IC'][t-1] + tot
+                            t += 1
+                            tot = 0
+                self.THREADS['IC'][nthreads] = self.DICTIONARY['IC']['n']
+
+                # check if some threads are not assigned any segment
+                if np.count_nonzero( np.diff( self.THREADS['IC'].astype(np.int32) ) <= 0 ) :
+                    self.THREADS = None
+                    raise RuntimeError( 'Too many threads for the IC compartments to evaluate; try decreasing the number.' )
+
+            if self.DICTIONARY['EC']['nE'] > 0 :
+                self.THREADS['EC'] = np.zeros( nthreads+1, dtype=np.uint32 )
+                for i in xrange(nthreads) :
+                    self.THREADS['EC'][i] = np.searchsorted( self.DICTIONARY['EC']['v'], self.DICTIONARY['IC']['v'][ self.THREADS['IC'][i] ] )
+                self.THREADS['EC'][nthreads] = self.DICTIONARY['EC']['nE']
+
+                # check if some threads are not assigned any segment
+                if np.count_nonzero( np.diff( self.THREADS['EC'].astype(np.int32) ) <= 0 ) :
+                    self.THREADS = None
+                    raise RuntimeError( 'Too many threads for the EC compartments to evaluate; try decreasing the number.' )
+
+            if self.DICTIONARY['nV'] > 0 :
+                self.THREADS['ISO'] = np.zeros( nthreads+1, dtype=np.uint32 )
+                for i in xrange(nthreads) :
+                    self.THREADS['ISO'][i] = np.searchsorted( self.DICTIONARY['ISO']['v'], self.DICTIONARY['IC']['v'][ self.THREADS['IC'][i] ] )
+                self.THREADS['ISO'][nthreads] = self.DICTIONARY['nV']
+
+                # check if some threads are not assigned any segment
+                if np.count_nonzero( np.diff( self.THREADS['ISO'].astype(np.int32) ) <= 0 ) :
+                    self.THREADS = None
+                    raise RuntimeError( 'Too many threads for the ISO compartments to evaluate; try decreasing the number.' )
+
+            print( '[ OK ]' )
+
+            # Distribute load for the computation of At*y product
+            print( '\t* A\' operator... ', end="" )
+            sys.stdout.flush()
+
+            if self.DICTIONARY['IC']['n'] > 0 :
+                self.THREADS['ICt'] = np.full( self.DICTIONARY['IC']['n'], nthreads-1, dtype=np.uint8 )
+                if nthreads > 1 :
+                    idx = np.argsort( self.DICTIONARY['IC']['fiber'], kind='mergesort' )
+                    C = np.bincount( self.DICTIONARY['IC']['fiber'] )
+                    t = tot = i1 = i2 = 0
+                    N = np.floor(self.DICTIONARY['IC']['n']/nthreads)
+                    for c in C :
+                        i2 += c
+                        tot += c
+                        if tot >= N :
+                            self.THREADS['ICt'][ i1:i2 ] = t
+                            t += 1
+                            if t==nthreads-1 :
+                                break
+                            i1 = i2
+                            tot = c
+                    self.THREADS['ICt'][idx] = self.THREADS['ICt'].copy()
+
+            if self.DICTIONARY['EC']['nE'] > 0 :
+                self.THREADS['ECt'] = np.zeros( nthreads+1, dtype=np.uint32 )
+                N = np.floor( self.DICTIONARY['EC']['nE']/nthreads )
+                for i in xrange(1,nthreads) :
+                    self.THREADS['ECt'][i] = self.THREADS['ECt'][i-1] + N
+                self.THREADS['ECt'][nthreads] = self.DICTIONARY['EC']['nE']
+
+                # check if some threads are not assigned any segment
+                if np.count_nonzero( np.diff( self.THREADS['ECt'].astype(np.int32) ) <= 0 ) :
+                    self.THREADS = None
+                    raise RuntimeError( 'Too many threads for the EC compartments to evaluate; try decreasing the number.' )
+
+            if self.DICTIONARY['nV'] > 0 :
+                self.THREADS['ISOt'] = np.zeros( nthreads+1, dtype=np.uint32 )
+                N = np.floor( self.DICTIONARY['nV']/nthreads )
+                for i in xrange(1,nthreads) :
+                    self.THREADS['ISOt'][i] = self.THREADS['ISOt'][i-1] + N
+                self.THREADS['ISOt'][nthreads] = self.DICTIONARY['nV']
+
+                # check if some threads are not assigned any segment
+                if np.count_nonzero( np.diff( self.THREADS['ISOt'].astype(np.int32) ) <= 0 ) :
+                    self.THREADS = None
+                    raise RuntimeError( 'Too many threads for the ISO compartments to evaluate; try decreasing the number.' )
+
+            print( '[ OK ]' )
+
+            print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
+
+
+    def build_operator( self ) :
+        """Compile/build the operator for computing the matrix-vector multiplications by A and A'
+        using the informations from self.DICTIONARY, self.KERNELS and self.THREADS.
+        NB: needs to call this function to update pointers to data structures in case
+            the data is changed in self.DICTIONARY, self.KERNELS or self.THREADS.
+        """
+        if self.DICTIONARY is None :
+            raise RuntimeError( 'Dictionary not loaded; call "load_dictionary()" first.' )
+        if self.KERNELS is None :
+            raise RuntimeError( 'Response functions not generated; call "generate_kernels()" and "load_kernels()" first.' )
+        if self.THREADS is None :
+            raise RuntimeError( 'Threads not set; call "set_threads()" first.' )
+
+        tic = time.time()
+        print( '\n-> Building linear operator A:' )
+
+        # need to pass these parameters at runtime for compiling the C code
+        from commit.operator import config
+        config.nTHREADS = self.THREADS['n']
+        config.model    = self.model.id
+        config.nIC      = self.KERNELS['wmr'].shape[0]
+        config.nEC      = self.KERNELS['wmh'].shape[0]
+        config.nISO     = self.KERNELS['iso'].shape[0]
+        if not 'commit.operator.operator' in sys.modules :
+            import commit.operator.operator
+        else :
+            reload( sys.modules['commit.operator.operator'] )
+
+        if self.THREADS['n'] > 0:
+            self.A = sys.modules['commit.operator.operator'].LinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
+        else:
+            import commit.cudaoperator
+            self.A = commit.cudaoperator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS, fcall=1 )
+
+        print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
+
+    def get_y( self ):
+        """
+        Returns a numpy array that corresponds to the 'y' vector of the optimisation problem.
+        NB: this can be run only after having loaded the dictionary and the data.
+        """
+        if self.DICTIONARY is None :
+            raise RuntimeError( 'Dictionary not loaded; call "load_dictionary()" first.' )
+        if self.niiDWI is None :
+            raise RuntimeError( 'Data not loaded; call "load_data()" first.' )
+        return self.niiDWI_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'], : ].flatten().astype(np.float64)
+
+    def fit( self, tol_fun = 1e-3, tol_x = 1e-6, max_iter = 100, verbose = 1, x0 = None, regularisation = None ) :
+        """Fit the model to the data.
+
+        Parameters
+        ----------
+        tol_fun : float
+            Tolerance on the objective function (default : 1e-3)
+        max_iter : integer
+            Maximum number of iterations (default : 100)
+        verbose : integer
+            Level of verbosity: 0=no print, 1=print progress (default : 1)
+        x0 : np.array
+            Initial guess for the solution of the problem (default : None)
+        regularisation : commit.solvers.init_regularisation object
+            Python dictionary that describes the wanted regularisation term.
+            Check the documentation of commit.solvers.init_regularisation to see
+            how to properly define the wanted mathematical formulation
+            ( default : None )
+        """
+        if self.niiDWI is None :
+            raise RuntimeError( 'Data not loaded; call "load_data()" first.' )
+        if self.DICTIONARY is None :
+            raise RuntimeError( 'Dictionary not loaded; call "load_dictionary()" first.' )
+        if self.KERNELS is None :
+            raise RuntimeError( 'Response functions not generated; call "generate_kernels()" and "load_kernels()" first.' )
+        if self.THREADS is None :
+            raise RuntimeError( 'Threads not set; call "set_threads()" first.' )
+        if self.A is None :
+            raise RuntimeError( 'Operator not built; call "build_operator()" first.' )
+
+        if x0 is not None :
+            if x0.shape[0] != self.A.shape[1] :
+                raise RuntimeError( 'x0: dimension does not match the number of columns of the dictionary.' )
+        if regularisation is None :
+            regularisation = commit.solvers.init_regularisation(self)
+
+        self.CONFIG['optimization']                   = {}
+        self.CONFIG['optimization']['tol_fun']        = tol_fun
+        self.CONFIG['optimization']['tol_x']          = tol_x
+        self.CONFIG['optimization']['max_iter']       = max_iter
+        self.CONFIG['optimization']['verbose']        = verbose
+        self.CONFIG['optimization']['regularisation'] = regularisation
+
+        # run solver
+        t = time.time()
+        print( '\n-> Fit model' )
+
+        self.x, opt_details = commit.solvers.solve(self.get_y(), self.A, self.A.T, tol_fun = tol_fun, tol_x = tol_x, max_iter = max_iter, verbose = verbose, x0 = x0, regularisation = regularisation)
+
+        self.CONFIG['optimization']['fit_details'] = opt_details
+        self.CONFIG['optimization']['fit_time'] = time.time()-t
+
+        print( '   [ %s ]' % ( time.strftime("%Hh %Mm %Ss", time.gmtime(self.CONFIG['optimization']['fit_time']) ) ) )
+
+
+    def save_results( self, path_suffix = None, save_opt_details = True, save_coeff = False ) :
+        """Save the output (coefficients, errors, maps etc).
+
+        Parameters
+        ----------
+        path_suffix : string
+            Text to be appended to "Results" to create the output path (default : None)
+        save_opt_details : boolean
+            Save everything in a pickle file containing the following list L:
+                L[0]: dictionary with all the configuration details
+                L[1]: np.array obtained through the optimisation process with the normalised kernels
+                L[2]: np.array renormalisation of L[1]
+            (default : True)
+        save_coeff : boolean
+            Save three txt files containing the coefficients related to each
+            compartment and a pickle file containing the dictionary with all
+            the configuration details.
+            (default : False)
+        """
+        if self.x is None :
+            raise RuntimeError( 'Model not fitted to the data; call "fit()" first.' )
+
+        RESULTS_path = 'Results_' + self.model.id
+        if path_suffix :
+            self.set_config('path_suffix', path_suffix)
+            RESULTS_path = RESULTS_path + path_suffix
+
+        print( '\n-> Saving results to "%s/*":' % RESULTS_path )
+        tic = time.time()
+
+        # create folder or delete existing files (if any)
+        RESULTS_path = pjoin( self.get_config('TRACKING_path'), RESULTS_path )
+        if not exists( RESULTS_path ) :
+            makedirs( RESULTS_path )
+        else :
+            for f in glob.glob( pjoin(RESULTS_path,'*') ) :
+                remove( f )
+        self.set_config('RESULTS_path', RESULTS_path)
+
+        # Configuration and results
+        print( '\t* configuration and results:' )
+
+        nF = self.DICTIONARY['IC']['nF']
+        nE = self.DICTIONARY['EC']['nE']
+        nV = self.DICTIONARY['nV']
+        norm_fib = np.ones( nF )
+        # x is the x of the original problem
+        # self.x is the x preconditioned
+        if self.get_config('doNormalizeKernels') :
+            # renormalize the coefficients
+            norm1 = np.repeat(self.KERNELS['wmr_norm'],nF)
+            norm2 = np.repeat(self.KERNELS['wmh_norm'],nE)
+            norm3 = np.repeat(self.KERNELS['iso_norm'],nV)
+            norm_fib = np.kron(np.ones(self.KERNELS['wmr'].shape[0]), self.DICTIONARY['TRK']['norm'])
+            x = self.x / np.hstack( (norm1*norm_fib,norm2,norm3) )
+        else :
+            x = self.x
+        if save_opt_details:
+            print( '\t\t- pickle... ', end="" )
+            sys.stdout.flush()
+            with open( pjoin(RESULTS_path,'results.pickle'), 'wb+' ) as fid :
+                pickle.dump( [self.CONFIG, self.x, x], fid, protocol=2 )
+            print( '[ OK ]' )
+        if save_coeff:
+            print( '\t\t- txt... ', end="" )
+            sys.stdout.flush()
+            np.savetxt(pjoin(RESULTS_path,'xic.txt'), x[0:nF])
+            np.savetxt(pjoin(RESULTS_path,'xec.txt'), x[nF:nF+nE])
+            np.savetxt(pjoin(RESULTS_path,'xiso.txt'), x[(nF+nE):])
+            with open( pjoin(RESULTS_path,'config.pickle'), 'wb+' ) as fid :
+                pickle.dump( self.CONFIG, fid, protocol=2 )
+            print( '[ OK ]' )
+
+
+        # Map of wovelwise errors
+        print( '\t* fitting errors:' )
+
+        not_NaN = np.ones( self.get_config('dim'), dtype=np.float32 ) * 1e-16 # avoid division by 0
+
+        niiMAP_img = np.zeros( self.get_config('dim'), dtype=np.float32 )
+        affine = self.niiDWI.affine if nibabel.__version__ >= '2.0.0' else self.niiDWI.get_affine()
+        niiMAP     = nibabel.Nifti1Image( niiMAP_img, affine )
+        niiMAP_hdr = niiMAP.header if nibabel.__version__ >= '2.0.0' else niiMAP.get_header()
+
+        y_mea = np.reshape( self.niiDWI_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'], : ].flatten().astype(np.float32), (nV,-1) )
+        y_est = np.reshape( self.A.dot(self.x), (nV,-1) ).astype(np.float32)
+
+        print( '\t\t- RMSE...', end="" )
+        sys.stdout.flush()
+        tmp = np.sqrt( np.mean((y_mea-y_est)**2,axis=1) )
+        niiMAP_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = tmp
+        niiMAP_hdr['cal_min'] = 0
+        niiMAP_hdr['cal_max'] = tmp.max()
+        nibabel.save( niiMAP, pjoin(RESULTS_path,'fit_RMSE.nii.gz') )
+        print( ' [ %.3f +/- %.3f ]' % ( tmp.mean(), tmp.std() ) )
+
+        print( '\t\t- NRMSE...', end="" )
+        sys.stdout.flush()
+        tmp = np.sum(y_mea**2,axis=1)
+        idx = np.where( tmp < 1E-12 )
+        tmp[ idx ] = 1
+        tmp = np.sqrt( np.sum((y_mea-y_est)**2,axis=1) / tmp )
+        tmp[ idx ] = 0
+        niiMAP_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = tmp
+        niiMAP_hdr['cal_min'] = 0
+        niiMAP_hdr['cal_max'] = 1
+        nibabel.save( niiMAP, pjoin(RESULTS_path,'fit_NRMSE.nii.gz') )
+        print( '[ %.3f +/- %.3f ]' % ( tmp.mean(), tmp.std() ) )
+
+        # Map of compartment contributions
+        print( '\t* voxelwise contributions:' )
+
+        print( '\t\t- intra-axonal', end="" )
+        sys.stdout.flush()
+        niiIC_img = np.zeros( self.get_config('dim'), dtype=np.float32 )
+        if len(self.KERNELS['wmr']) > 0 :
+            offset = nF * self.KERNELS['wmr'].shape[0]
+            tmp = ( x[:offset].reshape( (-1,nF) ) * norm_fib.reshape( (-1,nF) ) ).sum( axis=0 )
+            xv = np.bincount( self.DICTIONARY['IC']['v'], minlength=nV,
+                weights=tmp[ self.DICTIONARY['IC']['fiber'] ] * self.DICTIONARY['IC']['len']
+            ).astype(np.float32)
+            niiIC_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = xv
+        print( '[ OK ]' )
+
+        print( '\t\t- extra-axonal', end="" )
+        sys.stdout.flush()
+        niiEC_img = np.zeros( self.get_config('dim'), dtype=np.float32 )
+        if len(self.KERNELS['wmh']) > 0 :
+            offset = nF * self.KERNELS['wmr'].shape[0]
+            tmp = x[offset:offset+nE*len(self.KERNELS['wmh'])].reshape( (-1,nE) ).sum( axis=0 )
+            xv = np.bincount( self.DICTIONARY['EC']['v'], weights=tmp, minlength=nV ).astype(np.float32)
+            niiEC_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = xv
+        print( '[ OK ]' )
+
+        print( '\t\t- isotropic', end="" )
+        sys.stdout.flush()
+        niiISO_img = np.zeros( self.get_config('dim'), dtype=np.float32 )
+        if len(self.KERNELS['iso']) > 0 :
+            offset = nF * self.KERNELS['wmr'].shape[0] + nE * self.KERNELS['wmh'].shape[0]
+            xv = x[offset:].reshape( (-1,nV) ).sum( axis=0 )
+            niiISO_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = xv
+        print( '   [ OK ]' )
+
+        if self.get_config('doNormalizeMaps') :
+                niiIC = nibabel.Nifti1Image( niiIC_img / ( niiIC_img + niiEC_img + niiISO_img + not_NaN), affine )
+                niiEC = nibabel.Nifti1Image( niiEC_img / ( niiIC_img + niiEC_img + niiISO_img + not_NaN), affine )
+                niiISO = nibabel.Nifti1Image( niiISO_img / ( niiIC_img + niiEC_img + niiISO_img + not_NaN), affine )
+        else:
+                niiIC = nibabel.Nifti1Image( niiIC_img, affine )
+                niiEC = nibabel.Nifti1Image( niiEC_img, affine )
+                niiISO = nibabel.Nifti1Image( niiISO_img, affine )
+
+        nibabel.save( niiIC , pjoin(RESULTS_path,'compartment_IC.nii.gz') )
+        nibabel.save( niiEC , pjoin(RESULTS_path,'compartment_EC.nii.gz') )
+        nibabel.save( niiISO , pjoin(RESULTS_path,'compartment_ISO.nii.gz') )
+
+
+        print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 3cc6591a..30d2450d 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -1,221 +1,221 @@
-#!python
-#cython: language_level=3, boundscheck=False, wraparound=False, profile=False
-
-import cython
-import numpy as np
-cimport numpy as np
-
-cdef extern from "operator_withCUDA.cuh":
-    cdef cppclass C_CudaLinearOperator "CudaLinearOperator":
-        C_CudaLinearOperator(
-            np.uint32_t*,
-            np.uint32_t*,
-            np.uint16_t*,
-            np.float32_t*,
-            np.float32_t*,
-
-            np.uint32_t*,
-            np.uint16_t*,
-            np.float32_t*,
-
-            np.float32_t*,
-
-            int,
-            int,
-            int,
-            int,
-            int,
-            int,
-            int,
-            int,
-            int,
-            
-            int)
-
-        int   getCudaStatus()
-        void  setTransposeData(np.uint32_t*, np.uint32_t*, np.uint16_t*, np.float32_t*)
-        void  destroy()
-        void  dot(np.float64_t*, np.float64_t*)
-        void Tdot(np.float64_t*, np.float64_t*)
-
-cdef class CudaLinearOperator :
-    """This class is a wrapper to the C code for performing marix-vector multiplications
-    with the COMMIT linear operator A. The multiplications are done using C code
-    that uses information from the DICTIONARY, KERNELS and THREADS data structures.
-    """
-    cdef int nS, nF, nR, nE, nT, nV, nI, n, ndirs
-    cdef public int adjoint, n1, n2
-
-    cdef DICTIONARY
-    cdef KERNELS
-    cdef THREADS
-
-    cdef unsigned int*   ICf
-    cdef float*          ICl
-    cdef unsigned int*   ICv
-    cdef unsigned short* ICo
-    cdef unsigned int*   ECv
-    cdef unsigned short* ECo
-    cdef unsigned int*   ISOv
-
-    cdef float* LUT_IC
-    cdef float* LUT_EC
-    cdef float* LUT_ISO
-
-    cdef unsigned int*   ICthreads
-    cdef unsigned int*   ECthreads
-    cdef unsigned int*   ISOthreads
-
-    cdef unsigned char*  ICthreadsT
-    cdef unsigned int*   ECthreadsT
-    cdef unsigned int*   ISOthreadsT
-    cdef C_CudaLinearOperator* A
-
-
-    def __init__( self, DICTIONARY, KERNELS, THREADS, fcall = 0 ) :
-        """Set the pointers to the data structures used by the C code."""
-        self.DICTIONARY = DICTIONARY
-        self.KERNELS    = KERNELS
-        self.THREADS    = THREADS
-
-        self.nF         = DICTIONARY['IC']['nF']    # number of FIBERS
-        self.nR         = KERNELS['wmr'].shape[0]   # number of FIBER RADII
-        self.nE         = DICTIONARY['EC']['nE']    # number of EC segments
-        self.nT         = KERNELS['wmh'].shape[0]   # number of EC TORTUOSITY values
-        self.nV         = DICTIONARY['nV']          # number of VOXELS
-        self.nI         = KERNELS['iso'].shape[0]   # number of ISO contributions
-        self.n          = DICTIONARY['IC']['n']     # numbner of IC segments
-        self.ndirs      = KERNELS['wmr'].shape[1]   # number of directions
-
-        if KERNELS['wmr'].size > 0 :
-            self.nS = KERNELS['wmr'].shape[2]       # number of SAMPLES
-        elif KERNELS['wmh'].size > 0 :
-            self.nS = KERNELS['wmh'].shape[2]
-        else :
-            self.nS = KERNELS['wmr'].shape[1]
-
-        self.adjoint = 0                            # direct of inverse product
-
-        self.n1 = self.nV*self.nS
-        self.n2 = self.nR*self.nF + self.nT*self.nE + self.nI*self.nV
-
-        # get C pointers to arrays in DICTIONARY
-        cdef unsigned int [::1]   ICf  = DICTIONARY['IC']['fiber']
-        self.ICf = &ICf[0]
-        cdef float [::1]          ICl  = DICTIONARY['IC']['len']
-        self.ICl = &ICl[0]
-        cdef unsigned int [::1]   ICv  = DICTIONARY['IC']['v']
-        self.ICv = &ICv[0]
-        cdef unsigned short [::1] ICo  = DICTIONARY['IC']['o']
-        self.ICo = &ICo[0]
-        cdef unsigned int [::1]   ECv  = DICTIONARY['EC']['v']
-        self.ECv = &ECv[0]
-        cdef unsigned short [::1] ECo  = DICTIONARY['EC']['o']
-        self.ECo = &ECo[0]
-        cdef unsigned int [::1]   ISOv = DICTIONARY['ISO']['v']
-        self.ISOv = &ISOv[0]
-
-        # get C pointers to arrays in KERNELS
-        cdef float [:, :, ::1] wmrSFP = KERNELS['wmr']
-        self.LUT_IC  = &wmrSFP[0,0,0]
-        cdef float [:, :, ::1] wmhSFP = KERNELS['wmh']
-        self.LUT_EC  = &wmhSFP[0,0,0]
-        cdef float [:, ::1] isoSFP = KERNELS['iso']
-        self.LUT_ISO = &isoSFP[0,0]
-
-        self.A = new C_CudaLinearOperator(
-            &ICv[0],
-            &ICf[0],
-            &ICo[0],
-            &ICl[0],
-            &wmrSFP[0,0,0],
-
-            &ECv[0],
-            &ECo[0],
-            &wmhSFP[0,0,0],
-
-            &isoSFP[0,0],
-
-            self.n,
-            self.nV,
-            self.nF,
-            self.nE,
-            self.ndirs,
-            self.nS,
-            self.nR,
-            self.nT,
-            self.nI,
-            
-            fcall)
-
-        if fcall == 1:
-            idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['fiber'])] )
-
-            self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
-            self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]
-            self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
-            self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
-
-            ICf = self.DICTIONARY['IC']['fiber']
-            ICl = self.DICTIONARY['IC']['len']
-            ICv = self.DICTIONARY['IC']['v']
-            ICo = self.DICTIONARY['IC']['o']
-
-            self.ICf = &ICf[0]
-            self.ICl = &ICl[0]
-            self.ICv = &ICv[0]
-            self.ICo = &ICo[0]
-
-            self.A.setTransposeData(&self.ICv[0], &self.ICf[0], &self.ICo[0], &self.ICl[0])
-
-    @property
-    def T( self ) :
-        """Transpose of the explicit matrix."""
-        C = CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
-        C.adjoint = 1 - C.adjoint
-        return C
-
-    @property
-    def shape( self ) :
-        """Size of the explicit matrix."""
-        if not self.adjoint :
-            return ( self.n1, self.n2 )
-        else :
-            return ( self.n2, self.n1 )
-
-
-    def dot( self, double [::1] v_in  ):
-        """Wrapper to C code for efficiently performing the matrix-vector multiplications.
-
-        Parameters
-        ----------
-        v_in : 1D numpy.array of double
-            Input vector for the matrix-vector multiplication
-
-        Returns
-        -------
-        v_out : 1D numpy.array of double
-            Results of the multiplication
-        """
-
-        # Permit only matrix-vector multiplications
-        if v_in.size != self.shape[1] :
-            raise RuntimeError( "A.dot(): dimensions do not match" )
-
-        # Create output array
-        cdef double [::1] v_out = np.zeros( self.shape[0], dtype=np.float64 )
-
-        # Call the cython function to read the memory pointers
-        if not self.adjoint :
-            # DIRECT PRODUCT A*x
-            self.A.dot(&v_in[0], &v_out[0])
-        else :
-            # INVERSE PRODUCT A'*y
-            self.A.Tdot(&v_in[0], &v_out[0])
-
-        return v_out
-
-    def destroy( self ):
-        """Free all memory of the CUDA GPU"""
-        self.A.destroy()
-
+#!python
+#cython: language_level=3, boundscheck=False, wraparound=False, profile=False
+
+import cython
+import numpy as np
+cimport numpy as np
+
+cdef extern from "operator_withCUDA.cuh":
+    cdef cppclass C_CudaLinearOperator "CudaLinearOperator":
+        C_CudaLinearOperator(
+            np.uint32_t*,
+            np.uint32_t*,
+            np.uint16_t*,
+            np.float32_t*,
+            np.float32_t*,
+
+            np.uint32_t*,
+            np.uint16_t*,
+            np.float32_t*,
+
+            np.float32_t*,
+
+            int,
+            int,
+            int,
+            int,
+            int,
+            int,
+            int,
+            int,
+            int,
+            
+            int)
+
+        int   getCudaStatus()
+        void  setTransposeData(np.uint32_t*, np.uint32_t*, np.uint16_t*, np.float32_t*)
+        void  destroy()
+        void  dot(np.float64_t*, np.float64_t*)
+        void Tdot(np.float64_t*, np.float64_t*)
+
+cdef class CudaLinearOperator :
+    """This class is a wrapper to the C code for performing marix-vector multiplications
+    with the COMMIT linear operator A. The multiplications are done using C code
+    that uses information from the DICTIONARY, KERNELS and THREADS data structures.
+    """
+    cdef int nS, nF, nR, nE, nT, nV, nI, n, ndirs
+    cdef public int adjoint, n1, n2
+
+    cdef DICTIONARY
+    cdef KERNELS
+    cdef THREADS
+
+    cdef unsigned int*   ICf
+    cdef float*          ICl
+    cdef unsigned int*   ICv
+    cdef unsigned short* ICo
+    cdef unsigned int*   ECv
+    cdef unsigned short* ECo
+    cdef unsigned int*   ISOv
+
+    cdef float* LUT_IC
+    cdef float* LUT_EC
+    cdef float* LUT_ISO
+
+    cdef unsigned int*   ICthreads
+    cdef unsigned int*   ECthreads
+    cdef unsigned int*   ISOthreads
+
+    cdef unsigned char*  ICthreadsT
+    cdef unsigned int*   ECthreadsT
+    cdef unsigned int*   ISOthreadsT
+    cdef C_CudaLinearOperator* A
+
+
+    def __init__( self, DICTIONARY, KERNELS, THREADS, fcall = 0 ) :
+        """Set the pointers to the data structures used by the C code."""
+        self.DICTIONARY = DICTIONARY
+        self.KERNELS    = KERNELS
+        self.THREADS    = THREADS
+
+        self.nF         = DICTIONARY['IC']['nF']    # number of FIBERS
+        self.nR         = KERNELS['wmr'].shape[0]   # number of FIBER RADII
+        self.nE         = DICTIONARY['EC']['nE']    # number of EC segments
+        self.nT         = KERNELS['wmh'].shape[0]   # number of EC TORTUOSITY values
+        self.nV         = DICTIONARY['nV']          # number of VOXELS
+        self.nI         = KERNELS['iso'].shape[0]   # number of ISO contributions
+        self.n          = DICTIONARY['IC']['n']     # numbner of IC segments
+        self.ndirs      = KERNELS['wmr'].shape[1]   # number of directions
+
+        if KERNELS['wmr'].size > 0 :
+            self.nS = KERNELS['wmr'].shape[2]       # number of SAMPLES
+        elif KERNELS['wmh'].size > 0 :
+            self.nS = KERNELS['wmh'].shape[2]
+        else :
+            self.nS = KERNELS['wmr'].shape[1]
+
+        self.adjoint = 0                            # direct of inverse product
+
+        self.n1 = self.nV*self.nS
+        self.n2 = self.nR*self.nF + self.nT*self.nE + self.nI*self.nV
+
+        # get C pointers to arrays in DICTIONARY
+        cdef unsigned int [::1]   ICf  = DICTIONARY['IC']['fiber']
+        self.ICf = &ICf[0]
+        cdef float [::1]          ICl  = DICTIONARY['IC']['len']
+        self.ICl = &ICl[0]
+        cdef unsigned int [::1]   ICv  = DICTIONARY['IC']['v']
+        self.ICv = &ICv[0]
+        cdef unsigned short [::1] ICo  = DICTIONARY['IC']['o']
+        self.ICo = &ICo[0]
+        cdef unsigned int [::1]   ECv  = DICTIONARY['EC']['v']
+        self.ECv = &ECv[0]
+        cdef unsigned short [::1] ECo  = DICTIONARY['EC']['o']
+        self.ECo = &ECo[0]
+        cdef unsigned int [::1]   ISOv = DICTIONARY['ISO']['v']
+        self.ISOv = &ISOv[0]
+
+        # get C pointers to arrays in KERNELS
+        cdef float [:, :, ::1] wmrSFP = KERNELS['wmr']
+        self.LUT_IC  = &wmrSFP[0,0,0]
+        cdef float [:, :, ::1] wmhSFP = KERNELS['wmh']
+        self.LUT_EC  = &wmhSFP[0,0,0]
+        cdef float [:, ::1] isoSFP = KERNELS['iso']
+        self.LUT_ISO = &isoSFP[0,0]
+
+        self.A = new C_CudaLinearOperator(
+            &ICv[0],
+            &ICf[0],
+            &ICo[0],
+            &ICl[0],
+            &wmrSFP[0,0,0],
+
+            &ECv[0],
+            &ECo[0],
+            &wmhSFP[0,0,0],
+
+            &isoSFP[0,0],
+
+            self.n,
+            self.nV,
+            self.nF,
+            self.nE,
+            self.ndirs,
+            self.nS,
+            self.nR,
+            self.nT,
+            self.nI,
+            
+            fcall)
+
+        if fcall == 1:
+            idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['fiber'])] )
+
+            self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
+            self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]
+            self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
+            self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
+
+            ICf = self.DICTIONARY['IC']['fiber']
+            ICl = self.DICTIONARY['IC']['len']
+            ICv = self.DICTIONARY['IC']['v']
+            ICo = self.DICTIONARY['IC']['o']
+
+            self.ICf = &ICf[0]
+            self.ICl = &ICl[0]
+            self.ICv = &ICv[0]
+            self.ICo = &ICo[0]
+
+            self.A.setTransposeData(&self.ICv[0], &self.ICf[0], &self.ICo[0], &self.ICl[0])
+
+    @property
+    def T( self ) :
+        """Transpose of the explicit matrix."""
+        C = CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
+        C.adjoint = 1 - C.adjoint
+        return C
+
+    @property
+    def shape( self ) :
+        """Size of the explicit matrix."""
+        if not self.adjoint :
+            return ( self.n1, self.n2 )
+        else :
+            return ( self.n2, self.n1 )
+
+
+    def dot( self, double [::1] v_in  ):
+        """Wrapper to C code for efficiently performing the matrix-vector multiplications.
+
+        Parameters
+        ----------
+        v_in : 1D numpy.array of double
+            Input vector for the matrix-vector multiplication
+
+        Returns
+        -------
+        v_out : 1D numpy.array of double
+            Results of the multiplication
+        """
+
+        # Permit only matrix-vector multiplications
+        if v_in.size != self.shape[1] :
+            raise RuntimeError( "A.dot(): dimensions do not match" )
+
+        # Create output array
+        cdef double [::1] v_out = np.zeros( self.shape[0], dtype=np.float64 )
+
+        # Call the cython function to read the memory pointers
+        if not self.adjoint :
+            # DIRECT PRODUCT A*x
+            self.A.dot(&v_in[0], &v_out[0])
+        else :
+            # INVERSE PRODUCT A'*y
+            self.A.Tdot(&v_in[0], &v_out[0])
+
+        return v_out
+
+    def destroy( self ):
+        """Free all memory of the CUDA GPU"""
+        self.A.destroy()
+
diff --git a/commit/operator/operator.pyx b/commit/operator/operator.pyx
index 6d83202a..72ed8655 100755
--- a/commit/operator/operator.pyx
+++ b/commit/operator/operator.pyx
@@ -1,191 +1,191 @@
-#!python
-#cython: language_level=3, boundscheck=False, wraparound=False, profile=False
-
-import cython
-import numpy as np
-cimport numpy as np
-
-# Interfaces to actual C code performing the multiplications
-cdef extern void COMMIT_A(
-    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
-    double *_v_in, double *_v_out,
-    unsigned int *_ICf, unsigned int *_ICv, unsigned short *_ICo, float *_ICl,
-    unsigned int *_ECv, unsigned short *_ECo,
-    unsigned int *_ISOv,
-    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
-    unsigned int* _ICthreads, unsigned int* _ECthreads, unsigned int* _ISOthreads
-) nogil
-
-cdef extern void COMMIT_At(
-    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
-    double *_v_in, double *_v_out,
-    unsigned int *_ICf, unsigned int *_ICv, unsigned short *_ICo, float *_ICl,
-    unsigned int *_ECv, unsigned short *_ECo,
-    unsigned int *_ISOv,
-    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
-    unsigned char *_ICthreadsT, unsigned int *_ECthreadsT, unsigned int *_ISOthreadsT
-) nogil
-
-
-
-cdef class LinearOperator :
-    """This class is a wrapper to the C code for performing marix-vector multiplications
-    with the COMMIT linear operator A. The multiplications are done using C code
-    that uses information from the DICTIONARY, KERNELS and THREADS data structures.
-    """
-    cdef int nS, nF, nR, nE, nT, nV, nI, n, ndirs
-    cdef public int adjoint, n1, n2
-
-    cdef DICTIONARY
-    cdef KERNELS
-    cdef THREADS
-
-    cdef unsigned int*   ICf
-    cdef float*          ICl
-    cdef unsigned int*   ICv
-    cdef unsigned short* ICo
-    cdef unsigned int*   ECv
-    cdef unsigned short* ECo
-    cdef unsigned int*   ISOv
-
-    cdef float* LUT_IC
-    cdef float* LUT_EC
-    cdef float* LUT_ISO
-
-    cdef unsigned int*   ICthreads
-    cdef unsigned int*   ECthreads
-    cdef unsigned int*   ISOthreads
-
-    cdef unsigned char*  ICthreadsT
-    cdef unsigned int*   ECthreadsT
-    cdef unsigned int*   ISOthreadsT
-
-
-    def __init__( self, DICTIONARY, KERNELS, THREADS ) :
-        """Set the pointers to the data structures used by the C code."""
-        self.DICTIONARY = DICTIONARY
-        self.KERNELS    = KERNELS
-        self.THREADS    = THREADS
-
-        self.nF         = DICTIONARY['IC']['nF']    # number of FIBERS
-        self.nR         = KERNELS['wmr'].shape[0]   # number of FIBER RADII
-        self.nE         = DICTIONARY['EC']['nE']    # number of EC segments
-        self.nT         = KERNELS['wmh'].shape[0]   # number of EC TORTUOSITY values
-        self.nV         = DICTIONARY['nV']          # number of VOXELS
-        self.nI         = KERNELS['iso'].shape[0]   # number of ISO contributions
-        self.n          = DICTIONARY['IC']['n']     # numbner of IC segments
-        self.ndirs      = KERNELS['wmr'].shape[1]   # number of directions
-
-        if KERNELS['wmr'].size > 0 :
-            self.nS = KERNELS['wmr'].shape[2]       # number of SAMPLES
-        elif KERNELS['wmh'].size > 0 :
-            self.nS = KERNELS['wmh'].shape[2]
-        else :
-            self.nS = KERNELS['wmr'].shape[1]
-
-        self.adjoint    = 0                         # direct of inverse product
-
-        self.n1 = self.nV*self.nS
-        self.n2 = self.nR*self.nF + self.nT*self.nE + self.nI*self.nV
-
-        # get C pointers to arrays in DICTIONARY
-        cdef unsigned int [::1]   ICf  = DICTIONARY['IC']['fiber']
-        self.ICf = &ICf[0]
-        cdef float [::1]          ICl  = DICTIONARY['IC']['len']
-        self.ICl = &ICl[0]
-        cdef unsigned int [::1]   ICv  = DICTIONARY['IC']['v']
-        self.ICv = &ICv[0]
-        cdef unsigned short [::1] ICo  = DICTIONARY['IC']['o']
-        self.ICo = &ICo[0]
-        cdef unsigned int [::1]   ECv  = DICTIONARY['EC']['v']
-        self.ECv = &ECv[0]
-        cdef unsigned short [::1] ECo  = DICTIONARY['EC']['o']
-        self.ECo = &ECo[0]
-        cdef unsigned int [::1]   ISOv = DICTIONARY['ISO']['v']
-        self.ISOv = &ISOv[0]
-
-        # get C pointers to arrays in KERNELS
-        cdef float [:, :, ::1] wmrSFP = KERNELS['wmr']
-        self.LUT_IC  = &wmrSFP[0,0,0]
-        cdef float [:, :, ::1] wmhSFP = KERNELS['wmh']
-        self.LUT_EC  = &wmhSFP[0,0,0]
-        cdef float [:, ::1] isoSFP = KERNELS['iso']
-        self.LUT_ISO = &isoSFP[0,0]
-
-        # get C pointers to arrays in THREADS
-        cdef unsigned int [::1] ICthreads = THREADS['IC']
-        self.ICthreads  = &ICthreads[0]
-        cdef unsigned int [::1] ECthreads = THREADS['EC']
-        self.ECthreads  = &ECthreads[0]
-        cdef unsigned int [::1] ISOthreads = THREADS['ISO']
-        self.ISOthreads = &ISOthreads[0]
-
-        cdef unsigned char [::1] ICthreadsT = THREADS['ICt']
-        self.ICthreadsT  = &ICthreadsT[0]
-        cdef unsigned int  [::1] ECthreadsT = THREADS['ECt']
-        self.ECthreadsT  = &ECthreadsT[0]
-        cdef unsigned int  [::1] ISOthreadsT = THREADS['ISOt']
-        self.ISOthreadsT = &ISOthreadsT[0]
-
-
-    @property
-    def T( self ) :
-        """Transpose of the explicit matrix."""
-        C = LinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
-        C.adjoint = 1 - C.adjoint
-        return C
-
-
-    @property
-    def shape( self ) :
-        """Size of the explicit matrix."""
-        if not self.adjoint :
-            return ( self.n1, self.n2 )
-        else :
-            return ( self.n2, self.n1 )
-
-
-    def dot( self, double [::1] v_in  ):
-        """Wrapper to C code for efficiently performing the matrix-vector multiplications.
-
-        Parameters
-        ----------
-        v_in : 1D numpy.array of double
-            Input vector for the matrix-vector multiplication
-
-        Returns
-        -------
-        v_out : 1D numpy.array of double
-            Results of the multiplication
-        """
-
-        # Permit only matrix-vector multiplications
-        if v_in.size != self.shape[1] :
-            raise RuntimeError( "A.dot(): dimensions do not match" )
-
-        # Create output array
-        cdef double [::1] v_out = np.zeros( self.shape[0], dtype=np.float64 )
-
-        # Call the cython function to read the memory pointers
-        if not self.adjoint :
-            # DIRECT PRODUCT A*x
-            with nogil :
-                COMMIT_A(
-                    self.nF, self.n, self.nE, self.nV, self.nS, self.ndirs,
-                    &v_in[0], &v_out[0],
-                    self.ICf, self.ICv, self.ICo, self.ICl, self.ECv, self.ECo, self.ISOv,
-                    self.LUT_IC, self.LUT_EC, self.LUT_ISO,
-                    self.ICthreads, self.ECthreads, self.ISOthreads
-                )
-        else :
-            # INVERSE PRODUCT A'*y
-            with nogil :
-                COMMIT_At(
-                    self.nF, self.n, self.nE, self.nV, self.nS, self.ndirs,
-                    &v_in[0], &v_out[0],
-                    self.ICf, self.ICv, self.ICo, self.ICl, self.ECv, self.ECo, self.ISOv,
-                    self.LUT_IC, self.LUT_EC, self.LUT_ISO,
-                    self.ICthreadsT, self.ECthreadsT, self.ISOthreadsT
-                )
-
-        return v_out
+#!python
+#cython: language_level=3, boundscheck=False, wraparound=False, profile=False
+
+import cython
+import numpy as np
+cimport numpy as np
+
+# Interfaces to actual C code performing the multiplications
+cdef extern void COMMIT_A(
+    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
+    double *_v_in, double *_v_out,
+    unsigned int *_ICf, unsigned int *_ICv, unsigned short *_ICo, float *_ICl,
+    unsigned int *_ECv, unsigned short *_ECo,
+    unsigned int *_ISOv,
+    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
+    unsigned int* _ICthreads, unsigned int* _ECthreads, unsigned int* _ISOthreads
+) nogil
+
+cdef extern void COMMIT_At(
+    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
+    double *_v_in, double *_v_out,
+    unsigned int *_ICf, unsigned int *_ICv, unsigned short *_ICo, float *_ICl,
+    unsigned int *_ECv, unsigned short *_ECo,
+    unsigned int *_ISOv,
+    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
+    unsigned char *_ICthreadsT, unsigned int *_ECthreadsT, unsigned int *_ISOthreadsT
+) nogil
+
+
+
+cdef class LinearOperator :
+    """This class is a wrapper to the C code for performing marix-vector multiplications
+    with the COMMIT linear operator A. The multiplications are done using C code
+    that uses information from the DICTIONARY, KERNELS and THREADS data structures.
+    """
+    cdef int nS, nF, nR, nE, nT, nV, nI, n, ndirs
+    cdef public int adjoint, n1, n2
+
+    cdef DICTIONARY
+    cdef KERNELS
+    cdef THREADS
+
+    cdef unsigned int*   ICf
+    cdef float*          ICl
+    cdef unsigned int*   ICv
+    cdef unsigned short* ICo
+    cdef unsigned int*   ECv
+    cdef unsigned short* ECo
+    cdef unsigned int*   ISOv
+
+    cdef float* LUT_IC
+    cdef float* LUT_EC
+    cdef float* LUT_ISO
+
+    cdef unsigned int*   ICthreads
+    cdef unsigned int*   ECthreads
+    cdef unsigned int*   ISOthreads
+
+    cdef unsigned char*  ICthreadsT
+    cdef unsigned int*   ECthreadsT
+    cdef unsigned int*   ISOthreadsT
+
+
+    def __init__( self, DICTIONARY, KERNELS, THREADS ) :
+        """Set the pointers to the data structures used by the C code."""
+        self.DICTIONARY = DICTIONARY
+        self.KERNELS    = KERNELS
+        self.THREADS    = THREADS
+
+        self.nF         = DICTIONARY['IC']['nF']    # number of FIBERS
+        self.nR         = KERNELS['wmr'].shape[0]   # number of FIBER RADII
+        self.nE         = DICTIONARY['EC']['nE']    # number of EC segments
+        self.nT         = KERNELS['wmh'].shape[0]   # number of EC TORTUOSITY values
+        self.nV         = DICTIONARY['nV']          # number of VOXELS
+        self.nI         = KERNELS['iso'].shape[0]   # number of ISO contributions
+        self.n          = DICTIONARY['IC']['n']     # numbner of IC segments
+        self.ndirs      = KERNELS['wmr'].shape[1]   # number of directions
+
+        if KERNELS['wmr'].size > 0 :
+            self.nS = KERNELS['wmr'].shape[2]       # number of SAMPLES
+        elif KERNELS['wmh'].size > 0 :
+            self.nS = KERNELS['wmh'].shape[2]
+        else :
+            self.nS = KERNELS['wmr'].shape[1]
+
+        self.adjoint    = 0                         # direct of inverse product
+
+        self.n1 = self.nV*self.nS
+        self.n2 = self.nR*self.nF + self.nT*self.nE + self.nI*self.nV
+
+        # get C pointers to arrays in DICTIONARY
+        cdef unsigned int [::1]   ICf  = DICTIONARY['IC']['fiber']
+        self.ICf = &ICf[0]
+        cdef float [::1]          ICl  = DICTIONARY['IC']['len']
+        self.ICl = &ICl[0]
+        cdef unsigned int [::1]   ICv  = DICTIONARY['IC']['v']
+        self.ICv = &ICv[0]
+        cdef unsigned short [::1] ICo  = DICTIONARY['IC']['o']
+        self.ICo = &ICo[0]
+        cdef unsigned int [::1]   ECv  = DICTIONARY['EC']['v']
+        self.ECv = &ECv[0]
+        cdef unsigned short [::1] ECo  = DICTIONARY['EC']['o']
+        self.ECo = &ECo[0]
+        cdef unsigned int [::1]   ISOv = DICTIONARY['ISO']['v']
+        self.ISOv = &ISOv[0]
+
+        # get C pointers to arrays in KERNELS
+        cdef float [:, :, ::1] wmrSFP = KERNELS['wmr']
+        self.LUT_IC  = &wmrSFP[0,0,0]
+        cdef float [:, :, ::1] wmhSFP = KERNELS['wmh']
+        self.LUT_EC  = &wmhSFP[0,0,0]
+        cdef float [:, ::1] isoSFP = KERNELS['iso']
+        self.LUT_ISO = &isoSFP[0,0]
+
+        # get C pointers to arrays in THREADS
+        cdef unsigned int [::1] ICthreads = THREADS['IC']
+        self.ICthreads  = &ICthreads[0]
+        cdef unsigned int [::1] ECthreads = THREADS['EC']
+        self.ECthreads  = &ECthreads[0]
+        cdef unsigned int [::1] ISOthreads = THREADS['ISO']
+        self.ISOthreads = &ISOthreads[0]
+
+        cdef unsigned char [::1] ICthreadsT = THREADS['ICt']
+        self.ICthreadsT  = &ICthreadsT[0]
+        cdef unsigned int  [::1] ECthreadsT = THREADS['ECt']
+        self.ECthreadsT  = &ECthreadsT[0]
+        cdef unsigned int  [::1] ISOthreadsT = THREADS['ISOt']
+        self.ISOthreadsT = &ISOthreadsT[0]
+
+
+    @property
+    def T( self ) :
+        """Transpose of the explicit matrix."""
+        C = LinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
+        C.adjoint = 1 - C.adjoint
+        return C
+
+
+    @property
+    def shape( self ) :
+        """Size of the explicit matrix."""
+        if not self.adjoint :
+            return ( self.n1, self.n2 )
+        else :
+            return ( self.n2, self.n1 )
+
+
+    def dot( self, double [::1] v_in  ):
+        """Wrapper to C code for efficiently performing the matrix-vector multiplications.
+
+        Parameters
+        ----------
+        v_in : 1D numpy.array of double
+            Input vector for the matrix-vector multiplication
+
+        Returns
+        -------
+        v_out : 1D numpy.array of double
+            Results of the multiplication
+        """
+
+        # Permit only matrix-vector multiplications
+        if v_in.size != self.shape[1] :
+            raise RuntimeError( "A.dot(): dimensions do not match" )
+
+        # Create output array
+        cdef double [::1] v_out = np.zeros( self.shape[0], dtype=np.float64 )
+
+        # Call the cython function to read the memory pointers
+        if not self.adjoint :
+            # DIRECT PRODUCT A*x
+            with nogil :
+                COMMIT_A(
+                    self.nF, self.n, self.nE, self.nV, self.nS, self.ndirs,
+                    &v_in[0], &v_out[0],
+                    self.ICf, self.ICv, self.ICo, self.ICl, self.ECv, self.ECo, self.ISOv,
+                    self.LUT_IC, self.LUT_EC, self.LUT_ISO,
+                    self.ICthreads, self.ECthreads, self.ISOthreads
+                )
+        else :
+            # INVERSE PRODUCT A'*y
+            with nogil :
+                COMMIT_At(
+                    self.nF, self.n, self.nE, self.nV, self.nS, self.ndirs,
+                    &v_in[0], &v_out[0],
+                    self.ICf, self.ICv, self.ICo, self.ICl, self.ECv, self.ECo, self.ISOv,
+                    self.LUT_IC, self.LUT_EC, self.LUT_ISO,
+                    self.ICthreadsT, self.ECthreadsT, self.ISOthreadsT
+                )
+
+        return v_out
diff --git a/commit/operator/operator.pyxbld b/commit/operator/operator.pyxbld
index 6e5b9f12..cd71f249 100755
--- a/commit/operator/operator.pyxbld
+++ b/commit/operator/operator.pyxbld
@@ -1,33 +1,33 @@
-import numpy
-from os import utime
-from os.path import dirname, join
-from distutils.extension import Extension
-
-# pass parameters to the compiler at runtime
-# [TODO] find a way to avoid using this fake module
-from commit.operator import config
-
-def make_ext(modname, pyxfilename):
-
-    if ( config.nTHREADS is None or config.nTHREADS < 0 or config.nTHREADS > 255 ):
-       raise RuntimeError( 'config.nTHREADS must be between 1 and 255' )
-    if ( config.nIC is None or config.nIC < 0 or config.nIC > 20 ):
-       raise RuntimeError( 'config.nIC must be in the range [0..20]' )
-    if ( config.nEC is None or config.nEC < 0 or config.nEC > 20 ):
-       raise RuntimeError( 'config.nEC must be in the range [0..20]' )
-    if ( config.nISO is None or config.nISO < 0 or config.nISO > 20 ):
-       raise RuntimeError( 'config.nISO must be in the range [0..20]' )
-
-    # Force recompilation
-    if config.model=="VolumeFractions" :
-        filename = "operator_noLUT.c"
-    else :
-        filename = "operator_withLUT.c"
-    path = dirname(pyxfilename)
-    utime( join(path,filename), None)
-    return Extension(name=modname,
-                     sources=[pyxfilename,join(path,filename)],
-                     include_dirs=[numpy.get_include()],
-                     define_macros = [('nTHREADS',config.nTHREADS), ('nIC',config.nIC), ('nEC',config.nEC), ('nISO',config.nISO)],
-                     extra_compile_args=['-w', '-O3', '-Ofast'],
-                     )
+import numpy
+from os import utime
+from os.path import dirname, join
+from distutils.extension import Extension
+
+# pass parameters to the compiler at runtime
+# [TODO] find a way to avoid using this fake module
+from commit.operator import config
+
+def make_ext(modname, pyxfilename):
+
+    if ( config.nTHREADS is None or config.nTHREADS < 0 or config.nTHREADS > 255 ):
+       raise RuntimeError( 'config.nTHREADS must be between 1 and 255' )
+    if ( config.nIC is None or config.nIC < 0 or config.nIC > 20 ):
+       raise RuntimeError( 'config.nIC must be in the range [0..20]' )
+    if ( config.nEC is None or config.nEC < 0 or config.nEC > 20 ):
+       raise RuntimeError( 'config.nEC must be in the range [0..20]' )
+    if ( config.nISO is None or config.nISO < 0 or config.nISO > 20 ):
+       raise RuntimeError( 'config.nISO must be in the range [0..20]' )
+
+    # Force recompilation
+    if config.model=="VolumeFractions" :
+        filename = "operator_noLUT.c"
+    else :
+        filename = "operator_withLUT.c"
+    path = dirname(pyxfilename)
+    utime( join(path,filename), None)
+    return Extension(name=modname,
+                     sources=[pyxfilename,join(path,filename)],
+                     include_dirs=[numpy.get_include()],
+                     define_macros = [('nTHREADS',config.nTHREADS), ('nIC',config.nIC), ('nEC',config.nEC), ('nISO',config.nISO)],
+                     extra_compile_args=['-w', '-O3', '-Ofast'],
+                     )
diff --git a/commit/operator/operator_noLUT.c b/commit/operator/operator_noLUT.c
index d8b6706b..fe1269fd 100644
--- a/commit/operator/operator_noLUT.c
+++ b/commit/operator/operator_noLUT.c
@@ -1,187 +1,187 @@
-#include <pthread.h>
-#include <stdint.h> // uint32_t etc
-
-// number of THREADS
-#ifdef nTHREADS
-    #if (nTHREADS<0 || nTHREADS>255)
-    #error "nTHREADS" must be in the range 0..255
-    #endif
-#else
-    #error "nTHREADS" parameter must be passed to the compiler as "-DnTHREADS=<value>"
-#endif
-
-
-/* global variables */
-int         nF, n;
-double      *x, *Y;
-uint32_t    *ICthreads, *ISOthreads;
-uint8_t     *ICthreadsT;
-uint32_t    *ISOthreadsT;
-uint32_t    *ICf, *ICv, *ISOv;
-float       *ICl;
-
-
-// ====================================================
-// Compute a sub-block of the A*x MAtRIX-VECTOR product
-// ====================================================
-void* COMMIT_A__block( void *ptr )
-{
-    int      id = (long)ptr;
-    double   x0;
-    double   *xPtr;
-    uint32_t *t_v, *t_vEnd, *t_f;
-    float    *t_l;
-
-    // intra-cellular compartments
-    t_v    = ICv + ICthreads[id];
-    t_vEnd = ICv + ICthreads[id+1];
-    t_l    = ICl + ICthreads[id];
-    t_f    = ICf + ICthreads[id];
-
-    while( t_v != t_vEnd )
-    {
-        x0 = x[*t_f];
-        if ( x0 != 0 )
-            Y[*t_v] += (double)(*t_l) * x0;
-        t_f++;
-        t_v++;
-        t_l++;
-    }
-
-#if nISO>=1
-    // isotropic compartments
-    t_v    = ISOv + ISOthreads[id];
-    t_vEnd = ISOv + ISOthreads[id+1];
-    xPtr   = x + nF + ISOthreads[id];
-
-    while( t_v != t_vEnd )
-    {
-        x0 = *xPtr++;
-        if ( x0 != 0 )
-            Y[*t_v] += x0;
-        t_v++;
-    }
-#endif
-
-    pthread_exit( 0 );
-}
-
-
-// =========================
-// Function called by CYTHON
-// =========================
-void COMMIT_A(
-    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
-    double *_vIN, double *_vOUT,
-    uint32_t *_ICf, uint32_t *_ICv, uint16_t *_ICo, float *_ICl,
-    uint32_t *_ECv, uint16_t *_ECo,
-    uint32_t *_ISOv,
-    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
-    uint32_t* _ICthreads, uint32_t* _ECthreads, uint32_t* _ISOthreads
-)
-{
-    nF = _nF;
-    n  = _n;
-
-    x = _vIN;
-    Y = _vOUT;
-
-    ICf  = _ICf;
-    ICv  = _ICv;
-    ICl  = _ICl;
-    ISOv = _ISOv;
-
-    ICthreads  = _ICthreads;
-    ISOthreads = _ISOthreads;
-
-    // Run SEPARATE THREADS to perform the multiplication
-    pthread_t threads[nTHREADS];
-    int t;
-    for(t=0; t<nTHREADS ; t++)
-        pthread_create( &threads[t], NULL, COMMIT_A__block, (void *) (long int)t );
-    for(t=0; t<nTHREADS ; t++)
-        pthread_join( threads[t], NULL );
-    return;
-}
-
-
-
-/* ===================================================== */
-/* Compute a sub-block of the A'*y MAtRIX-VECTOR product */
-/* ===================================================== */
-void* COMMIT_At__block( void *ptr )
-{
-    int      id = (long)ptr;
-    double   *xPtr;
-    uint32_t *t_v, *t_vEnd, *t_f;
-    float    *t_l;
-    uint8_t  *t_t;
-
-    // intra-cellular compartments
-    t_v    = ICv;
-    t_vEnd = ICv + n;
-    t_l    = ICl;
-    t_f    = ICf;
-    t_t    = ICthreadsT;
-
-    while( t_v != t_vEnd )
-    {
-        // in this case, I need to walk throug because the segments are ordered in "voxel order"
-        if ( *t_t == id )
-            x[*t_f] += (double)(*t_l) * Y[*t_v];
-        t_t++;
-        t_f++;
-        t_v++;
-        t_l++;
-    }
-
-#if nISO>=1
-    // isotropic compartments
-    t_v    = ISOv + ISOthreadsT[id];
-    t_vEnd = ISOv + ISOthreadsT[id+1];
-    xPtr   = x + nF + ISOthreadsT[id];
-
-    while( t_v != t_vEnd )
-        (*xPtr++) += Y[*t_v++];
-#endif
-
-    pthread_exit( 0 );
-}
-
-
-// =========================
-// Function called by CYTHON
-// =========================
-void COMMIT_At(
-    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
-    double *_vIN, double *_vOUT,
-    uint32_t *_ICf, uint32_t *_ICv, uint16_t *_ICo, float *_ICl,
-    uint32_t *_ECv, uint16_t *_ECo,
-    uint32_t *_ISOv,
-    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
-    uint8_t* _ICthreadsT, uint32_t* _ECthreadsT, uint32_t* _ISOthreadsT
-)
-{
-    nF = _nF;
-    n  = _n;
-
-    x = _vOUT;
-    Y = _vIN;
-
-    ICf  = _ICf;
-    ICv  = _ICv;
-    ICl  = _ICl;
-    ISOv = _ISOv;
-
-    ICthreadsT  = _ICthreadsT;
-    ISOthreadsT = _ISOthreadsT;
-
-    // Run SEPARATE THREADS to perform the multiplication
-    pthread_t threads[nTHREADS];
-    int t;
-    for(t=0; t<nTHREADS ; t++)
-        pthread_create( &threads[t], NULL, COMMIT_At__block, (void *) (long int)t );
-    for(t=0; t<nTHREADS ; t++)
-        pthread_join( threads[t], NULL );
-    return;
-}
+#include <pthread.h>
+#include <stdint.h> // uint32_t etc
+
+// number of THREADS
+#ifdef nTHREADS
+    #if (nTHREADS<0 || nTHREADS>255)
+    #error "nTHREADS" must be in the range 0..255
+    #endif
+#else
+    #error "nTHREADS" parameter must be passed to the compiler as "-DnTHREADS=<value>"
+#endif
+
+
+/* global variables */
+int         nF, n;
+double      *x, *Y;
+uint32_t    *ICthreads, *ISOthreads;
+uint8_t     *ICthreadsT;
+uint32_t    *ISOthreadsT;
+uint32_t    *ICf, *ICv, *ISOv;
+float       *ICl;
+
+
+// ====================================================
+// Compute a sub-block of the A*x MAtRIX-VECTOR product
+// ====================================================
+void* COMMIT_A__block( void *ptr )
+{
+    int      id = (long)ptr;
+    double   x0;
+    double   *xPtr;
+    uint32_t *t_v, *t_vEnd, *t_f;
+    float    *t_l;
+
+    // intra-cellular compartments
+    t_v    = ICv + ICthreads[id];
+    t_vEnd = ICv + ICthreads[id+1];
+    t_l    = ICl + ICthreads[id];
+    t_f    = ICf + ICthreads[id];
+
+    while( t_v != t_vEnd )
+    {
+        x0 = x[*t_f];
+        if ( x0 != 0 )
+            Y[*t_v] += (double)(*t_l) * x0;
+        t_f++;
+        t_v++;
+        t_l++;
+    }
+
+#if nISO>=1
+    // isotropic compartments
+    t_v    = ISOv + ISOthreads[id];
+    t_vEnd = ISOv + ISOthreads[id+1];
+    xPtr   = x + nF + ISOthreads[id];
+
+    while( t_v != t_vEnd )
+    {
+        x0 = *xPtr++;
+        if ( x0 != 0 )
+            Y[*t_v] += x0;
+        t_v++;
+    }
+#endif
+
+    pthread_exit( 0 );
+}
+
+
+// =========================
+// Function called by CYTHON
+// =========================
+void COMMIT_A(
+    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
+    double *_vIN, double *_vOUT,
+    uint32_t *_ICf, uint32_t *_ICv, uint16_t *_ICo, float *_ICl,
+    uint32_t *_ECv, uint16_t *_ECo,
+    uint32_t *_ISOv,
+    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
+    uint32_t* _ICthreads, uint32_t* _ECthreads, uint32_t* _ISOthreads
+)
+{
+    nF = _nF;
+    n  = _n;
+
+    x = _vIN;
+    Y = _vOUT;
+
+    ICf  = _ICf;
+    ICv  = _ICv;
+    ICl  = _ICl;
+    ISOv = _ISOv;
+
+    ICthreads  = _ICthreads;
+    ISOthreads = _ISOthreads;
+
+    // Run SEPARATE THREADS to perform the multiplication
+    pthread_t threads[nTHREADS];
+    int t;
+    for(t=0; t<nTHREADS ; t++)
+        pthread_create( &threads[t], NULL, COMMIT_A__block, (void *) (long int)t );
+    for(t=0; t<nTHREADS ; t++)
+        pthread_join( threads[t], NULL );
+    return;
+}
+
+
+
+/* ===================================================== */
+/* Compute a sub-block of the A'*y MAtRIX-VECTOR product */
+/* ===================================================== */
+void* COMMIT_At__block( void *ptr )
+{
+    int      id = (long)ptr;
+    double   *xPtr;
+    uint32_t *t_v, *t_vEnd, *t_f;
+    float    *t_l;
+    uint8_t  *t_t;
+
+    // intra-cellular compartments
+    t_v    = ICv;
+    t_vEnd = ICv + n;
+    t_l    = ICl;
+    t_f    = ICf;
+    t_t    = ICthreadsT;
+
+    while( t_v != t_vEnd )
+    {
+        // in this case, I need to walk throug because the segments are ordered in "voxel order"
+        if ( *t_t == id )
+            x[*t_f] += (double)(*t_l) * Y[*t_v];
+        t_t++;
+        t_f++;
+        t_v++;
+        t_l++;
+    }
+
+#if nISO>=1
+    // isotropic compartments
+    t_v    = ISOv + ISOthreadsT[id];
+    t_vEnd = ISOv + ISOthreadsT[id+1];
+    xPtr   = x + nF + ISOthreadsT[id];
+
+    while( t_v != t_vEnd )
+        (*xPtr++) += Y[*t_v++];
+#endif
+
+    pthread_exit( 0 );
+}
+
+
+// =========================
+// Function called by CYTHON
+// =========================
+void COMMIT_At(
+    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
+    double *_vIN, double *_vOUT,
+    uint32_t *_ICf, uint32_t *_ICv, uint16_t *_ICo, float *_ICl,
+    uint32_t *_ECv, uint16_t *_ECo,
+    uint32_t *_ISOv,
+    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
+    uint8_t* _ICthreadsT, uint32_t* _ECthreadsT, uint32_t* _ISOthreadsT
+)
+{
+    nF = _nF;
+    n  = _n;
+
+    x = _vOUT;
+    Y = _vIN;
+
+    ICf  = _ICf;
+    ICv  = _ICv;
+    ICl  = _ICl;
+    ISOv = _ISOv;
+
+    ICthreadsT  = _ICthreadsT;
+    ISOthreadsT = _ISOthreadsT;
+
+    // Run SEPARATE THREADS to perform the multiplication
+    pthread_t threads[nTHREADS];
+    int t;
+    for(t=0; t<nTHREADS ; t++)
+        pthread_create( &threads[t], NULL, COMMIT_At__block, (void *) (long int)t );
+    for(t=0; t<nTHREADS ; t++)
+        pthread_join( threads[t], NULL );
+    return;
+}
diff --git a/commit/operator/operator_withLUT.c b/commit/operator/operator_withLUT.c
index 1b6fd1ae..042dbe5f 100644
--- a/commit/operator/operator_withLUT.c
+++ b/commit/operator/operator_withLUT.c
@@ -1,2247 +1,2247 @@
-#include <pthread.h>
-#include <stdint.h> // uint32_t etc
-
-// number of THREADS
-#ifdef nTHREADS
-    #if (nTHREADS<0 || nTHREADS>255)
-    #error "nTHREADS" must be in the range 0..255
-    #endif
-#else
-    #error "nTHREADS" parameter must be passed to the compiler as "-DnTHREADS=<value>"
-#endif
-
-
-/* global variables */
-int         nF, n, nE, nV, nS, ndirs;
-double      *x, *Y;
-uint32_t    *ICthreads, *ECthreads, *ISOthreads;
-uint8_t     *ICthreadsT;
-uint32_t    *ECthreadsT, *ISOthreadsT;
-uint32_t    *ICf, *ICv, *ECv, *ISOv;
-uint16_t    *ICo, *ECo;
-float       *ICl;
-float       *wmrSFP0, *wmrSFP1, *wmrSFP2, *wmrSFP3, *wmrSFP4, *wmrSFP5, *wmrSFP6, *wmrSFP7, *wmrSFP8, *wmrSFP9, *wmrSFP10, *wmrSFP11, *wmrSFP12, *wmrSFP13, *wmrSFP14, *wmrSFP15, *wmrSFP16, *wmrSFP17, *wmrSFP18, *wmrSFP19;
-float       *wmhSFP0, *wmhSFP1, *wmhSFP2, *wmhSFP3, *wmhSFP4, *wmhSFP5, *wmhSFP6, *wmhSFP7, *wmhSFP8, *wmhSFP9, *wmhSFP10, *wmhSFP11, *wmhSFP12, *wmhSFP13, *wmhSFP14, *wmhSFP15, *wmhSFP16, *wmhSFP17, *wmhSFP18, *wmhSFP19;
-float       *isoSFP0, *isoSFP1, *isoSFP2, *isoSFP3, *isoSFP4, *isoSFP5, *isoSFP6, *isoSFP7, *isoSFP8, *isoSFP9, *isoSFP10, *isoSFP11, *isoSFP12, *isoSFP13, *isoSFP14, *isoSFP15, *isoSFP16, *isoSFP17, *isoSFP18, *isoSFP19;
-
-
-
-// ====================================================
-// Compute a sub-block of the A*x MAtRIX-VECTOR product
-// ====================================================
-void* COMMIT_A__block( void *ptr )
-{
-    int      id = (long)ptr;
-    int      offset;
-    double   x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, w;
-    double   *x_Ptr0, *x_Ptr1, *x_Ptr2, *x_Ptr3, *x_Ptr4, *x_Ptr5, *x_Ptr6, *x_Ptr7, *x_Ptr8, *x_Ptr9, *x_Ptr10, *x_Ptr11, *x_Ptr12, *x_Ptr13, *x_Ptr14, *x_Ptr15, *x_Ptr16, *x_Ptr17, *x_Ptr18, *x_Ptr19;
-    double   *Yptr, *YptrEnd;
-    float    *SFP0ptr, *SFP1ptr, *SFP2ptr, *SFP3ptr, *SFP4ptr, *SFP5ptr, *SFP6ptr, *SFP7ptr, *SFP8ptr, *SFP9ptr, *SFP10ptr, *SFP11ptr, *SFP12ptr, *SFP13ptr, *SFP14ptr, *SFP15ptr, *SFP16ptr, *SFP17ptr, *SFP18ptr, *SFP19ptr;
-    uint32_t *t_v, *t_vEnd, *t_f;
-    uint16_t *t_o;
-    float    *t_l;
-
-#if nIC>=1
-    // intra-cellular compartments
-    t_v    = ICv + ICthreads[id];
-    t_vEnd = ICv + ICthreads[id+1];
-    t_o    = ICo + ICthreads[id];
-    t_l    = ICl + ICthreads[id];
-    t_f    = ICf + ICthreads[id];
-
-    while( t_v != t_vEnd )
-    {
-        x_Ptr0 = x + *t_f;
-        x0 = *x_Ptr0;
-        #if nIC>=2
-        x_Ptr1 = x_Ptr0 + nF;
-        x1 = *x_Ptr1;
-        #endif
-        #if nIC>=3
-        x_Ptr2 = x_Ptr1 + nF;
-        x2 = *x_Ptr2;
-        #endif
-        #if nIC>=4
-        x_Ptr3 = x_Ptr2 + nF;
-        x3 = *x_Ptr3;
-        #endif
-        #if nIC>=5
-        x_Ptr4 = x_Ptr3 + nF;
-        x4 = *x_Ptr4;
-        #endif
-        #if nIC>=6
-        x_Ptr5 = x_Ptr4 + nF;
-        x5 = *x_Ptr5;
-        #endif
-        #if nIC>=7
-        x_Ptr6 = x_Ptr5 + nF;
-        x6 = *x_Ptr6;
-        #endif
-        #if nIC>=8
-        x_Ptr7 = x_Ptr6 + nF;
-        x7 = *x_Ptr7;
-        #endif
-        #if nIC>=9
-        x_Ptr8 = x_Ptr7 + nF;
-        x8 = *x_Ptr8;
-        #endif
-        #if nIC>=10
-        x_Ptr9 = x_Ptr8 + nF;
-        x9 = *x_Ptr9;
-        #endif
-        #if nIC>=11
-        x_Ptr10 = x_Ptr9 + nF;
-        x10 = *x_Ptr10;
-        #endif
-        #if nIC>=12
-        x_Ptr11 = x_Ptr10 + nF;
-        x11 = *x_Ptr11;
-        #endif
-        #if nIC>=13
-        x_Ptr12 = x_Ptr11 + nF;
-        x12 = *x_Ptr12;
-        #endif
-        #if nIC>=14
-        x_Ptr13 = x_Ptr12 + nF;
-        x13 = *x_Ptr13;
-        #endif
-        #if nIC>=15
-        x_Ptr14 = x_Ptr13 + nF;
-        x14 = *x_Ptr14;
-        #endif
-        #if nIC>=16
-        x_Ptr15 = x_Ptr14 + nF;
-        x15 = *x_Ptr15;
-        #endif
-        #if nIC>=17
-        x_Ptr16 = x_Ptr15 + nF;
-        x16 = *x_Ptr16;
-        #endif
-        #if nIC>=18
-        x_Ptr17 = x_Ptr16 + nF;
-        x17 = *x_Ptr17;
-        #endif
-        #if nIC>=19
-        x_Ptr18 = x_Ptr17 + nF;
-        x18 = *x_Ptr18;
-        #endif
-        #if nIC>=20
-        x_Ptr19 = x_Ptr18 + nF;
-        x19 = *x_Ptr19;
-        #endif
-
-        if ( x0 != 0
-        #if nIC>=2
-            || x1 != 0
-        #endif
-        #if nIC>=3
-            || x2 != 0
-        #endif
-        #if nIC>=4
-            || x3 != 0
-        #endif
-        #if nIC>=5
-            || x4 != 0
-        #endif
-        #if nIC>=6
-            || x5 != 0
-        #endif
-        #if nIC>=7
-            || x6 != 0
-        #endif
-        #if nIC>=8
-            || x7 != 0
-        #endif
-        #if nIC>=9
-            || x8 != 0
-        #endif
-        #if nIC>=10
-            || x9 != 0
-        #endif
-        #if nIC>=11
-            || x10 != 0
-        #endif
-        #if nIC>=12
-            || x11 != 0
-        #endif
-        #if nIC>=13
-            || x12 != 0
-        #endif
-        #if nIC>=14
-            || x13 != 0
-        #endif
-        #if nIC>=15
-            || x14 != 0
-        #endif
-        #if nIC>=16
-            || x15 != 0
-        #endif
-        #if nIC>=17
-            || x16 != 0
-        #endif
-        #if nIC>=18
-            || x17 != 0
-        #endif
-        #if nIC>=19
-            || x18 != 0
-        #endif
-        #if nIC>=20
-            || x19 != 0
-        #endif
-        )
-        {
-            Yptr    = Y    + nS * (*t_v);
-            YptrEnd = Yptr + nS;
-            w       = (double)(*t_l);
-            offset  = nS * (*t_o);
-            SFP0ptr = wmrSFP0 + offset;
-            #if nIC>=2
-            SFP1ptr = wmrSFP1 + offset;
-            #endif
-            #if nIC>=3
-            SFP2ptr = wmrSFP2 + offset;
-            #endif
-            #if nIC>=4
-            SFP3ptr = wmrSFP3 + offset;
-            #endif
-            #if nIC>=5
-            SFP4ptr = wmrSFP4 + offset;
-            #endif
-            #if nIC>=6
-            SFP5ptr = wmrSFP5 + offset;
-            #endif
-            #if nIC>=7
-            SFP6ptr = wmrSFP6 + offset;
-            #endif
-            #if nIC>=8
-            SFP7ptr = wmrSFP7 + offset;
-            #endif
-            #if nIC>=9
-            SFP8ptr = wmrSFP8 + offset;
-            #endif
-            #if nIC>=10
-            SFP9ptr = wmrSFP9 + offset;
-            #endif
-            #if nIC>=11
-            SFP10ptr = wmrSFP10 + offset;
-            #endif
-            #if nIC>=12
-            SFP11ptr = wmrSFP11 + offset;
-            #endif
-            #if nIC>=13
-            SFP12ptr = wmrSFP12 + offset;
-            #endif
-            #if nIC>=14
-            SFP13ptr = wmrSFP13 + offset;
-            #endif
-            #if nIC>=15
-            SFP14ptr = wmrSFP14 + offset;
-            #endif
-            #if nIC>=16
-            SFP15ptr = wmrSFP15 + offset;
-            #endif
-            #if nIC>=17
-            SFP16ptr = wmrSFP16 + offset;
-            #endif
-            #if nIC>=18
-            SFP17ptr = wmrSFP17 + offset;
-            #endif
-            #if nIC>=19
-            SFP18ptr = wmrSFP18 + offset;
-            #endif
-            #if nIC>=20
-            SFP19ptr = wmrSFP19 + offset;
-            #endif
-
-            while( Yptr != YptrEnd )
-                (*Yptr++) += w * (
-                          x0 * (*SFP0ptr++)
-                        #if nIC>=2
-                        + x1 * (*SFP1ptr++)
-                        #endif
-                        #if nIC>=3
-                        + x2 * (*SFP2ptr++)
-                        #endif
-                        #if nIC>=4
-                        + x3 * (*SFP3ptr++)
-                        #endif
-                        #if nIC>=5
-                        + x4 * (*SFP4ptr++)
-                        #endif
-                        #if nIC>=6
-                        + x5 * (*SFP5ptr++)
-                        #endif
-                        #if nIC>=7
-                        + x6 * (*SFP6ptr++)
-                        #endif
-                        #if nIC>=8
-                        + x7 * (*SFP7ptr++)
-                        #endif
-                        #if nIC>=9
-                        + x8 * (*SFP8ptr++)
-                        #endif
-                        #if nIC>=10
-                        + x9 * (*SFP9ptr++)
-                        #endif
-                        #if nIC>=11
-                        + x10 * (*SFP10ptr++)
-                        #endif
-                        #if nIC>=12
-                        + x11 * (*SFP11ptr++)
-                        #endif
-                        #if nIC>=13
-                        + x12 * (*SFP12ptr++)
-                        #endif
-                        #if nIC>=14
-                        + x13 * (*SFP13ptr++)
-                        #endif
-                        #if nIC>=15
-                        + x14 * (*SFP14ptr++)
-                        #endif
-                        #if nIC>=16
-                        + x15 * (*SFP15ptr++)
-                        #endif
-                        #if nIC>=17
-                        + x16 * (*SFP16ptr++)
-                        #endif
-                        #if nIC>=18
-                        + x17 * (*SFP17ptr++)
-                        #endif
-                        #if nIC>=19
-                        + x18 * (*SFP18ptr++)
-                        #endif
-                        #if nIC>=20
-                        + x19 * (*SFP19ptr++)
-                        #endif
-                );
-        }
-
-        t_f++;
-        t_v++;
-        t_o++;
-        t_l++;
-    }
-#endif
-
-#if nEC>=1
-    // extra-cellular compartments
-    t_v    = ECv + ECthreads[id];
-    t_vEnd = ECv + ECthreads[id+1];
-    t_o    = ECo + ECthreads[id];
-
-    x_Ptr0 = x + nIC*nF + ECthreads[id];
-    #if nEC>=2
-    x_Ptr1 = x_Ptr0 + nE;
-    #endif
-    #if nEC>=3
-    x_Ptr2 = x_Ptr1 + nE;
-    #endif
-    #if nEC>=4
-    x_Ptr3 = x_Ptr2 + nE;
-    #endif
-    #if nEC>=5
-    x_Ptr4 = x_Ptr3 + nE;
-    #endif
-    #if nEC>=6
-    x_Ptr5 = x_Ptr4 + nE;
-    #endif
-    #if nEC>=7
-    x_Ptr6 = x_Ptr5 + nE;
-    #endif
-    #if nEC>=8
-    x_Ptr7 = x_Ptr6 + nE;
-    #endif
-    #if nEC>=9
-    x_Ptr8 = x_Ptr7 + nE;
-    #endif
-    #if nEC>=10
-    x_Ptr9 = x_Ptr8 + nE;
-    #endif
-    #if nEC>=11
-    x_Ptr10 = x_Ptr9 + nE;
-    #endif
-    #if nEC>=12
-    x_Ptr11 = x_Ptr10 + nE;
-    #endif
-    #if nEC>=13
-    x_Ptr12 = x_Ptr11 + nE;
-    #endif
-    #if nEC>=14
-    x_Ptr13 = x_Ptr12 + nE;
-    #endif
-    #if nEC>=15
-    x_Ptr14 = x_Ptr13 + nE;
-    #endif
-    #if nEC>=16
-    x_Ptr15 = x_Ptr14 + nE;
-    #endif
-    #if nEC>=17
-    x_Ptr16 = x_Ptr15 + nE;
-    #endif
-    #if nEC>=18
-    x_Ptr17 = x_Ptr16 + nE;
-    #endif
-    #if nEC>=19
-    x_Ptr18 = x_Ptr17 + nE;
-    #endif
-    #if nEC>=20
-    x_Ptr19 = x_Ptr18 + nE;
-    #endif
-
-    while( t_v != t_vEnd )
-    {
-        x0 = *x_Ptr0++;
-        #if nEC>=2
-        x1 = *x_Ptr1++;
-        #endif
-        #if nEC>=3
-        x2 = *x_Ptr2++;
-        #endif
-        #if nEC>=4
-        x3 = *x_Ptr3++;
-        #endif
-        #if nEC>=5
-        x4 = *x_Ptr4++;
-        #endif
-        #if nEC>=6
-        x5 = *x_Ptr5++;
-        #endif
-        #if nEC>=7
-        x6 = *x_Ptr6++;
-        #endif
-        #if nEC>=8
-        x7 = *x_Ptr7++;
-        #endif
-        #if nEC>=9
-        x8 = *x_Ptr8++;
-        #endif
-        #if nEC>=10
-        x9 = *x_Ptr9++;
-        #endif
-        #if nEC>=11
-        x10 = *x_Ptr10++;
-        #endif
-        #if nEC>=12
-        x11 = *x_Ptr11++;
-        #endif
-        #if nEC>=13
-        x12 = *x_Ptr12++;
-        #endif
-        #if nEC>=14
-        x13 = *x_Ptr13++;
-        #endif
-        #if nEC>=15
-        x14 = *x_Ptr14++;
-        #endif
-        #if nEC>=16
-        x15 = *x_Ptr15++;
-        #endif
-        #if nEC>=17
-        x16 = *x_Ptr16++;
-        #endif
-        #if nEC>=18
-        x17 = *x_Ptr17++;
-        #endif
-        #if nEC>=19
-        x18 = *x_Ptr18++;
-        #endif
-        #if nEC>=20
-        x19 = *x_Ptr19++;
-        #endif
-        if (
-               x0 != 0
-            #if nEC>=2
-            || x1 != 0
-            #endif
-            #if nEC>=3
-            || x2 != 0
-            #endif
-            #if nEC>=4
-            || x3 != 0
-            #endif
-            #if nEC>=5
-            || x4 != 0
-            #endif
-            #if nEC>=6
-            || x5 != 0
-            #endif
-            #if nEC>=7
-            || x6 != 0
-            #endif
-            #if nEC>=8
-            || x7 != 0
-            #endif
-            #if nEC>=9
-            || x8 != 0
-            #endif
-            #if nEC>=10
-            || x9 != 0
-            #endif
-            #if nEC>=11
-            || x10 != 0
-            #endif
-            #if nEC>=12
-            || x11 != 0
-            #endif
-            #if nEC>=13
-            || x12 != 0
-            #endif
-            #if nEC>=14
-            || x13 != 0
-            #endif
-            #if nEC>=15
-            || x14 != 0
-            #endif
-            #if nEC>=16
-            || x15 != 0
-            #endif
-            #if nEC>=17
-            || x16 != 0
-            #endif
-            #if nEC>=18
-            || x17 != 0
-            #endif
-            #if nEC>=19
-            || x18 != 0
-            #endif
-            #if nEC>=20
-            || x19 != 0
-            #endif
-          )
-        {
-            Yptr    = Y    + nS * (*t_v);
-            YptrEnd = Yptr + nS;
-            offset  = nS * (*t_o);
-            SFP0ptr = wmhSFP0 + offset;
-            #if nEC>=2
-            SFP1ptr = wmhSFP1 + offset;
-            #endif
-            #if nEC>=3
-            SFP2ptr = wmhSFP2 + offset;
-            #endif
-            #if nEC>=4
-            SFP3ptr = wmhSFP3 + offset;
-            #endif
-            #if nEC>=5
-            SFP4ptr = wmhSFP4 + offset;
-            #endif
-            #if nEC>=6
-            SFP5ptr = wmhSFP5 + offset;
-            #endif
-            #if nEC>=7
-            SFP6ptr = wmhSFP6 + offset;
-            #endif
-            #if nEC>=8
-            SFP7ptr = wmhSFP7 + offset;
-            #endif
-            #if nEC>=9
-            SFP8ptr = wmhSFP8 + offset;
-            #endif
-            #if nEC>=10
-            SFP9ptr = wmhSFP9 + offset;
-            #endif
-            #if nEC>=11
-            SFP10ptr = wmhSFP10 + offset;
-            #endif
-            #if nEC>=12
-            SFP11ptr = wmhSFP11 + offset;
-            #endif
-            #if nEC>=13
-            SFP12ptr = wmhSFP12 + offset;
-            #endif
-            #if nEC>=14
-            SFP13ptr = wmhSFP13 + offset;
-            #endif
-            #if nEC>=15
-            SFP14ptr = wmhSFP14 + offset;
-            #endif
-            #if nEC>=16
-            SFP15ptr = wmhSFP15 + offset;
-            #endif
-            #if nEC>=17
-            SFP16ptr = wmhSFP16 + offset;
-            #endif
-            #if nEC>=18
-            SFP17ptr = wmhSFP17 + offset;
-            #endif
-            #if nEC>=19
-            SFP18ptr = wmhSFP18 + offset;
-            #endif
-            #if nEC>=20
-            SFP19ptr = wmhSFP19 + offset;
-            #endif
-
-            while( Yptr != YptrEnd )
-                (*Yptr++) += (
-                      x0 * (*SFP0ptr++)
-                    #if nEC>=2
-                    + x1 * (*SFP1ptr++)
-                    #endif
-                    #if nEC>=3
-                    + x2 * (*SFP2ptr++)
-                    #endif
-                    #if nEC>=4
-                    + x3 * (*SFP3ptr++)
-                    #endif
-                    #if nEC>=5
-                    + x4 * (*SFP4ptr++)
-                    #endif
-                    #if nEC>=6
-                    + x5 * (*SFP5ptr++)
-                    #endif
-                    #if nEC>=7
-                    + x6 * (*SFP6ptr++)
-                    #endif
-                    #if nEC>=8
-                    + x7 * (*SFP7ptr++)
-                    #endif
-                    #if nEC>=9
-                    + x8 * (*SFP8ptr++)
-                    #endif
-                    #if nEC>=10
-                    + x9 * (*SFP9ptr++)
-                    #endif
-                    #if nEC>=11
-                    + x10 * (*SFP10ptr++)
-                    #endif
-                    #if nEC>=12
-                    + x11 * (*SFP11ptr++)
-                    #endif
-                    #if nEC>=13
-                    + x12 * (*SFP12ptr++)
-                    #endif
-                    #if nEC>=14
-                    + x13 * (*SFP13ptr++)
-                    #endif
-                    #if nEC>=15
-                    + x14 * (*SFP14ptr++)
-                    #endif
-                    #if nEC>=16
-                    + x15 * (*SFP15ptr++)
-                    #endif
-                    #if nEC>=17
-                    + x16 * (*SFP16ptr++)
-                    #endif
-                    #if nEC>=18
-                    + x17 * (*SFP17ptr++)
-                    #endif
-                    #if nEC>=19
-                    + x18 * (*SFP18ptr++)
-                    #endif
-                    #if nEC>=20
-                    + x19 * (*SFP19ptr++)
-                    #endif
-
-                );
-        }
-        t_v++;
-        t_o++;
-    }
-#endif
-
-#if nISO>=1
-    // isotropic compartments
-    t_v    = ISOv + ISOthreads[id];
-    t_vEnd = ISOv + ISOthreads[id+1];
-
-    x_Ptr0 = x + nIC*nF + nEC*nE + ISOthreads[id];
-    #if nISO>=2
-    x_Ptr1 = x_Ptr0 + nV;
-    #endif
-    #if nISO>=3
-    x_Ptr2 = x_Ptr1 + nV;
-    #endif
-    #if nISO>=4
-    x_Ptr3 = x_Ptr2 + nV;
-    #endif
-    #if nISO>=5
-    x_Ptr4 = x_Ptr3 + nV;
-    #endif
-    #if nISO>=6
-    x_Ptr5 = x_Ptr4 + nV;
-    #endif
-    #if nISO>=7
-    x_Ptr6 = x_Ptr5 + nV;
-    #endif
-    #if nISO>=8
-    x_Ptr7 = x_Ptr6 + nV;
-    #endif
-    #if nISO>=9
-    x_Ptr8 = x_Ptr7 + nV;
-    #endif
-    #if nISO>=10
-    x_Ptr9 = x_Ptr8 + nV;
-    #endif
-    #if nISO>=11
-    x_Ptr10 = x_Ptr9 + nV;
-    #endif
-    #if nISO>=12
-    x_Ptr11 = x_Ptr10 + nV;
-    #endif
-    #if nISO>=13
-    x_Ptr12 = x_Ptr11 + nV;
-    #endif
-    #if nISO>=14
-    x_Ptr13 = x_Ptr12 + nV;
-    #endif
-    #if nISO>=15
-    x_Ptr14 = x_Ptr13 + nV;
-    #endif
-    #if nISO>=16
-    x_Ptr15 = x_Ptr14 + nV;
-    #endif
-    #if nISO>=17
-    x_Ptr16 = x_Ptr15 + nV;
-    #endif
-    #if nISO>=18
-    x_Ptr17 = x_Ptr16 + nV;
-    #endif
-    #if nISO>=19
-    x_Ptr18 = x_Ptr17 + nV;
-    #endif
-    #if nISO>=20
-    x_Ptr19 = x_Ptr18 + nV;
-    #endif
-
-    while( t_v != t_vEnd )
-    {
-        x0 = *x_Ptr0++;
-        #if nISO>=2
-        x1 = *x_Ptr1++;
-        #endif
-        #if nISO>=3
-        x2 = *x_Ptr2++;
-        #endif
-        #if nISO>=4
-        x3 = *x_Ptr3++;
-        #endif
-        #if nISO>=5
-        x4 = *x_Ptr4++;
-        #endif
-        #if nISO>=6
-        x5 = *x_Ptr5++;
-        #endif
-        #if nISO>=7
-        x6 = *x_Ptr6++;
-        #endif
-        #if nISO>=8
-        x7 = *x_Ptr7++;
-        #endif
-        #if nISO>=9
-        x8 = *x_Ptr8++;
-        #endif
-        #if nISO>=10
-        x9 = *x_Ptr9++;
-        #endif
-        #if nISO>=11
-        x10 = *x_Ptr10++;
-        #endif
-        #if nISO>=12
-        x11 = *x_Ptr11++;
-        #endif
-        #if nISO>=13
-        x12 = *x_Ptr12++;
-        #endif
-        #if nISO>=14
-        x13 = *x_Ptr13++;
-        #endif
-        #if nISO>=15
-        x14 = *x_Ptr14++;
-        #endif
-        #if nISO>=16
-        x15 = *x_Ptr15++;
-        #endif
-        #if nISO>=17
-        x16 = *x_Ptr16++;
-        #endif
-        #if nISO>=18
-        x17 = *x_Ptr17++;
-        #endif
-        #if nISO>=19
-        x18 = *x_Ptr18++;
-        #endif
-        #if nISO>=20
-        x19 = *x_Ptr19++;
-        #endif
-
-        if (
-               x0 != 0
-            #if nISO>=2
-            || x1 != 0
-            #endif
-            #if nISO>=3
-            || x2 != 0
-            #endif
-            #if nISO>=4
-            || x3 != 0
-            #endif
-            #if nISO>=5
-            || x4 != 0
-            #endif
-            #if nISO>=6
-            || x5 != 0
-            #endif
-            #if nISO>=7
-            || x6 != 0
-            #endif
-            #if nISO>=8
-            || x7 != 0
-            #endif
-            #if nISO>=9
-            || x8 != 0
-            #endif
-            #if nISO>=10
-            || x9 != 0
-            #endif
-            #if nISO>=11
-            || x10 != 0
-            #endif
-            #if nISO>=12
-            || x11 != 0
-            #endif
-            #if nISO>=13
-            || x12 != 0
-            #endif
-            #if nISO>=14
-            || x13 != 0
-            #endif
-            #if nISO>=15
-            || x14 != 0
-            #endif
-            #if nISO>=16
-            || x15 != 0
-            #endif
-            #if nISO>=17
-            || x16 != 0
-            #endif
-            #if nISO>=18
-            || x17 != 0
-            #endif
-            #if nISO>=19
-            || x18 != 0
-            #endif
-            #if nISO>=20
-            || x19 != 0
-            #endif
-          )
-        {
-            Yptr    = Y    + nS * (*t_v);
-            YptrEnd = Yptr + nS;
-            SFP0ptr = isoSFP0;
-            #if nISO>=2
-            SFP1ptr = isoSFP1;
-            #endif
-            #if nISO>=3
-            SFP2ptr = isoSFP2;
-            #endif
-            #if nISO>=4
-            SFP3ptr = isoSFP3;
-            #endif
-            #if nISO>=5
-            SFP4ptr = isoSFP4;
-            #endif
-            #if nISO>=6
-            SFP5ptr = isoSFP5;
-            #endif
-            #if nISO>=7
-            SFP6ptr = isoSFP6;
-            #endif
-            #if nISO>=8
-            SFP7ptr = isoSFP7;
-            #endif
-            #if nISO>=9
-            SFP8ptr = isoSFP8;
-            #endif
-            #if nISO>=10
-            SFP9ptr = isoSFP9;
-            #endif
-            #if nISO>=11
-            SFP10ptr = isoSFP10;
-            #endif
-            #if nISO>=12
-            SFP11ptr = isoSFP11;
-            #endif
-            #if nISO>=13
-            SFP12ptr = isoSFP12;
-            #endif
-            #if nISO>=14
-            SFP13ptr = isoSFP13;
-            #endif
-            #if nISO>=15
-            SFP14ptr = isoSFP14;
-            #endif
-            #if nISO>=16
-            SFP15ptr = isoSFP15;
-            #endif
-            #if nISO>=17
-            SFP16ptr = isoSFP16;
-            #endif
-            #if nISO>=18
-            SFP17ptr = isoSFP17;
-            #endif
-            #if nISO>=19
-            SFP18ptr = isoSFP18;
-            #endif
-            #if nISO>=20
-            SFP19ptr = isoSFP19;
-            #endif
-
-            while( Yptr != YptrEnd )
-                (*Yptr++) += (
-                      x0 * (*SFP0ptr++)
-                    #if nISO>=2
-                    + x1 * (*SFP1ptr++)
-                    #endif
-                    #if nISO>=3
-                    + x2 * (*SFP2ptr++)
-                    #endif
-                    #if nISO>=4
-                    + x3 * (*SFP3ptr++)
-                    #endif
-                    #if nISO>=5
-                    + x4 * (*SFP4ptr++)
-                    #endif
-                    #if nISO>=6
-                    + x5 * (*SFP5ptr++)
-                    #endif
-                    #if nISO>=7
-                    + x6 * (*SFP6ptr++)
-                    #endif
-                    #if nISO>=8
-                    + x7 * (*SFP7ptr++)
-                    #endif
-                    #if nISO>=9
-                    + x8 * (*SFP8ptr++)
-                    #endif
-                    #if nISO>=10
-                    + x9 * (*SFP9ptr++)
-                    #endif
-                    #if nISO>=11
-                    + x10 * (*SFP10ptr++)
-                    #endif
-                    #if nISO>=12
-                    + x11 * (*SFP11ptr++)
-                    #endif
-                    #if nISO>=13
-                    + x12 * (*SFP12ptr++)
-                    #endif
-                    #if nISO>=14
-                    + x13 * (*SFP13ptr++)
-                    #endif
-                    #if nISO>=15
-                    + x14 * (*SFP14ptr++)
-                    #endif
-                    #if nISO>=16
-                    + x15 * (*SFP15ptr++)
-                    #endif
-                    #if nISO>=17
-                    + x16 * (*SFP16ptr++)
-                    #endif
-                    #if nISO>=18
-                    + x17 * (*SFP17ptr++)
-                    #endif
-                    #if nISO>=19
-                    + x18 * (*SFP18ptr++)
-                    #endif
-                    #if nISO>=20
-                    + x19 * (*SFP19ptr++)
-                    #endif
-                );
-        }
-        t_v++;
-    }
-#endif
-
-    pthread_exit( 0 );
-}
-
-
-// =========================
-// Function called by CYTHON
-// =========================
-void COMMIT_A(
-    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
-    double *_vIN, double *_vOUT,
-    uint32_t *_ICf, uint32_t *_ICv, uint16_t *_ICo, float *_ICl,
-    uint32_t *_ECv, uint16_t *_ECo,
-    uint32_t *_ISOv,
-    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
-    uint32_t* _ICthreads, uint32_t* _ECthreads, uint32_t* _ISOthreads
-)
-{
-    nF = _nF;
-    n  = _n;
-    nE = _nE;
-    nV = _nV;
-    nS = _nS;
-    ndirs = _ndirs;
-
-    x = _vIN;
-    Y = _vOUT;
-
-    ICf  = _ICf;
-    ICv  = _ICv;
-    ICo  = _ICo;
-    ICl  = _ICl;
-    ECv  = _ECv;
-    ECo  = _ECo;
-    ISOv = _ISOv;
-
-    #if nIC>=1
-    wmrSFP0 = _wmrSFP;
-    #if nIC>=2
-    wmrSFP1 = wmrSFP0 + _ndirs*_nS;
-    #if nIC>=3
-    wmrSFP2 = wmrSFP1 + _ndirs*_nS;
-    #if nIC>=4
-    wmrSFP3 = wmrSFP2 + _ndirs*_nS;
-    #if nIC>=5
-    wmrSFP4 = wmrSFP3 + _ndirs*_nS;
-    #if nIC>=6
-    wmrSFP5 = wmrSFP4 + _ndirs*_nS;
-    #if nIC>=7
-    wmrSFP6 = wmrSFP5 + _ndirs*_nS;
-    #if nIC>=8
-    wmrSFP7 = wmrSFP6 + _ndirs*_nS;
-    #if nIC>=9
-    wmrSFP8 = wmrSFP7 + _ndirs*_nS;
-    #if nIC>=10
-    wmrSFP9 = wmrSFP8 + _ndirs*_nS;
-    #if nIC>=11
-    wmrSFP10 = wmrSFP9 + _ndirs*_nS;
-    #if nIC>=12
-    wmrSFP11 = wmrSFP10 + _ndirs*_nS;
-    #if nIC>=13
-    wmrSFP12 = wmrSFP11 + _ndirs*_nS;
-    #if nIC>=14
-    wmrSFP13 = wmrSFP12 + _ndirs*_nS;
-    #if nIC>=15
-    wmrSFP14 = wmrSFP13 + _ndirs*_nS;
-    #if nIC>=16
-    wmrSFP15 = wmrSFP14 + _ndirs*_nS;
-    #if nIC>=17
-    wmrSFP16 = wmrSFP15 + _ndirs*_nS;
-    #if nIC>=18
-    wmrSFP17 = wmrSFP16 + _ndirs*_nS;
-    #if nIC>=19
-    wmrSFP18 = wmrSFP17 + _ndirs*_nS;
-    #if nIC>=20
-    wmrSFP19 = wmrSFP18 + _ndirs*_nS;
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #if nEC>=1
-    wmhSFP0 = _wmhSFP;
-    #if nEC>=2
-    wmhSFP1 = wmhSFP0 + _ndirs*_nS;
-    #if nEC>=3
-    wmhSFP2 = wmhSFP1 + _ndirs*_nS;
-    #if nEC>=4
-    wmhSFP3 = wmhSFP2 + _ndirs*_nS;
-    #if nEC>=5
-    wmhSFP4 = wmhSFP3 + _ndirs*_nS;
-    #if nEC>=6
-    wmhSFP5 = wmhSFP4 + _ndirs*_nS;
-    #if nEC>=7
-    wmhSFP6 = wmhSFP5 + _ndirs*_nS;
-    #if nEC>=8
-    wmhSFP7 = wmhSFP6 + _ndirs*_nS;
-    #if nEC>=9
-    wmhSFP8 = wmhSFP7 + _ndirs*_nS;
-    #if nEC>=10
-    wmhSFP9 = wmhSFP8 + _ndirs*_nS;
-    #if nEC>=11
-    wmhSFP10 = wmhSFP9 + _ndirs*_nS;
-    #if nEC>=12
-    wmhSFP11 = wmhSFP10 + _ndirs*_nS;
-    #if nEC>=13
-    wmhSFP12 = wmhSFP11 + _ndirs*_nS;
-    #if nEC>=14
-    wmhSFP13 = wmhSFP12 + _ndirs*_nS;
-    #if nEC>=15
-    wmhSFP14 = wmhSFP13 + _ndirs*_nS;
-    #if nEC>=16
-    wmhSFP15 = wmhSFP14 + _ndirs*_nS;
-    #if nEC>=17
-    wmhSFP16 = wmhSFP15 + _ndirs*_nS;
-    #if nEC>=18
-    wmhSFP17 = wmhSFP16 + _ndirs*_nS;
-    #if nEC>=19
-    wmhSFP18 = wmhSFP17 + _ndirs*_nS;
-    #if nEC>=20
-    wmhSFP19 = wmhSFP18 + _ndirs*_nS;
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #if nISO>=1
-    isoSFP0 = _isoSFP;
-    #if nISO>=2
-    isoSFP1 = isoSFP0 + _nS;
-    #if nISO>=3
-    isoSFP2 = isoSFP1 + _nS;
-    #if nISO>=4
-    isoSFP3 = isoSFP2 + _nS;
-    #if nISO>=5
-    isoSFP4 = isoSFP3 + _nS;
-    #if nISO>=6
-    isoSFP5 = isoSFP4 + _nS;
-    #if nISO>=7
-    isoSFP6 = isoSFP5 + _nS;
-    #if nISO>=8
-    isoSFP7 = isoSFP6 + _nS;
-    #if nISO>=9
-    isoSFP8 = isoSFP7 + _nS;
-    #if nISO>=10
-    isoSFP9 = isoSFP8 + _nS;
-    #if nISO>=11
-    isoSFP10 = isoSFP9 + _nS;
-    #if nISO>=12
-    isoSFP11 = isoSFP10 + _nS;
-    #if nISO>=13
-    isoSFP12 = isoSFP11 + _nS;
-    #if nISO>=14
-    isoSFP13 = isoSFP12 + _nS;
-    #if nISO>=15
-    isoSFP14 = isoSFP13 + _nS;
-    #if nISO>=16
-    isoSFP15 = isoSFP14 + _nS;
-    #if nISO>=17
-    isoSFP16 = isoSFP15 + _nS;
-    #if nISO>=18
-    isoSFP17 = isoSFP16 + _nS;
-    #if nISO>=19
-    isoSFP18 = isoSFP17 + _nS;
-    #if nISO>=20
-    isoSFP19 = isoSFP18 + _nS;
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-
-    ICthreads  = _ICthreads;
-    ECthreads  = _ECthreads;
-    ISOthreads = _ISOthreads;
-
-    // Run SEPARATE THREADS to perform the multiplication
-    pthread_t threads[nTHREADS];
-    int t;
-    for(t=0; t<nTHREADS ; t++)
-        pthread_create( &threads[t], NULL, COMMIT_A__block, (void *) (long int)t );
-    for(t=0; t<nTHREADS ; t++)
-        pthread_join( threads[t], NULL );
-    return;
-}
-
-
-
-/* ===================================================== */
-/* Compute a sub-block of the A'*y MAtRIX-VECTOR product */
-/* ===================================================== */
-void* COMMIT_At__block( void *ptr )
-{
-    int      id = (long)ptr;
-    int      offset;
-    double   x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, w, Y_tmp;
-    double   *x_Ptr0, *x_Ptr1, *x_Ptr2, *x_Ptr3, *x_Ptr4, *x_Ptr5, *x_Ptr6, *x_Ptr7, *x_Ptr8, *x_Ptr9, *x_Ptr10, *x_Ptr11, *x_Ptr12, *x_Ptr13, *x_Ptr14, *x_Ptr15, *x_Ptr16, *x_Ptr17, *x_Ptr18, *x_Ptr19;
-    double   *Yptr, *YptrEnd;
-    float    *SFP0ptr, *SFP1ptr, *SFP2ptr, *SFP3ptr, *SFP4ptr, *SFP5ptr, *SFP6ptr, *SFP7ptr, *SFP8ptr, *SFP9ptr, *SFP10ptr, *SFP11ptr, *SFP12ptr, *SFP13ptr, *SFP14ptr, *SFP15ptr, *SFP16ptr, *SFP17ptr, *SFP18ptr, *SFP19ptr;
-    uint32_t *t_v, *t_vEnd, *t_f;
-    uint16_t *t_o;
-    float    *t_l;
-    uint8_t  *t_t;
-
-#if nIC>=1
-    // intra-cellular compartments
-    t_v    = ICv;
-    t_vEnd = ICv + n;
-    t_o    = ICo;
-    t_l    = ICl;
-    t_f    = ICf;
-    t_t    = ICthreadsT;
-
-    while( t_v != t_vEnd )
-    {
-        // in this case, I need to walk throug because the segments are ordered in "voxel order"
-        if ( *t_t == id )
-        {
-            Yptr    = Y    + nS * (*t_v);
-            YptrEnd = Yptr + nS;
-            offset  = nS * (*t_o);
-
-            Y_tmp = *Yptr;
-            SFP0ptr   = wmrSFP0 + offset;
-            x0 = (*SFP0ptr++) * Y_tmp;
-            #if nIC>=2
-            SFP1ptr   = wmrSFP1 + offset;
-            x1 = (*SFP1ptr++) * Y_tmp;
-            #endif
-            #if nIC>=3
-            SFP2ptr   = wmrSFP2 + offset;
-            x2 = (*SFP2ptr++) * Y_tmp;
-            #endif
-            #if nIC>=4
-            SFP3ptr   = wmrSFP3 + offset;
-            x3 = (*SFP3ptr++) * Y_tmp;
-            #endif
-            #if nIC>=5
-            SFP4ptr   = wmrSFP4 + offset;
-            x4 = (*SFP4ptr++) * Y_tmp;
-            #endif
-            #if nIC>=6
-            SFP5ptr   = wmrSFP5 + offset;
-            x5 = (*SFP5ptr++) * Y_tmp;
-            #endif
-            #if nIC>=7
-            SFP6ptr   = wmrSFP6 + offset;
-            x6 = (*SFP6ptr++) * Y_tmp;
-            #endif
-            #if nIC>=8
-            SFP7ptr   = wmrSFP7 + offset;
-            x7 = (*SFP7ptr++) * Y_tmp;
-            #endif
-            #if nIC>=9
-            SFP8ptr   = wmrSFP8 + offset;
-            x8 = (*SFP8ptr++) * Y_tmp;
-            #endif
-            #if nIC>=10
-            SFP9ptr   = wmrSFP9 + offset;
-            x9 = (*SFP9ptr++) * Y_tmp;
-            #endif
-            #if nIC>=11
-            SFP10ptr   = wmrSFP10 + offset;
-            x10 = (*SFP10ptr++) * Y_tmp;
-            #endif
-            #if nIC>=12
-            SFP11ptr   = wmrSFP11 + offset;
-            x11 = (*SFP11ptr++) * Y_tmp;
-            #endif
-            #if nIC>=13
-            SFP12ptr   = wmrSFP12 + offset;
-            x12 = (*SFP12ptr++) * Y_tmp;
-            #endif
-            #if nIC>=14
-            SFP13ptr   = wmrSFP13 + offset;
-            x13 = (*SFP13ptr++) * Y_tmp;
-            #endif
-            #if nIC>=15
-            SFP14ptr   = wmrSFP14 + offset;
-            x14 = (*SFP14ptr++) * Y_tmp;
-            #endif
-            #if nIC>=16
-            SFP15ptr   = wmrSFP15 + offset;
-            x15 = (*SFP15ptr++) * Y_tmp;
-            #endif
-            #if nIC>=17
-            SFP16ptr   = wmrSFP16 + offset;
-            x16 = (*SFP16ptr++) * Y_tmp;
-            #endif
-            #if nIC>=18
-            SFP17ptr   = wmrSFP17 + offset;
-            x17 = (*SFP17ptr++) * Y_tmp;
-            #endif
-            #if nIC>=19
-            SFP18ptr   = wmrSFP18 + offset;
-            x18 = (*SFP18ptr++) * Y_tmp;
-            #endif
-            #if nIC>=20
-            SFP19ptr   = wmrSFP19 + offset;
-            x19 = (*SFP19ptr++) * Y_tmp;
-            #endif
-
-            while( ++Yptr != YptrEnd )
-            {
-                Y_tmp = *Yptr;
-                x0 += (*SFP0ptr++) * Y_tmp;
-                #if nIC>=2
-                x1 += (*SFP1ptr++) * Y_tmp;
-                #endif
-                #if nIC>=3
-                x2 += (*SFP2ptr++) * Y_tmp;
-                #endif
-                #if nIC>=4
-                x3 += (*SFP3ptr++) * Y_tmp;
-                #endif
-                #if nIC>=5
-                x4 += (*SFP4ptr++) * Y_tmp;
-                #endif
-                #if nIC>=6
-                x5 += (*SFP5ptr++) * Y_tmp;
-                #endif
-                #if nIC>=7
-                x6 += (*SFP6ptr++) * Y_tmp;
-                #endif
-                #if nIC>=8
-                x7 += (*SFP7ptr++) * Y_tmp;
-                #endif
-                #if nIC>=9
-                x8 += (*SFP8ptr++) * Y_tmp;
-                #endif
-                #if nIC>=10
-                x9 += (*SFP9ptr++) * Y_tmp;
-                #endif
-                #if nIC>=11
-                x10 += (*SFP10ptr++) * Y_tmp;
-                #endif
-                #if nIC>=12
-                x11 += (*SFP11ptr++) * Y_tmp;
-                #endif
-                #if nIC>=13
-                x12 += (*SFP12ptr++) * Y_tmp;
-                #endif
-                #if nIC>=14
-                x13 += (*SFP13ptr++) * Y_tmp;
-                #endif
-                #if nIC>=15
-                x14 += (*SFP14ptr++) * Y_tmp;
-                #endif
-                #if nIC>=16
-                x15 += (*SFP15ptr++) * Y_tmp;
-                #endif
-                #if nIC>=17
-                x16 += (*SFP16ptr++) * Y_tmp;
-                #endif
-                #if nIC>=18
-                x17 += (*SFP17ptr++) * Y_tmp;
-                #endif
-                #if nIC>=19
-                x18 += (*SFP18ptr++) * Y_tmp;
-                #endif
-                #if nIC>=20
-                x19 += (*SFP19ptr++) * Y_tmp;
-                #endif
-            }
-
-            w = (double)(*t_l);
-            x[*t_f]      += w * x0;
-            #if nIC>=2
-            x[*t_f+nF]   += w * x1;
-            #endif
-            #if nIC>=3
-            x[*t_f+2*nF] += w * x2;
-            #endif
-            #if nIC>=4
-            x[*t_f+3*nF] += w * x3;
-            #endif
-            #if nIC>=5
-            x[*t_f+4*nF] += w * x4;
-            #endif
-            #if nIC>=6
-            x[*t_f+5*nF] += w * x5;
-            #endif
-            #if nIC>=7
-            x[*t_f+6*nF] += w * x6;
-            #endif
-            #if nIC>=8
-            x[*t_f+7*nF] += w * x7;
-            #endif
-            #if nIC>=9
-            x[*t_f+8*nF] += w * x8;
-            #endif
-            #if nIC>=10
-            x[*t_f+9*nF] += w * x9;
-            #endif
-            #if nIC>=11
-            x[*t_f+10*nF] += w * x10;
-            #endif
-            #if nIC>=12
-            x[*t_f+11*nF] += w * x11;
-            #endif
-            #if nIC>=13
-            x[*t_f+12*nF] += w * x12;
-            #endif
-            #if nIC>=14
-            x[*t_f+13*nF] += w * x13;
-            #endif
-            #if nIC>=15
-            x[*t_f+14*nF] += w * x14;
-            #endif
-            #if nIC>=16
-            x[*t_f+15*nF] += w * x15;
-            #endif
-            #if nIC>=17
-            x[*t_f+16*nF] += w * x16;
-            #endif
-            #if nIC>=18
-            x[*t_f+17*nF] += w * x17;
-            #endif
-            #if nIC>=19
-            x[*t_f+18*nF] += w * x18;
-            #endif
-            #if nIC>=20
-            x[*t_f+19*nF] += w * x19;
-            #endif
-        }
-
-        t_f++;
-        t_v++;
-        t_o++;
-        t_l++;
-        t_t++;
-    }
-#endif
-
-#if nEC>=1
-    // extra-cellular compartments
-    t_v    = ECv + ECthreadsT[id];
-    t_vEnd = ECv + ECthreadsT[id+1];
-    t_o    = ECo + ECthreadsT[id];
-
-    x_Ptr0 = x + nIC*nF + ECthreadsT[id];
-    #if nEC>=2
-    x_Ptr1 = x_Ptr0 + nE;
-    #endif
-    #if nEC>=3
-    x_Ptr2 = x_Ptr1 + nE;
-    #endif
-    #if nEC>=4
-    x_Ptr3 = x_Ptr2 + nE;
-    #endif
-    #if nEC>=5
-    x_Ptr4 = x_Ptr3 + nE;
-    #endif
-    #if nEC>=6
-    x_Ptr5 = x_Ptr4 + nE;
-    #endif
-    #if nEC>=7
-    x_Ptr6 = x_Ptr5 + nE;
-    #endif
-    #if nEC>=8
-    x_Ptr7 = x_Ptr6 + nE;
-    #endif
-    #if nEC>=9
-    x_Ptr8 = x_Ptr7 + nE;
-    #endif
-    #if nEC>=10
-    x_Ptr9 = x_Ptr8 + nE;
-    #endif
-    #if nEC>=11
-    x_Ptr10 = x_Ptr9 + nE;
-    #endif
-    #if nEC>=12
-    x_Ptr11 = x_Ptr10 + nE;
-    #endif
-    #if nEC>=13
-    x_Ptr12 = x_Ptr11 + nE;
-    #endif
-    #if nEC>=14
-    x_Ptr13 = x_Ptr12 + nE;
-    #endif
-    #if nEC>=15
-    x_Ptr14 = x_Ptr13 + nE;
-    #endif
-    #if nEC>=16
-    x_Ptr15 = x_Ptr14 + nE;
-    #endif
-    #if nEC>=17
-    x_Ptr16 = x_Ptr15 + nE;
-    #endif
-    #if nEC>=18
-    x_Ptr17 = x_Ptr16 + nE;
-    #endif
-    #if nEC>=19
-    x_Ptr18 = x_Ptr17 + nE;
-    #endif
-    #if nEC>=20
-    x_Ptr19 = x_Ptr18 + nE;
-    #endif
-
-    while( t_v != t_vEnd )
-    {
-        Yptr    = Y    + nS * (*t_v++);
-        YptrEnd = Yptr + nS;
-        offset  = nS * (*t_o++);
-
-        Y_tmp = *Yptr;
-        SFP0ptr = wmhSFP0 + offset;
-        x0 = (*SFP0ptr++) * Y_tmp;
-        #if nEC>=2
-        SFP1ptr = wmhSFP1 + offset;
-        x1 = (*SFP1ptr++) * Y_tmp;
-        #endif
-        #if nEC>=3
-        SFP2ptr = wmhSFP2 + offset;
-        x2 = (*SFP2ptr++) * Y_tmp;
-        #endif
-        #if nEC>=4
-        SFP3ptr = wmhSFP3 + offset;
-        x3 = (*SFP3ptr++) * Y_tmp;
-        #endif
-        #if nEC>=5
-        SFP4ptr = wmhSFP4 + offset;
-        x4 = (*SFP4ptr++) * Y_tmp;
-        #endif
-        #if nEC>=6
-        SFP5ptr = wmhSFP5 + offset;
-        x5 = (*SFP5ptr++) * Y_tmp;
-        #endif
-        #if nEC>=7
-        SFP6ptr = wmhSFP6 + offset;
-        x6 = (*SFP6ptr++) * Y_tmp;
-        #endif
-        #if nEC>=8
-        SFP7ptr = wmhSFP7 + offset;
-        x7 = (*SFP7ptr++) * Y_tmp;
-        #endif
-        #if nEC>=9
-        SFP8ptr = wmhSFP8 + offset;
-        x8 = (*SFP8ptr++) * Y_tmp;
-        #endif
-        #if nEC>=10
-        SFP9ptr = wmhSFP9 + offset;
-        x9 = (*SFP9ptr++) * Y_tmp;
-        #endif
-        #if nEC>=11
-        SFP10ptr = wmhSFP10 + offset;
-        x10 = (*SFP10ptr++) * Y_tmp;
-        #endif
-        #if nEC>=12
-        SFP11ptr = wmhSFP11 + offset;
-        x11 = (*SFP11ptr++) * Y_tmp;
-        #endif
-        #if nEC>=13
-        SFP12ptr = wmhSFP12 + offset;
-        x12 = (*SFP12ptr++) * Y_tmp;
-        #endif
-        #if nEC>=14
-        SFP13ptr = wmhSFP13 + offset;
-        x13 = (*SFP13ptr++) * Y_tmp;
-        #endif
-        #if nEC>=15
-        SFP14ptr = wmhSFP14 + offset;
-        x14 = (*SFP14ptr++) * Y_tmp;
-        #endif
-        #if nEC>=16
-        SFP15ptr = wmhSFP15 + offset;
-        x15 = (*SFP15ptr++) * Y_tmp;
-        #endif
-        #if nEC>=17
-        SFP16ptr = wmhSFP16 + offset;
-        x16 = (*SFP16ptr++) * Y_tmp;
-        #endif
-        #if nEC>=18
-        SFP17ptr = wmhSFP17 + offset;
-        x17 = (*SFP17ptr++) * Y_tmp;
-        #endif
-        #if nEC>=19
-        SFP18ptr = wmhSFP18 + offset;
-        x18 = (*SFP18ptr++) * Y_tmp;
-        #endif
-        #if nEC>=20
-        SFP19ptr = wmhSFP19 + offset;
-        x19 = (*SFP19ptr++) * Y_tmp;
-        #endif
-
-        while( ++Yptr != YptrEnd )
-        {
-            Y_tmp = *Yptr;
-            x0 += (*SFP0ptr++) * Y_tmp;
-            #if nEC>=2
-            x1 += (*SFP1ptr++) * Y_tmp;
-            #endif
-            #if nEC>=3
-            x2 += (*SFP2ptr++) * Y_tmp;
-            #endif
-            #if nEC>=4
-            x3 += (*SFP3ptr++) * Y_tmp;
-            #endif
-            #if nEC>=5
-            x4 += (*SFP4ptr++) * Y_tmp;
-            #endif
-            #if nEC>=6
-            x5 += (*SFP5ptr++) * Y_tmp;
-            #endif
-            #if nEC>=7
-            x6 += (*SFP6ptr++) * Y_tmp;
-            #endif
-            #if nEC>=8
-            x7 += (*SFP7ptr++) * Y_tmp;
-            #endif
-            #if nEC>=9
-            x8 += (*SFP8ptr++) * Y_tmp;
-            #endif
-            #if nEC>=10
-            x9 += (*SFP9ptr++) * Y_tmp;
-            #endif
-            #if nEC>=11
-            x10 += (*SFP10ptr++) * Y_tmp;
-            #endif
-            #if nEC>=12
-            x11 += (*SFP11ptr++) * Y_tmp;
-            #endif
-            #if nEC>=13
-            x12 += (*SFP12ptr++) * Y_tmp;
-            #endif
-            #if nEC>=14
-            x13 += (*SFP13ptr++) * Y_tmp;
-            #endif
-            #if nEC>=15
-            x14 += (*SFP14ptr++) * Y_tmp;
-            #endif
-            #if nEC>=16
-            x15 += (*SFP15ptr++) * Y_tmp;
-            #endif
-            #if nEC>=17
-            x16 += (*SFP16ptr++) * Y_tmp;
-            #endif
-            #if nEC>=18
-            x17 += (*SFP17ptr++) * Y_tmp;
-            #endif
-            #if nEC>=19
-            x18 += (*SFP18ptr++) * Y_tmp;
-            #endif
-            #if nEC>=20
-            x19 += (*SFP19ptr++) * Y_tmp;
-            #endif
-        }
-        (*x_Ptr0++) += x0;
-        #if nEC>=2
-        (*x_Ptr1++) += x1;
-        #endif
-        #if nEC>=3
-        (*x_Ptr2++) += x2;
-        #endif
-        #if nEC>=4
-        (*x_Ptr3++) += x3;
-        #endif
-        #if nEC>=5
-        (*x_Ptr4++) += x4;
-        #endif
-        #if nEC>=6
-        (*x_Ptr5++) += x5;
-        #endif
-        #if nEC>=7
-        (*x_Ptr6++) += x6;
-        #endif
-        #if nEC>=8
-        (*x_Ptr7++) += x7;
-        #endif
-        #if nEC>=9
-        (*x_Ptr8++) += x8;
-        #endif
-        #if nEC>=10
-        (*x_Ptr9++) += x9;
-        #endif
-        #if nEC>=11
-        (*x_Ptr10++) += x10;
-        #endif
-        #if nEC>=12
-        (*x_Ptr11++) += x11;
-        #endif
-        #if nEC>=13
-        (*x_Ptr12++) += x12;
-        #endif
-        #if nEC>=14
-        (*x_Ptr13++) += x13;
-        #endif
-        #if nEC>=15
-        (*x_Ptr14++) += x14;
-        #endif
-        #if nEC>=16
-        (*x_Ptr15++) += x15;
-        #endif
-        #if nEC>=17
-        (*x_Ptr16++) += x16;
-        #endif
-        #if nEC>=18
-        (*x_Ptr17++) += x17;
-        #endif
-        #if nEC>=19
-        (*x_Ptr18++) += x18;
-        #endif
-        #if nEC>=20
-        (*x_Ptr19++) += x19;
-        #endif
-    }
-#endif
-
-#if nISO>=1
-    // isotropic compartments
-    t_v    = ISOv + ISOthreadsT[id];
-    t_vEnd = ISOv + ISOthreadsT[id+1];
-
-    x_Ptr0 = x + nIC*nF + nEC*nE + ISOthreadsT[id];
-    #if nISO>=2
-    x_Ptr1 = x_Ptr0 + nV;
-    #endif
-    #if nISO>=3
-    x_Ptr2 = x_Ptr1 + nV;
-    #endif
-    #if nISO>=4
-    x_Ptr3 = x_Ptr2 + nV;
-    #endif
-    #if nISO>=5
-    x_Ptr4 = x_Ptr3 + nV;
-    #endif
-    #if nISO>=6
-    x_Ptr5 = x_Ptr4 + nV;
-    #endif
-    #if nISO>=7
-    x_Ptr6 = x_Ptr5 + nV;
-    #endif
-    #if nISO>=8
-    x_Ptr7 = x_Ptr6 + nV;
-    #endif
-    #if nISO>=9
-    x_Ptr8 = x_Ptr7 + nV;
-    #endif
-    #if nISO>=10
-    x_Ptr9 = x_Ptr8 + nV;
-    #endif
-    #if nISO>=11
-    x_Ptr10 = x_Ptr9 + nV;
-    #endif
-    #if nISO>=12
-    x_Ptr11 = x_Ptr10 + nV;
-    #endif
-    #if nISO>=13
-    x_Ptr12 = x_Ptr11 + nV;
-    #endif
-    #if nISO>=14
-    x_Ptr13 = x_Ptr12 + nV;
-    #endif
-    #if nISO>=15
-    x_Ptr14 = x_Ptr13 + nV;
-    #endif
-    #if nISO>=16
-    x_Ptr15 = x_Ptr14 + nV;
-    #endif
-    #if nISO>=17
-    x_Ptr16 = x_Ptr15 + nV;
-    #endif
-    #if nISO>=18
-    x_Ptr17 = x_Ptr16 + nV;
-    #endif
-    #if nISO>=19
-    x_Ptr18 = x_Ptr17 + nV;
-    #endif
-    #if nISO>=20
-    x_Ptr19 = x_Ptr18 + nV;
-    #endif
-
-    while( t_v != t_vEnd )
-    {
-        Yptr    = Y    + nS * (*t_v++);
-        YptrEnd = Yptr + nS;
-
-        SFP0ptr = isoSFP0;
-        #if nISO>=2
-        SFP1ptr = isoSFP1;
-        #endif
-        #if nISO>=3
-        SFP2ptr = isoSFP2;
-        #endif
-        #if nISO>=4
-        SFP3ptr = isoSFP3;
-        #endif
-        #if nISO>=5
-        SFP4ptr = isoSFP4;
-        #endif
-        #if nISO>=6
-        SFP5ptr = isoSFP5;
-        #endif
-        #if nISO>=7
-        SFP6ptr = isoSFP6;
-        #endif
-        #if nISO>=8
-        SFP7ptr = isoSFP7;
-        #endif
-        #if nISO>=9
-        SFP8ptr = isoSFP8;
-        #endif
-        #if nISO>=10
-        SFP9ptr = isoSFP9;
-        #endif
-        #if nISO>=11
-        SFP10ptr = isoSFP10;
-        #endif
-        #if nISO>=12
-        SFP11ptr = isoSFP11;
-        #endif
-        #if nISO>=13
-        SFP12ptr = isoSFP12;
-        #endif
-        #if nISO>=14
-        SFP13ptr = isoSFP13;
-        #endif
-        #if nISO>=15
-        SFP14ptr = isoSFP14;
-        #endif
-        #if nISO>=16
-        SFP15ptr = isoSFP15;
-        #endif
-        #if nISO>=17
-        SFP16ptr = isoSFP16;
-        #endif
-        #if nISO>=18
-        SFP17ptr = isoSFP17;
-        #endif
-        #if nISO>=19
-        SFP18ptr = isoSFP18;
-        #endif
-        #if nISO>=20
-        SFP19ptr = isoSFP19;
-        #endif
-
-        Y_tmp = *Yptr;
-        x0 = (*SFP0ptr++) * Y_tmp;
-        #if nISO>=2
-        x1 = (*SFP1ptr++) * Y_tmp;
-        #endif
-        #if nISO>=3
-        x2 = (*SFP2ptr++) * Y_tmp;
-        #endif
-        #if nISO>=4
-        x3 = (*SFP3ptr++) * Y_tmp;
-        #endif
-        #if nISO>=5
-        x4 = (*SFP4ptr++) * Y_tmp;
-        #endif
-        #if nISO>=6
-        x5 = (*SFP5ptr++) * Y_tmp;
-        #endif
-        #if nISO>=7
-        x6 = (*SFP6ptr++) * Y_tmp;
-        #endif
-        #if nISO>=8
-        x7 = (*SFP7ptr++) * Y_tmp;
-        #endif
-        #if nISO>=9
-        x8 = (*SFP8ptr++) * Y_tmp;
-        #endif
-        #if nISO>=10
-        x9 = (*SFP9ptr++) * Y_tmp;
-        #endif
-        #if nISO>=11
-        x10 = (*SFP10ptr++) * Y_tmp;
-        #endif
-        #if nISO>=12
-        x11 = (*SFP11ptr++) * Y_tmp;
-        #endif
-        #if nISO>=13
-        x12 = (*SFP12ptr++) * Y_tmp;
-        #endif
-        #if nISO>=14
-        x13 = (*SFP13ptr++) * Y_tmp;
-        #endif
-        #if nISO>=15
-        x14 = (*SFP14ptr++) * Y_tmp;
-        #endif
-        #if nISO>=16
-        x15 = (*SFP15ptr++) * Y_tmp;
-        #endif
-        #if nISO>=17
-        x16 = (*SFP16ptr++) * Y_tmp;
-        #endif
-        #if nISO>=18
-        x17 = (*SFP17ptr++) * Y_tmp;
-        #endif
-        #if nISO>=19
-        x18 = (*SFP18ptr++) * Y_tmp;
-        #endif
-        #if nISO>=20
-        x19 = (*SFP19ptr++) * Y_tmp;
-        #endif
-
-        while( ++Yptr != YptrEnd )
-        {
-            Y_tmp = *Yptr;
-            x0  += (*SFP0ptr++) * Y_tmp;
-            #if nISO>=2
-            x1  += (*SFP1ptr++) * Y_tmp;
-            #endif
-            #if nISO>=3
-            x2  += (*SFP2ptr++) * Y_tmp;
-            #endif
-            #if nISO>=4
-            x3  += (*SFP3ptr++) * Y_tmp;
-            #endif
-            #if nISO>=5
-            x4  += (*SFP4ptr++) * Y_tmp;
-            #endif
-            #if nISO>=6
-            x5  += (*SFP5ptr++) * Y_tmp;
-            #endif
-            #if nISO>=7
-            x6  += (*SFP6ptr++) * Y_tmp;
-            #endif
-            #if nISO>=8
-            x7  += (*SFP7ptr++) * Y_tmp;
-            #endif
-            #if nISO>=9
-            x8  += (*SFP8ptr++) * Y_tmp;
-            #endif
-            #if nISO>=10
-            x9  += (*SFP9ptr++) * Y_tmp;
-            #endif
-            #if nISO>=11
-            x10  += (*SFP10ptr++) * Y_tmp;
-            #endif
-            #if nISO>=12
-            x11  += (*SFP11ptr++) * Y_tmp;
-            #endif
-            #if nISO>=13
-            x12  += (*SFP12ptr++) * Y_tmp;
-            #endif
-            #if nISO>=14
-            x13  += (*SFP13ptr++) * Y_tmp;
-            #endif
-            #if nISO>=15
-            x14  += (*SFP14ptr++) * Y_tmp;
-            #endif
-            #if nISO>=16
-            x15  += (*SFP15ptr++) * Y_tmp;
-            #endif
-            #if nISO>=17
-            x16  += (*SFP16ptr++) * Y_tmp;
-            #endif
-            #if nISO>=18
-            x17  += (*SFP17ptr++) * Y_tmp;
-            #endif
-            #if nISO>=19
-            x18  += (*SFP18ptr++) * Y_tmp;
-            #endif
-            #if nISO>=20
-            x19  += (*SFP19ptr++) * Y_tmp;
-            #endif
-        }
-
-        (*x_Ptr0++) += x0;
-        #if nISO>=2
-        (*x_Ptr1++) += x1;
-        #endif
-        #if nISO>=3
-        (*x_Ptr2++) += x2;
-        #endif
-        #if nISO>=4
-        (*x_Ptr3++) += x3;
-        #endif
-        #if nISO>=5
-        (*x_Ptr4++) += x4;
-        #endif
-        #if nISO>=6
-        (*x_Ptr5++) += x5;
-        #endif
-        #if nISO>=7
-        (*x_Ptr6++) += x6;
-        #endif
-        #if nISO>=8
-        (*x_Ptr7++) += x7;
-        #endif
-        #if nISO>=9
-        (*x_Ptr8++) += x8;
-        #endif
-        #if nISO>=10
-        (*x_Ptr9++) += x9;
-        #endif
-        #if nISO>=11
-        (*x_Ptr10++) += x10;
-        #endif
-        #if nISO>=12
-        (*x_Ptr11++) += x11;
-        #endif
-        #if nISO>=13
-        (*x_Ptr12++) += x12;
-        #endif
-        #if nISO>=14
-        (*x_Ptr13++) += x13;
-        #endif
-        #if nISO>=15
-        (*x_Ptr14++) += x14;
-        #endif
-        #if nISO>=16
-        (*x_Ptr15++) += x15;
-        #endif
-        #if nISO>=17
-        (*x_Ptr16++) += x16;
-        #endif
-        #if nISO>=18
-        (*x_Ptr17++) += x17;
-        #endif
-        #if nISO>=19
-        (*x_Ptr18++) += x18;
-        #endif
-        #if nISO>=20
-        (*x_Ptr19++) += x19;
-        #endif
-    }
-#endif
-
-    pthread_exit( 0 );
-}
-
-
-// =========================
-// Function called by CYTHON
-// =========================
-void COMMIT_At(
-    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
-    double *_vIN, double *_vOUT,
-    uint32_t *_ICf, uint32_t *_ICv, uint16_t *_ICo, float *_ICl,
-    uint32_t *_ECv, uint16_t *_ECo,
-    uint32_t *_ISOv,
-    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
-    uint8_t* _ICthreadsT, uint32_t* _ECthreadsT, uint32_t* _ISOthreadsT
-)
-{
-    nF = _nF;
-    n  = _n;
-    nE = _nE;
-    nV = _nV;
-    nS = _nS;
-    ndirs = _ndirs;
-
-    x = _vOUT;
-    Y = _vIN;
-
-    ICf  = _ICf;
-    ICv  = _ICv;
-    ICo  = _ICo;
-    ICl  = _ICl;
-    ECv  = _ECv;
-    ECo  = _ECo;
-    ISOv = _ISOv;
-
-    #if nIC>=1
-    wmrSFP0 = _wmrSFP;
-    #if nIC>=2
-    wmrSFP1 = wmrSFP0 + _ndirs*_nS;
-    #if nIC>=3
-    wmrSFP2 = wmrSFP1 + _ndirs*_nS;
-    #if nIC>=4
-    wmrSFP3 = wmrSFP2 + _ndirs*_nS;
-    #if nIC>=5
-    wmrSFP4 = wmrSFP3 + _ndirs*_nS;
-    #if nIC>=6
-    wmrSFP5 = wmrSFP4 + _ndirs*_nS;
-    #if nIC>=7
-    wmrSFP6 = wmrSFP5 + _ndirs*_nS;
-    #if nIC>=8
-    wmrSFP7 = wmrSFP6 + _ndirs*_nS;
-    #if nIC>=9
-    wmrSFP8 = wmrSFP7 + _ndirs*_nS;
-    #if nIC>=10
-    wmrSFP9 = wmrSFP8 + _ndirs*_nS;
-    #if nIC>=11
-    wmrSFP10 = wmrSFP9 + _ndirs*_nS;
-    #if nIC>=12
-    wmrSFP11 = wmrSFP10 + _ndirs*_nS;
-    #if nIC>=13
-    wmrSFP12 = wmrSFP11 + _ndirs*_nS;
-    #if nIC>=14
-    wmrSFP13 = wmrSFP12 + _ndirs*_nS;
-    #if nIC>=15
-    wmrSFP14 = wmrSFP13 + _ndirs*_nS;
-    #if nIC>=16
-    wmrSFP15 = wmrSFP14 + _ndirs*_nS;
-    #if nIC>=17
-    wmrSFP16 = wmrSFP15 + _ndirs*_nS;
-    #if nIC>=18
-    wmrSFP17 = wmrSFP16 + _ndirs*_nS;
-    #if nIC>=19
-    wmrSFP18 = wmrSFP17 + _ndirs*_nS;
-    #if nIC>=20
-    wmrSFP19 = wmrSFP18 + _ndirs*_nS;
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #if nEC>=1
-    wmhSFP0 = _wmhSFP;
-    #if nEC>=2
-    wmhSFP1 = wmhSFP0 + _ndirs*_nS;
-    #if nEC>=3
-    wmhSFP2 = wmhSFP1 + _ndirs*_nS;
-    #if nEC>=4
-    wmhSFP3 = wmhSFP2 + _ndirs*_nS;
-    #if nEC>=5
-    wmhSFP4 = wmhSFP3 + _ndirs*_nS;
-    #if nEC>=6
-    wmhSFP5 = wmhSFP4 + _ndirs*_nS;
-    #if nEC>=7
-    wmhSFP6 = wmhSFP5 + _ndirs*_nS;
-    #if nEC>=8
-    wmhSFP7 = wmhSFP6 + _ndirs*_nS;
-    #if nEC>=9
-    wmhSFP8 = wmhSFP7 + _ndirs*_nS;
-    #if nEC>=10
-    wmhSFP9 = wmhSFP8 + _ndirs*_nS;
-    #if nEC>=11
-    wmhSFP10 = wmhSFP9 + _ndirs*_nS;
-    #if nEC>=12
-    wmhSFP11 = wmhSFP10 + _ndirs*_nS;
-    #if nEC>=13
-    wmhSFP12 = wmhSFP11 + _ndirs*_nS;
-    #if nEC>=14
-    wmhSFP13 = wmhSFP12 + _ndirs*_nS;
-    #if nEC>=15
-    wmhSFP14 = wmhSFP13 + _ndirs*_nS;
-    #if nEC>=16
-    wmhSFP15 = wmhSFP14 + _ndirs*_nS;
-    #if nEC>=17
-    wmhSFP16 = wmhSFP15 + _ndirs*_nS;
-    #if nEC>=18
-    wmhSFP17 = wmhSFP16 + _ndirs*_nS;
-    #if nEC>=19
-    wmhSFP18 = wmhSFP17 + _ndirs*_nS;
-    #if nEC>=20
-    wmhSFP19 = wmhSFP18 + _ndirs*_nS;
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #if nISO>=1
-    isoSFP0 = _isoSFP;
-    #if nISO>=2
-    isoSFP1 = isoSFP0 + _nS;
-    #if nISO>=3
-    isoSFP2 = isoSFP1 + _nS;
-    #if nISO>=4
-    isoSFP3 = isoSFP2 + _nS;
-    #if nISO>=5
-    isoSFP4 = isoSFP3 + _nS;
-    #if nISO>=6
-    isoSFP5 = isoSFP4 + _nS;
-    #if nISO>=7
-    isoSFP6 = isoSFP5 + _nS;
-    #if nISO>=8
-    isoSFP7 = isoSFP6 + _nS;
-    #if nISO>=9
-    isoSFP8 = isoSFP7 + _nS;
-    #if nISO>=10
-    isoSFP9 = isoSFP8 + _nS;
-    #if nISO>=11
-    isoSFP10 = isoSFP9 + _nS;
-    #if nISO>=12
-    isoSFP11 = isoSFP10 + _nS;
-    #if nISO>=13
-    isoSFP12 = isoSFP11 + _nS;
-    #if nISO>=14
-    isoSFP13 = isoSFP12 + _nS;
-    #if nISO>=15
-    isoSFP14 = isoSFP13 + _nS;
-    #if nISO>=16
-    isoSFP15 = isoSFP14 + _nS;
-    #if nISO>=17
-    isoSFP16 = isoSFP15 + _nS;
-    #if nISO>=18
-    isoSFP17 = isoSFP16 + _nS;
-    #if nISO>=19
-    isoSFP18 = isoSFP17 + _nS;
-    #if nISO>=20
-    isoSFP19 = isoSFP18 + _nS;
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-
-    ICthreadsT  = _ICthreadsT;
-    ECthreadsT  = _ECthreadsT;
-    ISOthreadsT = _ISOthreadsT;
-
-    // Run SEPARATE THREADS to perform the multiplication
-    pthread_t threads[nTHREADS];
-    int t;
-    for(t=0; t<nTHREADS ; t++)
-        pthread_create( &threads[t], NULL, COMMIT_At__block, (void *) (long int)t );
-    for(t=0; t<nTHREADS ; t++)
-        pthread_join( threads[t], NULL );
-    return;
-}
+#include <pthread.h>
+#include <stdint.h> // uint32_t etc
+
+// number of THREADS
+#ifdef nTHREADS
+    #if (nTHREADS<0 || nTHREADS>255)
+    #error "nTHREADS" must be in the range 0..255
+    #endif
+#else
+    #error "nTHREADS" parameter must be passed to the compiler as "-DnTHREADS=<value>"
+#endif
+
+
+/* global variables */
+int         nF, n, nE, nV, nS, ndirs;
+double      *x, *Y;
+uint32_t    *ICthreads, *ECthreads, *ISOthreads;
+uint8_t     *ICthreadsT;
+uint32_t    *ECthreadsT, *ISOthreadsT;
+uint32_t    *ICf, *ICv, *ECv, *ISOv;
+uint16_t    *ICo, *ECo;
+float       *ICl;
+float       *wmrSFP0, *wmrSFP1, *wmrSFP2, *wmrSFP3, *wmrSFP4, *wmrSFP5, *wmrSFP6, *wmrSFP7, *wmrSFP8, *wmrSFP9, *wmrSFP10, *wmrSFP11, *wmrSFP12, *wmrSFP13, *wmrSFP14, *wmrSFP15, *wmrSFP16, *wmrSFP17, *wmrSFP18, *wmrSFP19;
+float       *wmhSFP0, *wmhSFP1, *wmhSFP2, *wmhSFP3, *wmhSFP4, *wmhSFP5, *wmhSFP6, *wmhSFP7, *wmhSFP8, *wmhSFP9, *wmhSFP10, *wmhSFP11, *wmhSFP12, *wmhSFP13, *wmhSFP14, *wmhSFP15, *wmhSFP16, *wmhSFP17, *wmhSFP18, *wmhSFP19;
+float       *isoSFP0, *isoSFP1, *isoSFP2, *isoSFP3, *isoSFP4, *isoSFP5, *isoSFP6, *isoSFP7, *isoSFP8, *isoSFP9, *isoSFP10, *isoSFP11, *isoSFP12, *isoSFP13, *isoSFP14, *isoSFP15, *isoSFP16, *isoSFP17, *isoSFP18, *isoSFP19;
+
+
+
+// ====================================================
+// Compute a sub-block of the A*x MAtRIX-VECTOR product
+// ====================================================
+void* COMMIT_A__block( void *ptr )
+{
+    int      id = (long)ptr;
+    int      offset;
+    double   x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, w;
+    double   *x_Ptr0, *x_Ptr1, *x_Ptr2, *x_Ptr3, *x_Ptr4, *x_Ptr5, *x_Ptr6, *x_Ptr7, *x_Ptr8, *x_Ptr9, *x_Ptr10, *x_Ptr11, *x_Ptr12, *x_Ptr13, *x_Ptr14, *x_Ptr15, *x_Ptr16, *x_Ptr17, *x_Ptr18, *x_Ptr19;
+    double   *Yptr, *YptrEnd;
+    float    *SFP0ptr, *SFP1ptr, *SFP2ptr, *SFP3ptr, *SFP4ptr, *SFP5ptr, *SFP6ptr, *SFP7ptr, *SFP8ptr, *SFP9ptr, *SFP10ptr, *SFP11ptr, *SFP12ptr, *SFP13ptr, *SFP14ptr, *SFP15ptr, *SFP16ptr, *SFP17ptr, *SFP18ptr, *SFP19ptr;
+    uint32_t *t_v, *t_vEnd, *t_f;
+    uint16_t *t_o;
+    float    *t_l;
+
+#if nIC>=1
+    // intra-cellular compartments
+    t_v    = ICv + ICthreads[id];
+    t_vEnd = ICv + ICthreads[id+1];
+    t_o    = ICo + ICthreads[id];
+    t_l    = ICl + ICthreads[id];
+    t_f    = ICf + ICthreads[id];
+
+    while( t_v != t_vEnd )
+    {
+        x_Ptr0 = x + *t_f;
+        x0 = *x_Ptr0;
+        #if nIC>=2
+        x_Ptr1 = x_Ptr0 + nF;
+        x1 = *x_Ptr1;
+        #endif
+        #if nIC>=3
+        x_Ptr2 = x_Ptr1 + nF;
+        x2 = *x_Ptr2;
+        #endif
+        #if nIC>=4
+        x_Ptr3 = x_Ptr2 + nF;
+        x3 = *x_Ptr3;
+        #endif
+        #if nIC>=5
+        x_Ptr4 = x_Ptr3 + nF;
+        x4 = *x_Ptr4;
+        #endif
+        #if nIC>=6
+        x_Ptr5 = x_Ptr4 + nF;
+        x5 = *x_Ptr5;
+        #endif
+        #if nIC>=7
+        x_Ptr6 = x_Ptr5 + nF;
+        x6 = *x_Ptr6;
+        #endif
+        #if nIC>=8
+        x_Ptr7 = x_Ptr6 + nF;
+        x7 = *x_Ptr7;
+        #endif
+        #if nIC>=9
+        x_Ptr8 = x_Ptr7 + nF;
+        x8 = *x_Ptr8;
+        #endif
+        #if nIC>=10
+        x_Ptr9 = x_Ptr8 + nF;
+        x9 = *x_Ptr9;
+        #endif
+        #if nIC>=11
+        x_Ptr10 = x_Ptr9 + nF;
+        x10 = *x_Ptr10;
+        #endif
+        #if nIC>=12
+        x_Ptr11 = x_Ptr10 + nF;
+        x11 = *x_Ptr11;
+        #endif
+        #if nIC>=13
+        x_Ptr12 = x_Ptr11 + nF;
+        x12 = *x_Ptr12;
+        #endif
+        #if nIC>=14
+        x_Ptr13 = x_Ptr12 + nF;
+        x13 = *x_Ptr13;
+        #endif
+        #if nIC>=15
+        x_Ptr14 = x_Ptr13 + nF;
+        x14 = *x_Ptr14;
+        #endif
+        #if nIC>=16
+        x_Ptr15 = x_Ptr14 + nF;
+        x15 = *x_Ptr15;
+        #endif
+        #if nIC>=17
+        x_Ptr16 = x_Ptr15 + nF;
+        x16 = *x_Ptr16;
+        #endif
+        #if nIC>=18
+        x_Ptr17 = x_Ptr16 + nF;
+        x17 = *x_Ptr17;
+        #endif
+        #if nIC>=19
+        x_Ptr18 = x_Ptr17 + nF;
+        x18 = *x_Ptr18;
+        #endif
+        #if nIC>=20
+        x_Ptr19 = x_Ptr18 + nF;
+        x19 = *x_Ptr19;
+        #endif
+
+        if ( x0 != 0
+        #if nIC>=2
+            || x1 != 0
+        #endif
+        #if nIC>=3
+            || x2 != 0
+        #endif
+        #if nIC>=4
+            || x3 != 0
+        #endif
+        #if nIC>=5
+            || x4 != 0
+        #endif
+        #if nIC>=6
+            || x5 != 0
+        #endif
+        #if nIC>=7
+            || x6 != 0
+        #endif
+        #if nIC>=8
+            || x7 != 0
+        #endif
+        #if nIC>=9
+            || x8 != 0
+        #endif
+        #if nIC>=10
+            || x9 != 0
+        #endif
+        #if nIC>=11
+            || x10 != 0
+        #endif
+        #if nIC>=12
+            || x11 != 0
+        #endif
+        #if nIC>=13
+            || x12 != 0
+        #endif
+        #if nIC>=14
+            || x13 != 0
+        #endif
+        #if nIC>=15
+            || x14 != 0
+        #endif
+        #if nIC>=16
+            || x15 != 0
+        #endif
+        #if nIC>=17
+            || x16 != 0
+        #endif
+        #if nIC>=18
+            || x17 != 0
+        #endif
+        #if nIC>=19
+            || x18 != 0
+        #endif
+        #if nIC>=20
+            || x19 != 0
+        #endif
+        )
+        {
+            Yptr    = Y    + nS * (*t_v);
+            YptrEnd = Yptr + nS;
+            w       = (double)(*t_l);
+            offset  = nS * (*t_o);
+            SFP0ptr = wmrSFP0 + offset;
+            #if nIC>=2
+            SFP1ptr = wmrSFP1 + offset;
+            #endif
+            #if nIC>=3
+            SFP2ptr = wmrSFP2 + offset;
+            #endif
+            #if nIC>=4
+            SFP3ptr = wmrSFP3 + offset;
+            #endif
+            #if nIC>=5
+            SFP4ptr = wmrSFP4 + offset;
+            #endif
+            #if nIC>=6
+            SFP5ptr = wmrSFP5 + offset;
+            #endif
+            #if nIC>=7
+            SFP6ptr = wmrSFP6 + offset;
+            #endif
+            #if nIC>=8
+            SFP7ptr = wmrSFP7 + offset;
+            #endif
+            #if nIC>=9
+            SFP8ptr = wmrSFP8 + offset;
+            #endif
+            #if nIC>=10
+            SFP9ptr = wmrSFP9 + offset;
+            #endif
+            #if nIC>=11
+            SFP10ptr = wmrSFP10 + offset;
+            #endif
+            #if nIC>=12
+            SFP11ptr = wmrSFP11 + offset;
+            #endif
+            #if nIC>=13
+            SFP12ptr = wmrSFP12 + offset;
+            #endif
+            #if nIC>=14
+            SFP13ptr = wmrSFP13 + offset;
+            #endif
+            #if nIC>=15
+            SFP14ptr = wmrSFP14 + offset;
+            #endif
+            #if nIC>=16
+            SFP15ptr = wmrSFP15 + offset;
+            #endif
+            #if nIC>=17
+            SFP16ptr = wmrSFP16 + offset;
+            #endif
+            #if nIC>=18
+            SFP17ptr = wmrSFP17 + offset;
+            #endif
+            #if nIC>=19
+            SFP18ptr = wmrSFP18 + offset;
+            #endif
+            #if nIC>=20
+            SFP19ptr = wmrSFP19 + offset;
+            #endif
+
+            while( Yptr != YptrEnd )
+                (*Yptr++) += w * (
+                          x0 * (*SFP0ptr++)
+                        #if nIC>=2
+                        + x1 * (*SFP1ptr++)
+                        #endif
+                        #if nIC>=3
+                        + x2 * (*SFP2ptr++)
+                        #endif
+                        #if nIC>=4
+                        + x3 * (*SFP3ptr++)
+                        #endif
+                        #if nIC>=5
+                        + x4 * (*SFP4ptr++)
+                        #endif
+                        #if nIC>=6
+                        + x5 * (*SFP5ptr++)
+                        #endif
+                        #if nIC>=7
+                        + x6 * (*SFP6ptr++)
+                        #endif
+                        #if nIC>=8
+                        + x7 * (*SFP7ptr++)
+                        #endif
+                        #if nIC>=9
+                        + x8 * (*SFP8ptr++)
+                        #endif
+                        #if nIC>=10
+                        + x9 * (*SFP9ptr++)
+                        #endif
+                        #if nIC>=11
+                        + x10 * (*SFP10ptr++)
+                        #endif
+                        #if nIC>=12
+                        + x11 * (*SFP11ptr++)
+                        #endif
+                        #if nIC>=13
+                        + x12 * (*SFP12ptr++)
+                        #endif
+                        #if nIC>=14
+                        + x13 * (*SFP13ptr++)
+                        #endif
+                        #if nIC>=15
+                        + x14 * (*SFP14ptr++)
+                        #endif
+                        #if nIC>=16
+                        + x15 * (*SFP15ptr++)
+                        #endif
+                        #if nIC>=17
+                        + x16 * (*SFP16ptr++)
+                        #endif
+                        #if nIC>=18
+                        + x17 * (*SFP17ptr++)
+                        #endif
+                        #if nIC>=19
+                        + x18 * (*SFP18ptr++)
+                        #endif
+                        #if nIC>=20
+                        + x19 * (*SFP19ptr++)
+                        #endif
+                );
+        }
+
+        t_f++;
+        t_v++;
+        t_o++;
+        t_l++;
+    }
+#endif
+
+#if nEC>=1
+    // extra-cellular compartments
+    t_v    = ECv + ECthreads[id];
+    t_vEnd = ECv + ECthreads[id+1];
+    t_o    = ECo + ECthreads[id];
+
+    x_Ptr0 = x + nIC*nF + ECthreads[id];
+    #if nEC>=2
+    x_Ptr1 = x_Ptr0 + nE;
+    #endif
+    #if nEC>=3
+    x_Ptr2 = x_Ptr1 + nE;
+    #endif
+    #if nEC>=4
+    x_Ptr3 = x_Ptr2 + nE;
+    #endif
+    #if nEC>=5
+    x_Ptr4 = x_Ptr3 + nE;
+    #endif
+    #if nEC>=6
+    x_Ptr5 = x_Ptr4 + nE;
+    #endif
+    #if nEC>=7
+    x_Ptr6 = x_Ptr5 + nE;
+    #endif
+    #if nEC>=8
+    x_Ptr7 = x_Ptr6 + nE;
+    #endif
+    #if nEC>=9
+    x_Ptr8 = x_Ptr7 + nE;
+    #endif
+    #if nEC>=10
+    x_Ptr9 = x_Ptr8 + nE;
+    #endif
+    #if nEC>=11
+    x_Ptr10 = x_Ptr9 + nE;
+    #endif
+    #if nEC>=12
+    x_Ptr11 = x_Ptr10 + nE;
+    #endif
+    #if nEC>=13
+    x_Ptr12 = x_Ptr11 + nE;
+    #endif
+    #if nEC>=14
+    x_Ptr13 = x_Ptr12 + nE;
+    #endif
+    #if nEC>=15
+    x_Ptr14 = x_Ptr13 + nE;
+    #endif
+    #if nEC>=16
+    x_Ptr15 = x_Ptr14 + nE;
+    #endif
+    #if nEC>=17
+    x_Ptr16 = x_Ptr15 + nE;
+    #endif
+    #if nEC>=18
+    x_Ptr17 = x_Ptr16 + nE;
+    #endif
+    #if nEC>=19
+    x_Ptr18 = x_Ptr17 + nE;
+    #endif
+    #if nEC>=20
+    x_Ptr19 = x_Ptr18 + nE;
+    #endif
+
+    while( t_v != t_vEnd )
+    {
+        x0 = *x_Ptr0++;
+        #if nEC>=2
+        x1 = *x_Ptr1++;
+        #endif
+        #if nEC>=3
+        x2 = *x_Ptr2++;
+        #endif
+        #if nEC>=4
+        x3 = *x_Ptr3++;
+        #endif
+        #if nEC>=5
+        x4 = *x_Ptr4++;
+        #endif
+        #if nEC>=6
+        x5 = *x_Ptr5++;
+        #endif
+        #if nEC>=7
+        x6 = *x_Ptr6++;
+        #endif
+        #if nEC>=8
+        x7 = *x_Ptr7++;
+        #endif
+        #if nEC>=9
+        x8 = *x_Ptr8++;
+        #endif
+        #if nEC>=10
+        x9 = *x_Ptr9++;
+        #endif
+        #if nEC>=11
+        x10 = *x_Ptr10++;
+        #endif
+        #if nEC>=12
+        x11 = *x_Ptr11++;
+        #endif
+        #if nEC>=13
+        x12 = *x_Ptr12++;
+        #endif
+        #if nEC>=14
+        x13 = *x_Ptr13++;
+        #endif
+        #if nEC>=15
+        x14 = *x_Ptr14++;
+        #endif
+        #if nEC>=16
+        x15 = *x_Ptr15++;
+        #endif
+        #if nEC>=17
+        x16 = *x_Ptr16++;
+        #endif
+        #if nEC>=18
+        x17 = *x_Ptr17++;
+        #endif
+        #if nEC>=19
+        x18 = *x_Ptr18++;
+        #endif
+        #if nEC>=20
+        x19 = *x_Ptr19++;
+        #endif
+        if (
+               x0 != 0
+            #if nEC>=2
+            || x1 != 0
+            #endif
+            #if nEC>=3
+            || x2 != 0
+            #endif
+            #if nEC>=4
+            || x3 != 0
+            #endif
+            #if nEC>=5
+            || x4 != 0
+            #endif
+            #if nEC>=6
+            || x5 != 0
+            #endif
+            #if nEC>=7
+            || x6 != 0
+            #endif
+            #if nEC>=8
+            || x7 != 0
+            #endif
+            #if nEC>=9
+            || x8 != 0
+            #endif
+            #if nEC>=10
+            || x9 != 0
+            #endif
+            #if nEC>=11
+            || x10 != 0
+            #endif
+            #if nEC>=12
+            || x11 != 0
+            #endif
+            #if nEC>=13
+            || x12 != 0
+            #endif
+            #if nEC>=14
+            || x13 != 0
+            #endif
+            #if nEC>=15
+            || x14 != 0
+            #endif
+            #if nEC>=16
+            || x15 != 0
+            #endif
+            #if nEC>=17
+            || x16 != 0
+            #endif
+            #if nEC>=18
+            || x17 != 0
+            #endif
+            #if nEC>=19
+            || x18 != 0
+            #endif
+            #if nEC>=20
+            || x19 != 0
+            #endif
+          )
+        {
+            Yptr    = Y    + nS * (*t_v);
+            YptrEnd = Yptr + nS;
+            offset  = nS * (*t_o);
+            SFP0ptr = wmhSFP0 + offset;
+            #if nEC>=2
+            SFP1ptr = wmhSFP1 + offset;
+            #endif
+            #if nEC>=3
+            SFP2ptr = wmhSFP2 + offset;
+            #endif
+            #if nEC>=4
+            SFP3ptr = wmhSFP3 + offset;
+            #endif
+            #if nEC>=5
+            SFP4ptr = wmhSFP4 + offset;
+            #endif
+            #if nEC>=6
+            SFP5ptr = wmhSFP5 + offset;
+            #endif
+            #if nEC>=7
+            SFP6ptr = wmhSFP6 + offset;
+            #endif
+            #if nEC>=8
+            SFP7ptr = wmhSFP7 + offset;
+            #endif
+            #if nEC>=9
+            SFP8ptr = wmhSFP8 + offset;
+            #endif
+            #if nEC>=10
+            SFP9ptr = wmhSFP9 + offset;
+            #endif
+            #if nEC>=11
+            SFP10ptr = wmhSFP10 + offset;
+            #endif
+            #if nEC>=12
+            SFP11ptr = wmhSFP11 + offset;
+            #endif
+            #if nEC>=13
+            SFP12ptr = wmhSFP12 + offset;
+            #endif
+            #if nEC>=14
+            SFP13ptr = wmhSFP13 + offset;
+            #endif
+            #if nEC>=15
+            SFP14ptr = wmhSFP14 + offset;
+            #endif
+            #if nEC>=16
+            SFP15ptr = wmhSFP15 + offset;
+            #endif
+            #if nEC>=17
+            SFP16ptr = wmhSFP16 + offset;
+            #endif
+            #if nEC>=18
+            SFP17ptr = wmhSFP17 + offset;
+            #endif
+            #if nEC>=19
+            SFP18ptr = wmhSFP18 + offset;
+            #endif
+            #if nEC>=20
+            SFP19ptr = wmhSFP19 + offset;
+            #endif
+
+            while( Yptr != YptrEnd )
+                (*Yptr++) += (
+                      x0 * (*SFP0ptr++)
+                    #if nEC>=2
+                    + x1 * (*SFP1ptr++)
+                    #endif
+                    #if nEC>=3
+                    + x2 * (*SFP2ptr++)
+                    #endif
+                    #if nEC>=4
+                    + x3 * (*SFP3ptr++)
+                    #endif
+                    #if nEC>=5
+                    + x4 * (*SFP4ptr++)
+                    #endif
+                    #if nEC>=6
+                    + x5 * (*SFP5ptr++)
+                    #endif
+                    #if nEC>=7
+                    + x6 * (*SFP6ptr++)
+                    #endif
+                    #if nEC>=8
+                    + x7 * (*SFP7ptr++)
+                    #endif
+                    #if nEC>=9
+                    + x8 * (*SFP8ptr++)
+                    #endif
+                    #if nEC>=10
+                    + x9 * (*SFP9ptr++)
+                    #endif
+                    #if nEC>=11
+                    + x10 * (*SFP10ptr++)
+                    #endif
+                    #if nEC>=12
+                    + x11 * (*SFP11ptr++)
+                    #endif
+                    #if nEC>=13
+                    + x12 * (*SFP12ptr++)
+                    #endif
+                    #if nEC>=14
+                    + x13 * (*SFP13ptr++)
+                    #endif
+                    #if nEC>=15
+                    + x14 * (*SFP14ptr++)
+                    #endif
+                    #if nEC>=16
+                    + x15 * (*SFP15ptr++)
+                    #endif
+                    #if nEC>=17
+                    + x16 * (*SFP16ptr++)
+                    #endif
+                    #if nEC>=18
+                    + x17 * (*SFP17ptr++)
+                    #endif
+                    #if nEC>=19
+                    + x18 * (*SFP18ptr++)
+                    #endif
+                    #if nEC>=20
+                    + x19 * (*SFP19ptr++)
+                    #endif
+
+                );
+        }
+        t_v++;
+        t_o++;
+    }
+#endif
+
+#if nISO>=1
+    // isotropic compartments
+    t_v    = ISOv + ISOthreads[id];
+    t_vEnd = ISOv + ISOthreads[id+1];
+
+    x_Ptr0 = x + nIC*nF + nEC*nE + ISOthreads[id];
+    #if nISO>=2
+    x_Ptr1 = x_Ptr0 + nV;
+    #endif
+    #if nISO>=3
+    x_Ptr2 = x_Ptr1 + nV;
+    #endif
+    #if nISO>=4
+    x_Ptr3 = x_Ptr2 + nV;
+    #endif
+    #if nISO>=5
+    x_Ptr4 = x_Ptr3 + nV;
+    #endif
+    #if nISO>=6
+    x_Ptr5 = x_Ptr4 + nV;
+    #endif
+    #if nISO>=7
+    x_Ptr6 = x_Ptr5 + nV;
+    #endif
+    #if nISO>=8
+    x_Ptr7 = x_Ptr6 + nV;
+    #endif
+    #if nISO>=9
+    x_Ptr8 = x_Ptr7 + nV;
+    #endif
+    #if nISO>=10
+    x_Ptr9 = x_Ptr8 + nV;
+    #endif
+    #if nISO>=11
+    x_Ptr10 = x_Ptr9 + nV;
+    #endif
+    #if nISO>=12
+    x_Ptr11 = x_Ptr10 + nV;
+    #endif
+    #if nISO>=13
+    x_Ptr12 = x_Ptr11 + nV;
+    #endif
+    #if nISO>=14
+    x_Ptr13 = x_Ptr12 + nV;
+    #endif
+    #if nISO>=15
+    x_Ptr14 = x_Ptr13 + nV;
+    #endif
+    #if nISO>=16
+    x_Ptr15 = x_Ptr14 + nV;
+    #endif
+    #if nISO>=17
+    x_Ptr16 = x_Ptr15 + nV;
+    #endif
+    #if nISO>=18
+    x_Ptr17 = x_Ptr16 + nV;
+    #endif
+    #if nISO>=19
+    x_Ptr18 = x_Ptr17 + nV;
+    #endif
+    #if nISO>=20
+    x_Ptr19 = x_Ptr18 + nV;
+    #endif
+
+    while( t_v != t_vEnd )
+    {
+        x0 = *x_Ptr0++;
+        #if nISO>=2
+        x1 = *x_Ptr1++;
+        #endif
+        #if nISO>=3
+        x2 = *x_Ptr2++;
+        #endif
+        #if nISO>=4
+        x3 = *x_Ptr3++;
+        #endif
+        #if nISO>=5
+        x4 = *x_Ptr4++;
+        #endif
+        #if nISO>=6
+        x5 = *x_Ptr5++;
+        #endif
+        #if nISO>=7
+        x6 = *x_Ptr6++;
+        #endif
+        #if nISO>=8
+        x7 = *x_Ptr7++;
+        #endif
+        #if nISO>=9
+        x8 = *x_Ptr8++;
+        #endif
+        #if nISO>=10
+        x9 = *x_Ptr9++;
+        #endif
+        #if nISO>=11
+        x10 = *x_Ptr10++;
+        #endif
+        #if nISO>=12
+        x11 = *x_Ptr11++;
+        #endif
+        #if nISO>=13
+        x12 = *x_Ptr12++;
+        #endif
+        #if nISO>=14
+        x13 = *x_Ptr13++;
+        #endif
+        #if nISO>=15
+        x14 = *x_Ptr14++;
+        #endif
+        #if nISO>=16
+        x15 = *x_Ptr15++;
+        #endif
+        #if nISO>=17
+        x16 = *x_Ptr16++;
+        #endif
+        #if nISO>=18
+        x17 = *x_Ptr17++;
+        #endif
+        #if nISO>=19
+        x18 = *x_Ptr18++;
+        #endif
+        #if nISO>=20
+        x19 = *x_Ptr19++;
+        #endif
+
+        if (
+               x0 != 0
+            #if nISO>=2
+            || x1 != 0
+            #endif
+            #if nISO>=3
+            || x2 != 0
+            #endif
+            #if nISO>=4
+            || x3 != 0
+            #endif
+            #if nISO>=5
+            || x4 != 0
+            #endif
+            #if nISO>=6
+            || x5 != 0
+            #endif
+            #if nISO>=7
+            || x6 != 0
+            #endif
+            #if nISO>=8
+            || x7 != 0
+            #endif
+            #if nISO>=9
+            || x8 != 0
+            #endif
+            #if nISO>=10
+            || x9 != 0
+            #endif
+            #if nISO>=11
+            || x10 != 0
+            #endif
+            #if nISO>=12
+            || x11 != 0
+            #endif
+            #if nISO>=13
+            || x12 != 0
+            #endif
+            #if nISO>=14
+            || x13 != 0
+            #endif
+            #if nISO>=15
+            || x14 != 0
+            #endif
+            #if nISO>=16
+            || x15 != 0
+            #endif
+            #if nISO>=17
+            || x16 != 0
+            #endif
+            #if nISO>=18
+            || x17 != 0
+            #endif
+            #if nISO>=19
+            || x18 != 0
+            #endif
+            #if nISO>=20
+            || x19 != 0
+            #endif
+          )
+        {
+            Yptr    = Y    + nS * (*t_v);
+            YptrEnd = Yptr + nS;
+            SFP0ptr = isoSFP0;
+            #if nISO>=2
+            SFP1ptr = isoSFP1;
+            #endif
+            #if nISO>=3
+            SFP2ptr = isoSFP2;
+            #endif
+            #if nISO>=4
+            SFP3ptr = isoSFP3;
+            #endif
+            #if nISO>=5
+            SFP4ptr = isoSFP4;
+            #endif
+            #if nISO>=6
+            SFP5ptr = isoSFP5;
+            #endif
+            #if nISO>=7
+            SFP6ptr = isoSFP6;
+            #endif
+            #if nISO>=8
+            SFP7ptr = isoSFP7;
+            #endif
+            #if nISO>=9
+            SFP8ptr = isoSFP8;
+            #endif
+            #if nISO>=10
+            SFP9ptr = isoSFP9;
+            #endif
+            #if nISO>=11
+            SFP10ptr = isoSFP10;
+            #endif
+            #if nISO>=12
+            SFP11ptr = isoSFP11;
+            #endif
+            #if nISO>=13
+            SFP12ptr = isoSFP12;
+            #endif
+            #if nISO>=14
+            SFP13ptr = isoSFP13;
+            #endif
+            #if nISO>=15
+            SFP14ptr = isoSFP14;
+            #endif
+            #if nISO>=16
+            SFP15ptr = isoSFP15;
+            #endif
+            #if nISO>=17
+            SFP16ptr = isoSFP16;
+            #endif
+            #if nISO>=18
+            SFP17ptr = isoSFP17;
+            #endif
+            #if nISO>=19
+            SFP18ptr = isoSFP18;
+            #endif
+            #if nISO>=20
+            SFP19ptr = isoSFP19;
+            #endif
+
+            while( Yptr != YptrEnd )
+                (*Yptr++) += (
+                      x0 * (*SFP0ptr++)
+                    #if nISO>=2
+                    + x1 * (*SFP1ptr++)
+                    #endif
+                    #if nISO>=3
+                    + x2 * (*SFP2ptr++)
+                    #endif
+                    #if nISO>=4
+                    + x3 * (*SFP3ptr++)
+                    #endif
+                    #if nISO>=5
+                    + x4 * (*SFP4ptr++)
+                    #endif
+                    #if nISO>=6
+                    + x5 * (*SFP5ptr++)
+                    #endif
+                    #if nISO>=7
+                    + x6 * (*SFP6ptr++)
+                    #endif
+                    #if nISO>=8
+                    + x7 * (*SFP7ptr++)
+                    #endif
+                    #if nISO>=9
+                    + x8 * (*SFP8ptr++)
+                    #endif
+                    #if nISO>=10
+                    + x9 * (*SFP9ptr++)
+                    #endif
+                    #if nISO>=11
+                    + x10 * (*SFP10ptr++)
+                    #endif
+                    #if nISO>=12
+                    + x11 * (*SFP11ptr++)
+                    #endif
+                    #if nISO>=13
+                    + x12 * (*SFP12ptr++)
+                    #endif
+                    #if nISO>=14
+                    + x13 * (*SFP13ptr++)
+                    #endif
+                    #if nISO>=15
+                    + x14 * (*SFP14ptr++)
+                    #endif
+                    #if nISO>=16
+                    + x15 * (*SFP15ptr++)
+                    #endif
+                    #if nISO>=17
+                    + x16 * (*SFP16ptr++)
+                    #endif
+                    #if nISO>=18
+                    + x17 * (*SFP17ptr++)
+                    #endif
+                    #if nISO>=19
+                    + x18 * (*SFP18ptr++)
+                    #endif
+                    #if nISO>=20
+                    + x19 * (*SFP19ptr++)
+                    #endif
+                );
+        }
+        t_v++;
+    }
+#endif
+
+    pthread_exit( 0 );
+}
+
+
+// =========================
+// Function called by CYTHON
+// =========================
+void COMMIT_A(
+    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
+    double *_vIN, double *_vOUT,
+    uint32_t *_ICf, uint32_t *_ICv, uint16_t *_ICo, float *_ICl,
+    uint32_t *_ECv, uint16_t *_ECo,
+    uint32_t *_ISOv,
+    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
+    uint32_t* _ICthreads, uint32_t* _ECthreads, uint32_t* _ISOthreads
+)
+{
+    nF = _nF;
+    n  = _n;
+    nE = _nE;
+    nV = _nV;
+    nS = _nS;
+    ndirs = _ndirs;
+
+    x = _vIN;
+    Y = _vOUT;
+
+    ICf  = _ICf;
+    ICv  = _ICv;
+    ICo  = _ICo;
+    ICl  = _ICl;
+    ECv  = _ECv;
+    ECo  = _ECo;
+    ISOv = _ISOv;
+
+    #if nIC>=1
+    wmrSFP0 = _wmrSFP;
+    #if nIC>=2
+    wmrSFP1 = wmrSFP0 + _ndirs*_nS;
+    #if nIC>=3
+    wmrSFP2 = wmrSFP1 + _ndirs*_nS;
+    #if nIC>=4
+    wmrSFP3 = wmrSFP2 + _ndirs*_nS;
+    #if nIC>=5
+    wmrSFP4 = wmrSFP3 + _ndirs*_nS;
+    #if nIC>=6
+    wmrSFP5 = wmrSFP4 + _ndirs*_nS;
+    #if nIC>=7
+    wmrSFP6 = wmrSFP5 + _ndirs*_nS;
+    #if nIC>=8
+    wmrSFP7 = wmrSFP6 + _ndirs*_nS;
+    #if nIC>=9
+    wmrSFP8 = wmrSFP7 + _ndirs*_nS;
+    #if nIC>=10
+    wmrSFP9 = wmrSFP8 + _ndirs*_nS;
+    #if nIC>=11
+    wmrSFP10 = wmrSFP9 + _ndirs*_nS;
+    #if nIC>=12
+    wmrSFP11 = wmrSFP10 + _ndirs*_nS;
+    #if nIC>=13
+    wmrSFP12 = wmrSFP11 + _ndirs*_nS;
+    #if nIC>=14
+    wmrSFP13 = wmrSFP12 + _ndirs*_nS;
+    #if nIC>=15
+    wmrSFP14 = wmrSFP13 + _ndirs*_nS;
+    #if nIC>=16
+    wmrSFP15 = wmrSFP14 + _ndirs*_nS;
+    #if nIC>=17
+    wmrSFP16 = wmrSFP15 + _ndirs*_nS;
+    #if nIC>=18
+    wmrSFP17 = wmrSFP16 + _ndirs*_nS;
+    #if nIC>=19
+    wmrSFP18 = wmrSFP17 + _ndirs*_nS;
+    #if nIC>=20
+    wmrSFP19 = wmrSFP18 + _ndirs*_nS;
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #if nEC>=1
+    wmhSFP0 = _wmhSFP;
+    #if nEC>=2
+    wmhSFP1 = wmhSFP0 + _ndirs*_nS;
+    #if nEC>=3
+    wmhSFP2 = wmhSFP1 + _ndirs*_nS;
+    #if nEC>=4
+    wmhSFP3 = wmhSFP2 + _ndirs*_nS;
+    #if nEC>=5
+    wmhSFP4 = wmhSFP3 + _ndirs*_nS;
+    #if nEC>=6
+    wmhSFP5 = wmhSFP4 + _ndirs*_nS;
+    #if nEC>=7
+    wmhSFP6 = wmhSFP5 + _ndirs*_nS;
+    #if nEC>=8
+    wmhSFP7 = wmhSFP6 + _ndirs*_nS;
+    #if nEC>=9
+    wmhSFP8 = wmhSFP7 + _ndirs*_nS;
+    #if nEC>=10
+    wmhSFP9 = wmhSFP8 + _ndirs*_nS;
+    #if nEC>=11
+    wmhSFP10 = wmhSFP9 + _ndirs*_nS;
+    #if nEC>=12
+    wmhSFP11 = wmhSFP10 + _ndirs*_nS;
+    #if nEC>=13
+    wmhSFP12 = wmhSFP11 + _ndirs*_nS;
+    #if nEC>=14
+    wmhSFP13 = wmhSFP12 + _ndirs*_nS;
+    #if nEC>=15
+    wmhSFP14 = wmhSFP13 + _ndirs*_nS;
+    #if nEC>=16
+    wmhSFP15 = wmhSFP14 + _ndirs*_nS;
+    #if nEC>=17
+    wmhSFP16 = wmhSFP15 + _ndirs*_nS;
+    #if nEC>=18
+    wmhSFP17 = wmhSFP16 + _ndirs*_nS;
+    #if nEC>=19
+    wmhSFP18 = wmhSFP17 + _ndirs*_nS;
+    #if nEC>=20
+    wmhSFP19 = wmhSFP18 + _ndirs*_nS;
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #if nISO>=1
+    isoSFP0 = _isoSFP;
+    #if nISO>=2
+    isoSFP1 = isoSFP0 + _nS;
+    #if nISO>=3
+    isoSFP2 = isoSFP1 + _nS;
+    #if nISO>=4
+    isoSFP3 = isoSFP2 + _nS;
+    #if nISO>=5
+    isoSFP4 = isoSFP3 + _nS;
+    #if nISO>=6
+    isoSFP5 = isoSFP4 + _nS;
+    #if nISO>=7
+    isoSFP6 = isoSFP5 + _nS;
+    #if nISO>=8
+    isoSFP7 = isoSFP6 + _nS;
+    #if nISO>=9
+    isoSFP8 = isoSFP7 + _nS;
+    #if nISO>=10
+    isoSFP9 = isoSFP8 + _nS;
+    #if nISO>=11
+    isoSFP10 = isoSFP9 + _nS;
+    #if nISO>=12
+    isoSFP11 = isoSFP10 + _nS;
+    #if nISO>=13
+    isoSFP12 = isoSFP11 + _nS;
+    #if nISO>=14
+    isoSFP13 = isoSFP12 + _nS;
+    #if nISO>=15
+    isoSFP14 = isoSFP13 + _nS;
+    #if nISO>=16
+    isoSFP15 = isoSFP14 + _nS;
+    #if nISO>=17
+    isoSFP16 = isoSFP15 + _nS;
+    #if nISO>=18
+    isoSFP17 = isoSFP16 + _nS;
+    #if nISO>=19
+    isoSFP18 = isoSFP17 + _nS;
+    #if nISO>=20
+    isoSFP19 = isoSFP18 + _nS;
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+
+    ICthreads  = _ICthreads;
+    ECthreads  = _ECthreads;
+    ISOthreads = _ISOthreads;
+
+    // Run SEPARATE THREADS to perform the multiplication
+    pthread_t threads[nTHREADS];
+    int t;
+    for(t=0; t<nTHREADS ; t++)
+        pthread_create( &threads[t], NULL, COMMIT_A__block, (void *) (long int)t );
+    for(t=0; t<nTHREADS ; t++)
+        pthread_join( threads[t], NULL );
+    return;
+}
+
+
+
+/* ===================================================== */
+/* Compute a sub-block of the A'*y MAtRIX-VECTOR product */
+/* ===================================================== */
+void* COMMIT_At__block( void *ptr )
+{
+    int      id = (long)ptr;
+    int      offset;
+    double   x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, w, Y_tmp;
+    double   *x_Ptr0, *x_Ptr1, *x_Ptr2, *x_Ptr3, *x_Ptr4, *x_Ptr5, *x_Ptr6, *x_Ptr7, *x_Ptr8, *x_Ptr9, *x_Ptr10, *x_Ptr11, *x_Ptr12, *x_Ptr13, *x_Ptr14, *x_Ptr15, *x_Ptr16, *x_Ptr17, *x_Ptr18, *x_Ptr19;
+    double   *Yptr, *YptrEnd;
+    float    *SFP0ptr, *SFP1ptr, *SFP2ptr, *SFP3ptr, *SFP4ptr, *SFP5ptr, *SFP6ptr, *SFP7ptr, *SFP8ptr, *SFP9ptr, *SFP10ptr, *SFP11ptr, *SFP12ptr, *SFP13ptr, *SFP14ptr, *SFP15ptr, *SFP16ptr, *SFP17ptr, *SFP18ptr, *SFP19ptr;
+    uint32_t *t_v, *t_vEnd, *t_f;
+    uint16_t *t_o;
+    float    *t_l;
+    uint8_t  *t_t;
+
+#if nIC>=1
+    // intra-cellular compartments
+    t_v    = ICv;
+    t_vEnd = ICv + n;
+    t_o    = ICo;
+    t_l    = ICl;
+    t_f    = ICf;
+    t_t    = ICthreadsT;
+
+    while( t_v != t_vEnd )
+    {
+        // in this case, I need to walk throug because the segments are ordered in "voxel order"
+        if ( *t_t == id )
+        {
+            Yptr    = Y    + nS * (*t_v);
+            YptrEnd = Yptr + nS;
+            offset  = nS * (*t_o);
+
+            Y_tmp = *Yptr;
+            SFP0ptr   = wmrSFP0 + offset;
+            x0 = (*SFP0ptr++) * Y_tmp;
+            #if nIC>=2
+            SFP1ptr   = wmrSFP1 + offset;
+            x1 = (*SFP1ptr++) * Y_tmp;
+            #endif
+            #if nIC>=3
+            SFP2ptr   = wmrSFP2 + offset;
+            x2 = (*SFP2ptr++) * Y_tmp;
+            #endif
+            #if nIC>=4
+            SFP3ptr   = wmrSFP3 + offset;
+            x3 = (*SFP3ptr++) * Y_tmp;
+            #endif
+            #if nIC>=5
+            SFP4ptr   = wmrSFP4 + offset;
+            x4 = (*SFP4ptr++) * Y_tmp;
+            #endif
+            #if nIC>=6
+            SFP5ptr   = wmrSFP5 + offset;
+            x5 = (*SFP5ptr++) * Y_tmp;
+            #endif
+            #if nIC>=7
+            SFP6ptr   = wmrSFP6 + offset;
+            x6 = (*SFP6ptr++) * Y_tmp;
+            #endif
+            #if nIC>=8
+            SFP7ptr   = wmrSFP7 + offset;
+            x7 = (*SFP7ptr++) * Y_tmp;
+            #endif
+            #if nIC>=9
+            SFP8ptr   = wmrSFP8 + offset;
+            x8 = (*SFP8ptr++) * Y_tmp;
+            #endif
+            #if nIC>=10
+            SFP9ptr   = wmrSFP9 + offset;
+            x9 = (*SFP9ptr++) * Y_tmp;
+            #endif
+            #if nIC>=11
+            SFP10ptr   = wmrSFP10 + offset;
+            x10 = (*SFP10ptr++) * Y_tmp;
+            #endif
+            #if nIC>=12
+            SFP11ptr   = wmrSFP11 + offset;
+            x11 = (*SFP11ptr++) * Y_tmp;
+            #endif
+            #if nIC>=13
+            SFP12ptr   = wmrSFP12 + offset;
+            x12 = (*SFP12ptr++) * Y_tmp;
+            #endif
+            #if nIC>=14
+            SFP13ptr   = wmrSFP13 + offset;
+            x13 = (*SFP13ptr++) * Y_tmp;
+            #endif
+            #if nIC>=15
+            SFP14ptr   = wmrSFP14 + offset;
+            x14 = (*SFP14ptr++) * Y_tmp;
+            #endif
+            #if nIC>=16
+            SFP15ptr   = wmrSFP15 + offset;
+            x15 = (*SFP15ptr++) * Y_tmp;
+            #endif
+            #if nIC>=17
+            SFP16ptr   = wmrSFP16 + offset;
+            x16 = (*SFP16ptr++) * Y_tmp;
+            #endif
+            #if nIC>=18
+            SFP17ptr   = wmrSFP17 + offset;
+            x17 = (*SFP17ptr++) * Y_tmp;
+            #endif
+            #if nIC>=19
+            SFP18ptr   = wmrSFP18 + offset;
+            x18 = (*SFP18ptr++) * Y_tmp;
+            #endif
+            #if nIC>=20
+            SFP19ptr   = wmrSFP19 + offset;
+            x19 = (*SFP19ptr++) * Y_tmp;
+            #endif
+
+            while( ++Yptr != YptrEnd )
+            {
+                Y_tmp = *Yptr;
+                x0 += (*SFP0ptr++) * Y_tmp;
+                #if nIC>=2
+                x1 += (*SFP1ptr++) * Y_tmp;
+                #endif
+                #if nIC>=3
+                x2 += (*SFP2ptr++) * Y_tmp;
+                #endif
+                #if nIC>=4
+                x3 += (*SFP3ptr++) * Y_tmp;
+                #endif
+                #if nIC>=5
+                x4 += (*SFP4ptr++) * Y_tmp;
+                #endif
+                #if nIC>=6
+                x5 += (*SFP5ptr++) * Y_tmp;
+                #endif
+                #if nIC>=7
+                x6 += (*SFP6ptr++) * Y_tmp;
+                #endif
+                #if nIC>=8
+                x7 += (*SFP7ptr++) * Y_tmp;
+                #endif
+                #if nIC>=9
+                x8 += (*SFP8ptr++) * Y_tmp;
+                #endif
+                #if nIC>=10
+                x9 += (*SFP9ptr++) * Y_tmp;
+                #endif
+                #if nIC>=11
+                x10 += (*SFP10ptr++) * Y_tmp;
+                #endif
+                #if nIC>=12
+                x11 += (*SFP11ptr++) * Y_tmp;
+                #endif
+                #if nIC>=13
+                x12 += (*SFP12ptr++) * Y_tmp;
+                #endif
+                #if nIC>=14
+                x13 += (*SFP13ptr++) * Y_tmp;
+                #endif
+                #if nIC>=15
+                x14 += (*SFP14ptr++) * Y_tmp;
+                #endif
+                #if nIC>=16
+                x15 += (*SFP15ptr++) * Y_tmp;
+                #endif
+                #if nIC>=17
+                x16 += (*SFP16ptr++) * Y_tmp;
+                #endif
+                #if nIC>=18
+                x17 += (*SFP17ptr++) * Y_tmp;
+                #endif
+                #if nIC>=19
+                x18 += (*SFP18ptr++) * Y_tmp;
+                #endif
+                #if nIC>=20
+                x19 += (*SFP19ptr++) * Y_tmp;
+                #endif
+            }
+
+            w = (double)(*t_l);
+            x[*t_f]      += w * x0;
+            #if nIC>=2
+            x[*t_f+nF]   += w * x1;
+            #endif
+            #if nIC>=3
+            x[*t_f+2*nF] += w * x2;
+            #endif
+            #if nIC>=4
+            x[*t_f+3*nF] += w * x3;
+            #endif
+            #if nIC>=5
+            x[*t_f+4*nF] += w * x4;
+            #endif
+            #if nIC>=6
+            x[*t_f+5*nF] += w * x5;
+            #endif
+            #if nIC>=7
+            x[*t_f+6*nF] += w * x6;
+            #endif
+            #if nIC>=8
+            x[*t_f+7*nF] += w * x7;
+            #endif
+            #if nIC>=9
+            x[*t_f+8*nF] += w * x8;
+            #endif
+            #if nIC>=10
+            x[*t_f+9*nF] += w * x9;
+            #endif
+            #if nIC>=11
+            x[*t_f+10*nF] += w * x10;
+            #endif
+            #if nIC>=12
+            x[*t_f+11*nF] += w * x11;
+            #endif
+            #if nIC>=13
+            x[*t_f+12*nF] += w * x12;
+            #endif
+            #if nIC>=14
+            x[*t_f+13*nF] += w * x13;
+            #endif
+            #if nIC>=15
+            x[*t_f+14*nF] += w * x14;
+            #endif
+            #if nIC>=16
+            x[*t_f+15*nF] += w * x15;
+            #endif
+            #if nIC>=17
+            x[*t_f+16*nF] += w * x16;
+            #endif
+            #if nIC>=18
+            x[*t_f+17*nF] += w * x17;
+            #endif
+            #if nIC>=19
+            x[*t_f+18*nF] += w * x18;
+            #endif
+            #if nIC>=20
+            x[*t_f+19*nF] += w * x19;
+            #endif
+        }
+
+        t_f++;
+        t_v++;
+        t_o++;
+        t_l++;
+        t_t++;
+    }
+#endif
+
+#if nEC>=1
+    // extra-cellular compartments
+    t_v    = ECv + ECthreadsT[id];
+    t_vEnd = ECv + ECthreadsT[id+1];
+    t_o    = ECo + ECthreadsT[id];
+
+    x_Ptr0 = x + nIC*nF + ECthreadsT[id];
+    #if nEC>=2
+    x_Ptr1 = x_Ptr0 + nE;
+    #endif
+    #if nEC>=3
+    x_Ptr2 = x_Ptr1 + nE;
+    #endif
+    #if nEC>=4
+    x_Ptr3 = x_Ptr2 + nE;
+    #endif
+    #if nEC>=5
+    x_Ptr4 = x_Ptr3 + nE;
+    #endif
+    #if nEC>=6
+    x_Ptr5 = x_Ptr4 + nE;
+    #endif
+    #if nEC>=7
+    x_Ptr6 = x_Ptr5 + nE;
+    #endif
+    #if nEC>=8
+    x_Ptr7 = x_Ptr6 + nE;
+    #endif
+    #if nEC>=9
+    x_Ptr8 = x_Ptr7 + nE;
+    #endif
+    #if nEC>=10
+    x_Ptr9 = x_Ptr8 + nE;
+    #endif
+    #if nEC>=11
+    x_Ptr10 = x_Ptr9 + nE;
+    #endif
+    #if nEC>=12
+    x_Ptr11 = x_Ptr10 + nE;
+    #endif
+    #if nEC>=13
+    x_Ptr12 = x_Ptr11 + nE;
+    #endif
+    #if nEC>=14
+    x_Ptr13 = x_Ptr12 + nE;
+    #endif
+    #if nEC>=15
+    x_Ptr14 = x_Ptr13 + nE;
+    #endif
+    #if nEC>=16
+    x_Ptr15 = x_Ptr14 + nE;
+    #endif
+    #if nEC>=17
+    x_Ptr16 = x_Ptr15 + nE;
+    #endif
+    #if nEC>=18
+    x_Ptr17 = x_Ptr16 + nE;
+    #endif
+    #if nEC>=19
+    x_Ptr18 = x_Ptr17 + nE;
+    #endif
+    #if nEC>=20
+    x_Ptr19 = x_Ptr18 + nE;
+    #endif
+
+    while( t_v != t_vEnd )
+    {
+        Yptr    = Y    + nS * (*t_v++);
+        YptrEnd = Yptr + nS;
+        offset  = nS * (*t_o++);
+
+        Y_tmp = *Yptr;
+        SFP0ptr = wmhSFP0 + offset;
+        x0 = (*SFP0ptr++) * Y_tmp;
+        #if nEC>=2
+        SFP1ptr = wmhSFP1 + offset;
+        x1 = (*SFP1ptr++) * Y_tmp;
+        #endif
+        #if nEC>=3
+        SFP2ptr = wmhSFP2 + offset;
+        x2 = (*SFP2ptr++) * Y_tmp;
+        #endif
+        #if nEC>=4
+        SFP3ptr = wmhSFP3 + offset;
+        x3 = (*SFP3ptr++) * Y_tmp;
+        #endif
+        #if nEC>=5
+        SFP4ptr = wmhSFP4 + offset;
+        x4 = (*SFP4ptr++) * Y_tmp;
+        #endif
+        #if nEC>=6
+        SFP5ptr = wmhSFP5 + offset;
+        x5 = (*SFP5ptr++) * Y_tmp;
+        #endif
+        #if nEC>=7
+        SFP6ptr = wmhSFP6 + offset;
+        x6 = (*SFP6ptr++) * Y_tmp;
+        #endif
+        #if nEC>=8
+        SFP7ptr = wmhSFP7 + offset;
+        x7 = (*SFP7ptr++) * Y_tmp;
+        #endif
+        #if nEC>=9
+        SFP8ptr = wmhSFP8 + offset;
+        x8 = (*SFP8ptr++) * Y_tmp;
+        #endif
+        #if nEC>=10
+        SFP9ptr = wmhSFP9 + offset;
+        x9 = (*SFP9ptr++) * Y_tmp;
+        #endif
+        #if nEC>=11
+        SFP10ptr = wmhSFP10 + offset;
+        x10 = (*SFP10ptr++) * Y_tmp;
+        #endif
+        #if nEC>=12
+        SFP11ptr = wmhSFP11 + offset;
+        x11 = (*SFP11ptr++) * Y_tmp;
+        #endif
+        #if nEC>=13
+        SFP12ptr = wmhSFP12 + offset;
+        x12 = (*SFP12ptr++) * Y_tmp;
+        #endif
+        #if nEC>=14
+        SFP13ptr = wmhSFP13 + offset;
+        x13 = (*SFP13ptr++) * Y_tmp;
+        #endif
+        #if nEC>=15
+        SFP14ptr = wmhSFP14 + offset;
+        x14 = (*SFP14ptr++) * Y_tmp;
+        #endif
+        #if nEC>=16
+        SFP15ptr = wmhSFP15 + offset;
+        x15 = (*SFP15ptr++) * Y_tmp;
+        #endif
+        #if nEC>=17
+        SFP16ptr = wmhSFP16 + offset;
+        x16 = (*SFP16ptr++) * Y_tmp;
+        #endif
+        #if nEC>=18
+        SFP17ptr = wmhSFP17 + offset;
+        x17 = (*SFP17ptr++) * Y_tmp;
+        #endif
+        #if nEC>=19
+        SFP18ptr = wmhSFP18 + offset;
+        x18 = (*SFP18ptr++) * Y_tmp;
+        #endif
+        #if nEC>=20
+        SFP19ptr = wmhSFP19 + offset;
+        x19 = (*SFP19ptr++) * Y_tmp;
+        #endif
+
+        while( ++Yptr != YptrEnd )
+        {
+            Y_tmp = *Yptr;
+            x0 += (*SFP0ptr++) * Y_tmp;
+            #if nEC>=2
+            x1 += (*SFP1ptr++) * Y_tmp;
+            #endif
+            #if nEC>=3
+            x2 += (*SFP2ptr++) * Y_tmp;
+            #endif
+            #if nEC>=4
+            x3 += (*SFP3ptr++) * Y_tmp;
+            #endif
+            #if nEC>=5
+            x4 += (*SFP4ptr++) * Y_tmp;
+            #endif
+            #if nEC>=6
+            x5 += (*SFP5ptr++) * Y_tmp;
+            #endif
+            #if nEC>=7
+            x6 += (*SFP6ptr++) * Y_tmp;
+            #endif
+            #if nEC>=8
+            x7 += (*SFP7ptr++) * Y_tmp;
+            #endif
+            #if nEC>=9
+            x8 += (*SFP8ptr++) * Y_tmp;
+            #endif
+            #if nEC>=10
+            x9 += (*SFP9ptr++) * Y_tmp;
+            #endif
+            #if nEC>=11
+            x10 += (*SFP10ptr++) * Y_tmp;
+            #endif
+            #if nEC>=12
+            x11 += (*SFP11ptr++) * Y_tmp;
+            #endif
+            #if nEC>=13
+            x12 += (*SFP12ptr++) * Y_tmp;
+            #endif
+            #if nEC>=14
+            x13 += (*SFP13ptr++) * Y_tmp;
+            #endif
+            #if nEC>=15
+            x14 += (*SFP14ptr++) * Y_tmp;
+            #endif
+            #if nEC>=16
+            x15 += (*SFP15ptr++) * Y_tmp;
+            #endif
+            #if nEC>=17
+            x16 += (*SFP16ptr++) * Y_tmp;
+            #endif
+            #if nEC>=18
+            x17 += (*SFP17ptr++) * Y_tmp;
+            #endif
+            #if nEC>=19
+            x18 += (*SFP18ptr++) * Y_tmp;
+            #endif
+            #if nEC>=20
+            x19 += (*SFP19ptr++) * Y_tmp;
+            #endif
+        }
+        (*x_Ptr0++) += x0;
+        #if nEC>=2
+        (*x_Ptr1++) += x1;
+        #endif
+        #if nEC>=3
+        (*x_Ptr2++) += x2;
+        #endif
+        #if nEC>=4
+        (*x_Ptr3++) += x3;
+        #endif
+        #if nEC>=5
+        (*x_Ptr4++) += x4;
+        #endif
+        #if nEC>=6
+        (*x_Ptr5++) += x5;
+        #endif
+        #if nEC>=7
+        (*x_Ptr6++) += x6;
+        #endif
+        #if nEC>=8
+        (*x_Ptr7++) += x7;
+        #endif
+        #if nEC>=9
+        (*x_Ptr8++) += x8;
+        #endif
+        #if nEC>=10
+        (*x_Ptr9++) += x9;
+        #endif
+        #if nEC>=11
+        (*x_Ptr10++) += x10;
+        #endif
+        #if nEC>=12
+        (*x_Ptr11++) += x11;
+        #endif
+        #if nEC>=13
+        (*x_Ptr12++) += x12;
+        #endif
+        #if nEC>=14
+        (*x_Ptr13++) += x13;
+        #endif
+        #if nEC>=15
+        (*x_Ptr14++) += x14;
+        #endif
+        #if nEC>=16
+        (*x_Ptr15++) += x15;
+        #endif
+        #if nEC>=17
+        (*x_Ptr16++) += x16;
+        #endif
+        #if nEC>=18
+        (*x_Ptr17++) += x17;
+        #endif
+        #if nEC>=19
+        (*x_Ptr18++) += x18;
+        #endif
+        #if nEC>=20
+        (*x_Ptr19++) += x19;
+        #endif
+    }
+#endif
+
+#if nISO>=1
+    // isotropic compartments
+    t_v    = ISOv + ISOthreadsT[id];
+    t_vEnd = ISOv + ISOthreadsT[id+1];
+
+    x_Ptr0 = x + nIC*nF + nEC*nE + ISOthreadsT[id];
+    #if nISO>=2
+    x_Ptr1 = x_Ptr0 + nV;
+    #endif
+    #if nISO>=3
+    x_Ptr2 = x_Ptr1 + nV;
+    #endif
+    #if nISO>=4
+    x_Ptr3 = x_Ptr2 + nV;
+    #endif
+    #if nISO>=5
+    x_Ptr4 = x_Ptr3 + nV;
+    #endif
+    #if nISO>=6
+    x_Ptr5 = x_Ptr4 + nV;
+    #endif
+    #if nISO>=7
+    x_Ptr6 = x_Ptr5 + nV;
+    #endif
+    #if nISO>=8
+    x_Ptr7 = x_Ptr6 + nV;
+    #endif
+    #if nISO>=9
+    x_Ptr8 = x_Ptr7 + nV;
+    #endif
+    #if nISO>=10
+    x_Ptr9 = x_Ptr8 + nV;
+    #endif
+    #if nISO>=11
+    x_Ptr10 = x_Ptr9 + nV;
+    #endif
+    #if nISO>=12
+    x_Ptr11 = x_Ptr10 + nV;
+    #endif
+    #if nISO>=13
+    x_Ptr12 = x_Ptr11 + nV;
+    #endif
+    #if nISO>=14
+    x_Ptr13 = x_Ptr12 + nV;
+    #endif
+    #if nISO>=15
+    x_Ptr14 = x_Ptr13 + nV;
+    #endif
+    #if nISO>=16
+    x_Ptr15 = x_Ptr14 + nV;
+    #endif
+    #if nISO>=17
+    x_Ptr16 = x_Ptr15 + nV;
+    #endif
+    #if nISO>=18
+    x_Ptr17 = x_Ptr16 + nV;
+    #endif
+    #if nISO>=19
+    x_Ptr18 = x_Ptr17 + nV;
+    #endif
+    #if nISO>=20
+    x_Ptr19 = x_Ptr18 + nV;
+    #endif
+
+    while( t_v != t_vEnd )
+    {
+        Yptr    = Y    + nS * (*t_v++);
+        YptrEnd = Yptr + nS;
+
+        SFP0ptr = isoSFP0;
+        #if nISO>=2
+        SFP1ptr = isoSFP1;
+        #endif
+        #if nISO>=3
+        SFP2ptr = isoSFP2;
+        #endif
+        #if nISO>=4
+        SFP3ptr = isoSFP3;
+        #endif
+        #if nISO>=5
+        SFP4ptr = isoSFP4;
+        #endif
+        #if nISO>=6
+        SFP5ptr = isoSFP5;
+        #endif
+        #if nISO>=7
+        SFP6ptr = isoSFP6;
+        #endif
+        #if nISO>=8
+        SFP7ptr = isoSFP7;
+        #endif
+        #if nISO>=9
+        SFP8ptr = isoSFP8;
+        #endif
+        #if nISO>=10
+        SFP9ptr = isoSFP9;
+        #endif
+        #if nISO>=11
+        SFP10ptr = isoSFP10;
+        #endif
+        #if nISO>=12
+        SFP11ptr = isoSFP11;
+        #endif
+        #if nISO>=13
+        SFP12ptr = isoSFP12;
+        #endif
+        #if nISO>=14
+        SFP13ptr = isoSFP13;
+        #endif
+        #if nISO>=15
+        SFP14ptr = isoSFP14;
+        #endif
+        #if nISO>=16
+        SFP15ptr = isoSFP15;
+        #endif
+        #if nISO>=17
+        SFP16ptr = isoSFP16;
+        #endif
+        #if nISO>=18
+        SFP17ptr = isoSFP17;
+        #endif
+        #if nISO>=19
+        SFP18ptr = isoSFP18;
+        #endif
+        #if nISO>=20
+        SFP19ptr = isoSFP19;
+        #endif
+
+        Y_tmp = *Yptr;
+        x0 = (*SFP0ptr++) * Y_tmp;
+        #if nISO>=2
+        x1 = (*SFP1ptr++) * Y_tmp;
+        #endif
+        #if nISO>=3
+        x2 = (*SFP2ptr++) * Y_tmp;
+        #endif
+        #if nISO>=4
+        x3 = (*SFP3ptr++) * Y_tmp;
+        #endif
+        #if nISO>=5
+        x4 = (*SFP4ptr++) * Y_tmp;
+        #endif
+        #if nISO>=6
+        x5 = (*SFP5ptr++) * Y_tmp;
+        #endif
+        #if nISO>=7
+        x6 = (*SFP6ptr++) * Y_tmp;
+        #endif
+        #if nISO>=8
+        x7 = (*SFP7ptr++) * Y_tmp;
+        #endif
+        #if nISO>=9
+        x8 = (*SFP8ptr++) * Y_tmp;
+        #endif
+        #if nISO>=10
+        x9 = (*SFP9ptr++) * Y_tmp;
+        #endif
+        #if nISO>=11
+        x10 = (*SFP10ptr++) * Y_tmp;
+        #endif
+        #if nISO>=12
+        x11 = (*SFP11ptr++) * Y_tmp;
+        #endif
+        #if nISO>=13
+        x12 = (*SFP12ptr++) * Y_tmp;
+        #endif
+        #if nISO>=14
+        x13 = (*SFP13ptr++) * Y_tmp;
+        #endif
+        #if nISO>=15
+        x14 = (*SFP14ptr++) * Y_tmp;
+        #endif
+        #if nISO>=16
+        x15 = (*SFP15ptr++) * Y_tmp;
+        #endif
+        #if nISO>=17
+        x16 = (*SFP16ptr++) * Y_tmp;
+        #endif
+        #if nISO>=18
+        x17 = (*SFP17ptr++) * Y_tmp;
+        #endif
+        #if nISO>=19
+        x18 = (*SFP18ptr++) * Y_tmp;
+        #endif
+        #if nISO>=20
+        x19 = (*SFP19ptr++) * Y_tmp;
+        #endif
+
+        while( ++Yptr != YptrEnd )
+        {
+            Y_tmp = *Yptr;
+            x0  += (*SFP0ptr++) * Y_tmp;
+            #if nISO>=2
+            x1  += (*SFP1ptr++) * Y_tmp;
+            #endif
+            #if nISO>=3
+            x2  += (*SFP2ptr++) * Y_tmp;
+            #endif
+            #if nISO>=4
+            x3  += (*SFP3ptr++) * Y_tmp;
+            #endif
+            #if nISO>=5
+            x4  += (*SFP4ptr++) * Y_tmp;
+            #endif
+            #if nISO>=6
+            x5  += (*SFP5ptr++) * Y_tmp;
+            #endif
+            #if nISO>=7
+            x6  += (*SFP6ptr++) * Y_tmp;
+            #endif
+            #if nISO>=8
+            x7  += (*SFP7ptr++) * Y_tmp;
+            #endif
+            #if nISO>=9
+            x8  += (*SFP8ptr++) * Y_tmp;
+            #endif
+            #if nISO>=10
+            x9  += (*SFP9ptr++) * Y_tmp;
+            #endif
+            #if nISO>=11
+            x10  += (*SFP10ptr++) * Y_tmp;
+            #endif
+            #if nISO>=12
+            x11  += (*SFP11ptr++) * Y_tmp;
+            #endif
+            #if nISO>=13
+            x12  += (*SFP12ptr++) * Y_tmp;
+            #endif
+            #if nISO>=14
+            x13  += (*SFP13ptr++) * Y_tmp;
+            #endif
+            #if nISO>=15
+            x14  += (*SFP14ptr++) * Y_tmp;
+            #endif
+            #if nISO>=16
+            x15  += (*SFP15ptr++) * Y_tmp;
+            #endif
+            #if nISO>=17
+            x16  += (*SFP16ptr++) * Y_tmp;
+            #endif
+            #if nISO>=18
+            x17  += (*SFP17ptr++) * Y_tmp;
+            #endif
+            #if nISO>=19
+            x18  += (*SFP18ptr++) * Y_tmp;
+            #endif
+            #if nISO>=20
+            x19  += (*SFP19ptr++) * Y_tmp;
+            #endif
+        }
+
+        (*x_Ptr0++) += x0;
+        #if nISO>=2
+        (*x_Ptr1++) += x1;
+        #endif
+        #if nISO>=3
+        (*x_Ptr2++) += x2;
+        #endif
+        #if nISO>=4
+        (*x_Ptr3++) += x3;
+        #endif
+        #if nISO>=5
+        (*x_Ptr4++) += x4;
+        #endif
+        #if nISO>=6
+        (*x_Ptr5++) += x5;
+        #endif
+        #if nISO>=7
+        (*x_Ptr6++) += x6;
+        #endif
+        #if nISO>=8
+        (*x_Ptr7++) += x7;
+        #endif
+        #if nISO>=9
+        (*x_Ptr8++) += x8;
+        #endif
+        #if nISO>=10
+        (*x_Ptr9++) += x9;
+        #endif
+        #if nISO>=11
+        (*x_Ptr10++) += x10;
+        #endif
+        #if nISO>=12
+        (*x_Ptr11++) += x11;
+        #endif
+        #if nISO>=13
+        (*x_Ptr12++) += x12;
+        #endif
+        #if nISO>=14
+        (*x_Ptr13++) += x13;
+        #endif
+        #if nISO>=15
+        (*x_Ptr14++) += x14;
+        #endif
+        #if nISO>=16
+        (*x_Ptr15++) += x15;
+        #endif
+        #if nISO>=17
+        (*x_Ptr16++) += x16;
+        #endif
+        #if nISO>=18
+        (*x_Ptr17++) += x17;
+        #endif
+        #if nISO>=19
+        (*x_Ptr18++) += x18;
+        #endif
+        #if nISO>=20
+        (*x_Ptr19++) += x19;
+        #endif
+    }
+#endif
+
+    pthread_exit( 0 );
+}
+
+
+// =========================
+// Function called by CYTHON
+// =========================
+void COMMIT_At(
+    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
+    double *_vIN, double *_vOUT,
+    uint32_t *_ICf, uint32_t *_ICv, uint16_t *_ICo, float *_ICl,
+    uint32_t *_ECv, uint16_t *_ECo,
+    uint32_t *_ISOv,
+    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
+    uint8_t* _ICthreadsT, uint32_t* _ECthreadsT, uint32_t* _ISOthreadsT
+)
+{
+    nF = _nF;
+    n  = _n;
+    nE = _nE;
+    nV = _nV;
+    nS = _nS;
+    ndirs = _ndirs;
+
+    x = _vOUT;
+    Y = _vIN;
+
+    ICf  = _ICf;
+    ICv  = _ICv;
+    ICo  = _ICo;
+    ICl  = _ICl;
+    ECv  = _ECv;
+    ECo  = _ECo;
+    ISOv = _ISOv;
+
+    #if nIC>=1
+    wmrSFP0 = _wmrSFP;
+    #if nIC>=2
+    wmrSFP1 = wmrSFP0 + _ndirs*_nS;
+    #if nIC>=3
+    wmrSFP2 = wmrSFP1 + _ndirs*_nS;
+    #if nIC>=4
+    wmrSFP3 = wmrSFP2 + _ndirs*_nS;
+    #if nIC>=5
+    wmrSFP4 = wmrSFP3 + _ndirs*_nS;
+    #if nIC>=6
+    wmrSFP5 = wmrSFP4 + _ndirs*_nS;
+    #if nIC>=7
+    wmrSFP6 = wmrSFP5 + _ndirs*_nS;
+    #if nIC>=8
+    wmrSFP7 = wmrSFP6 + _ndirs*_nS;
+    #if nIC>=9
+    wmrSFP8 = wmrSFP7 + _ndirs*_nS;
+    #if nIC>=10
+    wmrSFP9 = wmrSFP8 + _ndirs*_nS;
+    #if nIC>=11
+    wmrSFP10 = wmrSFP9 + _ndirs*_nS;
+    #if nIC>=12
+    wmrSFP11 = wmrSFP10 + _ndirs*_nS;
+    #if nIC>=13
+    wmrSFP12 = wmrSFP11 + _ndirs*_nS;
+    #if nIC>=14
+    wmrSFP13 = wmrSFP12 + _ndirs*_nS;
+    #if nIC>=15
+    wmrSFP14 = wmrSFP13 + _ndirs*_nS;
+    #if nIC>=16
+    wmrSFP15 = wmrSFP14 + _ndirs*_nS;
+    #if nIC>=17
+    wmrSFP16 = wmrSFP15 + _ndirs*_nS;
+    #if nIC>=18
+    wmrSFP17 = wmrSFP16 + _ndirs*_nS;
+    #if nIC>=19
+    wmrSFP18 = wmrSFP17 + _ndirs*_nS;
+    #if nIC>=20
+    wmrSFP19 = wmrSFP18 + _ndirs*_nS;
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #if nEC>=1
+    wmhSFP0 = _wmhSFP;
+    #if nEC>=2
+    wmhSFP1 = wmhSFP0 + _ndirs*_nS;
+    #if nEC>=3
+    wmhSFP2 = wmhSFP1 + _ndirs*_nS;
+    #if nEC>=4
+    wmhSFP3 = wmhSFP2 + _ndirs*_nS;
+    #if nEC>=5
+    wmhSFP4 = wmhSFP3 + _ndirs*_nS;
+    #if nEC>=6
+    wmhSFP5 = wmhSFP4 + _ndirs*_nS;
+    #if nEC>=7
+    wmhSFP6 = wmhSFP5 + _ndirs*_nS;
+    #if nEC>=8
+    wmhSFP7 = wmhSFP6 + _ndirs*_nS;
+    #if nEC>=9
+    wmhSFP8 = wmhSFP7 + _ndirs*_nS;
+    #if nEC>=10
+    wmhSFP9 = wmhSFP8 + _ndirs*_nS;
+    #if nEC>=11
+    wmhSFP10 = wmhSFP9 + _ndirs*_nS;
+    #if nEC>=12
+    wmhSFP11 = wmhSFP10 + _ndirs*_nS;
+    #if nEC>=13
+    wmhSFP12 = wmhSFP11 + _ndirs*_nS;
+    #if nEC>=14
+    wmhSFP13 = wmhSFP12 + _ndirs*_nS;
+    #if nEC>=15
+    wmhSFP14 = wmhSFP13 + _ndirs*_nS;
+    #if nEC>=16
+    wmhSFP15 = wmhSFP14 + _ndirs*_nS;
+    #if nEC>=17
+    wmhSFP16 = wmhSFP15 + _ndirs*_nS;
+    #if nEC>=18
+    wmhSFP17 = wmhSFP16 + _ndirs*_nS;
+    #if nEC>=19
+    wmhSFP18 = wmhSFP17 + _ndirs*_nS;
+    #if nEC>=20
+    wmhSFP19 = wmhSFP18 + _ndirs*_nS;
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #if nISO>=1
+    isoSFP0 = _isoSFP;
+    #if nISO>=2
+    isoSFP1 = isoSFP0 + _nS;
+    #if nISO>=3
+    isoSFP2 = isoSFP1 + _nS;
+    #if nISO>=4
+    isoSFP3 = isoSFP2 + _nS;
+    #if nISO>=5
+    isoSFP4 = isoSFP3 + _nS;
+    #if nISO>=6
+    isoSFP5 = isoSFP4 + _nS;
+    #if nISO>=7
+    isoSFP6 = isoSFP5 + _nS;
+    #if nISO>=8
+    isoSFP7 = isoSFP6 + _nS;
+    #if nISO>=9
+    isoSFP8 = isoSFP7 + _nS;
+    #if nISO>=10
+    isoSFP9 = isoSFP8 + _nS;
+    #if nISO>=11
+    isoSFP10 = isoSFP9 + _nS;
+    #if nISO>=12
+    isoSFP11 = isoSFP10 + _nS;
+    #if nISO>=13
+    isoSFP12 = isoSFP11 + _nS;
+    #if nISO>=14
+    isoSFP13 = isoSFP12 + _nS;
+    #if nISO>=15
+    isoSFP14 = isoSFP13 + _nS;
+    #if nISO>=16
+    isoSFP15 = isoSFP14 + _nS;
+    #if nISO>=17
+    isoSFP16 = isoSFP15 + _nS;
+    #if nISO>=18
+    isoSFP17 = isoSFP16 + _nS;
+    #if nISO>=19
+    isoSFP18 = isoSFP17 + _nS;
+    #if nISO>=20
+    isoSFP19 = isoSFP18 + _nS;
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+
+    ICthreadsT  = _ICthreadsT;
+    ECthreadsT  = _ECthreadsT;
+    ISOthreadsT = _ISOthreadsT;
+
+    // Run SEPARATE THREADS to perform the multiplication
+    pthread_t threads[nTHREADS];
+    int t;
+    for(t=0; t<nTHREADS ; t++)
+        pthread_create( &threads[t], NULL, COMMIT_At__block, (void *) (long int)t );
+    for(t=0; t<nTHREADS ; t++)
+        pthread_join( threads[t], NULL );
+    return;
+}
diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index 01f3e6de..086eb7b8 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -1,643 +1,620 @@
-#include "operator_withCUDA.cuh"
-
-bool cudaCheck(cudaError_t cudaStatus){
-    return cudaStatus == cudaSuccess;
-}
-
-void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compartmentsPerBlock, uint32_t* offsetPerBlock, int NUM_BLOCKS){
-
-    // fill arrays with zeros
-    memset(compartmentsPerBlock, 0, NUM_BLOCKS * sizeof(uint32_t));
-    memset(offsetPerBlock,       0, NUM_BLOCKS * sizeof(uint32_t));
-
-    // count compartments per block
-    for(int i = 0; i < NUM_COMPARTMENTS; i++)
-        compartmentsPerBlock[data[i]]++;
-
-    // calculate offset per block
-    offsetPerBlock[0] = 0;
-    for(int i = 1; i < NUM_BLOCKS; i++)
-        offsetPerBlock[i] = offsetPerBlock[i-1] + compartmentsPerBlock[i-1];
-}
-
-bool checkCompatibility(size_t required_mem, int gpu_id) {
-    int num_gpus;
-    cudaError_t cudaStatus;
-    
-    //printf("-> Checking availability of CUDA:\n");
-    cudaStatus = cudaGetDeviceCount(&num_gpus);
-
-    if (num_gpus <= 0 || num_gpus <= gpu_id) {
-        printf("\t* the selected GPU does not exist or is not detected \n");
-        return false;
-    }
-
-    if(cudaStatus == cudaSuccess){
-        cudaDeviceProp gpu_properties;
-        cudaGetDeviceProperties(&gpu_properties, gpu_id);
-
-        printf("\t* checking availability of CUDA ... [ OK ]\n");
-        printf("\t* number of CUDA GPUs detected: %d\n", num_gpus);
-        printf("\t* using GPU with ID %d... [ %s ]\n", gpu_id, gpu_properties.name);
-
-        if (required_mem <= gpu_properties.totalGlobalMem) {
-            printf("\t* using %.2f GB of total %.2f GB... [ OK ]\n", required_mem*1e-9, gpu_properties.totalGlobalMem*1e-9);
-        }
-        else {
-            printf("\t* using %f GB of total %f GB... [ ERROR ]: dictionary too big for GPU memory\n", required_mem*1e-9, gpu_properties.totalGlobalMem*1e-9);
-        }
-
-        if(gpu_properties.major >= 5){
-            printf("\t* compute capability: %d.%d [ OK ]\n", gpu_properties.major, gpu_properties.minor);
-        }
-        else{
-            printf("\t* compute capability: %d.%d [ ERROR ]. GPU compute capability must be at least 5.0\n", gpu_properties.major, gpu_properties.minor);
-            return false;
-        }
-
-        return true;
-    }
-    else{
-        printf("\t* checking availability of CUDA ... [ ERROR ]: CUDA is not available or GPU is not CUDA compatible\n");
-        return false;
-    }
-}
-
-CudaLinearOperator::CudaLinearOperator(
-    // pointers to IC data in CPU memory
-    uint32_t* voxelIC,
-    uint32_t* fiberIC,
-    uint16_t* orienIC,
-    float*    lengthIC,
-    float*    lutIC,
-    // pointers to EC data in CPU memory
-    uint32_t* voxelEC,
-    uint16_t* orienEC,
-    float*    lutEC,
-    // pointer to ISO data in CPU memory
-    float*    lutISO,
-    // dataset constant values
-    int nsegments,
-    int nvoxels,      
-    int nfibers,      
-    int npeaks,       
-    int norientations,
-    int nsamples,     
-    int ndiameters,   
-    int nzeppelins,   
-    int nballs,
-
-    int fcall)
-{
-    this->nsegments = nsegments;
-    this->nvoxels   = nvoxels;
-    this->nfibers   = nfibers;
-    this->nrows     = nvoxels * nsamples;
-    this->ncols     = nfibers*ndiameters + npeaks*nzeppelins + nvoxels*nballs;
-
-    if (fcall == 1) {
-    int size_lutic  = ndiameters*norientations*nsamples;
-    int size_lutec  = nzeppelins*norientations*nsamples;
-    int size_lutiso = nballs*nsamples;
-
-    size_t required_mem = 28*(size_t)nsegments + 6.0*(size_t)npeaks + 8.0*(size_t)nfibers + 16.0*(size_t)nvoxels + 4.0*((size_t)size_lutic + (size_t)size_lutec + (size_t)size_lutiso + (size_t)this->nrows + (size_t)this->ncols);
-    checkCompatibility(required_mem, 0);
-
-    // transfer constant values to the GPU
-    printf("\t* constant values ... ");
-    cudaStatus = true;
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_VOXELS,       &nvoxels,       sizeof(int)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_FIBERS,       &nfibers,       sizeof(int)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_PEAKS,        &npeaks,        sizeof(int)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_ORIENTATIONS, &norientations, sizeof(int)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_SAMPLES,      &nsamples,      sizeof(int)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_DIAMETERS,    &ndiameters,    sizeof(int)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_ZEPPELINS,    &nzeppelins,    sizeof(int)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_BALLS,        &nballs,        sizeof(int)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_ROWS,         &nrows,         sizeof(int)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_COLS,         &ncols,         sizeof(int)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTIC,       &size_lutic,    sizeof(int)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTEC,       &size_lutec,    sizeof(int)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTISO,      &size_lutiso,   sizeof(int)) );
-    if (cudaStatus) printf("[ OK ]\n");
-    else            printf("[ CUDA ERROR ]\n");
-
-    // alloc memory in GPU for vectors x and y
-    printf("\t* vectors x&y ... ");
-    cudaStatus = true;
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_x, ncols*sizeof(float64_t)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_y, nrows*sizeof(float64_t)) );
-    if (cudaStatus) printf("[ OK ]\n");
-    else            printf("[ CUDA ERROR ]\n");
-
-    // pre-process data for GPU
-    printf("\t* pre-processing ... ");
-    cudaStatus = true;
-    uint32_t* segmentsPerBlock = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
-    uint32_t* offsetPerBlock   = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
-
-    preprocessDataForGPU(voxelIC, nsegments, segmentsPerBlock, offsetPerBlock, nvoxels);
-
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_segmentsPerBlockIC, nvoxels*sizeof(uint32_t)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_offsetPerBlockIC,   nvoxels*sizeof(uint32_t)) );
-
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_segmentsPerBlockIC, segmentsPerBlock, nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_offsetPerBlockIC,   offsetPerBlock,   nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
-
-    preprocessDataForGPU(voxelEC, npeaks, segmentsPerBlock, offsetPerBlock, nvoxels);
-
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_segmentsPerBlockEC, nvoxels*sizeof(uint32_t)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_offsetPerBlockEC,   nvoxels*sizeof(uint32_t)) );
-
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_segmentsPerBlockEC, segmentsPerBlock, nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_offsetPerBlockEC,   offsetPerBlock,   nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
-
-    free(segmentsPerBlock);
-    free(offsetPerBlock);
-    if (cudaStatus) printf("[ OK ]\n");
-    else            printf("[ CUDA ERROR ]\n");
-
-    // alloc and transfer LUTs
-    printf("\t* loading LUT ... ");
-    cudaStatus = true;
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lutIC, size_lutic*sizeof(float32_t)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lutIC, lutIC, size_lutic*sizeof(float32_t), cudaMemcpyHostToDevice) );
-
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lutEC,  size_lutec*sizeof(float32_t)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lutEC, lutEC, size_lutec*sizeof(float32_t), cudaMemcpyHostToDevice) );
-
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lutISO, size_lutiso*sizeof(float32_t)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lutISO, lutISO, size_lutiso*sizeof(float32_t), cudaMemcpyHostToDevice) );
-    if (cudaStatus) printf("[ OK ]\n");
-    else            printf("[ CUDA ERROR ]\n");
-
-    // alloc and transfer operator A
-    printf("\t* A  operator... ");
-    cudaStatus = true;
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_voxelIC,  nsegments*sizeof(uint32_t))  );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_fiberIC,  nsegments*sizeof(uint32_t))  );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_orienIC,  nsegments*sizeof(uint16_t))  );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lengthIC, nsegments*sizeof(float32_t)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_voxelEC,  npeaks*sizeof(uint32_t))     );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_orienEC,  npeaks*sizeof(uint16_t))     );
-
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_voxelIC,  voxelIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_fiberIC,  fiberIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_orienIC,  orienIC,  nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lengthIC, lengthIC, nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_voxelEC,  voxelEC,  npeaks*sizeof(uint32_t),     cudaMemcpyHostToDevice) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_orienEC,  orienEC,  npeaks*sizeof(uint16_t),     cudaMemcpyHostToDevice) );
-    if (cudaStatus) printf("[ OK ]\n");
-    else            printf("[ CUDA ERROR ]\n");
-    }
-
-}
-
-CudaLinearOperator::~CudaLinearOperator() {}
-
-void CudaLinearOperator::destroy(){
-    bool cudaStatus;
-
-    printf("\n-> Deleting GPU memory:\n");
-
-    printf("\t* deleting A...   ");
-    cudaStatus = true;
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_voxelIC)  );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_fiberIC)  );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_orienIC)  );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_lengthIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_voxelEC)  );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_orienEC)  );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_segmentsPerBlockIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_offsetPerBlockIC)   );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_segmentsPerBlockEC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_offsetPerBlockEC)   );
-    if (cudaStatus) printf("[ OK ]\n");
-    else            printf("[ CUDA ERROR ]\n");
-
-    printf("\t* deleting A'...  ");
-    cudaStatus = true;
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_TvoxelIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_TfiberIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_TorienIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_TlengthIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_TfibersPerBlockIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_ToffsetPerBlockIC) );
-    if (cudaStatus) printf("[ OK ]\n");
-    else            printf("[ CUDA ERROR ]\n");
-
-    printf("\t* deleting x&y... ");
-    cudaStatus = true;
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_x) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_y) );
-    if (cudaStatus) printf("[ OK ]\n");
-    else            printf("[ CUDA ERROR ]\n");
-
-    printf("\t* deleting LUT... ");
-    cudaStatus = true;
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_lutIC)  );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_lutEC)  );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_lutISO) );
-    if (cudaStatus) printf("[ OK ]\n");
-    else            printf("[ CUDA ERROR ]\n");
-
-    printf("\t* reseting GPU... ");
-    cudaStatus = true;
-    cudaStatus = cudaStatus && cudaCheck( cudaDeviceReset() );
-    if (cudaStatus) printf("[ OK ]\n");
-    else            printf("[ CUDA ERROR ]\n");
-}
-
-void CudaLinearOperator::setTransposeData(uint32_t*  voxelIDs,
-                                          uint32_t*  fiberIDs,
-                                          uint16_t*  orienIDs,
-                                          float32_t* lengths)
-{
-    printf("\t* A' operator... ");
-    cudaStatus = true;
-    uint32_t*  fibersPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
-    uint32_t*  offsetPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
-
-    if(fibersPerBlock == NULL || offsetPerBlock == NULL) printf("problemas\n");
-
-    preprocessDataForGPU(fiberIDs, nsegments, fibersPerBlock, offsetPerBlock, nfibers);
-
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TfibersPerBlockIC, nfibers*sizeof(uint32_t)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_ToffsetPerBlockIC, nfibers*sizeof(uint32_t)) );
-
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TfibersPerBlockIC, fibersPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_ToffsetPerBlockIC, offsetPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-
-    free(fibersPerBlock);
-    free(offsetPerBlock);
-
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TvoxelIC,  nsegments*sizeof(uint32_t))  );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TfiberIC,  nsegments*sizeof(uint32_t))  );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TorienIC,  nsegments*sizeof(uint16_t))  );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TlengthIC, nsegments*sizeof(float32_t)) );
-
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TvoxelIC,  voxelIDs, nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TfiberIC,  fiberIDs, nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TorienIC,  orienIDs, nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TlengthIC, lengths,  nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
-    if (cudaStatus) printf("[ OK ]\n");
-    else            printf("[ CUDA ERROR ]\n");
-}
-
-void cudaCheckKernel(){
-    cudaError_t cudaStatus;
-    
-    cudaStatus = cudaGetLastError();
-	if(cudaStatus != cudaSuccess)
-        fprintf(stderr, "\t* kernel launch... [ ERROR ]: %s\n\n", cudaGetErrorString(cudaStatus));
-    else
-        printf("\t* kernel launch... [ OK ]\n");
-
-    cudaStatus = cudaDeviceSynchronize();
-	if(cudaStatus != cudaSuccess)
-        fprintf(stderr, "\t* cudaDeviceSynchronize() after launching kernel... [ ERROR ]: %d\n", cudaStatus);
-    else
-        printf("\t* cudaDeviceSynchronize() after launching kernel... [ OK ]\n");
-}
-
-void CudaLinearOperator::dot(float64_t* v_in, float64_t* v_out){
-    //cudaError_t cudaStatus;
-    
-    // Copy vector x to the GPU
-    cudaMemcpy(gpu_x, v_in, ncols*sizeof(double), cudaMemcpyHostToDevice);
-    /*if (cudaStatus != cudaSuccess) printf("\t* tranfering x to GPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
-    else                           printf("\t* tranfering x to GPU ... [   OK  ]\n");//*/
-
-    // Multiply IC part in the GPU
-    multiply_Ax_ICpart<<<nvoxels, 1024>>>(gpu_voxelIC, gpu_fiberIC, gpu_orienIC, gpu_lengthIC, gpu_segmentsPerBlockIC, gpu_offsetPerBlockIC, gpu_lutIC, gpu_x, gpu_y);
-
-    //cudaCheckKernel();
-
-    // Multiply EC part in the GPU
-    multiply_Ax_ECpart<<<nvoxels, 512>>>(gpu_voxelEC, gpu_orienEC, gpu_segmentsPerBlockEC, gpu_offsetPerBlockEC, gpu_lutEC, gpu_x, gpu_y);
-
-    //cudaCheckKernel();
-
-    // Multiply ISO part in the GPU
-    multiply_Ax_ISOpart<<<nvoxels, 512>>>(gpu_lutISO, gpu_x, gpu_y);
-
-    //cudaCheckKernel();
-
-    // Copy back result to CPU
-    cudaMemcpy(v_out, gpu_y, nrows*sizeof(double), cudaMemcpyDeviceToHost);
-    /*if (cudaStatus != cudaSuccess) printf("\t* tranfering y to CPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
-    else                           printf("\t* tranfering y to CPU ... [   OK  ]\n");//*/
-}
-
-void CudaLinearOperator::Tdot(float64_t* v_in, float64_t* v_out){
-        
-    //cudaError_t cudaStatus;
-    // Copy vector y to the GPU
-    //cudaCheck( cudaMemset(gpu_x, 0, NUM_COLS*sizeof(float64_t)) );
-    //cudaCheck( cudaMemcpy(gpu_x, x, NUM_COLS*sizeof(double), cudaMemcpyHostToDevice) );
-    cudaMemcpy(gpu_y, v_in, nrows*sizeof(double), cudaMemcpyHostToDevice);
-    /*if (cudaStatus != cudaSuccess) printf("\t* tranfering y to GPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
-    else                           printf("\t* tranfering y to GPU ... [   OK  ]\n");//*/
-
-    // Multiply IC part in the GPU
-    multiply_Aty_ICpart<<<nfibers, 512>>>(gpu_TvoxelIC, gpu_TfiberIC, gpu_TorienIC, gpu_TlengthIC, gpu_TfibersPerBlockIC, gpu_ToffsetPerBlockIC, gpu_lutIC, gpu_x, gpu_y);
-
-    //cudaCheckKernel();
-
-    // Multiply EC part in the GPU
-    multiply_Aty_ECpart<<<nvoxels, 512>>>(gpu_voxelEC, gpu_orienEC, gpu_segmentsPerBlockEC, gpu_offsetPerBlockEC, gpu_lutEC, gpu_x, gpu_y);
-
-    //cudaCheckKernel();
-
-    // Multiply ISO part in the GPU
-    multiply_Aty_ISOpart<<<nvoxels, 512>>>(gpu_lutISO, gpu_x, gpu_y);
-
-    //cudaCheckKernel();
-
-    // Copy back result to CPU
-    cudaMemcpy(v_out, gpu_x, ncols*sizeof(double), cudaMemcpyDeviceToHost);
-    /*if (cudaStatus != cudaSuccess) printf("\t* tranfering x to CPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
-    else                           printf("\t* tranfering x to CPU ... [   OK  ]\n");//*/
-        
-    /*printf("\n\n VECTOR X EC PART:\n");
-    for(int i = NUM_FIBERS*NUM_RESFUNCIC; i < NUM_FIBERS*NUM_RESFUNCIC+20; i++)
-        printf("%lf ", x[i]);
-    printf("\n\n");//*/
-}
-
-// ------------------------------------------------------- KERNELS ------------------------------------------------------- //
-__global__ void multiply_Ax_ICpart(uint32_t*  voxelIDs,
-                                   uint32_t*  fiberIDs,
-                                   uint16_t*  orienIDs,
-                                   float32_t* lengths,
-                                   uint32_t*  segmentsPerBlock,
-                                   uint32_t*  offsetPerBlock,
-                                   float32_t* lut,
-                                   float64_t* x,
-                                   float64_t* y)
-{
-    __shared__ float64_t shmem[1024];
-
-    uint32_t bid = blockIdx.x;
-    uint32_t tid = threadIdx.x;
-    uint32_t gid = threadIdx.x / 512;
-    uint32_t sid = threadIdx.x - 512*gid;
-
-    shmem[tid] = 0.0;
-
-    if(sid >= NUM_SAMPLES) return;
-
-    uint32_t offset = offsetPerBlock[bid] + (segmentsPerBlock[bid]/2)*gid;
-    uint32_t nsegments = segmentsPerBlock[bid]/2 + (segmentsPerBlock[bid]%2)*gid;
-
-    //segment_t* segment = segments + offset;
-    uint32_t*  voxel  = voxelIDs + offset;
-    uint32_t*  fiber  = fiberIDs + offset;
-    uint16_t*  orien  = orienIDs + offset;
-    float32_t* length = lengths  + offset;
-
-    float64_t sum = 0.0;
-
-    for(int i = 0; i < nsegments; i++){
-        int offset_lut = (*orien)*NUM_SAMPLES + sid;
-
-        float64_t aux = 0.0;
-        for(int j = 0; j < NUM_DIAMETERS; j++){
-            aux += (double)(lut[offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES])*x[(*fiber) + j*NUM_FIBERS];
-            //aux += tex1Dfetch(tex_lutIC, offset_lut + j*num_orientations*num_samples) * x[(*fiber) + j*num_fibers];
-        }
-
-        sum += aux * (*length);
-
-        fiber++;
-        orien++;
-        length++;
-    }
-
-    shmem[tid] = sum;
-    __syncthreads();
-
-    if(tid < NUM_SAMPLES)
-        y[(*voxel)*NUM_SAMPLES + sid] = sum + shmem[tid+512];
-}
-
-__global__ void multiply_Ax_ECpart(
-    uint32_t*  voxelIDs,
-    uint16_t*  orienIDs,
-    uint32_t*  segmentsPerBlock,
-    uint32_t*  offsetPerBlock,
-    float32_t* lut,
-    float64_t* x,
-    float64_t* y)
-{
-    uint32_t bid = blockIdx.x;
-    uint32_t tid = threadIdx.x;
-
-    if(tid >= NUM_SAMPLES) return;
-
-    uint32_t offset  = offsetPerBlock[bid];
-    uint32_t nsegments = segmentsPerBlock[bid];
-
-    //compartmentEC_t* excomp = excomps + offset;
-    uint32_t* voxel = voxelIDs + offset;
-    uint16_t* orien = orienIDs + offset;
-
-    uint32_t target = NUM_FIBERS*NUM_DIAMETERS + offset;
-
-    float64_t sum = 0.0;
-    for(int i = 0; i < nsegments; i++){
-        uint32_t offset_lut = (*orien)*NUM_SAMPLES + tid;
-
-        for(int j = 0; j < NUM_ZEPPELINS; j++)
-            sum += (double)(lut[offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES])*x[target + j*NUM_PEAKS + i];
-            //sum += tex1Dfetch(tex_lutEC, offset_lut + j*num_orientations*num_samples) * x[target + j*num_excomps + i];
-
-        orien++;
-    }
-
-    y[(*voxel)*NUM_SAMPLES + tid] += sum;
-}
-
-__global__ void multiply_Ax_ISOpart(
-    float32_t* lut,
-    float64_t* x,
-    float64_t* y)
-{
-    uint32_t bid = blockIdx.x;
-    uint32_t tid = threadIdx.x;
-
-    if(tid >= NUM_SAMPLES) return;
-
-    uint32_t target = NUM_FIBERS*NUM_DIAMETERS + NUM_PEAKS*NUM_ZEPPELINS + bid;
-
-    float64_t sum = 0.0;
-    for(int j = 0; j < NUM_BALLS; j++)
-        sum += (double)(lut[j*NUM_SAMPLES + tid])*x[target + j*NUM_VOXELS];
-        //sum += (double)(tex1Dfetch(tex_lutISO, j*num_samples + tid))*x[target + j*num_voxels];
-        
-
-    y[bid*NUM_SAMPLES + tid] += sum;
-}
-
-__global__ void multiply_Aty_ICpart(
-    uint32_t*  voxelICt,
-    uint32_t*  fiberICt,
-    uint16_t*  orienICt,
-    float32_t* lengthICt,
-    uint32_t*  compartmentsPerBlock,
-    uint32_t*  offsetPerBlock,
-    float32_t* lut,
-    float64_t* x,
-    float64_t* y)
-{
-    __shared__ float64_t shmem[512];
-
-    uint32_t bid = blockIdx.x;
-    uint32_t tid = threadIdx.x;
-
-    shmem[tid] = 0.0;
-
-    if(tid >= NUM_SAMPLES) return;
-
-    /*if(bid == 0 && tid == 0){
-    for(int i = 0; i < 10; i++){
-    printf("%d %d %d %f\n", voxelICt[i], fiberICt[i], orientICt[i], lengthICt[i]);
-    }
-    }
-    else if(bid != 0) return;
-    //__syncthreads();//*/
-
-    uint32_t offset = offsetPerBlock[bid];
-    uint32_t nsegments = offset + compartmentsPerBlock[bid];
-
-    //segment_t* segment = segments + offset;
-    uint32_t*  voxel  = voxelICt  + offset;
-    uint32_t*  fiber  = fiberICt  + offset;
-    uint16_t*  orien  = orienICt  + offset;
-    float32_t* length = lengthICt + offset;
-    //uint fiber = segment->fiber;
-
-    for(int j = 0; j < NUM_DIAMETERS; j++){
-        int offset_lut = j*NUM_ORIENTATIONS*NUM_SAMPLES + tid;
-
-        float64_t sum = 0.0;
-        //segment = segments + offset;
-        voxel  = voxelICt  + offset;
-        orien  = orienICt  + offset;
-        length = lengthICt + offset;
-        for(int i = offset; i < nsegments; i++){
-            sum += ((float64_t)(*length)) *( (float64_t) lut[offset_lut + (*orien)*NUM_SAMPLES] )* y[(*voxel)*NUM_SAMPLES + tid];
-            //sum += ((float64_t)(*length)) *( (float64_t) tex1Dfetch(tex_lutIC, offset_lut + (*orient)*num_samples) )* y[(*voxel)*num_samples + tid];
-            //segment++;
-            voxel++;
-            //fiber++;
-            orien++;
-            length++;
-        }
-
-        shmem[tid] = sum;
-        __syncthreads();
-
-        if(tid < 256) shmem[tid] += shmem[tid + 256]; __syncthreads();
-        if(tid < 128) shmem[tid] += shmem[tid + 128]; __syncthreads();
-        if(tid <  64) shmem[tid] += shmem[tid +  64]; __syncthreads();
-        if(tid <  32) shmem[tid] += shmem[tid +  32]; __syncthreads();
-        if(tid <  16) shmem[tid] += shmem[tid +  16]; __syncthreads();
-        if(tid <   8) shmem[tid] += shmem[tid +   8]; __syncthreads();
-        if(tid <   4) shmem[tid] += shmem[tid +   4]; __syncthreads();
-        //if(tid <   2) shmem[tid] += shmem[tid +   2]; __syncthreads();
-
-        if(tid == 0) x[j*NUM_FIBERS + (*fiber)] = shmem[0] + shmem[1] + shmem[2] + shmem[3];
-
-        __syncthreads();
-    }
-}
-
-__global__ void multiply_Aty_ECpart(
-    uint32_t*  voxelEC,
-    uint16_t*  orienEC,
-    uint32_t*  segmentsPerBlock,
-    uint32_t*  offsetPerBlock,
-    float32_t* lut,
-    float64_t* x,
-    float64_t* y)
-{
-    __shared__ float64_t shmem[512];
-
-    uint32_t bid = blockIdx.x;
-    uint32_t tid = threadIdx.x;
-
-    shmem[tid] = 0.0;
-
-    if(tid >= NUM_SAMPLES) return;
-
-    uint32_t offset  = offsetPerBlock[bid];
-    uint32_t ncompartments = segmentsPerBlock[bid] + offset;
-
-    //compartmentEC_t* peak = peaks + offset;
-    uint32_t* voxel = voxelEC + offset;
-    uint16_t* orien = orienEC + offset;
-
-    for(int j = 0; j < NUM_ZEPPELINS; j++){        
-        uint32_t offset_lut = j*NUM_ORIENTATIONS*NUM_SAMPLES + tid;
-
-        //peak = peaks + offset;
-        voxel = voxelEC + offset;
-        orien = orienEC + offset;
-        for(int i = offset; i < ncompartments; i++){
-            //shmem[tid] =( (float64_t)tex1Dfetch(tex_lutEC, (*orient)*num_samples + offset_lut) )* y[(*voxel)*num_samples + tid];
-            shmem[tid] =( (float64_t)(lut[(*orien)*NUM_SAMPLES + offset_lut] ))* y[(*voxel)*NUM_SAMPLES + tid];
-            __syncthreads();
-
-            //if(bid == 0){
-            //printf("%lf\n", lut[(peak->orientation)*num_samples + lut_offset] * y[(peak->voxel)*num_samples + tid]);
-
-            if(tid < 256) shmem[tid] += shmem[tid + 256]; __syncthreads();
-            if(tid < 128) shmem[tid] += shmem[tid + 128]; __syncthreads();
-            if(tid <  64) shmem[tid] += shmem[tid +  64]; __syncthreads();
-            if(tid <  32) shmem[tid] += shmem[tid +  32]; __syncthreads();
-            if(tid <  16) shmem[tid] += shmem[tid +  16]; __syncthreads();
-            if(tid <   8) shmem[tid] += shmem[tid +   8]; __syncthreads();
-            if(tid <   4) shmem[tid] += shmem[tid +   4]; __syncthreads();
-            if(tid <   2) shmem[tid] += shmem[tid +   2]; __syncthreads();
-
-            if(tid == 0) x[NUM_FIBERS*NUM_DIAMETERS + j*NUM_PEAKS + i] = shmem[0] + shmem[1];
-            //}
-
-            //peak++;
-            voxel++;
-            orien++;
-            __syncthreads();
-        }
-    }
-} //*/
-
-__global__ void multiply_Aty_ISOpart(float* lut, double* x, double* y){
-    __shared__ double shmem[512];
-
-    uint bid = blockIdx.x;
-    uint tid = threadIdx.x;
-    uint offset = NUM_FIBERS*NUM_DIAMETERS + NUM_PEAKS*NUM_ZEPPELINS + bid;
-
-    shmem[tid] = 0.0;
-
-    if(tid >= NUM_SAMPLES) return;
-
-    for(int j = 0; j < NUM_BALLS; j++){
-        shmem[tid] =( (float64_t) lut[j*NUM_SAMPLES + tid] )* y[bid*NUM_SAMPLES + tid];
-        //shmem[tid] =( (float64_t) tex1Dfetch(tex_lutISO, j*num_samples + tid) )* y[bid*num_samples + tid];
-        __syncthreads();
-
-        if(tid < 256) shmem[tid] += shmem[tid + 256]; __syncthreads();
-        if(tid < 128) shmem[tid] += shmem[tid + 128]; __syncthreads();
-        if(tid <  64) shmem[tid] += shmem[tid +  64]; __syncthreads();
-        if(tid <  32) shmem[tid] += shmem[tid +  32]; __syncthreads();
-        if(tid <  16) shmem[tid] += shmem[tid +  16]; __syncthreads();
-        if(tid <   8) shmem[tid] += shmem[tid +   8]; __syncthreads();
-        if(tid <   4) shmem[tid] += shmem[tid +   4]; __syncthreads(); 
-
-        if(tid == 0)
-            x[offset + j*NUM_VOXELS] = shmem[0] + shmem[1] + shmem[2] + shmem[3];
-    }
-}//*/
-
+#include "operator_withCUDA.cuh"
+
+bool cudaCheck(cudaError_t cudaStatus){
+    return cudaStatus == cudaSuccess;
+}
+
+void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compartmentsPerBlock, uint32_t* offsetPerBlock, int NUM_BLOCKS){
+
+    // fill arrays with zeros
+    memset(compartmentsPerBlock, 0, NUM_BLOCKS * sizeof(uint32_t));
+    memset(offsetPerBlock,       0, NUM_BLOCKS * sizeof(uint32_t));
+
+    // count compartments per block
+    for(int i = 0; i < NUM_COMPARTMENTS; i++)
+        compartmentsPerBlock[data[i]]++;
+
+    // calculate offset per block
+    offsetPerBlock[0] = 0;
+    for(int i = 1; i < NUM_BLOCKS; i++)
+        offsetPerBlock[i] = offsetPerBlock[i-1] + compartmentsPerBlock[i-1];
+}
+
+bool checkCompatibility(size_t required_mem, int gpu_id) {
+    int num_gpus;
+    cudaError_t cudaStatus;
+    
+    cudaStatus = cudaGetDeviceCount(&num_gpus);
+
+    if (num_gpus <= 0 || num_gpus <= gpu_id) {
+        printf("\t* the selected GPU does not exist or is not detected \n");
+        return false;
+    }
+
+    if(cudaStatus == cudaSuccess){
+        cudaDeviceProp gpu_properties;
+        cudaGetDeviceProperties(&gpu_properties, gpu_id);
+
+        printf("\t* checking availability of CUDA ... [ OK ]\n");
+        printf("\t* number of CUDA GPUs detected: %d\n", num_gpus);
+        printf("\t* using GPU with ID %d... [ %s ]\n", gpu_id, gpu_properties.name);
+
+        if (required_mem <= gpu_properties.totalGlobalMem) {
+            printf("\t* using %.2f GB of total %.2f GB... [ OK ]\n", required_mem*1e-9, gpu_properties.totalGlobalMem*1e-9);
+        }
+        else {
+            printf("\t* using %f GB of total %f GB... [ ERROR ]: dictionary too big for GPU memory\n", required_mem*1e-9, gpu_properties.totalGlobalMem*1e-9);
+        }
+
+        if(gpu_properties.major >= 5){
+            printf("\t* compute capability: %d.%d [ OK ]\n", gpu_properties.major, gpu_properties.minor);
+        }
+        else{
+            printf("\t* compute capability: %d.%d [ ERROR ]. GPU compute capability must be at least 5.0\n", gpu_properties.major, gpu_properties.minor);
+            return false;
+        }
+
+        return true;
+    }
+    else{
+        printf("\t* checking availability of CUDA ... [ ERROR ]: CUDA is not available or GPU is not CUDA compatible\n");
+        return false;
+    }
+}
+
+CudaLinearOperator::CudaLinearOperator(
+    // pointers to IC data in CPU memory
+    uint32_t* voxelIC,
+    uint32_t* fiberIC,
+    uint16_t* orienIC,
+    float*    lengthIC,
+    float*    lutIC,
+    // pointers to EC data in CPU memory
+    uint32_t* voxelEC,
+    uint16_t* orienEC,
+    float*    lutEC,
+    // pointer to ISO data in CPU memory
+    float*    lutISO,
+    // dataset constant values
+    int nsegments,
+    int nvoxels,      
+    int nfibers,      
+    int npeaks,       
+    int norientations,
+    int nsamples,     
+    int ndiameters,   
+    int nzeppelins,   
+    int nballs,
+
+    int fcall)
+{
+    this->nsegments = nsegments;
+    this->nvoxels   = nvoxels;
+    this->nfibers   = nfibers;
+    this->nrows     = nvoxels * nsamples;
+    this->ncols     = nfibers*ndiameters + npeaks*nzeppelins + nvoxels*nballs;
+
+    if (fcall == 1) {
+    int size_lutic  = ndiameters*norientations*nsamples;
+    int size_lutec  = nzeppelins*norientations*nsamples;
+    int size_lutiso = nballs*nsamples;
+
+    size_t required_mem = 28*(size_t)nsegments + 6.0*(size_t)npeaks + 8.0*(size_t)nfibers + 16.0*(size_t)nvoxels + 4.0*((size_t)size_lutic + (size_t)size_lutec + (size_t)size_lutiso + (size_t)this->nrows + (size_t)this->ncols);
+    checkCompatibility(required_mem, 0);
+
+    // transfer constant values to the GPU
+    printf("\t* constant values ... ");
+    cudaStatus = true;
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_VOXELS,       &nvoxels,       sizeof(int)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_FIBERS,       &nfibers,       sizeof(int)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_PEAKS,        &npeaks,        sizeof(int)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_ORIENTATIONS, &norientations, sizeof(int)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_SAMPLES,      &nsamples,      sizeof(int)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_DIAMETERS,    &ndiameters,    sizeof(int)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_ZEPPELINS,    &nzeppelins,    sizeof(int)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_BALLS,        &nballs,        sizeof(int)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_ROWS,         &nrows,         sizeof(int)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_COLS,         &ncols,         sizeof(int)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTIC,       &size_lutic,    sizeof(int)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTEC,       &size_lutec,    sizeof(int)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTISO,      &size_lutiso,   sizeof(int)) );
+    if (cudaStatus) printf("[ OK ]\n");
+    else            printf("[ CUDA ERROR ]\n");
+
+    // alloc memory in GPU for vectors x and y
+    printf("\t* vectors x&y ... ");
+    cudaStatus = true;
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_x, ncols*sizeof(float64_t)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_y, nrows*sizeof(float64_t)) );
+    if (cudaStatus) printf("[ OK ]\n");
+    else            printf("[ CUDA ERROR ]\n");
+
+    // pre-process data for GPU
+    printf("\t* pre-processing ... ");
+    cudaStatus = true;
+    uint32_t* segmentsPerBlock = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
+    uint32_t* offsetPerBlock   = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
+
+    preprocessDataForGPU(voxelIC, nsegments, segmentsPerBlock, offsetPerBlock, nvoxels);
+
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_segmentsPerBlockIC, nvoxels*sizeof(uint32_t)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_offsetPerBlockIC,   nvoxels*sizeof(uint32_t)) );
+
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_segmentsPerBlockIC, segmentsPerBlock, nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_offsetPerBlockIC,   offsetPerBlock,   nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
+
+    preprocessDataForGPU(voxelEC, npeaks, segmentsPerBlock, offsetPerBlock, nvoxels);
+
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_segmentsPerBlockEC, nvoxels*sizeof(uint32_t)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_offsetPerBlockEC,   nvoxels*sizeof(uint32_t)) );
+
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_segmentsPerBlockEC, segmentsPerBlock, nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_offsetPerBlockEC,   offsetPerBlock,   nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
+
+    free(segmentsPerBlock);
+    free(offsetPerBlock);
+    if (cudaStatus) printf("[ OK ]\n");
+    else            printf("[ CUDA ERROR ]\n");
+
+    // alloc and transfer LUTs
+    printf("\t* loading LUT ... ");
+    cudaStatus = true;
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lutIC, size_lutic*sizeof(float32_t)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lutIC, lutIC, size_lutic*sizeof(float32_t), cudaMemcpyHostToDevice) );
+
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lutEC,  size_lutec*sizeof(float32_t)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lutEC, lutEC, size_lutec*sizeof(float32_t), cudaMemcpyHostToDevice) );
+
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lutISO, size_lutiso*sizeof(float32_t)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lutISO, lutISO, size_lutiso*sizeof(float32_t), cudaMemcpyHostToDevice) );
+    if (cudaStatus) printf("[ OK ]\n");
+    else            printf("[ CUDA ERROR ]\n");
+
+    // alloc and transfer operator A
+    printf("\t* A  operator... ");
+    cudaStatus = true;
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_voxelIC,  nsegments*sizeof(uint32_t))  );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_fiberIC,  nsegments*sizeof(uint32_t))  );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_orienIC,  nsegments*sizeof(uint16_t))  );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lengthIC, nsegments*sizeof(float32_t)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_voxelEC,  npeaks*sizeof(uint32_t))     );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_orienEC,  npeaks*sizeof(uint16_t))     );
+
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_voxelIC,  voxelIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_fiberIC,  fiberIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_orienIC,  orienIC,  nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lengthIC, lengthIC, nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_voxelEC,  voxelEC,  npeaks*sizeof(uint32_t),     cudaMemcpyHostToDevice) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_orienEC,  orienEC,  npeaks*sizeof(uint16_t),     cudaMemcpyHostToDevice) );
+    if (cudaStatus) printf("[ OK ]\n");
+    else            printf("[ CUDA ERROR ]\n");
+    }
+
+}
+
+CudaLinearOperator::~CudaLinearOperator() {}
+
+void CudaLinearOperator::destroy(){
+    bool cudaStatus;
+
+    printf("\n-> Deleting GPU memory:\n");
+
+    printf("\t* deleting A...   ");
+    cudaStatus = true;
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_voxelIC)  );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_fiberIC)  );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_orienIC)  );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_lengthIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_voxelEC)  );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_orienEC)  );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_segmentsPerBlockIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_offsetPerBlockIC)   );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_segmentsPerBlockEC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_offsetPerBlockEC)   );
+    if (cudaStatus) printf("[ OK ]\n");
+    else            printf("[ CUDA ERROR ]\n");
+
+    printf("\t* deleting A'...  ");
+    cudaStatus = true;
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_TvoxelIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_TfiberIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_TorienIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_TlengthIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_TfibersPerBlockIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_ToffsetPerBlockIC) );
+    if (cudaStatus) printf("[ OK ]\n");
+    else            printf("[ CUDA ERROR ]\n");
+
+    printf("\t* deleting x&y... ");
+    cudaStatus = true;
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_x) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_y) );
+    if (cudaStatus) printf("[ OK ]\n");
+    else            printf("[ CUDA ERROR ]\n");
+
+    printf("\t* deleting LUT... ");
+    cudaStatus = true;
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_lutIC)  );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_lutEC)  );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_lutISO) );
+    if (cudaStatus) printf("[ OK ]\n");
+    else            printf("[ CUDA ERROR ]\n");
+
+    printf("\t* reseting GPU... ");
+    cudaStatus = true;
+    cudaStatus = cudaStatus && cudaCheck( cudaDeviceReset() );
+    if (cudaStatus) printf("[ OK ]\n");
+    else            printf("[ CUDA ERROR ]\n");
+}
+
+void CudaLinearOperator::setTransposeData(uint32_t*  voxelIDs,
+                                          uint32_t*  fiberIDs,
+                                          uint16_t*  orienIDs,
+                                          float32_t* lengths)
+{
+    printf("\t* A' operator... ");
+    cudaStatus = true;
+    uint32_t*  fibersPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
+    uint32_t*  offsetPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
+
+    if(fibersPerBlock == NULL || offsetPerBlock == NULL) printf("problemas\n");
+
+    preprocessDataForGPU(fiberIDs, nsegments, fibersPerBlock, offsetPerBlock, nfibers);
+
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TfibersPerBlockIC, nfibers*sizeof(uint32_t)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_ToffsetPerBlockIC, nfibers*sizeof(uint32_t)) );
+
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TfibersPerBlockIC, fibersPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_ToffsetPerBlockIC, offsetPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+
+    free(fibersPerBlock);
+    free(offsetPerBlock);
+
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TvoxelIC,  nsegments*sizeof(uint32_t))  );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TfiberIC,  nsegments*sizeof(uint32_t))  );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TorienIC,  nsegments*sizeof(uint16_t))  );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TlengthIC, nsegments*sizeof(float32_t)) );
+
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TvoxelIC,  voxelIDs, nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TfiberIC,  fiberIDs, nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TorienIC,  orienIDs, nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TlengthIC, lengths,  nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
+    if (cudaStatus) printf("[ OK ]\n");
+    else            printf("[ CUDA ERROR ]\n");
+}
+
+void cudaCheckKernel(){
+    cudaError_t cudaStatus;
+    
+    cudaStatus = cudaGetLastError();
+	if(cudaStatus != cudaSuccess)
+        fprintf(stderr, "\t* kernel launch... [ ERROR ]: %s\n\n", cudaGetErrorString(cudaStatus));
+    else
+        printf("\t* kernel launch... [ OK ]\n");
+
+    cudaStatus = cudaDeviceSynchronize();
+	if(cudaStatus != cudaSuccess)
+        fprintf(stderr, "\t* cudaDeviceSynchronize() after launching kernel... [ ERROR ]: %d\n", cudaStatus);
+    else
+        printf("\t* cudaDeviceSynchronize() after launching kernel... [ OK ]\n");
+}
+
+void CudaLinearOperator::dot(float64_t* v_in, float64_t* v_out){
+    //cudaError_t cudaStatus;
+    
+    // Copy vector x to the GPU
+    cudaMemcpy(gpu_x, v_in, ncols*sizeof(double), cudaMemcpyHostToDevice);
+    /*if (cudaStatus != cudaSuccess) printf("\t* tranfering x to GPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
+    else                           printf("\t* tranfering x to GPU ... [   OK  ]\n");//*/
+
+    // Multiply IC part in the GPU
+    multiply_Ax_ICpart<<<nvoxels, 1024>>>(gpu_voxelIC, gpu_fiberIC, gpu_orienIC, gpu_lengthIC, gpu_segmentsPerBlockIC, gpu_offsetPerBlockIC, gpu_lutIC, gpu_x, gpu_y);
+
+    //cudaCheckKernel();
+
+    // Multiply EC part in the GPU
+    multiply_Ax_ECpart<<<nvoxels, 512>>>(gpu_voxelEC, gpu_orienEC, gpu_segmentsPerBlockEC, gpu_offsetPerBlockEC, gpu_lutEC, gpu_x, gpu_y);
+
+    //cudaCheckKernel();
+
+    // Multiply ISO part in the GPU
+    multiply_Ax_ISOpart<<<nvoxels, 512>>>(gpu_lutISO, gpu_x, gpu_y);
+
+    //cudaCheckKernel();
+
+    // Copy back result to CPU
+    cudaMemcpy(v_out, gpu_y, nrows*sizeof(double), cudaMemcpyDeviceToHost);
+    /*if (cudaStatus != cudaSuccess) printf("\t* tranfering y to CPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
+    else                           printf("\t* tranfering y to CPU ... [   OK  ]\n");//*/
+}
+
+void CudaLinearOperator::Tdot(float64_t* v_in, float64_t* v_out){
+        
+    //cudaError_t cudaStatus;
+    // Copy vector y to the GPU
+    //cudaCheck( cudaMemset(gpu_x, 0, NUM_COLS*sizeof(float64_t)) );
+    //cudaCheck( cudaMemcpy(gpu_x, x, NUM_COLS*sizeof(double), cudaMemcpyHostToDevice) );
+    cudaMemcpy(gpu_y, v_in, nrows*sizeof(double), cudaMemcpyHostToDevice);
+    /*if (cudaStatus != cudaSuccess) printf("\t* tranfering y to GPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
+    else                           printf("\t* tranfering y to GPU ... [   OK  ]\n");//*/
+
+    // Multiply IC part in the GPU
+    multiply_Aty_ICpart<<<nfibers, 512>>>(gpu_TvoxelIC, gpu_TfiberIC, gpu_TorienIC, gpu_TlengthIC, gpu_TfibersPerBlockIC, gpu_ToffsetPerBlockIC, gpu_lutIC, gpu_x, gpu_y);
+
+    //cudaCheckKernel();
+
+    // Multiply EC part in the GPU
+    multiply_Aty_ECpart<<<nvoxels, 512>>>(gpu_voxelEC, gpu_orienEC, gpu_segmentsPerBlockEC, gpu_offsetPerBlockEC, gpu_lutEC, gpu_x, gpu_y);
+
+    //cudaCheckKernel();
+
+    // Multiply ISO part in the GPU
+    multiply_Aty_ISOpart<<<nvoxels, 512>>>(gpu_lutISO, gpu_x, gpu_y);
+
+    //cudaCheckKernel();
+
+    // Copy back result to CPU
+    cudaMemcpy(v_out, gpu_x, ncols*sizeof(double), cudaMemcpyDeviceToHost);
+    /*if (cudaStatus != cudaSuccess) printf("\t* tranfering x to CPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
+    else                           printf("\t* tranfering x to CPU ... [   OK  ]\n");//*/
+        
+    /*printf("\n\n VECTOR X EC PART:\n");
+    for(int i = NUM_FIBERS*NUM_RESFUNCIC; i < NUM_FIBERS*NUM_RESFUNCIC+20; i++)
+        printf("%lf ", x[i]);
+    printf("\n\n");//*/
+}
+
+// ------------------------------------------------------- KERNELS ------------------------------------------------------- //
+__global__ void multiply_Ax_ICpart(uint32_t*  voxelIDs,
+                                   uint32_t*  fiberIDs,
+                                   uint16_t*  orienIDs,
+                                   float32_t* lengths,
+                                   uint32_t*  segmentsPerBlock,
+                                   uint32_t*  offsetPerBlock,
+                                   float32_t* lut,
+                                   float64_t* x,
+                                   float64_t* y)
+{
+    __shared__ float64_t shmem[1024];
+
+    uint32_t bid = blockIdx.x;
+    uint32_t tid = threadIdx.x;
+    uint32_t gid = threadIdx.x / 512;
+    uint32_t sid = threadIdx.x - 512*gid;
+
+    shmem[tid] = 0.0;
+
+    if(sid >= NUM_SAMPLES) return;
+
+    uint32_t offset = offsetPerBlock[bid] + (segmentsPerBlock[bid]/2)*gid;
+    uint32_t nsegments = segmentsPerBlock[bid]/2 + (segmentsPerBlock[bid]%2)*gid;
+
+    uint32_t*  voxel  = voxelIDs + offset;
+    uint32_t*  fiber  = fiberIDs + offset;
+    uint16_t*  orien  = orienIDs + offset;
+    float32_t* length = lengths  + offset;
+
+    float64_t sum = 0.0;
+
+    for(int i = 0; i < nsegments; i++){
+        int offset_lut = (*orien)*NUM_SAMPLES + sid;
+
+        float64_t aux = 0.0;
+        for(int j = 0; j < NUM_DIAMETERS; j++){
+            aux += (double)(lut[offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES])*x[(*fiber) + j*NUM_FIBERS];
+            //aux += tex1Dfetch(tex_lutIC, offset_lut + j*num_orientations*num_samples) * x[(*fiber) + j*num_fibers];
+        }
+
+        sum += aux * (*length);
+
+        fiber++;
+        orien++;
+        length++;
+    }
+
+    shmem[tid] = sum;
+    __syncthreads();
+
+    if(tid < NUM_SAMPLES)
+        y[(*voxel)*NUM_SAMPLES + sid] = sum + shmem[tid+512];
+}
+
+__global__ void multiply_Ax_ECpart(
+    uint32_t*  voxelIDs,
+    uint16_t*  orienIDs,
+    uint32_t*  segmentsPerBlock,
+    uint32_t*  offsetPerBlock,
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y)
+{
+    uint32_t bid = blockIdx.x;
+    uint32_t tid = threadIdx.x;
+
+    if(tid >= NUM_SAMPLES) return;
+
+    uint32_t offset  = offsetPerBlock[bid];
+    uint32_t nsegments = segmentsPerBlock[bid];
+
+    uint32_t* voxel = voxelIDs + offset;
+    uint16_t* orien = orienIDs + offset;
+
+    uint32_t target = NUM_FIBERS*NUM_DIAMETERS + offset;
+
+    float64_t sum = 0.0;
+    for(int i = 0; i < nsegments; i++){
+        uint32_t offset_lut = (*orien)*NUM_SAMPLES + tid;
+
+        for(int j = 0; j < NUM_ZEPPELINS; j++)
+            sum += (double)(lut[offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES])*x[target + j*NUM_PEAKS + i];
+            //sum += tex1Dfetch(tex_lutEC, offset_lut + j*num_orientations*num_samples) * x[target + j*num_excomps + i];
+
+        orien++;
+    }
+
+    y[(*voxel)*NUM_SAMPLES + tid] += sum;
+}
+
+__global__ void multiply_Ax_ISOpart(
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y)
+{
+    uint32_t bid = blockIdx.x;
+    uint32_t tid = threadIdx.x;
+
+    if(tid >= NUM_SAMPLES) return;
+
+    uint32_t target = NUM_FIBERS*NUM_DIAMETERS + NUM_PEAKS*NUM_ZEPPELINS + bid;
+
+    float64_t sum = 0.0;
+    for(int j = 0; j < NUM_BALLS; j++)
+        sum += (double)(lut[j*NUM_SAMPLES + tid])*x[target + j*NUM_VOXELS];
+        //sum += (double)(tex1Dfetch(tex_lutISO, j*num_samples + tid))*x[target + j*num_voxels];
+        
+
+    y[bid*NUM_SAMPLES + tid] += sum;
+}
+
+__global__ void multiply_Aty_ICpart(
+    uint32_t*  voxelICt,
+    uint32_t*  fiberICt,
+    uint16_t*  orienICt,
+    float32_t* lengthICt,
+    uint32_t*  compartmentsPerBlock,
+    uint32_t*  offsetPerBlock,
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y)
+{
+    __shared__ float64_t shmem[512];
+
+    uint32_t bid = blockIdx.x;
+    uint32_t tid = threadIdx.x;
+
+    shmem[tid] = 0.0;
+
+    if(tid >= NUM_SAMPLES) return;
+
+    uint32_t offset = offsetPerBlock[bid];
+    uint32_t nsegments = offset + compartmentsPerBlock[bid];
+
+    uint32_t*  voxel  = voxelICt  + offset;
+    uint32_t*  fiber  = fiberICt  + offset;
+    uint16_t*  orien  = orienICt  + offset;
+    float32_t* length = lengthICt + offset;
+
+    for(int j = 0; j < NUM_DIAMETERS; j++){
+        int offset_lut = j*NUM_ORIENTATIONS*NUM_SAMPLES + tid;
+
+        float64_t sum = 0.0;
+        voxel  = voxelICt  + offset;
+        orien  = orienICt  + offset;
+        length = lengthICt + offset;
+        for(int i = offset; i < nsegments; i++){
+            sum += ((float64_t)(*length)) *( (float64_t) lut[offset_lut + (*orien)*NUM_SAMPLES] )* y[(*voxel)*NUM_SAMPLES + tid];
+            //sum += ((float64_t)(*length)) *( (float64_t) tex1Dfetch(tex_lutIC, offset_lut + (*orient)*num_samples) )* y[(*voxel)*num_samples + tid];
+
+            voxel++;
+            orien++;
+            length++;
+        }
+
+        shmem[tid] = sum;
+        __syncthreads();
+
+        if(tid < 256) shmem[tid] += shmem[tid + 256]; __syncthreads();
+        if(tid < 128) shmem[tid] += shmem[tid + 128]; __syncthreads();
+        if(tid <  64) shmem[tid] += shmem[tid +  64]; __syncthreads();
+        if(tid <  32) shmem[tid] += shmem[tid +  32]; __syncthreads();
+        if(tid <  16) shmem[tid] += shmem[tid +  16]; __syncthreads();
+        if(tid <   8) shmem[tid] += shmem[tid +   8]; __syncthreads();
+        if(tid <   4) shmem[tid] += shmem[tid +   4]; __syncthreads();
+
+        if(tid == 0) x[j*NUM_FIBERS + (*fiber)] = shmem[0] + shmem[1] + shmem[2] + shmem[3];
+
+        __syncthreads();
+    }
+}
+
+__global__ void multiply_Aty_ECpart(
+    uint32_t*  voxelEC,
+    uint16_t*  orienEC,
+    uint32_t*  segmentsPerBlock,
+    uint32_t*  offsetPerBlock,
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y)
+{
+    __shared__ float64_t shmem[512];
+
+    uint32_t bid = blockIdx.x;
+    uint32_t tid = threadIdx.x;
+
+    shmem[tid] = 0.0;
+
+    if(tid >= NUM_SAMPLES) return;
+
+    uint32_t offset  = offsetPerBlock[bid];
+    uint32_t ncompartments = segmentsPerBlock[bid] + offset;
+
+    uint32_t* voxel = voxelEC + offset;
+    uint16_t* orien = orienEC + offset;
+
+    for(int j = 0; j < NUM_ZEPPELINS; j++){        
+        uint32_t offset_lut = j*NUM_ORIENTATIONS*NUM_SAMPLES + tid;
+
+        voxel = voxelEC + offset;
+        orien = orienEC + offset;
+        for(int i = offset; i < ncompartments; i++){
+            //shmem[tid] =( (float64_t)tex1Dfetch(tex_lutEC, (*orient)*num_samples + offset_lut) )* y[(*voxel)*num_samples + tid];
+            shmem[tid] =( (float64_t)(lut[(*orien)*NUM_SAMPLES + offset_lut] ))* y[(*voxel)*NUM_SAMPLES + tid];
+            __syncthreads();
+
+            if(tid < 256) shmem[tid] += shmem[tid + 256]; __syncthreads();
+            if(tid < 128) shmem[tid] += shmem[tid + 128]; __syncthreads();
+            if(tid <  64) shmem[tid] += shmem[tid +  64]; __syncthreads();
+            if(tid <  32) shmem[tid] += shmem[tid +  32]; __syncthreads();
+            if(tid <  16) shmem[tid] += shmem[tid +  16]; __syncthreads();
+            if(tid <   8) shmem[tid] += shmem[tid +   8]; __syncthreads();
+            if(tid <   4) shmem[tid] += shmem[tid +   4]; __syncthreads();
+            if(tid <   2) shmem[tid] += shmem[tid +   2]; __syncthreads();
+
+            if(tid == 0) x[NUM_FIBERS*NUM_DIAMETERS + j*NUM_PEAKS + i] = shmem[0] + shmem[1];
+
+            voxel++;
+            orien++;
+            __syncthreads();
+        }
+    }
+}
+
+__global__ void multiply_Aty_ISOpart(float* lut, double* x, double* y){
+    __shared__ double shmem[512];
+
+    uint bid = blockIdx.x;
+    uint tid = threadIdx.x;
+    uint offset = NUM_FIBERS*NUM_DIAMETERS + NUM_PEAKS*NUM_ZEPPELINS + bid;
+
+    shmem[tid] = 0.0;
+
+    if(tid >= NUM_SAMPLES) return;
+
+    for(int j = 0; j < NUM_BALLS; j++){
+        shmem[tid] =( (float64_t) lut[j*NUM_SAMPLES + tid] )* y[bid*NUM_SAMPLES + tid];
+        //shmem[tid] =( (float64_t) tex1Dfetch(tex_lutISO, j*num_samples + tid) )* y[bid*num_samples + tid];
+        __syncthreads();
+
+        if(tid < 256) shmem[tid] += shmem[tid + 256]; __syncthreads();
+        if(tid < 128) shmem[tid] += shmem[tid + 128]; __syncthreads();
+        if(tid <  64) shmem[tid] += shmem[tid +  64]; __syncthreads();
+        if(tid <  32) shmem[tid] += shmem[tid +  32]; __syncthreads();
+        if(tid <  16) shmem[tid] += shmem[tid +  16]; __syncthreads();
+        if(tid <   8) shmem[tid] += shmem[tid +   8]; __syncthreads();
+        if(tid <   4) shmem[tid] += shmem[tid +   4]; __syncthreads(); 
+
+        if(tid == 0)
+            x[offset + j*NUM_VOXELS] = shmem[0] + shmem[1] + shmem[2] + shmem[3];
+    }
+}
+
diff --git a/commit/solvers.py b/commit/solvers.py
index ce4325fb..8e86d5c5 100755
--- a/commit/solvers.py
+++ b/commit/solvers.py
@@ -1,403 +1,403 @@
-"""
-Author: Matteo Frigo - lts5 @ EPFL and Dep. of CS @ Univ. of Verona
-
-This structure is based on the previous work of Rafael Carrillo and was
-supported by the LTS5 laboratory at EPFL, Lausanne.
-"""
-from __future__ import print_function
-import numpy as np
-from math import sqrt
-import sys
-import warnings
-eps = np.finfo(float).eps
-
-from commit.proximals import (non_negativity,
-                             omega_group_sparsity,
-                             prox_group_sparsity,
-                             soft_thresholding,
-                             projection_onto_l2_ball)
-group_sparsity = -1
-non_negative = 0
-norm1 = 1
-norm2 = 2
-norminf = np.inf
-list_regnorms = [group_sparsity, non_negative, norm1, norm2]
-list_group_sparsity_norms = [norm2]#, norminf] # removed because of issue #54
-
-
-def init_regularisation(commit_evaluation,
-                        regnorms = (non_negative, non_negative, non_negative),
-                        structureIC = None, weightsIC = None, group_norm = 2,
-                        lambdas = (.0,.0,.0) ):
-    """
-    Initialise the data structure that defines Omega in
-
-        argmin_x 0.5*||Ax-y||_2^2 + Omega(x)
-
-
-    Input
-    -----
-    commit_evaluation - commit.Evaluation object :
-        dictionary and model have to be loaded beforehand.
-
-
-    regnorms - tuple :
-        this sets the penalty term to be used for each compartment.
-            Default = (non_negative,non_negative,non_negative).
-
-            regnorms[0] corresponds to the Intracellular compartment
-            regnorms[1] corresponds to the Extracellular compartment
-            regnorms[2] corresponds to the Isotropic compartment
-
-            Each regnorms[k] must be one of commit.solvers.
-                                {group_sparsity, non_negative, norm1, norm2}.
-
-            commit.solvers.group_sparsity considers both the non-overlapping
-                and the hierarchical group sparsity (see [1]). This option is
-                allowed only in the IC compartment. The mathematical formulation
-                of this term is
-                $\Omega(x) = \lambda \sum_{g\in G} w_g |x_g|
-
-            commit.solvers.non_negative puts a non negativity constraint on the
-                coefficients corresponding to the compartment. This is the
-                default option for each compartment
-
-            commit.solvers.norm1 penalises with the 1-norm of the coefficients
-                corresponding to the compartment.
-
-            commit.solvers.norm2 penalises with the 2-norm of the coefficients
-                corresponding to the compartment.
-
-
-    structureIC - np.array(list(list)) :
-        group structure for the IC compartment.
-            This field is necessary only if regterm[0]=commit.solver.group_sparsity.
-            Example:
-                structureIC = np.array([[0,2,5],[1,3,4],[0,1,2,3,4,5],[6]])
-
-                that is equivalent to
-                            [0,1,2,3,4,5]        [6]
-                              /       \
-                        [0,2,5]       [1,3,4]
-                which has two non overlapping groups, one of which is the union
-                of two other non-overlapping groups.
-
-
-    weightsIC - np.array(np.float64) :
-        this defines the weights associated to each group of structure IC.
-
-
-    group_norm - number :
-        norm type for the commit.solver.group_sparsity penalisation of the IC compartment.
-            Default: group_norm = commit.solver.norm2
-            To be chosen among commit.solver.{norm2,norminf}.
-
-    lambdas - tuple :
-        regularisation parameter for each compartment.
-            Default: lambdas = (0.0, 0.0, 0.0)
-            The lambdas correspond to the onse described in the mathematical
-            formulation of the regularisation term
-            $\Omega(x) = lambdas[0]*regnorm[0](x) + lambdas[1]*regnorm[1](x) + lambdas[2]*regnorm[2](x)$
-
-
-    References:
-        [1] Jenatton et al. - 'Proximal Methods for Hierarchical Sparse Coding'
-    """
-    regularisation = {}
-
-    regularisation['startIC']  = 0
-    regularisation['sizeIC']   = int( commit_evaluation.DICTIONARY['IC']['nF'] * commit_evaluation.KERNELS['wmr'].shape[0])
-    regularisation['startEC']  = int( regularisation['sizeIC'] )
-    regularisation['sizeEC']   = int( commit_evaluation.DICTIONARY['EC']['nE'] * commit_evaluation.KERNELS['wmh'].shape[0])
-    regularisation['startISO'] = int( regularisation['sizeIC'] + regularisation['sizeEC'] )
-    regularisation['sizeISO']  = int( commit_evaluation.DICTIONARY['nV'] * commit_evaluation.KERNELS['iso'].shape[0])
-
-    regularisation['normIC']  = regnorms[0]
-    regularisation['normEC']  = regnorms[1]
-    regularisation['normISO'] = regnorms[2]
-
-    regularisation['lambdaIC']  = float( lambdas[0] )
-    regularisation['lambdaEC']  = float( lambdas[1] )
-    regularisation['lambdaISO'] = float( lambdas[2] )
-
-    # Solver-specific fields
-    regularisation['structureIC']      = structureIC
-    regularisation['weightsIC']        = weightsIC
-    regularisation['group_norm']       = group_norm
-
-    return regularisation
-
-
-def regularisation2omegaprox(regularisation):
-    lambdaIC  = float(regularisation.get('lambdaIC'))
-    lambdaEC  = float(regularisation.get('lambdaEC'))
-    lambdaISO = float(regularisation.get('lambdaISO'))
-    if lambdaIC < 0.0 or lambdaEC < 0.0 or lambdaISO < 0.0:
-        raise ValueError('Negative regularisation parameters are not allowed')
-
-    normIC  = regularisation.get('normIC')
-    normEC  = regularisation.get('normEC')
-    normISO = regularisation.get('normISO')
-    if not normIC in list_regnorms:
-        raise ValueError('normIC must be one of commit.solvers.{group_sparsity,non_negative,norm1,norm2}')
-    if not normEC in list_regnorms:
-        raise ValueError('normEC must be one of commit.solvers.{group_sparsity,non_negative,norm1,norm2}')
-    if not normISO in list_regnorms:
-        raise ValueError('normISO must be one of commit.solvers.{group_sparsity,non_negative,norm1,norm2}')
-
-    ## NNLS case
-    if (lambdaIC == 0.0 and lambdaEC == 0.0 and lambdaISO == 0.0) or (normIC == non_negative and normEC == non_negative and normISO == non_negative):
-        omega = lambda x: 0.0
-        prox  = lambda x: non_negativity(x, 0, len(x))
-        return omega, prox
-
-    ## All other cases
-    # Intracellular Compartment
-    startIC = regularisation.get('startIC')
-    sizeIC  = regularisation.get('sizeIC')
-    if lambdaIC == 0.0:
-        omegaIC = lambda x: 0.0
-        proxIC  = lambda x: x
-    elif normIC == norm2:
-        omegaIC = lambda x: lambdaIC * np.linalg.norm(x[startIC:sizeIC])
-        proxIC  = lambda x: projection_onto_l2_ball(x, lambdaIC, startIC, sizeIC)
-    elif normIC == norm1:
-        omegaIC = lambda x: lambdaIC * sum( x[startIC:sizeIC] )
-        proxIC  = lambda x: soft_thresholding(x, lambdaIC, startIC, sizeIC)
-    elif normIC == non_negative:
-        omegaIC = lambda x: 0.0
-        proxIC  = lambda x: non_negativity(x, startIC, sizeIC)
-    elif normIC == group_sparsity:
-        structureIC = regularisation.get('structureIC')
-        groupWeightIC   = regularisation.get('weightsIC')
-        if not len(structureIC) == len(groupWeightIC):
-            raise ValueError('Number of groups and weights do not coincide.')
-        group_norm = regularisation.get('group_norm')
-        if not group_norm in list_group_sparsity_norms:
-            raise ValueError('Wrong norm in the structured sparsity term. Choose between %s.' % str(list_group_sparsity_norms))
-
-        # convert to new data structure (needed for faster access)
-        N = np.sum([g.size for g in structureIC])
-        groupIdxIC  = np.zeros( (N,), dtype=np.int32 )
-        groupSizeIC = np.zeros( (structureIC.size,), dtype=np.int32 )
-        pos = 0
-        for i, g in enumerate(structureIC) :
-            groupSizeIC[i] = g.size
-            groupIdxIC[pos:(pos+g.size)] = g[:]
-            pos += g.size
-
-        omegaIC = lambda x: omega_group_sparsity( x, groupIdxIC, groupSizeIC, groupWeightIC, lambdaIC, group_norm )
-        proxIC  = lambda x:  prox_group_sparsity( x, groupIdxIC, groupSizeIC, groupWeightIC, lambdaIC, group_norm )
-    else:
-        raise ValueError('Type of regularisation for IC compartment not recognized.')
-
-
-    # Extracellular Compartment
-    startEC = regularisation.get('startEC')
-    sizeEC  = regularisation.get('sizeEC')
-    if lambdaEC == 0.0:
-        omegaEC = lambda x: 0.0
-        proxEC  = lambda x: x
-    elif normEC == norm2:
-        omegaEC = lambda x: lambdaEC * np.linalg.norm(x[startEC:(startEC+sizeEC)])
-        proxEC  = lambda x: projection_onto_l2_ball(x, lambdaEC, startEC, sizeEC)
-    elif normEC == norm1:
-        omegaEC = lambda x: lambdaEC * sum( x[startEC:(startEC+sizeEC)] )
-        proxEC  = lambda x: soft_thresholding(x, lambdaEC, startEC, sizeEC)
-    elif normEC == non_negative:
-        omegaEC = lambda x: 0.0
-        proxEC  = lambda x: non_negativity(x, startEC, sizeEC)
-    else:
-        raise ValueError('Type of regularisation for EC compartment not recognized.')
-
-    # Isotropic Compartment
-    startISO = regularisation.get('startISO')
-    sizeISO  = regularisation.get('sizeISO')
-    if lambdaISO == 0.0:
-        omegaISO = lambda x: 0.0
-        proxISO  = lambda x: x
-    elif normISO == norm2:
-        omegaISO = lambda x: lambdaISO * np.linalg.norm(x[startISO:(startISO+sizeISO)])
-        proxISO  = lambda x: projection_onto_l2_ball(x, lambdaISO, startISO, sizeISO)
-    elif normISO == norm1:
-        omegaISO = lambda x: lambdaISO * sum( x[startISO:(startISO+sizeISO)] )
-        proxISO  = lambda x: soft_thresholding(x, lambdaISO, startISO, sizeISO)
-    elif normISO == non_negative:
-        omegaISO = lambda x: 0.0
-        proxISO  = lambda x: non_negativity(x, startISO, sizeISO)
-    else:
-        raise ValueError('Type of regularisation for ISO compartment not recognized.')
-
-    omega = lambda x: omegaIC(x) + omegaEC(x) + omegaISO(x)
-    prox = lambda x: non_negativity(proxIC(proxEC(proxISO(x))),0,x.size) # non negativity is redunduntly forced
-
-    return omega, prox
-
-
-def evaluate_model(y, A, x, regularisation = None):
-    if regularisation is None:
-        omega = lambda x: 0.0
-        prox  = lambda x: non_negativity(x, 0, len(x))
-    else:
-        omega, _ = regularisation2omegaprox(regularisation)
-
-    return 0.5*np.linalg.norm(A.dot(x)-y)**2 + omega(x)
-
-
-def solve(y, A, At, tol_fun = 1e-4, tol_x = 1e-6, max_iter = 1000, verbose = 1, x0 = None, regularisation = None):
-    """
-    Solve the regularised least squares problem
-
-        argmin_x 0.5*||Ax-y||_2^2 + Omega(x)
-
-    with the Omega described by 'regularisation'.
-
-    Check the documentation of commit.solvers.init_regularisation to see how to
-    solve a specific problem.
-    """
-    if regularisation is None:
-        omega = lambda x: 0.0
-        prox  = lambda x: non_negativity(x, 0, x.size)
-    else:
-        omega, prox = regularisation2omegaprox(regularisation)
-
-    if x0 is None:
-        x0 = np.zeros(A.shape[1])
-
-    return fista( y, A, At, tol_fun, tol_x, max_iter, verbose, x0, omega, prox)
-
-
-def fista( y, A, At, tol_fun, tol_x, max_iter, verbose, x0, omega, proximal) :
-    """
-    Solve the regularised least squares problem
-
-        argmin_x 0.5*||Ax-y||_2^2 + Omega(x)
-
-    with the FISTA algorithm described in [1].
-
-    The penalty term and its proximal operator must be defined in such a way
-    that they already contain the regularisation parameter.
-
-    References:
-        [1] Beck & Teboulle - `A Fast Iterative Shrinkage Thresholding
-            Algorithm for Linear Inverse Problems`
-    """
-
-    # Initialization
-    res = -y.copy()
-    xhat = x0.copy()
-    x = np.zeros_like(xhat)
-    res += A.dot(xhat)
-    proximal( xhat )
-    reg_term = omega( xhat )
-    prev_obj = 0.5 * np.linalg.norm(res)**2 + reg_term
-
-    told = 1
-    beta = 0.9
-    prev_x = xhat.copy()
-    grad = np.asarray(At.dot(res))
-    qfval = prev_obj
-
-    # Step size computation
-    L = ( np.linalg.norm( A.dot(grad) ) / np.linalg.norm(grad) )**2
-    mu = 1.9 / L
-
-    # Main loop
-    if verbose >= 1 :
-        print()
-        print( "      |  1/2||Ax-y||^2    Omega         |  Cost function    Abs error      Rel error    |     Abs x          Rel x" )
-        print( "------|---------------------------------|-----------------------------------------------|------------------------------" )
-    iter = 1
-    while True :
-        if verbose >= 1 :
-            print( "%4d  |" % iter, end="" )
-            sys.stdout.flush()
-
-        # Smooth step
-        x = xhat - mu*grad
-
-        # Non-smooth step
-        proximal( x )
-        reg_term_x = omega( x )
-
-        # Check stepsize
-        tmp = x-xhat
-        q = qfval + np.real( np.dot(tmp,grad) ) + 0.5/mu * np.linalg.norm(tmp)**2 + reg_term_x
-        res = A.dot(x) - y
-        res_norm = np.linalg.norm(res)
-        curr_obj = 0.5 * res_norm**2 + reg_term_x
-
-        # Backtracking
-        while curr_obj > q :
-            # Smooth step
-            mu = beta*mu
-            x = xhat - mu*grad
-
-            # Non-smooth step
-            proximal( x )
-            reg_term_x = omega( x )
-
-            # Check stepsize
-            tmp = x-xhat
-            q = qfval + np.real( np.dot(tmp,grad) ) + 0.5/mu * np.linalg.norm(tmp)**2 + reg_term_x
-            res = A.dot(x) - y
-            res_norm = np.linalg.norm(res)
-            curr_obj = 0.5 * res_norm**2 + reg_term_x
-
-        # Global stopping criterion
-        abs_obj = abs(curr_obj - prev_obj)
-        rel_obj = abs_obj / curr_obj
-        abs_x   = np.linalg.norm(x - prev_x)
-        rel_x   = abs_x / ( np.linalg.norm(x) + eps )
-        if verbose >= 1 :
-            print( "  %13.7e  %13.7e  |  %13.7e  %13.7e  %13.7e  |  %13.7e  %13.7e" % ( 0.5 * res_norm**2, reg_term_x, curr_obj, abs_obj, rel_obj, abs_x, rel_x ) )
-
-        if abs_obj < eps :
-            criterion = "Absolute tolerance on the objective"
-            break
-        elif rel_obj < tol_fun :
-            criterion = "Relative tolerance on the objective"
-            break
-        elif abs_x < eps :
-            criterion = "Absolute tolerance on the unknown"
-            break
-        elif rel_x < tol_x :
-            criterion = "Relative tolerance on the unknown"
-            break
-        elif iter >= max_iter :
-            criterion = "Maximum number of iterations"
-            break
-
-        # FISTA update
-        t = 0.5 * ( 1 + sqrt(1+4*told**2) )
-        xhat = x + (told-1)/t * (x - prev_x)
-
-        # Gradient computation
-        res = A.dot(xhat) - y
-        xarr = np.asarray(x)
-
-        grad = np.asarray(At.dot(res))
-
-        # Update variables
-        iter += 1
-        prev_obj = curr_obj
-        prev_x = x.copy()
-        told = t
-        qfval = 0.5 * np.linalg.norm(res)**2
-
-
-    if verbose >= 1 :
-        print( "< Stopping criterion: %s >" % criterion )
-
-    opt_details = {}
-    opt_details['residual'] = 0.5*res_norm**2
-    opt_details['regterm'] = reg_term_x
-    opt_details['cost_function'] = curr_obj
-    opt_details['abs_cost'] = abs_obj
-    opt_details['rel_cost'] = rel_obj
-    opt_details['abs_x'] = abs_x
-    opt_details['rel _x'] = rel_x
-    opt_details['iterations'] = iter
-    opt_details['stopping_criterion'] = criterion
-
-    return x, opt_details
+"""
+Author: Matteo Frigo - lts5 @ EPFL and Dep. of CS @ Univ. of Verona
+
+This structure is based on the previous work of Rafael Carrillo and was
+supported by the LTS5 laboratory at EPFL, Lausanne.
+"""
+from __future__ import print_function
+import numpy as np
+from math import sqrt
+import sys
+import warnings
+eps = np.finfo(float).eps
+
+from commit.proximals import (non_negativity,
+                             omega_group_sparsity,
+                             prox_group_sparsity,
+                             soft_thresholding,
+                             projection_onto_l2_ball)
+group_sparsity = -1
+non_negative = 0
+norm1 = 1
+norm2 = 2
+norminf = np.inf
+list_regnorms = [group_sparsity, non_negative, norm1, norm2]
+list_group_sparsity_norms = [norm2]#, norminf] # removed because of issue #54
+
+
+def init_regularisation(commit_evaluation,
+                        regnorms = (non_negative, non_negative, non_negative),
+                        structureIC = None, weightsIC = None, group_norm = 2,
+                        lambdas = (.0,.0,.0) ):
+    """
+    Initialise the data structure that defines Omega in
+
+        argmin_x 0.5*||Ax-y||_2^2 + Omega(x)
+
+
+    Input
+    -----
+    commit_evaluation - commit.Evaluation object :
+        dictionary and model have to be loaded beforehand.
+
+
+    regnorms - tuple :
+        this sets the penalty term to be used for each compartment.
+            Default = (non_negative,non_negative,non_negative).
+
+            regnorms[0] corresponds to the Intracellular compartment
+            regnorms[1] corresponds to the Extracellular compartment
+            regnorms[2] corresponds to the Isotropic compartment
+
+            Each regnorms[k] must be one of commit.solvers.
+                                {group_sparsity, non_negative, norm1, norm2}.
+
+            commit.solvers.group_sparsity considers both the non-overlapping
+                and the hierarchical group sparsity (see [1]). This option is
+                allowed only in the IC compartment. The mathematical formulation
+                of this term is
+                $\Omega(x) = \lambda \sum_{g\in G} w_g |x_g|
+
+            commit.solvers.non_negative puts a non negativity constraint on the
+                coefficients corresponding to the compartment. This is the
+                default option for each compartment
+
+            commit.solvers.norm1 penalises with the 1-norm of the coefficients
+                corresponding to the compartment.
+
+            commit.solvers.norm2 penalises with the 2-norm of the coefficients
+                corresponding to the compartment.
+
+
+    structureIC - np.array(list(list)) :
+        group structure for the IC compartment.
+            This field is necessary only if regterm[0]=commit.solver.group_sparsity.
+            Example:
+                structureIC = np.array([[0,2,5],[1,3,4],[0,1,2,3,4,5],[6]])
+
+                that is equivalent to
+                            [0,1,2,3,4,5]        [6]
+                              /       \
+                        [0,2,5]       [1,3,4]
+                which has two non overlapping groups, one of which is the union
+                of two other non-overlapping groups.
+
+
+    weightsIC - np.array(np.float64) :
+        this defines the weights associated to each group of structure IC.
+
+
+    group_norm - number :
+        norm type for the commit.solver.group_sparsity penalisation of the IC compartment.
+            Default: group_norm = commit.solver.norm2
+            To be chosen among commit.solver.{norm2,norminf}.
+
+    lambdas - tuple :
+        regularisation parameter for each compartment.
+            Default: lambdas = (0.0, 0.0, 0.0)
+            The lambdas correspond to the onse described in the mathematical
+            formulation of the regularisation term
+            $\Omega(x) = lambdas[0]*regnorm[0](x) + lambdas[1]*regnorm[1](x) + lambdas[2]*regnorm[2](x)$
+
+
+    References:
+        [1] Jenatton et al. - 'Proximal Methods for Hierarchical Sparse Coding'
+    """
+    regularisation = {}
+
+    regularisation['startIC']  = 0
+    regularisation['sizeIC']   = int( commit_evaluation.DICTIONARY['IC']['nF'] * commit_evaluation.KERNELS['wmr'].shape[0])
+    regularisation['startEC']  = int( regularisation['sizeIC'] )
+    regularisation['sizeEC']   = int( commit_evaluation.DICTIONARY['EC']['nE'] * commit_evaluation.KERNELS['wmh'].shape[0])
+    regularisation['startISO'] = int( regularisation['sizeIC'] + regularisation['sizeEC'] )
+    regularisation['sizeISO']  = int( commit_evaluation.DICTIONARY['nV'] * commit_evaluation.KERNELS['iso'].shape[0])
+
+    regularisation['normIC']  = regnorms[0]
+    regularisation['normEC']  = regnorms[1]
+    regularisation['normISO'] = regnorms[2]
+
+    regularisation['lambdaIC']  = float( lambdas[0] )
+    regularisation['lambdaEC']  = float( lambdas[1] )
+    regularisation['lambdaISO'] = float( lambdas[2] )
+
+    # Solver-specific fields
+    regularisation['structureIC']      = structureIC
+    regularisation['weightsIC']        = weightsIC
+    regularisation['group_norm']       = group_norm
+
+    return regularisation
+
+
+def regularisation2omegaprox(regularisation):
+    lambdaIC  = float(regularisation.get('lambdaIC'))
+    lambdaEC  = float(regularisation.get('lambdaEC'))
+    lambdaISO = float(regularisation.get('lambdaISO'))
+    if lambdaIC < 0.0 or lambdaEC < 0.0 or lambdaISO < 0.0:
+        raise ValueError('Negative regularisation parameters are not allowed')
+
+    normIC  = regularisation.get('normIC')
+    normEC  = regularisation.get('normEC')
+    normISO = regularisation.get('normISO')
+    if not normIC in list_regnorms:
+        raise ValueError('normIC must be one of commit.solvers.{group_sparsity,non_negative,norm1,norm2}')
+    if not normEC in list_regnorms:
+        raise ValueError('normEC must be one of commit.solvers.{group_sparsity,non_negative,norm1,norm2}')
+    if not normISO in list_regnorms:
+        raise ValueError('normISO must be one of commit.solvers.{group_sparsity,non_negative,norm1,norm2}')
+
+    ## NNLS case
+    if (lambdaIC == 0.0 and lambdaEC == 0.0 and lambdaISO == 0.0) or (normIC == non_negative and normEC == non_negative and normISO == non_negative):
+        omega = lambda x: 0.0
+        prox  = lambda x: non_negativity(x, 0, len(x))
+        return omega, prox
+
+    ## All other cases
+    # Intracellular Compartment
+    startIC = regularisation.get('startIC')
+    sizeIC  = regularisation.get('sizeIC')
+    if lambdaIC == 0.0:
+        omegaIC = lambda x: 0.0
+        proxIC  = lambda x: x
+    elif normIC == norm2:
+        omegaIC = lambda x: lambdaIC * np.linalg.norm(x[startIC:sizeIC])
+        proxIC  = lambda x: projection_onto_l2_ball(x, lambdaIC, startIC, sizeIC)
+    elif normIC == norm1:
+        omegaIC = lambda x: lambdaIC * sum( x[startIC:sizeIC] )
+        proxIC  = lambda x: soft_thresholding(x, lambdaIC, startIC, sizeIC)
+    elif normIC == non_negative:
+        omegaIC = lambda x: 0.0
+        proxIC  = lambda x: non_negativity(x, startIC, sizeIC)
+    elif normIC == group_sparsity:
+        structureIC = regularisation.get('structureIC')
+        groupWeightIC   = regularisation.get('weightsIC')
+        if not len(structureIC) == len(groupWeightIC):
+            raise ValueError('Number of groups and weights do not coincide.')
+        group_norm = regularisation.get('group_norm')
+        if not group_norm in list_group_sparsity_norms:
+            raise ValueError('Wrong norm in the structured sparsity term. Choose between %s.' % str(list_group_sparsity_norms))
+
+        # convert to new data structure (needed for faster access)
+        N = np.sum([g.size for g in structureIC])
+        groupIdxIC  = np.zeros( (N,), dtype=np.int32 )
+        groupSizeIC = np.zeros( (structureIC.size,), dtype=np.int32 )
+        pos = 0
+        for i, g in enumerate(structureIC) :
+            groupSizeIC[i] = g.size
+            groupIdxIC[pos:(pos+g.size)] = g[:]
+            pos += g.size
+
+        omegaIC = lambda x: omega_group_sparsity( x, groupIdxIC, groupSizeIC, groupWeightIC, lambdaIC, group_norm )
+        proxIC  = lambda x:  prox_group_sparsity( x, groupIdxIC, groupSizeIC, groupWeightIC, lambdaIC, group_norm )
+    else:
+        raise ValueError('Type of regularisation for IC compartment not recognized.')
+
+
+    # Extracellular Compartment
+    startEC = regularisation.get('startEC')
+    sizeEC  = regularisation.get('sizeEC')
+    if lambdaEC == 0.0:
+        omegaEC = lambda x: 0.0
+        proxEC  = lambda x: x
+    elif normEC == norm2:
+        omegaEC = lambda x: lambdaEC * np.linalg.norm(x[startEC:(startEC+sizeEC)])
+        proxEC  = lambda x: projection_onto_l2_ball(x, lambdaEC, startEC, sizeEC)
+    elif normEC == norm1:
+        omegaEC = lambda x: lambdaEC * sum( x[startEC:(startEC+sizeEC)] )
+        proxEC  = lambda x: soft_thresholding(x, lambdaEC, startEC, sizeEC)
+    elif normEC == non_negative:
+        omegaEC = lambda x: 0.0
+        proxEC  = lambda x: non_negativity(x, startEC, sizeEC)
+    else:
+        raise ValueError('Type of regularisation for EC compartment not recognized.')
+
+    # Isotropic Compartment
+    startISO = regularisation.get('startISO')
+    sizeISO  = regularisation.get('sizeISO')
+    if lambdaISO == 0.0:
+        omegaISO = lambda x: 0.0
+        proxISO  = lambda x: x
+    elif normISO == norm2:
+        omegaISO = lambda x: lambdaISO * np.linalg.norm(x[startISO:(startISO+sizeISO)])
+        proxISO  = lambda x: projection_onto_l2_ball(x, lambdaISO, startISO, sizeISO)
+    elif normISO == norm1:
+        omegaISO = lambda x: lambdaISO * sum( x[startISO:(startISO+sizeISO)] )
+        proxISO  = lambda x: soft_thresholding(x, lambdaISO, startISO, sizeISO)
+    elif normISO == non_negative:
+        omegaISO = lambda x: 0.0
+        proxISO  = lambda x: non_negativity(x, startISO, sizeISO)
+    else:
+        raise ValueError('Type of regularisation for ISO compartment not recognized.')
+
+    omega = lambda x: omegaIC(x) + omegaEC(x) + omegaISO(x)
+    prox = lambda x: non_negativity(proxIC(proxEC(proxISO(x))),0,x.size) # non negativity is redunduntly forced
+
+    return omega, prox
+
+
+def evaluate_model(y, A, x, regularisation = None):
+    if regularisation is None:
+        omega = lambda x: 0.0
+        prox  = lambda x: non_negativity(x, 0, len(x))
+    else:
+        omega, _ = regularisation2omegaprox(regularisation)
+
+    return 0.5*np.linalg.norm(A.dot(x)-y)**2 + omega(x)
+
+
+def solve(y, A, At, tol_fun = 1e-4, tol_x = 1e-6, max_iter = 1000, verbose = 1, x0 = None, regularisation = None):
+    """
+    Solve the regularised least squares problem
+
+        argmin_x 0.5*||Ax-y||_2^2 + Omega(x)
+
+    with the Omega described by 'regularisation'.
+
+    Check the documentation of commit.solvers.init_regularisation to see how to
+    solve a specific problem.
+    """
+    if regularisation is None:
+        omega = lambda x: 0.0
+        prox  = lambda x: non_negativity(x, 0, x.size)
+    else:
+        omega, prox = regularisation2omegaprox(regularisation)
+
+    if x0 is None:
+        x0 = np.zeros(A.shape[1])
+
+    return fista( y, A, At, tol_fun, tol_x, max_iter, verbose, x0, omega, prox)
+
+
+def fista( y, A, At, tol_fun, tol_x, max_iter, verbose, x0, omega, proximal) :
+    """
+    Solve the regularised least squares problem
+
+        argmin_x 0.5*||Ax-y||_2^2 + Omega(x)
+
+    with the FISTA algorithm described in [1].
+
+    The penalty term and its proximal operator must be defined in such a way
+    that they already contain the regularisation parameter.
+
+    References:
+        [1] Beck & Teboulle - `A Fast Iterative Shrinkage Thresholding
+            Algorithm for Linear Inverse Problems`
+    """
+
+    # Initialization
+    res = -y.copy()
+    xhat = x0.copy()
+    x = np.zeros_like(xhat)
+    res += A.dot(xhat)
+    proximal( xhat )
+    reg_term = omega( xhat )
+    prev_obj = 0.5 * np.linalg.norm(res)**2 + reg_term
+
+    told = 1
+    beta = 0.9
+    prev_x = xhat.copy()
+    grad = np.asarray(At.dot(res))
+    qfval = prev_obj
+
+    # Step size computation
+    L = ( np.linalg.norm( A.dot(grad) ) / np.linalg.norm(grad) )**2
+    mu = 1.9 / L
+
+    # Main loop
+    if verbose >= 1 :
+        print()
+        print( "      |  1/2||Ax-y||^2    Omega         |  Cost function    Abs error      Rel error    |     Abs x          Rel x" )
+        print( "------|---------------------------------|-----------------------------------------------|------------------------------" )
+    iter = 1
+    while True :
+        if verbose >= 1 :
+            print( "%4d  |" % iter, end="" )
+            sys.stdout.flush()
+
+        # Smooth step
+        x = xhat - mu*grad
+
+        # Non-smooth step
+        proximal( x )
+        reg_term_x = omega( x )
+
+        # Check stepsize
+        tmp = x-xhat
+        q = qfval + np.real( np.dot(tmp,grad) ) + 0.5/mu * np.linalg.norm(tmp)**2 + reg_term_x
+        res = A.dot(x) - y
+        res_norm = np.linalg.norm(res)
+        curr_obj = 0.5 * res_norm**2 + reg_term_x
+
+        # Backtracking
+        while curr_obj > q :
+            # Smooth step
+            mu = beta*mu
+            x = xhat - mu*grad
+
+            # Non-smooth step
+            proximal( x )
+            reg_term_x = omega( x )
+
+            # Check stepsize
+            tmp = x-xhat
+            q = qfval + np.real( np.dot(tmp,grad) ) + 0.5/mu * np.linalg.norm(tmp)**2 + reg_term_x
+            res = A.dot(x) - y
+            res_norm = np.linalg.norm(res)
+            curr_obj = 0.5 * res_norm**2 + reg_term_x
+
+        # Global stopping criterion
+        abs_obj = abs(curr_obj - prev_obj)
+        rel_obj = abs_obj / curr_obj
+        abs_x   = np.linalg.norm(x - prev_x)
+        rel_x   = abs_x / ( np.linalg.norm(x) + eps )
+        if verbose >= 1 :
+            print( "  %13.7e  %13.7e  |  %13.7e  %13.7e  %13.7e  |  %13.7e  %13.7e" % ( 0.5 * res_norm**2, reg_term_x, curr_obj, abs_obj, rel_obj, abs_x, rel_x ) )
+
+        if abs_obj < eps :
+            criterion = "Absolute tolerance on the objective"
+            break
+        elif rel_obj < tol_fun :
+            criterion = "Relative tolerance on the objective"
+            break
+        elif abs_x < eps :
+            criterion = "Absolute tolerance on the unknown"
+            break
+        elif rel_x < tol_x :
+            criterion = "Relative tolerance on the unknown"
+            break
+        elif iter >= max_iter :
+            criterion = "Maximum number of iterations"
+            break
+
+        # FISTA update
+        t = 0.5 * ( 1 + sqrt(1+4*told**2) )
+        xhat = x + (told-1)/t * (x - prev_x)
+
+        # Gradient computation
+        res = A.dot(xhat) - y
+        xarr = np.asarray(x)
+
+        grad = np.asarray(At.dot(res))
+
+        # Update variables
+        iter += 1
+        prev_obj = curr_obj
+        prev_x = x.copy()
+        told = t
+        qfval = 0.5 * np.linalg.norm(res)**2
+
+
+    if verbose >= 1 :
+        print( "< Stopping criterion: %s >" % criterion )
+
+    opt_details = {}
+    opt_details['residual'] = 0.5*res_norm**2
+    opt_details['regterm'] = reg_term_x
+    opt_details['cost_function'] = curr_obj
+    opt_details['abs_cost'] = abs_obj
+    opt_details['rel_cost'] = rel_obj
+    opt_details['abs_x'] = abs_x
+    opt_details['rel _x'] = rel_x
+    opt_details['iterations'] = iter
+    opt_details['stopping_criterion'] = criterion
+
+    return x, opt_details
diff --git a/commit/trk2dictionary/trk2dictionary.pyx b/commit/trk2dictionary/trk2dictionary.pyx
index 410f92f1..c48d8366 100755
--- a/commit/trk2dictionary/trk2dictionary.pyx
+++ b/commit/trk2dictionary/trk2dictionary.pyx
@@ -1,456 +1,456 @@
-#!python
-# cython: language_level=3, c_string_type=str, c_string_encoding=ascii, boundscheck=False, wraparound=False, profile=False
-from __future__ import print_function
-import cython
-import numpy as np
-cimport numpy as np
-import nibabel
-from os.path import join, exists, splitext
-from os import makedirs, remove
-import time
-import amico
-import pickle
-
-
-# Interface to actual C code
-cdef extern from "trk2dictionary_c.cpp":
-    int trk2dictionary(
-        char* filename_tractogram, int data_offset, int Nx, int Ny, int Nz, float Px, float Py, float Pz, int n_count, int n_scalars, 
-        int n_properties, float fiber_shiftX, float fiber_shiftY, float fiber_shiftZ, int points_to_skip, float min_seg_len,
-        float* ptrPEAKS, int Np, float vf_THR, int ECix, int ECiy, int ECiz,
-        float* _ptrMASK, float* ptrTDI, char* path_out, int c, double* ptrAFFINE,
-        int nBlurRadii, double blurSigma, double* ptrBlurRadii, int* ptrBlurSamples, double* ptrBlurWeights,  float* ptrArrayInvM, unsigned short ndirs, short* prtHashTable
-    ) nogil
-
-
-cpdef run( filename_tractogram = None, path_out = None, filename_peaks = None, filename_mask = None, do_intersect = True,
-    fiber_shift = 0, points_to_skip = 0, vf_THR = 0.1, peaks_use_affine = False,
-    flip_peaks = [False,False,False], min_seg_len = 1e-3, gen_trk = True,
-    blur_radii = [], blur_samples = [], blur_sigma = 1.0, filename_trk = None, TCK_ref_image = None, ndirs = 32761
-    ):
-    """Perform the conversion of a tractoram to the sparse data-structure internally
-    used by COMMIT to perform the matrix-vector multiplications with the operator A
-    during the inversion of the linear system.
-
-    Parameters
-    ----------
-    filename_tractogram : string
-        Path to the .trk or .tck file containing the tractogram to load.
-        
-    filename_trk : string
-        DEPRECATED. Use filename_tractogram instead.
-
-    path_out : string
-        Path to the folder where to store the sparse data structure.
-
-    filename_peaks : string
-        Path to the NIFTI file containing the peaks to use as extra-cellular contributions.
-        The data matrix should be 4D with last dimension 3*N, where N is the number
-        of peaks in each voxel. (default : no extra-cellular contributions)
-
-    filename_mask : string
-        Path to a binary mask to restrict the analysis to specific areas. Segments
-        outside this mask are discarded. If not specified (default), the mask is created from
-        all voxels intersected by the tracts.
-
-    do_intersect : boolean
-        If True then fiber segments that intersect voxel boundaries are splitted (default).
-        If False then the centroid of the segment is used as its voxel position.
-
-    fiber_shift : float or list of three float
-        If necessary, apply a translation to fiber coordinates (default : 0) to account
-        for differences between the reference system of the tracking algorithm and COMMIT.
-        The value is specified in voxel units, eg 0.5 translates by half voxel.
-        Do noth use if you are using fiber_shiftX or fiber_shiftY or fiber_shiftZ.
-
-    points_to_skip : integer
-        If necessary, discard first points at beginning/end of a fiber (default : 0).
-
-    vf_THR : float
-        Discard peaks smaller than vf_THR * max peak (default : 0.1).
-
-    peaks_use_affine : boolean
-        Whether to rotate the peaks according to the affine matrix (default : False).
-
-    flip_peaks : list of three boolean
-        If necessary, flips peak orientations along each axis (default : no flipping).
-
-    min_seg_len : float
-        Discard segments <= than this length in mm (default : 1e-3)
-
-    gen_trk : boolean
-        If True then generate a .trk file in the 'path_out' containing the fibers used in the dictionary (default : True)
-    
-    blur_radii : list of float
-        Translate each segment to given radii to assign a broader fiber contribution (default : [])
-    
-    blur_samples : list of integer
-        Segments are duplicated along a circle at a given radius; this parameter controls the number of samples to take over a given circle (defaut : [])
-
-    blur_sigma: float
-        The contributions of the segments at different radii are damped as a Gaussian (default : 1.0)    
-    
-    TCK_ref_image: string
-        Path to the NIFTI file containing the information about the geometry used for the tractogram .tck to load. 
-        If it is not specified, it will try to use the information of filename_peaks or filename_mask.
-    
-    ndirs : int
-            Number of directions on the half of the sphere
-    """
-
-    filename = path_out + '/dictionary_info.pickle'
-    dictionary_info = {}
-    dictionary_info['filename_trk'] = filename_trk
-    dictionary_info['path_out'] = path_out
-    dictionary_info['filename_peaks'] = filename_peaks
-    dictionary_info['filename_mask'] = filename_mask
-    dictionary_info['do_intersect'] = do_intersect
-    dictionary_info['fiber_shift'] = fiber_shift
-    dictionary_info['points_to_skip'] = points_to_skip
-    dictionary_info['vf_THR'] = vf_THR
-    dictionary_info['peaks_use_affine'] = peaks_use_affine
-    dictionary_info['flip_peaks'] = flip_peaks
-    dictionary_info['min_seg_len'] = min_seg_len
-    dictionary_info['gen_trk'] = gen_trk
-    dictionary_info['blur_radii'] = blur_radii
-    dictionary_info['blur_samples'] = blur_samples
-    dictionary_info['blur_sigma'] = blur_sigma
-    dictionary_info['ndirs'] = ndirs
-
-    # check the value of ndirs
-    if not amico.lut.is_valid(ndirs):
-        raise RuntimeError( 'Unsupported value for ndirs.\nNote: Supported values for ndirs are [500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 10000, 32761 (default)]' )
-
-    # check conflicts of fiber_shift
-    if np.isscalar(fiber_shift) :
-        fiber_shiftX = fiber_shift
-        fiber_shiftY = fiber_shift
-        fiber_shiftZ = fiber_shift
-    elif len(fiber_shift) == 3 :
-        fiber_shiftX = fiber_shift[0]
-        fiber_shiftY = fiber_shift[1]
-        fiber_shiftZ = fiber_shift[2]
-    else :
-        raise RuntimeError( 'fiber_shift must be a scalar or a vector with 3 elements' )
-
-    tic = time.time()
-    print( '\n-> Creating the dictionary from tractogram:' )
-    print( '\t* Segment position = %s' % ( 'COMPUTE INTERSECTIONS' if do_intersect else 'CENTROID' ) )
-    print( '\t* Fiber shift X    = %.3f (voxel-size units)' % fiber_shiftX )
-    print( '\t* Fiber shift Y    = %.3f (voxel-size units)' % fiber_shiftY )
-    print( '\t* Fiber shift Z    = %.3f (voxel-size units)' % fiber_shiftZ )
-    print( '\t* Points to skip   = %d' % points_to_skip )
-    print( '\t* Min segment len  = %.2e' % min_seg_len )
-
-    # check blur params
-    cdef :
-        double [:] blurRadii
-        int [:] blurSamples
-        double [:] blurWeights
-        double* ptrBlurRadii
-        int* ptrBlurSamples
-        double* ptrBlurWeights
-        int nBlurRadii
-        float [:] ArrayInvM
-        float* ptrArrayInvM
-
-    if len(blur_radii) != len(blur_samples) :
-        raise RuntimeError( 'number of radii and samples must match' )
-
-    # convert to numpy arrays (add fake radius for original segment)
-    nBlurRadii = len(blur_radii)+1
-    blurRadii = np.array( [0.0]+blur_radii, np.double )
-    blurSamples = np.array( [1]+blur_samples, np.int32 )
-
-    # compute weights for gaussian damping
-    blurWeights = np.empty_like( blurRadii )
-    for i in xrange(nBlurRadii):
-        blurWeights[i] = np.exp( -blurRadii[i]**2 / (2.0*blur_sigma**2) )
-
-    if nBlurRadii == 1 :
-        print( '\t* Do not blur fibers' )
-    else :
-        print( '\t* Blur fibers :' )
-        print( '\t\t- sigma = %.3f' % blur_sigma )
-        print( '\t\t- radii =   [', end="" )
-        for i in xrange( 1, blurRadii.size ) :
-            print( '%.3f' % blurRadii[i], end="" )
-        print( ']' )
-        print( '\t\t- samples = [', end="" )
-        for i in xrange( 1, blurSamples.size ) :
-            print( '%5d' % blurSamples[i], end="" )
-        print( ']' )
-        print( '\t\t- weights = [', end="" )
-        for i in xrange( 1, blurWeights.size ) :
-            print( '%.3f' % blurWeights[i], end="" )
-        print( ']' )
-
-    ptrBlurRadii   = &blurRadii[0]
-    ptrBlurSamples = &blurSamples[0]
-    ptrBlurWeights = &blurWeights[0]
-
-    # minimum segment length
-    if min_seg_len < 0 :
-        raise RuntimeError( 'min_seg_len must be >= 0' )
-
-
-    print( '\t* Loading data:' )
-
-    cdef short [:] htable = amico.lut.load_precomputed_hash_table(ndirs)
-    cdef short* ptrHashTable = &htable[0]
-
-    # fiber-tracts from .trk
-    print( '\t\t* tractogram' )
-    
-    if (path_out is None):
-        raise RuntimeError( 'Path out not defined' )
-
-    if (filename_trk is None and filename_tractogram is None):
-        raise RuntimeError( 'Tractogram file not defined' )
-
-    if (filename_trk is not None and filename_tractogram is not None):
-        print('\t\t\t  [WARNING] filename_tractogram will be used, filename_trk will not be considered')
-
-    if (filename_trk is not None and filename_tractogram is None):
-        filename_tractogram = filename_trk
-        print('\t\t\t  [WARNING] filename_trk parameter is deprecated, in the future use filename_tractogram ')
-    
-    extension = splitext(filename_tractogram)[1]  #take extension of file
-    
-    if (extension != ".trk" and extension != ".tck") :
-        raise IOError( 'Invalid input file. Please enter tractogram file .trk or .tck' )
-    try : #read the header of the file in the same way both in .trk and in .tck
-        hdr = nibabel.streamlines.load( filename_tractogram ).header
-    except :
-        raise IOError( 'Tractogram file not found' )
-        
-    if (extension == ".trk"): #read header of .trk file
-        Nx = hdr['dimensions'][0]
-        Ny = hdr['dimensions'][1]
-        Nz = hdr['dimensions'][2]
-        Px = hdr['voxel_sizes'][0]
-        Py = hdr['voxel_sizes'][1]
-        Pz = hdr['voxel_sizes'][2]
-
-        data_offset = 1000
-        n_count = hdr['nb_streamlines']
-        n_scalars = hdr['nb_scalars_per_point']
-        n_properties = hdr['nb_properties_per_streamline']
-
-    if (extension == ".tck"): #read header of .tck file
-        #open file .nii and get header of this to get info on the structure
-
-        if TCK_ref_image is None:
-            if filename_peaks is not None:
-                TCK_ref_image = filename_peaks
-            elif filename_mask is not None:
-                TCK_ref_image = filename_mask
-            else:
-                raise RuntimeError( 'TCK files do not contain information about the geometry. Use "TCK_ref_image" for that.' )
-
-        print ('\t\t\t- geometry taken from "%s"' %TCK_ref_image)
-
-        #load the TCK_ref_image( .nii file ) with nibabel
-        nii_image = nibabel.load(TCK_ref_image)
-        #read the header of nii file
-        nii_hdr = nii_image.header if nibabel.__version__ >= '2.0.0' else nii_image.get_header()
-
-        #set shape's of tractogram
-        Nx = nii_image.shape[0]
-        Ny = nii_image.shape[1]
-        Nz = nii_image.shape[2]
-
-        #set distance's of control points
-        Px = nii_hdr['pixdim'][1]
-        Py = nii_hdr['pixdim'][2]
-        Pz = nii_hdr['pixdim'][3]
-
-        #set offset and number of streamlines
-        data_offset = int(hdr['_offset_data'])  #set offset
-        n_count = int(hdr['count'])  #set number of fibers
-
-        #set number of proprieties and number of scalar to zero, because there are not present in .tck file
-        n_scalars = 0
-        n_properties = 0
-        
-    print( '\t\t\t- %d x %d x %d' % ( Nx, Ny, Nz ) )
-    print( '\t\t\t- %.4f x %.4f x %.4f' % ( Px, Py, Pz ) )
-    print( '\t\t\t- %d fibers' % n_count )
-    if Nx >= 2**16 or Nz >= 2**16 or Nz >= 2**16 :
-        raise RuntimeError( 'The max dim size is 2^16 voxels' )
-    
-    # get the affine matrix
-    if (extension == ".tck"):
-        scaleMat = np.diag(np.divide(1.0, [Px,Py,Pz]))
-        M = nii_hdr.get_best_affine() #get affine
-
-        # Affine matrix without scaling, i.e. diagonal is 1
-        M[:3, :3] = np.dot(scaleMat, M[:3, :3]) #delete scalar
-
-        M = M.astype('<f4') # affine matrix in float value
-
-        invM = np.linalg.inv(M) # inverse affine matrix
-
-        #create a vector of inverse matrix M
-        ArrayInvM = np.ravel(invM)
-        ptrArrayInvM = &ArrayInvM[0]
-
-    # white-matter mask
-    cdef float* ptrMASK
-    cdef float [:, :, ::1] niiMASK_img
-    if filename_mask is not None :
-        print( '\t\t* filtering mask' )
-        niiMASK = nibabel.load( filename_mask )
-        niiMASK_hdr = niiMASK.header if nibabel.__version__ >= '2.0.0' else niiMASK.get_header()
-        print( '\t\t\t- %d x %d x %d' % ( niiMASK.shape[0], niiMASK.shape[1], niiMASK.shape[2] ) )
-        print( '\t\t\t- %.4f x %.4f x %.4f' % ( niiMASK_hdr['pixdim'][1], niiMASK_hdr['pixdim'][2], niiMASK_hdr['pixdim'][3] ) )
-        if ( Nx!=niiMASK.shape[0] or Ny!=niiMASK.shape[1] or Nz!=niiMASK.shape[2] or
-             abs(Px-niiMASK_hdr['pixdim'][1])>1e-3 or abs(Py-niiMASK_hdr['pixdim'][2])>1e-3 or abs(Pz-niiMASK_hdr['pixdim'][3])>1e-3 ) :
-            print( '\t\t  [WARNING] dataset does not have the same geometry as the tractogram' )
-        niiMASK_img = np.ascontiguousarray( niiMASK.get_data().astype(np.float32) )
-        ptrMASK  = &niiMASK_img[0,0,0]
-    else :
-        print( '\t\t* no mask specified to filter IC compartments' )
-        ptrMASK = NULL
-
-    # peaks file for EC contributions
-    cdef float* ptrPEAKS
-    cdef float [:, :, :, ::1] niiPEAKS_img
-    cdef int Np
-    cdef float [:, :, ::1] niiTDI_img = np.ascontiguousarray( np.zeros((Nx,Ny,Nz),dtype=np.float32) )
-    cdef float* ptrTDI  = &niiTDI_img[0,0,0]
-    cdef double [:, ::1] affine
-    cdef double* ptrAFFINE
-    if filename_peaks is not None :
-        print( '\t\t* EC orientations' )
-        niiPEAKS = nibabel.load( filename_peaks )
-        niiPEAKS_hdr = niiPEAKS.header if nibabel.__version__ >= '2.0.0' else niiPEAKS.get_header()
-        print( '\t\t\t- %d x %d x %d x %d' % ( niiPEAKS.shape[0], niiPEAKS.shape[1], niiPEAKS.shape[2], niiPEAKS.shape[3] ) )
-        print( '\t\t\t- %.4f x %.4f x %.4f' % ( niiPEAKS_hdr['pixdim'][1], niiPEAKS_hdr['pixdim'][2], niiPEAKS_hdr['pixdim'][3] ) )
-        print( '\t\t\t- ignoring peaks < %.2f * MaxPeak' % vf_THR )
-        print( '\t\t\t- %susing affine matrix' % ( "" if peaks_use_affine else "not " ) )
-        print( '\t\t\t- flipping axes : [ x=%s, y=%s, z=%s ]' % ( flip_peaks[0], flip_peaks[1], flip_peaks[2] ) )
-        if ( Nx!=niiPEAKS.shape[0] or Ny!=niiPEAKS.shape[1] or Nz!=niiPEAKS.shape[2] or
-             abs(Px-niiPEAKS_hdr['pixdim'][1])>1e-3 or abs(Py-niiPEAKS_hdr['pixdim'][2])>1e-3 or abs(Pz-niiPEAKS_hdr['pixdim'][3])>1e-3 ) :
-            print( "\t\t  [WARNING] dataset does not have the same geometry as the tractogram" )
-        if niiPEAKS.shape[3] % 3 :
-            raise RuntimeError( 'PEAKS dataset must have 3*k volumes' )
-        if vf_THR < 0 or vf_THR > 1 :
-            raise RuntimeError( 'vf_THR must be between 0 and 1' )
-        niiPEAKS_img = np.ascontiguousarray( niiPEAKS.get_data().astype(np.float32) )
-        ptrPEAKS = &niiPEAKS_img[0,0,0,0]
-        Np = niiPEAKS.shape[3]/3
-
-        # affine matrix to rotate gradien directions (if required)
-        if peaks_use_affine :
-            affine = np.ascontiguousarray( niiPEAKS.affine[:3,:3].T )
-        else :
-            affine = np.ascontiguousarray( np.eye(3) )
-        ptrAFFINE = &affine[0,0]
-    else :
-        print( '\t\t* no dataset specified for EC compartments' )
-        Np = 0
-        ptrPEAKS = NULL
-        ptrAFFINE = NULL
-
-    # output path
-    print( '\t\t* output written to "%s"' % path_out )
-    if not exists( path_out ):
-        makedirs( path_out )
-
-    # write dictionary info file
-    with open( filename, 'wb+' ) as dictionary_info_file:
-        pickle.dump(dictionary_info, dictionary_info_file, protocol=2)
-
-    # calling actual C code
-    ret = trk2dictionary( filename_tractogram, data_offset,
-        Nx, Ny, Nz, Px, Py, Pz, n_count, n_scalars, n_properties,
-        fiber_shiftX, fiber_shiftY, fiber_shiftZ, points_to_skip, min_seg_len,
-        ptrPEAKS, Np, vf_THR, -1 if flip_peaks[0] else 1, -1 if flip_peaks[1] else 1, -1 if flip_peaks[2] else 1,
-        ptrMASK, ptrTDI, path_out, 1 if do_intersect else 0, ptrAFFINE,
-        nBlurRadii, blur_sigma, ptrBlurRadii, ptrBlurSamples, ptrBlurWeights, ptrArrayInvM, ndirs, ptrHashTable  );
-    if ret == 0 :
-        print( '   [ DICTIONARY not generated ]' )
-        return None
-
-    # create new TRK with only fibers in the WM mask
-    # create new dictionaty file (TRK or TCK) with only fibers in the WM mask
-    if gen_trk :
-        print ('\t* Generate tractogram matching the dictionary: ')
-        fib = nibabel.streamlines.load(filename_tractogram)
-        hdr = fib.header
-
-        file_kept = np.fromfile( join(path_out,'dictionary_TRK_kept.dict'), dtype=np.bool_ )
-        tractogram_out = fib.tractogram[ file_kept ]
-        hdr['count'] = len(tractogram_out) #set new number of fibers in the header
-        hdr['nb_streamlines'] = len(tractogram_out)
-
-        #create a output dictionary file (TRK or TCK) in path_out
-        nibabel.streamlines.save( tractogram_out, join(path_out,'dictionary_TRK_fibers'+extension), header=hdr )
-        print( '\t  [ %d fibers kept ]' % np.count_nonzero( file_kept ) )
-    print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
-
-    # save TDI and MASK maps
-    if filename_mask is not None :
-        affine = niiMASK.affine if nibabel.__version__ >= '2.0.0' else niiMASK.get_affine()
-    elif filename_peaks is not None :
-        affine = niiPEAKS.affine if nibabel.__version__ >= '2.0.0' else niiPEAKS.get_affine()
-    else :
-        affine = np.diag( [Px, Py, Pz, 1] )
-
-    niiTDI = nibabel.Nifti1Image( niiTDI_img, affine )
-    nibabel.save( niiTDI, join(path_out,'dictionary_tdi.nii.gz') )
-
-    if filename_mask is not None :
-        niiMASK = nibabel.Nifti1Image( niiMASK_img, affine )
-    else :
-        niiMASK = nibabel.Nifti1Image( (np.asarray(niiTDI_img)>0).astype(np.float32), affine )
-    nibabel.save( niiMASK, join(path_out,'dictionary_mask.nii.gz') )
-
-
-cpdef convert_old_dictionary( path ):
-    """Perform the conversion of the files representing a dictionary, i.e. dictionary_*.dict,
-    from the old format to the new one, where the files *_{vx,vy,vz}.dict are replaced
-    by a single file *_v.dict (same for the files *_{ox,oy}.dict).
-
-    Parameters
-    ----------
-    path : string
-        Path to the folder containing the dictionary_*.dict files.
-    """
-    if not exists( join(path,'dictionary_IC_vx.dict') ):
-        raise RuntimeError( 'Folder does not contain dictionary files in the old format' )
-
-    niiTDI = nibabel.load( join(path,'dictionary_tdi.nii.gz') )
-    Nx, Ny, Nz = niiTDI.shape[:3]
-    x = np.fromfile( join(path,'dictionary_IC_vx.dict'), dtype=np.uint16 ).astype(np.uint32)
-    y = np.fromfile( join(path,'dictionary_IC_vy.dict'), dtype=np.uint16 ).astype(np.uint32)
-    z = np.fromfile( join(path,'dictionary_IC_vz.dict'), dtype=np.uint16 ).astype(np.uint32)
-    v = x + Nx * ( y + Ny * z )
-    v.tofile( join(path,'dictionary_IC_v.dict') )
-    remove( join(path,'dictionary_IC_vx.dict') )
-    remove( join(path,'dictionary_IC_vy.dict') )
-    remove( join(path,'dictionary_IC_vz.dict') )
-
-    x = np.fromfile( join(path,'dictionary_EC_vx.dict'), dtype=np.uint8 ).astype(np.uint32)
-    y = np.fromfile( join(path,'dictionary_EC_vy.dict'), dtype=np.uint8 ).astype(np.uint32)
-    z = np.fromfile( join(path,'dictionary_EC_vz.dict'), dtype=np.uint8 ).astype(np.uint32)
-    v = x + Nx * ( y + Ny * z )
-    v.tofile( join(path,'dictionary_EC_v.dict') )
-    remove( join(path,'dictionary_EC_vx.dict') )
-    remove( join(path,'dictionary_EC_vy.dict') )
-    remove( join(path,'dictionary_EC_vz.dict') )
-
-    x = np.fromfile( join(path,'dictionary_IC_ox.dict'), dtype=np.uint8 ).astype(np.uint16)
-    y = np.fromfile( join(path,'dictionary_IC_oy.dict'), dtype=np.uint8 ).astype(np.uint16)
-    v = y + 181 * x
-    v.tofile( join(path,'dictionary_IC_o.dict') )
-    remove( join(path,'dictionary_IC_ox.dict') )
-    remove( join(path,'dictionary_IC_oy.dict') )
-
-    x = np.fromfile( join(path,'dictionary_EC_ox.dict'), dtype=np.uint8 ).astype(np.uint16)
-    y = np.fromfile( join(path,'dictionary_EC_oy.dict'), dtype=np.uint8 ).astype(np.uint16)
-    v = y + 181 * x
-    v.tofile( join(path,'dictionary_EC_o.dict') )
-    remove( join(path,'dictionary_EC_ox.dict') )
-    remove( join(path,'dictionary_EC_oy.dict') )
+#!python
+# cython: language_level=3, c_string_type=str, c_string_encoding=ascii, boundscheck=False, wraparound=False, profile=False
+from __future__ import print_function
+import cython
+import numpy as np
+cimport numpy as np
+import nibabel
+from os.path import join, exists, splitext
+from os import makedirs, remove
+import time
+import amico
+import pickle
+
+
+# Interface to actual C code
+cdef extern from "trk2dictionary_c.cpp":
+    int trk2dictionary(
+        char* filename_tractogram, int data_offset, int Nx, int Ny, int Nz, float Px, float Py, float Pz, int n_count, int n_scalars, 
+        int n_properties, float fiber_shiftX, float fiber_shiftY, float fiber_shiftZ, int points_to_skip, float min_seg_len,
+        float* ptrPEAKS, int Np, float vf_THR, int ECix, int ECiy, int ECiz,
+        float* _ptrMASK, float* ptrTDI, char* path_out, int c, double* ptrAFFINE,
+        int nBlurRadii, double blurSigma, double* ptrBlurRadii, int* ptrBlurSamples, double* ptrBlurWeights,  float* ptrArrayInvM, unsigned short ndirs, short* prtHashTable
+    ) nogil
+
+
+cpdef run( filename_tractogram = None, path_out = None, filename_peaks = None, filename_mask = None, do_intersect = True,
+    fiber_shift = 0, points_to_skip = 0, vf_THR = 0.1, peaks_use_affine = False,
+    flip_peaks = [False,False,False], min_seg_len = 1e-3, gen_trk = True,
+    blur_radii = [], blur_samples = [], blur_sigma = 1.0, filename_trk = None, TCK_ref_image = None, ndirs = 32761
+    ):
+    """Perform the conversion of a tractoram to the sparse data-structure internally
+    used by COMMIT to perform the matrix-vector multiplications with the operator A
+    during the inversion of the linear system.
+
+    Parameters
+    ----------
+    filename_tractogram : string
+        Path to the .trk or .tck file containing the tractogram to load.
+        
+    filename_trk : string
+        DEPRECATED. Use filename_tractogram instead.
+
+    path_out : string
+        Path to the folder where to store the sparse data structure.
+
+    filename_peaks : string
+        Path to the NIFTI file containing the peaks to use as extra-cellular contributions.
+        The data matrix should be 4D with last dimension 3*N, where N is the number
+        of peaks in each voxel. (default : no extra-cellular contributions)
+
+    filename_mask : string
+        Path to a binary mask to restrict the analysis to specific areas. Segments
+        outside this mask are discarded. If not specified (default), the mask is created from
+        all voxels intersected by the tracts.
+
+    do_intersect : boolean
+        If True then fiber segments that intersect voxel boundaries are splitted (default).
+        If False then the centroid of the segment is used as its voxel position.
+
+    fiber_shift : float or list of three float
+        If necessary, apply a translation to fiber coordinates (default : 0) to account
+        for differences between the reference system of the tracking algorithm and COMMIT.
+        The value is specified in voxel units, eg 0.5 translates by half voxel.
+        Do noth use if you are using fiber_shiftX or fiber_shiftY or fiber_shiftZ.
+
+    points_to_skip : integer
+        If necessary, discard first points at beginning/end of a fiber (default : 0).
+
+    vf_THR : float
+        Discard peaks smaller than vf_THR * max peak (default : 0.1).
+
+    peaks_use_affine : boolean
+        Whether to rotate the peaks according to the affine matrix (default : False).
+
+    flip_peaks : list of three boolean
+        If necessary, flips peak orientations along each axis (default : no flipping).
+
+    min_seg_len : float
+        Discard segments <= than this length in mm (default : 1e-3)
+
+    gen_trk : boolean
+        If True then generate a .trk file in the 'path_out' containing the fibers used in the dictionary (default : True)
+    
+    blur_radii : list of float
+        Translate each segment to given radii to assign a broader fiber contribution (default : [])
+    
+    blur_samples : list of integer
+        Segments are duplicated along a circle at a given radius; this parameter controls the number of samples to take over a given circle (defaut : [])
+
+    blur_sigma: float
+        The contributions of the segments at different radii are damped as a Gaussian (default : 1.0)    
+    
+    TCK_ref_image: string
+        Path to the NIFTI file containing the information about the geometry used for the tractogram .tck to load. 
+        If it is not specified, it will try to use the information of filename_peaks or filename_mask.
+    
+    ndirs : int
+            Number of directions on the half of the sphere
+    """
+
+    filename = path_out + '/dictionary_info.pickle'
+    dictionary_info = {}
+    dictionary_info['filename_trk'] = filename_trk
+    dictionary_info['path_out'] = path_out
+    dictionary_info['filename_peaks'] = filename_peaks
+    dictionary_info['filename_mask'] = filename_mask
+    dictionary_info['do_intersect'] = do_intersect
+    dictionary_info['fiber_shift'] = fiber_shift
+    dictionary_info['points_to_skip'] = points_to_skip
+    dictionary_info['vf_THR'] = vf_THR
+    dictionary_info['peaks_use_affine'] = peaks_use_affine
+    dictionary_info['flip_peaks'] = flip_peaks
+    dictionary_info['min_seg_len'] = min_seg_len
+    dictionary_info['gen_trk'] = gen_trk
+    dictionary_info['blur_radii'] = blur_radii
+    dictionary_info['blur_samples'] = blur_samples
+    dictionary_info['blur_sigma'] = blur_sigma
+    dictionary_info['ndirs'] = ndirs
+
+    # check the value of ndirs
+    if not amico.lut.is_valid(ndirs):
+        raise RuntimeError( 'Unsupported value for ndirs.\nNote: Supported values for ndirs are [500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 10000, 32761 (default)]' )
+
+    # check conflicts of fiber_shift
+    if np.isscalar(fiber_shift) :
+        fiber_shiftX = fiber_shift
+        fiber_shiftY = fiber_shift
+        fiber_shiftZ = fiber_shift
+    elif len(fiber_shift) == 3 :
+        fiber_shiftX = fiber_shift[0]
+        fiber_shiftY = fiber_shift[1]
+        fiber_shiftZ = fiber_shift[2]
+    else :
+        raise RuntimeError( 'fiber_shift must be a scalar or a vector with 3 elements' )
+
+    tic = time.time()
+    print( '\n-> Creating the dictionary from tractogram:' )
+    print( '\t* Segment position = %s' % ( 'COMPUTE INTERSECTIONS' if do_intersect else 'CENTROID' ) )
+    print( '\t* Fiber shift X    = %.3f (voxel-size units)' % fiber_shiftX )
+    print( '\t* Fiber shift Y    = %.3f (voxel-size units)' % fiber_shiftY )
+    print( '\t* Fiber shift Z    = %.3f (voxel-size units)' % fiber_shiftZ )
+    print( '\t* Points to skip   = %d' % points_to_skip )
+    print( '\t* Min segment len  = %.2e' % min_seg_len )
+
+    # check blur params
+    cdef :
+        double [:] blurRadii
+        int [:] blurSamples
+        double [:] blurWeights
+        double* ptrBlurRadii
+        int* ptrBlurSamples
+        double* ptrBlurWeights
+        int nBlurRadii
+        float [:] ArrayInvM
+        float* ptrArrayInvM
+
+    if len(blur_radii) != len(blur_samples) :
+        raise RuntimeError( 'number of radii and samples must match' )
+
+    # convert to numpy arrays (add fake radius for original segment)
+    nBlurRadii = len(blur_radii)+1
+    blurRadii = np.array( [0.0]+blur_radii, np.double )
+    blurSamples = np.array( [1]+blur_samples, np.int32 )
+
+    # compute weights for gaussian damping
+    blurWeights = np.empty_like( blurRadii )
+    for i in xrange(nBlurRadii):
+        blurWeights[i] = np.exp( -blurRadii[i]**2 / (2.0*blur_sigma**2) )
+
+    if nBlurRadii == 1 :
+        print( '\t* Do not blur fibers' )
+    else :
+        print( '\t* Blur fibers :' )
+        print( '\t\t- sigma = %.3f' % blur_sigma )
+        print( '\t\t- radii =   [', end="" )
+        for i in xrange( 1, blurRadii.size ) :
+            print( '%.3f' % blurRadii[i], end="" )
+        print( ']' )
+        print( '\t\t- samples = [', end="" )
+        for i in xrange( 1, blurSamples.size ) :
+            print( '%5d' % blurSamples[i], end="" )
+        print( ']' )
+        print( '\t\t- weights = [', end="" )
+        for i in xrange( 1, blurWeights.size ) :
+            print( '%.3f' % blurWeights[i], end="" )
+        print( ']' )
+
+    ptrBlurRadii   = &blurRadii[0]
+    ptrBlurSamples = &blurSamples[0]
+    ptrBlurWeights = &blurWeights[0]
+
+    # minimum segment length
+    if min_seg_len < 0 :
+        raise RuntimeError( 'min_seg_len must be >= 0' )
+
+
+    print( '\t* Loading data:' )
+
+    cdef short [:] htable = amico.lut.load_precomputed_hash_table(ndirs)
+    cdef short* ptrHashTable = &htable[0]
+
+    # fiber-tracts from .trk
+    print( '\t\t* tractogram' )
+    
+    if (path_out is None):
+        raise RuntimeError( 'Path out not defined' )
+
+    if (filename_trk is None and filename_tractogram is None):
+        raise RuntimeError( 'Tractogram file not defined' )
+
+    if (filename_trk is not None and filename_tractogram is not None):
+        print('\t\t\t  [WARNING] filename_tractogram will be used, filename_trk will not be considered')
+
+    if (filename_trk is not None and filename_tractogram is None):
+        filename_tractogram = filename_trk
+        print('\t\t\t  [WARNING] filename_trk parameter is deprecated, in the future use filename_tractogram ')
+    
+    extension = splitext(filename_tractogram)[1]  #take extension of file
+    
+    if (extension != ".trk" and extension != ".tck") :
+        raise IOError( 'Invalid input file. Please enter tractogram file .trk or .tck' )
+    try : #read the header of the file in the same way both in .trk and in .tck
+        hdr = nibabel.streamlines.load( filename_tractogram ).header
+    except :
+        raise IOError( 'Tractogram file not found' )
+        
+    if (extension == ".trk"): #read header of .trk file
+        Nx = hdr['dimensions'][0]
+        Ny = hdr['dimensions'][1]
+        Nz = hdr['dimensions'][2]
+        Px = hdr['voxel_sizes'][0]
+        Py = hdr['voxel_sizes'][1]
+        Pz = hdr['voxel_sizes'][2]
+
+        data_offset = 1000
+        n_count = hdr['nb_streamlines']
+        n_scalars = hdr['nb_scalars_per_point']
+        n_properties = hdr['nb_properties_per_streamline']
+
+    if (extension == ".tck"): #read header of .tck file
+        #open file .nii and get header of this to get info on the structure
+
+        if TCK_ref_image is None:
+            if filename_peaks is not None:
+                TCK_ref_image = filename_peaks
+            elif filename_mask is not None:
+                TCK_ref_image = filename_mask
+            else:
+                raise RuntimeError( 'TCK files do not contain information about the geometry. Use "TCK_ref_image" for that.' )
+
+        print ('\t\t\t- geometry taken from "%s"' %TCK_ref_image)
+
+        #load the TCK_ref_image( .nii file ) with nibabel
+        nii_image = nibabel.load(TCK_ref_image)
+        #read the header of nii file
+        nii_hdr = nii_image.header if nibabel.__version__ >= '2.0.0' else nii_image.get_header()
+
+        #set shape's of tractogram
+        Nx = nii_image.shape[0]
+        Ny = nii_image.shape[1]
+        Nz = nii_image.shape[2]
+
+        #set distance's of control points
+        Px = nii_hdr['pixdim'][1]
+        Py = nii_hdr['pixdim'][2]
+        Pz = nii_hdr['pixdim'][3]
+
+        #set offset and number of streamlines
+        data_offset = int(hdr['_offset_data'])  #set offset
+        n_count = int(hdr['count'])  #set number of fibers
+
+        #set number of proprieties and number of scalar to zero, because there are not present in .tck file
+        n_scalars = 0
+        n_properties = 0
+        
+    print( '\t\t\t- %d x %d x %d' % ( Nx, Ny, Nz ) )
+    print( '\t\t\t- %.4f x %.4f x %.4f' % ( Px, Py, Pz ) )
+    print( '\t\t\t- %d fibers' % n_count )
+    if Nx >= 2**16 or Nz >= 2**16 or Nz >= 2**16 :
+        raise RuntimeError( 'The max dim size is 2^16 voxels' )
+    
+    # get the affine matrix
+    if (extension == ".tck"):
+        scaleMat = np.diag(np.divide(1.0, [Px,Py,Pz]))
+        M = nii_hdr.get_best_affine() #get affine
+
+        # Affine matrix without scaling, i.e. diagonal is 1
+        M[:3, :3] = np.dot(scaleMat, M[:3, :3]) #delete scalar
+
+        M = M.astype('<f4') # affine matrix in float value
+
+        invM = np.linalg.inv(M) # inverse affine matrix
+
+        #create a vector of inverse matrix M
+        ArrayInvM = np.ravel(invM)
+        ptrArrayInvM = &ArrayInvM[0]
+
+    # white-matter mask
+    cdef float* ptrMASK
+    cdef float [:, :, ::1] niiMASK_img
+    if filename_mask is not None :
+        print( '\t\t* filtering mask' )
+        niiMASK = nibabel.load( filename_mask )
+        niiMASK_hdr = niiMASK.header if nibabel.__version__ >= '2.0.0' else niiMASK.get_header()
+        print( '\t\t\t- %d x %d x %d' % ( niiMASK.shape[0], niiMASK.shape[1], niiMASK.shape[2] ) )
+        print( '\t\t\t- %.4f x %.4f x %.4f' % ( niiMASK_hdr['pixdim'][1], niiMASK_hdr['pixdim'][2], niiMASK_hdr['pixdim'][3] ) )
+        if ( Nx!=niiMASK.shape[0] or Ny!=niiMASK.shape[1] or Nz!=niiMASK.shape[2] or
+             abs(Px-niiMASK_hdr['pixdim'][1])>1e-3 or abs(Py-niiMASK_hdr['pixdim'][2])>1e-3 or abs(Pz-niiMASK_hdr['pixdim'][3])>1e-3 ) :
+            print( '\t\t  [WARNING] dataset does not have the same geometry as the tractogram' )
+        niiMASK_img = np.ascontiguousarray( niiMASK.get_data().astype(np.float32) )
+        ptrMASK  = &niiMASK_img[0,0,0]
+    else :
+        print( '\t\t* no mask specified to filter IC compartments' )
+        ptrMASK = NULL
+
+    # peaks file for EC contributions
+    cdef float* ptrPEAKS
+    cdef float [:, :, :, ::1] niiPEAKS_img
+    cdef int Np
+    cdef float [:, :, ::1] niiTDI_img = np.ascontiguousarray( np.zeros((Nx,Ny,Nz),dtype=np.float32) )
+    cdef float* ptrTDI  = &niiTDI_img[0,0,0]
+    cdef double [:, ::1] affine
+    cdef double* ptrAFFINE
+    if filename_peaks is not None :
+        print( '\t\t* EC orientations' )
+        niiPEAKS = nibabel.load( filename_peaks )
+        niiPEAKS_hdr = niiPEAKS.header if nibabel.__version__ >= '2.0.0' else niiPEAKS.get_header()
+        print( '\t\t\t- %d x %d x %d x %d' % ( niiPEAKS.shape[0], niiPEAKS.shape[1], niiPEAKS.shape[2], niiPEAKS.shape[3] ) )
+        print( '\t\t\t- %.4f x %.4f x %.4f' % ( niiPEAKS_hdr['pixdim'][1], niiPEAKS_hdr['pixdim'][2], niiPEAKS_hdr['pixdim'][3] ) )
+        print( '\t\t\t- ignoring peaks < %.2f * MaxPeak' % vf_THR )
+        print( '\t\t\t- %susing affine matrix' % ( "" if peaks_use_affine else "not " ) )
+        print( '\t\t\t- flipping axes : [ x=%s, y=%s, z=%s ]' % ( flip_peaks[0], flip_peaks[1], flip_peaks[2] ) )
+        if ( Nx!=niiPEAKS.shape[0] or Ny!=niiPEAKS.shape[1] or Nz!=niiPEAKS.shape[2] or
+             abs(Px-niiPEAKS_hdr['pixdim'][1])>1e-3 or abs(Py-niiPEAKS_hdr['pixdim'][2])>1e-3 or abs(Pz-niiPEAKS_hdr['pixdim'][3])>1e-3 ) :
+            print( "\t\t  [WARNING] dataset does not have the same geometry as the tractogram" )
+        if niiPEAKS.shape[3] % 3 :
+            raise RuntimeError( 'PEAKS dataset must have 3*k volumes' )
+        if vf_THR < 0 or vf_THR > 1 :
+            raise RuntimeError( 'vf_THR must be between 0 and 1' )
+        niiPEAKS_img = np.ascontiguousarray( niiPEAKS.get_data().astype(np.float32) )
+        ptrPEAKS = &niiPEAKS_img[0,0,0,0]
+        Np = niiPEAKS.shape[3]/3
+
+        # affine matrix to rotate gradien directions (if required)
+        if peaks_use_affine :
+            affine = np.ascontiguousarray( niiPEAKS.affine[:3,:3].T )
+        else :
+            affine = np.ascontiguousarray( np.eye(3) )
+        ptrAFFINE = &affine[0,0]
+    else :
+        print( '\t\t* no dataset specified for EC compartments' )
+        Np = 0
+        ptrPEAKS = NULL
+        ptrAFFINE = NULL
+
+    # output path
+    print( '\t\t* output written to "%s"' % path_out )
+    if not exists( path_out ):
+        makedirs( path_out )
+
+    # write dictionary info file
+    with open( filename, 'wb+' ) as dictionary_info_file:
+        pickle.dump(dictionary_info, dictionary_info_file, protocol=2)
+
+    # calling actual C code
+    ret = trk2dictionary( filename_tractogram, data_offset,
+        Nx, Ny, Nz, Px, Py, Pz, n_count, n_scalars, n_properties,
+        fiber_shiftX, fiber_shiftY, fiber_shiftZ, points_to_skip, min_seg_len,
+        ptrPEAKS, Np, vf_THR, -1 if flip_peaks[0] else 1, -1 if flip_peaks[1] else 1, -1 if flip_peaks[2] else 1,
+        ptrMASK, ptrTDI, path_out, 1 if do_intersect else 0, ptrAFFINE,
+        nBlurRadii, blur_sigma, ptrBlurRadii, ptrBlurSamples, ptrBlurWeights, ptrArrayInvM, ndirs, ptrHashTable  );
+    if ret == 0 :
+        print( '   [ DICTIONARY not generated ]' )
+        return None
+
+    # create new TRK with only fibers in the WM mask
+    # create new dictionaty file (TRK or TCK) with only fibers in the WM mask
+    if gen_trk :
+        print ('\t* Generate tractogram matching the dictionary: ')
+        fib = nibabel.streamlines.load(filename_tractogram)
+        hdr = fib.header
+
+        file_kept = np.fromfile( join(path_out,'dictionary_TRK_kept.dict'), dtype=np.bool_ )
+        tractogram_out = fib.tractogram[ file_kept ]
+        hdr['count'] = len(tractogram_out) #set new number of fibers in the header
+        hdr['nb_streamlines'] = len(tractogram_out)
+
+        #create a output dictionary file (TRK or TCK) in path_out
+        nibabel.streamlines.save( tractogram_out, join(path_out,'dictionary_TRK_fibers'+extension), header=hdr )
+        print( '\t  [ %d fibers kept ]' % np.count_nonzero( file_kept ) )
+    print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
+
+    # save TDI and MASK maps
+    if filename_mask is not None :
+        affine = niiMASK.affine if nibabel.__version__ >= '2.0.0' else niiMASK.get_affine()
+    elif filename_peaks is not None :
+        affine = niiPEAKS.affine if nibabel.__version__ >= '2.0.0' else niiPEAKS.get_affine()
+    else :
+        affine = np.diag( [Px, Py, Pz, 1] )
+
+    niiTDI = nibabel.Nifti1Image( niiTDI_img, affine )
+    nibabel.save( niiTDI, join(path_out,'dictionary_tdi.nii.gz') )
+
+    if filename_mask is not None :
+        niiMASK = nibabel.Nifti1Image( niiMASK_img, affine )
+    else :
+        niiMASK = nibabel.Nifti1Image( (np.asarray(niiTDI_img)>0).astype(np.float32), affine )
+    nibabel.save( niiMASK, join(path_out,'dictionary_mask.nii.gz') )
+
+
+cpdef convert_old_dictionary( path ):
+    """Perform the conversion of the files representing a dictionary, i.e. dictionary_*.dict,
+    from the old format to the new one, where the files *_{vx,vy,vz}.dict are replaced
+    by a single file *_v.dict (same for the files *_{ox,oy}.dict).
+
+    Parameters
+    ----------
+    path : string
+        Path to the folder containing the dictionary_*.dict files.
+    """
+    if not exists( join(path,'dictionary_IC_vx.dict') ):
+        raise RuntimeError( 'Folder does not contain dictionary files in the old format' )
+
+    niiTDI = nibabel.load( join(path,'dictionary_tdi.nii.gz') )
+    Nx, Ny, Nz = niiTDI.shape[:3]
+    x = np.fromfile( join(path,'dictionary_IC_vx.dict'), dtype=np.uint16 ).astype(np.uint32)
+    y = np.fromfile( join(path,'dictionary_IC_vy.dict'), dtype=np.uint16 ).astype(np.uint32)
+    z = np.fromfile( join(path,'dictionary_IC_vz.dict'), dtype=np.uint16 ).astype(np.uint32)
+    v = x + Nx * ( y + Ny * z )
+    v.tofile( join(path,'dictionary_IC_v.dict') )
+    remove( join(path,'dictionary_IC_vx.dict') )
+    remove( join(path,'dictionary_IC_vy.dict') )
+    remove( join(path,'dictionary_IC_vz.dict') )
+
+    x = np.fromfile( join(path,'dictionary_EC_vx.dict'), dtype=np.uint8 ).astype(np.uint32)
+    y = np.fromfile( join(path,'dictionary_EC_vy.dict'), dtype=np.uint8 ).astype(np.uint32)
+    z = np.fromfile( join(path,'dictionary_EC_vz.dict'), dtype=np.uint8 ).astype(np.uint32)
+    v = x + Nx * ( y + Ny * z )
+    v.tofile( join(path,'dictionary_EC_v.dict') )
+    remove( join(path,'dictionary_EC_vx.dict') )
+    remove( join(path,'dictionary_EC_vy.dict') )
+    remove( join(path,'dictionary_EC_vz.dict') )
+
+    x = np.fromfile( join(path,'dictionary_IC_ox.dict'), dtype=np.uint8 ).astype(np.uint16)
+    y = np.fromfile( join(path,'dictionary_IC_oy.dict'), dtype=np.uint8 ).astype(np.uint16)
+    v = y + 181 * x
+    v.tofile( join(path,'dictionary_IC_o.dict') )
+    remove( join(path,'dictionary_IC_ox.dict') )
+    remove( join(path,'dictionary_IC_oy.dict') )
+
+    x = np.fromfile( join(path,'dictionary_EC_ox.dict'), dtype=np.uint8 ).astype(np.uint16)
+    y = np.fromfile( join(path,'dictionary_EC_oy.dict'), dtype=np.uint8 ).astype(np.uint16)
+    v = y + 181 * x
+    v.tofile( join(path,'dictionary_EC_o.dict') )
+    remove( join(path,'dictionary_EC_ox.dict') )
+    remove( join(path,'dictionary_EC_oy.dict') )
diff --git a/commit/trk2dictionary/trk2dictionary_c.cpp b/commit/trk2dictionary/trk2dictionary_c.cpp
index 1c88a798..0baefe51 100644
--- a/commit/trk2dictionary/trk2dictionary_c.cpp
+++ b/commit/trk2dictionary/trk2dictionary_c.cpp
@@ -1,608 +1,608 @@
-#include <stdio.h>
-#include <cstdio>
-#include <string>
-#include <map>
-#include <vector>
-#include "Vector.h"
-#include "ProgressBar.h"
-#include <numpy/arrayobject.h>
-#include <math.h>
-
-#define MAX_FIB_LEN 10000
-
-
-// CLASS to store the segments of one fiber
-class segKey
-{
-    public:
-    unsigned short x, y, z;
-    unsigned short o;
-    segKey(){}
-
-    void set(unsigned short _x, unsigned short _y, unsigned short _z, unsigned short _o)
-    {
-        x  = _x;
-        y  = _y;
-        z  = _z;
-        o = _o;
-    }
-
-    bool const operator <(const segKey& seg) const
-    {
-        return o < seg.o || (o==seg.o && z<seg.z) || (o==seg.o && z==seg.z && y<seg.y) || (o==seg.o && z==seg.z && y==seg.y && x<seg.x);
-    }
-};
-
-class segInVoxKey
-{
-    public:
-    unsigned short x, y, z;
-    segInVoxKey(){}
-
-    void set(unsigned short _x, unsigned short _y, unsigned short _z)
-    {
-        x  = _x;
-        y  = _y;
-        z  = _z;
-    }
-    bool const operator <(const segInVoxKey& o) const
-    {
-        return (z<o.z) || (z==o.z && y<o.y) || (z==o.z && y==o.y && x<o.x);
-    }
-};
-
-// global variables (to avoid passing them at each call)
-std::map<segKey,float> FiberSegments;
-
-Vector<int>     dim;
-Vector<float>   pixdim;
-float*          ptrMASK;
-unsigned int    nPointsToSkip;
-float           fiberShiftXmm, fiberShiftYmm, fiberShiftZmm;
-bool            doIntersect;
-float           minSegLen;
-
-std::vector<double> radii;         // radii for the extrusion
-std::vector<double> weights;       // damping weight
-std::vector<int>    sectors;       // number of duplicates across the extrusion circle
-double              radiusSigma;   // modulates the impact of each segment as function of radius
-
-
-bool rayBoxIntersection( Vector<double>& origin, Vector<double>& direction, Vector<double>& vmin, Vector<double>& vmax, double & t);
-void fiberForwardModel( float fiber[3][MAX_FIB_LEN], unsigned int pts, std::vector<int> sectors, std::vector<double> radii, std::vector<double> weight, short* ptrHashTable );
-void segmentForwardModel( const Vector<double>& P1, const Vector<double>& P2, double w, short* ptrHashTable );
-unsigned int read_fiberTRK( FILE* fp, float fiber[3][MAX_FIB_LEN], int ns, int np );
-unsigned int read_fiberTCK( FILE* fp, float fiber[3][MAX_FIB_LEN] , float affine[4][4]);
-
-
-// =========================
-// Function called by CYTHON
-// =========================
-int trk2dictionary(
-    char* str_filename, int data_offset, int Nx, int Ny, int Nz, float Px, float Py, float Pz, int n_count, int n_scalars, int n_properties,
-    float fiber_shiftX, float fiber_shiftY, float fiber_shiftZ, int points_to_skip, float min_seg_len,
-    float* ptrPEAKS, int Np, float vf_THR, int ECix, int ECiy, int ECiz,
-    float* _ptrMASK, float* ptrTDI, char* path_out, int c, double* ptrAFFINE,
-    int nBlurRadii, double blurSigma, double* ptrBlurRadii, int* ptrBlurSamples, double* ptrBlurWeights, float* VetAffine, unsigned short ndirs, short* ptrHashTable
-)
-{
-    /*=========================*/
-    /*     IC compartments     */
-    /*=========================*/
-    float          fiber[3][MAX_FIB_LEN];
-    float          fiberNorm, fiberLen;
-    unsigned int   N, totICSegments = 0, totFibers = 0, v;
-    unsigned short o;
-    unsigned char  kept;
-    Vector<double> P;
-    std::string    filename;
-    std::string    OUTPUT_path(path_out);
-    std::map<segKey,float>::iterator it;
-
-    std::map<segInVoxKey,float> FiberNorm;
-    std::map<segInVoxKey,float>::iterator itNorm;
-    segInVoxKey         inVoxKey;
-
-    printf( "\t* Exporting IC compartments:\n" );
-    
-    int isTRK; // var to check
-
-    char *ext = strrchr(str_filename, '.'); //get the extension of input file
-
-    if (strcmp(ext,".trk")==0) //for .trk file
-        isTRK = 1;
-    else if (strcmp(ext,".tck")==0)// for .tck file
-        isTRK = 0;
-    else
-        return 0;
-
-    FILE* fpTractogram = fopen(str_filename,"rb"); //open 
-    if (fpTractogram == NULL) return 0;
-
-    if ( isTRK ) { // SKIP header on .trk
-        fseek(fpTractogram,data_offset,SEEK_SET); //skip the first 1000 bytes in the .trk file
-    }
-    else { // SKIP header on .tck
-        fseek(fpTractogram,data_offset,SEEK_SET); //skip the first offset bytes in the .tck file
-    }
-
-    // set global variables
-    dim.Set( Nx, Ny, Nz );
-    pixdim.Set( Px, Py, Pz );
-    nPointsToSkip = points_to_skip;
-    fiberShiftXmm = fiber_shiftX * pixdim.x; // shift in mm for the coordinates
-    fiberShiftYmm = fiber_shiftY * pixdim.y;
-    fiberShiftZmm = fiber_shiftZ * pixdim.z;
-    ptrMASK       = _ptrMASK;
-    doIntersect   = c > 0;
-    minSegLen     = min_seg_len;
-
-    radii.clear();
-    sectors.clear();
-    weights.clear();
-    for(int i=0; i<nBlurRadii ;i++)
-    {
-        radii.push_back( ptrBlurRadii[i] );
-        sectors.push_back( ptrBlurSamples[i] );
-        weights.push_back( ptrBlurWeights[i] );
-    }
-    radiusSigma = blurSigma;
-
-    // open files
-    filename = OUTPUT_path+"/dictionary_TRK_norm.dict";   FILE* pDict_TRK_norm = fopen(filename.c_str(),"wb");
-    if ( !pDict_TRK_norm )
-    {
-        printf( "\n[trk2dictionary] Unable to create output files" );
-        return 0;
-    }
-    filename = OUTPUT_path+"/dictionary_IC_f.dict";        FILE* pDict_IC_f      = fopen(filename.c_str(),"wb");
-    filename = OUTPUT_path+"/dictionary_IC_v.dict";        FILE* pDict_IC_v      = fopen(filename.c_str(),"wb");
-    filename = OUTPUT_path+"/dictionary_IC_o.dict";        FILE* pDict_IC_o      = fopen(filename.c_str(),"wb");
-    filename = OUTPUT_path+"/dictionary_IC_len.dict";      FILE* pDict_IC_len    = fopen(filename.c_str(),"wb");
-    filename = OUTPUT_path+"/dictionary_TRK_len.dict";     FILE* pDict_TRK_len   = fopen(filename.c_str(),"wb");
-    filename = OUTPUT_path+"/dictionary_TRK_kept.dict";    FILE* pDict_TRK_kept  = fopen(filename.c_str(),"wb");
-    filename = OUTPUT_path+"/dictionary_ndirs.dict";       FILE* pDict_ndirs     = fopen(filename.c_str(),"wb");
-
-    // iterate over fibers
-    ProgressBar PROGRESS( n_count );
-    PROGRESS.setPrefix("\t  ");
-    
-    float affine[4][4];
-    if (!isTRK)  {//.tck
-        //ricreate affine matrix
-        int k = 0;
-        for(int i=0; i<4; i++) {
-            for (int j=0; j<4; j++) {
-                affine[i][j] = VetAffine[k];
-                k++;
-            }
-        }
-    }
-    
-    for(int f=0; f<n_count ;f++)
-    {
-        PROGRESS.inc();
-        //read fibers in .trk or in .tck
-        if (isTRK) { // .trk file
-            N = read_fiberTRK( fpTractogram, fiber, n_scalars, n_properties );
-        }
-        else { // .tck file
-            N = read_fiberTCK( fpTractogram, fiber , affine );
-        }
-        
-        fiberForwardModel( fiber, N, sectors, radii, weights, ptrHashTable  );
-
-        kept = 0;
-        if ( FiberSegments.size() > 0 )
-        {
-            // add segments to files
-            fiberNorm = 0;
-            fiberLen = 0;
-            for (it=FiberSegments.begin(); it!=FiberSegments.end(); it++)
-            {
-                // NB: plese note inverted ordering for 'v'
-                v = it->first.x + dim.x * ( it->first.y + dim.y * it->first.z );
-                o = it->first.o;
-                fwrite( &totFibers,      4, 1, pDict_IC_f );
-                fwrite( &v,              4, 1, pDict_IC_v );
-                fwrite( &o,              2, 1, pDict_IC_o );
-                fwrite( &(it->second),   4, 1, pDict_IC_len );
-                ptrTDI[ it->first.z + dim.z * ( it->first.y + dim.y * it->first.x ) ] += it->second;
-                inVoxKey.set( it->first.x, it->first.y, it->first.z );
-                FiberNorm[inVoxKey] += it->second;
-                fiberLen += it->second;
-            }
-            for (itNorm=FiberNorm.begin(); itNorm!=FiberNorm.end(); itNorm++)
-            {
-                fiberNorm += pow(itNorm->second,2);
-            }
-            fiberNorm = sqrt(fiberNorm);
-            FiberNorm.clear();
-            fwrite( &fiberNorm,  1, 4, pDict_TRK_norm ); // actual length considered in optimization
-            fwrite( &fiberLen,   1, 4, pDict_TRK_len );
-            totICSegments += FiberSegments.size();
-            totFibers++;
-            kept = 1;
-        }
-        fwrite( &kept, 1, 1, pDict_TRK_kept );
-    }
-    PROGRESS.close();
-
-    // write dictionary ndirs value
-    fwrite(&ndirs, 1, sizeof(unsigned short), pDict_ndirs);
-    fclose( fpTractogram );
-    fclose( pDict_TRK_norm );
-    fclose( pDict_IC_f );
-    fclose( pDict_IC_v );
-    fclose( pDict_IC_o );
-    fclose( pDict_IC_len );
-    fclose( pDict_TRK_len );
-    fclose( pDict_TRK_kept );
-    fclose( pDict_ndirs );
-
-    printf("\t  [ %d fibers kept, %d segments in total ]\n", totFibers, totICSegments );
-
-
-    /*=========================*/
-    /*     EC compartments     */
-    /*=========================*/
-    unsigned int totECSegments = 0, totECVoxels = 0;
-
-    printf( "\t* Exporting EC compartments:\n" );
-
-    filename = OUTPUT_path+"/dictionary_EC_v.dict";        FILE* pDict_EC_v   = fopen( filename.c_str(),   "wb" );
-    filename = OUTPUT_path+"/dictionary_EC_o.dict";        FILE* pDict_EC_o   = fopen( filename.c_str(),   "wb" );
-
-    if ( ptrPEAKS != NULL )
-    {
-        Vector<double> dir;
-        double         longitude, colatitude;
-        segKey         ec_seg;
-        int            ix, iy, iz, id, atLeastOne;
-        float          peakMax;
-        float          norms[ Np ];
-        float          *ptr;
-        int            ox, oy;
-
-        PROGRESS.reset( dim.z );
-        for(iz=0; iz<dim.z ;iz++)
-        {
-            PROGRESS.inc();
-            for(iy=0; iy<dim.y ;iy++)
-            for(ix=0; ix<dim.x ;ix++)
-            {
-                // check if in mask previously computed from IC segments
-                if ( ptrTDI[ iz + dim.z * ( iy + dim.y * ix ) ] == 0 ) continue;
-
-                peakMax = -1;
-                for(id=0; id<Np ;id++)
-                {
-                    ptr = ptrPEAKS + 3*(id + Np * ( iz + dim.z * ( iy + dim.y * ix ) ));
-                    dir.x = ptr[0];
-                    dir.y = ptr[1];
-                    dir.z = ptr[2];
-                    norms[id] = dir.norm();
-                    if ( norms[id] > peakMax )
-                        peakMax = norms[id];
-                }
-
-                if ( peakMax > 0 )
-                {
-                    ec_seg.x  = ix;
-                    ec_seg.y  = iy;
-                    ec_seg.z  = iz;
-                    atLeastOne = 0;
-                    for(id=0; id<Np ;id++)
-                    {
-                        if ( norms[id]==0 || norms[id] < vf_THR*peakMax ) continue; // peak too small, don't consider it
-
-                        // get the orientation of the current peak
-                        ptr = ptrPEAKS + 3*(id + Np * ( iz + dim.z * ( iy + dim.y * ix ) ));
-
-                        // multiply by the affine matrix
-                        dir.x = ptr[0] * ptrAFFINE[0] + ptr[1] * ptrAFFINE[1] + ptr[2] * ptrAFFINE[2];
-                        dir.y = ptr[0] * ptrAFFINE[3] + ptr[1] * ptrAFFINE[4] + ptr[2] * ptrAFFINE[5];
-                        dir.z = ptr[0] * ptrAFFINE[6] + ptr[1] * ptrAFFINE[7] + ptr[2] * ptrAFFINE[8];
-
-                        // flip axes if requested
-                        dir.x *= ECix;
-                        dir.y *= ECiy;
-                        dir.z *= ECiz;
-                        if ( dir.y < 0 )
-                        {
-                            // ensure to be in the right hemisphere (the one where kernels were pre-computed)
-                            dir.x = -dir.x;
-                            dir.y = -dir.y;
-                            dir.z = -dir.z;
-                        }
-                        colatitude = atan2( sqrt(dir.x*dir.x + dir.y*dir.y), dir.z );
-                        longitude  = atan2( dir.y, dir.x );
-                        ox = (int)round(colatitude/M_PI*180.0);
-                        oy = (int)round(longitude/M_PI*180.0);
-
-                        v = ec_seg.x + dim.x * ( ec_seg.y + dim.y * ec_seg.z );
-                        o = ptrHashTable[ox*181 + oy];
-                        fwrite( &v, 4, 1, pDict_EC_v );
-                        fwrite( &o, 2, 1, pDict_EC_o );
-                        totECSegments++;
-                        atLeastOne = 1;
-                    }
-                    if ( atLeastOne>0 )
-                        totECVoxels++;
-                }
-            }
-        }
-        PROGRESS.close();
-    }
-
-    fclose( pDict_EC_v );
-    fclose( pDict_EC_o );
-
-    printf("\t  [ %d voxels, %d segments ]\n", totECVoxels, totECSegments );
-
-    return 1;
-}
-
-
-/********************************************************************************************************************/
-/*                                                 fiberForwardModel                                                */
-/********************************************************************************************************************/
-void fiberForwardModel( float fiber[3][MAX_FIB_LEN], unsigned int pts, std::vector<int> sectors, std::vector<double> radii, std::vector<double> weights, short* ptrHashTable )
-{
-    static Vector<double> S1, S2, S1m, S2m, P, q, n, qxn, qxqxn;
-    static Vector<double> vox, vmin, vmax, dir;
-    static double         len, t, alpha, w, R;
-    static int            i, j, k;
-
-    FiberSegments.clear();
-    //printf("RANGO -----------------------------> from %d to %d\n", nPointsToSkip, pts-1-nPointsToSkip);
-    for(i=nPointsToSkip; i<pts-1-nPointsToSkip ;i++)
-    {
-        // original segment to be processed
-        S1.Set( fiber[0][i]   + fiberShiftXmm, fiber[1][i]   + fiberShiftYmm, fiber[2][i]   + fiberShiftZmm );
-        S2.Set( fiber[0][i+1] + fiberShiftXmm, fiber[1][i+1] + fiberShiftYmm, fiber[2][i+1] + fiberShiftZmm );
-        dir.x = S2.x-S1.x;
-        dir.y = S2.y-S1.y;
-        dir.z = S2.z-S1.z;
-        dir.Normalize();
-
-        // get a normal to the vector to move
-        n.x = dir.y-dir.z;
-        n.y = dir.z-dir.x;
-        n.z = dir.x-dir.y;
-        n.Normalize();
-
-        /* assign contribution(s) */
-        for(k=0; k<(int)radii.size() ;k++)
-        {
-            if ( weights[k] < 1e-3 )
-                continue;
-
-            R = radii[k];
-
-            // quaternion (q.x, q.y, q.z, w) for rotation
-            alpha = 2.0*M_PI/sectors[k];
-            w = sin(alpha/2.0);
-            q.x = dir.x * w;
-            q.y = dir.y * w;
-            q.z = dir.z * w;
-            w = cos(alpha/2.0);
-
-
-            for(j=0; j<sectors[k] ;j++)
-            {
-                // rotate the segment's normal
-                qxn.x = 2.0 * ( q.y * n.z - q.z * n.y );
-                qxn.y = 2.0 * ( q.z * n.x - q.x * n.z );
-                qxn.z = 2.0 * ( q.x * n.y - q.y * n.x );
-                qxqxn.x = q.y * qxn.z - q.z * qxn.y;
-                qxqxn.y = q.z * qxn.x - q.x * qxn.z;
-                qxqxn.z = q.x * qxn.y - q.y * qxn.x;
-                n.x += w * qxn.x + qxqxn.x;
-                n.y += w * qxn.y + qxqxn.y;
-                n.z += w * qxn.z + qxqxn.z;
-                // n /= np.linalg.norm(n)
-
-                // move the segment
-                S1m.x = S1.x + R*n.x;
-                S1m.y = S1.y + R*n.y;
-                S1m.z = S1.z + R*n.z;
-                S2m.x = S2.x + R*n.x;
-                S2m.y = S2.y + R*n.y;
-                S2m.z = S2.z + R*n.z;
-
-                if ( doIntersect==false )
-                    segmentForwardModel( S1m, S2m, weights[k], ptrHashTable );
-                else
-                    while( 1 )
-                    {
-                        len = sqrt( pow(S2m.x-S1m.x,2) + pow(S2m.y-S1m.y,2) + pow(S2m.z-S1m.z,2) ); // in mm
-                        if ( len <= minSegLen )
-                            break;
-
-                        // compute AABB of the first point (in mm)
-                        vmin.x = floor( (S1m.x + 1e-6*dir.x)/pixdim.x ) * pixdim.x;
-                        vmin.y = floor( (S1m.y + 1e-6*dir.y)/pixdim.y ) * pixdim.y;
-                        vmin.z = floor( (S1m.z + 1e-6*dir.z)/pixdim.z ) * pixdim.z;
-                        vmax.x = vmin.x + pixdim.x;
-                        vmax.y = vmin.y + pixdim.y;
-                        vmax.z = vmin.z + pixdim.z;
-
-                        if ( rayBoxIntersection( S1m, dir, vmin, vmax, t ) && t>0 && t<len )
-                        {
-                            // add the portion S1P, and then reiterate
-                            P.Set( S1m.x + t*dir.x, S1m.y + t*dir.y, S1m.z + t*dir.z );
-                            segmentForwardModel( S1m, P, weights[k], ptrHashTable );
-                            S1m.Set( P.x, P.y, P.z );
-                        }
-                        else
-                        {
-                            // add the segment S1S2 and stop iterating
-                            segmentForwardModel( S1m, S2m, weights[k], ptrHashTable );
-                            break;
-                        }
-                    }
-            }
-        }
-    }
-}
-
-
-/********************************************************************************************************************/
-/*                                                segmentForwardModel                                               */
-/********************************************************************************************************************/
-void segmentForwardModel( const Vector<double>& P1, const Vector<double>& P2, double w, short* ptrHashTable )
-{
-    static Vector<int>    vox;
-    static Vector<double> dir, dirTrue;
-    static double         longitude, colatitude, len;
-    static segKey         key;
-    static int            ox, oy;
-
-    // direction of the segment
-    dir.y = P2.y-P1.y;
-    if ( dir.y >= 0 )
-    {
-        dir.x = P2.x-P1.x;
-        dir.z = P2.z-P1.z;
-    }
-    else
-    {
-        dir.x = P1.x-P2.x;
-        dir.y = P1.y-P2.y;
-        dir.z = P1.z-P2.z;
-    }
-
-    // length of segment
-    len = dir.norm();
-    if ( len <= minSegLen )
-        return;
-    dir.Normalize();
-
-    // voxel of the segment is the centroid
-    vox.x = floor( 0.5 * (P1.x + P2.x) / pixdim.x );
-    vox.y = floor( 0.5 * (P1.y + P2.y) / pixdim.y );
-    vox.z = floor( 0.5 * (P1.z + P2.z) / pixdim.z );
-    if ( vox.x>=dim.x || vox.x<0 || vox.y>=dim.y || vox.y<0 || vox.z>=dim.z || vox.z<0 )
-        return;
-    if ( ptrMASK && ptrMASK[ vox.z + dim.z * ( vox.y + dim.y * vox.x ) ]==0 )
-        return;
-
-    // add the segment to the data structure
-    longitude  = atan2(dir.y, dir.x);
-    colatitude = atan2( sqrt(dir.x*dir.x + dir.y*dir.y), dir.z );
-    ox = (int)round(colatitude/M_PI*180.0); // theta // i1
-    oy = (int)round(longitude/M_PI*180.0);  // phi   // i2
-    key.set( vox.x, vox.y, vox.z, (unsigned short) ptrHashTable[ox*181 + oy] );
-    FiberSegments[key] += w * len;
-}
-
-
-/********************************************************************************************************************/
-/*                                                rayBoxIntersection                                                */
-/********************************************************************************************************************/
-bool rayBoxIntersection( Vector<double>& origin, Vector<double>& direction, Vector<double>& vmin, Vector<double>& vmax, double & t)
-{
-    static double tmin, tmax, tymin, tymax, tzmin, tzmax;
-    static Vector<double> invrd;
-
-    // inverse direction to catch float problems
-    invrd.x = 1.0 / direction.x;
-    invrd.y = 1.0 / direction.y;
-    invrd.z = 1.0 / direction.z;
-
-
-    if (invrd.x >= 0)
-    {
-      tmin = (vmin.x - origin.x) * invrd.x;
-      tmax = (vmax.x - origin.x) * invrd.x;
-    }
-    else
-    {
-      tmin = (vmax.x - origin.x) * invrd.x;
-      tmax = (vmin.x - origin.x) * invrd.x;
-    }
-
-    if (invrd.y >= 0)
-    {
-      tymin = (vmin.y - origin.y) * invrd.y;
-      tymax = (vmax.y - origin.y) * invrd.y;
-    }
-    else
-    {
-      tymin = (vmax.y - origin.y) * invrd.y;
-      tymax = (vmin.y - origin.y) * invrd.y;
-    }
-
-    if ( (tmin > tymax) || (tymin > tmax) ) return false;
-    if ( tymin > tmin) tmin = tymin;
-    if ( tymax < tmax) tmax = tymax;
-
-    if (invrd.z >= 0)
-    {
-      tzmin = (vmin.z - origin.z) * invrd.z;
-      tzmax = (vmax.z - origin.z) * invrd.z;
-    }else
-    {
-      tzmin = (vmax.z - origin.z) * invrd.z;
-      tzmax = (vmin.z - origin.z) * invrd.z;
-    }
-
-    if ( (tmin > tzmax) || (tzmin > tmax) ) return false;
-    if ( tzmin > tmin) tmin = tzmin;
-    if ( tzmax < tmax) tmax = tzmax;
-
-    // check if values are valid
-    t = tmin;
-    if (t <= 0) t = tmax;
-
-    return true;
-}
-
-
-// Read a fiber from file .trk
-unsigned int read_fiberTRK( FILE* fp, float fiber[3][MAX_FIB_LEN], int ns, int np )
-{
-    int N;
-    fread((char*)&N, 1, 4, fp);
-
-    if ( N >= MAX_FIB_LEN || N <= 0 )
-        return 0;
-
-    float tmp[3];
-    for(int i=0; i<N; i++)
-    {
-        fread((char*)tmp, 1, 12, fp);
-        fiber[0][i] = tmp[0];
-        fiber[1][i] = tmp[1];
-        fiber[2][i] = tmp[2];
-        fseek(fp,4*ns,SEEK_CUR);
-    }
-    fseek(fp,4*np,SEEK_CUR);
-
-    return N;
-}
-
-// Read a fiber from file .tck
-unsigned int read_fiberTCK( FILE* fp, float fiber[3][MAX_FIB_LEN], float affine[4][4])
-{
-    int N = 0;
-    float tmp[3];
-
-    fread((char*)tmp, 1, 12, fp);
-    //printf("%f %f %f\n", tmp[0],tmp[1],tmp[2]);
-
-    while( !(isnan(tmp[0])) && !(isnan(tmp[1])) &&  !(isnan(tmp[2])) )
-    {
-        //printf("%f %f %f\n", tmp[0],tmp[1],tmp[2]);
-        fiber[0][N] = tmp[0]*affine[0][0] + tmp[1]*affine[0][1] + tmp[2]*affine[0][2] + affine[0][3];
-        fiber[1][N] = tmp[0]*affine[1][0] + tmp[1]*affine[1][1] + tmp[2]*affine[1][2] + affine[1][3];
-        fiber[2][N] = tmp[0]*affine[2][0] + tmp[1]*affine[2][1] + tmp[2]*affine[2][2] + affine[2][3];
-        N++;
-        fread((char*)tmp, 1, 12, fp);
-        //printf("%f %f %f\n", fiber[0][N],fiber[1][N],fiber[2][N]);
-    }
-    //printf("End Fiber\n");
-
-     return N;
-}
+#include <stdio.h>
+#include <cstdio>
+#include <string>
+#include <map>
+#include <vector>
+#include "Vector.h"
+#include "ProgressBar.h"
+#include <numpy/arrayobject.h>
+#include <math.h>
+
+#define MAX_FIB_LEN 10000
+
+
+// CLASS to store the segments of one fiber
+class segKey
+{
+    public:
+    unsigned short x, y, z;
+    unsigned short o;
+    segKey(){}
+
+    void set(unsigned short _x, unsigned short _y, unsigned short _z, unsigned short _o)
+    {
+        x  = _x;
+        y  = _y;
+        z  = _z;
+        o = _o;
+    }
+
+    bool const operator <(const segKey& seg) const
+    {
+        return o < seg.o || (o==seg.o && z<seg.z) || (o==seg.o && z==seg.z && y<seg.y) || (o==seg.o && z==seg.z && y==seg.y && x<seg.x);
+    }
+};
+
+class segInVoxKey
+{
+    public:
+    unsigned short x, y, z;
+    segInVoxKey(){}
+
+    void set(unsigned short _x, unsigned short _y, unsigned short _z)
+    {
+        x  = _x;
+        y  = _y;
+        z  = _z;
+    }
+    bool const operator <(const segInVoxKey& o) const
+    {
+        return (z<o.z) || (z==o.z && y<o.y) || (z==o.z && y==o.y && x<o.x);
+    }
+};
+
+// global variables (to avoid passing them at each call)
+std::map<segKey,float> FiberSegments;
+
+Vector<int>     dim;
+Vector<float>   pixdim;
+float*          ptrMASK;
+unsigned int    nPointsToSkip;
+float           fiberShiftXmm, fiberShiftYmm, fiberShiftZmm;
+bool            doIntersect;
+float           minSegLen;
+
+std::vector<double> radii;         // radii for the extrusion
+std::vector<double> weights;       // damping weight
+std::vector<int>    sectors;       // number of duplicates across the extrusion circle
+double              radiusSigma;   // modulates the impact of each segment as function of radius
+
+
+bool rayBoxIntersection( Vector<double>& origin, Vector<double>& direction, Vector<double>& vmin, Vector<double>& vmax, double & t);
+void fiberForwardModel( float fiber[3][MAX_FIB_LEN], unsigned int pts, std::vector<int> sectors, std::vector<double> radii, std::vector<double> weight, short* ptrHashTable );
+void segmentForwardModel( const Vector<double>& P1, const Vector<double>& P2, double w, short* ptrHashTable );
+unsigned int read_fiberTRK( FILE* fp, float fiber[3][MAX_FIB_LEN], int ns, int np );
+unsigned int read_fiberTCK( FILE* fp, float fiber[3][MAX_FIB_LEN] , float affine[4][4]);
+
+
+// =========================
+// Function called by CYTHON
+// =========================
+int trk2dictionary(
+    char* str_filename, int data_offset, int Nx, int Ny, int Nz, float Px, float Py, float Pz, int n_count, int n_scalars, int n_properties,
+    float fiber_shiftX, float fiber_shiftY, float fiber_shiftZ, int points_to_skip, float min_seg_len,
+    float* ptrPEAKS, int Np, float vf_THR, int ECix, int ECiy, int ECiz,
+    float* _ptrMASK, float* ptrTDI, char* path_out, int c, double* ptrAFFINE,
+    int nBlurRadii, double blurSigma, double* ptrBlurRadii, int* ptrBlurSamples, double* ptrBlurWeights, float* VetAffine, unsigned short ndirs, short* ptrHashTable
+)
+{
+    /*=========================*/
+    /*     IC compartments     */
+    /*=========================*/
+    float          fiber[3][MAX_FIB_LEN];
+    float          fiberNorm, fiberLen;
+    unsigned int   N, totICSegments = 0, totFibers = 0, v;
+    unsigned short o;
+    unsigned char  kept;
+    Vector<double> P;
+    std::string    filename;
+    std::string    OUTPUT_path(path_out);
+    std::map<segKey,float>::iterator it;
+
+    std::map<segInVoxKey,float> FiberNorm;
+    std::map<segInVoxKey,float>::iterator itNorm;
+    segInVoxKey         inVoxKey;
+
+    printf( "\t* Exporting IC compartments:\n" );
+    
+    int isTRK; // var to check
+
+    char *ext = strrchr(str_filename, '.'); //get the extension of input file
+
+    if (strcmp(ext,".trk")==0) //for .trk file
+        isTRK = 1;
+    else if (strcmp(ext,".tck")==0)// for .tck file
+        isTRK = 0;
+    else
+        return 0;
+
+    FILE* fpTractogram = fopen(str_filename,"rb"); //open 
+    if (fpTractogram == NULL) return 0;
+
+    if ( isTRK ) { // SKIP header on .trk
+        fseek(fpTractogram,data_offset,SEEK_SET); //skip the first 1000 bytes in the .trk file
+    }
+    else { // SKIP header on .tck
+        fseek(fpTractogram,data_offset,SEEK_SET); //skip the first offset bytes in the .tck file
+    }
+
+    // set global variables
+    dim.Set( Nx, Ny, Nz );
+    pixdim.Set( Px, Py, Pz );
+    nPointsToSkip = points_to_skip;
+    fiberShiftXmm = fiber_shiftX * pixdim.x; // shift in mm for the coordinates
+    fiberShiftYmm = fiber_shiftY * pixdim.y;
+    fiberShiftZmm = fiber_shiftZ * pixdim.z;
+    ptrMASK       = _ptrMASK;
+    doIntersect   = c > 0;
+    minSegLen     = min_seg_len;
+
+    radii.clear();
+    sectors.clear();
+    weights.clear();
+    for(int i=0; i<nBlurRadii ;i++)
+    {
+        radii.push_back( ptrBlurRadii[i] );
+        sectors.push_back( ptrBlurSamples[i] );
+        weights.push_back( ptrBlurWeights[i] );
+    }
+    radiusSigma = blurSigma;
+
+    // open files
+    filename = OUTPUT_path+"/dictionary_TRK_norm.dict";   FILE* pDict_TRK_norm = fopen(filename.c_str(),"wb");
+    if ( !pDict_TRK_norm )
+    {
+        printf( "\n[trk2dictionary] Unable to create output files" );
+        return 0;
+    }
+    filename = OUTPUT_path+"/dictionary_IC_f.dict";        FILE* pDict_IC_f      = fopen(filename.c_str(),"wb");
+    filename = OUTPUT_path+"/dictionary_IC_v.dict";        FILE* pDict_IC_v      = fopen(filename.c_str(),"wb");
+    filename = OUTPUT_path+"/dictionary_IC_o.dict";        FILE* pDict_IC_o      = fopen(filename.c_str(),"wb");
+    filename = OUTPUT_path+"/dictionary_IC_len.dict";      FILE* pDict_IC_len    = fopen(filename.c_str(),"wb");
+    filename = OUTPUT_path+"/dictionary_TRK_len.dict";     FILE* pDict_TRK_len   = fopen(filename.c_str(),"wb");
+    filename = OUTPUT_path+"/dictionary_TRK_kept.dict";    FILE* pDict_TRK_kept  = fopen(filename.c_str(),"wb");
+    filename = OUTPUT_path+"/dictionary_ndirs.dict";       FILE* pDict_ndirs     = fopen(filename.c_str(),"wb");
+
+    // iterate over fibers
+    ProgressBar PROGRESS( n_count );
+    PROGRESS.setPrefix("\t  ");
+    
+    float affine[4][4];
+    if (!isTRK)  {//.tck
+        //ricreate affine matrix
+        int k = 0;
+        for(int i=0; i<4; i++) {
+            for (int j=0; j<4; j++) {
+                affine[i][j] = VetAffine[k];
+                k++;
+            }
+        }
+    }
+    
+    for(int f=0; f<n_count ;f++)
+    {
+        PROGRESS.inc();
+        //read fibers in .trk or in .tck
+        if (isTRK) { // .trk file
+            N = read_fiberTRK( fpTractogram, fiber, n_scalars, n_properties );
+        }
+        else { // .tck file
+            N = read_fiberTCK( fpTractogram, fiber , affine );
+        }
+        
+        fiberForwardModel( fiber, N, sectors, radii, weights, ptrHashTable  );
+
+        kept = 0;
+        if ( FiberSegments.size() > 0 )
+        {
+            // add segments to files
+            fiberNorm = 0;
+            fiberLen = 0;
+            for (it=FiberSegments.begin(); it!=FiberSegments.end(); it++)
+            {
+                // NB: plese note inverted ordering for 'v'
+                v = it->first.x + dim.x * ( it->first.y + dim.y * it->first.z );
+                o = it->first.o;
+                fwrite( &totFibers,      4, 1, pDict_IC_f );
+                fwrite( &v,              4, 1, pDict_IC_v );
+                fwrite( &o,              2, 1, pDict_IC_o );
+                fwrite( &(it->second),   4, 1, pDict_IC_len );
+                ptrTDI[ it->first.z + dim.z * ( it->first.y + dim.y * it->first.x ) ] += it->second;
+                inVoxKey.set( it->first.x, it->first.y, it->first.z );
+                FiberNorm[inVoxKey] += it->second;
+                fiberLen += it->second;
+            }
+            for (itNorm=FiberNorm.begin(); itNorm!=FiberNorm.end(); itNorm++)
+            {
+                fiberNorm += pow(itNorm->second,2);
+            }
+            fiberNorm = sqrt(fiberNorm);
+            FiberNorm.clear();
+            fwrite( &fiberNorm,  1, 4, pDict_TRK_norm ); // actual length considered in optimization
+            fwrite( &fiberLen,   1, 4, pDict_TRK_len );
+            totICSegments += FiberSegments.size();
+            totFibers++;
+            kept = 1;
+        }
+        fwrite( &kept, 1, 1, pDict_TRK_kept );
+    }
+    PROGRESS.close();
+
+    // write dictionary ndirs value
+    fwrite(&ndirs, 1, sizeof(unsigned short), pDict_ndirs);
+    fclose( fpTractogram );
+    fclose( pDict_TRK_norm );
+    fclose( pDict_IC_f );
+    fclose( pDict_IC_v );
+    fclose( pDict_IC_o );
+    fclose( pDict_IC_len );
+    fclose( pDict_TRK_len );
+    fclose( pDict_TRK_kept );
+    fclose( pDict_ndirs );
+
+    printf("\t  [ %d fibers kept, %d segments in total ]\n", totFibers, totICSegments );
+
+
+    /*=========================*/
+    /*     EC compartments     */
+    /*=========================*/
+    unsigned int totECSegments = 0, totECVoxels = 0;
+
+    printf( "\t* Exporting EC compartments:\n" );
+
+    filename = OUTPUT_path+"/dictionary_EC_v.dict";        FILE* pDict_EC_v   = fopen( filename.c_str(),   "wb" );
+    filename = OUTPUT_path+"/dictionary_EC_o.dict";        FILE* pDict_EC_o   = fopen( filename.c_str(),   "wb" );
+
+    if ( ptrPEAKS != NULL )
+    {
+        Vector<double> dir;
+        double         longitude, colatitude;
+        segKey         ec_seg;
+        int            ix, iy, iz, id, atLeastOne;
+        float          peakMax;
+        float          norms[ Np ];
+        float          *ptr;
+        int            ox, oy;
+
+        PROGRESS.reset( dim.z );
+        for(iz=0; iz<dim.z ;iz++)
+        {
+            PROGRESS.inc();
+            for(iy=0; iy<dim.y ;iy++)
+            for(ix=0; ix<dim.x ;ix++)
+            {
+                // check if in mask previously computed from IC segments
+                if ( ptrTDI[ iz + dim.z * ( iy + dim.y * ix ) ] == 0 ) continue;
+
+                peakMax = -1;
+                for(id=0; id<Np ;id++)
+                {
+                    ptr = ptrPEAKS + 3*(id + Np * ( iz + dim.z * ( iy + dim.y * ix ) ));
+                    dir.x = ptr[0];
+                    dir.y = ptr[1];
+                    dir.z = ptr[2];
+                    norms[id] = dir.norm();
+                    if ( norms[id] > peakMax )
+                        peakMax = norms[id];
+                }
+
+                if ( peakMax > 0 )
+                {
+                    ec_seg.x  = ix;
+                    ec_seg.y  = iy;
+                    ec_seg.z  = iz;
+                    atLeastOne = 0;
+                    for(id=0; id<Np ;id++)
+                    {
+                        if ( norms[id]==0 || norms[id] < vf_THR*peakMax ) continue; // peak too small, don't consider it
+
+                        // get the orientation of the current peak
+                        ptr = ptrPEAKS + 3*(id + Np * ( iz + dim.z * ( iy + dim.y * ix ) ));
+
+                        // multiply by the affine matrix
+                        dir.x = ptr[0] * ptrAFFINE[0] + ptr[1] * ptrAFFINE[1] + ptr[2] * ptrAFFINE[2];
+                        dir.y = ptr[0] * ptrAFFINE[3] + ptr[1] * ptrAFFINE[4] + ptr[2] * ptrAFFINE[5];
+                        dir.z = ptr[0] * ptrAFFINE[6] + ptr[1] * ptrAFFINE[7] + ptr[2] * ptrAFFINE[8];
+
+                        // flip axes if requested
+                        dir.x *= ECix;
+                        dir.y *= ECiy;
+                        dir.z *= ECiz;
+                        if ( dir.y < 0 )
+                        {
+                            // ensure to be in the right hemisphere (the one where kernels were pre-computed)
+                            dir.x = -dir.x;
+                            dir.y = -dir.y;
+                            dir.z = -dir.z;
+                        }
+                        colatitude = atan2( sqrt(dir.x*dir.x + dir.y*dir.y), dir.z );
+                        longitude  = atan2( dir.y, dir.x );
+                        ox = (int)round(colatitude/M_PI*180.0);
+                        oy = (int)round(longitude/M_PI*180.0);
+
+                        v = ec_seg.x + dim.x * ( ec_seg.y + dim.y * ec_seg.z );
+                        o = ptrHashTable[ox*181 + oy];
+                        fwrite( &v, 4, 1, pDict_EC_v );
+                        fwrite( &o, 2, 1, pDict_EC_o );
+                        totECSegments++;
+                        atLeastOne = 1;
+                    }
+                    if ( atLeastOne>0 )
+                        totECVoxels++;
+                }
+            }
+        }
+        PROGRESS.close();
+    }
+
+    fclose( pDict_EC_v );
+    fclose( pDict_EC_o );
+
+    printf("\t  [ %d voxels, %d segments ]\n", totECVoxels, totECSegments );
+
+    return 1;
+}
+
+
+/********************************************************************************************************************/
+/*                                                 fiberForwardModel                                                */
+/********************************************************************************************************************/
+void fiberForwardModel( float fiber[3][MAX_FIB_LEN], unsigned int pts, std::vector<int> sectors, std::vector<double> radii, std::vector<double> weights, short* ptrHashTable )
+{
+    static Vector<double> S1, S2, S1m, S2m, P, q, n, qxn, qxqxn;
+    static Vector<double> vox, vmin, vmax, dir;
+    static double         len, t, alpha, w, R;
+    static int            i, j, k;
+
+    FiberSegments.clear();
+    //printf("RANGO -----------------------------> from %d to %d\n", nPointsToSkip, pts-1-nPointsToSkip);
+    for(i=nPointsToSkip; i<pts-1-nPointsToSkip ;i++)
+    {
+        // original segment to be processed
+        S1.Set( fiber[0][i]   + fiberShiftXmm, fiber[1][i]   + fiberShiftYmm, fiber[2][i]   + fiberShiftZmm );
+        S2.Set( fiber[0][i+1] + fiberShiftXmm, fiber[1][i+1] + fiberShiftYmm, fiber[2][i+1] + fiberShiftZmm );
+        dir.x = S2.x-S1.x;
+        dir.y = S2.y-S1.y;
+        dir.z = S2.z-S1.z;
+        dir.Normalize();
+
+        // get a normal to the vector to move
+        n.x = dir.y-dir.z;
+        n.y = dir.z-dir.x;
+        n.z = dir.x-dir.y;
+        n.Normalize();
+
+        /* assign contribution(s) */
+        for(k=0; k<(int)radii.size() ;k++)
+        {
+            if ( weights[k] < 1e-3 )
+                continue;
+
+            R = radii[k];
+
+            // quaternion (q.x, q.y, q.z, w) for rotation
+            alpha = 2.0*M_PI/sectors[k];
+            w = sin(alpha/2.0);
+            q.x = dir.x * w;
+            q.y = dir.y * w;
+            q.z = dir.z * w;
+            w = cos(alpha/2.0);
+
+
+            for(j=0; j<sectors[k] ;j++)
+            {
+                // rotate the segment's normal
+                qxn.x = 2.0 * ( q.y * n.z - q.z * n.y );
+                qxn.y = 2.0 * ( q.z * n.x - q.x * n.z );
+                qxn.z = 2.0 * ( q.x * n.y - q.y * n.x );
+                qxqxn.x = q.y * qxn.z - q.z * qxn.y;
+                qxqxn.y = q.z * qxn.x - q.x * qxn.z;
+                qxqxn.z = q.x * qxn.y - q.y * qxn.x;
+                n.x += w * qxn.x + qxqxn.x;
+                n.y += w * qxn.y + qxqxn.y;
+                n.z += w * qxn.z + qxqxn.z;
+                // n /= np.linalg.norm(n)
+
+                // move the segment
+                S1m.x = S1.x + R*n.x;
+                S1m.y = S1.y + R*n.y;
+                S1m.z = S1.z + R*n.z;
+                S2m.x = S2.x + R*n.x;
+                S2m.y = S2.y + R*n.y;
+                S2m.z = S2.z + R*n.z;
+
+                if ( doIntersect==false )
+                    segmentForwardModel( S1m, S2m, weights[k], ptrHashTable );
+                else
+                    while( 1 )
+                    {
+                        len = sqrt( pow(S2m.x-S1m.x,2) + pow(S2m.y-S1m.y,2) + pow(S2m.z-S1m.z,2) ); // in mm
+                        if ( len <= minSegLen )
+                            break;
+
+                        // compute AABB of the first point (in mm)
+                        vmin.x = floor( (S1m.x + 1e-6*dir.x)/pixdim.x ) * pixdim.x;
+                        vmin.y = floor( (S1m.y + 1e-6*dir.y)/pixdim.y ) * pixdim.y;
+                        vmin.z = floor( (S1m.z + 1e-6*dir.z)/pixdim.z ) * pixdim.z;
+                        vmax.x = vmin.x + pixdim.x;
+                        vmax.y = vmin.y + pixdim.y;
+                        vmax.z = vmin.z + pixdim.z;
+
+                        if ( rayBoxIntersection( S1m, dir, vmin, vmax, t ) && t>0 && t<len )
+                        {
+                            // add the portion S1P, and then reiterate
+                            P.Set( S1m.x + t*dir.x, S1m.y + t*dir.y, S1m.z + t*dir.z );
+                            segmentForwardModel( S1m, P, weights[k], ptrHashTable );
+                            S1m.Set( P.x, P.y, P.z );
+                        }
+                        else
+                        {
+                            // add the segment S1S2 and stop iterating
+                            segmentForwardModel( S1m, S2m, weights[k], ptrHashTable );
+                            break;
+                        }
+                    }
+            }
+        }
+    }
+}
+
+
+/********************************************************************************************************************/
+/*                                                segmentForwardModel                                               */
+/********************************************************************************************************************/
+void segmentForwardModel( const Vector<double>& P1, const Vector<double>& P2, double w, short* ptrHashTable )
+{
+    static Vector<int>    vox;
+    static Vector<double> dir, dirTrue;
+    static double         longitude, colatitude, len;
+    static segKey         key;
+    static int            ox, oy;
+
+    // direction of the segment
+    dir.y = P2.y-P1.y;
+    if ( dir.y >= 0 )
+    {
+        dir.x = P2.x-P1.x;
+        dir.z = P2.z-P1.z;
+    }
+    else
+    {
+        dir.x = P1.x-P2.x;
+        dir.y = P1.y-P2.y;
+        dir.z = P1.z-P2.z;
+    }
+
+    // length of segment
+    len = dir.norm();
+    if ( len <= minSegLen )
+        return;
+    dir.Normalize();
+
+    // voxel of the segment is the centroid
+    vox.x = floor( 0.5 * (P1.x + P2.x) / pixdim.x );
+    vox.y = floor( 0.5 * (P1.y + P2.y) / pixdim.y );
+    vox.z = floor( 0.5 * (P1.z + P2.z) / pixdim.z );
+    if ( vox.x>=dim.x || vox.x<0 || vox.y>=dim.y || vox.y<0 || vox.z>=dim.z || vox.z<0 )
+        return;
+    if ( ptrMASK && ptrMASK[ vox.z + dim.z * ( vox.y + dim.y * vox.x ) ]==0 )
+        return;
+
+    // add the segment to the data structure
+    longitude  = atan2(dir.y, dir.x);
+    colatitude = atan2( sqrt(dir.x*dir.x + dir.y*dir.y), dir.z );
+    ox = (int)round(colatitude/M_PI*180.0); // theta // i1
+    oy = (int)round(longitude/M_PI*180.0);  // phi   // i2
+    key.set( vox.x, vox.y, vox.z, (unsigned short) ptrHashTable[ox*181 + oy] );
+    FiberSegments[key] += w * len;
+}
+
+
+/********************************************************************************************************************/
+/*                                                rayBoxIntersection                                                */
+/********************************************************************************************************************/
+bool rayBoxIntersection( Vector<double>& origin, Vector<double>& direction, Vector<double>& vmin, Vector<double>& vmax, double & t)
+{
+    static double tmin, tmax, tymin, tymax, tzmin, tzmax;
+    static Vector<double> invrd;
+
+    // inverse direction to catch float problems
+    invrd.x = 1.0 / direction.x;
+    invrd.y = 1.0 / direction.y;
+    invrd.z = 1.0 / direction.z;
+
+
+    if (invrd.x >= 0)
+    {
+      tmin = (vmin.x - origin.x) * invrd.x;
+      tmax = (vmax.x - origin.x) * invrd.x;
+    }
+    else
+    {
+      tmin = (vmax.x - origin.x) * invrd.x;
+      tmax = (vmin.x - origin.x) * invrd.x;
+    }
+
+    if (invrd.y >= 0)
+    {
+      tymin = (vmin.y - origin.y) * invrd.y;
+      tymax = (vmax.y - origin.y) * invrd.y;
+    }
+    else
+    {
+      tymin = (vmax.y - origin.y) * invrd.y;
+      tymax = (vmin.y - origin.y) * invrd.y;
+    }
+
+    if ( (tmin > tymax) || (tymin > tmax) ) return false;
+    if ( tymin > tmin) tmin = tymin;
+    if ( tymax < tmax) tmax = tymax;
+
+    if (invrd.z >= 0)
+    {
+      tzmin = (vmin.z - origin.z) * invrd.z;
+      tzmax = (vmax.z - origin.z) * invrd.z;
+    }else
+    {
+      tzmin = (vmax.z - origin.z) * invrd.z;
+      tzmax = (vmin.z - origin.z) * invrd.z;
+    }
+
+    if ( (tmin > tzmax) || (tzmin > tmax) ) return false;
+    if ( tzmin > tmin) tmin = tzmin;
+    if ( tzmax < tmax) tmax = tzmax;
+
+    // check if values are valid
+    t = tmin;
+    if (t <= 0) t = tmax;
+
+    return true;
+}
+
+
+// Read a fiber from file .trk
+unsigned int read_fiberTRK( FILE* fp, float fiber[3][MAX_FIB_LEN], int ns, int np )
+{
+    int N;
+    fread((char*)&N, 1, 4, fp);
+
+    if ( N >= MAX_FIB_LEN || N <= 0 )
+        return 0;
+
+    float tmp[3];
+    for(int i=0; i<N; i++)
+    {
+        fread((char*)tmp, 1, 12, fp);
+        fiber[0][i] = tmp[0];
+        fiber[1][i] = tmp[1];
+        fiber[2][i] = tmp[2];
+        fseek(fp,4*ns,SEEK_CUR);
+    }
+    fseek(fp,4*np,SEEK_CUR);
+
+    return N;
+}
+
+// Read a fiber from file .tck
+unsigned int read_fiberTCK( FILE* fp, float fiber[3][MAX_FIB_LEN], float affine[4][4])
+{
+    int N = 0;
+    float tmp[3];
+
+    fread((char*)tmp, 1, 12, fp);
+    //printf("%f %f %f\n", tmp[0],tmp[1],tmp[2]);
+
+    while( !(isnan(tmp[0])) && !(isnan(tmp[1])) &&  !(isnan(tmp[2])) )
+    {
+        //printf("%f %f %f\n", tmp[0],tmp[1],tmp[2]);
+        fiber[0][N] = tmp[0]*affine[0][0] + tmp[1]*affine[0][1] + tmp[2]*affine[0][2] + affine[0][3];
+        fiber[1][N] = tmp[0]*affine[1][0] + tmp[1]*affine[1][1] + tmp[2]*affine[1][2] + affine[1][3];
+        fiber[2][N] = tmp[0]*affine[2][0] + tmp[1]*affine[2][1] + tmp[2]*affine[2][2] + affine[2][3];
+        N++;
+        fread((char*)tmp, 1, 12, fp);
+        //printf("%f %f %f\n", fiber[0][N],fiber[1][N],fiber[2][N]);
+    }
+    //printf("End Fiber\n");
+
+     return N;
+}
diff --git a/doc/tutorials/AdvancedSolvers/README.md b/doc/tutorials/AdvancedSolvers/README.md
index 51ab067c..376e18db 100644
--- a/doc/tutorials/AdvancedSolvers/README.md
+++ b/doc/tutorials/AdvancedSolvers/README.md
@@ -1,157 +1,157 @@
-
-You can find the ipython notebook version of this tutorial [at this link](tutorial_solvers.ipynb).
-
-# Advanced solvers
-
-This tutorial shows how to exploit the advanced features of the COMMIT framework from the side of the **optimisation problem**. The general formulation is the following:
-\begin{equation}
-x^* = \arg\min_{x\in R^n_+} \frac12 \|Ax-y\|_2^2 + \lambda_{IC}\Omega_{IC}(x) + \lambda_{EC}\Omega_{EC}(x) + \lambda_{ISO}\Omega_{ISO}(x),
-\end{equation}
-where $A$ is the COMMIT dictionary, $n$ is defined in such a way that the product $Ax$ makes sense and $y$ is the datum that we want to fit. The three regularisation terms allow us to exploit ***distinct penalties for each compartment***.
-
-*Note*: before exploring this tutorial, you should follow the [Getting Started](https://github.com/daducci/COMMIT/tree/master/doc/tutorials/GettingStarted) tutorial.
-
-
-### Download and unpack the data
-
-Download and extract the **example dataset** from the following [ZIP archive](http://hardi.epfl.ch/static/data/COMMIT_demos/LausanneTwoShell.zip), which contains the following files:
-
-- `DWI.nii`: a diffusion MRI dataset with 100 measurements distributed on 2 shells, respectively at b=700 s/mm^2 and b=2000 s/mm^2;
-- `DWI.scheme`: its corresponding acquisition scheme;
-- `peaks.nii.gz`: main diffusion orientations estimated with CSD;
-- `fibers.trk`: tractogram with about 280K fibers estimated using a streamline-based algorithm;
-- `WM.nii.gz`: white-matter mask extracted from an anatomical T1w image.
-
-
-<span style="color:crimson">**Make sure that your working directory is the folder where you unzipped the downloaded archive.**</span>
-
-
-```python
-path_to_the_directory_with_the_unzipped_archive = '.' # edit this
-cd path_to_the_directory_with_the_unzipped_archive
-```
-
-### Load the usual COMMIT structure
-
-
-```python
-from commit import trk2dictionary
-
-trk2dictionary.run(
-    filename_tractogram = 'LausanneTwoShell/fibers.trk',
-    path_out            = 'LausanneTwoShell/CommitOutput',
-    filename_peaks      = 'LausanneTwoShell/peaks.nii.gz',
-    filename_mask       = 'LausanneTwoShell/WM.nii.gz',
-    fiber_shift         = 0.5,
-    peaks_use_affine    = True
-)
-
-import commit
-mit = commit.Evaluation( '.', 'LausanneTwoShell' )
-mit.load_data( 'DWI.nii', 'DWI.scheme' )
-
-mit.set_model( 'StickZeppelinBall' )
-
-d_par = 1.7E-3              # Parallel diffusivity [mm^2/s]
-ICVFs = [ 0.7 ]             # Intra-cellular volume fraction(s) [0..1]
-d_ISOs = [ 1.7E-3, 3.0E-3 ] # Isotropic diffusivitie(s) [mm^2/s]
-
-mit.model.set( d_par, ICVFs, d_ISOs )
-mit.generate_kernels( regenerate=True )
-mit.load_kernels()
-
-mit.load_dictionary( 'CommitOutput' )
-mit.set_threads()
-mit.build_operator()
-```
-
-### Perform clustering of the streamlines
-
-You will need `dipy`, which is among the requirements of COMMIT, hence there should be no problem.
-
-The `threshold` parameter has to be tuned for each brain. Do not consider our choice as a standard one.
-
-
-```python
-from nibabel import trackvis as tv
-fname='LausanneTwoShell/fibers.trk'
-streams, hdr = tv.read(fname)
-streamlines = [i[0] for i in streams]
-
-from dipy.segment.clustering import QuickBundles
-threshold = 15.0
-qb = QuickBundles(threshold=threshold)
-clusters = qb.cluster(streamlines)
-
-import numpy as np
-structureIC = np.array([c.indices for c in clusters])
-weightsIC   = np.array([1.0/np.sqrt(len(c)) for c in structureIC])
-```
-
-Notice that we defined `structure_IC` as a `numpy.array` that contains a list of lists containing the indices associated to each group. We know it sounds a little bit bizarre but it computationally convenient.
-
-### Define the regularisation term
-Each compartment must be regularised separately. The user can choose among the following penalties:
-
-- $\sum_{g\in G}w_g\|x_g\|_k$ : `commit.solvers.group_sparsity` with $k\in \{2, \infty\}$ (only for IC compartment)
-
-- $\|x\|_1$ : `commit.solvers.norm1`
-
-- $\|x\|_2$ : `commit.solvers.norm2`
-
-- $\iota_{\ge 0}(x)$ : `commit.solvers.non_negative` (Default for all compartments)
-
-If the chosen regularisation for the IC compartment is $\sum_{g\in G}\|x_g\|_k$, we can define $k$ via the `group_norm` field, which must be one between
-
-- $\|x\|_2$ : `commit.solvers.norm2` (Default)
-
-- $\|x\|_\infty$ : `commit.solvers.norminf`
-
-In this example we consider the following penalties:
-
-- Intracellular: group sparsity with 2-norm of each group
-
-- Extracellular: 2-norm
-
-- Isotropic: 1-norm
-
-
-```python
-regnorms = [commit.solvers.group_sparsity, commit.solvers.norm2, commit.solvers.norm1]
-
-group_norm = 2 # each group is penalised with its 2-norm
-```
-
-The regularisation parameters are specified within the lambdas field. Again, do not consider our choice as a standard one.
-
-
-```python
-lambdas = [10.,10.,10.]
-```
-
-### Call the constructor of the data structure
-
-
-```python
-regterm = commit.solvers.init_regularisation(mit,
-                                             regnorms    = regnorms,
-                                             structureIC = structureIC,
-                                             weightsIC   = weightsIC,
-                                             group_norm  = group_norm,
-                                             lambdas     = lambdas)
-```
-
-### Call the fit function to perform the optimisation
-
-
-```python
-mit.fit(regularisation=regterm, max_iter=1000)
-```
-
-### Save the results
-
-
-```python
-suffix = 'IC'+str(regterm[0])+'EC'+str(regterm[1])+'ISO'+str(regterm[2])
-mit.save_results(path_suffix=suffix)
-```
+
+You can find the ipython notebook version of this tutorial [at this link](tutorial_solvers.ipynb).
+
+# Advanced solvers
+
+This tutorial shows how to exploit the advanced features of the COMMIT framework from the side of the **optimisation problem**. The general formulation is the following:
+\begin{equation}
+x^* = \arg\min_{x\in R^n_+} \frac12 \|Ax-y\|_2^2 + \lambda_{IC}\Omega_{IC}(x) + \lambda_{EC}\Omega_{EC}(x) + \lambda_{ISO}\Omega_{ISO}(x),
+\end{equation}
+where $A$ is the COMMIT dictionary, $n$ is defined in such a way that the product $Ax$ makes sense and $y$ is the datum that we want to fit. The three regularisation terms allow us to exploit ***distinct penalties for each compartment***.
+
+*Note*: before exploring this tutorial, you should follow the [Getting Started](https://github.com/daducci/COMMIT/tree/master/doc/tutorials/GettingStarted) tutorial.
+
+
+### Download and unpack the data
+
+Download and extract the **example dataset** from the following [ZIP archive](http://hardi.epfl.ch/static/data/COMMIT_demos/LausanneTwoShell.zip), which contains the following files:
+
+- `DWI.nii`: a diffusion MRI dataset with 100 measurements distributed on 2 shells, respectively at b=700 s/mm^2 and b=2000 s/mm^2;
+- `DWI.scheme`: its corresponding acquisition scheme;
+- `peaks.nii.gz`: main diffusion orientations estimated with CSD;
+- `fibers.trk`: tractogram with about 280K fibers estimated using a streamline-based algorithm;
+- `WM.nii.gz`: white-matter mask extracted from an anatomical T1w image.
+
+
+<span style="color:crimson">**Make sure that your working directory is the folder where you unzipped the downloaded archive.**</span>
+
+
+```python
+path_to_the_directory_with_the_unzipped_archive = '.' # edit this
+cd path_to_the_directory_with_the_unzipped_archive
+```
+
+### Load the usual COMMIT structure
+
+
+```python
+from commit import trk2dictionary
+
+trk2dictionary.run(
+    filename_tractogram = 'LausanneTwoShell/fibers.trk',
+    path_out            = 'LausanneTwoShell/CommitOutput',
+    filename_peaks      = 'LausanneTwoShell/peaks.nii.gz',
+    filename_mask       = 'LausanneTwoShell/WM.nii.gz',
+    fiber_shift         = 0.5,
+    peaks_use_affine    = True
+)
+
+import commit
+mit = commit.Evaluation( '.', 'LausanneTwoShell' )
+mit.load_data( 'DWI.nii', 'DWI.scheme' )
+
+mit.set_model( 'StickZeppelinBall' )
+
+d_par = 1.7E-3              # Parallel diffusivity [mm^2/s]
+ICVFs = [ 0.7 ]             # Intra-cellular volume fraction(s) [0..1]
+d_ISOs = [ 1.7E-3, 3.0E-3 ] # Isotropic diffusivitie(s) [mm^2/s]
+
+mit.model.set( d_par, ICVFs, d_ISOs )
+mit.generate_kernels( regenerate=True )
+mit.load_kernels()
+
+mit.load_dictionary( 'CommitOutput' )
+mit.set_threads()
+mit.build_operator()
+```
+
+### Perform clustering of the streamlines
+
+You will need `dipy`, which is among the requirements of COMMIT, hence there should be no problem.
+
+The `threshold` parameter has to be tuned for each brain. Do not consider our choice as a standard one.
+
+
+```python
+from nibabel import trackvis as tv
+fname='LausanneTwoShell/fibers.trk'
+streams, hdr = tv.read(fname)
+streamlines = [i[0] for i in streams]
+
+from dipy.segment.clustering import QuickBundles
+threshold = 15.0
+qb = QuickBundles(threshold=threshold)
+clusters = qb.cluster(streamlines)
+
+import numpy as np
+structureIC = np.array([c.indices for c in clusters])
+weightsIC   = np.array([1.0/np.sqrt(len(c)) for c in structureIC])
+```
+
+Notice that we defined `structure_IC` as a `numpy.array` that contains a list of lists containing the indices associated to each group. We know it sounds a little bit bizarre but it computationally convenient.
+
+### Define the regularisation term
+Each compartment must be regularised separately. The user can choose among the following penalties:
+
+- $\sum_{g\in G}w_g\|x_g\|_k$ : `commit.solvers.group_sparsity` with $k\in \{2, \infty\}$ (only for IC compartment)
+
+- $\|x\|_1$ : `commit.solvers.norm1`
+
+- $\|x\|_2$ : `commit.solvers.norm2`
+
+- $\iota_{\ge 0}(x)$ : `commit.solvers.non_negative` (Default for all compartments)
+
+If the chosen regularisation for the IC compartment is $\sum_{g\in G}\|x_g\|_k$, we can define $k$ via the `group_norm` field, which must be one between
+
+- $\|x\|_2$ : `commit.solvers.norm2` (Default)
+
+- $\|x\|_\infty$ : `commit.solvers.norminf`
+
+In this example we consider the following penalties:
+
+- Intracellular: group sparsity with 2-norm of each group
+
+- Extracellular: 2-norm
+
+- Isotropic: 1-norm
+
+
+```python
+regnorms = [commit.solvers.group_sparsity, commit.solvers.norm2, commit.solvers.norm1]
+
+group_norm = 2 # each group is penalised with its 2-norm
+```
+
+The regularisation parameters are specified within the lambdas field. Again, do not consider our choice as a standard one.
+
+
+```python
+lambdas = [10.,10.,10.]
+```
+
+### Call the constructor of the data structure
+
+
+```python
+regterm = commit.solvers.init_regularisation(mit,
+                                             regnorms    = regnorms,
+                                             structureIC = structureIC,
+                                             weightsIC   = weightsIC,
+                                             group_norm  = group_norm,
+                                             lambdas     = lambdas)
+```
+
+### Call the fit function to perform the optimisation
+
+
+```python
+mit.fit(regularisation=regterm, max_iter=1000)
+```
+
+### Save the results
+
+
+```python
+suffix = 'IC'+str(regterm[0])+'EC'+str(regterm[1])+'ISO'+str(regterm[2])
+mit.save_results(path_suffix=suffix)
+```
diff --git a/doc/tutorials/AdvancedSolvers/tutorial_solvers.ipynb b/doc/tutorials/AdvancedSolvers/tutorial_solvers.ipynb
index dab7c2a4..288d1840 100644
--- a/doc/tutorials/AdvancedSolvers/tutorial_solvers.ipynb
+++ b/doc/tutorials/AdvancedSolvers/tutorial_solvers.ipynb
@@ -1,264 +1,264 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "You can find the text version of this tutorial [at this link](README.md)."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Advanced solvers\n",
-    "\n",
-    "This tutorial shows how to exploit the advanced features of the COMMIT framework from the side of the **optimisation problem**. The general formulation is the following:\n",
-    "\\begin{equation}\n",
-    "x^* = \\arg\\min_{x\\in R^n_+} \\frac12 \\|Ax-y\\|_2^2 + \\lambda_{IC}\\Omega_{IC}(x) + \\lambda_{EC}\\Omega_{EC}(x) + \\lambda_{ISO}\\Omega_{ISO}(x),\n",
-    "\\end{equation}\n",
-    "where $A$ is the COMMIT dictionary, $n$ is defined in such a way that the product $Ax$ makes sense and $y$ is the datum that we want to fit. The three regularisation terms allow us to exploit ***distinct penalties for each compartment***.\n",
-    "\n",
-    "*Note*: before exploring this tutorial, you should follow the [Getting Started](https://github.com/daducci/COMMIT/tree/master/doc/tutorials/GettingStarted) tutorial.\n",
-    "\n",
-    "\n",
-    "### Download and unpack the data\n",
-    "\n",
-    "Download and extract the **example dataset** from the following [ZIP archive](http://hardi.epfl.ch/static/data/COMMIT_demos/LausanneTwoShell.zip), which contains the following files:\n",
-    "\n",
-    "- `DWI.nii`: a diffusion MRI dataset with 100 measurements distributed on 2 shells, respectively at b=700 s/mm^2 and b=2000 s/mm^2;\n",
-    "- `DWI.scheme`: its corresponding acquisition scheme;\n",
-    "- `peaks.nii.gz`: main diffusion orientations estimated with CSD;\n",
-    "- `fibers.trk`: tractogram with about 280K fibers estimated using a streamline-based algorithm;\n",
-    "- `WM.nii.gz`: white-matter mask extracted from an anatomical T1w image.\n",
-    "\n",
-    "\n",
-    "<span style=\"color:crimson\">**Make sure that your working directory is the folder where you unzipped the downloaded archive.**</span>"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "path_to_the_directory_with_the_unzipped_archive = '.' # edit this\n",
-    "cd path_to_the_directory_with_the_unzipped_archive"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Load the usual COMMIT structure"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from commit import trk2dictionary\n",
-    "\n",
-    "trk2dictionary.run(\n",
-    "    filename_tractogram = 'LausanneTwoShell/fibers.trk',\n",
-    "    path_out            = 'LausanneTwoShell/CommitOutput',\n",
-    "    filename_peaks      = 'LausanneTwoShell/peaks.nii.gz',\n",
-    "    filename_mask       = 'LausanneTwoShell/WM.nii.gz',\n",
-    "    fiber_shift         = 0.5,\n",
-    "    peaks_use_affine    = True\n",
-    ")\n",
-    "\n",
-    "import commit\n",
-    "mit = commit.Evaluation( '.', 'LausanneTwoShell' )\n",
-    "mit.load_data( 'DWI.nii', 'DWI.scheme' )\n",
-    "\n",
-    "mit.set_model( 'StickZeppelinBall' )\n",
-    "\n",
-    "d_par = 1.7E-3              # Parallel diffusivity [mm^2/s]\n",
-    "ICVFs = [ 0.7 ]             # Intra-cellular volume fraction(s) [0..1]\n",
-    "d_ISOs = [ 1.7E-3, 3.0E-3 ] # Isotropic diffusivitie(s) [mm^2/s]\n",
-    "\n",
-    "mit.model.set( d_par, ICVFs, d_ISOs )\n",
-    "mit.generate_kernels( regenerate=True )\n",
-    "mit.load_kernels()\n",
-    "\n",
-    "mit.load_dictionary( 'CommitOutput' )\n",
-    "mit.set_threads()\n",
-    "mit.build_operator()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Perform clustering of the streamlines\n",
-    "\n",
-    "You will need `dipy`, which is among the requirements of COMMIT, hence there should be no problem.\n",
-    "\n",
-    "The `threshold` parameter has to be tuned for each brain. Do not consider our choice as a standard one."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from nibabel import trackvis as tv\n",
-    "fname='LausanneTwoShell/fibers.trk'\n",
-    "streams, hdr = tv.read(fname)\n",
-    "streamlines = [i[0] for i in streams]\n",
-    "\n",
-    "from dipy.segment.clustering import QuickBundles\n",
-    "threshold = 15.0\n",
-    "qb = QuickBundles(threshold=threshold)\n",
-    "clusters = qb.cluster(streamlines)\n",
-    "\n",
-    "import numpy as np\n",
-    "structureIC = np.array([c.indices for c in clusters])\n",
-    "weightsIC   = np.array([1.0/np.sqrt(len(c)) for c in structureIC])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Notice that we defined `structure_IC` as a `numpy.array` that contains a list of lists containing the indices associated to each group. We know it sounds a little bit bizarre but it computationally convenient."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Define the regularisation term\n",
-    "Each compartment must be regularised separately. The user can choose among the following penalties:\n",
-    "\n",
-    "- $\\sum_{g\\in G}w_g\\|x_g\\|_k$ : `commit.solvers.group_sparsity` with $k\\in \\{2, \\infty\\}$ (only for IC compartment)\n",
-    "\n",
-    "- $\\|x\\|_1$ : `commit.solvers.norm1`\n",
-    "\n",
-    "- $\\|x\\|_2$ : `commit.solvers.norm2`\n",
-    "\n",
-    "- $\\iota_{\\ge 0}(x)$ : `commit.solvers.non_negative` (Default for all compartments)\n",
-    "\n",
-    "If the chosen regularisation for the IC compartment is $\\sum_{g\\in G}\\|x_g\\|_k$, we can define $k$ via the `group_norm` field, which must be one between\n",
-    "\n",
-    "- $\\|x\\|_2$ : `commit.solvers.norm2` (Default)\n",
-    "\n",
-    "- $\\|x\\|_\\infty$ : `commit.solvers.norminf`\n",
-    "\n",
-    "In this example we consider the following penalties:\n",
-    "\n",
-    "- Intracellular: group sparsity with 2-norm of each group\n",
-    "\n",
-    "- Extracellular: 2-norm\n",
-    "\n",
-    "- Isotropic: 1-norm"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "regnorms = [commit.solvers.group_sparsity, commit.solvers.norm2, commit.solvers.norm1]\n",
-    "\n",
-    "group_norm = 2 # each group is penalised with its 2-norm"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The regularisation parameters are specified within the lambdas field. Again, do not consider our choice as a standard one."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "lambdas = [10.,10.,10.]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Call the constructor of the data structure"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "regterm = commit.solvers.init_regularisation(mit,\n",
-    "                                             regnorms    = regnorms,\n",
-    "                                             structureIC = structureIC,\n",
-    "                                             weightsIC   = weightsIC,\n",
-    "                                             group_norm  = group_norm,\n",
-    "                                             lambdas     = lambdas)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Call the fit function to perform the optimisation"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mit.fit(regularisation=regterm, max_iter=1000)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Save the results"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "suffix = 'IC'+str(regterm[0])+'EC'+str(regterm[1])+'ISO'+str(regterm[2])\n",
-    "mit.save_results(path_suffix=suffix)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 2",
-   "language": "python",
-   "name": "python2"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 2
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.14"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can find the text version of this tutorial [at this link](README.md)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Advanced solvers\n",
+    "\n",
+    "This tutorial shows how to exploit the advanced features of the COMMIT framework from the side of the **optimisation problem**. The general formulation is the following:\n",
+    "\\begin{equation}\n",
+    "x^* = \\arg\\min_{x\\in R^n_+} \\frac12 \\|Ax-y\\|_2^2 + \\lambda_{IC}\\Omega_{IC}(x) + \\lambda_{EC}\\Omega_{EC}(x) + \\lambda_{ISO}\\Omega_{ISO}(x),\n",
+    "\\end{equation}\n",
+    "where $A$ is the COMMIT dictionary, $n$ is defined in such a way that the product $Ax$ makes sense and $y$ is the datum that we want to fit. The three regularisation terms allow us to exploit ***distinct penalties for each compartment***.\n",
+    "\n",
+    "*Note*: before exploring this tutorial, you should follow the [Getting Started](https://github.com/daducci/COMMIT/tree/master/doc/tutorials/GettingStarted) tutorial.\n",
+    "\n",
+    "\n",
+    "### Download and unpack the data\n",
+    "\n",
+    "Download and extract the **example dataset** from the following [ZIP archive](http://hardi.epfl.ch/static/data/COMMIT_demos/LausanneTwoShell.zip), which contains the following files:\n",
+    "\n",
+    "- `DWI.nii`: a diffusion MRI dataset with 100 measurements distributed on 2 shells, respectively at b=700 s/mm^2 and b=2000 s/mm^2;\n",
+    "- `DWI.scheme`: its corresponding acquisition scheme;\n",
+    "- `peaks.nii.gz`: main diffusion orientations estimated with CSD;\n",
+    "- `fibers.trk`: tractogram with about 280K fibers estimated using a streamline-based algorithm;\n",
+    "- `WM.nii.gz`: white-matter mask extracted from an anatomical T1w image.\n",
+    "\n",
+    "\n",
+    "<span style=\"color:crimson\">**Make sure that your working directory is the folder where you unzipped the downloaded archive.**</span>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "path_to_the_directory_with_the_unzipped_archive = '.' # edit this\n",
+    "cd path_to_the_directory_with_the_unzipped_archive"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load the usual COMMIT structure"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from commit import trk2dictionary\n",
+    "\n",
+    "trk2dictionary.run(\n",
+    "    filename_tractogram = 'LausanneTwoShell/fibers.trk',\n",
+    "    path_out            = 'LausanneTwoShell/CommitOutput',\n",
+    "    filename_peaks      = 'LausanneTwoShell/peaks.nii.gz',\n",
+    "    filename_mask       = 'LausanneTwoShell/WM.nii.gz',\n",
+    "    fiber_shift         = 0.5,\n",
+    "    peaks_use_affine    = True\n",
+    ")\n",
+    "\n",
+    "import commit\n",
+    "mit = commit.Evaluation( '.', 'LausanneTwoShell' )\n",
+    "mit.load_data( 'DWI.nii', 'DWI.scheme' )\n",
+    "\n",
+    "mit.set_model( 'StickZeppelinBall' )\n",
+    "\n",
+    "d_par = 1.7E-3              # Parallel diffusivity [mm^2/s]\n",
+    "ICVFs = [ 0.7 ]             # Intra-cellular volume fraction(s) [0..1]\n",
+    "d_ISOs = [ 1.7E-3, 3.0E-3 ] # Isotropic diffusivitie(s) [mm^2/s]\n",
+    "\n",
+    "mit.model.set( d_par, ICVFs, d_ISOs )\n",
+    "mit.generate_kernels( regenerate=True )\n",
+    "mit.load_kernels()\n",
+    "\n",
+    "mit.load_dictionary( 'CommitOutput' )\n",
+    "mit.set_threads()\n",
+    "mit.build_operator()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Perform clustering of the streamlines\n",
+    "\n",
+    "You will need `dipy`, which is among the requirements of COMMIT, hence there should be no problem.\n",
+    "\n",
+    "The `threshold` parameter has to be tuned for each brain. Do not consider our choice as a standard one."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nibabel import trackvis as tv\n",
+    "fname='LausanneTwoShell/fibers.trk'\n",
+    "streams, hdr = tv.read(fname)\n",
+    "streamlines = [i[0] for i in streams]\n",
+    "\n",
+    "from dipy.segment.clustering import QuickBundles\n",
+    "threshold = 15.0\n",
+    "qb = QuickBundles(threshold=threshold)\n",
+    "clusters = qb.cluster(streamlines)\n",
+    "\n",
+    "import numpy as np\n",
+    "structureIC = np.array([c.indices for c in clusters])\n",
+    "weightsIC   = np.array([1.0/np.sqrt(len(c)) for c in structureIC])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Notice that we defined `structure_IC` as a `numpy.array` that contains a list of lists containing the indices associated to each group. We know it sounds a little bit bizarre but it computationally convenient."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Define the regularisation term\n",
+    "Each compartment must be regularised separately. The user can choose among the following penalties:\n",
+    "\n",
+    "- $\\sum_{g\\in G}w_g\\|x_g\\|_k$ : `commit.solvers.group_sparsity` with $k\\in \\{2, \\infty\\}$ (only for IC compartment)\n",
+    "\n",
+    "- $\\|x\\|_1$ : `commit.solvers.norm1`\n",
+    "\n",
+    "- $\\|x\\|_2$ : `commit.solvers.norm2`\n",
+    "\n",
+    "- $\\iota_{\\ge 0}(x)$ : `commit.solvers.non_negative` (Default for all compartments)\n",
+    "\n",
+    "If the chosen regularisation for the IC compartment is $\\sum_{g\\in G}\\|x_g\\|_k$, we can define $k$ via the `group_norm` field, which must be one between\n",
+    "\n",
+    "- $\\|x\\|_2$ : `commit.solvers.norm2` (Default)\n",
+    "\n",
+    "- $\\|x\\|_\\infty$ : `commit.solvers.norminf`\n",
+    "\n",
+    "In this example we consider the following penalties:\n",
+    "\n",
+    "- Intracellular: group sparsity with 2-norm of each group\n",
+    "\n",
+    "- Extracellular: 2-norm\n",
+    "\n",
+    "- Isotropic: 1-norm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "regnorms = [commit.solvers.group_sparsity, commit.solvers.norm2, commit.solvers.norm1]\n",
+    "\n",
+    "group_norm = 2 # each group is penalised with its 2-norm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The regularisation parameters are specified within the lambdas field. Again, do not consider our choice as a standard one."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "lambdas = [10.,10.,10.]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Call the constructor of the data structure"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "regterm = commit.solvers.init_regularisation(mit,\n",
+    "                                             regnorms    = regnorms,\n",
+    "                                             structureIC = structureIC,\n",
+    "                                             weightsIC   = weightsIC,\n",
+    "                                             group_norm  = group_norm,\n",
+    "                                             lambdas     = lambdas)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Call the fit function to perform the optimisation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mit.fit(regularisation=regterm, max_iter=1000)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Save the results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "suffix = 'IC'+str(regterm[0])+'EC'+str(regterm[1])+'ISO'+str(regterm[2])\n",
+    "mit.save_results(path_suffix=suffix)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From a120ff6f755c68a95cd7adcf069ef3c6dc2c3a31 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 13 Jun 2020 20:38:26 -0500
Subject: [PATCH 110/190] Adding texture memory to kernels

---
 commit/operator_withCUDA.cu  | 52 +++++++++++++++++++++++++++---------
 commit/operator_withCUDA.cuh |  5 ++++
 2 files changed, 44 insertions(+), 13 deletions(-)

diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index 086eb7b8..7cf5c94e 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -157,7 +157,7 @@ CudaLinearOperator::CudaLinearOperator(
     else            printf("[ CUDA ERROR ]\n");
 
     // alloc and transfer LUTs
-    printf("\t* loading LUT ... ");
+    printf("\t* loading LUTs ... ");
     cudaStatus = true;
     cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lutIC, size_lutic*sizeof(float32_t)) );
     cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lutIC, lutIC, size_lutic*sizeof(float32_t), cudaMemcpyHostToDevice) );
@@ -170,6 +170,29 @@ CudaLinearOperator::CudaLinearOperator(
     if (cudaStatus) printf("[ OK ]\n");
     else            printf("[ CUDA ERROR ]\n");
 
+    // configure texture for LUTs
+    tex_lutIC.addressMode[0] = cudaAddressModeBorder;
+    tex_lutIC.addressMode[1] = cudaAddressModeBorder;
+    tex_lutIC.filterMode = cudaFilterModePoint;
+    tex_lutIC.normalized = false;
+
+    tex_lutEC.addressMode[0] = cudaAddressModeBorder;
+    tex_lutEC.addressMode[1] = cudaAddressModeBorder;
+    tex_lutEC.filterMode = cudaFilterModePoint;
+    tex_lutEC.normalized = false;
+
+    tex_lutISO.addressMode[0] = cudaAddressModeBorder;
+    tex_lutISO.addressMode[1] = cudaAddressModeBorder;
+    tex_lutISO.filterMode = cudaFilterModePoint;
+    tex_lutISO.normalized = false;
+
+    printf("\t* linking LUTs to a texture memory ... ");
+    cudaStatus = cudaStatus && cudaCheck( cudaBindTexture(NULL, tex_lutIC,  gpu_lutIC,  size_lutic  * sizeof(float32_t)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaBindTexture(NULL, tex_lutEC,  gpu_lutEC,  size_lutec  * sizeof(float32_t)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaBindTexture(NULL, tex_lutISO, gpu_lutISO, size_lutiso * sizeof(float32_t)) );
+    if (cudaStatus) printf("[ OK ]\n");
+    else            printf("[ CUDA ERROR ]\n");
+
     // alloc and transfer operator A
     printf("\t* A  operator... ");
     cudaStatus = true;
@@ -237,6 +260,9 @@ void CudaLinearOperator::destroy(){
     cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_lutIC)  );
     cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_lutEC)  );
     cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_lutISO) );
+    cudaStatus = cudaStatus && cudaCheck( cudaUnbindTexture(tex_lutIC)  );
+    cudaStatus = cudaStatus && cudaCheck( cudaUnbindTexture(tex_lutEC)  );
+    cudaStatus = cudaStatus && cudaCheck( cudaUnbindTexture(tex_lutISO) );
     if (cudaStatus) printf("[ OK ]\n");
     else            printf("[ CUDA ERROR ]\n");
 
@@ -401,8 +427,8 @@ __global__ void multiply_Ax_ICpart(uint32_t*  voxelIDs,
 
         float64_t aux = 0.0;
         for(int j = 0; j < NUM_DIAMETERS; j++){
-            aux += (double)(lut[offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES])*x[(*fiber) + j*NUM_FIBERS];
-            //aux += tex1Dfetch(tex_lutIC, offset_lut + j*num_orientations*num_samples) * x[(*fiber) + j*num_fibers];
+            //aux += (double)(lut[offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES])*x[(*fiber) + j*NUM_FIBERS];
+            aux += tex1Dfetch(tex_lutIC, offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES) * x[(*fiber) + j*NUM_FIBERS];
         }
 
         sum += aux * (*length);
@@ -446,8 +472,8 @@ __global__ void multiply_Ax_ECpart(
         uint32_t offset_lut = (*orien)*NUM_SAMPLES + tid;
 
         for(int j = 0; j < NUM_ZEPPELINS; j++)
-            sum += (double)(lut[offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES])*x[target + j*NUM_PEAKS + i];
-            //sum += tex1Dfetch(tex_lutEC, offset_lut + j*num_orientations*num_samples) * x[target + j*num_excomps + i];
+            //sum += (double)(lut[offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES])*x[target + j*NUM_PEAKS + i];
+            sum += tex1Dfetch(tex_lutEC, offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES) * x[target + j*NUM_PEAKS + i];
 
         orien++;
     }
@@ -469,8 +495,8 @@ __global__ void multiply_Ax_ISOpart(
 
     float64_t sum = 0.0;
     for(int j = 0; j < NUM_BALLS; j++)
-        sum += (double)(lut[j*NUM_SAMPLES + tid])*x[target + j*NUM_VOXELS];
-        //sum += (double)(tex1Dfetch(tex_lutISO, j*num_samples + tid))*x[target + j*num_voxels];
+        //sum += (double)(lut[j*NUM_SAMPLES + tid])*x[target + j*NUM_VOXELS];
+        sum += (double)(tex1Dfetch(tex_lutISO, j*NUM_SAMPLES + tid))*x[target + j*NUM_VOXELS];
         
 
     y[bid*NUM_SAMPLES + tid] += sum;
@@ -512,8 +538,8 @@ __global__ void multiply_Aty_ICpart(
         orien  = orienICt  + offset;
         length = lengthICt + offset;
         for(int i = offset; i < nsegments; i++){
-            sum += ((float64_t)(*length)) *( (float64_t) lut[offset_lut + (*orien)*NUM_SAMPLES] )* y[(*voxel)*NUM_SAMPLES + tid];
-            //sum += ((float64_t)(*length)) *( (float64_t) tex1Dfetch(tex_lutIC, offset_lut + (*orient)*num_samples) )* y[(*voxel)*num_samples + tid];
+            //summ += ((float64_t)(*length)) *( (float64_t) lut[offset_lut + (*orien)*NUM_SAMPLES] )* y[(*voxel)*NUM_SAMPLES + tid];
+            sum += ((float64_t)(*length)) *( (float64_t) tex1Dfetch(tex_lutIC, offset_lut + (*orien)*NUM_SAMPLES) )* y[(*voxel)*NUM_SAMPLES + tid];
 
             voxel++;
             orien++;
@@ -567,8 +593,8 @@ __global__ void multiply_Aty_ECpart(
         voxel = voxelEC + offset;
         orien = orienEC + offset;
         for(int i = offset; i < ncompartments; i++){
-            //shmem[tid] =( (float64_t)tex1Dfetch(tex_lutEC, (*orient)*num_samples + offset_lut) )* y[(*voxel)*num_samples + tid];
-            shmem[tid] =( (float64_t)(lut[(*orien)*NUM_SAMPLES + offset_lut] ))* y[(*voxel)*NUM_SAMPLES + tid];
+            //shmem[tid] =( (float64_t)(lut[(*orien)*NUM_SAMPLES + offset_lut] ))* y[(*voxel)*NUM_SAMPLES + tid];
+            shmem[tid] =( (float64_t)tex1Dfetch(tex_lutEC, (*orien)*NUM_SAMPLES + offset_lut) )* y[(*voxel)*NUM_SAMPLES + tid];
             __syncthreads();
 
             if(tid < 256) shmem[tid] += shmem[tid + 256]; __syncthreads();
@@ -601,8 +627,8 @@ __global__ void multiply_Aty_ISOpart(float* lut, double* x, double* y){
     if(tid >= NUM_SAMPLES) return;
 
     for(int j = 0; j < NUM_BALLS; j++){
-        shmem[tid] =( (float64_t) lut[j*NUM_SAMPLES + tid] )* y[bid*NUM_SAMPLES + tid];
-        //shmem[tid] =( (float64_t) tex1Dfetch(tex_lutISO, j*num_samples + tid) )* y[bid*num_samples + tid];
+        //shmem[tid] =( (float64_t) lut[j*NUM_SAMPLES + tid] )* y[bid*NUM_SAMPLES + tid];
+        shmem[tid] =( (float64_t) tex1Dfetch(tex_lutISO, j*NUM_SAMPLES + tid) )* y[bid*NUM_SAMPLES + tid];
         __syncthreads();
 
         if(tid < 256) shmem[tid] += shmem[tid + 256]; __syncthreads();
diff --git a/commit/operator_withCUDA.cuh b/commit/operator_withCUDA.cuh
index 231a4f77..1af7a636 100644
--- a/commit/operator_withCUDA.cuh
+++ b/commit/operator_withCUDA.cuh
@@ -83,6 +83,11 @@ __constant__ int SIZE_LUTIC;
 __constant__ int SIZE_LUTEC;     
 __constant__ int SIZE_LUTISO;
 
+// textures in GPU
+texture<float32_t, 1, cudaReadModeElementType> tex_lutIC;
+texture<float32_t, 1, cudaReadModeElementType> tex_lutEC;
+texture<float32_t, 1, cudaReadModeElementType> tex_lutISO;
+
 // pointers to IC data in GPU memory
 static uint32_t*  gpu_voxelIC;
 static uint32_t*  gpu_fiberIC;

From 512edb231ed70a2e6f35e4cb7e19d1a980fb067c Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 13 Jun 2020 20:54:04 -0500
Subject: [PATCH 111/190] Adding texture memory to kernels

---
 commit/operator_withCUDA.cu  | 5 +++++
 commit/operator_withCUDA.cuh | 5 -----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index 7cf5c94e..4f12f2a6 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -1,5 +1,10 @@
 #include "operator_withCUDA.cuh"
 
+// textures in GPU
+texture<float32_t, 1, cudaReadModeElementType> tex_lutIC;
+texture<float32_t, 1, cudaReadModeElementType> tex_lutEC;
+texture<float32_t, 1, cudaReadModeElementType> tex_lutISO;
+
 bool cudaCheck(cudaError_t cudaStatus){
     return cudaStatus == cudaSuccess;
 }
diff --git a/commit/operator_withCUDA.cuh b/commit/operator_withCUDA.cuh
index 1af7a636..231a4f77 100644
--- a/commit/operator_withCUDA.cuh
+++ b/commit/operator_withCUDA.cuh
@@ -83,11 +83,6 @@ __constant__ int SIZE_LUTIC;
 __constant__ int SIZE_LUTEC;     
 __constant__ int SIZE_LUTISO;
 
-// textures in GPU
-texture<float32_t, 1, cudaReadModeElementType> tex_lutIC;
-texture<float32_t, 1, cudaReadModeElementType> tex_lutEC;
-texture<float32_t, 1, cudaReadModeElementType> tex_lutISO;
-
 // pointers to IC data in GPU memory
 static uint32_t*  gpu_voxelIC;
 static uint32_t*  gpu_fiberIC;

From ccc147b975833ba1e8c908295ac01b76a056d960 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Fri, 3 Jul 2020 13:00:28 -0500
Subject: [PATCH 112/190] removing textures for experiments

---
 commit/operator_withCUDA.cu | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index 4f12f2a6..0fdfb405 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -432,8 +432,8 @@ __global__ void multiply_Ax_ICpart(uint32_t*  voxelIDs,
 
         float64_t aux = 0.0;
         for(int j = 0; j < NUM_DIAMETERS; j++){
-            //aux += (double)(lut[offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES])*x[(*fiber) + j*NUM_FIBERS];
-            aux += tex1Dfetch(tex_lutIC, offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES) * x[(*fiber) + j*NUM_FIBERS];
+            aux += (double)(lut[offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES])*x[(*fiber) + j*NUM_FIBERS];
+            //aux += tex1Dfetch(tex_lutIC, offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES) * x[(*fiber) + j*NUM_FIBERS];
         }
 
         sum += aux * (*length);
@@ -477,8 +477,8 @@ __global__ void multiply_Ax_ECpart(
         uint32_t offset_lut = (*orien)*NUM_SAMPLES + tid;
 
         for(int j = 0; j < NUM_ZEPPELINS; j++)
-            //sum += (double)(lut[offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES])*x[target + j*NUM_PEAKS + i];
-            sum += tex1Dfetch(tex_lutEC, offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES) * x[target + j*NUM_PEAKS + i];
+            sum += (double)(lut[offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES])*x[target + j*NUM_PEAKS + i];
+            //sum += tex1Dfetch(tex_lutEC, offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES) * x[target + j*NUM_PEAKS + i];
 
         orien++;
     }
@@ -500,8 +500,8 @@ __global__ void multiply_Ax_ISOpart(
 
     float64_t sum = 0.0;
     for(int j = 0; j < NUM_BALLS; j++)
-        //sum += (double)(lut[j*NUM_SAMPLES + tid])*x[target + j*NUM_VOXELS];
-        sum += (double)(tex1Dfetch(tex_lutISO, j*NUM_SAMPLES + tid))*x[target + j*NUM_VOXELS];
+        sum += (double)(lut[j*NUM_SAMPLES + tid])*x[target + j*NUM_VOXELS];
+        //sum += (double)(tex1Dfetch(tex_lutISO, j*NUM_SAMPLES + tid))*x[target + j*NUM_VOXELS];
         
 
     y[bid*NUM_SAMPLES + tid] += sum;
@@ -543,8 +543,8 @@ __global__ void multiply_Aty_ICpart(
         orien  = orienICt  + offset;
         length = lengthICt + offset;
         for(int i = offset; i < nsegments; i++){
-            //summ += ((float64_t)(*length)) *( (float64_t) lut[offset_lut + (*orien)*NUM_SAMPLES] )* y[(*voxel)*NUM_SAMPLES + tid];
-            sum += ((float64_t)(*length)) *( (float64_t) tex1Dfetch(tex_lutIC, offset_lut + (*orien)*NUM_SAMPLES) )* y[(*voxel)*NUM_SAMPLES + tid];
+            summ += ((float64_t)(*length)) *( (float64_t) lut[offset_lut + (*orien)*NUM_SAMPLES] )* y[(*voxel)*NUM_SAMPLES + tid];
+            //sum += ((float64_t)(*length)) *( (float64_t) tex1Dfetch(tex_lutIC, offset_lut + (*orien)*NUM_SAMPLES) )* y[(*voxel)*NUM_SAMPLES + tid];
 
             voxel++;
             orien++;
@@ -598,8 +598,8 @@ __global__ void multiply_Aty_ECpart(
         voxel = voxelEC + offset;
         orien = orienEC + offset;
         for(int i = offset; i < ncompartments; i++){
-            //shmem[tid] =( (float64_t)(lut[(*orien)*NUM_SAMPLES + offset_lut] ))* y[(*voxel)*NUM_SAMPLES + tid];
-            shmem[tid] =( (float64_t)tex1Dfetch(tex_lutEC, (*orien)*NUM_SAMPLES + offset_lut) )* y[(*voxel)*NUM_SAMPLES + tid];
+            shmem[tid] =( (float64_t)(lut[(*orien)*NUM_SAMPLES + offset_lut] ))* y[(*voxel)*NUM_SAMPLES + tid];
+            //shmem[tid] =( (float64_t)tex1Dfetch(tex_lutEC, (*orien)*NUM_SAMPLES + offset_lut) )* y[(*voxel)*NUM_SAMPLES + tid];
             __syncthreads();
 
             if(tid < 256) shmem[tid] += shmem[tid + 256]; __syncthreads();
@@ -632,8 +632,8 @@ __global__ void multiply_Aty_ISOpart(float* lut, double* x, double* y){
     if(tid >= NUM_SAMPLES) return;
 
     for(int j = 0; j < NUM_BALLS; j++){
-        //shmem[tid] =( (float64_t) lut[j*NUM_SAMPLES + tid] )* y[bid*NUM_SAMPLES + tid];
-        shmem[tid] =( (float64_t) tex1Dfetch(tex_lutISO, j*NUM_SAMPLES + tid) )* y[bid*NUM_SAMPLES + tid];
+        shmem[tid] =( (float64_t) lut[j*NUM_SAMPLES + tid] )* y[bid*NUM_SAMPLES + tid];
+        //shmem[tid] =( (float64_t) tex1Dfetch(tex_lutISO, j*NUM_SAMPLES + tid) )* y[bid*NUM_SAMPLES + tid];
         __syncthreads();
 
         if(tid < 256) shmem[tid] += shmem[tid + 256]; __syncthreads();

From b62895d7717d2e9faee3046558b76a571eb3da3d Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Fri, 3 Jul 2020 13:06:31 -0500
Subject: [PATCH 113/190] removing textures for experiments

---
 commit/operator_withCUDA.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index 0fdfb405..af25bf0b 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -543,7 +543,7 @@ __global__ void multiply_Aty_ICpart(
         orien  = orienICt  + offset;
         length = lengthICt + offset;
         for(int i = offset; i < nsegments; i++){
-            summ += ((float64_t)(*length)) *( (float64_t) lut[offset_lut + (*orien)*NUM_SAMPLES] )* y[(*voxel)*NUM_SAMPLES + tid];
+            sum += ((float64_t)(*length)) *( (float64_t) lut[offset_lut + (*orien)*NUM_SAMPLES] )* y[(*voxel)*NUM_SAMPLES + tid];
             //sum += ((float64_t)(*length)) *( (float64_t) tex1Dfetch(tex_lutIC, offset_lut + (*orien)*NUM_SAMPLES) )* y[(*voxel)*NUM_SAMPLES + tid];
 
             voxel++;

From 1667493b5dff9466e0636ba8a1ccf04acfd1da3b Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Fri, 17 Jul 2020 21:28:53 -0500
Subject: [PATCH 114/190] Adding option to choose GPU

---
 commit/core.pyx                    | 1760 +++++------
 commit/cudaoperator.pyx            |  448 +--
 commit/operator/operator.pyx       |  382 +--
 commit/operator/operator_withLUT.c | 4494 ++++++++++++++--------------
 commit/operator_withCUDA.cu        | 1317 ++++----
 commit/operator_withCUDA.cuh       |   57 +-
 commit/solvers.py                  |  806 ++---
 setup.py                           |  480 ++-
 8 files changed, 4897 insertions(+), 4847 deletions(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index 4f7462e0..f88fe4fe 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -1,879 +1,881 @@
-#!python
-#cython: language_level=3, boundscheck=False, wraparound=False, nonecheck=False, cdivision=True, initializedcheck=False, binding=False
-from __future__ import print_function
-cimport cython
-import numpy as np
-cimport numpy as np
-
-import time
-import glob
-import sys
-from os import makedirs, remove
-from os.path import exists, join as pjoin, isfile
-import nibabel
-import pickle
-import commit.models
-import commit.solvers
-import amico.scheme
-import amico.lut
-import pyximport
-pyximport.install( reload_support=True, language_level=3 )
-
-
-def setup( lmax = 12, ndirs = 32761 ) :
-    """General setup/initialization of the COMMIT framework.
-    
-    Parameters
-    ----------
-    lmax : int
-        Maximum SH order to use for the rotation phase (default : 12)
-    ndirs : int
-        Number of directions on the half of the sphere representing the possible orientations of the response functions (default : 32761)
-    """
-
-    if not amico.lut.is_valid(ndirs):
-        raise RuntimeError( 'Unsupported value for ndirs.\nNote: Supported values for ndirs are [500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 10000, 32761 (default)]' )
-
-    amico.lut.precompute_rotation_matrices( lmax, ndirs )
-
-def load_dictionary_info(filename):
-    """Function to load dictionary info file
-    
-    Parameters
-    ----------
-    filename : string
-        This value is always COMMIT_PATH + dictionary_info.pickle
-    """
-    if not isfile( filename ):
-        raise RuntimeError( 'Dictionary is outdated or not found. Execute ''trk2dictionary'' script first.' )
-    with open( filename, 'rb' ) as dictionary_info_file:
-        if sys.version_info.major == 3:
-            aux = pickle.load( dictionary_info_file, fix_imports=True, encoding='bytes' )
-            # Pickle files written by Python 2 are loaded with byte
-            # keys, whereas those written by Python 3 are loaded with
-            # str keys, even when both are written using protocol=2
-            result_aux = {(k.decode() if hasattr(k,"decode") else k): v for k, v in aux.items()}
-            return result_aux
-        else:
-            return pickle.load( dictionary_info_file )
-
-cdef class Evaluation :
-    """Class to hold all the information (data and parameters) when performing an
-    evaluation with the COMMIT framework.
-    """
-    cdef public niiDWI
-    cdef public niiDWI_img
-    cdef public scheme
-    cdef public model
-    cdef public KERNELS
-    cdef public DICTIONARY
-    cdef public THREADS
-    cdef public A
-    cdef public x
-    cdef public CONFIG
-
-    def __init__( self, study_path, subject ) :
-        """Setup the data structures with default values.
-
-        Parameters
-        ----------
-        study_path : string
-            The path to the folder containing all the subjects from one study
-        subject : string
-            The path (relative to previous folder) to the subject folder
-        """
-        self.niiDWI     = None # set by "load_data" method
-        self.scheme     = None # set by "load_data" method
-        self.model      = None # set by "set_model" method
-        self.KERNELS    = None # set by "load_kernels" method
-        self.DICTIONARY = None # set by "load_dictionary" method
-        self.THREADS    = None # set by "set_threads" method
-        self.A          = None # set by "build_operator" method
-        self.x          = None # set by "fit" method
-
-        # store all the parameters of an evaluation with COMMIT
-        self.CONFIG = {}
-        self.set_config('study_path', study_path)
-        self.set_config('subject', subject)
-        self.set_config('DATA_path', pjoin( study_path, subject ))
-
-        self.set_config('doNormalizeSignal', True)
-        self.set_config('doMergeB0', False)
-        self.set_config('doNormalizeKernels', True)
-        self.set_config('doDemean', False)
-        self.set_config('doNormalizeMaps', False)
-
-
-
-    def set_config( self, key, value ) :
-        self.CONFIG[ key ] = value
-
-    def get_config( self, key ) :
-        return self.CONFIG.get( key )
-
-
-    def load_data( self, dwi_filename = 'DWI.nii', scheme_filename = 'DWI.scheme', b0_thr = 0 ) :
-        """Load the diffusion signal and its corresponding acquisition scheme.
-
-        Parameters
-        ----------
-        dwi_filename : string
-            The file name of the DWI data, relative to the subject folder (default : 'DWI.nii')
-        scheme_filename : string
-            The file name of the corresponding acquisition scheme (default : 'DWI.scheme')
-        b0_thr : float
-            The threshold below which a b-value is considered a b0 (default : 0)
-        """
-
-        # Loading data and acquisition scheme
-        tic = time.time()
-        print( '\n-> Loading data:' )
-
-        print( '\t* DWI signal...' )
-        self.set_config('dwi_filename', dwi_filename)
-        self.niiDWI  = nibabel.load( pjoin( self.get_config('DATA_path'), dwi_filename) )
-        self.niiDWI_img = self.niiDWI.get_data().astype(np.float32)
-        if self.niiDWI_img.ndim ==3 :
-            self.niiDWI_img = np.expand_dims( self.niiDWI_img, axis=3 )
-        hdr = self.niiDWI.header if nibabel.__version__ >= '2.0.0' else self.niiDWI.get_header()
-        self.set_config('dim', self.niiDWI_img.shape[0:3])
-        self.set_config('pixdim', tuple( hdr.get_zooms()[:3] ))
-        print( '\t\t- dim    = %d x %d x %d x %d' % self.niiDWI_img.shape )
-        print( '\t\t- pixdim = %.3f x %.3f x %.3f' % self.get_config('pixdim') )
-
-        print( '\t* Acquisition scheme...' )
-        self.set_config('scheme_filename', scheme_filename)
-        self.set_config('b0_thr', b0_thr)
-        self.scheme = amico.scheme.Scheme( pjoin( self.get_config('DATA_path'), scheme_filename), b0_thr )
-        print( '\t\t- %d samples, %d shells' % ( self.scheme.nS, len(self.scheme.shells) ) )
-        print( '\t\t- %d @ b=0' % ( self.scheme.b0_count ), end="" )
-        for i in xrange(len(self.scheme.shells)) :
-            print( ', %d @ b=%.1f' % ( len(self.scheme.shells[i]['idx']), self.scheme.shells[i]['b'] ), end="" )
-        print()
-
-        if self.scheme.nS != self.niiDWI_img.shape[3] :
-            raise ValueError( 'Scheme does not match with DWI data' )
-
-        if self.scheme.dwi_count == 0 :
-            raise ValueError( 'There are no DWI volumes in the data' )
-
-        print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
-
-        # Preprocessing
-        tic = time.time()
-        print( '\n-> Preprocessing:' )
-
-        if self.get_config('doNormalizeSignal') :
-            if self.scheme.b0_count > 0 :
-                print( '\t* Normalizing to b0...', end="" )
-                sys.stdout.flush()
-                mean = np.mean( self.niiDWI_img[:,:,:,self.scheme.b0_idx], axis=3 )
-                idx = mean <= 0
-                mean[ idx ] = 1
-                mean = 1 / mean
-                mean[ idx ] = 0
-                for i in xrange(self.scheme.nS) :
-                    self.niiDWI_img[:,:,:,i] *= mean
-            else :
-                print( '\t* There are no b0 volume(s) for normalization...', end="" )
-            print( '[ min=%.2f,  mean=%.2f, max=%.2f ]' % ( self.niiDWI_img.min(), self.niiDWI_img.mean(), self.niiDWI_img.max() ) )
-
-        if self.scheme.b0_count > 1 :
-            if self.get_config('doMergeB0') :
-                print( '\t* Merging multiple b0 volume(s)...', end="" )
-                mean = np.expand_dims( np.mean( self.niiDWI_img[:,:,:,self.scheme.b0_idx], axis=3 ), axis=3 )
-                self.niiDWI_img = np.concatenate( (mean, self.niiDWI_img[:,:,:,self.scheme.dwi_idx]), axis=3 )
-            else :
-                print( '\t* Keeping all b0 volume(s)...', end="" )
-            print( '[ %d x %d x %d x %d ]' % self.niiDWI_img.shape )
-
-        if self.get_config('doDemean') :
-            print( '\t* Demeaning signal...', end="" )
-            sys.stdout.flush()
-            mean = np.repeat( np.expand_dims(np.mean(self.niiDWI_img,axis=3),axis=3), self.niiDWI_img.shape[3], axis=3 )
-            self.niiDWI_img = self.niiDWI_img - mean
-            print( '[ min=%.2f,  mean=%.2f, max=%.2f ]' % ( self.niiDWI_img.min(), self.niiDWI_img.mean(), self.niiDWI_img.max() ) )
-
-        print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
-
-
-    def set_model( self, model_name ) :
-        """Set the model to use to describe the signal contributions in each voxel.
-
-        Parameters
-        ----------
-        model_name : string
-            The name of the model (must match a class name in "commit.models" module)
-        """
-        # Call the specific model constructor
-        if hasattr(commit.models, model_name ) :
-            self.model = getattr(commit.models,model_name)()
-        else :
-            raise ValueError( 'Model "%s" not recognized' % model_name )
-
-        self.set_config('ATOMS_path', pjoin( self.get_config('study_path'), 'kernels', self.model.id ))
-
-
-    def generate_kernels( self, regenerate = False, lmax = 12, ndirs = 32761 ) :
-        """Generate the high-resolution response functions for each compartment.
-        Dispatch to the proper function, depending on the model.
-
-        Parameters
-        ----------
-        regenerate : boolean
-            Regenerate kernels if they already exist (default : False)
-        lmax : int
-            Maximum SH order to use for the rotation procedure (default : 12)
-        ndirs : int
-            Number of directions on the half of the sphere representing the possible orientations of the response functions (default : 32761)
-        """
-        if not amico.lut.is_valid(ndirs):
-            raise RuntimeError( 'Unsupported value for ndirs.\nNote: Supported values for ndirs are [500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 10000, 32761 (default)]' )
-        if self.scheme is None :
-            raise RuntimeError( 'Scheme not loaded; call "load_data()" first.' )
-        if self.model is None :
-            raise RuntimeError( 'Model not set; call "set_model()" method first.' )
-
-        # store some values for later use
-        self.set_config('lmax', lmax)
-        self.set_config('ndirs', ndirs)
-        self.model.scheme = self.scheme
-
-        print( '\n-> Simulating with "%s" model:' % self.model.name )
-
-        # check if kernels were already generated
-        tmp = glob.glob( pjoin(self.get_config('ATOMS_path'),'A_*.npy') )
-        if len(tmp)>0 and not regenerate :
-            print( '   [ Kernels already computed. Call "generate_kernels( regenerate=True )" to force regeneration. ]' )
-            return
-
-        # create folder or delete existing files (if any)
-        if not exists( self.get_config('ATOMS_path') ) :
-            makedirs( self.get_config('ATOMS_path') )
-        else :
-            for f in glob.glob( pjoin(self.get_config('ATOMS_path'),'*') ) :
-                remove( f )
-
-        # auxiliary data structures
-        aux = amico.lut.load_precomputed_rotation_matrices( lmax, ndirs )
-        idx_IN, idx_OUT = amico.lut.aux_structures_generate( self.scheme, lmax )
-
-        # Dispatch to the right handler for each model
-        tic = time.time()
-        self.model.generate( self.get_config('ATOMS_path'), aux, idx_IN, idx_OUT, ndirs )
-        print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
-
-
-    def load_kernels( self ) :
-        """Load rotated kernels and project to the specific gradient scheme of this subject.
-        Dispatch to the proper function, depending on the model.
-        """
-        if self.model is None :
-            raise RuntimeError( 'Model not set; call "set_model()" method first.' )
-        if self.scheme is None :
-            raise RuntimeError( 'Scheme not loaded; call "load_data()" first.' )
-
-        tic = time.time()
-        print( '\n-> Resampling LUT for subject "%s":' % self.get_config('subject') )
-
-        # auxiliary data structures
-        idx_OUT, Ylm_OUT = amico.lut.aux_structures_resample( self.scheme, self.get_config('lmax') )
-
-        # Dispatch to the right handler for each model
-        if self.get_config('doMergeB0') :
-            print( '\t* Merging multiple b0 volume(s)...', end="" )
-        else :
-            print( '\t* Keeping all b0 volume(s)...', end="" )
-        self.KERNELS = self.model.resample( self.get_config('ATOMS_path'), idx_OUT, Ylm_OUT, self.get_config('doMergeB0'), self.get_config('ndirs') )
-        nIC  = self.KERNELS['wmr'].shape[0]
-        nEC  = self.KERNELS['wmh'].shape[0]
-        nISO = self.KERNELS['iso'].shape[0]
-        print( '[ OK ]' )
-
-
-        # ensure contiguous arrays for C part
-        self.KERNELS['wmr'] = np.ascontiguousarray( self.KERNELS['wmr'] )
-        self.KERNELS['wmh'] = np.ascontiguousarray( self.KERNELS['wmh'] )
-        self.KERNELS['iso'] = np.ascontiguousarray( self.KERNELS['iso'] )
-
-        # De-mean kernels
-        if self.get_config('doDemean') :
-            print( '\t* Demeaning signal...', end="" )
-            for j in xrange(self.get_config('ndirs')) :
-                for i in xrange(nIC) :
-                    self.KERNELS['wmr'][i,j,:] -= self.KERNELS['wmr'][i,j,:].mean()
-                for i in xrange(nEC) :
-                    self.KERNELS['wmh'][i,j,:] -= self.KERNELS['wmh'][i,j,:].mean()
-            for i in xrange(nISO) :
-                self.KERNELS['iso'][i] -= self.KERNELS['iso'][i].mean()
-            print( '[ OK ]' )
-
-        # Normalize atoms
-        if self.get_config('doNormalizeKernels') :
-            print( '\t* Normalizing...', end="" )
-
-            self.KERNELS['wmr_norm'] = np.zeros( nIC )
-            for i in xrange(nIC) :
-                self.KERNELS['wmr_norm'][i] = np.linalg.norm( self.KERNELS['wmr'][i,0,:] )
-                for j in xrange(self.get_config('ndirs')) :
-                    self.KERNELS['wmr'][i,j,:] /= self.KERNELS['wmr_norm'][i]
-
-            self.KERNELS['wmh_norm'] = np.zeros( nEC )
-            for i in xrange(nEC) :
-                self.KERNELS['wmh_norm'][i] = np.linalg.norm( self.KERNELS['wmh'][i,0,:] )
-                for j in xrange(self.get_config('ndirs')) :
-                    self.KERNELS['wmh'][i,j,:] /= self.KERNELS['wmh_norm'][i]
-
-            self.KERNELS['iso_norm'] = np.zeros( nISO )
-            for i in xrange(nISO) :
-                self.KERNELS['iso_norm'][i] = np.linalg.norm( self.KERNELS['iso'][i,:] )
-                self.KERNELS['iso'][i,:] /= self.KERNELS['iso_norm'][i]
-
-            print( '[ OK ]' )
-
-        print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
-
-
-    cpdef load_dictionary( self, path, use_mask = False ) :
-        """Load the sparse structure previously created with "trk2dictionary" script.
-
-        Parameters
-        ----------
-        path : string
-            Folder containing the output of the trk2dictionary script (relative to subject path)
-        use_mask : boolean
-            If False (default) the optimization will be conducted only on the voxels actually
-            traversed by tracts. If True, the mask specified in trk2dictionary
-            (i.e. "filename_mask" paramater) will be used instead.
-            NB: if no mask was specified in trk2dictionary, the "tdi" and
-            "mask" masks are equivalent and this parameter is not influent.
-        """
-        if self.niiDWI is None :
-            raise RuntimeError( 'Data not loaded; call "load_data()" first.' )
-
-        tic = time.time()
-        print( '\n-> Loading the dictionary:' )
-        self.DICTIONARY = {}
-        self.set_config('TRACKING_path', pjoin(self.get_config('DATA_path'),path))
-
-        # load mask
-        self.set_config('dictionary_mask', 'mask' if use_mask else 'tdi' )
-        mask_filename = pjoin(self.get_config('TRACKING_path'),'dictionary_%s.nii'%self.get_config('dictionary_mask'))
-        if not exists( mask_filename ) :
-            mask_filename += '.gz'
-            if not exists( mask_filename ) :
-                raise RuntimeError( 'Dictionary not found. Execute ''trk2dictionary'' script first.' );
-        niiMASK = nibabel.load( mask_filename )
-        self.DICTIONARY['MASK'] = (niiMASK.get_data() > 0).astype(np.uint8)
-
-        # segments from the tracts
-        # ------------------------
-        print( '\t* segments from the tracts...', end="" )
-        sys.stdout.flush()
-
-        dictionary_info = load_dictionary_info( pjoin(self.get_config('TRACKING_path'), "dictionary_info.pickle") )
-
-        self.DICTIONARY['ndirs'] = dictionary_info['ndirs']
-
-        if self.DICTIONARY['ndirs'] != self.get_config('ndirs'):
-            raise RuntimeError( 'Dictionary is outdated. Execute ''trk2dictionary'' script first.' )
-
-        self.DICTIONARY['TRK'] = {}
-        self.DICTIONARY['TRK']['norm'] = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_TRK_norm.dict'), dtype=np.float32 )
-        self.DICTIONARY['TRK']['len']  = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_TRK_len.dict'), dtype=np.float32 )
-
-        self.DICTIONARY['IC'] = {}
-        self.DICTIONARY['IC']['fiber'] = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_IC_f.dict'), dtype=np.uint32 )
-        self.DICTIONARY['IC']['v']     = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_IC_v.dict'), dtype=np.uint32 )
-        self.DICTIONARY['IC']['o']     = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_IC_o.dict'), dtype=np.uint16 )
-        self.DICTIONARY['IC']['len']   = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_IC_len.dict'), dtype=np.float32 )
-        self.DICTIONARY['IC']['n']     = self.DICTIONARY['IC']['fiber'].size
-        self.DICTIONARY['IC']['nF']    = self.DICTIONARY['TRK']['norm'].size
-
-        # reorder the segments based on the "v" field
-        idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['v'])] )
-        self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
-        self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]
-        self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
-        self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
-        del idx
-
-        # divide the length of each segment by the fiber length so that all the columns of the libear operator will have same length
-        # NB: it works in conjunction with the normalization of the kernels
-        cdef :
-            np.float32_t [:] sl = self.DICTIONARY['IC']['len']
-            np.float32_t [:] tl = self.DICTIONARY['TRK']['norm']
-            np.uint32_t  [:] f  = self.DICTIONARY['IC']['fiber']
-            int s
-        if self.get_config('doNormalizeKernels') :
-            for s in xrange(self.DICTIONARY['IC']['n']) :
-                sl[s] /= tl[ f[s] ]
-
-        print( '[ %d fibers and %d segments ]' % ( self.DICTIONARY['IC']['nF'], self.DICTIONARY['IC']['n'] ) )
-
-        # segments from the peaks
-        # -----------------------
-        print( '\t* segments from the peaks...', end="" )
-        sys.stdout.flush()
-
-        self.DICTIONARY['EC'] = {}
-        self.DICTIONARY['EC']['v']  = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_EC_v.dict'), dtype=np.uint32 )
-        self.DICTIONARY['EC']['o']  = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_EC_o.dict'), dtype=np.uint16 )
-        self.DICTIONARY['EC']['nE'] = self.DICTIONARY['EC']['v'].size
-
-        # reorder the segments based on the "v" field
-        idx = np.lexsort( [np.array(self.DICTIONARY['EC']['o']), np.array(self.DICTIONARY['EC']['v'])] )
-        self.DICTIONARY['EC']['v'] = self.DICTIONARY['EC']['v'][ idx ]
-        self.DICTIONARY['EC']['o'] = self.DICTIONARY['EC']['o'][ idx ]
-        del idx
-
-        print( ' [ %d segments ]' % self.DICTIONARY['EC']['nE'] )
-
-        # isotropic compartments
-        # ----------------------
-        print( '\t* isotropic contributions...', end="" )
-        sys.stdout.flush()
-
-        self.DICTIONARY['ISO'] = {}
-
-        self.DICTIONARY['nV'] = self.DICTIONARY['MASK'].sum()
-
-        vx, vy, vz = ( self.DICTIONARY['MASK'] > 0 ).nonzero() # [TODO] find a way to avoid using int64 (not necessary and waste of memory)
-        vx = vx.astype(np.int32)
-        vy = vy.astype(np.int32)
-        vz = vz.astype(np.int32)
-        self.DICTIONARY['ISO']['v'] = vx + self.get_config('dim')[0] * ( vy + self.get_config('dim')[1] * vz )
-        del vx, vy, vz
-
-        # reorder the segments based on the "v" field
-        idx = np.argsort( self.DICTIONARY['ISO']['v'], kind='mergesort' )
-        self.DICTIONARY['ISO']['v'] = self.DICTIONARY['ISO']['v'][ idx ]
-        del idx
-
-        print( ' [ %d voxels ]' % self.DICTIONARY['nV'] )
-
-        # post-processing
-        # ---------------
-        print( '\t* post-processing...', end="" )
-        sys.stdout.flush()
-
-        # get the indices to extract the VOI as in MATLAB (in place of DICTIONARY.MASKidx)
-        idx = self.DICTIONARY['MASK'].ravel(order='F').nonzero()[0]
-        self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] = np.unravel_index( idx, self.DICTIONARY['MASK'].shape, order='F' )
-
-        lut = np.zeros( self.get_config('dim'), dtype=np.uint32 ).ravel()
-        for i in xrange(idx.size) :
-            lut[ idx[i] ] = i
-        self.DICTIONARY['IC'][ 'v'] = lut[ self.DICTIONARY['IC'][ 'v'] ]
-        self.DICTIONARY['EC'][ 'v'] = lut[ self.DICTIONARY['EC'][ 'v'] ]
-        self.DICTIONARY['ISO']['v'] = lut[ self.DICTIONARY['ISO']['v'] ]
-
-        print( '         [ OK ]' )
-
-        print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
-
-
-    def set_threads( self, nthreads = None ) :
-        """Set the number of threads to use for the matrix-vector operations with A and A'.
-
-        Parameters
-        ----------
-        nthreads : integer
-            Number of threads to use (nthreads = None ---> all the CPU threads available in the system
-                                      nthreads = 0    ---> enable CUDA GPU acceleration)
-        """
-        if nthreads is None :
-            # Set to the number of CPUs in the system
-            try :
-                import multiprocessing
-                nthreads = multiprocessing.cpu_count()
-            except :
-                nthreads = 1
-
-        if nthreads < 0 or nthreads > 255 :
-            raise RuntimeError( 'Number of threads must be between 0 and 255' )
-        if self.DICTIONARY is None :
-            raise RuntimeError( 'Dictionary not loaded; call "load_dictionary()" first.' )
-        if self.KERNELS is None :
-            raise RuntimeError( 'Response functions not generated; call "generate_kernels()" and "load_kernels()" first.' )
-
-        self.THREADS = {}
-        self.THREADS['n'] = nthreads
-        self.THREADS['IC'] = None
-        self.THREADS['EC'] = None
-        self.THREADS['ISO'] = None
-        self.THREADS['ICt'] = None
-        self.THREADS['ECt'] = None
-        self.THREADS['ISOt'] = None
-
-        cdef :
-            long [:] C
-            long t, tot, i1, i2, N, c
-            int i
-
-        if nthreads > 0:
-            print( '\n-> Distributing workload to different threads:' )
-            print( '\t* number of threads : %d' % nthreads )
-
-            tic = time.time()
-
-            # Distribute load for the computation of A*x product
-            print( '\t* A  operator... ', end="" )
-            sys.stdout.flush()
-
-            if self.DICTIONARY['IC']['n'] > 0 :
-                self.THREADS['IC'] = np.zeros( nthreads+1, dtype=np.uint32 )
-                if nthreads > 1 :
-                    N = np.floor( self.DICTIONARY['IC']['n']/nthreads )
-                    t = 1
-                    tot = 0
-                    C = np.bincount( self.DICTIONARY['IC']['v'] )
-                    for c in C :
-                        tot += c
-                        if tot >= N :
-                            self.THREADS['IC'][t] = self.THREADS['IC'][t-1] + tot
-                            t += 1
-                            tot = 0
-                self.THREADS['IC'][nthreads] = self.DICTIONARY['IC']['n']
-
-                # check if some threads are not assigned any segment
-                if np.count_nonzero( np.diff( self.THREADS['IC'].astype(np.int32) ) <= 0 ) :
-                    self.THREADS = None
-                    raise RuntimeError( 'Too many threads for the IC compartments to evaluate; try decreasing the number.' )
-
-            if self.DICTIONARY['EC']['nE'] > 0 :
-                self.THREADS['EC'] = np.zeros( nthreads+1, dtype=np.uint32 )
-                for i in xrange(nthreads) :
-                    self.THREADS['EC'][i] = np.searchsorted( self.DICTIONARY['EC']['v'], self.DICTIONARY['IC']['v'][ self.THREADS['IC'][i] ] )
-                self.THREADS['EC'][nthreads] = self.DICTIONARY['EC']['nE']
-
-                # check if some threads are not assigned any segment
-                if np.count_nonzero( np.diff( self.THREADS['EC'].astype(np.int32) ) <= 0 ) :
-                    self.THREADS = None
-                    raise RuntimeError( 'Too many threads for the EC compartments to evaluate; try decreasing the number.' )
-
-            if self.DICTIONARY['nV'] > 0 :
-                self.THREADS['ISO'] = np.zeros( nthreads+1, dtype=np.uint32 )
-                for i in xrange(nthreads) :
-                    self.THREADS['ISO'][i] = np.searchsorted( self.DICTIONARY['ISO']['v'], self.DICTIONARY['IC']['v'][ self.THREADS['IC'][i] ] )
-                self.THREADS['ISO'][nthreads] = self.DICTIONARY['nV']
-
-                # check if some threads are not assigned any segment
-                if np.count_nonzero( np.diff( self.THREADS['ISO'].astype(np.int32) ) <= 0 ) :
-                    self.THREADS = None
-                    raise RuntimeError( 'Too many threads for the ISO compartments to evaluate; try decreasing the number.' )
-
-            print( '[ OK ]' )
-
-            # Distribute load for the computation of At*y product
-            print( '\t* A\' operator... ', end="" )
-            sys.stdout.flush()
-
-            if self.DICTIONARY['IC']['n'] > 0 :
-                self.THREADS['ICt'] = np.full( self.DICTIONARY['IC']['n'], nthreads-1, dtype=np.uint8 )
-                if nthreads > 1 :
-                    idx = np.argsort( self.DICTIONARY['IC']['fiber'], kind='mergesort' )
-                    C = np.bincount( self.DICTIONARY['IC']['fiber'] )
-                    t = tot = i1 = i2 = 0
-                    N = np.floor(self.DICTIONARY['IC']['n']/nthreads)
-                    for c in C :
-                        i2 += c
-                        tot += c
-                        if tot >= N :
-                            self.THREADS['ICt'][ i1:i2 ] = t
-                            t += 1
-                            if t==nthreads-1 :
-                                break
-                            i1 = i2
-                            tot = c
-                    self.THREADS['ICt'][idx] = self.THREADS['ICt'].copy()
-
-            if self.DICTIONARY['EC']['nE'] > 0 :
-                self.THREADS['ECt'] = np.zeros( nthreads+1, dtype=np.uint32 )
-                N = np.floor( self.DICTIONARY['EC']['nE']/nthreads )
-                for i in xrange(1,nthreads) :
-                    self.THREADS['ECt'][i] = self.THREADS['ECt'][i-1] + N
-                self.THREADS['ECt'][nthreads] = self.DICTIONARY['EC']['nE']
-
-                # check if some threads are not assigned any segment
-                if np.count_nonzero( np.diff( self.THREADS['ECt'].astype(np.int32) ) <= 0 ) :
-                    self.THREADS = None
-                    raise RuntimeError( 'Too many threads for the EC compartments to evaluate; try decreasing the number.' )
-
-            if self.DICTIONARY['nV'] > 0 :
-                self.THREADS['ISOt'] = np.zeros( nthreads+1, dtype=np.uint32 )
-                N = np.floor( self.DICTIONARY['nV']/nthreads )
-                for i in xrange(1,nthreads) :
-                    self.THREADS['ISOt'][i] = self.THREADS['ISOt'][i-1] + N
-                self.THREADS['ISOt'][nthreads] = self.DICTIONARY['nV']
-
-                # check if some threads are not assigned any segment
-                if np.count_nonzero( np.diff( self.THREADS['ISOt'].astype(np.int32) ) <= 0 ) :
-                    self.THREADS = None
-                    raise RuntimeError( 'Too many threads for the ISO compartments to evaluate; try decreasing the number.' )
-
-            print( '[ OK ]' )
-
-            print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
-
-
-    def build_operator( self ) :
-        """Compile/build the operator for computing the matrix-vector multiplications by A and A'
-        using the informations from self.DICTIONARY, self.KERNELS and self.THREADS.
-        NB: needs to call this function to update pointers to data structures in case
-            the data is changed in self.DICTIONARY, self.KERNELS or self.THREADS.
-        """
-        if self.DICTIONARY is None :
-            raise RuntimeError( 'Dictionary not loaded; call "load_dictionary()" first.' )
-        if self.KERNELS is None :
-            raise RuntimeError( 'Response functions not generated; call "generate_kernels()" and "load_kernels()" first.' )
-        if self.THREADS is None :
-            raise RuntimeError( 'Threads not set; call "set_threads()" first.' )
-
-        tic = time.time()
-        print( '\n-> Building linear operator A:' )
-
-        # need to pass these parameters at runtime for compiling the C code
-        from commit.operator import config
-        config.nTHREADS = self.THREADS['n']
-        config.model    = self.model.id
-        config.nIC      = self.KERNELS['wmr'].shape[0]
-        config.nEC      = self.KERNELS['wmh'].shape[0]
-        config.nISO     = self.KERNELS['iso'].shape[0]
-        if not 'commit.operator.operator' in sys.modules :
-            import commit.operator.operator
-        else :
-            reload( sys.modules['commit.operator.operator'] )
-
-        if self.THREADS['n'] > 0:
-            self.A = sys.modules['commit.operator.operator'].LinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
-        else:
-            import commit.cudaoperator
-            self.A = commit.cudaoperator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS, fcall=1 )
-
-        print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
-
-    def get_y( self ):
-        """
-        Returns a numpy array that corresponds to the 'y' vector of the optimisation problem.
-        NB: this can be run only after having loaded the dictionary and the data.
-        """
-        if self.DICTIONARY is None :
-            raise RuntimeError( 'Dictionary not loaded; call "load_dictionary()" first.' )
-        if self.niiDWI is None :
-            raise RuntimeError( 'Data not loaded; call "load_data()" first.' )
-        return self.niiDWI_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'], : ].flatten().astype(np.float64)
-
-    def fit( self, tol_fun = 1e-3, tol_x = 1e-6, max_iter = 100, verbose = 1, x0 = None, regularisation = None ) :
-        """Fit the model to the data.
-
-        Parameters
-        ----------
-        tol_fun : float
-            Tolerance on the objective function (default : 1e-3)
-        max_iter : integer
-            Maximum number of iterations (default : 100)
-        verbose : integer
-            Level of verbosity: 0=no print, 1=print progress (default : 1)
-        x0 : np.array
-            Initial guess for the solution of the problem (default : None)
-        regularisation : commit.solvers.init_regularisation object
-            Python dictionary that describes the wanted regularisation term.
-            Check the documentation of commit.solvers.init_regularisation to see
-            how to properly define the wanted mathematical formulation
-            ( default : None )
-        """
-        if self.niiDWI is None :
-            raise RuntimeError( 'Data not loaded; call "load_data()" first.' )
-        if self.DICTIONARY is None :
-            raise RuntimeError( 'Dictionary not loaded; call "load_dictionary()" first.' )
-        if self.KERNELS is None :
-            raise RuntimeError( 'Response functions not generated; call "generate_kernels()" and "load_kernels()" first.' )
-        if self.THREADS is None :
-            raise RuntimeError( 'Threads not set; call "set_threads()" first.' )
-        if self.A is None :
-            raise RuntimeError( 'Operator not built; call "build_operator()" first.' )
-
-        if x0 is not None :
-            if x0.shape[0] != self.A.shape[1] :
-                raise RuntimeError( 'x0: dimension does not match the number of columns of the dictionary.' )
-        if regularisation is None :
-            regularisation = commit.solvers.init_regularisation(self)
-
-        self.CONFIG['optimization']                   = {}
-        self.CONFIG['optimization']['tol_fun']        = tol_fun
-        self.CONFIG['optimization']['tol_x']          = tol_x
-        self.CONFIG['optimization']['max_iter']       = max_iter
-        self.CONFIG['optimization']['verbose']        = verbose
-        self.CONFIG['optimization']['regularisation'] = regularisation
-
-        # run solver
-        t = time.time()
-        print( '\n-> Fit model' )
-
-        self.x, opt_details = commit.solvers.solve(self.get_y(), self.A, self.A.T, tol_fun = tol_fun, tol_x = tol_x, max_iter = max_iter, verbose = verbose, x0 = x0, regularisation = regularisation)
-
-        self.CONFIG['optimization']['fit_details'] = opt_details
-        self.CONFIG['optimization']['fit_time'] = time.time()-t
-
-        print( '   [ %s ]' % ( time.strftime("%Hh %Mm %Ss", time.gmtime(self.CONFIG['optimization']['fit_time']) ) ) )
-
-
-    def save_results( self, path_suffix = None, save_opt_details = True, save_coeff = False ) :
-        """Save the output (coefficients, errors, maps etc).
-
-        Parameters
-        ----------
-        path_suffix : string
-            Text to be appended to "Results" to create the output path (default : None)
-        save_opt_details : boolean
-            Save everything in a pickle file containing the following list L:
-                L[0]: dictionary with all the configuration details
-                L[1]: np.array obtained through the optimisation process with the normalised kernels
-                L[2]: np.array renormalisation of L[1]
-            (default : True)
-        save_coeff : boolean
-            Save three txt files containing the coefficients related to each
-            compartment and a pickle file containing the dictionary with all
-            the configuration details.
-            (default : False)
-        """
-        if self.x is None :
-            raise RuntimeError( 'Model not fitted to the data; call "fit()" first.' )
-
-        RESULTS_path = 'Results_' + self.model.id
-        if path_suffix :
-            self.set_config('path_suffix', path_suffix)
-            RESULTS_path = RESULTS_path + path_suffix
-
-        print( '\n-> Saving results to "%s/*":' % RESULTS_path )
-        tic = time.time()
-
-        # create folder or delete existing files (if any)
-        RESULTS_path = pjoin( self.get_config('TRACKING_path'), RESULTS_path )
-        if not exists( RESULTS_path ) :
-            makedirs( RESULTS_path )
-        else :
-            for f in glob.glob( pjoin(RESULTS_path,'*') ) :
-                remove( f )
-        self.set_config('RESULTS_path', RESULTS_path)
-
-        # Configuration and results
-        print( '\t* configuration and results:' )
-
-        nF = self.DICTIONARY['IC']['nF']
-        nE = self.DICTIONARY['EC']['nE']
-        nV = self.DICTIONARY['nV']
-        norm_fib = np.ones( nF )
-        # x is the x of the original problem
-        # self.x is the x preconditioned
-        if self.get_config('doNormalizeKernels') :
-            # renormalize the coefficients
-            norm1 = np.repeat(self.KERNELS['wmr_norm'],nF)
-            norm2 = np.repeat(self.KERNELS['wmh_norm'],nE)
-            norm3 = np.repeat(self.KERNELS['iso_norm'],nV)
-            norm_fib = np.kron(np.ones(self.KERNELS['wmr'].shape[0]), self.DICTIONARY['TRK']['norm'])
-            x = self.x / np.hstack( (norm1*norm_fib,norm2,norm3) )
-        else :
-            x = self.x
-        if save_opt_details:
-            print( '\t\t- pickle... ', end="" )
-            sys.stdout.flush()
-            with open( pjoin(RESULTS_path,'results.pickle'), 'wb+' ) as fid :
-                pickle.dump( [self.CONFIG, self.x, x], fid, protocol=2 )
-            print( '[ OK ]' )
-        if save_coeff:
-            print( '\t\t- txt... ', end="" )
-            sys.stdout.flush()
-            np.savetxt(pjoin(RESULTS_path,'xic.txt'), x[0:nF])
-            np.savetxt(pjoin(RESULTS_path,'xec.txt'), x[nF:nF+nE])
-            np.savetxt(pjoin(RESULTS_path,'xiso.txt'), x[(nF+nE):])
-            with open( pjoin(RESULTS_path,'config.pickle'), 'wb+' ) as fid :
-                pickle.dump( self.CONFIG, fid, protocol=2 )
-            print( '[ OK ]' )
-
-
-        # Map of wovelwise errors
-        print( '\t* fitting errors:' )
-
-        not_NaN = np.ones( self.get_config('dim'), dtype=np.float32 ) * 1e-16 # avoid division by 0
-
-        niiMAP_img = np.zeros( self.get_config('dim'), dtype=np.float32 )
-        affine = self.niiDWI.affine if nibabel.__version__ >= '2.0.0' else self.niiDWI.get_affine()
-        niiMAP     = nibabel.Nifti1Image( niiMAP_img, affine )
-        niiMAP_hdr = niiMAP.header if nibabel.__version__ >= '2.0.0' else niiMAP.get_header()
-
-        y_mea = np.reshape( self.niiDWI_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'], : ].flatten().astype(np.float32), (nV,-1) )
-        y_est = np.reshape( self.A.dot(self.x), (nV,-1) ).astype(np.float32)
-
-        print( '\t\t- RMSE...', end="" )
-        sys.stdout.flush()
-        tmp = np.sqrt( np.mean((y_mea-y_est)**2,axis=1) )
-        niiMAP_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = tmp
-        niiMAP_hdr['cal_min'] = 0
-        niiMAP_hdr['cal_max'] = tmp.max()
-        nibabel.save( niiMAP, pjoin(RESULTS_path,'fit_RMSE.nii.gz') )
-        print( ' [ %.3f +/- %.3f ]' % ( tmp.mean(), tmp.std() ) )
-
-        print( '\t\t- NRMSE...', end="" )
-        sys.stdout.flush()
-        tmp = np.sum(y_mea**2,axis=1)
-        idx = np.where( tmp < 1E-12 )
-        tmp[ idx ] = 1
-        tmp = np.sqrt( np.sum((y_mea-y_est)**2,axis=1) / tmp )
-        tmp[ idx ] = 0
-        niiMAP_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = tmp
-        niiMAP_hdr['cal_min'] = 0
-        niiMAP_hdr['cal_max'] = 1
-        nibabel.save( niiMAP, pjoin(RESULTS_path,'fit_NRMSE.nii.gz') )
-        print( '[ %.3f +/- %.3f ]' % ( tmp.mean(), tmp.std() ) )
-
-        # Map of compartment contributions
-        print( '\t* voxelwise contributions:' )
-
-        print( '\t\t- intra-axonal', end="" )
-        sys.stdout.flush()
-        niiIC_img = np.zeros( self.get_config('dim'), dtype=np.float32 )
-        if len(self.KERNELS['wmr']) > 0 :
-            offset = nF * self.KERNELS['wmr'].shape[0]
-            tmp = ( x[:offset].reshape( (-1,nF) ) * norm_fib.reshape( (-1,nF) ) ).sum( axis=0 )
-            xv = np.bincount( self.DICTIONARY['IC']['v'], minlength=nV,
-                weights=tmp[ self.DICTIONARY['IC']['fiber'] ] * self.DICTIONARY['IC']['len']
-            ).astype(np.float32)
-            niiIC_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = xv
-        print( '[ OK ]' )
-
-        print( '\t\t- extra-axonal', end="" )
-        sys.stdout.flush()
-        niiEC_img = np.zeros( self.get_config('dim'), dtype=np.float32 )
-        if len(self.KERNELS['wmh']) > 0 :
-            offset = nF * self.KERNELS['wmr'].shape[0]
-            tmp = x[offset:offset+nE*len(self.KERNELS['wmh'])].reshape( (-1,nE) ).sum( axis=0 )
-            xv = np.bincount( self.DICTIONARY['EC']['v'], weights=tmp, minlength=nV ).astype(np.float32)
-            niiEC_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = xv
-        print( '[ OK ]' )
-
-        print( '\t\t- isotropic', end="" )
-        sys.stdout.flush()
-        niiISO_img = np.zeros( self.get_config('dim'), dtype=np.float32 )
-        if len(self.KERNELS['iso']) > 0 :
-            offset = nF * self.KERNELS['wmr'].shape[0] + nE * self.KERNELS['wmh'].shape[0]
-            xv = x[offset:].reshape( (-1,nV) ).sum( axis=0 )
-            niiISO_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = xv
-        print( '   [ OK ]' )
-
-        if self.get_config('doNormalizeMaps') :
-                niiIC = nibabel.Nifti1Image( niiIC_img / ( niiIC_img + niiEC_img + niiISO_img + not_NaN), affine )
-                niiEC = nibabel.Nifti1Image( niiEC_img / ( niiIC_img + niiEC_img + niiISO_img + not_NaN), affine )
-                niiISO = nibabel.Nifti1Image( niiISO_img / ( niiIC_img + niiEC_img + niiISO_img + not_NaN), affine )
-        else:
-                niiIC = nibabel.Nifti1Image( niiIC_img, affine )
-                niiEC = nibabel.Nifti1Image( niiEC_img, affine )
-                niiISO = nibabel.Nifti1Image( niiISO_img, affine )
-
-        nibabel.save( niiIC , pjoin(RESULTS_path,'compartment_IC.nii.gz') )
-        nibabel.save( niiEC , pjoin(RESULTS_path,'compartment_EC.nii.gz') )
-        nibabel.save( niiISO , pjoin(RESULTS_path,'compartment_ISO.nii.gz') )
-
-
-        print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
+#!python
+#cython: language_level=3, boundscheck=False, wraparound=False, nonecheck=False, cdivision=True, initializedcheck=False, binding=False
+from __future__ import print_function
+cimport cython
+import numpy as np
+cimport numpy as np
+
+import time
+import glob
+import sys
+from os import makedirs, remove
+from os.path import exists, join as pjoin, isfile
+import nibabel
+import pickle
+import commit.models
+import commit.solvers
+import amico.scheme
+import amico.lut
+import pyximport
+pyximport.install( reload_support=True, language_level=3 )
+
+
+def setup( lmax = 12, ndirs = 32761 ) :
+    """General setup/initialization of the COMMIT framework.
+    
+    Parameters
+    ----------
+    lmax : int
+        Maximum SH order to use for the rotation phase (default : 12)
+    ndirs : int
+        Number of directions on the half of the sphere representing the possible orientations of the response functions (default : 32761)
+    """
+
+    if not amico.lut.is_valid(ndirs):
+        raise RuntimeError( 'Unsupported value for ndirs.\nNote: Supported values for ndirs are [500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 10000, 32761 (default)]' )
+
+    amico.lut.precompute_rotation_matrices( lmax, ndirs )
+
+def load_dictionary_info(filename):
+    """Function to load dictionary info file
+    
+    Parameters
+    ----------
+    filename : string
+        This value is always COMMIT_PATH + dictionary_info.pickle
+    """
+    if not isfile( filename ):
+        raise RuntimeError( 'Dictionary is outdated or not found. Execute ''trk2dictionary'' script first.' )
+    with open( filename, 'rb' ) as dictionary_info_file:
+        if sys.version_info.major == 3:
+            aux = pickle.load( dictionary_info_file, fix_imports=True, encoding='bytes' )
+            # Pickle files written by Python 2 are loaded with byte
+            # keys, whereas those written by Python 3 are loaded with
+            # str keys, even when both are written using protocol=2
+            result_aux = {(k.decode() if hasattr(k,"decode") else k): v for k, v in aux.items()}
+            return result_aux
+        else:
+            return pickle.load( dictionary_info_file )
+
+cdef class Evaluation :
+    """Class to hold all the information (data and parameters) when performing an
+    evaluation with the COMMIT framework.
+    """
+    cdef public niiDWI
+    cdef public niiDWI_img
+    cdef public scheme
+    cdef public model
+    cdef public KERNELS
+    cdef public DICTIONARY
+    cdef public THREADS
+    cdef public A
+    cdef public x
+    cdef public CONFIG
+
+    def __init__( self, study_path, subject ) :
+        """Setup the data structures with default values.
+
+        Parameters
+        ----------
+        study_path : string
+            The path to the folder containing all the subjects from one study
+        subject : string
+            The path (relative to previous folder) to the subject folder
+        """
+        self.niiDWI     = None # set by "load_data" method
+        self.scheme     = None # set by "load_data" method
+        self.model      = None # set by "set_model" method
+        self.KERNELS    = None # set by "load_kernels" method
+        self.DICTIONARY = None # set by "load_dictionary" method
+        self.THREADS    = None # set by "set_threads" method
+        self.A          = None # set by "build_operator" method
+        self.x          = None # set by "fit" method
+
+        # store all the parameters of an evaluation with COMMIT
+        self.CONFIG = {}
+        self.set_config('study_path', study_path)
+        self.set_config('subject', subject)
+        self.set_config('DATA_path', pjoin( study_path, subject ))
+
+        self.set_config('doNormalizeSignal', True)
+        self.set_config('doMergeB0', False)
+        self.set_config('doNormalizeKernels', True)
+        self.set_config('doDemean', False)
+        self.set_config('doNormalizeMaps', False)
+
+
+
+    def set_config( self, key, value ) :
+        self.CONFIG[ key ] = value
+
+    def get_config( self, key ) :
+        return self.CONFIG.get( key )
+
+
+    def load_data( self, dwi_filename = 'DWI.nii', scheme_filename = 'DWI.scheme', b0_thr = 0 ) :
+        """Load the diffusion signal and its corresponding acquisition scheme.
+
+        Parameters
+        ----------
+        dwi_filename : string
+            The file name of the DWI data, relative to the subject folder (default : 'DWI.nii')
+        scheme_filename : string
+            The file name of the corresponding acquisition scheme (default : 'DWI.scheme')
+        b0_thr : float
+            The threshold below which a b-value is considered a b0 (default : 0)
+        """
+
+        # Loading data and acquisition scheme
+        tic = time.time()
+        print( '\n-> Loading data:' )
+
+        print( '\t* DWI signal...' )
+        self.set_config('dwi_filename', dwi_filename)
+        self.niiDWI  = nibabel.load( pjoin( self.get_config('DATA_path'), dwi_filename) )
+        self.niiDWI_img = self.niiDWI.get_data().astype(np.float32)
+        if self.niiDWI_img.ndim ==3 :
+            self.niiDWI_img = np.expand_dims( self.niiDWI_img, axis=3 )
+        hdr = self.niiDWI.header if nibabel.__version__ >= '2.0.0' else self.niiDWI.get_header()
+        self.set_config('dim', self.niiDWI_img.shape[0:3])
+        self.set_config('pixdim', tuple( hdr.get_zooms()[:3] ))
+        print( '\t\t- dim    = %d x %d x %d x %d' % self.niiDWI_img.shape )
+        print( '\t\t- pixdim = %.3f x %.3f x %.3f' % self.get_config('pixdim') )
+
+        print( '\t* Acquisition scheme...' )
+        self.set_config('scheme_filename', scheme_filename)
+        self.set_config('b0_thr', b0_thr)
+        self.scheme = amico.scheme.Scheme( pjoin( self.get_config('DATA_path'), scheme_filename), b0_thr )
+        print( '\t\t- %d samples, %d shells' % ( self.scheme.nS, len(self.scheme.shells) ) )
+        print( '\t\t- %d @ b=0' % ( self.scheme.b0_count ), end="" )
+        for i in xrange(len(self.scheme.shells)) :
+            print( ', %d @ b=%.1f' % ( len(self.scheme.shells[i]['idx']), self.scheme.shells[i]['b'] ), end="" )
+        print()
+
+        if self.scheme.nS != self.niiDWI_img.shape[3] :
+            raise ValueError( 'Scheme does not match with DWI data' )
+
+        if self.scheme.dwi_count == 0 :
+            raise ValueError( 'There are no DWI volumes in the data' )
+
+        print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
+
+        # Preprocessing
+        tic = time.time()
+        print( '\n-> Preprocessing:' )
+
+        if self.get_config('doNormalizeSignal') :
+            if self.scheme.b0_count > 0 :
+                print( '\t* Normalizing to b0...', end="" )
+                sys.stdout.flush()
+                mean = np.mean( self.niiDWI_img[:,:,:,self.scheme.b0_idx], axis=3 )
+                idx = mean <= 0
+                mean[ idx ] = 1
+                mean = 1 / mean
+                mean[ idx ] = 0
+                for i in xrange(self.scheme.nS) :
+                    self.niiDWI_img[:,:,:,i] *= mean
+            else :
+                print( '\t* There are no b0 volume(s) for normalization...', end="" )
+            print( '[ min=%.2f,  mean=%.2f, max=%.2f ]' % ( self.niiDWI_img.min(), self.niiDWI_img.mean(), self.niiDWI_img.max() ) )
+
+        if self.scheme.b0_count > 1 :
+            if self.get_config('doMergeB0') :
+                print( '\t* Merging multiple b0 volume(s)...', end="" )
+                mean = np.expand_dims( np.mean( self.niiDWI_img[:,:,:,self.scheme.b0_idx], axis=3 ), axis=3 )
+                self.niiDWI_img = np.concatenate( (mean, self.niiDWI_img[:,:,:,self.scheme.dwi_idx]), axis=3 )
+            else :
+                print( '\t* Keeping all b0 volume(s)...', end="" )
+            print( '[ %d x %d x %d x %d ]' % self.niiDWI_img.shape )
+
+        if self.get_config('doDemean') :
+            print( '\t* Demeaning signal...', end="" )
+            sys.stdout.flush()
+            mean = np.repeat( np.expand_dims(np.mean(self.niiDWI_img,axis=3),axis=3), self.niiDWI_img.shape[3], axis=3 )
+            self.niiDWI_img = self.niiDWI_img - mean
+            print( '[ min=%.2f,  mean=%.2f, max=%.2f ]' % ( self.niiDWI_img.min(), self.niiDWI_img.mean(), self.niiDWI_img.max() ) )
+
+        print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
+
+
+    def set_model( self, model_name ) :
+        """Set the model to use to describe the signal contributions in each voxel.
+
+        Parameters
+        ----------
+        model_name : string
+            The name of the model (must match a class name in "commit.models" module)
+        """
+        # Call the specific model constructor
+        if hasattr(commit.models, model_name ) :
+            self.model = getattr(commit.models,model_name)()
+        else :
+            raise ValueError( 'Model "%s" not recognized' % model_name )
+
+        self.set_config('ATOMS_path', pjoin( self.get_config('study_path'), 'kernels', self.model.id ))
+
+
+    def generate_kernels( self, regenerate = False, lmax = 12, ndirs = 32761 ) :
+        """Generate the high-resolution response functions for each compartment.
+        Dispatch to the proper function, depending on the model.
+
+        Parameters
+        ----------
+        regenerate : boolean
+            Regenerate kernels if they already exist (default : False)
+        lmax : int
+            Maximum SH order to use for the rotation procedure (default : 12)
+        ndirs : int
+            Number of directions on the half of the sphere representing the possible orientations of the response functions (default : 32761)
+        """
+        if not amico.lut.is_valid(ndirs):
+            raise RuntimeError( 'Unsupported value for ndirs.\nNote: Supported values for ndirs are [500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 10000, 32761 (default)]' )
+        if self.scheme is None :
+            raise RuntimeError( 'Scheme not loaded; call "load_data()" first.' )
+        if self.model is None :
+            raise RuntimeError( 'Model not set; call "set_model()" method first.' )
+
+        # store some values for later use
+        self.set_config('lmax', lmax)
+        self.set_config('ndirs', ndirs)
+        self.model.scheme = self.scheme
+
+        print( '\n-> Simulating with "%s" model:' % self.model.name )
+
+        # check if kernels were already generated
+        tmp = glob.glob( pjoin(self.get_config('ATOMS_path'),'A_*.npy') )
+        if len(tmp)>0 and not regenerate :
+            print( '   [ Kernels already computed. Call "generate_kernels( regenerate=True )" to force regeneration. ]' )
+            return
+
+        # create folder or delete existing files (if any)
+        if not exists( self.get_config('ATOMS_path') ) :
+            makedirs( self.get_config('ATOMS_path') )
+        else :
+            for f in glob.glob( pjoin(self.get_config('ATOMS_path'),'*') ) :
+                remove( f )
+
+        # auxiliary data structures
+        aux = amico.lut.load_precomputed_rotation_matrices( lmax, ndirs )
+        idx_IN, idx_OUT = amico.lut.aux_structures_generate( self.scheme, lmax )
+
+        # Dispatch to the right handler for each model
+        tic = time.time()
+        self.model.generate( self.get_config('ATOMS_path'), aux, idx_IN, idx_OUT, ndirs )
+        print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
+
+
+    def load_kernels( self ) :
+        """Load rotated kernels and project to the specific gradient scheme of this subject.
+        Dispatch to the proper function, depending on the model.
+        """
+        if self.model is None :
+            raise RuntimeError( 'Model not set; call "set_model()" method first.' )
+        if self.scheme is None :
+            raise RuntimeError( 'Scheme not loaded; call "load_data()" first.' )
+
+        tic = time.time()
+        print( '\n-> Resampling LUT for subject "%s":' % self.get_config('subject') )
+
+        # auxiliary data structures
+        idx_OUT, Ylm_OUT = amico.lut.aux_structures_resample( self.scheme, self.get_config('lmax') )
+
+        # Dispatch to the right handler for each model
+        if self.get_config('doMergeB0') :
+            print( '\t* Merging multiple b0 volume(s)...', end="" )
+        else :
+            print( '\t* Keeping all b0 volume(s)...', end="" )
+        self.KERNELS = self.model.resample( self.get_config('ATOMS_path'), idx_OUT, Ylm_OUT, self.get_config('doMergeB0'), self.get_config('ndirs') )
+        nIC  = self.KERNELS['wmr'].shape[0]
+        nEC  = self.KERNELS['wmh'].shape[0]
+        nISO = self.KERNELS['iso'].shape[0]
+        print( '[ OK ]' )
+
+
+        # ensure contiguous arrays for C part
+        self.KERNELS['wmr'] = np.ascontiguousarray( self.KERNELS['wmr'] )
+        self.KERNELS['wmh'] = np.ascontiguousarray( self.KERNELS['wmh'] )
+        self.KERNELS['iso'] = np.ascontiguousarray( self.KERNELS['iso'] )
+
+        # De-mean kernels
+        if self.get_config('doDemean') :
+            print( '\t* Demeaning signal...', end="" )
+            for j in xrange(self.get_config('ndirs')) :
+                for i in xrange(nIC) :
+                    self.KERNELS['wmr'][i,j,:] -= self.KERNELS['wmr'][i,j,:].mean()
+                for i in xrange(nEC) :
+                    self.KERNELS['wmh'][i,j,:] -= self.KERNELS['wmh'][i,j,:].mean()
+            for i in xrange(nISO) :
+                self.KERNELS['iso'][i] -= self.KERNELS['iso'][i].mean()
+            print( '[ OK ]' )
+
+        # Normalize atoms
+        if self.get_config('doNormalizeKernels') :
+            print( '\t* Normalizing...', end="" )
+
+            self.KERNELS['wmr_norm'] = np.zeros( nIC )
+            for i in xrange(nIC) :
+                self.KERNELS['wmr_norm'][i] = np.linalg.norm( self.KERNELS['wmr'][i,0,:] )
+                for j in xrange(self.get_config('ndirs')) :
+                    self.KERNELS['wmr'][i,j,:] /= self.KERNELS['wmr_norm'][i]
+
+            self.KERNELS['wmh_norm'] = np.zeros( nEC )
+            for i in xrange(nEC) :
+                self.KERNELS['wmh_norm'][i] = np.linalg.norm( self.KERNELS['wmh'][i,0,:] )
+                for j in xrange(self.get_config('ndirs')) :
+                    self.KERNELS['wmh'][i,j,:] /= self.KERNELS['wmh_norm'][i]
+
+            self.KERNELS['iso_norm'] = np.zeros( nISO )
+            for i in xrange(nISO) :
+                self.KERNELS['iso_norm'][i] = np.linalg.norm( self.KERNELS['iso'][i,:] )
+                self.KERNELS['iso'][i,:] /= self.KERNELS['iso_norm'][i]
+
+            print( '[ OK ]' )
+
+        print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
+
+
+    cpdef load_dictionary( self, path, use_mask = False ) :
+        """Load the sparse structure previously created with "trk2dictionary" script.
+
+        Parameters
+        ----------
+        path : string
+            Folder containing the output of the trk2dictionary script (relative to subject path)
+        use_mask : boolean
+            If False (default) the optimization will be conducted only on the voxels actually
+            traversed by tracts. If True, the mask specified in trk2dictionary
+            (i.e. "filename_mask" paramater) will be used instead.
+            NB: if no mask was specified in trk2dictionary, the "tdi" and
+            "mask" masks are equivalent and this parameter is not influent.
+        """
+        if self.niiDWI is None :
+            raise RuntimeError( 'Data not loaded; call "load_data()" first.' )
+
+        tic = time.time()
+        print( '\n-> Loading the dictionary:' )
+        self.DICTIONARY = {}
+        self.set_config('TRACKING_path', pjoin(self.get_config('DATA_path'),path))
+
+        # load mask
+        self.set_config('dictionary_mask', 'mask' if use_mask else 'tdi' )
+        mask_filename = pjoin(self.get_config('TRACKING_path'),'dictionary_%s.nii'%self.get_config('dictionary_mask'))
+        if not exists( mask_filename ) :
+            mask_filename += '.gz'
+            if not exists( mask_filename ) :
+                raise RuntimeError( 'Dictionary not found. Execute ''trk2dictionary'' script first.' );
+        niiMASK = nibabel.load( mask_filename )
+        self.DICTIONARY['MASK'] = (niiMASK.get_data() > 0).astype(np.uint8)
+
+        # segments from the tracts
+        # ------------------------
+        print( '\t* segments from the tracts...', end="" )
+        sys.stdout.flush()
+
+        dictionary_info = load_dictionary_info( pjoin(self.get_config('TRACKING_path'), "dictionary_info.pickle") )
+
+        self.DICTIONARY['ndirs'] = dictionary_info['ndirs']
+
+        if self.DICTIONARY['ndirs'] != self.get_config('ndirs'):
+            raise RuntimeError( 'Dictionary is outdated. Execute ''trk2dictionary'' script first.' )
+
+        self.DICTIONARY['TRK'] = {}
+        self.DICTIONARY['TRK']['norm'] = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_TRK_norm.dict'), dtype=np.float32 )
+        self.DICTIONARY['TRK']['len']  = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_TRK_len.dict'), dtype=np.float32 )
+
+        self.DICTIONARY['IC'] = {}
+        self.DICTIONARY['IC']['fiber'] = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_IC_f.dict'), dtype=np.uint32 )
+        self.DICTIONARY['IC']['v']     = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_IC_v.dict'), dtype=np.uint32 )
+        self.DICTIONARY['IC']['o']     = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_IC_o.dict'), dtype=np.uint16 )
+        self.DICTIONARY['IC']['len']   = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_IC_len.dict'), dtype=np.float32 )
+        self.DICTIONARY['IC']['n']     = self.DICTIONARY['IC']['fiber'].size
+        self.DICTIONARY['IC']['nF']    = self.DICTIONARY['TRK']['norm'].size
+
+        # reorder the segments based on the "v" field
+        idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['v'])] )
+        self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
+        self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]
+        self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
+        self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
+        del idx
+
+        # divide the length of each segment by the fiber length so that all the columns of the libear operator will have same length
+        # NB: it works in conjunction with the normalization of the kernels
+        cdef :
+            np.float32_t [:] sl = self.DICTIONARY['IC']['len']
+            np.float32_t [:] tl = self.DICTIONARY['TRK']['norm']
+            np.uint32_t  [:] f  = self.DICTIONARY['IC']['fiber']
+            int s
+        if self.get_config('doNormalizeKernels') :
+            for s in xrange(self.DICTIONARY['IC']['n']) :
+                sl[s] /= tl[ f[s] ]
+
+        print( '[ %d fibers and %d segments ]' % ( self.DICTIONARY['IC']['nF'], self.DICTIONARY['IC']['n'] ) )
+
+        # segments from the peaks
+        # -----------------------
+        print( '\t* segments from the peaks...', end="" )
+        sys.stdout.flush()
+
+        self.DICTIONARY['EC'] = {}
+        self.DICTIONARY['EC']['v']  = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_EC_v.dict'), dtype=np.uint32 )
+        self.DICTIONARY['EC']['o']  = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_EC_o.dict'), dtype=np.uint16 )
+        self.DICTIONARY['EC']['nE'] = self.DICTIONARY['EC']['v'].size
+
+        # reorder the segments based on the "v" field
+        idx = np.lexsort( [np.array(self.DICTIONARY['EC']['o']), np.array(self.DICTIONARY['EC']['v'])] )
+        self.DICTIONARY['EC']['v'] = self.DICTIONARY['EC']['v'][ idx ]
+        self.DICTIONARY['EC']['o'] = self.DICTIONARY['EC']['o'][ idx ]
+        del idx
+
+        print( ' [ %d segments ]' % self.DICTIONARY['EC']['nE'] )
+
+        # isotropic compartments
+        # ----------------------
+        print( '\t* isotropic contributions...', end="" )
+        sys.stdout.flush()
+
+        self.DICTIONARY['ISO'] = {}
+
+        self.DICTIONARY['nV'] = self.DICTIONARY['MASK'].sum()
+
+        vx, vy, vz = ( self.DICTIONARY['MASK'] > 0 ).nonzero() # [TODO] find a way to avoid using int64 (not necessary and waste of memory)
+        vx = vx.astype(np.int32)
+        vy = vy.astype(np.int32)
+        vz = vz.astype(np.int32)
+        self.DICTIONARY['ISO']['v'] = vx + self.get_config('dim')[0] * ( vy + self.get_config('dim')[1] * vz )
+        del vx, vy, vz
+
+        # reorder the segments based on the "v" field
+        idx = np.argsort( self.DICTIONARY['ISO']['v'], kind='mergesort' )
+        self.DICTIONARY['ISO']['v'] = self.DICTIONARY['ISO']['v'][ idx ]
+        del idx
+
+        print( ' [ %d voxels ]' % self.DICTIONARY['nV'] )
+
+        # post-processing
+        # ---------------
+        print( '\t* post-processing...', end="" )
+        sys.stdout.flush()
+
+        # get the indices to extract the VOI as in MATLAB (in place of DICTIONARY.MASKidx)
+        idx = self.DICTIONARY['MASK'].ravel(order='F').nonzero()[0]
+        self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] = np.unravel_index( idx, self.DICTIONARY['MASK'].shape, order='F' )
+
+        lut = np.zeros( self.get_config('dim'), dtype=np.uint32 ).ravel()
+        for i in xrange(idx.size) :
+            lut[ idx[i] ] = i
+        self.DICTIONARY['IC'][ 'v'] = lut[ self.DICTIONARY['IC'][ 'v'] ]
+        self.DICTIONARY['EC'][ 'v'] = lut[ self.DICTIONARY['EC'][ 'v'] ]
+        self.DICTIONARY['ISO']['v'] = lut[ self.DICTIONARY['ISO']['v'] ]
+
+        print( '         [ OK ]' )
+
+        print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
+
+
+    def set_threads( self, nthreads = None, select_gpu = 0 ) :
+        """Set the number of threads to use for the matrix-vector operations with A and A'.
+
+        Parameters
+        ----------
+        nthreads : integer
+            Number of threads to use (nthreads = None ---> all the CPU threads available in the system
+                                      nthreads = 0    ---> enable CUDA GPU acceleration)
+        """
+        if nthreads is None :
+            # Set to the number of CPUs in the system
+            try :
+                import multiprocessing
+                nthreads = multiprocessing.cpu_count()
+            except :
+                nthreads = 1
+
+        if nthreads < 0 or nthreads > 255 :
+            raise RuntimeError( 'Number of threads must be between 0 and 255' )
+        if self.DICTIONARY is None :
+            raise RuntimeError( 'Dictionary not loaded; call "load_dictionary()" first.' )
+        if self.KERNELS is None :
+            raise RuntimeError( 'Response functions not generated; call "generate_kernels()" and "load_kernels()" first.' )
+
+        self.THREADS = {}
+        self.THREADS['n'] = nthreads
+        self.THREADS['IC'] = None
+        self.THREADS['EC'] = None
+        self.THREADS['ISO'] = None
+        self.THREADS['ICt'] = None
+        self.THREADS['ECt'] = None
+        self.THREADS['ISOt'] = None
+
+        cdef :
+            long [:] C
+            long t, tot, i1, i2, N, c
+            int i
+
+        if nthreads > 0:
+            print( '\n-> Distributing workload to different threads:' )
+            print( '\t* number of threads : %d' % nthreads )
+
+            tic = time.time()
+
+            # Distribute load for the computation of A*x product
+            print( '\t* A  operator... ', end="" )
+            sys.stdout.flush()
+
+            if self.DICTIONARY['IC']['n'] > 0 :
+                self.THREADS['IC'] = np.zeros( nthreads+1, dtype=np.uint32 )
+                if nthreads > 1 :
+                    N = np.floor( self.DICTIONARY['IC']['n']/nthreads )
+                    t = 1
+                    tot = 0
+                    C = np.bincount( self.DICTIONARY['IC']['v'] )
+                    for c in C :
+                        tot += c
+                        if tot >= N :
+                            self.THREADS['IC'][t] = self.THREADS['IC'][t-1] + tot
+                            t += 1
+                            tot = 0
+                self.THREADS['IC'][nthreads] = self.DICTIONARY['IC']['n']
+
+                # check if some threads are not assigned any segment
+                if np.count_nonzero( np.diff( self.THREADS['IC'].astype(np.int32) ) <= 0 ) :
+                    self.THREADS = None
+                    raise RuntimeError( 'Too many threads for the IC compartments to evaluate; try decreasing the number.' )
+
+            if self.DICTIONARY['EC']['nE'] > 0 :
+                self.THREADS['EC'] = np.zeros( nthreads+1, dtype=np.uint32 )
+                for i in xrange(nthreads) :
+                    self.THREADS['EC'][i] = np.searchsorted( self.DICTIONARY['EC']['v'], self.DICTIONARY['IC']['v'][ self.THREADS['IC'][i] ] )
+                self.THREADS['EC'][nthreads] = self.DICTIONARY['EC']['nE']
+
+                # check if some threads are not assigned any segment
+                if np.count_nonzero( np.diff( self.THREADS['EC'].astype(np.int32) ) <= 0 ) :
+                    self.THREADS = None
+                    raise RuntimeError( 'Too many threads for the EC compartments to evaluate; try decreasing the number.' )
+
+            if self.DICTIONARY['nV'] > 0 :
+                self.THREADS['ISO'] = np.zeros( nthreads+1, dtype=np.uint32 )
+                for i in xrange(nthreads) :
+                    self.THREADS['ISO'][i] = np.searchsorted( self.DICTIONARY['ISO']['v'], self.DICTIONARY['IC']['v'][ self.THREADS['IC'][i] ] )
+                self.THREADS['ISO'][nthreads] = self.DICTIONARY['nV']
+
+                # check if some threads are not assigned any segment
+                if np.count_nonzero( np.diff( self.THREADS['ISO'].astype(np.int32) ) <= 0 ) :
+                    self.THREADS = None
+                    raise RuntimeError( 'Too many threads for the ISO compartments to evaluate; try decreasing the number.' )
+
+            print( '[ OK ]' )
+
+            # Distribute load for the computation of At*y product
+            print( '\t* A\' operator... ', end="" )
+            sys.stdout.flush()
+
+            if self.DICTIONARY['IC']['n'] > 0 :
+                self.THREADS['ICt'] = np.full( self.DICTIONARY['IC']['n'], nthreads-1, dtype=np.uint8 )
+                if nthreads > 1 :
+                    idx = np.argsort( self.DICTIONARY['IC']['fiber'], kind='mergesort' )
+                    C = np.bincount( self.DICTIONARY['IC']['fiber'] )
+                    t = tot = i1 = i2 = 0
+                    N = np.floor(self.DICTIONARY['IC']['n']/nthreads)
+                    for c in C :
+                        i2 += c
+                        tot += c
+                        if tot >= N :
+                            self.THREADS['ICt'][ i1:i2 ] = t
+                            t += 1
+                            if t==nthreads-1 :
+                                break
+                            i1 = i2
+                            tot = c
+                    self.THREADS['ICt'][idx] = self.THREADS['ICt'].copy()
+
+            if self.DICTIONARY['EC']['nE'] > 0 :
+                self.THREADS['ECt'] = np.zeros( nthreads+1, dtype=np.uint32 )
+                N = np.floor( self.DICTIONARY['EC']['nE']/nthreads )
+                for i in xrange(1,nthreads) :
+                    self.THREADS['ECt'][i] = self.THREADS['ECt'][i-1] + N
+                self.THREADS['ECt'][nthreads] = self.DICTIONARY['EC']['nE']
+
+                # check if some threads are not assigned any segment
+                if np.count_nonzero( np.diff( self.THREADS['ECt'].astype(np.int32) ) <= 0 ) :
+                    self.THREADS = None
+                    raise RuntimeError( 'Too many threads for the EC compartments to evaluate; try decreasing the number.' )
+
+            if self.DICTIONARY['nV'] > 0 :
+                self.THREADS['ISOt'] = np.zeros( nthreads+1, dtype=np.uint32 )
+                N = np.floor( self.DICTIONARY['nV']/nthreads )
+                for i in xrange(1,nthreads) :
+                    self.THREADS['ISOt'][i] = self.THREADS['ISOt'][i-1] + N
+                self.THREADS['ISOt'][nthreads] = self.DICTIONARY['nV']
+
+                # check if some threads are not assigned any segment
+                if np.count_nonzero( np.diff( self.THREADS['ISOt'].astype(np.int32) ) <= 0 ) :
+                    self.THREADS = None
+                    raise RuntimeError( 'Too many threads for the ISO compartments to evaluate; try decreasing the number.' )
+        else:
+            self.THREADS['GPUID'] = select_gpu
+
+            print( '[ OK ]' )
+
+            print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
+
+
+    def build_operator( self ) :
+        """Compile/build the operator for computing the matrix-vector multiplications by A and A'
+        using the informations from self.DICTIONARY, self.KERNELS and self.THREADS.
+        NB: needs to call this function to update pointers to data structures in case
+            the data is changed in self.DICTIONARY, self.KERNELS or self.THREADS.
+        """
+        if self.DICTIONARY is None :
+            raise RuntimeError( 'Dictionary not loaded; call "load_dictionary()" first.' )
+        if self.KERNELS is None :
+            raise RuntimeError( 'Response functions not generated; call "generate_kernels()" and "load_kernels()" first.' )
+        if self.THREADS is None :
+            raise RuntimeError( 'Threads not set; call "set_threads()" first.' )
+
+        tic = time.time()
+        print( '\n-> Building linear operator A:' )
+
+        # need to pass these parameters at runtime for compiling the C code
+        from commit.operator import config
+        config.nTHREADS = self.THREADS['n']
+        config.model    = self.model.id
+        config.nIC      = self.KERNELS['wmr'].shape[0]
+        config.nEC      = self.KERNELS['wmh'].shape[0]
+        config.nISO     = self.KERNELS['iso'].shape[0]
+        if not 'commit.operator.operator' in sys.modules :
+            import commit.operator.operator
+        else :
+            reload( sys.modules['commit.operator.operator'] )
+
+        if self.THREADS['n'] > 0:
+            self.A = sys.modules['commit.operator.operator'].LinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
+        else:
+            import commit.cudaoperator
+            self.A = commit.cudaoperator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS, fcall=1, self.THREADS['GPUID'] )
+
+        print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
+
+    def get_y( self ):
+        """
+        Returns a numpy array that corresponds to the 'y' vector of the optimisation problem.
+        NB: this can be run only after having loaded the dictionary and the data.
+        """
+        if self.DICTIONARY is None :
+            raise RuntimeError( 'Dictionary not loaded; call "load_dictionary()" first.' )
+        if self.niiDWI is None :
+            raise RuntimeError( 'Data not loaded; call "load_data()" first.' )
+        return self.niiDWI_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'], : ].flatten().astype(np.float64)
+
+    def fit( self, tol_fun = 1e-3, tol_x = 1e-6, max_iter = 100, verbose = 1, x0 = None, regularisation = None ) :
+        """Fit the model to the data.
+
+        Parameters
+        ----------
+        tol_fun : float
+            Tolerance on the objective function (default : 1e-3)
+        max_iter : integer
+            Maximum number of iterations (default : 100)
+        verbose : integer
+            Level of verbosity: 0=no print, 1=print progress (default : 1)
+        x0 : np.array
+            Initial guess for the solution of the problem (default : None)
+        regularisation : commit.solvers.init_regularisation object
+            Python dictionary that describes the wanted regularisation term.
+            Check the documentation of commit.solvers.init_regularisation to see
+            how to properly define the wanted mathematical formulation
+            ( default : None )
+        """
+        if self.niiDWI is None :
+            raise RuntimeError( 'Data not loaded; call "load_data()" first.' )
+        if self.DICTIONARY is None :
+            raise RuntimeError( 'Dictionary not loaded; call "load_dictionary()" first.' )
+        if self.KERNELS is None :
+            raise RuntimeError( 'Response functions not generated; call "generate_kernels()" and "load_kernels()" first.' )
+        if self.THREADS is None :
+            raise RuntimeError( 'Threads not set; call "set_threads()" first.' )
+        if self.A is None :
+            raise RuntimeError( 'Operator not built; call "build_operator()" first.' )
+
+        if x0 is not None :
+            if x0.shape[0] != self.A.shape[1] :
+                raise RuntimeError( 'x0: dimension does not match the number of columns of the dictionary.' )
+        if regularisation is None :
+            regularisation = commit.solvers.init_regularisation(self)
+
+        self.CONFIG['optimization']                   = {}
+        self.CONFIG['optimization']['tol_fun']        = tol_fun
+        self.CONFIG['optimization']['tol_x']          = tol_x
+        self.CONFIG['optimization']['max_iter']       = max_iter
+        self.CONFIG['optimization']['verbose']        = verbose
+        self.CONFIG['optimization']['regularisation'] = regularisation
+
+        # run solver
+        t = time.time()
+        print( '\n-> Fit model' )
+
+        self.x, opt_details = commit.solvers.solve(self.get_y(), self.A, self.A.T, tol_fun = tol_fun, tol_x = tol_x, max_iter = max_iter, verbose = verbose, x0 = x0, regularisation = regularisation)
+
+        self.CONFIG['optimization']['fit_details'] = opt_details
+        self.CONFIG['optimization']['fit_time'] = time.time()-t
+
+        print( '   [ %s ]' % ( time.strftime("%Hh %Mm %Ss", time.gmtime(self.CONFIG['optimization']['fit_time']) ) ) )
+
+
+    def save_results( self, path_suffix = None, save_opt_details = True, save_coeff = False ) :
+        """Save the output (coefficients, errors, maps etc).
+
+        Parameters
+        ----------
+        path_suffix : string
+            Text to be appended to "Results" to create the output path (default : None)
+        save_opt_details : boolean
+            Save everything in a pickle file containing the following list L:
+                L[0]: dictionary with all the configuration details
+                L[1]: np.array obtained through the optimisation process with the normalised kernels
+                L[2]: np.array renormalisation of L[1]
+            (default : True)
+        save_coeff : boolean
+            Save three txt files containing the coefficients related to each
+            compartment and a pickle file containing the dictionary with all
+            the configuration details.
+            (default : False)
+        """
+        if self.x is None :
+            raise RuntimeError( 'Model not fitted to the data; call "fit()" first.' )
+
+        RESULTS_path = 'Results_' + self.model.id
+        if path_suffix :
+            self.set_config('path_suffix', path_suffix)
+            RESULTS_path = RESULTS_path + path_suffix
+
+        print( '\n-> Saving results to "%s/*":' % RESULTS_path )
+        tic = time.time()
+
+        # create folder or delete existing files (if any)
+        RESULTS_path = pjoin( self.get_config('TRACKING_path'), RESULTS_path )
+        if not exists( RESULTS_path ) :
+            makedirs( RESULTS_path )
+        else :
+            for f in glob.glob( pjoin(RESULTS_path,'*') ) :
+                remove( f )
+        self.set_config('RESULTS_path', RESULTS_path)
+
+        # Configuration and results
+        print( '\t* configuration and results:' )
+
+        nF = self.DICTIONARY['IC']['nF']
+        nE = self.DICTIONARY['EC']['nE']
+        nV = self.DICTIONARY['nV']
+        norm_fib = np.ones( nF )
+        # x is the x of the original problem
+        # self.x is the x preconditioned
+        if self.get_config('doNormalizeKernels') :
+            # renormalize the coefficients
+            norm1 = np.repeat(self.KERNELS['wmr_norm'],nF)
+            norm2 = np.repeat(self.KERNELS['wmh_norm'],nE)
+            norm3 = np.repeat(self.KERNELS['iso_norm'],nV)
+            norm_fib = np.kron(np.ones(self.KERNELS['wmr'].shape[0]), self.DICTIONARY['TRK']['norm'])
+            x = self.x / np.hstack( (norm1*norm_fib,norm2,norm3) )
+        else :
+            x = self.x
+        if save_opt_details:
+            print( '\t\t- pickle... ', end="" )
+            sys.stdout.flush()
+            with open( pjoin(RESULTS_path,'results.pickle'), 'wb+' ) as fid :
+                pickle.dump( [self.CONFIG, self.x, x], fid, protocol=2 )
+            print( '[ OK ]' )
+        if save_coeff:
+            print( '\t\t- txt... ', end="" )
+            sys.stdout.flush()
+            np.savetxt(pjoin(RESULTS_path,'xic.txt'), x[0:nF])
+            np.savetxt(pjoin(RESULTS_path,'xec.txt'), x[nF:nF+nE])
+            np.savetxt(pjoin(RESULTS_path,'xiso.txt'), x[(nF+nE):])
+            with open( pjoin(RESULTS_path,'config.pickle'), 'wb+' ) as fid :
+                pickle.dump( self.CONFIG, fid, protocol=2 )
+            print( '[ OK ]' )
+
+
+        # Map of wovelwise errors
+        print( '\t* fitting errors:' )
+
+        not_NaN = np.ones( self.get_config('dim'), dtype=np.float32 ) * 1e-16 # avoid division by 0
+
+        niiMAP_img = np.zeros( self.get_config('dim'), dtype=np.float32 )
+        affine = self.niiDWI.affine if nibabel.__version__ >= '2.0.0' else self.niiDWI.get_affine()
+        niiMAP     = nibabel.Nifti1Image( niiMAP_img, affine )
+        niiMAP_hdr = niiMAP.header if nibabel.__version__ >= '2.0.0' else niiMAP.get_header()
+
+        y_mea = np.reshape( self.niiDWI_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'], : ].flatten().astype(np.float32), (nV,-1) )
+        y_est = np.reshape( self.A.dot(self.x), (nV,-1) ).astype(np.float32)
+
+        print( '\t\t- RMSE...', end="" )
+        sys.stdout.flush()
+        tmp = np.sqrt( np.mean((y_mea-y_est)**2,axis=1) )
+        niiMAP_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = tmp
+        niiMAP_hdr['cal_min'] = 0
+        niiMAP_hdr['cal_max'] = tmp.max()
+        nibabel.save( niiMAP, pjoin(RESULTS_path,'fit_RMSE.nii.gz') )
+        print( ' [ %.3f +/- %.3f ]' % ( tmp.mean(), tmp.std() ) )
+
+        print( '\t\t- NRMSE...', end="" )
+        sys.stdout.flush()
+        tmp = np.sum(y_mea**2,axis=1)
+        idx = np.where( tmp < 1E-12 )
+        tmp[ idx ] = 1
+        tmp = np.sqrt( np.sum((y_mea-y_est)**2,axis=1) / tmp )
+        tmp[ idx ] = 0
+        niiMAP_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = tmp
+        niiMAP_hdr['cal_min'] = 0
+        niiMAP_hdr['cal_max'] = 1
+        nibabel.save( niiMAP, pjoin(RESULTS_path,'fit_NRMSE.nii.gz') )
+        print( '[ %.3f +/- %.3f ]' % ( tmp.mean(), tmp.std() ) )
+
+        # Map of compartment contributions
+        print( '\t* voxelwise contributions:' )
+
+        print( '\t\t- intra-axonal', end="" )
+        sys.stdout.flush()
+        niiIC_img = np.zeros( self.get_config('dim'), dtype=np.float32 )
+        if len(self.KERNELS['wmr']) > 0 :
+            offset = nF * self.KERNELS['wmr'].shape[0]
+            tmp = ( x[:offset].reshape( (-1,nF) ) * norm_fib.reshape( (-1,nF) ) ).sum( axis=0 )
+            xv = np.bincount( self.DICTIONARY['IC']['v'], minlength=nV,
+                weights=tmp[ self.DICTIONARY['IC']['fiber'] ] * self.DICTIONARY['IC']['len']
+            ).astype(np.float32)
+            niiIC_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = xv
+        print( '[ OK ]' )
+
+        print( '\t\t- extra-axonal', end="" )
+        sys.stdout.flush()
+        niiEC_img = np.zeros( self.get_config('dim'), dtype=np.float32 )
+        if len(self.KERNELS['wmh']) > 0 :
+            offset = nF * self.KERNELS['wmr'].shape[0]
+            tmp = x[offset:offset+nE*len(self.KERNELS['wmh'])].reshape( (-1,nE) ).sum( axis=0 )
+            xv = np.bincount( self.DICTIONARY['EC']['v'], weights=tmp, minlength=nV ).astype(np.float32)
+            niiEC_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = xv
+        print( '[ OK ]' )
+
+        print( '\t\t- isotropic', end="" )
+        sys.stdout.flush()
+        niiISO_img = np.zeros( self.get_config('dim'), dtype=np.float32 )
+        if len(self.KERNELS['iso']) > 0 :
+            offset = nF * self.KERNELS['wmr'].shape[0] + nE * self.KERNELS['wmh'].shape[0]
+            xv = x[offset:].reshape( (-1,nV) ).sum( axis=0 )
+            niiISO_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = xv
+        print( '   [ OK ]' )
+
+        if self.get_config('doNormalizeMaps') :
+                niiIC = nibabel.Nifti1Image( niiIC_img / ( niiIC_img + niiEC_img + niiISO_img + not_NaN), affine )
+                niiEC = nibabel.Nifti1Image( niiEC_img / ( niiIC_img + niiEC_img + niiISO_img + not_NaN), affine )
+                niiISO = nibabel.Nifti1Image( niiISO_img / ( niiIC_img + niiEC_img + niiISO_img + not_NaN), affine )
+        else:
+                niiIC = nibabel.Nifti1Image( niiIC_img, affine )
+                niiEC = nibabel.Nifti1Image( niiEC_img, affine )
+                niiISO = nibabel.Nifti1Image( niiISO_img, affine )
+
+        nibabel.save( niiIC , pjoin(RESULTS_path,'compartment_IC.nii.gz') )
+        nibabel.save( niiEC , pjoin(RESULTS_path,'compartment_EC.nii.gz') )
+        nibabel.save( niiISO , pjoin(RESULTS_path,'compartment_ISO.nii.gz') )
+
+
+        print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 30d2450d..e1fb9d43 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -1,221 +1,227 @@
-#!python
-#cython: language_level=3, boundscheck=False, wraparound=False, profile=False
-
-import cython
-import numpy as np
-cimport numpy as np
-
-cdef extern from "operator_withCUDA.cuh":
-    cdef cppclass C_CudaLinearOperator "CudaLinearOperator":
-        C_CudaLinearOperator(
-            np.uint32_t*,
-            np.uint32_t*,
-            np.uint16_t*,
-            np.float32_t*,
-            np.float32_t*,
-
-            np.uint32_t*,
-            np.uint16_t*,
-            np.float32_t*,
-
-            np.float32_t*,
-
-            int,
-            int,
-            int,
-            int,
-            int,
-            int,
-            int,
-            int,
-            int,
-            
-            int)
-
-        int   getCudaStatus()
-        void  setTransposeData(np.uint32_t*, np.uint32_t*, np.uint16_t*, np.float32_t*)
-        void  destroy()
-        void  dot(np.float64_t*, np.float64_t*)
-        void Tdot(np.float64_t*, np.float64_t*)
-
-cdef class CudaLinearOperator :
-    """This class is a wrapper to the C code for performing marix-vector multiplications
-    with the COMMIT linear operator A. The multiplications are done using C code
-    that uses information from the DICTIONARY, KERNELS and THREADS data structures.
-    """
-    cdef int nS, nF, nR, nE, nT, nV, nI, n, ndirs
-    cdef public int adjoint, n1, n2
-
-    cdef DICTIONARY
-    cdef KERNELS
-    cdef THREADS
-
-    cdef unsigned int*   ICf
-    cdef float*          ICl
-    cdef unsigned int*   ICv
-    cdef unsigned short* ICo
-    cdef unsigned int*   ECv
-    cdef unsigned short* ECo
-    cdef unsigned int*   ISOv
-
-    cdef float* LUT_IC
-    cdef float* LUT_EC
-    cdef float* LUT_ISO
-
-    cdef unsigned int*   ICthreads
-    cdef unsigned int*   ECthreads
-    cdef unsigned int*   ISOthreads
-
-    cdef unsigned char*  ICthreadsT
-    cdef unsigned int*   ECthreadsT
-    cdef unsigned int*   ISOthreadsT
-    cdef C_CudaLinearOperator* A
-
-
-    def __init__( self, DICTIONARY, KERNELS, THREADS, fcall = 0 ) :
-        """Set the pointers to the data structures used by the C code."""
-        self.DICTIONARY = DICTIONARY
-        self.KERNELS    = KERNELS
-        self.THREADS    = THREADS
-
-        self.nF         = DICTIONARY['IC']['nF']    # number of FIBERS
-        self.nR         = KERNELS['wmr'].shape[0]   # number of FIBER RADII
-        self.nE         = DICTIONARY['EC']['nE']    # number of EC segments
-        self.nT         = KERNELS['wmh'].shape[0]   # number of EC TORTUOSITY values
-        self.nV         = DICTIONARY['nV']          # number of VOXELS
-        self.nI         = KERNELS['iso'].shape[0]   # number of ISO contributions
-        self.n          = DICTIONARY['IC']['n']     # numbner of IC segments
-        self.ndirs      = KERNELS['wmr'].shape[1]   # number of directions
-
-        if KERNELS['wmr'].size > 0 :
-            self.nS = KERNELS['wmr'].shape[2]       # number of SAMPLES
-        elif KERNELS['wmh'].size > 0 :
-            self.nS = KERNELS['wmh'].shape[2]
-        else :
-            self.nS = KERNELS['wmr'].shape[1]
-
-        self.adjoint = 0                            # direct of inverse product
-
-        self.n1 = self.nV*self.nS
-        self.n2 = self.nR*self.nF + self.nT*self.nE + self.nI*self.nV
-
-        # get C pointers to arrays in DICTIONARY
-        cdef unsigned int [::1]   ICf  = DICTIONARY['IC']['fiber']
-        self.ICf = &ICf[0]
-        cdef float [::1]          ICl  = DICTIONARY['IC']['len']
-        self.ICl = &ICl[0]
-        cdef unsigned int [::1]   ICv  = DICTIONARY['IC']['v']
-        self.ICv = &ICv[0]
-        cdef unsigned short [::1] ICo  = DICTIONARY['IC']['o']
-        self.ICo = &ICo[0]
-        cdef unsigned int [::1]   ECv  = DICTIONARY['EC']['v']
-        self.ECv = &ECv[0]
-        cdef unsigned short [::1] ECo  = DICTIONARY['EC']['o']
-        self.ECo = &ECo[0]
-        cdef unsigned int [::1]   ISOv = DICTIONARY['ISO']['v']
-        self.ISOv = &ISOv[0]
-
-        # get C pointers to arrays in KERNELS
-        cdef float [:, :, ::1] wmrSFP = KERNELS['wmr']
-        self.LUT_IC  = &wmrSFP[0,0,0]
-        cdef float [:, :, ::1] wmhSFP = KERNELS['wmh']
-        self.LUT_EC  = &wmhSFP[0,0,0]
-        cdef float [:, ::1] isoSFP = KERNELS['iso']
-        self.LUT_ISO = &isoSFP[0,0]
-
-        self.A = new C_CudaLinearOperator(
-            &ICv[0],
-            &ICf[0],
-            &ICo[0],
-            &ICl[0],
-            &wmrSFP[0,0,0],
-
-            &ECv[0],
-            &ECo[0],
-            &wmhSFP[0,0,0],
-
-            &isoSFP[0,0],
-
-            self.n,
-            self.nV,
-            self.nF,
-            self.nE,
-            self.ndirs,
-            self.nS,
-            self.nR,
-            self.nT,
-            self.nI,
-            
-            fcall)
-
-        if fcall == 1:
-            idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['fiber'])] )
-
-            self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
-            self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]
-            self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
-            self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
-
-            ICf = self.DICTIONARY['IC']['fiber']
-            ICl = self.DICTIONARY['IC']['len']
-            ICv = self.DICTIONARY['IC']['v']
-            ICo = self.DICTIONARY['IC']['o']
-
-            self.ICf = &ICf[0]
-            self.ICl = &ICl[0]
-            self.ICv = &ICv[0]
-            self.ICo = &ICo[0]
-
-            self.A.setTransposeData(&self.ICv[0], &self.ICf[0], &self.ICo[0], &self.ICl[0])
-
-    @property
-    def T( self ) :
-        """Transpose of the explicit matrix."""
-        C = CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
-        C.adjoint = 1 - C.adjoint
-        return C
-
-    @property
-    def shape( self ) :
-        """Size of the explicit matrix."""
-        if not self.adjoint :
-            return ( self.n1, self.n2 )
-        else :
-            return ( self.n2, self.n1 )
-
-
-    def dot( self, double [::1] v_in  ):
-        """Wrapper to C code for efficiently performing the matrix-vector multiplications.
-
-        Parameters
-        ----------
-        v_in : 1D numpy.array of double
-            Input vector for the matrix-vector multiplication
-
-        Returns
-        -------
-        v_out : 1D numpy.array of double
-            Results of the multiplication
-        """
-
-        # Permit only matrix-vector multiplications
-        if v_in.size != self.shape[1] :
-            raise RuntimeError( "A.dot(): dimensions do not match" )
-
-        # Create output array
-        cdef double [::1] v_out = np.zeros( self.shape[0], dtype=np.float64 )
-
-        # Call the cython function to read the memory pointers
-        if not self.adjoint :
-            # DIRECT PRODUCT A*x
-            self.A.dot(&v_in[0], &v_out[0])
-        else :
-            # INVERSE PRODUCT A'*y
-            self.A.Tdot(&v_in[0], &v_out[0])
-
-        return v_out
-
-    def destroy( self ):
-        """Free all memory of the CUDA GPU"""
-        self.A.destroy()
-
+#!python
+#cython: language_level=3, boundscheck=False, wraparound=False, profile=False
+
+import cython
+import numpy as np
+cimport numpy as np
+
+cdef extern from "operator_withCUDA.cuh":
+    cdef cppclass C_CudaLinearOperator "CudaLinearOperator":
+        C_CudaLinearOperator(
+            np.uint32_t*,
+            np.uint32_t*,
+            np.uint16_t*,
+            np.float32_t*,
+            np.float32_t*,
+
+            np.uint32_t*,
+            np.uint16_t*,
+            np.float32_t*,
+
+            np.float32_t*,
+
+            int,
+            int,
+            int,
+            int,
+            int,
+            int,
+            int,
+            int,
+            int,
+            
+            int,
+            int)
+
+        int   getCudaStatus()
+        void  setTransposeData(np.uint32_t*, np.uint32_t*, np.uint16_t*, np.float32_t*)
+        void  destroy()
+        void  dot(np.float64_t*, np.float64_t*)
+        void Tdot(np.float64_t*, np.float64_t*)
+
+cdef class CudaLinearOperator :
+    """This class is a wrapper to the CUDA C++ code for performing marix-vector multiplications
+    with the COMMIT linear operator A in a CUDA GPU. The multiplications are done using CUDA C++ code
+    that uses information from the DICTIONARY and KERNELS data structures.
+    """
+    cdef int nS, nF, nR, nE, nT, nV, nI, n, ndirs
+    cdef public int adjoint, n1, n2
+
+    cdef DICTIONARY
+    cdef KERNELS
+    cdef THREADS
+
+    cdef unsigned int*   ICf
+    cdef float*          ICl
+    cdef unsigned int*   ICv
+    cdef unsigned short* ICo
+    cdef unsigned int*   ECv
+    cdef unsigned short* ECo
+    cdef unsigned int*   ISOv
+
+    cdef float* LUT_IC
+    cdef float* LUT_EC
+    cdef float* LUT_ISO
+
+    # pointer to the operator in GPU memory
+    cdef C_CudaLinearOperator* GPU_COMMIT_A
+
+    # these should be always None, they remain for compatibility
+    cdef unsigned int*   ICthreads
+    cdef unsigned int*   ECthreads
+    cdef unsigned int*   ISOthreads
+    cdef unsigned char*  ICthreadsT
+    cdef unsigned int*   ECthreadsT
+    cdef unsigned int*   ISOthreadsT
+
+
+    def __init__( self, DICTIONARY, KERNELS, THREADS, fcall = 0, gpu_id = 0 ) :
+        """Set the pointers to the data structures used by the C code."""
+        self.DICTIONARY = DICTIONARY
+        self.KERNELS    = KERNELS
+        self.THREADS    = THREADS
+
+        self.nF         = DICTIONARY['IC']['nF']    # number of FIBERS
+        self.nR         = KERNELS['wmr'].shape[0]   # number of FIBER RADII
+        self.nE         = DICTIONARY['EC']['nE']    # number of EC segments
+        self.nT         = KERNELS['wmh'].shape[0]   # number of EC TORTUOSITY values
+        self.nV         = DICTIONARY['nV']          # number of VOXELS
+        self.nI         = KERNELS['iso'].shape[0]   # number of ISO contributions
+        self.n          = DICTIONARY['IC']['n']     # numbner of IC segments
+        self.ndirs      = KERNELS['wmr'].shape[1]   # number of directions
+
+        if KERNELS['wmr'].size > 0 :
+            self.nS = KERNELS['wmr'].shape[2]       # number of SAMPLES
+        elif KERNELS['wmh'].size > 0 :
+            self.nS = KERNELS['wmh'].shape[2]
+        else :
+            self.nS = KERNELS['wmr'].shape[1]
+
+        self.adjoint = 0                            # direct of inverse product
+
+        self.n1 = self.nV*self.nS
+        self.n2 = self.nR*self.nF + self.nT*self.nE + self.nI*self.nV
+
+        # get C pointers to arrays in DICTIONARY
+        cdef unsigned int [::1]   ICf  = DICTIONARY['IC']['fiber']
+        self.ICf = &ICf[0]
+        cdef float [::1]          ICl  = DICTIONARY['IC']['len']
+        self.ICl = &ICl[0]
+        cdef unsigned int [::1]   ICv  = DICTIONARY['IC']['v']
+        self.ICv = &ICv[0]
+        cdef unsigned short [::1] ICo  = DICTIONARY['IC']['o']
+        self.ICo = &ICo[0]
+        cdef unsigned int [::1]   ECv  = DICTIONARY['EC']['v']
+        self.ECv = &ECv[0]
+        cdef unsigned short [::1] ECo  = DICTIONARY['EC']['o']
+        self.ECo = &ECo[0]
+        cdef unsigned int [::1]   ISOv = DICTIONARY['ISO']['v']
+        self.ISOv = &ISOv[0]
+
+        # get C pointers to arrays in KERNELS
+        cdef float [:, :, ::1] wmrSFP = KERNELS['wmr']
+        self.LUT_IC  = &wmrSFP[0,0,0]
+        cdef float [:, :, ::1] wmhSFP = KERNELS['wmh']
+        self.LUT_EC  = &wmhSFP[0,0,0]
+        cdef float [:, ::1] isoSFP = KERNELS['iso']
+        self.LUT_ISO = &isoSFP[0,0]
+
+        # create the operator in GPU memory
+        self.GPU_COMMIT_A = new C_CudaLinearOperator(
+            &ICv[0],
+            &ICf[0],
+            &ICo[0],
+            &ICl[0],
+            &wmrSFP[0,0,0],
+
+            &ECv[0],
+            &ECo[0],
+            &wmhSFP[0,0,0],
+
+            &isoSFP[0,0],
+
+            self.n,
+            self.nV,
+            self.nF,
+            self.nE,
+            self.ndirs,
+            self.nS,
+            self.nR,
+            self.nT,
+            self.nI,
+            
+            fcall,
+            gpu_id)
+
+        # create the transpose of the operator in GPU memory
+        if fcall == 1:
+            idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['fiber'])] )
+
+            self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
+            self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]
+            self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
+            self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
+
+            ICf = self.DICTIONARY['IC']['fiber']
+            ICl = self.DICTIONARY['IC']['len']
+            ICv = self.DICTIONARY['IC']['v']
+            ICo = self.DICTIONARY['IC']['o']
+
+            self.ICf = &ICf[0]
+            self.ICl = &ICl[0]
+            self.ICv = &ICv[0]
+            self.ICo = &ICo[0]
+
+            self.GPU_COMMIT_A.setTransposeData(&self.ICv[0], &self.ICf[0], &self.ICo[0], &self.ICl[0])
+
+    @property
+    def T( self ) :
+        """Transpose of the explicit matrix."""
+        C = CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
+        C.adjoint = 1 - C.adjoint
+        return C
+
+    @property
+    def shape( self ) :
+        """Size of the explicit matrix."""
+        if not self.adjoint :
+            return ( self.n1, self.n2 )
+        else :
+            return ( self.n2, self.n1 )
+
+
+    def dot( self, double [::1] v_in  ):
+        """Wrapper to C code for efficiently performing the matrix-vector multiplications.
+
+        Parameters
+        ----------
+        v_in : 1D numpy.array of double
+            Input vector for the matrix-vector multiplication
+
+        Returns
+        -------
+        v_out : 1D numpy.array of double
+            Results of the multiplication
+        """
+
+        # Permit only matrix-vector multiplications
+        if v_in.size != self.shape[1] :
+            raise RuntimeError( "A.dot(): dimensions do not match" )
+
+        # Create output array
+        cdef double [::1] v_out = np.zeros( self.shape[0], dtype=np.float64 )
+
+        # Call the cython function to read the memory pointers
+        if not self.adjoint :
+            # DIRECT PRODUCT A*x
+            self.GPU_COMMIT_A.dot(&v_in[0], &v_out[0])
+        else :
+            # INVERSE PRODUCT A'*y
+            self.GPU_COMMIT_A.Tdot(&v_in[0], &v_out[0])
+
+        return v_out
+
+    def destroy( self ):
+        """Free all memory of the CUDA GPU"""
+        self.GPU_COMMIT_A.destroy()
+
diff --git a/commit/operator/operator.pyx b/commit/operator/operator.pyx
index 72ed8655..6d83202a 100755
--- a/commit/operator/operator.pyx
+++ b/commit/operator/operator.pyx
@@ -1,191 +1,191 @@
-#!python
-#cython: language_level=3, boundscheck=False, wraparound=False, profile=False
-
-import cython
-import numpy as np
-cimport numpy as np
-
-# Interfaces to actual C code performing the multiplications
-cdef extern void COMMIT_A(
-    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
-    double *_v_in, double *_v_out,
-    unsigned int *_ICf, unsigned int *_ICv, unsigned short *_ICo, float *_ICl,
-    unsigned int *_ECv, unsigned short *_ECo,
-    unsigned int *_ISOv,
-    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
-    unsigned int* _ICthreads, unsigned int* _ECthreads, unsigned int* _ISOthreads
-) nogil
-
-cdef extern void COMMIT_At(
-    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
-    double *_v_in, double *_v_out,
-    unsigned int *_ICf, unsigned int *_ICv, unsigned short *_ICo, float *_ICl,
-    unsigned int *_ECv, unsigned short *_ECo,
-    unsigned int *_ISOv,
-    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
-    unsigned char *_ICthreadsT, unsigned int *_ECthreadsT, unsigned int *_ISOthreadsT
-) nogil
-
-
-
-cdef class LinearOperator :
-    """This class is a wrapper to the C code for performing marix-vector multiplications
-    with the COMMIT linear operator A. The multiplications are done using C code
-    that uses information from the DICTIONARY, KERNELS and THREADS data structures.
-    """
-    cdef int nS, nF, nR, nE, nT, nV, nI, n, ndirs
-    cdef public int adjoint, n1, n2
-
-    cdef DICTIONARY
-    cdef KERNELS
-    cdef THREADS
-
-    cdef unsigned int*   ICf
-    cdef float*          ICl
-    cdef unsigned int*   ICv
-    cdef unsigned short* ICo
-    cdef unsigned int*   ECv
-    cdef unsigned short* ECo
-    cdef unsigned int*   ISOv
-
-    cdef float* LUT_IC
-    cdef float* LUT_EC
-    cdef float* LUT_ISO
-
-    cdef unsigned int*   ICthreads
-    cdef unsigned int*   ECthreads
-    cdef unsigned int*   ISOthreads
-
-    cdef unsigned char*  ICthreadsT
-    cdef unsigned int*   ECthreadsT
-    cdef unsigned int*   ISOthreadsT
-
-
-    def __init__( self, DICTIONARY, KERNELS, THREADS ) :
-        """Set the pointers to the data structures used by the C code."""
-        self.DICTIONARY = DICTIONARY
-        self.KERNELS    = KERNELS
-        self.THREADS    = THREADS
-
-        self.nF         = DICTIONARY['IC']['nF']    # number of FIBERS
-        self.nR         = KERNELS['wmr'].shape[0]   # number of FIBER RADII
-        self.nE         = DICTIONARY['EC']['nE']    # number of EC segments
-        self.nT         = KERNELS['wmh'].shape[0]   # number of EC TORTUOSITY values
-        self.nV         = DICTIONARY['nV']          # number of VOXELS
-        self.nI         = KERNELS['iso'].shape[0]   # number of ISO contributions
-        self.n          = DICTIONARY['IC']['n']     # numbner of IC segments
-        self.ndirs      = KERNELS['wmr'].shape[1]   # number of directions
-
-        if KERNELS['wmr'].size > 0 :
-            self.nS = KERNELS['wmr'].shape[2]       # number of SAMPLES
-        elif KERNELS['wmh'].size > 0 :
-            self.nS = KERNELS['wmh'].shape[2]
-        else :
-            self.nS = KERNELS['wmr'].shape[1]
-
-        self.adjoint    = 0                         # direct of inverse product
-
-        self.n1 = self.nV*self.nS
-        self.n2 = self.nR*self.nF + self.nT*self.nE + self.nI*self.nV
-
-        # get C pointers to arrays in DICTIONARY
-        cdef unsigned int [::1]   ICf  = DICTIONARY['IC']['fiber']
-        self.ICf = &ICf[0]
-        cdef float [::1]          ICl  = DICTIONARY['IC']['len']
-        self.ICl = &ICl[0]
-        cdef unsigned int [::1]   ICv  = DICTIONARY['IC']['v']
-        self.ICv = &ICv[0]
-        cdef unsigned short [::1] ICo  = DICTIONARY['IC']['o']
-        self.ICo = &ICo[0]
-        cdef unsigned int [::1]   ECv  = DICTIONARY['EC']['v']
-        self.ECv = &ECv[0]
-        cdef unsigned short [::1] ECo  = DICTIONARY['EC']['o']
-        self.ECo = &ECo[0]
-        cdef unsigned int [::1]   ISOv = DICTIONARY['ISO']['v']
-        self.ISOv = &ISOv[0]
-
-        # get C pointers to arrays in KERNELS
-        cdef float [:, :, ::1] wmrSFP = KERNELS['wmr']
-        self.LUT_IC  = &wmrSFP[0,0,0]
-        cdef float [:, :, ::1] wmhSFP = KERNELS['wmh']
-        self.LUT_EC  = &wmhSFP[0,0,0]
-        cdef float [:, ::1] isoSFP = KERNELS['iso']
-        self.LUT_ISO = &isoSFP[0,0]
-
-        # get C pointers to arrays in THREADS
-        cdef unsigned int [::1] ICthreads = THREADS['IC']
-        self.ICthreads  = &ICthreads[0]
-        cdef unsigned int [::1] ECthreads = THREADS['EC']
-        self.ECthreads  = &ECthreads[0]
-        cdef unsigned int [::1] ISOthreads = THREADS['ISO']
-        self.ISOthreads = &ISOthreads[0]
-
-        cdef unsigned char [::1] ICthreadsT = THREADS['ICt']
-        self.ICthreadsT  = &ICthreadsT[0]
-        cdef unsigned int  [::1] ECthreadsT = THREADS['ECt']
-        self.ECthreadsT  = &ECthreadsT[0]
-        cdef unsigned int  [::1] ISOthreadsT = THREADS['ISOt']
-        self.ISOthreadsT = &ISOthreadsT[0]
-
-
-    @property
-    def T( self ) :
-        """Transpose of the explicit matrix."""
-        C = LinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
-        C.adjoint = 1 - C.adjoint
-        return C
-
-
-    @property
-    def shape( self ) :
-        """Size of the explicit matrix."""
-        if not self.adjoint :
-            return ( self.n1, self.n2 )
-        else :
-            return ( self.n2, self.n1 )
-
-
-    def dot( self, double [::1] v_in  ):
-        """Wrapper to C code for efficiently performing the matrix-vector multiplications.
-
-        Parameters
-        ----------
-        v_in : 1D numpy.array of double
-            Input vector for the matrix-vector multiplication
-
-        Returns
-        -------
-        v_out : 1D numpy.array of double
-            Results of the multiplication
-        """
-
-        # Permit only matrix-vector multiplications
-        if v_in.size != self.shape[1] :
-            raise RuntimeError( "A.dot(): dimensions do not match" )
-
-        # Create output array
-        cdef double [::1] v_out = np.zeros( self.shape[0], dtype=np.float64 )
-
-        # Call the cython function to read the memory pointers
-        if not self.adjoint :
-            # DIRECT PRODUCT A*x
-            with nogil :
-                COMMIT_A(
-                    self.nF, self.n, self.nE, self.nV, self.nS, self.ndirs,
-                    &v_in[0], &v_out[0],
-                    self.ICf, self.ICv, self.ICo, self.ICl, self.ECv, self.ECo, self.ISOv,
-                    self.LUT_IC, self.LUT_EC, self.LUT_ISO,
-                    self.ICthreads, self.ECthreads, self.ISOthreads
-                )
-        else :
-            # INVERSE PRODUCT A'*y
-            with nogil :
-                COMMIT_At(
-                    self.nF, self.n, self.nE, self.nV, self.nS, self.ndirs,
-                    &v_in[0], &v_out[0],
-                    self.ICf, self.ICv, self.ICo, self.ICl, self.ECv, self.ECo, self.ISOv,
-                    self.LUT_IC, self.LUT_EC, self.LUT_ISO,
-                    self.ICthreadsT, self.ECthreadsT, self.ISOthreadsT
-                )
-
-        return v_out
+#!python
+#cython: language_level=3, boundscheck=False, wraparound=False, profile=False
+
+import cython
+import numpy as np
+cimport numpy as np
+
+# Interfaces to actual C code performing the multiplications
+cdef extern void COMMIT_A(
+    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
+    double *_v_in, double *_v_out,
+    unsigned int *_ICf, unsigned int *_ICv, unsigned short *_ICo, float *_ICl,
+    unsigned int *_ECv, unsigned short *_ECo,
+    unsigned int *_ISOv,
+    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
+    unsigned int* _ICthreads, unsigned int* _ECthreads, unsigned int* _ISOthreads
+) nogil
+
+cdef extern void COMMIT_At(
+    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
+    double *_v_in, double *_v_out,
+    unsigned int *_ICf, unsigned int *_ICv, unsigned short *_ICo, float *_ICl,
+    unsigned int *_ECv, unsigned short *_ECo,
+    unsigned int *_ISOv,
+    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
+    unsigned char *_ICthreadsT, unsigned int *_ECthreadsT, unsigned int *_ISOthreadsT
+) nogil
+
+
+
+cdef class LinearOperator :
+    """This class is a wrapper to the C code for performing marix-vector multiplications
+    with the COMMIT linear operator A. The multiplications are done using C code
+    that uses information from the DICTIONARY, KERNELS and THREADS data structures.
+    """
+    cdef int nS, nF, nR, nE, nT, nV, nI, n, ndirs
+    cdef public int adjoint, n1, n2
+
+    cdef DICTIONARY
+    cdef KERNELS
+    cdef THREADS
+
+    cdef unsigned int*   ICf
+    cdef float*          ICl
+    cdef unsigned int*   ICv
+    cdef unsigned short* ICo
+    cdef unsigned int*   ECv
+    cdef unsigned short* ECo
+    cdef unsigned int*   ISOv
+
+    cdef float* LUT_IC
+    cdef float* LUT_EC
+    cdef float* LUT_ISO
+
+    cdef unsigned int*   ICthreads
+    cdef unsigned int*   ECthreads
+    cdef unsigned int*   ISOthreads
+
+    cdef unsigned char*  ICthreadsT
+    cdef unsigned int*   ECthreadsT
+    cdef unsigned int*   ISOthreadsT
+
+
+    def __init__( self, DICTIONARY, KERNELS, THREADS ) :
+        """Set the pointers to the data structures used by the C code."""
+        self.DICTIONARY = DICTIONARY
+        self.KERNELS    = KERNELS
+        self.THREADS    = THREADS
+
+        self.nF         = DICTIONARY['IC']['nF']    # number of FIBERS
+        self.nR         = KERNELS['wmr'].shape[0]   # number of FIBER RADII
+        self.nE         = DICTIONARY['EC']['nE']    # number of EC segments
+        self.nT         = KERNELS['wmh'].shape[0]   # number of EC TORTUOSITY values
+        self.nV         = DICTIONARY['nV']          # number of VOXELS
+        self.nI         = KERNELS['iso'].shape[0]   # number of ISO contributions
+        self.n          = DICTIONARY['IC']['n']     # numbner of IC segments
+        self.ndirs      = KERNELS['wmr'].shape[1]   # number of directions
+
+        if KERNELS['wmr'].size > 0 :
+            self.nS = KERNELS['wmr'].shape[2]       # number of SAMPLES
+        elif KERNELS['wmh'].size > 0 :
+            self.nS = KERNELS['wmh'].shape[2]
+        else :
+            self.nS = KERNELS['wmr'].shape[1]
+
+        self.adjoint    = 0                         # direct of inverse product
+
+        self.n1 = self.nV*self.nS
+        self.n2 = self.nR*self.nF + self.nT*self.nE + self.nI*self.nV
+
+        # get C pointers to arrays in DICTIONARY
+        cdef unsigned int [::1]   ICf  = DICTIONARY['IC']['fiber']
+        self.ICf = &ICf[0]
+        cdef float [::1]          ICl  = DICTIONARY['IC']['len']
+        self.ICl = &ICl[0]
+        cdef unsigned int [::1]   ICv  = DICTIONARY['IC']['v']
+        self.ICv = &ICv[0]
+        cdef unsigned short [::1] ICo  = DICTIONARY['IC']['o']
+        self.ICo = &ICo[0]
+        cdef unsigned int [::1]   ECv  = DICTIONARY['EC']['v']
+        self.ECv = &ECv[0]
+        cdef unsigned short [::1] ECo  = DICTIONARY['EC']['o']
+        self.ECo = &ECo[0]
+        cdef unsigned int [::1]   ISOv = DICTIONARY['ISO']['v']
+        self.ISOv = &ISOv[0]
+
+        # get C pointers to arrays in KERNELS
+        cdef float [:, :, ::1] wmrSFP = KERNELS['wmr']
+        self.LUT_IC  = &wmrSFP[0,0,0]
+        cdef float [:, :, ::1] wmhSFP = KERNELS['wmh']
+        self.LUT_EC  = &wmhSFP[0,0,0]
+        cdef float [:, ::1] isoSFP = KERNELS['iso']
+        self.LUT_ISO = &isoSFP[0,0]
+
+        # get C pointers to arrays in THREADS
+        cdef unsigned int [::1] ICthreads = THREADS['IC']
+        self.ICthreads  = &ICthreads[0]
+        cdef unsigned int [::1] ECthreads = THREADS['EC']
+        self.ECthreads  = &ECthreads[0]
+        cdef unsigned int [::1] ISOthreads = THREADS['ISO']
+        self.ISOthreads = &ISOthreads[0]
+
+        cdef unsigned char [::1] ICthreadsT = THREADS['ICt']
+        self.ICthreadsT  = &ICthreadsT[0]
+        cdef unsigned int  [::1] ECthreadsT = THREADS['ECt']
+        self.ECthreadsT  = &ECthreadsT[0]
+        cdef unsigned int  [::1] ISOthreadsT = THREADS['ISOt']
+        self.ISOthreadsT = &ISOthreadsT[0]
+
+
+    @property
+    def T( self ) :
+        """Transpose of the explicit matrix."""
+        C = LinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
+        C.adjoint = 1 - C.adjoint
+        return C
+
+
+    @property
+    def shape( self ) :
+        """Size of the explicit matrix."""
+        if not self.adjoint :
+            return ( self.n1, self.n2 )
+        else :
+            return ( self.n2, self.n1 )
+
+
+    def dot( self, double [::1] v_in  ):
+        """Wrapper to C code for efficiently performing the matrix-vector multiplications.
+
+        Parameters
+        ----------
+        v_in : 1D numpy.array of double
+            Input vector for the matrix-vector multiplication
+
+        Returns
+        -------
+        v_out : 1D numpy.array of double
+            Results of the multiplication
+        """
+
+        # Permit only matrix-vector multiplications
+        if v_in.size != self.shape[1] :
+            raise RuntimeError( "A.dot(): dimensions do not match" )
+
+        # Create output array
+        cdef double [::1] v_out = np.zeros( self.shape[0], dtype=np.float64 )
+
+        # Call the cython function to read the memory pointers
+        if not self.adjoint :
+            # DIRECT PRODUCT A*x
+            with nogil :
+                COMMIT_A(
+                    self.nF, self.n, self.nE, self.nV, self.nS, self.ndirs,
+                    &v_in[0], &v_out[0],
+                    self.ICf, self.ICv, self.ICo, self.ICl, self.ECv, self.ECo, self.ISOv,
+                    self.LUT_IC, self.LUT_EC, self.LUT_ISO,
+                    self.ICthreads, self.ECthreads, self.ISOthreads
+                )
+        else :
+            # INVERSE PRODUCT A'*y
+            with nogil :
+                COMMIT_At(
+                    self.nF, self.n, self.nE, self.nV, self.nS, self.ndirs,
+                    &v_in[0], &v_out[0],
+                    self.ICf, self.ICv, self.ICo, self.ICl, self.ECv, self.ECo, self.ISOv,
+                    self.LUT_IC, self.LUT_EC, self.LUT_ISO,
+                    self.ICthreadsT, self.ECthreadsT, self.ISOthreadsT
+                )
+
+        return v_out
diff --git a/commit/operator/operator_withLUT.c b/commit/operator/operator_withLUT.c
index 042dbe5f..1b6fd1ae 100644
--- a/commit/operator/operator_withLUT.c
+++ b/commit/operator/operator_withLUT.c
@@ -1,2247 +1,2247 @@
-#include <pthread.h>
-#include <stdint.h> // uint32_t etc
-
-// number of THREADS
-#ifdef nTHREADS
-    #if (nTHREADS<0 || nTHREADS>255)
-    #error "nTHREADS" must be in the range 0..255
-    #endif
-#else
-    #error "nTHREADS" parameter must be passed to the compiler as "-DnTHREADS=<value>"
-#endif
-
-
-/* global variables */
-int         nF, n, nE, nV, nS, ndirs;
-double      *x, *Y;
-uint32_t    *ICthreads, *ECthreads, *ISOthreads;
-uint8_t     *ICthreadsT;
-uint32_t    *ECthreadsT, *ISOthreadsT;
-uint32_t    *ICf, *ICv, *ECv, *ISOv;
-uint16_t    *ICo, *ECo;
-float       *ICl;
-float       *wmrSFP0, *wmrSFP1, *wmrSFP2, *wmrSFP3, *wmrSFP4, *wmrSFP5, *wmrSFP6, *wmrSFP7, *wmrSFP8, *wmrSFP9, *wmrSFP10, *wmrSFP11, *wmrSFP12, *wmrSFP13, *wmrSFP14, *wmrSFP15, *wmrSFP16, *wmrSFP17, *wmrSFP18, *wmrSFP19;
-float       *wmhSFP0, *wmhSFP1, *wmhSFP2, *wmhSFP3, *wmhSFP4, *wmhSFP5, *wmhSFP6, *wmhSFP7, *wmhSFP8, *wmhSFP9, *wmhSFP10, *wmhSFP11, *wmhSFP12, *wmhSFP13, *wmhSFP14, *wmhSFP15, *wmhSFP16, *wmhSFP17, *wmhSFP18, *wmhSFP19;
-float       *isoSFP0, *isoSFP1, *isoSFP2, *isoSFP3, *isoSFP4, *isoSFP5, *isoSFP6, *isoSFP7, *isoSFP8, *isoSFP9, *isoSFP10, *isoSFP11, *isoSFP12, *isoSFP13, *isoSFP14, *isoSFP15, *isoSFP16, *isoSFP17, *isoSFP18, *isoSFP19;
-
-
-
-// ====================================================
-// Compute a sub-block of the A*x MAtRIX-VECTOR product
-// ====================================================
-void* COMMIT_A__block( void *ptr )
-{
-    int      id = (long)ptr;
-    int      offset;
-    double   x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, w;
-    double   *x_Ptr0, *x_Ptr1, *x_Ptr2, *x_Ptr3, *x_Ptr4, *x_Ptr5, *x_Ptr6, *x_Ptr7, *x_Ptr8, *x_Ptr9, *x_Ptr10, *x_Ptr11, *x_Ptr12, *x_Ptr13, *x_Ptr14, *x_Ptr15, *x_Ptr16, *x_Ptr17, *x_Ptr18, *x_Ptr19;
-    double   *Yptr, *YptrEnd;
-    float    *SFP0ptr, *SFP1ptr, *SFP2ptr, *SFP3ptr, *SFP4ptr, *SFP5ptr, *SFP6ptr, *SFP7ptr, *SFP8ptr, *SFP9ptr, *SFP10ptr, *SFP11ptr, *SFP12ptr, *SFP13ptr, *SFP14ptr, *SFP15ptr, *SFP16ptr, *SFP17ptr, *SFP18ptr, *SFP19ptr;
-    uint32_t *t_v, *t_vEnd, *t_f;
-    uint16_t *t_o;
-    float    *t_l;
-
-#if nIC>=1
-    // intra-cellular compartments
-    t_v    = ICv + ICthreads[id];
-    t_vEnd = ICv + ICthreads[id+1];
-    t_o    = ICo + ICthreads[id];
-    t_l    = ICl + ICthreads[id];
-    t_f    = ICf + ICthreads[id];
-
-    while( t_v != t_vEnd )
-    {
-        x_Ptr0 = x + *t_f;
-        x0 = *x_Ptr0;
-        #if nIC>=2
-        x_Ptr1 = x_Ptr0 + nF;
-        x1 = *x_Ptr1;
-        #endif
-        #if nIC>=3
-        x_Ptr2 = x_Ptr1 + nF;
-        x2 = *x_Ptr2;
-        #endif
-        #if nIC>=4
-        x_Ptr3 = x_Ptr2 + nF;
-        x3 = *x_Ptr3;
-        #endif
-        #if nIC>=5
-        x_Ptr4 = x_Ptr3 + nF;
-        x4 = *x_Ptr4;
-        #endif
-        #if nIC>=6
-        x_Ptr5 = x_Ptr4 + nF;
-        x5 = *x_Ptr5;
-        #endif
-        #if nIC>=7
-        x_Ptr6 = x_Ptr5 + nF;
-        x6 = *x_Ptr6;
-        #endif
-        #if nIC>=8
-        x_Ptr7 = x_Ptr6 + nF;
-        x7 = *x_Ptr7;
-        #endif
-        #if nIC>=9
-        x_Ptr8 = x_Ptr7 + nF;
-        x8 = *x_Ptr8;
-        #endif
-        #if nIC>=10
-        x_Ptr9 = x_Ptr8 + nF;
-        x9 = *x_Ptr9;
-        #endif
-        #if nIC>=11
-        x_Ptr10 = x_Ptr9 + nF;
-        x10 = *x_Ptr10;
-        #endif
-        #if nIC>=12
-        x_Ptr11 = x_Ptr10 + nF;
-        x11 = *x_Ptr11;
-        #endif
-        #if nIC>=13
-        x_Ptr12 = x_Ptr11 + nF;
-        x12 = *x_Ptr12;
-        #endif
-        #if nIC>=14
-        x_Ptr13 = x_Ptr12 + nF;
-        x13 = *x_Ptr13;
-        #endif
-        #if nIC>=15
-        x_Ptr14 = x_Ptr13 + nF;
-        x14 = *x_Ptr14;
-        #endif
-        #if nIC>=16
-        x_Ptr15 = x_Ptr14 + nF;
-        x15 = *x_Ptr15;
-        #endif
-        #if nIC>=17
-        x_Ptr16 = x_Ptr15 + nF;
-        x16 = *x_Ptr16;
-        #endif
-        #if nIC>=18
-        x_Ptr17 = x_Ptr16 + nF;
-        x17 = *x_Ptr17;
-        #endif
-        #if nIC>=19
-        x_Ptr18 = x_Ptr17 + nF;
-        x18 = *x_Ptr18;
-        #endif
-        #if nIC>=20
-        x_Ptr19 = x_Ptr18 + nF;
-        x19 = *x_Ptr19;
-        #endif
-
-        if ( x0 != 0
-        #if nIC>=2
-            || x1 != 0
-        #endif
-        #if nIC>=3
-            || x2 != 0
-        #endif
-        #if nIC>=4
-            || x3 != 0
-        #endif
-        #if nIC>=5
-            || x4 != 0
-        #endif
-        #if nIC>=6
-            || x5 != 0
-        #endif
-        #if nIC>=7
-            || x6 != 0
-        #endif
-        #if nIC>=8
-            || x7 != 0
-        #endif
-        #if nIC>=9
-            || x8 != 0
-        #endif
-        #if nIC>=10
-            || x9 != 0
-        #endif
-        #if nIC>=11
-            || x10 != 0
-        #endif
-        #if nIC>=12
-            || x11 != 0
-        #endif
-        #if nIC>=13
-            || x12 != 0
-        #endif
-        #if nIC>=14
-            || x13 != 0
-        #endif
-        #if nIC>=15
-            || x14 != 0
-        #endif
-        #if nIC>=16
-            || x15 != 0
-        #endif
-        #if nIC>=17
-            || x16 != 0
-        #endif
-        #if nIC>=18
-            || x17 != 0
-        #endif
-        #if nIC>=19
-            || x18 != 0
-        #endif
-        #if nIC>=20
-            || x19 != 0
-        #endif
-        )
-        {
-            Yptr    = Y    + nS * (*t_v);
-            YptrEnd = Yptr + nS;
-            w       = (double)(*t_l);
-            offset  = nS * (*t_o);
-            SFP0ptr = wmrSFP0 + offset;
-            #if nIC>=2
-            SFP1ptr = wmrSFP1 + offset;
-            #endif
-            #if nIC>=3
-            SFP2ptr = wmrSFP2 + offset;
-            #endif
-            #if nIC>=4
-            SFP3ptr = wmrSFP3 + offset;
-            #endif
-            #if nIC>=5
-            SFP4ptr = wmrSFP4 + offset;
-            #endif
-            #if nIC>=6
-            SFP5ptr = wmrSFP5 + offset;
-            #endif
-            #if nIC>=7
-            SFP6ptr = wmrSFP6 + offset;
-            #endif
-            #if nIC>=8
-            SFP7ptr = wmrSFP7 + offset;
-            #endif
-            #if nIC>=9
-            SFP8ptr = wmrSFP8 + offset;
-            #endif
-            #if nIC>=10
-            SFP9ptr = wmrSFP9 + offset;
-            #endif
-            #if nIC>=11
-            SFP10ptr = wmrSFP10 + offset;
-            #endif
-            #if nIC>=12
-            SFP11ptr = wmrSFP11 + offset;
-            #endif
-            #if nIC>=13
-            SFP12ptr = wmrSFP12 + offset;
-            #endif
-            #if nIC>=14
-            SFP13ptr = wmrSFP13 + offset;
-            #endif
-            #if nIC>=15
-            SFP14ptr = wmrSFP14 + offset;
-            #endif
-            #if nIC>=16
-            SFP15ptr = wmrSFP15 + offset;
-            #endif
-            #if nIC>=17
-            SFP16ptr = wmrSFP16 + offset;
-            #endif
-            #if nIC>=18
-            SFP17ptr = wmrSFP17 + offset;
-            #endif
-            #if nIC>=19
-            SFP18ptr = wmrSFP18 + offset;
-            #endif
-            #if nIC>=20
-            SFP19ptr = wmrSFP19 + offset;
-            #endif
-
-            while( Yptr != YptrEnd )
-                (*Yptr++) += w * (
-                          x0 * (*SFP0ptr++)
-                        #if nIC>=2
-                        + x1 * (*SFP1ptr++)
-                        #endif
-                        #if nIC>=3
-                        + x2 * (*SFP2ptr++)
-                        #endif
-                        #if nIC>=4
-                        + x3 * (*SFP3ptr++)
-                        #endif
-                        #if nIC>=5
-                        + x4 * (*SFP4ptr++)
-                        #endif
-                        #if nIC>=6
-                        + x5 * (*SFP5ptr++)
-                        #endif
-                        #if nIC>=7
-                        + x6 * (*SFP6ptr++)
-                        #endif
-                        #if nIC>=8
-                        + x7 * (*SFP7ptr++)
-                        #endif
-                        #if nIC>=9
-                        + x8 * (*SFP8ptr++)
-                        #endif
-                        #if nIC>=10
-                        + x9 * (*SFP9ptr++)
-                        #endif
-                        #if nIC>=11
-                        + x10 * (*SFP10ptr++)
-                        #endif
-                        #if nIC>=12
-                        + x11 * (*SFP11ptr++)
-                        #endif
-                        #if nIC>=13
-                        + x12 * (*SFP12ptr++)
-                        #endif
-                        #if nIC>=14
-                        + x13 * (*SFP13ptr++)
-                        #endif
-                        #if nIC>=15
-                        + x14 * (*SFP14ptr++)
-                        #endif
-                        #if nIC>=16
-                        + x15 * (*SFP15ptr++)
-                        #endif
-                        #if nIC>=17
-                        + x16 * (*SFP16ptr++)
-                        #endif
-                        #if nIC>=18
-                        + x17 * (*SFP17ptr++)
-                        #endif
-                        #if nIC>=19
-                        + x18 * (*SFP18ptr++)
-                        #endif
-                        #if nIC>=20
-                        + x19 * (*SFP19ptr++)
-                        #endif
-                );
-        }
-
-        t_f++;
-        t_v++;
-        t_o++;
-        t_l++;
-    }
-#endif
-
-#if nEC>=1
-    // extra-cellular compartments
-    t_v    = ECv + ECthreads[id];
-    t_vEnd = ECv + ECthreads[id+1];
-    t_o    = ECo + ECthreads[id];
-
-    x_Ptr0 = x + nIC*nF + ECthreads[id];
-    #if nEC>=2
-    x_Ptr1 = x_Ptr0 + nE;
-    #endif
-    #if nEC>=3
-    x_Ptr2 = x_Ptr1 + nE;
-    #endif
-    #if nEC>=4
-    x_Ptr3 = x_Ptr2 + nE;
-    #endif
-    #if nEC>=5
-    x_Ptr4 = x_Ptr3 + nE;
-    #endif
-    #if nEC>=6
-    x_Ptr5 = x_Ptr4 + nE;
-    #endif
-    #if nEC>=7
-    x_Ptr6 = x_Ptr5 + nE;
-    #endif
-    #if nEC>=8
-    x_Ptr7 = x_Ptr6 + nE;
-    #endif
-    #if nEC>=9
-    x_Ptr8 = x_Ptr7 + nE;
-    #endif
-    #if nEC>=10
-    x_Ptr9 = x_Ptr8 + nE;
-    #endif
-    #if nEC>=11
-    x_Ptr10 = x_Ptr9 + nE;
-    #endif
-    #if nEC>=12
-    x_Ptr11 = x_Ptr10 + nE;
-    #endif
-    #if nEC>=13
-    x_Ptr12 = x_Ptr11 + nE;
-    #endif
-    #if nEC>=14
-    x_Ptr13 = x_Ptr12 + nE;
-    #endif
-    #if nEC>=15
-    x_Ptr14 = x_Ptr13 + nE;
-    #endif
-    #if nEC>=16
-    x_Ptr15 = x_Ptr14 + nE;
-    #endif
-    #if nEC>=17
-    x_Ptr16 = x_Ptr15 + nE;
-    #endif
-    #if nEC>=18
-    x_Ptr17 = x_Ptr16 + nE;
-    #endif
-    #if nEC>=19
-    x_Ptr18 = x_Ptr17 + nE;
-    #endif
-    #if nEC>=20
-    x_Ptr19 = x_Ptr18 + nE;
-    #endif
-
-    while( t_v != t_vEnd )
-    {
-        x0 = *x_Ptr0++;
-        #if nEC>=2
-        x1 = *x_Ptr1++;
-        #endif
-        #if nEC>=3
-        x2 = *x_Ptr2++;
-        #endif
-        #if nEC>=4
-        x3 = *x_Ptr3++;
-        #endif
-        #if nEC>=5
-        x4 = *x_Ptr4++;
-        #endif
-        #if nEC>=6
-        x5 = *x_Ptr5++;
-        #endif
-        #if nEC>=7
-        x6 = *x_Ptr6++;
-        #endif
-        #if nEC>=8
-        x7 = *x_Ptr7++;
-        #endif
-        #if nEC>=9
-        x8 = *x_Ptr8++;
-        #endif
-        #if nEC>=10
-        x9 = *x_Ptr9++;
-        #endif
-        #if nEC>=11
-        x10 = *x_Ptr10++;
-        #endif
-        #if nEC>=12
-        x11 = *x_Ptr11++;
-        #endif
-        #if nEC>=13
-        x12 = *x_Ptr12++;
-        #endif
-        #if nEC>=14
-        x13 = *x_Ptr13++;
-        #endif
-        #if nEC>=15
-        x14 = *x_Ptr14++;
-        #endif
-        #if nEC>=16
-        x15 = *x_Ptr15++;
-        #endif
-        #if nEC>=17
-        x16 = *x_Ptr16++;
-        #endif
-        #if nEC>=18
-        x17 = *x_Ptr17++;
-        #endif
-        #if nEC>=19
-        x18 = *x_Ptr18++;
-        #endif
-        #if nEC>=20
-        x19 = *x_Ptr19++;
-        #endif
-        if (
-               x0 != 0
-            #if nEC>=2
-            || x1 != 0
-            #endif
-            #if nEC>=3
-            || x2 != 0
-            #endif
-            #if nEC>=4
-            || x3 != 0
-            #endif
-            #if nEC>=5
-            || x4 != 0
-            #endif
-            #if nEC>=6
-            || x5 != 0
-            #endif
-            #if nEC>=7
-            || x6 != 0
-            #endif
-            #if nEC>=8
-            || x7 != 0
-            #endif
-            #if nEC>=9
-            || x8 != 0
-            #endif
-            #if nEC>=10
-            || x9 != 0
-            #endif
-            #if nEC>=11
-            || x10 != 0
-            #endif
-            #if nEC>=12
-            || x11 != 0
-            #endif
-            #if nEC>=13
-            || x12 != 0
-            #endif
-            #if nEC>=14
-            || x13 != 0
-            #endif
-            #if nEC>=15
-            || x14 != 0
-            #endif
-            #if nEC>=16
-            || x15 != 0
-            #endif
-            #if nEC>=17
-            || x16 != 0
-            #endif
-            #if nEC>=18
-            || x17 != 0
-            #endif
-            #if nEC>=19
-            || x18 != 0
-            #endif
-            #if nEC>=20
-            || x19 != 0
-            #endif
-          )
-        {
-            Yptr    = Y    + nS * (*t_v);
-            YptrEnd = Yptr + nS;
-            offset  = nS * (*t_o);
-            SFP0ptr = wmhSFP0 + offset;
-            #if nEC>=2
-            SFP1ptr = wmhSFP1 + offset;
-            #endif
-            #if nEC>=3
-            SFP2ptr = wmhSFP2 + offset;
-            #endif
-            #if nEC>=4
-            SFP3ptr = wmhSFP3 + offset;
-            #endif
-            #if nEC>=5
-            SFP4ptr = wmhSFP4 + offset;
-            #endif
-            #if nEC>=6
-            SFP5ptr = wmhSFP5 + offset;
-            #endif
-            #if nEC>=7
-            SFP6ptr = wmhSFP6 + offset;
-            #endif
-            #if nEC>=8
-            SFP7ptr = wmhSFP7 + offset;
-            #endif
-            #if nEC>=9
-            SFP8ptr = wmhSFP8 + offset;
-            #endif
-            #if nEC>=10
-            SFP9ptr = wmhSFP9 + offset;
-            #endif
-            #if nEC>=11
-            SFP10ptr = wmhSFP10 + offset;
-            #endif
-            #if nEC>=12
-            SFP11ptr = wmhSFP11 + offset;
-            #endif
-            #if nEC>=13
-            SFP12ptr = wmhSFP12 + offset;
-            #endif
-            #if nEC>=14
-            SFP13ptr = wmhSFP13 + offset;
-            #endif
-            #if nEC>=15
-            SFP14ptr = wmhSFP14 + offset;
-            #endif
-            #if nEC>=16
-            SFP15ptr = wmhSFP15 + offset;
-            #endif
-            #if nEC>=17
-            SFP16ptr = wmhSFP16 + offset;
-            #endif
-            #if nEC>=18
-            SFP17ptr = wmhSFP17 + offset;
-            #endif
-            #if nEC>=19
-            SFP18ptr = wmhSFP18 + offset;
-            #endif
-            #if nEC>=20
-            SFP19ptr = wmhSFP19 + offset;
-            #endif
-
-            while( Yptr != YptrEnd )
-                (*Yptr++) += (
-                      x0 * (*SFP0ptr++)
-                    #if nEC>=2
-                    + x1 * (*SFP1ptr++)
-                    #endif
-                    #if nEC>=3
-                    + x2 * (*SFP2ptr++)
-                    #endif
-                    #if nEC>=4
-                    + x3 * (*SFP3ptr++)
-                    #endif
-                    #if nEC>=5
-                    + x4 * (*SFP4ptr++)
-                    #endif
-                    #if nEC>=6
-                    + x5 * (*SFP5ptr++)
-                    #endif
-                    #if nEC>=7
-                    + x6 * (*SFP6ptr++)
-                    #endif
-                    #if nEC>=8
-                    + x7 * (*SFP7ptr++)
-                    #endif
-                    #if nEC>=9
-                    + x8 * (*SFP8ptr++)
-                    #endif
-                    #if nEC>=10
-                    + x9 * (*SFP9ptr++)
-                    #endif
-                    #if nEC>=11
-                    + x10 * (*SFP10ptr++)
-                    #endif
-                    #if nEC>=12
-                    + x11 * (*SFP11ptr++)
-                    #endif
-                    #if nEC>=13
-                    + x12 * (*SFP12ptr++)
-                    #endif
-                    #if nEC>=14
-                    + x13 * (*SFP13ptr++)
-                    #endif
-                    #if nEC>=15
-                    + x14 * (*SFP14ptr++)
-                    #endif
-                    #if nEC>=16
-                    + x15 * (*SFP15ptr++)
-                    #endif
-                    #if nEC>=17
-                    + x16 * (*SFP16ptr++)
-                    #endif
-                    #if nEC>=18
-                    + x17 * (*SFP17ptr++)
-                    #endif
-                    #if nEC>=19
-                    + x18 * (*SFP18ptr++)
-                    #endif
-                    #if nEC>=20
-                    + x19 * (*SFP19ptr++)
-                    #endif
-
-                );
-        }
-        t_v++;
-        t_o++;
-    }
-#endif
-
-#if nISO>=1
-    // isotropic compartments
-    t_v    = ISOv + ISOthreads[id];
-    t_vEnd = ISOv + ISOthreads[id+1];
-
-    x_Ptr0 = x + nIC*nF + nEC*nE + ISOthreads[id];
-    #if nISO>=2
-    x_Ptr1 = x_Ptr0 + nV;
-    #endif
-    #if nISO>=3
-    x_Ptr2 = x_Ptr1 + nV;
-    #endif
-    #if nISO>=4
-    x_Ptr3 = x_Ptr2 + nV;
-    #endif
-    #if nISO>=5
-    x_Ptr4 = x_Ptr3 + nV;
-    #endif
-    #if nISO>=6
-    x_Ptr5 = x_Ptr4 + nV;
-    #endif
-    #if nISO>=7
-    x_Ptr6 = x_Ptr5 + nV;
-    #endif
-    #if nISO>=8
-    x_Ptr7 = x_Ptr6 + nV;
-    #endif
-    #if nISO>=9
-    x_Ptr8 = x_Ptr7 + nV;
-    #endif
-    #if nISO>=10
-    x_Ptr9 = x_Ptr8 + nV;
-    #endif
-    #if nISO>=11
-    x_Ptr10 = x_Ptr9 + nV;
-    #endif
-    #if nISO>=12
-    x_Ptr11 = x_Ptr10 + nV;
-    #endif
-    #if nISO>=13
-    x_Ptr12 = x_Ptr11 + nV;
-    #endif
-    #if nISO>=14
-    x_Ptr13 = x_Ptr12 + nV;
-    #endif
-    #if nISO>=15
-    x_Ptr14 = x_Ptr13 + nV;
-    #endif
-    #if nISO>=16
-    x_Ptr15 = x_Ptr14 + nV;
-    #endif
-    #if nISO>=17
-    x_Ptr16 = x_Ptr15 + nV;
-    #endif
-    #if nISO>=18
-    x_Ptr17 = x_Ptr16 + nV;
-    #endif
-    #if nISO>=19
-    x_Ptr18 = x_Ptr17 + nV;
-    #endif
-    #if nISO>=20
-    x_Ptr19 = x_Ptr18 + nV;
-    #endif
-
-    while( t_v != t_vEnd )
-    {
-        x0 = *x_Ptr0++;
-        #if nISO>=2
-        x1 = *x_Ptr1++;
-        #endif
-        #if nISO>=3
-        x2 = *x_Ptr2++;
-        #endif
-        #if nISO>=4
-        x3 = *x_Ptr3++;
-        #endif
-        #if nISO>=5
-        x4 = *x_Ptr4++;
-        #endif
-        #if nISO>=6
-        x5 = *x_Ptr5++;
-        #endif
-        #if nISO>=7
-        x6 = *x_Ptr6++;
-        #endif
-        #if nISO>=8
-        x7 = *x_Ptr7++;
-        #endif
-        #if nISO>=9
-        x8 = *x_Ptr8++;
-        #endif
-        #if nISO>=10
-        x9 = *x_Ptr9++;
-        #endif
-        #if nISO>=11
-        x10 = *x_Ptr10++;
-        #endif
-        #if nISO>=12
-        x11 = *x_Ptr11++;
-        #endif
-        #if nISO>=13
-        x12 = *x_Ptr12++;
-        #endif
-        #if nISO>=14
-        x13 = *x_Ptr13++;
-        #endif
-        #if nISO>=15
-        x14 = *x_Ptr14++;
-        #endif
-        #if nISO>=16
-        x15 = *x_Ptr15++;
-        #endif
-        #if nISO>=17
-        x16 = *x_Ptr16++;
-        #endif
-        #if nISO>=18
-        x17 = *x_Ptr17++;
-        #endif
-        #if nISO>=19
-        x18 = *x_Ptr18++;
-        #endif
-        #if nISO>=20
-        x19 = *x_Ptr19++;
-        #endif
-
-        if (
-               x0 != 0
-            #if nISO>=2
-            || x1 != 0
-            #endif
-            #if nISO>=3
-            || x2 != 0
-            #endif
-            #if nISO>=4
-            || x3 != 0
-            #endif
-            #if nISO>=5
-            || x4 != 0
-            #endif
-            #if nISO>=6
-            || x5 != 0
-            #endif
-            #if nISO>=7
-            || x6 != 0
-            #endif
-            #if nISO>=8
-            || x7 != 0
-            #endif
-            #if nISO>=9
-            || x8 != 0
-            #endif
-            #if nISO>=10
-            || x9 != 0
-            #endif
-            #if nISO>=11
-            || x10 != 0
-            #endif
-            #if nISO>=12
-            || x11 != 0
-            #endif
-            #if nISO>=13
-            || x12 != 0
-            #endif
-            #if nISO>=14
-            || x13 != 0
-            #endif
-            #if nISO>=15
-            || x14 != 0
-            #endif
-            #if nISO>=16
-            || x15 != 0
-            #endif
-            #if nISO>=17
-            || x16 != 0
-            #endif
-            #if nISO>=18
-            || x17 != 0
-            #endif
-            #if nISO>=19
-            || x18 != 0
-            #endif
-            #if nISO>=20
-            || x19 != 0
-            #endif
-          )
-        {
-            Yptr    = Y    + nS * (*t_v);
-            YptrEnd = Yptr + nS;
-            SFP0ptr = isoSFP0;
-            #if nISO>=2
-            SFP1ptr = isoSFP1;
-            #endif
-            #if nISO>=3
-            SFP2ptr = isoSFP2;
-            #endif
-            #if nISO>=4
-            SFP3ptr = isoSFP3;
-            #endif
-            #if nISO>=5
-            SFP4ptr = isoSFP4;
-            #endif
-            #if nISO>=6
-            SFP5ptr = isoSFP5;
-            #endif
-            #if nISO>=7
-            SFP6ptr = isoSFP6;
-            #endif
-            #if nISO>=8
-            SFP7ptr = isoSFP7;
-            #endif
-            #if nISO>=9
-            SFP8ptr = isoSFP8;
-            #endif
-            #if nISO>=10
-            SFP9ptr = isoSFP9;
-            #endif
-            #if nISO>=11
-            SFP10ptr = isoSFP10;
-            #endif
-            #if nISO>=12
-            SFP11ptr = isoSFP11;
-            #endif
-            #if nISO>=13
-            SFP12ptr = isoSFP12;
-            #endif
-            #if nISO>=14
-            SFP13ptr = isoSFP13;
-            #endif
-            #if nISO>=15
-            SFP14ptr = isoSFP14;
-            #endif
-            #if nISO>=16
-            SFP15ptr = isoSFP15;
-            #endif
-            #if nISO>=17
-            SFP16ptr = isoSFP16;
-            #endif
-            #if nISO>=18
-            SFP17ptr = isoSFP17;
-            #endif
-            #if nISO>=19
-            SFP18ptr = isoSFP18;
-            #endif
-            #if nISO>=20
-            SFP19ptr = isoSFP19;
-            #endif
-
-            while( Yptr != YptrEnd )
-                (*Yptr++) += (
-                      x0 * (*SFP0ptr++)
-                    #if nISO>=2
-                    + x1 * (*SFP1ptr++)
-                    #endif
-                    #if nISO>=3
-                    + x2 * (*SFP2ptr++)
-                    #endif
-                    #if nISO>=4
-                    + x3 * (*SFP3ptr++)
-                    #endif
-                    #if nISO>=5
-                    + x4 * (*SFP4ptr++)
-                    #endif
-                    #if nISO>=6
-                    + x5 * (*SFP5ptr++)
-                    #endif
-                    #if nISO>=7
-                    + x6 * (*SFP6ptr++)
-                    #endif
-                    #if nISO>=8
-                    + x7 * (*SFP7ptr++)
-                    #endif
-                    #if nISO>=9
-                    + x8 * (*SFP8ptr++)
-                    #endif
-                    #if nISO>=10
-                    + x9 * (*SFP9ptr++)
-                    #endif
-                    #if nISO>=11
-                    + x10 * (*SFP10ptr++)
-                    #endif
-                    #if nISO>=12
-                    + x11 * (*SFP11ptr++)
-                    #endif
-                    #if nISO>=13
-                    + x12 * (*SFP12ptr++)
-                    #endif
-                    #if nISO>=14
-                    + x13 * (*SFP13ptr++)
-                    #endif
-                    #if nISO>=15
-                    + x14 * (*SFP14ptr++)
-                    #endif
-                    #if nISO>=16
-                    + x15 * (*SFP15ptr++)
-                    #endif
-                    #if nISO>=17
-                    + x16 * (*SFP16ptr++)
-                    #endif
-                    #if nISO>=18
-                    + x17 * (*SFP17ptr++)
-                    #endif
-                    #if nISO>=19
-                    + x18 * (*SFP18ptr++)
-                    #endif
-                    #if nISO>=20
-                    + x19 * (*SFP19ptr++)
-                    #endif
-                );
-        }
-        t_v++;
-    }
-#endif
-
-    pthread_exit( 0 );
-}
-
-
-// =========================
-// Function called by CYTHON
-// =========================
-void COMMIT_A(
-    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
-    double *_vIN, double *_vOUT,
-    uint32_t *_ICf, uint32_t *_ICv, uint16_t *_ICo, float *_ICl,
-    uint32_t *_ECv, uint16_t *_ECo,
-    uint32_t *_ISOv,
-    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
-    uint32_t* _ICthreads, uint32_t* _ECthreads, uint32_t* _ISOthreads
-)
-{
-    nF = _nF;
-    n  = _n;
-    nE = _nE;
-    nV = _nV;
-    nS = _nS;
-    ndirs = _ndirs;
-
-    x = _vIN;
-    Y = _vOUT;
-
-    ICf  = _ICf;
-    ICv  = _ICv;
-    ICo  = _ICo;
-    ICl  = _ICl;
-    ECv  = _ECv;
-    ECo  = _ECo;
-    ISOv = _ISOv;
-
-    #if nIC>=1
-    wmrSFP0 = _wmrSFP;
-    #if nIC>=2
-    wmrSFP1 = wmrSFP0 + _ndirs*_nS;
-    #if nIC>=3
-    wmrSFP2 = wmrSFP1 + _ndirs*_nS;
-    #if nIC>=4
-    wmrSFP3 = wmrSFP2 + _ndirs*_nS;
-    #if nIC>=5
-    wmrSFP4 = wmrSFP3 + _ndirs*_nS;
-    #if nIC>=6
-    wmrSFP5 = wmrSFP4 + _ndirs*_nS;
-    #if nIC>=7
-    wmrSFP6 = wmrSFP5 + _ndirs*_nS;
-    #if nIC>=8
-    wmrSFP7 = wmrSFP6 + _ndirs*_nS;
-    #if nIC>=9
-    wmrSFP8 = wmrSFP7 + _ndirs*_nS;
-    #if nIC>=10
-    wmrSFP9 = wmrSFP8 + _ndirs*_nS;
-    #if nIC>=11
-    wmrSFP10 = wmrSFP9 + _ndirs*_nS;
-    #if nIC>=12
-    wmrSFP11 = wmrSFP10 + _ndirs*_nS;
-    #if nIC>=13
-    wmrSFP12 = wmrSFP11 + _ndirs*_nS;
-    #if nIC>=14
-    wmrSFP13 = wmrSFP12 + _ndirs*_nS;
-    #if nIC>=15
-    wmrSFP14 = wmrSFP13 + _ndirs*_nS;
-    #if nIC>=16
-    wmrSFP15 = wmrSFP14 + _ndirs*_nS;
-    #if nIC>=17
-    wmrSFP16 = wmrSFP15 + _ndirs*_nS;
-    #if nIC>=18
-    wmrSFP17 = wmrSFP16 + _ndirs*_nS;
-    #if nIC>=19
-    wmrSFP18 = wmrSFP17 + _ndirs*_nS;
-    #if nIC>=20
-    wmrSFP19 = wmrSFP18 + _ndirs*_nS;
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #if nEC>=1
-    wmhSFP0 = _wmhSFP;
-    #if nEC>=2
-    wmhSFP1 = wmhSFP0 + _ndirs*_nS;
-    #if nEC>=3
-    wmhSFP2 = wmhSFP1 + _ndirs*_nS;
-    #if nEC>=4
-    wmhSFP3 = wmhSFP2 + _ndirs*_nS;
-    #if nEC>=5
-    wmhSFP4 = wmhSFP3 + _ndirs*_nS;
-    #if nEC>=6
-    wmhSFP5 = wmhSFP4 + _ndirs*_nS;
-    #if nEC>=7
-    wmhSFP6 = wmhSFP5 + _ndirs*_nS;
-    #if nEC>=8
-    wmhSFP7 = wmhSFP6 + _ndirs*_nS;
-    #if nEC>=9
-    wmhSFP8 = wmhSFP7 + _ndirs*_nS;
-    #if nEC>=10
-    wmhSFP9 = wmhSFP8 + _ndirs*_nS;
-    #if nEC>=11
-    wmhSFP10 = wmhSFP9 + _ndirs*_nS;
-    #if nEC>=12
-    wmhSFP11 = wmhSFP10 + _ndirs*_nS;
-    #if nEC>=13
-    wmhSFP12 = wmhSFP11 + _ndirs*_nS;
-    #if nEC>=14
-    wmhSFP13 = wmhSFP12 + _ndirs*_nS;
-    #if nEC>=15
-    wmhSFP14 = wmhSFP13 + _ndirs*_nS;
-    #if nEC>=16
-    wmhSFP15 = wmhSFP14 + _ndirs*_nS;
-    #if nEC>=17
-    wmhSFP16 = wmhSFP15 + _ndirs*_nS;
-    #if nEC>=18
-    wmhSFP17 = wmhSFP16 + _ndirs*_nS;
-    #if nEC>=19
-    wmhSFP18 = wmhSFP17 + _ndirs*_nS;
-    #if nEC>=20
-    wmhSFP19 = wmhSFP18 + _ndirs*_nS;
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #if nISO>=1
-    isoSFP0 = _isoSFP;
-    #if nISO>=2
-    isoSFP1 = isoSFP0 + _nS;
-    #if nISO>=3
-    isoSFP2 = isoSFP1 + _nS;
-    #if nISO>=4
-    isoSFP3 = isoSFP2 + _nS;
-    #if nISO>=5
-    isoSFP4 = isoSFP3 + _nS;
-    #if nISO>=6
-    isoSFP5 = isoSFP4 + _nS;
-    #if nISO>=7
-    isoSFP6 = isoSFP5 + _nS;
-    #if nISO>=8
-    isoSFP7 = isoSFP6 + _nS;
-    #if nISO>=9
-    isoSFP8 = isoSFP7 + _nS;
-    #if nISO>=10
-    isoSFP9 = isoSFP8 + _nS;
-    #if nISO>=11
-    isoSFP10 = isoSFP9 + _nS;
-    #if nISO>=12
-    isoSFP11 = isoSFP10 + _nS;
-    #if nISO>=13
-    isoSFP12 = isoSFP11 + _nS;
-    #if nISO>=14
-    isoSFP13 = isoSFP12 + _nS;
-    #if nISO>=15
-    isoSFP14 = isoSFP13 + _nS;
-    #if nISO>=16
-    isoSFP15 = isoSFP14 + _nS;
-    #if nISO>=17
-    isoSFP16 = isoSFP15 + _nS;
-    #if nISO>=18
-    isoSFP17 = isoSFP16 + _nS;
-    #if nISO>=19
-    isoSFP18 = isoSFP17 + _nS;
-    #if nISO>=20
-    isoSFP19 = isoSFP18 + _nS;
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-
-    ICthreads  = _ICthreads;
-    ECthreads  = _ECthreads;
-    ISOthreads = _ISOthreads;
-
-    // Run SEPARATE THREADS to perform the multiplication
-    pthread_t threads[nTHREADS];
-    int t;
-    for(t=0; t<nTHREADS ; t++)
-        pthread_create( &threads[t], NULL, COMMIT_A__block, (void *) (long int)t );
-    for(t=0; t<nTHREADS ; t++)
-        pthread_join( threads[t], NULL );
-    return;
-}
-
-
-
-/* ===================================================== */
-/* Compute a sub-block of the A'*y MAtRIX-VECTOR product */
-/* ===================================================== */
-void* COMMIT_At__block( void *ptr )
-{
-    int      id = (long)ptr;
-    int      offset;
-    double   x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, w, Y_tmp;
-    double   *x_Ptr0, *x_Ptr1, *x_Ptr2, *x_Ptr3, *x_Ptr4, *x_Ptr5, *x_Ptr6, *x_Ptr7, *x_Ptr8, *x_Ptr9, *x_Ptr10, *x_Ptr11, *x_Ptr12, *x_Ptr13, *x_Ptr14, *x_Ptr15, *x_Ptr16, *x_Ptr17, *x_Ptr18, *x_Ptr19;
-    double   *Yptr, *YptrEnd;
-    float    *SFP0ptr, *SFP1ptr, *SFP2ptr, *SFP3ptr, *SFP4ptr, *SFP5ptr, *SFP6ptr, *SFP7ptr, *SFP8ptr, *SFP9ptr, *SFP10ptr, *SFP11ptr, *SFP12ptr, *SFP13ptr, *SFP14ptr, *SFP15ptr, *SFP16ptr, *SFP17ptr, *SFP18ptr, *SFP19ptr;
-    uint32_t *t_v, *t_vEnd, *t_f;
-    uint16_t *t_o;
-    float    *t_l;
-    uint8_t  *t_t;
-
-#if nIC>=1
-    // intra-cellular compartments
-    t_v    = ICv;
-    t_vEnd = ICv + n;
-    t_o    = ICo;
-    t_l    = ICl;
-    t_f    = ICf;
-    t_t    = ICthreadsT;
-
-    while( t_v != t_vEnd )
-    {
-        // in this case, I need to walk throug because the segments are ordered in "voxel order"
-        if ( *t_t == id )
-        {
-            Yptr    = Y    + nS * (*t_v);
-            YptrEnd = Yptr + nS;
-            offset  = nS * (*t_o);
-
-            Y_tmp = *Yptr;
-            SFP0ptr   = wmrSFP0 + offset;
-            x0 = (*SFP0ptr++) * Y_tmp;
-            #if nIC>=2
-            SFP1ptr   = wmrSFP1 + offset;
-            x1 = (*SFP1ptr++) * Y_tmp;
-            #endif
-            #if nIC>=3
-            SFP2ptr   = wmrSFP2 + offset;
-            x2 = (*SFP2ptr++) * Y_tmp;
-            #endif
-            #if nIC>=4
-            SFP3ptr   = wmrSFP3 + offset;
-            x3 = (*SFP3ptr++) * Y_tmp;
-            #endif
-            #if nIC>=5
-            SFP4ptr   = wmrSFP4 + offset;
-            x4 = (*SFP4ptr++) * Y_tmp;
-            #endif
-            #if nIC>=6
-            SFP5ptr   = wmrSFP5 + offset;
-            x5 = (*SFP5ptr++) * Y_tmp;
-            #endif
-            #if nIC>=7
-            SFP6ptr   = wmrSFP6 + offset;
-            x6 = (*SFP6ptr++) * Y_tmp;
-            #endif
-            #if nIC>=8
-            SFP7ptr   = wmrSFP7 + offset;
-            x7 = (*SFP7ptr++) * Y_tmp;
-            #endif
-            #if nIC>=9
-            SFP8ptr   = wmrSFP8 + offset;
-            x8 = (*SFP8ptr++) * Y_tmp;
-            #endif
-            #if nIC>=10
-            SFP9ptr   = wmrSFP9 + offset;
-            x9 = (*SFP9ptr++) * Y_tmp;
-            #endif
-            #if nIC>=11
-            SFP10ptr   = wmrSFP10 + offset;
-            x10 = (*SFP10ptr++) * Y_tmp;
-            #endif
-            #if nIC>=12
-            SFP11ptr   = wmrSFP11 + offset;
-            x11 = (*SFP11ptr++) * Y_tmp;
-            #endif
-            #if nIC>=13
-            SFP12ptr   = wmrSFP12 + offset;
-            x12 = (*SFP12ptr++) * Y_tmp;
-            #endif
-            #if nIC>=14
-            SFP13ptr   = wmrSFP13 + offset;
-            x13 = (*SFP13ptr++) * Y_tmp;
-            #endif
-            #if nIC>=15
-            SFP14ptr   = wmrSFP14 + offset;
-            x14 = (*SFP14ptr++) * Y_tmp;
-            #endif
-            #if nIC>=16
-            SFP15ptr   = wmrSFP15 + offset;
-            x15 = (*SFP15ptr++) * Y_tmp;
-            #endif
-            #if nIC>=17
-            SFP16ptr   = wmrSFP16 + offset;
-            x16 = (*SFP16ptr++) * Y_tmp;
-            #endif
-            #if nIC>=18
-            SFP17ptr   = wmrSFP17 + offset;
-            x17 = (*SFP17ptr++) * Y_tmp;
-            #endif
-            #if nIC>=19
-            SFP18ptr   = wmrSFP18 + offset;
-            x18 = (*SFP18ptr++) * Y_tmp;
-            #endif
-            #if nIC>=20
-            SFP19ptr   = wmrSFP19 + offset;
-            x19 = (*SFP19ptr++) * Y_tmp;
-            #endif
-
-            while( ++Yptr != YptrEnd )
-            {
-                Y_tmp = *Yptr;
-                x0 += (*SFP0ptr++) * Y_tmp;
-                #if nIC>=2
-                x1 += (*SFP1ptr++) * Y_tmp;
-                #endif
-                #if nIC>=3
-                x2 += (*SFP2ptr++) * Y_tmp;
-                #endif
-                #if nIC>=4
-                x3 += (*SFP3ptr++) * Y_tmp;
-                #endif
-                #if nIC>=5
-                x4 += (*SFP4ptr++) * Y_tmp;
-                #endif
-                #if nIC>=6
-                x5 += (*SFP5ptr++) * Y_tmp;
-                #endif
-                #if nIC>=7
-                x6 += (*SFP6ptr++) * Y_tmp;
-                #endif
-                #if nIC>=8
-                x7 += (*SFP7ptr++) * Y_tmp;
-                #endif
-                #if nIC>=9
-                x8 += (*SFP8ptr++) * Y_tmp;
-                #endif
-                #if nIC>=10
-                x9 += (*SFP9ptr++) * Y_tmp;
-                #endif
-                #if nIC>=11
-                x10 += (*SFP10ptr++) * Y_tmp;
-                #endif
-                #if nIC>=12
-                x11 += (*SFP11ptr++) * Y_tmp;
-                #endif
-                #if nIC>=13
-                x12 += (*SFP12ptr++) * Y_tmp;
-                #endif
-                #if nIC>=14
-                x13 += (*SFP13ptr++) * Y_tmp;
-                #endif
-                #if nIC>=15
-                x14 += (*SFP14ptr++) * Y_tmp;
-                #endif
-                #if nIC>=16
-                x15 += (*SFP15ptr++) * Y_tmp;
-                #endif
-                #if nIC>=17
-                x16 += (*SFP16ptr++) * Y_tmp;
-                #endif
-                #if nIC>=18
-                x17 += (*SFP17ptr++) * Y_tmp;
-                #endif
-                #if nIC>=19
-                x18 += (*SFP18ptr++) * Y_tmp;
-                #endif
-                #if nIC>=20
-                x19 += (*SFP19ptr++) * Y_tmp;
-                #endif
-            }
-
-            w = (double)(*t_l);
-            x[*t_f]      += w * x0;
-            #if nIC>=2
-            x[*t_f+nF]   += w * x1;
-            #endif
-            #if nIC>=3
-            x[*t_f+2*nF] += w * x2;
-            #endif
-            #if nIC>=4
-            x[*t_f+3*nF] += w * x3;
-            #endif
-            #if nIC>=5
-            x[*t_f+4*nF] += w * x4;
-            #endif
-            #if nIC>=6
-            x[*t_f+5*nF] += w * x5;
-            #endif
-            #if nIC>=7
-            x[*t_f+6*nF] += w * x6;
-            #endif
-            #if nIC>=8
-            x[*t_f+7*nF] += w * x7;
-            #endif
-            #if nIC>=9
-            x[*t_f+8*nF] += w * x8;
-            #endif
-            #if nIC>=10
-            x[*t_f+9*nF] += w * x9;
-            #endif
-            #if nIC>=11
-            x[*t_f+10*nF] += w * x10;
-            #endif
-            #if nIC>=12
-            x[*t_f+11*nF] += w * x11;
-            #endif
-            #if nIC>=13
-            x[*t_f+12*nF] += w * x12;
-            #endif
-            #if nIC>=14
-            x[*t_f+13*nF] += w * x13;
-            #endif
-            #if nIC>=15
-            x[*t_f+14*nF] += w * x14;
-            #endif
-            #if nIC>=16
-            x[*t_f+15*nF] += w * x15;
-            #endif
-            #if nIC>=17
-            x[*t_f+16*nF] += w * x16;
-            #endif
-            #if nIC>=18
-            x[*t_f+17*nF] += w * x17;
-            #endif
-            #if nIC>=19
-            x[*t_f+18*nF] += w * x18;
-            #endif
-            #if nIC>=20
-            x[*t_f+19*nF] += w * x19;
-            #endif
-        }
-
-        t_f++;
-        t_v++;
-        t_o++;
-        t_l++;
-        t_t++;
-    }
-#endif
-
-#if nEC>=1
-    // extra-cellular compartments
-    t_v    = ECv + ECthreadsT[id];
-    t_vEnd = ECv + ECthreadsT[id+1];
-    t_o    = ECo + ECthreadsT[id];
-
-    x_Ptr0 = x + nIC*nF + ECthreadsT[id];
-    #if nEC>=2
-    x_Ptr1 = x_Ptr0 + nE;
-    #endif
-    #if nEC>=3
-    x_Ptr2 = x_Ptr1 + nE;
-    #endif
-    #if nEC>=4
-    x_Ptr3 = x_Ptr2 + nE;
-    #endif
-    #if nEC>=5
-    x_Ptr4 = x_Ptr3 + nE;
-    #endif
-    #if nEC>=6
-    x_Ptr5 = x_Ptr4 + nE;
-    #endif
-    #if nEC>=7
-    x_Ptr6 = x_Ptr5 + nE;
-    #endif
-    #if nEC>=8
-    x_Ptr7 = x_Ptr6 + nE;
-    #endif
-    #if nEC>=9
-    x_Ptr8 = x_Ptr7 + nE;
-    #endif
-    #if nEC>=10
-    x_Ptr9 = x_Ptr8 + nE;
-    #endif
-    #if nEC>=11
-    x_Ptr10 = x_Ptr9 + nE;
-    #endif
-    #if nEC>=12
-    x_Ptr11 = x_Ptr10 + nE;
-    #endif
-    #if nEC>=13
-    x_Ptr12 = x_Ptr11 + nE;
-    #endif
-    #if nEC>=14
-    x_Ptr13 = x_Ptr12 + nE;
-    #endif
-    #if nEC>=15
-    x_Ptr14 = x_Ptr13 + nE;
-    #endif
-    #if nEC>=16
-    x_Ptr15 = x_Ptr14 + nE;
-    #endif
-    #if nEC>=17
-    x_Ptr16 = x_Ptr15 + nE;
-    #endif
-    #if nEC>=18
-    x_Ptr17 = x_Ptr16 + nE;
-    #endif
-    #if nEC>=19
-    x_Ptr18 = x_Ptr17 + nE;
-    #endif
-    #if nEC>=20
-    x_Ptr19 = x_Ptr18 + nE;
-    #endif
-
-    while( t_v != t_vEnd )
-    {
-        Yptr    = Y    + nS * (*t_v++);
-        YptrEnd = Yptr + nS;
-        offset  = nS * (*t_o++);
-
-        Y_tmp = *Yptr;
-        SFP0ptr = wmhSFP0 + offset;
-        x0 = (*SFP0ptr++) * Y_tmp;
-        #if nEC>=2
-        SFP1ptr = wmhSFP1 + offset;
-        x1 = (*SFP1ptr++) * Y_tmp;
-        #endif
-        #if nEC>=3
-        SFP2ptr = wmhSFP2 + offset;
-        x2 = (*SFP2ptr++) * Y_tmp;
-        #endif
-        #if nEC>=4
-        SFP3ptr = wmhSFP3 + offset;
-        x3 = (*SFP3ptr++) * Y_tmp;
-        #endif
-        #if nEC>=5
-        SFP4ptr = wmhSFP4 + offset;
-        x4 = (*SFP4ptr++) * Y_tmp;
-        #endif
-        #if nEC>=6
-        SFP5ptr = wmhSFP5 + offset;
-        x5 = (*SFP5ptr++) * Y_tmp;
-        #endif
-        #if nEC>=7
-        SFP6ptr = wmhSFP6 + offset;
-        x6 = (*SFP6ptr++) * Y_tmp;
-        #endif
-        #if nEC>=8
-        SFP7ptr = wmhSFP7 + offset;
-        x7 = (*SFP7ptr++) * Y_tmp;
-        #endif
-        #if nEC>=9
-        SFP8ptr = wmhSFP8 + offset;
-        x8 = (*SFP8ptr++) * Y_tmp;
-        #endif
-        #if nEC>=10
-        SFP9ptr = wmhSFP9 + offset;
-        x9 = (*SFP9ptr++) * Y_tmp;
-        #endif
-        #if nEC>=11
-        SFP10ptr = wmhSFP10 + offset;
-        x10 = (*SFP10ptr++) * Y_tmp;
-        #endif
-        #if nEC>=12
-        SFP11ptr = wmhSFP11 + offset;
-        x11 = (*SFP11ptr++) * Y_tmp;
-        #endif
-        #if nEC>=13
-        SFP12ptr = wmhSFP12 + offset;
-        x12 = (*SFP12ptr++) * Y_tmp;
-        #endif
-        #if nEC>=14
-        SFP13ptr = wmhSFP13 + offset;
-        x13 = (*SFP13ptr++) * Y_tmp;
-        #endif
-        #if nEC>=15
-        SFP14ptr = wmhSFP14 + offset;
-        x14 = (*SFP14ptr++) * Y_tmp;
-        #endif
-        #if nEC>=16
-        SFP15ptr = wmhSFP15 + offset;
-        x15 = (*SFP15ptr++) * Y_tmp;
-        #endif
-        #if nEC>=17
-        SFP16ptr = wmhSFP16 + offset;
-        x16 = (*SFP16ptr++) * Y_tmp;
-        #endif
-        #if nEC>=18
-        SFP17ptr = wmhSFP17 + offset;
-        x17 = (*SFP17ptr++) * Y_tmp;
-        #endif
-        #if nEC>=19
-        SFP18ptr = wmhSFP18 + offset;
-        x18 = (*SFP18ptr++) * Y_tmp;
-        #endif
-        #if nEC>=20
-        SFP19ptr = wmhSFP19 + offset;
-        x19 = (*SFP19ptr++) * Y_tmp;
-        #endif
-
-        while( ++Yptr != YptrEnd )
-        {
-            Y_tmp = *Yptr;
-            x0 += (*SFP0ptr++) * Y_tmp;
-            #if nEC>=2
-            x1 += (*SFP1ptr++) * Y_tmp;
-            #endif
-            #if nEC>=3
-            x2 += (*SFP2ptr++) * Y_tmp;
-            #endif
-            #if nEC>=4
-            x3 += (*SFP3ptr++) * Y_tmp;
-            #endif
-            #if nEC>=5
-            x4 += (*SFP4ptr++) * Y_tmp;
-            #endif
-            #if nEC>=6
-            x5 += (*SFP5ptr++) * Y_tmp;
-            #endif
-            #if nEC>=7
-            x6 += (*SFP6ptr++) * Y_tmp;
-            #endif
-            #if nEC>=8
-            x7 += (*SFP7ptr++) * Y_tmp;
-            #endif
-            #if nEC>=9
-            x8 += (*SFP8ptr++) * Y_tmp;
-            #endif
-            #if nEC>=10
-            x9 += (*SFP9ptr++) * Y_tmp;
-            #endif
-            #if nEC>=11
-            x10 += (*SFP10ptr++) * Y_tmp;
-            #endif
-            #if nEC>=12
-            x11 += (*SFP11ptr++) * Y_tmp;
-            #endif
-            #if nEC>=13
-            x12 += (*SFP12ptr++) * Y_tmp;
-            #endif
-            #if nEC>=14
-            x13 += (*SFP13ptr++) * Y_tmp;
-            #endif
-            #if nEC>=15
-            x14 += (*SFP14ptr++) * Y_tmp;
-            #endif
-            #if nEC>=16
-            x15 += (*SFP15ptr++) * Y_tmp;
-            #endif
-            #if nEC>=17
-            x16 += (*SFP16ptr++) * Y_tmp;
-            #endif
-            #if nEC>=18
-            x17 += (*SFP17ptr++) * Y_tmp;
-            #endif
-            #if nEC>=19
-            x18 += (*SFP18ptr++) * Y_tmp;
-            #endif
-            #if nEC>=20
-            x19 += (*SFP19ptr++) * Y_tmp;
-            #endif
-        }
-        (*x_Ptr0++) += x0;
-        #if nEC>=2
-        (*x_Ptr1++) += x1;
-        #endif
-        #if nEC>=3
-        (*x_Ptr2++) += x2;
-        #endif
-        #if nEC>=4
-        (*x_Ptr3++) += x3;
-        #endif
-        #if nEC>=5
-        (*x_Ptr4++) += x4;
-        #endif
-        #if nEC>=6
-        (*x_Ptr5++) += x5;
-        #endif
-        #if nEC>=7
-        (*x_Ptr6++) += x6;
-        #endif
-        #if nEC>=8
-        (*x_Ptr7++) += x7;
-        #endif
-        #if nEC>=9
-        (*x_Ptr8++) += x8;
-        #endif
-        #if nEC>=10
-        (*x_Ptr9++) += x9;
-        #endif
-        #if nEC>=11
-        (*x_Ptr10++) += x10;
-        #endif
-        #if nEC>=12
-        (*x_Ptr11++) += x11;
-        #endif
-        #if nEC>=13
-        (*x_Ptr12++) += x12;
-        #endif
-        #if nEC>=14
-        (*x_Ptr13++) += x13;
-        #endif
-        #if nEC>=15
-        (*x_Ptr14++) += x14;
-        #endif
-        #if nEC>=16
-        (*x_Ptr15++) += x15;
-        #endif
-        #if nEC>=17
-        (*x_Ptr16++) += x16;
-        #endif
-        #if nEC>=18
-        (*x_Ptr17++) += x17;
-        #endif
-        #if nEC>=19
-        (*x_Ptr18++) += x18;
-        #endif
-        #if nEC>=20
-        (*x_Ptr19++) += x19;
-        #endif
-    }
-#endif
-
-#if nISO>=1
-    // isotropic compartments
-    t_v    = ISOv + ISOthreadsT[id];
-    t_vEnd = ISOv + ISOthreadsT[id+1];
-
-    x_Ptr0 = x + nIC*nF + nEC*nE + ISOthreadsT[id];
-    #if nISO>=2
-    x_Ptr1 = x_Ptr0 + nV;
-    #endif
-    #if nISO>=3
-    x_Ptr2 = x_Ptr1 + nV;
-    #endif
-    #if nISO>=4
-    x_Ptr3 = x_Ptr2 + nV;
-    #endif
-    #if nISO>=5
-    x_Ptr4 = x_Ptr3 + nV;
-    #endif
-    #if nISO>=6
-    x_Ptr5 = x_Ptr4 + nV;
-    #endif
-    #if nISO>=7
-    x_Ptr6 = x_Ptr5 + nV;
-    #endif
-    #if nISO>=8
-    x_Ptr7 = x_Ptr6 + nV;
-    #endif
-    #if nISO>=9
-    x_Ptr8 = x_Ptr7 + nV;
-    #endif
-    #if nISO>=10
-    x_Ptr9 = x_Ptr8 + nV;
-    #endif
-    #if nISO>=11
-    x_Ptr10 = x_Ptr9 + nV;
-    #endif
-    #if nISO>=12
-    x_Ptr11 = x_Ptr10 + nV;
-    #endif
-    #if nISO>=13
-    x_Ptr12 = x_Ptr11 + nV;
-    #endif
-    #if nISO>=14
-    x_Ptr13 = x_Ptr12 + nV;
-    #endif
-    #if nISO>=15
-    x_Ptr14 = x_Ptr13 + nV;
-    #endif
-    #if nISO>=16
-    x_Ptr15 = x_Ptr14 + nV;
-    #endif
-    #if nISO>=17
-    x_Ptr16 = x_Ptr15 + nV;
-    #endif
-    #if nISO>=18
-    x_Ptr17 = x_Ptr16 + nV;
-    #endif
-    #if nISO>=19
-    x_Ptr18 = x_Ptr17 + nV;
-    #endif
-    #if nISO>=20
-    x_Ptr19 = x_Ptr18 + nV;
-    #endif
-
-    while( t_v != t_vEnd )
-    {
-        Yptr    = Y    + nS * (*t_v++);
-        YptrEnd = Yptr + nS;
-
-        SFP0ptr = isoSFP0;
-        #if nISO>=2
-        SFP1ptr = isoSFP1;
-        #endif
-        #if nISO>=3
-        SFP2ptr = isoSFP2;
-        #endif
-        #if nISO>=4
-        SFP3ptr = isoSFP3;
-        #endif
-        #if nISO>=5
-        SFP4ptr = isoSFP4;
-        #endif
-        #if nISO>=6
-        SFP5ptr = isoSFP5;
-        #endif
-        #if nISO>=7
-        SFP6ptr = isoSFP6;
-        #endif
-        #if nISO>=8
-        SFP7ptr = isoSFP7;
-        #endif
-        #if nISO>=9
-        SFP8ptr = isoSFP8;
-        #endif
-        #if nISO>=10
-        SFP9ptr = isoSFP9;
-        #endif
-        #if nISO>=11
-        SFP10ptr = isoSFP10;
-        #endif
-        #if nISO>=12
-        SFP11ptr = isoSFP11;
-        #endif
-        #if nISO>=13
-        SFP12ptr = isoSFP12;
-        #endif
-        #if nISO>=14
-        SFP13ptr = isoSFP13;
-        #endif
-        #if nISO>=15
-        SFP14ptr = isoSFP14;
-        #endif
-        #if nISO>=16
-        SFP15ptr = isoSFP15;
-        #endif
-        #if nISO>=17
-        SFP16ptr = isoSFP16;
-        #endif
-        #if nISO>=18
-        SFP17ptr = isoSFP17;
-        #endif
-        #if nISO>=19
-        SFP18ptr = isoSFP18;
-        #endif
-        #if nISO>=20
-        SFP19ptr = isoSFP19;
-        #endif
-
-        Y_tmp = *Yptr;
-        x0 = (*SFP0ptr++) * Y_tmp;
-        #if nISO>=2
-        x1 = (*SFP1ptr++) * Y_tmp;
-        #endif
-        #if nISO>=3
-        x2 = (*SFP2ptr++) * Y_tmp;
-        #endif
-        #if nISO>=4
-        x3 = (*SFP3ptr++) * Y_tmp;
-        #endif
-        #if nISO>=5
-        x4 = (*SFP4ptr++) * Y_tmp;
-        #endif
-        #if nISO>=6
-        x5 = (*SFP5ptr++) * Y_tmp;
-        #endif
-        #if nISO>=7
-        x6 = (*SFP6ptr++) * Y_tmp;
-        #endif
-        #if nISO>=8
-        x7 = (*SFP7ptr++) * Y_tmp;
-        #endif
-        #if nISO>=9
-        x8 = (*SFP8ptr++) * Y_tmp;
-        #endif
-        #if nISO>=10
-        x9 = (*SFP9ptr++) * Y_tmp;
-        #endif
-        #if nISO>=11
-        x10 = (*SFP10ptr++) * Y_tmp;
-        #endif
-        #if nISO>=12
-        x11 = (*SFP11ptr++) * Y_tmp;
-        #endif
-        #if nISO>=13
-        x12 = (*SFP12ptr++) * Y_tmp;
-        #endif
-        #if nISO>=14
-        x13 = (*SFP13ptr++) * Y_tmp;
-        #endif
-        #if nISO>=15
-        x14 = (*SFP14ptr++) * Y_tmp;
-        #endif
-        #if nISO>=16
-        x15 = (*SFP15ptr++) * Y_tmp;
-        #endif
-        #if nISO>=17
-        x16 = (*SFP16ptr++) * Y_tmp;
-        #endif
-        #if nISO>=18
-        x17 = (*SFP17ptr++) * Y_tmp;
-        #endif
-        #if nISO>=19
-        x18 = (*SFP18ptr++) * Y_tmp;
-        #endif
-        #if nISO>=20
-        x19 = (*SFP19ptr++) * Y_tmp;
-        #endif
-
-        while( ++Yptr != YptrEnd )
-        {
-            Y_tmp = *Yptr;
-            x0  += (*SFP0ptr++) * Y_tmp;
-            #if nISO>=2
-            x1  += (*SFP1ptr++) * Y_tmp;
-            #endif
-            #if nISO>=3
-            x2  += (*SFP2ptr++) * Y_tmp;
-            #endif
-            #if nISO>=4
-            x3  += (*SFP3ptr++) * Y_tmp;
-            #endif
-            #if nISO>=5
-            x4  += (*SFP4ptr++) * Y_tmp;
-            #endif
-            #if nISO>=6
-            x5  += (*SFP5ptr++) * Y_tmp;
-            #endif
-            #if nISO>=7
-            x6  += (*SFP6ptr++) * Y_tmp;
-            #endif
-            #if nISO>=8
-            x7  += (*SFP7ptr++) * Y_tmp;
-            #endif
-            #if nISO>=9
-            x8  += (*SFP8ptr++) * Y_tmp;
-            #endif
-            #if nISO>=10
-            x9  += (*SFP9ptr++) * Y_tmp;
-            #endif
-            #if nISO>=11
-            x10  += (*SFP10ptr++) * Y_tmp;
-            #endif
-            #if nISO>=12
-            x11  += (*SFP11ptr++) * Y_tmp;
-            #endif
-            #if nISO>=13
-            x12  += (*SFP12ptr++) * Y_tmp;
-            #endif
-            #if nISO>=14
-            x13  += (*SFP13ptr++) * Y_tmp;
-            #endif
-            #if nISO>=15
-            x14  += (*SFP14ptr++) * Y_tmp;
-            #endif
-            #if nISO>=16
-            x15  += (*SFP15ptr++) * Y_tmp;
-            #endif
-            #if nISO>=17
-            x16  += (*SFP16ptr++) * Y_tmp;
-            #endif
-            #if nISO>=18
-            x17  += (*SFP17ptr++) * Y_tmp;
-            #endif
-            #if nISO>=19
-            x18  += (*SFP18ptr++) * Y_tmp;
-            #endif
-            #if nISO>=20
-            x19  += (*SFP19ptr++) * Y_tmp;
-            #endif
-        }
-
-        (*x_Ptr0++) += x0;
-        #if nISO>=2
-        (*x_Ptr1++) += x1;
-        #endif
-        #if nISO>=3
-        (*x_Ptr2++) += x2;
-        #endif
-        #if nISO>=4
-        (*x_Ptr3++) += x3;
-        #endif
-        #if nISO>=5
-        (*x_Ptr4++) += x4;
-        #endif
-        #if nISO>=6
-        (*x_Ptr5++) += x5;
-        #endif
-        #if nISO>=7
-        (*x_Ptr6++) += x6;
-        #endif
-        #if nISO>=8
-        (*x_Ptr7++) += x7;
-        #endif
-        #if nISO>=9
-        (*x_Ptr8++) += x8;
-        #endif
-        #if nISO>=10
-        (*x_Ptr9++) += x9;
-        #endif
-        #if nISO>=11
-        (*x_Ptr10++) += x10;
-        #endif
-        #if nISO>=12
-        (*x_Ptr11++) += x11;
-        #endif
-        #if nISO>=13
-        (*x_Ptr12++) += x12;
-        #endif
-        #if nISO>=14
-        (*x_Ptr13++) += x13;
-        #endif
-        #if nISO>=15
-        (*x_Ptr14++) += x14;
-        #endif
-        #if nISO>=16
-        (*x_Ptr15++) += x15;
-        #endif
-        #if nISO>=17
-        (*x_Ptr16++) += x16;
-        #endif
-        #if nISO>=18
-        (*x_Ptr17++) += x17;
-        #endif
-        #if nISO>=19
-        (*x_Ptr18++) += x18;
-        #endif
-        #if nISO>=20
-        (*x_Ptr19++) += x19;
-        #endif
-    }
-#endif
-
-    pthread_exit( 0 );
-}
-
-
-// =========================
-// Function called by CYTHON
-// =========================
-void COMMIT_At(
-    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
-    double *_vIN, double *_vOUT,
-    uint32_t *_ICf, uint32_t *_ICv, uint16_t *_ICo, float *_ICl,
-    uint32_t *_ECv, uint16_t *_ECo,
-    uint32_t *_ISOv,
-    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
-    uint8_t* _ICthreadsT, uint32_t* _ECthreadsT, uint32_t* _ISOthreadsT
-)
-{
-    nF = _nF;
-    n  = _n;
-    nE = _nE;
-    nV = _nV;
-    nS = _nS;
-    ndirs = _ndirs;
-
-    x = _vOUT;
-    Y = _vIN;
-
-    ICf  = _ICf;
-    ICv  = _ICv;
-    ICo  = _ICo;
-    ICl  = _ICl;
-    ECv  = _ECv;
-    ECo  = _ECo;
-    ISOv = _ISOv;
-
-    #if nIC>=1
-    wmrSFP0 = _wmrSFP;
-    #if nIC>=2
-    wmrSFP1 = wmrSFP0 + _ndirs*_nS;
-    #if nIC>=3
-    wmrSFP2 = wmrSFP1 + _ndirs*_nS;
-    #if nIC>=4
-    wmrSFP3 = wmrSFP2 + _ndirs*_nS;
-    #if nIC>=5
-    wmrSFP4 = wmrSFP3 + _ndirs*_nS;
-    #if nIC>=6
-    wmrSFP5 = wmrSFP4 + _ndirs*_nS;
-    #if nIC>=7
-    wmrSFP6 = wmrSFP5 + _ndirs*_nS;
-    #if nIC>=8
-    wmrSFP7 = wmrSFP6 + _ndirs*_nS;
-    #if nIC>=9
-    wmrSFP8 = wmrSFP7 + _ndirs*_nS;
-    #if nIC>=10
-    wmrSFP9 = wmrSFP8 + _ndirs*_nS;
-    #if nIC>=11
-    wmrSFP10 = wmrSFP9 + _ndirs*_nS;
-    #if nIC>=12
-    wmrSFP11 = wmrSFP10 + _ndirs*_nS;
-    #if nIC>=13
-    wmrSFP12 = wmrSFP11 + _ndirs*_nS;
-    #if nIC>=14
-    wmrSFP13 = wmrSFP12 + _ndirs*_nS;
-    #if nIC>=15
-    wmrSFP14 = wmrSFP13 + _ndirs*_nS;
-    #if nIC>=16
-    wmrSFP15 = wmrSFP14 + _ndirs*_nS;
-    #if nIC>=17
-    wmrSFP16 = wmrSFP15 + _ndirs*_nS;
-    #if nIC>=18
-    wmrSFP17 = wmrSFP16 + _ndirs*_nS;
-    #if nIC>=19
-    wmrSFP18 = wmrSFP17 + _ndirs*_nS;
-    #if nIC>=20
-    wmrSFP19 = wmrSFP18 + _ndirs*_nS;
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #if nEC>=1
-    wmhSFP0 = _wmhSFP;
-    #if nEC>=2
-    wmhSFP1 = wmhSFP0 + _ndirs*_nS;
-    #if nEC>=3
-    wmhSFP2 = wmhSFP1 + _ndirs*_nS;
-    #if nEC>=4
-    wmhSFP3 = wmhSFP2 + _ndirs*_nS;
-    #if nEC>=5
-    wmhSFP4 = wmhSFP3 + _ndirs*_nS;
-    #if nEC>=6
-    wmhSFP5 = wmhSFP4 + _ndirs*_nS;
-    #if nEC>=7
-    wmhSFP6 = wmhSFP5 + _ndirs*_nS;
-    #if nEC>=8
-    wmhSFP7 = wmhSFP6 + _ndirs*_nS;
-    #if nEC>=9
-    wmhSFP8 = wmhSFP7 + _ndirs*_nS;
-    #if nEC>=10
-    wmhSFP9 = wmhSFP8 + _ndirs*_nS;
-    #if nEC>=11
-    wmhSFP10 = wmhSFP9 + _ndirs*_nS;
-    #if nEC>=12
-    wmhSFP11 = wmhSFP10 + _ndirs*_nS;
-    #if nEC>=13
-    wmhSFP12 = wmhSFP11 + _ndirs*_nS;
-    #if nEC>=14
-    wmhSFP13 = wmhSFP12 + _ndirs*_nS;
-    #if nEC>=15
-    wmhSFP14 = wmhSFP13 + _ndirs*_nS;
-    #if nEC>=16
-    wmhSFP15 = wmhSFP14 + _ndirs*_nS;
-    #if nEC>=17
-    wmhSFP16 = wmhSFP15 + _ndirs*_nS;
-    #if nEC>=18
-    wmhSFP17 = wmhSFP16 + _ndirs*_nS;
-    #if nEC>=19
-    wmhSFP18 = wmhSFP17 + _ndirs*_nS;
-    #if nEC>=20
-    wmhSFP19 = wmhSFP18 + _ndirs*_nS;
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #if nISO>=1
-    isoSFP0 = _isoSFP;
-    #if nISO>=2
-    isoSFP1 = isoSFP0 + _nS;
-    #if nISO>=3
-    isoSFP2 = isoSFP1 + _nS;
-    #if nISO>=4
-    isoSFP3 = isoSFP2 + _nS;
-    #if nISO>=5
-    isoSFP4 = isoSFP3 + _nS;
-    #if nISO>=6
-    isoSFP5 = isoSFP4 + _nS;
-    #if nISO>=7
-    isoSFP6 = isoSFP5 + _nS;
-    #if nISO>=8
-    isoSFP7 = isoSFP6 + _nS;
-    #if nISO>=9
-    isoSFP8 = isoSFP7 + _nS;
-    #if nISO>=10
-    isoSFP9 = isoSFP8 + _nS;
-    #if nISO>=11
-    isoSFP10 = isoSFP9 + _nS;
-    #if nISO>=12
-    isoSFP11 = isoSFP10 + _nS;
-    #if nISO>=13
-    isoSFP12 = isoSFP11 + _nS;
-    #if nISO>=14
-    isoSFP13 = isoSFP12 + _nS;
-    #if nISO>=15
-    isoSFP14 = isoSFP13 + _nS;
-    #if nISO>=16
-    isoSFP15 = isoSFP14 + _nS;
-    #if nISO>=17
-    isoSFP16 = isoSFP15 + _nS;
-    #if nISO>=18
-    isoSFP17 = isoSFP16 + _nS;
-    #if nISO>=19
-    isoSFP18 = isoSFP17 + _nS;
-    #if nISO>=20
-    isoSFP19 = isoSFP18 + _nS;
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-
-    ICthreadsT  = _ICthreadsT;
-    ECthreadsT  = _ECthreadsT;
-    ISOthreadsT = _ISOthreadsT;
-
-    // Run SEPARATE THREADS to perform the multiplication
-    pthread_t threads[nTHREADS];
-    int t;
-    for(t=0; t<nTHREADS ; t++)
-        pthread_create( &threads[t], NULL, COMMIT_At__block, (void *) (long int)t );
-    for(t=0; t<nTHREADS ; t++)
-        pthread_join( threads[t], NULL );
-    return;
-}
+#include <pthread.h>
+#include <stdint.h> // uint32_t etc
+
+// number of THREADS
+#ifdef nTHREADS
+    #if (nTHREADS<0 || nTHREADS>255)
+    #error "nTHREADS" must be in the range 0..255
+    #endif
+#else
+    #error "nTHREADS" parameter must be passed to the compiler as "-DnTHREADS=<value>"
+#endif
+
+
+/* global variables */
+int         nF, n, nE, nV, nS, ndirs;
+double      *x, *Y;
+uint32_t    *ICthreads, *ECthreads, *ISOthreads;
+uint8_t     *ICthreadsT;
+uint32_t    *ECthreadsT, *ISOthreadsT;
+uint32_t    *ICf, *ICv, *ECv, *ISOv;
+uint16_t    *ICo, *ECo;
+float       *ICl;
+float       *wmrSFP0, *wmrSFP1, *wmrSFP2, *wmrSFP3, *wmrSFP4, *wmrSFP5, *wmrSFP6, *wmrSFP7, *wmrSFP8, *wmrSFP9, *wmrSFP10, *wmrSFP11, *wmrSFP12, *wmrSFP13, *wmrSFP14, *wmrSFP15, *wmrSFP16, *wmrSFP17, *wmrSFP18, *wmrSFP19;
+float       *wmhSFP0, *wmhSFP1, *wmhSFP2, *wmhSFP3, *wmhSFP4, *wmhSFP5, *wmhSFP6, *wmhSFP7, *wmhSFP8, *wmhSFP9, *wmhSFP10, *wmhSFP11, *wmhSFP12, *wmhSFP13, *wmhSFP14, *wmhSFP15, *wmhSFP16, *wmhSFP17, *wmhSFP18, *wmhSFP19;
+float       *isoSFP0, *isoSFP1, *isoSFP2, *isoSFP3, *isoSFP4, *isoSFP5, *isoSFP6, *isoSFP7, *isoSFP8, *isoSFP9, *isoSFP10, *isoSFP11, *isoSFP12, *isoSFP13, *isoSFP14, *isoSFP15, *isoSFP16, *isoSFP17, *isoSFP18, *isoSFP19;
+
+
+
+// ====================================================
+// Compute a sub-block of the A*x MAtRIX-VECTOR product
+// ====================================================
+void* COMMIT_A__block( void *ptr )
+{
+    int      id = (long)ptr;
+    int      offset;
+    double   x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, w;
+    double   *x_Ptr0, *x_Ptr1, *x_Ptr2, *x_Ptr3, *x_Ptr4, *x_Ptr5, *x_Ptr6, *x_Ptr7, *x_Ptr8, *x_Ptr9, *x_Ptr10, *x_Ptr11, *x_Ptr12, *x_Ptr13, *x_Ptr14, *x_Ptr15, *x_Ptr16, *x_Ptr17, *x_Ptr18, *x_Ptr19;
+    double   *Yptr, *YptrEnd;
+    float    *SFP0ptr, *SFP1ptr, *SFP2ptr, *SFP3ptr, *SFP4ptr, *SFP5ptr, *SFP6ptr, *SFP7ptr, *SFP8ptr, *SFP9ptr, *SFP10ptr, *SFP11ptr, *SFP12ptr, *SFP13ptr, *SFP14ptr, *SFP15ptr, *SFP16ptr, *SFP17ptr, *SFP18ptr, *SFP19ptr;
+    uint32_t *t_v, *t_vEnd, *t_f;
+    uint16_t *t_o;
+    float    *t_l;
+
+#if nIC>=1
+    // intra-cellular compartments
+    t_v    = ICv + ICthreads[id];
+    t_vEnd = ICv + ICthreads[id+1];
+    t_o    = ICo + ICthreads[id];
+    t_l    = ICl + ICthreads[id];
+    t_f    = ICf + ICthreads[id];
+
+    while( t_v != t_vEnd )
+    {
+        x_Ptr0 = x + *t_f;
+        x0 = *x_Ptr0;
+        #if nIC>=2
+        x_Ptr1 = x_Ptr0 + nF;
+        x1 = *x_Ptr1;
+        #endif
+        #if nIC>=3
+        x_Ptr2 = x_Ptr1 + nF;
+        x2 = *x_Ptr2;
+        #endif
+        #if nIC>=4
+        x_Ptr3 = x_Ptr2 + nF;
+        x3 = *x_Ptr3;
+        #endif
+        #if nIC>=5
+        x_Ptr4 = x_Ptr3 + nF;
+        x4 = *x_Ptr4;
+        #endif
+        #if nIC>=6
+        x_Ptr5 = x_Ptr4 + nF;
+        x5 = *x_Ptr5;
+        #endif
+        #if nIC>=7
+        x_Ptr6 = x_Ptr5 + nF;
+        x6 = *x_Ptr6;
+        #endif
+        #if nIC>=8
+        x_Ptr7 = x_Ptr6 + nF;
+        x7 = *x_Ptr7;
+        #endif
+        #if nIC>=9
+        x_Ptr8 = x_Ptr7 + nF;
+        x8 = *x_Ptr8;
+        #endif
+        #if nIC>=10
+        x_Ptr9 = x_Ptr8 + nF;
+        x9 = *x_Ptr9;
+        #endif
+        #if nIC>=11
+        x_Ptr10 = x_Ptr9 + nF;
+        x10 = *x_Ptr10;
+        #endif
+        #if nIC>=12
+        x_Ptr11 = x_Ptr10 + nF;
+        x11 = *x_Ptr11;
+        #endif
+        #if nIC>=13
+        x_Ptr12 = x_Ptr11 + nF;
+        x12 = *x_Ptr12;
+        #endif
+        #if nIC>=14
+        x_Ptr13 = x_Ptr12 + nF;
+        x13 = *x_Ptr13;
+        #endif
+        #if nIC>=15
+        x_Ptr14 = x_Ptr13 + nF;
+        x14 = *x_Ptr14;
+        #endif
+        #if nIC>=16
+        x_Ptr15 = x_Ptr14 + nF;
+        x15 = *x_Ptr15;
+        #endif
+        #if nIC>=17
+        x_Ptr16 = x_Ptr15 + nF;
+        x16 = *x_Ptr16;
+        #endif
+        #if nIC>=18
+        x_Ptr17 = x_Ptr16 + nF;
+        x17 = *x_Ptr17;
+        #endif
+        #if nIC>=19
+        x_Ptr18 = x_Ptr17 + nF;
+        x18 = *x_Ptr18;
+        #endif
+        #if nIC>=20
+        x_Ptr19 = x_Ptr18 + nF;
+        x19 = *x_Ptr19;
+        #endif
+
+        if ( x0 != 0
+        #if nIC>=2
+            || x1 != 0
+        #endif
+        #if nIC>=3
+            || x2 != 0
+        #endif
+        #if nIC>=4
+            || x3 != 0
+        #endif
+        #if nIC>=5
+            || x4 != 0
+        #endif
+        #if nIC>=6
+            || x5 != 0
+        #endif
+        #if nIC>=7
+            || x6 != 0
+        #endif
+        #if nIC>=8
+            || x7 != 0
+        #endif
+        #if nIC>=9
+            || x8 != 0
+        #endif
+        #if nIC>=10
+            || x9 != 0
+        #endif
+        #if nIC>=11
+            || x10 != 0
+        #endif
+        #if nIC>=12
+            || x11 != 0
+        #endif
+        #if nIC>=13
+            || x12 != 0
+        #endif
+        #if nIC>=14
+            || x13 != 0
+        #endif
+        #if nIC>=15
+            || x14 != 0
+        #endif
+        #if nIC>=16
+            || x15 != 0
+        #endif
+        #if nIC>=17
+            || x16 != 0
+        #endif
+        #if nIC>=18
+            || x17 != 0
+        #endif
+        #if nIC>=19
+            || x18 != 0
+        #endif
+        #if nIC>=20
+            || x19 != 0
+        #endif
+        )
+        {
+            Yptr    = Y    + nS * (*t_v);
+            YptrEnd = Yptr + nS;
+            w       = (double)(*t_l);
+            offset  = nS * (*t_o);
+            SFP0ptr = wmrSFP0 + offset;
+            #if nIC>=2
+            SFP1ptr = wmrSFP1 + offset;
+            #endif
+            #if nIC>=3
+            SFP2ptr = wmrSFP2 + offset;
+            #endif
+            #if nIC>=4
+            SFP3ptr = wmrSFP3 + offset;
+            #endif
+            #if nIC>=5
+            SFP4ptr = wmrSFP4 + offset;
+            #endif
+            #if nIC>=6
+            SFP5ptr = wmrSFP5 + offset;
+            #endif
+            #if nIC>=7
+            SFP6ptr = wmrSFP6 + offset;
+            #endif
+            #if nIC>=8
+            SFP7ptr = wmrSFP7 + offset;
+            #endif
+            #if nIC>=9
+            SFP8ptr = wmrSFP8 + offset;
+            #endif
+            #if nIC>=10
+            SFP9ptr = wmrSFP9 + offset;
+            #endif
+            #if nIC>=11
+            SFP10ptr = wmrSFP10 + offset;
+            #endif
+            #if nIC>=12
+            SFP11ptr = wmrSFP11 + offset;
+            #endif
+            #if nIC>=13
+            SFP12ptr = wmrSFP12 + offset;
+            #endif
+            #if nIC>=14
+            SFP13ptr = wmrSFP13 + offset;
+            #endif
+            #if nIC>=15
+            SFP14ptr = wmrSFP14 + offset;
+            #endif
+            #if nIC>=16
+            SFP15ptr = wmrSFP15 + offset;
+            #endif
+            #if nIC>=17
+            SFP16ptr = wmrSFP16 + offset;
+            #endif
+            #if nIC>=18
+            SFP17ptr = wmrSFP17 + offset;
+            #endif
+            #if nIC>=19
+            SFP18ptr = wmrSFP18 + offset;
+            #endif
+            #if nIC>=20
+            SFP19ptr = wmrSFP19 + offset;
+            #endif
+
+            while( Yptr != YptrEnd )
+                (*Yptr++) += w * (
+                          x0 * (*SFP0ptr++)
+                        #if nIC>=2
+                        + x1 * (*SFP1ptr++)
+                        #endif
+                        #if nIC>=3
+                        + x2 * (*SFP2ptr++)
+                        #endif
+                        #if nIC>=4
+                        + x3 * (*SFP3ptr++)
+                        #endif
+                        #if nIC>=5
+                        + x4 * (*SFP4ptr++)
+                        #endif
+                        #if nIC>=6
+                        + x5 * (*SFP5ptr++)
+                        #endif
+                        #if nIC>=7
+                        + x6 * (*SFP6ptr++)
+                        #endif
+                        #if nIC>=8
+                        + x7 * (*SFP7ptr++)
+                        #endif
+                        #if nIC>=9
+                        + x8 * (*SFP8ptr++)
+                        #endif
+                        #if nIC>=10
+                        + x9 * (*SFP9ptr++)
+                        #endif
+                        #if nIC>=11
+                        + x10 * (*SFP10ptr++)
+                        #endif
+                        #if nIC>=12
+                        + x11 * (*SFP11ptr++)
+                        #endif
+                        #if nIC>=13
+                        + x12 * (*SFP12ptr++)
+                        #endif
+                        #if nIC>=14
+                        + x13 * (*SFP13ptr++)
+                        #endif
+                        #if nIC>=15
+                        + x14 * (*SFP14ptr++)
+                        #endif
+                        #if nIC>=16
+                        + x15 * (*SFP15ptr++)
+                        #endif
+                        #if nIC>=17
+                        + x16 * (*SFP16ptr++)
+                        #endif
+                        #if nIC>=18
+                        + x17 * (*SFP17ptr++)
+                        #endif
+                        #if nIC>=19
+                        + x18 * (*SFP18ptr++)
+                        #endif
+                        #if nIC>=20
+                        + x19 * (*SFP19ptr++)
+                        #endif
+                );
+        }
+
+        t_f++;
+        t_v++;
+        t_o++;
+        t_l++;
+    }
+#endif
+
+#if nEC>=1
+    // extra-cellular compartments
+    t_v    = ECv + ECthreads[id];
+    t_vEnd = ECv + ECthreads[id+1];
+    t_o    = ECo + ECthreads[id];
+
+    x_Ptr0 = x + nIC*nF + ECthreads[id];
+    #if nEC>=2
+    x_Ptr1 = x_Ptr0 + nE;
+    #endif
+    #if nEC>=3
+    x_Ptr2 = x_Ptr1 + nE;
+    #endif
+    #if nEC>=4
+    x_Ptr3 = x_Ptr2 + nE;
+    #endif
+    #if nEC>=5
+    x_Ptr4 = x_Ptr3 + nE;
+    #endif
+    #if nEC>=6
+    x_Ptr5 = x_Ptr4 + nE;
+    #endif
+    #if nEC>=7
+    x_Ptr6 = x_Ptr5 + nE;
+    #endif
+    #if nEC>=8
+    x_Ptr7 = x_Ptr6 + nE;
+    #endif
+    #if nEC>=9
+    x_Ptr8 = x_Ptr7 + nE;
+    #endif
+    #if nEC>=10
+    x_Ptr9 = x_Ptr8 + nE;
+    #endif
+    #if nEC>=11
+    x_Ptr10 = x_Ptr9 + nE;
+    #endif
+    #if nEC>=12
+    x_Ptr11 = x_Ptr10 + nE;
+    #endif
+    #if nEC>=13
+    x_Ptr12 = x_Ptr11 + nE;
+    #endif
+    #if nEC>=14
+    x_Ptr13 = x_Ptr12 + nE;
+    #endif
+    #if nEC>=15
+    x_Ptr14 = x_Ptr13 + nE;
+    #endif
+    #if nEC>=16
+    x_Ptr15 = x_Ptr14 + nE;
+    #endif
+    #if nEC>=17
+    x_Ptr16 = x_Ptr15 + nE;
+    #endif
+    #if nEC>=18
+    x_Ptr17 = x_Ptr16 + nE;
+    #endif
+    #if nEC>=19
+    x_Ptr18 = x_Ptr17 + nE;
+    #endif
+    #if nEC>=20
+    x_Ptr19 = x_Ptr18 + nE;
+    #endif
+
+    while( t_v != t_vEnd )
+    {
+        x0 = *x_Ptr0++;
+        #if nEC>=2
+        x1 = *x_Ptr1++;
+        #endif
+        #if nEC>=3
+        x2 = *x_Ptr2++;
+        #endif
+        #if nEC>=4
+        x3 = *x_Ptr3++;
+        #endif
+        #if nEC>=5
+        x4 = *x_Ptr4++;
+        #endif
+        #if nEC>=6
+        x5 = *x_Ptr5++;
+        #endif
+        #if nEC>=7
+        x6 = *x_Ptr6++;
+        #endif
+        #if nEC>=8
+        x7 = *x_Ptr7++;
+        #endif
+        #if nEC>=9
+        x8 = *x_Ptr8++;
+        #endif
+        #if nEC>=10
+        x9 = *x_Ptr9++;
+        #endif
+        #if nEC>=11
+        x10 = *x_Ptr10++;
+        #endif
+        #if nEC>=12
+        x11 = *x_Ptr11++;
+        #endif
+        #if nEC>=13
+        x12 = *x_Ptr12++;
+        #endif
+        #if nEC>=14
+        x13 = *x_Ptr13++;
+        #endif
+        #if nEC>=15
+        x14 = *x_Ptr14++;
+        #endif
+        #if nEC>=16
+        x15 = *x_Ptr15++;
+        #endif
+        #if nEC>=17
+        x16 = *x_Ptr16++;
+        #endif
+        #if nEC>=18
+        x17 = *x_Ptr17++;
+        #endif
+        #if nEC>=19
+        x18 = *x_Ptr18++;
+        #endif
+        #if nEC>=20
+        x19 = *x_Ptr19++;
+        #endif
+        if (
+               x0 != 0
+            #if nEC>=2
+            || x1 != 0
+            #endif
+            #if nEC>=3
+            || x2 != 0
+            #endif
+            #if nEC>=4
+            || x3 != 0
+            #endif
+            #if nEC>=5
+            || x4 != 0
+            #endif
+            #if nEC>=6
+            || x5 != 0
+            #endif
+            #if nEC>=7
+            || x6 != 0
+            #endif
+            #if nEC>=8
+            || x7 != 0
+            #endif
+            #if nEC>=9
+            || x8 != 0
+            #endif
+            #if nEC>=10
+            || x9 != 0
+            #endif
+            #if nEC>=11
+            || x10 != 0
+            #endif
+            #if nEC>=12
+            || x11 != 0
+            #endif
+            #if nEC>=13
+            || x12 != 0
+            #endif
+            #if nEC>=14
+            || x13 != 0
+            #endif
+            #if nEC>=15
+            || x14 != 0
+            #endif
+            #if nEC>=16
+            || x15 != 0
+            #endif
+            #if nEC>=17
+            || x16 != 0
+            #endif
+            #if nEC>=18
+            || x17 != 0
+            #endif
+            #if nEC>=19
+            || x18 != 0
+            #endif
+            #if nEC>=20
+            || x19 != 0
+            #endif
+          )
+        {
+            Yptr    = Y    + nS * (*t_v);
+            YptrEnd = Yptr + nS;
+            offset  = nS * (*t_o);
+            SFP0ptr = wmhSFP0 + offset;
+            #if nEC>=2
+            SFP1ptr = wmhSFP1 + offset;
+            #endif
+            #if nEC>=3
+            SFP2ptr = wmhSFP2 + offset;
+            #endif
+            #if nEC>=4
+            SFP3ptr = wmhSFP3 + offset;
+            #endif
+            #if nEC>=5
+            SFP4ptr = wmhSFP4 + offset;
+            #endif
+            #if nEC>=6
+            SFP5ptr = wmhSFP5 + offset;
+            #endif
+            #if nEC>=7
+            SFP6ptr = wmhSFP6 + offset;
+            #endif
+            #if nEC>=8
+            SFP7ptr = wmhSFP7 + offset;
+            #endif
+            #if nEC>=9
+            SFP8ptr = wmhSFP8 + offset;
+            #endif
+            #if nEC>=10
+            SFP9ptr = wmhSFP9 + offset;
+            #endif
+            #if nEC>=11
+            SFP10ptr = wmhSFP10 + offset;
+            #endif
+            #if nEC>=12
+            SFP11ptr = wmhSFP11 + offset;
+            #endif
+            #if nEC>=13
+            SFP12ptr = wmhSFP12 + offset;
+            #endif
+            #if nEC>=14
+            SFP13ptr = wmhSFP13 + offset;
+            #endif
+            #if nEC>=15
+            SFP14ptr = wmhSFP14 + offset;
+            #endif
+            #if nEC>=16
+            SFP15ptr = wmhSFP15 + offset;
+            #endif
+            #if nEC>=17
+            SFP16ptr = wmhSFP16 + offset;
+            #endif
+            #if nEC>=18
+            SFP17ptr = wmhSFP17 + offset;
+            #endif
+            #if nEC>=19
+            SFP18ptr = wmhSFP18 + offset;
+            #endif
+            #if nEC>=20
+            SFP19ptr = wmhSFP19 + offset;
+            #endif
+
+            while( Yptr != YptrEnd )
+                (*Yptr++) += (
+                      x0 * (*SFP0ptr++)
+                    #if nEC>=2
+                    + x1 * (*SFP1ptr++)
+                    #endif
+                    #if nEC>=3
+                    + x2 * (*SFP2ptr++)
+                    #endif
+                    #if nEC>=4
+                    + x3 * (*SFP3ptr++)
+                    #endif
+                    #if nEC>=5
+                    + x4 * (*SFP4ptr++)
+                    #endif
+                    #if nEC>=6
+                    + x5 * (*SFP5ptr++)
+                    #endif
+                    #if nEC>=7
+                    + x6 * (*SFP6ptr++)
+                    #endif
+                    #if nEC>=8
+                    + x7 * (*SFP7ptr++)
+                    #endif
+                    #if nEC>=9
+                    + x8 * (*SFP8ptr++)
+                    #endif
+                    #if nEC>=10
+                    + x9 * (*SFP9ptr++)
+                    #endif
+                    #if nEC>=11
+                    + x10 * (*SFP10ptr++)
+                    #endif
+                    #if nEC>=12
+                    + x11 * (*SFP11ptr++)
+                    #endif
+                    #if nEC>=13
+                    + x12 * (*SFP12ptr++)
+                    #endif
+                    #if nEC>=14
+                    + x13 * (*SFP13ptr++)
+                    #endif
+                    #if nEC>=15
+                    + x14 * (*SFP14ptr++)
+                    #endif
+                    #if nEC>=16
+                    + x15 * (*SFP15ptr++)
+                    #endif
+                    #if nEC>=17
+                    + x16 * (*SFP16ptr++)
+                    #endif
+                    #if nEC>=18
+                    + x17 * (*SFP17ptr++)
+                    #endif
+                    #if nEC>=19
+                    + x18 * (*SFP18ptr++)
+                    #endif
+                    #if nEC>=20
+                    + x19 * (*SFP19ptr++)
+                    #endif
+
+                );
+        }
+        t_v++;
+        t_o++;
+    }
+#endif
+
+#if nISO>=1
+    // isotropic compartments
+    t_v    = ISOv + ISOthreads[id];
+    t_vEnd = ISOv + ISOthreads[id+1];
+
+    x_Ptr0 = x + nIC*nF + nEC*nE + ISOthreads[id];
+    #if nISO>=2
+    x_Ptr1 = x_Ptr0 + nV;
+    #endif
+    #if nISO>=3
+    x_Ptr2 = x_Ptr1 + nV;
+    #endif
+    #if nISO>=4
+    x_Ptr3 = x_Ptr2 + nV;
+    #endif
+    #if nISO>=5
+    x_Ptr4 = x_Ptr3 + nV;
+    #endif
+    #if nISO>=6
+    x_Ptr5 = x_Ptr4 + nV;
+    #endif
+    #if nISO>=7
+    x_Ptr6 = x_Ptr5 + nV;
+    #endif
+    #if nISO>=8
+    x_Ptr7 = x_Ptr6 + nV;
+    #endif
+    #if nISO>=9
+    x_Ptr8 = x_Ptr7 + nV;
+    #endif
+    #if nISO>=10
+    x_Ptr9 = x_Ptr8 + nV;
+    #endif
+    #if nISO>=11
+    x_Ptr10 = x_Ptr9 + nV;
+    #endif
+    #if nISO>=12
+    x_Ptr11 = x_Ptr10 + nV;
+    #endif
+    #if nISO>=13
+    x_Ptr12 = x_Ptr11 + nV;
+    #endif
+    #if nISO>=14
+    x_Ptr13 = x_Ptr12 + nV;
+    #endif
+    #if nISO>=15
+    x_Ptr14 = x_Ptr13 + nV;
+    #endif
+    #if nISO>=16
+    x_Ptr15 = x_Ptr14 + nV;
+    #endif
+    #if nISO>=17
+    x_Ptr16 = x_Ptr15 + nV;
+    #endif
+    #if nISO>=18
+    x_Ptr17 = x_Ptr16 + nV;
+    #endif
+    #if nISO>=19
+    x_Ptr18 = x_Ptr17 + nV;
+    #endif
+    #if nISO>=20
+    x_Ptr19 = x_Ptr18 + nV;
+    #endif
+
+    while( t_v != t_vEnd )
+    {
+        x0 = *x_Ptr0++;
+        #if nISO>=2
+        x1 = *x_Ptr1++;
+        #endif
+        #if nISO>=3
+        x2 = *x_Ptr2++;
+        #endif
+        #if nISO>=4
+        x3 = *x_Ptr3++;
+        #endif
+        #if nISO>=5
+        x4 = *x_Ptr4++;
+        #endif
+        #if nISO>=6
+        x5 = *x_Ptr5++;
+        #endif
+        #if nISO>=7
+        x6 = *x_Ptr6++;
+        #endif
+        #if nISO>=8
+        x7 = *x_Ptr7++;
+        #endif
+        #if nISO>=9
+        x8 = *x_Ptr8++;
+        #endif
+        #if nISO>=10
+        x9 = *x_Ptr9++;
+        #endif
+        #if nISO>=11
+        x10 = *x_Ptr10++;
+        #endif
+        #if nISO>=12
+        x11 = *x_Ptr11++;
+        #endif
+        #if nISO>=13
+        x12 = *x_Ptr12++;
+        #endif
+        #if nISO>=14
+        x13 = *x_Ptr13++;
+        #endif
+        #if nISO>=15
+        x14 = *x_Ptr14++;
+        #endif
+        #if nISO>=16
+        x15 = *x_Ptr15++;
+        #endif
+        #if nISO>=17
+        x16 = *x_Ptr16++;
+        #endif
+        #if nISO>=18
+        x17 = *x_Ptr17++;
+        #endif
+        #if nISO>=19
+        x18 = *x_Ptr18++;
+        #endif
+        #if nISO>=20
+        x19 = *x_Ptr19++;
+        #endif
+
+        if (
+               x0 != 0
+            #if nISO>=2
+            || x1 != 0
+            #endif
+            #if nISO>=3
+            || x2 != 0
+            #endif
+            #if nISO>=4
+            || x3 != 0
+            #endif
+            #if nISO>=5
+            || x4 != 0
+            #endif
+            #if nISO>=6
+            || x5 != 0
+            #endif
+            #if nISO>=7
+            || x6 != 0
+            #endif
+            #if nISO>=8
+            || x7 != 0
+            #endif
+            #if nISO>=9
+            || x8 != 0
+            #endif
+            #if nISO>=10
+            || x9 != 0
+            #endif
+            #if nISO>=11
+            || x10 != 0
+            #endif
+            #if nISO>=12
+            || x11 != 0
+            #endif
+            #if nISO>=13
+            || x12 != 0
+            #endif
+            #if nISO>=14
+            || x13 != 0
+            #endif
+            #if nISO>=15
+            || x14 != 0
+            #endif
+            #if nISO>=16
+            || x15 != 0
+            #endif
+            #if nISO>=17
+            || x16 != 0
+            #endif
+            #if nISO>=18
+            || x17 != 0
+            #endif
+            #if nISO>=19
+            || x18 != 0
+            #endif
+            #if nISO>=20
+            || x19 != 0
+            #endif
+          )
+        {
+            Yptr    = Y    + nS * (*t_v);
+            YptrEnd = Yptr + nS;
+            SFP0ptr = isoSFP0;
+            #if nISO>=2
+            SFP1ptr = isoSFP1;
+            #endif
+            #if nISO>=3
+            SFP2ptr = isoSFP2;
+            #endif
+            #if nISO>=4
+            SFP3ptr = isoSFP3;
+            #endif
+            #if nISO>=5
+            SFP4ptr = isoSFP4;
+            #endif
+            #if nISO>=6
+            SFP5ptr = isoSFP5;
+            #endif
+            #if nISO>=7
+            SFP6ptr = isoSFP6;
+            #endif
+            #if nISO>=8
+            SFP7ptr = isoSFP7;
+            #endif
+            #if nISO>=9
+            SFP8ptr = isoSFP8;
+            #endif
+            #if nISO>=10
+            SFP9ptr = isoSFP9;
+            #endif
+            #if nISO>=11
+            SFP10ptr = isoSFP10;
+            #endif
+            #if nISO>=12
+            SFP11ptr = isoSFP11;
+            #endif
+            #if nISO>=13
+            SFP12ptr = isoSFP12;
+            #endif
+            #if nISO>=14
+            SFP13ptr = isoSFP13;
+            #endif
+            #if nISO>=15
+            SFP14ptr = isoSFP14;
+            #endif
+            #if nISO>=16
+            SFP15ptr = isoSFP15;
+            #endif
+            #if nISO>=17
+            SFP16ptr = isoSFP16;
+            #endif
+            #if nISO>=18
+            SFP17ptr = isoSFP17;
+            #endif
+            #if nISO>=19
+            SFP18ptr = isoSFP18;
+            #endif
+            #if nISO>=20
+            SFP19ptr = isoSFP19;
+            #endif
+
+            while( Yptr != YptrEnd )
+                (*Yptr++) += (
+                      x0 * (*SFP0ptr++)
+                    #if nISO>=2
+                    + x1 * (*SFP1ptr++)
+                    #endif
+                    #if nISO>=3
+                    + x2 * (*SFP2ptr++)
+                    #endif
+                    #if nISO>=4
+                    + x3 * (*SFP3ptr++)
+                    #endif
+                    #if nISO>=5
+                    + x4 * (*SFP4ptr++)
+                    #endif
+                    #if nISO>=6
+                    + x5 * (*SFP5ptr++)
+                    #endif
+                    #if nISO>=7
+                    + x6 * (*SFP6ptr++)
+                    #endif
+                    #if nISO>=8
+                    + x7 * (*SFP7ptr++)
+                    #endif
+                    #if nISO>=9
+                    + x8 * (*SFP8ptr++)
+                    #endif
+                    #if nISO>=10
+                    + x9 * (*SFP9ptr++)
+                    #endif
+                    #if nISO>=11
+                    + x10 * (*SFP10ptr++)
+                    #endif
+                    #if nISO>=12
+                    + x11 * (*SFP11ptr++)
+                    #endif
+                    #if nISO>=13
+                    + x12 * (*SFP12ptr++)
+                    #endif
+                    #if nISO>=14
+                    + x13 * (*SFP13ptr++)
+                    #endif
+                    #if nISO>=15
+                    + x14 * (*SFP14ptr++)
+                    #endif
+                    #if nISO>=16
+                    + x15 * (*SFP15ptr++)
+                    #endif
+                    #if nISO>=17
+                    + x16 * (*SFP16ptr++)
+                    #endif
+                    #if nISO>=18
+                    + x17 * (*SFP17ptr++)
+                    #endif
+                    #if nISO>=19
+                    + x18 * (*SFP18ptr++)
+                    #endif
+                    #if nISO>=20
+                    + x19 * (*SFP19ptr++)
+                    #endif
+                );
+        }
+        t_v++;
+    }
+#endif
+
+    pthread_exit( 0 );
+}
+
+
+// =========================
+// Function called by CYTHON
+// =========================
+void COMMIT_A(
+    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
+    double *_vIN, double *_vOUT,
+    uint32_t *_ICf, uint32_t *_ICv, uint16_t *_ICo, float *_ICl,
+    uint32_t *_ECv, uint16_t *_ECo,
+    uint32_t *_ISOv,
+    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
+    uint32_t* _ICthreads, uint32_t* _ECthreads, uint32_t* _ISOthreads
+)
+{
+    nF = _nF;
+    n  = _n;
+    nE = _nE;
+    nV = _nV;
+    nS = _nS;
+    ndirs = _ndirs;
+
+    x = _vIN;
+    Y = _vOUT;
+
+    ICf  = _ICf;
+    ICv  = _ICv;
+    ICo  = _ICo;
+    ICl  = _ICl;
+    ECv  = _ECv;
+    ECo  = _ECo;
+    ISOv = _ISOv;
+
+    #if nIC>=1
+    wmrSFP0 = _wmrSFP;
+    #if nIC>=2
+    wmrSFP1 = wmrSFP0 + _ndirs*_nS;
+    #if nIC>=3
+    wmrSFP2 = wmrSFP1 + _ndirs*_nS;
+    #if nIC>=4
+    wmrSFP3 = wmrSFP2 + _ndirs*_nS;
+    #if nIC>=5
+    wmrSFP4 = wmrSFP3 + _ndirs*_nS;
+    #if nIC>=6
+    wmrSFP5 = wmrSFP4 + _ndirs*_nS;
+    #if nIC>=7
+    wmrSFP6 = wmrSFP5 + _ndirs*_nS;
+    #if nIC>=8
+    wmrSFP7 = wmrSFP6 + _ndirs*_nS;
+    #if nIC>=9
+    wmrSFP8 = wmrSFP7 + _ndirs*_nS;
+    #if nIC>=10
+    wmrSFP9 = wmrSFP8 + _ndirs*_nS;
+    #if nIC>=11
+    wmrSFP10 = wmrSFP9 + _ndirs*_nS;
+    #if nIC>=12
+    wmrSFP11 = wmrSFP10 + _ndirs*_nS;
+    #if nIC>=13
+    wmrSFP12 = wmrSFP11 + _ndirs*_nS;
+    #if nIC>=14
+    wmrSFP13 = wmrSFP12 + _ndirs*_nS;
+    #if nIC>=15
+    wmrSFP14 = wmrSFP13 + _ndirs*_nS;
+    #if nIC>=16
+    wmrSFP15 = wmrSFP14 + _ndirs*_nS;
+    #if nIC>=17
+    wmrSFP16 = wmrSFP15 + _ndirs*_nS;
+    #if nIC>=18
+    wmrSFP17 = wmrSFP16 + _ndirs*_nS;
+    #if nIC>=19
+    wmrSFP18 = wmrSFP17 + _ndirs*_nS;
+    #if nIC>=20
+    wmrSFP19 = wmrSFP18 + _ndirs*_nS;
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #if nEC>=1
+    wmhSFP0 = _wmhSFP;
+    #if nEC>=2
+    wmhSFP1 = wmhSFP0 + _ndirs*_nS;
+    #if nEC>=3
+    wmhSFP2 = wmhSFP1 + _ndirs*_nS;
+    #if nEC>=4
+    wmhSFP3 = wmhSFP2 + _ndirs*_nS;
+    #if nEC>=5
+    wmhSFP4 = wmhSFP3 + _ndirs*_nS;
+    #if nEC>=6
+    wmhSFP5 = wmhSFP4 + _ndirs*_nS;
+    #if nEC>=7
+    wmhSFP6 = wmhSFP5 + _ndirs*_nS;
+    #if nEC>=8
+    wmhSFP7 = wmhSFP6 + _ndirs*_nS;
+    #if nEC>=9
+    wmhSFP8 = wmhSFP7 + _ndirs*_nS;
+    #if nEC>=10
+    wmhSFP9 = wmhSFP8 + _ndirs*_nS;
+    #if nEC>=11
+    wmhSFP10 = wmhSFP9 + _ndirs*_nS;
+    #if nEC>=12
+    wmhSFP11 = wmhSFP10 + _ndirs*_nS;
+    #if nEC>=13
+    wmhSFP12 = wmhSFP11 + _ndirs*_nS;
+    #if nEC>=14
+    wmhSFP13 = wmhSFP12 + _ndirs*_nS;
+    #if nEC>=15
+    wmhSFP14 = wmhSFP13 + _ndirs*_nS;
+    #if nEC>=16
+    wmhSFP15 = wmhSFP14 + _ndirs*_nS;
+    #if nEC>=17
+    wmhSFP16 = wmhSFP15 + _ndirs*_nS;
+    #if nEC>=18
+    wmhSFP17 = wmhSFP16 + _ndirs*_nS;
+    #if nEC>=19
+    wmhSFP18 = wmhSFP17 + _ndirs*_nS;
+    #if nEC>=20
+    wmhSFP19 = wmhSFP18 + _ndirs*_nS;
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #if nISO>=1
+    isoSFP0 = _isoSFP;
+    #if nISO>=2
+    isoSFP1 = isoSFP0 + _nS;
+    #if nISO>=3
+    isoSFP2 = isoSFP1 + _nS;
+    #if nISO>=4
+    isoSFP3 = isoSFP2 + _nS;
+    #if nISO>=5
+    isoSFP4 = isoSFP3 + _nS;
+    #if nISO>=6
+    isoSFP5 = isoSFP4 + _nS;
+    #if nISO>=7
+    isoSFP6 = isoSFP5 + _nS;
+    #if nISO>=8
+    isoSFP7 = isoSFP6 + _nS;
+    #if nISO>=9
+    isoSFP8 = isoSFP7 + _nS;
+    #if nISO>=10
+    isoSFP9 = isoSFP8 + _nS;
+    #if nISO>=11
+    isoSFP10 = isoSFP9 + _nS;
+    #if nISO>=12
+    isoSFP11 = isoSFP10 + _nS;
+    #if nISO>=13
+    isoSFP12 = isoSFP11 + _nS;
+    #if nISO>=14
+    isoSFP13 = isoSFP12 + _nS;
+    #if nISO>=15
+    isoSFP14 = isoSFP13 + _nS;
+    #if nISO>=16
+    isoSFP15 = isoSFP14 + _nS;
+    #if nISO>=17
+    isoSFP16 = isoSFP15 + _nS;
+    #if nISO>=18
+    isoSFP17 = isoSFP16 + _nS;
+    #if nISO>=19
+    isoSFP18 = isoSFP17 + _nS;
+    #if nISO>=20
+    isoSFP19 = isoSFP18 + _nS;
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+
+    ICthreads  = _ICthreads;
+    ECthreads  = _ECthreads;
+    ISOthreads = _ISOthreads;
+
+    // Run SEPARATE THREADS to perform the multiplication
+    pthread_t threads[nTHREADS];
+    int t;
+    for(t=0; t<nTHREADS ; t++)
+        pthread_create( &threads[t], NULL, COMMIT_A__block, (void *) (long int)t );
+    for(t=0; t<nTHREADS ; t++)
+        pthread_join( threads[t], NULL );
+    return;
+}
+
+
+
+/* ===================================================== */
+/* Compute a sub-block of the A'*y MAtRIX-VECTOR product */
+/* ===================================================== */
+void* COMMIT_At__block( void *ptr )
+{
+    int      id = (long)ptr;
+    int      offset;
+    double   x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, w, Y_tmp;
+    double   *x_Ptr0, *x_Ptr1, *x_Ptr2, *x_Ptr3, *x_Ptr4, *x_Ptr5, *x_Ptr6, *x_Ptr7, *x_Ptr8, *x_Ptr9, *x_Ptr10, *x_Ptr11, *x_Ptr12, *x_Ptr13, *x_Ptr14, *x_Ptr15, *x_Ptr16, *x_Ptr17, *x_Ptr18, *x_Ptr19;
+    double   *Yptr, *YptrEnd;
+    float    *SFP0ptr, *SFP1ptr, *SFP2ptr, *SFP3ptr, *SFP4ptr, *SFP5ptr, *SFP6ptr, *SFP7ptr, *SFP8ptr, *SFP9ptr, *SFP10ptr, *SFP11ptr, *SFP12ptr, *SFP13ptr, *SFP14ptr, *SFP15ptr, *SFP16ptr, *SFP17ptr, *SFP18ptr, *SFP19ptr;
+    uint32_t *t_v, *t_vEnd, *t_f;
+    uint16_t *t_o;
+    float    *t_l;
+    uint8_t  *t_t;
+
+#if nIC>=1
+    // intra-cellular compartments
+    t_v    = ICv;
+    t_vEnd = ICv + n;
+    t_o    = ICo;
+    t_l    = ICl;
+    t_f    = ICf;
+    t_t    = ICthreadsT;
+
+    while( t_v != t_vEnd )
+    {
+        // in this case, I need to walk throug because the segments are ordered in "voxel order"
+        if ( *t_t == id )
+        {
+            Yptr    = Y    + nS * (*t_v);
+            YptrEnd = Yptr + nS;
+            offset  = nS * (*t_o);
+
+            Y_tmp = *Yptr;
+            SFP0ptr   = wmrSFP0 + offset;
+            x0 = (*SFP0ptr++) * Y_tmp;
+            #if nIC>=2
+            SFP1ptr   = wmrSFP1 + offset;
+            x1 = (*SFP1ptr++) * Y_tmp;
+            #endif
+            #if nIC>=3
+            SFP2ptr   = wmrSFP2 + offset;
+            x2 = (*SFP2ptr++) * Y_tmp;
+            #endif
+            #if nIC>=4
+            SFP3ptr   = wmrSFP3 + offset;
+            x3 = (*SFP3ptr++) * Y_tmp;
+            #endif
+            #if nIC>=5
+            SFP4ptr   = wmrSFP4 + offset;
+            x4 = (*SFP4ptr++) * Y_tmp;
+            #endif
+            #if nIC>=6
+            SFP5ptr   = wmrSFP5 + offset;
+            x5 = (*SFP5ptr++) * Y_tmp;
+            #endif
+            #if nIC>=7
+            SFP6ptr   = wmrSFP6 + offset;
+            x6 = (*SFP6ptr++) * Y_tmp;
+            #endif
+            #if nIC>=8
+            SFP7ptr   = wmrSFP7 + offset;
+            x7 = (*SFP7ptr++) * Y_tmp;
+            #endif
+            #if nIC>=9
+            SFP8ptr   = wmrSFP8 + offset;
+            x8 = (*SFP8ptr++) * Y_tmp;
+            #endif
+            #if nIC>=10
+            SFP9ptr   = wmrSFP9 + offset;
+            x9 = (*SFP9ptr++) * Y_tmp;
+            #endif
+            #if nIC>=11
+            SFP10ptr   = wmrSFP10 + offset;
+            x10 = (*SFP10ptr++) * Y_tmp;
+            #endif
+            #if nIC>=12
+            SFP11ptr   = wmrSFP11 + offset;
+            x11 = (*SFP11ptr++) * Y_tmp;
+            #endif
+            #if nIC>=13
+            SFP12ptr   = wmrSFP12 + offset;
+            x12 = (*SFP12ptr++) * Y_tmp;
+            #endif
+            #if nIC>=14
+            SFP13ptr   = wmrSFP13 + offset;
+            x13 = (*SFP13ptr++) * Y_tmp;
+            #endif
+            #if nIC>=15
+            SFP14ptr   = wmrSFP14 + offset;
+            x14 = (*SFP14ptr++) * Y_tmp;
+            #endif
+            #if nIC>=16
+            SFP15ptr   = wmrSFP15 + offset;
+            x15 = (*SFP15ptr++) * Y_tmp;
+            #endif
+            #if nIC>=17
+            SFP16ptr   = wmrSFP16 + offset;
+            x16 = (*SFP16ptr++) * Y_tmp;
+            #endif
+            #if nIC>=18
+            SFP17ptr   = wmrSFP17 + offset;
+            x17 = (*SFP17ptr++) * Y_tmp;
+            #endif
+            #if nIC>=19
+            SFP18ptr   = wmrSFP18 + offset;
+            x18 = (*SFP18ptr++) * Y_tmp;
+            #endif
+            #if nIC>=20
+            SFP19ptr   = wmrSFP19 + offset;
+            x19 = (*SFP19ptr++) * Y_tmp;
+            #endif
+
+            while( ++Yptr != YptrEnd )
+            {
+                Y_tmp = *Yptr;
+                x0 += (*SFP0ptr++) * Y_tmp;
+                #if nIC>=2
+                x1 += (*SFP1ptr++) * Y_tmp;
+                #endif
+                #if nIC>=3
+                x2 += (*SFP2ptr++) * Y_tmp;
+                #endif
+                #if nIC>=4
+                x3 += (*SFP3ptr++) * Y_tmp;
+                #endif
+                #if nIC>=5
+                x4 += (*SFP4ptr++) * Y_tmp;
+                #endif
+                #if nIC>=6
+                x5 += (*SFP5ptr++) * Y_tmp;
+                #endif
+                #if nIC>=7
+                x6 += (*SFP6ptr++) * Y_tmp;
+                #endif
+                #if nIC>=8
+                x7 += (*SFP7ptr++) * Y_tmp;
+                #endif
+                #if nIC>=9
+                x8 += (*SFP8ptr++) * Y_tmp;
+                #endif
+                #if nIC>=10
+                x9 += (*SFP9ptr++) * Y_tmp;
+                #endif
+                #if nIC>=11
+                x10 += (*SFP10ptr++) * Y_tmp;
+                #endif
+                #if nIC>=12
+                x11 += (*SFP11ptr++) * Y_tmp;
+                #endif
+                #if nIC>=13
+                x12 += (*SFP12ptr++) * Y_tmp;
+                #endif
+                #if nIC>=14
+                x13 += (*SFP13ptr++) * Y_tmp;
+                #endif
+                #if nIC>=15
+                x14 += (*SFP14ptr++) * Y_tmp;
+                #endif
+                #if nIC>=16
+                x15 += (*SFP15ptr++) * Y_tmp;
+                #endif
+                #if nIC>=17
+                x16 += (*SFP16ptr++) * Y_tmp;
+                #endif
+                #if nIC>=18
+                x17 += (*SFP17ptr++) * Y_tmp;
+                #endif
+                #if nIC>=19
+                x18 += (*SFP18ptr++) * Y_tmp;
+                #endif
+                #if nIC>=20
+                x19 += (*SFP19ptr++) * Y_tmp;
+                #endif
+            }
+
+            w = (double)(*t_l);
+            x[*t_f]      += w * x0;
+            #if nIC>=2
+            x[*t_f+nF]   += w * x1;
+            #endif
+            #if nIC>=3
+            x[*t_f+2*nF] += w * x2;
+            #endif
+            #if nIC>=4
+            x[*t_f+3*nF] += w * x3;
+            #endif
+            #if nIC>=5
+            x[*t_f+4*nF] += w * x4;
+            #endif
+            #if nIC>=6
+            x[*t_f+5*nF] += w * x5;
+            #endif
+            #if nIC>=7
+            x[*t_f+6*nF] += w * x6;
+            #endif
+            #if nIC>=8
+            x[*t_f+7*nF] += w * x7;
+            #endif
+            #if nIC>=9
+            x[*t_f+8*nF] += w * x8;
+            #endif
+            #if nIC>=10
+            x[*t_f+9*nF] += w * x9;
+            #endif
+            #if nIC>=11
+            x[*t_f+10*nF] += w * x10;
+            #endif
+            #if nIC>=12
+            x[*t_f+11*nF] += w * x11;
+            #endif
+            #if nIC>=13
+            x[*t_f+12*nF] += w * x12;
+            #endif
+            #if nIC>=14
+            x[*t_f+13*nF] += w * x13;
+            #endif
+            #if nIC>=15
+            x[*t_f+14*nF] += w * x14;
+            #endif
+            #if nIC>=16
+            x[*t_f+15*nF] += w * x15;
+            #endif
+            #if nIC>=17
+            x[*t_f+16*nF] += w * x16;
+            #endif
+            #if nIC>=18
+            x[*t_f+17*nF] += w * x17;
+            #endif
+            #if nIC>=19
+            x[*t_f+18*nF] += w * x18;
+            #endif
+            #if nIC>=20
+            x[*t_f+19*nF] += w * x19;
+            #endif
+        }
+
+        t_f++;
+        t_v++;
+        t_o++;
+        t_l++;
+        t_t++;
+    }
+#endif
+
+#if nEC>=1
+    // extra-cellular compartments
+    t_v    = ECv + ECthreadsT[id];
+    t_vEnd = ECv + ECthreadsT[id+1];
+    t_o    = ECo + ECthreadsT[id];
+
+    x_Ptr0 = x + nIC*nF + ECthreadsT[id];
+    #if nEC>=2
+    x_Ptr1 = x_Ptr0 + nE;
+    #endif
+    #if nEC>=3
+    x_Ptr2 = x_Ptr1 + nE;
+    #endif
+    #if nEC>=4
+    x_Ptr3 = x_Ptr2 + nE;
+    #endif
+    #if nEC>=5
+    x_Ptr4 = x_Ptr3 + nE;
+    #endif
+    #if nEC>=6
+    x_Ptr5 = x_Ptr4 + nE;
+    #endif
+    #if nEC>=7
+    x_Ptr6 = x_Ptr5 + nE;
+    #endif
+    #if nEC>=8
+    x_Ptr7 = x_Ptr6 + nE;
+    #endif
+    #if nEC>=9
+    x_Ptr8 = x_Ptr7 + nE;
+    #endif
+    #if nEC>=10
+    x_Ptr9 = x_Ptr8 + nE;
+    #endif
+    #if nEC>=11
+    x_Ptr10 = x_Ptr9 + nE;
+    #endif
+    #if nEC>=12
+    x_Ptr11 = x_Ptr10 + nE;
+    #endif
+    #if nEC>=13
+    x_Ptr12 = x_Ptr11 + nE;
+    #endif
+    #if nEC>=14
+    x_Ptr13 = x_Ptr12 + nE;
+    #endif
+    #if nEC>=15
+    x_Ptr14 = x_Ptr13 + nE;
+    #endif
+    #if nEC>=16
+    x_Ptr15 = x_Ptr14 + nE;
+    #endif
+    #if nEC>=17
+    x_Ptr16 = x_Ptr15 + nE;
+    #endif
+    #if nEC>=18
+    x_Ptr17 = x_Ptr16 + nE;
+    #endif
+    #if nEC>=19
+    x_Ptr18 = x_Ptr17 + nE;
+    #endif
+    #if nEC>=20
+    x_Ptr19 = x_Ptr18 + nE;
+    #endif
+
+    while( t_v != t_vEnd )
+    {
+        Yptr    = Y    + nS * (*t_v++);
+        YptrEnd = Yptr + nS;
+        offset  = nS * (*t_o++);
+
+        Y_tmp = *Yptr;
+        SFP0ptr = wmhSFP0 + offset;
+        x0 = (*SFP0ptr++) * Y_tmp;
+        #if nEC>=2
+        SFP1ptr = wmhSFP1 + offset;
+        x1 = (*SFP1ptr++) * Y_tmp;
+        #endif
+        #if nEC>=3
+        SFP2ptr = wmhSFP2 + offset;
+        x2 = (*SFP2ptr++) * Y_tmp;
+        #endif
+        #if nEC>=4
+        SFP3ptr = wmhSFP3 + offset;
+        x3 = (*SFP3ptr++) * Y_tmp;
+        #endif
+        #if nEC>=5
+        SFP4ptr = wmhSFP4 + offset;
+        x4 = (*SFP4ptr++) * Y_tmp;
+        #endif
+        #if nEC>=6
+        SFP5ptr = wmhSFP5 + offset;
+        x5 = (*SFP5ptr++) * Y_tmp;
+        #endif
+        #if nEC>=7
+        SFP6ptr = wmhSFP6 + offset;
+        x6 = (*SFP6ptr++) * Y_tmp;
+        #endif
+        #if nEC>=8
+        SFP7ptr = wmhSFP7 + offset;
+        x7 = (*SFP7ptr++) * Y_tmp;
+        #endif
+        #if nEC>=9
+        SFP8ptr = wmhSFP8 + offset;
+        x8 = (*SFP8ptr++) * Y_tmp;
+        #endif
+        #if nEC>=10
+        SFP9ptr = wmhSFP9 + offset;
+        x9 = (*SFP9ptr++) * Y_tmp;
+        #endif
+        #if nEC>=11
+        SFP10ptr = wmhSFP10 + offset;
+        x10 = (*SFP10ptr++) * Y_tmp;
+        #endif
+        #if nEC>=12
+        SFP11ptr = wmhSFP11 + offset;
+        x11 = (*SFP11ptr++) * Y_tmp;
+        #endif
+        #if nEC>=13
+        SFP12ptr = wmhSFP12 + offset;
+        x12 = (*SFP12ptr++) * Y_tmp;
+        #endif
+        #if nEC>=14
+        SFP13ptr = wmhSFP13 + offset;
+        x13 = (*SFP13ptr++) * Y_tmp;
+        #endif
+        #if nEC>=15
+        SFP14ptr = wmhSFP14 + offset;
+        x14 = (*SFP14ptr++) * Y_tmp;
+        #endif
+        #if nEC>=16
+        SFP15ptr = wmhSFP15 + offset;
+        x15 = (*SFP15ptr++) * Y_tmp;
+        #endif
+        #if nEC>=17
+        SFP16ptr = wmhSFP16 + offset;
+        x16 = (*SFP16ptr++) * Y_tmp;
+        #endif
+        #if nEC>=18
+        SFP17ptr = wmhSFP17 + offset;
+        x17 = (*SFP17ptr++) * Y_tmp;
+        #endif
+        #if nEC>=19
+        SFP18ptr = wmhSFP18 + offset;
+        x18 = (*SFP18ptr++) * Y_tmp;
+        #endif
+        #if nEC>=20
+        SFP19ptr = wmhSFP19 + offset;
+        x19 = (*SFP19ptr++) * Y_tmp;
+        #endif
+
+        while( ++Yptr != YptrEnd )
+        {
+            Y_tmp = *Yptr;
+            x0 += (*SFP0ptr++) * Y_tmp;
+            #if nEC>=2
+            x1 += (*SFP1ptr++) * Y_tmp;
+            #endif
+            #if nEC>=3
+            x2 += (*SFP2ptr++) * Y_tmp;
+            #endif
+            #if nEC>=4
+            x3 += (*SFP3ptr++) * Y_tmp;
+            #endif
+            #if nEC>=5
+            x4 += (*SFP4ptr++) * Y_tmp;
+            #endif
+            #if nEC>=6
+            x5 += (*SFP5ptr++) * Y_tmp;
+            #endif
+            #if nEC>=7
+            x6 += (*SFP6ptr++) * Y_tmp;
+            #endif
+            #if nEC>=8
+            x7 += (*SFP7ptr++) * Y_tmp;
+            #endif
+            #if nEC>=9
+            x8 += (*SFP8ptr++) * Y_tmp;
+            #endif
+            #if nEC>=10
+            x9 += (*SFP9ptr++) * Y_tmp;
+            #endif
+            #if nEC>=11
+            x10 += (*SFP10ptr++) * Y_tmp;
+            #endif
+            #if nEC>=12
+            x11 += (*SFP11ptr++) * Y_tmp;
+            #endif
+            #if nEC>=13
+            x12 += (*SFP12ptr++) * Y_tmp;
+            #endif
+            #if nEC>=14
+            x13 += (*SFP13ptr++) * Y_tmp;
+            #endif
+            #if nEC>=15
+            x14 += (*SFP14ptr++) * Y_tmp;
+            #endif
+            #if nEC>=16
+            x15 += (*SFP15ptr++) * Y_tmp;
+            #endif
+            #if nEC>=17
+            x16 += (*SFP16ptr++) * Y_tmp;
+            #endif
+            #if nEC>=18
+            x17 += (*SFP17ptr++) * Y_tmp;
+            #endif
+            #if nEC>=19
+            x18 += (*SFP18ptr++) * Y_tmp;
+            #endif
+            #if nEC>=20
+            x19 += (*SFP19ptr++) * Y_tmp;
+            #endif
+        }
+        (*x_Ptr0++) += x0;
+        #if nEC>=2
+        (*x_Ptr1++) += x1;
+        #endif
+        #if nEC>=3
+        (*x_Ptr2++) += x2;
+        #endif
+        #if nEC>=4
+        (*x_Ptr3++) += x3;
+        #endif
+        #if nEC>=5
+        (*x_Ptr4++) += x4;
+        #endif
+        #if nEC>=6
+        (*x_Ptr5++) += x5;
+        #endif
+        #if nEC>=7
+        (*x_Ptr6++) += x6;
+        #endif
+        #if nEC>=8
+        (*x_Ptr7++) += x7;
+        #endif
+        #if nEC>=9
+        (*x_Ptr8++) += x8;
+        #endif
+        #if nEC>=10
+        (*x_Ptr9++) += x9;
+        #endif
+        #if nEC>=11
+        (*x_Ptr10++) += x10;
+        #endif
+        #if nEC>=12
+        (*x_Ptr11++) += x11;
+        #endif
+        #if nEC>=13
+        (*x_Ptr12++) += x12;
+        #endif
+        #if nEC>=14
+        (*x_Ptr13++) += x13;
+        #endif
+        #if nEC>=15
+        (*x_Ptr14++) += x14;
+        #endif
+        #if nEC>=16
+        (*x_Ptr15++) += x15;
+        #endif
+        #if nEC>=17
+        (*x_Ptr16++) += x16;
+        #endif
+        #if nEC>=18
+        (*x_Ptr17++) += x17;
+        #endif
+        #if nEC>=19
+        (*x_Ptr18++) += x18;
+        #endif
+        #if nEC>=20
+        (*x_Ptr19++) += x19;
+        #endif
+    }
+#endif
+
+#if nISO>=1
+    // isotropic compartments
+    t_v    = ISOv + ISOthreadsT[id];
+    t_vEnd = ISOv + ISOthreadsT[id+1];
+
+    x_Ptr0 = x + nIC*nF + nEC*nE + ISOthreadsT[id];
+    #if nISO>=2
+    x_Ptr1 = x_Ptr0 + nV;
+    #endif
+    #if nISO>=3
+    x_Ptr2 = x_Ptr1 + nV;
+    #endif
+    #if nISO>=4
+    x_Ptr3 = x_Ptr2 + nV;
+    #endif
+    #if nISO>=5
+    x_Ptr4 = x_Ptr3 + nV;
+    #endif
+    #if nISO>=6
+    x_Ptr5 = x_Ptr4 + nV;
+    #endif
+    #if nISO>=7
+    x_Ptr6 = x_Ptr5 + nV;
+    #endif
+    #if nISO>=8
+    x_Ptr7 = x_Ptr6 + nV;
+    #endif
+    #if nISO>=9
+    x_Ptr8 = x_Ptr7 + nV;
+    #endif
+    #if nISO>=10
+    x_Ptr9 = x_Ptr8 + nV;
+    #endif
+    #if nISO>=11
+    x_Ptr10 = x_Ptr9 + nV;
+    #endif
+    #if nISO>=12
+    x_Ptr11 = x_Ptr10 + nV;
+    #endif
+    #if nISO>=13
+    x_Ptr12 = x_Ptr11 + nV;
+    #endif
+    #if nISO>=14
+    x_Ptr13 = x_Ptr12 + nV;
+    #endif
+    #if nISO>=15
+    x_Ptr14 = x_Ptr13 + nV;
+    #endif
+    #if nISO>=16
+    x_Ptr15 = x_Ptr14 + nV;
+    #endif
+    #if nISO>=17
+    x_Ptr16 = x_Ptr15 + nV;
+    #endif
+    #if nISO>=18
+    x_Ptr17 = x_Ptr16 + nV;
+    #endif
+    #if nISO>=19
+    x_Ptr18 = x_Ptr17 + nV;
+    #endif
+    #if nISO>=20
+    x_Ptr19 = x_Ptr18 + nV;
+    #endif
+
+    while( t_v != t_vEnd )
+    {
+        Yptr    = Y    + nS * (*t_v++);
+        YptrEnd = Yptr + nS;
+
+        SFP0ptr = isoSFP0;
+        #if nISO>=2
+        SFP1ptr = isoSFP1;
+        #endif
+        #if nISO>=3
+        SFP2ptr = isoSFP2;
+        #endif
+        #if nISO>=4
+        SFP3ptr = isoSFP3;
+        #endif
+        #if nISO>=5
+        SFP4ptr = isoSFP4;
+        #endif
+        #if nISO>=6
+        SFP5ptr = isoSFP5;
+        #endif
+        #if nISO>=7
+        SFP6ptr = isoSFP6;
+        #endif
+        #if nISO>=8
+        SFP7ptr = isoSFP7;
+        #endif
+        #if nISO>=9
+        SFP8ptr = isoSFP8;
+        #endif
+        #if nISO>=10
+        SFP9ptr = isoSFP9;
+        #endif
+        #if nISO>=11
+        SFP10ptr = isoSFP10;
+        #endif
+        #if nISO>=12
+        SFP11ptr = isoSFP11;
+        #endif
+        #if nISO>=13
+        SFP12ptr = isoSFP12;
+        #endif
+        #if nISO>=14
+        SFP13ptr = isoSFP13;
+        #endif
+        #if nISO>=15
+        SFP14ptr = isoSFP14;
+        #endif
+        #if nISO>=16
+        SFP15ptr = isoSFP15;
+        #endif
+        #if nISO>=17
+        SFP16ptr = isoSFP16;
+        #endif
+        #if nISO>=18
+        SFP17ptr = isoSFP17;
+        #endif
+        #if nISO>=19
+        SFP18ptr = isoSFP18;
+        #endif
+        #if nISO>=20
+        SFP19ptr = isoSFP19;
+        #endif
+
+        Y_tmp = *Yptr;
+        x0 = (*SFP0ptr++) * Y_tmp;
+        #if nISO>=2
+        x1 = (*SFP1ptr++) * Y_tmp;
+        #endif
+        #if nISO>=3
+        x2 = (*SFP2ptr++) * Y_tmp;
+        #endif
+        #if nISO>=4
+        x3 = (*SFP3ptr++) * Y_tmp;
+        #endif
+        #if nISO>=5
+        x4 = (*SFP4ptr++) * Y_tmp;
+        #endif
+        #if nISO>=6
+        x5 = (*SFP5ptr++) * Y_tmp;
+        #endif
+        #if nISO>=7
+        x6 = (*SFP6ptr++) * Y_tmp;
+        #endif
+        #if nISO>=8
+        x7 = (*SFP7ptr++) * Y_tmp;
+        #endif
+        #if nISO>=9
+        x8 = (*SFP8ptr++) * Y_tmp;
+        #endif
+        #if nISO>=10
+        x9 = (*SFP9ptr++) * Y_tmp;
+        #endif
+        #if nISO>=11
+        x10 = (*SFP10ptr++) * Y_tmp;
+        #endif
+        #if nISO>=12
+        x11 = (*SFP11ptr++) * Y_tmp;
+        #endif
+        #if nISO>=13
+        x12 = (*SFP12ptr++) * Y_tmp;
+        #endif
+        #if nISO>=14
+        x13 = (*SFP13ptr++) * Y_tmp;
+        #endif
+        #if nISO>=15
+        x14 = (*SFP14ptr++) * Y_tmp;
+        #endif
+        #if nISO>=16
+        x15 = (*SFP15ptr++) * Y_tmp;
+        #endif
+        #if nISO>=17
+        x16 = (*SFP16ptr++) * Y_tmp;
+        #endif
+        #if nISO>=18
+        x17 = (*SFP17ptr++) * Y_tmp;
+        #endif
+        #if nISO>=19
+        x18 = (*SFP18ptr++) * Y_tmp;
+        #endif
+        #if nISO>=20
+        x19 = (*SFP19ptr++) * Y_tmp;
+        #endif
+
+        while( ++Yptr != YptrEnd )
+        {
+            Y_tmp = *Yptr;
+            x0  += (*SFP0ptr++) * Y_tmp;
+            #if nISO>=2
+            x1  += (*SFP1ptr++) * Y_tmp;
+            #endif
+            #if nISO>=3
+            x2  += (*SFP2ptr++) * Y_tmp;
+            #endif
+            #if nISO>=4
+            x3  += (*SFP3ptr++) * Y_tmp;
+            #endif
+            #if nISO>=5
+            x4  += (*SFP4ptr++) * Y_tmp;
+            #endif
+            #if nISO>=6
+            x5  += (*SFP5ptr++) * Y_tmp;
+            #endif
+            #if nISO>=7
+            x6  += (*SFP6ptr++) * Y_tmp;
+            #endif
+            #if nISO>=8
+            x7  += (*SFP7ptr++) * Y_tmp;
+            #endif
+            #if nISO>=9
+            x8  += (*SFP8ptr++) * Y_tmp;
+            #endif
+            #if nISO>=10
+            x9  += (*SFP9ptr++) * Y_tmp;
+            #endif
+            #if nISO>=11
+            x10  += (*SFP10ptr++) * Y_tmp;
+            #endif
+            #if nISO>=12
+            x11  += (*SFP11ptr++) * Y_tmp;
+            #endif
+            #if nISO>=13
+            x12  += (*SFP12ptr++) * Y_tmp;
+            #endif
+            #if nISO>=14
+            x13  += (*SFP13ptr++) * Y_tmp;
+            #endif
+            #if nISO>=15
+            x14  += (*SFP14ptr++) * Y_tmp;
+            #endif
+            #if nISO>=16
+            x15  += (*SFP15ptr++) * Y_tmp;
+            #endif
+            #if nISO>=17
+            x16  += (*SFP16ptr++) * Y_tmp;
+            #endif
+            #if nISO>=18
+            x17  += (*SFP17ptr++) * Y_tmp;
+            #endif
+            #if nISO>=19
+            x18  += (*SFP18ptr++) * Y_tmp;
+            #endif
+            #if nISO>=20
+            x19  += (*SFP19ptr++) * Y_tmp;
+            #endif
+        }
+
+        (*x_Ptr0++) += x0;
+        #if nISO>=2
+        (*x_Ptr1++) += x1;
+        #endif
+        #if nISO>=3
+        (*x_Ptr2++) += x2;
+        #endif
+        #if nISO>=4
+        (*x_Ptr3++) += x3;
+        #endif
+        #if nISO>=5
+        (*x_Ptr4++) += x4;
+        #endif
+        #if nISO>=6
+        (*x_Ptr5++) += x5;
+        #endif
+        #if nISO>=7
+        (*x_Ptr6++) += x6;
+        #endif
+        #if nISO>=8
+        (*x_Ptr7++) += x7;
+        #endif
+        #if nISO>=9
+        (*x_Ptr8++) += x8;
+        #endif
+        #if nISO>=10
+        (*x_Ptr9++) += x9;
+        #endif
+        #if nISO>=11
+        (*x_Ptr10++) += x10;
+        #endif
+        #if nISO>=12
+        (*x_Ptr11++) += x11;
+        #endif
+        #if nISO>=13
+        (*x_Ptr12++) += x12;
+        #endif
+        #if nISO>=14
+        (*x_Ptr13++) += x13;
+        #endif
+        #if nISO>=15
+        (*x_Ptr14++) += x14;
+        #endif
+        #if nISO>=16
+        (*x_Ptr15++) += x15;
+        #endif
+        #if nISO>=17
+        (*x_Ptr16++) += x16;
+        #endif
+        #if nISO>=18
+        (*x_Ptr17++) += x17;
+        #endif
+        #if nISO>=19
+        (*x_Ptr18++) += x18;
+        #endif
+        #if nISO>=20
+        (*x_Ptr19++) += x19;
+        #endif
+    }
+#endif
+
+    pthread_exit( 0 );
+}
+
+
+// =========================
+// Function called by CYTHON
+// =========================
+void COMMIT_At(
+    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
+    double *_vIN, double *_vOUT,
+    uint32_t *_ICf, uint32_t *_ICv, uint16_t *_ICo, float *_ICl,
+    uint32_t *_ECv, uint16_t *_ECo,
+    uint32_t *_ISOv,
+    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
+    uint8_t* _ICthreadsT, uint32_t* _ECthreadsT, uint32_t* _ISOthreadsT
+)
+{
+    nF = _nF;
+    n  = _n;
+    nE = _nE;
+    nV = _nV;
+    nS = _nS;
+    ndirs = _ndirs;
+
+    x = _vOUT;
+    Y = _vIN;
+
+    ICf  = _ICf;
+    ICv  = _ICv;
+    ICo  = _ICo;
+    ICl  = _ICl;
+    ECv  = _ECv;
+    ECo  = _ECo;
+    ISOv = _ISOv;
+
+    #if nIC>=1
+    wmrSFP0 = _wmrSFP;
+    #if nIC>=2
+    wmrSFP1 = wmrSFP0 + _ndirs*_nS;
+    #if nIC>=3
+    wmrSFP2 = wmrSFP1 + _ndirs*_nS;
+    #if nIC>=4
+    wmrSFP3 = wmrSFP2 + _ndirs*_nS;
+    #if nIC>=5
+    wmrSFP4 = wmrSFP3 + _ndirs*_nS;
+    #if nIC>=6
+    wmrSFP5 = wmrSFP4 + _ndirs*_nS;
+    #if nIC>=7
+    wmrSFP6 = wmrSFP5 + _ndirs*_nS;
+    #if nIC>=8
+    wmrSFP7 = wmrSFP6 + _ndirs*_nS;
+    #if nIC>=9
+    wmrSFP8 = wmrSFP7 + _ndirs*_nS;
+    #if nIC>=10
+    wmrSFP9 = wmrSFP8 + _ndirs*_nS;
+    #if nIC>=11
+    wmrSFP10 = wmrSFP9 + _ndirs*_nS;
+    #if nIC>=12
+    wmrSFP11 = wmrSFP10 + _ndirs*_nS;
+    #if nIC>=13
+    wmrSFP12 = wmrSFP11 + _ndirs*_nS;
+    #if nIC>=14
+    wmrSFP13 = wmrSFP12 + _ndirs*_nS;
+    #if nIC>=15
+    wmrSFP14 = wmrSFP13 + _ndirs*_nS;
+    #if nIC>=16
+    wmrSFP15 = wmrSFP14 + _ndirs*_nS;
+    #if nIC>=17
+    wmrSFP16 = wmrSFP15 + _ndirs*_nS;
+    #if nIC>=18
+    wmrSFP17 = wmrSFP16 + _ndirs*_nS;
+    #if nIC>=19
+    wmrSFP18 = wmrSFP17 + _ndirs*_nS;
+    #if nIC>=20
+    wmrSFP19 = wmrSFP18 + _ndirs*_nS;
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #if nEC>=1
+    wmhSFP0 = _wmhSFP;
+    #if nEC>=2
+    wmhSFP1 = wmhSFP0 + _ndirs*_nS;
+    #if nEC>=3
+    wmhSFP2 = wmhSFP1 + _ndirs*_nS;
+    #if nEC>=4
+    wmhSFP3 = wmhSFP2 + _ndirs*_nS;
+    #if nEC>=5
+    wmhSFP4 = wmhSFP3 + _ndirs*_nS;
+    #if nEC>=6
+    wmhSFP5 = wmhSFP4 + _ndirs*_nS;
+    #if nEC>=7
+    wmhSFP6 = wmhSFP5 + _ndirs*_nS;
+    #if nEC>=8
+    wmhSFP7 = wmhSFP6 + _ndirs*_nS;
+    #if nEC>=9
+    wmhSFP8 = wmhSFP7 + _ndirs*_nS;
+    #if nEC>=10
+    wmhSFP9 = wmhSFP8 + _ndirs*_nS;
+    #if nEC>=11
+    wmhSFP10 = wmhSFP9 + _ndirs*_nS;
+    #if nEC>=12
+    wmhSFP11 = wmhSFP10 + _ndirs*_nS;
+    #if nEC>=13
+    wmhSFP12 = wmhSFP11 + _ndirs*_nS;
+    #if nEC>=14
+    wmhSFP13 = wmhSFP12 + _ndirs*_nS;
+    #if nEC>=15
+    wmhSFP14 = wmhSFP13 + _ndirs*_nS;
+    #if nEC>=16
+    wmhSFP15 = wmhSFP14 + _ndirs*_nS;
+    #if nEC>=17
+    wmhSFP16 = wmhSFP15 + _ndirs*_nS;
+    #if nEC>=18
+    wmhSFP17 = wmhSFP16 + _ndirs*_nS;
+    #if nEC>=19
+    wmhSFP18 = wmhSFP17 + _ndirs*_nS;
+    #if nEC>=20
+    wmhSFP19 = wmhSFP18 + _ndirs*_nS;
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #if nISO>=1
+    isoSFP0 = _isoSFP;
+    #if nISO>=2
+    isoSFP1 = isoSFP0 + _nS;
+    #if nISO>=3
+    isoSFP2 = isoSFP1 + _nS;
+    #if nISO>=4
+    isoSFP3 = isoSFP2 + _nS;
+    #if nISO>=5
+    isoSFP4 = isoSFP3 + _nS;
+    #if nISO>=6
+    isoSFP5 = isoSFP4 + _nS;
+    #if nISO>=7
+    isoSFP6 = isoSFP5 + _nS;
+    #if nISO>=8
+    isoSFP7 = isoSFP6 + _nS;
+    #if nISO>=9
+    isoSFP8 = isoSFP7 + _nS;
+    #if nISO>=10
+    isoSFP9 = isoSFP8 + _nS;
+    #if nISO>=11
+    isoSFP10 = isoSFP9 + _nS;
+    #if nISO>=12
+    isoSFP11 = isoSFP10 + _nS;
+    #if nISO>=13
+    isoSFP12 = isoSFP11 + _nS;
+    #if nISO>=14
+    isoSFP13 = isoSFP12 + _nS;
+    #if nISO>=15
+    isoSFP14 = isoSFP13 + _nS;
+    #if nISO>=16
+    isoSFP15 = isoSFP14 + _nS;
+    #if nISO>=17
+    isoSFP16 = isoSFP15 + _nS;
+    #if nISO>=18
+    isoSFP17 = isoSFP16 + _nS;
+    #if nISO>=19
+    isoSFP18 = isoSFP17 + _nS;
+    #if nISO>=20
+    isoSFP19 = isoSFP18 + _nS;
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+
+    ICthreadsT  = _ICthreadsT;
+    ECthreadsT  = _ECthreadsT;
+    ISOthreadsT = _ISOthreadsT;
+
+    // Run SEPARATE THREADS to perform the multiplication
+    pthread_t threads[nTHREADS];
+    int t;
+    for(t=0; t<nTHREADS ; t++)
+        pthread_create( &threads[t], NULL, COMMIT_At__block, (void *) (long int)t );
+    for(t=0; t<nTHREADS ; t++)
+        pthread_join( threads[t], NULL );
+    return;
+}
diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index af25bf0b..9e3e1ac3 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -1,651 +1,666 @@
-#include "operator_withCUDA.cuh"
-
-// textures in GPU
-texture<float32_t, 1, cudaReadModeElementType> tex_lutIC;
-texture<float32_t, 1, cudaReadModeElementType> tex_lutEC;
-texture<float32_t, 1, cudaReadModeElementType> tex_lutISO;
-
-bool cudaCheck(cudaError_t cudaStatus){
-    return cudaStatus == cudaSuccess;
-}
-
-void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compartmentsPerBlock, uint32_t* offsetPerBlock, int NUM_BLOCKS){
-
-    // fill arrays with zeros
-    memset(compartmentsPerBlock, 0, NUM_BLOCKS * sizeof(uint32_t));
-    memset(offsetPerBlock,       0, NUM_BLOCKS * sizeof(uint32_t));
-
-    // count compartments per block
-    for(int i = 0; i < NUM_COMPARTMENTS; i++)
-        compartmentsPerBlock[data[i]]++;
-
-    // calculate offset per block
-    offsetPerBlock[0] = 0;
-    for(int i = 1; i < NUM_BLOCKS; i++)
-        offsetPerBlock[i] = offsetPerBlock[i-1] + compartmentsPerBlock[i-1];
-}
-
-bool checkCompatibility(size_t required_mem, int gpu_id) {
-    int num_gpus;
-    cudaError_t cudaStatus;
-    
-    cudaStatus = cudaGetDeviceCount(&num_gpus);
-
-    if (num_gpus <= 0 || num_gpus <= gpu_id) {
-        printf("\t* the selected GPU does not exist or is not detected \n");
-        return false;
-    }
-
-    if(cudaStatus == cudaSuccess){
-        cudaDeviceProp gpu_properties;
-        cudaGetDeviceProperties(&gpu_properties, gpu_id);
-
-        printf("\t* checking availability of CUDA ... [ OK ]\n");
-        printf("\t* number of CUDA GPUs detected: %d\n", num_gpus);
-        printf("\t* using GPU with ID %d... [ %s ]\n", gpu_id, gpu_properties.name);
-
-        if (required_mem <= gpu_properties.totalGlobalMem) {
-            printf("\t* using %.2f GB of total %.2f GB... [ OK ]\n", required_mem*1e-9, gpu_properties.totalGlobalMem*1e-9);
-        }
-        else {
-            printf("\t* using %f GB of total %f GB... [ ERROR ]: dictionary too big for GPU memory\n", required_mem*1e-9, gpu_properties.totalGlobalMem*1e-9);
-        }
-
-        if(gpu_properties.major >= 5){
-            printf("\t* compute capability: %d.%d [ OK ]\n", gpu_properties.major, gpu_properties.minor);
-        }
-        else{
-            printf("\t* compute capability: %d.%d [ ERROR ]. GPU compute capability must be at least 5.0\n", gpu_properties.major, gpu_properties.minor);
-            return false;
-        }
-
-        return true;
-    }
-    else{
-        printf("\t* checking availability of CUDA ... [ ERROR ]: CUDA is not available or GPU is not CUDA compatible\n");
-        return false;
-    }
-}
-
-CudaLinearOperator::CudaLinearOperator(
-    // pointers to IC data in CPU memory
-    uint32_t* voxelIC,
-    uint32_t* fiberIC,
-    uint16_t* orienIC,
-    float*    lengthIC,
-    float*    lutIC,
-    // pointers to EC data in CPU memory
-    uint32_t* voxelEC,
-    uint16_t* orienEC,
-    float*    lutEC,
-    // pointer to ISO data in CPU memory
-    float*    lutISO,
-    // dataset constant values
-    int nsegments,
-    int nvoxels,      
-    int nfibers,      
-    int npeaks,       
-    int norientations,
-    int nsamples,     
-    int ndiameters,   
-    int nzeppelins,   
-    int nballs,
-
-    int fcall)
-{
-    this->nsegments = nsegments;
-    this->nvoxels   = nvoxels;
-    this->nfibers   = nfibers;
-    this->nrows     = nvoxels * nsamples;
-    this->ncols     = nfibers*ndiameters + npeaks*nzeppelins + nvoxels*nballs;
-
-    if (fcall == 1) {
-    int size_lutic  = ndiameters*norientations*nsamples;
-    int size_lutec  = nzeppelins*norientations*nsamples;
-    int size_lutiso = nballs*nsamples;
-
-    size_t required_mem = 28*(size_t)nsegments + 6.0*(size_t)npeaks + 8.0*(size_t)nfibers + 16.0*(size_t)nvoxels + 4.0*((size_t)size_lutic + (size_t)size_lutec + (size_t)size_lutiso + (size_t)this->nrows + (size_t)this->ncols);
-    checkCompatibility(required_mem, 0);
-
-    // transfer constant values to the GPU
-    printf("\t* constant values ... ");
-    cudaStatus = true;
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_VOXELS,       &nvoxels,       sizeof(int)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_FIBERS,       &nfibers,       sizeof(int)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_PEAKS,        &npeaks,        sizeof(int)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_ORIENTATIONS, &norientations, sizeof(int)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_SAMPLES,      &nsamples,      sizeof(int)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_DIAMETERS,    &ndiameters,    sizeof(int)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_ZEPPELINS,    &nzeppelins,    sizeof(int)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_BALLS,        &nballs,        sizeof(int)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_ROWS,         &nrows,         sizeof(int)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_COLS,         &ncols,         sizeof(int)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTIC,       &size_lutic,    sizeof(int)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTEC,       &size_lutec,    sizeof(int)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTISO,      &size_lutiso,   sizeof(int)) );
-    if (cudaStatus) printf("[ OK ]\n");
-    else            printf("[ CUDA ERROR ]\n");
-
-    // alloc memory in GPU for vectors x and y
-    printf("\t* vectors x&y ... ");
-    cudaStatus = true;
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_x, ncols*sizeof(float64_t)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_y, nrows*sizeof(float64_t)) );
-    if (cudaStatus) printf("[ OK ]\n");
-    else            printf("[ CUDA ERROR ]\n");
-
-    // pre-process data for GPU
-    printf("\t* pre-processing ... ");
-    cudaStatus = true;
-    uint32_t* segmentsPerBlock = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
-    uint32_t* offsetPerBlock   = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
-
-    preprocessDataForGPU(voxelIC, nsegments, segmentsPerBlock, offsetPerBlock, nvoxels);
-
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_segmentsPerBlockIC, nvoxels*sizeof(uint32_t)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_offsetPerBlockIC,   nvoxels*sizeof(uint32_t)) );
-
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_segmentsPerBlockIC, segmentsPerBlock, nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_offsetPerBlockIC,   offsetPerBlock,   nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
-
-    preprocessDataForGPU(voxelEC, npeaks, segmentsPerBlock, offsetPerBlock, nvoxels);
-
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_segmentsPerBlockEC, nvoxels*sizeof(uint32_t)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_offsetPerBlockEC,   nvoxels*sizeof(uint32_t)) );
-
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_segmentsPerBlockEC, segmentsPerBlock, nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_offsetPerBlockEC,   offsetPerBlock,   nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
-
-    free(segmentsPerBlock);
-    free(offsetPerBlock);
-    if (cudaStatus) printf("[ OK ]\n");
-    else            printf("[ CUDA ERROR ]\n");
-
-    // alloc and transfer LUTs
-    printf("\t* loading LUTs ... ");
-    cudaStatus = true;
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lutIC, size_lutic*sizeof(float32_t)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lutIC, lutIC, size_lutic*sizeof(float32_t), cudaMemcpyHostToDevice) );
-
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lutEC,  size_lutec*sizeof(float32_t)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lutEC, lutEC, size_lutec*sizeof(float32_t), cudaMemcpyHostToDevice) );
-
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lutISO, size_lutiso*sizeof(float32_t)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lutISO, lutISO, size_lutiso*sizeof(float32_t), cudaMemcpyHostToDevice) );
-    if (cudaStatus) printf("[ OK ]\n");
-    else            printf("[ CUDA ERROR ]\n");
-
-    // configure texture for LUTs
-    tex_lutIC.addressMode[0] = cudaAddressModeBorder;
-    tex_lutIC.addressMode[1] = cudaAddressModeBorder;
-    tex_lutIC.filterMode = cudaFilterModePoint;
-    tex_lutIC.normalized = false;
-
-    tex_lutEC.addressMode[0] = cudaAddressModeBorder;
-    tex_lutEC.addressMode[1] = cudaAddressModeBorder;
-    tex_lutEC.filterMode = cudaFilterModePoint;
-    tex_lutEC.normalized = false;
-
-    tex_lutISO.addressMode[0] = cudaAddressModeBorder;
-    tex_lutISO.addressMode[1] = cudaAddressModeBorder;
-    tex_lutISO.filterMode = cudaFilterModePoint;
-    tex_lutISO.normalized = false;
-
-    printf("\t* linking LUTs to a texture memory ... ");
-    cudaStatus = cudaStatus && cudaCheck( cudaBindTexture(NULL, tex_lutIC,  gpu_lutIC,  size_lutic  * sizeof(float32_t)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaBindTexture(NULL, tex_lutEC,  gpu_lutEC,  size_lutec  * sizeof(float32_t)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaBindTexture(NULL, tex_lutISO, gpu_lutISO, size_lutiso * sizeof(float32_t)) );
-    if (cudaStatus) printf("[ OK ]\n");
-    else            printf("[ CUDA ERROR ]\n");
-
-    // alloc and transfer operator A
-    printf("\t* A  operator... ");
-    cudaStatus = true;
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_voxelIC,  nsegments*sizeof(uint32_t))  );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_fiberIC,  nsegments*sizeof(uint32_t))  );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_orienIC,  nsegments*sizeof(uint16_t))  );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lengthIC, nsegments*sizeof(float32_t)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_voxelEC,  npeaks*sizeof(uint32_t))     );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_orienEC,  npeaks*sizeof(uint16_t))     );
-
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_voxelIC,  voxelIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_fiberIC,  fiberIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_orienIC,  orienIC,  nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lengthIC, lengthIC, nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_voxelEC,  voxelEC,  npeaks*sizeof(uint32_t),     cudaMemcpyHostToDevice) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_orienEC,  orienEC,  npeaks*sizeof(uint16_t),     cudaMemcpyHostToDevice) );
-    if (cudaStatus) printf("[ OK ]\n");
-    else            printf("[ CUDA ERROR ]\n");
-    }
-
-}
-
-CudaLinearOperator::~CudaLinearOperator() {}
-
-void CudaLinearOperator::destroy(){
-    bool cudaStatus;
-
-    printf("\n-> Deleting GPU memory:\n");
-
-    printf("\t* deleting A...   ");
-    cudaStatus = true;
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_voxelIC)  );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_fiberIC)  );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_orienIC)  );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_lengthIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_voxelEC)  );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_orienEC)  );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_segmentsPerBlockIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_offsetPerBlockIC)   );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_segmentsPerBlockEC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_offsetPerBlockEC)   );
-    if (cudaStatus) printf("[ OK ]\n");
-    else            printf("[ CUDA ERROR ]\n");
-
-    printf("\t* deleting A'...  ");
-    cudaStatus = true;
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_TvoxelIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_TfiberIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_TorienIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_TlengthIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_TfibersPerBlockIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_ToffsetPerBlockIC) );
-    if (cudaStatus) printf("[ OK ]\n");
-    else            printf("[ CUDA ERROR ]\n");
-
-    printf("\t* deleting x&y... ");
-    cudaStatus = true;
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_x) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_y) );
-    if (cudaStatus) printf("[ OK ]\n");
-    else            printf("[ CUDA ERROR ]\n");
-
-    printf("\t* deleting LUT... ");
-    cudaStatus = true;
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_lutIC)  );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_lutEC)  );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_lutISO) );
-    cudaStatus = cudaStatus && cudaCheck( cudaUnbindTexture(tex_lutIC)  );
-    cudaStatus = cudaStatus && cudaCheck( cudaUnbindTexture(tex_lutEC)  );
-    cudaStatus = cudaStatus && cudaCheck( cudaUnbindTexture(tex_lutISO) );
-    if (cudaStatus) printf("[ OK ]\n");
-    else            printf("[ CUDA ERROR ]\n");
-
-    printf("\t* reseting GPU... ");
-    cudaStatus = true;
-    cudaStatus = cudaStatus && cudaCheck( cudaDeviceReset() );
-    if (cudaStatus) printf("[ OK ]\n");
-    else            printf("[ CUDA ERROR ]\n");
-}
-
-void CudaLinearOperator::setTransposeData(uint32_t*  voxelIDs,
-                                          uint32_t*  fiberIDs,
-                                          uint16_t*  orienIDs,
-                                          float32_t* lengths)
-{
-    printf("\t* A' operator... ");
-    cudaStatus = true;
-    uint32_t*  fibersPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
-    uint32_t*  offsetPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
-
-    if(fibersPerBlock == NULL || offsetPerBlock == NULL) printf("problemas\n");
-
-    preprocessDataForGPU(fiberIDs, nsegments, fibersPerBlock, offsetPerBlock, nfibers);
-
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TfibersPerBlockIC, nfibers*sizeof(uint32_t)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_ToffsetPerBlockIC, nfibers*sizeof(uint32_t)) );
-
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TfibersPerBlockIC, fibersPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_ToffsetPerBlockIC, offsetPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-
-    free(fibersPerBlock);
-    free(offsetPerBlock);
-
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TvoxelIC,  nsegments*sizeof(uint32_t))  );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TfiberIC,  nsegments*sizeof(uint32_t))  );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TorienIC,  nsegments*sizeof(uint16_t))  );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TlengthIC, nsegments*sizeof(float32_t)) );
-
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TvoxelIC,  voxelIDs, nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TfiberIC,  fiberIDs, nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TorienIC,  orienIDs, nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TlengthIC, lengths,  nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
-    if (cudaStatus) printf("[ OK ]\n");
-    else            printf("[ CUDA ERROR ]\n");
-}
-
-void cudaCheckKernel(){
-    cudaError_t cudaStatus;
-    
-    cudaStatus = cudaGetLastError();
-	if(cudaStatus != cudaSuccess)
-        fprintf(stderr, "\t* kernel launch... [ ERROR ]: %s\n\n", cudaGetErrorString(cudaStatus));
-    else
-        printf("\t* kernel launch... [ OK ]\n");
-
-    cudaStatus = cudaDeviceSynchronize();
-	if(cudaStatus != cudaSuccess)
-        fprintf(stderr, "\t* cudaDeviceSynchronize() after launching kernel... [ ERROR ]: %d\n", cudaStatus);
-    else
-        printf("\t* cudaDeviceSynchronize() after launching kernel... [ OK ]\n");
-}
-
-void CudaLinearOperator::dot(float64_t* v_in, float64_t* v_out){
-    //cudaError_t cudaStatus;
-    
-    // Copy vector x to the GPU
-    cudaMemcpy(gpu_x, v_in, ncols*sizeof(double), cudaMemcpyHostToDevice);
-    /*if (cudaStatus != cudaSuccess) printf("\t* tranfering x to GPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
-    else                           printf("\t* tranfering x to GPU ... [   OK  ]\n");//*/
-
-    // Multiply IC part in the GPU
-    multiply_Ax_ICpart<<<nvoxels, 1024>>>(gpu_voxelIC, gpu_fiberIC, gpu_orienIC, gpu_lengthIC, gpu_segmentsPerBlockIC, gpu_offsetPerBlockIC, gpu_lutIC, gpu_x, gpu_y);
-
-    //cudaCheckKernel();
-
-    // Multiply EC part in the GPU
-    multiply_Ax_ECpart<<<nvoxels, 512>>>(gpu_voxelEC, gpu_orienEC, gpu_segmentsPerBlockEC, gpu_offsetPerBlockEC, gpu_lutEC, gpu_x, gpu_y);
-
-    //cudaCheckKernel();
-
-    // Multiply ISO part in the GPU
-    multiply_Ax_ISOpart<<<nvoxels, 512>>>(gpu_lutISO, gpu_x, gpu_y);
-
-    //cudaCheckKernel();
-
-    // Copy back result to CPU
-    cudaMemcpy(v_out, gpu_y, nrows*sizeof(double), cudaMemcpyDeviceToHost);
-    /*if (cudaStatus != cudaSuccess) printf("\t* tranfering y to CPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
-    else                           printf("\t* tranfering y to CPU ... [   OK  ]\n");//*/
-}
-
-void CudaLinearOperator::Tdot(float64_t* v_in, float64_t* v_out){
-        
-    //cudaError_t cudaStatus;
-    // Copy vector y to the GPU
-    //cudaCheck( cudaMemset(gpu_x, 0, NUM_COLS*sizeof(float64_t)) );
-    //cudaCheck( cudaMemcpy(gpu_x, x, NUM_COLS*sizeof(double), cudaMemcpyHostToDevice) );
-    cudaMemcpy(gpu_y, v_in, nrows*sizeof(double), cudaMemcpyHostToDevice);
-    /*if (cudaStatus != cudaSuccess) printf("\t* tranfering y to GPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
-    else                           printf("\t* tranfering y to GPU ... [   OK  ]\n");//*/
-
-    // Multiply IC part in the GPU
-    multiply_Aty_ICpart<<<nfibers, 512>>>(gpu_TvoxelIC, gpu_TfiberIC, gpu_TorienIC, gpu_TlengthIC, gpu_TfibersPerBlockIC, gpu_ToffsetPerBlockIC, gpu_lutIC, gpu_x, gpu_y);
-
-    //cudaCheckKernel();
-
-    // Multiply EC part in the GPU
-    multiply_Aty_ECpart<<<nvoxels, 512>>>(gpu_voxelEC, gpu_orienEC, gpu_segmentsPerBlockEC, gpu_offsetPerBlockEC, gpu_lutEC, gpu_x, gpu_y);
-
-    //cudaCheckKernel();
-
-    // Multiply ISO part in the GPU
-    multiply_Aty_ISOpart<<<nvoxels, 512>>>(gpu_lutISO, gpu_x, gpu_y);
-
-    //cudaCheckKernel();
-
-    // Copy back result to CPU
-    cudaMemcpy(v_out, gpu_x, ncols*sizeof(double), cudaMemcpyDeviceToHost);
-    /*if (cudaStatus != cudaSuccess) printf("\t* tranfering x to CPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
-    else                           printf("\t* tranfering x to CPU ... [   OK  ]\n");//*/
-        
-    /*printf("\n\n VECTOR X EC PART:\n");
-    for(int i = NUM_FIBERS*NUM_RESFUNCIC; i < NUM_FIBERS*NUM_RESFUNCIC+20; i++)
-        printf("%lf ", x[i]);
-    printf("\n\n");//*/
-}
-
-// ------------------------------------------------------- KERNELS ------------------------------------------------------- //
-__global__ void multiply_Ax_ICpart(uint32_t*  voxelIDs,
-                                   uint32_t*  fiberIDs,
-                                   uint16_t*  orienIDs,
-                                   float32_t* lengths,
-                                   uint32_t*  segmentsPerBlock,
-                                   uint32_t*  offsetPerBlock,
-                                   float32_t* lut,
-                                   float64_t* x,
-                                   float64_t* y)
-{
-    __shared__ float64_t shmem[1024];
-
-    uint32_t bid = blockIdx.x;
-    uint32_t tid = threadIdx.x;
-    uint32_t gid = threadIdx.x / 512;
-    uint32_t sid = threadIdx.x - 512*gid;
-
-    shmem[tid] = 0.0;
-
-    if(sid >= NUM_SAMPLES) return;
-
-    uint32_t offset = offsetPerBlock[bid] + (segmentsPerBlock[bid]/2)*gid;
-    uint32_t nsegments = segmentsPerBlock[bid]/2 + (segmentsPerBlock[bid]%2)*gid;
-
-    uint32_t*  voxel  = voxelIDs + offset;
-    uint32_t*  fiber  = fiberIDs + offset;
-    uint16_t*  orien  = orienIDs + offset;
-    float32_t* length = lengths  + offset;
-
-    float64_t sum = 0.0;
-
-    for(int i = 0; i < nsegments; i++){
-        int offset_lut = (*orien)*NUM_SAMPLES + sid;
-
-        float64_t aux = 0.0;
-        for(int j = 0; j < NUM_DIAMETERS; j++){
-            aux += (double)(lut[offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES])*x[(*fiber) + j*NUM_FIBERS];
-            //aux += tex1Dfetch(tex_lutIC, offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES) * x[(*fiber) + j*NUM_FIBERS];
-        }
-
-        sum += aux * (*length);
-
-        fiber++;
-        orien++;
-        length++;
-    }
-
-    shmem[tid] = sum;
-    __syncthreads();
-
-    if(tid < NUM_SAMPLES)
-        y[(*voxel)*NUM_SAMPLES + sid] = sum + shmem[tid+512];
-}
-
-__global__ void multiply_Ax_ECpart(
-    uint32_t*  voxelIDs,
-    uint16_t*  orienIDs,
-    uint32_t*  segmentsPerBlock,
-    uint32_t*  offsetPerBlock,
-    float32_t* lut,
-    float64_t* x,
-    float64_t* y)
-{
-    uint32_t bid = blockIdx.x;
-    uint32_t tid = threadIdx.x;
-
-    if(tid >= NUM_SAMPLES) return;
-
-    uint32_t offset  = offsetPerBlock[bid];
-    uint32_t nsegments = segmentsPerBlock[bid];
-
-    uint32_t* voxel = voxelIDs + offset;
-    uint16_t* orien = orienIDs + offset;
-
-    uint32_t target = NUM_FIBERS*NUM_DIAMETERS + offset;
-
-    float64_t sum = 0.0;
-    for(int i = 0; i < nsegments; i++){
-        uint32_t offset_lut = (*orien)*NUM_SAMPLES + tid;
-
-        for(int j = 0; j < NUM_ZEPPELINS; j++)
-            sum += (double)(lut[offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES])*x[target + j*NUM_PEAKS + i];
-            //sum += tex1Dfetch(tex_lutEC, offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES) * x[target + j*NUM_PEAKS + i];
-
-        orien++;
-    }
-
-    y[(*voxel)*NUM_SAMPLES + tid] += sum;
-}
-
-__global__ void multiply_Ax_ISOpart(
-    float32_t* lut,
-    float64_t* x,
-    float64_t* y)
-{
-    uint32_t bid = blockIdx.x;
-    uint32_t tid = threadIdx.x;
-
-    if(tid >= NUM_SAMPLES) return;
-
-    uint32_t target = NUM_FIBERS*NUM_DIAMETERS + NUM_PEAKS*NUM_ZEPPELINS + bid;
-
-    float64_t sum = 0.0;
-    for(int j = 0; j < NUM_BALLS; j++)
-        sum += (double)(lut[j*NUM_SAMPLES + tid])*x[target + j*NUM_VOXELS];
-        //sum += (double)(tex1Dfetch(tex_lutISO, j*NUM_SAMPLES + tid))*x[target + j*NUM_VOXELS];
-        
-
-    y[bid*NUM_SAMPLES + tid] += sum;
-}
-
-__global__ void multiply_Aty_ICpart(
-    uint32_t*  voxelICt,
-    uint32_t*  fiberICt,
-    uint16_t*  orienICt,
-    float32_t* lengthICt,
-    uint32_t*  compartmentsPerBlock,
-    uint32_t*  offsetPerBlock,
-    float32_t* lut,
-    float64_t* x,
-    float64_t* y)
-{
-    __shared__ float64_t shmem[512];
-
-    uint32_t bid = blockIdx.x;
-    uint32_t tid = threadIdx.x;
-
-    shmem[tid] = 0.0;
-
-    if(tid >= NUM_SAMPLES) return;
-
-    uint32_t offset = offsetPerBlock[bid];
-    uint32_t nsegments = offset + compartmentsPerBlock[bid];
-
-    uint32_t*  voxel  = voxelICt  + offset;
-    uint32_t*  fiber  = fiberICt  + offset;
-    uint16_t*  orien  = orienICt  + offset;
-    float32_t* length = lengthICt + offset;
-
-    for(int j = 0; j < NUM_DIAMETERS; j++){
-        int offset_lut = j*NUM_ORIENTATIONS*NUM_SAMPLES + tid;
-
-        float64_t sum = 0.0;
-        voxel  = voxelICt  + offset;
-        orien  = orienICt  + offset;
-        length = lengthICt + offset;
-        for(int i = offset; i < nsegments; i++){
-            sum += ((float64_t)(*length)) *( (float64_t) lut[offset_lut + (*orien)*NUM_SAMPLES] )* y[(*voxel)*NUM_SAMPLES + tid];
-            //sum += ((float64_t)(*length)) *( (float64_t) tex1Dfetch(tex_lutIC, offset_lut + (*orien)*NUM_SAMPLES) )* y[(*voxel)*NUM_SAMPLES + tid];
-
-            voxel++;
-            orien++;
-            length++;
-        }
-
-        shmem[tid] = sum;
-        __syncthreads();
-
-        if(tid < 256) shmem[tid] += shmem[tid + 256]; __syncthreads();
-        if(tid < 128) shmem[tid] += shmem[tid + 128]; __syncthreads();
-        if(tid <  64) shmem[tid] += shmem[tid +  64]; __syncthreads();
-        if(tid <  32) shmem[tid] += shmem[tid +  32]; __syncthreads();
-        if(tid <  16) shmem[tid] += shmem[tid +  16]; __syncthreads();
-        if(tid <   8) shmem[tid] += shmem[tid +   8]; __syncthreads();
-        if(tid <   4) shmem[tid] += shmem[tid +   4]; __syncthreads();
-
-        if(tid == 0) x[j*NUM_FIBERS + (*fiber)] = shmem[0] + shmem[1] + shmem[2] + shmem[3];
-
-        __syncthreads();
-    }
-}
-
-__global__ void multiply_Aty_ECpart(
-    uint32_t*  voxelEC,
-    uint16_t*  orienEC,
-    uint32_t*  segmentsPerBlock,
-    uint32_t*  offsetPerBlock,
-    float32_t* lut,
-    float64_t* x,
-    float64_t* y)
-{
-    __shared__ float64_t shmem[512];
-
-    uint32_t bid = blockIdx.x;
-    uint32_t tid = threadIdx.x;
-
-    shmem[tid] = 0.0;
-
-    if(tid >= NUM_SAMPLES) return;
-
-    uint32_t offset  = offsetPerBlock[bid];
-    uint32_t ncompartments = segmentsPerBlock[bid] + offset;
-
-    uint32_t* voxel = voxelEC + offset;
-    uint16_t* orien = orienEC + offset;
-
-    for(int j = 0; j < NUM_ZEPPELINS; j++){        
-        uint32_t offset_lut = j*NUM_ORIENTATIONS*NUM_SAMPLES + tid;
-
-        voxel = voxelEC + offset;
-        orien = orienEC + offset;
-        for(int i = offset; i < ncompartments; i++){
-            shmem[tid] =( (float64_t)(lut[(*orien)*NUM_SAMPLES + offset_lut] ))* y[(*voxel)*NUM_SAMPLES + tid];
-            //shmem[tid] =( (float64_t)tex1Dfetch(tex_lutEC, (*orien)*NUM_SAMPLES + offset_lut) )* y[(*voxel)*NUM_SAMPLES + tid];
-            __syncthreads();
-
-            if(tid < 256) shmem[tid] += shmem[tid + 256]; __syncthreads();
-            if(tid < 128) shmem[tid] += shmem[tid + 128]; __syncthreads();
-            if(tid <  64) shmem[tid] += shmem[tid +  64]; __syncthreads();
-            if(tid <  32) shmem[tid] += shmem[tid +  32]; __syncthreads();
-            if(tid <  16) shmem[tid] += shmem[tid +  16]; __syncthreads();
-            if(tid <   8) shmem[tid] += shmem[tid +   8]; __syncthreads();
-            if(tid <   4) shmem[tid] += shmem[tid +   4]; __syncthreads();
-            if(tid <   2) shmem[tid] += shmem[tid +   2]; __syncthreads();
-
-            if(tid == 0) x[NUM_FIBERS*NUM_DIAMETERS + j*NUM_PEAKS + i] = shmem[0] + shmem[1];
-
-            voxel++;
-            orien++;
-            __syncthreads();
-        }
-    }
-}
-
-__global__ void multiply_Aty_ISOpart(float* lut, double* x, double* y){
-    __shared__ double shmem[512];
-
-    uint bid = blockIdx.x;
-    uint tid = threadIdx.x;
-    uint offset = NUM_FIBERS*NUM_DIAMETERS + NUM_PEAKS*NUM_ZEPPELINS + bid;
-
-    shmem[tid] = 0.0;
-
-    if(tid >= NUM_SAMPLES) return;
-
-    for(int j = 0; j < NUM_BALLS; j++){
-        shmem[tid] =( (float64_t) lut[j*NUM_SAMPLES + tid] )* y[bid*NUM_SAMPLES + tid];
-        //shmem[tid] =( (float64_t) tex1Dfetch(tex_lutISO, j*NUM_SAMPLES + tid) )* y[bid*NUM_SAMPLES + tid];
-        __syncthreads();
-
-        if(tid < 256) shmem[tid] += shmem[tid + 256]; __syncthreads();
-        if(tid < 128) shmem[tid] += shmem[tid + 128]; __syncthreads();
-        if(tid <  64) shmem[tid] += shmem[tid +  64]; __syncthreads();
-        if(tid <  32) shmem[tid] += shmem[tid +  32]; __syncthreads();
-        if(tid <  16) shmem[tid] += shmem[tid +  16]; __syncthreads();
-        if(tid <   8) shmem[tid] += shmem[tid +   8]; __syncthreads();
-        if(tid <   4) shmem[tid] += shmem[tid +   4]; __syncthreads(); 
-
-        if(tid == 0)
-            x[offset + j*NUM_VOXELS] = shmem[0] + shmem[1] + shmem[2] + shmem[3];
-    }
-}
-
+#include "operator_withCUDA.cuh"
+
+// textures in GPU
+texture<float32_t, 1, cudaReadModeElementType> tex_lutIC;
+texture<float32_t, 1, cudaReadModeElementType> tex_lutEC;
+texture<float32_t, 1, cudaReadModeElementType> tex_lutISO;
+
+bool cudaCheck(cudaError_t cudaStatus){
+    return cudaStatus == cudaSuccess;
+}
+
+bool checkCompatibility(size_t required_mem, int gpu_id) {
+    int num_gpus;
+    cudaError_t cudaStatus;
+    
+    cudaStatus = cudaGetDeviceCount(&num_gpus);
+
+    if (num_gpus <= 0 || num_gpus <= gpu_id) {
+        printf("\t* the selected GPU does not exist or it is not detected \n");
+        return false;
+    }
+
+    if(cudaStatus == cudaSuccess){
+        cudaDeviceProp gpu_properties;
+        cudaGetDeviceProperties(&gpu_properties, gpu_id);
+
+        printf("\t* checking availability of CUDA ... [ OK ]\n");
+        printf("\t* number of CUDA GPUs detected: %d\n", num_gpus);
+        printf("\t* using GPU with ID %d... [ %s ]\n", gpu_id, gpu_properties.name);
+
+        if (required_mem <= gpu_properties.totalGlobalMem) {
+            printf("\t* using %.2f GB of total %.2f GB... [ OK ]\n", required_mem*1e-9, gpu_properties.totalGlobalMem*1e-9);
+        }
+        else {
+            printf("\t* using %f GB of total %f GB... [ ERROR ]: dictionary too big for GPU memory\n", required_mem*1e-9, gpu_properties.totalGlobalMem*1e-9);
+        }
+
+        if(gpu_properties.major >= 5){
+            printf("\t* compute capability: %d.%d [ OK ]\n", gpu_properties.major, gpu_properties.minor);
+        }
+        else{
+            printf("\t* compute capability: %d.%d [ ERROR ]. GPU compute capability must be at least 5.0\n", gpu_properties.major, gpu_properties.minor);
+            return false;
+        }
+
+        return true;
+    }
+    else{
+        printf("\t* checking availability of CUDA ... [ ERROR ]: CUDA is not available or GPU is not CUDA compatible\n");
+        return false;
+    }
+}
+
+void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compartmentsPerBlock, uint32_t* offsetPerBlock, int NUM_BLOCKS){
+
+    // fill arrays with zeros
+    memset(compartmentsPerBlock, 0, NUM_BLOCKS * sizeof(uint32_t));
+    memset(offsetPerBlock,       0, NUM_BLOCKS * sizeof(uint32_t));
+
+    // count compartments per block
+    for(int i = 0; i < NUM_COMPARTMENTS; i++)
+        compartmentsPerBlock[data[i]]++;
+
+    // calculate offset per block
+    offsetPerBlock[0] = 0;
+    for(int i = 1; i < NUM_BLOCKS; i++)
+        offsetPerBlock[i] = offsetPerBlock[i-1] + compartmentsPerBlock[i-1];
+}
+
+CudaLinearOperator::CudaLinearOperator(
+    // pointers to IC data in CPU memory
+    uint32_t* voxelIC,
+    uint32_t* fiberIC,
+    uint16_t* orienIC,
+    float*    lengthIC,
+    float*    lutIC,
+    // pointers to EC data in CPU memory
+    uint32_t* voxelEC,
+    uint16_t* orienEC,
+    float*    lutEC,
+    // pointer to ISO data in CPU memory
+    float*    lutISO,
+    // dataset constant values
+    int nsegments,
+    int nvoxels,      
+    int nfibers,      
+    int nzeppelins,       
+    int norientations,
+    int nsamples,     
+    int ndiameters,   
+    int nzeppelins,   
+    int nballs,
+    // flag to ensure we create the operator only one time
+    int fcall,
+    // id of the selected CUDA gpu
+    int gpu_id)
+{
+    this->nsegments = nsegments;
+    this->nvoxels   = nvoxels;
+    this->nfibers   = nfibers;
+    this->nrows     = nvoxels * nsamples;
+    this->ncols     = nfibers*ndiameters + nzeppelins*nzeppelins + nvoxels*nballs;
+
+    if (fcall == 1) {
+        int size_lutic  = ndiameters*norientations*nsamples;
+        int size_lutec  = nzeppelins*norientations*nsamples;
+        int size_lutiso = nballs*nsamples;
+
+        size_t required_mem = 28*(size_t)nsegments + 6.0*(size_t)nzeppelins + 8.0*(size_t)nfibers + 16.0*(size_t)nvoxels + 4.0*((size_t)size_lutic + (size_t)size_lutec + (size_t)size_lutiso + (size_t)this->nrows + (size_t)this->ncols);
+        checkCompatibility(required_mem, gpu_id);
+
+        // transfer constant values to the GPU
+        printf("\t* constant values ... ");
+        cudaStatus = true;
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_VOXELS,       &nvoxels,       sizeof(int)) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_FIBERS,       &nfibers,       sizeof(int)) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_PEAKS,        &npeaks,        sizeof(int)) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_ORIENTATIONS, &norientations, sizeof(int)) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_SAMPLES,      &nsamples,      sizeof(int)) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_DIAMETERS,    &ndiameters,    sizeof(int)) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_ZEPPELINS,    &nzeppelins,    sizeof(int)) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_BALLS,        &nballs,        sizeof(int)) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_ROWS,         &nrows,         sizeof(int)) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_COLS,         &ncols,         sizeof(int)) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTIC,       &size_lutic,    sizeof(int)) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTEC,       &size_lutec,    sizeof(int)) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTISO,      &size_lutiso,   sizeof(int)) );
+        if (cudaStatus) printf("[ OK ]\n");
+        else            printf("[ CUDA ERROR ]\n");
+
+        // alloc memory in GPU for vectors x and y
+        printf("\t* vectors x&y ... ");
+        cudaStatus = true;
+        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_x, ncols*sizeof(float64_t)) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_y, nrows*sizeof(float64_t)) );
+        if (cudaStatus) printf("[ OK ]\n");
+        else            printf("[ CUDA ERROR ]\n");
+
+        // pre-process data for GPU
+        printf("\t* pre-processing ... ");
+        cudaStatus = true;
+        uint32_t* segmentsPerBlock = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
+        uint32_t* offsetPerBlock   = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
+
+        preprocessDataForGPU(voxelIC, nsegments, segmentsPerBlock, offsetPerBlock, nvoxels);
+
+        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_segmentsPerBlockIC, nvoxels*sizeof(uint32_t)) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_offsetPerBlockIC,   nvoxels*sizeof(uint32_t)) );
+
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_segmentsPerBlockIC, segmentsPerBlock, nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_offsetPerBlockIC,   offsetPerBlock,   nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
+
+        if (npeaks > 0){
+            preprocessDataForGPU(voxelEC, npeaks, segmentsPerBlock, offsetPerBlock, nvoxels);
+
+            cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_segmentsPerBlockEC, nvoxels*sizeof(uint32_t)) );
+            cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_offsetPerBlockEC,   nvoxels*sizeof(uint32_t)) );
+
+            cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_segmentsPerBlockEC, segmentsPerBlock, nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
+            cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_offsetPerBlockEC,   offsetPerBlock,   nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
+        }
+
+        free(segmentsPerBlock);
+        free(offsetPerBlock);
+        if (cudaStatus) printf("[ OK ]\n");
+        else            printf("[ CUDA ERROR ]\n");
+
+        // alloc and transfer LUTs
+        printf("\t* loading LUTs ... ");
+        cudaStatus = true;
+
+        if (ndiameters > 0){
+            cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lutIC, size_lutic*sizeof(float32_t)) );
+            cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lutIC, lutIC, size_lutic*sizeof(float32_t), cudaMemcpyHostToDevice) );
+
+            tex_lutIC.addressMode[0] = cudaAddressModeBorder;
+            tex_lutIC.addressMode[1] = cudaAddressModeBorder;
+            tex_lutIC.filterMode = cudaFilterModePoint;
+            tex_lutIC.normalized = false;
+
+            cudaStatus = cudaStatus && cudaCheck( cudaBindTexture(NULL, tex_lutIC,  gpu_lutIC,  size_lutic  * sizeof(float32_t)) );
+        }
+
+        if (nzeppelins > 0){
+            cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lutEC,  size_lutec*sizeof(float32_t)) );
+            cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lutEC, lutEC, size_lutec*sizeof(float32_t), cudaMemcpyHostToDevice) );
+
+            tex_lutEC.addressMode[0] = cudaAddressModeBorder;
+            tex_lutEC.addressMode[1] = cudaAddressModeBorder;
+            tex_lutEC.filterMode = cudaFilterModePoint;
+            tex_lutEC.normalized = false;
+
+            cudaStatus = cudaStatus && cudaCheck( cudaBindTexture(NULL, tex_lutEC,  gpu_lutEC,  size_lutec  * sizeof(float32_t)) );
+        }
+
+        if (nballs > 0){
+            cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lutISO, size_lutiso*sizeof(float32_t)) );
+            cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lutISO, lutISO, size_lutiso*sizeof(float32_t), cudaMemcpyHostToDevice) );
+
+            tex_lutISO.addressMode[0] = cudaAddressModeBorder;
+            tex_lutISO.addressMode[1] = cudaAddressModeBorder;
+            tex_lutISO.filterMode = cudaFilterModePoint;
+            tex_lutISO.normalized = false;
+
+            cudaStatus = cudaStatus && cudaCheck( cudaBindTexture(NULL, tex_lutISO, gpu_lutISO, size_lutiso * sizeof(float32_t)) );
+        }
+
+        if (cudaStatus) printf("[ OK ]\n");
+        else            printf("[ CUDA ERROR ]\n");
+
+
+        // alloc and transfer operator A
+        printf("\t* A  operator... ");
+        cudaStatus = true;
+        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_voxelIC,  nsegments*sizeof(uint32_t))  );
+        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_fiberIC,  nsegments*sizeof(uint32_t))  );
+        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_orienIC,  nsegments*sizeof(uint16_t))  );
+        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lengthIC, nsegments*sizeof(float32_t)) );
+        if (npeaks > 0){
+            cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_voxelEC,  npeaks*sizeof(uint32_t)) );
+            cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_orienEC,  npeaks*sizeof(uint16_t)) );
+        }
+
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_voxelIC,  voxelIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_fiberIC,  fiberIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_orienIC,  orienIC,  nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
+        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lengthIC, lengthIC, nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
+        if (npeaks > 0){
+            cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_voxelEC,  voxelEC,  npeaks*sizeof(uint32_t), cudaMemcpyHostToDevice) );
+            cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_orienEC,  orienEC,  npeaks*sizeof(uint16_t), cudaMemcpyHostToDevice) );
+        }
+        if (cudaStatus) printf("[ OK ]\n");
+        else            printf("[ CUDA ERROR ]\n");
+    }
+
+}
+
+CudaLinearOperator::~CudaLinearOperator() {}
+
+void CudaLinearOperator::destroy(){
+    bool cudaStatus;
+
+    printf("\n-> Deleting GPU memory:\n");
+
+    printf("\t* deleting A...   ");
+    cudaStatus = true;
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_voxelIC)  );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_fiberIC)  );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_orienIC)  );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_lengthIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_voxelEC)  );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_orienEC)  );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_segmentsPerBlockIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_offsetPerBlockIC)   );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_segmentsPerBlockEC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_offsetPerBlockEC)   );
+    if (cudaStatus) printf("[ OK ]\n");
+    else            printf("[ CUDA ERROR ]\n");
+
+    printf("\t* deleting A'...  ");
+    cudaStatus = true;
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_TvoxelIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_TfiberIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_TorienIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_TlengthIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_TfibersPerBlockIC) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_ToffsetPerBlockIC) );
+    if (cudaStatus) printf("[ OK ]\n");
+    else            printf("[ CUDA ERROR ]\n");
+
+    printf("\t* deleting x&y... ");
+    cudaStatus = true;
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_x) );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_y) );
+    if (cudaStatus) printf("[ OK ]\n");
+    else            printf("[ CUDA ERROR ]\n");
+
+    printf("\t* deleting LUT... ");
+    cudaStatus = true;
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_lutIC)  );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_lutEC)  );
+    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_lutISO) );
+    cudaStatus = cudaStatus && cudaCheck( cudaUnbindTexture(tex_lutIC)  );
+    cudaStatus = cudaStatus && cudaCheck( cudaUnbindTexture(tex_lutEC)  );
+    cudaStatus = cudaStatus && cudaCheck( cudaUnbindTexture(tex_lutISO) );
+    if (cudaStatus) printf("[ OK ]\n");
+    else            printf("[ CUDA ERROR ]\n");
+
+    printf("\t* reseting GPU... ");
+    cudaStatus = true;
+    cudaStatus = cudaStatus && cudaCheck( cudaDeviceReset() );
+    if (cudaStatus) printf("[ OK ]\n");
+    else            printf("[ CUDA ERROR ]\n");
+}
+
+void CudaLinearOperator::setTransposeData(uint32_t*  voxelIDs,
+                                          uint32_t*  fiberIDs,
+                                          uint16_t*  orienIDs,
+                                          float32_t* lengths)
+{
+    printf("\t* A' operator... ");
+    cudaStatus = true;
+    uint32_t*  fibersPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
+    uint32_t*  offsetPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
+
+    if(fibersPerBlock == NULL || offsetPerBlock == NULL) printf("problemas\n");
+
+    preprocessDataForGPU(fiberIDs, nsegments, fibersPerBlock, offsetPerBlock, nfibers);
+
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TfibersPerBlockIC, nfibers*sizeof(uint32_t)) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_ToffsetPerBlockIC, nfibers*sizeof(uint32_t)) );
+
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TfibersPerBlockIC, fibersPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_ToffsetPerBlockIC, offsetPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+
+    free(fibersPerBlock);
+    free(offsetPerBlock);
+
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TvoxelIC,  nsegments*sizeof(uint32_t))  );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TfiberIC,  nsegments*sizeof(uint32_t))  );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TorienIC,  nsegments*sizeof(uint16_t))  );
+    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TlengthIC, nsegments*sizeof(float32_t)) );
+
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TvoxelIC,  voxelIDs, nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TfiberIC,  fiberIDs, nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TorienIC,  orienIDs, nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
+    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TlengthIC, lengths,  nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
+    if (cudaStatus) printf("[ OK ]\n");
+    else            printf("[ CUDA ERROR ]\n");
+}
+
+void cudaCheckKernel(){
+    cudaError_t cudaStatus;
+    
+    cudaStatus = cudaGetLastError();
+	if(cudaStatus != cudaSuccess)
+        fprintf(stderr, "\t* kernel launch... [ ERROR ]: %s\n\n", cudaGetErrorString(cudaStatus));
+    else
+        printf("\t* kernel launch... [ OK ]\n");
+
+    cudaStatus = cudaDeviceSynchronize();
+	if(cudaStatus != cudaSuccess)
+        fprintf(stderr, "\t* cudaDeviceSynchronize() after launching kernel... [ ERROR ]: %d\n", cudaStatus);
+    else
+        printf("\t* cudaDeviceSynchronize() after launching kernel... [ OK ]\n");
+}
+
+void CudaLinearOperator::dot(float64_t* v_in, float64_t* v_out){
+    //cudaError_t cudaStatus;
+    
+    // Copy vector x to the GPU
+    cudaMemcpy(gpu_x, v_in, ncols*sizeof(double), cudaMemcpyHostToDevice);
+    /*if (cudaStatus != cudaSuccess) printf("\t* tranfering x to GPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
+    else                           printf("\t* tranfering x to GPU ... [   OK  ]\n");//*/
+
+    // Multiply IC part in the GPU
+    multiply_Ax_ICpart<<<nvoxels, 1024>>>(gpu_voxelIC, gpu_fiberIC, gpu_orienIC, gpu_lengthIC, gpu_segmentsPerBlockIC, gpu_offsetPerBlockIC, gpu_lutIC, gpu_x, gpu_y);
+
+    //cudaCheckKernel();
+
+    // Multiply EC part in the GPU
+    multiply_Ax_ECpart<<<nvoxels, 512>>>(gpu_voxelEC, gpu_orienEC, gpu_segmentsPerBlockEC, gpu_offsetPerBlockEC, gpu_lutEC, gpu_x, gpu_y);
+
+    //cudaCheckKernel();
+
+    // Multiply ISO part in the GPU
+    multiply_Ax_ISOpart<<<nvoxels, 512>>>(gpu_lutISO, gpu_x, gpu_y);
+
+    //cudaCheckKernel();
+
+    // Copy back result to CPU
+    cudaMemcpy(v_out, gpu_y, nrows*sizeof(double), cudaMemcpyDeviceToHost);
+    /*if (cudaStatus != cudaSuccess) printf("\t* tranfering y to CPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
+    else                           printf("\t* tranfering y to CPU ... [   OK  ]\n");//*/
+}
+
+void CudaLinearOperator::Tdot(float64_t* v_in, float64_t* v_out){
+        
+    //cudaError_t cudaStatus;
+    // Copy vector y to the GPU
+    //cudaCheck( cudaMemset(gpu_x, 0, NUM_COLS*sizeof(float64_t)) );
+    //cudaCheck( cudaMemcpy(gpu_x, x, NUM_COLS*sizeof(double), cudaMemcpyHostToDevice) );
+    cudaMemcpy(gpu_y, v_in, nrows*sizeof(double), cudaMemcpyHostToDevice);
+    /*if (cudaStatus != cudaSuccess) printf("\t* tranfering y to GPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
+    else                           printf("\t* tranfering y to GPU ... [   OK  ]\n");//*/
+
+    // Multiply IC part in the GPU
+    multiply_Aty_ICpart<<<nfibers, 512>>>(gpu_TvoxelIC, gpu_TfiberIC, gpu_TorienIC, gpu_TlengthIC, gpu_TfibersPerBlockIC, gpu_ToffsetPerBlockIC, gpu_lutIC, gpu_x, gpu_y);
+
+    //cudaCheckKernel();
+
+    // Multiply EC part in the GPU
+    multiply_Aty_ECpart<<<nvoxels, 512>>>(gpu_voxelEC, gpu_orienEC, gpu_segmentsPerBlockEC, gpu_offsetPerBlockEC, gpu_lutEC, gpu_x, gpu_y);
+
+    //cudaCheckKernel();
+
+    // Multiply ISO part in the GPU
+    multiply_Aty_ISOpart<<<nvoxels, 512>>>(gpu_lutISO, gpu_x, gpu_y);
+
+    //cudaCheckKernel();
+
+    // Copy back result to CPU
+    cudaMemcpy(v_out, gpu_x, ncols*sizeof(double), cudaMemcpyDeviceToHost);
+    /*if (cudaStatus != cudaSuccess) printf("\t* tranfering x to CPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
+    else                           printf("\t* tranfering x to CPU ... [   OK  ]\n");//*/
+        
+    /*printf("\n\n VECTOR X EC PART:\n");
+    for(int i = NUM_FIBERS*NUM_RESFUNCIC; i < NUM_FIBERS*NUM_RESFUNCIC+20; i++)
+        printf("%lf ", x[i]);
+    printf("\n\n");//*/
+}
+
+// ------------------------------------------------------- KERNELS ------------------------------------------------------- //
+__global__ void multiply_Ax_ICpart(uint32_t*  voxelIDs,
+                                   uint32_t*  fiberIDs,
+                                   uint16_t*  orienIDs,
+                                   float32_t* lengths,
+                                   uint32_t*  segmentsPerBlock,
+                                   uint32_t*  offsetPerBlock,
+                                   float32_t* lut,
+                                   float64_t* x,
+                                   float64_t* y)
+{
+    __shared__ float64_t shmem[1024];
+
+    uint32_t bid = blockIdx.x;
+    uint32_t tid = threadIdx.x;
+    uint32_t gid = threadIdx.x / 512;
+    uint32_t sid = threadIdx.x - 512*gid;
+
+    shmem[tid] = 0.0;
+
+    if(sid >= NUM_SAMPLES) return;
+
+    uint32_t offset = offsetPerBlock[bid] + (segmentsPerBlock[bid]/2)*gid;
+    uint32_t nsegments = segmentsPerBlock[bid]/2 + (segmentsPerBlock[bid]%2)*gid;
+
+    uint32_t*  voxel  = voxelIDs + offset;
+    uint32_t*  fiber  = fiberIDs + offset;
+    uint16_t*  orien  = orienIDs + offset;
+    float32_t* length = lengths  + offset;
+
+    float64_t sum = 0.0;
+
+    for(int i = 0; i < nsegments; i++){
+        int offset_lut = (*orien)*NUM_SAMPLES + sid;
+
+        float64_t aux = 0.0;
+        for(int j = 0; j < NUM_DIAMETERS; j++){
+            aux += (double)(lut[offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES])*x[(*fiber) + j*NUM_FIBERS];
+            //aux += tex1Dfetch(tex_lutIC, offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES) * x[(*fiber) + j*NUM_FIBERS];
+        }
+
+        sum += aux * (*length);
+
+        fiber++;
+        orien++;
+        length++;
+    }
+
+    shmem[tid] = sum;
+    __syncthreads();
+
+    if(tid < NUM_SAMPLES)
+        y[(*voxel)*NUM_SAMPLES + sid] = sum + shmem[tid+512];
+}
+
+__global__ void multiply_Ax_ECpart(
+    uint32_t*  voxelIDs,
+    uint16_t*  orienIDs,
+    uint32_t*  segmentsPerBlock,
+    uint32_t*  offsetPerBlock,
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y)
+{
+    uint32_t bid = blockIdx.x;
+    uint32_t tid = threadIdx.x;
+
+    if(tid >= NUM_SAMPLES) return;
+
+    uint32_t offset  = offsetPerBlock[bid];
+    uint32_t nsegments = segmentsPerBlock[bid];
+
+    uint32_t* voxel = voxelIDs + offset;
+    uint16_t* orien = orienIDs + offset;
+
+    uint32_t target = NUM_FIBERS*NUM_DIAMETERS + offset;
+
+    float64_t sum = 0.0;
+    for(int i = 0; i < nsegments; i++){
+        uint32_t offset_lut = (*orien)*NUM_SAMPLES + tid;
+
+        for(int j = 0; j < NUM_ZEPPELINS; j++)
+            sum += (double)(lut[offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES])*x[target + j*NUM_PEAKS + i];
+            //sum += tex1Dfetch(tex_lutEC, offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES) * x[target + j*NUM_PEAKS + i];
+
+        orien++;
+    }
+
+    y[(*voxel)*NUM_SAMPLES + tid] += sum;
+}
+
+__global__ void multiply_Ax_ISOpart(
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y)
+{
+    uint32_t bid = blockIdx.x;
+    uint32_t tid = threadIdx.x;
+
+    if(tid >= NUM_SAMPLES) return;
+
+    uint32_t target = NUM_FIBERS*NUM_DIAMETERS + NUM_PEAKS*NUM_ZEPPELINS + bid;
+
+    float64_t sum = 0.0;
+    for(int j = 0; j < NUM_BALLS; j++)
+        sum += (double)(lut[j*NUM_SAMPLES + tid])*x[target + j*NUM_VOXELS];
+        //sum += (double)(tex1Dfetch(tex_lutISO, j*NUM_SAMPLES + tid))*x[target + j*NUM_VOXELS];
+        
+
+    y[bid*NUM_SAMPLES + tid] += sum;
+}
+
+__global__ void multiply_Aty_ICpart(
+    uint32_t*  voxelICt,
+    uint32_t*  fiberICt,
+    uint16_t*  orienICt,
+    float32_t* lengthICt,
+    uint32_t*  compartmentsPerBlock,
+    uint32_t*  offsetPerBlock,
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y)
+{
+    __shared__ float64_t shmem[512];
+
+    uint32_t bid = blockIdx.x;
+    uint32_t tid = threadIdx.x;
+
+    shmem[tid] = 0.0;
+
+    if(tid >= NUM_SAMPLES) return;
+
+    uint32_t offset = offsetPerBlock[bid];
+    uint32_t nsegments = offset + compartmentsPerBlock[bid];
+
+    uint32_t*  voxel  = voxelICt  + offset;
+    uint32_t*  fiber  = fiberICt  + offset;
+    uint16_t*  orien  = orienICt  + offset;
+    float32_t* length = lengthICt + offset;
+
+    for(int j = 0; j < NUM_DIAMETERS; j++){
+        int offset_lut = j*NUM_ORIENTATIONS*NUM_SAMPLES + tid;
+
+        float64_t sum = 0.0;
+        voxel  = voxelICt  + offset;
+        orien  = orienICt  + offset;
+        length = lengthICt + offset;
+        for(int i = offset; i < nsegments; i++){
+            sum += ((float64_t)(*length)) *( (float64_t) lut[offset_lut + (*orien)*NUM_SAMPLES] )* y[(*voxel)*NUM_SAMPLES + tid];
+            //sum += ((float64_t)(*length)) *( (float64_t) tex1Dfetch(tex_lutIC, offset_lut + (*orien)*NUM_SAMPLES) )* y[(*voxel)*NUM_SAMPLES + tid];
+
+            voxel++;
+            orien++;
+            length++;
+        }
+
+        shmem[tid] = sum;
+        __syncthreads();
+
+        if(tid < 256) shmem[tid] += shmem[tid + 256]; __syncthreads();
+        if(tid < 128) shmem[tid] += shmem[tid + 128]; __syncthreads();
+        if(tid <  64) shmem[tid] += shmem[tid +  64]; __syncthreads();
+        if(tid <  32) shmem[tid] += shmem[tid +  32]; __syncthreads();
+        if(tid <  16) shmem[tid] += shmem[tid +  16]; __syncthreads();
+        if(tid <   8) shmem[tid] += shmem[tid +   8]; __syncthreads();
+        if(tid <   4) shmem[tid] += shmem[tid +   4]; __syncthreads();
+
+        if(tid == 0) x[j*NUM_FIBERS + (*fiber)] = shmem[0] + shmem[1] + shmem[2] + shmem[3];
+
+        __syncthreads();
+    }
+}
+
+__global__ void multiply_Aty_ECpart(
+    uint32_t*  voxelEC,
+    uint16_t*  orienEC,
+    uint32_t*  segmentsPerBlock,
+    uint32_t*  offsetPerBlock,
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y)
+{
+    __shared__ float64_t shmem[512];
+
+    uint32_t bid = blockIdx.x;
+    uint32_t tid = threadIdx.x;
+
+    shmem[tid] = 0.0;
+
+    if(tid >= NUM_SAMPLES) return;
+
+    uint32_t offset  = offsetPerBlock[bid];
+    uint32_t ncompartments = segmentsPerBlock[bid] + offset;
+
+    uint32_t* voxel = voxelEC + offset;
+    uint16_t* orien = orienEC + offset;
+
+    for(int j = 0; j < NUM_ZEPPELINS; j++){        
+        uint32_t offset_lut = j*NUM_ORIENTATIONS*NUM_SAMPLES + tid;
+
+        voxel = voxelEC + offset;
+        orien = orienEC + offset;
+        for(int i = offset; i < ncompartments; i++){
+            shmem[tid] =( (float64_t)(lut[(*orien)*NUM_SAMPLES + offset_lut] ))* y[(*voxel)*NUM_SAMPLES + tid];
+            //shmem[tid] =( (float64_t)tex1Dfetch(tex_lutEC, (*orien)*NUM_SAMPLES + offset_lut) )* y[(*voxel)*NUM_SAMPLES + tid];
+            __syncthreads();
+
+            if(tid < 256) shmem[tid] += shmem[tid + 256]; __syncthreads();
+            if(tid < 128) shmem[tid] += shmem[tid + 128]; __syncthreads();
+            if(tid <  64) shmem[tid] += shmem[tid +  64]; __syncthreads();
+            if(tid <  32) shmem[tid] += shmem[tid +  32]; __syncthreads();
+            if(tid <  16) shmem[tid] += shmem[tid +  16]; __syncthreads();
+            if(tid <   8) shmem[tid] += shmem[tid +   8]; __syncthreads();
+            if(tid <   4) shmem[tid] += shmem[tid +   4]; __syncthreads();
+            if(tid <   2) shmem[tid] += shmem[tid +   2]; __syncthreads();
+
+            if(tid == 0) x[NUM_FIBERS*NUM_DIAMETERS + j*NUM_PEAKS + i] = shmem[0] + shmem[1];
+
+            voxel++;
+            orien++;
+            __syncthreads();
+        }
+    }
+}
+
+__global__ void multiply_Aty_ISOpart(float* lut, double* x, double* y){
+    __shared__ double shmem[512];
+
+    uint bid = blockIdx.x;
+    uint tid = threadIdx.x;
+    uint offset = NUM_FIBERS*NUM_DIAMETERS + NUM_PEAKS*NUM_ZEPPELINS + bid;
+
+    shmem[tid] = 0.0;
+
+    if(tid >= NUM_SAMPLES) return;
+
+    for(int j = 0; j < NUM_BALLS; j++){
+        shmem[tid] =( (float64_t) lut[j*NUM_SAMPLES + tid] )* y[bid*NUM_SAMPLES + tid];
+        //shmem[tid] =( (float64_t) tex1Dfetch(tex_lutISO, j*NUM_SAMPLES + tid) )* y[bid*NUM_SAMPLES + tid];
+        __syncthreads();
+
+        if(tid < 256) shmem[tid] += shmem[tid + 256]; __syncthreads();
+        if(tid < 128) shmem[tid] += shmem[tid + 128]; __syncthreads();
+        if(tid <  64) shmem[tid] += shmem[tid +  64]; __syncthreads();
+        if(tid <  32) shmem[tid] += shmem[tid +  32]; __syncthreads();
+        if(tid <  16) shmem[tid] += shmem[tid +  16]; __syncthreads();
+        if(tid <   8) shmem[tid] += shmem[tid +   8]; __syncthreads();
+        if(tid <   4) shmem[tid] += shmem[tid +   4]; __syncthreads(); 
+
+        if(tid == 0)
+            x[offset + j*NUM_VOXELS] = shmem[0] + shmem[1] + shmem[2] + shmem[3];
+    }
+}
+
diff --git a/commit/operator_withCUDA.cuh b/commit/operator_withCUDA.cuh
index 231a4f77..faa8b6f8 100644
--- a/commit/operator_withCUDA.cuh
+++ b/commit/operator_withCUDA.cuh
@@ -14,10 +14,20 @@ typedef unsigned short int uint16_t;
 typedef float float32_t;
 typedef double float64_t;
 
+// ====================================================
+// Util functions to check CUDA GPU compatibility
+// ====================================================
 bool cudaCheck(cudaError_t cudaStatus);
-void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compartmentsPerBlock, uint32_t* offsetPerBlock, int NUM_BLOCKS);
 bool checkCompatibility(size_t required_mem, int gpu_id);
 
+// ====================================================
+// Function to preprocess data for GPU
+// ====================================================
+void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compartmentsPerBlock, uint32_t* offsetPerBlock, int NUM_BLOCKS);
+
+// ====================================================
+// CUDA Kernels for Ax operation
+// ====================================================
 __global__ void multiply_Ax_ICpart(
     uint32_t*  voxelIDs,
     uint32_t*  fiberIDs,
@@ -43,6 +53,9 @@ __global__ void multiply_Ax_ISOpart(
     float64_t* x,
     float64_t* y);
 
+// ====================================================
+// CUDA Kernels for A'y operation
+// ====================================================
 __global__ void multiply_Aty_ICpart(
     uint32_t*  TvoxelIC,
     uint32_t*  TfiberIC,
@@ -68,7 +81,9 @@ __global__ void multiply_Aty_ISOpart(
     double* x,
     double* y);
 
-// constant values in GPU
+// ====================================================
+// Constant global values in the GPU
+// ====================================================
 __constant__ int NUM_VOXELS;
 __constant__ int NUM_FIBERS;
 __constant__ int NUM_PEAKS;
@@ -83,7 +98,9 @@ __constant__ int SIZE_LUTIC;
 __constant__ int SIZE_LUTEC;     
 __constant__ int SIZE_LUTISO;
 
-// pointers to IC data in GPU memory
+// ====================================================
+// Pointers to A (IC part) in the GPU
+// ====================================================
 static uint32_t*  gpu_voxelIC;
 static uint32_t*  gpu_fiberIC;
 static uint16_t*  gpu_orienIC;
@@ -91,7 +108,9 @@ static float32_t* gpu_lengthIC;
 static uint32_t*  gpu_segmentsPerBlockIC;
 static uint32_t*  gpu_offsetPerBlockIC;
 
-// pointers to IC data (transpose) in GPU memory
+// ====================================================
+// Pointers to A' (IC part) in the GPU
+// ====================================================
 static uint32_t*  gpu_TvoxelIC;
 static uint32_t*  gpu_TfiberIC;
 static uint16_t*  gpu_TorienIC;
@@ -99,21 +118,30 @@ static float32_t* gpu_TlengthIC;
 static uint32_t*  gpu_TfibersPerBlockIC;
 static uint32_t*  gpu_ToffsetPerBlockIC;
 
-// pointers to EC data in GPU memory
+// ====================================================
+// Pointers to A (EC part) in the GPU
+// ====================================================
 static uint32_t* gpu_voxelEC;
 static uint16_t* gpu_orienEC;
 static uint32_t* gpu_segmentsPerBlockEC;
 static uint32_t* gpu_offsetPerBlockEC;
 
-// pointers to LUTs in GPU memory
+// ====================================================
+// Pointers to LUTs in the GPU
+// ====================================================
 static float32_t* gpu_lutIC;
 static float32_t* gpu_lutEC;
 static float32_t* gpu_lutISO;
 
-// pointers to vector x and y
+// ====================================================
+// Pointers to x and y in the GPU
+// ====================================================
 static float64_t* gpu_x;
 static float64_t* gpu_y;
 
+// ============================================================================
+// This class creates an instance of the LinearOperator in GPU memory
+// ============================================================================
 class CudaLinearOperator {
 
     // constant values in CPU
@@ -128,29 +156,32 @@ class CudaLinearOperator {
 
     public:
         CudaLinearOperator(
+            // pointers to IC data in CPU memory
             uint32_t* voxelIC,
             uint32_t* fiberIC,
             uint16_t* orienIC,
             float*    lengthIC,
             float*    lutIC,
-        
+            // pointers to EC data in CPU memory
             uint32_t* voxelEC,
             uint16_t* orienEC,
             float*    lutEC,
-        
+            // pointer to ISO data in CPU memory
             float*    lutISO,
-        
+            // dataset constant values
             int nsegments,
             int nvoxels,      
             int nfibers,      
-            int npeaks,       
+            int nzeppelins,       
             int norientations,
             int nsamples,     
             int ndiameters,   
             int nzeppelins,   
             int nballs,
-        
-            int fcall);
+            // flag to ensure we create the operator only one time
+            int fcall,
+            // id of the selected CUDA gpu
+            int gpu_id);
 
         ~CudaLinearOperator();
 
diff --git a/commit/solvers.py b/commit/solvers.py
index 8e86d5c5..ce4325fb 100755
--- a/commit/solvers.py
+++ b/commit/solvers.py
@@ -1,403 +1,403 @@
-"""
-Author: Matteo Frigo - lts5 @ EPFL and Dep. of CS @ Univ. of Verona
-
-This structure is based on the previous work of Rafael Carrillo and was
-supported by the LTS5 laboratory at EPFL, Lausanne.
-"""
-from __future__ import print_function
-import numpy as np
-from math import sqrt
-import sys
-import warnings
-eps = np.finfo(float).eps
-
-from commit.proximals import (non_negativity,
-                             omega_group_sparsity,
-                             prox_group_sparsity,
-                             soft_thresholding,
-                             projection_onto_l2_ball)
-group_sparsity = -1
-non_negative = 0
-norm1 = 1
-norm2 = 2
-norminf = np.inf
-list_regnorms = [group_sparsity, non_negative, norm1, norm2]
-list_group_sparsity_norms = [norm2]#, norminf] # removed because of issue #54
-
-
-def init_regularisation(commit_evaluation,
-                        regnorms = (non_negative, non_negative, non_negative),
-                        structureIC = None, weightsIC = None, group_norm = 2,
-                        lambdas = (.0,.0,.0) ):
-    """
-    Initialise the data structure that defines Omega in
-
-        argmin_x 0.5*||Ax-y||_2^2 + Omega(x)
-
-
-    Input
-    -----
-    commit_evaluation - commit.Evaluation object :
-        dictionary and model have to be loaded beforehand.
-
-
-    regnorms - tuple :
-        this sets the penalty term to be used for each compartment.
-            Default = (non_negative,non_negative,non_negative).
-
-            regnorms[0] corresponds to the Intracellular compartment
-            regnorms[1] corresponds to the Extracellular compartment
-            regnorms[2] corresponds to the Isotropic compartment
-
-            Each regnorms[k] must be one of commit.solvers.
-                                {group_sparsity, non_negative, norm1, norm2}.
-
-            commit.solvers.group_sparsity considers both the non-overlapping
-                and the hierarchical group sparsity (see [1]). This option is
-                allowed only in the IC compartment. The mathematical formulation
-                of this term is
-                $\Omega(x) = \lambda \sum_{g\in G} w_g |x_g|
-
-            commit.solvers.non_negative puts a non negativity constraint on the
-                coefficients corresponding to the compartment. This is the
-                default option for each compartment
-
-            commit.solvers.norm1 penalises with the 1-norm of the coefficients
-                corresponding to the compartment.
-
-            commit.solvers.norm2 penalises with the 2-norm of the coefficients
-                corresponding to the compartment.
-
-
-    structureIC - np.array(list(list)) :
-        group structure for the IC compartment.
-            This field is necessary only if regterm[0]=commit.solver.group_sparsity.
-            Example:
-                structureIC = np.array([[0,2,5],[1,3,4],[0,1,2,3,4,5],[6]])
-
-                that is equivalent to
-                            [0,1,2,3,4,5]        [6]
-                              /       \
-                        [0,2,5]       [1,3,4]
-                which has two non overlapping groups, one of which is the union
-                of two other non-overlapping groups.
-
-
-    weightsIC - np.array(np.float64) :
-        this defines the weights associated to each group of structure IC.
-
-
-    group_norm - number :
-        norm type for the commit.solver.group_sparsity penalisation of the IC compartment.
-            Default: group_norm = commit.solver.norm2
-            To be chosen among commit.solver.{norm2,norminf}.
-
-    lambdas - tuple :
-        regularisation parameter for each compartment.
-            Default: lambdas = (0.0, 0.0, 0.0)
-            The lambdas correspond to the onse described in the mathematical
-            formulation of the regularisation term
-            $\Omega(x) = lambdas[0]*regnorm[0](x) + lambdas[1]*regnorm[1](x) + lambdas[2]*regnorm[2](x)$
-
-
-    References:
-        [1] Jenatton et al. - 'Proximal Methods for Hierarchical Sparse Coding'
-    """
-    regularisation = {}
-
-    regularisation['startIC']  = 0
-    regularisation['sizeIC']   = int( commit_evaluation.DICTIONARY['IC']['nF'] * commit_evaluation.KERNELS['wmr'].shape[0])
-    regularisation['startEC']  = int( regularisation['sizeIC'] )
-    regularisation['sizeEC']   = int( commit_evaluation.DICTIONARY['EC']['nE'] * commit_evaluation.KERNELS['wmh'].shape[0])
-    regularisation['startISO'] = int( regularisation['sizeIC'] + regularisation['sizeEC'] )
-    regularisation['sizeISO']  = int( commit_evaluation.DICTIONARY['nV'] * commit_evaluation.KERNELS['iso'].shape[0])
-
-    regularisation['normIC']  = regnorms[0]
-    regularisation['normEC']  = regnorms[1]
-    regularisation['normISO'] = regnorms[2]
-
-    regularisation['lambdaIC']  = float( lambdas[0] )
-    regularisation['lambdaEC']  = float( lambdas[1] )
-    regularisation['lambdaISO'] = float( lambdas[2] )
-
-    # Solver-specific fields
-    regularisation['structureIC']      = structureIC
-    regularisation['weightsIC']        = weightsIC
-    regularisation['group_norm']       = group_norm
-
-    return regularisation
-
-
-def regularisation2omegaprox(regularisation):
-    lambdaIC  = float(regularisation.get('lambdaIC'))
-    lambdaEC  = float(regularisation.get('lambdaEC'))
-    lambdaISO = float(regularisation.get('lambdaISO'))
-    if lambdaIC < 0.0 or lambdaEC < 0.0 or lambdaISO < 0.0:
-        raise ValueError('Negative regularisation parameters are not allowed')
-
-    normIC  = regularisation.get('normIC')
-    normEC  = regularisation.get('normEC')
-    normISO = regularisation.get('normISO')
-    if not normIC in list_regnorms:
-        raise ValueError('normIC must be one of commit.solvers.{group_sparsity,non_negative,norm1,norm2}')
-    if not normEC in list_regnorms:
-        raise ValueError('normEC must be one of commit.solvers.{group_sparsity,non_negative,norm1,norm2}')
-    if not normISO in list_regnorms:
-        raise ValueError('normISO must be one of commit.solvers.{group_sparsity,non_negative,norm1,norm2}')
-
-    ## NNLS case
-    if (lambdaIC == 0.0 and lambdaEC == 0.0 and lambdaISO == 0.0) or (normIC == non_negative and normEC == non_negative and normISO == non_negative):
-        omega = lambda x: 0.0
-        prox  = lambda x: non_negativity(x, 0, len(x))
-        return omega, prox
-
-    ## All other cases
-    # Intracellular Compartment
-    startIC = regularisation.get('startIC')
-    sizeIC  = regularisation.get('sizeIC')
-    if lambdaIC == 0.0:
-        omegaIC = lambda x: 0.0
-        proxIC  = lambda x: x
-    elif normIC == norm2:
-        omegaIC = lambda x: lambdaIC * np.linalg.norm(x[startIC:sizeIC])
-        proxIC  = lambda x: projection_onto_l2_ball(x, lambdaIC, startIC, sizeIC)
-    elif normIC == norm1:
-        omegaIC = lambda x: lambdaIC * sum( x[startIC:sizeIC] )
-        proxIC  = lambda x: soft_thresholding(x, lambdaIC, startIC, sizeIC)
-    elif normIC == non_negative:
-        omegaIC = lambda x: 0.0
-        proxIC  = lambda x: non_negativity(x, startIC, sizeIC)
-    elif normIC == group_sparsity:
-        structureIC = regularisation.get('structureIC')
-        groupWeightIC   = regularisation.get('weightsIC')
-        if not len(structureIC) == len(groupWeightIC):
-            raise ValueError('Number of groups and weights do not coincide.')
-        group_norm = regularisation.get('group_norm')
-        if not group_norm in list_group_sparsity_norms:
-            raise ValueError('Wrong norm in the structured sparsity term. Choose between %s.' % str(list_group_sparsity_norms))
-
-        # convert to new data structure (needed for faster access)
-        N = np.sum([g.size for g in structureIC])
-        groupIdxIC  = np.zeros( (N,), dtype=np.int32 )
-        groupSizeIC = np.zeros( (structureIC.size,), dtype=np.int32 )
-        pos = 0
-        for i, g in enumerate(structureIC) :
-            groupSizeIC[i] = g.size
-            groupIdxIC[pos:(pos+g.size)] = g[:]
-            pos += g.size
-
-        omegaIC = lambda x: omega_group_sparsity( x, groupIdxIC, groupSizeIC, groupWeightIC, lambdaIC, group_norm )
-        proxIC  = lambda x:  prox_group_sparsity( x, groupIdxIC, groupSizeIC, groupWeightIC, lambdaIC, group_norm )
-    else:
-        raise ValueError('Type of regularisation for IC compartment not recognized.')
-
-
-    # Extracellular Compartment
-    startEC = regularisation.get('startEC')
-    sizeEC  = regularisation.get('sizeEC')
-    if lambdaEC == 0.0:
-        omegaEC = lambda x: 0.0
-        proxEC  = lambda x: x
-    elif normEC == norm2:
-        omegaEC = lambda x: lambdaEC * np.linalg.norm(x[startEC:(startEC+sizeEC)])
-        proxEC  = lambda x: projection_onto_l2_ball(x, lambdaEC, startEC, sizeEC)
-    elif normEC == norm1:
-        omegaEC = lambda x: lambdaEC * sum( x[startEC:(startEC+sizeEC)] )
-        proxEC  = lambda x: soft_thresholding(x, lambdaEC, startEC, sizeEC)
-    elif normEC == non_negative:
-        omegaEC = lambda x: 0.0
-        proxEC  = lambda x: non_negativity(x, startEC, sizeEC)
-    else:
-        raise ValueError('Type of regularisation for EC compartment not recognized.')
-
-    # Isotropic Compartment
-    startISO = regularisation.get('startISO')
-    sizeISO  = regularisation.get('sizeISO')
-    if lambdaISO == 0.0:
-        omegaISO = lambda x: 0.0
-        proxISO  = lambda x: x
-    elif normISO == norm2:
-        omegaISO = lambda x: lambdaISO * np.linalg.norm(x[startISO:(startISO+sizeISO)])
-        proxISO  = lambda x: projection_onto_l2_ball(x, lambdaISO, startISO, sizeISO)
-    elif normISO == norm1:
-        omegaISO = lambda x: lambdaISO * sum( x[startISO:(startISO+sizeISO)] )
-        proxISO  = lambda x: soft_thresholding(x, lambdaISO, startISO, sizeISO)
-    elif normISO == non_negative:
-        omegaISO = lambda x: 0.0
-        proxISO  = lambda x: non_negativity(x, startISO, sizeISO)
-    else:
-        raise ValueError('Type of regularisation for ISO compartment not recognized.')
-
-    omega = lambda x: omegaIC(x) + omegaEC(x) + omegaISO(x)
-    prox = lambda x: non_negativity(proxIC(proxEC(proxISO(x))),0,x.size) # non negativity is redunduntly forced
-
-    return omega, prox
-
-
-def evaluate_model(y, A, x, regularisation = None):
-    if regularisation is None:
-        omega = lambda x: 0.0
-        prox  = lambda x: non_negativity(x, 0, len(x))
-    else:
-        omega, _ = regularisation2omegaprox(regularisation)
-
-    return 0.5*np.linalg.norm(A.dot(x)-y)**2 + omega(x)
-
-
-def solve(y, A, At, tol_fun = 1e-4, tol_x = 1e-6, max_iter = 1000, verbose = 1, x0 = None, regularisation = None):
-    """
-    Solve the regularised least squares problem
-
-        argmin_x 0.5*||Ax-y||_2^2 + Omega(x)
-
-    with the Omega described by 'regularisation'.
-
-    Check the documentation of commit.solvers.init_regularisation to see how to
-    solve a specific problem.
-    """
-    if regularisation is None:
-        omega = lambda x: 0.0
-        prox  = lambda x: non_negativity(x, 0, x.size)
-    else:
-        omega, prox = regularisation2omegaprox(regularisation)
-
-    if x0 is None:
-        x0 = np.zeros(A.shape[1])
-
-    return fista( y, A, At, tol_fun, tol_x, max_iter, verbose, x0, omega, prox)
-
-
-def fista( y, A, At, tol_fun, tol_x, max_iter, verbose, x0, omega, proximal) :
-    """
-    Solve the regularised least squares problem
-
-        argmin_x 0.5*||Ax-y||_2^2 + Omega(x)
-
-    with the FISTA algorithm described in [1].
-
-    The penalty term and its proximal operator must be defined in such a way
-    that they already contain the regularisation parameter.
-
-    References:
-        [1] Beck & Teboulle - `A Fast Iterative Shrinkage Thresholding
-            Algorithm for Linear Inverse Problems`
-    """
-
-    # Initialization
-    res = -y.copy()
-    xhat = x0.copy()
-    x = np.zeros_like(xhat)
-    res += A.dot(xhat)
-    proximal( xhat )
-    reg_term = omega( xhat )
-    prev_obj = 0.5 * np.linalg.norm(res)**2 + reg_term
-
-    told = 1
-    beta = 0.9
-    prev_x = xhat.copy()
-    grad = np.asarray(At.dot(res))
-    qfval = prev_obj
-
-    # Step size computation
-    L = ( np.linalg.norm( A.dot(grad) ) / np.linalg.norm(grad) )**2
-    mu = 1.9 / L
-
-    # Main loop
-    if verbose >= 1 :
-        print()
-        print( "      |  1/2||Ax-y||^2    Omega         |  Cost function    Abs error      Rel error    |     Abs x          Rel x" )
-        print( "------|---------------------------------|-----------------------------------------------|------------------------------" )
-    iter = 1
-    while True :
-        if verbose >= 1 :
-            print( "%4d  |" % iter, end="" )
-            sys.stdout.flush()
-
-        # Smooth step
-        x = xhat - mu*grad
-
-        # Non-smooth step
-        proximal( x )
-        reg_term_x = omega( x )
-
-        # Check stepsize
-        tmp = x-xhat
-        q = qfval + np.real( np.dot(tmp,grad) ) + 0.5/mu * np.linalg.norm(tmp)**2 + reg_term_x
-        res = A.dot(x) - y
-        res_norm = np.linalg.norm(res)
-        curr_obj = 0.5 * res_norm**2 + reg_term_x
-
-        # Backtracking
-        while curr_obj > q :
-            # Smooth step
-            mu = beta*mu
-            x = xhat - mu*grad
-
-            # Non-smooth step
-            proximal( x )
-            reg_term_x = omega( x )
-
-            # Check stepsize
-            tmp = x-xhat
-            q = qfval + np.real( np.dot(tmp,grad) ) + 0.5/mu * np.linalg.norm(tmp)**2 + reg_term_x
-            res = A.dot(x) - y
-            res_norm = np.linalg.norm(res)
-            curr_obj = 0.5 * res_norm**2 + reg_term_x
-
-        # Global stopping criterion
-        abs_obj = abs(curr_obj - prev_obj)
-        rel_obj = abs_obj / curr_obj
-        abs_x   = np.linalg.norm(x - prev_x)
-        rel_x   = abs_x / ( np.linalg.norm(x) + eps )
-        if verbose >= 1 :
-            print( "  %13.7e  %13.7e  |  %13.7e  %13.7e  %13.7e  |  %13.7e  %13.7e" % ( 0.5 * res_norm**2, reg_term_x, curr_obj, abs_obj, rel_obj, abs_x, rel_x ) )
-
-        if abs_obj < eps :
-            criterion = "Absolute tolerance on the objective"
-            break
-        elif rel_obj < tol_fun :
-            criterion = "Relative tolerance on the objective"
-            break
-        elif abs_x < eps :
-            criterion = "Absolute tolerance on the unknown"
-            break
-        elif rel_x < tol_x :
-            criterion = "Relative tolerance on the unknown"
-            break
-        elif iter >= max_iter :
-            criterion = "Maximum number of iterations"
-            break
-
-        # FISTA update
-        t = 0.5 * ( 1 + sqrt(1+4*told**2) )
-        xhat = x + (told-1)/t * (x - prev_x)
-
-        # Gradient computation
-        res = A.dot(xhat) - y
-        xarr = np.asarray(x)
-
-        grad = np.asarray(At.dot(res))
-
-        # Update variables
-        iter += 1
-        prev_obj = curr_obj
-        prev_x = x.copy()
-        told = t
-        qfval = 0.5 * np.linalg.norm(res)**2
-
-
-    if verbose >= 1 :
-        print( "< Stopping criterion: %s >" % criterion )
-
-    opt_details = {}
-    opt_details['residual'] = 0.5*res_norm**2
-    opt_details['regterm'] = reg_term_x
-    opt_details['cost_function'] = curr_obj
-    opt_details['abs_cost'] = abs_obj
-    opt_details['rel_cost'] = rel_obj
-    opt_details['abs_x'] = abs_x
-    opt_details['rel _x'] = rel_x
-    opt_details['iterations'] = iter
-    opt_details['stopping_criterion'] = criterion
-
-    return x, opt_details
+"""
+Author: Matteo Frigo - lts5 @ EPFL and Dep. of CS @ Univ. of Verona
+
+This structure is based on the previous work of Rafael Carrillo and was
+supported by the LTS5 laboratory at EPFL, Lausanne.
+"""
+from __future__ import print_function
+import numpy as np
+from math import sqrt
+import sys
+import warnings
+eps = np.finfo(float).eps
+
+from commit.proximals import (non_negativity,
+                             omega_group_sparsity,
+                             prox_group_sparsity,
+                             soft_thresholding,
+                             projection_onto_l2_ball)
+group_sparsity = -1
+non_negative = 0
+norm1 = 1
+norm2 = 2
+norminf = np.inf
+list_regnorms = [group_sparsity, non_negative, norm1, norm2]
+list_group_sparsity_norms = [norm2]#, norminf] # removed because of issue #54
+
+
+def init_regularisation(commit_evaluation,
+                        regnorms = (non_negative, non_negative, non_negative),
+                        structureIC = None, weightsIC = None, group_norm = 2,
+                        lambdas = (.0,.0,.0) ):
+    """
+    Initialise the data structure that defines Omega in
+
+        argmin_x 0.5*||Ax-y||_2^2 + Omega(x)
+
+
+    Input
+    -----
+    commit_evaluation - commit.Evaluation object :
+        dictionary and model have to be loaded beforehand.
+
+
+    regnorms - tuple :
+        this sets the penalty term to be used for each compartment.
+            Default = (non_negative,non_negative,non_negative).
+
+            regnorms[0] corresponds to the Intracellular compartment
+            regnorms[1] corresponds to the Extracellular compartment
+            regnorms[2] corresponds to the Isotropic compartment
+
+            Each regnorms[k] must be one of commit.solvers.
+                                {group_sparsity, non_negative, norm1, norm2}.
+
+            commit.solvers.group_sparsity considers both the non-overlapping
+                and the hierarchical group sparsity (see [1]). This option is
+                allowed only in the IC compartment. The mathematical formulation
+                of this term is
+                $\Omega(x) = \lambda \sum_{g\in G} w_g |x_g|
+
+            commit.solvers.non_negative puts a non negativity constraint on the
+                coefficients corresponding to the compartment. This is the
+                default option for each compartment
+
+            commit.solvers.norm1 penalises with the 1-norm of the coefficients
+                corresponding to the compartment.
+
+            commit.solvers.norm2 penalises with the 2-norm of the coefficients
+                corresponding to the compartment.
+
+
+    structureIC - np.array(list(list)) :
+        group structure for the IC compartment.
+            This field is necessary only if regterm[0]=commit.solver.group_sparsity.
+            Example:
+                structureIC = np.array([[0,2,5],[1,3,4],[0,1,2,3,4,5],[6]])
+
+                that is equivalent to
+                            [0,1,2,3,4,5]        [6]
+                              /       \
+                        [0,2,5]       [1,3,4]
+                which has two non overlapping groups, one of which is the union
+                of two other non-overlapping groups.
+
+
+    weightsIC - np.array(np.float64) :
+        this defines the weights associated to each group of structure IC.
+
+
+    group_norm - number :
+        norm type for the commit.solver.group_sparsity penalisation of the IC compartment.
+            Default: group_norm = commit.solver.norm2
+            To be chosen among commit.solver.{norm2,norminf}.
+
+    lambdas - tuple :
+        regularisation parameter for each compartment.
+            Default: lambdas = (0.0, 0.0, 0.0)
+            The lambdas correspond to the onse described in the mathematical
+            formulation of the regularisation term
+            $\Omega(x) = lambdas[0]*regnorm[0](x) + lambdas[1]*regnorm[1](x) + lambdas[2]*regnorm[2](x)$
+
+
+    References:
+        [1] Jenatton et al. - 'Proximal Methods for Hierarchical Sparse Coding'
+    """
+    regularisation = {}
+
+    regularisation['startIC']  = 0
+    regularisation['sizeIC']   = int( commit_evaluation.DICTIONARY['IC']['nF'] * commit_evaluation.KERNELS['wmr'].shape[0])
+    regularisation['startEC']  = int( regularisation['sizeIC'] )
+    regularisation['sizeEC']   = int( commit_evaluation.DICTIONARY['EC']['nE'] * commit_evaluation.KERNELS['wmh'].shape[0])
+    regularisation['startISO'] = int( regularisation['sizeIC'] + regularisation['sizeEC'] )
+    regularisation['sizeISO']  = int( commit_evaluation.DICTIONARY['nV'] * commit_evaluation.KERNELS['iso'].shape[0])
+
+    regularisation['normIC']  = regnorms[0]
+    regularisation['normEC']  = regnorms[1]
+    regularisation['normISO'] = regnorms[2]
+
+    regularisation['lambdaIC']  = float( lambdas[0] )
+    regularisation['lambdaEC']  = float( lambdas[1] )
+    regularisation['lambdaISO'] = float( lambdas[2] )
+
+    # Solver-specific fields
+    regularisation['structureIC']      = structureIC
+    regularisation['weightsIC']        = weightsIC
+    regularisation['group_norm']       = group_norm
+
+    return regularisation
+
+
+def regularisation2omegaprox(regularisation):
+    lambdaIC  = float(regularisation.get('lambdaIC'))
+    lambdaEC  = float(regularisation.get('lambdaEC'))
+    lambdaISO = float(regularisation.get('lambdaISO'))
+    if lambdaIC < 0.0 or lambdaEC < 0.0 or lambdaISO < 0.0:
+        raise ValueError('Negative regularisation parameters are not allowed')
+
+    normIC  = regularisation.get('normIC')
+    normEC  = regularisation.get('normEC')
+    normISO = regularisation.get('normISO')
+    if not normIC in list_regnorms:
+        raise ValueError('normIC must be one of commit.solvers.{group_sparsity,non_negative,norm1,norm2}')
+    if not normEC in list_regnorms:
+        raise ValueError('normEC must be one of commit.solvers.{group_sparsity,non_negative,norm1,norm2}')
+    if not normISO in list_regnorms:
+        raise ValueError('normISO must be one of commit.solvers.{group_sparsity,non_negative,norm1,norm2}')
+
+    ## NNLS case
+    if (lambdaIC == 0.0 and lambdaEC == 0.0 and lambdaISO == 0.0) or (normIC == non_negative and normEC == non_negative and normISO == non_negative):
+        omega = lambda x: 0.0
+        prox  = lambda x: non_negativity(x, 0, len(x))
+        return omega, prox
+
+    ## All other cases
+    # Intracellular Compartment
+    startIC = regularisation.get('startIC')
+    sizeIC  = regularisation.get('sizeIC')
+    if lambdaIC == 0.0:
+        omegaIC = lambda x: 0.0
+        proxIC  = lambda x: x
+    elif normIC == norm2:
+        omegaIC = lambda x: lambdaIC * np.linalg.norm(x[startIC:sizeIC])
+        proxIC  = lambda x: projection_onto_l2_ball(x, lambdaIC, startIC, sizeIC)
+    elif normIC == norm1:
+        omegaIC = lambda x: lambdaIC * sum( x[startIC:sizeIC] )
+        proxIC  = lambda x: soft_thresholding(x, lambdaIC, startIC, sizeIC)
+    elif normIC == non_negative:
+        omegaIC = lambda x: 0.0
+        proxIC  = lambda x: non_negativity(x, startIC, sizeIC)
+    elif normIC == group_sparsity:
+        structureIC = regularisation.get('structureIC')
+        groupWeightIC   = regularisation.get('weightsIC')
+        if not len(structureIC) == len(groupWeightIC):
+            raise ValueError('Number of groups and weights do not coincide.')
+        group_norm = regularisation.get('group_norm')
+        if not group_norm in list_group_sparsity_norms:
+            raise ValueError('Wrong norm in the structured sparsity term. Choose between %s.' % str(list_group_sparsity_norms))
+
+        # convert to new data structure (needed for faster access)
+        N = np.sum([g.size for g in structureIC])
+        groupIdxIC  = np.zeros( (N,), dtype=np.int32 )
+        groupSizeIC = np.zeros( (structureIC.size,), dtype=np.int32 )
+        pos = 0
+        for i, g in enumerate(structureIC) :
+            groupSizeIC[i] = g.size
+            groupIdxIC[pos:(pos+g.size)] = g[:]
+            pos += g.size
+
+        omegaIC = lambda x: omega_group_sparsity( x, groupIdxIC, groupSizeIC, groupWeightIC, lambdaIC, group_norm )
+        proxIC  = lambda x:  prox_group_sparsity( x, groupIdxIC, groupSizeIC, groupWeightIC, lambdaIC, group_norm )
+    else:
+        raise ValueError('Type of regularisation for IC compartment not recognized.')
+
+
+    # Extracellular Compartment
+    startEC = regularisation.get('startEC')
+    sizeEC  = regularisation.get('sizeEC')
+    if lambdaEC == 0.0:
+        omegaEC = lambda x: 0.0
+        proxEC  = lambda x: x
+    elif normEC == norm2:
+        omegaEC = lambda x: lambdaEC * np.linalg.norm(x[startEC:(startEC+sizeEC)])
+        proxEC  = lambda x: projection_onto_l2_ball(x, lambdaEC, startEC, sizeEC)
+    elif normEC == norm1:
+        omegaEC = lambda x: lambdaEC * sum( x[startEC:(startEC+sizeEC)] )
+        proxEC  = lambda x: soft_thresholding(x, lambdaEC, startEC, sizeEC)
+    elif normEC == non_negative:
+        omegaEC = lambda x: 0.0
+        proxEC  = lambda x: non_negativity(x, startEC, sizeEC)
+    else:
+        raise ValueError('Type of regularisation for EC compartment not recognized.')
+
+    # Isotropic Compartment
+    startISO = regularisation.get('startISO')
+    sizeISO  = regularisation.get('sizeISO')
+    if lambdaISO == 0.0:
+        omegaISO = lambda x: 0.0
+        proxISO  = lambda x: x
+    elif normISO == norm2:
+        omegaISO = lambda x: lambdaISO * np.linalg.norm(x[startISO:(startISO+sizeISO)])
+        proxISO  = lambda x: projection_onto_l2_ball(x, lambdaISO, startISO, sizeISO)
+    elif normISO == norm1:
+        omegaISO = lambda x: lambdaISO * sum( x[startISO:(startISO+sizeISO)] )
+        proxISO  = lambda x: soft_thresholding(x, lambdaISO, startISO, sizeISO)
+    elif normISO == non_negative:
+        omegaISO = lambda x: 0.0
+        proxISO  = lambda x: non_negativity(x, startISO, sizeISO)
+    else:
+        raise ValueError('Type of regularisation for ISO compartment not recognized.')
+
+    omega = lambda x: omegaIC(x) + omegaEC(x) + omegaISO(x)
+    prox = lambda x: non_negativity(proxIC(proxEC(proxISO(x))),0,x.size) # non negativity is redunduntly forced
+
+    return omega, prox
+
+
+def evaluate_model(y, A, x, regularisation = None):
+    if regularisation is None:
+        omega = lambda x: 0.0
+        prox  = lambda x: non_negativity(x, 0, len(x))
+    else:
+        omega, _ = regularisation2omegaprox(regularisation)
+
+    return 0.5*np.linalg.norm(A.dot(x)-y)**2 + omega(x)
+
+
+def solve(y, A, At, tol_fun = 1e-4, tol_x = 1e-6, max_iter = 1000, verbose = 1, x0 = None, regularisation = None):
+    """
+    Solve the regularised least squares problem
+
+        argmin_x 0.5*||Ax-y||_2^2 + Omega(x)
+
+    with the Omega described by 'regularisation'.
+
+    Check the documentation of commit.solvers.init_regularisation to see how to
+    solve a specific problem.
+    """
+    if regularisation is None:
+        omega = lambda x: 0.0
+        prox  = lambda x: non_negativity(x, 0, x.size)
+    else:
+        omega, prox = regularisation2omegaprox(regularisation)
+
+    if x0 is None:
+        x0 = np.zeros(A.shape[1])
+
+    return fista( y, A, At, tol_fun, tol_x, max_iter, verbose, x0, omega, prox)
+
+
+def fista( y, A, At, tol_fun, tol_x, max_iter, verbose, x0, omega, proximal) :
+    """
+    Solve the regularised least squares problem
+
+        argmin_x 0.5*||Ax-y||_2^2 + Omega(x)
+
+    with the FISTA algorithm described in [1].
+
+    The penalty term and its proximal operator must be defined in such a way
+    that they already contain the regularisation parameter.
+
+    References:
+        [1] Beck & Teboulle - `A Fast Iterative Shrinkage Thresholding
+            Algorithm for Linear Inverse Problems`
+    """
+
+    # Initialization
+    res = -y.copy()
+    xhat = x0.copy()
+    x = np.zeros_like(xhat)
+    res += A.dot(xhat)
+    proximal( xhat )
+    reg_term = omega( xhat )
+    prev_obj = 0.5 * np.linalg.norm(res)**2 + reg_term
+
+    told = 1
+    beta = 0.9
+    prev_x = xhat.copy()
+    grad = np.asarray(At.dot(res))
+    qfval = prev_obj
+
+    # Step size computation
+    L = ( np.linalg.norm( A.dot(grad) ) / np.linalg.norm(grad) )**2
+    mu = 1.9 / L
+
+    # Main loop
+    if verbose >= 1 :
+        print()
+        print( "      |  1/2||Ax-y||^2    Omega         |  Cost function    Abs error      Rel error    |     Abs x          Rel x" )
+        print( "------|---------------------------------|-----------------------------------------------|------------------------------" )
+    iter = 1
+    while True :
+        if verbose >= 1 :
+            print( "%4d  |" % iter, end="" )
+            sys.stdout.flush()
+
+        # Smooth step
+        x = xhat - mu*grad
+
+        # Non-smooth step
+        proximal( x )
+        reg_term_x = omega( x )
+
+        # Check stepsize
+        tmp = x-xhat
+        q = qfval + np.real( np.dot(tmp,grad) ) + 0.5/mu * np.linalg.norm(tmp)**2 + reg_term_x
+        res = A.dot(x) - y
+        res_norm = np.linalg.norm(res)
+        curr_obj = 0.5 * res_norm**2 + reg_term_x
+
+        # Backtracking
+        while curr_obj > q :
+            # Smooth step
+            mu = beta*mu
+            x = xhat - mu*grad
+
+            # Non-smooth step
+            proximal( x )
+            reg_term_x = omega( x )
+
+            # Check stepsize
+            tmp = x-xhat
+            q = qfval + np.real( np.dot(tmp,grad) ) + 0.5/mu * np.linalg.norm(tmp)**2 + reg_term_x
+            res = A.dot(x) - y
+            res_norm = np.linalg.norm(res)
+            curr_obj = 0.5 * res_norm**2 + reg_term_x
+
+        # Global stopping criterion
+        abs_obj = abs(curr_obj - prev_obj)
+        rel_obj = abs_obj / curr_obj
+        abs_x   = np.linalg.norm(x - prev_x)
+        rel_x   = abs_x / ( np.linalg.norm(x) + eps )
+        if verbose >= 1 :
+            print( "  %13.7e  %13.7e  |  %13.7e  %13.7e  %13.7e  |  %13.7e  %13.7e" % ( 0.5 * res_norm**2, reg_term_x, curr_obj, abs_obj, rel_obj, abs_x, rel_x ) )
+
+        if abs_obj < eps :
+            criterion = "Absolute tolerance on the objective"
+            break
+        elif rel_obj < tol_fun :
+            criterion = "Relative tolerance on the objective"
+            break
+        elif abs_x < eps :
+            criterion = "Absolute tolerance on the unknown"
+            break
+        elif rel_x < tol_x :
+            criterion = "Relative tolerance on the unknown"
+            break
+        elif iter >= max_iter :
+            criterion = "Maximum number of iterations"
+            break
+
+        # FISTA update
+        t = 0.5 * ( 1 + sqrt(1+4*told**2) )
+        xhat = x + (told-1)/t * (x - prev_x)
+
+        # Gradient computation
+        res = A.dot(xhat) - y
+        xarr = np.asarray(x)
+
+        grad = np.asarray(At.dot(res))
+
+        # Update variables
+        iter += 1
+        prev_obj = curr_obj
+        prev_x = x.copy()
+        told = t
+        qfval = 0.5 * np.linalg.norm(res)**2
+
+
+    if verbose >= 1 :
+        print( "< Stopping criterion: %s >" % criterion )
+
+    opt_details = {}
+    opt_details['residual'] = 0.5*res_norm**2
+    opt_details['regterm'] = reg_term_x
+    opt_details['cost_function'] = curr_obj
+    opt_details['abs_cost'] = abs_obj
+    opt_details['rel_cost'] = rel_obj
+    opt_details['abs_x'] = abs_x
+    opt_details['rel _x'] = rel_x
+    opt_details['iterations'] = iter
+    opt_details['stopping_criterion'] = criterion
+
+    return x, opt_details
diff --git a/setup.py b/setup.py
index ea11a4e9..0093ede9 100644
--- a/setup.py
+++ b/setup.py
@@ -1,242 +1,238 @@
-from distutils.core import setup, Extension
-from Cython.Distutils import build_ext
-from Cython.Build import cythonize
-import numpy
-import amico
-import os
-from os.path import join as pjoin
-
-amico_version = amico.__version__.split('.')
-amico_version = [int(version_val) for version_val in amico_version]
-if amico_version[0] == 1 and amico_version[1] < 1:
-    raise RuntimeError( 'COMMIT requires AMICO v1.1.0 or above. Current AMICO version is %s' % amico.__version__ )
-
-
-# taken from npcuda
-def find_in_path(name, path):
-    """Find a file in a search path"""
-
-    # Adapted fom http://code.activestate.com/recipes/52224
-    for dir in path.split(os.pathsep):
-        binpath = pjoin(dir, name)
-        if os.path.exists(binpath):
-            return os.path.abspath(binpath)
-    return None
-
-def locate_cuda():
-    """Locate the CUDA environment on the system
-    Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64'
-    and values giving the absolute path to each directory.
-    Starts by looking for the CUDAHOME env variable. If not found,
-    everything is based on finding 'nvcc' in the PATH.
-    """
-
-    # First check if the CUDAHOME env variable is in use
-    if 'CUDAHOME' in os.environ:
-        home = os.environ['CUDAHOME']
-        nvcc = pjoin(home, 'bin', 'nvcc')
-    else:
-        # Otherwise, search the PATH for NVCC
-        nvcc = find_in_path('nvcc', os.environ['PATH'])
-        if nvcc is None:
-            return None
-        home = os.path.dirname(os.path.dirname(nvcc))
-
-    cudaconfig = {'home': home, 'nvcc': nvcc,
-                  'include': pjoin(home, 'include'),
-                  'lib64': pjoin(home, 'lib64')}
-    for k, v in iter(cudaconfig.items()):
-        if not os.path.exists(v):
-            return None
-
-    return cudaconfig
-
-def customize_compiler_for_nvcc(self):
-    """Inject deep into distutils to customize how the dispatch
-    to gcc/nvcc works.
-    If you subclass UnixCCompiler, it's not trivial to get your subclass
-    injected in, and still have the right customizations (i.e.
-    distutils.sysconfig.customize_compiler) run on it. So instead of going
-    the OO route, I have this. Note, it's kindof like a wierd functional
-    subclassing going on.
-    """
-
-    # Tell the compiler it can processes .cu
-    self.src_extensions.append('.cu')
-
-    # Save references to the default compiler_so and _comple methods
-    default_compiler_so = self.compiler_so
-    super = self._compile
-
-    # Now redefine the _compile method. This gets executed for each
-    # object but distutils doesn't have the ability to change compilers
-    # based on source extension: we add it.
-    def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
-        if os.path.splitext(src)[1] == '.cu':
-            # use the cuda for .cu files
-            self.set_executable('compiler_so', CUDA['nvcc'])
-            # use only a subset of the extra_postargs, which are 1-1
-            # translated from the extra_compile_args in the Extension class
-            print('\n--------nvcc aqui--------')
-            print(type(extra_postargs))
-            print(extra_postargs)
-            print('--------------------\n')
-            postargs = extra_postargs['nvcc']
-        else:
-            print('\n--------gcc aqui--------')
-            print(type(extra_postargs))
-            print(extra_postargs)
-            print('--------------------\n')
-            postargs = extra_postargs['gcc']
-
-        super(obj, src, ext, cc_args, postargs, pp_opts)
-        # Reset the default compiler_so, which we might have changed for cuda
-        self.compiler_so = default_compiler_so
-
-    # Inject our redefined _compile method into the class
-    self._compile = _compile
-
-# Obtain the numpy include directory. This logic works across numpy versions.
-try:
-    numpy_include = numpy.get_include()
-except AttributeError:
-    numpy_include = numpy.get_numpy_include()
-
-# Try to locate CUDA
-CUDA = locate_cuda()
-
-if CUDA != None:
-    # Run the customize_compiler
-    class cuda_build_ext(build_ext):
-        def build_extensions(self):
-            customize_compiler_for_nvcc(self.compiler)
-            build_ext.build_extensions(self)
-
-    # Cython extension to create the sparse data structure from a tractogram
-    # for the computation of matrix-vector multiplications
-    trk2dictionary_ext = Extension(
-        name='commit.trk2dictionary',
-        sources=['commit/trk2dictionary/trk2dictionary.pyx'],
-        include_dirs=[numpy.get_include()],
-        extra_compile_args= {
-            'gcc': ['-w'],
-            'nvcc': [
-                '-arch=sm_30', '--ptxas-options=-v', '-c',
-                '--compiler-options', "'-fPIC'"
-                ]
-            },
-        extra_link_args=[],
-        language='c++',
-    )
-
-    core_ext = Extension(
-        name='commit.core',
-        sources=['commit/core.pyx'],
-        include_dirs=[numpy.get_include()],
-        extra_compile_args= {
-            'gcc': ['-w'],
-            'nvcc': [
-                '-arch=sm_30', '--ptxas-options=-v', '-c',
-                '--compiler-options', "'-fPIC'"
-                ]
-            },
-        extra_link_args=[],
-        language='c++',
-    )
-
-    proximals_ext = Extension(
-        name='commit.proximals',
-        sources=['commit/proximals.pyx'],
-        include_dirs=[numpy.get_include()],
-        extra_compile_args= {
-            'gcc': ['-w'],
-            'nvcc': [
-                '-arch=sm_30', '--ptxas-options=-v', '-c',
-                '--compiler-options', "'-fPIC'"
-                ]
-            },
-        extra_link_args=[],
-        language='c++',
-    )
-
-    cudaoperator_ext = Extension(
-        name='commit.cudaoperator',
-        sources = ['commit/operator_withCUDA.cu', 'commit/cudaoperator.pyx'],
-        library_dirs = [CUDA['lib64']],
-        libraries = ['cudart'],
-        language = 'c++',
-        runtime_library_dirs = [CUDA['lib64']],
-        # This syntax is specific to this build system
-        # we're only going to use certain compiler args with nvcc
-        # and not with gcc the implementation of this trick is in
-        # customize_compiler()
-        extra_compile_args= {
-            'gcc': ['-w'],
-            'nvcc': [
-                '-arch=sm_30', '--ptxas-options=-v', '-c',
-                '--compiler-options', "'-fPIC'"
-                ]
-            },
-        include_dirs = [numpy_include, CUDA['include']]
-    )
-
-    setup(
-        name='commit',
-        version='1.4.0',
-        description='Convex Optimization Modeling for Microstructure Informed Tractography (COMMIT)',
-        author='Alessandro Daducci',
-        author_email='alessandro.daducci@gmail.com',
-        url='https://github.com/daducci/COMMIT',
-        cmdclass = {'build_ext':cuda_build_ext},
-        ext_modules = [ trk2dictionary_ext, core_ext, proximals_ext, cudaoperator_ext ],
-        packages=['commit','commit.operator'],
-        package_data={
-            'commit.operator':["*.*"], # needed by pyximport to compile at runtime
-        },
-    )
-else:
-    print('Installing CPU version')
-
-    # Cython extension to create the sparse data structure from a tractogram
-    # for the computation of matrix-vector multiplications
-    ext1 = Extension(
-        name='commit.trk2dictionary',
-        sources=['commit/trk2dictionary/trk2dictionary.pyx'],
-        include_dirs=[numpy.get_include()],
-        extra_compile_args=['-w'],
-        extra_link_args=[],
-        language='c++',
-    )
-
-    ext2 = Extension(
-        name='commit.core',
-        sources=['commit/core.pyx'],
-        include_dirs=[numpy.get_include()],
-        extra_compile_args=['-w'],
-        extra_link_args=[],
-        language='c++',
-    )
-
-    ext3 = Extension(
-        name='commit.proximals',
-        sources=['commit/proximals.pyx'],
-        include_dirs=[numpy.get_include()],
-        extra_compile_args=['-w'],
-        extra_link_args=[],
-        language='c++',
-    )
-
-    setup(
-        name='commit',
-        version='1.3.0',
-        description='Convex Optimization Modeling for Microstructure Informed Tractography (COMMIT)',
-        author='Alessandro Daducci',
-        author_email='alessandro.daducci@gmail.com',
-        url='https://github.com/daducci/COMMIT',
-        cmdclass = {'build_ext':build_ext},
-        ext_modules = [ ext1, ext2, ext3 ],
-        packages=['commit','commit.operator'],
-        package_data={
-            'commit.operator':["*.*"], # needed by pyximport to compile at runtime
-        },
-    )
+from distutils.core import setup, Extension
+from Cython.Distutils import build_ext
+from Cython.Build import cythonize
+import numpy
+import amico
+import os
+from os.path import join as pjoin
+
+amico_version = amico.__version__.split('.')
+amico_version = [int(version_val) for version_val in amico_version]
+if amico_version[0] == 1 and amico_version[1] < 1:
+    raise RuntimeError( 'COMMIT requires AMICO v1.1.0 or above. Current AMICO version is %s' % amico.__version__ )
+
+
+# taken from npcuda
+def find_in_path(name, path):
+    """Find a file in a search path"""
+
+    # Adapted fom http://code.activestate.com/recipes/52224
+    for dir in path.split(os.pathsep):
+        binpath = pjoin(dir, name)
+        if os.path.exists(binpath):
+            return os.path.abspath(binpath)
+    return None
+
+def locate_cuda():
+    """Locate the CUDA environment on the system
+    Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64'
+    and values giving the absolute path to each directory.
+    Starts by looking for the CUDAHOME env variable. If not found,
+    everything is based on finding 'nvcc' in the PATH.
+    """
+
+    # First check if the CUDAHOME env variable is in use
+    if 'CUDAHOME' in os.environ:
+        home = os.environ['CUDAHOME']
+        nvcc = pjoin(home, 'bin', 'nvcc')
+    else:
+        # Otherwise, search the PATH for NVCC
+        nvcc = find_in_path('nvcc', os.environ['PATH'])
+        if nvcc is None:
+            return None
+        home = os.path.dirname(os.path.dirname(nvcc))
+
+    cudaconfig = {'home': home, 'nvcc': nvcc,
+                  'include': pjoin(home, 'include'),
+                  'lib64': pjoin(home, 'lib64')}
+    for k, v in iter(cudaconfig.items()):
+        if not os.path.exists(v):
+            return None
+
+    return cudaconfig
+
+def customize_compiler_for_nvcc(self):
+    """Inject deep into distutils to customize how the dispatch
+    to gcc/nvcc works.
+    If you subclass UnixCCompiler, it's not trivial to get your subclass
+    injected in, and still have the right customizations (i.e.
+    distutils.sysconfig.customize_compiler) run on it. So instead of going
+    the OO route, I have this. Note, it's kindof like a wierd functional
+    subclassing going on.
+    """
+
+    # Tell the compiler it can processes .cu
+    self.src_extensions.append('.cu')
+
+    # Save references to the default compiler_so and _comple methods
+    default_compiler_so = self.compiler_so
+    super = self._compile
+
+    # Now redefine the _compile method. This gets executed for each
+    # object but distutils doesn't have the ability to change compilers
+    # based on source extension: we add it.
+    def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
+        if os.path.splitext(src)[1] == '.cu':
+            # use the cuda for .cu files
+            self.set_executable('compiler_so', CUDA['nvcc'])
+            # use only a subset of the extra_postargs, which are 1-1
+            # translated from the extra_compile_args in the Extension class
+            print(type(extra_postargs))
+            print(extra_postargs)
+            postargs = extra_postargs['nvcc']
+        else:
+            print(type(extra_postargs))
+            print(extra_postargs)
+            postargs = extra_postargs['gcc']
+
+        super(obj, src, ext, cc_args, postargs, pp_opts)
+        # Reset the default compiler_so, which we might have changed for cuda
+        self.compiler_so = default_compiler_so
+
+    # Inject our redefined _compile method into the class
+    self._compile = _compile
+
+# Obtain the numpy include directory. This logic works across numpy versions.
+try:
+    numpy_include = numpy.get_include()
+except AttributeError:
+    numpy_include = numpy.get_numpy_include()
+
+# Try to locate CUDA
+CUDA = locate_cuda()
+
+if CUDA != None:
+    # Run the customize_compiler
+    class cuda_build_ext(build_ext):
+        def build_extensions(self):
+            customize_compiler_for_nvcc(self.compiler)
+            build_ext.build_extensions(self)
+
+    # Cython extension to create the sparse data structure from a tractogram
+    # for the computation of matrix-vector multiplications
+    trk2dictionary_ext = Extension(
+        name='commit.trk2dictionary',
+        sources=['commit/trk2dictionary/trk2dictionary.pyx'],
+        include_dirs=[numpy.get_include()],
+        extra_compile_args= {
+            'gcc': ['-w'],
+            'nvcc': [
+                '-arch=sm_30', '--ptxas-options=-v', '-c',
+                '--compiler-options', "'-fPIC'"
+                ]
+            },
+        extra_link_args=[],
+        language='c++',
+    )
+
+    core_ext = Extension(
+        name='commit.core',
+        sources=['commit/core.pyx'],
+        include_dirs=[numpy.get_include()],
+        extra_compile_args= {
+            'gcc': ['-w'],
+            'nvcc': [
+                '-arch=sm_30', '--ptxas-options=-v', '-c',
+                '--compiler-options', "'-fPIC'"
+                ]
+            },
+        extra_link_args=[],
+        language='c++',
+    )
+
+    proximals_ext = Extension(
+        name='commit.proximals',
+        sources=['commit/proximals.pyx'],
+        include_dirs=[numpy.get_include()],
+        extra_compile_args= {
+            'gcc': ['-w'],
+            'nvcc': [
+                '-arch=sm_30', '--ptxas-options=-v', '-c',
+                '--compiler-options', "'-fPIC'"
+                ]
+            },
+        extra_link_args=[],
+        language='c++',
+    )
+
+    cudaoperator_ext = Extension(
+        name='commit.cudaoperator',
+        sources = ['commit/operator_withCUDA.cu', 'commit/cudaoperator.pyx'],
+        library_dirs = [CUDA['lib64']],
+        libraries = ['cudart'],
+        language = 'c++',
+        runtime_library_dirs = [CUDA['lib64']],
+        # This syntax is specific to this build system
+        # we're only going to use certain compiler args with nvcc
+        # and not with gcc the implementation of this trick is in
+        # customize_compiler()
+        extra_compile_args= {
+            'gcc': ['-w'],
+            'nvcc': [
+                '-arch=sm_30', '--ptxas-options=-v', '-c',
+                '--compiler-options', "'-fPIC'"
+                ]
+            },
+        include_dirs = [numpy_include, CUDA['include']]
+    )
+
+    setup(
+        name='commit',
+        version='1.4.0',
+        description='Convex Optimization Modeling for Microstructure Informed Tractography (COMMIT)',
+        author='Alessandro Daducci',
+        author_email='alessandro.daducci@gmail.com',
+        url='https://github.com/daducci/COMMIT',
+        cmdclass = {'build_ext':cuda_build_ext},
+        ext_modules = [ trk2dictionary_ext, core_ext, proximals_ext, cudaoperator_ext ],
+        packages=['commit','commit.operator'],
+        package_data={
+            'commit.operator':["*.*"], # needed by pyximport to compile at runtime
+        },
+    )
+else:
+    print('Installing CPU version')
+
+    # Cython extension to create the sparse data structure from a tractogram
+    # for the computation of matrix-vector multiplications
+    ext1 = Extension(
+        name='commit.trk2dictionary',
+        sources=['commit/trk2dictionary/trk2dictionary.pyx'],
+        include_dirs=[numpy.get_include()],
+        extra_compile_args=['-w'],
+        extra_link_args=[],
+        language='c++',
+    )
+
+    ext2 = Extension(
+        name='commit.core',
+        sources=['commit/core.pyx'],
+        include_dirs=[numpy.get_include()],
+        extra_compile_args=['-w'],
+        extra_link_args=[],
+        language='c++',
+    )
+
+    ext3 = Extension(
+        name='commit.proximals',
+        sources=['commit/proximals.pyx'],
+        include_dirs=[numpy.get_include()],
+        extra_compile_args=['-w'],
+        extra_link_args=[],
+        language='c++',
+    )
+
+    setup(
+        name='commit',
+        version='1.3.0',
+        description='Convex Optimization Modeling for Microstructure Informed Tractography (COMMIT)',
+        author='Alessandro Daducci',
+        author_email='alessandro.daducci@gmail.com',
+        url='https://github.com/daducci/COMMIT',
+        cmdclass = {'build_ext':build_ext},
+        ext_modules = [ ext1, ext2, ext3 ],
+        packages=['commit','commit.operator'],
+        package_data={
+            'commit.operator':["*.*"], # needed by pyximport to compile at runtime
+        },
+    )

From b948fdea230aadbcc90ecc432b4d5fabf7a1d2d2 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Fri, 17 Jul 2020 21:44:07 -0500
Subject: [PATCH 115/190] Adding option to choose GPU

---
 commit/core.pyx         | 7 +++----
 commit/cudaoperator.pyx | 7 ++++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index f88fe4fe..5641d89a 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -505,6 +505,7 @@ cdef class Evaluation :
         self.THREADS['ICt'] = None
         self.THREADS['ECt'] = None
         self.THREADS['ISOt'] = None
+        self.THREADS['GPUID'] = select_gpu
 
         cdef :
             long [:] C
@@ -611,12 +612,10 @@ cdef class Evaluation :
                 if np.count_nonzero( np.diff( self.THREADS['ISOt'].astype(np.int32) ) <= 0 ) :
                     self.THREADS = None
                     raise RuntimeError( 'Too many threads for the ISO compartments to evaluate; try decreasing the number.' )
-        else:
-            self.THREADS['GPUID'] = select_gpu
 
             print( '[ OK ]' )
 
-            print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
+        print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
 
 
     def build_operator( self ) :
@@ -651,7 +650,7 @@ cdef class Evaluation :
             self.A = sys.modules['commit.operator.operator'].LinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
         else:
             import commit.cudaoperator
-            self.A = commit.cudaoperator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS, fcall=1, self.THREADS['GPUID'] )
+            self.A = commit.cudaoperator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS, fcall=1 )
 
         print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
 
diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index e1fb9d43..85eb3778 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -44,7 +44,7 @@ cdef class CudaLinearOperator :
     with the COMMIT linear operator A in a CUDA GPU. The multiplications are done using CUDA C++ code
     that uses information from the DICTIONARY and KERNELS data structures.
     """
-    cdef int nS, nF, nR, nE, nT, nV, nI, n, ndirs
+    cdef int nS, nF, nR, nE, nT, nV, nI, n, ndirs, gpu_id
     cdef public int adjoint, n1, n2
 
     cdef DICTIONARY
@@ -75,7 +75,7 @@ cdef class CudaLinearOperator :
     cdef unsigned int*   ISOthreadsT
 
 
-    def __init__( self, DICTIONARY, KERNELS, THREADS, fcall = 0, gpu_id = 0 ) :
+    def __init__( self, DICTIONARY, KERNELS, THREADS, fcall = 0 ) :
         """Set the pointers to the data structures used by the C code."""
         self.DICTIONARY = DICTIONARY
         self.KERNELS    = KERNELS
@@ -89,6 +89,7 @@ cdef class CudaLinearOperator :
         self.nI         = KERNELS['iso'].shape[0]   # number of ISO contributions
         self.n          = DICTIONARY['IC']['n']     # numbner of IC segments
         self.ndirs      = KERNELS['wmr'].shape[1]   # number of directions
+        self.gpu_id     = THREADS['GPUID']          # id of the CUDA GPU
 
         if KERNELS['wmr'].size > 0 :
             self.nS = KERNELS['wmr'].shape[2]       # number of SAMPLES
@@ -151,7 +152,7 @@ cdef class CudaLinearOperator :
             self.nI,
             
             fcall,
-            gpu_id)
+            self.gpu_id)
 
         # create the transpose of the operator in GPU memory
         if fcall == 1:

From 0f1de29d02693251f6df0864b9c1f7949556f9ca Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Fri, 17 Jul 2020 21:57:00 -0500
Subject: [PATCH 116/190] Adding option to choose CUDA GPU

---
 .gitattributes              | 3 +++
 commit/operator_withCUDA.cu | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/.gitattributes b/.gitattributes
index fe756e6d..6b4d6de6 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1 +1,4 @@
 extras/* linguist-vendored
+
+# Never modify line endings of our bash scripts
+*.sh -lf
\ No newline at end of file
diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index 9e3e1ac3..f2e364ee 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -84,7 +84,7 @@ CudaLinearOperator::CudaLinearOperator(
     int nsegments,
     int nvoxels,      
     int nfibers,      
-    int nzeppelins,       
+    int npeaks,       
     int norientations,
     int nsamples,     
     int ndiameters,   

From 2a5b4939072e7286c2c1dcd8d93a66de9541539c Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Fri, 17 Jul 2020 22:01:39 -0500
Subject: [PATCH 117/190] Adding option to choose CUDA GPU

---
 commit/operator_withCUDA.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/commit/operator_withCUDA.cuh b/commit/operator_withCUDA.cuh
index faa8b6f8..d2cf61fe 100644
--- a/commit/operator_withCUDA.cuh
+++ b/commit/operator_withCUDA.cuh
@@ -172,7 +172,7 @@ class CudaLinearOperator {
             int nsegments,
             int nvoxels,      
             int nfibers,      
-            int nzeppelins,       
+            int npeaks,
             int norientations,
             int nsamples,     
             int ndiameters,   

From a99c955ee8a1b7abb1a5e27ec12599b0ff29eede Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Fri, 17 Jul 2020 23:09:25 -0500
Subject: [PATCH 118/190] Adding option to choose CUDA GPU

---
 commit/core.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index 5641d89a..b04c12eb 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -512,12 +512,12 @@ cdef class Evaluation :
             long t, tot, i1, i2, N, c
             int i
 
+        tic = time.time()
+
         if nthreads > 0:
             print( '\n-> Distributing workload to different threads:' )
             print( '\t* number of threads : %d' % nthreads )
 
-            tic = time.time()
-
             # Distribute load for the computation of A*x product
             print( '\t* A  operator... ', end="" )
             sys.stdout.flush()

From 20824914065408107eb3ce0399fa341093e96d20 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Fri, 17 Jul 2020 23:12:38 -0500
Subject: [PATCH 119/190] Adding option to choose CUDA GPU

---
 commit/cudaoperator.pyx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 85eb3778..c61e8fbb 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -91,6 +91,8 @@ cdef class CudaLinearOperator :
         self.ndirs      = KERNELS['wmr'].shape[1]   # number of directions
         self.gpu_id     = THREADS['GPUID']          # id of the CUDA GPU
 
+        print('cuda gpu id: %d' % gpu_id)
+
         if KERNELS['wmr'].size > 0 :
             self.nS = KERNELS['wmr'].shape[2]       # number of SAMPLES
         elif KERNELS['wmh'].size > 0 :

From b0641ffe3b30d4435e1cda89c5160b80c605ee80 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Fri, 17 Jul 2020 23:15:00 -0500
Subject: [PATCH 120/190] Adding option to choose CUDA GPU

---
 commit/cudaoperator.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index c61e8fbb..2a99a035 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -91,7 +91,7 @@ cdef class CudaLinearOperator :
         self.ndirs      = KERNELS['wmr'].shape[1]   # number of directions
         self.gpu_id     = THREADS['GPUID']          # id of the CUDA GPU
 
-        print('cuda gpu id: %d' % gpu_id)
+        print('cuda gpu id: %d' % self.gpu_id)
 
         if KERNELS['wmr'].size > 0 :
             self.nS = KERNELS['wmr'].shape[2]       # number of SAMPLES

From 9c1963109143cf9ae38fd9bab02520c15ca7844a Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Fri, 17 Jul 2020 23:17:29 -0500
Subject: [PATCH 121/190] Adding option to choose CUDA GPU

---
 commit/operator_withCUDA.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index f2e364ee..d41b5ba8 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -20,6 +20,8 @@ bool checkCompatibility(size_t required_mem, int gpu_id) {
         return false;
     }
 
+    cudaStatus = cudSetDevice(gpu_id);
+
     if(cudaStatus == cudaSuccess){
         cudaDeviceProp gpu_properties;
         cudaGetDeviceProperties(&gpu_properties, gpu_id);

From b89bed8c035713aa7787c3d5acc7e53a9f466d5d Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Fri, 17 Jul 2020 23:18:43 -0500
Subject: [PATCH 122/190] Adding option to choose CUDA GPU

---
 commit/operator_withCUDA.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index d41b5ba8..0d801212 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -15,7 +15,7 @@ bool checkCompatibility(size_t required_mem, int gpu_id) {
     
     cudaStatus = cudaGetDeviceCount(&num_gpus);
 
-    if (num_gpus <= 0 || num_gpus <= gpu_id) {
+    if (num_gpus <= 0 || num_gpus <= gpu_id || cudaStatus != cudaSuccess) {
         printf("\t* the selected GPU does not exist or it is not detected \n");
         return false;
     }

From 8fb324b404d12e6ad4f627e26cb5f9cd7ece1a46 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Fri, 17 Jul 2020 23:20:50 -0500
Subject: [PATCH 123/190] Adding option to choose CUDA GPU

---
 commit/operator_withCUDA.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index 0d801212..df10e76c 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -20,7 +20,7 @@ bool checkCompatibility(size_t required_mem, int gpu_id) {
         return false;
     }
 
-    cudaStatus = cudSetDevice(gpu_id);
+    cudaStatus = cudaSetDevice(gpu_id);
 
     if(cudaStatus == cudaSuccess){
         cudaDeviceProp gpu_properties;

From 8c8fa752c8221d4c2bf6ff190482f72908ad877c Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Fri, 17 Jul 2020 23:24:45 -0500
Subject: [PATCH 124/190] Adding option to choose CUDA GPU

---
 commit/operator_withCUDA.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index df10e76c..adbd453c 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -14,6 +14,8 @@ bool checkCompatibility(size_t required_mem, int gpu_id) {
     cudaError_t cudaStatus;
     
     cudaStatus = cudaGetDeviceCount(&num_gpus);
+    printf("num gpus %d\n", num_gpus);
+    printf("id gpu %d\n", gpu_id);
 
     if (num_gpus <= 0 || num_gpus <= gpu_id || cudaStatus != cudaSuccess) {
         printf("\t* the selected GPU does not exist or it is not detected \n");

From 8ce9b4173d331e72578a63ffdc7f3250fb369305 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Fri, 17 Jul 2020 23:57:05 -0500
Subject: [PATCH 125/190] Adding option to choose CUDA GPU

---
 commit/cudaoperator.pyx     | 2 --
 commit/operator_withCUDA.cu | 2 --
 2 files changed, 4 deletions(-)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator.pyx
index 2a99a035..85eb3778 100644
--- a/commit/cudaoperator.pyx
+++ b/commit/cudaoperator.pyx
@@ -91,8 +91,6 @@ cdef class CudaLinearOperator :
         self.ndirs      = KERNELS['wmr'].shape[1]   # number of directions
         self.gpu_id     = THREADS['GPUID']          # id of the CUDA GPU
 
-        print('cuda gpu id: %d' % self.gpu_id)
-
         if KERNELS['wmr'].size > 0 :
             self.nS = KERNELS['wmr'].shape[2]       # number of SAMPLES
         elif KERNELS['wmh'].size > 0 :
diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index adbd453c..df10e76c 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -14,8 +14,6 @@ bool checkCompatibility(size_t required_mem, int gpu_id) {
     cudaError_t cudaStatus;
     
     cudaStatus = cudaGetDeviceCount(&num_gpus);
-    printf("num gpus %d\n", num_gpus);
-    printf("id gpu %d\n", gpu_id);
 
     if (num_gpus <= 0 || num_gpus <= gpu_id || cudaStatus != cudaSuccess) {
         printf("\t* the selected GPU does not exist or it is not detected \n");

From 0c8007e6f2dbd4f614c4342d24a6c6730ab3506e Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 18 Jul 2020 00:42:00 -0500
Subject: [PATCH 126/190] Merging CUDA version with the lastest COMMIT version

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index b164fe80..f4a1605f 100644
--- a/setup.py
+++ b/setup.py
@@ -149,7 +149,7 @@ def get_extensions_with_cuda():
     print('CUDA detected. Installing COMMIT with GPU acceleration.')
 
     class CustomCudaBuildExtCommand(build_ext):
-        """ build_ext command to use when CUDA is detected and numpy headers are needed. """
+    """ build_ext command to use when CUDA is detected and numpy headers are needed. """
 
         def build_extensions(self):
             customize_compiler_for_nvcc(self.compiler)
@@ -172,7 +172,7 @@ def run(self):
     description = 'Convex Optimization Modeling for Microstructure Informed Tractography (COMMIT)'
 
     opts = dict(name='dmri-commit',
-                version='1.3.9.2',
+                version='1.3.9.2-cuda',
                 description=description,
                 long_description=description,
                 author='Alessandro Daducci',

From ad35e09d9983d542231821785bb3ec993ea0f213 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 18 Jul 2020 00:44:07 -0500
Subject: [PATCH 127/190] Merging CUDA version with the lastest COMMIT version

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index f4a1605f..2dae2e6c 100644
--- a/setup.py
+++ b/setup.py
@@ -149,7 +149,7 @@ def get_extensions_with_cuda():
     print('CUDA detected. Installing COMMIT with GPU acceleration.')
 
     class CustomCudaBuildExtCommand(build_ext):
-    """ build_ext command to use when CUDA is detected and numpy headers are needed. """
+        """ build_ext command to use when CUDA is detected and numpy headers are needed. """
 
         def build_extensions(self):
             customize_compiler_for_nvcc(self.compiler)
@@ -189,7 +189,7 @@ def run(self):
     print('CUDA not detected. Installing COMMIT without GPU acceleration.')
 
     class CustomBuildExtCommand(build_ext):
-    """ build_ext command to use when numpy headers are needed. """
+        """ build_ext command to use when numpy headers are needed. """
 
         def run(self):
             # Now that the requirements are installed, get everything from numpy

From 26ab0ac0e294461686337d01d5ee444438e3a806 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 18 Jul 2020 00:48:15 -0500
Subject: [PATCH 128/190] Merging CUDA version with the lastest COMMIT version

---
 setup.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/setup.py b/setup.py
index 2dae2e6c..95ecca9a 100644
--- a/setup.py
+++ b/setup.py
@@ -111,26 +111,31 @@ def get_extensions():
 def get_extensions_with_cuda():
     # Cython extension to create the sparse data structure from a tractogram
     # for the computation of matrix-vector multiplications
+    from numpy import get_include
+
     ext1 = Extension(name='commit.trk2dictionary',
                      sources=['commit/trk2dictionary/trk2dictionary.pyx'],
                      extra_compile_args= {'gcc':  ['-w'],
                                           'nvcc': ['-arch=sm_30', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
                      extra_link_args=[],
-                     language='c++')
+                     language='c++',
+                     include_dirs = [numpy_include])
 
     ext2 = Extension(name='commit.core',
                      sources=['commit/core.pyx'],
                      extra_compile_args= {'gcc':  ['-w'],
                                           'nvcc': ['-arch=sm_30', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
                      extra_link_args=[],
-                     language='c++')
+                     language='c++',
+                     include_dirs = [numpy_include])
 
     ext3 = Extension(name='commit.proximals',
                       sources=['commit/proximals.pyx'],
                       extra_compile_args= {'gcc':  ['-w'],
                                            'nvcc': ['-arch=sm_30', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
                       extra_link_args=[],
-                      language='c++')
+                      language='c++',
+                      include_dirs = [numpy_include])
 
     ext4 = Extension(name='commit.cudaoperator',
                      sources = ['commit/operator_withCUDA.cu', 'commit/cudaoperator.pyx'],
@@ -139,8 +144,8 @@ def get_extensions_with_cuda():
                      language = 'c++',
                      library_dirs = [CUDA['lib64']],
                      libraries = ['cudart'],
-                     runtime_library_dirs = [CUDA['lib64']])
-                     #include_dirs = [numpy_include, CUDA['include']]
+                     runtime_library_dirs = [CUDA['lib64']]
+                     include_dirs = [numpy_include, CUDA['include']])
 
 # Locate CUDA
 CUDA = locate_cuda()
@@ -155,7 +160,7 @@ def build_extensions(self):
             customize_compiler_for_nvcc(self.compiler)
             build_ext.build_extensions(self)
 
-        def run(self):
+        """def run(self):
             # Now that the requirements are installed, get everything from numpy
             from Cython.Build import cythonize
             from numpy import get_include
@@ -167,7 +172,7 @@ def run(self):
 
             # Call original build_ext command
             build_ext.finalize_options(self)
-            build_ext.run(self)
+            build_ext.run(self)"""
 
     description = 'Convex Optimization Modeling for Microstructure Informed Tractography (COMMIT)'
 

From 14bb55d7ce18039df8a2f5844c9e35f79930a1d2 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 18 Jul 2020 00:49:19 -0500
Subject: [PATCH 129/190] Merging CUDA version with the lastest COMMIT version

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 95ecca9a..eafd6a14 100644
--- a/setup.py
+++ b/setup.py
@@ -144,7 +144,7 @@ def get_extensions_with_cuda():
                      language = 'c++',
                      library_dirs = [CUDA['lib64']],
                      libraries = ['cudart'],
-                     runtime_library_dirs = [CUDA['lib64']]
+                     runtime_library_dirs = [CUDA['lib64']],
                      include_dirs = [numpy_include, CUDA['include']])
 
 # Locate CUDA

From 0067039c604ff6e9af64b9b4c48a99dcd52f282a Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 18 Jul 2020 00:53:34 -0500
Subject: [PATCH 130/190] Merging CUDA version with the lastest COMMIT version

---
 setup.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index eafd6a14..504e0bcb 100644
--- a/setup.py
+++ b/setup.py
@@ -151,7 +151,9 @@ def get_extensions_with_cuda():
 CUDA = locate_cuda()
 
 if CUDA != None:
+    print('\n=====================================================')
     print('CUDA detected. Installing COMMIT with GPU acceleration.')
+    print('=====================================================\n')
 
     class CustomCudaBuildExtCommand(build_ext):
         """ build_ext command to use when CUDA is detected and numpy headers are needed. """
@@ -177,7 +179,7 @@ def build_extensions(self):
     description = 'Convex Optimization Modeling for Microstructure Informed Tractography (COMMIT)'
 
     opts = dict(name='dmri-commit',
-                version='1.3.9.2-cuda',
+                version='1.4.0.0',
                 description=description,
                 long_description=description,
                 author='Alessandro Daducci',
@@ -185,7 +187,7 @@ def build_extensions(self):
                 url='https://github.com/daducci/COMMIT',
                 packages=['commit', 'commit.operator'],
                 cmdclass={'build_ext': CustomCudaBuildExtCommand},
-                ext_modules=get_extensions(),
+                ext_modules=get_extensions_with_cuda(),
                 setup_requires=['Cython>=0.29', 'numpy>=1.12'],
                 install_requires=['Cython>=0.29',
                                   'dmri-amico>=1.2.3', 'dipy>=1.0', 'numpy>=1.12'],

From bee7b0e5ce3a6ac8096581646f4f76d7d751e403 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 18 Jul 2020 00:55:54 -0500
Subject: [PATCH 131/190] Merging CUDA version with the lastest COMMIT version

---
 setup.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/setup.py b/setup.py
index 504e0bcb..79bea226 100644
--- a/setup.py
+++ b/setup.py
@@ -119,7 +119,7 @@ def get_extensions_with_cuda():
                                           'nvcc': ['-arch=sm_30', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
                      extra_link_args=[],
                      language='c++',
-                     include_dirs = [numpy_include])
+                     include_dirs = [get_include])
 
     ext2 = Extension(name='commit.core',
                      sources=['commit/core.pyx'],
@@ -127,7 +127,7 @@ def get_extensions_with_cuda():
                                           'nvcc': ['-arch=sm_30', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
                      extra_link_args=[],
                      language='c++',
-                     include_dirs = [numpy_include])
+                     include_dirs = [get_include])
 
     ext3 = Extension(name='commit.proximals',
                       sources=['commit/proximals.pyx'],
@@ -135,7 +135,7 @@ def get_extensions_with_cuda():
                                            'nvcc': ['-arch=sm_30', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
                       extra_link_args=[],
                       language='c++',
-                      include_dirs = [numpy_include])
+                      include_dirs = [get_include])
 
     ext4 = Extension(name='commit.cudaoperator',
                      sources = ['commit/operator_withCUDA.cu', 'commit/cudaoperator.pyx'],
@@ -145,15 +145,15 @@ def get_extensions_with_cuda():
                      library_dirs = [CUDA['lib64']],
                      libraries = ['cudart'],
                      runtime_library_dirs = [CUDA['lib64']],
-                     include_dirs = [numpy_include, CUDA['include']])
+                     include_dirs = [get_include, CUDA['include']])
 
 # Locate CUDA
 CUDA = locate_cuda()
 
 if CUDA != None:
-    print('\n=====================================================')
+    print('\n==========================================================')
     print('CUDA detected. Installing COMMIT with GPU acceleration.')
-    print('=====================================================\n')
+    print('==========================================================\n')
 
     class CustomCudaBuildExtCommand(build_ext):
         """ build_ext command to use when CUDA is detected and numpy headers are needed. """

From bd2b4673ce5f842a009821594e7a39a9312a692e Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 18 Jul 2020 00:59:37 -0500
Subject: [PATCH 132/190] Merging CUDA version with the lastest COMMIT version

---
 setup.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/setup.py b/setup.py
index 79bea226..09a838aa 100644
--- a/setup.py
+++ b/setup.py
@@ -118,34 +118,34 @@ def get_extensions_with_cuda():
                      extra_compile_args= {'gcc':  ['-w'],
                                           'nvcc': ['-arch=sm_30', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
                      extra_link_args=[],
-                     language='c++',
-                     include_dirs = [get_include])
+                     language='c++')
+                     #include_dirs = [get_include])
 
     ext2 = Extension(name='commit.core',
                      sources=['commit/core.pyx'],
                      extra_compile_args= {'gcc':  ['-w'],
                                           'nvcc': ['-arch=sm_30', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
                      extra_link_args=[],
-                     language='c++',
-                     include_dirs = [get_include])
+                     language='c++')
+                     #include_dirs = [get_include])
 
     ext3 = Extension(name='commit.proximals',
                       sources=['commit/proximals.pyx'],
                       extra_compile_args= {'gcc':  ['-w'],
                                            'nvcc': ['-arch=sm_30', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
                       extra_link_args=[],
-                      language='c++',
-                      include_dirs = [get_include])
+                      language='c++')
+                      #include_dirs = [get_include])
 
     ext4 = Extension(name='commit.cudaoperator',
                      sources = ['commit/operator_withCUDA.cu', 'commit/cudaoperator.pyx'],
                      extra_compile_args= {'gcc':  ['-w'],
                                           'nvcc': ['-arch=sm_30', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
                      language = 'c++',
+                     #include_dirs = [get_include, CUDA['include']],
                      library_dirs = [CUDA['lib64']],
                      libraries = ['cudart'],
-                     runtime_library_dirs = [CUDA['lib64']],
-                     include_dirs = [get_include, CUDA['include']])
+                     runtime_library_dirs = [CUDA['lib64']])
 
 # Locate CUDA
 CUDA = locate_cuda()
@@ -162,7 +162,7 @@ def build_extensions(self):
             customize_compiler_for_nvcc(self.compiler)
             build_ext.build_extensions(self)
 
-        """def run(self):
+        def run(self):
             # Now that the requirements are installed, get everything from numpy
             from Cython.Build import cythonize
             from numpy import get_include
@@ -174,7 +174,7 @@ def build_extensions(self):
 
             # Call original build_ext command
             build_ext.finalize_options(self)
-            build_ext.run(self)"""
+            build_ext.run(self)
 
     description = 'Convex Optimization Modeling for Microstructure Informed Tractography (COMMIT)'
 

From 666407d7d980fe9b3f27fbc3fa604594e6473b1d Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 18 Jul 2020 01:41:37 -0500
Subject: [PATCH 133/190] Merging CUDA version with the lastest COMMIT version

---
 LICENSE                                     |   66 +-
 MANIFEST.in                                 |   12 +-
 README.md                                   |   76 +-
 commit/__init__.py                          |   10 +-
 commit/operator/config.py                   |   12 +-
 commit/operator_withCUDA.cuh                |  386 ++--
 commit/solvers.py                           |  806 +++----
 docs/COMMIT_debugger.md                     |  142 +-
 docs/README.md                              |   20 +-
 docs/conventions.md                         |   84 +-
 docs/faq.md                                 |   28 +-
 docs/install.md                             |   92 +-
 docs/models.md                              |  114 +-
 docs/tutorials/AdvancedSolvers/README.md    |  302 +--
 docs/tutorials/GettingStarted/README.md     |  502 ++--
 docs/tutorials/LiFE_STN96/README.md         |  640 +++---
 docs/tutorials/README.md                    |   10 +-
 extras/COMMIT_debugger/OPENGL_callbacks.cxx | 2280 +++++++++----------
 extras/COMMIT_debugger/OPENGL_utils.h       |  190 +-
 extras/COMMIT_debugger/main.cxx             | 1300 +++++------
 requirements.txt                            |   10 +-
 setup.cfg                                   |   10 +-
 setup.py                                    |  124 +-
 23 files changed, 3597 insertions(+), 3619 deletions(-)

diff --git a/LICENSE b/LICENSE
index 70808f61..04e0c652 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,33 +1,33 @@
-Unless otherwise specified by LICENSE.txt files in individual
-directories, or within individual files or functions, all code is:
-
-Copyright (c) 2008-2020, COMMIT developers
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-    * Redistributions of source code must retain the above copyright
-       notice, this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above
-       copyright notice, this list of conditions and the following
-       disclaimer in the documentation and/or other materials provided
-       with the distribution.
-
-    * Neither the name of the COMMIT developers nor the names of any
-       contributors may be used to endorse or promote products derived
-       from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+Unless otherwise specified by LICENSE.txt files in individual
+directories, or within individual files or functions, all code is:
+
+Copyright (c) 2008-2020, COMMIT developers
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+       notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+       copyright notice, this list of conditions and the following
+       disclaimer in the documentation and/or other materials provided
+       with the distribution.
+
+    * Neither the name of the COMMIT developers nor the names of any
+       contributors may be used to endorse or promote products derived
+       from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/MANIFEST.in b/MANIFEST.in
index d3b5c5b7..fa48479d 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,7 +1,7 @@
-include README.md
-include LICENSE
-
-recursive-include commit *.h
-recursive-include commit *.cpp
-recursive-include commit *.pyx
+include README.md
+include LICENSE
+
+recursive-include commit *.h
+recursive-include commit *.cpp
+recursive-include commit *.pyx
 recursive-include commit *.c
\ No newline at end of file
diff --git a/README.md b/README.md
index ff70b6d3..b2be65ac 100644
--- a/README.md
+++ b/README.md
@@ -1,38 +1,38 @@
-# COMMIT
-
-The reconstructions recovered with existing tractography algorithms are *not really quantitative* even though diffusion MRI is a quantitative modality by nature. As a matter of fact, several techniques have been proposed in recent years to estimate, at the voxel level, intrinsic micro-structural features of the tissue, such as axonal density and diameter, by using multi-compartment models. COMMIT implements a novel framework to **re-establish the link between tractography and tissue micro-structure**.
-
-Starting from an input set of candidate fiber-tracts, which can be estimated using standard fiber-tracking techniques, COMMIT models the diffusion MRI signal in each voxel of the image as a *linear combination* of the restricted and hindered contributions generated in every location of the brain by these candidate tracts. Then, COMMIT seeks for the effective contribution of each of them such that they globally fit the measured signal at best.
-
-These weights can be easily estimated by solving a convenient **global convex optimization problem** and using efficient algorithms. Results clearly demonstrated the benefits of the proposed formulation, opening new perspectives for a more quantitative and biologically-plausible assessment of the structural connectivity in the brain.
-
-
-## Main features
-
-- Accepts and works with **any input tractogram** (i.e. set of fiber tracts).
-- Can easily implement and consider **any multi-compartment model** available in the literature: possibility to account for restricted, hindered as well as isotropic contributions into the signal forward model.
-- Very efficient: the core of the algorithm is implemented in C++ and using **multi-threading programming** for efficient parallel computation.
-- **Low memory** consumption using optimized sparse data structures, e.g. it can easily run on a standard laptop with 8GB RAM a full-brain tractogram from the HCP data (1M fibers, 3 shells, 1.25 mm^3 resolution).
-- **Soon**: **GPU implementation** for even faster model fitting.
-
-## How to cite COMMIT
-
-**COMMIT: Convex Optimization Modeling for Microstructure Informed Tractography**  
-Alessandro Daducci, Alessandro Dal Palú, Alia Lemkaddem, Jean-Philippe Thiran  
-*IEEE Transactions on Medical Imaging* 34(1) 246-257, 2015  
-[Link to publisher](http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6884830)
-
-**A convex optimization framework for global tractography**  
-Alessandro Daducci, Alessandro Dal Palú, Alia Lemkaddem, Jean-Philippe Thiran  
-*IEEE 10th International Symposium on Biomedical Imaging (ISBI)* 524-527, 2013  
-[Link to publisher](http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6556527)
-
-## Installation
-To install COMMIT, please refer to the [installation guide](docs/install.md).
-
-More information/documentation can be found in the [`docs`](docs/) folder.
-
-## Getting started
-
-Tutorials are provided in the [`docs/tutorials`](docs/tutorials/) folder to help you get started with the COMMIT framework.
-
+# COMMIT
+
+The reconstructions recovered with existing tractography algorithms are *not really quantitative* even though diffusion MRI is a quantitative modality by nature. As a matter of fact, several techniques have been proposed in recent years to estimate, at the voxel level, intrinsic micro-structural features of the tissue, such as axonal density and diameter, by using multi-compartment models. COMMIT implements a novel framework to **re-establish the link between tractography and tissue micro-structure**.
+
+Starting from an input set of candidate fiber-tracts, which can be estimated using standard fiber-tracking techniques, COMMIT models the diffusion MRI signal in each voxel of the image as a *linear combination* of the restricted and hindered contributions generated in every location of the brain by these candidate tracts. Then, COMMIT seeks for the effective contribution of each of them such that they globally fit the measured signal at best.
+
+These weights can be easily estimated by solving a convenient **global convex optimization problem** and using efficient algorithms. Results clearly demonstrated the benefits of the proposed formulation, opening new perspectives for a more quantitative and biologically-plausible assessment of the structural connectivity in the brain.
+
+
+## Main features
+
+- Accepts and works with **any input tractogram** (i.e. set of fiber tracts).
+- Can easily implement and consider **any multi-compartment model** available in the literature: possibility to account for restricted, hindered as well as isotropic contributions into the signal forward model.
+- Very efficient: the core of the algorithm is implemented in C++ and using **multi-threading programming** for efficient parallel computation.
+- **Low memory** consumption using optimized sparse data structures, e.g. it can easily run on a standard laptop with 8GB RAM a full-brain tractogram from the HCP data (1M fibers, 3 shells, 1.25 mm^3 resolution).
+- **Soon**: **GPU implementation** for even faster model fitting.
+
+## How to cite COMMIT
+
+**COMMIT: Convex Optimization Modeling for Microstructure Informed Tractography**  
+Alessandro Daducci, Alessandro Dal Palú, Alia Lemkaddem, Jean-Philippe Thiran  
+*IEEE Transactions on Medical Imaging* 34(1) 246-257, 2015  
+[Link to publisher](http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6884830)
+
+**A convex optimization framework for global tractography**  
+Alessandro Daducci, Alessandro Dal Palú, Alia Lemkaddem, Jean-Philippe Thiran  
+*IEEE 10th International Symposium on Biomedical Imaging (ISBI)* 524-527, 2013  
+[Link to publisher](http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6556527)
+
+## Installation
+To install COMMIT, please refer to the [installation guide](docs/install.md).
+
+More information/documentation can be found in the [`docs`](docs/) folder.
+
+## Getting started
+
+Tutorials are provided in the [`docs/tutorials`](docs/tutorials/) folder to help you get started with the COMMIT framework.
+
diff --git a/commit/__init__.py b/commit/__init__.py
index 3ab179d3..e7e71d6c 100755
--- a/commit/__init__.py
+++ b/commit/__init__.py
@@ -1,5 +1,5 @@
-from .core import Evaluation
-__all__ = ['core','models','solvers','trk2dictionary']
-
-from pkg_resources import get_distribution
-__version__ = get_distribution('dmri-commit').version
+from .core import Evaluation
+__all__ = ['core','models','solvers','trk2dictionary']
+
+from pkg_resources import get_distribution
+__version__ = get_distribution('dmri-commit').version
diff --git a/commit/operator/config.py b/commit/operator/config.py
index 8d782f49..8192419b 100755
--- a/commit/operator/config.py
+++ b/commit/operator/config.py
@@ -1,6 +1,6 @@
-nTHREADS      = None
-model         = None
-nIC           = None
-nEC      	  = None
-nISO     	  = None
-build_locally = False
+nTHREADS      = None
+model         = None
+nIC           = None
+nEC      	  = None
+nISO     	  = None
+build_locally = False
diff --git a/commit/operator_withCUDA.cuh b/commit/operator_withCUDA.cuh
index d2cf61fe..2894bb9c 100644
--- a/commit/operator_withCUDA.cuh
+++ b/commit/operator_withCUDA.cuh
@@ -1,194 +1,194 @@
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cuda_runtime_api.h>
-#include <device_launch_parameters.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <algorithm>
-
-using namespace std;
-
-typedef unsigned int uint32_t;
-typedef unsigned short int uint16_t;
-typedef float float32_t;
-typedef double float64_t;
-
-// ====================================================
-// Util functions to check CUDA GPU compatibility
-// ====================================================
-bool cudaCheck(cudaError_t cudaStatus);
-bool checkCompatibility(size_t required_mem, int gpu_id);
-
-// ====================================================
-// Function to preprocess data for GPU
-// ====================================================
-void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compartmentsPerBlock, uint32_t* offsetPerBlock, int NUM_BLOCKS);
-
-// ====================================================
-// CUDA Kernels for Ax operation
-// ====================================================
-__global__ void multiply_Ax_ICpart(
-    uint32_t*  voxelIDs,
-    uint32_t*  fiberIDs,
-    uint16_t*  orienIDs,
-    float32_t* lengths,
-    uint32_t*  segmentsPerBlock,
-    uint32_t*  offsetPerBlock,
-    float32_t* lut,
-    float64_t* x,
-    float64_t* y);
-
-__global__ void multiply_Ax_ECpart(
-        uint32_t*  voxelIDs,
-        uint16_t*  orienIDs,
-        uint32_t*  segmentsPerBlock,
-        uint32_t*  offsetPerBlock,
-        float32_t* lut,
-        float64_t* x,
-        float64_t* y);
-
-__global__ void multiply_Ax_ISOpart(
-    float32_t* lut,
-    float64_t* x,
-    float64_t* y);
-
-// ====================================================
-// CUDA Kernels for A'y operation
-// ====================================================
-__global__ void multiply_Aty_ICpart(
-    uint32_t*  TvoxelIC,
-    uint32_t*  TfiberIC,
-    uint16_t*  TorienIC,
-    float32_t* TlengthIC,
-    uint32_t*  compartmentsPerBlock,
-    uint32_t*  offsetPerBlock,
-    float32_t* lut,
-    float64_t* x,
-    float64_t* y);
-
-__global__ void multiply_Aty_ECpart(
-    uint32_t*  voxelEC,
-    uint16_t*  orienEC,
-    uint32_t*  segmentsPerBlock,
-    uint32_t*  offsetPerBlock,
-    float32_t* lut,
-    float64_t* x,
-    float64_t* y);
-
-__global__ void multiply_Aty_ISOpart(
-    float* lut,
-    double* x,
-    double* y);
-
-// ====================================================
-// Constant global values in the GPU
-// ====================================================
-__constant__ int NUM_VOXELS;
-__constant__ int NUM_FIBERS;
-__constant__ int NUM_PEAKS;
-__constant__ int NUM_ORIENTATIONS;
-__constant__ int NUM_SAMPLES;
-__constant__ int NUM_DIAMETERS;
-__constant__ int NUM_ZEPPELINS;
-__constant__ int NUM_BALLS;
-__constant__ int NUM_ROWS;        
-__constant__ int NUM_COLS;      
-__constant__ int SIZE_LUTIC;      
-__constant__ int SIZE_LUTEC;     
-__constant__ int SIZE_LUTISO;
-
-// ====================================================
-// Pointers to A (IC part) in the GPU
-// ====================================================
-static uint32_t*  gpu_voxelIC;
-static uint32_t*  gpu_fiberIC;
-static uint16_t*  gpu_orienIC;
-static float32_t* gpu_lengthIC;
-static uint32_t*  gpu_segmentsPerBlockIC;
-static uint32_t*  gpu_offsetPerBlockIC;
-
-// ====================================================
-// Pointers to A' (IC part) in the GPU
-// ====================================================
-static uint32_t*  gpu_TvoxelIC;
-static uint32_t*  gpu_TfiberIC;
-static uint16_t*  gpu_TorienIC;
-static float32_t* gpu_TlengthIC;
-static uint32_t*  gpu_TfibersPerBlockIC;
-static uint32_t*  gpu_ToffsetPerBlockIC;
-
-// ====================================================
-// Pointers to A (EC part) in the GPU
-// ====================================================
-static uint32_t* gpu_voxelEC;
-static uint16_t* gpu_orienEC;
-static uint32_t* gpu_segmentsPerBlockEC;
-static uint32_t* gpu_offsetPerBlockEC;
-
-// ====================================================
-// Pointers to LUTs in the GPU
-// ====================================================
-static float32_t* gpu_lutIC;
-static float32_t* gpu_lutEC;
-static float32_t* gpu_lutISO;
-
-// ====================================================
-// Pointers to x and y in the GPU
-// ====================================================
-static float64_t* gpu_x;
-static float64_t* gpu_y;
-
-// ============================================================================
-// This class creates an instance of the LinearOperator in GPU memory
-// ============================================================================
-class CudaLinearOperator {
-
-    // constant values in CPU
-    int nrows;
-    int ncols;
-    int nvoxels;
-    int nfibers;
-    int nsegments;
-
-    // CUDA GPU status
-    bool cudaStatus;
-
-    public:
-        CudaLinearOperator(
-            // pointers to IC data in CPU memory
-            uint32_t* voxelIC,
-            uint32_t* fiberIC,
-            uint16_t* orienIC,
-            float*    lengthIC,
-            float*    lutIC,
-            // pointers to EC data in CPU memory
-            uint32_t* voxelEC,
-            uint16_t* orienEC,
-            float*    lutEC,
-            // pointer to ISO data in CPU memory
-            float*    lutISO,
-            // dataset constant values
-            int nsegments,
-            int nvoxels,      
-            int nfibers,      
-            int npeaks,
-            int norientations,
-            int nsamples,     
-            int ndiameters,   
-            int nzeppelins,   
-            int nballs,
-            // flag to ensure we create the operator only one time
-            int fcall,
-            // id of the selected CUDA gpu
-            int gpu_id);
-
-        ~CudaLinearOperator();
-
-        int  getCudaStatus() { return (int)cudaStatus; }
-        void setTransposeData(uint32_t*  voxelIDs, uint32_t*  fiberIDs, uint16_t*  orienIDs, float32_t* lengths);
-        void destroy();
-
-        void  dot(float64_t* v_in, float64_t* v_out);
-        void Tdot(float64_t* v_in, float64_t* v_out);
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
+#include <device_launch_parameters.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <algorithm>
+
+using namespace std;
+
+typedef unsigned int uint32_t;
+typedef unsigned short int uint16_t;
+typedef float float32_t;
+typedef double float64_t;
+
+// ====================================================
+// Util functions to check CUDA GPU compatibility
+// ====================================================
+bool cudaCheck(cudaError_t cudaStatus);
+bool checkCompatibility(size_t required_mem, int gpu_id);
+
+// ====================================================
+// Function to preprocess data for GPU
+// ====================================================
+void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compartmentsPerBlock, uint32_t* offsetPerBlock, int NUM_BLOCKS);
+
+// ====================================================
+// CUDA Kernels for Ax operation
+// ====================================================
+__global__ void multiply_Ax_ICpart(
+    uint32_t*  voxelIDs,
+    uint32_t*  fiberIDs,
+    uint16_t*  orienIDs,
+    float32_t* lengths,
+    uint32_t*  segmentsPerBlock,
+    uint32_t*  offsetPerBlock,
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y);
+
+__global__ void multiply_Ax_ECpart(
+        uint32_t*  voxelIDs,
+        uint16_t*  orienIDs,
+        uint32_t*  segmentsPerBlock,
+        uint32_t*  offsetPerBlock,
+        float32_t* lut,
+        float64_t* x,
+        float64_t* y);
+
+__global__ void multiply_Ax_ISOpart(
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y);
+
+// ====================================================
+// CUDA Kernels for A'y operation
+// ====================================================
+__global__ void multiply_Aty_ICpart(
+    uint32_t*  TvoxelIC,
+    uint32_t*  TfiberIC,
+    uint16_t*  TorienIC,
+    float32_t* TlengthIC,
+    uint32_t*  compartmentsPerBlock,
+    uint32_t*  offsetPerBlock,
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y);
+
+__global__ void multiply_Aty_ECpart(
+    uint32_t*  voxelEC,
+    uint16_t*  orienEC,
+    uint32_t*  segmentsPerBlock,
+    uint32_t*  offsetPerBlock,
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y);
+
+__global__ void multiply_Aty_ISOpart(
+    float* lut,
+    double* x,
+    double* y);
+
+// ====================================================
+// Constant global values in the GPU
+// ====================================================
+__constant__ int NUM_VOXELS;
+__constant__ int NUM_FIBERS;
+__constant__ int NUM_PEAKS;
+__constant__ int NUM_ORIENTATIONS;
+__constant__ int NUM_SAMPLES;
+__constant__ int NUM_DIAMETERS;
+__constant__ int NUM_ZEPPELINS;
+__constant__ int NUM_BALLS;
+__constant__ int NUM_ROWS;        
+__constant__ int NUM_COLS;      
+__constant__ int SIZE_LUTIC;      
+__constant__ int SIZE_LUTEC;     
+__constant__ int SIZE_LUTISO;
+
+// ====================================================
+// Pointers to A (IC part) in the GPU
+// ====================================================
+static uint32_t*  gpu_voxelIC;
+static uint32_t*  gpu_fiberIC;
+static uint16_t*  gpu_orienIC;
+static float32_t* gpu_lengthIC;
+static uint32_t*  gpu_segmentsPerBlockIC;
+static uint32_t*  gpu_offsetPerBlockIC;
+
+// ====================================================
+// Pointers to A' (IC part) in the GPU
+// ====================================================
+static uint32_t*  gpu_TvoxelIC;
+static uint32_t*  gpu_TfiberIC;
+static uint16_t*  gpu_TorienIC;
+static float32_t* gpu_TlengthIC;
+static uint32_t*  gpu_TfibersPerBlockIC;
+static uint32_t*  gpu_ToffsetPerBlockIC;
+
+// ====================================================
+// Pointers to A (EC part) in the GPU
+// ====================================================
+static uint32_t* gpu_voxelEC;
+static uint16_t* gpu_orienEC;
+static uint32_t* gpu_segmentsPerBlockEC;
+static uint32_t* gpu_offsetPerBlockEC;
+
+// ====================================================
+// Pointers to LUTs in the GPU
+// ====================================================
+static float32_t* gpu_lutIC;
+static float32_t* gpu_lutEC;
+static float32_t* gpu_lutISO;
+
+// ====================================================
+// Pointers to x and y in the GPU
+// ====================================================
+static float64_t* gpu_x;
+static float64_t* gpu_y;
+
+// ============================================================================
+// This class creates an instance of the LinearOperator in GPU memory
+// ============================================================================
+class CudaLinearOperator {
+
+    // constant values in CPU
+    int nrows;
+    int ncols;
+    int nvoxels;
+    int nfibers;
+    int nsegments;
+
+    // CUDA GPU status
+    bool cudaStatus;
+
+    public:
+        CudaLinearOperator(
+            // pointers to IC data in CPU memory
+            uint32_t* voxelIC,
+            uint32_t* fiberIC,
+            uint16_t* orienIC,
+            float*    lengthIC,
+            float*    lutIC,
+            // pointers to EC data in CPU memory
+            uint32_t* voxelEC,
+            uint16_t* orienEC,
+            float*    lutEC,
+            // pointer to ISO data in CPU memory
+            float*    lutISO,
+            // dataset constant values
+            int nsegments,
+            int nvoxels,      
+            int nfibers,      
+            int npeaks,
+            int norientations,
+            int nsamples,     
+            int ndiameters,   
+            int nzeppelins,   
+            int nballs,
+            // flag to ensure we create the operator only one time
+            int fcall,
+            // id of the selected CUDA gpu
+            int gpu_id);
+
+        ~CudaLinearOperator();
+
+        int  getCudaStatus() { return (int)cudaStatus; }
+        void setTransposeData(uint32_t*  voxelIDs, uint32_t*  fiberIDs, uint16_t*  orienIDs, float32_t* lengths);
+        void destroy();
+
+        void  dot(float64_t* v_in, float64_t* v_out);
+        void Tdot(float64_t* v_in, float64_t* v_out);
 };
\ No newline at end of file
diff --git a/commit/solvers.py b/commit/solvers.py
index dc7767ce..29bc8374 100755
--- a/commit/solvers.py
+++ b/commit/solvers.py
@@ -1,403 +1,403 @@
-"""
-Author: Matteo Frigo - lts5 @ EPFL and Dep. of CS @ Univ. of Verona
-
-This structure is based on the previous work of Rafael Carrillo and was
-supported by the LTS5 laboratory at EPFL, Lausanne.
-"""
-from __future__ import print_function
-import numpy as np
-from math import sqrt
-import sys
-import warnings
-eps = np.finfo(float).eps
-
-from commit.proximals import (non_negativity,
-                             omega_group_sparsity,
-                             prox_group_sparsity,
-                             soft_thresholding,
-                             projection_onto_l2_ball)
-group_sparsity = -1
-non_negative = 0
-norm1 = 1
-norm2 = 2
-norminf = np.inf
-list_regnorms = [group_sparsity, non_negative, norm1, norm2]
-list_group_sparsity_norms = [norm2]#, norminf] # removed because of issue #54
-
-
-def init_regularisation(commit_evaluation,
-                        regnorms = (non_negative, non_negative, non_negative),
-                        structureIC = None, weightsIC = None, group_norm = 2,
-                        lambdas = (.0,.0,.0) ):
-    """
-    Initialise the data structure that defines Omega in
-
-        argmin_x 0.5*||Ax-y||_2^2 + Omega(x)
-
-
-    Input
-    -----
-    commit_evaluation - commit.Evaluation object :
-        dictionary and model have to be loaded beforehand.
-
-
-    regnorms - tuple :
-        this sets the penalty term to be used for each compartment.
-            Default = (non_negative,non_negative,non_negative).
-
-            regnorms[0] corresponds to the Intracellular compartment
-            regnorms[1] corresponds to the Extracellular compartment
-            regnorms[2] corresponds to the Isotropic compartment
-
-            Each regnorms[k] must be one of commit.solvers.
-                                {group_sparsity, non_negative, norm1, norm2}.
-
-            commit.solvers.group_sparsity considers both the non-overlapping
-                and the hierarchical group sparsity (see [1]). This option is
-                allowed only in the IC compartment. The mathematical formulation
-                of this term is
-                $\Omega(x) = \lambda \sum_{g\in G} w_g |x_g|
-
-            commit.solvers.non_negative puts a non negativity constraint on the
-                coefficients corresponding to the compartment. This is the
-                default option for each compartment
-
-            commit.solvers.norm1 penalises with the 1-norm of the coefficients
-                corresponding to the compartment.
-
-            commit.solvers.norm2 penalises with the 2-norm of the coefficients
-                corresponding to the compartment.
-
-
-    structureIC - np.array(list(list)) :
-        group structure for the IC compartment.
-            This field is necessary only if regterm[0]=commit.solver.group_sparsity.
-            Example:
-                structureIC = np.array([[0,2,5],[1,3,4],[0,1,2,3,4,5],[6]])
-
-                that is equivalent to
-                            [0,1,2,3,4,5]        [6]
-                              /       \
-                        [0,2,5]       [1,3,4]
-                which has two non overlapping groups, one of which is the union
-                of two other non-overlapping groups.
-
-
-    weightsIC - np.array(np.float64) :
-        this defines the weights associated to each group of structure IC.
-
-
-    group_norm - number :
-        norm type for the commit.solver.group_sparsity penalisation of the IC compartment.
-            Default: group_norm = commit.solver.norm2
-            To be chosen among commit.solver.{norm2,norminf}.
-
-    lambdas - tuple :
-        regularisation parameter for each compartment.
-            Default: lambdas = (0.0, 0.0, 0.0)
-            The lambdas correspond to the onse described in the mathematical
-            formulation of the regularisation term
-            $\Omega(x) = lambdas[0]*regnorm[0](x) + lambdas[1]*regnorm[1](x) + lambdas[2]*regnorm[2](x)$
-
-
-    References:
-        [1] Jenatton et al. - 'Proximal Methods for Hierarchical Sparse Coding'
-    """
-    regularisation = {}
-
-    regularisation['startIC']  = 0
-    regularisation['sizeIC']   = int( commit_evaluation.DICTIONARY['IC']['nF'] * commit_evaluation.KERNELS['wmr'].shape[0])
-    regularisation['startEC']  = int( regularisation['sizeIC'] )
-    regularisation['sizeEC']   = int( commit_evaluation.DICTIONARY['EC']['nE'] * commit_evaluation.KERNELS['wmh'].shape[0])
-    regularisation['startISO'] = int( regularisation['sizeIC'] + regularisation['sizeEC'] )
-    regularisation['sizeISO']  = int( commit_evaluation.DICTIONARY['nV'] * commit_evaluation.KERNELS['iso'].shape[0])
-
-    regularisation['normIC']  = regnorms[0]
-    regularisation['normEC']  = regnorms[1]
-    regularisation['normISO'] = regnorms[2]
-
-    regularisation['lambdaIC']  = float( lambdas[0] )
-    regularisation['lambdaEC']  = float( lambdas[1] )
-    regularisation['lambdaISO'] = float( lambdas[2] )
-
-    # Solver-specific fields
-    regularisation['structureIC']      = structureIC
-    regularisation['weightsIC']        = weightsIC
-    regularisation['group_norm']       = group_norm
-
-    return regularisation
-
-
-def regularisation2omegaprox(regularisation):
-    lambdaIC  = float(regularisation.get('lambdaIC'))
-    lambdaEC  = float(regularisation.get('lambdaEC'))
-    lambdaISO = float(regularisation.get('lambdaISO'))
-    if lambdaIC < 0.0 or lambdaEC < 0.0 or lambdaISO < 0.0:
-        raise ValueError('Negative regularisation parameters are not allowed')
-
-    normIC  = regularisation.get('normIC')
-    normEC  = regularisation.get('normEC')
-    normISO = regularisation.get('normISO')
-    if not normIC in list_regnorms:
-        raise ValueError('normIC must be one of commit.solvers.{group_sparsity,non_negative,norm1,norm2}')
-    if not normEC in list_regnorms:
-        raise ValueError('normEC must be one of commit.solvers.{group_sparsity,non_negative,norm1,norm2}')
-    if not normISO in list_regnorms:
-        raise ValueError('normISO must be one of commit.solvers.{group_sparsity,non_negative,norm1,norm2}')
-
-    ## NNLS case
-    if (lambdaIC == 0.0 and lambdaEC == 0.0 and lambdaISO == 0.0) or (normIC == non_negative and normEC == non_negative and normISO == non_negative):
-        omega = lambda x: 0.0
-        prox  = lambda x: non_negativity(x, 0, len(x))
-        return omega, prox
-
-    ## All other cases
-    # Intracellular Compartment
-    startIC = regularisation.get('startIC')
-    sizeIC  = regularisation.get('sizeIC')
-    if lambdaIC == 0.0:
-        omegaIC = lambda x: 0.0
-        proxIC  = lambda x: x
-    elif normIC == norm2:
-        omegaIC = lambda x: lambdaIC * np.linalg.norm(x[startIC:sizeIC])
-        proxIC  = lambda x: projection_onto_l2_ball(x, lambdaIC, startIC, sizeIC)
-    elif normIC == norm1:
-        omegaIC = lambda x: lambdaIC * sum( x[startIC:sizeIC] )
-        proxIC  = lambda x: soft_thresholding(x, lambdaIC, startIC, sizeIC)
-    elif normIC == non_negative:
-        omegaIC = lambda x: 0.0
-        proxIC  = lambda x: non_negativity(x, startIC, sizeIC)
-    elif normIC == group_sparsity:
-        structureIC = regularisation.get('structureIC')
-        groupWeightIC   = regularisation.get('weightsIC')
-        if not len(structureIC) == len(groupWeightIC):
-            raise ValueError('Number of groups and weights do not coincide.')
-        group_norm = regularisation.get('group_norm')
-        if not group_norm in list_group_sparsity_norms:
-            raise ValueError('Wrong norm in the structured sparsity term. Choose between %s.' % str(list_group_sparsity_norms))
-
-        # convert to new data structure (needed for faster access)
-        N = np.sum([g.size for g in structureIC])
-        groupIdxIC  = np.zeros( (N,), dtype=np.int32 )
-        groupSizeIC = np.zeros( (structureIC.size,), dtype=np.int32 )
-        pos = 0
-        for i, g in enumerate(structureIC) :
-            groupSizeIC[i] = g.size
-            groupIdxIC[pos:(pos+g.size)] = g[:]
-            pos += g.size
-
-        omegaIC = lambda x: omega_group_sparsity( x, groupIdxIC, groupSizeIC, groupWeightIC, lambdaIC, group_norm )
-        proxIC  = lambda x:  prox_group_sparsity( x, groupIdxIC, groupSizeIC, groupWeightIC, lambdaIC, group_norm )
-    else:
-        raise ValueError('Type of regularisation for IC compartment not recognized.')
-
-
-    # Extracellular Compartment
-    startEC = regularisation.get('startEC')
-    sizeEC  = regularisation.get('sizeEC')
-    if lambdaEC == 0.0:
-        omegaEC = lambda x: 0.0
-        proxEC  = lambda x: x
-    elif normEC == norm2:
-        omegaEC = lambda x: lambdaEC * np.linalg.norm(x[startEC:(startEC+sizeEC)])
-        proxEC  = lambda x: projection_onto_l2_ball(x, lambdaEC, startEC, sizeEC)
-    elif normEC == norm1:
-        omegaEC = lambda x: lambdaEC * sum( x[startEC:(startEC+sizeEC)] )
-        proxEC  = lambda x: soft_thresholding(x, lambdaEC, startEC, sizeEC)
-    elif normEC == non_negative:
-        omegaEC = lambda x: 0.0
-        proxEC  = lambda x: non_negativity(x, startEC, sizeEC)
-    else:
-        raise ValueError('Type of regularisation for EC compartment not recognized.')
-
-    # Isotropic Compartment
-    startISO = regularisation.get('startISO')
-    sizeISO  = regularisation.get('sizeISO')
-    if lambdaISO == 0.0:
-        omegaISO = lambda x: 0.0
-        proxISO  = lambda x: x
-    elif normISO == norm2:
-        omegaISO = lambda x: lambdaISO * np.linalg.norm(x[startISO:(startISO+sizeISO)])
-        proxISO  = lambda x: projection_onto_l2_ball(x, lambdaISO, startISO, sizeISO)
-    elif normISO == norm1:
-        omegaISO = lambda x: lambdaISO * sum( x[startISO:(startISO+sizeISO)] )
-        proxISO  = lambda x: soft_thresholding(x, lambdaISO, startISO, sizeISO)
-    elif normISO == non_negative:
-        omegaISO = lambda x: 0.0
-        proxISO  = lambda x: non_negativity(x, startISO, sizeISO)
-    else:
-        raise ValueError('Type of regularisation for ISO compartment not recognized.')
-
-    omega = lambda x: omegaIC(x) + omegaEC(x) + omegaISO(x)
-    prox = lambda x: non_negativity(proxIC(proxEC(proxISO(x))),0,x.size) # non negativity is redunduntly forced
-
-    return omega, prox
-
-
-def evaluate_model(y, A, x, regularisation = None):
-    if regularisation is None:
-        omega = lambda x: 0.0
-        prox  = lambda x: non_negativity(x, 0, len(x))
-    else:
-        omega, _ = regularisation2omegaprox(regularisation)
-
-    return 0.5*np.linalg.norm(A.dot(x)-y)**2 + omega(x)
-
-
-def solve(y, A, At, tol_fun = 1e-4, tol_x = 1e-6, max_iter = 1000, verbose = 1, x0 = None, regularisation = None):
-    """
-    Solve the regularised least squares problem
-
-        argmin_x 0.5*||Ax-y||_2^2 + Omega(x)
-
-    with the Omega described by 'regularisation'.
-
-    Check the documentation of commit.solvers.init_regularisation to see how to
-    solve a specific problem.
-    """
-    if regularisation is None:
-        omega = lambda x: 0.0
-        prox  = lambda x: non_negativity(x, 0, x.size)
-    else:
-        omega, prox = regularisation2omegaprox(regularisation)
-
-    if x0 is None:
-        x0 = np.zeros(A.shape[1])
-
-    return fista( y, A, At, tol_fun, tol_x, max_iter, verbose, x0, omega, prox)
-
-
-def fista( y, A, At, tol_fun, tol_x, max_iter, verbose, x0, omega, proximal) :
-    """
-    Solve the regularised least squares problem
-
-        argmin_x 0.5*||Ax-y||_2^2 + Omega(x)
-
-    with the FISTA algorithm described in [1].
-
-    The penalty term and its proximal operator must be defined in such a way
-    that they already contain the regularisation parameter.
-
-    References:
-        [1] Beck & Teboulle - `A Fast Iterative Shrinkage Thresholding
-            Algorithm for Linear Inverse Problems`
-    """
-
-    # Initialization
-    res = -y.copy()
-    xhat = x0.copy()
-    x = np.zeros_like(xhat)
-    res += A.dot(xhat)
-    proximal( xhat )
-    reg_term = omega( xhat )
-    prev_obj = 0.5 * np.linalg.norm(res)**2 + reg_term
-
-    told = 1
-    beta = 0.9
-    prev_x = xhat.copy()
-    grad = np.asarray(At.dot(res))
-    qfval = prev_obj
-
-    # Step size computation
-    L = ( np.linalg.norm( A.dot(grad) ) / np.linalg.norm(grad) )**2
-    mu = 1.9 / L
-
-    # Main loop
-    if verbose >= 1 :
-        print()
-        print( "      |  1/2||Ax-y||^2      Omega      |  Cost function    Abs error      Rel error    |      Abs x          Rel x    " )
-        print( "------|--------------------------------|-----------------------------------------------|------------------------------" )
-    iter = 1
-    while True :
-        if verbose >= 1 :
-            print( "%4d  |" % iter, end="" )
-            sys.stdout.flush()
-
-        # Smooth step
-        x = xhat - mu*grad
-
-        # Non-smooth step
-        proximal( x )
-        reg_term_x = omega( x )
-
-        # Check stepsize
-        tmp = x-xhat
-        q = qfval + np.real( np.dot(tmp,grad) ) + 0.5/mu * np.linalg.norm(tmp)**2 + reg_term_x
-        res = A.dot(x) - y
-        res_norm = np.linalg.norm(res)
-        curr_obj = 0.5 * res_norm**2 + reg_term_x
-
-        # Backtracking
-        while curr_obj > q :
-            # Smooth step
-            mu = beta*mu
-            x = xhat - mu*grad
-
-            # Non-smooth step
-            proximal( x )
-            reg_term_x = omega( x )
-
-            # Check stepsize
-            tmp = x-xhat
-            q = qfval + np.real( np.dot(tmp,grad) ) + 0.5/mu * np.linalg.norm(tmp)**2 + reg_term_x
-            res = A.dot(x) - y
-            res_norm = np.linalg.norm(res)
-            curr_obj = 0.5 * res_norm**2 + reg_term_x
-
-        # Global stopping criterion
-        abs_obj = abs(curr_obj - prev_obj)
-        rel_obj = abs_obj / curr_obj
-        abs_x   = np.linalg.norm(x - prev_x)
-        rel_x   = abs_x / ( np.linalg.norm(x) + eps )
-        if verbose >= 1 :
-            print( "  %13.7e  %13.7e  |  %13.7e  %13.7e  %13.7e  |  %13.7e  %13.7e" % ( 0.5 * res_norm**2, reg_term_x, curr_obj, abs_obj, rel_obj, abs_x, rel_x ) )
-
-        if abs_obj < eps :
-            criterion = "Absolute tolerance on the objective"
-            break
-        elif rel_obj < tol_fun :
-            criterion = "Relative tolerance on the objective"
-            break
-        elif abs_x < eps :
-            criterion = "Absolute tolerance on the unknown"
-            break
-        elif rel_x < tol_x :
-            criterion = "Relative tolerance on the unknown"
-            break
-        elif iter >= max_iter :
-            criterion = "Maximum number of iterations"
-            break
-
-        # FISTA update
-        t = 0.5 * ( 1 + sqrt(1+4*told**2) )
-        xhat = x + (told-1)/t * (x - prev_x)
-
-        # Gradient computation
-        res = A.dot(xhat) - y
-        xarr = np.asarray(x)
-
-        grad = np.asarray(At.dot(res))
-
-        # Update variables
-        iter += 1
-        prev_obj = curr_obj
-        prev_x = x.copy()
-        told = t
-        qfval = 0.5 * np.linalg.norm(res)**2
-
-
-    if verbose >= 1 :
-        print( "< Stopping criterion: %s >" % criterion )
-
-    opt_details = {}
-    opt_details['residual'] = 0.5*res_norm**2
-    opt_details['regterm'] = reg_term_x
-    opt_details['cost_function'] = curr_obj
-    opt_details['abs_cost'] = abs_obj
-    opt_details['rel_cost'] = rel_obj
-    opt_details['abs_x'] = abs_x
-    opt_details['rel _x'] = rel_x
-    opt_details['iterations'] = iter
-    opt_details['stopping_criterion'] = criterion
-
-    return x, opt_details
+"""
+Author: Matteo Frigo - lts5 @ EPFL and Dep. of CS @ Univ. of Verona
+
+This structure is based on the previous work of Rafael Carrillo and was
+supported by the LTS5 laboratory at EPFL, Lausanne.
+"""
+from __future__ import print_function
+import numpy as np
+from math import sqrt
+import sys
+import warnings
+eps = np.finfo(float).eps
+
+from commit.proximals import (non_negativity,
+                             omega_group_sparsity,
+                             prox_group_sparsity,
+                             soft_thresholding,
+                             projection_onto_l2_ball)
+group_sparsity = -1
+non_negative = 0
+norm1 = 1
+norm2 = 2
+norminf = np.inf
+list_regnorms = [group_sparsity, non_negative, norm1, norm2]
+list_group_sparsity_norms = [norm2]#, norminf] # removed because of issue #54
+
+
+def init_regularisation(commit_evaluation,
+                        regnorms = (non_negative, non_negative, non_negative),
+                        structureIC = None, weightsIC = None, group_norm = 2,
+                        lambdas = (.0,.0,.0) ):
+    """
+    Initialise the data structure that defines Omega in
+
+        argmin_x 0.5*||Ax-y||_2^2 + Omega(x)
+
+
+    Input
+    -----
+    commit_evaluation - commit.Evaluation object :
+        dictionary and model have to be loaded beforehand.
+
+
+    regnorms - tuple :
+        this sets the penalty term to be used for each compartment.
+            Default = (non_negative,non_negative,non_negative).
+
+            regnorms[0] corresponds to the Intracellular compartment
+            regnorms[1] corresponds to the Extracellular compartment
+            regnorms[2] corresponds to the Isotropic compartment
+
+            Each regnorms[k] must be one of commit.solvers.
+                                {group_sparsity, non_negative, norm1, norm2}.
+
+            commit.solvers.group_sparsity considers both the non-overlapping
+                and the hierarchical group sparsity (see [1]). This option is
+                allowed only in the IC compartment. The mathematical formulation
+                of this term is
+                $\Omega(x) = \lambda \sum_{g\in G} w_g |x_g|
+
+            commit.solvers.non_negative puts a non negativity constraint on the
+                coefficients corresponding to the compartment. This is the
+                default option for each compartment
+
+            commit.solvers.norm1 penalises with the 1-norm of the coefficients
+                corresponding to the compartment.
+
+            commit.solvers.norm2 penalises with the 2-norm of the coefficients
+                corresponding to the compartment.
+
+
+    structureIC - np.array(list(list)) :
+        group structure for the IC compartment.
+            This field is necessary only if regterm[0]=commit.solver.group_sparsity.
+            Example:
+                structureIC = np.array([[0,2,5],[1,3,4],[0,1,2,3,4,5],[6]])
+
+                that is equivalent to
+                            [0,1,2,3,4,5]        [6]
+                              /       \
+                        [0,2,5]       [1,3,4]
+                which has two non overlapping groups, one of which is the union
+                of two other non-overlapping groups.
+
+
+    weightsIC - np.array(np.float64) :
+        this defines the weights associated to each group of structure IC.
+
+
+    group_norm - number :
+        norm type for the commit.solver.group_sparsity penalisation of the IC compartment.
+            Default: group_norm = commit.solver.norm2
+            To be chosen among commit.solver.{norm2,norminf}.
+
+    lambdas - tuple :
+        regularisation parameter for each compartment.
+            Default: lambdas = (0.0, 0.0, 0.0)
+            The lambdas correspond to the onse described in the mathematical
+            formulation of the regularisation term
+            $\Omega(x) = lambdas[0]*regnorm[0](x) + lambdas[1]*regnorm[1](x) + lambdas[2]*regnorm[2](x)$
+
+
+    References:
+        [1] Jenatton et al. - 'Proximal Methods for Hierarchical Sparse Coding'
+    """
+    regularisation = {}
+
+    regularisation['startIC']  = 0
+    regularisation['sizeIC']   = int( commit_evaluation.DICTIONARY['IC']['nF'] * commit_evaluation.KERNELS['wmr'].shape[0])
+    regularisation['startEC']  = int( regularisation['sizeIC'] )
+    regularisation['sizeEC']   = int( commit_evaluation.DICTIONARY['EC']['nE'] * commit_evaluation.KERNELS['wmh'].shape[0])
+    regularisation['startISO'] = int( regularisation['sizeIC'] + regularisation['sizeEC'] )
+    regularisation['sizeISO']  = int( commit_evaluation.DICTIONARY['nV'] * commit_evaluation.KERNELS['iso'].shape[0])
+
+    regularisation['normIC']  = regnorms[0]
+    regularisation['normEC']  = regnorms[1]
+    regularisation['normISO'] = regnorms[2]
+
+    regularisation['lambdaIC']  = float( lambdas[0] )
+    regularisation['lambdaEC']  = float( lambdas[1] )
+    regularisation['lambdaISO'] = float( lambdas[2] )
+
+    # Solver-specific fields
+    regularisation['structureIC']      = structureIC
+    regularisation['weightsIC']        = weightsIC
+    regularisation['group_norm']       = group_norm
+
+    return regularisation
+
+
+def regularisation2omegaprox(regularisation):
+    lambdaIC  = float(regularisation.get('lambdaIC'))
+    lambdaEC  = float(regularisation.get('lambdaEC'))
+    lambdaISO = float(regularisation.get('lambdaISO'))
+    if lambdaIC < 0.0 or lambdaEC < 0.0 or lambdaISO < 0.0:
+        raise ValueError('Negative regularisation parameters are not allowed')
+
+    normIC  = regularisation.get('normIC')
+    normEC  = regularisation.get('normEC')
+    normISO = regularisation.get('normISO')
+    if not normIC in list_regnorms:
+        raise ValueError('normIC must be one of commit.solvers.{group_sparsity,non_negative,norm1,norm2}')
+    if not normEC in list_regnorms:
+        raise ValueError('normEC must be one of commit.solvers.{group_sparsity,non_negative,norm1,norm2}')
+    if not normISO in list_regnorms:
+        raise ValueError('normISO must be one of commit.solvers.{group_sparsity,non_negative,norm1,norm2}')
+
+    ## NNLS case
+    if (lambdaIC == 0.0 and lambdaEC == 0.0 and lambdaISO == 0.0) or (normIC == non_negative and normEC == non_negative and normISO == non_negative):
+        omega = lambda x: 0.0
+        prox  = lambda x: non_negativity(x, 0, len(x))
+        return omega, prox
+
+    ## All other cases
+    # Intracellular Compartment
+    startIC = regularisation.get('startIC')
+    sizeIC  = regularisation.get('sizeIC')
+    if lambdaIC == 0.0:
+        omegaIC = lambda x: 0.0
+        proxIC  = lambda x: x
+    elif normIC == norm2:
+        omegaIC = lambda x: lambdaIC * np.linalg.norm(x[startIC:sizeIC])
+        proxIC  = lambda x: projection_onto_l2_ball(x, lambdaIC, startIC, sizeIC)
+    elif normIC == norm1:
+        omegaIC = lambda x: lambdaIC * sum( x[startIC:sizeIC] )
+        proxIC  = lambda x: soft_thresholding(x, lambdaIC, startIC, sizeIC)
+    elif normIC == non_negative:
+        omegaIC = lambda x: 0.0
+        proxIC  = lambda x: non_negativity(x, startIC, sizeIC)
+    elif normIC == group_sparsity:
+        structureIC = regularisation.get('structureIC')
+        groupWeightIC   = regularisation.get('weightsIC')
+        if not len(structureIC) == len(groupWeightIC):
+            raise ValueError('Number of groups and weights do not coincide.')
+        group_norm = regularisation.get('group_norm')
+        if not group_norm in list_group_sparsity_norms:
+            raise ValueError('Wrong norm in the structured sparsity term. Choose between %s.' % str(list_group_sparsity_norms))
+
+        # convert to new data structure (needed for faster access)
+        N = np.sum([g.size for g in structureIC])
+        groupIdxIC  = np.zeros( (N,), dtype=np.int32 )
+        groupSizeIC = np.zeros( (structureIC.size,), dtype=np.int32 )
+        pos = 0
+        for i, g in enumerate(structureIC) :
+            groupSizeIC[i] = g.size
+            groupIdxIC[pos:(pos+g.size)] = g[:]
+            pos += g.size
+
+        omegaIC = lambda x: omega_group_sparsity( x, groupIdxIC, groupSizeIC, groupWeightIC, lambdaIC, group_norm )
+        proxIC  = lambda x:  prox_group_sparsity( x, groupIdxIC, groupSizeIC, groupWeightIC, lambdaIC, group_norm )
+    else:
+        raise ValueError('Type of regularisation for IC compartment not recognized.')
+
+
+    # Extracellular Compartment
+    startEC = regularisation.get('startEC')
+    sizeEC  = regularisation.get('sizeEC')
+    if lambdaEC == 0.0:
+        omegaEC = lambda x: 0.0
+        proxEC  = lambda x: x
+    elif normEC == norm2:
+        omegaEC = lambda x: lambdaEC * np.linalg.norm(x[startEC:(startEC+sizeEC)])
+        proxEC  = lambda x: projection_onto_l2_ball(x, lambdaEC, startEC, sizeEC)
+    elif normEC == norm1:
+        omegaEC = lambda x: lambdaEC * sum( x[startEC:(startEC+sizeEC)] )
+        proxEC  = lambda x: soft_thresholding(x, lambdaEC, startEC, sizeEC)
+    elif normEC == non_negative:
+        omegaEC = lambda x: 0.0
+        proxEC  = lambda x: non_negativity(x, startEC, sizeEC)
+    else:
+        raise ValueError('Type of regularisation for EC compartment not recognized.')
+
+    # Isotropic Compartment
+    startISO = regularisation.get('startISO')
+    sizeISO  = regularisation.get('sizeISO')
+    if lambdaISO == 0.0:
+        omegaISO = lambda x: 0.0
+        proxISO  = lambda x: x
+    elif normISO == norm2:
+        omegaISO = lambda x: lambdaISO * np.linalg.norm(x[startISO:(startISO+sizeISO)])
+        proxISO  = lambda x: projection_onto_l2_ball(x, lambdaISO, startISO, sizeISO)
+    elif normISO == norm1:
+        omegaISO = lambda x: lambdaISO * sum( x[startISO:(startISO+sizeISO)] )
+        proxISO  = lambda x: soft_thresholding(x, lambdaISO, startISO, sizeISO)
+    elif normISO == non_negative:
+        omegaISO = lambda x: 0.0
+        proxISO  = lambda x: non_negativity(x, startISO, sizeISO)
+    else:
+        raise ValueError('Type of regularisation for ISO compartment not recognized.')
+
+    omega = lambda x: omegaIC(x) + omegaEC(x) + omegaISO(x)
+    prox = lambda x: non_negativity(proxIC(proxEC(proxISO(x))),0,x.size) # non negativity is redunduntly forced
+
+    return omega, prox
+
+
+def evaluate_model(y, A, x, regularisation = None):
+    if regularisation is None:
+        omega = lambda x: 0.0
+        prox  = lambda x: non_negativity(x, 0, len(x))
+    else:
+        omega, _ = regularisation2omegaprox(regularisation)
+
+    return 0.5*np.linalg.norm(A.dot(x)-y)**2 + omega(x)
+
+
+def solve(y, A, At, tol_fun = 1e-4, tol_x = 1e-6, max_iter = 1000, verbose = 1, x0 = None, regularisation = None):
+    """
+    Solve the regularised least squares problem
+
+        argmin_x 0.5*||Ax-y||_2^2 + Omega(x)
+
+    with the Omega described by 'regularisation'.
+
+    Check the documentation of commit.solvers.init_regularisation to see how to
+    solve a specific problem.
+    """
+    if regularisation is None:
+        omega = lambda x: 0.0
+        prox  = lambda x: non_negativity(x, 0, x.size)
+    else:
+        omega, prox = regularisation2omegaprox(regularisation)
+
+    if x0 is None:
+        x0 = np.zeros(A.shape[1])
+
+    return fista( y, A, At, tol_fun, tol_x, max_iter, verbose, x0, omega, prox)
+
+
+def fista( y, A, At, tol_fun, tol_x, max_iter, verbose, x0, omega, proximal) :
+    """
+    Solve the regularised least squares problem
+
+        argmin_x 0.5*||Ax-y||_2^2 + Omega(x)
+
+    with the FISTA algorithm described in [1].
+
+    The penalty term and its proximal operator must be defined in such a way
+    that they already contain the regularisation parameter.
+
+    References:
+        [1] Beck & Teboulle - `A Fast Iterative Shrinkage Thresholding
+            Algorithm for Linear Inverse Problems`
+    """
+
+    # Initialization
+    res = -y.copy()
+    xhat = x0.copy()
+    x = np.zeros_like(xhat)
+    res += A.dot(xhat)
+    proximal( xhat )
+    reg_term = omega( xhat )
+    prev_obj = 0.5 * np.linalg.norm(res)**2 + reg_term
+
+    told = 1
+    beta = 0.9
+    prev_x = xhat.copy()
+    grad = np.asarray(At.dot(res))
+    qfval = prev_obj
+
+    # Step size computation
+    L = ( np.linalg.norm( A.dot(grad) ) / np.linalg.norm(grad) )**2
+    mu = 1.9 / L
+
+    # Main loop
+    if verbose >= 1 :
+        print()
+        print( "      |  1/2||Ax-y||^2      Omega      |  Cost function    Abs error      Rel error    |      Abs x          Rel x    " )
+        print( "------|--------------------------------|-----------------------------------------------|------------------------------" )
+    iter = 1
+    while True :
+        if verbose >= 1 :
+            print( "%4d  |" % iter, end="" )
+            sys.stdout.flush()
+
+        # Smooth step
+        x = xhat - mu*grad
+
+        # Non-smooth step
+        proximal( x )
+        reg_term_x = omega( x )
+
+        # Check stepsize
+        tmp = x-xhat
+        q = qfval + np.real( np.dot(tmp,grad) ) + 0.5/mu * np.linalg.norm(tmp)**2 + reg_term_x
+        res = A.dot(x) - y
+        res_norm = np.linalg.norm(res)
+        curr_obj = 0.5 * res_norm**2 + reg_term_x
+
+        # Backtracking
+        while curr_obj > q :
+            # Smooth step
+            mu = beta*mu
+            x = xhat - mu*grad
+
+            # Non-smooth step
+            proximal( x )
+            reg_term_x = omega( x )
+
+            # Check stepsize
+            tmp = x-xhat
+            q = qfval + np.real( np.dot(tmp,grad) ) + 0.5/mu * np.linalg.norm(tmp)**2 + reg_term_x
+            res = A.dot(x) - y
+            res_norm = np.linalg.norm(res)
+            curr_obj = 0.5 * res_norm**2 + reg_term_x
+
+        # Global stopping criterion
+        abs_obj = abs(curr_obj - prev_obj)
+        rel_obj = abs_obj / curr_obj
+        abs_x   = np.linalg.norm(x - prev_x)
+        rel_x   = abs_x / ( np.linalg.norm(x) + eps )
+        if verbose >= 1 :
+            print( "  %13.7e  %13.7e  |  %13.7e  %13.7e  %13.7e  |  %13.7e  %13.7e" % ( 0.5 * res_norm**2, reg_term_x, curr_obj, abs_obj, rel_obj, abs_x, rel_x ) )
+
+        if abs_obj < eps :
+            criterion = "Absolute tolerance on the objective"
+            break
+        elif rel_obj < tol_fun :
+            criterion = "Relative tolerance on the objective"
+            break
+        elif abs_x < eps :
+            criterion = "Absolute tolerance on the unknown"
+            break
+        elif rel_x < tol_x :
+            criterion = "Relative tolerance on the unknown"
+            break
+        elif iter >= max_iter :
+            criterion = "Maximum number of iterations"
+            break
+
+        # FISTA update
+        t = 0.5 * ( 1 + sqrt(1+4*told**2) )
+        xhat = x + (told-1)/t * (x - prev_x)
+
+        # Gradient computation
+        res = A.dot(xhat) - y
+        xarr = np.asarray(x)
+
+        grad = np.asarray(At.dot(res))
+
+        # Update variables
+        iter += 1
+        prev_obj = curr_obj
+        prev_x = x.copy()
+        told = t
+        qfval = 0.5 * np.linalg.norm(res)**2
+
+
+    if verbose >= 1 :
+        print( "< Stopping criterion: %s >" % criterion )
+
+    opt_details = {}
+    opt_details['residual'] = 0.5*res_norm**2
+    opt_details['regterm'] = reg_term_x
+    opt_details['cost_function'] = curr_obj
+    opt_details['abs_cost'] = abs_obj
+    opt_details['rel_cost'] = rel_obj
+    opt_details['abs_x'] = abs_x
+    opt_details['rel _x'] = rel_x
+    opt_details['iterations'] = iter
+    opt_details['stopping_criterion'] = criterion
+
+    return x, opt_details
diff --git a/docs/COMMIT_debugger.md b/docs/COMMIT_debugger.md
index 58e7c7ad..ca1badf3 100644
--- a/docs/COMMIT_debugger.md
+++ b/docs/COMMIT_debugger.md
@@ -1,71 +1,71 @@
-# How to "debug" your data
-
-This tool allows one to display in a common 3D space all the objects used by COMMIT (DWI data, streamlines etc...) in order to **spot possible incosistencies between the conventions** of COMMIT and the software that generated the data, e.g. flip in some axes in the DWI data or in the peaks, spatial shift of the streamlines, whether the affine transformation was already applied to the data etc.
-
-**NB**: please note that this tool is very rudimental and is released only for debugging purposes.
-
-![Application screenshot](https://github.com/daducci/COMMIT/blob/master/docs/COMMIT_debugger.jpg)
-
-## Synopsis
-
-```bash
-COMMIT_debugger \
-    <dwi> \
-    <scheme> \
-    [-p <peaks> ] \
-    [-f <tracts>] \
-    [-m <map>]
-```
-
-- `dwi`: DWI data (4D NIFTI file);
-- `scheme`: corresponding acquisition scheme (Camino format);
-- `peaks`: major directions of the *hindered* water pools in each voxel (4D NIFTI file);
-- `tracts`: tractogram to generate the *restricted* contributions of the tracts in each voxel (.TRK or .TCK file);
-- `map`: background map; default is a b0 computed from the DWI data.
-
-A **dropdown menu** will appear with right-click of the mouse.
-
-## Install dependencies
-
-You need to install the following libraries:
-
-- [CMake](http://www.cmake.org/) to allow cross-platform compilation;
-- [Niftilib](https://sourceforge.net/projects/niftilib/) for reading/writing NIFTI files;
-- [Blitz++](http://sourceforge.net/projects/blitz/) for efficient manipulation of multi-dimensional arrays;
-- [OpenGL](https://www.opengl.org/) and [GLUT](https://www.opengl.org/resources/libraries/glut/) for 3D visualization.
-
-Please follow the corresponding documentation to install these libraries on your platform. Our code was successfully tested on Linux (Ubuntu 14.04) and OSX (10.9 up to 10.15) systems.
-
-### OSX with homebrew
-
-If your're using [homebrew](https://brew.sh), then the following should work:
-
-```bash
-brew tap brewsci/science
-brew install cmake
-brew install blitz
-brew install niftilib
-```
-
-The `OpenGL` and `GLUT` libraries are already provided by the operating system.
-
-##  Compile, build and install
-
-Open the terminal and type:
-
-```bash
-cd extras
-mkdir build
-cd build
-ccmake ..
-```
-
-Hit `c` (twice) and then `g`. This will create the required makefiles for the compilation.
-Once back to the terminal, type:
-
-```bash
-make
-make install
-```
-
-This will install the binaries into the `/usr/local/bin` folder. This installation path can be changed by rerunning `ccmake ..` and then modifying the `CMAKE_INSTALL_PREFIX` parameter to suit your custom needs.
+# How to "debug" your data
+
+This tool allows one to display in a common 3D space all the objects used by COMMIT (DWI data, streamlines etc...) in order to **spot possible incosistencies between the conventions** of COMMIT and the software that generated the data, e.g. flip in some axes in the DWI data or in the peaks, spatial shift of the streamlines, whether the affine transformation was already applied to the data etc.
+
+**NB**: please note that this tool is very rudimental and is released only for debugging purposes.
+
+![Application screenshot](https://github.com/daducci/COMMIT/blob/master/docs/COMMIT_debugger.jpg)
+
+## Synopsis
+
+```bash
+COMMIT_debugger \
+    <dwi> \
+    <scheme> \
+    [-p <peaks> ] \
+    [-f <tracts>] \
+    [-m <map>]
+```
+
+- `dwi`: DWI data (4D NIFTI file);
+- `scheme`: corresponding acquisition scheme (Camino format);
+- `peaks`: major directions of the *hindered* water pools in each voxel (4D NIFTI file);
+- `tracts`: tractogram to generate the *restricted* contributions of the tracts in each voxel (.TRK or .TCK file);
+- `map`: background map; default is a b0 computed from the DWI data.
+
+A **dropdown menu** will appear with right-click of the mouse.
+
+## Install dependencies
+
+You need to install the following libraries:
+
+- [CMake](http://www.cmake.org/) to allow cross-platform compilation;
+- [Niftilib](https://sourceforge.net/projects/niftilib/) for reading/writing NIFTI files;
+- [Blitz++](http://sourceforge.net/projects/blitz/) for efficient manipulation of multi-dimensional arrays;
+- [OpenGL](https://www.opengl.org/) and [GLUT](https://www.opengl.org/resources/libraries/glut/) for 3D visualization.
+
+Please follow the corresponding documentation to install these libraries on your platform. Our code was successfully tested on Linux (Ubuntu 14.04) and OSX (10.9 up to 10.15) systems.
+
+### OSX with homebrew
+
+If your're using [homebrew](https://brew.sh), then the following should work:
+
+```bash
+brew tap brewsci/science
+brew install cmake
+brew install blitz
+brew install niftilib
+```
+
+The `OpenGL` and `GLUT` libraries are already provided by the operating system.
+
+##  Compile, build and install
+
+Open the terminal and type:
+
+```bash
+cd extras
+mkdir build
+cd build
+ccmake ..
+```
+
+Hit `c` (twice) and then `g`. This will create the required makefiles for the compilation.
+Once back to the terminal, type:
+
+```bash
+make
+make install
+```
+
+This will install the binaries into the `/usr/local/bin` folder. This installation path can be changed by rerunning `ccmake ..` and then modifying the `CMAKE_INSTALL_PREFIX` parameter to suit your custom needs.
diff --git a/docs/README.md b/docs/README.md
index c8f7b556..5db9b226 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,10 +1,10 @@
-#  Table of Content
-
-1. [Installation](install.md)
-2. [Tutorials](tutorials)
-3. [Conventions](conventions.md)
-4. [Forward models](models.md)
-5. [How to "debug" your data](COMMIT_debugger.md)
-6. [Frequently asked questions (FAQ)](faq.md)
-
- 
+#  Table of Content
+
+1. [Installation](install.md)
+2. [Tutorials](tutorials)
+3. [Conventions](conventions.md)
+4. [Forward models](models.md)
+5. [How to "debug" your data](COMMIT_debugger.md)
+6. [Frequently asked questions (FAQ)](faq.md)
+
+ 
diff --git a/docs/conventions.md b/docs/conventions.md
index db233665..9dc6cb3d 100644
--- a/docs/conventions.md
+++ b/docs/conventions.md
@@ -1,42 +1,42 @@
-# DWI data
-
-The **diffusion MRI signal** is stored as a 4D [NIFTI](http://nifti.nimh.nih.gov/) file, where:
-
-- the *first three dimensions* define the spatial locations of voxels, ie *x*, *y* and *z*;
-- the *fourth dimension* contains the diffusion signal for each voxel (*x*,*y*,*z*).
-
-A **companion scheme file** defines all the information about the diffusion acquisition protocol. The scheme is a a text file and can be specified in two formats:
-
-- as a *Nx4 matrix*, where the first three columns are the gradient directions and the fourth contains their b-value (*s/mm^2*).
-- as a *Nx7 matrix*, where the first three columns are the gradient directions and the remaining four define the gradient strength (*T/m*), big delta (*s*), small delta (*s*) and echo time (*s*), respectively.
-
-The scheme files follow the [Camino file structure](http://cmic.cs.ucl.ac.uk/camino/index.php?n=Docs.SchemeFiles); here, the header line (eg. `VERSION: BVECTOR`) can be omitted. The gradient directions are assumed to be in voxel-coordinates and the affine transformation of the DWI data will not be taken into account.
-
-# Peak orientations
-
-If an extra-cellular compartment is used in the forward-model, a peak-file has to be provided containing the orientations of the additional compartment. By default, the vectors will be interpreted in voxel-coordinate system. However, if the peak orientation was stored in world-coordinate system (e.g. using `sh2peaks` in [MRtrix](http://www.mrtrix.org)), an additional flag (`peaks_use_affine = True`) in `trk2dictionary` in order to apply the affine transformation from the header of the peak file. The correct orientation of the peaks can be determined in the [COMMIT_debugger](extras.md) switching between the two possibilities pressing `a`.  
-
-# Acquisition protocol
-
-At the moment, COMMIT assumes that the data is acquired with a **multi-shell acquisition protocol** (in addition to an arbitrary number of b0 images). The reason for this is that COMMIT creates internal lookup-tables (LUT) for efficiently computing the matrix-vector multiplications with the linear operator **A**. These LUT are created by performing the rotations in spherical harmonics (SH) space, as this procedure is much faster than generating the single response-functions in all possible orientations. As a consequence, the current version of COMMIT works only with data acquired on (multiple) shells.
-
-For other non shell-like acquisition schemes, e.g. DSI, this procedure is not possible. At the moment, the software threats all the diffusion gradients as belonging to distinct shells (very inefficient).
-
-# Folder structure
-
-Usually, all subjects belonging to the same study are acquired with the *same acquisition scheme*. For this reason, the software implicitly assumes a folder structure as follows:
-
-```
-    ├── data
-        ├── Study_01    --> subjects acquired with protocol "Study_01"
-            ├── Subject_01
-            ├── Subject_02
-            ├── ...
-        ├── Study_02    --> subjects acquired with protocol "Study_02"
-            ├── Subject_01
-            ├── Subject_02
-            ├── ...
-        ├── ...
-```
-
-For data following this convention, the internal LUT are generated only once for the acquisition scheme of the study and then, for each subject, they are simply projected back very efficiently from the SH space to the specific subject space. In all other cases, the LUT must be regenerated each time.
+# DWI data
+
+The **diffusion MRI signal** is stored as a 4D [NIFTI](http://nifti.nimh.nih.gov/) file, where:
+
+- the *first three dimensions* define the spatial locations of voxels, ie *x*, *y* and *z*;
+- the *fourth dimension* contains the diffusion signal for each voxel (*x*,*y*,*z*).
+
+A **companion scheme file** defines all the information about the diffusion acquisition protocol. The scheme is a a text file and can be specified in two formats:
+
+- as a *Nx4 matrix*, where the first three columns are the gradient directions and the fourth contains their b-value (*s/mm^2*).
+- as a *Nx7 matrix*, where the first three columns are the gradient directions and the remaining four define the gradient strength (*T/m*), big delta (*s*), small delta (*s*) and echo time (*s*), respectively.
+
+The scheme files follow the [Camino file structure](http://cmic.cs.ucl.ac.uk/camino/index.php?n=Docs.SchemeFiles); here, the header line (eg. `VERSION: BVECTOR`) can be omitted. The gradient directions are assumed to be in voxel-coordinates and the affine transformation of the DWI data will not be taken into account.
+
+# Peak orientations
+
+If an extra-cellular compartment is used in the forward-model, a peak-file has to be provided containing the orientations of the additional compartment. By default, the vectors will be interpreted in voxel-coordinate system. However, if the peak orientation was stored in world-coordinate system (e.g. using `sh2peaks` in [MRtrix](http://www.mrtrix.org)), an additional flag (`peaks_use_affine = True`) in `trk2dictionary` in order to apply the affine transformation from the header of the peak file. The correct orientation of the peaks can be determined in the [COMMIT_debugger](extras.md) switching between the two possibilities pressing `a`.  
+
+# Acquisition protocol
+
+At the moment, COMMIT assumes that the data is acquired with a **multi-shell acquisition protocol** (in addition to an arbitrary number of b0 images). The reason for this is that COMMIT creates internal lookup-tables (LUT) for efficiently computing the matrix-vector multiplications with the linear operator **A**. These LUT are created by performing the rotations in spherical harmonics (SH) space, as this procedure is much faster than generating the single response-functions in all possible orientations. As a consequence, the current version of COMMIT works only with data acquired on (multiple) shells.
+
+For other non shell-like acquisition schemes, e.g. DSI, this procedure is not possible. At the moment, the software threats all the diffusion gradients as belonging to distinct shells (very inefficient).
+
+# Folder structure
+
+Usually, all subjects belonging to the same study are acquired with the *same acquisition scheme*. For this reason, the software implicitly assumes a folder structure as follows:
+
+```
+    ├── data
+        ├── Study_01    --> subjects acquired with protocol "Study_01"
+            ├── Subject_01
+            ├── Subject_02
+            ├── ...
+        ├── Study_02    --> subjects acquired with protocol "Study_02"
+            ├── Subject_01
+            ├── Subject_02
+            ├── ...
+        ├── ...
+```
+
+For data following this convention, the internal LUT are generated only once for the acquisition scheme of the study and then, for each subject, they are simply projected back very efficiently from the SH space to the specific subject space. In all other cases, the LUT must be regenerated each time.
diff --git a/docs/faq.md b/docs/faq.md
index 7bc7eb45..8630f7b7 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -1,14 +1,14 @@
-# Frequently asked questions (FAQ)
-
-
-### Which tractograms can I use with COMMIT?
-
-Any tractogram can be fed to the COMMIT framework, as long as each tract is represented as a polyline, i.e. sequence of consecutive segments. At the moment, however, COMMIT only reads tractograms in the [.TRK](http://www.trackvis.org/docs/?subsect=fileformat) and [.TCK](https://mrtrix.readthedocs.io/en/latest/getting_started/image_data.html#tracks-file-format-tck) file formats. For other formats, several converters are available, e.g. [DIPY](http://dipy.org).
-
-### Where is the old MATLAB version?
-
-The old MATLAB version is still available [as a tag in the repository](https://github.com/daducci/COMMIT/releases/tag/MATLAB). Please note, however, that this version is no longer mantained.
-
-### Why this transition to Python?
-
-We decided to re-implement our tool in Python with the aim to be more compatible with existing tools and libraries in the field, in particular [DIPY](http://dipy.org), and allow an easier integration in existing pipelines. 
+# Frequently asked questions (FAQ)
+
+
+### Which tractograms can I use with COMMIT?
+
+Any tractogram can be fed to the COMMIT framework, as long as each tract is represented as a polyline, i.e. sequence of consecutive segments. At the moment, however, COMMIT only reads tractograms in the [.TRK](http://www.trackvis.org/docs/?subsect=fileformat) and [.TCK](https://mrtrix.readthedocs.io/en/latest/getting_started/image_data.html#tracks-file-format-tck) file formats. For other formats, several converters are available, e.g. [DIPY](http://dipy.org).
+
+### Where is the old MATLAB version?
+
+The old MATLAB version is still available [as a tag in the repository](https://github.com/daducci/COMMIT/releases/tag/MATLAB). Please note, however, that this version is no longer mantained.
+
+### Why this transition to Python?
+
+We decided to re-implement our tool in Python with the aim to be more compatible with existing tools and libraries in the field, in particular [DIPY](http://dipy.org), and allow an easier integration in existing pipelines. 
diff --git a/docs/install.md b/docs/install.md
index fbe27902..2d070b8c 100644
--- a/docs/install.md
+++ b/docs/install.md
@@ -1,46 +1,46 @@
-# Installation
-
-
-## Install dependencies
-
-### Python and DIPY
-
-COMMIT is written in [Python](https://www.python.org/) and it internally makes use of the [DIPY](http://dipy.org) library. Also, some parts of the code require to be compiled and this is done via the [Cython](http://cython.org/) module.
-Please install and configure all these packages by following the guidelines on the corresponding websites.
-
-> COMMIT was **succesfully tested** on:  
-  - OSX 10.10, [Anaconda](http://docs.continuum.io/anaconda/) Python distribution and DIPY 0.9.0dev.
-
-### AMICO
-
-COMMIT shares the code for the generation/rotation of the response-function lookup tables with [AMICO](https://github.com/daducci/AMICO). Please install AMICO following the instructions [here](https://github.com/daducci/AMICO).
-
-> NB: in order to use COMMIT, it is only necessary to install the Python code; no additional modules (e.g. SPAMS and NODDI) are required.
-
-### Camino toolkit
-
-Depending on the forward-model employed, COMMIT can require the [Camino](http://camino.org.uk) toolkit to generate the response functions, e.g. in case of the `Cylinder-Zeppelin-Ball` model.
-
-Please follow the corresponding [documentation](http://cmic.cs.ucl.ac.uk/camino//index.php?n=Main.Installation) to install Camino and make sure to include the folder containing the script `datasynth` in your system path.
-
-## Install COMMIT
-
-Open the system shell, go to the folder where you downloaded this repository and run:
-
-```bash
-pip install .
-```
-
-COMMIT is now available in your Python interpreter and can be imported as usual:
-
-```python
-import commit
-```
-
-### Uninstall COMMIT
-
-Open the system shell and run:
-
-```bash
-pip uninstall commit
-```
+# Installation
+
+
+## Install dependencies
+
+### Python and DIPY
+
+COMMIT is written in [Python](https://www.python.org/) and it internally makes use of the [DIPY](http://dipy.org) library. Also, some parts of the code require to be compiled and this is done via the [Cython](http://cython.org/) module.
+Please install and configure all these packages by following the guidelines on the corresponding websites.
+
+> COMMIT was **succesfully tested** on:  
+  - OSX 10.10, [Anaconda](http://docs.continuum.io/anaconda/) Python distribution and DIPY 0.9.0dev.
+
+### AMICO
+
+COMMIT shares the code for the generation/rotation of the response-function lookup tables with [AMICO](https://github.com/daducci/AMICO). Please install AMICO following the instructions [here](https://github.com/daducci/AMICO).
+
+> NB: in order to use COMMIT, it is only necessary to install the Python code; no additional modules (e.g. SPAMS and NODDI) are required.
+
+### Camino toolkit
+
+Depending on the forward-model employed, COMMIT can require the [Camino](http://camino.org.uk) toolkit to generate the response functions, e.g. in case of the `Cylinder-Zeppelin-Ball` model.
+
+Please follow the corresponding [documentation](http://cmic.cs.ucl.ac.uk/camino//index.php?n=Main.Installation) to install Camino and make sure to include the folder containing the script `datasynth` in your system path.
+
+## Install COMMIT
+
+Open the system shell, go to the folder where you downloaded this repository and run:
+
+```bash
+pip install .
+```
+
+COMMIT is now available in your Python interpreter and can be imported as usual:
+
+```python
+import commit
+```
+
+### Uninstall COMMIT
+
+Open the system shell and run:
+
+```bash
+pip uninstall commit
+```
diff --git a/docs/models.md b/docs/models.md
index 515c7a4f..73ab168a 100644
--- a/docs/models.md
+++ b/docs/models.md
@@ -1,58 +1,58 @@
-# Forward models
-
-COMMIT is *not* a model, but a *framework*: it allows the combination of a tractogram with any generic multi-compartment model, accounting for possible signal contributions arising from **restricted**, **hindered** and **isotropic** water pools.
-
-Two classical models are already included in the current version of the software, i.e. `Stick-Zeppelin-Ball` and `Cylinder-Zeppelin-Ball` defined in [(Panagiotaki et al., Neuroimage, 2012)](http://www.sciencedirect.com/science/article/pii/S1053811911011566). Each compartment can be selectively enabled/disabled; this means, for example, that many other models are already implicitly included, such as the `Ball&Stick` that can be obtained from the `Stick-Zeppelin-Ball` model by disabling the `Zeppelin` contributions.
-
-Additional multi-compartment models can be easily added. A model is defined as a *class* in the file `models.py` and must expose (at least) the following methods for:
-
-1) Setting the specific **parameters of the model**; the method must be named `set`, but the actual signature is model-dependent:
-
-```python
-def set( self, ... ) :
-```
-
-2) **Generating high-resolution response functions** and rotate them (in SH space), with the following signature:
-
-```python
-def generate( self, out_path, scheme, aux, idx_in, idx_out ) :
-
-Parameters
-----------
-out_path : string
-    The path where to store the rotated kernels.
-
-scheme : Scheme class
-    The original acquisition scheme.
-
-aux : dictionary
-    Auxiliary data structures needed to rotate functions in SH space.
-
-idx_in : list of list
-    Index of samples in input kernel belonging to each shell.
-
-idx_out : list of list
-    Index of samples in output kernel belonging to each shell.
-```
-
-3) **Projecting the response functions** from SH space to signal space of the subject, with the following signature:
-
-```python
-def resample( self, in_path, idx_out, Ylm_out ) :
-
-Parameters
-----------
-in_path : string
-    The path where the rotated kernels in SH space are stored
-
-idx_out : list of list
-    Index of samples in output kernel belonging to each shell
-
-Ylm_out : numpy.array
-    Matrix to project back all shells from SH space to signal space (of the subject)
-
-Returns
--------
-KERNELS : dict
-    Contains all the response functions projected to the signal space of the subject
+# Forward models
+
+COMMIT is *not* a model, but a *framework*: it allows the combination of a tractogram with any generic multi-compartment model, accounting for possible signal contributions arising from **restricted**, **hindered** and **isotropic** water pools.
+
+Two classical models are already included in the current version of the software, i.e. `Stick-Zeppelin-Ball` and `Cylinder-Zeppelin-Ball` defined in [(Panagiotaki et al., Neuroimage, 2012)](http://www.sciencedirect.com/science/article/pii/S1053811911011566). Each compartment can be selectively enabled/disabled; this means, for example, that many other models are already implicitly included, such as the `Ball&Stick` that can be obtained from the `Stick-Zeppelin-Ball` model by disabling the `Zeppelin` contributions.
+
+Additional multi-compartment models can be easily added. A model is defined as a *class* in the file `models.py` and must expose (at least) the following methods for:
+
+1) Setting the specific **parameters of the model**; the method must be named `set`, but the actual signature is model-dependent:
+
+```python
+def set( self, ... ) :
+```
+
+2) **Generating high-resolution response functions** and rotate them (in SH space), with the following signature:
+
+```python
+def generate( self, out_path, scheme, aux, idx_in, idx_out ) :
+
+Parameters
+----------
+out_path : string
+    The path where to store the rotated kernels.
+
+scheme : Scheme class
+    The original acquisition scheme.
+
+aux : dictionary
+    Auxiliary data structures needed to rotate functions in SH space.
+
+idx_in : list of list
+    Index of samples in input kernel belonging to each shell.
+
+idx_out : list of list
+    Index of samples in output kernel belonging to each shell.
+```
+
+3) **Projecting the response functions** from SH space to signal space of the subject, with the following signature:
+
+```python
+def resample( self, in_path, idx_out, Ylm_out ) :
+
+Parameters
+----------
+in_path : string
+    The path where the rotated kernels in SH space are stored
+
+idx_out : list of list
+    Index of samples in output kernel belonging to each shell
+
+Ylm_out : numpy.array
+    Matrix to project back all shells from SH space to signal space (of the subject)
+
+Returns
+-------
+KERNELS : dict
+    Contains all the response functions projected to the signal space of the subject
 ```
\ No newline at end of file
diff --git a/docs/tutorials/AdvancedSolvers/README.md b/docs/tutorials/AdvancedSolvers/README.md
index 0b04539d..9294cd17 100644
--- a/docs/tutorials/AdvancedSolvers/README.md
+++ b/docs/tutorials/AdvancedSolvers/README.md
@@ -1,151 +1,151 @@
-# Advanced solvers
-
-This tutorial shows how to exploit the advanced features of the COMMIT framework from the side of the **optimisation problem**. The general formulation is the following:
-\begin{equation}
-x^* = \arg\min_{x\in R^n_+} \frac12 \|Ax-y\|_2^2 + \lambda_{IC}\Omega_{IC}(x) + \lambda_{EC}\Omega_{EC}(x) + \lambda_{ISO}\Omega_{ISO}(x),
-\end{equation}
-where $A$ is the COMMIT dictionary, $n$ is defined in such a way that the product $Ax$ makes sense and $y$ is the datum that we want to fit. The three regularisation terms allow us to exploit ***distinct penalties for each compartment***.
-
-*Note*: before exploring this tutorial, you should follow the [Getting Started](https://github.com/daducci/COMMIT/tree/master/docs/tutorials/GettingStarted) tutorial.
-
-
-### Download and unpack the data
-
-Download and extract the **example dataset** from the following [ZIP archive](http://hardi.epfl.ch/static/data/COMMIT_demos/LausanneTwoShell.zip), which contains the following files:
-
-- `DWI.nii`: a diffusion MRI dataset with 100 measurements distributed on 2 shells, respectively at b=700 s/mm^2 and b=2000 s/mm^2;
-- `DWI.scheme`: its corresponding acquisition scheme;
-- `peaks.nii.gz`: main diffusion orientations estimated with CSD;
-- `fibers.trk`: tractogram with about 280K fibers estimated using a streamline-based algorithm;
-- `WM.nii.gz`: white-matter mask extracted from an anatomical T1w image.
-
-
-<span style="color:crimson">**Make sure that your working directory is the folder where you unzipped the downloaded archive.**</span>
-
-
-```python
-path_to_the_directory_with_the_unzipped_archive = '.' # edit this
-cd path_to_the_directory_with_the_unzipped_archive
-```
-
-### Load the usual COMMIT structure
-
-
-```python
-from commit import trk2dictionary
-
-trk2dictionary.run(
-    filename_tractogram = 'LausanneTwoShell/fibers.trk',
-    path_out            = 'LausanneTwoShell/CommitOutput',
-    filename_peaks      = 'LausanneTwoShell/peaks.nii.gz',
-    filename_mask       = 'LausanneTwoShell/WM.nii.gz',
-    fiber_shift         = 0.5,
-    peaks_use_affine    = True
-)
-
-import commit
-commit.core.setup() 
-mit = commit.Evaluation( '.', 'LausanneTwoShell' )
-mit.load_data( 'DWI.nii', 'DWI.scheme' )
-
-mit.set_model( 'StickZeppelinBall' )
-
-d_par   = 1.7E-3            # Parallel diffusivity [mm^2/s]
-d_perps = [ 0.51E-3 ]       # Perpendicular diffusivitis [mm^2/s]
-d_isos = [ 1.7E-3, 3.0E-3 ] # Isotropic diffusivitie(s) [mm^2/s]
-
-mit.model.set( d_par, d_perps, d_isos )
-mit.generate_kernels( regenerate=True )
-mit.load_kernels()
-
-mit.load_dictionary( 'CommitOutput' )
-mit.set_threads()
-mit.build_operator()
-```
-
-### Perform clustering of the streamlines
-
-You will need `dipy`, which is among the requirements of COMMIT, hence there should be no problem.
-
-The `threshold` parameter has to be tuned for each brain. Do not consider our choice as a standard one.
-
-
-```python
-from nibabel import trackvis as tv
-fname='LausanneTwoShell/fibers.trk'
-streams, hdr = tv.read(fname)
-streamlines = [i[0] for i in streams]
-
-from dipy.segment.clustering import QuickBundles
-threshold = 15.0
-qb = QuickBundles(threshold=threshold)
-clusters = qb.cluster(streamlines)
-
-import numpy as np
-structureIC = np.array([np.array(c.indices) for c in clusters])
-weightsIC   = np.array([1.0/np.sqrt(len(c)) for c in structureIC])
-```
-
-### Define the regularisation term
-Each compartment must be regularised separately. The user can choose among the following penalties:
-
-- $\sum_{g\in G}w_g\|x_g\|_k$ : `commit.solvers.group_sparsity` with $k\in \{2, \infty\}$ (only for IC compartment)
-
-- $\|x\|_1$ : `commit.solvers.norm1`
-
-- $\|x\|_2$ : `commit.solvers.norm2`
-
-- $\iota_{\ge 0}(x)$ : `commit.solvers.non_negative` (Default for all compartments)
-
-If the chosen regularisation for the IC compartment is $\sum_{g\in G}\|x_g\|_k$, we can define $k$ via the `group_norm` field, which must be
-
-- $\|x\|_2$ : `commit.solvers.norm2`
-
-In this example we consider the following penalties:
-
-- Intracellular: group sparsity with 2-norm of each group
-
-- Extracellular: 2-norm
-
-- Isotropic: 1-norm
-
-
-```python
-regnorms = [commit.solvers.group_sparsity, commit.solvers.norm2, commit.solvers.norm1]
-
-group_norm = 2 # each group is penalised with its 2-norm
-```
-
-The regularisation parameters are specified within the lambdas field. Again, do not consider our choice as a standard one.
-
-
-```python
-lambdas = [10.,10.,10.]
-```
-
-### Call the constructor of the data structure
-
-
-```python
-regterm = commit.solvers.init_regularisation(mit,
-                                             regnorms    = regnorms,
-                                             structureIC = structureIC,
-                                             weightsIC   = weightsIC,
-                                             group_norm  = group_norm,
-                                             lambdas     = lambdas)
-```
-
-### Call the fit function to perform the optimisation
-
-
-```python
-mit.fit(regularisation=regterm, max_iter=1000)
-```
-
-### Save the results
-
-
-```python
-suffix = '_AdvancedSolvers'
-mit.save_results(path_suffix=suffix)
-```
+# Advanced solvers
+
+This tutorial shows how to exploit the advanced features of the COMMIT framework from the side of the **optimisation problem**. The general formulation is the following:
+\begin{equation}
+x^* = \arg\min_{x\in R^n_+} \frac12 \|Ax-y\|_2^2 + \lambda_{IC}\Omega_{IC}(x) + \lambda_{EC}\Omega_{EC}(x) + \lambda_{ISO}\Omega_{ISO}(x),
+\end{equation}
+where $A$ is the COMMIT dictionary, $n$ is defined in such a way that the product $Ax$ makes sense and $y$ is the datum that we want to fit. The three regularisation terms allow us to exploit ***distinct penalties for each compartment***.
+
+*Note*: before exploring this tutorial, you should follow the [Getting Started](https://github.com/daducci/COMMIT/tree/master/docs/tutorials/GettingStarted) tutorial.
+
+
+### Download and unpack the data
+
+Download and extract the **example dataset** from the following [ZIP archive](http://hardi.epfl.ch/static/data/COMMIT_demos/LausanneTwoShell.zip), which contains the following files:
+
+- `DWI.nii`: a diffusion MRI dataset with 100 measurements distributed on 2 shells, respectively at b=700 s/mm^2 and b=2000 s/mm^2;
+- `DWI.scheme`: its corresponding acquisition scheme;
+- `peaks.nii.gz`: main diffusion orientations estimated with CSD;
+- `fibers.trk`: tractogram with about 280K fibers estimated using a streamline-based algorithm;
+- `WM.nii.gz`: white-matter mask extracted from an anatomical T1w image.
+
+
+<span style="color:crimson">**Make sure that your working directory is the folder where you unzipped the downloaded archive.**</span>
+
+
+```python
+path_to_the_directory_with_the_unzipped_archive = '.' # edit this
+cd path_to_the_directory_with_the_unzipped_archive
+```
+
+### Load the usual COMMIT structure
+
+
+```python
+from commit import trk2dictionary
+
+trk2dictionary.run(
+    filename_tractogram = 'LausanneTwoShell/fibers.trk',
+    path_out            = 'LausanneTwoShell/CommitOutput',
+    filename_peaks      = 'LausanneTwoShell/peaks.nii.gz',
+    filename_mask       = 'LausanneTwoShell/WM.nii.gz',
+    fiber_shift         = 0.5,
+    peaks_use_affine    = True
+)
+
+import commit
+commit.core.setup() 
+mit = commit.Evaluation( '.', 'LausanneTwoShell' )
+mit.load_data( 'DWI.nii', 'DWI.scheme' )
+
+mit.set_model( 'StickZeppelinBall' )
+
+d_par   = 1.7E-3            # Parallel diffusivity [mm^2/s]
+d_perps = [ 0.51E-3 ]       # Perpendicular diffusivitis [mm^2/s]
+d_isos = [ 1.7E-3, 3.0E-3 ] # Isotropic diffusivitie(s) [mm^2/s]
+
+mit.model.set( d_par, d_perps, d_isos )
+mit.generate_kernels( regenerate=True )
+mit.load_kernels()
+
+mit.load_dictionary( 'CommitOutput' )
+mit.set_threads()
+mit.build_operator()
+```
+
+### Perform clustering of the streamlines
+
+You will need `dipy`, which is among the requirements of COMMIT, hence there should be no problem.
+
+The `threshold` parameter has to be tuned for each brain. Do not consider our choice as a standard one.
+
+
+```python
+from nibabel import trackvis as tv
+fname='LausanneTwoShell/fibers.trk'
+streams, hdr = tv.read(fname)
+streamlines = [i[0] for i in streams]
+
+from dipy.segment.clustering import QuickBundles
+threshold = 15.0
+qb = QuickBundles(threshold=threshold)
+clusters = qb.cluster(streamlines)
+
+import numpy as np
+structureIC = np.array([np.array(c.indices) for c in clusters])
+weightsIC   = np.array([1.0/np.sqrt(len(c)) for c in structureIC])
+```
+
+### Define the regularisation term
+Each compartment must be regularised separately. The user can choose among the following penalties:
+
+- $\sum_{g\in G}w_g\|x_g\|_k$ : `commit.solvers.group_sparsity` with $k\in \{2, \infty\}$ (only for IC compartment)
+
+- $\|x\|_1$ : `commit.solvers.norm1`
+
+- $\|x\|_2$ : `commit.solvers.norm2`
+
+- $\iota_{\ge 0}(x)$ : `commit.solvers.non_negative` (Default for all compartments)
+
+If the chosen regularisation for the IC compartment is $\sum_{g\in G}\|x_g\|_k$, we can define $k$ via the `group_norm` field, which must be
+
+- $\|x\|_2$ : `commit.solvers.norm2`
+
+In this example we consider the following penalties:
+
+- Intracellular: group sparsity with 2-norm of each group
+
+- Extracellular: 2-norm
+
+- Isotropic: 1-norm
+
+
+```python
+regnorms = [commit.solvers.group_sparsity, commit.solvers.norm2, commit.solvers.norm1]
+
+group_norm = 2 # each group is penalised with its 2-norm
+```
+
+The regularisation parameters are specified within the lambdas field. Again, do not consider our choice as a standard one.
+
+
+```python
+lambdas = [10.,10.,10.]
+```
+
+### Call the constructor of the data structure
+
+
+```python
+regterm = commit.solvers.init_regularisation(mit,
+                                             regnorms    = regnorms,
+                                             structureIC = structureIC,
+                                             weightsIC   = weightsIC,
+                                             group_norm  = group_norm,
+                                             lambdas     = lambdas)
+```
+
+### Call the fit function to perform the optimisation
+
+
+```python
+mit.fit(regularisation=regterm, max_iter=1000)
+```
+
+### Save the results
+
+
+```python
+suffix = '_AdvancedSolvers'
+mit.save_results(path_suffix=suffix)
+```
diff --git a/docs/tutorials/GettingStarted/README.md b/docs/tutorials/GettingStarted/README.md
index fda7092d..5503475f 100644
--- a/docs/tutorials/GettingStarted/README.md
+++ b/docs/tutorials/GettingStarted/README.md
@@ -1,251 +1,251 @@
-# Getting started
-
-This tutorial illustrates the basics for using the COMMIT framework to **evaluate the evidence of a tractogram**.
-
-## Download data
-
-Download and extract the **example dataset** from the following [ZIP archive](http://hardi.epfl.ch/static/data/COMMIT_demos/LausanneTwoShell.zip), which contains the following files:
-
-- `DWI.nii`: a diffusion MRI dataset with 100 measurements distributed on 2 shells, respectively at b=700 s/mm^2 and b=2000 s/mm^2;
-- `DWI.scheme`: its corresponding acquisition scheme;
-- `peaks.nii.gz`: main diffusion orientations estimated with CSD;
-- `fibers.trk`: tractogram with about 280K fibers estimated using a streamline-based algorithm;
-- `WM.nii.gz`: white-matter mask extracted from an anatomical T1w image.
-
-
-## Convert the tractogram
-
-Open the *Python interpreter* and go to the folder where you downloaded/unzipped the archive. Then run the following commands:
-
-```python
-from commit import trk2dictionary
-
-trk2dictionary.run(
-    filename_tractogram = 'LausanneTwoShell/fibers.trk',
-    path_out            = 'LausanneTwoShell/CommitOutput',
-    filename_peaks      = 'LausanneTwoShell/peaks.nii.gz',
-    filename_mask       = 'LausanneTwoShell/WM.nii.gz',
-    fiber_shift         = 0.5,
-    peaks_use_affine    = True
-)
-```
-
-The output should be something like this:
-
-```
--> Creating the dictionary from tractogram:
-	* Segment position = COMPUTE INTERSECTIONS
-	* Fiber shift X    = 0.500 (voxel-size units)
-	* Fiber shift Y    = 0.500 (voxel-size units)
-	* Fiber shift Z    = 0.500 (voxel-size units)
-	* Points to skip   = 0
-	* Loading data:
-		* tractogram
-			- 106 x 106 x 60
-			- 2.0000 x 2.0000 x 2.0000
-			- 283522 fibers
-		* filtering mask
-			- 106 x 106 x 60
-			- 2.0000 x 2.0000 x 2.0000
-		* EC orientations
-			- 106 x 106 x 60 x 9
-			- 2.0000 x 2.0000 x 2.0000
-			- ignoring peaks < 0.10 * MaxPeak
-			- flipping axes : [ x=True, y=True, z=False ]
-		* output written to "LausanneTwoShell/CommitOutput"
-	* Exporting IC compartments:
-          [ 283522 fibers, 24388967 segments ]
-	* Exporting EC compartments:
-          [ 53021 voxels, 145472 segments ]
-   [ 44.6 seconds ]
-```
-
-Please note that, in this particular example, in order to have all the data in the same reference system we had to:
-
-- apply a translation of half voxel to the fibers.
-
-![Flipping in the data](https://github.com/daducci/COMMIT/blob/master/docs/tutorials/GettingStarted/debugger_screenshot2.jpg)
-
-## Load the diffusion data
-
-Precompute the rotation matrices used internally by COMMIT to create the lookup-tables for the response functions:
-
-```python
-import commit
-commit.core.setup()
-```
-
-Now, load the data:
-
-```python
-mit = commit.Evaluation( '.', 'LausanneTwoShell' )
-mit.load_data( 'DWI.nii', 'DWI.scheme' )
-```
-
-The output should be something like:
-
-```
--> Loading data:
-	* DWI signal...
-		- dim    = 106 x 106 x 60 x 100
-		- pixdim = 2.000 x 2.000 x 2.000
-	* Acquisition scheme...
-		- 100 samples, 2 shells
-		- 10 @ b=0 , 30 @ b=700.0 , 60 @ b=2000.0
-   [ 0.2 seconds ]
-
--> Preprocessing:
-	* Normalizing to b0... [ min=0.00,  mean=0.64, max=36.15 ]
-	* Merging multiple b0 volume(s)... [ 106 x 106 x 60 x 91 ]
-   [ 0.5 seconds ]
-```
-
-## Set the forward-model
-
-For this example we made use of the `Stick-Zeppelin-Ball` model described in [(Panagiotaki et al., NeuroImage, 2012)](http://www.sciencedirect.com/science/article/pii/S1053811911011566):
-
-- the contributions of the tracts are modeled as "sticks", i.e. tensors with a given axial diffusivity (`1.7*10^-3 mm^2/s`) but null perpendicular diffusivity;
-- extra-cellular contributions are modeled as tensors with the same axial diffusivity as the sticks (1.7*10^-3 mm^2/s) and whose perpendicular diffusivities are calculated with a tortuosity model as a function of the intra-cellular volume fractions (`0.7`);
-- isotropic contributions are modeled as tensors with isotropic diffusivities (`1.7*10^-3 mm^2/s` and `3.0*10^-3 mm^2/s`).
-
-Setup the parameters of the model and **generate the lookup-tables**:
-
-```python
-mit.set_model( 'StickZeppelinBall' )
-
-d_par = 1.7E-3              # Parallel diffusivity [mm^2/s]
-d_perps = [ 0.51E-3 ]       # Perpendicular diffusivitis [mm^2/s]
-d_isos = [ 1.7E-3, 3.0E-3 ] # Isotropic diffusivitie(s) [mm^2/s]
-
-mit.model.set( d_par, d_perps, d_isos )
-mit.generate_kernels( regenerate=True )
-mit.load_kernels()
-```
-
-and the output should look like:
-
-```
--> Simulating with "Stick-Zeppelin-Ball" model:
-	* 1 stick, 1 extra-cellular and 2 isotropic
-	* A_001... [ OK ]
-	* A_002... [ OK ]
-	* A_003... [ OK ]
-	* A_004... [ OK ]
-   [ 1.5 seconds ]
-
--> Resampling kernels for subject "LausanneTwoShell":
-	* A_001... [ OK ]
-	* A_002... [ OK ]
-	* A_003... [ OK ]
-	* A_004... [ OK ]
-	* Merging multiple b0 volume(s)... [ OK ]
-	* Normalizing... [ OK ]
-   [ 1.0 seconds ]
-```
-
-## Load the sparse data-structure
-
-Load in memory the sparse data-structure previously created with `trk2dicitonary.run()`:
-
-```python
-mit.load_dictionary( 'CommitOutput' )
-```
-
-The output should show that around 280K fibers have been loaded, in addition to 145K segments for the extra-cellular contributions in the 53K voxels of the white matter:
-
-```
--> Loading the dictionary:
-	* segments from the tracts... [ 283522 fibers and 24388967 segments ]
-	* segments from the peaks...  [ 145472 segments ]
-	* isotropic contributions...  [ 53021 voxels ]
-	* post-processing...          [ OK ]
-   [ 14.8 seconds ]
-```
-
-## Build the linear operator A
-
-Now it's time to build the linear operator **A** to compute the matrix-vector multiplications for solving the linear system. This operator uses information from the segments loaded in the previous step and the lookup-tables for the response functions; it also needs to know the workload to be assigned to each thread durint the multiplications. To this aim, run the following commands:
-
-```python
-mit.set_threads()
-mit.build_operator()
-```
-
-The output should be something similar to this:
-
-```
--> Distributing workload to different threads:
-	* number of threads : 4
-	* A operator...  [ OK ]
-	* A' operator... [ OK ]
-   [ 3.5 seconds ]
-
--> Building linear operator A:
-   [ 2.1 seconds ]
-```
-
-NB: the *number of threads* is automatically set to the maximum number of cores in the system (4 in this example), but this setting can be manually set.
-
-## Fit the model to the data
-
-To fit the model (`Stick-Zeppelin-Ball` in this case) to the data, simply run:
-
-```python
-mit.fit( tol_fun = 1e-3, max_iter = 200 )
-```
-
-The optimization progress is displayed by default:
-
-```
--> Fit model using "nnls":
-|     ||Ax-y||     |  Cost function    Abs error      Rel error    |     Abs x          Rel x
-------|------------------|-----------------------------------------------|------------------------------
-1  |   7.5552614e+02  |  2.8540987e+05  4.0602923e+05  1.4226180e+00  |  5.4262515e+01  1.0000000e+00
-2  |   6.7997468e+02  |  2.3118278e+05  5.4227093e+04  2.3456372e-01  |  1.6229691e+01  2.6520302e-01
-3  |   6.2490484e+02  |  1.9525303e+05  3.5929749e+04  1.8401635e-01  |  1.4457099e+01  2.0335528e-01
-...
-...
-...
-137  |   1.4197542e+02  |  1.0078510e+04  1.0588051e+01  1.0505571e-03  |  1.5019784e+00  4.0796383e-03
-138  |   1.4190279e+02  |  1.0068201e+04  1.0309090e+01  1.0239257e-03  |  1.4936457e+00  4.0495040e-03
-139  |   1.4183213e+02  |  1.0058177e+04  1.0024696e+01  9.9667126e-04  |  1.4848343e+00  4.0182480e-03
-< Stopping criterion: REL_OBJ >
-[ 00h 07m 04s ]
-```
-
-where the columns report, respectively, the *iteration number*, the *cost function* and its *relative change*.
-
-## Storing the results
-
-The results and the output maps can be stored to files as follows:
-
-```python
-mit.save_results()
-```
-
-As shown in the output, the results are saved in the folder `Results_StickZeppelinBall`:
-
-```
--> Saving results to "Results_StickZeppelinBall/*":
-	* configuration and results... [ OK ]
-	* fitting errors:
-		- RMSE...  [ 0.059 +/- 0.018 ]
-		- NRMSE... [ 0.117 +/- 0.037 ]
-	* voxelwise contributions:
-		- intra-axonal [ OK ]
-		- extra-axonal [ OK ]
-		- isotropic    [ OK ]
-   [ 2.0 seconds ]
-```
-
-The following figure shows the **density of the tracts** [(Calamante et al., NeuroImage, 2010)](http://www.sciencedirect.com/science/article/pii/S1053811910009766) of the original tractogram (left) and of its optimized version (right):
-
-![Track-density](https://github.com/daducci/COMMIT/blob/master/docs/tutorials/GettingStarted/density.png)
-
-It is also possible to visualize voxelwise maps of the corresponding contributions of the **extra-cellular space** (left) and other **isotropic contaminations** (right):
-
-![Compartments](https://github.com/daducci/COMMIT/blob/master/docs/tutorials/GettingStarted/compartments.png)
-
-Finally, the **fitting error** in each voxel can also be inspected:
-
-![fitting error](https://github.com/daducci/COMMIT/blob/master/docs/tutorials/GettingStarted/NRMSE.png)
+# Getting started
+
+This tutorial illustrates the basics for using the COMMIT framework to **evaluate the evidence of a tractogram**.
+
+## Download data
+
+Download and extract the **example dataset** from the following [ZIP archive](http://hardi.epfl.ch/static/data/COMMIT_demos/LausanneTwoShell.zip), which contains the following files:
+
+- `DWI.nii`: a diffusion MRI dataset with 100 measurements distributed on 2 shells, respectively at b=700 s/mm^2 and b=2000 s/mm^2;
+- `DWI.scheme`: its corresponding acquisition scheme;
+- `peaks.nii.gz`: main diffusion orientations estimated with CSD;
+- `fibers.trk`: tractogram with about 280K fibers estimated using a streamline-based algorithm;
+- `WM.nii.gz`: white-matter mask extracted from an anatomical T1w image.
+
+
+## Convert the tractogram
+
+Open the *Python interpreter* and go to the folder where you downloaded/unzipped the archive. Then run the following commands:
+
+```python
+from commit import trk2dictionary
+
+trk2dictionary.run(
+    filename_tractogram = 'LausanneTwoShell/fibers.trk',
+    path_out            = 'LausanneTwoShell/CommitOutput',
+    filename_peaks      = 'LausanneTwoShell/peaks.nii.gz',
+    filename_mask       = 'LausanneTwoShell/WM.nii.gz',
+    fiber_shift         = 0.5,
+    peaks_use_affine    = True
+)
+```
+
+The output should be something like this:
+
+```
+-> Creating the dictionary from tractogram:
+	* Segment position = COMPUTE INTERSECTIONS
+	* Fiber shift X    = 0.500 (voxel-size units)
+	* Fiber shift Y    = 0.500 (voxel-size units)
+	* Fiber shift Z    = 0.500 (voxel-size units)
+	* Points to skip   = 0
+	* Loading data:
+		* tractogram
+			- 106 x 106 x 60
+			- 2.0000 x 2.0000 x 2.0000
+			- 283522 fibers
+		* filtering mask
+			- 106 x 106 x 60
+			- 2.0000 x 2.0000 x 2.0000
+		* EC orientations
+			- 106 x 106 x 60 x 9
+			- 2.0000 x 2.0000 x 2.0000
+			- ignoring peaks < 0.10 * MaxPeak
+			- flipping axes : [ x=True, y=True, z=False ]
+		* output written to "LausanneTwoShell/CommitOutput"
+	* Exporting IC compartments:
+          [ 283522 fibers, 24388967 segments ]
+	* Exporting EC compartments:
+          [ 53021 voxels, 145472 segments ]
+   [ 44.6 seconds ]
+```
+
+Please note that, in this particular example, in order to have all the data in the same reference system we had to:
+
+- apply a translation of half voxel to the fibers.
+
+![Flipping in the data](https://github.com/daducci/COMMIT/blob/master/docs/tutorials/GettingStarted/debugger_screenshot2.jpg)
+
+## Load the diffusion data
+
+Precompute the rotation matrices used internally by COMMIT to create the lookup-tables for the response functions:
+
+```python
+import commit
+commit.core.setup()
+```
+
+Now, load the data:
+
+```python
+mit = commit.Evaluation( '.', 'LausanneTwoShell' )
+mit.load_data( 'DWI.nii', 'DWI.scheme' )
+```
+
+The output should be something like:
+
+```
+-> Loading data:
+	* DWI signal...
+		- dim    = 106 x 106 x 60 x 100
+		- pixdim = 2.000 x 2.000 x 2.000
+	* Acquisition scheme...
+		- 100 samples, 2 shells
+		- 10 @ b=0 , 30 @ b=700.0 , 60 @ b=2000.0
+   [ 0.2 seconds ]
+
+-> Preprocessing:
+	* Normalizing to b0... [ min=0.00,  mean=0.64, max=36.15 ]
+	* Merging multiple b0 volume(s)... [ 106 x 106 x 60 x 91 ]
+   [ 0.5 seconds ]
+```
+
+## Set the forward-model
+
+For this example we made use of the `Stick-Zeppelin-Ball` model described in [(Panagiotaki et al., NeuroImage, 2012)](http://www.sciencedirect.com/science/article/pii/S1053811911011566):
+
+- the contributions of the tracts are modeled as "sticks", i.e. tensors with a given axial diffusivity (`1.7*10^-3 mm^2/s`) but null perpendicular diffusivity;
+- extra-cellular contributions are modeled as tensors with the same axial diffusivity as the sticks (1.7*10^-3 mm^2/s) and whose perpendicular diffusivities are calculated with a tortuosity model as a function of the intra-cellular volume fractions (`0.7`);
+- isotropic contributions are modeled as tensors with isotropic diffusivities (`1.7*10^-3 mm^2/s` and `3.0*10^-3 mm^2/s`).
+
+Setup the parameters of the model and **generate the lookup-tables**:
+
+```python
+mit.set_model( 'StickZeppelinBall' )
+
+d_par = 1.7E-3              # Parallel diffusivity [mm^2/s]
+d_perps = [ 0.51E-3 ]       # Perpendicular diffusivitis [mm^2/s]
+d_isos = [ 1.7E-3, 3.0E-3 ] # Isotropic diffusivitie(s) [mm^2/s]
+
+mit.model.set( d_par, d_perps, d_isos )
+mit.generate_kernels( regenerate=True )
+mit.load_kernels()
+```
+
+and the output should look like:
+
+```
+-> Simulating with "Stick-Zeppelin-Ball" model:
+	* 1 stick, 1 extra-cellular and 2 isotropic
+	* A_001... [ OK ]
+	* A_002... [ OK ]
+	* A_003... [ OK ]
+	* A_004... [ OK ]
+   [ 1.5 seconds ]
+
+-> Resampling kernels for subject "LausanneTwoShell":
+	* A_001... [ OK ]
+	* A_002... [ OK ]
+	* A_003... [ OK ]
+	* A_004... [ OK ]
+	* Merging multiple b0 volume(s)... [ OK ]
+	* Normalizing... [ OK ]
+   [ 1.0 seconds ]
+```
+
+## Load the sparse data-structure
+
+Load in memory the sparse data-structure previously created with `trk2dicitonary.run()`:
+
+```python
+mit.load_dictionary( 'CommitOutput' )
+```
+
+The output should show that around 280K fibers have been loaded, in addition to 145K segments for the extra-cellular contributions in the 53K voxels of the white matter:
+
+```
+-> Loading the dictionary:
+	* segments from the tracts... [ 283522 fibers and 24388967 segments ]
+	* segments from the peaks...  [ 145472 segments ]
+	* isotropic contributions...  [ 53021 voxels ]
+	* post-processing...          [ OK ]
+   [ 14.8 seconds ]
+```
+
+## Build the linear operator A
+
+Now it's time to build the linear operator **A** to compute the matrix-vector multiplications for solving the linear system. This operator uses information from the segments loaded in the previous step and the lookup-tables for the response functions; it also needs to know the workload to be assigned to each thread durint the multiplications. To this aim, run the following commands:
+
+```python
+mit.set_threads()
+mit.build_operator()
+```
+
+The output should be something similar to this:
+
+```
+-> Distributing workload to different threads:
+	* number of threads : 4
+	* A operator...  [ OK ]
+	* A' operator... [ OK ]
+   [ 3.5 seconds ]
+
+-> Building linear operator A:
+   [ 2.1 seconds ]
+```
+
+NB: the *number of threads* is automatically set to the maximum number of cores in the system (4 in this example), but this setting can be manually set.
+
+## Fit the model to the data
+
+To fit the model (`Stick-Zeppelin-Ball` in this case) to the data, simply run:
+
+```python
+mit.fit( tol_fun = 1e-3, max_iter = 200 )
+```
+
+The optimization progress is displayed by default:
+
+```
+-> Fit model using "nnls":
+|     ||Ax-y||     |  Cost function    Abs error      Rel error    |     Abs x          Rel x
+------|------------------|-----------------------------------------------|------------------------------
+1  |   7.5552614e+02  |  2.8540987e+05  4.0602923e+05  1.4226180e+00  |  5.4262515e+01  1.0000000e+00
+2  |   6.7997468e+02  |  2.3118278e+05  5.4227093e+04  2.3456372e-01  |  1.6229691e+01  2.6520302e-01
+3  |   6.2490484e+02  |  1.9525303e+05  3.5929749e+04  1.8401635e-01  |  1.4457099e+01  2.0335528e-01
+...
+...
+...
+137  |   1.4197542e+02  |  1.0078510e+04  1.0588051e+01  1.0505571e-03  |  1.5019784e+00  4.0796383e-03
+138  |   1.4190279e+02  |  1.0068201e+04  1.0309090e+01  1.0239257e-03  |  1.4936457e+00  4.0495040e-03
+139  |   1.4183213e+02  |  1.0058177e+04  1.0024696e+01  9.9667126e-04  |  1.4848343e+00  4.0182480e-03
+< Stopping criterion: REL_OBJ >
+[ 00h 07m 04s ]
+```
+
+where the columns report, respectively, the *iteration number*, the *cost function* and its *relative change*.
+
+## Storing the results
+
+The results and the output maps can be stored to files as follows:
+
+```python
+mit.save_results()
+```
+
+As shown in the output, the results are saved in the folder `Results_StickZeppelinBall`:
+
+```
+-> Saving results to "Results_StickZeppelinBall/*":
+	* configuration and results... [ OK ]
+	* fitting errors:
+		- RMSE...  [ 0.059 +/- 0.018 ]
+		- NRMSE... [ 0.117 +/- 0.037 ]
+	* voxelwise contributions:
+		- intra-axonal [ OK ]
+		- extra-axonal [ OK ]
+		- isotropic    [ OK ]
+   [ 2.0 seconds ]
+```
+
+The following figure shows the **density of the tracts** [(Calamante et al., NeuroImage, 2010)](http://www.sciencedirect.com/science/article/pii/S1053811910009766) of the original tractogram (left) and of its optimized version (right):
+
+![Track-density](https://github.com/daducci/COMMIT/blob/master/docs/tutorials/GettingStarted/density.png)
+
+It is also possible to visualize voxelwise maps of the corresponding contributions of the **extra-cellular space** (left) and other **isotropic contaminations** (right):
+
+![Compartments](https://github.com/daducci/COMMIT/blob/master/docs/tutorials/GettingStarted/compartments.png)
+
+Finally, the **fitting error** in each voxel can also be inspected:
+
+![fitting error](https://github.com/daducci/COMMIT/blob/master/docs/tutorials/GettingStarted/NRMSE.png)
diff --git a/docs/tutorials/LiFE_STN96/README.md b/docs/tutorials/LiFE_STN96/README.md
index fabe6172..83f43af0 100644
--- a/docs/tutorials/LiFE_STN96/README.md
+++ b/docs/tutorials/LiFE_STN96/README.md
@@ -1,320 +1,320 @@
-# Comparison between COMMIT and LIFE (on STN96 data from the LiFE original publication)
-
-In this example, we show the **importance of using adequate multi-compartment models** to be able to effectively evaluate the evidence of a tractogram, i.e. set of fiber tracts. For more information, please refer to the following abstract (#3148):
-
-> **On evaluating the accuracy and biological plausibility of diffusion MRI tractograms**  
-> *David Romascano, Alessandro Dal Palú, Jean-Philippe Thiran, and Alessandro Daducci*
-
-that recently has been **specially selected for a power pitch presentation** (less than 3% of submitted papers) at the annual *International Society for Magnetic Resonance in Medicine* (ISMRM) meeting in Toronto (30/05-05/06 2015)!
-
-To this aim, we evaluate the performance of the the *LiFE* model that was recently described in [(Pestilli et al, Nat Methods, Sep 2014)](http://www.nature.com/nmeth/journal/v11/n10/abs/nmeth.3098.html). To model the diffusion MR signal in each voxel, **LiFE considers only contributions arising from the tracts** crossing a particular voxel (i.e. restricted diffusion). Notably, *LiFE* does not consider the extra-cellular space around the fibers (i.e. hindered diffusion) and all the partial volume that can occur with gray matter and CSF. On the other hand, *COMMIT* can account for all possible compartments that contribute to the signal in a voxel.
-
-As a matter of fact, *LiFE* can be considered a **special case** of our *COMMIT* framework
- [(Daducci et al, IEEE Trans Med Imaging, Aug 2014)](http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6884830); in fact, it corresponds to the preliminary formulation we had proposed in [(Daducci et al, ISBI, Apr 2013)](http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6556527). Hence, the *COMMIT* framework can be used to evaluate both approaches.
- 
-
-## Download the data
-
-1. Create the folder `STN96/scan1` in your data directory.
-
-2. Download the original DWI data from [here](https://stacks.stanford.edu/file/druid:cs392kv3054/life_demo_data.tar.gz).
-
-3. Extract the file `life_demo_scan1_subject1_b2000_150dirs_stanford.nii.gz` from the archive, unzip it and move it to the `scan1` folder with the name `DWI.nii`, i.e.
-
- ```bash
- gunzip life_demo_scan1_subject1_b2000_150dirs_stanford.nii.gz
- mv life_demo_scan1_subject1_b2000_150dirs_stanford.nii STN96/scan1/DWI.nii
- ```
-
-4. Download precomputed reconstructions from [here](http://hardi.epfl.ch/static/data/COMMIT_demos/STN96_scan1.zip). This archive contains a CSD reconstruction + probabilistic tracking performed according to the experimental setting used in the corresponding publication (e.g. CSD implemented in *MrTrix* and probabilistic tracking with 500000 tracts). 
-
-5. Unzip the file content into the `STN96/scan1` folder in your data directory.
-
-## Convert the tracts to the internal data structure
-
-Use the module `commit.trk2dictionary` to convert the tracts contained in the file `STN96/scan1/Tracking/PROB/fibers.trk` to the internal sparse data structure used by the COMMIT framework. In the Python shell run:
-
-```python
-from commit import trk2dictionary
-trk2dictionary.run(
-    filename_tractogram = 'STN96/scan1/Tracking/PROB/fibers.trk',
-    path_out            = 'STN96/scan1/Tracking/PROB',
-    filename_peaks      = 'STN96/scan1/CSD/CSD_FODsh_peaks.nii',
-    filename_mask       = 'STN96/scan1/WM.nii'
-)
-```
-This will create the necessary data structure (`STN96/scan1/Tracking/PROB/dictionary_*`) containing all the details of the tracts.
-
-NB: the output tractogram from *MrTrix* has been already converted to the format accepted by *COMMIT*, i.e. [TrackVis format](http://www.trackvis.org/docs/?subsect=fileformat) with fiber coordinates (in mm) in image space, where the coordinate (0,0,0) corresponds to the corner of first voxel.
-
-## Process data with COMMIT
-
-Setup a *Microstructure Informed Tractography (mit)* experiment and load the data:
-
-```python
-import commit
-commit.core.setup()
-
-mit = commit.Evaluation( 'STN96', 'scan1' )
-mit.CONFIG['doNormalizeSignal'] = False
-mit.CONFIG['doDemean'] = False
-mit.load_data( 'DWI.nii', 'DWI.scheme' )
-```
-
-Calculate the **kernels** corresponding to the different compartments. In this example, we use 1 kernel for intra-axonal compartment (i.e. Stick), 1 for extra-axonal space (i.e. Zeppelin) and 2 to model partial volume with gray matter and CSF:
-
-```python
-mit.set_model( 'StickZeppelinBall' )
-mit.model.set( 1.7E-3, [ 0.7 ], [ 1.7E-3, 3.0E-3 ] )
-mit.generate_kernels( regenerate=True )
-mit.load_kernels()
-```
-
-Load the *sparse data structure* that represents the linear operators **A** and **A'**:
-
-```python
-mit.load_dictionary( 'Tracking/PROB' )
-```
-
-**Solve** the inverse problem according to the *COMMIT* model:
-
-```python
-mit.set_threads()
-mit.build_operator()
-mit.fit()
-mit.save_results( 'COMMIT' )
-```
-
-Result will be stored in `STN96/scan1/Tracking/PROB/Results_StickZeppelinBall_COMMIT/`.
-
-
-## Process data with LiFE
-
-Setup and load the data; this time, however, we will apply the *demeaning procedure* used in *LiFE* to both data and kernels:
-
-```python
-mit = commit.Evaluation( 'STN96', 'scan1' )
-mit.CONFIG['doNormalizeSignal'] = False
-mit.CONFIG['doDemean'] = True
-mit.load_data( 'DWI.nii', 'DWI.scheme' )
-```
-
-Calculate the **kernel** corresponding to the intra-cellular compartment (the only one considered in *LiFE*); in this example, thus, we use only 1 kernel for intra-axonal compartment (i.e. Stick):
-
-```python
-mit.set_model( 'StickZeppelinBall' )
-mit.model.set( 1.7E-3, [], [] )
-mit.generate_kernels( regenerate=True )
-mit.load_kernels()
-```
-
-Load the *sparse data structure* that represents the linear operators **A** and **At**:
-
-```python
-mit.load_dictionary( 'Tracking/PROB' )
-```
-
-**Solve** the inverse problem according to the *LiFE* model:
-
-```python
-mit.set_threads()
-mit.build_operator()
-mit.fit()
-mit.save_results( 'LIFE' )
-```
-
-Result will be stored in `STN96/scan1/Tracking/PROB/Results_StickZeppelinBall_LIFE/`.
-
-
-## Compare the two models
-
-Let's first analyze the performance of the two approaches in the **native space in which the two models perform the fitting**. In fact, *LiFE* does not fit the model to the acquired diffusion MR signal, but rather to the signal after removing the mean value in each voxel, i.e. demeaned signal.
-
-It is important to note that as the two models actually work in different spaces (different values), a normalization of the error metrics is required in order to compare their accuracy in explaining the measured diffusion MR data. To this aim, we use the *Normalized RMSE (NRMSE)* as quality measure. Please note that the normalization constant used in each voxel quantifies the magnitude of the data in that voxel, hence the values are expressed as *percentage error* with respect to the actual measurements considered in the voxel, i.e. measured diffusion MR signal for *COMMIT* and demeaned signal for *LiFE*.
-
-We then load the *NRMSE* fit error of the two models, as follows:
-
-```python
-import nibabel, numpy, pylab
-
-# load error maps
-niiERR_L = nibabel.load( 'STN96/scan1/Tracking/PROB/Results_StickZeppelinBall_LIFE/fit_NRMSE.nii.gz' );
-niiERR_L_img = 100.0 * niiERR_L.get_data()
-niiERR_C = nibabel.load( 'STN96/scan1/Tracking/PROB/Results_StickZeppelinBall_COMMIT/fit_NRMSE.nii.gz' );
-niiERR_C_img = 100.0 * niiERR_C.get_data()
-
-# load mask
-niiMASK = nibabel.load( 'STN96/scan1/Tracking/PROB/dictionary_tdi.nii.gz' );
-niiMASK_img = niiMASK.get_data()
-```
-
-Then we plot the fitting error with *LiFE* in a representative slice of the brain where two important fiber bundles cross (CST and CC):
-
-```python
-# plot the NRMSE with LiFE
-pylab.figure(1,facecolor='white')
-h = pylab.imshow( numpy.rot90(niiERR_L_img[:,69,:].squeeze()), interpolation='nearest', cmap='hot' )
-h.set_clim(0.0,100.0)
-pylab.colorbar()
-pylab.axis('off')
-h.axes.get_xaxis().set_visible(False)
-h.axes.get_yaxis().set_visible(False)
-yL = niiERR_L_img[ niiMASK_img>0 ]
-pylab.title( 'LiFE : %.1f%% +/- %.1f%%' % ( yL.mean(), yL.std() ) )
-```
-
-![NRMSE for LiFE](https://github.com/daducci/COMMIT/blob/master/docs/tutorials/LiFE_STN96/RESULTS_Fig1.png)
-
-The average fitting error is, in this case, pretty high, i.e. **69.7% ± 16.5%**. Also, we see that *LiFE* shows the highest errors in regions with crossing fibers and close to gray matter, as expected (see [this abstract](ISMRM_3148.pdf)).
-
-We plot now the fitting error with *COMMIT*:
-
-```python
-# plot the NRMSE with COMMIT
-pylab.figure(2,facecolor='white')
-h = pylab.imshow( numpy.rot90(niiERR_C_img[:,69,:].squeeze()), interpolation='nearest', cmap='hot' )
-h.set_clim(0.0,100.0)
-pylab.colorbar()
-pylab.axis('off')
-h.axes.get_xaxis().set_visible(False)
-h.axes.get_yaxis().set_visible(False)
-yC = niiERR_C_img[ niiMASK_img>0 ]
-pylab.title( 'COMMIT : %.1f%% +/- %.1f%%' % ( yC.mean(), yC.std() ) )
-```
-
-![NRMSE for COMMIT](https://github.com/daducci/COMMIT/blob/master/docs/tutorials/LiFE_STN96/RESULTS_Fig2.png)
-
-The average fitting error is drastically reduced with *COMMIT*, i.e. (**19.3% ± 4.7%**). Also, a more homogeneous distribution of the errors can be observed, notably in crossing regions and in proximity to gray matter.
- 
-Now we can directly compare the *fitting error distributions* of the two models:
-
-```python
-# direct comparison of the NRMSE of LiFE and COMMIT
-pylab.figure(3,facecolor='white')
-pylab.clf()
-pylab.hold(True)
-N = numpy.count_nonzero(niiMASK_img>0)
-hL, _ = numpy.histogram( yL, bins=60, range=(0,100), density=True )
-hC, _ = numpy.histogram( yC, bins=60, range=(0,100), density=True )
-pylab.plot( hL, '-', color=[.8,0,0], linewidth=3, label='LiFE' )
-pylab.plot( hC, '-', color=[0,.8,0], linewidth=3, label='COMMIT' )
-pylab.xticks( numpy.linspace(0,60,11,dtype=numpy.uint8), numpy.linspace(0,100,11,dtype=numpy.uint8) )
-pylab.grid(True)
-pylab.xlabel( 'NRMSE [%]' )
-pylab.ylabel( 'percentage of voxels' )
-pylab.legend()
-pylab.title( 'Error distributions' )
-```
-
-![Histograms comparison LiFE vs COMMIT](https://github.com/daducci/COMMIT/blob/master/docs/tutorials/LiFE_STN96/RESULTS_Fig3.png)
-
-Also, we can directly compare their fitting errors *voxel-by-voxel* with the following scatter-plot:
-
-```python
-# voxelwise comparison of the NRMSE of LiFE and COMMIT
-pylab.figure(4,facecolor='white')
-pylab.clf()
-pylab.hold(True)
-pylab.plot( yL, yC, 'bx' )
-pylab.plot( [0,100], [0,100], 'k--', linewidth=2 )
-pylab.grid(True)
-pylab.axis([0,100,0,100])
-pylab.xlabel( 'NRMSE [%] with LiFE' )
-pylab.ylabel( 'NRMSE [%] with COMMIT' )
-pylab.title( 'Error scatterplot' )
-```
-
-![Scatterplot comparison LiFE vs COMMIT](https://github.com/daducci/COMMIT/blob/master/docs/tutorials/LiFE_STN96/RESULTS_Fig4.png)
-
-As we can see, in all voxels the *COMMIT* model **always explains the data much better** than the *LiFE* model.
-
-
-## Compare the two models (continued)
-
-One might also want to **evaluate how well both models explain the measured diffusion MRI signal** acquired with the scanner.
-To this end, we need to *add back the mean* to the data used by the *LiFE* model and utilize the previously estimated fiber weights. By doing this we can directly compare the two models with respect to the same common quantity, i.e. the acquired diffusion MRI signal.
-No normalization is needed in this case and we can then use the *RMSE* (expressed in raw signal units) to compare **the accuracy of the fit** of the two approaches.
-
-To this aim, it is simply necessary to perform the following operations after processing the data with *LiFE*:
-
-```python
-# reload the DWI data and KERNELS (LUT) and DO NOT remove the mean
-mit.CONFIG['doDemean'] = False
-mit.load_data( 'DWI.nii', 'DWI.scheme' )
-mit.load_kernels()
-mit.build_operator()
-
-# recompute the error metrics
-mit.save_results( 'LIFE_2' )
-```
-
-By doing this, both the measurements **y** and the signal **Ax** predicted by the *LiFE* model will be compared using the *NMSE* error metric to evaluate how well the *LiFE* model actually explains the measured diffusion MRI signal.
-We then load the *RMSE* errors and compare the accuracy of the two models, as follows:
-
-```python
-# load error maps
-niiERR_L = nibabel.load( 'STN96/scan1/Tracking/PROB/Results_StickZeppelinBall_LIFE_2/fit_RMSE.nii.gz' );
-niiERR_L_img = niiERR_L.get_data()
-niiERR_C = nibabel.load( 'STN96/scan1/Tracking/PROB/Results_StickZeppelinBall_COMMIT/fit_RMSE.nii.gz' );
-niiERR_C_img = niiERR_C.get_data()
-
-# plot the RMSE with LiFE
-pylab.figure(5,facecolor='white')
-h = pylab.imshow( numpy.rot90(niiERR_L_img[:,69,:].squeeze()), interpolation='nearest', cmap='hot' )
-h.set_clim(0.0,200.0)
-pylab.colorbar()
-pylab.axis('off')
-h.axes.get_xaxis().set_visible(False)
-h.axes.get_yaxis().set_visible(False)
-yL = niiERR_L_img[ niiMASK_img>0 ]
-pylab.title( 'LiFE : %.1f +/- %.1f' % ( yL.mean(), yL.std() ) )
-
-# plot the RMSE with COMMIT
-pylab.figure(6,facecolor='white')
-h = pylab.imshow( numpy.rot90(niiERR_C_img[:,69,:].squeeze()), interpolation='nearest', cmap='hot' )
-h.set_clim(0.0,200.0)
-pylab.colorbar()
-pylab.axis('off')
-h.axes.get_xaxis().set_visible(False)
-h.axes.get_yaxis().set_visible(False)
-yC = niiERR_C_img[ niiMASK_img>0 ]
-pylab.title( 'COMMIT : %.1f +/- %.1f' % ( yC.mean(), yC.std() ) )
-
-# direct comparison of the RMSE of LiFE and COMMIT
-pylab.figure(7,facecolor='white')
-pylab.clf()
-pylab.hold(True)
-N = numpy.count_nonzero(niiMASK_img>0)
-hL, _ = numpy.histogram( yL, bins=100, range=(0,300), density=True )
-hC, _ = numpy.histogram( yC, bins=100, range=(0,300), density=True )
-pylab.plot( hL, '-', color=[.8,0,0], linewidth=3, label='LiFE' )
-pylab.plot( hC, '-', color=[0,.8,0], linewidth=3, label='COMMIT' )
-pylab.xticks( numpy.linspace(0,100,7,dtype=numpy.uint16), numpy.linspace(0,300,7,dtype=numpy.uint16) )
-pylab.grid(True)
-pylab.xlabel( 'RMSE [raw signal units]' )
-pylab.ylabel( 'percentage of voxels' )
-pylab.legend()
-pylab.title( 'Error distributions' )
-
-# voxelwise comparison of the NRMSE of LiFE and COMMIT
-pylab.figure(8,facecolor='white')
-pylab.clf()
-pylab.hold(True)
-pylab.plot( yL, yC, 'bx' )
-pylab.plot( [0,350], [0,350], 'k--', linewidth=2 )
-pylab.grid(True)
-pylab.axis([0,350,0,350])
-pylab.xlabel( 'RMSE [raw signal units] with LiFE' )
-pylab.ylabel( 'RMSE [raw signal units] with COMMIT' )
-pylab.title( 'Error scatterplot' )
-```
-
-![RMSE for LiFE](https://github.com/daducci/COMMIT/blob/master/docs/tutorials/LiFE_STN96/RESULTS_Fig5.png)
-
-![RMSE for COMMIT](https://github.com/daducci/COMMIT/blob/master/docs/tutorials/LiFE_STN96/RESULTS_Fig6.png)
-
-![Histogram comparison LiFE vs COMMIT](https://github.com/daducci/COMMIT/blob/master/docs/tutorials/LiFE_STN96/RESULTS_Fig7.png)
-
-![Scatterplot comparison LiFE vs COMMIT](https://github.com/daducci/COMMIT/blob/master/docs/tutorials/LiFE_STN96/RESULTS_Fig8.png)
-
-As we can see, the results essentially lead to the the same results, as previously highlighted using the *NRMSE* metric, de facto showing the **superiority of the *COMMIT* model in explaining the measured diffusion MRI signal** with respect to *LiFE*.
+# Comparison between COMMIT and LIFE (on STN96 data from the LiFE original publication)
+
+In this example, we show the **importance of using adequate multi-compartment models** to be able to effectively evaluate the evidence of a tractogram, i.e. set of fiber tracts. For more information, please refer to the following abstract (#3148):
+
+> **On evaluating the accuracy and biological plausibility of diffusion MRI tractograms**  
+> *David Romascano, Alessandro Dal Palú, Jean-Philippe Thiran, and Alessandro Daducci*
+
+that recently has been **specially selected for a power pitch presentation** (less than 3% of submitted papers) at the annual *International Society for Magnetic Resonance in Medicine* (ISMRM) meeting in Toronto (30/05-05/06 2015)!
+
+To this aim, we evaluate the performance of the the *LiFE* model that was recently described in [(Pestilli et al, Nat Methods, Sep 2014)](http://www.nature.com/nmeth/journal/v11/n10/abs/nmeth.3098.html). To model the diffusion MR signal in each voxel, **LiFE considers only contributions arising from the tracts** crossing a particular voxel (i.e. restricted diffusion). Notably, *LiFE* does not consider the extra-cellular space around the fibers (i.e. hindered diffusion) and all the partial volume that can occur with gray matter and CSF. On the other hand, *COMMIT* can account for all possible compartments that contribute to the signal in a voxel.
+
+As a matter of fact, *LiFE* can be considered a **special case** of our *COMMIT* framework
+ [(Daducci et al, IEEE Trans Med Imaging, Aug 2014)](http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6884830); in fact, it corresponds to the preliminary formulation we had proposed in [(Daducci et al, ISBI, Apr 2013)](http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6556527). Hence, the *COMMIT* framework can be used to evaluate both approaches.
+ 
+
+## Download the data
+
+1. Create the folder `STN96/scan1` in your data directory.
+
+2. Download the original DWI data from [here](https://stacks.stanford.edu/file/druid:cs392kv3054/life_demo_data.tar.gz).
+
+3. Extract the file `life_demo_scan1_subject1_b2000_150dirs_stanford.nii.gz` from the archive, unzip it and move it to the `scan1` folder with the name `DWI.nii`, i.e.
+
+ ```bash
+ gunzip life_demo_scan1_subject1_b2000_150dirs_stanford.nii.gz
+ mv life_demo_scan1_subject1_b2000_150dirs_stanford.nii STN96/scan1/DWI.nii
+ ```
+
+4. Download precomputed reconstructions from [here](http://hardi.epfl.ch/static/data/COMMIT_demos/STN96_scan1.zip). This archive contains a CSD reconstruction + probabilistic tracking performed according to the experimental setting used in the corresponding publication (e.g. CSD implemented in *MrTrix* and probabilistic tracking with 500000 tracts). 
+
+5. Unzip the file content into the `STN96/scan1` folder in your data directory.
+
+## Convert the tracts to the internal data structure
+
+Use the module `commit.trk2dictionary` to convert the tracts contained in the file `STN96/scan1/Tracking/PROB/fibers.trk` to the internal sparse data structure used by the COMMIT framework. In the Python shell run:
+
+```python
+from commit import trk2dictionary
+trk2dictionary.run(
+    filename_tractogram = 'STN96/scan1/Tracking/PROB/fibers.trk',
+    path_out            = 'STN96/scan1/Tracking/PROB',
+    filename_peaks      = 'STN96/scan1/CSD/CSD_FODsh_peaks.nii',
+    filename_mask       = 'STN96/scan1/WM.nii'
+)
+```
+This will create the necessary data structure (`STN96/scan1/Tracking/PROB/dictionary_*`) containing all the details of the tracts.
+
+NB: the output tractogram from *MrTrix* has been already converted to the format accepted by *COMMIT*, i.e. [TrackVis format](http://www.trackvis.org/docs/?subsect=fileformat) with fiber coordinates (in mm) in image space, where the coordinate (0,0,0) corresponds to the corner of first voxel.
+
+## Process data with COMMIT
+
+Setup a *Microstructure Informed Tractography (mit)* experiment and load the data:
+
+```python
+import commit
+commit.core.setup()
+
+mit = commit.Evaluation( 'STN96', 'scan1' )
+mit.CONFIG['doNormalizeSignal'] = False
+mit.CONFIG['doDemean'] = False
+mit.load_data( 'DWI.nii', 'DWI.scheme' )
+```
+
+Calculate the **kernels** corresponding to the different compartments. In this example, we use 1 kernel for intra-axonal compartment (i.e. Stick), 1 for extra-axonal space (i.e. Zeppelin) and 2 to model partial volume with gray matter and CSF:
+
+```python
+mit.set_model( 'StickZeppelinBall' )
+mit.model.set( 1.7E-3, [ 0.7 ], [ 1.7E-3, 3.0E-3 ] )
+mit.generate_kernels( regenerate=True )
+mit.load_kernels()
+```
+
+Load the *sparse data structure* that represents the linear operators **A** and **A'**:
+
+```python
+mit.load_dictionary( 'Tracking/PROB' )
+```
+
+**Solve** the inverse problem according to the *COMMIT* model:
+
+```python
+mit.set_threads()
+mit.build_operator()
+mit.fit()
+mit.save_results( 'COMMIT' )
+```
+
+Result will be stored in `STN96/scan1/Tracking/PROB/Results_StickZeppelinBall_COMMIT/`.
+
+
+## Process data with LiFE
+
+Setup and load the data; this time, however, we will apply the *demeaning procedure* used in *LiFE* to both data and kernels:
+
+```python
+mit = commit.Evaluation( 'STN96', 'scan1' )
+mit.CONFIG['doNormalizeSignal'] = False
+mit.CONFIG['doDemean'] = True
+mit.load_data( 'DWI.nii', 'DWI.scheme' )
+```
+
+Calculate the **kernel** corresponding to the intra-cellular compartment (the only one considered in *LiFE*); in this example, thus, we use only 1 kernel for intra-axonal compartment (i.e. Stick):
+
+```python
+mit.set_model( 'StickZeppelinBall' )
+mit.model.set( 1.7E-3, [], [] )
+mit.generate_kernels( regenerate=True )
+mit.load_kernels()
+```
+
+Load the *sparse data structure* that represents the linear operators **A** and **At**:
+
+```python
+mit.load_dictionary( 'Tracking/PROB' )
+```
+
+**Solve** the inverse problem according to the *LiFE* model:
+
+```python
+mit.set_threads()
+mit.build_operator()
+mit.fit()
+mit.save_results( 'LIFE' )
+```
+
+Result will be stored in `STN96/scan1/Tracking/PROB/Results_StickZeppelinBall_LIFE/`.
+
+
+## Compare the two models
+
+Let's first analyze the performance of the two approaches in the **native space in which the two models perform the fitting**. In fact, *LiFE* does not fit the model to the acquired diffusion MR signal, but rather to the signal after removing the mean value in each voxel, i.e. demeaned signal.
+
+It is important to note that as the two models actually work in different spaces (different values), a normalization of the error metrics is required in order to compare their accuracy in explaining the measured diffusion MR data. To this aim, we use the *Normalized RMSE (NRMSE)* as quality measure. Please note that the normalization constant used in each voxel quantifies the magnitude of the data in that voxel, hence the values are expressed as *percentage error* with respect to the actual measurements considered in the voxel, i.e. measured diffusion MR signal for *COMMIT* and demeaned signal for *LiFE*.
+
+We then load the *NRMSE* fit error of the two models, as follows:
+
+```python
+import nibabel, numpy, pylab
+
+# load error maps
+niiERR_L = nibabel.load( 'STN96/scan1/Tracking/PROB/Results_StickZeppelinBall_LIFE/fit_NRMSE.nii.gz' );
+niiERR_L_img = 100.0 * niiERR_L.get_data()
+niiERR_C = nibabel.load( 'STN96/scan1/Tracking/PROB/Results_StickZeppelinBall_COMMIT/fit_NRMSE.nii.gz' );
+niiERR_C_img = 100.0 * niiERR_C.get_data()
+
+# load mask
+niiMASK = nibabel.load( 'STN96/scan1/Tracking/PROB/dictionary_tdi.nii.gz' );
+niiMASK_img = niiMASK.get_data()
+```
+
+Then we plot the fitting error with *LiFE* in a representative slice of the brain where two important fiber bundles cross (CST and CC):
+
+```python
+# plot the NRMSE with LiFE
+pylab.figure(1,facecolor='white')
+h = pylab.imshow( numpy.rot90(niiERR_L_img[:,69,:].squeeze()), interpolation='nearest', cmap='hot' )
+h.set_clim(0.0,100.0)
+pylab.colorbar()
+pylab.axis('off')
+h.axes.get_xaxis().set_visible(False)
+h.axes.get_yaxis().set_visible(False)
+yL = niiERR_L_img[ niiMASK_img>0 ]
+pylab.title( 'LiFE : %.1f%% +/- %.1f%%' % ( yL.mean(), yL.std() ) )
+```
+
+![NRMSE for LiFE](https://github.com/daducci/COMMIT/blob/master/docs/tutorials/LiFE_STN96/RESULTS_Fig1.png)
+
+The average fitting error is, in this case, pretty high, i.e. **69.7% ± 16.5%**. Also, we see that *LiFE* shows the highest errors in regions with crossing fibers and close to gray matter, as expected (see [this abstract](ISMRM_3148.pdf)).
+
+We plot now the fitting error with *COMMIT*:
+
+```python
+# plot the NRMSE with COMMIT
+pylab.figure(2,facecolor='white')
+h = pylab.imshow( numpy.rot90(niiERR_C_img[:,69,:].squeeze()), interpolation='nearest', cmap='hot' )
+h.set_clim(0.0,100.0)
+pylab.colorbar()
+pylab.axis('off')
+h.axes.get_xaxis().set_visible(False)
+h.axes.get_yaxis().set_visible(False)
+yC = niiERR_C_img[ niiMASK_img>0 ]
+pylab.title( 'COMMIT : %.1f%% +/- %.1f%%' % ( yC.mean(), yC.std() ) )
+```
+
+![NRMSE for COMMIT](https://github.com/daducci/COMMIT/blob/master/docs/tutorials/LiFE_STN96/RESULTS_Fig2.png)
+
+The average fitting error is drastically reduced with *COMMIT*, i.e. (**19.3% ± 4.7%**). Also, a more homogeneous distribution of the errors can be observed, notably in crossing regions and in proximity to gray matter.
+ 
+Now we can directly compare the *fitting error distributions* of the two models:
+
+```python
+# direct comparison of the NRMSE of LiFE and COMMIT
+pylab.figure(3,facecolor='white')
+pylab.clf()
+pylab.hold(True)
+N = numpy.count_nonzero(niiMASK_img>0)
+hL, _ = numpy.histogram( yL, bins=60, range=(0,100), density=True )
+hC, _ = numpy.histogram( yC, bins=60, range=(0,100), density=True )
+pylab.plot( hL, '-', color=[.8,0,0], linewidth=3, label='LiFE' )
+pylab.plot( hC, '-', color=[0,.8,0], linewidth=3, label='COMMIT' )
+pylab.xticks( numpy.linspace(0,60,11,dtype=numpy.uint8), numpy.linspace(0,100,11,dtype=numpy.uint8) )
+pylab.grid(True)
+pylab.xlabel( 'NRMSE [%]' )
+pylab.ylabel( 'percentage of voxels' )
+pylab.legend()
+pylab.title( 'Error distributions' )
+```
+
+![Histograms comparison LiFE vs COMMIT](https://github.com/daducci/COMMIT/blob/master/docs/tutorials/LiFE_STN96/RESULTS_Fig3.png)
+
+Also, we can directly compare their fitting errors *voxel-by-voxel* with the following scatter-plot:
+
+```python
+# voxelwise comparison of the NRMSE of LiFE and COMMIT
+pylab.figure(4,facecolor='white')
+pylab.clf()
+pylab.hold(True)
+pylab.plot( yL, yC, 'bx' )
+pylab.plot( [0,100], [0,100], 'k--', linewidth=2 )
+pylab.grid(True)
+pylab.axis([0,100,0,100])
+pylab.xlabel( 'NRMSE [%] with LiFE' )
+pylab.ylabel( 'NRMSE [%] with COMMIT' )
+pylab.title( 'Error scatterplot' )
+```
+
+![Scatterplot comparison LiFE vs COMMIT](https://github.com/daducci/COMMIT/blob/master/docs/tutorials/LiFE_STN96/RESULTS_Fig4.png)
+
+As we can see, in all voxels the *COMMIT* model **always explains the data much better** than the *LiFE* model.
+
+
+## Compare the two models (continued)
+
+One might also want to **evaluate how well both models explain the measured diffusion MRI signal** acquired with the scanner.
+To this end, we need to *add back the mean* to the data used by the *LiFE* model and utilize the previously estimated fiber weights. By doing this we can directly compare the two models with respect to the same common quantity, i.e. the acquired diffusion MRI signal.
+No normalization is needed in this case and we can then use the *RMSE* (expressed in raw signal units) to compare **the accuracy of the fit** of the two approaches.
+
+To this aim, it is simply necessary to perform the following operations after processing the data with *LiFE*:
+
+```python
+# reload the DWI data and KERNELS (LUT) and DO NOT remove the mean
+mit.CONFIG['doDemean'] = False
+mit.load_data( 'DWI.nii', 'DWI.scheme' )
+mit.load_kernels()
+mit.build_operator()
+
+# recompute the error metrics
+mit.save_results( 'LIFE_2' )
+```
+
+By doing this, both the measurements **y** and the signal **Ax** predicted by the *LiFE* model will be compared using the *NMSE* error metric to evaluate how well the *LiFE* model actually explains the measured diffusion MRI signal.
+We then load the *RMSE* errors and compare the accuracy of the two models, as follows:
+
+```python
+# load error maps
+niiERR_L = nibabel.load( 'STN96/scan1/Tracking/PROB/Results_StickZeppelinBall_LIFE_2/fit_RMSE.nii.gz' );
+niiERR_L_img = niiERR_L.get_data()
+niiERR_C = nibabel.load( 'STN96/scan1/Tracking/PROB/Results_StickZeppelinBall_COMMIT/fit_RMSE.nii.gz' );
+niiERR_C_img = niiERR_C.get_data()
+
+# plot the RMSE with LiFE
+pylab.figure(5,facecolor='white')
+h = pylab.imshow( numpy.rot90(niiERR_L_img[:,69,:].squeeze()), interpolation='nearest', cmap='hot' )
+h.set_clim(0.0,200.0)
+pylab.colorbar()
+pylab.axis('off')
+h.axes.get_xaxis().set_visible(False)
+h.axes.get_yaxis().set_visible(False)
+yL = niiERR_L_img[ niiMASK_img>0 ]
+pylab.title( 'LiFE : %.1f +/- %.1f' % ( yL.mean(), yL.std() ) )
+
+# plot the RMSE with COMMIT
+pylab.figure(6,facecolor='white')
+h = pylab.imshow( numpy.rot90(niiERR_C_img[:,69,:].squeeze()), interpolation='nearest', cmap='hot' )
+h.set_clim(0.0,200.0)
+pylab.colorbar()
+pylab.axis('off')
+h.axes.get_xaxis().set_visible(False)
+h.axes.get_yaxis().set_visible(False)
+yC = niiERR_C_img[ niiMASK_img>0 ]
+pylab.title( 'COMMIT : %.1f +/- %.1f' % ( yC.mean(), yC.std() ) )
+
+# direct comparison of the RMSE of LiFE and COMMIT
+pylab.figure(7,facecolor='white')
+pylab.clf()
+pylab.hold(True)
+N = numpy.count_nonzero(niiMASK_img>0)
+hL, _ = numpy.histogram( yL, bins=100, range=(0,300), density=True )
+hC, _ = numpy.histogram( yC, bins=100, range=(0,300), density=True )
+pylab.plot( hL, '-', color=[.8,0,0], linewidth=3, label='LiFE' )
+pylab.plot( hC, '-', color=[0,.8,0], linewidth=3, label='COMMIT' )
+pylab.xticks( numpy.linspace(0,100,7,dtype=numpy.uint16), numpy.linspace(0,300,7,dtype=numpy.uint16) )
+pylab.grid(True)
+pylab.xlabel( 'RMSE [raw signal units]' )
+pylab.ylabel( 'percentage of voxels' )
+pylab.legend()
+pylab.title( 'Error distributions' )
+
+# voxelwise comparison of the NRMSE of LiFE and COMMIT
+pylab.figure(8,facecolor='white')
+pylab.clf()
+pylab.hold(True)
+pylab.plot( yL, yC, 'bx' )
+pylab.plot( [0,350], [0,350], 'k--', linewidth=2 )
+pylab.grid(True)
+pylab.axis([0,350,0,350])
+pylab.xlabel( 'RMSE [raw signal units] with LiFE' )
+pylab.ylabel( 'RMSE [raw signal units] with COMMIT' )
+pylab.title( 'Error scatterplot' )
+```
+
+![RMSE for LiFE](https://github.com/daducci/COMMIT/blob/master/docs/tutorials/LiFE_STN96/RESULTS_Fig5.png)
+
+![RMSE for COMMIT](https://github.com/daducci/COMMIT/blob/master/docs/tutorials/LiFE_STN96/RESULTS_Fig6.png)
+
+![Histogram comparison LiFE vs COMMIT](https://github.com/daducci/COMMIT/blob/master/docs/tutorials/LiFE_STN96/RESULTS_Fig7.png)
+
+![Scatterplot comparison LiFE vs COMMIT](https://github.com/daducci/COMMIT/blob/master/docs/tutorials/LiFE_STN96/RESULTS_Fig8.png)
+
+As we can see, the results essentially lead to the the same results, as previously highlighted using the *NRMSE* metric, de facto showing the **superiority of the *COMMIT* model in explaining the measured diffusion MRI signal** with respect to *LiFE*.
diff --git a/docs/tutorials/README.md b/docs/tutorials/README.md
index 2c61c5c2..e6438a1a 100644
--- a/docs/tutorials/README.md
+++ b/docs/tutorials/README.md
@@ -1,5 +1,5 @@
-Tutorials/demos using the COMMIT framework:
-
-* [Getting started](GettingStarted)
-* [Comparison to LiFE on STN96 data](LiFE_STN96)
-* [Advanced Solvers](AdvancedSolvers)
+Tutorials/demos using the COMMIT framework:
+
+* [Getting started](GettingStarted)
+* [Comparison to LiFE on STN96 data](LiFE_STN96)
+* [Advanced Solvers](AdvancedSolvers)
diff --git a/extras/COMMIT_debugger/OPENGL_callbacks.cxx b/extras/COMMIT_debugger/OPENGL_callbacks.cxx
index 60b097ee..b0dbf510 100755
--- a/extras/COMMIT_debugger/OPENGL_callbacks.cxx
+++ b/extras/COMMIT_debugger/OPENGL_callbacks.cxx
@@ -1,1140 +1,1140 @@
-#define GL_GLEXT_PROTOTYPES 1
-#ifdef __APPLE__
-    #include <OpenGL/gl.h>
-    #include <OpenGL/glext.h>
-    #include <GLUT/glut.h>
-#else
-    #include <GL/gl.h>
-    #include <GL/glext.h>
-    #include <GL/glut.h>
-#endif
-
-#include "OPENGL_utils.h"
-using namespace OPENGL_utils;
-
-/* global variables */
-GLfloat			id[16], rot[16], rot1[16], rot2[16], rot3[16];
-Vec3Df			translation;
-Vec3Di			start;
-GLint			moving;
-GLfloat			zoom;
-
-float ScreenX, ScreenY;
-
-void drawString( const char *string )
-{
-    static int y = glutGet( GLUT_WINDOW_HEIGHT ) - 50;
-    if ( string=="" )
-        y = glutGet( GLUT_WINDOW_HEIGHT ) - 50;
-    else
-    {
-        glRasterPos2i(10, y);
-        for (const char* c=string; *c != '\0'; c++) 
-            glutBitmapCharacter(GLUT_BITMAP_9_BY_15, *c);
-        y -= 18;
-    }
-}
-
-void PrintConfig()
-{
-    if ( !showConfig )
-        return;
-
-    glMatrixMode(GL_PROJECTION);
-    glPushMatrix();             
-    glLoadIdentity();
-    glMatrixMode( GL_MODELVIEW ) ;
-    glPushMatrix() ;
-    glLoadIdentity() ;
-    int w = glutGet( GLUT_WINDOW_WIDTH );
-    int h = glutGet( GLUT_WINDOW_HEIGHT );
-    glOrtho( 0, w, 0, h, -1, 1 );
-    glDisable( GL_DEPTH_TEST ); 
-
-    char s[1024];
-    glColor3f(1, 1, 0);
-    drawString( "" ); // reset initial position
-
-    drawString( "MAP" );
-    sprintf( s, "   - value(%d,%d,%d) = %.2f", VOXEL.x, VOXEL.y, VOXEL.z, MAP(VOXEL.x, VOXEL.y, VOXEL.z) );
-    drawString( s );
-    sprintf( s, "   - range = [ %.1f ... %.1f ]", MAP_min_view, MAP_max_view );
-    drawString( s );
-    sprintf( s, "   - opacity = %.1f", MAP_opacity );
-    drawString( s );
-
-    drawString( "SIGNAL" );
-    sprintf( s, "   - shell = %d/%d  (b=%.1f)", GLYPHS_shell+1, SCHEME_shells_b.size(), SCHEME_shells_b[GLYPHS_shell] );
-    drawString( s );
-    sprintf( s, "   - use affine = %s", GLYPHS_use_affine?"true":"false" );
-    drawString( s );
-    sprintf( s, "   - flip = [ %d, %d, %d ]", GLYPHS_flip[0], GLYPHS_flip[1], GLYPHS_flip[2] );
-    drawString( s );
-    sprintf( s, "   - b0 thr = %.1f", GLYPHS_b0_thr );
-    drawString( s );
-
-    if ( PEAKS_n>0 )
-    {
-        drawString( "PEAKS" );
-        sprintf( s, "   - use affine = %s", PEAKS_use_affine?"true":"false" );
-        drawString( s );
-        sprintf( s, "   - flip = [ %d, %d, %d ]", PEAKS_flip[0], PEAKS_flip[1], PEAKS_flip[2] );
-        drawString( s );
-        sprintf( s, "   - thr = %.1f", PEAKS_thr );
-        drawString( s );
-        sprintf( s, "   - normalize = %s", PEAKS_doNormalize?"true":"false" );
-        drawString( s );
-    }
-
-    if ( TRK_nTractsPlotted>0 )
-    {
-        drawString( "FIBERS" );
-        sprintf( s, "   - shift = [ %.1f %.1f %.1f ]  (voxels)", TRK_offset.x, TRK_offset.y, TRK_offset.z );
-        drawString( s );
-        sprintf( s, "   - slab thickness = %.1f  (voxels)", TRK_crop );
-        drawString( s );
-    }
-
-    glEnable (GL_DEPTH_TEST);     
-    glMatrixMode(GL_PROJECTION);
-    glPopMatrix();
-    glMatrixMode(GL_MODELVIEW);
-    glPopMatrix();
-}
-
-
-// KEYBOARD callback
-// -----------------
-void GLUT__keyboard( unsigned char key, GLint x=0, GLint y=0 )
-{
-    bool doRedraw = true;
-
-    switch( key )
-    {
-        case 'l': showConfig = 1 - showConfig; break;
-
-        case '1': showPlane[0] = 1 - showPlane[0]; break;
-        case '2': showPlane[1] = 1 - showPlane[1]; break;
-        case '3': showPlane[2] = 1 - showPlane[2]; break;
-        case '4':
-            showPlane[0] = 1;
-            showPlane[1] = 0;
-            showPlane[2] = 0;
-            translation.x	= translation.y = 0;
-            OPENGL_utils::identity(rot1);
-            OPENGL_utils::rotateX(rot1, 90.0, rot2);
-            OPENGL_utils::rotateZ(rot2, 90.0, rot);
-            break;
-        case '5':
-            showPlane[0] = 0;
-            showPlane[1] = 1;
-            showPlane[2] = 0;
-            translation.x	= translation.y = 0;
-            OPENGL_utils::identity(rot1);
-            OPENGL_utils::rotateX(rot1, 90.0, rot);
-            break;
-        case '6':
-            showPlane[0] = 0;
-            showPlane[1] = 0;
-            showPlane[2] = 1;
-            translation.x	= translation.y = 0;
-            OPENGL_utils::identity( rot );
-            break;
-
-        case '0': showAxes = 1 - showAxes; break;
-        case '-': zoom += 10.0; break;
-        case '+': zoom -= 10.0; break;
-        case 'm': MAP_max_view = fmaxf(0.0,MAP_max_view-MAP_max*0.05); break;
-        case 'M': MAP_max_view = fminf(MAP_max,MAP_max_view+MAP_max*0.05); break;
-        case 'o': MAP_opacity = fmaxf(0.0,MAP_opacity-0.1); break;
-        case 'O': MAP_opacity = fminf(1.0,MAP_opacity+0.1); break;
-        case 'w': LINE_width = fmaxf( 1,LINE_width-1); break;
-        case 'W': LINE_width = fminf(10,LINE_width+1); break;
-        case 'r':
-            showPlane[0] = showPlane[1] = showPlane[2] = 1;
-            translation.x	= translation.y = 0;
-            zoom			= 0;
-            OPENGL_utils::identity( rot );
-            break;
-
-        case 's': GLYPHS_show = 1 - GLYPHS_show; break;
-        case 'S': GLYPHS_shell = (GLYPHS_shell+1) % SCHEME_shells_idx.size(); break;
-        case 'a': GLYPHS_use_affine = 1 - GLYPHS_use_affine; break;
-        case 'x': GLYPHS_flip[0] = 1 - GLYPHS_flip[0]; for(int d=0; d < SCHEME_dirs.size() ;d++) SCHEME_dirs[d].x *= -1; break;
-        case 'y': GLYPHS_flip[1] = 1 - GLYPHS_flip[1]; for(int d=0; d < SCHEME_dirs.size() ;d++) SCHEME_dirs[d].y *= -1; break;
-        case 'z': GLYPHS_flip[2] = 1 - GLYPHS_flip[2]; for(int d=0; d < SCHEME_dirs.size() ;d++) SCHEME_dirs[d].z *= -1; break;
-        case 'b': GLYPHS_b0_thr = fmaxf(0.0,GLYPHS_b0_thr-10.0); break;
-        case 'B': GLYPHS_b0_thr = fminf(MAP_max,GLYPHS_b0_thr+10.0); break;
-
-        case 'p': if ( PEAKS_n>0 ) PEAKS_show  = 1 - PEAKS_show; break;
-        case 'A': PEAKS_use_affine = 1 - PEAKS_use_affine; break;
-        case 'X': PEAKS_flip[0] = 1 - PEAKS_flip[0]; break;
-        case 'Y': PEAKS_flip[1] = 1 - PEAKS_flip[1]; break;
-        case 'Z': PEAKS_flip[2] = 1 - PEAKS_flip[2]; break;
-        case 't': PEAKS_thr = fmaxf(PEAKS_thr - 0.1, 0.0); break;
-        case 'T': PEAKS_thr = fminf(PEAKS_thr + 0.1, 1.0); break;
-        case 'n': PEAKS_doNormalize = 1 - PEAKS_doNormalize; break;
-
-        case 'f': if ( TRK_nTractsPlotted>0 ) TRK_show = 1 - TRK_show; break;
-        case 'c': TRK_crop = fmaxf( 0.0,TRK_crop-0.5); break;
-        case 'C': TRK_crop = fminf(max(dim.x,max(dim.y,dim.z)),TRK_crop+0.5); break;
-        case ' ': TRK_crop_mode = 1 - TRK_crop_mode; break;
-
-        case 'q':
-        case 27 : exit(0); break;
-
-        default: doRedraw = false;
-    }
-
-    if ( doRedraw )
-        glutPostRedisplay();
-}
-
-
-// MENU callback
-// -------------
-void GLUT__menu( int id ) 
-{
-    switch( id )
-    {
-        case   0: GLUT__keyboard('q'); break;
-
-        case 101: GLUT__keyboard('s'); break;
-        case 102: GLUT__keyboard('S'); break;
-        case 103: GLUT__keyboard('a'); break;
-        case 104: GLUT__keyboard('x'); break;
-        case 105: GLUT__keyboard('y'); break;
-        case 106: GLUT__keyboard('z'); break;
-        case 107: GLUT__keyboard('b'); break;
-        case 108: GLUT__keyboard('B'); break;
-
-        case 201: GLUT__keyboard('p'); break;
-        case 202: GLUT__keyboard('A'); break;
-        case 203: GLUT__keyboard('X'); break;
-        case 204: GLUT__keyboard('Y'); break;
-        case 205: GLUT__keyboard('Z'); break;
-        case 206: GLUT__keyboard('t'); break;
-        case 207: GLUT__keyboard('T'); break;
-        case 208: GLUT__keyboard('n'); break;
-
-        case 301: GLUT__keyboard('f'); break;
-        case 302: GLUT__keyboard('c'); break;
-        case 303: GLUT__keyboard('C'); break;
-        case 304: GLUT__keyboard(' '); break;
-
-        case 401: GLUT__keyboard('1'); break;
-        case 402: GLUT__keyboard('2'); break;
-        case 403: GLUT__keyboard('3'); break;
-        case 404: GLUT__keyboard('4'); break;
-        case 405: GLUT__keyboard('5'); break;
-        case 406: GLUT__keyboard('6'); break;
-        case 407: GLUT__keyboard('0'); break;
-        case 408: GLUT__keyboard('-'); break;
-        case 409: GLUT__keyboard('+'); break;
-        case 410: GLUT__keyboard('m'); break;
-        case 411: GLUT__keyboard('M'); break;
-        case 412: GLUT__keyboard('o'); break;
-        case 413: GLUT__keyboard('O'); break;
-        case 414: GLUT__keyboard('w'); break;
-        case 415: GLUT__keyboard('W'); break;
-        case 416: GLUT__keyboard('r'); break;
-        case 417: GLUT__keyboard('l'); break;
-    }
-}
-
-
-// Create the dropdown MENU
-// ------------------------
-void GLUT__createMenu()
-{
-    int submenu_SIGNAL_id, submenu_PEAKS_id, submenu_FIBERS_id, submenu_VIEW_id;
-
-    submenu_SIGNAL_id = glutCreateMenu( GLUT__menu );
-    glutAddMenuEntry("[s] Show/hide",         101);
-    glutAddMenuEntry("[S] Change shell",      102);
-    glutAddMenuEntry("[a] Use affine",        103);
-    glutAddMenuEntry("[x] Flip X axis",       104);
-    glutAddMenuEntry("[y] Flip Y axis",       105);
-    glutAddMenuEntry("[z] Flip Z axis",       106);
-    glutAddMenuEntry("[b] Decrease b0 thr",   107);
-    glutAddMenuEntry("[B] Increase b0 thr",   108);
-
-    if ( PEAKS_n>0 )
-    {
-        submenu_PEAKS_id = glutCreateMenu( GLUT__menu );
-        glutAddMenuEntry("[p] Show/hide",         201);
-        glutAddMenuEntry("[A] Use affine",        202);
-        glutAddMenuEntry("[X] Flip X axis",       203);
-        glutAddMenuEntry("[Y] Flip Y axis",       204);
-        glutAddMenuEntry("[Z] Flip Z axis",       205);
-        glutAddMenuEntry("[t] Decrease threshold",206);
-        glutAddMenuEntry("[T] Increase threshold",207);
-        glutAddMenuEntry("[n] Normalize length",  208);
-    }
-
-    if ( TRK_nTractsPlotted>0 )
-    {
-        submenu_FIBERS_id = glutCreateMenu( GLUT__menu );
-        glutAddMenuEntry("[f] Show/hide",         301);
-        glutAddMenuEntry("[c] Decrease crop size",302);
-        glutAddMenuEntry("[C] Increase crop size",303);
-        glutAddMenuEntry("[ ] Change crop mode",  304);
-    }
-
-    submenu_VIEW_id = glutCreateMenu( GLUT__menu );
-    glutAddMenuEntry("[1] Show/hide YZ plane", 401);
-    glutAddMenuEntry("[2] Show/hide XZ plane", 402);
-    glutAddMenuEntry("[3] Show/hide XY plane", 403);
-    glutAddMenuEntry("[4] Reset to YZ plane",  404);
-    glutAddMenuEntry("[5] Reset to XZ plane",  405);
-    glutAddMenuEntry("[6] Reset to XY plane",  406);
-    glutAddMenuEntry("[0] Show/hide axes",     407);
-    glutAddMenuEntry("[-] Decrease zoom",      408);
-    glutAddMenuEntry("[+] Increase zoom",      409);
-    glutAddMenuEntry("[m] Decrease max value", 410);
-    glutAddMenuEntry("[M] Increase max value", 411);
-    glutAddMenuEntry("[o] Decrease opacity",   412);
-    glutAddMenuEntry("[O] Increase opacity",   413);
-    glutAddMenuEntry("[t] Decrease line width",414);
-    glutAddMenuEntry("[T] Increase line width",415);
-    glutAddMenuEntry("[r] Reset view",         416);
-    glutAddMenuEntry("[l] Show/hide log",      417);
-
-    int menu_id = glutCreateMenu( GLUT__menu );
-    glutAddSubMenu("Signal", submenu_SIGNAL_id);
-    if ( PEAKS_n>0 )
-        glutAddSubMenu("Peaks", submenu_PEAKS_id);
-    if ( TRK_nTractsPlotted>0 )
-        glutAddSubMenu("Fibers", submenu_FIBERS_id);
-    glutAddSubMenu("View options", submenu_VIEW_id);
-    glutAddMenuEntry("Quit", 0);
-    glutAttachMenu(GLUT_RIGHT_BUTTON);
-}
-
-
-// RESHAPE callback
-// ----------------
-void GLUT__reshape( GLint w, GLint h )
-{
-    ScreenX = w;
-    ScreenY = h;
-
-    glMatrixMode( GL_PROJECTION );
-    glLoadIdentity();
-    gluPerspective( 45.0f, (GLfloat)w / (GLfloat)h, 1.0f, 5000.0f );
-
-    glMatrixMode( GL_MODELVIEW );
-    glLoadIdentity();
-    gluLookAt(
-        0.0, 0.0, 2.0 * max(pixdim.x*dim.x,pixdim.y*dim.y) * (GLfloat)ScreenY/(GLfloat)ScreenX, // eye point
-        0.0, 0.0, 0.0, // reference point
-        0.0, 1.0, 0.0  // up vector
-    );
-}
-
-
-// SPECIALKEY callback
-// -------------------
-void GLUT__specialkey( GLint key, GLint x, GLint y )
-{
-    bool doRedraw = true;
-    GLint modif = glutGetModifiers();
-    GLint ALT   = modif & GLUT_ACTIVE_ALT;
-    GLint CTRL  = modif & GLUT_ACTIVE_CTRL;
-
-    switch( key )
-    {
-        case GLUT_KEY_LEFT:
-            if ( ALT )
-                TRK_offset.x -= 0.5;
-            else if ( CTRL )
-                translation.x -= 2.0;
-            else
-                VOXEL.x--;
-            break;
-        case GLUT_KEY_RIGHT:
-            if ( ALT )
-                TRK_offset.x += 0.5;
-            else if ( CTRL )
-                translation.x += 2.0;
-            else
-                VOXEL.x++;
-            break;
-        case GLUT_KEY_DOWN:
-            if ( ALT )
-                TRK_offset.y -= 0.5;
-            else if ( CTRL )
-                translation.y -= 2.0;
-            else
-                VOXEL.y--;
-            break;
-        case GLUT_KEY_UP:
-            if ( ALT )
-                TRK_offset.y += 0.5;
-            else if ( CTRL )
-                translation.y += 2.0;
-            else
-                VOXEL.y++;
-            break;
-        case GLUT_KEY_PAGE_DOWN:
-            if ( ALT )
-                TRK_offset.z -= 0.5;
-            else
-                VOXEL.z--;
-            break;
-        case GLUT_KEY_PAGE_UP:
-            if ( ALT )
-                TRK_offset.z += 0.5;
-            else
-                VOXEL.z++;
-            break;
-
-        default:
-            doRedraw = false;
-    }
-
-    // check the bounds
-    VOXEL.x = max( VOXEL.x, 0 );
-    VOXEL.y = max( VOXEL.y, 0 );
-    VOXEL.z = max( VOXEL.z, 0 );
-    VOXEL.x = min( VOXEL.x, dim.x-1 );
-    VOXEL.y = min( VOXEL.y, dim.y-1 );
-    VOXEL.z = min( VOXEL.z, dim.z-1 );
-
-    if ( doRedraw )
-        glutPostRedisplay();
-}
-
-
-
-// MOUSE callback
-// --------------
-void GLUT__mouse( GLint button, GLint state, GLint x, GLint y )
-{
-    if (state == GLUT_DOWN)
-    {
-        if ( button == GLUT_LEFT_BUTTON && glutGetModifiers() != GLUT_ACTIVE_CTRL )
-        {
-            moving = 1;
-            start.x = x;
-            start.y = y;
-        }
-        // NOTE: does not work, issue with glutGetModifiers not getting CTRL
-        // else if ( button == GLUT_LEFT_BUTTON && glutGetModifiers() == GLUT_ACTIVE_CTRL )
-        // {
-        //     moving = 2;
-        //     start.x = x;
-        //     start.y = y;
-        // }
-        else if ( (button == GLUT_MIDDLE_BUTTON) || (button == GLUT_LEFT_BUTTON && glutGetModifiers() == GLUT_ACTIVE_ALT) )
-        {
-            moving = 3;
-            start.x = x;
-            start.y = y;
-        }
-    }
-    else if (state == GLUT_UP)
-    {
-        moving = 0;
-    }
-}
-
-
-// MOTION callback
-// ---------------
-void GLUT__motion( GLint x, GLint y )
-{
-    if (moving==1)
-    {
-        OPENGL_utils::translate(id, 0,0,0, rot1);
-
-        OPENGL_utils::rotateY(id,start.x-x,rot3);
-        OPENGL_utils::matXMat(rot,rot1,rot2);
-        OPENGL_utils::rotateX(id,start.y-y,rot1);
-        OPENGL_utils::matXMat(rot2,rot1,rot);
-        OPENGL_utils::matXMat(rot,rot3,rot2);
-
-        OPENGL_utils::translate(id, 0,0,0, rot1);
-        OPENGL_utils::matXMat(rot2,rot1,rot);
-
-        start.x = x;
-        start.y = y;
-    }
-
-    else if (moving==2)
-    {
-        zoom = zoom + (y-start.y)/2.0;
-        start.y = y;
-    }
-
-    else if (moving==3)
-    {
-        translation.x = translation.x - (start.x-x)/3.0;
-        translation.y = translation.y + (start.y-y)/3.0;
-        start.x = x;
-        start.y = y;
-    }
-
-    glutPostRedisplay();
-}
-
-
-// DISPLAY callback
-// ----------------
-void GLUT__display( void )
-{
-    glClear( GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT );
-
-    glPushMatrix();
-    glTranslatef(translation.x, translation.y, -zoom); // mouse translation + zoom
-    glMultMatrixf(rot); // mouse rotation    
-    glTranslatef( -pixdim.x*dim.x/2.0, -pixdim.y*dim.y/2.0, -pixdim.z*dim.z/2.0 ); // center the FOV
-    glScalef( pixdim.x, pixdim.y, pixdim.z ); // account for voxel size
-
-    /* ============= */
-    /* Draw the AXES */
-    /* ============= */
-    if ( showAxes )
-    {
-        glLineWidth(2);
-        glBegin(GL_LINES);
-            glColor4f( 1,0,0,1); glVertex3f( 0,0,0 ); glVertex3f( 10,  0,  0 );
-            glColor4f( 0,1,0,1); glVertex3f( 0,0,0 ); glVertex3f(  0, 10,  0 );
-            glColor4f( 0,0,1,1); glVertex3f( 0,0,0 ); glVertex3f(  0,  0, 10 );
-        glEnd();
-    }
-
-    /* =============== */
-    /* Draw the TRACTS */
-    /* =============== */
-    if ( TRK_show )
-    {
-        glPushMatrix();
-        glTranslatef(TRK_offset.x, TRK_offset.y, TRK_offset.z);
-
-        glLineWidth(1.0f);
-
-        float *ptr  = TRK_coords, *ptrc = TRK_colors;
-        VECTOR<float> Vc( VOXEL.x+0.5, VOXEL.y+0.5, VOXEL.z+0.5 ); // voxel center
-        float thr = 0.5*TRK_crop;
-        for(int f=0; f < TRK_nTractsPlotted; f++)
-        {
-            glBegin(GL_LINE_STRIP);
-            for(int i=0; i < TRK_nPoints[f]; i++)
-            {
-                // plot segment only if it's close to center of VOXEL
-                if (
-                      (
-                        TRK_crop_mode && (
-                        ( showPlane[0] && abs( (ptr[0]+TRK_offset.x) - Vc.x ) <= thr ) ||
-                        ( showPlane[1] && abs( (ptr[1]+TRK_offset.y) - Vc.y ) <= thr ) ||
-                        ( showPlane[2] && abs( (ptr[2]+TRK_offset.z) - Vc.z ) <= thr ) )
-                      )
-                      ||
-                      (
-                        !TRK_crop_mode && (
-                        ( abs( (ptr[0]+TRK_offset.x) - Vc.x ) <= thr ) &&
-                        ( abs( (ptr[1]+TRK_offset.y) - Vc.y ) <= thr ) &&
-                        ( abs( (ptr[2]+TRK_offset.z) - Vc.z ) <= thr ) )
-                      )
-                    )
-                {
-                    glColor3f(  *ptrc++, *ptrc++, *ptrc++ );
-                    glVertex3f( *ptr++,  *ptr++,  *ptr++  );
-                }
-                else
-                {
-                    glEnd();
-                    glBegin(GL_LINE_STRIP);
-                    ptr  += 3;
-                    ptrc += 3;
-                }
-            }
-            glEnd();
-        }
-
-        glPopMatrix();
-    }
-
-    /* ============== */
-    /* Draw the PEAKS */
-    /* ============== */
-    if ( PEAKS_show || GLYPHS_show )
-    {
-        glDisable( GL_BLEND );
-        glLineWidth( LINE_width );
-        glPointSize( LINE_width );
-
-        glPushMatrix();
-        glTranslatef(.5,.5,.5);
-
-        Vec3Df dir, col;
-        int x,y,z,d,idx;
-        float norms[PEAKS_n], normMax, b0, w;
-
-        // plane YZ
-        if ( showPlane[0]  )
-        {
-            x = (int)VOXEL.x;
-            for(y=0; y<dim.y ;y++)
-            for(z=0; z<dim.z ;z++)
-            {
-                if ( PEAKS_show )
-                {
-                    normMax = 0;
-                    for(d=0; d<PEAKS_n; d++)
-                    {
-                        col.x = (*niiPEAKS->img)(x,y,z,3*d+0); // use "col" as tmp variable
-                        col.y = (*niiPEAKS->img)(x,y,z,3*d+1);
-                        col.z = (*niiPEAKS->img)(x,y,z,3*d+2);
-                        if ( PEAKS_use_affine )
-                        {
-                            dir.x = col.x * ((float*)PEAKS_affine)[0] + col.y * ((float*)PEAKS_affine)[1] + col.z * ((float*)PEAKS_affine)[2];
-                            dir.y = col.x * ((float*)PEAKS_affine)[3] + col.y * ((float*)PEAKS_affine)[4] + col.z * ((float*)PEAKS_affine)[5];
-                            dir.z = col.x * ((float*)PEAKS_affine)[6] + col.y * ((float*)PEAKS_affine)[7] + col.z * ((float*)PEAKS_affine)[8];
-                        }
-                        else
-                        {
-                            dir.x = col.x;
-                            dir.y = col.y;
-                            dir.z = col.z;
-                        }
-                        norms[d] = dir.norm();
-                        if ( norms[d] > normMax )
-                            normMax = norms[d];
-                    }
-
-                    for(d=0; d<PEAKS_n; d++)
-                    {
-                        if ( norms[d] < PEAKS_thr*normMax )
-                            continue;
-
-                        col.x = (*niiPEAKS->img)(x,y,z,3*d+0); // use "col" as tmp variable
-                        col.y = (*niiPEAKS->img)(x,y,z,3*d+1);
-                        col.z = (*niiPEAKS->img)(x,y,z,3*d+2);
-                        if ( PEAKS_use_affine )
-                        {
-                            dir.x = col.x * ((float*)PEAKS_affine)[0] + col.y * ((float*)PEAKS_affine)[1] + col.z * ((float*)PEAKS_affine)[2];
-                            dir.y = col.x * ((float*)PEAKS_affine)[3] + col.y * ((float*)PEAKS_affine)[4] + col.z * ((float*)PEAKS_affine)[5];
-                            dir.z = col.x * ((float*)PEAKS_affine)[6] + col.y * ((float*)PEAKS_affine)[7] + col.z * ((float*)PEAKS_affine)[8];
-                        }
-                        else
-                        {
-                            dir.x = col.x;
-                            dir.y = col.y;
-                            dir.z = col.z;
-                        }
-                        col.x = 0.5 * (PEAKS_flip[0]?-1:1) * dir.x / norms[d];
-                        col.y = 0.5 * (PEAKS_flip[1]?-1:1) * dir.y / norms[d];
-                        col.z = 0.5 * (PEAKS_flip[2]?-1:1) * dir.z / norms[d];
-
-                        if ( PEAKS_doNormalize )
-                        {
-                            dir.x = col.x;
-                            dir.y = col.y;
-                            dir.z = col.z;
-                        }
-                        else
-                        {
-                            dir.x = col.x * norms[d] / normMax;
-                            dir.y = col.y * norms[d] / normMax;
-                            dir.z = col.z * norms[d] / normMax;
-                        }
-
-                        glColor3f( fabs(2.0*col.x), fabs(2.0*col.y), fabs(2.0*col.z) ); 
-                        glBegin(GL_LINES);
-                            glVertex3f( x-dir.x, y-dir.y, z-dir.z );
-                            glVertex3f( x+dir.x, y+dir.y, z+dir.z );
-                        glEnd();
-                    }
-                }
-                if ( GLYPHS_show )
-                {
-                    b0 = (*niiDWI->img)(x,y,z,SCHEME_idxB0[0]);
-                    if ( b0 > GLYPHS_b0_thr )
-                    {
-                        glBegin(GL_POINTS);
-                        for(d=0; d < SCHEME_shells_idx[GLYPHS_shell].size() ;d++)
-                        {
-                            idx = SCHEME_shells_idx[GLYPHS_shell][d];
-                            w = 0.5 * (float)(*niiDWI->img)(x,y,z,idx) / b0;
-                            if ( GLYPHS_use_affine ) 
-                            {
-                                dir.x = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[0] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[1] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[2];
-                                dir.y = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[3] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[4] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[5];
-                                dir.z = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[6] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[7] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[8];
-                                normMax = dir.norm();
-                                dir.x *= w / normMax;
-                                dir.y *= w / normMax;
-                                dir.z *= w / normMax;
-                            }
-                            else
-                            {
-                                dir.x = w * SCHEME_dirs[idx].x;
-                                dir.y = w * SCHEME_dirs[idx].y;
-                                dir.z = w * SCHEME_dirs[idx].z;
-                            }
-                            normMax = dir.norm();
-                            glColor3f( fabs(dir.x)/normMax, fabs(dir.y)/normMax, fabs(dir.z)/normMax );
-                            glVertex3f( x+dir.x, y+dir.y, z+dir.z );
-                            glVertex3f( x-dir.x, y-dir.y, z-dir.z );
-                        }
-                        glEnd();
-                    }
-                }
-            }
-        }
-
-        // plane XZ
-        if ( showPlane[1] )
-        {
-            y = (int)VOXEL.y;
-            for(x=0; x<dim.x ;x++)
-            for(z=0; z<dim.z ;z++)
-            {
-                if ( PEAKS_show )
-                {
-                    normMax = 0;
-                    for(d=0; d<PEAKS_n; d++)
-                    {
-                        col.x = (*niiPEAKS->img)(x,y,z,3*d+0); // use "col" as tmp variable
-                        col.y = (*niiPEAKS->img)(x,y,z,3*d+1);
-                        col.z = (*niiPEAKS->img)(x,y,z,3*d+2);
-                        if ( PEAKS_use_affine )
-                        {
-                            dir.x = col.x * ((float*)PEAKS_affine)[0] + col.y * ((float*)PEAKS_affine)[1] + col.z * ((float*)PEAKS_affine)[2];
-                            dir.y = col.x * ((float*)PEAKS_affine)[3] + col.y * ((float*)PEAKS_affine)[4] + col.z * ((float*)PEAKS_affine)[5];
-                            dir.z = col.x * ((float*)PEAKS_affine)[6] + col.y * ((float*)PEAKS_affine)[7] + col.z * ((float*)PEAKS_affine)[8];
-                        }
-                        else
-                        {
-                            dir.x = col.x;
-                            dir.y = col.y;
-                            dir.z = col.z;
-                        }
-                        norms[d] = dir.norm();
-                        if ( norms[d] > normMax )
-                            normMax = norms[d];
-                    }
-
-                    for(d=0; d<PEAKS_n; d++)
-                    {
-                        if ( norms[d] < normMax*PEAKS_thr )
-                            continue;
-
-                        col.x = (*niiPEAKS->img)(x,y,z,3*d+0); // use "col" as tmp variable
-                        col.y = (*niiPEAKS->img)(x,y,z,3*d+1);
-                        col.z = (*niiPEAKS->img)(x,y,z,3*d+2);
-                        if ( PEAKS_use_affine )
-                        {
-                            dir.x = col.x * ((float*)PEAKS_affine)[0] + col.y * ((float*)PEAKS_affine)[1] + col.z * ((float*)PEAKS_affine)[2];
-                            dir.y = col.x * ((float*)PEAKS_affine)[3] + col.y * ((float*)PEAKS_affine)[4] + col.z * ((float*)PEAKS_affine)[5];
-                            dir.z = col.x * ((float*)PEAKS_affine)[6] + col.y * ((float*)PEAKS_affine)[7] + col.z * ((float*)PEAKS_affine)[8];
-                        }
-                        else
-                        {
-                            dir.x = col.x;
-                            dir.y = col.y;
-                            dir.z = col.z;
-                        }
-                        col.x = 0.5 * (PEAKS_flip[0]?-1:1) * dir.x / norms[d];
-                        col.y = 0.5 * (PEAKS_flip[1]?-1:1) * dir.y / norms[d];
-                        col.z = 0.5 * (PEAKS_flip[2]?-1:1) * dir.z / norms[d];
-
-                        if ( PEAKS_doNormalize )
-                        {
-                            dir.x = col.x;
-                            dir.y = col.y;
-                            dir.z = col.z;
-                        }
-                        else
-                        {
-                            dir.x = col.x * norms[d] / normMax;
-                            dir.y = col.y * norms[d] / normMax;
-                            dir.z = col.z * norms[d] / normMax;
-                        }
-
-                        glColor3f( fabs(2.0*col.x), fabs(2.0*col.y), fabs(2.0*col.z) );
-                        glBegin(GL_LINES);
-                            glVertex3f( x-dir.x, y-dir.y, z-dir.z );
-                            glVertex3f( x+dir.x, y+dir.y, z+dir.z );
-                        glEnd();
-                    }
-                }
-
-                if ( GLYPHS_show )
-                {
-                    b0 = (*niiDWI->img)(x,y,z,SCHEME_idxB0[0]);
-                    if ( b0 > GLYPHS_b0_thr )
-                    {
-                        glBegin(GL_POINTS);
-                        for(d=0; d < SCHEME_shells_idx[GLYPHS_shell].size() ;d++)
-                        {
-                            idx = SCHEME_shells_idx[GLYPHS_shell][d];
-                            w = 0.5 * (float)(*niiDWI->img)(x,y,z,idx) / b0;
-                            if ( GLYPHS_use_affine ) 
-                            {
-                                dir.x = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[0] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[1] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[2];
-                                dir.y = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[3] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[4] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[5];
-                                dir.z = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[6] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[7] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[8];
-                                normMax = dir.norm();
-                                dir.x *= w / normMax;
-                                dir.y *= w / normMax;
-                                dir.z *= w / normMax;
-                            }
-                            else
-                            {
-                                dir.x = w * SCHEME_dirs[idx].x;
-                                dir.y = w * SCHEME_dirs[idx].y;
-                                dir.z = w * SCHEME_dirs[idx].z;
-                            }
-                            normMax = dir.norm();
-                            glColor3f( fabs(dir.x)/normMax, fabs(dir.y)/normMax, fabs(dir.z)/normMax );
-                            glVertex3f( x+dir.x, y+dir.y, z+dir.z );
-                            glVertex3f( x-dir.x, y-dir.y, z-dir.z );
-                        }
-                        glEnd();
-                    }
-                }
-            }
-        }
-
-        // plane XY
-        if ( showPlane[2] )
-        {
-            z = (int)VOXEL.z;
-            for(y=0; y<dim.y ;y++)
-            for(x=0; x<dim.x ;x++)
-            {
-                if ( PEAKS_show )
-                {
-                    normMax = 0;
-                    for(d=0; d<PEAKS_n; d++)
-                    {
-                        col.x = (*niiPEAKS->img)(x,y,z,3*d+0); // use "col" as tmp variable
-                        col.y = (*niiPEAKS->img)(x,y,z,3*d+1);
-                        col.z = (*niiPEAKS->img)(x,y,z,3*d+2);
-                        if ( PEAKS_use_affine )
-                        {
-                            dir.x = col.x * ((float*)PEAKS_affine)[0] + col.y * ((float*)PEAKS_affine)[1] + col.z * ((float*)PEAKS_affine)[2];
-                            dir.y = col.x * ((float*)PEAKS_affine)[3] + col.y * ((float*)PEAKS_affine)[4] + col.z * ((float*)PEAKS_affine)[5];
-                            dir.z = col.x * ((float*)PEAKS_affine)[6] + col.y * ((float*)PEAKS_affine)[7] + col.z * ((float*)PEAKS_affine)[8];
-                        }
-                        else
-                        {
-                            dir.x = col.x;
-                            dir.y = col.y;
-                            dir.z = col.z;
-                        }
-                        norms[d] = dir.norm();
-                        if ( norms[d] > normMax )
-                            normMax = norms[d];
-                    }
-
-                    for(d=0; d<PEAKS_n; d++)
-                    {
-                        if ( norms[d] < normMax*PEAKS_thr )
-                            continue;
-
-                        col.x = (*niiPEAKS->img)(x,y,z,3*d+0); // use "col" as tmp variable
-                        col.y = (*niiPEAKS->img)(x,y,z,3*d+1);
-                        col.z = (*niiPEAKS->img)(x,y,z,3*d+2);
-                        if ( PEAKS_use_affine )
-                        {
-                            dir.x = col.x * ((float*)PEAKS_affine)[0] + col.y * ((float*)PEAKS_affine)[1] + col.z * ((float*)PEAKS_affine)[2];
-                            dir.y = col.x * ((float*)PEAKS_affine)[3] + col.y * ((float*)PEAKS_affine)[4] + col.z * ((float*)PEAKS_affine)[5];
-                            dir.z = col.x * ((float*)PEAKS_affine)[6] + col.y * ((float*)PEAKS_affine)[7] + col.z * ((float*)PEAKS_affine)[8];
-                        }
-                        else
-                        {
-                            dir.x = col.x;
-                            dir.y = col.y;
-                            dir.z = col.z;
-                        }
-                        col.x = 0.5 * (PEAKS_flip[0]?-1:1) * dir.x / norms[d];
-                        col.y = 0.5 * (PEAKS_flip[1]?-1:1) * dir.y / norms[d];
-                        col.z = 0.5 * (PEAKS_flip[2]?-1:1) * dir.z / norms[d];
-
-                        if ( PEAKS_doNormalize )
-                        {
-                            dir.x = col.x;
-                            dir.y = col.y;
-                            dir.z = col.z;
-                        }
-                        else
-                        {
-                            dir.x = col.x * norms[d] / normMax;
-                            dir.y = col.y * norms[d] / normMax;
-                            dir.z = col.z * norms[d] / normMax;
-                        }
-
-                        glColor3f( fabs(2.0*col.x), fabs(2.0*col.y), fabs(2.0*col.z) );
-                        glBegin(GL_LINES);
-                            glVertex3f( x-dir.x, y-dir.y, z-dir.z );
-                            glVertex3f( x+dir.x, y+dir.y, z+dir.z );
-                        glEnd();
-                    }
-                }
-
-                if( GLYPHS_show)
-                {
-                    b0 = (*niiDWI->img)(x,y,z,SCHEME_idxB0[0]);
-                    if ( b0 > GLYPHS_b0_thr )
-                    {
-                        glBegin(GL_POINTS);
-                        for(d=0; d < SCHEME_shells_idx[GLYPHS_shell].size() ;d++)
-                        {
-                            idx = SCHEME_shells_idx[GLYPHS_shell][d];
-                            w = 0.5 * (float)(*niiDWI->img)(x,y,z,idx) / b0;
-                            if ( GLYPHS_use_affine ) 
-                            {
-                                dir.x = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[0] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[1] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[2];
-                                dir.y = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[3] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[4] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[5];
-                                dir.z = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[6] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[7] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[8];
-                                normMax = dir.norm();
-                                dir.x *= w / normMax;
-                                dir.y *= w / normMax;
-                                dir.z *= w / normMax;
-                            }
-                            else
-                            {
-                                dir.x = w * SCHEME_dirs[idx].x;
-                                dir.y = w * SCHEME_dirs[idx].y;
-                                dir.z = w * SCHEME_dirs[idx].z;
-                            }
-
-                            normMax = dir.norm();
-                            glColor3f( fabs(dir.x)/normMax, fabs(dir.y)/normMax, fabs(dir.z)/normMax );
-                            glVertex3f( x+dir.x, y+dir.y, z+dir.z );
-                            glVertex3f( x-dir.x, y-dir.y, z-dir.z );
-                        }
-                        glEnd();
-                    }
-                }
-            }
-        }
-
-        glPopMatrix();
-    }
-
-    /* =================== */
-    /* Draw the SCALAR MAP */
-    /* =================== */
-    if ( showPlane[0] || showPlane[1] || showPlane[2] )
-    {
-        glDisable( GL_CULL_FACE );
-        glEnable( GL_BLEND );
-        glBlendFunc( GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA );
-
-        // to avoid z-fighting
-        glPolygonOffset( 1.0, 1.0 );
-        glEnable(GL_POLYGON_OFFSET_FILL);
-        glPolygonMode(GL_FRONT_AND_BACK, GL_FILL);
-
-        glLineWidth( 3 );
-
-        int x, y, z; // voxel coordinates NB: (0,0,0) -> corner of voxel
-        float color;
-
-        // plane YZ
-        if ( showPlane[0]  )
-        {
-            glPushMatrix();
-            glTranslatef(0.5,0,0);
-
-            x = (int)VOXEL.x;
-            for(y=0; y<dim.y ;y++)
-            for(z=0; z<dim.z ;z++)
-            {
-                color = ( MAP(x,y,z) - MAP_min_view) / ( MAP_max_view - MAP_min_view );
-                glColor4f(color,color,color,MAP_opacity);
-                glBegin(GL_QUADS);
-                    glVertex3f(x, y,   z);
-                    glVertex3f(x, y,   z+1);
-                    glVertex3f(x, y+1, z+1);
-                    glVertex3f(x, y+1, z);
-                glEnd();
-            }
-            // colored frame
-            if ( showAxes )
-            {
-                glColor3f(1,0,0);
-                glBegin(GL_LINE_STRIP);
-                    glVertex3f(x,0,0);
-                    glVertex3f(x,dim.y,0);
-                    glVertex3f(x,dim.y,dim.z);
-                    glVertex3f(x,0,dim.z);
-                    glVertex3f(x,0,0);
-                glEnd();
-            }
-
-            glPopMatrix();
-        }
-
-        // plane XZ
-        if ( showPlane[1] )
-        {
-            glPushMatrix();
-            glTranslatef(0,0.5,0);
-
-            y = (int)VOXEL.y;
-            for(x=0; x<dim.x ;x++)
-            for(z=0; z<dim.z ;z++)
-            {
-                color = ( MAP(x,y,z) - MAP_min_view) / ( MAP_max_view - MAP_min_view );
-                glColor4f(color,color,color,MAP_opacity);
-                glBegin(GL_QUADS);
-                    glVertex3f(x,   y, z);
-                    glVertex3f(x,   y, z+1);
-                    glVertex3f(x+1, y, z+1);
-                    glVertex3f(x+1, y, z);
-                glEnd();
-            }
-            // colored frame
-            if ( showAxes )
-            {
-                glColor3f(0,1,0);
-                glBegin(GL_LINE_STRIP);
-                    glVertex3f(0,y,0);
-                    glVertex3f(dim.x,y,0);
-                    glVertex3f(dim.x,y,dim.z);
-                    glVertex3f(0,y,dim.z);
-                    glVertex3f(0,y,0);
-                glEnd();
-            }
-
-            glPopMatrix();
-        }
-
-        // plane XY
-        if ( showPlane[2] )
-        {
-            glPushMatrix();
-            glTranslatef(0,0,0.5);
-
-            z = (int)VOXEL.z;
-            for(y=0; y<dim.y ;y++)
-            for(x=0; x<dim.x ;x++)
-            {
-                color = ( MAP(x,y,z) - MAP_min_view) / ( MAP_max_view - MAP_min_view );
-                glColor4f(color,color,color,MAP_opacity);
-                glBegin(GL_QUADS);
-                    glVertex3f(x,   y,   z);
-                    glVertex3f(x+1, y,   z);
-                    glVertex3f(x+1, y+1, z);
-                    glVertex3f(x,   y+1, z);
-                glEnd();
-            }
-
-            // colored frame
-            if ( showAxes )
-            {
-                glColor3f(0,0,1);
-                glBegin(GL_LINE_STRIP);
-                    glVertex3f(0,0,z);
-                    glVertex3f(dim.x,0,z);
-                    glVertex3f(dim.x,dim.y,z);
-                    glVertex3f(0,dim.y,z);
-                    glVertex3f(0,0,z);
-                glEnd();
-            }
-
-            glPopMatrix();
-        }
-
-        glEnable(GL_CULL_FACE);
-        glDisable( GL_BLEND );
-        glDisable(GL_POLYGON_OFFSET_FILL);
-    }
-
-    /* ====================== */
-    /* Draw the CURRENT VOXEL */
-    /* ====================== */
-    if ( showAxes )
-    {
-        glPushMatrix();
-        glTranslatef( VOXEL.x+0.5, VOXEL.y+0.5, VOXEL.z+0.5 );
-
-        glEnable( GL_BLEND );
-        glBlendFunc( GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA );
-        glLineWidth(1);
-        glColor4f( 1,1,0,1 );
-        glutWireCube( 1 );
-        glColor4f( 1,1,0,0.25 );
-        glutSolidCube( 1 );
-        glDisable( GL_BLEND );
-
-        glPopMatrix();
-    }
-
-    glPopMatrix();
-    PrintConfig();
-    glutSwapBuffers();
-}
-
-
-// INITIALIZATION
-// --------------
-void OpenGL_init( int argc, char** argv )
-{
-    glutInit( &argc, argv );
-    glutInitDisplayMode( GLUT_DEPTH | GLUT_DOUBLE | GLUT_RGBA );
-    ScreenX = 0.7*glutGet(GLUT_SCREEN_WIDTH);  if (ScreenX==0) ScreenX = 800;
-    ScreenY = 0.7*glutGet(GLUT_SCREEN_HEIGHT); if (ScreenY==0) ScreenY = 600;
-    glutInitWindowSize( ScreenX/2, ScreenY/2 );
-    glutInitWindowPosition( 0.15*glutGet(GLUT_SCREEN_WIDTH), 0.15*glutGet(GLUT_SCREEN_HEIGHT) );
-    glutCreateWindow( "COMMIT debugger" );
-    glutReshapeWindow( ScreenX, ScreenY );
-
-    // Projection and model matrix
-    glMatrixMode(GL_PROJECTION);
-    glLoadIdentity();
-    // gluPerspective( 40.0f, (GLfloat)ScreenX / (GLfloat)ScreenY, 10.0f, 1000.0f );
-    glMatrixMode(GL_MODELVIEW);
-    glLoadIdentity();
-    // gluLookAt(
-    //     0.0, 0.0, 2.0*max(pixdim.x*dim.x,pixdim.y*dim.y) * (GLfloat)ScreenY/(GLfloat)ScreenX,
-    //     0.0, 0.0, 0.0,
-    //     0.0, 1.0, 0.0
-    // );
-
-    translation.x	= translation.y = 0;
-    zoom			= 0;
-    OPENGL_utils::identity( rot );
-    OPENGL_utils::identity( id );
-
-    // basic settings
-    glEnable( GL_LINE_SMOOTH );
-    glEnable( GL_POLYGON_SMOOTH );
-    glHint( GL_LINE_SMOOTH_HINT, GL_NICEST );
-    glHint( GL_POLYGON_SMOOTH_HINT, GL_NICEST );
-
-    glEnable( GL_DEPTH_TEST );
-    glClearColor( 0.1, 0.1, 0.1, 0.0 );
-
-    // lighting
-    glShadeModel( GL_SMOOTH );
-    glEnable( GL_NORMALIZE );
-
-    GLfloat white[] = {.5f, .5f, .5f, 1.0f};
-    glMaterialfv(GL_FRONT, GL_SPECULAR, white);
-    GLfloat shininess[] = {32};
-    glMaterialfv(GL_FRONT, GL_SHININESS, shininess);
-
-    glLightModeli(GL_LIGHT_MODEL_TWO_SIDE, GL_FALSE);
-    GLfloat global_ambient[] = { 0.2f, 0.2f, 0.2f, 1.0f };
-    glLightModelfv(GL_LIGHT_MODEL_AMBIENT, global_ambient);
-    glEnable ( GL_COLOR_MATERIAL );	// use glColor3f() to colorize polygons
-
-    // register CALLBACKS and open window
-    glutKeyboardFunc( GLUT__keyboard );
-    glutSpecialFunc(  GLUT__specialkey );
-    glutDisplayFunc(  GLUT__display );
-    glutReshapeFunc(  GLUT__reshape );
-    glutMouseFunc(    GLUT__mouse );
-    glutMotionFunc(   GLUT__motion );
-
-    GLUT__createMenu();
-
-    glutMainLoop();
-}
+#define GL_GLEXT_PROTOTYPES 1
+#ifdef __APPLE__
+    #include <OpenGL/gl.h>
+    #include <OpenGL/glext.h>
+    #include <GLUT/glut.h>
+#else
+    #include <GL/gl.h>
+    #include <GL/glext.h>
+    #include <GL/glut.h>
+#endif
+
+#include "OPENGL_utils.h"
+using namespace OPENGL_utils;
+
+/* global variables */
+GLfloat			id[16], rot[16], rot1[16], rot2[16], rot3[16];
+Vec3Df			translation;
+Vec3Di			start;
+GLint			moving;
+GLfloat			zoom;
+
+float ScreenX, ScreenY;
+
+void drawString( const char *string )
+{
+    static int y = glutGet( GLUT_WINDOW_HEIGHT ) - 50;
+    if ( string=="" )
+        y = glutGet( GLUT_WINDOW_HEIGHT ) - 50;
+    else
+    {
+        glRasterPos2i(10, y);
+        for (const char* c=string; *c != '\0'; c++) 
+            glutBitmapCharacter(GLUT_BITMAP_9_BY_15, *c);
+        y -= 18;
+    }
+}
+
+void PrintConfig()
+{
+    if ( !showConfig )
+        return;
+
+    glMatrixMode(GL_PROJECTION);
+    glPushMatrix();             
+    glLoadIdentity();
+    glMatrixMode( GL_MODELVIEW ) ;
+    glPushMatrix() ;
+    glLoadIdentity() ;
+    int w = glutGet( GLUT_WINDOW_WIDTH );
+    int h = glutGet( GLUT_WINDOW_HEIGHT );
+    glOrtho( 0, w, 0, h, -1, 1 );
+    glDisable( GL_DEPTH_TEST ); 
+
+    char s[1024];
+    glColor3f(1, 1, 0);
+    drawString( "" ); // reset initial position
+
+    drawString( "MAP" );
+    sprintf( s, "   - value(%d,%d,%d) = %.2f", VOXEL.x, VOXEL.y, VOXEL.z, MAP(VOXEL.x, VOXEL.y, VOXEL.z) );
+    drawString( s );
+    sprintf( s, "   - range = [ %.1f ... %.1f ]", MAP_min_view, MAP_max_view );
+    drawString( s );
+    sprintf( s, "   - opacity = %.1f", MAP_opacity );
+    drawString( s );
+
+    drawString( "SIGNAL" );
+    sprintf( s, "   - shell = %d/%d  (b=%.1f)", GLYPHS_shell+1, SCHEME_shells_b.size(), SCHEME_shells_b[GLYPHS_shell] );
+    drawString( s );
+    sprintf( s, "   - use affine = %s", GLYPHS_use_affine?"true":"false" );
+    drawString( s );
+    sprintf( s, "   - flip = [ %d, %d, %d ]", GLYPHS_flip[0], GLYPHS_flip[1], GLYPHS_flip[2] );
+    drawString( s );
+    sprintf( s, "   - b0 thr = %.1f", GLYPHS_b0_thr );
+    drawString( s );
+
+    if ( PEAKS_n>0 )
+    {
+        drawString( "PEAKS" );
+        sprintf( s, "   - use affine = %s", PEAKS_use_affine?"true":"false" );
+        drawString( s );
+        sprintf( s, "   - flip = [ %d, %d, %d ]", PEAKS_flip[0], PEAKS_flip[1], PEAKS_flip[2] );
+        drawString( s );
+        sprintf( s, "   - thr = %.1f", PEAKS_thr );
+        drawString( s );
+        sprintf( s, "   - normalize = %s", PEAKS_doNormalize?"true":"false" );
+        drawString( s );
+    }
+
+    if ( TRK_nTractsPlotted>0 )
+    {
+        drawString( "FIBERS" );
+        sprintf( s, "   - shift = [ %.1f %.1f %.1f ]  (voxels)", TRK_offset.x, TRK_offset.y, TRK_offset.z );
+        drawString( s );
+        sprintf( s, "   - slab thickness = %.1f  (voxels)", TRK_crop );
+        drawString( s );
+    }
+
+    glEnable (GL_DEPTH_TEST);     
+    glMatrixMode(GL_PROJECTION);
+    glPopMatrix();
+    glMatrixMode(GL_MODELVIEW);
+    glPopMatrix();
+}
+
+
+// KEYBOARD callback
+// -----------------
+void GLUT__keyboard( unsigned char key, GLint x=0, GLint y=0 )
+{
+    bool doRedraw = true;
+
+    switch( key )
+    {
+        case 'l': showConfig = 1 - showConfig; break;
+
+        case '1': showPlane[0] = 1 - showPlane[0]; break;
+        case '2': showPlane[1] = 1 - showPlane[1]; break;
+        case '3': showPlane[2] = 1 - showPlane[2]; break;
+        case '4':
+            showPlane[0] = 1;
+            showPlane[1] = 0;
+            showPlane[2] = 0;
+            translation.x	= translation.y = 0;
+            OPENGL_utils::identity(rot1);
+            OPENGL_utils::rotateX(rot1, 90.0, rot2);
+            OPENGL_utils::rotateZ(rot2, 90.0, rot);
+            break;
+        case '5':
+            showPlane[0] = 0;
+            showPlane[1] = 1;
+            showPlane[2] = 0;
+            translation.x	= translation.y = 0;
+            OPENGL_utils::identity(rot1);
+            OPENGL_utils::rotateX(rot1, 90.0, rot);
+            break;
+        case '6':
+            showPlane[0] = 0;
+            showPlane[1] = 0;
+            showPlane[2] = 1;
+            translation.x	= translation.y = 0;
+            OPENGL_utils::identity( rot );
+            break;
+
+        case '0': showAxes = 1 - showAxes; break;
+        case '-': zoom += 10.0; break;
+        case '+': zoom -= 10.0; break;
+        case 'm': MAP_max_view = fmaxf(0.0,MAP_max_view-MAP_max*0.05); break;
+        case 'M': MAP_max_view = fminf(MAP_max,MAP_max_view+MAP_max*0.05); break;
+        case 'o': MAP_opacity = fmaxf(0.0,MAP_opacity-0.1); break;
+        case 'O': MAP_opacity = fminf(1.0,MAP_opacity+0.1); break;
+        case 'w': LINE_width = fmaxf( 1,LINE_width-1); break;
+        case 'W': LINE_width = fminf(10,LINE_width+1); break;
+        case 'r':
+            showPlane[0] = showPlane[1] = showPlane[2] = 1;
+            translation.x	= translation.y = 0;
+            zoom			= 0;
+            OPENGL_utils::identity( rot );
+            break;
+
+        case 's': GLYPHS_show = 1 - GLYPHS_show; break;
+        case 'S': GLYPHS_shell = (GLYPHS_shell+1) % SCHEME_shells_idx.size(); break;
+        case 'a': GLYPHS_use_affine = 1 - GLYPHS_use_affine; break;
+        case 'x': GLYPHS_flip[0] = 1 - GLYPHS_flip[0]; for(int d=0; d < SCHEME_dirs.size() ;d++) SCHEME_dirs[d].x *= -1; break;
+        case 'y': GLYPHS_flip[1] = 1 - GLYPHS_flip[1]; for(int d=0; d < SCHEME_dirs.size() ;d++) SCHEME_dirs[d].y *= -1; break;
+        case 'z': GLYPHS_flip[2] = 1 - GLYPHS_flip[2]; for(int d=0; d < SCHEME_dirs.size() ;d++) SCHEME_dirs[d].z *= -1; break;
+        case 'b': GLYPHS_b0_thr = fmaxf(0.0,GLYPHS_b0_thr-10.0); break;
+        case 'B': GLYPHS_b0_thr = fminf(MAP_max,GLYPHS_b0_thr+10.0); break;
+
+        case 'p': if ( PEAKS_n>0 ) PEAKS_show  = 1 - PEAKS_show; break;
+        case 'A': PEAKS_use_affine = 1 - PEAKS_use_affine; break;
+        case 'X': PEAKS_flip[0] = 1 - PEAKS_flip[0]; break;
+        case 'Y': PEAKS_flip[1] = 1 - PEAKS_flip[1]; break;
+        case 'Z': PEAKS_flip[2] = 1 - PEAKS_flip[2]; break;
+        case 't': PEAKS_thr = fmaxf(PEAKS_thr - 0.1, 0.0); break;
+        case 'T': PEAKS_thr = fminf(PEAKS_thr + 0.1, 1.0); break;
+        case 'n': PEAKS_doNormalize = 1 - PEAKS_doNormalize; break;
+
+        case 'f': if ( TRK_nTractsPlotted>0 ) TRK_show = 1 - TRK_show; break;
+        case 'c': TRK_crop = fmaxf( 0.0,TRK_crop-0.5); break;
+        case 'C': TRK_crop = fminf(max(dim.x,max(dim.y,dim.z)),TRK_crop+0.5); break;
+        case ' ': TRK_crop_mode = 1 - TRK_crop_mode; break;
+
+        case 'q':
+        case 27 : exit(0); break;
+
+        default: doRedraw = false;
+    }
+
+    if ( doRedraw )
+        glutPostRedisplay();
+}
+
+
+// MENU callback
+// -------------
+void GLUT__menu( int id ) 
+{
+    switch( id )
+    {
+        case   0: GLUT__keyboard('q'); break;
+
+        case 101: GLUT__keyboard('s'); break;
+        case 102: GLUT__keyboard('S'); break;
+        case 103: GLUT__keyboard('a'); break;
+        case 104: GLUT__keyboard('x'); break;
+        case 105: GLUT__keyboard('y'); break;
+        case 106: GLUT__keyboard('z'); break;
+        case 107: GLUT__keyboard('b'); break;
+        case 108: GLUT__keyboard('B'); break;
+
+        case 201: GLUT__keyboard('p'); break;
+        case 202: GLUT__keyboard('A'); break;
+        case 203: GLUT__keyboard('X'); break;
+        case 204: GLUT__keyboard('Y'); break;
+        case 205: GLUT__keyboard('Z'); break;
+        case 206: GLUT__keyboard('t'); break;
+        case 207: GLUT__keyboard('T'); break;
+        case 208: GLUT__keyboard('n'); break;
+
+        case 301: GLUT__keyboard('f'); break;
+        case 302: GLUT__keyboard('c'); break;
+        case 303: GLUT__keyboard('C'); break;
+        case 304: GLUT__keyboard(' '); break;
+
+        case 401: GLUT__keyboard('1'); break;
+        case 402: GLUT__keyboard('2'); break;
+        case 403: GLUT__keyboard('3'); break;
+        case 404: GLUT__keyboard('4'); break;
+        case 405: GLUT__keyboard('5'); break;
+        case 406: GLUT__keyboard('6'); break;
+        case 407: GLUT__keyboard('0'); break;
+        case 408: GLUT__keyboard('-'); break;
+        case 409: GLUT__keyboard('+'); break;
+        case 410: GLUT__keyboard('m'); break;
+        case 411: GLUT__keyboard('M'); break;
+        case 412: GLUT__keyboard('o'); break;
+        case 413: GLUT__keyboard('O'); break;
+        case 414: GLUT__keyboard('w'); break;
+        case 415: GLUT__keyboard('W'); break;
+        case 416: GLUT__keyboard('r'); break;
+        case 417: GLUT__keyboard('l'); break;
+    }
+}
+
+
+// Create the dropdown MENU
+// ------------------------
+void GLUT__createMenu()
+{
+    int submenu_SIGNAL_id, submenu_PEAKS_id, submenu_FIBERS_id, submenu_VIEW_id;
+
+    submenu_SIGNAL_id = glutCreateMenu( GLUT__menu );
+    glutAddMenuEntry("[s] Show/hide",         101);
+    glutAddMenuEntry("[S] Change shell",      102);
+    glutAddMenuEntry("[a] Use affine",        103);
+    glutAddMenuEntry("[x] Flip X axis",       104);
+    glutAddMenuEntry("[y] Flip Y axis",       105);
+    glutAddMenuEntry("[z] Flip Z axis",       106);
+    glutAddMenuEntry("[b] Decrease b0 thr",   107);
+    glutAddMenuEntry("[B] Increase b0 thr",   108);
+
+    if ( PEAKS_n>0 )
+    {
+        submenu_PEAKS_id = glutCreateMenu( GLUT__menu );
+        glutAddMenuEntry("[p] Show/hide",         201);
+        glutAddMenuEntry("[A] Use affine",        202);
+        glutAddMenuEntry("[X] Flip X axis",       203);
+        glutAddMenuEntry("[Y] Flip Y axis",       204);
+        glutAddMenuEntry("[Z] Flip Z axis",       205);
+        glutAddMenuEntry("[t] Decrease threshold",206);
+        glutAddMenuEntry("[T] Increase threshold",207);
+        glutAddMenuEntry("[n] Normalize length",  208);
+    }
+
+    if ( TRK_nTractsPlotted>0 )
+    {
+        submenu_FIBERS_id = glutCreateMenu( GLUT__menu );
+        glutAddMenuEntry("[f] Show/hide",         301);
+        glutAddMenuEntry("[c] Decrease crop size",302);
+        glutAddMenuEntry("[C] Increase crop size",303);
+        glutAddMenuEntry("[ ] Change crop mode",  304);
+    }
+
+    submenu_VIEW_id = glutCreateMenu( GLUT__menu );
+    glutAddMenuEntry("[1] Show/hide YZ plane", 401);
+    glutAddMenuEntry("[2] Show/hide XZ plane", 402);
+    glutAddMenuEntry("[3] Show/hide XY plane", 403);
+    glutAddMenuEntry("[4] Reset to YZ plane",  404);
+    glutAddMenuEntry("[5] Reset to XZ plane",  405);
+    glutAddMenuEntry("[6] Reset to XY plane",  406);
+    glutAddMenuEntry("[0] Show/hide axes",     407);
+    glutAddMenuEntry("[-] Decrease zoom",      408);
+    glutAddMenuEntry("[+] Increase zoom",      409);
+    glutAddMenuEntry("[m] Decrease max value", 410);
+    glutAddMenuEntry("[M] Increase max value", 411);
+    glutAddMenuEntry("[o] Decrease opacity",   412);
+    glutAddMenuEntry("[O] Increase opacity",   413);
+    glutAddMenuEntry("[t] Decrease line width",414);
+    glutAddMenuEntry("[T] Increase line width",415);
+    glutAddMenuEntry("[r] Reset view",         416);
+    glutAddMenuEntry("[l] Show/hide log",      417);
+
+    int menu_id = glutCreateMenu( GLUT__menu );
+    glutAddSubMenu("Signal", submenu_SIGNAL_id);
+    if ( PEAKS_n>0 )
+        glutAddSubMenu("Peaks", submenu_PEAKS_id);
+    if ( TRK_nTractsPlotted>0 )
+        glutAddSubMenu("Fibers", submenu_FIBERS_id);
+    glutAddSubMenu("View options", submenu_VIEW_id);
+    glutAddMenuEntry("Quit", 0);
+    glutAttachMenu(GLUT_RIGHT_BUTTON);
+}
+
+
+// RESHAPE callback
+// ----------------
+void GLUT__reshape( GLint w, GLint h )
+{
+    ScreenX = w;
+    ScreenY = h;
+
+    glMatrixMode( GL_PROJECTION );
+    glLoadIdentity();
+    gluPerspective( 45.0f, (GLfloat)w / (GLfloat)h, 1.0f, 5000.0f );
+
+    glMatrixMode( GL_MODELVIEW );
+    glLoadIdentity();
+    gluLookAt(
+        0.0, 0.0, 2.0 * max(pixdim.x*dim.x,pixdim.y*dim.y) * (GLfloat)ScreenY/(GLfloat)ScreenX, // eye point
+        0.0, 0.0, 0.0, // reference point
+        0.0, 1.0, 0.0  // up vector
+    );
+}
+
+
+// SPECIALKEY callback
+// -------------------
+void GLUT__specialkey( GLint key, GLint x, GLint y )
+{
+    bool doRedraw = true;
+    GLint modif = glutGetModifiers();
+    GLint ALT   = modif & GLUT_ACTIVE_ALT;
+    GLint CTRL  = modif & GLUT_ACTIVE_CTRL;
+
+    switch( key )
+    {
+        case GLUT_KEY_LEFT:
+            if ( ALT )
+                TRK_offset.x -= 0.5;
+            else if ( CTRL )
+                translation.x -= 2.0;
+            else
+                VOXEL.x--;
+            break;
+        case GLUT_KEY_RIGHT:
+            if ( ALT )
+                TRK_offset.x += 0.5;
+            else if ( CTRL )
+                translation.x += 2.0;
+            else
+                VOXEL.x++;
+            break;
+        case GLUT_KEY_DOWN:
+            if ( ALT )
+                TRK_offset.y -= 0.5;
+            else if ( CTRL )
+                translation.y -= 2.0;
+            else
+                VOXEL.y--;
+            break;
+        case GLUT_KEY_UP:
+            if ( ALT )
+                TRK_offset.y += 0.5;
+            else if ( CTRL )
+                translation.y += 2.0;
+            else
+                VOXEL.y++;
+            break;
+        case GLUT_KEY_PAGE_DOWN:
+            if ( ALT )
+                TRK_offset.z -= 0.5;
+            else
+                VOXEL.z--;
+            break;
+        case GLUT_KEY_PAGE_UP:
+            if ( ALT )
+                TRK_offset.z += 0.5;
+            else
+                VOXEL.z++;
+            break;
+
+        default:
+            doRedraw = false;
+    }
+
+    // check the bounds
+    VOXEL.x = max( VOXEL.x, 0 );
+    VOXEL.y = max( VOXEL.y, 0 );
+    VOXEL.z = max( VOXEL.z, 0 );
+    VOXEL.x = min( VOXEL.x, dim.x-1 );
+    VOXEL.y = min( VOXEL.y, dim.y-1 );
+    VOXEL.z = min( VOXEL.z, dim.z-1 );
+
+    if ( doRedraw )
+        glutPostRedisplay();
+}
+
+
+
+// MOUSE callback
+// --------------
+void GLUT__mouse( GLint button, GLint state, GLint x, GLint y )
+{
+    if (state == GLUT_DOWN)
+    {
+        if ( button == GLUT_LEFT_BUTTON && glutGetModifiers() != GLUT_ACTIVE_CTRL )
+        {
+            moving = 1;
+            start.x = x;
+            start.y = y;
+        }
+        // NOTE: does not work, issue with glutGetModifiers not getting CTRL
+        // else if ( button == GLUT_LEFT_BUTTON && glutGetModifiers() == GLUT_ACTIVE_CTRL )
+        // {
+        //     moving = 2;
+        //     start.x = x;
+        //     start.y = y;
+        // }
+        else if ( (button == GLUT_MIDDLE_BUTTON) || (button == GLUT_LEFT_BUTTON && glutGetModifiers() == GLUT_ACTIVE_ALT) )
+        {
+            moving = 3;
+            start.x = x;
+            start.y = y;
+        }
+    }
+    else if (state == GLUT_UP)
+    {
+        moving = 0;
+    }
+}
+
+
+// MOTION callback
+// ---------------
+void GLUT__motion( GLint x, GLint y )
+{
+    if (moving==1)
+    {
+        OPENGL_utils::translate(id, 0,0,0, rot1);
+
+        OPENGL_utils::rotateY(id,start.x-x,rot3);
+        OPENGL_utils::matXMat(rot,rot1,rot2);
+        OPENGL_utils::rotateX(id,start.y-y,rot1);
+        OPENGL_utils::matXMat(rot2,rot1,rot);
+        OPENGL_utils::matXMat(rot,rot3,rot2);
+
+        OPENGL_utils::translate(id, 0,0,0, rot1);
+        OPENGL_utils::matXMat(rot2,rot1,rot);
+
+        start.x = x;
+        start.y = y;
+    }
+
+    else if (moving==2)
+    {
+        zoom = zoom + (y-start.y)/2.0;
+        start.y = y;
+    }
+
+    else if (moving==3)
+    {
+        translation.x = translation.x - (start.x-x)/3.0;
+        translation.y = translation.y + (start.y-y)/3.0;
+        start.x = x;
+        start.y = y;
+    }
+
+    glutPostRedisplay();
+}
+
+
+// DISPLAY callback
+// ----------------
+void GLUT__display( void )
+{
+    glClear( GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT );
+
+    glPushMatrix();
+    glTranslatef(translation.x, translation.y, -zoom); // mouse translation + zoom
+    glMultMatrixf(rot); // mouse rotation    
+    glTranslatef( -pixdim.x*dim.x/2.0, -pixdim.y*dim.y/2.0, -pixdim.z*dim.z/2.0 ); // center the FOV
+    glScalef( pixdim.x, pixdim.y, pixdim.z ); // account for voxel size
+
+    /* ============= */
+    /* Draw the AXES */
+    /* ============= */
+    if ( showAxes )
+    {
+        glLineWidth(2);
+        glBegin(GL_LINES);
+            glColor4f( 1,0,0,1); glVertex3f( 0,0,0 ); glVertex3f( 10,  0,  0 );
+            glColor4f( 0,1,0,1); glVertex3f( 0,0,0 ); glVertex3f(  0, 10,  0 );
+            glColor4f( 0,0,1,1); glVertex3f( 0,0,0 ); glVertex3f(  0,  0, 10 );
+        glEnd();
+    }
+
+    /* =============== */
+    /* Draw the TRACTS */
+    /* =============== */
+    if ( TRK_show )
+    {
+        glPushMatrix();
+        glTranslatef(TRK_offset.x, TRK_offset.y, TRK_offset.z);
+
+        glLineWidth(1.0f);
+
+        float *ptr  = TRK_coords, *ptrc = TRK_colors;
+        VECTOR<float> Vc( VOXEL.x+0.5, VOXEL.y+0.5, VOXEL.z+0.5 ); // voxel center
+        float thr = 0.5*TRK_crop;
+        for(int f=0; f < TRK_nTractsPlotted; f++)
+        {
+            glBegin(GL_LINE_STRIP);
+            for(int i=0; i < TRK_nPoints[f]; i++)
+            {
+                // plot segment only if it's close to center of VOXEL
+                if (
+                      (
+                        TRK_crop_mode && (
+                        ( showPlane[0] && abs( (ptr[0]+TRK_offset.x) - Vc.x ) <= thr ) ||
+                        ( showPlane[1] && abs( (ptr[1]+TRK_offset.y) - Vc.y ) <= thr ) ||
+                        ( showPlane[2] && abs( (ptr[2]+TRK_offset.z) - Vc.z ) <= thr ) )
+                      )
+                      ||
+                      (
+                        !TRK_crop_mode && (
+                        ( abs( (ptr[0]+TRK_offset.x) - Vc.x ) <= thr ) &&
+                        ( abs( (ptr[1]+TRK_offset.y) - Vc.y ) <= thr ) &&
+                        ( abs( (ptr[2]+TRK_offset.z) - Vc.z ) <= thr ) )
+                      )
+                    )
+                {
+                    glColor3f(  *ptrc++, *ptrc++, *ptrc++ );
+                    glVertex3f( *ptr++,  *ptr++,  *ptr++  );
+                }
+                else
+                {
+                    glEnd();
+                    glBegin(GL_LINE_STRIP);
+                    ptr  += 3;
+                    ptrc += 3;
+                }
+            }
+            glEnd();
+        }
+
+        glPopMatrix();
+    }
+
+    /* ============== */
+    /* Draw the PEAKS */
+    /* ============== */
+    if ( PEAKS_show || GLYPHS_show )
+    {
+        glDisable( GL_BLEND );
+        glLineWidth( LINE_width );
+        glPointSize( LINE_width );
+
+        glPushMatrix();
+        glTranslatef(.5,.5,.5);
+
+        Vec3Df dir, col;
+        int x,y,z,d,idx;
+        float norms[PEAKS_n], normMax, b0, w;
+
+        // plane YZ
+        if ( showPlane[0]  )
+        {
+            x = (int)VOXEL.x;
+            for(y=0; y<dim.y ;y++)
+            for(z=0; z<dim.z ;z++)
+            {
+                if ( PEAKS_show )
+                {
+                    normMax = 0;
+                    for(d=0; d<PEAKS_n; d++)
+                    {
+                        col.x = (*niiPEAKS->img)(x,y,z,3*d+0); // use "col" as tmp variable
+                        col.y = (*niiPEAKS->img)(x,y,z,3*d+1);
+                        col.z = (*niiPEAKS->img)(x,y,z,3*d+2);
+                        if ( PEAKS_use_affine )
+                        {
+                            dir.x = col.x * ((float*)PEAKS_affine)[0] + col.y * ((float*)PEAKS_affine)[1] + col.z * ((float*)PEAKS_affine)[2];
+                            dir.y = col.x * ((float*)PEAKS_affine)[3] + col.y * ((float*)PEAKS_affine)[4] + col.z * ((float*)PEAKS_affine)[5];
+                            dir.z = col.x * ((float*)PEAKS_affine)[6] + col.y * ((float*)PEAKS_affine)[7] + col.z * ((float*)PEAKS_affine)[8];
+                        }
+                        else
+                        {
+                            dir.x = col.x;
+                            dir.y = col.y;
+                            dir.z = col.z;
+                        }
+                        norms[d] = dir.norm();
+                        if ( norms[d] > normMax )
+                            normMax = norms[d];
+                    }
+
+                    for(d=0; d<PEAKS_n; d++)
+                    {
+                        if ( norms[d] < PEAKS_thr*normMax )
+                            continue;
+
+                        col.x = (*niiPEAKS->img)(x,y,z,3*d+0); // use "col" as tmp variable
+                        col.y = (*niiPEAKS->img)(x,y,z,3*d+1);
+                        col.z = (*niiPEAKS->img)(x,y,z,3*d+2);
+                        if ( PEAKS_use_affine )
+                        {
+                            dir.x = col.x * ((float*)PEAKS_affine)[0] + col.y * ((float*)PEAKS_affine)[1] + col.z * ((float*)PEAKS_affine)[2];
+                            dir.y = col.x * ((float*)PEAKS_affine)[3] + col.y * ((float*)PEAKS_affine)[4] + col.z * ((float*)PEAKS_affine)[5];
+                            dir.z = col.x * ((float*)PEAKS_affine)[6] + col.y * ((float*)PEAKS_affine)[7] + col.z * ((float*)PEAKS_affine)[8];
+                        }
+                        else
+                        {
+                            dir.x = col.x;
+                            dir.y = col.y;
+                            dir.z = col.z;
+                        }
+                        col.x = 0.5 * (PEAKS_flip[0]?-1:1) * dir.x / norms[d];
+                        col.y = 0.5 * (PEAKS_flip[1]?-1:1) * dir.y / norms[d];
+                        col.z = 0.5 * (PEAKS_flip[2]?-1:1) * dir.z / norms[d];
+
+                        if ( PEAKS_doNormalize )
+                        {
+                            dir.x = col.x;
+                            dir.y = col.y;
+                            dir.z = col.z;
+                        }
+                        else
+                        {
+                            dir.x = col.x * norms[d] / normMax;
+                            dir.y = col.y * norms[d] / normMax;
+                            dir.z = col.z * norms[d] / normMax;
+                        }
+
+                        glColor3f( fabs(2.0*col.x), fabs(2.0*col.y), fabs(2.0*col.z) ); 
+                        glBegin(GL_LINES);
+                            glVertex3f( x-dir.x, y-dir.y, z-dir.z );
+                            glVertex3f( x+dir.x, y+dir.y, z+dir.z );
+                        glEnd();
+                    }
+                }
+                if ( GLYPHS_show )
+                {
+                    b0 = (*niiDWI->img)(x,y,z,SCHEME_idxB0[0]);
+                    if ( b0 > GLYPHS_b0_thr )
+                    {
+                        glBegin(GL_POINTS);
+                        for(d=0; d < SCHEME_shells_idx[GLYPHS_shell].size() ;d++)
+                        {
+                            idx = SCHEME_shells_idx[GLYPHS_shell][d];
+                            w = 0.5 * (float)(*niiDWI->img)(x,y,z,idx) / b0;
+                            if ( GLYPHS_use_affine ) 
+                            {
+                                dir.x = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[0] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[1] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[2];
+                                dir.y = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[3] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[4] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[5];
+                                dir.z = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[6] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[7] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[8];
+                                normMax = dir.norm();
+                                dir.x *= w / normMax;
+                                dir.y *= w / normMax;
+                                dir.z *= w / normMax;
+                            }
+                            else
+                            {
+                                dir.x = w * SCHEME_dirs[idx].x;
+                                dir.y = w * SCHEME_dirs[idx].y;
+                                dir.z = w * SCHEME_dirs[idx].z;
+                            }
+                            normMax = dir.norm();
+                            glColor3f( fabs(dir.x)/normMax, fabs(dir.y)/normMax, fabs(dir.z)/normMax );
+                            glVertex3f( x+dir.x, y+dir.y, z+dir.z );
+                            glVertex3f( x-dir.x, y-dir.y, z-dir.z );
+                        }
+                        glEnd();
+                    }
+                }
+            }
+        }
+
+        // plane XZ
+        if ( showPlane[1] )
+        {
+            y = (int)VOXEL.y;
+            for(x=0; x<dim.x ;x++)
+            for(z=0; z<dim.z ;z++)
+            {
+                if ( PEAKS_show )
+                {
+                    normMax = 0;
+                    for(d=0; d<PEAKS_n; d++)
+                    {
+                        col.x = (*niiPEAKS->img)(x,y,z,3*d+0); // use "col" as tmp variable
+                        col.y = (*niiPEAKS->img)(x,y,z,3*d+1);
+                        col.z = (*niiPEAKS->img)(x,y,z,3*d+2);
+                        if ( PEAKS_use_affine )
+                        {
+                            dir.x = col.x * ((float*)PEAKS_affine)[0] + col.y * ((float*)PEAKS_affine)[1] + col.z * ((float*)PEAKS_affine)[2];
+                            dir.y = col.x * ((float*)PEAKS_affine)[3] + col.y * ((float*)PEAKS_affine)[4] + col.z * ((float*)PEAKS_affine)[5];
+                            dir.z = col.x * ((float*)PEAKS_affine)[6] + col.y * ((float*)PEAKS_affine)[7] + col.z * ((float*)PEAKS_affine)[8];
+                        }
+                        else
+                        {
+                            dir.x = col.x;
+                            dir.y = col.y;
+                            dir.z = col.z;
+                        }
+                        norms[d] = dir.norm();
+                        if ( norms[d] > normMax )
+                            normMax = norms[d];
+                    }
+
+                    for(d=0; d<PEAKS_n; d++)
+                    {
+                        if ( norms[d] < normMax*PEAKS_thr )
+                            continue;
+
+                        col.x = (*niiPEAKS->img)(x,y,z,3*d+0); // use "col" as tmp variable
+                        col.y = (*niiPEAKS->img)(x,y,z,3*d+1);
+                        col.z = (*niiPEAKS->img)(x,y,z,3*d+2);
+                        if ( PEAKS_use_affine )
+                        {
+                            dir.x = col.x * ((float*)PEAKS_affine)[0] + col.y * ((float*)PEAKS_affine)[1] + col.z * ((float*)PEAKS_affine)[2];
+                            dir.y = col.x * ((float*)PEAKS_affine)[3] + col.y * ((float*)PEAKS_affine)[4] + col.z * ((float*)PEAKS_affine)[5];
+                            dir.z = col.x * ((float*)PEAKS_affine)[6] + col.y * ((float*)PEAKS_affine)[7] + col.z * ((float*)PEAKS_affine)[8];
+                        }
+                        else
+                        {
+                            dir.x = col.x;
+                            dir.y = col.y;
+                            dir.z = col.z;
+                        }
+                        col.x = 0.5 * (PEAKS_flip[0]?-1:1) * dir.x / norms[d];
+                        col.y = 0.5 * (PEAKS_flip[1]?-1:1) * dir.y / norms[d];
+                        col.z = 0.5 * (PEAKS_flip[2]?-1:1) * dir.z / norms[d];
+
+                        if ( PEAKS_doNormalize )
+                        {
+                            dir.x = col.x;
+                            dir.y = col.y;
+                            dir.z = col.z;
+                        }
+                        else
+                        {
+                            dir.x = col.x * norms[d] / normMax;
+                            dir.y = col.y * norms[d] / normMax;
+                            dir.z = col.z * norms[d] / normMax;
+                        }
+
+                        glColor3f( fabs(2.0*col.x), fabs(2.0*col.y), fabs(2.0*col.z) );
+                        glBegin(GL_LINES);
+                            glVertex3f( x-dir.x, y-dir.y, z-dir.z );
+                            glVertex3f( x+dir.x, y+dir.y, z+dir.z );
+                        glEnd();
+                    }
+                }
+
+                if ( GLYPHS_show )
+                {
+                    b0 = (*niiDWI->img)(x,y,z,SCHEME_idxB0[0]);
+                    if ( b0 > GLYPHS_b0_thr )
+                    {
+                        glBegin(GL_POINTS);
+                        for(d=0; d < SCHEME_shells_idx[GLYPHS_shell].size() ;d++)
+                        {
+                            idx = SCHEME_shells_idx[GLYPHS_shell][d];
+                            w = 0.5 * (float)(*niiDWI->img)(x,y,z,idx) / b0;
+                            if ( GLYPHS_use_affine ) 
+                            {
+                                dir.x = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[0] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[1] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[2];
+                                dir.y = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[3] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[4] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[5];
+                                dir.z = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[6] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[7] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[8];
+                                normMax = dir.norm();
+                                dir.x *= w / normMax;
+                                dir.y *= w / normMax;
+                                dir.z *= w / normMax;
+                            }
+                            else
+                            {
+                                dir.x = w * SCHEME_dirs[idx].x;
+                                dir.y = w * SCHEME_dirs[idx].y;
+                                dir.z = w * SCHEME_dirs[idx].z;
+                            }
+                            normMax = dir.norm();
+                            glColor3f( fabs(dir.x)/normMax, fabs(dir.y)/normMax, fabs(dir.z)/normMax );
+                            glVertex3f( x+dir.x, y+dir.y, z+dir.z );
+                            glVertex3f( x-dir.x, y-dir.y, z-dir.z );
+                        }
+                        glEnd();
+                    }
+                }
+            }
+        }
+
+        // plane XY
+        if ( showPlane[2] )
+        {
+            z = (int)VOXEL.z;
+            for(y=0; y<dim.y ;y++)
+            for(x=0; x<dim.x ;x++)
+            {
+                if ( PEAKS_show )
+                {
+                    normMax = 0;
+                    for(d=0; d<PEAKS_n; d++)
+                    {
+                        col.x = (*niiPEAKS->img)(x,y,z,3*d+0); // use "col" as tmp variable
+                        col.y = (*niiPEAKS->img)(x,y,z,3*d+1);
+                        col.z = (*niiPEAKS->img)(x,y,z,3*d+2);
+                        if ( PEAKS_use_affine )
+                        {
+                            dir.x = col.x * ((float*)PEAKS_affine)[0] + col.y * ((float*)PEAKS_affine)[1] + col.z * ((float*)PEAKS_affine)[2];
+                            dir.y = col.x * ((float*)PEAKS_affine)[3] + col.y * ((float*)PEAKS_affine)[4] + col.z * ((float*)PEAKS_affine)[5];
+                            dir.z = col.x * ((float*)PEAKS_affine)[6] + col.y * ((float*)PEAKS_affine)[7] + col.z * ((float*)PEAKS_affine)[8];
+                        }
+                        else
+                        {
+                            dir.x = col.x;
+                            dir.y = col.y;
+                            dir.z = col.z;
+                        }
+                        norms[d] = dir.norm();
+                        if ( norms[d] > normMax )
+                            normMax = norms[d];
+                    }
+
+                    for(d=0; d<PEAKS_n; d++)
+                    {
+                        if ( norms[d] < normMax*PEAKS_thr )
+                            continue;
+
+                        col.x = (*niiPEAKS->img)(x,y,z,3*d+0); // use "col" as tmp variable
+                        col.y = (*niiPEAKS->img)(x,y,z,3*d+1);
+                        col.z = (*niiPEAKS->img)(x,y,z,3*d+2);
+                        if ( PEAKS_use_affine )
+                        {
+                            dir.x = col.x * ((float*)PEAKS_affine)[0] + col.y * ((float*)PEAKS_affine)[1] + col.z * ((float*)PEAKS_affine)[2];
+                            dir.y = col.x * ((float*)PEAKS_affine)[3] + col.y * ((float*)PEAKS_affine)[4] + col.z * ((float*)PEAKS_affine)[5];
+                            dir.z = col.x * ((float*)PEAKS_affine)[6] + col.y * ((float*)PEAKS_affine)[7] + col.z * ((float*)PEAKS_affine)[8];
+                        }
+                        else
+                        {
+                            dir.x = col.x;
+                            dir.y = col.y;
+                            dir.z = col.z;
+                        }
+                        col.x = 0.5 * (PEAKS_flip[0]?-1:1) * dir.x / norms[d];
+                        col.y = 0.5 * (PEAKS_flip[1]?-1:1) * dir.y / norms[d];
+                        col.z = 0.5 * (PEAKS_flip[2]?-1:1) * dir.z / norms[d];
+
+                        if ( PEAKS_doNormalize )
+                        {
+                            dir.x = col.x;
+                            dir.y = col.y;
+                            dir.z = col.z;
+                        }
+                        else
+                        {
+                            dir.x = col.x * norms[d] / normMax;
+                            dir.y = col.y * norms[d] / normMax;
+                            dir.z = col.z * norms[d] / normMax;
+                        }
+
+                        glColor3f( fabs(2.0*col.x), fabs(2.0*col.y), fabs(2.0*col.z) );
+                        glBegin(GL_LINES);
+                            glVertex3f( x-dir.x, y-dir.y, z-dir.z );
+                            glVertex3f( x+dir.x, y+dir.y, z+dir.z );
+                        glEnd();
+                    }
+                }
+
+                if( GLYPHS_show)
+                {
+                    b0 = (*niiDWI->img)(x,y,z,SCHEME_idxB0[0]);
+                    if ( b0 > GLYPHS_b0_thr )
+                    {
+                        glBegin(GL_POINTS);
+                        for(d=0; d < SCHEME_shells_idx[GLYPHS_shell].size() ;d++)
+                        {
+                            idx = SCHEME_shells_idx[GLYPHS_shell][d];
+                            w = 0.5 * (float)(*niiDWI->img)(x,y,z,idx) / b0;
+                            if ( GLYPHS_use_affine ) 
+                            {
+                                dir.x = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[0] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[1] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[2];
+                                dir.y = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[3] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[4] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[5];
+                                dir.z = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[6] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[7] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[8];
+                                normMax = dir.norm();
+                                dir.x *= w / normMax;
+                                dir.y *= w / normMax;
+                                dir.z *= w / normMax;
+                            }
+                            else
+                            {
+                                dir.x = w * SCHEME_dirs[idx].x;
+                                dir.y = w * SCHEME_dirs[idx].y;
+                                dir.z = w * SCHEME_dirs[idx].z;
+                            }
+
+                            normMax = dir.norm();
+                            glColor3f( fabs(dir.x)/normMax, fabs(dir.y)/normMax, fabs(dir.z)/normMax );
+                            glVertex3f( x+dir.x, y+dir.y, z+dir.z );
+                            glVertex3f( x-dir.x, y-dir.y, z-dir.z );
+                        }
+                        glEnd();
+                    }
+                }
+            }
+        }
+
+        glPopMatrix();
+    }
+
+    /* =================== */
+    /* Draw the SCALAR MAP */
+    /* =================== */
+    if ( showPlane[0] || showPlane[1] || showPlane[2] )
+    {
+        glDisable( GL_CULL_FACE );
+        glEnable( GL_BLEND );
+        glBlendFunc( GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA );
+
+        // to avoid z-fighting
+        glPolygonOffset( 1.0, 1.0 );
+        glEnable(GL_POLYGON_OFFSET_FILL);
+        glPolygonMode(GL_FRONT_AND_BACK, GL_FILL);
+
+        glLineWidth( 3 );
+
+        int x, y, z; // voxel coordinates NB: (0,0,0) -> corner of voxel
+        float color;
+
+        // plane YZ
+        if ( showPlane[0]  )
+        {
+            glPushMatrix();
+            glTranslatef(0.5,0,0);
+
+            x = (int)VOXEL.x;
+            for(y=0; y<dim.y ;y++)
+            for(z=0; z<dim.z ;z++)
+            {
+                color = ( MAP(x,y,z) - MAP_min_view) / ( MAP_max_view - MAP_min_view );
+                glColor4f(color,color,color,MAP_opacity);
+                glBegin(GL_QUADS);
+                    glVertex3f(x, y,   z);
+                    glVertex3f(x, y,   z+1);
+                    glVertex3f(x, y+1, z+1);
+                    glVertex3f(x, y+1, z);
+                glEnd();
+            }
+            // colored frame
+            if ( showAxes )
+            {
+                glColor3f(1,0,0);
+                glBegin(GL_LINE_STRIP);
+                    glVertex3f(x,0,0);
+                    glVertex3f(x,dim.y,0);
+                    glVertex3f(x,dim.y,dim.z);
+                    glVertex3f(x,0,dim.z);
+                    glVertex3f(x,0,0);
+                glEnd();
+            }
+
+            glPopMatrix();
+        }
+
+        // plane XZ
+        if ( showPlane[1] )
+        {
+            glPushMatrix();
+            glTranslatef(0,0.5,0);
+
+            y = (int)VOXEL.y;
+            for(x=0; x<dim.x ;x++)
+            for(z=0; z<dim.z ;z++)
+            {
+                color = ( MAP(x,y,z) - MAP_min_view) / ( MAP_max_view - MAP_min_view );
+                glColor4f(color,color,color,MAP_opacity);
+                glBegin(GL_QUADS);
+                    glVertex3f(x,   y, z);
+                    glVertex3f(x,   y, z+1);
+                    glVertex3f(x+1, y, z+1);
+                    glVertex3f(x+1, y, z);
+                glEnd();
+            }
+            // colored frame
+            if ( showAxes )
+            {
+                glColor3f(0,1,0);
+                glBegin(GL_LINE_STRIP);
+                    glVertex3f(0,y,0);
+                    glVertex3f(dim.x,y,0);
+                    glVertex3f(dim.x,y,dim.z);
+                    glVertex3f(0,y,dim.z);
+                    glVertex3f(0,y,0);
+                glEnd();
+            }
+
+            glPopMatrix();
+        }
+
+        // plane XY
+        if ( showPlane[2] )
+        {
+            glPushMatrix();
+            glTranslatef(0,0,0.5);
+
+            z = (int)VOXEL.z;
+            for(y=0; y<dim.y ;y++)
+            for(x=0; x<dim.x ;x++)
+            {
+                color = ( MAP(x,y,z) - MAP_min_view) / ( MAP_max_view - MAP_min_view );
+                glColor4f(color,color,color,MAP_opacity);
+                glBegin(GL_QUADS);
+                    glVertex3f(x,   y,   z);
+                    glVertex3f(x+1, y,   z);
+                    glVertex3f(x+1, y+1, z);
+                    glVertex3f(x,   y+1, z);
+                glEnd();
+            }
+
+            // colored frame
+            if ( showAxes )
+            {
+                glColor3f(0,0,1);
+                glBegin(GL_LINE_STRIP);
+                    glVertex3f(0,0,z);
+                    glVertex3f(dim.x,0,z);
+                    glVertex3f(dim.x,dim.y,z);
+                    glVertex3f(0,dim.y,z);
+                    glVertex3f(0,0,z);
+                glEnd();
+            }
+
+            glPopMatrix();
+        }
+
+        glEnable(GL_CULL_FACE);
+        glDisable( GL_BLEND );
+        glDisable(GL_POLYGON_OFFSET_FILL);
+    }
+
+    /* ====================== */
+    /* Draw the CURRENT VOXEL */
+    /* ====================== */
+    if ( showAxes )
+    {
+        glPushMatrix();
+        glTranslatef( VOXEL.x+0.5, VOXEL.y+0.5, VOXEL.z+0.5 );
+
+        glEnable( GL_BLEND );
+        glBlendFunc( GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA );
+        glLineWidth(1);
+        glColor4f( 1,1,0,1 );
+        glutWireCube( 1 );
+        glColor4f( 1,1,0,0.25 );
+        glutSolidCube( 1 );
+        glDisable( GL_BLEND );
+
+        glPopMatrix();
+    }
+
+    glPopMatrix();
+    PrintConfig();
+    glutSwapBuffers();
+}
+
+
+// INITIALIZATION
+// --------------
+void OpenGL_init( int argc, char** argv )
+{
+    glutInit( &argc, argv );
+    glutInitDisplayMode( GLUT_DEPTH | GLUT_DOUBLE | GLUT_RGBA );
+    ScreenX = 0.7*glutGet(GLUT_SCREEN_WIDTH);  if (ScreenX==0) ScreenX = 800;
+    ScreenY = 0.7*glutGet(GLUT_SCREEN_HEIGHT); if (ScreenY==0) ScreenY = 600;
+    glutInitWindowSize( ScreenX/2, ScreenY/2 );
+    glutInitWindowPosition( 0.15*glutGet(GLUT_SCREEN_WIDTH), 0.15*glutGet(GLUT_SCREEN_HEIGHT) );
+    glutCreateWindow( "COMMIT debugger" );
+    glutReshapeWindow( ScreenX, ScreenY );
+
+    // Projection and model matrix
+    glMatrixMode(GL_PROJECTION);
+    glLoadIdentity();
+    // gluPerspective( 40.0f, (GLfloat)ScreenX / (GLfloat)ScreenY, 10.0f, 1000.0f );
+    glMatrixMode(GL_MODELVIEW);
+    glLoadIdentity();
+    // gluLookAt(
+    //     0.0, 0.0, 2.0*max(pixdim.x*dim.x,pixdim.y*dim.y) * (GLfloat)ScreenY/(GLfloat)ScreenX,
+    //     0.0, 0.0, 0.0,
+    //     0.0, 1.0, 0.0
+    // );
+
+    translation.x	= translation.y = 0;
+    zoom			= 0;
+    OPENGL_utils::identity( rot );
+    OPENGL_utils::identity( id );
+
+    // basic settings
+    glEnable( GL_LINE_SMOOTH );
+    glEnable( GL_POLYGON_SMOOTH );
+    glHint( GL_LINE_SMOOTH_HINT, GL_NICEST );
+    glHint( GL_POLYGON_SMOOTH_HINT, GL_NICEST );
+
+    glEnable( GL_DEPTH_TEST );
+    glClearColor( 0.1, 0.1, 0.1, 0.0 );
+
+    // lighting
+    glShadeModel( GL_SMOOTH );
+    glEnable( GL_NORMALIZE );
+
+    GLfloat white[] = {.5f, .5f, .5f, 1.0f};
+    glMaterialfv(GL_FRONT, GL_SPECULAR, white);
+    GLfloat shininess[] = {32};
+    glMaterialfv(GL_FRONT, GL_SHININESS, shininess);
+
+    glLightModeli(GL_LIGHT_MODEL_TWO_SIDE, GL_FALSE);
+    GLfloat global_ambient[] = { 0.2f, 0.2f, 0.2f, 1.0f };
+    glLightModelfv(GL_LIGHT_MODEL_AMBIENT, global_ambient);
+    glEnable ( GL_COLOR_MATERIAL );	// use glColor3f() to colorize polygons
+
+    // register CALLBACKS and open window
+    glutKeyboardFunc( GLUT__keyboard );
+    glutSpecialFunc(  GLUT__specialkey );
+    glutDisplayFunc(  GLUT__display );
+    glutReshapeFunc(  GLUT__reshape );
+    glutMouseFunc(    GLUT__mouse );
+    glutMotionFunc(   GLUT__motion );
+
+    GLUT__createMenu();
+
+    glutMainLoop();
+}
diff --git a/extras/COMMIT_debugger/OPENGL_utils.h b/extras/COMMIT_debugger/OPENGL_utils.h
index cd3f3607..a9390517 100755
--- a/extras/COMMIT_debugger/OPENGL_utils.h
+++ b/extras/COMMIT_debugger/OPENGL_utils.h
@@ -1,95 +1,95 @@
-#ifndef __OPENGL_UTILS_H__
-#define __OPENGL_UTILS_H__
-
-#include <algorithm>
-
-#include "VECTOR.h"
-typedef VECTOR<GLint>		Vec3Di;
-typedef VECTOR<GLfloat>		Vec3Df;
-
-
-namespace OPENGL_utils
-{
-
-void identity(GLfloat* result)
-{
-    for (int i=0; i<4; i++)
-    for (int j=0; j<4; j++)
-        if (i==j) result[4*i+j]=1; else result[4*i+j]=0;
-}
-
-
-void matXMat(GLfloat* m, GLfloat* m1, GLfloat* result)
-{
-    for (int i=0; i<4; i++)
-    for (int j=0; j<4; j++)
-    {
-        result[4*i+j]=0;
-        for (int t=0; t<4; t++)
-            result[4*i+j]=result[4*i+j]+m[4*i+t]*m1[4*t+j];
-    }
-}
-
-
-void rotateZ(GLfloat* m, GLfloat ang, GLfloat* result)
-{
-    static GLfloat matrix[16];
-
-    for (int i=0; i<16 ; i++) matrix[i] = 0;
-    matrix[0]  = cos(ang/180*3.1415);
-    matrix[5]  = cos(ang/180*3.1415);
-    matrix[1]  = -sin(ang/180*3.1415);
-    matrix[4]  = sin(ang/180*3.1415);
-    matrix[10] = 1;
-    matrix[15] = 1;
-    matXMat(matrix,m,result);
-}
-
-
-void rotateY(GLfloat* m, GLfloat ang, GLfloat* result)
-{
-    static GLfloat matrix[16];
-
-    for (int i=0; i<16 ; i++) matrix[i] = 0;
-    matrix[0]  = cos(ang/180*3.1415);
-    matrix[10] = cos(ang/180*3.1415);
-    matrix[8]  = -sin(ang/180*3.1415);
-    matrix[2]  = sin(ang/180*3.1415);
-    matrix[5]  = 1;
-    matrix[15] = 1;
-    matXMat(matrix,m,result);
-}
-
-
-void rotateX(GLfloat* m, GLfloat ang, GLfloat* result)
-{
-    static GLfloat matrix[16];
-
-    for (int i=0; i<16 ; i++) matrix[i] = 0;
-    matrix[5]  = cos(ang/180*3.1415);
-    matrix[10] = cos(ang/180*3.1415);
-    matrix[6]  = -sin(ang/180*3.1415);
-    matrix[9]  = sin(ang/180*3.1415);
-    matrix[0]  = 1;
-    matrix[15] = 1;
-    matXMat(matrix,m,result);
-}
-
-
-void translate(GLfloat* m, GLfloat x,GLfloat y,GLfloat z, GLfloat* result)
-{
-    static GLfloat matrix[16];
-
-    for (int i=0; i<16 ; i++) matrix[i] = 0;
-    matrix[0]  = 1;
-    matrix[5]  = 1;
-    matrix[10] = 1;
-    matrix[15] = 1;
-    matrix[12] = x;
-    matrix[13] = y;
-    matrix[14] = z;
-    matXMat(matrix,m,result);
-}
-
-}
-#endif
+#ifndef __OPENGL_UTILS_H__
+#define __OPENGL_UTILS_H__
+
+#include <algorithm>
+
+#include "VECTOR.h"
+typedef VECTOR<GLint>		Vec3Di;
+typedef VECTOR<GLfloat>		Vec3Df;
+
+
+namespace OPENGL_utils
+{
+
+void identity(GLfloat* result)
+{
+    for (int i=0; i<4; i++)
+    for (int j=0; j<4; j++)
+        if (i==j) result[4*i+j]=1; else result[4*i+j]=0;
+}
+
+
+void matXMat(GLfloat* m, GLfloat* m1, GLfloat* result)
+{
+    for (int i=0; i<4; i++)
+    for (int j=0; j<4; j++)
+    {
+        result[4*i+j]=0;
+        for (int t=0; t<4; t++)
+            result[4*i+j]=result[4*i+j]+m[4*i+t]*m1[4*t+j];
+    }
+}
+
+
+void rotateZ(GLfloat* m, GLfloat ang, GLfloat* result)
+{
+    static GLfloat matrix[16];
+
+    for (int i=0; i<16 ; i++) matrix[i] = 0;
+    matrix[0]  = cos(ang/180*3.1415);
+    matrix[5]  = cos(ang/180*3.1415);
+    matrix[1]  = -sin(ang/180*3.1415);
+    matrix[4]  = sin(ang/180*3.1415);
+    matrix[10] = 1;
+    matrix[15] = 1;
+    matXMat(matrix,m,result);
+}
+
+
+void rotateY(GLfloat* m, GLfloat ang, GLfloat* result)
+{
+    static GLfloat matrix[16];
+
+    for (int i=0; i<16 ; i++) matrix[i] = 0;
+    matrix[0]  = cos(ang/180*3.1415);
+    matrix[10] = cos(ang/180*3.1415);
+    matrix[8]  = -sin(ang/180*3.1415);
+    matrix[2]  = sin(ang/180*3.1415);
+    matrix[5]  = 1;
+    matrix[15] = 1;
+    matXMat(matrix,m,result);
+}
+
+
+void rotateX(GLfloat* m, GLfloat ang, GLfloat* result)
+{
+    static GLfloat matrix[16];
+
+    for (int i=0; i<16 ; i++) matrix[i] = 0;
+    matrix[5]  = cos(ang/180*3.1415);
+    matrix[10] = cos(ang/180*3.1415);
+    matrix[6]  = -sin(ang/180*3.1415);
+    matrix[9]  = sin(ang/180*3.1415);
+    matrix[0]  = 1;
+    matrix[15] = 1;
+    matXMat(matrix,m,result);
+}
+
+
+void translate(GLfloat* m, GLfloat x,GLfloat y,GLfloat z, GLfloat* result)
+{
+    static GLfloat matrix[16];
+
+    for (int i=0; i<16 ; i++) matrix[i] = 0;
+    matrix[0]  = 1;
+    matrix[5]  = 1;
+    matrix[10] = 1;
+    matrix[15] = 1;
+    matrix[12] = x;
+    matrix[13] = y;
+    matrix[14] = z;
+    matXMat(matrix,m,result);
+}
+
+}
+#endif
diff --git a/extras/COMMIT_debugger/main.cxx b/extras/COMMIT_debugger/main.cxx
index 88793e62..46c94717 100755
--- a/extras/COMMIT_debugger/main.cxx
+++ b/extras/COMMIT_debugger/main.cxx
@@ -1,651 +1,651 @@
-#include <NIFTI.h>
-#include <nifti1_io.h>
-#include <COLOR_ui.h>
-#include <TrackVis.h>
-#include <VECTOR.h>
-#include <cmath>
-#include <regex>
-#include <cstdlib>
-#include "tclap/CmdLine.h"
-#include <blitz/array.h>
-using namespace std;
-
-#include "colormaps.h"
-
-NIFTI*                   niiDWI;
-VECTOR<int>		         dim;
-VECTOR<float>	         pixdim;
-
-int                      SCHEME_version;
-vector< VECTOR<float> >	 SCHEME_dirs;
-vector<float>	         SCHEME_b;
-vector<int>              SCHEME_idxB0;
-vector<int>              SCHEME_idxDWI;
-vector<float>	         SCHEME_shells_b;
-vector< vector<int> >    SCHEME_shells_idx;
-
-blitz::Array<float,3>    MAP;
-VECTOR<int>		         VOXEL;
-float                    MAP_min, MAP_min_view, MAP_max, MAP_max_view;
-float 			         MAP_opacity = 0.5;
-bool			         showPlane[3] = { true, true, true };
-bool                     showAxes = true;
-bool			         showConfig = true;
-float				     LINE_width = 2.0;
-
-NIFTI*                   niiPEAKS;
-int				         PEAKS_n;
-bool			         PEAKS_show = false;
-float			         PEAKS_thr = 0.0;
-bool			         PEAKS_doNormalize = false;
-bool			         PEAKS_flip[3] = {false, false, false};
-bool			         PEAKS_use_affine = false;
-float                    PEAKS_affine[3][3];
-
-TrackVis 		         TRK_file;
-int				         TRK_skip;
-int				         TRK_nTractsPlotted;
-int*   			         TRK_nPoints;
-float*			         TRK_coords;
-float*			         TRK_colors;
-float 			         TRK_crop = 1.0;
-bool 			         TRK_crop_mode = true;
-bool 			         TRK_show = false;
-VECTOR<float> 	         TRK_offset;
-
-bool 			         GLYPHS_show = false;
-int                      GLYPHS_shell = 0;
-bool			         GLYPHS_flip[3] = {false, false, false};
-float	                 GLYPHS_b0_thr = 50.0;
-bool			         GLYPHS_use_affine = false;
-float                    GLYPHS_affine[3][3];
-
-#include "OPENGL_callbacks.cxx"
-
-
-/*----------------------------------------------------------------------------------------------------------------------------------*/
-int main(int argc, char** argv)
-{
-    TCLAP::CmdLine cmd("This tool allows one to display in a common 3D space all the objects (DWI data, streamlines etc...) used by COMMIT in order to spot possible incosistencies between the conventions of COMMIT and the software that generated the data, e.g. flip in some axes in the DWI data or in the peaks, spatial shift in the streamlines, whether the affine transformation was already applied to the data etc..", ' ', "1.1");
-
-    TCLAP::UnlabeledValueArg<string> argDWI(    "dwi","Filename of the DWI dataset [4D NIFTI]", true, "", "DWI", cmd );
-    TCLAP::ValueArg<string>          argMAP(    "m", "map", "Background map [3D NIFTI]", false, "", "map", cmd );
-    TCLAP::ValueArg<string>          argPEAKS(  "p", "peaks", "Main diffusion directions for the extra-axonal part in each voxel [4D NIFTI]", false, "", "peaks", cmd );
-    TCLAP::ValueArg<string>          argTRK(    "f", "fibers", "Streamlines for the intra-axonal part [.TRK format]", false, "", "fibers", cmd );
-    TCLAP::UnlabeledValueArg<string> argSCHEME( "scheme","Acquisition scheme [text]", true, "", "scheme", cmd );
-
-    try	{ cmd.parse( argc, argv ); }
-    catch (TCLAP::ArgException &e) { cerr << "error: " << e.error() << " for arg " << e.argId() << endl; }
-
-    string DWI_filename( argDWI.getValue() );
-    string SCHEME_filename( argSCHEME.getValue() );
-    string PEAKS_filename( argPEAKS.getValue() );
-    string TRK_filename( argTRK.getValue() );
-    string MAP_filename( argMAP.getValue() );
-
-
-    // ===================
-    // Reading DWI dataset
-    // ===================
-    COLOR_msg( "-> Reading 'DWI' dataset:", "\n" );
-
-    niiDWI = new NIFTI;
-    niiDWI->open( DWI_filename, true );
-    if ( !niiDWI->isValid() )
-    {
-        COLOR_error( "Unable to open file", "\t" );
-        return EXIT_FAILURE;
-    }
-    dim.x = niiDWI->hdr->dim[1];
-    dim.y = niiDWI->hdr->dim[2];
-    dim.z = niiDWI->hdr->dim[3];
-    pixdim.x = niiDWI->hdr->pixdim[1];
-    pixdim.y = niiDWI->hdr->pixdim[2];
-    pixdim.z = niiDWI->hdr->pixdim[3];
-    printf( "\tdim    : %d x %d x %d x %d\n", dim.x, dim.y, dim.z, niiDWI->hdr->dim[4] );
-    printf( "\tpixdim : %.4f x %.4f x %.4f\n", 	pixdim.x, pixdim.y, pixdim.z );
-    printf( "\tqform  : %d\n", niiDWI->hdr->qform_code );
-    mat44 DWI_qform = niiDWI->hdr->qto_xyz;
-    if ( niiDWI->hdr->qform_code > 0 )
-    {
-        for(int i=0; i<3 ;i++)
-        {
-            printf( "\t\t| " );
-            for(int j=0; j<4 ;j++)
-                printf( "%9.4f ", DWI_qform.m[i][j] );
-            printf( "|\n" );
-        }
-    }
-    else
-    {
-        COLOR_warning( "This should never happen!", "\t\t" );
-    }
-    printf( "\tsform  : %d\n", niiDWI->hdr->sform_code );
-    mat44 DWI_sform = niiDWI->hdr->sto_xyz;
-    if ( niiDWI->hdr->sform_code > 0 )
-    {
-        for(int i=0; i<3 ;i++)
-        {
-            printf( "\t\t| " );
-            for(int j=0; j<4 ;j++)
-                printf( "%9.4f ", DWI_sform.m[i][j] );
-            printf( "|\n" );
-        }
-    }
-
-    // Read the affine matrix to rotate the vectors
-    // NB: we need the inverse, but in this case inv=transpose
-    if ( niiDWI->hdr->qform_code != 0 )
-    {
-        for(int i=0; i<3 ;i++)
-        for(int j=0; j<3 ;j++)
-            GLYPHS_affine[i][j] = DWI_qform.m[j][i];
-    }
-    else if ( niiDWI->hdr->sform_code != 0 )
-    {
-        for(int i=0; i<3 ;i++)
-        for(int j=0; j<3 ;j++)
-            GLYPHS_affine[i][j] = DWI_sform.m[j][i];
-    }
-    else {
-        for(int i=0; i<3 ;i++)
-        for(int j=0; j<3 ;j++)
-            GLYPHS_affine[i][j] = 0;
-        for(int i=0; i<3 ;i++)
-            GLYPHS_affine[i][i] = 1;
-    }
-
-    mat33 tmp;
-    for(int i=0; i<3 ;i++)
-        for(int j=0; j<3 ;j++)
-            tmp.m[i][j] = GLYPHS_affine[i][j];
-    printf( "\tAffine used (%s):\n", nifti_mat33_determ(tmp)<0?"RADIOLOGICAL":"NEUROLOGICAL" );
-    for(int i=0; i<3 ;i++)
-    {
-        printf( "\t\t| " );
-        for(int j=0; j<3 ;j++)
-            printf( "%9.4f ", GLYPHS_affine[i][j] );
-        printf( "|\n" );
-    }
-
-    COLOR_msg( "   [OK]" );
-
-
-    // ===================
-    // Reading SCHEME file
-    // ===================
-    COLOR_msg( "-> Reading 'SCHEME' file:", "\n" );
-
-    char line[1000];
-    FILE* pFile = fopen( SCHEME_filename.c_str(), "rt" );
-
-    // read the version
-    // ----------------
-    try
-    {
-        while( fgets(line, 1000, pFile) )
-            if ( line[0]!='#' )
-                break;
-
-        std::regex reVersion("^VERSION: (.*)\\s*$");
-        std::smatch reMatches;
-
-        if ( !std::regex_match(string(line), reMatches, reVersion) )
-        {
-            // no header found, assume standards BVECTOR format
-            SCHEME_version = 0;
-            fseek(pFile, -strlen(line), SEEK_CUR);
-        }
-        else
-        {
-            if( strcmp(reMatches[1].str().c_str(),"0")==0 || strcmp(reMatches[1].str().c_str(),"BVECTOR")==0 )
-                SCHEME_version = 0;
-            else if( strcmp(reMatches[1].str().c_str(),"1")==0 || strcmp(reMatches[1].str().c_str(),"STEJSKALTANNER")==0 )
-                SCHEME_version = 1;
-            else
-                throw "Version not recognized";
-        }
-    }
-    catch( const char* msg )
-    {
-        COLOR_error( msg, "\t" );
-        return EXIT_FAILURE;
-    }
-    printf( "\tversion   : %s\n", SCHEME_version==0?"BVECTOR":"STEJSKALTANNER" );
-
-    // read the data
-    // -------------
-    try
-    {
-        string      reFLOAT( "[-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?" );
-        std::regex  reVERSION0( "^\\s*("+reFLOAT+")\\s+("+reFLOAT+")\\s+("+reFLOAT+")\\s+("+reFLOAT+")\\s*$" );
-        std::regex  reVERSION1( "^\\s*("+reFLOAT+")\\s+("+reFLOAT+")\\s+("+reFLOAT+")\\s+("+reFLOAT+")\\s+("+reFLOAT+")\\s+("+reFLOAT+")\\s+("+reFLOAT+")\\s*$" );
-        std::regex  reEMPTY( "^\\s*$" );
-        std::smatch reMatches;
-        int         Ns = 0;
-        float       x, y, z, b, G, D, d;
-        while( fgets(line, 1000, pFile) )
-        {
-            if( std::regex_match(string(line), reMatches, reEMPTY) )
-                continue;   // skip empty lines
-
-            if( SCHEME_version == 0 )
-            {
-                if ( !std::regex_match(string(line), reMatches, reVERSION0) )
-                    throw "Wrong row format";
-                x = std::atof( reMatches[1].str().c_str() );
-                y = std::atof( reMatches[2].str().c_str() );
-                z = std::atof( reMatches[3].str().c_str() );
-                b = std::atof( reMatches[4].str().c_str() ); // in mm^2/s
-                VECTOR<float> tmp( x, y, z );
-                tmp.Normalize();
-                SCHEME_dirs.push_back( tmp );
-                SCHEME_b.push_back( b );
-            }
-            else
-            {
-                if ( !std::regex_match(string(line), reMatches, reVERSION1) )
-                    throw "Wrong row format";
-                x = std::atof( reMatches[1].str().c_str() );
-                y = std::atof( reMatches[2].str().c_str() );
-                z = std::atof( reMatches[3].str().c_str() );
-                G = std::atof( reMatches[4].str().c_str() );
-                D = std::atof( reMatches[5].str().c_str() );
-                d = std::atof( reMatches[6].str().c_str() );
-                VECTOR<float> tmp( x, y, z );
-                tmp.Normalize();
-                SCHEME_dirs.push_back( tmp );
-                b = std::pow( 267.513e6*G*d, 2 ) * (D-d/3.0) * 1e-6; // in mm^2/s
-                SCHEME_b.push_back( b );
-            }
-
-            if ( b<5.0 )
-            {
-                SCHEME_idxB0.push_back( Ns );
-            }
-            else
-            {
-                SCHEME_idxDWI.push_back( Ns );
-                if ( std::find(SCHEME_shells_b.begin(), SCHEME_shells_b.end(), b) == SCHEME_shells_b.end() )
-                {
-                    SCHEME_shells_b.push_back( b ) ;
-                    vector<int> tmp;
-                    SCHEME_shells_idx.push_back( tmp ) ;
-                }
-            }
-            Ns++;
-        }
-    }
-    catch( const char* msg )
-    {
-        COLOR_error( msg, "\t" );
-        return EXIT_FAILURE;
-    }
-    fclose(pFile);
-
-    printf( "\tgradients : %d\n", SCHEME_b.size() );
-    if ( niiDWI->hdr->dim[4] != SCHEME_b.size() )
-    {
-        COLOR_error( "The scheme does not match the DWI dataset", "\t" );
-        return EXIT_FAILURE;
-    }
-
-    // fill data structure about the SCHEME
-    // ------------------------------------
-    for(int i=0; i < SCHEME_b.size() ;i++)
-    {
-        if ( SCHEME_b[i] < 5 )
-            continue;
-        int s = std::find( SCHEME_shells_b.begin(), SCHEME_shells_b.end(), SCHEME_b[i] ) - SCHEME_shells_b.begin();
-        SCHEME_shells_idx[s].push_back( i );
-    }
-
-    printf( "\tscheme    : %d b0 and %d shells (", SCHEME_idxB0.size(), SCHEME_shells_idx.size() );
-    for(int i=0; i < SCHEME_shells_b.size() ;i++)
-        printf( " [%d @ b=%.1f]", SCHEME_shells_idx[i].size(), SCHEME_shells_b[i] );
-    printf( " )\n" );
-
-    COLOR_msg( "   [OK]" );
-
-
-
-    // =======================
-    // Creating BACKGROUND map
-    // =======================
-    COLOR_msg( "-> Preparing 'BACKGROUND' map:", "\n" );
-    MAP.resize(dim.x,dim.y,dim.z);
-    if ( !MAP_filename.empty() )
-    {
-        printf( "\tdata   : reading from file\n" );
-        NIFTI* niiMAP = new NIFTI;
-        niiMAP->open( MAP_filename, true );
-        if ( !niiMAP->isValid() )
-        {
-            COLOR_error( "Unable to open the file", "\t" );
-            return EXIT_FAILURE;
-        }
-
-        printf( "\tdim    : %d x %d x %d x %d\n" , niiMAP->hdr->dim[1],    niiMAP->hdr->dim[2],    niiMAP->hdr->dim[3], niiMAP->hdr->dim[4] );
-        printf( "\tpixdim : %.4f x %.4f x %.4f\n", niiMAP->hdr->pixdim[1], niiMAP->hdr->pixdim[2], niiMAP->hdr->pixdim[3] );
-
-        if ( niiMAP->hdr->dim[1] != dim.x || niiMAP->hdr->dim[2] != dim.y || niiMAP->hdr->dim[3] != dim.z )
-        {
-            COLOR_error( "The DIMENSIONS do not match those of DWI images", "\t" );
-            return EXIT_FAILURE;
-        }
-        if ( abs(niiMAP->hdr->pixdim[1]-pixdim.x) > 1e-4 || abs(niiMAP->hdr->pixdim[2]-pixdim.y) > 1e-4 || abs(niiMAP->hdr->pixdim[3]-pixdim.z) > 1e-4 )
-        {
-            COLOR_warning( "The VOXEL SIZE does not match that of DWI images", "\t" );
-        }
-
-        FLOAT32 MIN = 0;//(*niiMAP->img)(0,0,0);
-        FLOAT32 MAX = MIN;
-
-        for(int i=0; i<dim.x ;i++)
-        for(int j=0; j<dim.y ;j++)
-        for(int k=0; k<dim.z ;k++)
-        {
-            MAP(i,j,k) = (*niiMAP->img)(i,j,k);
-            if ( MAP(i,j,k) > MAX )
-                MAX = MAP(i,j,k);
-            if ( MAP(i,j,k) < MIN )
-                MIN = MAP(i,j,k);
-        }
-        if ( MAX - MIN <= 0 )
-        {
-            COLOR_error( "The dynamic range is zero", "\t" );
-            return EXIT_FAILURE;
-        }
-        MAP_min	= MIN;
-        MAP_min_view = 0;
-        MAP_max	= MAP_max_view = MAX;
-
-        printf( "\tvalues : [%.2e ... %.2e]\n", MAP_min, MAP_max );
-        COLOR_msg( "   [OK]" );
-    }
-    else
-    {
-        printf( "\tdata   : " );
-
-        if ( SCHEME_idxB0.size() > 0 )
-        {
-            printf( "taking first b0 image\n" );
-            FLOAT32 MIN = (*niiDWI->img)(0,0,0,SCHEME_idxB0[0]);
-            FLOAT32 MAX = MIN;
-
-            for(int i=0; i<dim.x ;i++)
-            for(int j=0; j<dim.y ;j++)
-            for(int k=0; k<dim.z ;k++)
-            {
-                MAP(i,j,k) = (*niiDWI->img)(i,j,k,SCHEME_idxB0[0]);
-                if ( MAP(i,j,k) > MAX )
-                    MAX = MAP(i,j,k);
-                if ( MAP(i,j,k) < MIN )
-                    MIN = MAP(i,j,k);
-            }
-            if ( MAX - MIN <= 0 )
-            {
-                COLOR_error( "The dynamic range is zero", "\t" );
-                return EXIT_FAILURE;
-            }
-            MAP_min	= MIN;
-            MAP_min_view = 0;
-            MAP_max	= MAP_max_view = MAX;
-        }
-        else
-        {
-            printf( "no b0 found\n" );
-            MAP = 0;
-            MAP_min	= MAP_min_view = 0;
-            MAP_max	= MAP_max_view = 1;
-        }
-        printf( "\tvalues : [%.2e ... %.2e]\n", MAP_min, MAP_max );
-        COLOR_msg( "   [OK]" );
-    }
-
-
-    // ==================
-    // Reading PEAKS file
-    // ==================
-    COLOR_msg( "-> Reading 'PEAKS' dataset:", "\n" );
-
-    if ( !PEAKS_filename.empty() )
-    {
-        niiPEAKS = new NIFTI;
-        niiPEAKS->open( PEAKS_filename, true );
-        if ( !niiPEAKS->isValid() )
-        {
-            COLOR_error( "Unable to open the file", "\t" );
-            return false;
-        }
-
-        if ( niiPEAKS->hdr->dim[0] != 4 || niiPEAKS->hdr->dim[4]%3 != 0 )
-        {
-            COLOR_error( "The size must be (*,*,*,3*k)", "\t" );
-            return EXIT_FAILURE;
-        }
-        PEAKS_n = niiPEAKS->hdr->dim[4]/3;
-
-        printf( "\tdim     : %d x %d x %d (%d peaks per voxel)\n" , niiPEAKS->hdr->dim[1], niiPEAKS->hdr->dim[2], niiPEAKS->hdr->dim[3], PEAKS_n );
-        printf( "\tpixdim  : %.4f x %.4f x %.4f\n", niiPEAKS->hdr->pixdim[1], niiPEAKS->hdr->pixdim[2], niiPEAKS->hdr->pixdim[3] );
-
-        printf( "\tqform   : %d\n", niiPEAKS->hdr->qform_code );
-        mat44 PEAKS_qform = niiPEAKS->hdr->qto_xyz;
-        if ( niiPEAKS->hdr->qform_code > 0 )
-        {
-            for(int i=0; i<3 ;i++)
-            {
-                printf( "\t\t| " );
-                for(int j=0; j<4 ;j++)
-                    printf( "%9.4f ", PEAKS_qform.m[i][j] );
-                printf( "|\n" );
-            }
-        }
-        else
-        {
-            COLOR_warning( "This should never happen!", "\t\t" );
-        }
-
-        printf( "\tsform  : %d\n", niiPEAKS->hdr->sform_code );
-        mat44 PEAKS_sform = niiPEAKS->hdr->sto_xyz;
-        if ( niiPEAKS->hdr->sform_code > 0 )
-        {
-            for(int i=0; i<3 ;i++)
-            {
-                printf( "\t\t| " );
-                for(int j=0; j<4 ;j++)
-                    printf( "%9.4f ", PEAKS_sform.m[i][j] );
-                printf( "|\n" );
-            }
-        }
-
-        if ( niiPEAKS->hdr->dim[1] != dim.x || niiPEAKS->hdr->dim[2] != dim.y || niiPEAKS->hdr->dim[3] != dim.z )
-        {
-            COLOR_error( "The DIMENSIONS do not match those of DWI images", "\t" );
-            return EXIT_FAILURE;
-        }
-        if ( abs(niiPEAKS->hdr->pixdim[1]-pixdim.x) > 1e-3 || abs(niiPEAKS->hdr->pixdim[2]-pixdim.y) > 1e-3 || abs(niiPEAKS->hdr->pixdim[3]-pixdim.z) > 1e-3 )
-        {
-            COLOR_warning( "The VOXEL SIZE does not match that of DWI images", "\t" );
-        }
-        if (
-            niiPEAKS->hdr->sform_code != niiDWI->hdr->sform_code || niiPEAKS->hdr->qform_code != niiDWI->hdr->qform_code || niiPEAKS->hdr->pixdim[0] != niiDWI->hdr->pixdim[0] ||
-            niiPEAKS->hdr->quatern_b != niiDWI->hdr->quatern_b || niiPEAKS->hdr->quatern_c != niiDWI->hdr->quatern_c || niiPEAKS->hdr->quatern_d != niiDWI->hdr->quatern_d ||
-            niiPEAKS->hdr->qoffset_x != niiDWI->hdr->qoffset_x || niiPEAKS->hdr->qoffset_y != niiDWI->hdr->qoffset_y || niiPEAKS->hdr->qoffset_z != niiDWI->hdr->qoffset_z
-        )
-        {
-            COLOR_warning( "The GEOMETRY does not match that of DWI images", "\t" );
-        }
-
-        // Read the affine matrix to rotate the vectors
-        // NB: we need the inverse, but in this case inv=transpose
-        if ( niiPEAKS->hdr->qform_code != 0 )
-        {
-            for(int i=0; i<3 ;i++)
-            for(int j=0; j<3 ;j++)
-                PEAKS_affine[i][j] = PEAKS_qform.m[j][i];
-        }
-        else if ( niiPEAKS->hdr->sform_code != 0 )
-        {
-            for(int i=0; i<3 ;i++)
-            for(int j=0; j<3 ;j++)
-                PEAKS_affine[i][j] = PEAKS_sform.m[j][i];
-        }
-        else {
-            for(int i=0; i<3 ;i++)
-            for(int j=0; j<3 ;j++)
-                PEAKS_affine[i][j] = 0;
-            for(int i=0; i<3 ;i++)
-                PEAKS_affine[i][i] = 1;
-        }
-
-        printf( "\tAffine used :\n" );
-        for(int i=0; i<3 ;i++)
-        {
-            printf( "\t\t| " );
-            for(int j=0; j<3 ;j++)
-                printf( "%9.4f ", PEAKS_affine[i][j] );
-            printf( "|\n" );
-        }
-
-        COLOR_msg( "   [OK]" );
-    }
-    else {
-        // no peaks are passed and won't be showed
-        COLOR_msg( "   [no peaks specified]" );
-        PEAKS_n = 0;
-    }
-
-
-    // ===================
-    // Reading TRACTS file
-    // ===================
-    COLOR_msg( "-> Reading 'TRACTOGRAM':", "\n" );
-
-    if ( !TRK_filename.empty() )
-    {
-        TRK_file = TrackVis();
-        if ( !TRK_file.open( TRK_filename ) )
-        {
-            COLOR_error( "Unable to open the file", "\t" );
-            return false;
-        }
-
-        printf("\tcount      : %d\n" , TRK_file.hdr.n_count );
-        printf("\tdim        : %d x %d x %d\n" , TRK_file.hdr.dim[0], TRK_file.hdr.dim[1], TRK_file.hdr.dim[2] );
-        printf("\tpixdim     : %.4f x %.4f x %.4f\n", TRK_file.hdr.voxel_size[0], TRK_file.hdr.voxel_size[1], TRK_file.hdr.voxel_size[2] );
-        printf("\tscalars    : %d\n" , TRK_file.hdr.n_scalars );
-        printf("\tproperties : %d\n" , TRK_file.hdr.n_properties );
-
-        if ( TRK_file.hdr.dim[0] != dim.x || TRK_file.hdr.dim[1] != dim.y || TRK_file.hdr.dim[2] != dim.z ||
-             abs(TRK_file.hdr.voxel_size[0]-pixdim.x) > 1e-4 || abs(TRK_file.hdr.voxel_size[1]-pixdim.y) > 1e-4 || abs(TRK_file.hdr.voxel_size[2]-pixdim.z) > 1e-4 )
-        {
-            COLOR_error( "The GEOMETRY does not match those of DWI images", "\t" );
-            return EXIT_FAILURE;
-        }
-
-        TRK_skip = ceil( TRK_file.hdr.n_count / 25000.0 );
-        int N, n_s = TRK_file.hdr.n_scalars, n_p = TRK_file.hdr.n_properties;
-        FILE* fp = TRK_file.getFilePtr();
-
-        // count how many points I need to store in memory
-        int TractsRead = 0, CoordsRead = 0;
-        fseek(fp, 1000, SEEK_SET);
-        for(int f=0; f < TRK_file.hdr.n_count ; f++)
-        {
-            fread( (char*)&N, 1, 4, fp );
-            fseek( fp, N*(3+n_s)*4 + n_p*4, SEEK_CUR );
-            if ( f%TRK_skip==0 )
-            {
-                TractsRead++;
-                CoordsRead += N;
-            }
-        }
-        printf("\tin memory  : %d (%d points)\n" , TractsRead, CoordsRead );
-
-        // create data structure for drawing the tracts
-        TRK_nTractsPlotted = TractsRead;
-        TRK_nPoints = new int[TRK_nTractsPlotted];
-        TRK_coords  = new float[3*CoordsRead];
-        TRK_colors  = new float[3*CoordsRead];
-
-        float* ptr  = TRK_coords;
-        float* ptrc = TRK_colors;
-        float norm;
-        VECTOR<float> dir;
-        TractsRead = 0;
-        fseek(fp, 1000, SEEK_SET);
-        for(int f=0; f < TRK_file.hdr.n_count ; f++)
-        {
-            if ( f%TRK_skip==0 )
-            {
-                fread( (char*)&N, 1, 4, fp );
-                TRK_nPoints[TractsRead] = N;
-
-                for(int i=0; i<N; i++)
-                {
-                    fread((char*)ptr, 1, 12, fp);
-                    fseek( fp, n_s*4, SEEK_CUR );
-
-                    // coordinates (later they will be scaled back to voxel size)
-                    ptr[0] /= pixdim.x;
-                    ptr[1] /= pixdim.y;
-                    ptr[2] /= pixdim.z;
-
-                    // colors
-                    if ( i > 0 )
-                    {
-                        dir.x = *(ptr  ) - *(ptr-3);
-                        dir.y = *(ptr+1) - *(ptr-2);
-                        dir.z = *(ptr+2) - *(ptr-1);
-                        norm = dir.norm();
-                        ptrc[0] = abs( dir.x / norm );
-                        ptrc[1] = abs( dir.y / norm );
-                        ptrc[2] = abs( dir.z / norm );
-                    }
-                    else
-                    {
-                        ptrc[0] = 0;
-                        ptrc[1] = 0;
-                        ptrc[2] = 0;
-                    }
-
-                    ptr  += 3;
-                    ptrc += 3;
-                }
-                fseek( fp, n_p*4, SEEK_CUR );
-                TractsRead++;
-            }
-            else
-            {
-                fread( (char*)&N, 1, 4, fp );
-                fseek( fp, N*(3+n_s)*4 + n_p*4, SEEK_CUR );
-            }
-        }
-
-        COLOR_msg( "   [OK]" );
-        printf( "\n\n" );
-    }
-    else
-    {
-        // no fibers are passed and won't be showed
-        COLOR_msg( "   [no streamlines specified]" );
-        TRK_nTractsPlotted = 0;
-    }
-
-    TRK_offset.x = 0;
-    TRK_offset.y = 0;
-    TRK_offset.z = 0;
-
-
-    // ============
-    // SETUP OpenGL
-    // ============
-    VOXEL.x = round( dim.x / 2.0 );
-    VOXEL.y = round( dim.y / 2.0 );
-    VOXEL.z = round( dim.z / 2.0 );
-    OpenGL_init( argc, argv );
-
-    return EXIT_SUCCESS;
-}
+#include <NIFTI.h>
+#include <nifti1_io.h>
+#include <COLOR_ui.h>
+#include <TrackVis.h>
+#include <VECTOR.h>
+#include <cmath>
+#include <regex>
+#include <cstdlib>
+#include "tclap/CmdLine.h"
+#include <blitz/array.h>
+using namespace std;
+
+#include "colormaps.h"
+
+NIFTI*                   niiDWI;
+VECTOR<int>		         dim;
+VECTOR<float>	         pixdim;
+
+int                      SCHEME_version;
+vector< VECTOR<float> >	 SCHEME_dirs;
+vector<float>	         SCHEME_b;
+vector<int>              SCHEME_idxB0;
+vector<int>              SCHEME_idxDWI;
+vector<float>	         SCHEME_shells_b;
+vector< vector<int> >    SCHEME_shells_idx;
+
+blitz::Array<float,3>    MAP;
+VECTOR<int>		         VOXEL;
+float                    MAP_min, MAP_min_view, MAP_max, MAP_max_view;
+float 			         MAP_opacity = 0.5;
+bool			         showPlane[3] = { true, true, true };
+bool                     showAxes = true;
+bool			         showConfig = true;
+float				     LINE_width = 2.0;
+
+NIFTI*                   niiPEAKS;
+int				         PEAKS_n;
+bool			         PEAKS_show = false;
+float			         PEAKS_thr = 0.0;
+bool			         PEAKS_doNormalize = false;
+bool			         PEAKS_flip[3] = {false, false, false};
+bool			         PEAKS_use_affine = false;
+float                    PEAKS_affine[3][3];
+
+TrackVis 		         TRK_file;
+int				         TRK_skip;
+int				         TRK_nTractsPlotted;
+int*   			         TRK_nPoints;
+float*			         TRK_coords;
+float*			         TRK_colors;
+float 			         TRK_crop = 1.0;
+bool 			         TRK_crop_mode = true;
+bool 			         TRK_show = false;
+VECTOR<float> 	         TRK_offset;
+
+bool 			         GLYPHS_show = false;
+int                      GLYPHS_shell = 0;
+bool			         GLYPHS_flip[3] = {false, false, false};
+float	                 GLYPHS_b0_thr = 50.0;
+bool			         GLYPHS_use_affine = false;
+float                    GLYPHS_affine[3][3];
+
+#include "OPENGL_callbacks.cxx"
+
+
+/*----------------------------------------------------------------------------------------------------------------------------------*/
+int main(int argc, char** argv)
+{
+    TCLAP::CmdLine cmd("This tool allows one to display in a common 3D space all the objects (DWI data, streamlines etc...) used by COMMIT in order to spot possible incosistencies between the conventions of COMMIT and the software that generated the data, e.g. flip in some axes in the DWI data or in the peaks, spatial shift in the streamlines, whether the affine transformation was already applied to the data etc..", ' ', "1.1");
+
+    TCLAP::UnlabeledValueArg<string> argDWI(    "dwi","Filename of the DWI dataset [4D NIFTI]", true, "", "DWI", cmd );
+    TCLAP::ValueArg<string>          argMAP(    "m", "map", "Background map [3D NIFTI]", false, "", "map", cmd );
+    TCLAP::ValueArg<string>          argPEAKS(  "p", "peaks", "Main diffusion directions for the extra-axonal part in each voxel [4D NIFTI]", false, "", "peaks", cmd );
+    TCLAP::ValueArg<string>          argTRK(    "f", "fibers", "Streamlines for the intra-axonal part [.TRK format]", false, "", "fibers", cmd );
+    TCLAP::UnlabeledValueArg<string> argSCHEME( "scheme","Acquisition scheme [text]", true, "", "scheme", cmd );
+
+    try	{ cmd.parse( argc, argv ); }
+    catch (TCLAP::ArgException &e) { cerr << "error: " << e.error() << " for arg " << e.argId() << endl; }
+
+    string DWI_filename( argDWI.getValue() );
+    string SCHEME_filename( argSCHEME.getValue() );
+    string PEAKS_filename( argPEAKS.getValue() );
+    string TRK_filename( argTRK.getValue() );
+    string MAP_filename( argMAP.getValue() );
+
+
+    // ===================
+    // Reading DWI dataset
+    // ===================
+    COLOR_msg( "-> Reading 'DWI' dataset:", "\n" );
+
+    niiDWI = new NIFTI;
+    niiDWI->open( DWI_filename, true );
+    if ( !niiDWI->isValid() )
+    {
+        COLOR_error( "Unable to open file", "\t" );
+        return EXIT_FAILURE;
+    }
+    dim.x = niiDWI->hdr->dim[1];
+    dim.y = niiDWI->hdr->dim[2];
+    dim.z = niiDWI->hdr->dim[3];
+    pixdim.x = niiDWI->hdr->pixdim[1];
+    pixdim.y = niiDWI->hdr->pixdim[2];
+    pixdim.z = niiDWI->hdr->pixdim[3];
+    printf( "\tdim    : %d x %d x %d x %d\n", dim.x, dim.y, dim.z, niiDWI->hdr->dim[4] );
+    printf( "\tpixdim : %.4f x %.4f x %.4f\n", 	pixdim.x, pixdim.y, pixdim.z );
+    printf( "\tqform  : %d\n", niiDWI->hdr->qform_code );
+    mat44 DWI_qform = niiDWI->hdr->qto_xyz;
+    if ( niiDWI->hdr->qform_code > 0 )
+    {
+        for(int i=0; i<3 ;i++)
+        {
+            printf( "\t\t| " );
+            for(int j=0; j<4 ;j++)
+                printf( "%9.4f ", DWI_qform.m[i][j] );
+            printf( "|\n" );
+        }
+    }
+    else
+    {
+        COLOR_warning( "This should never happen!", "\t\t" );
+    }
+    printf( "\tsform  : %d\n", niiDWI->hdr->sform_code );
+    mat44 DWI_sform = niiDWI->hdr->sto_xyz;
+    if ( niiDWI->hdr->sform_code > 0 )
+    {
+        for(int i=0; i<3 ;i++)
+        {
+            printf( "\t\t| " );
+            for(int j=0; j<4 ;j++)
+                printf( "%9.4f ", DWI_sform.m[i][j] );
+            printf( "|\n" );
+        }
+    }
+
+    // Read the affine matrix to rotate the vectors
+    // NB: we need the inverse, but in this case inv=transpose
+    if ( niiDWI->hdr->qform_code != 0 )
+    {
+        for(int i=0; i<3 ;i++)
+        for(int j=0; j<3 ;j++)
+            GLYPHS_affine[i][j] = DWI_qform.m[j][i];
+    }
+    else if ( niiDWI->hdr->sform_code != 0 )
+    {
+        for(int i=0; i<3 ;i++)
+        for(int j=0; j<3 ;j++)
+            GLYPHS_affine[i][j] = DWI_sform.m[j][i];
+    }
+    else {
+        for(int i=0; i<3 ;i++)
+        for(int j=0; j<3 ;j++)
+            GLYPHS_affine[i][j] = 0;
+        for(int i=0; i<3 ;i++)
+            GLYPHS_affine[i][i] = 1;
+    }
+
+    mat33 tmp;
+    for(int i=0; i<3 ;i++)
+        for(int j=0; j<3 ;j++)
+            tmp.m[i][j] = GLYPHS_affine[i][j];
+    printf( "\tAffine used (%s):\n", nifti_mat33_determ(tmp)<0?"RADIOLOGICAL":"NEUROLOGICAL" );
+    for(int i=0; i<3 ;i++)
+    {
+        printf( "\t\t| " );
+        for(int j=0; j<3 ;j++)
+            printf( "%9.4f ", GLYPHS_affine[i][j] );
+        printf( "|\n" );
+    }
+
+    COLOR_msg( "   [OK]" );
+
+
+    // ===================
+    // Reading SCHEME file
+    // ===================
+    COLOR_msg( "-> Reading 'SCHEME' file:", "\n" );
+
+    char line[1000];
+    FILE* pFile = fopen( SCHEME_filename.c_str(), "rt" );
+
+    // read the version
+    // ----------------
+    try
+    {
+        while( fgets(line, 1000, pFile) )
+            if ( line[0]!='#' )
+                break;
+
+        std::regex reVersion("^VERSION: (.*)\\s*$");
+        std::smatch reMatches;
+
+        if ( !std::regex_match(string(line), reMatches, reVersion) )
+        {
+            // no header found, assume standards BVECTOR format
+            SCHEME_version = 0;
+            fseek(pFile, -strlen(line), SEEK_CUR);
+        }
+        else
+        {
+            if( strcmp(reMatches[1].str().c_str(),"0")==0 || strcmp(reMatches[1].str().c_str(),"BVECTOR")==0 )
+                SCHEME_version = 0;
+            else if( strcmp(reMatches[1].str().c_str(),"1")==0 || strcmp(reMatches[1].str().c_str(),"STEJSKALTANNER")==0 )
+                SCHEME_version = 1;
+            else
+                throw "Version not recognized";
+        }
+    }
+    catch( const char* msg )
+    {
+        COLOR_error( msg, "\t" );
+        return EXIT_FAILURE;
+    }
+    printf( "\tversion   : %s\n", SCHEME_version==0?"BVECTOR":"STEJSKALTANNER" );
+
+    // read the data
+    // -------------
+    try
+    {
+        string      reFLOAT( "[-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?" );
+        std::regex  reVERSION0( "^\\s*("+reFLOAT+")\\s+("+reFLOAT+")\\s+("+reFLOAT+")\\s+("+reFLOAT+")\\s*$" );
+        std::regex  reVERSION1( "^\\s*("+reFLOAT+")\\s+("+reFLOAT+")\\s+("+reFLOAT+")\\s+("+reFLOAT+")\\s+("+reFLOAT+")\\s+("+reFLOAT+")\\s+("+reFLOAT+")\\s*$" );
+        std::regex  reEMPTY( "^\\s*$" );
+        std::smatch reMatches;
+        int         Ns = 0;
+        float       x, y, z, b, G, D, d;
+        while( fgets(line, 1000, pFile) )
+        {
+            if( std::regex_match(string(line), reMatches, reEMPTY) )
+                continue;   // skip empty lines
+
+            if( SCHEME_version == 0 )
+            {
+                if ( !std::regex_match(string(line), reMatches, reVERSION0) )
+                    throw "Wrong row format";
+                x = std::atof( reMatches[1].str().c_str() );
+                y = std::atof( reMatches[2].str().c_str() );
+                z = std::atof( reMatches[3].str().c_str() );
+                b = std::atof( reMatches[4].str().c_str() ); // in mm^2/s
+                VECTOR<float> tmp( x, y, z );
+                tmp.Normalize();
+                SCHEME_dirs.push_back( tmp );
+                SCHEME_b.push_back( b );
+            }
+            else
+            {
+                if ( !std::regex_match(string(line), reMatches, reVERSION1) )
+                    throw "Wrong row format";
+                x = std::atof( reMatches[1].str().c_str() );
+                y = std::atof( reMatches[2].str().c_str() );
+                z = std::atof( reMatches[3].str().c_str() );
+                G = std::atof( reMatches[4].str().c_str() );
+                D = std::atof( reMatches[5].str().c_str() );
+                d = std::atof( reMatches[6].str().c_str() );
+                VECTOR<float> tmp( x, y, z );
+                tmp.Normalize();
+                SCHEME_dirs.push_back( tmp );
+                b = std::pow( 267.513e6*G*d, 2 ) * (D-d/3.0) * 1e-6; // in mm^2/s
+                SCHEME_b.push_back( b );
+            }
+
+            if ( b<5.0 )
+            {
+                SCHEME_idxB0.push_back( Ns );
+            }
+            else
+            {
+                SCHEME_idxDWI.push_back( Ns );
+                if ( std::find(SCHEME_shells_b.begin(), SCHEME_shells_b.end(), b) == SCHEME_shells_b.end() )
+                {
+                    SCHEME_shells_b.push_back( b ) ;
+                    vector<int> tmp;
+                    SCHEME_shells_idx.push_back( tmp ) ;
+                }
+            }
+            Ns++;
+        }
+    }
+    catch( const char* msg )
+    {
+        COLOR_error( msg, "\t" );
+        return EXIT_FAILURE;
+    }
+    fclose(pFile);
+
+    printf( "\tgradients : %d\n", SCHEME_b.size() );
+    if ( niiDWI->hdr->dim[4] != SCHEME_b.size() )
+    {
+        COLOR_error( "The scheme does not match the DWI dataset", "\t" );
+        return EXIT_FAILURE;
+    }
+
+    // fill data structure about the SCHEME
+    // ------------------------------------
+    for(int i=0; i < SCHEME_b.size() ;i++)
+    {
+        if ( SCHEME_b[i] < 5 )
+            continue;
+        int s = std::find( SCHEME_shells_b.begin(), SCHEME_shells_b.end(), SCHEME_b[i] ) - SCHEME_shells_b.begin();
+        SCHEME_shells_idx[s].push_back( i );
+    }
+
+    printf( "\tscheme    : %d b0 and %d shells (", SCHEME_idxB0.size(), SCHEME_shells_idx.size() );
+    for(int i=0; i < SCHEME_shells_b.size() ;i++)
+        printf( " [%d @ b=%.1f]", SCHEME_shells_idx[i].size(), SCHEME_shells_b[i] );
+    printf( " )\n" );
+
+    COLOR_msg( "   [OK]" );
+
+
+
+    // =======================
+    // Creating BACKGROUND map
+    // =======================
+    COLOR_msg( "-> Preparing 'BACKGROUND' map:", "\n" );
+    MAP.resize(dim.x,dim.y,dim.z);
+    if ( !MAP_filename.empty() )
+    {
+        printf( "\tdata   : reading from file\n" );
+        NIFTI* niiMAP = new NIFTI;
+        niiMAP->open( MAP_filename, true );
+        if ( !niiMAP->isValid() )
+        {
+            COLOR_error( "Unable to open the file", "\t" );
+            return EXIT_FAILURE;
+        }
+
+        printf( "\tdim    : %d x %d x %d x %d\n" , niiMAP->hdr->dim[1],    niiMAP->hdr->dim[2],    niiMAP->hdr->dim[3], niiMAP->hdr->dim[4] );
+        printf( "\tpixdim : %.4f x %.4f x %.4f\n", niiMAP->hdr->pixdim[1], niiMAP->hdr->pixdim[2], niiMAP->hdr->pixdim[3] );
+
+        if ( niiMAP->hdr->dim[1] != dim.x || niiMAP->hdr->dim[2] != dim.y || niiMAP->hdr->dim[3] != dim.z )
+        {
+            COLOR_error( "The DIMENSIONS do not match those of DWI images", "\t" );
+            return EXIT_FAILURE;
+        }
+        if ( abs(niiMAP->hdr->pixdim[1]-pixdim.x) > 1e-4 || abs(niiMAP->hdr->pixdim[2]-pixdim.y) > 1e-4 || abs(niiMAP->hdr->pixdim[3]-pixdim.z) > 1e-4 )
+        {
+            COLOR_warning( "The VOXEL SIZE does not match that of DWI images", "\t" );
+        }
+
+        FLOAT32 MIN = 0;//(*niiMAP->img)(0,0,0);
+        FLOAT32 MAX = MIN;
+
+        for(int i=0; i<dim.x ;i++)
+        for(int j=0; j<dim.y ;j++)
+        for(int k=0; k<dim.z ;k++)
+        {
+            MAP(i,j,k) = (*niiMAP->img)(i,j,k);
+            if ( MAP(i,j,k) > MAX )
+                MAX = MAP(i,j,k);
+            if ( MAP(i,j,k) < MIN )
+                MIN = MAP(i,j,k);
+        }
+        if ( MAX - MIN <= 0 )
+        {
+            COLOR_error( "The dynamic range is zero", "\t" );
+            return EXIT_FAILURE;
+        }
+        MAP_min	= MIN;
+        MAP_min_view = 0;
+        MAP_max	= MAP_max_view = MAX;
+
+        printf( "\tvalues : [%.2e ... %.2e]\n", MAP_min, MAP_max );
+        COLOR_msg( "   [OK]" );
+    }
+    else
+    {
+        printf( "\tdata   : " );
+
+        if ( SCHEME_idxB0.size() > 0 )
+        {
+            printf( "taking first b0 image\n" );
+            FLOAT32 MIN = (*niiDWI->img)(0,0,0,SCHEME_idxB0[0]);
+            FLOAT32 MAX = MIN;
+
+            for(int i=0; i<dim.x ;i++)
+            for(int j=0; j<dim.y ;j++)
+            for(int k=0; k<dim.z ;k++)
+            {
+                MAP(i,j,k) = (*niiDWI->img)(i,j,k,SCHEME_idxB0[0]);
+                if ( MAP(i,j,k) > MAX )
+                    MAX = MAP(i,j,k);
+                if ( MAP(i,j,k) < MIN )
+                    MIN = MAP(i,j,k);
+            }
+            if ( MAX - MIN <= 0 )
+            {
+                COLOR_error( "The dynamic range is zero", "\t" );
+                return EXIT_FAILURE;
+            }
+            MAP_min	= MIN;
+            MAP_min_view = 0;
+            MAP_max	= MAP_max_view = MAX;
+        }
+        else
+        {
+            printf( "no b0 found\n" );
+            MAP = 0;
+            MAP_min	= MAP_min_view = 0;
+            MAP_max	= MAP_max_view = 1;
+        }
+        printf( "\tvalues : [%.2e ... %.2e]\n", MAP_min, MAP_max );
+        COLOR_msg( "   [OK]" );
+    }
+
+
+    // ==================
+    // Reading PEAKS file
+    // ==================
+    COLOR_msg( "-> Reading 'PEAKS' dataset:", "\n" );
+
+    if ( !PEAKS_filename.empty() )
+    {
+        niiPEAKS = new NIFTI;
+        niiPEAKS->open( PEAKS_filename, true );
+        if ( !niiPEAKS->isValid() )
+        {
+            COLOR_error( "Unable to open the file", "\t" );
+            return false;
+        }
+
+        if ( niiPEAKS->hdr->dim[0] != 4 || niiPEAKS->hdr->dim[4]%3 != 0 )
+        {
+            COLOR_error( "The size must be (*,*,*,3*k)", "\t" );
+            return EXIT_FAILURE;
+        }
+        PEAKS_n = niiPEAKS->hdr->dim[4]/3;
+
+        printf( "\tdim     : %d x %d x %d (%d peaks per voxel)\n" , niiPEAKS->hdr->dim[1], niiPEAKS->hdr->dim[2], niiPEAKS->hdr->dim[3], PEAKS_n );
+        printf( "\tpixdim  : %.4f x %.4f x %.4f\n", niiPEAKS->hdr->pixdim[1], niiPEAKS->hdr->pixdim[2], niiPEAKS->hdr->pixdim[3] );
+
+        printf( "\tqform   : %d\n", niiPEAKS->hdr->qform_code );
+        mat44 PEAKS_qform = niiPEAKS->hdr->qto_xyz;
+        if ( niiPEAKS->hdr->qform_code > 0 )
+        {
+            for(int i=0; i<3 ;i++)
+            {
+                printf( "\t\t| " );
+                for(int j=0; j<4 ;j++)
+                    printf( "%9.4f ", PEAKS_qform.m[i][j] );
+                printf( "|\n" );
+            }
+        }
+        else
+        {
+            COLOR_warning( "This should never happen!", "\t\t" );
+        }
+
+        printf( "\tsform  : %d\n", niiPEAKS->hdr->sform_code );
+        mat44 PEAKS_sform = niiPEAKS->hdr->sto_xyz;
+        if ( niiPEAKS->hdr->sform_code > 0 )
+        {
+            for(int i=0; i<3 ;i++)
+            {
+                printf( "\t\t| " );
+                for(int j=0; j<4 ;j++)
+                    printf( "%9.4f ", PEAKS_sform.m[i][j] );
+                printf( "|\n" );
+            }
+        }
+
+        if ( niiPEAKS->hdr->dim[1] != dim.x || niiPEAKS->hdr->dim[2] != dim.y || niiPEAKS->hdr->dim[3] != dim.z )
+        {
+            COLOR_error( "The DIMENSIONS do not match those of DWI images", "\t" );
+            return EXIT_FAILURE;
+        }
+        if ( abs(niiPEAKS->hdr->pixdim[1]-pixdim.x) > 1e-3 || abs(niiPEAKS->hdr->pixdim[2]-pixdim.y) > 1e-3 || abs(niiPEAKS->hdr->pixdim[3]-pixdim.z) > 1e-3 )
+        {
+            COLOR_warning( "The VOXEL SIZE does not match that of DWI images", "\t" );
+        }
+        if (
+            niiPEAKS->hdr->sform_code != niiDWI->hdr->sform_code || niiPEAKS->hdr->qform_code != niiDWI->hdr->qform_code || niiPEAKS->hdr->pixdim[0] != niiDWI->hdr->pixdim[0] ||
+            niiPEAKS->hdr->quatern_b != niiDWI->hdr->quatern_b || niiPEAKS->hdr->quatern_c != niiDWI->hdr->quatern_c || niiPEAKS->hdr->quatern_d != niiDWI->hdr->quatern_d ||
+            niiPEAKS->hdr->qoffset_x != niiDWI->hdr->qoffset_x || niiPEAKS->hdr->qoffset_y != niiDWI->hdr->qoffset_y || niiPEAKS->hdr->qoffset_z != niiDWI->hdr->qoffset_z
+        )
+        {
+            COLOR_warning( "The GEOMETRY does not match that of DWI images", "\t" );
+        }
+
+        // Read the affine matrix to rotate the vectors
+        // NB: we need the inverse, but in this case inv=transpose
+        if ( niiPEAKS->hdr->qform_code != 0 )
+        {
+            for(int i=0; i<3 ;i++)
+            for(int j=0; j<3 ;j++)
+                PEAKS_affine[i][j] = PEAKS_qform.m[j][i];
+        }
+        else if ( niiPEAKS->hdr->sform_code != 0 )
+        {
+            for(int i=0; i<3 ;i++)
+            for(int j=0; j<3 ;j++)
+                PEAKS_affine[i][j] = PEAKS_sform.m[j][i];
+        }
+        else {
+            for(int i=0; i<3 ;i++)
+            for(int j=0; j<3 ;j++)
+                PEAKS_affine[i][j] = 0;
+            for(int i=0; i<3 ;i++)
+                PEAKS_affine[i][i] = 1;
+        }
+
+        printf( "\tAffine used :\n" );
+        for(int i=0; i<3 ;i++)
+        {
+            printf( "\t\t| " );
+            for(int j=0; j<3 ;j++)
+                printf( "%9.4f ", PEAKS_affine[i][j] );
+            printf( "|\n" );
+        }
+
+        COLOR_msg( "   [OK]" );
+    }
+    else {
+        // no peaks are passed and won't be showed
+        COLOR_msg( "   [no peaks specified]" );
+        PEAKS_n = 0;
+    }
+
+
+    // ===================
+    // Reading TRACTS file
+    // ===================
+    COLOR_msg( "-> Reading 'TRACTOGRAM':", "\n" );
+
+    if ( !TRK_filename.empty() )
+    {
+        TRK_file = TrackVis();
+        if ( !TRK_file.open( TRK_filename ) )
+        {
+            COLOR_error( "Unable to open the file", "\t" );
+            return false;
+        }
+
+        printf("\tcount      : %d\n" , TRK_file.hdr.n_count );
+        printf("\tdim        : %d x %d x %d\n" , TRK_file.hdr.dim[0], TRK_file.hdr.dim[1], TRK_file.hdr.dim[2] );
+        printf("\tpixdim     : %.4f x %.4f x %.4f\n", TRK_file.hdr.voxel_size[0], TRK_file.hdr.voxel_size[1], TRK_file.hdr.voxel_size[2] );
+        printf("\tscalars    : %d\n" , TRK_file.hdr.n_scalars );
+        printf("\tproperties : %d\n" , TRK_file.hdr.n_properties );
+
+        if ( TRK_file.hdr.dim[0] != dim.x || TRK_file.hdr.dim[1] != dim.y || TRK_file.hdr.dim[2] != dim.z ||
+             abs(TRK_file.hdr.voxel_size[0]-pixdim.x) > 1e-4 || abs(TRK_file.hdr.voxel_size[1]-pixdim.y) > 1e-4 || abs(TRK_file.hdr.voxel_size[2]-pixdim.z) > 1e-4 )
+        {
+            COLOR_error( "The GEOMETRY does not match those of DWI images", "\t" );
+            return EXIT_FAILURE;
+        }
+
+        TRK_skip = ceil( TRK_file.hdr.n_count / 25000.0 );
+        int N, n_s = TRK_file.hdr.n_scalars, n_p = TRK_file.hdr.n_properties;
+        FILE* fp = TRK_file.getFilePtr();
+
+        // count how many points I need to store in memory
+        int TractsRead = 0, CoordsRead = 0;
+        fseek(fp, 1000, SEEK_SET);
+        for(int f=0; f < TRK_file.hdr.n_count ; f++)
+        {
+            fread( (char*)&N, 1, 4, fp );
+            fseek( fp, N*(3+n_s)*4 + n_p*4, SEEK_CUR );
+            if ( f%TRK_skip==0 )
+            {
+                TractsRead++;
+                CoordsRead += N;
+            }
+        }
+        printf("\tin memory  : %d (%d points)\n" , TractsRead, CoordsRead );
+
+        // create data structure for drawing the tracts
+        TRK_nTractsPlotted = TractsRead;
+        TRK_nPoints = new int[TRK_nTractsPlotted];
+        TRK_coords  = new float[3*CoordsRead];
+        TRK_colors  = new float[3*CoordsRead];
+
+        float* ptr  = TRK_coords;
+        float* ptrc = TRK_colors;
+        float norm;
+        VECTOR<float> dir;
+        TractsRead = 0;
+        fseek(fp, 1000, SEEK_SET);
+        for(int f=0; f < TRK_file.hdr.n_count ; f++)
+        {
+            if ( f%TRK_skip==0 )
+            {
+                fread( (char*)&N, 1, 4, fp );
+                TRK_nPoints[TractsRead] = N;
+
+                for(int i=0; i<N; i++)
+                {
+                    fread((char*)ptr, 1, 12, fp);
+                    fseek( fp, n_s*4, SEEK_CUR );
+
+                    // coordinates (later they will be scaled back to voxel size)
+                    ptr[0] /= pixdim.x;
+                    ptr[1] /= pixdim.y;
+                    ptr[2] /= pixdim.z;
+
+                    // colors
+                    if ( i > 0 )
+                    {
+                        dir.x = *(ptr  ) - *(ptr-3);
+                        dir.y = *(ptr+1) - *(ptr-2);
+                        dir.z = *(ptr+2) - *(ptr-1);
+                        norm = dir.norm();
+                        ptrc[0] = abs( dir.x / norm );
+                        ptrc[1] = abs( dir.y / norm );
+                        ptrc[2] = abs( dir.z / norm );
+                    }
+                    else
+                    {
+                        ptrc[0] = 0;
+                        ptrc[1] = 0;
+                        ptrc[2] = 0;
+                    }
+
+                    ptr  += 3;
+                    ptrc += 3;
+                }
+                fseek( fp, n_p*4, SEEK_CUR );
+                TractsRead++;
+            }
+            else
+            {
+                fread( (char*)&N, 1, 4, fp );
+                fseek( fp, N*(3+n_s)*4 + n_p*4, SEEK_CUR );
+            }
+        }
+
+        COLOR_msg( "   [OK]" );
+        printf( "\n\n" );
+    }
+    else
+    {
+        // no fibers are passed and won't be showed
+        COLOR_msg( "   [no streamlines specified]" );
+        TRK_nTractsPlotted = 0;
+    }
+
+    TRK_offset.x = 0;
+    TRK_offset.y = 0;
+    TRK_offset.z = 0;
+
+
+    // ============
+    // SETUP OpenGL
+    // ============
+    VOXEL.x = round( dim.x / 2.0 );
+    VOXEL.y = round( dim.y / 2.0 );
+    VOXEL.z = round( dim.z / 2.0 );
+    OpenGL_init( argc, argv );
+
+    return EXIT_SUCCESS;
+}
  
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 1c03d182..9234880c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
-Cython>=0.29
-dipy>=1.0
-dmri-amico>=1.2.3
-numpy>=1.12
-setuptools>=46.1
+Cython>=0.29
+dipy>=1.0
+dmri-amico>=1.2.3
+numpy>=1.12
+setuptools>=46.1
diff --git a/setup.cfg b/setup.cfg
index 3463cc53..a96a1715 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
-# Inside of setup.cfg
-[metadata]
-description-file = README.md
-
-[bdist_wheel]
+# Inside of setup.cfg
+[metadata]
+description-file = README.md
+
+[bdist_wheel]
 universal = 1
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 09a838aa..46738346 100644
--- a/setup.py
+++ b/setup.py
@@ -88,6 +88,9 @@ def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
     # Inject our redefined _compile method into the class
     self._compile = _compile
 
+# Locate CUDA
+CUDA = locate_cuda()
+
 def get_extensions():
     # Cython extension to create the sparse data structure from a tractogram
     # for the computation of matrix-vector multiplications
@@ -106,6 +109,18 @@ def get_extensions():
                      extra_compile_args=['-w'],
                      language='c++')
 
+    if CUDA != None:
+        ext4 = Extension(name='commit.cudaoperator',
+                        sources = ['commit/operator_withCUDA.cu', 'commit/cudaoperator.pyx'],
+                        extra_compile_args= {'gcc':  ['-w'],
+                                            'nvcc': ['-arch=sm_30', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
+                        language = 'c++',
+                        library_dirs = [CUDA['lib64']],
+                        libraries = ['cudart'],
+                        runtime_library_dirs = [CUDA['lib64']])
+
+        return [ext1, ext2, ext3, ext4]
+
     return [ext1, ext2, ext3]
 
 def get_extensions_with_cuda():
@@ -147,86 +162,49 @@ def get_extensions_with_cuda():
                      libraries = ['cudart'],
                      runtime_library_dirs = [CUDA['lib64']])
 
-# Locate CUDA
-CUDA = locate_cuda()
 
-if CUDA != None:
-    print('\n==========================================================')
-    print('CUDA detected. Installing COMMIT with GPU acceleration.')
-    print('==========================================================\n')
+#print('CUDA not detected. Installing COMMIT without GPU acceleration.')
 
-    class CustomCudaBuildExtCommand(build_ext):
-        """ build_ext command to use when CUDA is detected and numpy headers are needed. """
+class CustomBuildExtCommand(build_ext):
+    """ build_ext command to use when numpy headers are needed. """
 
+    if CUDA != None:
         def build_extensions(self):
             customize_compiler_for_nvcc(self.compiler)
             build_ext.build_extensions(self)
 
-        def run(self):
-            # Now that the requirements are installed, get everything from numpy
-            from Cython.Build import cythonize
-            from numpy import get_include
-            
-            # Add everything requires for build
-            self.swig_opts = None
-            self.include_dirs = [get_include(), CUDA['lib64']]
-            self.distribution.ext_modules[:] = cythonize(self.distribution.ext_modules)
-
-            # Call original build_ext command
-            build_ext.finalize_options(self)
-            build_ext.run(self)
-
-    description = 'Convex Optimization Modeling for Microstructure Informed Tractography (COMMIT)'
-
-    opts = dict(name='dmri-commit',
-                version='1.4.0.0',
-                description=description,
-                long_description=description,
-                author='Alessandro Daducci',
-                author_email='alessandro.daducci@univr.it',
-                url='https://github.com/daducci/COMMIT',
-                packages=['commit', 'commit.operator'],
-                cmdclass={'build_ext': CustomCudaBuildExtCommand},
-                ext_modules=get_extensions_with_cuda(),
-                setup_requires=['Cython>=0.29', 'numpy>=1.12'],
-                install_requires=['Cython>=0.29',
-                                  'dmri-amico>=1.2.3', 'dipy>=1.0', 'numpy>=1.12'],
-                package_data={'commit.operator': ["*.*"]})
-else:
-    print('CUDA not detected. Installing COMMIT without GPU acceleration.')
-
-    class CustomBuildExtCommand(build_ext):
-        """ build_ext command to use when numpy headers are needed. """
-
-        def run(self):
-            # Now that the requirements are installed, get everything from numpy
-            from Cython.Build import cythonize
-            from numpy import get_include
-            
-            # Add everything requires for build
-            self.swig_opts = None
+    def run(self):
+        # Now that the requirements are installed, get everything from numpy
+        from Cython.Build import cythonize
+        from numpy import get_include
+        
+        # Add everything requires for build
+        self.swig_opts = None
+        if CUDA == None:
             self.include_dirs = [get_include()]
-            self.distribution.ext_modules[:] = cythonize(self.distribution.ext_modules)
-
-            # Call original build_ext command
-            build_ext.finalize_options(self)
-            build_ext.run(self)
-
-    description = 'Convex Optimization Modeling for Microstructure Informed Tractography (COMMIT)'
-
-    opts = dict(name='dmri-commit',
-                version='1.3.9.2',
-                description=description,
-                long_description=description,
-                author='Alessandro Daducci',
-                author_email='alessandro.daducci@univr.it',
-                url='https://github.com/daducci/COMMIT',
-                packages=['commit', 'commit.operator'],
-                cmdclass={'build_ext': CustomBuildExtCommand},
-                ext_modules=get_extensions(),
-                setup_requires=['Cython>=0.29', 'numpy>=1.12'],
-                install_requires=['Cython>=0.29',
-                                  'dmri-amico>=1.2.3', 'dipy>=1.0', 'numpy>=1.12'],
-                package_data={'commit.operator': ["*.*"]})
+        else:
+            self.include_dirs = [get_include(), CUDA['lib64']]
+        self.distribution.ext_modules[:] = cythonize(self.distribution.ext_modules)
+
+        # Call original build_ext command
+        build_ext.finalize_options(self)
+        build_ext.run(self)
+
+description = 'Convex Optimization Modeling for Microstructure Informed Tractography (COMMIT)'
+
+opts = dict(name='dmri-commit',
+            version='1.4.0.0',
+            description=description,
+            long_description=description,
+            author='Alessandro Daducci',
+            author_email='alessandro.daducci@univr.it',
+            url='https://github.com/daducci/COMMIT',
+            packages=['commit', 'commit.operator'],
+            cmdclass={'build_ext': CustomBuildExtCommand},
+            ext_modules=get_extensions(),
+            setup_requires=['Cython>=0.29', 'numpy>=1.12'],
+            install_requires=['Cython>=0.29',
+                                'dmri-amico>=1.2.3', 'dipy>=1.0', 'numpy>=1.12'],
+            package_data={'commit.operator': ["*.*"]})
 
 setup(**opts)
\ No newline at end of file

From 2a15a0c3f981f2eb92a3e812461d38e9a6f0c179 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 18 Jul 2020 01:47:01 -0500
Subject: [PATCH 134/190] Merging CUDA version with the lastest COMMIT version

---
 setup.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/setup.py b/setup.py
index 46738346..40e93a0e 100644
--- a/setup.py
+++ b/setup.py
@@ -109,7 +109,7 @@ def get_extensions():
                      extra_compile_args=['-w'],
                      language='c++')
 
-    if CUDA != None:
+    """if CUDA != None:
         ext4 = Extension(name='commit.cudaoperator',
                         sources = ['commit/operator_withCUDA.cu', 'commit/cudaoperator.pyx'],
                         extra_compile_args= {'gcc':  ['-w'],
@@ -119,7 +119,7 @@ def get_extensions():
                         libraries = ['cudart'],
                         runtime_library_dirs = [CUDA['lib64']])
 
-        return [ext1, ext2, ext3, ext4]
+        return [ext1, ext2, ext3, ext4]"""
 
     return [ext1, ext2, ext3]
 
@@ -162,8 +162,10 @@ def get_extensions_with_cuda():
                      libraries = ['cudart'],
                      runtime_library_dirs = [CUDA['lib64']])
 
-
-#print('CUDA not detected. Installing COMMIT without GPU acceleration.')
+if CUDA == None:
+    extensions = get_extensions()
+else:
+    extensions = get_extensions_with_cuda()
 
 class CustomBuildExtCommand(build_ext):
     """ build_ext command to use when numpy headers are needed. """
@@ -201,7 +203,7 @@ def run(self):
             url='https://github.com/daducci/COMMIT',
             packages=['commit', 'commit.operator'],
             cmdclass={'build_ext': CustomBuildExtCommand},
-            ext_modules=get_extensions(),
+            ext_modules=extensions,
             setup_requires=['Cython>=0.29', 'numpy>=1.12'],
             install_requires=['Cython>=0.29',
                                 'dmri-amico>=1.2.3', 'dipy>=1.0', 'numpy>=1.12'],

From 90ac9b9d66bd2e414cffad3a8bf6c53fb0214ef5 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 18 Jul 2020 02:16:59 -0500
Subject: [PATCH 135/190] Merging CUDA version with the lastest COMMIT version

---
 setup.py | 32 +++++++++++++++++++++++---------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/setup.py b/setup.py
index 40e93a0e..a37c1bf9 100644
--- a/setup.py
+++ b/setup.py
@@ -167,13 +167,30 @@ def get_extensions_with_cuda():
 else:
     extensions = get_extensions_with_cuda()
 
-class CustomBuildExtCommand(build_ext):
+if CUDA == None:
+    class CustomBuildExtCommand(build_ext):
+        """ build_ext command to use when numpy headers are needed. """
+
+        def run(self):
+            # Now that the requirements are installed, get everything from numpy
+            from Cython.Build import cythonize
+            from numpy import get_include
+            
+            # Add everything requires for build
+            self.swig_opts = None
+            self.include_dirs = [get_include()]
+            self.distribution.ext_modules[:] = cythonize(self.distribution.ext_modules)
+
+            # Call original build_ext command
+            build_ext.finalize_options(self)
+            build_ext.run(self)
+else:
+    class CustomBuildExtCommand(build_ext):
     """ build_ext command to use when numpy headers are needed. """
 
-    if CUDA != None:
-        def build_extensions(self):
-            customize_compiler_for_nvcc(self.compiler)
-            build_ext.build_extensions(self)
+    def build_extensions(self):
+        customize_compiler_for_nvcc(self.compiler)
+        build_ext.build_extensions(self)
 
     def run(self):
         # Now that the requirements are installed, get everything from numpy
@@ -182,10 +199,7 @@ def run(self):
         
         # Add everything requires for build
         self.swig_opts = None
-        if CUDA == None:
-            self.include_dirs = [get_include()]
-        else:
-            self.include_dirs = [get_include(), CUDA['lib64']]
+        self.include_dirs = [get_include(), CUDA['lib64']]
         self.distribution.ext_modules[:] = cythonize(self.distribution.ext_modules)
 
         # Call original build_ext command

From c1d30ed3daaf240cfc2a148b2470e05060c231ae Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 18 Jul 2020 02:18:03 -0500
Subject: [PATCH 136/190] Merging CUDA version with the lastest COMMIT version

---
 setup.py | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/setup.py b/setup.py
index a37c1bf9..89578eb0 100644
--- a/setup.py
+++ b/setup.py
@@ -186,25 +186,25 @@ def run(self):
             build_ext.run(self)
 else:
     class CustomBuildExtCommand(build_ext):
-    """ build_ext command to use when numpy headers are needed. """
-
-    def build_extensions(self):
-        customize_compiler_for_nvcc(self.compiler)
-        build_ext.build_extensions(self)
-
-    def run(self):
-        # Now that the requirements are installed, get everything from numpy
-        from Cython.Build import cythonize
-        from numpy import get_include
-        
-        # Add everything requires for build
-        self.swig_opts = None
-        self.include_dirs = [get_include(), CUDA['lib64']]
-        self.distribution.ext_modules[:] = cythonize(self.distribution.ext_modules)
-
-        # Call original build_ext command
-        build_ext.finalize_options(self)
-        build_ext.run(self)
+        """ build_ext command to use when numpy headers are needed. """
+
+        def build_extensions(self):
+            customize_compiler_for_nvcc(self.compiler)
+            build_ext.build_extensions(self)
+
+        def run(self):
+            # Now that the requirements are installed, get everything from numpy
+            from Cython.Build import cythonize
+            from numpy import get_include
+            
+            # Add everything requires for build
+            self.swig_opts = None
+            self.include_dirs = [get_include(), CUDA['lib64']]
+            self.distribution.ext_modules[:] = cythonize(self.distribution.ext_modules)
+
+            # Call original build_ext command
+            build_ext.finalize_options(self)
+            build_ext.run(self)
 
 description = 'Convex Optimization Modeling for Microstructure Informed Tractography (COMMIT)'
 

From 382efb9cc0b4d4ea634931923d28a43ff0d9fe3b Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 18 Jul 2020 02:23:44 -0500
Subject: [PATCH 137/190] Merging CUDA version with the lastest COMMIT version

---
 commit/__init__.py                         |  10 +-
 commit/operator/config.py                  |  12 +-
 commit/operator/operator_noLUT.c           | 374 +++++-----
 commit/solvers.py                          | 806 ++++++++++-----------
 commit/trk2dictionary/trk2dictionary.pyx   | 459 ------------
 commit/trk2dictionary/trk2dictionary_c.cpp | 611 ----------------
 6 files changed, 601 insertions(+), 1671 deletions(-)

diff --git a/commit/__init__.py b/commit/__init__.py
index e7e71d6c..3ab179d3 100755
--- a/commit/__init__.py
+++ b/commit/__init__.py
@@ -1,5 +1,5 @@
-from .core import Evaluation
-__all__ = ['core','models','solvers','trk2dictionary']
-
-from pkg_resources import get_distribution
-__version__ = get_distribution('dmri-commit').version
+from .core import Evaluation
+__all__ = ['core','models','solvers','trk2dictionary']
+
+from pkg_resources import get_distribution
+__version__ = get_distribution('dmri-commit').version
diff --git a/commit/operator/config.py b/commit/operator/config.py
index 8192419b..8d782f49 100755
--- a/commit/operator/config.py
+++ b/commit/operator/config.py
@@ -1,6 +1,6 @@
-nTHREADS      = None
-model         = None
-nIC           = None
-nEC      	  = None
-nISO     	  = None
-build_locally = False
+nTHREADS      = None
+model         = None
+nIC           = None
+nEC      	  = None
+nISO     	  = None
+build_locally = False
diff --git a/commit/operator/operator_noLUT.c b/commit/operator/operator_noLUT.c
index fe1269fd..d8b6706b 100644
--- a/commit/operator/operator_noLUT.c
+++ b/commit/operator/operator_noLUT.c
@@ -1,187 +1,187 @@
-#include <pthread.h>
-#include <stdint.h> // uint32_t etc
-
-// number of THREADS
-#ifdef nTHREADS
-    #if (nTHREADS<0 || nTHREADS>255)
-    #error "nTHREADS" must be in the range 0..255
-    #endif
-#else
-    #error "nTHREADS" parameter must be passed to the compiler as "-DnTHREADS=<value>"
-#endif
-
-
-/* global variables */
-int         nF, n;
-double      *x, *Y;
-uint32_t    *ICthreads, *ISOthreads;
-uint8_t     *ICthreadsT;
-uint32_t    *ISOthreadsT;
-uint32_t    *ICf, *ICv, *ISOv;
-float       *ICl;
-
-
-// ====================================================
-// Compute a sub-block of the A*x MAtRIX-VECTOR product
-// ====================================================
-void* COMMIT_A__block( void *ptr )
-{
-    int      id = (long)ptr;
-    double   x0;
-    double   *xPtr;
-    uint32_t *t_v, *t_vEnd, *t_f;
-    float    *t_l;
-
-    // intra-cellular compartments
-    t_v    = ICv + ICthreads[id];
-    t_vEnd = ICv + ICthreads[id+1];
-    t_l    = ICl + ICthreads[id];
-    t_f    = ICf + ICthreads[id];
-
-    while( t_v != t_vEnd )
-    {
-        x0 = x[*t_f];
-        if ( x0 != 0 )
-            Y[*t_v] += (double)(*t_l) * x0;
-        t_f++;
-        t_v++;
-        t_l++;
-    }
-
-#if nISO>=1
-    // isotropic compartments
-    t_v    = ISOv + ISOthreads[id];
-    t_vEnd = ISOv + ISOthreads[id+1];
-    xPtr   = x + nF + ISOthreads[id];
-
-    while( t_v != t_vEnd )
-    {
-        x0 = *xPtr++;
-        if ( x0 != 0 )
-            Y[*t_v] += x0;
-        t_v++;
-    }
-#endif
-
-    pthread_exit( 0 );
-}
-
-
-// =========================
-// Function called by CYTHON
-// =========================
-void COMMIT_A(
-    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
-    double *_vIN, double *_vOUT,
-    uint32_t *_ICf, uint32_t *_ICv, uint16_t *_ICo, float *_ICl,
-    uint32_t *_ECv, uint16_t *_ECo,
-    uint32_t *_ISOv,
-    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
-    uint32_t* _ICthreads, uint32_t* _ECthreads, uint32_t* _ISOthreads
-)
-{
-    nF = _nF;
-    n  = _n;
-
-    x = _vIN;
-    Y = _vOUT;
-
-    ICf  = _ICf;
-    ICv  = _ICv;
-    ICl  = _ICl;
-    ISOv = _ISOv;
-
-    ICthreads  = _ICthreads;
-    ISOthreads = _ISOthreads;
-
-    // Run SEPARATE THREADS to perform the multiplication
-    pthread_t threads[nTHREADS];
-    int t;
-    for(t=0; t<nTHREADS ; t++)
-        pthread_create( &threads[t], NULL, COMMIT_A__block, (void *) (long int)t );
-    for(t=0; t<nTHREADS ; t++)
-        pthread_join( threads[t], NULL );
-    return;
-}
-
-
-
-/* ===================================================== */
-/* Compute a sub-block of the A'*y MAtRIX-VECTOR product */
-/* ===================================================== */
-void* COMMIT_At__block( void *ptr )
-{
-    int      id = (long)ptr;
-    double   *xPtr;
-    uint32_t *t_v, *t_vEnd, *t_f;
-    float    *t_l;
-    uint8_t  *t_t;
-
-    // intra-cellular compartments
-    t_v    = ICv;
-    t_vEnd = ICv + n;
-    t_l    = ICl;
-    t_f    = ICf;
-    t_t    = ICthreadsT;
-
-    while( t_v != t_vEnd )
-    {
-        // in this case, I need to walk throug because the segments are ordered in "voxel order"
-        if ( *t_t == id )
-            x[*t_f] += (double)(*t_l) * Y[*t_v];
-        t_t++;
-        t_f++;
-        t_v++;
-        t_l++;
-    }
-
-#if nISO>=1
-    // isotropic compartments
-    t_v    = ISOv + ISOthreadsT[id];
-    t_vEnd = ISOv + ISOthreadsT[id+1];
-    xPtr   = x + nF + ISOthreadsT[id];
-
-    while( t_v != t_vEnd )
-        (*xPtr++) += Y[*t_v++];
-#endif
-
-    pthread_exit( 0 );
-}
-
-
-// =========================
-// Function called by CYTHON
-// =========================
-void COMMIT_At(
-    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
-    double *_vIN, double *_vOUT,
-    uint32_t *_ICf, uint32_t *_ICv, uint16_t *_ICo, float *_ICl,
-    uint32_t *_ECv, uint16_t *_ECo,
-    uint32_t *_ISOv,
-    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
-    uint8_t* _ICthreadsT, uint32_t* _ECthreadsT, uint32_t* _ISOthreadsT
-)
-{
-    nF = _nF;
-    n  = _n;
-
-    x = _vOUT;
-    Y = _vIN;
-
-    ICf  = _ICf;
-    ICv  = _ICv;
-    ICl  = _ICl;
-    ISOv = _ISOv;
-
-    ICthreadsT  = _ICthreadsT;
-    ISOthreadsT = _ISOthreadsT;
-
-    // Run SEPARATE THREADS to perform the multiplication
-    pthread_t threads[nTHREADS];
-    int t;
-    for(t=0; t<nTHREADS ; t++)
-        pthread_create( &threads[t], NULL, COMMIT_At__block, (void *) (long int)t );
-    for(t=0; t<nTHREADS ; t++)
-        pthread_join( threads[t], NULL );
-    return;
-}
+#include <pthread.h>
+#include <stdint.h> // uint32_t etc
+
+// number of THREADS
+#ifdef nTHREADS
+    #if (nTHREADS<0 || nTHREADS>255)
+    #error "nTHREADS" must be in the range 0..255
+    #endif
+#else
+    #error "nTHREADS" parameter must be passed to the compiler as "-DnTHREADS=<value>"
+#endif
+
+
+/* global variables */
+int         nF, n;
+double      *x, *Y;
+uint32_t    *ICthreads, *ISOthreads;
+uint8_t     *ICthreadsT;
+uint32_t    *ISOthreadsT;
+uint32_t    *ICf, *ICv, *ISOv;
+float       *ICl;
+
+
+// ====================================================
+// Compute a sub-block of the A*x MAtRIX-VECTOR product
+// ====================================================
+void* COMMIT_A__block( void *ptr )
+{
+    int      id = (long)ptr;
+    double   x0;
+    double   *xPtr;
+    uint32_t *t_v, *t_vEnd, *t_f;
+    float    *t_l;
+
+    // intra-cellular compartments
+    t_v    = ICv + ICthreads[id];
+    t_vEnd = ICv + ICthreads[id+1];
+    t_l    = ICl + ICthreads[id];
+    t_f    = ICf + ICthreads[id];
+
+    while( t_v != t_vEnd )
+    {
+        x0 = x[*t_f];
+        if ( x0 != 0 )
+            Y[*t_v] += (double)(*t_l) * x0;
+        t_f++;
+        t_v++;
+        t_l++;
+    }
+
+#if nISO>=1
+    // isotropic compartments
+    t_v    = ISOv + ISOthreads[id];
+    t_vEnd = ISOv + ISOthreads[id+1];
+    xPtr   = x + nF + ISOthreads[id];
+
+    while( t_v != t_vEnd )
+    {
+        x0 = *xPtr++;
+        if ( x0 != 0 )
+            Y[*t_v] += x0;
+        t_v++;
+    }
+#endif
+
+    pthread_exit( 0 );
+}
+
+
+// =========================
+// Function called by CYTHON
+// =========================
+void COMMIT_A(
+    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
+    double *_vIN, double *_vOUT,
+    uint32_t *_ICf, uint32_t *_ICv, uint16_t *_ICo, float *_ICl,
+    uint32_t *_ECv, uint16_t *_ECo,
+    uint32_t *_ISOv,
+    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
+    uint32_t* _ICthreads, uint32_t* _ECthreads, uint32_t* _ISOthreads
+)
+{
+    nF = _nF;
+    n  = _n;
+
+    x = _vIN;
+    Y = _vOUT;
+
+    ICf  = _ICf;
+    ICv  = _ICv;
+    ICl  = _ICl;
+    ISOv = _ISOv;
+
+    ICthreads  = _ICthreads;
+    ISOthreads = _ISOthreads;
+
+    // Run SEPARATE THREADS to perform the multiplication
+    pthread_t threads[nTHREADS];
+    int t;
+    for(t=0; t<nTHREADS ; t++)
+        pthread_create( &threads[t], NULL, COMMIT_A__block, (void *) (long int)t );
+    for(t=0; t<nTHREADS ; t++)
+        pthread_join( threads[t], NULL );
+    return;
+}
+
+
+
+/* ===================================================== */
+/* Compute a sub-block of the A'*y MAtRIX-VECTOR product */
+/* ===================================================== */
+void* COMMIT_At__block( void *ptr )
+{
+    int      id = (long)ptr;
+    double   *xPtr;
+    uint32_t *t_v, *t_vEnd, *t_f;
+    float    *t_l;
+    uint8_t  *t_t;
+
+    // intra-cellular compartments
+    t_v    = ICv;
+    t_vEnd = ICv + n;
+    t_l    = ICl;
+    t_f    = ICf;
+    t_t    = ICthreadsT;
+
+    while( t_v != t_vEnd )
+    {
+        // in this case, I need to walk throug because the segments are ordered in "voxel order"
+        if ( *t_t == id )
+            x[*t_f] += (double)(*t_l) * Y[*t_v];
+        t_t++;
+        t_f++;
+        t_v++;
+        t_l++;
+    }
+
+#if nISO>=1
+    // isotropic compartments
+    t_v    = ISOv + ISOthreadsT[id];
+    t_vEnd = ISOv + ISOthreadsT[id+1];
+    xPtr   = x + nF + ISOthreadsT[id];
+
+    while( t_v != t_vEnd )
+        (*xPtr++) += Y[*t_v++];
+#endif
+
+    pthread_exit( 0 );
+}
+
+
+// =========================
+// Function called by CYTHON
+// =========================
+void COMMIT_At(
+    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
+    double *_vIN, double *_vOUT,
+    uint32_t *_ICf, uint32_t *_ICv, uint16_t *_ICo, float *_ICl,
+    uint32_t *_ECv, uint16_t *_ECo,
+    uint32_t *_ISOv,
+    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
+    uint8_t* _ICthreadsT, uint32_t* _ECthreadsT, uint32_t* _ISOthreadsT
+)
+{
+    nF = _nF;
+    n  = _n;
+
+    x = _vOUT;
+    Y = _vIN;
+
+    ICf  = _ICf;
+    ICv  = _ICv;
+    ICl  = _ICl;
+    ISOv = _ISOv;
+
+    ICthreadsT  = _ICthreadsT;
+    ISOthreadsT = _ISOthreadsT;
+
+    // Run SEPARATE THREADS to perform the multiplication
+    pthread_t threads[nTHREADS];
+    int t;
+    for(t=0; t<nTHREADS ; t++)
+        pthread_create( &threads[t], NULL, COMMIT_At__block, (void *) (long int)t );
+    for(t=0; t<nTHREADS ; t++)
+        pthread_join( threads[t], NULL );
+    return;
+}
diff --git a/commit/solvers.py b/commit/solvers.py
index 29bc8374..dc7767ce 100755
--- a/commit/solvers.py
+++ b/commit/solvers.py
@@ -1,403 +1,403 @@
-"""
-Author: Matteo Frigo - lts5 @ EPFL and Dep. of CS @ Univ. of Verona
-
-This structure is based on the previous work of Rafael Carrillo and was
-supported by the LTS5 laboratory at EPFL, Lausanne.
-"""
-from __future__ import print_function
-import numpy as np
-from math import sqrt
-import sys
-import warnings
-eps = np.finfo(float).eps
-
-from commit.proximals import (non_negativity,
-                             omega_group_sparsity,
-                             prox_group_sparsity,
-                             soft_thresholding,
-                             projection_onto_l2_ball)
-group_sparsity = -1
-non_negative = 0
-norm1 = 1
-norm2 = 2
-norminf = np.inf
-list_regnorms = [group_sparsity, non_negative, norm1, norm2]
-list_group_sparsity_norms = [norm2]#, norminf] # removed because of issue #54
-
-
-def init_regularisation(commit_evaluation,
-                        regnorms = (non_negative, non_negative, non_negative),
-                        structureIC = None, weightsIC = None, group_norm = 2,
-                        lambdas = (.0,.0,.0) ):
-    """
-    Initialise the data structure that defines Omega in
-
-        argmin_x 0.5*||Ax-y||_2^2 + Omega(x)
-
-
-    Input
-    -----
-    commit_evaluation - commit.Evaluation object :
-        dictionary and model have to be loaded beforehand.
-
-
-    regnorms - tuple :
-        this sets the penalty term to be used for each compartment.
-            Default = (non_negative,non_negative,non_negative).
-
-            regnorms[0] corresponds to the Intracellular compartment
-            regnorms[1] corresponds to the Extracellular compartment
-            regnorms[2] corresponds to the Isotropic compartment
-
-            Each regnorms[k] must be one of commit.solvers.
-                                {group_sparsity, non_negative, norm1, norm2}.
-
-            commit.solvers.group_sparsity considers both the non-overlapping
-                and the hierarchical group sparsity (see [1]). This option is
-                allowed only in the IC compartment. The mathematical formulation
-                of this term is
-                $\Omega(x) = \lambda \sum_{g\in G} w_g |x_g|
-
-            commit.solvers.non_negative puts a non negativity constraint on the
-                coefficients corresponding to the compartment. This is the
-                default option for each compartment
-
-            commit.solvers.norm1 penalises with the 1-norm of the coefficients
-                corresponding to the compartment.
-
-            commit.solvers.norm2 penalises with the 2-norm of the coefficients
-                corresponding to the compartment.
-
-
-    structureIC - np.array(list(list)) :
-        group structure for the IC compartment.
-            This field is necessary only if regterm[0]=commit.solver.group_sparsity.
-            Example:
-                structureIC = np.array([[0,2,5],[1,3,4],[0,1,2,3,4,5],[6]])
-
-                that is equivalent to
-                            [0,1,2,3,4,5]        [6]
-                              /       \
-                        [0,2,5]       [1,3,4]
-                which has two non overlapping groups, one of which is the union
-                of two other non-overlapping groups.
-
-
-    weightsIC - np.array(np.float64) :
-        this defines the weights associated to each group of structure IC.
-
-
-    group_norm - number :
-        norm type for the commit.solver.group_sparsity penalisation of the IC compartment.
-            Default: group_norm = commit.solver.norm2
-            To be chosen among commit.solver.{norm2,norminf}.
-
-    lambdas - tuple :
-        regularisation parameter for each compartment.
-            Default: lambdas = (0.0, 0.0, 0.0)
-            The lambdas correspond to the onse described in the mathematical
-            formulation of the regularisation term
-            $\Omega(x) = lambdas[0]*regnorm[0](x) + lambdas[1]*regnorm[1](x) + lambdas[2]*regnorm[2](x)$
-
-
-    References:
-        [1] Jenatton et al. - 'Proximal Methods for Hierarchical Sparse Coding'
-    """
-    regularisation = {}
-
-    regularisation['startIC']  = 0
-    regularisation['sizeIC']   = int( commit_evaluation.DICTIONARY['IC']['nF'] * commit_evaluation.KERNELS['wmr'].shape[0])
-    regularisation['startEC']  = int( regularisation['sizeIC'] )
-    regularisation['sizeEC']   = int( commit_evaluation.DICTIONARY['EC']['nE'] * commit_evaluation.KERNELS['wmh'].shape[0])
-    regularisation['startISO'] = int( regularisation['sizeIC'] + regularisation['sizeEC'] )
-    regularisation['sizeISO']  = int( commit_evaluation.DICTIONARY['nV'] * commit_evaluation.KERNELS['iso'].shape[0])
-
-    regularisation['normIC']  = regnorms[0]
-    regularisation['normEC']  = regnorms[1]
-    regularisation['normISO'] = regnorms[2]
-
-    regularisation['lambdaIC']  = float( lambdas[0] )
-    regularisation['lambdaEC']  = float( lambdas[1] )
-    regularisation['lambdaISO'] = float( lambdas[2] )
-
-    # Solver-specific fields
-    regularisation['structureIC']      = structureIC
-    regularisation['weightsIC']        = weightsIC
-    regularisation['group_norm']       = group_norm
-
-    return regularisation
-
-
-def regularisation2omegaprox(regularisation):
-    lambdaIC  = float(regularisation.get('lambdaIC'))
-    lambdaEC  = float(regularisation.get('lambdaEC'))
-    lambdaISO = float(regularisation.get('lambdaISO'))
-    if lambdaIC < 0.0 or lambdaEC < 0.0 or lambdaISO < 0.0:
-        raise ValueError('Negative regularisation parameters are not allowed')
-
-    normIC  = regularisation.get('normIC')
-    normEC  = regularisation.get('normEC')
-    normISO = regularisation.get('normISO')
-    if not normIC in list_regnorms:
-        raise ValueError('normIC must be one of commit.solvers.{group_sparsity,non_negative,norm1,norm2}')
-    if not normEC in list_regnorms:
-        raise ValueError('normEC must be one of commit.solvers.{group_sparsity,non_negative,norm1,norm2}')
-    if not normISO in list_regnorms:
-        raise ValueError('normISO must be one of commit.solvers.{group_sparsity,non_negative,norm1,norm2}')
-
-    ## NNLS case
-    if (lambdaIC == 0.0 and lambdaEC == 0.0 and lambdaISO == 0.0) or (normIC == non_negative and normEC == non_negative and normISO == non_negative):
-        omega = lambda x: 0.0
-        prox  = lambda x: non_negativity(x, 0, len(x))
-        return omega, prox
-
-    ## All other cases
-    # Intracellular Compartment
-    startIC = regularisation.get('startIC')
-    sizeIC  = regularisation.get('sizeIC')
-    if lambdaIC == 0.0:
-        omegaIC = lambda x: 0.0
-        proxIC  = lambda x: x
-    elif normIC == norm2:
-        omegaIC = lambda x: lambdaIC * np.linalg.norm(x[startIC:sizeIC])
-        proxIC  = lambda x: projection_onto_l2_ball(x, lambdaIC, startIC, sizeIC)
-    elif normIC == norm1:
-        omegaIC = lambda x: lambdaIC * sum( x[startIC:sizeIC] )
-        proxIC  = lambda x: soft_thresholding(x, lambdaIC, startIC, sizeIC)
-    elif normIC == non_negative:
-        omegaIC = lambda x: 0.0
-        proxIC  = lambda x: non_negativity(x, startIC, sizeIC)
-    elif normIC == group_sparsity:
-        structureIC = regularisation.get('structureIC')
-        groupWeightIC   = regularisation.get('weightsIC')
-        if not len(structureIC) == len(groupWeightIC):
-            raise ValueError('Number of groups and weights do not coincide.')
-        group_norm = regularisation.get('group_norm')
-        if not group_norm in list_group_sparsity_norms:
-            raise ValueError('Wrong norm in the structured sparsity term. Choose between %s.' % str(list_group_sparsity_norms))
-
-        # convert to new data structure (needed for faster access)
-        N = np.sum([g.size for g in structureIC])
-        groupIdxIC  = np.zeros( (N,), dtype=np.int32 )
-        groupSizeIC = np.zeros( (structureIC.size,), dtype=np.int32 )
-        pos = 0
-        for i, g in enumerate(structureIC) :
-            groupSizeIC[i] = g.size
-            groupIdxIC[pos:(pos+g.size)] = g[:]
-            pos += g.size
-
-        omegaIC = lambda x: omega_group_sparsity( x, groupIdxIC, groupSizeIC, groupWeightIC, lambdaIC, group_norm )
-        proxIC  = lambda x:  prox_group_sparsity( x, groupIdxIC, groupSizeIC, groupWeightIC, lambdaIC, group_norm )
-    else:
-        raise ValueError('Type of regularisation for IC compartment not recognized.')
-
-
-    # Extracellular Compartment
-    startEC = regularisation.get('startEC')
-    sizeEC  = regularisation.get('sizeEC')
-    if lambdaEC == 0.0:
-        omegaEC = lambda x: 0.0
-        proxEC  = lambda x: x
-    elif normEC == norm2:
-        omegaEC = lambda x: lambdaEC * np.linalg.norm(x[startEC:(startEC+sizeEC)])
-        proxEC  = lambda x: projection_onto_l2_ball(x, lambdaEC, startEC, sizeEC)
-    elif normEC == norm1:
-        omegaEC = lambda x: lambdaEC * sum( x[startEC:(startEC+sizeEC)] )
-        proxEC  = lambda x: soft_thresholding(x, lambdaEC, startEC, sizeEC)
-    elif normEC == non_negative:
-        omegaEC = lambda x: 0.0
-        proxEC  = lambda x: non_negativity(x, startEC, sizeEC)
-    else:
-        raise ValueError('Type of regularisation for EC compartment not recognized.')
-
-    # Isotropic Compartment
-    startISO = regularisation.get('startISO')
-    sizeISO  = regularisation.get('sizeISO')
-    if lambdaISO == 0.0:
-        omegaISO = lambda x: 0.0
-        proxISO  = lambda x: x
-    elif normISO == norm2:
-        omegaISO = lambda x: lambdaISO * np.linalg.norm(x[startISO:(startISO+sizeISO)])
-        proxISO  = lambda x: projection_onto_l2_ball(x, lambdaISO, startISO, sizeISO)
-    elif normISO == norm1:
-        omegaISO = lambda x: lambdaISO * sum( x[startISO:(startISO+sizeISO)] )
-        proxISO  = lambda x: soft_thresholding(x, lambdaISO, startISO, sizeISO)
-    elif normISO == non_negative:
-        omegaISO = lambda x: 0.0
-        proxISO  = lambda x: non_negativity(x, startISO, sizeISO)
-    else:
-        raise ValueError('Type of regularisation for ISO compartment not recognized.')
-
-    omega = lambda x: omegaIC(x) + omegaEC(x) + omegaISO(x)
-    prox = lambda x: non_negativity(proxIC(proxEC(proxISO(x))),0,x.size) # non negativity is redunduntly forced
-
-    return omega, prox
-
-
-def evaluate_model(y, A, x, regularisation = None):
-    if regularisation is None:
-        omega = lambda x: 0.0
-        prox  = lambda x: non_negativity(x, 0, len(x))
-    else:
-        omega, _ = regularisation2omegaprox(regularisation)
-
-    return 0.5*np.linalg.norm(A.dot(x)-y)**2 + omega(x)
-
-
-def solve(y, A, At, tol_fun = 1e-4, tol_x = 1e-6, max_iter = 1000, verbose = 1, x0 = None, regularisation = None):
-    """
-    Solve the regularised least squares problem
-
-        argmin_x 0.5*||Ax-y||_2^2 + Omega(x)
-
-    with the Omega described by 'regularisation'.
-
-    Check the documentation of commit.solvers.init_regularisation to see how to
-    solve a specific problem.
-    """
-    if regularisation is None:
-        omega = lambda x: 0.0
-        prox  = lambda x: non_negativity(x, 0, x.size)
-    else:
-        omega, prox = regularisation2omegaprox(regularisation)
-
-    if x0 is None:
-        x0 = np.zeros(A.shape[1])
-
-    return fista( y, A, At, tol_fun, tol_x, max_iter, verbose, x0, omega, prox)
-
-
-def fista( y, A, At, tol_fun, tol_x, max_iter, verbose, x0, omega, proximal) :
-    """
-    Solve the regularised least squares problem
-
-        argmin_x 0.5*||Ax-y||_2^2 + Omega(x)
-
-    with the FISTA algorithm described in [1].
-
-    The penalty term and its proximal operator must be defined in such a way
-    that they already contain the regularisation parameter.
-
-    References:
-        [1] Beck & Teboulle - `A Fast Iterative Shrinkage Thresholding
-            Algorithm for Linear Inverse Problems`
-    """
-
-    # Initialization
-    res = -y.copy()
-    xhat = x0.copy()
-    x = np.zeros_like(xhat)
-    res += A.dot(xhat)
-    proximal( xhat )
-    reg_term = omega( xhat )
-    prev_obj = 0.5 * np.linalg.norm(res)**2 + reg_term
-
-    told = 1
-    beta = 0.9
-    prev_x = xhat.copy()
-    grad = np.asarray(At.dot(res))
-    qfval = prev_obj
-
-    # Step size computation
-    L = ( np.linalg.norm( A.dot(grad) ) / np.linalg.norm(grad) )**2
-    mu = 1.9 / L
-
-    # Main loop
-    if verbose >= 1 :
-        print()
-        print( "      |  1/2||Ax-y||^2      Omega      |  Cost function    Abs error      Rel error    |      Abs x          Rel x    " )
-        print( "------|--------------------------------|-----------------------------------------------|------------------------------" )
-    iter = 1
-    while True :
-        if verbose >= 1 :
-            print( "%4d  |" % iter, end="" )
-            sys.stdout.flush()
-
-        # Smooth step
-        x = xhat - mu*grad
-
-        # Non-smooth step
-        proximal( x )
-        reg_term_x = omega( x )
-
-        # Check stepsize
-        tmp = x-xhat
-        q = qfval + np.real( np.dot(tmp,grad) ) + 0.5/mu * np.linalg.norm(tmp)**2 + reg_term_x
-        res = A.dot(x) - y
-        res_norm = np.linalg.norm(res)
-        curr_obj = 0.5 * res_norm**2 + reg_term_x
-
-        # Backtracking
-        while curr_obj > q :
-            # Smooth step
-            mu = beta*mu
-            x = xhat - mu*grad
-
-            # Non-smooth step
-            proximal( x )
-            reg_term_x = omega( x )
-
-            # Check stepsize
-            tmp = x-xhat
-            q = qfval + np.real( np.dot(tmp,grad) ) + 0.5/mu * np.linalg.norm(tmp)**2 + reg_term_x
-            res = A.dot(x) - y
-            res_norm = np.linalg.norm(res)
-            curr_obj = 0.5 * res_norm**2 + reg_term_x
-
-        # Global stopping criterion
-        abs_obj = abs(curr_obj - prev_obj)
-        rel_obj = abs_obj / curr_obj
-        abs_x   = np.linalg.norm(x - prev_x)
-        rel_x   = abs_x / ( np.linalg.norm(x) + eps )
-        if verbose >= 1 :
-            print( "  %13.7e  %13.7e  |  %13.7e  %13.7e  %13.7e  |  %13.7e  %13.7e" % ( 0.5 * res_norm**2, reg_term_x, curr_obj, abs_obj, rel_obj, abs_x, rel_x ) )
-
-        if abs_obj < eps :
-            criterion = "Absolute tolerance on the objective"
-            break
-        elif rel_obj < tol_fun :
-            criterion = "Relative tolerance on the objective"
-            break
-        elif abs_x < eps :
-            criterion = "Absolute tolerance on the unknown"
-            break
-        elif rel_x < tol_x :
-            criterion = "Relative tolerance on the unknown"
-            break
-        elif iter >= max_iter :
-            criterion = "Maximum number of iterations"
-            break
-
-        # FISTA update
-        t = 0.5 * ( 1 + sqrt(1+4*told**2) )
-        xhat = x + (told-1)/t * (x - prev_x)
-
-        # Gradient computation
-        res = A.dot(xhat) - y
-        xarr = np.asarray(x)
-
-        grad = np.asarray(At.dot(res))
-
-        # Update variables
-        iter += 1
-        prev_obj = curr_obj
-        prev_x = x.copy()
-        told = t
-        qfval = 0.5 * np.linalg.norm(res)**2
-
-
-    if verbose >= 1 :
-        print( "< Stopping criterion: %s >" % criterion )
-
-    opt_details = {}
-    opt_details['residual'] = 0.5*res_norm**2
-    opt_details['regterm'] = reg_term_x
-    opt_details['cost_function'] = curr_obj
-    opt_details['abs_cost'] = abs_obj
-    opt_details['rel_cost'] = rel_obj
-    opt_details['abs_x'] = abs_x
-    opt_details['rel _x'] = rel_x
-    opt_details['iterations'] = iter
-    opt_details['stopping_criterion'] = criterion
-
-    return x, opt_details
+"""
+Author: Matteo Frigo - lts5 @ EPFL and Dep. of CS @ Univ. of Verona
+
+This structure is based on the previous work of Rafael Carrillo and was
+supported by the LTS5 laboratory at EPFL, Lausanne.
+"""
+from __future__ import print_function
+import numpy as np
+from math import sqrt
+import sys
+import warnings
+eps = np.finfo(float).eps
+
+from commit.proximals import (non_negativity,
+                             omega_group_sparsity,
+                             prox_group_sparsity,
+                             soft_thresholding,
+                             projection_onto_l2_ball)
+group_sparsity = -1
+non_negative = 0
+norm1 = 1
+norm2 = 2
+norminf = np.inf
+list_regnorms = [group_sparsity, non_negative, norm1, norm2]
+list_group_sparsity_norms = [norm2]#, norminf] # removed because of issue #54
+
+
+def init_regularisation(commit_evaluation,
+                        regnorms = (non_negative, non_negative, non_negative),
+                        structureIC = None, weightsIC = None, group_norm = 2,
+                        lambdas = (.0,.0,.0) ):
+    """
+    Initialise the data structure that defines Omega in
+
+        argmin_x 0.5*||Ax-y||_2^2 + Omega(x)
+
+
+    Input
+    -----
+    commit_evaluation - commit.Evaluation object :
+        dictionary and model have to be loaded beforehand.
+
+
+    regnorms - tuple :
+        this sets the penalty term to be used for each compartment.
+            Default = (non_negative,non_negative,non_negative).
+
+            regnorms[0] corresponds to the Intracellular compartment
+            regnorms[1] corresponds to the Extracellular compartment
+            regnorms[2] corresponds to the Isotropic compartment
+
+            Each regnorms[k] must be one of commit.solvers.
+                                {group_sparsity, non_negative, norm1, norm2}.
+
+            commit.solvers.group_sparsity considers both the non-overlapping
+                and the hierarchical group sparsity (see [1]). This option is
+                allowed only in the IC compartment. The mathematical formulation
+                of this term is
+                $\Omega(x) = \lambda \sum_{g\in G} w_g |x_g|
+
+            commit.solvers.non_negative puts a non negativity constraint on the
+                coefficients corresponding to the compartment. This is the
+                default option for each compartment
+
+            commit.solvers.norm1 penalises with the 1-norm of the coefficients
+                corresponding to the compartment.
+
+            commit.solvers.norm2 penalises with the 2-norm of the coefficients
+                corresponding to the compartment.
+
+
+    structureIC - np.array(list(list)) :
+        group structure for the IC compartment.
+            This field is necessary only if regterm[0]=commit.solver.group_sparsity.
+            Example:
+                structureIC = np.array([[0,2,5],[1,3,4],[0,1,2,3,4,5],[6]])
+
+                that is equivalent to
+                            [0,1,2,3,4,5]        [6]
+                              /       \
+                        [0,2,5]       [1,3,4]
+                which has two non overlapping groups, one of which is the union
+                of two other non-overlapping groups.
+
+
+    weightsIC - np.array(np.float64) :
+        this defines the weights associated to each group of structure IC.
+
+
+    group_norm - number :
+        norm type for the commit.solver.group_sparsity penalisation of the IC compartment.
+            Default: group_norm = commit.solver.norm2
+            To be chosen among commit.solver.{norm2,norminf}.
+
+    lambdas - tuple :
+        regularisation parameter for each compartment.
+            Default: lambdas = (0.0, 0.0, 0.0)
+            The lambdas correspond to the onse described in the mathematical
+            formulation of the regularisation term
+            $\Omega(x) = lambdas[0]*regnorm[0](x) + lambdas[1]*regnorm[1](x) + lambdas[2]*regnorm[2](x)$
+
+
+    References:
+        [1] Jenatton et al. - 'Proximal Methods for Hierarchical Sparse Coding'
+    """
+    regularisation = {}
+
+    regularisation['startIC']  = 0
+    regularisation['sizeIC']   = int( commit_evaluation.DICTIONARY['IC']['nF'] * commit_evaluation.KERNELS['wmr'].shape[0])
+    regularisation['startEC']  = int( regularisation['sizeIC'] )
+    regularisation['sizeEC']   = int( commit_evaluation.DICTIONARY['EC']['nE'] * commit_evaluation.KERNELS['wmh'].shape[0])
+    regularisation['startISO'] = int( regularisation['sizeIC'] + regularisation['sizeEC'] )
+    regularisation['sizeISO']  = int( commit_evaluation.DICTIONARY['nV'] * commit_evaluation.KERNELS['iso'].shape[0])
+
+    regularisation['normIC']  = regnorms[0]
+    regularisation['normEC']  = regnorms[1]
+    regularisation['normISO'] = regnorms[2]
+
+    regularisation['lambdaIC']  = float( lambdas[0] )
+    regularisation['lambdaEC']  = float( lambdas[1] )
+    regularisation['lambdaISO'] = float( lambdas[2] )
+
+    # Solver-specific fields
+    regularisation['structureIC']      = structureIC
+    regularisation['weightsIC']        = weightsIC
+    regularisation['group_norm']       = group_norm
+
+    return regularisation
+
+
+def regularisation2omegaprox(regularisation):
+    lambdaIC  = float(regularisation.get('lambdaIC'))
+    lambdaEC  = float(regularisation.get('lambdaEC'))
+    lambdaISO = float(regularisation.get('lambdaISO'))
+    if lambdaIC < 0.0 or lambdaEC < 0.0 or lambdaISO < 0.0:
+        raise ValueError('Negative regularisation parameters are not allowed')
+
+    normIC  = regularisation.get('normIC')
+    normEC  = regularisation.get('normEC')
+    normISO = regularisation.get('normISO')
+    if not normIC in list_regnorms:
+        raise ValueError('normIC must be one of commit.solvers.{group_sparsity,non_negative,norm1,norm2}')
+    if not normEC in list_regnorms:
+        raise ValueError('normEC must be one of commit.solvers.{group_sparsity,non_negative,norm1,norm2}')
+    if not normISO in list_regnorms:
+        raise ValueError('normISO must be one of commit.solvers.{group_sparsity,non_negative,norm1,norm2}')
+
+    ## NNLS case
+    if (lambdaIC == 0.0 and lambdaEC == 0.0 and lambdaISO == 0.0) or (normIC == non_negative and normEC == non_negative and normISO == non_negative):
+        omega = lambda x: 0.0
+        prox  = lambda x: non_negativity(x, 0, len(x))
+        return omega, prox
+
+    ## All other cases
+    # Intracellular Compartment
+    startIC = regularisation.get('startIC')
+    sizeIC  = regularisation.get('sizeIC')
+    if lambdaIC == 0.0:
+        omegaIC = lambda x: 0.0
+        proxIC  = lambda x: x
+    elif normIC == norm2:
+        omegaIC = lambda x: lambdaIC * np.linalg.norm(x[startIC:sizeIC])
+        proxIC  = lambda x: projection_onto_l2_ball(x, lambdaIC, startIC, sizeIC)
+    elif normIC == norm1:
+        omegaIC = lambda x: lambdaIC * sum( x[startIC:sizeIC] )
+        proxIC  = lambda x: soft_thresholding(x, lambdaIC, startIC, sizeIC)
+    elif normIC == non_negative:
+        omegaIC = lambda x: 0.0
+        proxIC  = lambda x: non_negativity(x, startIC, sizeIC)
+    elif normIC == group_sparsity:
+        structureIC = regularisation.get('structureIC')
+        groupWeightIC   = regularisation.get('weightsIC')
+        if not len(structureIC) == len(groupWeightIC):
+            raise ValueError('Number of groups and weights do not coincide.')
+        group_norm = regularisation.get('group_norm')
+        if not group_norm in list_group_sparsity_norms:
+            raise ValueError('Wrong norm in the structured sparsity term. Choose between %s.' % str(list_group_sparsity_norms))
+
+        # convert to new data structure (needed for faster access)
+        N = np.sum([g.size for g in structureIC])
+        groupIdxIC  = np.zeros( (N,), dtype=np.int32 )
+        groupSizeIC = np.zeros( (structureIC.size,), dtype=np.int32 )
+        pos = 0
+        for i, g in enumerate(structureIC) :
+            groupSizeIC[i] = g.size
+            groupIdxIC[pos:(pos+g.size)] = g[:]
+            pos += g.size
+
+        omegaIC = lambda x: omega_group_sparsity( x, groupIdxIC, groupSizeIC, groupWeightIC, lambdaIC, group_norm )
+        proxIC  = lambda x:  prox_group_sparsity( x, groupIdxIC, groupSizeIC, groupWeightIC, lambdaIC, group_norm )
+    else:
+        raise ValueError('Type of regularisation for IC compartment not recognized.')
+
+
+    # Extracellular Compartment
+    startEC = regularisation.get('startEC')
+    sizeEC  = regularisation.get('sizeEC')
+    if lambdaEC == 0.0:
+        omegaEC = lambda x: 0.0
+        proxEC  = lambda x: x
+    elif normEC == norm2:
+        omegaEC = lambda x: lambdaEC * np.linalg.norm(x[startEC:(startEC+sizeEC)])
+        proxEC  = lambda x: projection_onto_l2_ball(x, lambdaEC, startEC, sizeEC)
+    elif normEC == norm1:
+        omegaEC = lambda x: lambdaEC * sum( x[startEC:(startEC+sizeEC)] )
+        proxEC  = lambda x: soft_thresholding(x, lambdaEC, startEC, sizeEC)
+    elif normEC == non_negative:
+        omegaEC = lambda x: 0.0
+        proxEC  = lambda x: non_negativity(x, startEC, sizeEC)
+    else:
+        raise ValueError('Type of regularisation for EC compartment not recognized.')
+
+    # Isotropic Compartment
+    startISO = regularisation.get('startISO')
+    sizeISO  = regularisation.get('sizeISO')
+    if lambdaISO == 0.0:
+        omegaISO = lambda x: 0.0
+        proxISO  = lambda x: x
+    elif normISO == norm2:
+        omegaISO = lambda x: lambdaISO * np.linalg.norm(x[startISO:(startISO+sizeISO)])
+        proxISO  = lambda x: projection_onto_l2_ball(x, lambdaISO, startISO, sizeISO)
+    elif normISO == norm1:
+        omegaISO = lambda x: lambdaISO * sum( x[startISO:(startISO+sizeISO)] )
+        proxISO  = lambda x: soft_thresholding(x, lambdaISO, startISO, sizeISO)
+    elif normISO == non_negative:
+        omegaISO = lambda x: 0.0
+        proxISO  = lambda x: non_negativity(x, startISO, sizeISO)
+    else:
+        raise ValueError('Type of regularisation for ISO compartment not recognized.')
+
+    omega = lambda x: omegaIC(x) + omegaEC(x) + omegaISO(x)
+    prox = lambda x: non_negativity(proxIC(proxEC(proxISO(x))),0,x.size) # non negativity is redunduntly forced
+
+    return omega, prox
+
+
+def evaluate_model(y, A, x, regularisation = None):
+    if regularisation is None:
+        omega = lambda x: 0.0
+        prox  = lambda x: non_negativity(x, 0, len(x))
+    else:
+        omega, _ = regularisation2omegaprox(regularisation)
+
+    return 0.5*np.linalg.norm(A.dot(x)-y)**2 + omega(x)
+
+
+def solve(y, A, At, tol_fun = 1e-4, tol_x = 1e-6, max_iter = 1000, verbose = 1, x0 = None, regularisation = None):
+    """
+    Solve the regularised least squares problem
+
+        argmin_x 0.5*||Ax-y||_2^2 + Omega(x)
+
+    with the Omega described by 'regularisation'.
+
+    Check the documentation of commit.solvers.init_regularisation to see how to
+    solve a specific problem.
+    """
+    if regularisation is None:
+        omega = lambda x: 0.0
+        prox  = lambda x: non_negativity(x, 0, x.size)
+    else:
+        omega, prox = regularisation2omegaprox(regularisation)
+
+    if x0 is None:
+        x0 = np.zeros(A.shape[1])
+
+    return fista( y, A, At, tol_fun, tol_x, max_iter, verbose, x0, omega, prox)
+
+
+def fista( y, A, At, tol_fun, tol_x, max_iter, verbose, x0, omega, proximal) :
+    """
+    Solve the regularised least squares problem
+
+        argmin_x 0.5*||Ax-y||_2^2 + Omega(x)
+
+    with the FISTA algorithm described in [1].
+
+    The penalty term and its proximal operator must be defined in such a way
+    that they already contain the regularisation parameter.
+
+    References:
+        [1] Beck & Teboulle - `A Fast Iterative Shrinkage Thresholding
+            Algorithm for Linear Inverse Problems`
+    """
+
+    # Initialization
+    res = -y.copy()
+    xhat = x0.copy()
+    x = np.zeros_like(xhat)
+    res += A.dot(xhat)
+    proximal( xhat )
+    reg_term = omega( xhat )
+    prev_obj = 0.5 * np.linalg.norm(res)**2 + reg_term
+
+    told = 1
+    beta = 0.9
+    prev_x = xhat.copy()
+    grad = np.asarray(At.dot(res))
+    qfval = prev_obj
+
+    # Step size computation
+    L = ( np.linalg.norm( A.dot(grad) ) / np.linalg.norm(grad) )**2
+    mu = 1.9 / L
+
+    # Main loop
+    if verbose >= 1 :
+        print()
+        print( "      |  1/2||Ax-y||^2      Omega      |  Cost function    Abs error      Rel error    |      Abs x          Rel x    " )
+        print( "------|--------------------------------|-----------------------------------------------|------------------------------" )
+    iter = 1
+    while True :
+        if verbose >= 1 :
+            print( "%4d  |" % iter, end="" )
+            sys.stdout.flush()
+
+        # Smooth step
+        x = xhat - mu*grad
+
+        # Non-smooth step
+        proximal( x )
+        reg_term_x = omega( x )
+
+        # Check stepsize
+        tmp = x-xhat
+        q = qfval + np.real( np.dot(tmp,grad) ) + 0.5/mu * np.linalg.norm(tmp)**2 + reg_term_x
+        res = A.dot(x) - y
+        res_norm = np.linalg.norm(res)
+        curr_obj = 0.5 * res_norm**2 + reg_term_x
+
+        # Backtracking
+        while curr_obj > q :
+            # Smooth step
+            mu = beta*mu
+            x = xhat - mu*grad
+
+            # Non-smooth step
+            proximal( x )
+            reg_term_x = omega( x )
+
+            # Check stepsize
+            tmp = x-xhat
+            q = qfval + np.real( np.dot(tmp,grad) ) + 0.5/mu * np.linalg.norm(tmp)**2 + reg_term_x
+            res = A.dot(x) - y
+            res_norm = np.linalg.norm(res)
+            curr_obj = 0.5 * res_norm**2 + reg_term_x
+
+        # Global stopping criterion
+        abs_obj = abs(curr_obj - prev_obj)
+        rel_obj = abs_obj / curr_obj
+        abs_x   = np.linalg.norm(x - prev_x)
+        rel_x   = abs_x / ( np.linalg.norm(x) + eps )
+        if verbose >= 1 :
+            print( "  %13.7e  %13.7e  |  %13.7e  %13.7e  %13.7e  |  %13.7e  %13.7e" % ( 0.5 * res_norm**2, reg_term_x, curr_obj, abs_obj, rel_obj, abs_x, rel_x ) )
+
+        if abs_obj < eps :
+            criterion = "Absolute tolerance on the objective"
+            break
+        elif rel_obj < tol_fun :
+            criterion = "Relative tolerance on the objective"
+            break
+        elif abs_x < eps :
+            criterion = "Absolute tolerance on the unknown"
+            break
+        elif rel_x < tol_x :
+            criterion = "Relative tolerance on the unknown"
+            break
+        elif iter >= max_iter :
+            criterion = "Maximum number of iterations"
+            break
+
+        # FISTA update
+        t = 0.5 * ( 1 + sqrt(1+4*told**2) )
+        xhat = x + (told-1)/t * (x - prev_x)
+
+        # Gradient computation
+        res = A.dot(xhat) - y
+        xarr = np.asarray(x)
+
+        grad = np.asarray(At.dot(res))
+
+        # Update variables
+        iter += 1
+        prev_obj = curr_obj
+        prev_x = x.copy()
+        told = t
+        qfval = 0.5 * np.linalg.norm(res)**2
+
+
+    if verbose >= 1 :
+        print( "< Stopping criterion: %s >" % criterion )
+
+    opt_details = {}
+    opt_details['residual'] = 0.5*res_norm**2
+    opt_details['regterm'] = reg_term_x
+    opt_details['cost_function'] = curr_obj
+    opt_details['abs_cost'] = abs_obj
+    opt_details['rel_cost'] = rel_obj
+    opt_details['abs_x'] = abs_x
+    opt_details['rel _x'] = rel_x
+    opt_details['iterations'] = iter
+    opt_details['stopping_criterion'] = criterion
+
+    return x, opt_details
diff --git a/commit/trk2dictionary/trk2dictionary.pyx b/commit/trk2dictionary/trk2dictionary.pyx
index 21d85dc5..9f769cce 100755
--- a/commit/trk2dictionary/trk2dictionary.pyx
+++ b/commit/trk2dictionary/trk2dictionary.pyx
@@ -1,461 +1,3 @@
-<<<<<<< HEAD
-#!python
-# cython: language_level=3, c_string_type=str, c_string_encoding=ascii, boundscheck=False, wraparound=False, profile=False
-from __future__ import print_function
-import cython
-import numpy as np
-cimport numpy as np
-import nibabel
-from os.path import join, exists, splitext
-from os import makedirs, remove
-import time
-import amico
-import pickle
-
-
-# Interface to actual C code
-cdef extern from "trk2dictionary_c.cpp":
-    int trk2dictionary(
-        char* filename_tractogram, int data_offset, int Nx, int Ny, int Nz, float Px, float Py, float Pz, int n_count, int n_scalars, 
-        int n_properties, float fiber_shiftX, float fiber_shiftY, float fiber_shiftZ, int points_to_skip, float min_seg_len,
-        float* ptrPEAKS, int Np, float vf_THR, int ECix, int ECiy, int ECiz,
-        float* _ptrMASK, float* ptrTDI, char* path_out, int c, double* ptrAFFINE,
-        int nBlurRadii, double blurSigma, double* ptrBlurRadii, int* ptrBlurSamples, double* ptrBlurWeights,  float* ptrArrayInvM, unsigned short ndirs, short* prtHashTable
-    ) nogil
-
-
-cpdef run( filename_tractogram = None, path_out = None, filename_peaks = None, filename_mask = None, do_intersect = True,
-    fiber_shift = 0, points_to_skip = 0, vf_THR = 0.1, peaks_use_affine = False,
-    flip_peaks = [False,False,False], min_seg_len = 1e-3, gen_trk = True,
-    blur_radii = [], blur_samples = [], blur_sigma = 1.0, filename_trk = None, TCK_ref_image = None, ndirs = 32761
-    ):
-    """Perform the conversion of a tractoram to the sparse data-structure internally
-    used by COMMIT to perform the matrix-vector multiplications with the operator A
-    during the inversion of the linear system.
-
-    Parameters
-    ----------
-    filename_tractogram : string
-        Path to the .trk or .tck file containing the tractogram to load.
-        
-    filename_trk : string
-        DEPRECATED. Use filename_tractogram instead.
-
-    path_out : string
-        Path to the folder where to store the sparse data structure.
-
-    filename_peaks : string
-        Path to the NIFTI file containing the peaks to use as extra-cellular contributions.
-        The data matrix should be 4D with last dimension 3*N, where N is the number
-        of peaks in each voxel. (default : no extra-cellular contributions)
-
-    filename_mask : string
-        Path to a binary mask to restrict the analysis to specific areas. Segments
-        outside this mask are discarded. If not specified (default), the mask is created from
-        all voxels intersected by the tracts.
-
-    do_intersect : boolean
-        If True then fiber segments that intersect voxel boundaries are splitted (default).
-        If False then the centroid of the segment is used as its voxel position.
-
-    fiber_shift : float or list of three float
-        If necessary, apply a translation to fiber coordinates (default : 0) to account
-        for differences between the reference system of the tracking algorithm and COMMIT.
-        The value is specified in voxel units, eg 0.5 translates by half voxel.
-        Do noth use if you are using fiber_shiftX or fiber_shiftY or fiber_shiftZ.
-
-    points_to_skip : integer
-        If necessary, discard first points at beginning/end of a fiber (default : 0).
-
-    vf_THR : float
-        Discard peaks smaller than vf_THR * max peak (default : 0.1).
-
-    peaks_use_affine : boolean
-        Whether to rotate the peaks according to the affine matrix (default : False).
-
-    flip_peaks : list of three boolean
-        If necessary, flips peak orientations along each axis (default : no flipping).
-
-    min_seg_len : float
-        Discard segments <= than this length in mm (default : 1e-3)
-
-    gen_trk : boolean
-        If True then generate a .trk file in the 'path_out' containing the fibers used in the dictionary (default : True)
-    
-    blur_radii : list of float
-        Translate each segment to given radii to assign a broader fiber contribution (default : [])
-    
-    blur_samples : list of integer
-        Segments are duplicated along a circle at a given radius; this parameter controls the number of samples to take over a given circle (defaut : [])
-
-    blur_sigma: float
-        The contributions of the segments at different radii are damped as a Gaussian (default : 1.0)    
-    
-    TCK_ref_image: string
-        Path to the NIFTI file containing the information about the geometry used for the tractogram .tck to load. 
-        If it is not specified, it will try to use the information of filename_peaks or filename_mask.
-    
-    ndirs : int
-            Number of directions on the half of the sphere
-    """
-
-    filename = path_out + '/dictionary_info.pickle'
-    dictionary_info = {}
-    dictionary_info['filename_trk'] = filename_trk
-    dictionary_info['path_out'] = path_out
-    dictionary_info['filename_peaks'] = filename_peaks
-    dictionary_info['filename_mask'] = filename_mask
-    dictionary_info['do_intersect'] = do_intersect
-    dictionary_info['fiber_shift'] = fiber_shift
-    dictionary_info['points_to_skip'] = points_to_skip
-    dictionary_info['vf_THR'] = vf_THR
-    dictionary_info['peaks_use_affine'] = peaks_use_affine
-    dictionary_info['flip_peaks'] = flip_peaks
-    dictionary_info['min_seg_len'] = min_seg_len
-    dictionary_info['gen_trk'] = gen_trk
-    dictionary_info['blur_radii'] = blur_radii
-    dictionary_info['blur_samples'] = blur_samples
-    dictionary_info['blur_sigma'] = blur_sigma
-    dictionary_info['ndirs'] = ndirs
-
-    # check the value of ndirs
-    if not amico.lut.is_valid(ndirs):
-        raise RuntimeError( 'Unsupported value for ndirs.\nNote: Supported values for ndirs are [500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 10000, 32761 (default)]' )
-
-    # check conflicts of fiber_shift
-    if np.isscalar(fiber_shift) :
-        fiber_shiftX = fiber_shift
-        fiber_shiftY = fiber_shift
-        fiber_shiftZ = fiber_shift
-    elif len(fiber_shift) == 3 :
-        fiber_shiftX = fiber_shift[0]
-        fiber_shiftY = fiber_shift[1]
-        fiber_shiftZ = fiber_shift[2]
-    else :
-        raise RuntimeError( 'fiber_shift must be a scalar or a vector with 3 elements' )
-
-    tic = time.time()
-    print( '\n-> Creating the dictionary from tractogram:' )
-    print( '\t* Segment position = %s' % ( 'COMPUTE INTERSECTIONS' if do_intersect else 'CENTROID' ) )
-    print( '\t* Fiber shift X    = %.3f (voxel-size units)' % fiber_shiftX )
-    print( '\t* Fiber shift Y    = %.3f (voxel-size units)' % fiber_shiftY )
-    print( '\t* Fiber shift Z    = %.3f (voxel-size units)' % fiber_shiftZ )
-    print( '\t* Points to skip   = %d' % points_to_skip )
-    print( '\t* Min segment len  = %.2e' % min_seg_len )
-
-    # check blur params
-    cdef :
-        double [:] blurRadii
-        int [:] blurSamples
-        double [:] blurWeights
-        double* ptrBlurRadii
-        int* ptrBlurSamples
-        double* ptrBlurWeights
-        int nBlurRadii
-        float [:] ArrayInvM
-        float* ptrArrayInvM
-
-    if len(blur_radii) != len(blur_samples) :
-        raise RuntimeError( 'number of radii and samples must match' )
-
-    # convert to numpy arrays (add fake radius for original segment)
-    nBlurRadii = len(blur_radii)+1
-    blurRadii = np.array( [0.0]+blur_radii, np.double )
-    blurSamples = np.array( [1]+blur_samples, np.int32 )
-
-    # compute weights for gaussian damping
-    blurWeights = np.empty_like( blurRadii )
-    for i in xrange(nBlurRadii):
-        blurWeights[i] = np.exp( -blurRadii[i]**2 / (2.0*blur_sigma**2) )
-
-    if nBlurRadii == 1 :
-        print( '\t* Do not blur fibers' )
-    else :
-        print( '\t* Blur fibers :' )
-        print( '\t\t- sigma = %.3f' % blur_sigma )
-        print( '\t\t- radii =   [', end="" )
-        for i in xrange( 1, blurRadii.size ) :
-            print( '%.3f' % blurRadii[i], end="" )
-        print( ']' )
-        print( '\t\t- samples = [', end="" )
-        for i in xrange( 1, blurSamples.size ) :
-            print( '%5d' % blurSamples[i], end="" )
-        print( ']' )
-        print( '\t\t- weights = [', end="" )
-        for i in xrange( 1, blurWeights.size ) :
-            print( '%.3f' % blurWeights[i], end="" )
-        print( ']' )
-
-    ptrBlurRadii   = &blurRadii[0]
-    ptrBlurSamples = &blurSamples[0]
-    ptrBlurWeights = &blurWeights[0]
-
-    # minimum segment length
-    if min_seg_len < 0 :
-        raise RuntimeError( 'min_seg_len must be >= 0' )
-
-
-    print( '\t* Loading data:' )
-
-    cdef short [:] htable = amico.lut.load_precomputed_hash_table(ndirs)
-    cdef short* ptrHashTable = &htable[0]
-
-    # fiber-tracts from .trk
-    print( '\t\t* tractogram' )
-    
-    if (path_out is None):
-        raise RuntimeError( 'Path out not defined' )
-
-    if (filename_trk is None and filename_tractogram is None):
-        raise RuntimeError( 'Tractogram file not defined' )
-
-    if (filename_trk is not None and filename_tractogram is not None):
-        print('\t\t\t  [WARNING] filename_tractogram will be used, filename_trk will not be considered')
-
-    if (filename_trk is not None and filename_tractogram is None):
-        filename_tractogram = filename_trk
-        print('\t\t\t  [WARNING] filename_trk parameter is deprecated, in the future use filename_tractogram ')
-    
-    extension = splitext(filename_tractogram)[1]  #take extension of file
-    
-    if (extension != ".trk" and extension != ".tck") :
-        raise IOError( 'Invalid input file. Please enter tractogram file .trk or .tck' )
-    try : #read the header of the file in the same way both in .trk and in .tck
-        hdr = nibabel.streamlines.load( filename_tractogram ).header
-    except :
-        raise IOError( 'Tractogram file not found' )
-        
-    if (extension == ".trk"): #read header of .trk file
-        Nx = hdr['dimensions'][0]
-        Ny = hdr['dimensions'][1]
-        Nz = hdr['dimensions'][2]
-        Px = hdr['voxel_sizes'][0]
-        Py = hdr['voxel_sizes'][1]
-        Pz = hdr['voxel_sizes'][2]
-
-        data_offset = 1000
-        n_count = hdr['nb_streamlines']
-        n_scalars = hdr['nb_scalars_per_point']
-        n_properties = hdr['nb_properties_per_streamline']
-
-    if (extension == ".tck"): #read header of .tck file
-        #open file .nii and get header of this to get info on the structure
-
-        if TCK_ref_image is None:
-            if filename_peaks is not None:
-                TCK_ref_image = filename_peaks
-            elif filename_mask is not None:
-                TCK_ref_image = filename_mask
-            else:
-                raise RuntimeError( 'TCK files do not contain information about the geometry. Use "TCK_ref_image" for that.' )
-
-        print ('\t\t\t- geometry taken from "%s"' %TCK_ref_image)
-
-        #load the TCK_ref_image( .nii file ) with nibabel
-        nii_image = nibabel.load(TCK_ref_image)
-        #read the header of nii file
-        nii_hdr = nii_image.header if nibabel.__version__ >= '2.0.0' else nii_image.get_header()
-
-        #set shape's of tractogram
-        Nx = nii_image.shape[0]
-        Ny = nii_image.shape[1]
-        Nz = nii_image.shape[2]
-
-        #set distance's of control points
-        Px = nii_hdr['pixdim'][1]
-        Py = nii_hdr['pixdim'][2]
-        Pz = nii_hdr['pixdim'][3]
-
-        #set offset and number of streamlines
-        data_offset = int(hdr['_offset_data'])  #set offset
-        n_count = int(hdr['count'])  #set number of fibers
-
-        #set number of proprieties and number of scalar to zero, because there are not present in .tck file
-        n_scalars = 0
-        n_properties = 0
-        
-    print( '\t\t\t- %d x %d x %d' % ( Nx, Ny, Nz ) )
-    print( '\t\t\t- %.4f x %.4f x %.4f' % ( Px, Py, Pz ) )
-    print( '\t\t\t- %d fibers' % n_count )
-    if Nx >= 2**16 or Nz >= 2**16 or Nz >= 2**16 :
-        raise RuntimeError( 'The max dim size is 2^16 voxels' )
-    
-    # get the affine matrix
-    if (extension == ".tck"):
-        scaleMat = np.diag(np.divide(1.0, [Px,Py,Pz]))
-        M = nii_hdr.get_best_affine() #get affine
-
-        # Affine matrix without scaling, i.e. diagonal is 1
-        M[:3, :3] = np.dot(scaleMat, M[:3, :3]) #delete scalar
-
-        M = M.astype('<f4') # affine matrix in float value
-
-        invM = np.linalg.inv(M) # inverse affine matrix
-
-        #create a vector of inverse matrix M
-        ArrayInvM = np.ravel(invM)
-        ptrArrayInvM = &ArrayInvM[0]
-
-    # white-matter mask
-    cdef float* ptrMASK
-    cdef float [:, :, ::1] niiMASK_img
-    if filename_mask is not None :
-        print( '\t\t* filtering mask' )
-        niiMASK = nibabel.load( filename_mask )
-        niiMASK_hdr = niiMASK.header if nibabel.__version__ >= '2.0.0' else niiMASK.get_header()
-        print( '\t\t\t- %d x %d x %d' % ( niiMASK.shape[0], niiMASK.shape[1], niiMASK.shape[2] ) )
-        print( '\t\t\t- %.4f x %.4f x %.4f' % ( niiMASK_hdr['pixdim'][1], niiMASK_hdr['pixdim'][2], niiMASK_hdr['pixdim'][3] ) )
-        if ( Nx!=niiMASK.shape[0] or Ny!=niiMASK.shape[1] or Nz!=niiMASK.shape[2] or
-             abs(Px-niiMASK_hdr['pixdim'][1])>1e-3 or abs(Py-niiMASK_hdr['pixdim'][2])>1e-3 or abs(Pz-niiMASK_hdr['pixdim'][3])>1e-3 ) :
-            print( '\t\t  [WARNING] dataset does not have the same geometry as the tractogram' )
-        niiMASK_img = np.ascontiguousarray( niiMASK.get_data().astype(np.float32) )
-        ptrMASK  = &niiMASK_img[0,0,0]
-    else :
-        print( '\t\t* no mask specified to filter IC compartments' )
-        ptrMASK = NULL
-
-    # peaks file for EC contributions
-    cdef float* ptrPEAKS
-    cdef float [:, :, :, ::1] niiPEAKS_img
-    cdef int Np
-    cdef float [:, :, ::1] niiTDI_img = np.ascontiguousarray( np.zeros((Nx,Ny,Nz),dtype=np.float32) )
-    cdef float* ptrTDI  = &niiTDI_img[0,0,0]
-    cdef double [:, ::1] affine
-    cdef double* ptrAFFINE
-    if filename_peaks is not None :
-        print( '\t\t* EC orientations' )
-        niiPEAKS = nibabel.load( filename_peaks )
-        niiPEAKS_hdr = niiPEAKS.header if nibabel.__version__ >= '2.0.0' else niiPEAKS.get_header()
-        print( '\t\t\t- %d x %d x %d x %d' % ( niiPEAKS.shape[0], niiPEAKS.shape[1], niiPEAKS.shape[2], niiPEAKS.shape[3] ) )
-        print( '\t\t\t- %.4f x %.4f x %.4f' % ( niiPEAKS_hdr['pixdim'][1], niiPEAKS_hdr['pixdim'][2], niiPEAKS_hdr['pixdim'][3] ) )
-        print( '\t\t\t- ignoring peaks < %.2f * MaxPeak' % vf_THR )
-        print( '\t\t\t- %susing affine matrix' % ( "" if peaks_use_affine else "not " ) )
-        print( '\t\t\t- flipping axes : [ x=%s, y=%s, z=%s ]' % ( flip_peaks[0], flip_peaks[1], flip_peaks[2] ) )
-        if ( Nx!=niiPEAKS.shape[0] or Ny!=niiPEAKS.shape[1] or Nz!=niiPEAKS.shape[2] or
-             abs(Px-niiPEAKS_hdr['pixdim'][1])>1e-3 or abs(Py-niiPEAKS_hdr['pixdim'][2])>1e-3 or abs(Pz-niiPEAKS_hdr['pixdim'][3])>1e-3 ) :
-            print( "\t\t  [WARNING] dataset does not have the same geometry as the tractogram" )
-        if niiPEAKS.shape[3] % 3 :
-            raise RuntimeError( 'PEAKS dataset must have 3*k volumes' )
-        if vf_THR < 0 or vf_THR > 1 :
-            raise RuntimeError( 'vf_THR must be between 0 and 1' )
-        niiPEAKS_img = np.ascontiguousarray( niiPEAKS.get_data().astype(np.float32) )
-        ptrPEAKS = &niiPEAKS_img[0,0,0,0]
-        Np = niiPEAKS.shape[3]/3
-
-        # affine matrix to rotate gradien directions (if required)
-        if peaks_use_affine :
-            affine = np.ascontiguousarray( niiPEAKS.affine[:3,:3].T )
-        else :
-            affine = np.ascontiguousarray( np.eye(3) )
-        ptrAFFINE = &affine[0,0]
-    else :
-        print( '\t\t* no dataset specified for EC compartments' )
-        Np = 0
-        ptrPEAKS = NULL
-        ptrAFFINE = NULL
-
-    # output path
-    print( '\t\t* output written to "%s"' % path_out )
-    if not exists( path_out ):
-        makedirs( path_out )
-
-    # write dictionary info file
-    with open( filename, 'wb+' ) as dictionary_info_file:
-        pickle.dump(dictionary_info, dictionary_info_file, protocol=2)
-
-    # calling actual C code
-    ret = trk2dictionary( filename_tractogram, data_offset,
-        Nx, Ny, Nz, Px, Py, Pz, n_count, n_scalars, n_properties,
-        fiber_shiftX, fiber_shiftY, fiber_shiftZ, points_to_skip, min_seg_len,
-        ptrPEAKS, Np, vf_THR, -1 if flip_peaks[0] else 1, -1 if flip_peaks[1] else 1, -1 if flip_peaks[2] else 1,
-        ptrMASK, ptrTDI, path_out, 1 if do_intersect else 0, ptrAFFINE,
-        nBlurRadii, blur_sigma, ptrBlurRadii, ptrBlurSamples, ptrBlurWeights, ptrArrayInvM, ndirs, ptrHashTable  );
-    if ret == 0 :
-        print( '   [ DICTIONARY not generated ]' )
-        return None
-
-    # create new TRK with only fibers in the WM mask
-    # create new dictionaty file (TRK or TCK) with only fibers in the WM mask
-    if gen_trk :
-        print ('\t* Generate tractogram matching the dictionary: ')
-        fib = nibabel.streamlines.load(filename_tractogram)
-        hdr = fib.header
-
-        file_kept = np.fromfile( join(path_out,'dictionary_TRK_kept.dict'), dtype=np.bool_ )
-        tractogram_out = fib.tractogram[ file_kept ]
-        hdr['count'] = len(tractogram_out) #set new number of fibers in the header
-        hdr['nb_streamlines'] = len(tractogram_out)
-
-        #create a output dictionary file (TRK or TCK) in path_out
-        nibabel.streamlines.save( tractogram_out, join(path_out,'dictionary_TRK_fibers'+extension), header=hdr )
-        print( '\t  [ %d fibers kept ]' % np.count_nonzero( file_kept ) )
-    print( '   [ %.1f seconds ]' % ( time.time() - tic ) )
-
-    # save TDI and MASK maps
-    if filename_mask is not None :
-        affine = niiMASK.affine if nibabel.__version__ >= '2.0.0' else niiMASK.get_affine()
-    elif filename_peaks is not None :
-        affine = niiPEAKS.affine if nibabel.__version__ >= '2.0.0' else niiPEAKS.get_affine()
-    else :
-        affine = np.diag( [Px, Py, Pz, 1] )
-
-    niiTDI = nibabel.Nifti1Image( niiTDI_img, affine )
-    nibabel.save( niiTDI, join(path_out,'dictionary_tdi.nii.gz') )
-
-    if filename_mask is not None :
-        niiMASK = nibabel.Nifti1Image( niiMASK_img, affine )
-    else :
-        niiMASK = nibabel.Nifti1Image( (np.asarray(niiTDI_img)>0).astype(np.float32), affine )
-    nibabel.save( niiMASK, join(path_out,'dictionary_mask.nii.gz') )
-
-
-cpdef convert_old_dictionary( path ):
-    """Perform the conversion of the files representing a dictionary, i.e. dictionary_*.dict,
-    from the old format to the new one, where the files *_{vx,vy,vz}.dict are replaced
-    by a single file *_v.dict (same for the files *_{ox,oy}.dict).
-
-    Parameters
-    ----------
-    path : string
-        Path to the folder containing the dictionary_*.dict files.
-    """
-    if not exists( join(path,'dictionary_IC_vx.dict') ):
-        raise RuntimeError( 'Folder does not contain dictionary files in the old format' )
-
-    niiTDI = nibabel.load( join(path,'dictionary_tdi.nii.gz') )
-    Nx, Ny, Nz = niiTDI.shape[:3]
-    x = np.fromfile( join(path,'dictionary_IC_vx.dict'), dtype=np.uint16 ).astype(np.uint32)
-    y = np.fromfile( join(path,'dictionary_IC_vy.dict'), dtype=np.uint16 ).astype(np.uint32)
-    z = np.fromfile( join(path,'dictionary_IC_vz.dict'), dtype=np.uint16 ).astype(np.uint32)
-    v = x + Nx * ( y + Ny * z )
-    v.tofile( join(path,'dictionary_IC_v.dict') )
-    remove( join(path,'dictionary_IC_vx.dict') )
-    remove( join(path,'dictionary_IC_vy.dict') )
-    remove( join(path,'dictionary_IC_vz.dict') )
-
-    x = np.fromfile( join(path,'dictionary_EC_vx.dict'), dtype=np.uint8 ).astype(np.uint32)
-    y = np.fromfile( join(path,'dictionary_EC_vy.dict'), dtype=np.uint8 ).astype(np.uint32)
-    z = np.fromfile( join(path,'dictionary_EC_vz.dict'), dtype=np.uint8 ).astype(np.uint32)
-    v = x + Nx * ( y + Ny * z )
-    v.tofile( join(path,'dictionary_EC_v.dict') )
-    remove( join(path,'dictionary_EC_vx.dict') )
-    remove( join(path,'dictionary_EC_vy.dict') )
-    remove( join(path,'dictionary_EC_vz.dict') )
-
-    x = np.fromfile( join(path,'dictionary_IC_ox.dict'), dtype=np.uint8 ).astype(np.uint16)
-    y = np.fromfile( join(path,'dictionary_IC_oy.dict'), dtype=np.uint8 ).astype(np.uint16)
-    v = y + 181 * x
-    v.tofile( join(path,'dictionary_IC_o.dict') )
-    remove( join(path,'dictionary_IC_ox.dict') )
-    remove( join(path,'dictionary_IC_oy.dict') )
-
-    x = np.fromfile( join(path,'dictionary_EC_ox.dict'), dtype=np.uint8 ).astype(np.uint16)
-    y = np.fromfile( join(path,'dictionary_EC_oy.dict'), dtype=np.uint8 ).astype(np.uint16)
-    v = y + 181 * x
-    v.tofile( join(path,'dictionary_EC_o.dict') )
-    remove( join(path,'dictionary_EC_ox.dict') )
-    remove( join(path,'dictionary_EC_oy.dict') )
-=======
 #!python
 # cython: language_level=3, c_string_type=str, c_string_encoding=ascii, boundscheck=False, wraparound=False, profile=False
 from __future__ import print_function
@@ -911,4 +453,3 @@ cpdef convert_old_dictionary( path ):
     v.tofile( join(path,'dictionary_EC_o.dict') )
     remove( join(path,'dictionary_EC_ox.dict') )
     remove( join(path,'dictionary_EC_oy.dict') )
->>>>>>> 3ec00e357fc859f6e9f8893a1b93e1d5fb53557c
diff --git a/commit/trk2dictionary/trk2dictionary_c.cpp b/commit/trk2dictionary/trk2dictionary_c.cpp
index dc062ab8..58a114dd 100644
--- a/commit/trk2dictionary/trk2dictionary_c.cpp
+++ b/commit/trk2dictionary/trk2dictionary_c.cpp
@@ -1,613 +1,3 @@
-<<<<<<< HEAD
-#include <stdio.h>
-#include <cstdio>
-#include <string>
-#include <map>
-#include <vector>
-#include "Vector.h"
-#include "ProgressBar.h"
-#include <numpy/arrayobject.h>
-#include <math.h>
-
-#define MAX_FIB_LEN 10000
-
-
-// CLASS to store the segments of one fiber
-class segKey
-{
-    public:
-    unsigned short x, y, z;
-    unsigned short o;
-    segKey(){}
-
-    void set(unsigned short _x, unsigned short _y, unsigned short _z, unsigned short _o)
-    {
-        x  = _x;
-        y  = _y;
-        z  = _z;
-        o = _o;
-    }
-
-    bool const operator <(const segKey& seg) const
-    {
-        return o < seg.o || (o==seg.o && z<seg.z) || (o==seg.o && z==seg.z && y<seg.y) || (o==seg.o && z==seg.z && y==seg.y && x<seg.x);
-    }
-};
-
-class segInVoxKey
-{
-    public:
-    unsigned short x, y, z;
-    segInVoxKey(){}
-
-    void set(unsigned short _x, unsigned short _y, unsigned short _z)
-    {
-        x  = _x;
-        y  = _y;
-        z  = _z;
-    }
-    bool const operator <(const segInVoxKey& o) const
-    {
-        return (z<o.z) || (z==o.z && y<o.y) || (z==o.z && y==o.y && x<o.x);
-    }
-};
-
-// global variables (to avoid passing them at each call)
-std::map<segKey,float> FiberSegments;
-
-Vector<int>     dim;
-Vector<float>   pixdim;
-float*          ptrMASK;
-unsigned int    nPointsToSkip;
-float           fiberShiftXmm, fiberShiftYmm, fiberShiftZmm;
-bool            doIntersect;
-float           minSegLen;
-
-std::vector<double> radii;         // radii for the extrusion
-std::vector<double> weights;       // damping weight
-std::vector<int>    sectors;       // number of duplicates across the extrusion circle
-double              radiusSigma;   // modulates the impact of each segment as function of radius
-
-
-bool rayBoxIntersection( Vector<double>& origin, Vector<double>& direction, Vector<double>& vmin, Vector<double>& vmax, double & t);
-void fiberForwardModel( float fiber[3][MAX_FIB_LEN], unsigned int pts, std::vector<int> sectors, std::vector<double> radii, std::vector<double> weight, short* ptrHashTable );
-void segmentForwardModel( const Vector<double>& P1, const Vector<double>& P2, double w, short* ptrHashTable );
-unsigned int read_fiberTRK( FILE* fp, float fiber[3][MAX_FIB_LEN], int ns, int np );
-unsigned int read_fiberTCK( FILE* fp, float fiber[3][MAX_FIB_LEN] , float affine[4][4]);
-
-
-// =========================
-// Function called by CYTHON
-// =========================
-int trk2dictionary(
-    char* str_filename, int data_offset, int Nx, int Ny, int Nz, float Px, float Py, float Pz, int n_count, int n_scalars, int n_properties,
-    float fiber_shiftX, float fiber_shiftY, float fiber_shiftZ, int points_to_skip, float min_seg_len,
-    float* ptrPEAKS, int Np, float vf_THR, int ECix, int ECiy, int ECiz,
-    float* _ptrMASK, float* ptrTDI, char* path_out, int c, double* ptrAFFINE,
-    int nBlurRadii, double blurSigma, double* ptrBlurRadii, int* ptrBlurSamples, double* ptrBlurWeights, float* VetAffine, unsigned short ndirs, short* ptrHashTable
-)
-{
-    /*=========================*/
-    /*     IC compartments     */
-    /*=========================*/
-    float          fiber[3][MAX_FIB_LEN];
-    float          fiberNorm, fiberLen;
-    unsigned int   N, totICSegments = 0, totFibers = 0, v;
-    unsigned short o;
-    unsigned char  kept;
-    Vector<double> P;
-    std::string    filename;
-    std::string    OUTPUT_path(path_out);
-    std::map<segKey,float>::iterator it;
-
-    std::map<segInVoxKey,float> FiberNorm;
-    std::map<segInVoxKey,float>::iterator itNorm;
-    segInVoxKey         inVoxKey;
-
-    printf( "\t* Exporting IC compartments:\n" );
-    
-    int isTRK; // var to check
-
-    char *ext = strrchr(str_filename, '.'); //get the extension of input file
-
-    if (strcmp(ext,".trk")==0) //for .trk file
-        isTRK = 1;
-    else if (strcmp(ext,".tck")==0)// for .tck file
-        isTRK = 0;
-    else
-        return 0;
-
-    FILE* fpTractogram = fopen(str_filename,"rb"); //open 
-    if (fpTractogram == NULL) return 0;
-
-    if ( isTRK ) { // SKIP header on .trk
-        fseek(fpTractogram,data_offset,SEEK_SET); //skip the first 1000 bytes in the .trk file
-    }
-    else { // SKIP header on .tck
-        fseek(fpTractogram,data_offset,SEEK_SET); //skip the first offset bytes in the .tck file
-    }
-
-    // set global variables
-    dim.Set( Nx, Ny, Nz );
-    pixdim.Set( Px, Py, Pz );
-    nPointsToSkip = points_to_skip;
-    fiberShiftXmm = fiber_shiftX * pixdim.x; // shift in mm for the coordinates
-    fiberShiftYmm = fiber_shiftY * pixdim.y;
-    fiberShiftZmm = fiber_shiftZ * pixdim.z;
-    ptrMASK       = _ptrMASK;
-    doIntersect   = c > 0;
-    minSegLen     = min_seg_len;
-
-    radii.clear();
-    sectors.clear();
-    weights.clear();
-    for(int i=0; i<nBlurRadii ;i++)
-    {
-        radii.push_back( ptrBlurRadii[i] );
-        sectors.push_back( ptrBlurSamples[i] );
-        weights.push_back( ptrBlurWeights[i] );
-    }
-    radiusSigma = blurSigma;
-
-    // open files
-    filename = OUTPUT_path+"/dictionary_TRK_norm.dict";   FILE* pDict_TRK_norm = fopen(filename.c_str(),"wb");
-    if ( !pDict_TRK_norm )
-    {
-        printf( "\n[trk2dictionary] Unable to create output files" );
-        return 0;
-    }
-    filename = OUTPUT_path+"/dictionary_IC_f.dict";        FILE* pDict_IC_f      = fopen(filename.c_str(),"wb");
-    filename = OUTPUT_path+"/dictionary_IC_v.dict";        FILE* pDict_IC_v      = fopen(filename.c_str(),"wb");
-    filename = OUTPUT_path+"/dictionary_IC_o.dict";        FILE* pDict_IC_o      = fopen(filename.c_str(),"wb");
-    filename = OUTPUT_path+"/dictionary_IC_len.dict";      FILE* pDict_IC_len    = fopen(filename.c_str(),"wb");
-    filename = OUTPUT_path+"/dictionary_TRK_len.dict";     FILE* pDict_TRK_len   = fopen(filename.c_str(),"wb");
-    filename = OUTPUT_path+"/dictionary_TRK_kept.dict";    FILE* pDict_TRK_kept  = fopen(filename.c_str(),"wb");
-    filename = OUTPUT_path+"/dictionary_ndirs.dict";       FILE* pDict_ndirs     = fopen(filename.c_str(),"wb");
-
-    // iterate over fibers
-    ProgressBar PROGRESS( n_count );
-    PROGRESS.setPrefix("\t  ");
-    
-    float affine[4][4];
-    if (!isTRK)  {//.tck
-        //ricreate affine matrix
-        int k = 0;
-        for(int i=0; i<4; i++) {
-            for (int j=0; j<4; j++) {
-                affine[i][j] = VetAffine[k];
-                k++;
-            }
-        }
-    }
-    
-    for(int f=0; f<n_count ;f++)
-    {
-        PROGRESS.inc();
-        //read fibers in .trk or in .tck
-        if (isTRK) { // .trk file
-            N = read_fiberTRK( fpTractogram, fiber, n_scalars, n_properties );
-        }
-        else { // .tck file
-            N = read_fiberTCK( fpTractogram, fiber , affine );
-        }
-        
-        fiberForwardModel( fiber, N, sectors, radii, weights, ptrHashTable  );
-
-        kept = 0;
-        if ( FiberSegments.size() > 0 )
-        {
-            // add segments to files
-            fiberNorm = 0;
-            fiberLen = 0;
-            for (it=FiberSegments.begin(); it!=FiberSegments.end(); it++)
-            {
-                // NB: plese note inverted ordering for 'v'
-                v = it->first.x + dim.x * ( it->first.y + dim.y * it->first.z );
-                o = it->first.o;
-                fwrite( &totFibers,      4, 1, pDict_IC_f );
-                fwrite( &v,              4, 1, pDict_IC_v );
-                fwrite( &o,              2, 1, pDict_IC_o );
-                fwrite( &(it->second),   4, 1, pDict_IC_len );
-                ptrTDI[ it->first.z + dim.z * ( it->first.y + dim.y * it->first.x ) ] += it->second;
-                inVoxKey.set( it->first.x, it->first.y, it->first.z );
-                FiberNorm[inVoxKey] += it->second;
-                fiberLen += it->second;
-            }
-            for (itNorm=FiberNorm.begin(); itNorm!=FiberNorm.end(); itNorm++)
-            {
-                fiberNorm += pow(itNorm->second,2);
-            }
-            fiberNorm = sqrt(fiberNorm);
-            FiberNorm.clear();
-            fwrite( &fiberNorm,  1, 4, pDict_TRK_norm ); // actual length considered in optimization
-            fwrite( &fiberLen,   1, 4, pDict_TRK_len );
-            totICSegments += FiberSegments.size();
-            totFibers++;
-            kept = 1;
-        }
-        fwrite( &kept, 1, 1, pDict_TRK_kept );
-    }
-    PROGRESS.close();
-
-    // write dictionary ndirs value
-    fwrite(&ndirs, 1, sizeof(unsigned short), pDict_ndirs);
-    fclose( fpTractogram );
-    fclose( pDict_TRK_norm );
-    fclose( pDict_IC_f );
-    fclose( pDict_IC_v );
-    fclose( pDict_IC_o );
-    fclose( pDict_IC_len );
-    fclose( pDict_TRK_len );
-    fclose( pDict_TRK_kept );
-    fclose( pDict_ndirs );
-
-    printf("\t  [ %d fibers kept, %d segments in total ]\n", totFibers, totICSegments );
-
-
-    /*=========================*/
-    /*     EC compartments     */
-    /*=========================*/
-    unsigned int totECSegments = 0, totECVoxels = 0;
-
-    printf( "\t* Exporting EC compartments:\n" );
-
-    filename = OUTPUT_path+"/dictionary_EC_v.dict";        FILE* pDict_EC_v   = fopen( filename.c_str(),   "wb" );
-    filename = OUTPUT_path+"/dictionary_EC_o.dict";        FILE* pDict_EC_o   = fopen( filename.c_str(),   "wb" );
-
-    if ( ptrPEAKS != NULL )
-    {
-        Vector<double> dir;
-        double         longitude, colatitude;
-        segKey         ec_seg;
-        int            ix, iy, iz, id, atLeastOne;
-        float          peakMax;
-        float          norms[ Np ];
-        float          *ptr;
-        int            ox, oy;
-
-        PROGRESS.reset( dim.z );
-        for(iz=0; iz<dim.z ;iz++)
-        {
-            PROGRESS.inc();
-            for(iy=0; iy<dim.y ;iy++)
-            for(ix=0; ix<dim.x ;ix++)
-            {
-                // check if in mask previously computed from IC segments
-                if ( ptrTDI[ iz + dim.z * ( iy + dim.y * ix ) ] == 0 ) continue;
-
-                peakMax = -1;
-                for(id=0; id<Np ;id++)
-                {
-                    ptr = ptrPEAKS + 3*(id + Np * ( iz + dim.z * ( iy + dim.y * ix ) ));
-                    dir.x = ptr[0];
-                    dir.y = ptr[1];
-                    dir.z = ptr[2];
-                    norms[id] = dir.norm();
-                    if ( norms[id] > peakMax )
-                        peakMax = norms[id];
-                }
-
-                if ( peakMax > 0 )
-                {
-                    ec_seg.x  = ix;
-                    ec_seg.y  = iy;
-                    ec_seg.z  = iz;
-                    atLeastOne = 0;
-                    for(id=0; id<Np ;id++)
-                    {
-                        if ( norms[id]==0 || norms[id] < vf_THR*peakMax ) continue; // peak too small, don't consider it
-
-                        // get the orientation of the current peak
-                        ptr = ptrPEAKS + 3*(id + Np * ( iz + dim.z * ( iy + dim.y * ix ) ));
-
-                        // multiply by the affine matrix
-                        dir.x = ptr[0] * ptrAFFINE[0] + ptr[1] * ptrAFFINE[1] + ptr[2] * ptrAFFINE[2];
-                        dir.y = ptr[0] * ptrAFFINE[3] + ptr[1] * ptrAFFINE[4] + ptr[2] * ptrAFFINE[5];
-                        dir.z = ptr[0] * ptrAFFINE[6] + ptr[1] * ptrAFFINE[7] + ptr[2] * ptrAFFINE[8];
-
-                        // flip axes if requested
-                        dir.x *= ECix;
-                        dir.y *= ECiy;
-                        dir.z *= ECiz;
-                        if ( dir.y < 0 )
-                        {
-                            // ensure to be in the right hemisphere (the one where kernels were pre-computed)
-                            dir.x = -dir.x;
-                            dir.y = -dir.y;
-                            dir.z = -dir.z;
-                        }
-                        colatitude = atan2( sqrt(dir.x*dir.x + dir.y*dir.y), dir.z );
-                        longitude  = atan2( dir.y, dir.x );
-                        ox = (int)round(colatitude/M_PI*180.0);
-                        oy = (int)round(longitude/M_PI*180.0);
-
-                        v = ec_seg.x + dim.x * ( ec_seg.y + dim.y * ec_seg.z );
-                        o = ptrHashTable[ox*181 + oy];
-                        fwrite( &v, 4, 1, pDict_EC_v );
-                        fwrite( &o, 2, 1, pDict_EC_o );
-                        totECSegments++;
-                        atLeastOne = 1;
-                    }
-                    if ( atLeastOne>0 )
-                        totECVoxels++;
-                }
-            }
-        }
-        PROGRESS.close();
-    }
-
-    fclose( pDict_EC_v );
-    fclose( pDict_EC_o );
-
-    printf("\t  [ %d voxels, %d segments ]\n", totECVoxels, totECSegments );
-
-    return 1;
-}
-
-
-/********************************************************************************************************************/
-/*                                                 fiberForwardModel                                                */
-/********************************************************************************************************************/
-void fiberForwardModel( float fiber[3][MAX_FIB_LEN], unsigned int pts, std::vector<int> sectors, std::vector<double> radii, std::vector<double> weights, short* ptrHashTable )
-{
-    static Vector<double> S1, S2, S1m, S2m, P, q, n, qxn, qxqxn;
-    static Vector<double> vox, vmin, vmax, dir;
-    static double         len, t, alpha, w, R;
-    static int            i, j, k;
-
-    FiberSegments.clear();
-    //printf("RANGO -----------------------------> from %d to %d\n", nPointsToSkip, pts-1-nPointsToSkip);
-    for(i=nPointsToSkip; i<pts-1-nPointsToSkip ;i++)
-    {
-        // original segment to be processed
-        S1.Set( fiber[0][i]   + fiberShiftXmm, fiber[1][i]   + fiberShiftYmm, fiber[2][i]   + fiberShiftZmm );
-        S2.Set( fiber[0][i+1] + fiberShiftXmm, fiber[1][i+1] + fiberShiftYmm, fiber[2][i+1] + fiberShiftZmm );
-        dir.x = S2.x-S1.x;
-        dir.y = S2.y-S1.y;
-        dir.z = S2.z-S1.z;
-        dir.Normalize();
-
-        // get a normal to the vector to move
-        n.x = dir.y-dir.z;
-        n.y = dir.z-dir.x;
-        n.z = dir.x-dir.y;
-        n.Normalize();
-
-        /* assign contribution(s) */
-        for(k=0; k<(int)radii.size() ;k++)
-        {
-            if ( weights[k] < 1e-3 )
-                continue;
-
-            R = radii[k];
-
-            // quaternion (q.x, q.y, q.z, w) for rotation
-            alpha = 2.0*M_PI/sectors[k];
-            w = sin(alpha/2.0);
-            q.x = dir.x * w;
-            q.y = dir.y * w;
-            q.z = dir.z * w;
-            w = cos(alpha/2.0);
-
-
-            for(j=0; j<sectors[k] ;j++)
-            {
-                // rotate the segment's normal
-                qxn.x = 2.0 * ( q.y * n.z - q.z * n.y );
-                qxn.y = 2.0 * ( q.z * n.x - q.x * n.z );
-                qxn.z = 2.0 * ( q.x * n.y - q.y * n.x );
-                qxqxn.x = q.y * qxn.z - q.z * qxn.y;
-                qxqxn.y = q.z * qxn.x - q.x * qxn.z;
-                qxqxn.z = q.x * qxn.y - q.y * qxn.x;
-                n.x += w * qxn.x + qxqxn.x;
-                n.y += w * qxn.y + qxqxn.y;
-                n.z += w * qxn.z + qxqxn.z;
-                // n /= np.linalg.norm(n)
-
-                // move the segment
-                S1m.x = S1.x + R*n.x;
-                S1m.y = S1.y + R*n.y;
-                S1m.z = S1.z + R*n.z;
-                S2m.x = S2.x + R*n.x;
-                S2m.y = S2.y + R*n.y;
-                S2m.z = S2.z + R*n.z;
-
-                if ( doIntersect==false )
-                    segmentForwardModel( S1m, S2m, weights[k], ptrHashTable );
-                else
-                    while( 1 )
-                    {
-                        len = sqrt( pow(S2m.x-S1m.x,2) + pow(S2m.y-S1m.y,2) + pow(S2m.z-S1m.z,2) ); // in mm
-                        if ( len <= minSegLen )
-                            break;
-
-                        // compute AABB of the first point (in mm)
-                        vmin.x = floor( (S1m.x + 1e-6*dir.x)/pixdim.x ) * pixdim.x;
-                        vmin.y = floor( (S1m.y + 1e-6*dir.y)/pixdim.y ) * pixdim.y;
-                        vmin.z = floor( (S1m.z + 1e-6*dir.z)/pixdim.z ) * pixdim.z;
-                        vmax.x = vmin.x + pixdim.x;
-                        vmax.y = vmin.y + pixdim.y;
-                        vmax.z = vmin.z + pixdim.z;
-
-                        if ( rayBoxIntersection( S1m, dir, vmin, vmax, t ) && t>0 && t<len )
-                        {
-                            // add the portion S1P, and then reiterate
-                            P.Set( S1m.x + t*dir.x, S1m.y + t*dir.y, S1m.z + t*dir.z );
-                            segmentForwardModel( S1m, P, weights[k], ptrHashTable );
-                            S1m.Set( P.x, P.y, P.z );
-                        }
-                        else
-                        {
-                            // add the segment S1S2 and stop iterating
-                            segmentForwardModel( S1m, S2m, weights[k], ptrHashTable );
-                            break;
-                        }
-                    }
-            }
-        }
-    }
-}
-
-
-/********************************************************************************************************************/
-/*                                                segmentForwardModel                                               */
-/********************************************************************************************************************/
-void segmentForwardModel( const Vector<double>& P1, const Vector<double>& P2, double w, short* ptrHashTable )
-{
-    static Vector<int>    vox;
-    static Vector<double> dir, dirTrue;
-    static double         longitude, colatitude, len;
-    static segKey         key;
-    static int            ox, oy;
-
-    // direction of the segment
-    dir.y = P2.y-P1.y;
-    if ( dir.y >= 0 )
-    {
-        dir.x = P2.x-P1.x;
-        dir.z = P2.z-P1.z;
-    }
-    else
-    {
-        dir.x = P1.x-P2.x;
-        dir.y = P1.y-P2.y;
-        dir.z = P1.z-P2.z;
-    }
-
-    // length of segment
-    len = dir.norm();
-    if ( len <= minSegLen )
-        return;
-    dir.Normalize();
-
-    // voxel of the segment is the centroid
-    vox.x = floor( 0.5 * (P1.x + P2.x) / pixdim.x );
-    vox.y = floor( 0.5 * (P1.y + P2.y) / pixdim.y );
-    vox.z = floor( 0.5 * (P1.z + P2.z) / pixdim.z );
-    if ( vox.x>=dim.x || vox.x<0 || vox.y>=dim.y || vox.y<0 || vox.z>=dim.z || vox.z<0 )
-        return;
-    if ( ptrMASK && ptrMASK[ vox.z + dim.z * ( vox.y + dim.y * vox.x ) ]==0 )
-        return;
-
-    // add the segment to the data structure
-    longitude  = atan2(dir.y, dir.x);
-    colatitude = atan2( sqrt(dir.x*dir.x + dir.y*dir.y), dir.z );
-    ox = (int)round(colatitude/M_PI*180.0); // theta // i1
-    oy = (int)round(longitude/M_PI*180.0);  // phi   // i2
-    key.set( vox.x, vox.y, vox.z, (unsigned short) ptrHashTable[ox*181 + oy] );
-    FiberSegments[key] += w * len;
-}
-
-
-/********************************************************************************************************************/
-/*                                                rayBoxIntersection                                                */
-/********************************************************************************************************************/
-bool rayBoxIntersection( Vector<double>& origin, Vector<double>& direction, Vector<double>& vmin, Vector<double>& vmax, double & t)
-{
-    static double tmin, tmax, tymin, tymax, tzmin, tzmax;
-    static Vector<double> invrd;
-
-    // inverse direction to catch float problems
-    invrd.x = 1.0 / direction.x;
-    invrd.y = 1.0 / direction.y;
-    invrd.z = 1.0 / direction.z;
-
-
-    if (invrd.x >= 0)
-    {
-      tmin = (vmin.x - origin.x) * invrd.x;
-      tmax = (vmax.x - origin.x) * invrd.x;
-    }
-    else
-    {
-      tmin = (vmax.x - origin.x) * invrd.x;
-      tmax = (vmin.x - origin.x) * invrd.x;
-    }
-
-    if (invrd.y >= 0)
-    {
-      tymin = (vmin.y - origin.y) * invrd.y;
-      tymax = (vmax.y - origin.y) * invrd.y;
-    }
-    else
-    {
-      tymin = (vmax.y - origin.y) * invrd.y;
-      tymax = (vmin.y - origin.y) * invrd.y;
-    }
-
-    if ( (tmin > tymax) || (tymin > tmax) ) return false;
-    if ( tymin > tmin) tmin = tymin;
-    if ( tymax < tmax) tmax = tymax;
-
-    if (invrd.z >= 0)
-    {
-      tzmin = (vmin.z - origin.z) * invrd.z;
-      tzmax = (vmax.z - origin.z) * invrd.z;
-    }else
-    {
-      tzmin = (vmax.z - origin.z) * invrd.z;
-      tzmax = (vmin.z - origin.z) * invrd.z;
-    }
-
-    if ( (tmin > tzmax) || (tzmin > tmax) ) return false;
-    if ( tzmin > tmin) tmin = tzmin;
-    if ( tzmax < tmax) tmax = tzmax;
-
-    // check if values are valid
-    t = tmin;
-    if (t <= 0) t = tmax;
-
-    return true;
-}
-
-
-// Read a fiber from file .trk
-unsigned int read_fiberTRK( FILE* fp, float fiber[3][MAX_FIB_LEN], int ns, int np )
-{
-    int N;
-    fread((char*)&N, 1, 4, fp);
-
-    if ( N >= MAX_FIB_LEN || N <= 0 )
-        return 0;
-
-    float tmp[3];
-    for(int i=0; i<N; i++)
-    {
-        fread((char*)tmp, 1, 12, fp);
-        fiber[0][i] = tmp[0];
-        fiber[1][i] = tmp[1];
-        fiber[2][i] = tmp[2];
-        fseek(fp,4*ns,SEEK_CUR);
-    }
-    fseek(fp,4*np,SEEK_CUR);
-
-    return N;
-}
-
-// Read a fiber from file .tck
-unsigned int read_fiberTCK( FILE* fp, float fiber[3][MAX_FIB_LEN], float affine[4][4])
-{
-    int N = 0;
-    float tmp[3];
-
-    fread((char*)tmp, 1, 12, fp);
-    //printf("%f %f %f\n", tmp[0],tmp[1],tmp[2]);
-
-    while( !(isnan(tmp[0])) && !(isnan(tmp[1])) &&  !(isnan(tmp[2])) )
-    {
-        //printf("%f %f %f\n", tmp[0],tmp[1],tmp[2]);
-        fiber[0][N] = tmp[0]*affine[0][0] + tmp[1]*affine[0][1] + tmp[2]*affine[0][2] + affine[0][3];
-        fiber[1][N] = tmp[0]*affine[1][0] + tmp[1]*affine[1][1] + tmp[2]*affine[1][2] + affine[1][3];
-        fiber[2][N] = tmp[0]*affine[2][0] + tmp[1]*affine[2][1] + tmp[2]*affine[2][2] + affine[2][3];
-        N++;
-        fread((char*)tmp, 1, 12, fp);
-        //printf("%f %f %f\n", fiber[0][N],fiber[1][N],fiber[2][N]);
-    }
-    //printf("End Fiber\n");
-
-     return N;
-}
-=======
 #include <stdio.h>
 #include <cstdio>
 #include <string>
@@ -1207,4 +597,3 @@ unsigned int read_fiberTCK( FILE* fp, float fiber[3][MAX_FIB_LEN], float affine[
 
     return i;
 }
->>>>>>> 3ec00e357fc859f6e9f8893a1b93e1d5fb53557c

From e1638b6ea106c2d2a45614c29dba960f07c1a3e4 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 18 Jul 2020 02:50:24 -0500
Subject: [PATCH 138/190] Merging CUDA version with the lastest COMMIT version

---
 setup.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/setup.py b/setup.py
index 89578eb0..1767d776 100644
--- a/setup.py
+++ b/setup.py
@@ -162,6 +162,8 @@ def get_extensions_with_cuda():
                      libraries = ['cudart'],
                      runtime_library_dirs = [CUDA['lib64']])
 
+    return [ext1, ext2, ext3, ext4]
+
 if CUDA == None:
     extensions = get_extensions()
 else:

From 349efd2c50f125641ce7be4df27af0c1e4ff9b5a Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 18 Jul 2020 02:54:55 -0500
Subject: [PATCH 139/190] Merging CUDA version with the lastest COMMIT version

---
 commit/operator_withCUDA.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/commit/operator_withCUDA.cuh b/commit/operator_withCUDA.cuh
index 2894bb9c..e3c8ccf7 100644
--- a/commit/operator_withCUDA.cuh
+++ b/commit/operator_withCUDA.cuh
@@ -1,4 +1,4 @@
-#include <cuda.h>
+//#include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuda_runtime_api.h>
 #include <device_launch_parameters.h>

From 97cefb060bb03633cd8d80aec0b8a972f26f601e Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 18 Jul 2020 02:58:38 -0500
Subject: [PATCH 140/190] Merging CUDA version with the lastest COMMIT version

---
 commit/operator_withCUDA.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/commit/operator_withCUDA.cuh b/commit/operator_withCUDA.cuh
index e3c8ccf7..2894bb9c 100644
--- a/commit/operator_withCUDA.cuh
+++ b/commit/operator_withCUDA.cuh
@@ -1,4 +1,4 @@
-//#include <cuda.h>
+#include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuda_runtime_api.h>
 #include <device_launch_parameters.h>

From 95d2bbf1556ecb51c5b75a585ac2a4fef58e23c8 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 18 Jul 2020 03:27:47 -0500
Subject: [PATCH 141/190] Merging CUDA version with the lastest COMMIT version

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 1767d776..8de18f83 100644
--- a/setup.py
+++ b/setup.py
@@ -201,7 +201,7 @@ def run(self):
             
             # Add everything requires for build
             self.swig_opts = None
-            self.include_dirs = [get_include(), CUDA['lib64']]
+            self.include_dirs = [get_include(), CUDA['include']]
             self.distribution.ext_modules[:] = cythonize(self.distribution.ext_modules)
 
             # Call original build_ext command

From 9511ec7f453e2d73820e306b33882e3210ffd06c Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 18 Jul 2020 03:37:11 -0500
Subject: [PATCH 142/190] Merging CUDA version with the lastest COMMIT version

---
 setup.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index 8de18f83..411774ee 100644
--- a/setup.py
+++ b/setup.py
@@ -131,7 +131,7 @@ def get_extensions_with_cuda():
     ext1 = Extension(name='commit.trk2dictionary',
                      sources=['commit/trk2dictionary/trk2dictionary.pyx'],
                      extra_compile_args= {'gcc':  ['-w'],
-                                          'nvcc': ['-arch=sm_30', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
+                                          'nvcc': ['-arch=sm_50', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
                      extra_link_args=[],
                      language='c++')
                      #include_dirs = [get_include])
@@ -139,7 +139,7 @@ def get_extensions_with_cuda():
     ext2 = Extension(name='commit.core',
                      sources=['commit/core.pyx'],
                      extra_compile_args= {'gcc':  ['-w'],
-                                          'nvcc': ['-arch=sm_30', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
+                                          'nvcc': ['-arch=sm_50', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
                      extra_link_args=[],
                      language='c++')
                      #include_dirs = [get_include])
@@ -147,7 +147,7 @@ def get_extensions_with_cuda():
     ext3 = Extension(name='commit.proximals',
                       sources=['commit/proximals.pyx'],
                       extra_compile_args= {'gcc':  ['-w'],
-                                           'nvcc': ['-arch=sm_30', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
+                                           'nvcc': ['-arch=sm_50', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
                       extra_link_args=[],
                       language='c++')
                       #include_dirs = [get_include])
@@ -155,7 +155,7 @@ def get_extensions_with_cuda():
     ext4 = Extension(name='commit.cudaoperator',
                      sources = ['commit/operator_withCUDA.cu', 'commit/cudaoperator.pyx'],
                      extra_compile_args= {'gcc':  ['-w'],
-                                          'nvcc': ['-arch=sm_30', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
+                                          'nvcc': ['-arch=sm_50', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
                      language = 'c++',
                      #include_dirs = [get_include, CUDA['include']],
                      library_dirs = [CUDA['lib64']],

From bd6518a8d801007e968bcdfe007a6ed8a9d72d85 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 18 Jul 2020 03:39:50 -0500
Subject: [PATCH 143/190] Merging CUDA version with the lastest COMMIT version

---
 commit/operator/operator.pyxbld | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/commit/operator/operator.pyxbld b/commit/operator/operator.pyxbld
index d7d86566..3b3d2d48 100755
--- a/commit/operator/operator.pyxbld
+++ b/commit/operator/operator.pyxbld
@@ -10,7 +10,7 @@ from commit.operator import config
 
 def make_ext(modname, pyxfilename):
 
-    if (config.nTHREADS is None or config.nTHREADS < 1 or config.nTHREADS > 255):
+    if (config.nTHREADS is None or config.nTHREADS < 0 or config.nTHREADS > 255):
         raise RuntimeError('config.nTHREADS must be between 1 and 255')
     if (config.nIC is None or config.nIC < 0 or config.nIC > 20):
         raise RuntimeError('config.nIC must be in the range [0..20]')

From 234f750b83e5dbe82912313c877b92eff18c1aba Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 18 Jul 2020 13:15:41 -0500
Subject: [PATCH 144/190] Adding tutorial to enable GPU acceleration

---
 CHANGELOG.md                              |  8 +++
 docs/tutorials/CudaAcceleration/README.md | 86 +++++++++++++++++++++++
 2 files changed, 94 insertions(+)
 create mode 100644 docs/tutorials/CudaAcceleration/README.md

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6f56bdca..0203367b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,14 @@
 # Change Log
 All notable changes to COMMIT will be documented in this file.
 
+## [1.4.0] - 2020-07-18
+
+### Changed
+- Modify setup.py to add support for .cu files
+
+### Added
+- GPU acceleration with CUDA for faster model fitting
+
 ## [1.3.9] - 2020-06-09
 
 ### Changed
diff --git a/docs/tutorials/CudaAcceleration/README.md b/docs/tutorials/CudaAcceleration/README.md
new file mode 100644
index 00000000..54982a5b
--- /dev/null
+++ b/docs/tutorials/CudaAcceleration/README.md
@@ -0,0 +1,86 @@
+# Enable CUDA GPU Acceleration
+
+This tutorial illustrates how to enable the CUDA GPU acceleration for faster model fitting.
+
+## Getting Started
+
+This tutorial takes as starting point the [getting starting](https://github.com/daducci/COMMIT/tree/master/docs/tutorials/GettingStarted) tutorial.
+
+## Build the linear operator A on GPU
+
+Once the getting starting tutorial runs properly, focus on the commands `mit.set_threads()` and `mit.build_operator()`. The command `mit.set_threads()` allows to indicate (through the parameter `nthreads`) how many CPU threads will use COMMIT to build linear operator **A**. Then, the command `mit.build_operator()` builds **A** by using the same number of threads especified by `nthreads`. The GPU acceleration can be enabled by setting the parameter `nthreads` equal to zero in `mit.set_threads()`.
+
+```python
+nthreads = 0
+mit.set_threads( nthreads=nthreads )
+mit.build_operator()
+```
+
+Once these commands area executed, COMMIT will build the linear operator ***A** but now on the GPU. The output should be something similar to this:
+
+```
+-> Building linear operator A:
+        * checking availability of CUDA...  [ OK ]
+        * number of CUDA GPUs detected: 	1
+        * using GPU with ID 0... 			[ GeForce RTX 2080 SUPER ]
+        * using 0.31 GB of total 8.37 GB... [ OK ]
+        * compute capability: 7.5 			[ OK ]
+        * constant values ... 				[ OK ]
+        * vectors x&y ... 					[ OK ]
+        * pre-processing ... 				[ OK ]
+        * loading LUTs ... 					[ OK ]
+        * A  operator... 					[ OK ]
+        * A' operator... 					[ OK ]
+   [ 0.5 seconds ]
+```
+
+If there are more than one CUDA capable GPU installed, COMMIT will use by default the GPU with ID 0. The selected GPU can be changed with the parameter `select_gpu` in `mit.set_threads()`. For example, assuming there are at least two CUDA capable GPUs installed, the following commands build the linear operator **A** on the GPU with ID 1 rather than on the GPU with ID 0:
+
+```python
+nthreads = 0
+gpu_id = 1
+mit.set_threads( nthreads=nthreads, select_gpu=gpu_id )
+mit.build_operator()
+```
+
+To show a list of GPUs and their IDs, open a system shell and run the command `nvidia-smi`. This command should output something similar to:
+
+```
+                             +-----------------------------------------------------------------------------+
+                             | NVIDIA-SMI 375.82                 Driver Version: 375.82                    |
+                             |-------------------------------+----------------------+----------------------+
+                             | GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
+                             | Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
+                             |===============================+======================+======================|
+this is the GPU ID  -------> |   0  GeForce GTX TIT...  Off  | 0000:05:00.0     Off |                  N/A |
+                             | 22%   55C    P8    31W / 250W |  11853MiB / 12205MiB |      0%      Default |
+                             +-------------------------------+----------------------+----------------------+
+this is the GPU ID  -------> |   1  GeForce GTX TIT...  Off  | 0000:06:00.0     Off |                  N/A |
+                             | 22%   60C    P8    18W / 250W |    114MiB / 12207MiB |      0%      Default |
+                             +-------------------------------+----------------------+----------------------+
+this is the GPU ID  -------> |   2  GeForce GTX TIT...  Off  | 0000:09:00.0     Off |                  N/A |
+                             | 27%   66C    P2    72W / 250W |   8452MiB / 12207MiB |      0%      Default |
+                             +-------------------------------+----------------------+----------------------+
+```
+
+**NOTE:** At this moment, COMMIT does not have support for multi-GPU acceleration.
+
+## Clearing GPU memory
+
+The commands apart from the command `mit.set_threads()` remain the same. But in the case when the GPU acceleration was enabled, the method `mit.A.destroy()` has to the executed in order to clear the GPU memory and reset the GPU for the next evaluation. That is, add the following command to the end of the script:
+
+```python
+if nthreads == 0:
+	mit.A.destroy()
+```
+
+Then, something like the following is displayed:
+
+```
+-> Clearing GPU memory:
+        * deleting A...   [ OK ]
+        * deleting A'...  [ OK ]
+        * deleting x&y... [ OK ]
+        * deleting LUT... [ OK ]
+        * reseting GPU... [ OK ]
+```
\ No newline at end of file

From 13dfccf7f4593eb2a75dd31fc805bc544069ba05 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 18 Jul 2020 13:19:05 -0500
Subject: [PATCH 145/190] Adding tutorial to enable GPU acceleration

---
 docs/tutorials/CudaAcceleration/README.md | 4 ++--
 docs/tutorials/README.md                  | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/tutorials/CudaAcceleration/README.md b/docs/tutorials/CudaAcceleration/README.md
index 54982a5b..8d541280 100644
--- a/docs/tutorials/CudaAcceleration/README.md
+++ b/docs/tutorials/CudaAcceleration/README.md
@@ -4,11 +4,11 @@ This tutorial illustrates how to enable the CUDA GPU acceleration for faster mod
 
 ## Getting Started
 
-This tutorial takes as starting point the [getting starting](https://github.com/daducci/COMMIT/tree/master/docs/tutorials/GettingStarted) tutorial.
+This tutorial takes as starting point the [getting started](https://github.com/daducci/COMMIT/tree/master/docs/tutorials/GettingStarted) tutorial.
 
 ## Build the linear operator A on GPU
 
-Once the getting starting tutorial runs properly, focus on the commands `mit.set_threads()` and `mit.build_operator()`. The command `mit.set_threads()` allows to indicate (through the parameter `nthreads`) how many CPU threads will use COMMIT to build linear operator **A**. Then, the command `mit.build_operator()` builds **A** by using the same number of threads especified by `nthreads`. The GPU acceleration can be enabled by setting the parameter `nthreads` equal to zero in `mit.set_threads()`.
+Once the getting started tutorial runs properly, focus on the commands `mit.set_threads()` and `mit.build_operator()`. The command `mit.set_threads()` allows to indicate (through the parameter `nthreads`) how many CPU threads will use COMMIT to build linear operator **A**. Then, the command `mit.build_operator()` builds **A** by using the same number of threads especified by `nthreads`. The GPU acceleration can be enabled by setting the parameter `nthreads` equal to zero in `mit.set_threads()`.
 
 ```python
 nthreads = 0
diff --git a/docs/tutorials/README.md b/docs/tutorials/README.md
index e6438a1a..c910d7d7 100644
--- a/docs/tutorials/README.md
+++ b/docs/tutorials/README.md
@@ -3,3 +3,4 @@ Tutorials/demos using the COMMIT framework:
 * [Getting started](GettingStarted)
 * [Comparison to LiFE on STN96 data](LiFE_STN96)
 * [Advanced Solvers](AdvancedSolvers)
+* [Enable CUDA GPU Acceleration](CudaAcceleration)

From ced31cc11251e22eff93c8e41eb0877addb8784d Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 18 Jul 2020 13:22:42 -0500
Subject: [PATCH 146/190] Adding tutorial to enable GPU acceleration

---
 docs/tutorials/CudaAcceleration/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/tutorials/CudaAcceleration/README.md b/docs/tutorials/CudaAcceleration/README.md
index 8d541280..66b405e5 100644
--- a/docs/tutorials/CudaAcceleration/README.md
+++ b/docs/tutorials/CudaAcceleration/README.md
@@ -16,7 +16,7 @@ mit.set_threads( nthreads=nthreads )
 mit.build_operator()
 ```
 
-Once these commands area executed, COMMIT will build the linear operator ***A** but now on the GPU. The output should be something similar to this:
+Once these commands are executed, COMMIT will build the linear operator **A** but now on the GPU. The output should be something similar to this:
 
 ```
 -> Building linear operator A:
@@ -67,7 +67,7 @@ this is the GPU ID  -------> |   2  GeForce GTX TIT...  Off  | 0000:09:00.0
 
 ## Clearing GPU memory
 
-The commands apart from the command `mit.set_threads()` remain the same. But in the case when the GPU acceleration was enabled, the method `mit.A.destroy()` has to the executed in order to clear the GPU memory and reset the GPU for the next evaluation. That is, add the following command to the end of the script:
+The commands apart from the command `mit.set_threads()` remain the same. But in the case when the GPU acceleration is enabled, the method `mit.A.destroy()` has to the executed in order to clear the GPU memory and reset the GPU for the next evaluation. That is, add the following command to the end of the script:
 
 ```python
 if nthreads == 0:

From 52cbcc528a75da6e080114a2d223b2e9e373f864 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 18 Jul 2020 13:36:31 -0500
Subject: [PATCH 147/190] Minor cleanup

---
 commit/core.pyx             |  2 +-
 commit/operator_withCUDA.cu | 35 +++--------------------------------
 2 files changed, 4 insertions(+), 33 deletions(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index 26052d0e..d5ae6c6e 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -623,7 +623,7 @@ cdef class Evaluation :
 
             print( '[ OK ]' )
 
-        LOG( '   [ %.1f seconds ]' % ( time.time() - tic ) )
+            LOG( '   [ %.1f seconds ]' % ( time.time() - tic ) )
 
 
     def build_operator( self, build_dir=None ) :
diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index df10e76c..612015b3 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -26,7 +26,7 @@ bool checkCompatibility(size_t required_mem, int gpu_id) {
         cudaDeviceProp gpu_properties;
         cudaGetDeviceProperties(&gpu_properties, gpu_id);
 
-        printf("\t* checking availability of CUDA ... [ OK ]\n");
+        printf("\t* checking availability of CUDA... [ OK ]\n");
         printf("\t* number of CUDA GPUs detected: %d\n", num_gpus);
         printf("\t* using GPU with ID %d... [ %s ]\n", gpu_id, gpu_properties.name);
 
@@ -242,7 +242,7 @@ CudaLinearOperator::~CudaLinearOperator() {}
 void CudaLinearOperator::destroy(){
     bool cudaStatus;
 
-    printf("\n-> Deleting GPU memory:\n");
+    printf("\n-> Clearing GPU memory:\n");
 
     printf("\t* deleting A...   ");
     cudaStatus = true;
@@ -348,68 +348,39 @@ void cudaCheckKernel(){
 }
 
 void CudaLinearOperator::dot(float64_t* v_in, float64_t* v_out){
-    //cudaError_t cudaStatus;
     
     // Copy vector x to the GPU
     cudaMemcpy(gpu_x, v_in, ncols*sizeof(double), cudaMemcpyHostToDevice);
-    /*if (cudaStatus != cudaSuccess) printf("\t* tranfering x to GPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
-    else                           printf("\t* tranfering x to GPU ... [   OK  ]\n");//*/
 
     // Multiply IC part in the GPU
     multiply_Ax_ICpart<<<nvoxels, 1024>>>(gpu_voxelIC, gpu_fiberIC, gpu_orienIC, gpu_lengthIC, gpu_segmentsPerBlockIC, gpu_offsetPerBlockIC, gpu_lutIC, gpu_x, gpu_y);
 
-    //cudaCheckKernel();
-
     // Multiply EC part in the GPU
     multiply_Ax_ECpart<<<nvoxels, 512>>>(gpu_voxelEC, gpu_orienEC, gpu_segmentsPerBlockEC, gpu_offsetPerBlockEC, gpu_lutEC, gpu_x, gpu_y);
 
-    //cudaCheckKernel();
-
     // Multiply ISO part in the GPU
     multiply_Ax_ISOpart<<<nvoxels, 512>>>(gpu_lutISO, gpu_x, gpu_y);
 
-    //cudaCheckKernel();
-
     // Copy back result to CPU
     cudaMemcpy(v_out, gpu_y, nrows*sizeof(double), cudaMemcpyDeviceToHost);
-    /*if (cudaStatus != cudaSuccess) printf("\t* tranfering y to CPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
-    else                           printf("\t* tranfering y to CPU ... [   OK  ]\n");//*/
 }
 
 void CudaLinearOperator::Tdot(float64_t* v_in, float64_t* v_out){
-        
-    //cudaError_t cudaStatus;
+    
     // Copy vector y to the GPU
-    //cudaCheck( cudaMemset(gpu_x, 0, NUM_COLS*sizeof(float64_t)) );
-    //cudaCheck( cudaMemcpy(gpu_x, x, NUM_COLS*sizeof(double), cudaMemcpyHostToDevice) );
     cudaMemcpy(gpu_y, v_in, nrows*sizeof(double), cudaMemcpyHostToDevice);
-    /*if (cudaStatus != cudaSuccess) printf("\t* tranfering y to GPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
-    else                           printf("\t* tranfering y to GPU ... [   OK  ]\n");//*/
 
     // Multiply IC part in the GPU
     multiply_Aty_ICpart<<<nfibers, 512>>>(gpu_TvoxelIC, gpu_TfiberIC, gpu_TorienIC, gpu_TlengthIC, gpu_TfibersPerBlockIC, gpu_ToffsetPerBlockIC, gpu_lutIC, gpu_x, gpu_y);
 
-    //cudaCheckKernel();
-
     // Multiply EC part in the GPU
     multiply_Aty_ECpart<<<nvoxels, 512>>>(gpu_voxelEC, gpu_orienEC, gpu_segmentsPerBlockEC, gpu_offsetPerBlockEC, gpu_lutEC, gpu_x, gpu_y);
 
-    //cudaCheckKernel();
-
     // Multiply ISO part in the GPU
     multiply_Aty_ISOpart<<<nvoxels, 512>>>(gpu_lutISO, gpu_x, gpu_y);
 
-    //cudaCheckKernel();
-
     // Copy back result to CPU
     cudaMemcpy(v_out, gpu_x, ncols*sizeof(double), cudaMemcpyDeviceToHost);
-    /*if (cudaStatus != cudaSuccess) printf("\t* tranfering x to CPU ... [ ERROR ]: %s\n", cudaGetErrorString(cudaStatus));
-    else                           printf("\t* tranfering x to CPU ... [   OK  ]\n");//*/
-        
-    /*printf("\n\n VECTOR X EC PART:\n");
-    for(int i = NUM_FIBERS*NUM_RESFUNCIC; i < NUM_FIBERS*NUM_RESFUNCIC+20; i++)
-        printf("%lf ", x[i]);
-    printf("\n\n");//*/
 }
 
 // ------------------------------------------------------- KERNELS ------------------------------------------------------- //

From 0ebf7dffc465008857c9b8c5d491032d7982d495 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 18 Jul 2020 14:18:43 -0500
Subject: [PATCH 148/190] Adding Cuda toolkit to the installation guide

---
 docs/install.md | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/docs/install.md b/docs/install.md
index 2d070b8c..c06c13b1 100644
--- a/docs/install.md
+++ b/docs/install.md
@@ -23,6 +23,25 @@ Depending on the forward-model employed, COMMIT can require the [Camino](http://
 
 Please follow the corresponding [documentation](http://cmic.cs.ucl.ac.uk/camino//index.php?n=Main.Installation) to install Camino and make sure to include the folder containing the script `datasynth` in your system path.
 
+### Cuda toolkit (optional)
+
+COMMIT has GPU acceleration support for fast model fitting. In order to use COMMIT with GPU acceleration, it is necessary to install the [CUDA toolkit](https://docs.nvidia.com/cuda/cuda-quick-start-guide/index.html#introduction). Make sure to add CUDA and the CUDA libraries to the PATH. For example, if the CUDA Toolkit 11.0 was installed in the default folder (`/usr/local/cuda-11.0/`), run in a system shell
+
+```bash
+export PATH=/usr/local/cuda-11.0/bin${PATH:+:${PATH}}
+export LD_LIBRARY_PATH=/usr/local/cuda-11.0/lib64\
+                         ${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
+```
+
+COMMIT uses the `CUDAHOME` variable to compile some parts of the code. In a system shell, under the previous context, run the command:
+
+```bash
+export CUDAHOME=/usr/local/cuda-11.0/
+```
+
+**NOTE:** Only NVIDIA GPUs with compute capability >= 5.0 are supported.
+**NOTE:** It is recommended to have the latest NVIDIA drivers installed.
+
 ## Install COMMIT
 
 Open the system shell, go to the folder where you downloaded this repository and run:

From 714f066d8d1af1d98021a3ad759d080567a38527 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 18 Jul 2020 14:20:36 -0500
Subject: [PATCH 149/190] Adding Cuda toolkit to the installation guide

---
 docs/install.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/install.md b/docs/install.md
index c06c13b1..27840623 100644
--- a/docs/install.md
+++ b/docs/install.md
@@ -25,7 +25,7 @@ Please follow the corresponding [documentation](http://cmic.cs.ucl.ac.uk/camino/
 
 ### Cuda toolkit (optional)
 
-COMMIT has GPU acceleration support for fast model fitting. In order to use COMMIT with GPU acceleration, it is necessary to install the [CUDA toolkit](https://docs.nvidia.com/cuda/cuda-quick-start-guide/index.html#introduction). Make sure to add CUDA and the CUDA libraries to the PATH. For example, if the CUDA Toolkit 11.0 was installed in the default folder (`/usr/local/cuda-11.0/`), run in a system shell
+COMMIT has GPU acceleration support for fast model fitting. In order to use COMMIT with GPU acceleration, it is necessary to install the [CUDA toolkit](https://docs.nvidia.com/cuda/cuda-quick-start-guide/index.html#introduction). After the installation of CUDA, make sure to add CUDA and the CUDA libraries to the PATH. For example, if the CUDA Toolkit 11.0 was installed in the default folder (`/usr/local/cuda-11.0/`), run in a system shell
 
 ```bash
 export PATH=/usr/local/cuda-11.0/bin${PATH:+:${PATH}}
@@ -40,6 +40,7 @@ export CUDAHOME=/usr/local/cuda-11.0/
 ```
 
 **NOTE:** Only NVIDIA GPUs with compute capability >= 5.0 are supported.
+
 **NOTE:** It is recommended to have the latest NVIDIA drivers installed.
 
 ## Install COMMIT

From 02d00b1808cbd700a96935aa28703007760a0493 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 18 Jul 2020 14:22:43 -0500
Subject: [PATCH 150/190] Adding link to nvidia drivers

---
 docs/install.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/install.md b/docs/install.md
index 27840623..fc97cc81 100644
--- a/docs/install.md
+++ b/docs/install.md
@@ -41,7 +41,7 @@ export CUDAHOME=/usr/local/cuda-11.0/
 
 **NOTE:** Only NVIDIA GPUs with compute capability >= 5.0 are supported.
 
-**NOTE:** It is recommended to have the latest NVIDIA drivers installed.
+**NOTE:** It is recommended to have the latest [NVIDIA drivers](https://www.nvidia.com/Download/index.aspx?lang=en-us) installed.
 
 ## Install COMMIT
 

From c91b4d2ed08fb826ba7fccbfb5d3473f212b3b2a Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Mon, 4 Jan 2021 21:45:04 -0600
Subject: [PATCH 151/190] Adding version information to CHANGELOG file

---
 CHANGELOG.md | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 060cf9ce..cb5a637d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,7 +1,14 @@
-
 # Change Log
 All notable changes to COMMIT will be documented in this file.
 
+## [1.5.0] - 2021-01-04
+
+### Changed
+- setup.py: Add compilation for .cu files
+
+### Added
+- GPU acceleration with CUDA for faster model fitting
+
 ## [1.4.5] - 2020-12-29
 
 ### Fixed

From b7e1c60771a3cf7b1267817cd93610cf66fe2cf8 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Mon, 4 Jan 2021 22:16:45 -0600
Subject: [PATCH 152/190] Removing tutorial for cuda version

---
 docs/tutorials/CudaAcceleration/README.md | 86 -----------------------
 1 file changed, 86 deletions(-)
 delete mode 100644 docs/tutorials/CudaAcceleration/README.md

diff --git a/docs/tutorials/CudaAcceleration/README.md b/docs/tutorials/CudaAcceleration/README.md
deleted file mode 100644
index 66b405e5..00000000
--- a/docs/tutorials/CudaAcceleration/README.md
+++ /dev/null
@@ -1,86 +0,0 @@
-# Enable CUDA GPU Acceleration
-
-This tutorial illustrates how to enable the CUDA GPU acceleration for faster model fitting.
-
-## Getting Started
-
-This tutorial takes as starting point the [getting started](https://github.com/daducci/COMMIT/tree/master/docs/tutorials/GettingStarted) tutorial.
-
-## Build the linear operator A on GPU
-
-Once the getting started tutorial runs properly, focus on the commands `mit.set_threads()` and `mit.build_operator()`. The command `mit.set_threads()` allows to indicate (through the parameter `nthreads`) how many CPU threads will use COMMIT to build linear operator **A**. Then, the command `mit.build_operator()` builds **A** by using the same number of threads especified by `nthreads`. The GPU acceleration can be enabled by setting the parameter `nthreads` equal to zero in `mit.set_threads()`.
-
-```python
-nthreads = 0
-mit.set_threads( nthreads=nthreads )
-mit.build_operator()
-```
-
-Once these commands are executed, COMMIT will build the linear operator **A** but now on the GPU. The output should be something similar to this:
-
-```
--> Building linear operator A:
-        * checking availability of CUDA...  [ OK ]
-        * number of CUDA GPUs detected: 	1
-        * using GPU with ID 0... 			[ GeForce RTX 2080 SUPER ]
-        * using 0.31 GB of total 8.37 GB... [ OK ]
-        * compute capability: 7.5 			[ OK ]
-        * constant values ... 				[ OK ]
-        * vectors x&y ... 					[ OK ]
-        * pre-processing ... 				[ OK ]
-        * loading LUTs ... 					[ OK ]
-        * A  operator... 					[ OK ]
-        * A' operator... 					[ OK ]
-   [ 0.5 seconds ]
-```
-
-If there are more than one CUDA capable GPU installed, COMMIT will use by default the GPU with ID 0. The selected GPU can be changed with the parameter `select_gpu` in `mit.set_threads()`. For example, assuming there are at least two CUDA capable GPUs installed, the following commands build the linear operator **A** on the GPU with ID 1 rather than on the GPU with ID 0:
-
-```python
-nthreads = 0
-gpu_id = 1
-mit.set_threads( nthreads=nthreads, select_gpu=gpu_id )
-mit.build_operator()
-```
-
-To show a list of GPUs and their IDs, open a system shell and run the command `nvidia-smi`. This command should output something similar to:
-
-```
-                             +-----------------------------------------------------------------------------+
-                             | NVIDIA-SMI 375.82                 Driver Version: 375.82                    |
-                             |-------------------------------+----------------------+----------------------+
-                             | GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
-                             | Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
-                             |===============================+======================+======================|
-this is the GPU ID  -------> |   0  GeForce GTX TIT...  Off  | 0000:05:00.0     Off |                  N/A |
-                             | 22%   55C    P8    31W / 250W |  11853MiB / 12205MiB |      0%      Default |
-                             +-------------------------------+----------------------+----------------------+
-this is the GPU ID  -------> |   1  GeForce GTX TIT...  Off  | 0000:06:00.0     Off |                  N/A |
-                             | 22%   60C    P8    18W / 250W |    114MiB / 12207MiB |      0%      Default |
-                             +-------------------------------+----------------------+----------------------+
-this is the GPU ID  -------> |   2  GeForce GTX TIT...  Off  | 0000:09:00.0     Off |                  N/A |
-                             | 27%   66C    P2    72W / 250W |   8452MiB / 12207MiB |      0%      Default |
-                             +-------------------------------+----------------------+----------------------+
-```
-
-**NOTE:** At this moment, COMMIT does not have support for multi-GPU acceleration.
-
-## Clearing GPU memory
-
-The commands apart from the command `mit.set_threads()` remain the same. But in the case when the GPU acceleration is enabled, the method `mit.A.destroy()` has to the executed in order to clear the GPU memory and reset the GPU for the next evaluation. That is, add the following command to the end of the script:
-
-```python
-if nthreads == 0:
-	mit.A.destroy()
-```
-
-Then, something like the following is displayed:
-
-```
--> Clearing GPU memory:
-        * deleting A...   [ OK ]
-        * deleting A'...  [ OK ]
-        * deleting x&y... [ OK ]
-        * deleting LUT... [ OK ]
-        * reseting GPU... [ OK ]
-```
\ No newline at end of file

From 2b613408bddddc83869cfd9e3aef2f4dab0481e3 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Mon, 4 Jan 2021 23:01:06 -0600
Subject: [PATCH 153/190] Minor cleanup

---
 setup.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/setup.py b/setup.py
index fc363112..98fa2fb9 100644
--- a/setup.py
+++ b/setup.py
@@ -1,14 +1,9 @@
-#from distutils.core import setup, Extension
-#from Cython.Distutils import build_ext
-#from Cython.Build import cythonize
-#import numpy
-#import amico
 from setuptools import Extension, setup
 from setuptools.command.build_ext import build_ext
 import os
 from os.path import join as pjoin
 
-# taken from npcuda
+# taken from https://github.com/rmcgibbo/npcuda-example/blob/master/cython/setup.py
 def find_in_path(name, path):
     """Find a file in a search path"""
 

From ea871aac0c4467d2a316de992126b4efbceea456 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Tue, 5 Jan 2021 16:43:56 -0600
Subject: [PATCH 154/190] Minor cleanup

---
 LICENSE                                     |   66 +-
 MANIFEST.in                                 |   12 +-
 README.md                                   |   60 +-
 commit/core.pyx                             |   35 +-
 extras/COMMIT_debugger/OPENGL_callbacks.cxx | 2264 +++++++++----------
 extras/COMMIT_debugger/OPENGL_utils.h       |  190 +-
 requirements.txt                            |   10 +-
 setup.cfg                                   |   10 +-
 setup.py                                    |   17 -
 9 files changed, 1326 insertions(+), 1338 deletions(-)

diff --git a/LICENSE b/LICENSE
index 04e0c652..70808f61 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,33 +1,33 @@
-Unless otherwise specified by LICENSE.txt files in individual
-directories, or within individual files or functions, all code is:
-
-Copyright (c) 2008-2020, COMMIT developers
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-    * Redistributions of source code must retain the above copyright
-       notice, this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above
-       copyright notice, this list of conditions and the following
-       disclaimer in the documentation and/or other materials provided
-       with the distribution.
-
-    * Neither the name of the COMMIT developers nor the names of any
-       contributors may be used to endorse or promote products derived
-       from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+Unless otherwise specified by LICENSE.txt files in individual
+directories, or within individual files or functions, all code is:
+
+Copyright (c) 2008-2020, COMMIT developers
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+       notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+       copyright notice, this list of conditions and the following
+       disclaimer in the documentation and/or other materials provided
+       with the distribution.
+
+    * Neither the name of the COMMIT developers nor the names of any
+       contributors may be used to endorse or promote products derived
+       from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/MANIFEST.in b/MANIFEST.in
index fa48479d..d3b5c5b7 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,7 +1,7 @@
-include README.md
-include LICENSE
-
-recursive-include commit *.h
-recursive-include commit *.cpp
-recursive-include commit *.pyx
+include README.md
+include LICENSE
+
+recursive-include commit *.h
+recursive-include commit *.cpp
+recursive-include commit *.pyx
 recursive-include commit *.c
\ No newline at end of file
diff --git a/README.md b/README.md
index cdd2cb13..78bc5128 100644
--- a/README.md
+++ b/README.md
@@ -1,30 +1,30 @@
-# COMMIT
-
-The reconstructions recovered with existing tractography algorithms are *not really quantitative* even though diffusion MRI is a quantitative modality by nature. As a matter of fact, several techniques have been proposed in recent years to estimate, at the voxel level, intrinsic micro-structural features of the tissue, such as axonal density and diameter, by using multi-compartment models. COMMIT implements a novel framework to **re-establish the link between tractography and tissue micro-structure**.
-
-Starting from an input set of candidate fiber-tracts, which can be estimated using standard fiber-tracking techniques, COMMIT models the diffusion MRI signal in each voxel of the image as a *linear combination* of the restricted and hindered contributions generated in every location of the brain by these candidate tracts. Then, COMMIT seeks for the effective contribution of each of them such that they globally fit the measured signal at best.
-
-These weights can be easily estimated by solving a convenient **global convex optimization problem** and using efficient algorithms. Results clearly demonstrated the benefits of the proposed formulation, opening new perspectives for a more quantitative and biologically-plausible assessment of the structural connectivity in the brain.
-
-
-## Main features
-
-- Accepts and works with **any input tractogram** (i.e. set of fiber tracts).
-- Can easily implement and consider **any multi-compartment model** available in the literature: possibility to account for restricted, hindered as well as isotropic contributions into the signal forward model.
-- Very efficient: the core of the algorithm is implemented in C++ and using **multi-threading programming** for efficient parallel computation.
-- **Low memory** consumption using optimized sparse data structures, e.g. it can easily run on a standard laptop with 8GB RAM a full-brain tractogram from the HCP data (1M fibers, 3 shells, 1.25 mm^3 resolution).
-- **Soon**: **GPU implementation** for even faster model fitting.
-
-
-## Documentation
-
-More information/documentation, as well as a series of tutorials, can be found in the [wiki pages](https://github.com/daducci/COMMIT/wiki/Home).
-
-### Installation
-
-To install COMMIT, refer to the [installation guide](https://github.com/daducci/COMMIT/wiki/Installation).
-
-### Getting started
-
-To get started with the COMMIT framework, have a look at [this tutorial](https://github.com/daducci/COMMIT/wiki/Getting-started), which will guide you through the main steps of the processing.
-
+# COMMIT
+
+The reconstructions recovered with existing tractography algorithms are *not really quantitative* even though diffusion MRI is a quantitative modality by nature. As a matter of fact, several techniques have been proposed in recent years to estimate, at the voxel level, intrinsic micro-structural features of the tissue, such as axonal density and diameter, by using multi-compartment models. COMMIT implements a novel framework to **re-establish the link between tractography and tissue micro-structure**.
+
+Starting from an input set of candidate fiber-tracts, which can be estimated using standard fiber-tracking techniques, COMMIT models the diffusion MRI signal in each voxel of the image as a *linear combination* of the restricted and hindered contributions generated in every location of the brain by these candidate tracts. Then, COMMIT seeks for the effective contribution of each of them such that they globally fit the measured signal at best.
+
+These weights can be easily estimated by solving a convenient **global convex optimization problem** and using efficient algorithms. Results clearly demonstrated the benefits of the proposed formulation, opening new perspectives for a more quantitative and biologically-plausible assessment of the structural connectivity in the brain.
+
+
+## Main features
+
+- Accepts and works with **any input tractogram** (i.e. set of fiber tracts).
+- Can easily implement and consider **any multi-compartment model** available in the literature: possibility to account for restricted, hindered as well as isotropic contributions into the signal forward model.
+- Very efficient: the core of the algorithm is implemented in C++ and using **multi-threading programming** for efficient parallel computation.
+- **Low memory** consumption using optimized sparse data structures, e.g. it can easily run on a standard laptop with 8GB RAM a full-brain tractogram from the HCP data (1M fibers, 3 shells, 1.25 mm^3 resolution).
+- **Soon**: **GPU implementation** for even faster model fitting.
+
+
+## Documentation
+
+More information/documentation, as well as a series of tutorials, can be found in the [wiki pages](https://github.com/daducci/COMMIT/wiki/Home).
+
+### Installation
+
+To install COMMIT, refer to the [installation guide](https://github.com/daducci/COMMIT/wiki/Installation).
+
+### Getting started
+
+To get started with the COMMIT framework, have a look at [this tutorial](https://github.com/daducci/COMMIT/wiki/Getting-started), which will guide you through the main steps of the processing.
+
diff --git a/commit/core.pyx b/commit/core.pyx
index 21735962..a9895b48 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -499,6 +499,9 @@ cdef class Evaluation :
         nthreads : integer
             Number of threads to use (nthreads = None ---> all the CPU threads available in the system
                                       nthreads = 0    ---> enable CUDA GPU acceleration)
+        select_gpu : integer
+            GPU ID of the Nvidia GPU where COMMIT will be executed, default=0 and it is only required if nthreads=0
+            (To show a list of Nvidia GPUs and their IDs, open a system shell and run the command 'nvidia-smi')
         """
         if nthreads is None :
             # Set to the number of CPUs in the system
@@ -509,7 +512,7 @@ cdef class Evaluation :
                 nthreads = 1
 
         if nthreads < 0 or nthreads > 255 :
-            raise RuntimeError( 'Number of threads must be between 0 and 255' )
+            ERROR( 'Number of threads must be between 0 and 255' )
         if self.DICTIONARY is None :
             ERROR( 'Dictionary not loaded; call "load_dictionary()" first' )
         if self.KERNELS is None :
@@ -517,13 +520,8 @@ cdef class Evaluation :
 
         self.THREADS = {}
         self.THREADS['n'] = nthreads
-        self.THREADS['IC'] = None
-        self.THREADS['EC'] = None
-        self.THREADS['ISO'] = None
-        self.THREADS['ICt'] = None
-        self.THREADS['ECt'] = None
-        self.THREADS['ISOt'] = None
-        self.THREADS['GPUID'] = select_gpu
+        if nthreads == 0:
+            self.THREADS['GPUID'] = select_gpu
 
         cdef :
             long [:] C
@@ -533,13 +531,20 @@ cdef class Evaluation :
         tic = time.time()
 
         if nthreads > 0:
-            print( '\n-> Distributing workload to different threads:' )
+            LOG( '\n-> Distributing workload to different threads:' )
             print( '\t* number of threads : %d' % nthreads )
 
             # Distribute load for the computation of A*x product
-            print( '\t* A  operator... ', end="" )
+            print( '\t* A  operator... ', end='' )
             sys.stdout.flush()
 
+            self.THREADS['IC']   = None
+            self.THREADS['EC']   = None
+            self.THREADS['ISO']  = None
+            self.THREADS['ICt']  = None
+            self.THREADS['ECt']  = None
+            self.THREADS['ISOt'] = None
+
             if self.DICTIONARY['IC']['n'] > 0 :
                 self.THREADS['IC'] = np.zeros( nthreads+1, dtype=np.uint32 )
                 if nthreads > 1 :
@@ -558,7 +563,7 @@ cdef class Evaluation :
                 # check if some threads are not assigned any segment
                 if np.count_nonzero( np.diff( self.THREADS['IC'].astype(np.int32) ) <= 0 ) :
                     self.THREADS = None
-                    raise RuntimeError( 'Too many threads for the IC compartments to evaluate; try decreasing the number.' )
+                    ERROR( 'Too many threads for the IC compartments to evaluate; try decreasing the number.' )
 
             if self.DICTIONARY['EC']['nE'] > 0 :
                 self.THREADS['EC'] = np.zeros( nthreads+1, dtype=np.uint32 )
@@ -569,7 +574,7 @@ cdef class Evaluation :
                 # check if some threads are not assigned any segment
                 if np.count_nonzero( np.diff( self.THREADS['EC'].astype(np.int32) ) <= 0 ) :
                     self.THREADS = None
-                    raise RuntimeError( 'Too many threads for the EC compartments to evaluate; try decreasing the number.' )
+                    ERROR( 'Too many threads for the EC compartments to evaluate; try decreasing the number.' )
 
             if self.DICTIONARY['nV'] > 0 :
                 self.THREADS['ISO'] = np.zeros( nthreads+1, dtype=np.uint32 )
@@ -580,7 +585,7 @@ cdef class Evaluation :
                 # check if some threads are not assigned any segment
                 if np.count_nonzero( np.diff( self.THREADS['ISO'].astype(np.int32) ) <= 0 ) :
                     self.THREADS = None
-                    raise RuntimeError( 'Too many threads for the ISO compartments to evaluate; try decreasing the number.' )
+                    ERROR( 'Too many threads for the ISO compartments to evaluate; try decreasing the number.' )
 
             print( '[ OK ]' )
 
@@ -617,7 +622,7 @@ cdef class Evaluation :
                 # check if some threads are not assigned any segment
                 if np.count_nonzero( np.diff( self.THREADS['ECt'].astype(np.int32) ) <= 0 ) :
                     self.THREADS = None
-                    raise RuntimeError( 'Too many threads for the EC compartments to evaluate; try decreasing the number.' )
+                    ERROR( 'Too many threads for the EC compartments to evaluate; try decreasing the number.' )
 
             if self.DICTIONARY['nV'] > 0 :
                 self.THREADS['ISOt'] = np.zeros( nthreads+1, dtype=np.uint32 )
@@ -629,7 +634,7 @@ cdef class Evaluation :
                 # check if some threads are not assigned any segment
                 if np.count_nonzero( np.diff( self.THREADS['ISOt'].astype(np.int32) ) <= 0 ) :
                     self.THREADS = None
-                    raise RuntimeError( 'Too many threads for the ISO compartments to evaluate; try decreasing the number.' )
+                    ERROR( 'Too many threads for the ISO compartments to evaluate; try decreasing the number.' )
 
             print( '[ OK ]' )
 
diff --git a/extras/COMMIT_debugger/OPENGL_callbacks.cxx b/extras/COMMIT_debugger/OPENGL_callbacks.cxx
index fcf4bca3..e90e9c08 100755
--- a/extras/COMMIT_debugger/OPENGL_callbacks.cxx
+++ b/extras/COMMIT_debugger/OPENGL_callbacks.cxx
@@ -1,1132 +1,1132 @@
-#define GL_GLEXT_PROTOTYPES 1
-#ifdef __APPLE__
-    #include <OpenGL/gl.h>
-    #include <OpenGL/glext.h>
-    #include <GLUT/glut.h>
-#else
-    #include <GL/gl.h>
-    #include <GL/glext.h>
-    #include <GL/glut.h>
-#endif
-
-#include "OPENGL_utils.h"
-using namespace OPENGL_utils;
-
-/* global variables */
-GLfloat			id[16], rot[16], rot1[16], rot2[16], rot3[16];
-Vec3Df			translation;
-Vec3Di			start;
-GLint			moving;
-GLfloat			zoom;
-
-float ScreenX, ScreenY;
-
-
-void drawString( const char *string )
-{
-    static int y = glutGet( GLUT_WINDOW_HEIGHT ) - 50;
-    if ( string=="" )
-        y = glutGet( GLUT_WINDOW_HEIGHT ) - 50;
-    else
-    {
-        glRasterPos2i(10, y);
-        for (const char* c=string; *c != '\0'; c++) 
-            glutBitmapCharacter(GLUT_BITMAP_9_BY_15, *c);
-        y -= 18;
-    }
-}
-
-
-void PrintConfig()
-{
-    if ( !showConfig )
-        return;
-
-    glMatrixMode(GL_PROJECTION);
-    glPushMatrix();             
-    glLoadIdentity();
-    glMatrixMode( GL_MODELVIEW ) ;
-    glPushMatrix() ;
-    glLoadIdentity() ;
-    int w = glutGet( GLUT_WINDOW_WIDTH );
-    int h = glutGet( GLUT_WINDOW_HEIGHT );
-    glOrtho( 0, w, 0, h, -1, 1 );
-    glDisable( GL_DEPTH_TEST ); 
-
-    char s[1024];
-    glColor3f(1, 1, 0);
-    drawString( "" ); // reset initial position
-
-    drawString( "MAP" );
-    sprintf( s, "   - value(%d,%d,%d) = %.2f", VOXEL.x, VOXEL.y, VOXEL.z, MAP(VOXEL.x, VOXEL.y, VOXEL.z) );
-    drawString( s );
-    sprintf( s, "   - range = [ %.1f ... %.1f ]", MAP_min_view, MAP_max_view );
-    drawString( s );
-    sprintf( s, "   - opacity = %.1f", MAP_opacity );
-    drawString( s );
-
-    drawString( "SIGNAL" );
-    sprintf( s, "   - shell = %d/%d  (b=%.1f)", GLYPHS_shell+1, SCHEME_shells_b.size(), SCHEME_shells_b[GLYPHS_shell] );
-    drawString( s );
-    sprintf( s, "   - use affine = %s", GLYPHS_use_affine?"true":"false" );
-    drawString( s );
-    sprintf( s, "   - flip = [ %d, %d, %d ]", GLYPHS_flip[0], GLYPHS_flip[1], GLYPHS_flip[2] );
-    drawString( s );
-    sprintf( s, "   - b0 thr = %.1f", GLYPHS_b0_thr );
-    drawString( s );
-
-    if ( PEAKS_n>0 )
-    {
-        drawString( "PEAKS" );
-        sprintf( s, "   - use affine = %s", PEAKS_use_affine?"true":"false" );
-        drawString( s );
-        sprintf( s, "   - flip = [ %d, %d, %d ]", PEAKS_flip[0], PEAKS_flip[1], PEAKS_flip[2] );
-        drawString( s );
-        sprintf( s, "   - thr = %.1f", PEAKS_thr );
-        drawString( s );
-        sprintf( s, "   - normalize = %s", PEAKS_doNormalize?"true":"false" );
-        drawString( s );
-    }
-
-    if ( TRK_nTractsPlotted>0 )
-    {
-        drawString( "FIBERS" );
-        sprintf( s, "   - shift = [ %.1f %.1f %.1f ]  (voxels)", TRK_offset.x, TRK_offset.y, TRK_offset.z );
-        drawString( s );
-        sprintf( s, "   - slab thickness = %.1f  (voxels)", TRK_crop );
-        drawString( s );
-    }
-
-    glEnable (GL_DEPTH_TEST);     
-    glMatrixMode(GL_PROJECTION);
-    glPopMatrix();
-    glMatrixMode(GL_MODELVIEW);
-    glPopMatrix();
-}
-
-
-// KEYBOARD callback
-// -----------------
-void GLUT__keyboard( unsigned char key, GLint x=0, GLint y=0 )
-{
-    bool doRedraw = true;
-
-    switch( key )
-    {
-        case 'l': showConfig = 1 - showConfig; break;
-
-        case '1': showPlane[0] = 1 - showPlane[0]; break;
-        case '2': showPlane[1] = 1 - showPlane[1]; break;
-        case '3': showPlane[2] = 1 - showPlane[2]; break;
-        case '4':
-            showPlane[0] = 1;
-            showPlane[1] = 0;
-            showPlane[2] = 0;
-            translation.x	= translation.y = 0;
-            OPENGL_utils::identity(rot1);
-            OPENGL_utils::rotateX(rot1, 90.0, rot2);
-            OPENGL_utils::rotateZ(rot2, 90.0, rot);
-            break;
-        case '5':
-            showPlane[0] = 0;
-            showPlane[1] = 1;
-            showPlane[2] = 0;
-            translation.x	= translation.y = 0;
-            OPENGL_utils::identity(rot1);
-            OPENGL_utils::rotateX(rot1, 90.0, rot);
-            break;
-        case '6':
-            showPlane[0] = 0;
-            showPlane[1] = 0;
-            showPlane[2] = 1;
-            translation.x	= translation.y = 0;
-            OPENGL_utils::identity( rot );
-            break;
-
-        case '0': showAxes = 1 - showAxes; break;
-        case '-': zoom += 10.0; break;
-        case '+': zoom -= 10.0; break;
-        case 'm': MAP_max_view = fmaxf(0.0,MAP_max_view-MAP_max*0.05); break;
-        case 'M': MAP_max_view = fminf(MAP_max,MAP_max_view+MAP_max*0.05); break;
-        case 'o': MAP_opacity = fmaxf(0.0,MAP_opacity-0.1); break;
-        case 'O': MAP_opacity = fminf(1.0,MAP_opacity+0.1); break;
-        case 'w': LINE_width = fmaxf( 1,LINE_width-1); break;
-        case 'W': LINE_width = fminf(10,LINE_width+1); break;
-        case 'r':
-            showPlane[0] = showPlane[1] = showPlane[2] = 1;
-            translation.x	= translation.y = 0;
-            zoom			= 0;
-            OPENGL_utils::identity( rot );
-            break;
-
-        case 's': GLYPHS_show = 1 - GLYPHS_show; break;
-        case 'S': GLYPHS_shell = (GLYPHS_shell+1) % SCHEME_shells_idx.size(); break;
-        case 'a': GLYPHS_use_affine = 1 - GLYPHS_use_affine; break;
-        case 'x': GLYPHS_flip[0] = 1 - GLYPHS_flip[0]; for(int d=0; d < SCHEME_dirs.size() ;d++) SCHEME_dirs[d].x *= -1; break;
-        case 'y': GLYPHS_flip[1] = 1 - GLYPHS_flip[1]; for(int d=0; d < SCHEME_dirs.size() ;d++) SCHEME_dirs[d].y *= -1; break;
-        case 'z': GLYPHS_flip[2] = 1 - GLYPHS_flip[2]; for(int d=0; d < SCHEME_dirs.size() ;d++) SCHEME_dirs[d].z *= -1; break;
-        case 'b': GLYPHS_b0_thr = fmaxf(0.0,GLYPHS_b0_thr-10.0); break;
-        case 'B': GLYPHS_b0_thr = fminf(MAP_max,GLYPHS_b0_thr+10.0); break;
-
-        case 'p': if ( PEAKS_n>0 ) PEAKS_show  = 1 - PEAKS_show; break;
-        case 'A': PEAKS_use_affine = 1 - PEAKS_use_affine; break;
-        case 'X': PEAKS_flip[0] = 1 - PEAKS_flip[0]; break;
-        case 'Y': PEAKS_flip[1] = 1 - PEAKS_flip[1]; break;
-        case 'Z': PEAKS_flip[2] = 1 - PEAKS_flip[2]; break;
-        case 't': PEAKS_thr = fmaxf(PEAKS_thr - 0.1, 0.0); break;
-        case 'T': PEAKS_thr = fminf(PEAKS_thr + 0.1, 1.0); break;
-        case 'n': PEAKS_doNormalize = 1 - PEAKS_doNormalize; break;
-
-        case 'f': if ( TRK_nTractsPlotted>0 ) TRK_show = 1 - TRK_show; break;
-        case 'c': TRK_crop = fmaxf( 0.0,TRK_crop-0.5); break;
-        case 'C': TRK_crop = fminf(max(dim.x,max(dim.y,dim.z)),TRK_crop+0.5); break;
-        case ' ': TRK_crop_mode = 1 - TRK_crop_mode; break;
-
-        case 'q':
-        case 27 : exit(0); break;
-
-        default: doRedraw = false;
-    }
-
-    if ( doRedraw )
-        glutPostRedisplay();
-}
-
-
-// MENU callback
-// -------------
-void GLUT__menu( int id ) 
-{
-    switch( id )
-    {
-        case   0: GLUT__keyboard('q'); break;
-
-        case 101: GLUT__keyboard('s'); break;
-        case 102: GLUT__keyboard('S'); break;
-        case 103: GLUT__keyboard('a'); break;
-        case 104: GLUT__keyboard('x'); break;
-        case 105: GLUT__keyboard('y'); break;
-        case 106: GLUT__keyboard('z'); break;
-        case 107: GLUT__keyboard('b'); break;
-        case 108: GLUT__keyboard('B'); break;
-
-        case 201: GLUT__keyboard('p'); break;
-        case 202: GLUT__keyboard('A'); break;
-        case 203: GLUT__keyboard('X'); break;
-        case 204: GLUT__keyboard('Y'); break;
-        case 205: GLUT__keyboard('Z'); break;
-        case 206: GLUT__keyboard('t'); break;
-        case 207: GLUT__keyboard('T'); break;
-        case 208: GLUT__keyboard('n'); break;
-
-        case 301: GLUT__keyboard('f'); break;
-        case 302: GLUT__keyboard('c'); break;
-        case 303: GLUT__keyboard('C'); break;
-        case 304: GLUT__keyboard(' '); break;
-
-        case 401: GLUT__keyboard('1'); break;
-        case 402: GLUT__keyboard('2'); break;
-        case 403: GLUT__keyboard('3'); break;
-        case 404: GLUT__keyboard('4'); break;
-        case 405: GLUT__keyboard('5'); break;
-        case 406: GLUT__keyboard('6'); break;
-        case 407: GLUT__keyboard('0'); break;
-        case 408: GLUT__keyboard('-'); break;
-        case 409: GLUT__keyboard('+'); break;
-        case 410: GLUT__keyboard('m'); break;
-        case 411: GLUT__keyboard('M'); break;
-        case 412: GLUT__keyboard('o'); break;
-        case 413: GLUT__keyboard('O'); break;
-        case 414: GLUT__keyboard('w'); break;
-        case 415: GLUT__keyboard('W'); break;
-        case 416: GLUT__keyboard('r'); break;
-        case 417: GLUT__keyboard('l'); break;
-    }
-}
-
-
-// Create the dropdown MENU
-// ------------------------
-void GLUT__createMenu()
-{
-    int submenu_SIGNAL_id, submenu_PEAKS_id, submenu_FIBERS_id, submenu_VIEW_id;
-
-    submenu_SIGNAL_id = glutCreateMenu( GLUT__menu );
-    glutAddMenuEntry("[s] Show/hide",         101);
-    glutAddMenuEntry("[S] Change shell",      102);
-    glutAddMenuEntry("[a] Use affine",        103);
-    glutAddMenuEntry("[x] Flip X axis",       104);
-    glutAddMenuEntry("[y] Flip Y axis",       105);
-    glutAddMenuEntry("[z] Flip Z axis",       106);
-    glutAddMenuEntry("[b] Decrease b0 thr",   107);
-    glutAddMenuEntry("[B] Increase b0 thr",   108);
-
-    if ( PEAKS_n>0 )
-    {
-        submenu_PEAKS_id = glutCreateMenu( GLUT__menu );
-        glutAddMenuEntry("[p] Show/hide",         201);
-        glutAddMenuEntry("[A] Use affine",        202);
-        glutAddMenuEntry("[X] Flip X axis",       203);
-        glutAddMenuEntry("[Y] Flip Y axis",       204);
-        glutAddMenuEntry("[Z] Flip Z axis",       205);
-        glutAddMenuEntry("[t] Decrease threshold",206);
-        glutAddMenuEntry("[T] Increase threshold",207);
-        glutAddMenuEntry("[n] Normalize length",  208);
-    }
-
-    if ( TRK_nTractsPlotted>0 )
-    {
-        submenu_FIBERS_id = glutCreateMenu( GLUT__menu );
-        glutAddMenuEntry("[f] Show/hide",         301);
-        glutAddMenuEntry("[c] Decrease crop size",302);
-        glutAddMenuEntry("[C] Increase crop size",303);
-        glutAddMenuEntry("[ ] Change crop mode",  304);
-    }
-
-    submenu_VIEW_id = glutCreateMenu( GLUT__menu );
-    glutAddMenuEntry("[1] Show/hide YZ plane", 401);
-    glutAddMenuEntry("[2] Show/hide XZ plane", 402);
-    glutAddMenuEntry("[3] Show/hide XY plane", 403);
-    glutAddMenuEntry("[4] Reset to YZ plane",  404);
-    glutAddMenuEntry("[5] Reset to XZ plane",  405);
-    glutAddMenuEntry("[6] Reset to XY plane",  406);
-    glutAddMenuEntry("[0] Show/hide axes",     407);
-    glutAddMenuEntry("[-] Decrease zoom",      408);
-    glutAddMenuEntry("[+] Increase zoom",      409);
-    glutAddMenuEntry("[m] Decrease max value", 410);
-    glutAddMenuEntry("[M] Increase max value", 411);
-    glutAddMenuEntry("[o] Decrease opacity",   412);
-    glutAddMenuEntry("[O] Increase opacity",   413);
-    glutAddMenuEntry("[t] Decrease line width",414);
-    glutAddMenuEntry("[T] Increase line width",415);
-    glutAddMenuEntry("[r] Reset view",         416);
-    glutAddMenuEntry("[l] Show/hide log",      417);
-
-    int menu_id = glutCreateMenu( GLUT__menu );
-    glutAddSubMenu("Signal", submenu_SIGNAL_id);
-    if ( PEAKS_n>0 )
-        glutAddSubMenu("Peaks", submenu_PEAKS_id);
-    if ( TRK_nTractsPlotted>0 )
-        glutAddSubMenu("Fibers", submenu_FIBERS_id);
-    glutAddSubMenu("View options", submenu_VIEW_id);
-    glutAddMenuEntry("Quit", 0);
-    glutAttachMenu(GLUT_RIGHT_BUTTON);
-}
-
-
-// RESHAPE callback
-// ----------------
-void GLUT__reshape( GLint w, GLint h )
-{
-    ScreenX = w;
-    ScreenY = h;
-
-    glViewport( 0, 0, w, h );
-
-    glMatrixMode( GL_PROJECTION );
-    glLoadIdentity();
-    gluPerspective( 45.0f, ScreenX/ScreenY, 1.0f, 5000.0f );
-
-    glMatrixMode( GL_MODELVIEW );
-    glLoadIdentity();
-    gluLookAt(
-        0.0, 0.0, 2.0 * max(pixdim.x*dim.x,pixdim.y*dim.y) * ScreenY/ScreenX, // eye point
-        0.0, 0.0, 0.0, // reference point
-        0.0, 1.0, 0.0  // up vector
-    );
-}
-
-
-// SPECIALKEY callback
-// -------------------
-void GLUT__specialkey( GLint key, GLint x, GLint y )
-{
-    bool doRedraw = true;
-    GLint modif = glutGetModifiers();
-    GLint ALT   = modif & GLUT_ACTIVE_ALT;
-    GLint CTRL  = modif & GLUT_ACTIVE_CTRL;
-
-    switch( key )
-    {
-        case GLUT_KEY_LEFT:
-            if ( ALT )
-                TRK_offset.x -= 0.5;
-            else if ( CTRL )
-                translation.x -= 2.0;
-            else
-                VOXEL.x--;
-            break;
-        case GLUT_KEY_RIGHT:
-            if ( ALT )
-                TRK_offset.x += 0.5;
-            else if ( CTRL )
-                translation.x += 2.0;
-            else
-                VOXEL.x++;
-            break;
-        case GLUT_KEY_DOWN:
-            if ( ALT )
-                TRK_offset.y -= 0.5;
-            else if ( CTRL )
-                translation.y -= 2.0;
-            else
-                VOXEL.y--;
-            break;
-        case GLUT_KEY_UP:
-            if ( ALT )
-                TRK_offset.y += 0.5;
-            else if ( CTRL )
-                translation.y += 2.0;
-            else
-                VOXEL.y++;
-            break;
-        case GLUT_KEY_PAGE_DOWN:
-            if ( ALT )
-                TRK_offset.z -= 0.5;
-            else
-                VOXEL.z--;
-            break;
-        case GLUT_KEY_PAGE_UP:
-            if ( ALT )
-                TRK_offset.z += 0.5;
-            else
-                VOXEL.z++;
-            break;
-
-        default:
-            doRedraw = false;
-    }
-
-    // check the bounds
-    VOXEL.x = max( VOXEL.x, 0 );
-    VOXEL.y = max( VOXEL.y, 0 );
-    VOXEL.z = max( VOXEL.z, 0 );
-    VOXEL.x = min( VOXEL.x, dim.x-1 );
-    VOXEL.y = min( VOXEL.y, dim.y-1 );
-    VOXEL.z = min( VOXEL.z, dim.z-1 );
-
-    if ( doRedraw )
-        glutPostRedisplay();
-}
-
-
-// MOUSE callback
-// --------------
-void GLUT__mouse( GLint button, GLint state, GLint x, GLint y )
-{
-    if (state == GLUT_DOWN)
-    {
-        if ( button == GLUT_LEFT_BUTTON && glutGetModifiers() != GLUT_ACTIVE_CTRL )
-        {
-            moving = 1;
-            start.x = x;
-            start.y = y;
-        }
-        // NOTE: does not work, issue with glutGetModifiers not getting CTRL
-        // else if ( button == GLUT_LEFT_BUTTON && glutGetModifiers() == GLUT_ACTIVE_CTRL )
-        // {
-        //     moving = 2;
-        //     start.x = x;
-        //     start.y = y;
-        // }
-        else if ( (button == GLUT_MIDDLE_BUTTON) || (button == GLUT_LEFT_BUTTON && glutGetModifiers() == GLUT_ACTIVE_ALT) )
-        {
-            moving = 3;
-            start.x = x;
-            start.y = y;
-        }
-    }
-    else if (state == GLUT_UP)
-    {
-        moving = 0;
-    }
-}
-
-
-// MOTION callback
-// ---------------
-void GLUT__motion( GLint x, GLint y )
-{
-    if (moving==1)
-    {
-        OPENGL_utils::translate(id, 0,0,0, rot1);
-
-        OPENGL_utils::rotateY(id,start.x-x,rot3);
-        OPENGL_utils::matXMat(rot,rot1,rot2);
-        OPENGL_utils::rotateX(id,start.y-y,rot1);
-        OPENGL_utils::matXMat(rot2,rot1,rot);
-        OPENGL_utils::matXMat(rot,rot3,rot2);
-
-        OPENGL_utils::translate(id, 0,0,0, rot1);
-        OPENGL_utils::matXMat(rot2,rot1,rot);
-
-        start.x = x;
-        start.y = y;
-    }
-
-    else if (moving==2)
-    {
-        zoom = zoom + (y-start.y)/2.0;
-        start.y = y;
-    }
-
-    else if (moving==3)
-    {
-        translation.x = translation.x - (start.x-x)/3.0;
-        translation.y = translation.y + (start.y-y)/3.0;
-        start.x = x;
-        start.y = y;
-    }
-
-    glutPostRedisplay();
-}
-
-
-// DISPLAY callback
-// ----------------
-void GLUT__display( void )
-{
-    glClear( GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT );
-
-    glPushMatrix();
-    glTranslatef(translation.x, translation.y, -zoom); // mouse translation + zoom
-    glMultMatrixf(rot); // mouse rotation    
-    glTranslatef( -pixdim.x*dim.x/2.0, -pixdim.y*dim.y/2.0, -pixdim.z*dim.z/2.0 ); // center the FOV
-    glScalef( pixdim.x, pixdim.y, pixdim.z ); // account for voxel size
-
-    glEnable(GL_MULTISAMPLE_ARB);
-
-    /* ============= */
-    /* Draw the AXES */
-    /* ============= */
-    if ( showAxes )
-    {
-        glLineWidth(2);
-        glBegin(GL_LINES);
-            glColor4f( 1,0,0,1); glVertex3f( 0,0,0 ); glVertex3f( 10,  0,  0 );
-            glColor4f( 0,1,0,1); glVertex3f( 0,0,0 ); glVertex3f(  0, 10,  0 );
-            glColor4f( 0,0,1,1); glVertex3f( 0,0,0 ); glVertex3f(  0,  0, 10 );
-        glEnd();
-    }
-
-    /* =============== */
-    /* Draw the TRACTS */
-    /* =============== */
-    if ( TRK_show )
-    {
-        glPushMatrix();
-        glTranslatef(TRK_offset.x, TRK_offset.y, TRK_offset.z);
-
-        glLineWidth(1.0f);
-
-        float *ptr  = TRK_coords, *ptrc = TRK_colors;
-        VECTOR<float> Vc( VOXEL.x+0.5, VOXEL.y+0.5, VOXEL.z+0.5 ); // voxel center
-        float thr = 0.5*TRK_crop;
-        for(int f=0; f < TRK_nTractsPlotted; f++)
-        {
-            glBegin(GL_LINE_STRIP);
-            for(int i=0; i < TRK_nPoints[f]; i++)
-            {
-                // plot segment only if it's close to center of VOXEL
-                if (
-                      (
-                        TRK_crop_mode && (
-                        ( showPlane[0] && abs( (ptr[0]+TRK_offset.x) - Vc.x ) <= thr ) ||
-                        ( showPlane[1] && abs( (ptr[1]+TRK_offset.y) - Vc.y ) <= thr ) ||
-                        ( showPlane[2] && abs( (ptr[2]+TRK_offset.z) - Vc.z ) <= thr ) )
-                      )
-                      ||
-                      (
-                        !TRK_crop_mode && (
-                        ( abs( (ptr[0]+TRK_offset.x) - Vc.x ) <= thr ) &&
-                        ( abs( (ptr[1]+TRK_offset.y) - Vc.y ) <= thr ) &&
-                        ( abs( (ptr[2]+TRK_offset.z) - Vc.z ) <= thr ) )
-                      )
-                    )
-                {
-                    glColor3f(  ptrc[0], ptrc[1], ptrc[2] );
-                    glVertex3f( ptr[0],  ptr[1],  ptr[2]  );
-                }
-                else
-                {
-                    glEnd();
-                    glBegin(GL_LINE_STRIP);
-                }
-                ptr  += 3;
-                ptrc += 3;
-            }
-            glEnd();
-        }
-
-        glPopMatrix();
-    }
-
-    /* ============== */
-    /* Draw the PEAKS */
-    /* ============== */
-    if ( PEAKS_show || GLYPHS_show )
-    {
-        glDisable( GL_BLEND );
-        glLineWidth( LINE_width );
-        glPointSize( LINE_width );
-
-        glPushMatrix();
-        glTranslatef(.5,.5,.5);
-
-        Vec3Df dir, col;
-        int x,y,z,d,idx;
-        float norms[PEAKS_n], normMax, b0, w;
-
-        // plane YZ
-        if ( showPlane[0]  )
-        {
-            x = (int)VOXEL.x;
-            for(y=0; y<dim.y ;y++)
-            for(z=0; z<dim.z ;z++)
-            {
-                if ( PEAKS_show )
-                {
-                    normMax = 0;
-                    for(d=0; d<PEAKS_n; d++)
-                    {
-                        col.x = (*niiPEAKS->img)(x,y,z,3*d+0); // use "col" as tmp variable
-                        col.y = (*niiPEAKS->img)(x,y,z,3*d+1);
-                        col.z = (*niiPEAKS->img)(x,y,z,3*d+2);
-                        if ( PEAKS_use_affine )
-                        {
-                            dir.x = col.x * ((float*)PEAKS_affine)[0] + col.y * ((float*)PEAKS_affine)[1] + col.z * ((float*)PEAKS_affine)[2];
-                            dir.y = col.x * ((float*)PEAKS_affine)[3] + col.y * ((float*)PEAKS_affine)[4] + col.z * ((float*)PEAKS_affine)[5];
-                            dir.z = col.x * ((float*)PEAKS_affine)[6] + col.y * ((float*)PEAKS_affine)[7] + col.z * ((float*)PEAKS_affine)[8];
-                        }
-                        else
-                        {
-                            dir.x = col.x;
-                            dir.y = col.y;
-                            dir.z = col.z;
-                        }
-                        norms[d] = dir.norm();
-                        if ( norms[d] > normMax )
-                            normMax = norms[d];
-                    }
-
-                    for(d=0; d<PEAKS_n; d++)
-                    {
-                        if ( norms[d] < PEAKS_thr*normMax )
-                            continue;
-
-                        col.x = (*niiPEAKS->img)(x,y,z,3*d+0); // use "col" as tmp variable
-                        col.y = (*niiPEAKS->img)(x,y,z,3*d+1);
-                        col.z = (*niiPEAKS->img)(x,y,z,3*d+2);
-                        if ( PEAKS_use_affine )
-                        {
-                            dir.x = col.x * ((float*)PEAKS_affine)[0] + col.y * ((float*)PEAKS_affine)[1] + col.z * ((float*)PEAKS_affine)[2];
-                            dir.y = col.x * ((float*)PEAKS_affine)[3] + col.y * ((float*)PEAKS_affine)[4] + col.z * ((float*)PEAKS_affine)[5];
-                            dir.z = col.x * ((float*)PEAKS_affine)[6] + col.y * ((float*)PEAKS_affine)[7] + col.z * ((float*)PEAKS_affine)[8];
-                        }
-                        else
-                        {
-                            dir.x = col.x;
-                            dir.y = col.y;
-                            dir.z = col.z;
-                        }
-                        col.x = 0.5 * (PEAKS_flip[0]?-1:1) * dir.x / norms[d];
-                        col.y = 0.5 * (PEAKS_flip[1]?-1:1) * dir.y / norms[d];
-                        col.z = 0.5 * (PEAKS_flip[2]?-1:1) * dir.z / norms[d];
-
-                        if ( PEAKS_doNormalize )
-                        {
-                            dir.x = col.x;
-                            dir.y = col.y;
-                            dir.z = col.z;
-                        }
-                        else
-                        {
-                            dir.x = col.x * norms[d] / normMax;
-                            dir.y = col.y * norms[d] / normMax;
-                            dir.z = col.z * norms[d] / normMax;
-                        }
-
-                        glColor3f( fabs(2.0*col.x), fabs(2.0*col.y), fabs(2.0*col.z) ); 
-                        glBegin(GL_LINES);
-                            glVertex3f( x-dir.x, y-dir.y, z-dir.z );
-                            glVertex3f( x+dir.x, y+dir.y, z+dir.z );
-                        glEnd();
-                    }
-                }
-                if ( GLYPHS_show )
-                {
-                    b0 = (*niiDWI->img)(x,y,z,SCHEME_idxB0[0]);
-                    if ( b0 > GLYPHS_b0_thr )
-                    {
-                        glBegin(GL_POINTS);
-                        for(d=0; d < SCHEME_shells_idx[GLYPHS_shell].size() ;d++)
-                        {
-                            idx = SCHEME_shells_idx[GLYPHS_shell][d];
-                            w = 0.5 * (float)(*niiDWI->img)(x,y,z,idx) / b0;
-                            if ( GLYPHS_use_affine ) 
-                            {
-                                dir.x = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[0] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[1] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[2];
-                                dir.y = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[3] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[4] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[5];
-                                dir.z = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[6] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[7] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[8];
-                                normMax = dir.norm();
-                                dir.x *= w / normMax;
-                                dir.y *= w / normMax;
-                                dir.z *= w / normMax;
-                            }
-                            else
-                            {
-                                dir.x = w * SCHEME_dirs[idx].x;
-                                dir.y = w * SCHEME_dirs[idx].y;
-                                dir.z = w * SCHEME_dirs[idx].z;
-                            }
-                            normMax = dir.norm();
-                            glColor3f( fabs(dir.x)/normMax, fabs(dir.y)/normMax, fabs(dir.z)/normMax );
-                            glVertex3f( x+dir.x, y+dir.y, z+dir.z );
-                            glVertex3f( x-dir.x, y-dir.y, z-dir.z );
-                        }
-                        glEnd();
-                    }
-                }
-            }
-        }
-
-        // plane XZ
-        if ( showPlane[1] )
-        {
-            y = (int)VOXEL.y;
-            for(x=0; x<dim.x ;x++)
-            for(z=0; z<dim.z ;z++)
-            {
-                if ( PEAKS_show )
-                {
-                    normMax = 0;
-                    for(d=0; d<PEAKS_n; d++)
-                    {
-                        col.x = (*niiPEAKS->img)(x,y,z,3*d+0); // use "col" as tmp variable
-                        col.y = (*niiPEAKS->img)(x,y,z,3*d+1);
-                        col.z = (*niiPEAKS->img)(x,y,z,3*d+2);
-                        if ( PEAKS_use_affine )
-                        {
-                            dir.x = col.x * ((float*)PEAKS_affine)[0] + col.y * ((float*)PEAKS_affine)[1] + col.z * ((float*)PEAKS_affine)[2];
-                            dir.y = col.x * ((float*)PEAKS_affine)[3] + col.y * ((float*)PEAKS_affine)[4] + col.z * ((float*)PEAKS_affine)[5];
-                            dir.z = col.x * ((float*)PEAKS_affine)[6] + col.y * ((float*)PEAKS_affine)[7] + col.z * ((float*)PEAKS_affine)[8];
-                        }
-                        else
-                        {
-                            dir.x = col.x;
-                            dir.y = col.y;
-                            dir.z = col.z;
-                        }
-                        norms[d] = dir.norm();
-                        if ( norms[d] > normMax )
-                            normMax = norms[d];
-                    }
-
-                    for(d=0; d<PEAKS_n; d++)
-                    {
-                        if ( norms[d] < normMax*PEAKS_thr )
-                            continue;
-
-                        col.x = (*niiPEAKS->img)(x,y,z,3*d+0); // use "col" as tmp variable
-                        col.y = (*niiPEAKS->img)(x,y,z,3*d+1);
-                        col.z = (*niiPEAKS->img)(x,y,z,3*d+2);
-                        if ( PEAKS_use_affine )
-                        {
-                            dir.x = col.x * ((float*)PEAKS_affine)[0] + col.y * ((float*)PEAKS_affine)[1] + col.z * ((float*)PEAKS_affine)[2];
-                            dir.y = col.x * ((float*)PEAKS_affine)[3] + col.y * ((float*)PEAKS_affine)[4] + col.z * ((float*)PEAKS_affine)[5];
-                            dir.z = col.x * ((float*)PEAKS_affine)[6] + col.y * ((float*)PEAKS_affine)[7] + col.z * ((float*)PEAKS_affine)[8];
-                        }
-                        else
-                        {
-                            dir.x = col.x;
-                            dir.y = col.y;
-                            dir.z = col.z;
-                        }
-                        col.x = 0.5 * (PEAKS_flip[0]?-1:1) * dir.x / norms[d];
-                        col.y = 0.5 * (PEAKS_flip[1]?-1:1) * dir.y / norms[d];
-                        col.z = 0.5 * (PEAKS_flip[2]?-1:1) * dir.z / norms[d];
-
-                        if ( PEAKS_doNormalize )
-                        {
-                            dir.x = col.x;
-                            dir.y = col.y;
-                            dir.z = col.z;
-                        }
-                        else
-                        {
-                            dir.x = col.x * norms[d] / normMax;
-                            dir.y = col.y * norms[d] / normMax;
-                            dir.z = col.z * norms[d] / normMax;
-                        }
-
-                        glColor3f( fabs(2.0*col.x), fabs(2.0*col.y), fabs(2.0*col.z) );
-                        glBegin(GL_LINES);
-                            glVertex3f( x-dir.x, y-dir.y, z-dir.z );
-                            glVertex3f( x+dir.x, y+dir.y, z+dir.z );
-                        glEnd();
-                    }
-                }
-
-                if ( GLYPHS_show )
-                {
-                    b0 = (*niiDWI->img)(x,y,z,SCHEME_idxB0[0]);
-                    if ( b0 > GLYPHS_b0_thr )
-                    {
-                        glBegin(GL_POINTS);
-                        for(d=0; d < SCHEME_shells_idx[GLYPHS_shell].size() ;d++)
-                        {
-                            idx = SCHEME_shells_idx[GLYPHS_shell][d];
-                            w = 0.5 * (float)(*niiDWI->img)(x,y,z,idx) / b0;
-                            if ( GLYPHS_use_affine ) 
-                            {
-                                dir.x = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[0] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[1] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[2];
-                                dir.y = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[3] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[4] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[5];
-                                dir.z = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[6] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[7] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[8];
-                                normMax = dir.norm();
-                                dir.x *= w / normMax;
-                                dir.y *= w / normMax;
-                                dir.z *= w / normMax;
-                            }
-                            else
-                            {
-                                dir.x = w * SCHEME_dirs[idx].x;
-                                dir.y = w * SCHEME_dirs[idx].y;
-                                dir.z = w * SCHEME_dirs[idx].z;
-                            }
-                            normMax = dir.norm();
-                            glColor3f( fabs(dir.x)/normMax, fabs(dir.y)/normMax, fabs(dir.z)/normMax );
-                            glVertex3f( x+dir.x, y+dir.y, z+dir.z );
-                            glVertex3f( x-dir.x, y-dir.y, z-dir.z );
-                        }
-                        glEnd();
-                    }
-                }
-            }
-        }
-
-        // plane XY
-        if ( showPlane[2] )
-        {
-            z = (int)VOXEL.z;
-            for(y=0; y<dim.y ;y++)
-            for(x=0; x<dim.x ;x++)
-            {
-                if ( PEAKS_show )
-                {
-                    normMax = 0;
-                    for(d=0; d<PEAKS_n; d++)
-                    {
-                        col.x = (*niiPEAKS->img)(x,y,z,3*d+0); // use "col" as tmp variable
-                        col.y = (*niiPEAKS->img)(x,y,z,3*d+1);
-                        col.z = (*niiPEAKS->img)(x,y,z,3*d+2);
-                        if ( PEAKS_use_affine )
-                        {
-                            dir.x = col.x * ((float*)PEAKS_affine)[0] + col.y * ((float*)PEAKS_affine)[1] + col.z * ((float*)PEAKS_affine)[2];
-                            dir.y = col.x * ((float*)PEAKS_affine)[3] + col.y * ((float*)PEAKS_affine)[4] + col.z * ((float*)PEAKS_affine)[5];
-                            dir.z = col.x * ((float*)PEAKS_affine)[6] + col.y * ((float*)PEAKS_affine)[7] + col.z * ((float*)PEAKS_affine)[8];
-                        }
-                        else
-                        {
-                            dir.x = col.x;
-                            dir.y = col.y;
-                            dir.z = col.z;
-                        }
-                        norms[d] = dir.norm();
-                        if ( norms[d] > normMax )
-                            normMax = norms[d];
-                    }
-
-                    for(d=0; d<PEAKS_n; d++)
-                    {
-                        if ( norms[d] < normMax*PEAKS_thr )
-                            continue;
-
-                        col.x = (*niiPEAKS->img)(x,y,z,3*d+0); // use "col" as tmp variable
-                        col.y = (*niiPEAKS->img)(x,y,z,3*d+1);
-                        col.z = (*niiPEAKS->img)(x,y,z,3*d+2);
-                        if ( PEAKS_use_affine )
-                        {
-                            dir.x = col.x * ((float*)PEAKS_affine)[0] + col.y * ((float*)PEAKS_affine)[1] + col.z * ((float*)PEAKS_affine)[2];
-                            dir.y = col.x * ((float*)PEAKS_affine)[3] + col.y * ((float*)PEAKS_affine)[4] + col.z * ((float*)PEAKS_affine)[5];
-                            dir.z = col.x * ((float*)PEAKS_affine)[6] + col.y * ((float*)PEAKS_affine)[7] + col.z * ((float*)PEAKS_affine)[8];
-                        }
-                        else
-                        {
-                            dir.x = col.x;
-                            dir.y = col.y;
-                            dir.z = col.z;
-                        }
-                        col.x = 0.5 * (PEAKS_flip[0]?-1:1) * dir.x / norms[d];
-                        col.y = 0.5 * (PEAKS_flip[1]?-1:1) * dir.y / norms[d];
-                        col.z = 0.5 * (PEAKS_flip[2]?-1:1) * dir.z / norms[d];
-
-                        if ( PEAKS_doNormalize )
-                        {
-                            dir.x = col.x;
-                            dir.y = col.y;
-                            dir.z = col.z;
-                        }
-                        else
-                        {
-                            dir.x = col.x * norms[d] / normMax;
-                            dir.y = col.y * norms[d] / normMax;
-                            dir.z = col.z * norms[d] / normMax;
-                        }
-
-                        glColor3f( fabs(2.0*col.x), fabs(2.0*col.y), fabs(2.0*col.z) );
-                        glBegin(GL_LINES);
-                            glVertex3f( x-dir.x, y-dir.y, z-dir.z );
-                            glVertex3f( x+dir.x, y+dir.y, z+dir.z );
-                        glEnd();
-                    }
-                }
-
-                if( GLYPHS_show)
-                {
-                    b0 = (*niiDWI->img)(x,y,z,SCHEME_idxB0[0]);
-                    if ( b0 > GLYPHS_b0_thr )
-                    {
-                        glBegin(GL_POINTS);
-                        for(d=0; d < SCHEME_shells_idx[GLYPHS_shell].size() ;d++)
-                        {
-                            idx = SCHEME_shells_idx[GLYPHS_shell][d];
-                            w = 0.5 * (float)(*niiDWI->img)(x,y,z,idx) / b0;
-                            if ( GLYPHS_use_affine ) 
-                            {
-                                dir.x = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[0] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[1] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[2];
-                                dir.y = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[3] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[4] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[5];
-                                dir.z = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[6] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[7] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[8];
-                                normMax = dir.norm();
-                                dir.x *= w / normMax;
-                                dir.y *= w / normMax;
-                                dir.z *= w / normMax;
-                            }
-                            else
-                            {
-                                dir.x = w * SCHEME_dirs[idx].x;
-                                dir.y = w * SCHEME_dirs[idx].y;
-                                dir.z = w * SCHEME_dirs[idx].z;
-                            }
-
-                            normMax = dir.norm();
-                            glColor3f( fabs(dir.x)/normMax, fabs(dir.y)/normMax, fabs(dir.z)/normMax );
-                            glVertex3f( x+dir.x, y+dir.y, z+dir.z );
-                            glVertex3f( x-dir.x, y-dir.y, z-dir.z );
-                        }
-                        glEnd();
-                    }
-                }
-            }
-        }
-
-        glPopMatrix();
-    }
-
-    /* =================== */
-    /* Draw the SCALAR MAP */
-    /* =================== */
-    if ( showPlane[0] || showPlane[1] || showPlane[2] )
-    {
-        glDisable( GL_CULL_FACE );
-        glEnable( GL_BLEND );
-        glBlendFunc( GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA );
-
-        // to avoid z-fighting
-        glPolygonOffset( 1.0, 1.0 );
-        glEnable(GL_POLYGON_OFFSET_FILL);
-        glPolygonMode(GL_FRONT_AND_BACK, GL_FILL);
-
-        glLineWidth( 3 );
-
-        int x, y, z; // voxel coordinates NB: (0,0,0) -> corner of voxel
-        float color;
-
-        // plane YZ
-        if ( showPlane[0]  )
-        {
-            glPushMatrix();
-            glTranslatef(0.5,0,0);
-
-            x = (int)VOXEL.x;
-            for(y=0; y<dim.y ;y++)
-            for(z=0; z<dim.z ;z++)
-            {
-                color = ( MAP(x,y,z) - MAP_min_view) / ( MAP_max_view - MAP_min_view );
-                glColor4f(color,color,color,MAP_opacity);
-                glBegin(GL_QUADS);
-                    glVertex3f(x, y,   z);
-                    glVertex3f(x, y,   z+1);
-                    glVertex3f(x, y+1, z+1);
-                    glVertex3f(x, y+1, z);
-                glEnd();
-            }
-            // colored frame
-            if ( showAxes )
-            {
-                glColor3f(1,0,0);
-                glBegin(GL_LINE_STRIP);
-                    glVertex3f(x,0,0);
-                    glVertex3f(x,dim.y,0);
-                    glVertex3f(x,dim.y,dim.z);
-                    glVertex3f(x,0,dim.z);
-                    glVertex3f(x,0,0);
-                glEnd();
-            }
-
-            glPopMatrix();
-        }
-
-        // plane XZ
-        if ( showPlane[1] )
-        {
-            glPushMatrix();
-            glTranslatef(0,0.5,0);
-
-            y = (int)VOXEL.y;
-            for(x=0; x<dim.x ;x++)
-            for(z=0; z<dim.z ;z++)
-            {
-                color = ( MAP(x,y,z) - MAP_min_view) / ( MAP_max_view - MAP_min_view );
-                glColor4f(color,color,color,MAP_opacity);
-                glBegin(GL_QUADS);
-                    glVertex3f(x,   y, z);
-                    glVertex3f(x,   y, z+1);
-                    glVertex3f(x+1, y, z+1);
-                    glVertex3f(x+1, y, z);
-                glEnd();
-            }
-            // colored frame
-            if ( showAxes )
-            {
-                glColor3f(0,1,0);
-                glBegin(GL_LINE_STRIP);
-                    glVertex3f(0,y,0);
-                    glVertex3f(dim.x,y,0);
-                    glVertex3f(dim.x,y,dim.z);
-                    glVertex3f(0,y,dim.z);
-                    glVertex3f(0,y,0);
-                glEnd();
-            }
-
-            glPopMatrix();
-        }
-
-        // plane XY
-        if ( showPlane[2] )
-        {
-            glPushMatrix();
-            glTranslatef(0,0,0.5);
-
-            z = (int)VOXEL.z;
-            for(y=0; y<dim.y ;y++)
-            for(x=0; x<dim.x ;x++)
-            {
-                color = ( MAP(x,y,z) - MAP_min_view) / ( MAP_max_view - MAP_min_view );
-                glColor4f(color,color,color,MAP_opacity);
-                glBegin(GL_QUADS);
-                    glVertex3f(x,   y,   z);
-                    glVertex3f(x+1, y,   z);
-                    glVertex3f(x+1, y+1, z);
-                    glVertex3f(x,   y+1, z);
-                glEnd();
-            }
-
-            // colored frame
-            if ( showAxes )
-            {
-                glColor3f(0,0,1);
-                glBegin(GL_LINE_STRIP);
-                    glVertex3f(0,0,z);
-                    glVertex3f(dim.x,0,z);
-                    glVertex3f(dim.x,dim.y,z);
-                    glVertex3f(0,dim.y,z);
-                    glVertex3f(0,0,z);
-                glEnd();
-            }
-
-            glPopMatrix();
-        }
-
-        glEnable(GL_CULL_FACE);
-        glDisable( GL_BLEND );
-        glDisable(GL_POLYGON_OFFSET_FILL);
-    }
-
-    /* ====================== */
-    /* Draw the CURRENT VOXEL */
-    /* ====================== */
-    if ( showAxes )
-    {
-        glPushMatrix();
-        glTranslatef( VOXEL.x+0.5, VOXEL.y+0.5, VOXEL.z+0.5 );
-
-        glEnable( GL_BLEND );
-        glBlendFunc( GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA );
-        glLineWidth(1);
-        glColor4f( 1,1,0,1 );
-        glutWireCube( 1 );
-        glColor4f( 1,1,0,0.25 );
-        glutSolidCube( 1 );
-        glDisable( GL_BLEND );
-
-        glPopMatrix();
-    }
-
-    glPopMatrix();
-    PrintConfig();
-    glutSwapBuffers();
-}
-
-
-// INITIALIZATION
-// --------------
-void OpenGL_init( int argc, char** argv )
-{
-    glutInit( &argc, argv );
-    glutInitDisplayMode( GLUT_DEPTH | GLUT_DOUBLE | GLUT_RGBA | GLUT_ALPHA | GLUT_MULTISAMPLE );
-    ScreenX = 0.7*glutGet(GLUT_SCREEN_WIDTH);  if (ScreenX==0) ScreenX = 800;
-    ScreenY = 0.7*glutGet(GLUT_SCREEN_HEIGHT); if (ScreenY==0) ScreenY = 600;
-    glutInitWindowSize( ScreenX, ScreenY );
-    glutInitWindowPosition( 0.15*glutGet(GLUT_SCREEN_WIDTH), 0.15*glutGet(GLUT_SCREEN_HEIGHT) );
-    glutCreateWindow( "COMMIT debugger" );
-
-    // Projection and model matrix
-    glMatrixMode(GL_PROJECTION);
-    glLoadIdentity();
-    glMatrixMode(GL_MODELVIEW);
-    glLoadIdentity();
-
-    translation.x	= translation.y = 0;
-    zoom			= 0;
-    OPENGL_utils::identity( rot );
-    OPENGL_utils::identity( id );
-
-    glEnable( GL_DEPTH_TEST );
-    glClearColor( 0.1, 0.1, 0.1, 0.0 );
-
-    // lighting
-    glShadeModel( GL_SMOOTH );
-    glEnable( GL_NORMALIZE );
-
-    GLfloat white[] = {.5f, .5f, .5f, 1.0f};
-    glMaterialfv(GL_FRONT, GL_SPECULAR, white);
-    GLfloat shininess[] = {32};
-    glMaterialfv(GL_FRONT, GL_SHININESS, shininess);
-
-    glLightModeli(GL_LIGHT_MODEL_TWO_SIDE, GL_FALSE);
-    GLfloat global_ambient[] = { 0.2f, 0.2f, 0.2f, 1.0f };
-    glLightModelfv(GL_LIGHT_MODEL_AMBIENT, global_ambient);
-    glEnable ( GL_COLOR_MATERIAL );
-
-    // register CALLBACKS and open window
-    glutKeyboardFunc( GLUT__keyboard );
-    glutSpecialFunc(  GLUT__specialkey );
-    glutDisplayFunc(  GLUT__display );
-    glutReshapeFunc(  GLUT__reshape );
-    glutMouseFunc(    GLUT__mouse );
-    glutMotionFunc(   GLUT__motion );
-
-    GLUT__createMenu();
-
-    glutMainLoop();
-}
+#define GL_GLEXT_PROTOTYPES 1
+#ifdef __APPLE__
+    #include <OpenGL/gl.h>
+    #include <OpenGL/glext.h>
+    #include <GLUT/glut.h>
+#else
+    #include <GL/gl.h>
+    #include <GL/glext.h>
+    #include <GL/glut.h>
+#endif
+
+#include "OPENGL_utils.h"
+using namespace OPENGL_utils;
+
+/* global variables */
+GLfloat			id[16], rot[16], rot1[16], rot2[16], rot3[16];
+Vec3Df			translation;
+Vec3Di			start;
+GLint			moving;
+GLfloat			zoom;
+
+float ScreenX, ScreenY;
+
+
+void drawString( const char *string )
+{
+    static int y = glutGet( GLUT_WINDOW_HEIGHT ) - 50;
+    if ( string=="" )
+        y = glutGet( GLUT_WINDOW_HEIGHT ) - 50;
+    else
+    {
+        glRasterPos2i(10, y);
+        for (const char* c=string; *c != '\0'; c++) 
+            glutBitmapCharacter(GLUT_BITMAP_9_BY_15, *c);
+        y -= 18;
+    }
+}
+
+
+void PrintConfig()
+{
+    if ( !showConfig )
+        return;
+
+    glMatrixMode(GL_PROJECTION);
+    glPushMatrix();             
+    glLoadIdentity();
+    glMatrixMode( GL_MODELVIEW ) ;
+    glPushMatrix() ;
+    glLoadIdentity() ;
+    int w = glutGet( GLUT_WINDOW_WIDTH );
+    int h = glutGet( GLUT_WINDOW_HEIGHT );
+    glOrtho( 0, w, 0, h, -1, 1 );
+    glDisable( GL_DEPTH_TEST ); 
+
+    char s[1024];
+    glColor3f(1, 1, 0);
+    drawString( "" ); // reset initial position
+
+    drawString( "MAP" );
+    sprintf( s, "   - value(%d,%d,%d) = %.2f", VOXEL.x, VOXEL.y, VOXEL.z, MAP(VOXEL.x, VOXEL.y, VOXEL.z) );
+    drawString( s );
+    sprintf( s, "   - range = [ %.1f ... %.1f ]", MAP_min_view, MAP_max_view );
+    drawString( s );
+    sprintf( s, "   - opacity = %.1f", MAP_opacity );
+    drawString( s );
+
+    drawString( "SIGNAL" );
+    sprintf( s, "   - shell = %d/%d  (b=%.1f)", GLYPHS_shell+1, SCHEME_shells_b.size(), SCHEME_shells_b[GLYPHS_shell] );
+    drawString( s );
+    sprintf( s, "   - use affine = %s", GLYPHS_use_affine?"true":"false" );
+    drawString( s );
+    sprintf( s, "   - flip = [ %d, %d, %d ]", GLYPHS_flip[0], GLYPHS_flip[1], GLYPHS_flip[2] );
+    drawString( s );
+    sprintf( s, "   - b0 thr = %.1f", GLYPHS_b0_thr );
+    drawString( s );
+
+    if ( PEAKS_n>0 )
+    {
+        drawString( "PEAKS" );
+        sprintf( s, "   - use affine = %s", PEAKS_use_affine?"true":"false" );
+        drawString( s );
+        sprintf( s, "   - flip = [ %d, %d, %d ]", PEAKS_flip[0], PEAKS_flip[1], PEAKS_flip[2] );
+        drawString( s );
+        sprintf( s, "   - thr = %.1f", PEAKS_thr );
+        drawString( s );
+        sprintf( s, "   - normalize = %s", PEAKS_doNormalize?"true":"false" );
+        drawString( s );
+    }
+
+    if ( TRK_nTractsPlotted>0 )
+    {
+        drawString( "FIBERS" );
+        sprintf( s, "   - shift = [ %.1f %.1f %.1f ]  (voxels)", TRK_offset.x, TRK_offset.y, TRK_offset.z );
+        drawString( s );
+        sprintf( s, "   - slab thickness = %.1f  (voxels)", TRK_crop );
+        drawString( s );
+    }
+
+    glEnable (GL_DEPTH_TEST);     
+    glMatrixMode(GL_PROJECTION);
+    glPopMatrix();
+    glMatrixMode(GL_MODELVIEW);
+    glPopMatrix();
+}
+
+
+// KEYBOARD callback
+// -----------------
+void GLUT__keyboard( unsigned char key, GLint x=0, GLint y=0 )
+{
+    bool doRedraw = true;
+
+    switch( key )
+    {
+        case 'l': showConfig = 1 - showConfig; break;
+
+        case '1': showPlane[0] = 1 - showPlane[0]; break;
+        case '2': showPlane[1] = 1 - showPlane[1]; break;
+        case '3': showPlane[2] = 1 - showPlane[2]; break;
+        case '4':
+            showPlane[0] = 1;
+            showPlane[1] = 0;
+            showPlane[2] = 0;
+            translation.x	= translation.y = 0;
+            OPENGL_utils::identity(rot1);
+            OPENGL_utils::rotateX(rot1, 90.0, rot2);
+            OPENGL_utils::rotateZ(rot2, 90.0, rot);
+            break;
+        case '5':
+            showPlane[0] = 0;
+            showPlane[1] = 1;
+            showPlane[2] = 0;
+            translation.x	= translation.y = 0;
+            OPENGL_utils::identity(rot1);
+            OPENGL_utils::rotateX(rot1, 90.0, rot);
+            break;
+        case '6':
+            showPlane[0] = 0;
+            showPlane[1] = 0;
+            showPlane[2] = 1;
+            translation.x	= translation.y = 0;
+            OPENGL_utils::identity( rot );
+            break;
+
+        case '0': showAxes = 1 - showAxes; break;
+        case '-': zoom += 10.0; break;
+        case '+': zoom -= 10.0; break;
+        case 'm': MAP_max_view = fmaxf(0.0,MAP_max_view-MAP_max*0.05); break;
+        case 'M': MAP_max_view = fminf(MAP_max,MAP_max_view+MAP_max*0.05); break;
+        case 'o': MAP_opacity = fmaxf(0.0,MAP_opacity-0.1); break;
+        case 'O': MAP_opacity = fminf(1.0,MAP_opacity+0.1); break;
+        case 'w': LINE_width = fmaxf( 1,LINE_width-1); break;
+        case 'W': LINE_width = fminf(10,LINE_width+1); break;
+        case 'r':
+            showPlane[0] = showPlane[1] = showPlane[2] = 1;
+            translation.x	= translation.y = 0;
+            zoom			= 0;
+            OPENGL_utils::identity( rot );
+            break;
+
+        case 's': GLYPHS_show = 1 - GLYPHS_show; break;
+        case 'S': GLYPHS_shell = (GLYPHS_shell+1) % SCHEME_shells_idx.size(); break;
+        case 'a': GLYPHS_use_affine = 1 - GLYPHS_use_affine; break;
+        case 'x': GLYPHS_flip[0] = 1 - GLYPHS_flip[0]; for(int d=0; d < SCHEME_dirs.size() ;d++) SCHEME_dirs[d].x *= -1; break;
+        case 'y': GLYPHS_flip[1] = 1 - GLYPHS_flip[1]; for(int d=0; d < SCHEME_dirs.size() ;d++) SCHEME_dirs[d].y *= -1; break;
+        case 'z': GLYPHS_flip[2] = 1 - GLYPHS_flip[2]; for(int d=0; d < SCHEME_dirs.size() ;d++) SCHEME_dirs[d].z *= -1; break;
+        case 'b': GLYPHS_b0_thr = fmaxf(0.0,GLYPHS_b0_thr-10.0); break;
+        case 'B': GLYPHS_b0_thr = fminf(MAP_max,GLYPHS_b0_thr+10.0); break;
+
+        case 'p': if ( PEAKS_n>0 ) PEAKS_show  = 1 - PEAKS_show; break;
+        case 'A': PEAKS_use_affine = 1 - PEAKS_use_affine; break;
+        case 'X': PEAKS_flip[0] = 1 - PEAKS_flip[0]; break;
+        case 'Y': PEAKS_flip[1] = 1 - PEAKS_flip[1]; break;
+        case 'Z': PEAKS_flip[2] = 1 - PEAKS_flip[2]; break;
+        case 't': PEAKS_thr = fmaxf(PEAKS_thr - 0.1, 0.0); break;
+        case 'T': PEAKS_thr = fminf(PEAKS_thr + 0.1, 1.0); break;
+        case 'n': PEAKS_doNormalize = 1 - PEAKS_doNormalize; break;
+
+        case 'f': if ( TRK_nTractsPlotted>0 ) TRK_show = 1 - TRK_show; break;
+        case 'c': TRK_crop = fmaxf( 0.0,TRK_crop-0.5); break;
+        case 'C': TRK_crop = fminf(max(dim.x,max(dim.y,dim.z)),TRK_crop+0.5); break;
+        case ' ': TRK_crop_mode = 1 - TRK_crop_mode; break;
+
+        case 'q':
+        case 27 : exit(0); break;
+
+        default: doRedraw = false;
+    }
+
+    if ( doRedraw )
+        glutPostRedisplay();
+}
+
+
+// MENU callback
+// -------------
+void GLUT__menu( int id ) 
+{
+    switch( id )
+    {
+        case   0: GLUT__keyboard('q'); break;
+
+        case 101: GLUT__keyboard('s'); break;
+        case 102: GLUT__keyboard('S'); break;
+        case 103: GLUT__keyboard('a'); break;
+        case 104: GLUT__keyboard('x'); break;
+        case 105: GLUT__keyboard('y'); break;
+        case 106: GLUT__keyboard('z'); break;
+        case 107: GLUT__keyboard('b'); break;
+        case 108: GLUT__keyboard('B'); break;
+
+        case 201: GLUT__keyboard('p'); break;
+        case 202: GLUT__keyboard('A'); break;
+        case 203: GLUT__keyboard('X'); break;
+        case 204: GLUT__keyboard('Y'); break;
+        case 205: GLUT__keyboard('Z'); break;
+        case 206: GLUT__keyboard('t'); break;
+        case 207: GLUT__keyboard('T'); break;
+        case 208: GLUT__keyboard('n'); break;
+
+        case 301: GLUT__keyboard('f'); break;
+        case 302: GLUT__keyboard('c'); break;
+        case 303: GLUT__keyboard('C'); break;
+        case 304: GLUT__keyboard(' '); break;
+
+        case 401: GLUT__keyboard('1'); break;
+        case 402: GLUT__keyboard('2'); break;
+        case 403: GLUT__keyboard('3'); break;
+        case 404: GLUT__keyboard('4'); break;
+        case 405: GLUT__keyboard('5'); break;
+        case 406: GLUT__keyboard('6'); break;
+        case 407: GLUT__keyboard('0'); break;
+        case 408: GLUT__keyboard('-'); break;
+        case 409: GLUT__keyboard('+'); break;
+        case 410: GLUT__keyboard('m'); break;
+        case 411: GLUT__keyboard('M'); break;
+        case 412: GLUT__keyboard('o'); break;
+        case 413: GLUT__keyboard('O'); break;
+        case 414: GLUT__keyboard('w'); break;
+        case 415: GLUT__keyboard('W'); break;
+        case 416: GLUT__keyboard('r'); break;
+        case 417: GLUT__keyboard('l'); break;
+    }
+}
+
+
+// Create the dropdown MENU
+// ------------------------
+void GLUT__createMenu()
+{
+    int submenu_SIGNAL_id, submenu_PEAKS_id, submenu_FIBERS_id, submenu_VIEW_id;
+
+    submenu_SIGNAL_id = glutCreateMenu( GLUT__menu );
+    glutAddMenuEntry("[s] Show/hide",         101);
+    glutAddMenuEntry("[S] Change shell",      102);
+    glutAddMenuEntry("[a] Use affine",        103);
+    glutAddMenuEntry("[x] Flip X axis",       104);
+    glutAddMenuEntry("[y] Flip Y axis",       105);
+    glutAddMenuEntry("[z] Flip Z axis",       106);
+    glutAddMenuEntry("[b] Decrease b0 thr",   107);
+    glutAddMenuEntry("[B] Increase b0 thr",   108);
+
+    if ( PEAKS_n>0 )
+    {
+        submenu_PEAKS_id = glutCreateMenu( GLUT__menu );
+        glutAddMenuEntry("[p] Show/hide",         201);
+        glutAddMenuEntry("[A] Use affine",        202);
+        glutAddMenuEntry("[X] Flip X axis",       203);
+        glutAddMenuEntry("[Y] Flip Y axis",       204);
+        glutAddMenuEntry("[Z] Flip Z axis",       205);
+        glutAddMenuEntry("[t] Decrease threshold",206);
+        glutAddMenuEntry("[T] Increase threshold",207);
+        glutAddMenuEntry("[n] Normalize length",  208);
+    }
+
+    if ( TRK_nTractsPlotted>0 )
+    {
+        submenu_FIBERS_id = glutCreateMenu( GLUT__menu );
+        glutAddMenuEntry("[f] Show/hide",         301);
+        glutAddMenuEntry("[c] Decrease crop size",302);
+        glutAddMenuEntry("[C] Increase crop size",303);
+        glutAddMenuEntry("[ ] Change crop mode",  304);
+    }
+
+    submenu_VIEW_id = glutCreateMenu( GLUT__menu );
+    glutAddMenuEntry("[1] Show/hide YZ plane", 401);
+    glutAddMenuEntry("[2] Show/hide XZ plane", 402);
+    glutAddMenuEntry("[3] Show/hide XY plane", 403);
+    glutAddMenuEntry("[4] Reset to YZ plane",  404);
+    glutAddMenuEntry("[5] Reset to XZ plane",  405);
+    glutAddMenuEntry("[6] Reset to XY plane",  406);
+    glutAddMenuEntry("[0] Show/hide axes",     407);
+    glutAddMenuEntry("[-] Decrease zoom",      408);
+    glutAddMenuEntry("[+] Increase zoom",      409);
+    glutAddMenuEntry("[m] Decrease max value", 410);
+    glutAddMenuEntry("[M] Increase max value", 411);
+    glutAddMenuEntry("[o] Decrease opacity",   412);
+    glutAddMenuEntry("[O] Increase opacity",   413);
+    glutAddMenuEntry("[t] Decrease line width",414);
+    glutAddMenuEntry("[T] Increase line width",415);
+    glutAddMenuEntry("[r] Reset view",         416);
+    glutAddMenuEntry("[l] Show/hide log",      417);
+
+    int menu_id = glutCreateMenu( GLUT__menu );
+    glutAddSubMenu("Signal", submenu_SIGNAL_id);
+    if ( PEAKS_n>0 )
+        glutAddSubMenu("Peaks", submenu_PEAKS_id);
+    if ( TRK_nTractsPlotted>0 )
+        glutAddSubMenu("Fibers", submenu_FIBERS_id);
+    glutAddSubMenu("View options", submenu_VIEW_id);
+    glutAddMenuEntry("Quit", 0);
+    glutAttachMenu(GLUT_RIGHT_BUTTON);
+}
+
+
+// RESHAPE callback
+// ----------------
+void GLUT__reshape( GLint w, GLint h )
+{
+    ScreenX = w;
+    ScreenY = h;
+
+    glViewport( 0, 0, w, h );
+
+    glMatrixMode( GL_PROJECTION );
+    glLoadIdentity();
+    gluPerspective( 45.0f, ScreenX/ScreenY, 1.0f, 5000.0f );
+
+    glMatrixMode( GL_MODELVIEW );
+    glLoadIdentity();
+    gluLookAt(
+        0.0, 0.0, 2.0 * max(pixdim.x*dim.x,pixdim.y*dim.y) * ScreenY/ScreenX, // eye point
+        0.0, 0.0, 0.0, // reference point
+        0.0, 1.0, 0.0  // up vector
+    );
+}
+
+
+// SPECIALKEY callback
+// -------------------
+void GLUT__specialkey( GLint key, GLint x, GLint y )
+{
+    bool doRedraw = true;
+    GLint modif = glutGetModifiers();
+    GLint ALT   = modif & GLUT_ACTIVE_ALT;
+    GLint CTRL  = modif & GLUT_ACTIVE_CTRL;
+
+    switch( key )
+    {
+        case GLUT_KEY_LEFT:
+            if ( ALT )
+                TRK_offset.x -= 0.5;
+            else if ( CTRL )
+                translation.x -= 2.0;
+            else
+                VOXEL.x--;
+            break;
+        case GLUT_KEY_RIGHT:
+            if ( ALT )
+                TRK_offset.x += 0.5;
+            else if ( CTRL )
+                translation.x += 2.0;
+            else
+                VOXEL.x++;
+            break;
+        case GLUT_KEY_DOWN:
+            if ( ALT )
+                TRK_offset.y -= 0.5;
+            else if ( CTRL )
+                translation.y -= 2.0;
+            else
+                VOXEL.y--;
+            break;
+        case GLUT_KEY_UP:
+            if ( ALT )
+                TRK_offset.y += 0.5;
+            else if ( CTRL )
+                translation.y += 2.0;
+            else
+                VOXEL.y++;
+            break;
+        case GLUT_KEY_PAGE_DOWN:
+            if ( ALT )
+                TRK_offset.z -= 0.5;
+            else
+                VOXEL.z--;
+            break;
+        case GLUT_KEY_PAGE_UP:
+            if ( ALT )
+                TRK_offset.z += 0.5;
+            else
+                VOXEL.z++;
+            break;
+
+        default:
+            doRedraw = false;
+    }
+
+    // check the bounds
+    VOXEL.x = max( VOXEL.x, 0 );
+    VOXEL.y = max( VOXEL.y, 0 );
+    VOXEL.z = max( VOXEL.z, 0 );
+    VOXEL.x = min( VOXEL.x, dim.x-1 );
+    VOXEL.y = min( VOXEL.y, dim.y-1 );
+    VOXEL.z = min( VOXEL.z, dim.z-1 );
+
+    if ( doRedraw )
+        glutPostRedisplay();
+}
+
+
+// MOUSE callback
+// --------------
+void GLUT__mouse( GLint button, GLint state, GLint x, GLint y )
+{
+    if (state == GLUT_DOWN)
+    {
+        if ( button == GLUT_LEFT_BUTTON && glutGetModifiers() != GLUT_ACTIVE_CTRL )
+        {
+            moving = 1;
+            start.x = x;
+            start.y = y;
+        }
+        // NOTE: does not work, issue with glutGetModifiers not getting CTRL
+        // else if ( button == GLUT_LEFT_BUTTON && glutGetModifiers() == GLUT_ACTIVE_CTRL )
+        // {
+        //     moving = 2;
+        //     start.x = x;
+        //     start.y = y;
+        // }
+        else if ( (button == GLUT_MIDDLE_BUTTON) || (button == GLUT_LEFT_BUTTON && glutGetModifiers() == GLUT_ACTIVE_ALT) )
+        {
+            moving = 3;
+            start.x = x;
+            start.y = y;
+        }
+    }
+    else if (state == GLUT_UP)
+    {
+        moving = 0;
+    }
+}
+
+
+// MOTION callback
+// ---------------
+void GLUT__motion( GLint x, GLint y )
+{
+    if (moving==1)
+    {
+        OPENGL_utils::translate(id, 0,0,0, rot1);
+
+        OPENGL_utils::rotateY(id,start.x-x,rot3);
+        OPENGL_utils::matXMat(rot,rot1,rot2);
+        OPENGL_utils::rotateX(id,start.y-y,rot1);
+        OPENGL_utils::matXMat(rot2,rot1,rot);
+        OPENGL_utils::matXMat(rot,rot3,rot2);
+
+        OPENGL_utils::translate(id, 0,0,0, rot1);
+        OPENGL_utils::matXMat(rot2,rot1,rot);
+
+        start.x = x;
+        start.y = y;
+    }
+
+    else if (moving==2)
+    {
+        zoom = zoom + (y-start.y)/2.0;
+        start.y = y;
+    }
+
+    else if (moving==3)
+    {
+        translation.x = translation.x - (start.x-x)/3.0;
+        translation.y = translation.y + (start.y-y)/3.0;
+        start.x = x;
+        start.y = y;
+    }
+
+    glutPostRedisplay();
+}
+
+
+// DISPLAY callback
+// ----------------
+void GLUT__display( void )
+{
+    glClear( GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT );
+
+    glPushMatrix();
+    glTranslatef(translation.x, translation.y, -zoom); // mouse translation + zoom
+    glMultMatrixf(rot); // mouse rotation    
+    glTranslatef( -pixdim.x*dim.x/2.0, -pixdim.y*dim.y/2.0, -pixdim.z*dim.z/2.0 ); // center the FOV
+    glScalef( pixdim.x, pixdim.y, pixdim.z ); // account for voxel size
+
+    glEnable(GL_MULTISAMPLE_ARB);
+
+    /* ============= */
+    /* Draw the AXES */
+    /* ============= */
+    if ( showAxes )
+    {
+        glLineWidth(2);
+        glBegin(GL_LINES);
+            glColor4f( 1,0,0,1); glVertex3f( 0,0,0 ); glVertex3f( 10,  0,  0 );
+            glColor4f( 0,1,0,1); glVertex3f( 0,0,0 ); glVertex3f(  0, 10,  0 );
+            glColor4f( 0,0,1,1); glVertex3f( 0,0,0 ); glVertex3f(  0,  0, 10 );
+        glEnd();
+    }
+
+    /* =============== */
+    /* Draw the TRACTS */
+    /* =============== */
+    if ( TRK_show )
+    {
+        glPushMatrix();
+        glTranslatef(TRK_offset.x, TRK_offset.y, TRK_offset.z);
+
+        glLineWidth(1.0f);
+
+        float *ptr  = TRK_coords, *ptrc = TRK_colors;
+        VECTOR<float> Vc( VOXEL.x+0.5, VOXEL.y+0.5, VOXEL.z+0.5 ); // voxel center
+        float thr = 0.5*TRK_crop;
+        for(int f=0; f < TRK_nTractsPlotted; f++)
+        {
+            glBegin(GL_LINE_STRIP);
+            for(int i=0; i < TRK_nPoints[f]; i++)
+            {
+                // plot segment only if it's close to center of VOXEL
+                if (
+                      (
+                        TRK_crop_mode && (
+                        ( showPlane[0] && abs( (ptr[0]+TRK_offset.x) - Vc.x ) <= thr ) ||
+                        ( showPlane[1] && abs( (ptr[1]+TRK_offset.y) - Vc.y ) <= thr ) ||
+                        ( showPlane[2] && abs( (ptr[2]+TRK_offset.z) - Vc.z ) <= thr ) )
+                      )
+                      ||
+                      (
+                        !TRK_crop_mode && (
+                        ( abs( (ptr[0]+TRK_offset.x) - Vc.x ) <= thr ) &&
+                        ( abs( (ptr[1]+TRK_offset.y) - Vc.y ) <= thr ) &&
+                        ( abs( (ptr[2]+TRK_offset.z) - Vc.z ) <= thr ) )
+                      )
+                    )
+                {
+                    glColor3f(  ptrc[0], ptrc[1], ptrc[2] );
+                    glVertex3f( ptr[0],  ptr[1],  ptr[2]  );
+                }
+                else
+                {
+                    glEnd();
+                    glBegin(GL_LINE_STRIP);
+                }
+                ptr  += 3;
+                ptrc += 3;
+            }
+            glEnd();
+        }
+
+        glPopMatrix();
+    }
+
+    /* ============== */
+    /* Draw the PEAKS */
+    /* ============== */
+    if ( PEAKS_show || GLYPHS_show )
+    {
+        glDisable( GL_BLEND );
+        glLineWidth( LINE_width );
+        glPointSize( LINE_width );
+
+        glPushMatrix();
+        glTranslatef(.5,.5,.5);
+
+        Vec3Df dir, col;
+        int x,y,z,d,idx;
+        float norms[PEAKS_n], normMax, b0, w;
+
+        // plane YZ
+        if ( showPlane[0]  )
+        {
+            x = (int)VOXEL.x;
+            for(y=0; y<dim.y ;y++)
+            for(z=0; z<dim.z ;z++)
+            {
+                if ( PEAKS_show )
+                {
+                    normMax = 0;
+                    for(d=0; d<PEAKS_n; d++)
+                    {
+                        col.x = (*niiPEAKS->img)(x,y,z,3*d+0); // use "col" as tmp variable
+                        col.y = (*niiPEAKS->img)(x,y,z,3*d+1);
+                        col.z = (*niiPEAKS->img)(x,y,z,3*d+2);
+                        if ( PEAKS_use_affine )
+                        {
+                            dir.x = col.x * ((float*)PEAKS_affine)[0] + col.y * ((float*)PEAKS_affine)[1] + col.z * ((float*)PEAKS_affine)[2];
+                            dir.y = col.x * ((float*)PEAKS_affine)[3] + col.y * ((float*)PEAKS_affine)[4] + col.z * ((float*)PEAKS_affine)[5];
+                            dir.z = col.x * ((float*)PEAKS_affine)[6] + col.y * ((float*)PEAKS_affine)[7] + col.z * ((float*)PEAKS_affine)[8];
+                        }
+                        else
+                        {
+                            dir.x = col.x;
+                            dir.y = col.y;
+                            dir.z = col.z;
+                        }
+                        norms[d] = dir.norm();
+                        if ( norms[d] > normMax )
+                            normMax = norms[d];
+                    }
+
+                    for(d=0; d<PEAKS_n; d++)
+                    {
+                        if ( norms[d] < PEAKS_thr*normMax )
+                            continue;
+
+                        col.x = (*niiPEAKS->img)(x,y,z,3*d+0); // use "col" as tmp variable
+                        col.y = (*niiPEAKS->img)(x,y,z,3*d+1);
+                        col.z = (*niiPEAKS->img)(x,y,z,3*d+2);
+                        if ( PEAKS_use_affine )
+                        {
+                            dir.x = col.x * ((float*)PEAKS_affine)[0] + col.y * ((float*)PEAKS_affine)[1] + col.z * ((float*)PEAKS_affine)[2];
+                            dir.y = col.x * ((float*)PEAKS_affine)[3] + col.y * ((float*)PEAKS_affine)[4] + col.z * ((float*)PEAKS_affine)[5];
+                            dir.z = col.x * ((float*)PEAKS_affine)[6] + col.y * ((float*)PEAKS_affine)[7] + col.z * ((float*)PEAKS_affine)[8];
+                        }
+                        else
+                        {
+                            dir.x = col.x;
+                            dir.y = col.y;
+                            dir.z = col.z;
+                        }
+                        col.x = 0.5 * (PEAKS_flip[0]?-1:1) * dir.x / norms[d];
+                        col.y = 0.5 * (PEAKS_flip[1]?-1:1) * dir.y / norms[d];
+                        col.z = 0.5 * (PEAKS_flip[2]?-1:1) * dir.z / norms[d];
+
+                        if ( PEAKS_doNormalize )
+                        {
+                            dir.x = col.x;
+                            dir.y = col.y;
+                            dir.z = col.z;
+                        }
+                        else
+                        {
+                            dir.x = col.x * norms[d] / normMax;
+                            dir.y = col.y * norms[d] / normMax;
+                            dir.z = col.z * norms[d] / normMax;
+                        }
+
+                        glColor3f( fabs(2.0*col.x), fabs(2.0*col.y), fabs(2.0*col.z) ); 
+                        glBegin(GL_LINES);
+                            glVertex3f( x-dir.x, y-dir.y, z-dir.z );
+                            glVertex3f( x+dir.x, y+dir.y, z+dir.z );
+                        glEnd();
+                    }
+                }
+                if ( GLYPHS_show )
+                {
+                    b0 = (*niiDWI->img)(x,y,z,SCHEME_idxB0[0]);
+                    if ( b0 > GLYPHS_b0_thr )
+                    {
+                        glBegin(GL_POINTS);
+                        for(d=0; d < SCHEME_shells_idx[GLYPHS_shell].size() ;d++)
+                        {
+                            idx = SCHEME_shells_idx[GLYPHS_shell][d];
+                            w = 0.5 * (float)(*niiDWI->img)(x,y,z,idx) / b0;
+                            if ( GLYPHS_use_affine ) 
+                            {
+                                dir.x = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[0] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[1] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[2];
+                                dir.y = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[3] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[4] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[5];
+                                dir.z = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[6] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[7] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[8];
+                                normMax = dir.norm();
+                                dir.x *= w / normMax;
+                                dir.y *= w / normMax;
+                                dir.z *= w / normMax;
+                            }
+                            else
+                            {
+                                dir.x = w * SCHEME_dirs[idx].x;
+                                dir.y = w * SCHEME_dirs[idx].y;
+                                dir.z = w * SCHEME_dirs[idx].z;
+                            }
+                            normMax = dir.norm();
+                            glColor3f( fabs(dir.x)/normMax, fabs(dir.y)/normMax, fabs(dir.z)/normMax );
+                            glVertex3f( x+dir.x, y+dir.y, z+dir.z );
+                            glVertex3f( x-dir.x, y-dir.y, z-dir.z );
+                        }
+                        glEnd();
+                    }
+                }
+            }
+        }
+
+        // plane XZ
+        if ( showPlane[1] )
+        {
+            y = (int)VOXEL.y;
+            for(x=0; x<dim.x ;x++)
+            for(z=0; z<dim.z ;z++)
+            {
+                if ( PEAKS_show )
+                {
+                    normMax = 0;
+                    for(d=0; d<PEAKS_n; d++)
+                    {
+                        col.x = (*niiPEAKS->img)(x,y,z,3*d+0); // use "col" as tmp variable
+                        col.y = (*niiPEAKS->img)(x,y,z,3*d+1);
+                        col.z = (*niiPEAKS->img)(x,y,z,3*d+2);
+                        if ( PEAKS_use_affine )
+                        {
+                            dir.x = col.x * ((float*)PEAKS_affine)[0] + col.y * ((float*)PEAKS_affine)[1] + col.z * ((float*)PEAKS_affine)[2];
+                            dir.y = col.x * ((float*)PEAKS_affine)[3] + col.y * ((float*)PEAKS_affine)[4] + col.z * ((float*)PEAKS_affine)[5];
+                            dir.z = col.x * ((float*)PEAKS_affine)[6] + col.y * ((float*)PEAKS_affine)[7] + col.z * ((float*)PEAKS_affine)[8];
+                        }
+                        else
+                        {
+                            dir.x = col.x;
+                            dir.y = col.y;
+                            dir.z = col.z;
+                        }
+                        norms[d] = dir.norm();
+                        if ( norms[d] > normMax )
+                            normMax = norms[d];
+                    }
+
+                    for(d=0; d<PEAKS_n; d++)
+                    {
+                        if ( norms[d] < normMax*PEAKS_thr )
+                            continue;
+
+                        col.x = (*niiPEAKS->img)(x,y,z,3*d+0); // use "col" as tmp variable
+                        col.y = (*niiPEAKS->img)(x,y,z,3*d+1);
+                        col.z = (*niiPEAKS->img)(x,y,z,3*d+2);
+                        if ( PEAKS_use_affine )
+                        {
+                            dir.x = col.x * ((float*)PEAKS_affine)[0] + col.y * ((float*)PEAKS_affine)[1] + col.z * ((float*)PEAKS_affine)[2];
+                            dir.y = col.x * ((float*)PEAKS_affine)[3] + col.y * ((float*)PEAKS_affine)[4] + col.z * ((float*)PEAKS_affine)[5];
+                            dir.z = col.x * ((float*)PEAKS_affine)[6] + col.y * ((float*)PEAKS_affine)[7] + col.z * ((float*)PEAKS_affine)[8];
+                        }
+                        else
+                        {
+                            dir.x = col.x;
+                            dir.y = col.y;
+                            dir.z = col.z;
+                        }
+                        col.x = 0.5 * (PEAKS_flip[0]?-1:1) * dir.x / norms[d];
+                        col.y = 0.5 * (PEAKS_flip[1]?-1:1) * dir.y / norms[d];
+                        col.z = 0.5 * (PEAKS_flip[2]?-1:1) * dir.z / norms[d];
+
+                        if ( PEAKS_doNormalize )
+                        {
+                            dir.x = col.x;
+                            dir.y = col.y;
+                            dir.z = col.z;
+                        }
+                        else
+                        {
+                            dir.x = col.x * norms[d] / normMax;
+                            dir.y = col.y * norms[d] / normMax;
+                            dir.z = col.z * norms[d] / normMax;
+                        }
+
+                        glColor3f( fabs(2.0*col.x), fabs(2.0*col.y), fabs(2.0*col.z) );
+                        glBegin(GL_LINES);
+                            glVertex3f( x-dir.x, y-dir.y, z-dir.z );
+                            glVertex3f( x+dir.x, y+dir.y, z+dir.z );
+                        glEnd();
+                    }
+                }
+
+                if ( GLYPHS_show )
+                {
+                    b0 = (*niiDWI->img)(x,y,z,SCHEME_idxB0[0]);
+                    if ( b0 > GLYPHS_b0_thr )
+                    {
+                        glBegin(GL_POINTS);
+                        for(d=0; d < SCHEME_shells_idx[GLYPHS_shell].size() ;d++)
+                        {
+                            idx = SCHEME_shells_idx[GLYPHS_shell][d];
+                            w = 0.5 * (float)(*niiDWI->img)(x,y,z,idx) / b0;
+                            if ( GLYPHS_use_affine ) 
+                            {
+                                dir.x = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[0] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[1] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[2];
+                                dir.y = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[3] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[4] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[5];
+                                dir.z = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[6] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[7] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[8];
+                                normMax = dir.norm();
+                                dir.x *= w / normMax;
+                                dir.y *= w / normMax;
+                                dir.z *= w / normMax;
+                            }
+                            else
+                            {
+                                dir.x = w * SCHEME_dirs[idx].x;
+                                dir.y = w * SCHEME_dirs[idx].y;
+                                dir.z = w * SCHEME_dirs[idx].z;
+                            }
+                            normMax = dir.norm();
+                            glColor3f( fabs(dir.x)/normMax, fabs(dir.y)/normMax, fabs(dir.z)/normMax );
+                            glVertex3f( x+dir.x, y+dir.y, z+dir.z );
+                            glVertex3f( x-dir.x, y-dir.y, z-dir.z );
+                        }
+                        glEnd();
+                    }
+                }
+            }
+        }
+
+        // plane XY
+        if ( showPlane[2] )
+        {
+            z = (int)VOXEL.z;
+            for(y=0; y<dim.y ;y++)
+            for(x=0; x<dim.x ;x++)
+            {
+                if ( PEAKS_show )
+                {
+                    normMax = 0;
+                    for(d=0; d<PEAKS_n; d++)
+                    {
+                        col.x = (*niiPEAKS->img)(x,y,z,3*d+0); // use "col" as tmp variable
+                        col.y = (*niiPEAKS->img)(x,y,z,3*d+1);
+                        col.z = (*niiPEAKS->img)(x,y,z,3*d+2);
+                        if ( PEAKS_use_affine )
+                        {
+                            dir.x = col.x * ((float*)PEAKS_affine)[0] + col.y * ((float*)PEAKS_affine)[1] + col.z * ((float*)PEAKS_affine)[2];
+                            dir.y = col.x * ((float*)PEAKS_affine)[3] + col.y * ((float*)PEAKS_affine)[4] + col.z * ((float*)PEAKS_affine)[5];
+                            dir.z = col.x * ((float*)PEAKS_affine)[6] + col.y * ((float*)PEAKS_affine)[7] + col.z * ((float*)PEAKS_affine)[8];
+                        }
+                        else
+                        {
+                            dir.x = col.x;
+                            dir.y = col.y;
+                            dir.z = col.z;
+                        }
+                        norms[d] = dir.norm();
+                        if ( norms[d] > normMax )
+                            normMax = norms[d];
+                    }
+
+                    for(d=0; d<PEAKS_n; d++)
+                    {
+                        if ( norms[d] < normMax*PEAKS_thr )
+                            continue;
+
+                        col.x = (*niiPEAKS->img)(x,y,z,3*d+0); // use "col" as tmp variable
+                        col.y = (*niiPEAKS->img)(x,y,z,3*d+1);
+                        col.z = (*niiPEAKS->img)(x,y,z,3*d+2);
+                        if ( PEAKS_use_affine )
+                        {
+                            dir.x = col.x * ((float*)PEAKS_affine)[0] + col.y * ((float*)PEAKS_affine)[1] + col.z * ((float*)PEAKS_affine)[2];
+                            dir.y = col.x * ((float*)PEAKS_affine)[3] + col.y * ((float*)PEAKS_affine)[4] + col.z * ((float*)PEAKS_affine)[5];
+                            dir.z = col.x * ((float*)PEAKS_affine)[6] + col.y * ((float*)PEAKS_affine)[7] + col.z * ((float*)PEAKS_affine)[8];
+                        }
+                        else
+                        {
+                            dir.x = col.x;
+                            dir.y = col.y;
+                            dir.z = col.z;
+                        }
+                        col.x = 0.5 * (PEAKS_flip[0]?-1:1) * dir.x / norms[d];
+                        col.y = 0.5 * (PEAKS_flip[1]?-1:1) * dir.y / norms[d];
+                        col.z = 0.5 * (PEAKS_flip[2]?-1:1) * dir.z / norms[d];
+
+                        if ( PEAKS_doNormalize )
+                        {
+                            dir.x = col.x;
+                            dir.y = col.y;
+                            dir.z = col.z;
+                        }
+                        else
+                        {
+                            dir.x = col.x * norms[d] / normMax;
+                            dir.y = col.y * norms[d] / normMax;
+                            dir.z = col.z * norms[d] / normMax;
+                        }
+
+                        glColor3f( fabs(2.0*col.x), fabs(2.0*col.y), fabs(2.0*col.z) );
+                        glBegin(GL_LINES);
+                            glVertex3f( x-dir.x, y-dir.y, z-dir.z );
+                            glVertex3f( x+dir.x, y+dir.y, z+dir.z );
+                        glEnd();
+                    }
+                }
+
+                if( GLYPHS_show)
+                {
+                    b0 = (*niiDWI->img)(x,y,z,SCHEME_idxB0[0]);
+                    if ( b0 > GLYPHS_b0_thr )
+                    {
+                        glBegin(GL_POINTS);
+                        for(d=0; d < SCHEME_shells_idx[GLYPHS_shell].size() ;d++)
+                        {
+                            idx = SCHEME_shells_idx[GLYPHS_shell][d];
+                            w = 0.5 * (float)(*niiDWI->img)(x,y,z,idx) / b0;
+                            if ( GLYPHS_use_affine ) 
+                            {
+                                dir.x = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[0] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[1] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[2];
+                                dir.y = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[3] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[4] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[5];
+                                dir.z = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[6] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[7] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[8];
+                                normMax = dir.norm();
+                                dir.x *= w / normMax;
+                                dir.y *= w / normMax;
+                                dir.z *= w / normMax;
+                            }
+                            else
+                            {
+                                dir.x = w * SCHEME_dirs[idx].x;
+                                dir.y = w * SCHEME_dirs[idx].y;
+                                dir.z = w * SCHEME_dirs[idx].z;
+                            }
+
+                            normMax = dir.norm();
+                            glColor3f( fabs(dir.x)/normMax, fabs(dir.y)/normMax, fabs(dir.z)/normMax );
+                            glVertex3f( x+dir.x, y+dir.y, z+dir.z );
+                            glVertex3f( x-dir.x, y-dir.y, z-dir.z );
+                        }
+                        glEnd();
+                    }
+                }
+            }
+        }
+
+        glPopMatrix();
+    }
+
+    /* =================== */
+    /* Draw the SCALAR MAP */
+    /* =================== */
+    if ( showPlane[0] || showPlane[1] || showPlane[2] )
+    {
+        glDisable( GL_CULL_FACE );
+        glEnable( GL_BLEND );
+        glBlendFunc( GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA );
+
+        // to avoid z-fighting
+        glPolygonOffset( 1.0, 1.0 );
+        glEnable(GL_POLYGON_OFFSET_FILL);
+        glPolygonMode(GL_FRONT_AND_BACK, GL_FILL);
+
+        glLineWidth( 3 );
+
+        int x, y, z; // voxel coordinates NB: (0,0,0) -> corner of voxel
+        float color;
+
+        // plane YZ
+        if ( showPlane[0]  )
+        {
+            glPushMatrix();
+            glTranslatef(0.5,0,0);
+
+            x = (int)VOXEL.x;
+            for(y=0; y<dim.y ;y++)
+            for(z=0; z<dim.z ;z++)
+            {
+                color = ( MAP(x,y,z) - MAP_min_view) / ( MAP_max_view - MAP_min_view );
+                glColor4f(color,color,color,MAP_opacity);
+                glBegin(GL_QUADS);
+                    glVertex3f(x, y,   z);
+                    glVertex3f(x, y,   z+1);
+                    glVertex3f(x, y+1, z+1);
+                    glVertex3f(x, y+1, z);
+                glEnd();
+            }
+            // colored frame
+            if ( showAxes )
+            {
+                glColor3f(1,0,0);
+                glBegin(GL_LINE_STRIP);
+                    glVertex3f(x,0,0);
+                    glVertex3f(x,dim.y,0);
+                    glVertex3f(x,dim.y,dim.z);
+                    glVertex3f(x,0,dim.z);
+                    glVertex3f(x,0,0);
+                glEnd();
+            }
+
+            glPopMatrix();
+        }
+
+        // plane XZ
+        if ( showPlane[1] )
+        {
+            glPushMatrix();
+            glTranslatef(0,0.5,0);
+
+            y = (int)VOXEL.y;
+            for(x=0; x<dim.x ;x++)
+            for(z=0; z<dim.z ;z++)
+            {
+                color = ( MAP(x,y,z) - MAP_min_view) / ( MAP_max_view - MAP_min_view );
+                glColor4f(color,color,color,MAP_opacity);
+                glBegin(GL_QUADS);
+                    glVertex3f(x,   y, z);
+                    glVertex3f(x,   y, z+1);
+                    glVertex3f(x+1, y, z+1);
+                    glVertex3f(x+1, y, z);
+                glEnd();
+            }
+            // colored frame
+            if ( showAxes )
+            {
+                glColor3f(0,1,0);
+                glBegin(GL_LINE_STRIP);
+                    glVertex3f(0,y,0);
+                    glVertex3f(dim.x,y,0);
+                    glVertex3f(dim.x,y,dim.z);
+                    glVertex3f(0,y,dim.z);
+                    glVertex3f(0,y,0);
+                glEnd();
+            }
+
+            glPopMatrix();
+        }
+
+        // plane XY
+        if ( showPlane[2] )
+        {
+            glPushMatrix();
+            glTranslatef(0,0,0.5);
+
+            z = (int)VOXEL.z;
+            for(y=0; y<dim.y ;y++)
+            for(x=0; x<dim.x ;x++)
+            {
+                color = ( MAP(x,y,z) - MAP_min_view) / ( MAP_max_view - MAP_min_view );
+                glColor4f(color,color,color,MAP_opacity);
+                glBegin(GL_QUADS);
+                    glVertex3f(x,   y,   z);
+                    glVertex3f(x+1, y,   z);
+                    glVertex3f(x+1, y+1, z);
+                    glVertex3f(x,   y+1, z);
+                glEnd();
+            }
+
+            // colored frame
+            if ( showAxes )
+            {
+                glColor3f(0,0,1);
+                glBegin(GL_LINE_STRIP);
+                    glVertex3f(0,0,z);
+                    glVertex3f(dim.x,0,z);
+                    glVertex3f(dim.x,dim.y,z);
+                    glVertex3f(0,dim.y,z);
+                    glVertex3f(0,0,z);
+                glEnd();
+            }
+
+            glPopMatrix();
+        }
+
+        glEnable(GL_CULL_FACE);
+        glDisable( GL_BLEND );
+        glDisable(GL_POLYGON_OFFSET_FILL);
+    }
+
+    /* ====================== */
+    /* Draw the CURRENT VOXEL */
+    /* ====================== */
+    if ( showAxes )
+    {
+        glPushMatrix();
+        glTranslatef( VOXEL.x+0.5, VOXEL.y+0.5, VOXEL.z+0.5 );
+
+        glEnable( GL_BLEND );
+        glBlendFunc( GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA );
+        glLineWidth(1);
+        glColor4f( 1,1,0,1 );
+        glutWireCube( 1 );
+        glColor4f( 1,1,0,0.25 );
+        glutSolidCube( 1 );
+        glDisable( GL_BLEND );
+
+        glPopMatrix();
+    }
+
+    glPopMatrix();
+    PrintConfig();
+    glutSwapBuffers();
+}
+
+
+// INITIALIZATION
+// --------------
+void OpenGL_init( int argc, char** argv )
+{
+    glutInit( &argc, argv );
+    glutInitDisplayMode( GLUT_DEPTH | GLUT_DOUBLE | GLUT_RGBA | GLUT_ALPHA | GLUT_MULTISAMPLE );
+    ScreenX = 0.7*glutGet(GLUT_SCREEN_WIDTH);  if (ScreenX==0) ScreenX = 800;
+    ScreenY = 0.7*glutGet(GLUT_SCREEN_HEIGHT); if (ScreenY==0) ScreenY = 600;
+    glutInitWindowSize( ScreenX, ScreenY );
+    glutInitWindowPosition( 0.15*glutGet(GLUT_SCREEN_WIDTH), 0.15*glutGet(GLUT_SCREEN_HEIGHT) );
+    glutCreateWindow( "COMMIT debugger" );
+
+    // Projection and model matrix
+    glMatrixMode(GL_PROJECTION);
+    glLoadIdentity();
+    glMatrixMode(GL_MODELVIEW);
+    glLoadIdentity();
+
+    translation.x	= translation.y = 0;
+    zoom			= 0;
+    OPENGL_utils::identity( rot );
+    OPENGL_utils::identity( id );
+
+    glEnable( GL_DEPTH_TEST );
+    glClearColor( 0.1, 0.1, 0.1, 0.0 );
+
+    // lighting
+    glShadeModel( GL_SMOOTH );
+    glEnable( GL_NORMALIZE );
+
+    GLfloat white[] = {.5f, .5f, .5f, 1.0f};
+    glMaterialfv(GL_FRONT, GL_SPECULAR, white);
+    GLfloat shininess[] = {32};
+    glMaterialfv(GL_FRONT, GL_SHININESS, shininess);
+
+    glLightModeli(GL_LIGHT_MODEL_TWO_SIDE, GL_FALSE);
+    GLfloat global_ambient[] = { 0.2f, 0.2f, 0.2f, 1.0f };
+    glLightModelfv(GL_LIGHT_MODEL_AMBIENT, global_ambient);
+    glEnable ( GL_COLOR_MATERIAL );
+
+    // register CALLBACKS and open window
+    glutKeyboardFunc( GLUT__keyboard );
+    glutSpecialFunc(  GLUT__specialkey );
+    glutDisplayFunc(  GLUT__display );
+    glutReshapeFunc(  GLUT__reshape );
+    glutMouseFunc(    GLUT__mouse );
+    glutMotionFunc(   GLUT__motion );
+
+    GLUT__createMenu();
+
+    glutMainLoop();
+}
diff --git a/extras/COMMIT_debugger/OPENGL_utils.h b/extras/COMMIT_debugger/OPENGL_utils.h
index a9390517..cd3f3607 100755
--- a/extras/COMMIT_debugger/OPENGL_utils.h
+++ b/extras/COMMIT_debugger/OPENGL_utils.h
@@ -1,95 +1,95 @@
-#ifndef __OPENGL_UTILS_H__
-#define __OPENGL_UTILS_H__
-
-#include <algorithm>
-
-#include "VECTOR.h"
-typedef VECTOR<GLint>		Vec3Di;
-typedef VECTOR<GLfloat>		Vec3Df;
-
-
-namespace OPENGL_utils
-{
-
-void identity(GLfloat* result)
-{
-    for (int i=0; i<4; i++)
-    for (int j=0; j<4; j++)
-        if (i==j) result[4*i+j]=1; else result[4*i+j]=0;
-}
-
-
-void matXMat(GLfloat* m, GLfloat* m1, GLfloat* result)
-{
-    for (int i=0; i<4; i++)
-    for (int j=0; j<4; j++)
-    {
-        result[4*i+j]=0;
-        for (int t=0; t<4; t++)
-            result[4*i+j]=result[4*i+j]+m[4*i+t]*m1[4*t+j];
-    }
-}
-
-
-void rotateZ(GLfloat* m, GLfloat ang, GLfloat* result)
-{
-    static GLfloat matrix[16];
-
-    for (int i=0; i<16 ; i++) matrix[i] = 0;
-    matrix[0]  = cos(ang/180*3.1415);
-    matrix[5]  = cos(ang/180*3.1415);
-    matrix[1]  = -sin(ang/180*3.1415);
-    matrix[4]  = sin(ang/180*3.1415);
-    matrix[10] = 1;
-    matrix[15] = 1;
-    matXMat(matrix,m,result);
-}
-
-
-void rotateY(GLfloat* m, GLfloat ang, GLfloat* result)
-{
-    static GLfloat matrix[16];
-
-    for (int i=0; i<16 ; i++) matrix[i] = 0;
-    matrix[0]  = cos(ang/180*3.1415);
-    matrix[10] = cos(ang/180*3.1415);
-    matrix[8]  = -sin(ang/180*3.1415);
-    matrix[2]  = sin(ang/180*3.1415);
-    matrix[5]  = 1;
-    matrix[15] = 1;
-    matXMat(matrix,m,result);
-}
-
-
-void rotateX(GLfloat* m, GLfloat ang, GLfloat* result)
-{
-    static GLfloat matrix[16];
-
-    for (int i=0; i<16 ; i++) matrix[i] = 0;
-    matrix[5]  = cos(ang/180*3.1415);
-    matrix[10] = cos(ang/180*3.1415);
-    matrix[6]  = -sin(ang/180*3.1415);
-    matrix[9]  = sin(ang/180*3.1415);
-    matrix[0]  = 1;
-    matrix[15] = 1;
-    matXMat(matrix,m,result);
-}
-
-
-void translate(GLfloat* m, GLfloat x,GLfloat y,GLfloat z, GLfloat* result)
-{
-    static GLfloat matrix[16];
-
-    for (int i=0; i<16 ; i++) matrix[i] = 0;
-    matrix[0]  = 1;
-    matrix[5]  = 1;
-    matrix[10] = 1;
-    matrix[15] = 1;
-    matrix[12] = x;
-    matrix[13] = y;
-    matrix[14] = z;
-    matXMat(matrix,m,result);
-}
-
-}
-#endif
+#ifndef __OPENGL_UTILS_H__
+#define __OPENGL_UTILS_H__
+
+#include <algorithm>
+
+#include "VECTOR.h"
+typedef VECTOR<GLint>		Vec3Di;
+typedef VECTOR<GLfloat>		Vec3Df;
+
+
+namespace OPENGL_utils
+{
+
+void identity(GLfloat* result)
+{
+    for (int i=0; i<4; i++)
+    for (int j=0; j<4; j++)
+        if (i==j) result[4*i+j]=1; else result[4*i+j]=0;
+}
+
+
+void matXMat(GLfloat* m, GLfloat* m1, GLfloat* result)
+{
+    for (int i=0; i<4; i++)
+    for (int j=0; j<4; j++)
+    {
+        result[4*i+j]=0;
+        for (int t=0; t<4; t++)
+            result[4*i+j]=result[4*i+j]+m[4*i+t]*m1[4*t+j];
+    }
+}
+
+
+void rotateZ(GLfloat* m, GLfloat ang, GLfloat* result)
+{
+    static GLfloat matrix[16];
+
+    for (int i=0; i<16 ; i++) matrix[i] = 0;
+    matrix[0]  = cos(ang/180*3.1415);
+    matrix[5]  = cos(ang/180*3.1415);
+    matrix[1]  = -sin(ang/180*3.1415);
+    matrix[4]  = sin(ang/180*3.1415);
+    matrix[10] = 1;
+    matrix[15] = 1;
+    matXMat(matrix,m,result);
+}
+
+
+void rotateY(GLfloat* m, GLfloat ang, GLfloat* result)
+{
+    static GLfloat matrix[16];
+
+    for (int i=0; i<16 ; i++) matrix[i] = 0;
+    matrix[0]  = cos(ang/180*3.1415);
+    matrix[10] = cos(ang/180*3.1415);
+    matrix[8]  = -sin(ang/180*3.1415);
+    matrix[2]  = sin(ang/180*3.1415);
+    matrix[5]  = 1;
+    matrix[15] = 1;
+    matXMat(matrix,m,result);
+}
+
+
+void rotateX(GLfloat* m, GLfloat ang, GLfloat* result)
+{
+    static GLfloat matrix[16];
+
+    for (int i=0; i<16 ; i++) matrix[i] = 0;
+    matrix[5]  = cos(ang/180*3.1415);
+    matrix[10] = cos(ang/180*3.1415);
+    matrix[6]  = -sin(ang/180*3.1415);
+    matrix[9]  = sin(ang/180*3.1415);
+    matrix[0]  = 1;
+    matrix[15] = 1;
+    matXMat(matrix,m,result);
+}
+
+
+void translate(GLfloat* m, GLfloat x,GLfloat y,GLfloat z, GLfloat* result)
+{
+    static GLfloat matrix[16];
+
+    for (int i=0; i<16 ; i++) matrix[i] = 0;
+    matrix[0]  = 1;
+    matrix[5]  = 1;
+    matrix[10] = 1;
+    matrix[15] = 1;
+    matrix[12] = x;
+    matrix[13] = y;
+    matrix[14] = z;
+    matXMat(matrix,m,result);
+}
+
+}
+#endif
diff --git a/requirements.txt b/requirements.txt
index 9234880c..1c03d182 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
-Cython>=0.29
-dipy>=1.0
-dmri-amico>=1.2.3
-numpy>=1.12
-setuptools>=46.1
+Cython>=0.29
+dipy>=1.0
+dmri-amico>=1.2.3
+numpy>=1.12
+setuptools>=46.1
diff --git a/setup.cfg b/setup.cfg
index a96a1715..3463cc53 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
-# Inside of setup.cfg
-[metadata]
-description-file = README.md
-
-[bdist_wheel]
+# Inside of setup.cfg
+[metadata]
+description-file = README.md
+
+[bdist_wheel]
 universal = 1
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 98fa2fb9..325ca04d 100644
--- a/setup.py
+++ b/setup.py
@@ -104,24 +104,11 @@ def get_extensions():
                      extra_compile_args=['-w'],
                      language='c++')
 
-    """if CUDA != None:
-        ext4 = Extension(name='commit.cudaoperator',
-                        sources = ['commit/operator_withCUDA.cu', 'commit/cudaoperator.pyx'],
-                        extra_compile_args= {'gcc':  ['-w'],
-                                            'nvcc': ['-arch=sm_30', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
-                        language = 'c++',
-                        library_dirs = [CUDA['lib64']],
-                        libraries = ['cudart'],
-                        runtime_library_dirs = [CUDA['lib64']])
-
-        return [ext1, ext2, ext3, ext4]"""
-
     return [ext1, ext2, ext3]
 
 def get_extensions_with_cuda():
     # Cython extension to create the sparse data structure from a tractogram
     # for the computation of matrix-vector multiplications
-    from numpy import get_include
 
     ext1 = Extension(name='commit.trk2dictionary',
                      sources=['commit/trk2dictionary/trk2dictionary.pyx'],
@@ -129,7 +116,6 @@ def get_extensions_with_cuda():
                                           'nvcc': ['-arch=sm_50', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
                      extra_link_args=[],
                      language='c++')
-                     #include_dirs = [get_include])
 
     ext2 = Extension(name='commit.core',
                      sources=['commit/core.pyx'],
@@ -137,7 +123,6 @@ def get_extensions_with_cuda():
                                           'nvcc': ['-arch=sm_50', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
                      extra_link_args=[],
                      language='c++')
-                     #include_dirs = [get_include])
 
     ext3 = Extension(name='commit.proximals',
                       sources=['commit/proximals.pyx'],
@@ -145,14 +130,12 @@ def get_extensions_with_cuda():
                                            'nvcc': ['-arch=sm_50', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
                       extra_link_args=[],
                       language='c++')
-                      #include_dirs = [get_include])
 
     ext4 = Extension(name='commit.cudaoperator',
                      sources = ['commit/operator_withCUDA.cu', 'commit/cudaoperator.pyx'],
                      extra_compile_args= {'gcc':  ['-w'],
                                           'nvcc': ['-arch=sm_50', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
                      language = 'c++',
-                     #include_dirs = [get_include, CUDA['include']],
                      library_dirs = [CUDA['lib64']],
                      libraries = ['cudart'],
                      runtime_library_dirs = [CUDA['lib64']])

From 5142c490b8bb07502ae9bd94c3115d446fe34446 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Thu, 7 Jan 2021 22:42:42 -0600
Subject: [PATCH 155/190] Minor Cleanup

---
 commit/core.pyx                 | 4 ++--
 commit/operator/operator.pyxbld | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index a9895b48..d0d8b175 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -408,7 +408,7 @@ cdef class Evaluation :
         self.DICTIONARY['IC']['n']     = self.DICTIONARY['IC']['fiber'].size
         self.DICTIONARY['IC']['nF']    = self.DICTIONARY['TRK']['norm'].size
 
-        # reorder the segments based on the "v" field
+        # reorder the segments based, first, on the "v" field and after based on the "o" field
         idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['v'])] )
         self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
         self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]
@@ -439,7 +439,7 @@ cdef class Evaluation :
         self.DICTIONARY['EC']['o']  = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_EC_o.dict'), dtype=np.uint16 )
         self.DICTIONARY['EC']['nE'] = self.DICTIONARY['EC']['v'].size
 
-        # reorder the segments based on the "v" field
+        # reorder the segments based, first, on the "v" field and after based on the "o" field
         idx = np.lexsort( [np.array(self.DICTIONARY['EC']['o']), np.array(self.DICTIONARY['EC']['v'])] )
         self.DICTIONARY['EC']['v'] = self.DICTIONARY['EC']['v'][ idx ]
         self.DICTIONARY['EC']['o'] = self.DICTIONARY['EC']['o'][ idx ]
diff --git a/commit/operator/operator.pyxbld b/commit/operator/operator.pyxbld
index 44cab311..03b4310f 100755
--- a/commit/operator/operator.pyxbld
+++ b/commit/operator/operator.pyxbld
@@ -11,7 +11,7 @@ from commit.operator import config
 def make_ext(modname, pyxfilename):
 
     if (config.nTHREADS is None or config.nTHREADS < 0 or config.nTHREADS > 255):
-        raise RuntimeError('config.nTHREADS must be between 1 and 255')
+        raise RuntimeError('config.nTHREADS must be between 0 and 255')
     if (config.nIC is None or config.nIC < 0 or config.nIC > 20):
         raise RuntimeError('config.nIC must be in the range [0..20]')
     if (config.nEC is None or config.nEC < 0 or config.nEC > 20):

From 1deff30350340f2e4ea65516142433c3b677a5d6 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Tue, 12 Jan 2021 22:29:01 -0600
Subject: [PATCH 156/190] Solving Bug

---
 commit/operator_withCUDA.cu  | 28 ++++++++++++++++++++++++----
 commit/operator_withCUDA.cuh |  2 ++
 2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index 612015b3..dcb447bf 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -53,6 +53,16 @@ bool checkCompatibility(size_t required_mem, int gpu_id) {
     }
 }
 
+void cudaCheckLastError()
+{
+    cudaError_t err = cudaGetLastError();
+
+    if(err != cudaSuccess){
+        printf("CUDA Error: %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+
 void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compartmentsPerBlock, uint32_t* offsetPerBlock, int NUM_BLOCKS){
 
     // fill arrays with zeros
@@ -128,7 +138,7 @@ CudaLinearOperator::CudaLinearOperator(
         cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTEC,       &size_lutec,    sizeof(int)) );
         cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTISO,      &size_lutiso,   sizeof(int)) );
         if (cudaStatus) printf("[ OK ]\n");
-        else            printf("[ CUDA ERROR ]\n");
+        else            cudaError = 1;
 
         // alloc memory in GPU for vectors x and y
         printf("\t* vectors x&y ... ");
@@ -136,7 +146,7 @@ CudaLinearOperator::CudaLinearOperator(
         cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_x, ncols*sizeof(float64_t)) );
         cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_y, nrows*sizeof(float64_t)) );
         if (cudaStatus) printf("[ OK ]\n");
-        else            printf("[ CUDA ERROR ]\n");
+        else            cudaError = 2;
 
         // pre-process data for GPU
         printf("\t* pre-processing ... ");
@@ -165,7 +175,7 @@ CudaLinearOperator::CudaLinearOperator(
         free(segmentsPerBlock);
         free(offsetPerBlock);
         if (cudaStatus) printf("[ OK ]\n");
-        else            printf("[ CUDA ERROR ]\n");
+        else            cudaError = 3;
 
         // alloc and transfer LUTs
         printf("\t* loading LUTs ... ");
@@ -208,7 +218,7 @@ CudaLinearOperator::CudaLinearOperator(
         }
 
         if (cudaStatus) printf("[ OK ]\n");
-        else            printf("[ CUDA ERROR ]\n");
+        else            cudaError = 4;
 
 
         // alloc and transfer operator A
@@ -351,36 +361,46 @@ void CudaLinearOperator::dot(float64_t* v_in, float64_t* v_out){
     
     // Copy vector x to the GPU
     cudaMemcpy(gpu_x, v_in, ncols*sizeof(double), cudaMemcpyHostToDevice);
+    cudaCheckLastError();
 
     // Multiply IC part in the GPU
     multiply_Ax_ICpart<<<nvoxels, 1024>>>(gpu_voxelIC, gpu_fiberIC, gpu_orienIC, gpu_lengthIC, gpu_segmentsPerBlockIC, gpu_offsetPerBlockIC, gpu_lutIC, gpu_x, gpu_y);
+    cudaCheckLastError();
 
     // Multiply EC part in the GPU
     multiply_Ax_ECpart<<<nvoxels, 512>>>(gpu_voxelEC, gpu_orienEC, gpu_segmentsPerBlockEC, gpu_offsetPerBlockEC, gpu_lutEC, gpu_x, gpu_y);
+    cudaCheckLastError();
 
     // Multiply ISO part in the GPU
     multiply_Ax_ISOpart<<<nvoxels, 512>>>(gpu_lutISO, gpu_x, gpu_y);
+    cudaCheckLastError();
 
     // Copy back result to CPU
     cudaMemcpy(v_out, gpu_y, nrows*sizeof(double), cudaMemcpyDeviceToHost);
+    cudaCheckLastError();
 }
 
 void CudaLinearOperator::Tdot(float64_t* v_in, float64_t* v_out){
     
     // Copy vector y to the GPU
     cudaMemcpy(gpu_y, v_in, nrows*sizeof(double), cudaMemcpyHostToDevice);
+    cudaCheckLastError();
 
     // Multiply IC part in the GPU
     multiply_Aty_ICpart<<<nfibers, 512>>>(gpu_TvoxelIC, gpu_TfiberIC, gpu_TorienIC, gpu_TlengthIC, gpu_TfibersPerBlockIC, gpu_ToffsetPerBlockIC, gpu_lutIC, gpu_x, gpu_y);
+    cudaCheckLastError();
 
     // Multiply EC part in the GPU
     multiply_Aty_ECpart<<<nvoxels, 512>>>(gpu_voxelEC, gpu_orienEC, gpu_segmentsPerBlockEC, gpu_offsetPerBlockEC, gpu_lutEC, gpu_x, gpu_y);
+    cudaCheckLastError();
 
     // Multiply ISO part in the GPU
     multiply_Aty_ISOpart<<<nvoxels, 512>>>(gpu_lutISO, gpu_x, gpu_y);
+    cudaCheckLastError();
 
     // Copy back result to CPU
     cudaMemcpy(v_out, gpu_x, ncols*sizeof(double), cudaMemcpyDeviceToHost);
+    cudaCheckLastError();
 }
 
 // ------------------------------------------------------- KERNELS ------------------------------------------------------- //
diff --git a/commit/operator_withCUDA.cuh b/commit/operator_withCUDA.cuh
index 2894bb9c..c66d07c1 100644
--- a/commit/operator_withCUDA.cuh
+++ b/commit/operator_withCUDA.cuh
@@ -19,6 +19,7 @@ typedef double float64_t;
 // ====================================================
 bool cudaCheck(cudaError_t cudaStatus);
 bool checkCompatibility(size_t required_mem, int gpu_id);
+void cudaCheckLastError();
 
 // ====================================================
 // Function to preprocess data for GPU
@@ -153,6 +154,7 @@ class CudaLinearOperator {
 
     // CUDA GPU status
     bool cudaStatus;
+    int  cudaError;
 
     public:
         CudaLinearOperator(

From f84b4eede9a6145c65ca2769081eef7083ec82a2 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Tue, 12 Jan 2021 23:44:20 -0600
Subject: [PATCH 157/190] Solving Bug

---
 commit/operator_withCUDA.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index dcb447bf..035625e5 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -111,7 +111,7 @@ CudaLinearOperator::CudaLinearOperator(
     this->nvoxels   = nvoxels;
     this->nfibers   = nfibers;
     this->nrows     = nvoxels * nsamples;
-    this->ncols     = nfibers*ndiameters + nzeppelins*nzeppelins + nvoxels*nballs;
+    this->ncols     = nfibers*ndiameters + npeaks*nzeppelins + nvoxels*nballs;
 
     if (fcall == 1) {
         int size_lutic  = ndiameters*norientations*nsamples;

From 083c1eeeb01e916e504b6540393a34d0239bd0cd Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Wed, 13 Jan 2021 16:00:30 -0600
Subject: [PATCH 158/190] Avoid compilation of CPU version when GPU version is
 selected

---
 commit/core.pyx                    | 84 +++++++++++++++---------------
 commit/operator/operator.pyxbld    |  4 +-
 commit/operator/operator_noLUT.c   |  2 +-
 commit/operator/operator_withLUT.c |  2 +-
 commit/operator_withCUDA.cu        | 20 +++----
 5 files changed, 56 insertions(+), 56 deletions(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index d0d8b175..9c1da40f 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -670,50 +670,50 @@ cdef class Evaluation :
         tic = time.time()
         LOG( '\n-> Building linear operator A:' )
 
-        # need to pass these parameters at runtime for compiling the C code
-        from commit.operator import config
+        if self.THREADS['n'] > 0:
+            # need to pass these parameters at runtime for compiling the C code
+            from commit.operator import config
 
-        compilation_is_needed = False
-        
-        if config.nTHREADS is None or config.nTHREADS != self.THREADS['n']:
-            compilation_is_needed = True
-        if config.nIC is None or config.nIC != self.KERNELS['wmr'].shape[0]:
-            compilation_is_needed = True
-        if config.model is None or config.model != self.model.id:
-            compilation_is_needed = True        
-        if config.nEC is None or config.nEC != self.KERNELS['wmh'].shape[0]:
-            compilation_is_needed = True                
-        if config.nISO is None or config.nISO != self.KERNELS['iso'].shape[0]:
-            compilation_is_needed = True        
-        if config.build_dir != build_dir:
-            compilation_is_needed = True        
-
-        if compilation_is_needed or not 'commit.operator.operator' in sys.modules :       
-
-            if build_dir is not None:
-                if isdir(build_dir) and not len(listdir(build_dir)) == 0:
-                    ERROR( '\nbuild_dir is not empty, unsafe build option.' )
-                elif config.nTHREADS is not None:
-                    ERROR( '\nThe parameter build_dir has changed, unsafe build option.' )
-                else:
-                    WARNING( '\nUsing build_dir, always quit your python console between COMMIT Evaluation.' )
-
-            config.nTHREADS   = self.THREADS['n']
-            config.model      = self.model.id
-            config.nIC        = self.KERNELS['wmr'].shape[0]
-            config.nEC        = self.KERNELS['wmh'].shape[0]
-            config.nISO       = self.KERNELS['iso'].shape[0]
-            config.build_dir  = build_dir
-
-            pyximport.install( reload_support=True, language_level=3, build_dir=build_dir, build_in_temp=True, inplace=False )
-
-            if not 'commit.operator.operator' in sys.modules :
-                import commit.operator.operator
-            else :
-                reload( sys.modules['commit.operator.operator'] )
+            compilation_is_needed = False
             
-        if self.THREADS['n'] > 0:
-            self.A = sys.modules['commit.operator.operator'].LinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
+            if config.nTHREADS is None or config.nTHREADS != self.THREADS['n']:
+                compilation_is_needed = True
+            if config.nIC is None or config.nIC != self.KERNELS['wmr'].shape[0]:
+                compilation_is_needed = True
+            if config.model is None or config.model != self.model.id:
+                compilation_is_needed = True        
+            if config.nEC is None or config.nEC != self.KERNELS['wmh'].shape[0]:
+                compilation_is_needed = True                
+            if config.nISO is None or config.nISO != self.KERNELS['iso'].shape[0]:
+                compilation_is_needed = True        
+            if config.build_dir != build_dir:
+                compilation_is_needed = True        
+
+            if compilation_is_needed or not 'commit.operator.operator' in sys.modules :       
+
+                if build_dir is not None:
+                    if isdir(build_dir) and not len(listdir(build_dir)) == 0:
+                        ERROR( '\nbuild_dir is not empty, unsafe build option.' )
+                    elif config.nTHREADS is not None:
+                        ERROR( '\nThe parameter build_dir has changed, unsafe build option.' )
+                    else:
+                        WARNING( '\nUsing build_dir, always quit your python console between COMMIT Evaluation.' )
+
+                config.nTHREADS   = self.THREADS['n']
+                config.model      = self.model.id
+                config.nIC        = self.KERNELS['wmr'].shape[0]
+                config.nEC        = self.KERNELS['wmh'].shape[0]
+                config.nISO       = self.KERNELS['iso'].shape[0]
+                config.build_dir  = build_dir
+
+                pyximport.install( reload_support=True, language_level=3, build_dir=build_dir, build_in_temp=True, inplace=False )
+
+                if not 'commit.operator.operator' in sys.modules :
+                    import commit.operator.operator
+                else :
+                    reload( sys.modules['commit.operator.operator'] )
+                
+            self.A = sys.modules['commit.operator.operator'].LinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )        
         else:
             import commit.cudaoperator
             self.A = commit.cudaoperator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS, fcall=1 )
diff --git a/commit/operator/operator.pyxbld b/commit/operator/operator.pyxbld
index 03b4310f..f3967a15 100755
--- a/commit/operator/operator.pyxbld
+++ b/commit/operator/operator.pyxbld
@@ -10,8 +10,8 @@ from commit.operator import config
 
 def make_ext(modname, pyxfilename):
 
-    if (config.nTHREADS is None or config.nTHREADS < 0 or config.nTHREADS > 255):
-        raise RuntimeError('config.nTHREADS must be between 0 and 255')
+    if (config.nTHREADS is None or config.nTHREADS < 1 or config.nTHREADS > 255):
+        raise RuntimeError('config.nTHREADS must be between 1 and 255')
     if (config.nIC is None or config.nIC < 0 or config.nIC > 20):
         raise RuntimeError('config.nIC must be in the range [0..20]')
     if (config.nEC is None or config.nEC < 0 or config.nEC > 20):
diff --git a/commit/operator/operator_noLUT.c b/commit/operator/operator_noLUT.c
index d8b6706b..061ca1d1 100644
--- a/commit/operator/operator_noLUT.c
+++ b/commit/operator/operator_noLUT.c
@@ -3,7 +3,7 @@
 
 // number of THREADS
 #ifdef nTHREADS
-    #if (nTHREADS<0 || nTHREADS>255)
+    #if (nTHREADS<1 || nTHREADS>255)
     #error "nTHREADS" must be in the range 0..255
     #endif
 #else
diff --git a/commit/operator/operator_withLUT.c b/commit/operator/operator_withLUT.c
index 1b6fd1ae..2137d4a3 100644
--- a/commit/operator/operator_withLUT.c
+++ b/commit/operator/operator_withLUT.c
@@ -3,7 +3,7 @@
 
 // number of THREADS
 #ifdef nTHREADS
-    #if (nTHREADS<0 || nTHREADS>255)
+    #if (nTHREADS<1 || nTHREADS>255)
     #error "nTHREADS" must be in the range 0..255
     #endif
 #else
diff --git a/commit/operator_withCUDA.cu b/commit/operator_withCUDA.cu
index 035625e5..3d807757 100644
--- a/commit/operator_withCUDA.cu
+++ b/commit/operator_withCUDA.cu
@@ -361,46 +361,46 @@ void CudaLinearOperator::dot(float64_t* v_in, float64_t* v_out){
     
     // Copy vector x to the GPU
     cudaMemcpy(gpu_x, v_in, ncols*sizeof(double), cudaMemcpyHostToDevice);
-    cudaCheckLastError();
+    //cudaCheckLastError();
 
     // Multiply IC part in the GPU
     multiply_Ax_ICpart<<<nvoxels, 1024>>>(gpu_voxelIC, gpu_fiberIC, gpu_orienIC, gpu_lengthIC, gpu_segmentsPerBlockIC, gpu_offsetPerBlockIC, gpu_lutIC, gpu_x, gpu_y);
-    cudaCheckLastError();
+    //cudaCheckLastError();
 
     // Multiply EC part in the GPU
     multiply_Ax_ECpart<<<nvoxels, 512>>>(gpu_voxelEC, gpu_orienEC, gpu_segmentsPerBlockEC, gpu_offsetPerBlockEC, gpu_lutEC, gpu_x, gpu_y);
-    cudaCheckLastError();
+    //cudaCheckLastError();
 
     // Multiply ISO part in the GPU
     multiply_Ax_ISOpart<<<nvoxels, 512>>>(gpu_lutISO, gpu_x, gpu_y);
-    cudaCheckLastError();
+    //cudaCheckLastError();
 
     // Copy back result to CPU
     cudaMemcpy(v_out, gpu_y, nrows*sizeof(double), cudaMemcpyDeviceToHost);
-    cudaCheckLastError();
+    //cudaCheckLastError();
 }
 
 void CudaLinearOperator::Tdot(float64_t* v_in, float64_t* v_out){
     
     // Copy vector y to the GPU
     cudaMemcpy(gpu_y, v_in, nrows*sizeof(double), cudaMemcpyHostToDevice);
-    cudaCheckLastError();
+    //cudaCheckLastError();
 
     // Multiply IC part in the GPU
     multiply_Aty_ICpart<<<nfibers, 512>>>(gpu_TvoxelIC, gpu_TfiberIC, gpu_TorienIC, gpu_TlengthIC, gpu_TfibersPerBlockIC, gpu_ToffsetPerBlockIC, gpu_lutIC, gpu_x, gpu_y);
-    cudaCheckLastError();
+    //cudaCheckLastError();
 
     // Multiply EC part in the GPU
     multiply_Aty_ECpart<<<nvoxels, 512>>>(gpu_voxelEC, gpu_orienEC, gpu_segmentsPerBlockEC, gpu_offsetPerBlockEC, gpu_lutEC, gpu_x, gpu_y);
-    cudaCheckLastError();
+    //cudaCheckLastError();
 
     // Multiply ISO part in the GPU
     multiply_Aty_ISOpart<<<nvoxels, 512>>>(gpu_lutISO, gpu_x, gpu_y);
-    cudaCheckLastError();
+    //cudaCheckLastError();
 
     // Copy back result to CPU
     cudaMemcpy(v_out, gpu_x, ncols*sizeof(double), cudaMemcpyDeviceToHost);
-    cudaCheckLastError();
+    //cudaCheckLastError();
 }
 
 // ------------------------------------------------------- KERNELS ------------------------------------------------------- //

From b33e6683af0fa4a00ea3e6482fc421744c06b681 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Wed, 13 Jan 2021 21:28:36 -0600
Subject: [PATCH 159/190] Create cudaoperator folder

---
 commit/{cudaoperator.pyx => cudaoperator/operator.pyx} | 0
 commit/{ => cudaoperator}/operator_withCUDA.cu         | 0
 commit/{ => cudaoperator}/operator_withCUDA.cuh        | 0
 setup.py                                               | 6 +++---
 4 files changed, 3 insertions(+), 3 deletions(-)
 rename commit/{cudaoperator.pyx => cudaoperator/operator.pyx} (100%)
 rename commit/{ => cudaoperator}/operator_withCUDA.cu (100%)
 rename commit/{ => cudaoperator}/operator_withCUDA.cuh (100%)

diff --git a/commit/cudaoperator.pyx b/commit/cudaoperator/operator.pyx
similarity index 100%
rename from commit/cudaoperator.pyx
rename to commit/cudaoperator/operator.pyx
diff --git a/commit/operator_withCUDA.cu b/commit/cudaoperator/operator_withCUDA.cu
similarity index 100%
rename from commit/operator_withCUDA.cu
rename to commit/cudaoperator/operator_withCUDA.cu
diff --git a/commit/operator_withCUDA.cuh b/commit/cudaoperator/operator_withCUDA.cuh
similarity index 100%
rename from commit/operator_withCUDA.cuh
rename to commit/cudaoperator/operator_withCUDA.cuh
diff --git a/setup.py b/setup.py
index 325ca04d..b0a29135 100644
--- a/setup.py
+++ b/setup.py
@@ -131,8 +131,8 @@ def get_extensions_with_cuda():
                       extra_link_args=[],
                       language='c++')
 
-    ext4 = Extension(name='commit.cudaoperator',
-                     sources = ['commit/operator_withCUDA.cu', 'commit/cudaoperator.pyx'],
+    ext4 = Extension(name='commit.cudaoperator.operator',
+                     sources = ['commit/cudaoperator/operator_withCUDA.cu', 'commit/cudaoperator/operator.pyx'],
                      extra_compile_args= {'gcc':  ['-w'],
                                           'nvcc': ['-arch=sm_50', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
                      language = 'c++',
@@ -179,7 +179,7 @@ def run(self):
             
             # Add everything requires for build
             self.swig_opts = None
-            self.include_dirs = [get_include(), CUDA['include']]
+            self.include_dirs = [get_include(), CUDA['include'], 'commit/cudaoperator']
             self.distribution.ext_modules[:] = cythonize(self.distribution.ext_modules)
 
             # Call original build_ext command

From c34128cd4750447a9dfdc7dc3548222a259aea98 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Wed, 13 Jan 2021 21:44:39 -0600
Subject: [PATCH 160/190] Create cudaoperator folder

---
 commit/core.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index 9c1da40f..46fc1c72 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -716,7 +716,7 @@ cdef class Evaluation :
             self.A = sys.modules['commit.operator.operator'].LinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )        
         else:
             import commit.cudaoperator
-            self.A = commit.cudaoperator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS, fcall=1 )
+            self.A = commit.cudaoperator.operator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS, fcall=1 )
 
         LOG( '   [ %.1f seconds ]' % ( time.time() - tic ) )
 

From aabaed0f0a4386d2bd8e0ee7c6574adb4e7ffb24 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Wed, 13 Jan 2021 21:51:30 -0600
Subject: [PATCH 161/190] Create cudaoperator folder

---
 commit/core.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index 46fc1c72..f4467e8e 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -715,7 +715,7 @@ cdef class Evaluation :
                 
             self.A = sys.modules['commit.operator.operator'].LinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )        
         else:
-            import commit.cudaoperator
+            import commit.cudaoperator.operator
             self.A = commit.cudaoperator.operator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS, fcall=1 )
 
         LOG( '   [ %.1f seconds ]' % ( time.time() - tic ) )

From a563a52151dffdcd67a7b44ea2ea9f97138c18e4 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Thu, 14 Jan 2021 00:27:29 -0600
Subject: [PATCH 162/190] Change cuda error handling

---
 commit/cudaoperator/operator.pyx          | 24 +++++++++++++++--------
 commit/cudaoperator/operator_withCUDA.cu  | 14 ++++++-------
 commit/cudaoperator/operator_withCUDA.cuh |  2 +-
 commit/operator/operator.pyx              |  3 ++-
 4 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/commit/cudaoperator/operator.pyx b/commit/cudaoperator/operator.pyx
index 85eb3778..bf639d97 100644
--- a/commit/cudaoperator/operator.pyx
+++ b/commit/cudaoperator/operator.pyx
@@ -4,6 +4,10 @@
 import cython
 import numpy as np
 cimport numpy as np
+from amico.util import ERROR, LOG
+
+cdef extern from "operator_withCUDA.cuh":
+    void checkCompatibility(np.uint64_t, int)
 
 cdef extern from "operator_withCUDA.cuh":
     cdef cppclass C_CudaLinearOperator "CudaLinearOperator":
@@ -63,8 +67,8 @@ cdef class CudaLinearOperator :
     cdef float* LUT_EC
     cdef float* LUT_ISO
 
-    # pointer to the operator in GPU memory
-    cdef C_CudaLinearOperator* GPU_COMMIT_A
+    # pointer to this operator in GPU memory
+    cdef C_CudaLinearOperator* thisptr
 
     # these should be always None, they remain for compatibility
     cdef unsigned int*   ICthreads
@@ -127,8 +131,12 @@ cdef class CudaLinearOperator :
         cdef float [:, ::1] isoSFP = KERNELS['iso']
         self.LUT_ISO = &isoSFP[0,0]
 
+        LOG( '\n-> Checking availability of CUDA:' )
+        #cdef unsigned long long required_mem = 28*self.n + 6*self.nzeppelins + 8.0*(size_t)nfibers + 16.0*(size_t)nvoxels + 4.0*((size_t)size_lutic + (size_t)size_lutec + (size_t)size_lutiso + (size_t)this->nrows + (size_t)this->ncols)
+        checkCompatibility(0, self.gpu_id)
+
         # create the operator in GPU memory
-        self.GPU_COMMIT_A = new C_CudaLinearOperator(
+        self.thisptr = new C_CudaLinearOperator(
             &ICv[0],
             &ICf[0],
             &ICo[0],
@@ -173,7 +181,7 @@ cdef class CudaLinearOperator :
             self.ICv = &ICv[0]
             self.ICo = &ICo[0]
 
-            self.GPU_COMMIT_A.setTransposeData(&self.ICv[0], &self.ICf[0], &self.ICo[0], &self.ICl[0])
+            self.thisptr.setTransposeData(&self.ICv[0], &self.ICf[0], &self.ICo[0], &self.ICl[0])
 
     @property
     def T( self ) :
@@ -207,7 +215,7 @@ cdef class CudaLinearOperator :
 
         # Permit only matrix-vector multiplications
         if v_in.size != self.shape[1] :
-            raise RuntimeError( "A.dot(): dimensions do not match" )
+            ERROR( "A.dot(): dimensions do not match" )
 
         # Create output array
         cdef double [::1] v_out = np.zeros( self.shape[0], dtype=np.float64 )
@@ -215,14 +223,14 @@ cdef class CudaLinearOperator :
         # Call the cython function to read the memory pointers
         if not self.adjoint :
             # DIRECT PRODUCT A*x
-            self.GPU_COMMIT_A.dot(&v_in[0], &v_out[0])
+            self.thisptr.dot(&v_in[0], &v_out[0])
         else :
             # INVERSE PRODUCT A'*y
-            self.GPU_COMMIT_A.Tdot(&v_in[0], &v_out[0])
+            self.thisptr.Tdot(&v_in[0], &v_out[0])
 
         return v_out
 
     def destroy( self ):
         """Free all memory of the CUDA GPU"""
-        self.GPU_COMMIT_A.destroy()
+        self.thisptr.destroy()
 
diff --git a/commit/cudaoperator/operator_withCUDA.cu b/commit/cudaoperator/operator_withCUDA.cu
index 3d807757..5f5c77b4 100644
--- a/commit/cudaoperator/operator_withCUDA.cu
+++ b/commit/cudaoperator/operator_withCUDA.cu
@@ -9,7 +9,7 @@ bool cudaCheck(cudaError_t cudaStatus){
     return cudaStatus == cudaSuccess;
 }
 
-bool checkCompatibility(size_t required_mem, int gpu_id) {
+static void checkCompatibility(size_t required_mem, int gpu_id) {
     int num_gpus;
     cudaError_t cudaStatus;
     
@@ -17,7 +17,7 @@ bool checkCompatibility(size_t required_mem, int gpu_id) {
 
     if (num_gpus <= 0 || num_gpus <= gpu_id || cudaStatus != cudaSuccess) {
         printf("\t* the selected GPU does not exist or it is not detected \n");
-        return false;
+        //return false;
     }
 
     cudaStatus = cudaSetDevice(gpu_id);
@@ -42,14 +42,14 @@ bool checkCompatibility(size_t required_mem, int gpu_id) {
         }
         else{
             printf("\t* compute capability: %d.%d [ ERROR ]. GPU compute capability must be at least 5.0\n", gpu_properties.major, gpu_properties.minor);
-            return false;
+            //return false;
         }
 
-        return true;
+        //return true;
     }
     else{
         printf("\t* checking availability of CUDA ... [ ERROR ]: CUDA is not available or GPU is not CUDA compatible\n");
-        return false;
+        //return false;
     }
 }
 
@@ -118,8 +118,8 @@ CudaLinearOperator::CudaLinearOperator(
         int size_lutec  = nzeppelins*norientations*nsamples;
         int size_lutiso = nballs*nsamples;
 
-        size_t required_mem = 28*(size_t)nsegments + 6.0*(size_t)nzeppelins + 8.0*(size_t)nfibers + 16.0*(size_t)nvoxels + 4.0*((size_t)size_lutic + (size_t)size_lutec + (size_t)size_lutiso + (size_t)this->nrows + (size_t)this->ncols);
-        checkCompatibility(required_mem, gpu_id);
+        //size_t required_mem = 28*(size_t)nsegments + 6.0*(size_t)nzeppelins + 8.0*(size_t)nfibers + 16.0*(size_t)nvoxels + 4.0*((size_t)size_lutic + (size_t)size_lutec + (size_t)size_lutiso + (size_t)this->nrows + (size_t)this->ncols);
+        //checkCompatibility(required_mem, gpu_id);
 
         // transfer constant values to the GPU
         printf("\t* constant values ... ");
diff --git a/commit/cudaoperator/operator_withCUDA.cuh b/commit/cudaoperator/operator_withCUDA.cuh
index c66d07c1..f8ea2057 100644
--- a/commit/cudaoperator/operator_withCUDA.cuh
+++ b/commit/cudaoperator/operator_withCUDA.cuh
@@ -18,7 +18,7 @@ typedef double float64_t;
 // Util functions to check CUDA GPU compatibility
 // ====================================================
 bool cudaCheck(cudaError_t cudaStatus);
-bool checkCompatibility(size_t required_mem, int gpu_id);
+void checkCompatibility(uint64_t required_mem, int gpu_id);
 void cudaCheckLastError();
 
 // ====================================================
diff --git a/commit/operator/operator.pyx b/commit/operator/operator.pyx
index 6d83202a..a4187f95 100755
--- a/commit/operator/operator.pyx
+++ b/commit/operator/operator.pyx
@@ -3,6 +3,7 @@
 
 import cython
 import numpy as np
+from amico.util import ERROR
 cimport numpy as np
 
 # Interfaces to actual C code performing the multiplications
@@ -161,7 +162,7 @@ cdef class LinearOperator :
 
         # Permit only matrix-vector multiplications
         if v_in.size != self.shape[1] :
-            raise RuntimeError( "A.dot(): dimensions do not match" )
+            ERROR( "A.dot(): dimensions do not match" )
 
         # Create output array
         cdef double [::1] v_out = np.zeros( self.shape[0], dtype=np.float64 )

From 84036cfec161f51637aa70745966b191d53a562b Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Thu, 14 Jan 2021 00:30:24 -0600
Subject: [PATCH 163/190] Change cuda error handling

---
 commit/cudaoperator/operator_withCUDA.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/commit/cudaoperator/operator_withCUDA.cuh b/commit/cudaoperator/operator_withCUDA.cuh
index f8ea2057..4d38393c 100644
--- a/commit/cudaoperator/operator_withCUDA.cuh
+++ b/commit/cudaoperator/operator_withCUDA.cuh
@@ -18,7 +18,7 @@ typedef double float64_t;
 // Util functions to check CUDA GPU compatibility
 // ====================================================
 bool cudaCheck(cudaError_t cudaStatus);
-void checkCompatibility(uint64_t required_mem, int gpu_id);
+static void checkCompatibility(uint64_t required_mem, int gpu_id);
 void cudaCheckLastError();
 
 // ====================================================

From 6b9cd46ba2e8c33ef887386a8a226a643f9b85c2 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Thu, 14 Jan 2021 00:35:17 -0600
Subject: [PATCH 164/190] Change cuda error handling

---
 commit/cudaoperator/operator_withCUDA.cu  | 2 +-
 commit/cudaoperator/operator_withCUDA.cuh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/commit/cudaoperator/operator_withCUDA.cu b/commit/cudaoperator/operator_withCUDA.cu
index 5f5c77b4..a4185904 100644
--- a/commit/cudaoperator/operator_withCUDA.cu
+++ b/commit/cudaoperator/operator_withCUDA.cu
@@ -9,7 +9,7 @@ bool cudaCheck(cudaError_t cudaStatus){
     return cudaStatus == cudaSuccess;
 }
 
-static void checkCompatibility(size_t required_mem, int gpu_id) {
+void checkCompatibility(uint64_t required_mem, int gpu_id) {
     int num_gpus;
     cudaError_t cudaStatus;
     
diff --git a/commit/cudaoperator/operator_withCUDA.cuh b/commit/cudaoperator/operator_withCUDA.cuh
index 4d38393c..f8ea2057 100644
--- a/commit/cudaoperator/operator_withCUDA.cuh
+++ b/commit/cudaoperator/operator_withCUDA.cuh
@@ -18,7 +18,7 @@ typedef double float64_t;
 // Util functions to check CUDA GPU compatibility
 // ====================================================
 bool cudaCheck(cudaError_t cudaStatus);
-static void checkCompatibility(uint64_t required_mem, int gpu_id);
+void checkCompatibility(uint64_t required_mem, int gpu_id);
 void cudaCheckLastError();
 
 // ====================================================

From 4082bdd4003dc889be36de8592d8055f1c9c271d Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Fri, 22 Jan 2021 20:55:32 -0600
Subject: [PATCH 165/190] Check GPU properties before build operator

---
 commit/cudaoperator/operator.pyx          | 14 +++--
 commit/cudaoperator/operator_withCUDA.cu  | 63 ++++++++++++++++-------
 commit/cudaoperator/operator_withCUDA.cuh |  2 +-
 3 files changed, 56 insertions(+), 23 deletions(-)

diff --git a/commit/cudaoperator/operator.pyx b/commit/cudaoperator/operator.pyx
index bf639d97..a035606f 100644
--- a/commit/cudaoperator/operator.pyx
+++ b/commit/cudaoperator/operator.pyx
@@ -7,7 +7,7 @@ cimport numpy as np
 from amico.util import ERROR, LOG
 
 cdef extern from "operator_withCUDA.cuh":
-    void checkCompatibility(np.uint64_t, int)
+    int checkCompatibility(np.uint64_t, int)
 
 cdef extern from "operator_withCUDA.cuh":
     cdef cppclass C_CudaLinearOperator "CudaLinearOperator":
@@ -131,9 +131,17 @@ cdef class CudaLinearOperator :
         cdef float [:, ::1] isoSFP = KERNELS['iso']
         self.LUT_ISO = &isoSFP[0,0]
 
-        LOG( '\n-> Checking availability of CUDA:' )
+        LOG( '\n-> Checking CUDA GPU:' )
         #cdef unsigned long long required_mem = 28*self.n + 6*self.nzeppelins + 8.0*(size_t)nfibers + 16.0*(size_t)nvoxels + 4.0*((size_t)size_lutic + (size_t)size_lutec + (size_t)size_lutiso + (size_t)this->nrows + (size_t)this->ncols)
-        checkCompatibility(0, self.gpu_id)
+        cdef int ans = checkCompatibility(0, self.gpu_id)
+        if ans == 1:
+            ERROR( 'The selected GPU is not detected; check "gpu_id" in "set_threads()"' )
+        elif ans == 2:
+            ERROR( 'Impossible to set GPU with ID=%d' % gpu_id )
+        elif ans == 3:
+            ERROR( 'Impossible to get properties from GPU with ID=%d' % gpu_id )
+        elif ans == 4:
+            ERROR( 'Compute capability must be at least 5.0' )
 
         # create the operator in GPU memory
         self.thisptr = new C_CudaLinearOperator(
diff --git a/commit/cudaoperator/operator_withCUDA.cu b/commit/cudaoperator/operator_withCUDA.cu
index a4185904..113e254c 100644
--- a/commit/cudaoperator/operator_withCUDA.cu
+++ b/commit/cudaoperator/operator_withCUDA.cu
@@ -9,39 +9,64 @@ bool cudaCheck(cudaError_t cudaStatus){
     return cudaStatus == cudaSuccess;
 }
 
-void checkCompatibility(uint64_t required_mem, int gpu_id) {
-    int num_gpus;
+int checkCompatibility(uint64_t required_mem, int gpuID) {
+    int gpuCount;
     cudaError_t cudaStatus;
     
-    cudaStatus = cudaGetDeviceCount(&num_gpus);
+    cudaStatus = cudaGetDeviceCount(&gpuCount);
 
-    if (num_gpus <= 0 || num_gpus <= gpu_id || cudaStatus != cudaSuccess) {
-        printf("\t* the selected GPU does not exist or it is not detected \n");
-        //return false;
+    if (gpuCount <= 0 || gpuID >= gpuCount || cudaStatus != cudaSuccess) {
+        //printf("\t* the selected GPU does not exist or it is not detected \n");
+        return 1;
+    }
+
+    cudaStatus = cudaSetDevice(gpuID);
+
+    if (cudaStatus != cudaSuccess){
+        //printf("\t* checking availability of CUDA ... [ ERROR ]: CUDA is not available or GPU is not CUDA compatible\n");
+        //there was a problem setting CUDA GPU with ID=gpuID
+        return 2;
     }
 
-    cudaStatus = cudaSetDevice(gpu_id);
+    cudaDeviceProp gpuProperties;
+    cudaStatus = cudaGetDeviceProperties(&gpuProperties, gpuID);
+
+    if (cudaStatus != cudaSuccess){
+        //problem getting properties from CUDA GPU
+        return 3;
+    }
 
-    if(cudaStatus == cudaSuccess){
-        cudaDeviceProp gpu_properties;
-        cudaGetDeviceProperties(&gpu_properties, gpu_id);
+    printf("\t* using CUDA GPU:     [ %s ]\n",     gpuProperties.name);
+    printf("\t* total memory:       [ %.2fGB ]\n", gpuProperties.totalGlobalMem*1e-9);
+    printf("\t* compute capability: [ %d.%d ]\n",  gpuProperties.major, gpuProperties.minor);
+
+    if(gpuProperties.major < 5){
+        //printf("\t* GPU compute capability must be at least 5.0\n", gpuProperties.major, gpuProperties.minor);
+        return 4;
+    }
+
+    return 0;
+
+    /*if(cudaStatus == cudaSuccess){
+        cudaDeviceProp gpuProperties;
+        cudaGetDeviceProperties(&gpuProperties, gpuID);
 
         printf("\t* checking availability of CUDA... [ OK ]\n");
-        printf("\t* number of CUDA GPUs detected: %d\n", num_gpus);
-        printf("\t* using GPU with ID %d... [ %s ]\n", gpu_id, gpu_properties.name);
+        printf("\t* number of CUDA GPUs detected: %d\n", gpuCount);
+        printf("\t* using GPU with ID %d... [ %s ]\n", gpuID, gpuProperties.name);
 
-        if (required_mem <= gpu_properties.totalGlobalMem) {
-            printf("\t* using %.2f GB of total %.2f GB... [ OK ]\n", required_mem*1e-9, gpu_properties.totalGlobalMem*1e-9);
+        if (required_mem <= gpuProperties.totalGlobalMem) {
+            printf("\t* using %.2f GB of total %.2f GB... [ OK ]\n", required_mem*1e-9, gpuProperties.totalGlobalMem*1e-9);
         }
         else {
-            printf("\t* using %f GB of total %f GB... [ ERROR ]: dictionary too big for GPU memory\n", required_mem*1e-9, gpu_properties.totalGlobalMem*1e-9);
+            printf("\t* using %f GB of total %f GB... [ ERROR ]: dictionary too big for GPU memory\n", required_mem*1e-9, gpuProperties.totalGlobalMem*1e-9);
         }
 
-        if(gpu_properties.major >= 5){
-            printf("\t* compute capability: %d.%d [ OK ]\n", gpu_properties.major, gpu_properties.minor);
+        if(gpuProperties.major >= 5){
+            printf("\t* compute capability: %d.%d [ OK ]\n", gpuProperties.major, gpuProperties.minor);
         }
         else{
-            printf("\t* compute capability: %d.%d [ ERROR ]. GPU compute capability must be at least 5.0\n", gpu_properties.major, gpu_properties.minor);
+            printf("\t* compute capability: %d.%d [ ERROR ]. GPU compute capability must be at least 5.0\n", gpuProperties.major, gpuProperties.minor);
             //return false;
         }
 
@@ -50,7 +75,7 @@ void checkCompatibility(uint64_t required_mem, int gpu_id) {
     else{
         printf("\t* checking availability of CUDA ... [ ERROR ]: CUDA is not available or GPU is not CUDA compatible\n");
         //return false;
-    }
+    }//*/
 }
 
 void cudaCheckLastError()
diff --git a/commit/cudaoperator/operator_withCUDA.cuh b/commit/cudaoperator/operator_withCUDA.cuh
index f8ea2057..6be091cd 100644
--- a/commit/cudaoperator/operator_withCUDA.cuh
+++ b/commit/cudaoperator/operator_withCUDA.cuh
@@ -18,7 +18,7 @@ typedef double float64_t;
 // Util functions to check CUDA GPU compatibility
 // ====================================================
 bool cudaCheck(cudaError_t cudaStatus);
-void checkCompatibility(uint64_t required_mem, int gpu_id);
+int checkCompatibility(uint64_t required_mem, int gpu_id);
 void cudaCheckLastError();
 
 // ====================================================

From 8c98e699e4d170732743744fb04b1667161c1971 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Fri, 22 Jan 2021 21:03:36 -0600
Subject: [PATCH 166/190] Check GPU properties before build operator

---
 commit/cudaoperator/operator.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/commit/cudaoperator/operator.pyx b/commit/cudaoperator/operator.pyx
index a035606f..68bfd015 100644
--- a/commit/cudaoperator/operator.pyx
+++ b/commit/cudaoperator/operator.pyx
@@ -137,9 +137,9 @@ cdef class CudaLinearOperator :
         if ans == 1:
             ERROR( 'The selected GPU is not detected; check "gpu_id" in "set_threads()"' )
         elif ans == 2:
-            ERROR( 'Impossible to set GPU with ID=%d' % gpu_id )
+            ERROR( 'Impossible to set GPU with ID=%d' % self.gpu_id )
         elif ans == 3:
-            ERROR( 'Impossible to get properties from GPU with ID=%d' % gpu_id )
+            ERROR( 'Impossible to get properties from GPU with ID=%d' % self.gpu_id )
         elif ans == 4:
             ERROR( 'Compute capability must be at least 5.0' )
 

From cd80599cbd8dffc5dac6755f9620208106615969 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Fri, 22 Jan 2021 21:35:47 -0600
Subject: [PATCH 167/190] Check GPU properties before build operator

---
 commit/core.pyx                          | 22 +++++++++++++++++++---
 commit/cudaoperator/operator.pyx         | 12 ------------
 commit/cudaoperator/operator_withCUDA.cu |  2 +-
 3 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index f4467e8e..41884ee2 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -491,7 +491,7 @@ cdef class Evaluation :
         LOG( '   [ %.1f seconds ]' % ( time.time() - tic ) )
 
 
-    def set_threads( self, nthreads = None, select_gpu = 0 ) :
+    def set_threads( self, nthreads = None, gpu_id = 0 ) :
         """Set the number of threads to use for the matrix-vector operations with A and A'.
 
         Parameters
@@ -499,7 +499,7 @@ cdef class Evaluation :
         nthreads : integer
             Number of threads to use (nthreads = None ---> all the CPU threads available in the system
                                       nthreads = 0    ---> enable CUDA GPU acceleration)
-        select_gpu : integer
+        gpu_id : integer
             GPU ID of the Nvidia GPU where COMMIT will be executed, default=0 and it is only required if nthreads=0
             (To show a list of Nvidia GPUs and their IDs, open a system shell and run the command 'nvidia-smi')
         """
@@ -521,7 +521,23 @@ cdef class Evaluation :
         self.THREADS = {}
         self.THREADS['n'] = nthreads
         if nthreads == 0:
-            self.THREADS['GPUID'] = select_gpu
+            self.THREADS['GPUID'] = gpu_id
+            LOG( '\n-> Checking CUDA GPU:' )
+
+            from commit.cudaoperator.operator import checkCompatibility
+            #cdef unsigned long long required_mem = 28*self.n + 6*self.nzeppelins + 8.0*(size_t)nfibers + 16.0*(size_t)nvoxels + 4.0*((size_t)size_lutic + (size_t)size_lutec + (size_t)size_lutiso + (size_t)this->nrows + (size_t)this->ncols)
+            cdef int ans = checkCompatibility(0, gpu_id)
+            if ans == 1:
+                ERROR( 'The selected GPU is not detected' )
+            elif ans == 2:
+                ERROR( 'Impossible to set GPU with ID=%d' % gpu_id )
+            elif ans == 3:
+                ERROR( 'Impossible to get properties from GPU with ID=%d' % gpu_id )
+            elif ans == 4:
+                ERROR( 'Compute capability must be at least 5.0' )
+
+            if gpu_id == 0:
+                LOG( '   Using default GPU. Use option "gpu_id" in "set_threads()" to change selection' )
 
         cdef :
             long [:] C
diff --git a/commit/cudaoperator/operator.pyx b/commit/cudaoperator/operator.pyx
index 68bfd015..4fc381ca 100644
--- a/commit/cudaoperator/operator.pyx
+++ b/commit/cudaoperator/operator.pyx
@@ -131,18 +131,6 @@ cdef class CudaLinearOperator :
         cdef float [:, ::1] isoSFP = KERNELS['iso']
         self.LUT_ISO = &isoSFP[0,0]
 
-        LOG( '\n-> Checking CUDA GPU:' )
-        #cdef unsigned long long required_mem = 28*self.n + 6*self.nzeppelins + 8.0*(size_t)nfibers + 16.0*(size_t)nvoxels + 4.0*((size_t)size_lutic + (size_t)size_lutec + (size_t)size_lutiso + (size_t)this->nrows + (size_t)this->ncols)
-        cdef int ans = checkCompatibility(0, self.gpu_id)
-        if ans == 1:
-            ERROR( 'The selected GPU is not detected; check "gpu_id" in "set_threads()"' )
-        elif ans == 2:
-            ERROR( 'Impossible to set GPU with ID=%d' % self.gpu_id )
-        elif ans == 3:
-            ERROR( 'Impossible to get properties from GPU with ID=%d' % self.gpu_id )
-        elif ans == 4:
-            ERROR( 'Compute capability must be at least 5.0' )
-
         # create the operator in GPU memory
         self.thisptr = new C_CudaLinearOperator(
             &ICv[0],
diff --git a/commit/cudaoperator/operator_withCUDA.cu b/commit/cudaoperator/operator_withCUDA.cu
index 113e254c..633529f3 100644
--- a/commit/cudaoperator/operator_withCUDA.cu
+++ b/commit/cudaoperator/operator_withCUDA.cu
@@ -36,7 +36,7 @@ int checkCompatibility(uint64_t required_mem, int gpuID) {
         return 3;
     }
 
-    printf("\t* using CUDA GPU:     [ %s ]\n",     gpuProperties.name);
+    printf("\t* selected GPU:       [ %s ]\n",     gpuProperties.name);
     printf("\t* total memory:       [ %.2fGB ]\n", gpuProperties.totalGlobalMem*1e-9);
     printf("\t* compute capability: [ %d.%d ]\n",  gpuProperties.major, gpuProperties.minor);
 

From b0f8e3af7660217920e3dae6fd4273da6874264a Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Fri, 22 Jan 2021 21:37:49 -0600
Subject: [PATCH 168/190] Check GPU properties before build operator

---
 commit/core.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index 41884ee2..be7f7eb9 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -526,7 +526,7 @@ cdef class Evaluation :
 
             from commit.cudaoperator.operator import checkCompatibility
             #cdef unsigned long long required_mem = 28*self.n + 6*self.nzeppelins + 8.0*(size_t)nfibers + 16.0*(size_t)nvoxels + 4.0*((size_t)size_lutic + (size_t)size_lutec + (size_t)size_lutiso + (size_t)this->nrows + (size_t)this->ncols)
-            cdef int ans = checkCompatibility(0, gpu_id)
+            ans = checkCompatibility(0, gpu_id)
             if ans == 1:
                 ERROR( 'The selected GPU is not detected' )
             elif ans == 2:

From 7570d1ce53ddb92360ac9a96315f55aaefa3e299 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Fri, 22 Jan 2021 21:44:34 -0600
Subject: [PATCH 169/190] Check GPU properties before build operator

---
 commit/core.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index be7f7eb9..ebd22caf 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -524,9 +524,9 @@ cdef class Evaluation :
             self.THREADS['GPUID'] = gpu_id
             LOG( '\n-> Checking CUDA GPU:' )
 
-            from commit.cudaoperator.operator import checkCompatibility
+            import commit.cudaoperator.operator
             #cdef unsigned long long required_mem = 28*self.n + 6*self.nzeppelins + 8.0*(size_t)nfibers + 16.0*(size_t)nvoxels + 4.0*((size_t)size_lutic + (size_t)size_lutec + (size_t)size_lutiso + (size_t)this->nrows + (size_t)this->ncols)
-            ans = checkCompatibility(0, gpu_id)
+            ans = commit.cudaoperator.operator.checkCompatibility(0, gpu_id)
             if ans == 1:
                 ERROR( 'The selected GPU is not detected' )
             elif ans == 2:

From 4e320509c03d10a3fdf2eefec9d7fb3dc29890e4 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Fri, 22 Jan 2021 21:52:31 -0600
Subject: [PATCH 170/190] Check GPU properties before build operator

---
 commit/core.pyx                  | 4 ++--
 commit/cudaoperator/operator.pyx | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index ebd22caf..3426d924 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -524,9 +524,9 @@ cdef class Evaluation :
             self.THREADS['GPUID'] = gpu_id
             LOG( '\n-> Checking CUDA GPU:' )
 
-            import commit.cudaoperator.operator
+            from commit.cudaoperator.operator import check_compatibility
             #cdef unsigned long long required_mem = 28*self.n + 6*self.nzeppelins + 8.0*(size_t)nfibers + 16.0*(size_t)nvoxels + 4.0*((size_t)size_lutic + (size_t)size_lutec + (size_t)size_lutiso + (size_t)this->nrows + (size_t)this->ncols)
-            ans = commit.cudaoperator.operator.checkCompatibility(0, gpu_id)
+            ans = check_compatibility(0, gpu_id)
             if ans == 1:
                 ERROR( 'The selected GPU is not detected' )
             elif ans == 2:
diff --git a/commit/cudaoperator/operator.pyx b/commit/cudaoperator/operator.pyx
index 4fc381ca..eff8cf30 100644
--- a/commit/cudaoperator/operator.pyx
+++ b/commit/cudaoperator/operator.pyx
@@ -9,6 +9,9 @@ from amico.util import ERROR, LOG
 cdef extern from "operator_withCUDA.cuh":
     int checkCompatibility(np.uint64_t, int)
 
+def check_compatibility(mem, gpu_id):
+    return checkCompatibility(mem, gpu_id)
+
 cdef extern from "operator_withCUDA.cuh":
     cdef cppclass C_CudaLinearOperator "CudaLinearOperator":
         C_CudaLinearOperator(

From 1da8091408b8a19437e8a8075a053a26294343c8 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Fri, 22 Jan 2021 22:46:01 -0600
Subject: [PATCH 171/190] Check GPU properties before build operator

---
 commit/core.pyx                          | 27 +++++++++++++++---------
 commit/cudaoperator/operator.pyx         |  8 +++----
 commit/cudaoperator/operator_withCUDA.cu |  8 +++----
 3 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index 3426d924..26ebafd3 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -491,11 +491,14 @@ cdef class Evaluation :
         LOG( '   [ %.1f seconds ]' % ( time.time() - tic ) )
 
 
-    def set_threads( self, nthreads = None, gpu_id = 0 ) :
+    def set_threads( self, n = None, nthreads = None, gpu_id = 0 ) :
         """Set the number of threads to use for the matrix-vector operations with A and A'.
 
         Parameters
         ----------
+        n : integer
+            Same as nthreads. This remains just for compatibility with previous versions
+
         nthreads : integer
             Number of threads to use (nthreads = None ---> all the CPU threads available in the system
                                       nthreads = 0    ---> enable CUDA GPU acceleration)
@@ -504,12 +507,16 @@ cdef class Evaluation :
             (To show a list of Nvidia GPUs and their IDs, open a system shell and run the command 'nvidia-smi')
         """
         if nthreads is None :
-            # Set to the number of CPUs in the system
-            try :
-                import multiprocessing
-                nthreads = multiprocessing.cpu_count()
-            except :
-                nthreads = 1
+            if n != None:
+                WARNING( '"n" parameter is deprecated, use "nthreads" instead' )
+                nthreads = n
+            else:
+                # Set to the number of CPUs in the system
+                try :
+                    import multiprocessing
+                    nthreads = multiprocessing.cpu_count()
+                except :
+                    nthreads = 1
 
         if nthreads < 0 or nthreads > 255 :
             ERROR( 'Number of threads must be between 0 and 255' )
@@ -521,12 +528,12 @@ cdef class Evaluation :
         self.THREADS = {}
         self.THREADS['n'] = nthreads
         if nthreads == 0:
-            self.THREADS['GPUID'] = gpu_id
+            self.THREADS['gpu_id'] = gpu_id
             LOG( '\n-> Checking CUDA GPU:' )
 
             from commit.cudaoperator.operator import check_compatibility
             #cdef unsigned long long required_mem = 28*self.n + 6*self.nzeppelins + 8.0*(size_t)nfibers + 16.0*(size_t)nvoxels + 4.0*((size_t)size_lutic + (size_t)size_lutec + (size_t)size_lutiso + (size_t)this->nrows + (size_t)this->ncols)
-            ans = check_compatibility(0, gpu_id)
+            ans = check_compatibility(gpu_id)
             if ans == 1:
                 ERROR( 'The selected GPU is not detected' )
             elif ans == 2:
@@ -537,7 +544,7 @@ cdef class Evaluation :
                 ERROR( 'Compute capability must be at least 5.0' )
 
             if gpu_id == 0:
-                LOG( '   Using default GPU. Use option "gpu_id" in "set_threads()" to change selection' )
+                LOG( '   [ Default selected GPU. Use option "gpu_id" in "set_threads()" to change selection ]' )
 
         cdef :
             long [:] C
diff --git a/commit/cudaoperator/operator.pyx b/commit/cudaoperator/operator.pyx
index eff8cf30..6246f84f 100644
--- a/commit/cudaoperator/operator.pyx
+++ b/commit/cudaoperator/operator.pyx
@@ -7,10 +7,10 @@ cimport numpy as np
 from amico.util import ERROR, LOG
 
 cdef extern from "operator_withCUDA.cuh":
-    int checkCompatibility(np.uint64_t, int)
+    int checkCompatibility(int)
 
-def check_compatibility(mem, gpu_id):
-    return checkCompatibility(mem, gpu_id)
+def check_compatibility(gpu_id):
+    return checkCompatibility(gpu_id)
 
 cdef extern from "operator_withCUDA.cuh":
     cdef cppclass C_CudaLinearOperator "CudaLinearOperator":
@@ -96,7 +96,7 @@ cdef class CudaLinearOperator :
         self.nI         = KERNELS['iso'].shape[0]   # number of ISO contributions
         self.n          = DICTIONARY['IC']['n']     # numbner of IC segments
         self.ndirs      = KERNELS['wmr'].shape[1]   # number of directions
-        self.gpu_id     = THREADS['GPUID']          # id of the CUDA GPU
+        self.gpu_id     = THREADS['gpu_id']          # id of the CUDA GPU
 
         if KERNELS['wmr'].size > 0 :
             self.nS = KERNELS['wmr'].shape[2]       # number of SAMPLES
diff --git a/commit/cudaoperator/operator_withCUDA.cu b/commit/cudaoperator/operator_withCUDA.cu
index 633529f3..cb53753a 100644
--- a/commit/cudaoperator/operator_withCUDA.cu
+++ b/commit/cudaoperator/operator_withCUDA.cu
@@ -9,7 +9,7 @@ bool cudaCheck(cudaError_t cudaStatus){
     return cudaStatus == cudaSuccess;
 }
 
-int checkCompatibility(uint64_t required_mem, int gpuID) {
+int checkCompatibility(int gpuID) {
     int gpuCount;
     cudaError_t cudaStatus;
     
@@ -36,9 +36,9 @@ int checkCompatibility(uint64_t required_mem, int gpuID) {
         return 3;
     }
 
-    printf("\t* selected GPU:       [ %s ]\n",     gpuProperties.name);
-    printf("\t* total memory:       [ %.2fGB ]\n", gpuProperties.totalGlobalMem*1e-9);
-    printf("\t* compute capability: [ %d.%d ]\n",  gpuProperties.major, gpuProperties.minor);
+    printf("\t* selected GPU...       [ %s ]\n",     gpuProperties.name);
+    printf("\t* total memory...       [ %.2fGB ]\n", gpuProperties.totalGlobalMem*1e-9);
+    printf("\t* compute capability... [ %d.%d ]\n",  gpuProperties.major, gpuProperties.minor);
 
     if(gpuProperties.major < 5){
         //printf("\t* GPU compute capability must be at least 5.0\n", gpuProperties.major, gpuProperties.minor);

From a5f279aed7fded2964c813fca6bc533bc251ff8b Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Fri, 22 Jan 2021 22:49:26 -0600
Subject: [PATCH 172/190] Show warning message when using 'n' in
 'set_threads()'

---
 commit/core.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index 26ebafd3..903c9ac0 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -507,7 +507,7 @@ cdef class Evaluation :
             (To show a list of Nvidia GPUs and their IDs, open a system shell and run the command 'nvidia-smi')
         """
         if nthreads is None :
-            if n != None:
+            if n != None :
                 WARNING( '"n" parameter is deprecated, use "nthreads" instead' )
                 nthreads = n
             else:

From 5bc8b16b41d66876658c2117aeff62611026cc3b Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Fri, 22 Jan 2021 22:51:06 -0600
Subject: [PATCH 173/190] Show warning message when using 'n' in
 'set_threads()'

---
 commit/cudaoperator/operator_withCUDA.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/commit/cudaoperator/operator_withCUDA.cuh b/commit/cudaoperator/operator_withCUDA.cuh
index 6be091cd..15651ea5 100644
--- a/commit/cudaoperator/operator_withCUDA.cuh
+++ b/commit/cudaoperator/operator_withCUDA.cuh
@@ -18,7 +18,7 @@ typedef double float64_t;
 // Util functions to check CUDA GPU compatibility
 // ====================================================
 bool cudaCheck(cudaError_t cudaStatus);
-int checkCompatibility(uint64_t required_mem, int gpu_id);
+int checkCompatibility(int gpu_id);
 void cudaCheckLastError();
 
 // ====================================================

From 421ba45da8af29d1321b5e212b9fa4190b2e5a90 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 23 Jan 2021 21:11:42 -0600
Subject: [PATCH 174/190] Modify CUDA error messages

---
 commit/core.pyx                           |   2 +-
 commit/cudaoperator/operator.pyx          |  26 ++-
 commit/cudaoperator/operator_withCUDA.cu  | 255 +++++++++++++++++++++-
 commit/cudaoperator/operator_withCUDA.cuh |  29 ++-
 4 files changed, 295 insertions(+), 17 deletions(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index 903c9ac0..1999cd65 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -544,7 +544,7 @@ cdef class Evaluation :
                 ERROR( 'Compute capability must be at least 5.0' )
 
             if gpu_id == 0:
-                LOG( '   [ Default selected GPU. Use option "gpu_id" in "set_threads()" to change selection ]' )
+                LOG( '   [ Default GPU selected. Use option "gpu_id" in "set_threads()" to change selection ]' )
 
         cdef :
             long [:] C
diff --git a/commit/cudaoperator/operator.pyx b/commit/cudaoperator/operator.pyx
index 6246f84f..3110dfc9 100644
--- a/commit/cudaoperator/operator.pyx
+++ b/commit/cudaoperator/operator.pyx
@@ -12,6 +12,21 @@ cdef extern from "operator_withCUDA.cuh":
 def check_compatibility(gpu_id):
     return checkCompatibility(gpu_id)
 
+def check_cuda(error_id):
+    if ans == -1:
+        ERROR( 'Impossible to allocate auxiliar memory in CPU' )
+    elif ans == 1:
+        ERROR( 'Impossible to allocate memory in GPU' )
+    elif ans == 2:
+        ERROR( 'Impossible to transfer memory to GPU' )
+    elif ans == 3:
+        ERROR( 'Impossible to bind texture memory' )
+    elif ans == 4:
+        ERROR( 'Impossible to transfer constant values to GPU' )
+    elif ans == 0:
+        print( '[ OK ]' )
+        
+
 cdef extern from "operator_withCUDA.cuh":
     cdef cppclass C_CudaLinearOperator "CudaLinearOperator":
         C_CudaLinearOperator(
@@ -163,6 +178,14 @@ cdef class CudaLinearOperator :
 
         # create the transpose of the operator in GPU memory
         if fcall == 1:
+            check_cuda( self.thisptr.setConstants() )
+
+            check_cuda( self.thisptr.setDictionary(&ICv[0],&ICf[0],&ICo[0],&ICl[0], &ECv[0],&ECo[0]) )
+
+            check_cuda( self.thisptr.setKernels(&wmrSFP[0,0,0], &wmhSFP[0,0,0], &isoSFP[0,0]) )
+
+            check_cuda( self.thisptr.setVectors() )
+
             idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['fiber'])] )
 
             self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
@@ -180,7 +203,8 @@ cdef class CudaLinearOperator :
             self.ICv = &ICv[0]
             self.ICo = &ICo[0]
 
-            self.thisptr.setTransposeData(&self.ICv[0], &self.ICf[0], &self.ICo[0], &self.ICl[0])
+            #self.thisptr.setTransposeData(&self.ICv[0], &self.ICf[0], &self.ICo[0], &self.ICl[0])
+            check_cuda( self.thisptr.setTransposeDictionary(&self.ICv[0], &self.ICf[0], &self.ICo[0], &self.ICl[0]) )
 
     @property
     def T( self ) :
diff --git a/commit/cudaoperator/operator_withCUDA.cu b/commit/cudaoperator/operator_withCUDA.cu
index cb53753a..7e59deb6 100644
--- a/commit/cudaoperator/operator_withCUDA.cu
+++ b/commit/cudaoperator/operator_withCUDA.cu
@@ -104,6 +104,234 @@ void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compar
         offsetPerBlock[i] = offsetPerBlock[i-1] + compartmentsPerBlock[i-1];
 }
 
+int CudaLinearOperator::setDictionary(uint32_t* voxelIC, uint32_t* fiberIC, uint16_t* orienIC, float32_t* lengthIC, uint32_t* voxelEC, uint16_t* orienEC){
+    
+    cudaError_t cudaStatus;
+
+    printf("\t* pre-processing... ");
+    uint32_t* segmentsPerBlock = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
+    uint32_t* offsetPerBlock   = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
+
+    if (segmentsPerBlock == NULL || offsetPerBlock == NULL) return -1;
+
+    preprocessDataForGPU(voxelIC, nsegments, segmentsPerBlock, offsetPerBlock, nvoxels);
+
+    cudaStatus = cudaMalloc((void**)&gpu_segmentsPerBlockIC, nvoxels*sizeof(uint32_t));
+    if (cudaStatus != cudaSuccess) return 1;
+    cudaStatus = cudaMalloc((void**)&gpu_offsetPerBlockIC,   nvoxels*sizeof(uint32_t));
+    if (cudaStatus != cudaSuccess) return 1;
+
+    cudaStatus = cudaMemcpy(gpu_segmentsPerBlockIC, segmentsPerBlock, nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) return 2;
+    cudaStatus = cudaMemcpy(gpu_offsetPerBlockIC,   offsetPerBlock,   nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) return 2;
+
+    if (npeaks > 0){
+        preprocessDataForGPU(voxelEC, npeaks, segmentsPerBlock, offsetPerBlock, nvoxels);
+
+        cudaStatus = cudaMalloc((void**)&gpu_segmentsPerBlockEC, nvoxels*sizeof(uint32_t));
+        if (cudaStatus != cudaSuccess) return 1;
+        cudaStatus = cudaMalloc((void**)&gpu_offsetPerBlockEC,   nvoxels*sizeof(uint32_t));
+        if (cudaStatus != cudaSuccess) return 1;
+
+        cudaStatus = cudaMemcpy(gpu_segmentsPerBlockEC, segmentsPerBlock, nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice);
+        if (cudaStatus != cudaSuccess) return 2;
+        cudaStatus = cudaMemcpy(gpu_offsetPerBlockEC,   offsetPerBlock,   nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice);
+        if (cudaStatus != cudaSuccess) return 2;
+    }
+
+    free(segmentsPerBlock);
+    free(offsetPerBlock);
+    
+    printf("\t* A  operator... ");
+
+    // alloc IC part of the dictionary in GPU
+    cudaStatus = cudaMalloc((void**)&gpu_voxelIC,  nsegments*sizeof(uint32_t)); 
+    if (cudaStatus != cudaSuccess) return 1;
+    cudaStatus = cudaMalloc((void**)&gpu_fiberIC,  nsegments*sizeof(uint32_t)); 
+    if (cudaStatus != cudaSuccess) return 1;
+    cudaStatus = cudaMalloc((void**)&gpu_orienIC,  nsegments*sizeof(uint16_t)); 
+    if (cudaStatus != cudaSuccess) return 1;
+    cudaStatus = cudaMalloc((void**)&gpu_lengthIC, nsegments*sizeof(float32_t));
+    if (cudaStatus != cudaSuccess) return 1;
+
+    // transfer IC part of the dictionary to GPU
+    cudaStatus = cudaMemcpy(gpu_voxelIC,  voxelIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) return 2;
+    cudaStatus = cudaMemcpy(gpu_fiberIC,  fiberIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) return 2;
+    cudaStatus = cudaMemcpy(gpu_orienIC,  orienIC,  nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) return 2;
+    cudaStatus = cudaMemcpy(gpu_lengthIC, lengthIC, nsegments*sizeof(float32_t), cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) return 2;
+
+    if (npeaks > 0){
+        // alloc EC part of the dictionary in GPU
+        cudaStatus = cudaMalloc((void**)&gpu_voxelEC,  npeaks*sizeof(uint32_t));
+        if (cudaStatus != cudaSuccess) return 1;
+        cudaStatus = cudaMalloc((void**)&gpu_orienEC,  npeaks*sizeof(uint16_t));
+        if (cudaStatus != cudaSuccess) return 1;
+
+        // transfer EC part of the dictionary to GPU
+        cudaStatus = cudaMemcpy(gpu_voxelEC,  voxelEC,  npeaks*sizeof(uint32_t), cudaMemcpyHostToDevice);
+        if (cudaStatus != cudaSuccess) return 2;
+        cudaStatus = cudaMemcpy(gpu_orienEC,  orienEC,  npeaks*sizeof(uint16_t), cudaMemcpyHostToDevice);
+        if (cudaStatus != cudaSuccess) return 2;
+    }
+
+    return 0;
+}
+
+int CudaLinearOperator::setTransposeDictionary(uint32_t* TvoxelIC, uint32_t* TfiberIC, uint16_t* TorienIC, float32_t* TlengthIC){
+    
+    cudaError_t cudaStatus;
+    
+    printf("\t* A' operator... ");
+
+    uint32_t*  fibersPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
+    uint32_t*  offsetPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
+    if(fibersPerBlock == NULL || offsetPerBlock == NULL) return -1;
+
+    preprocessDataForGPU(TfiberIC, nsegments, fibersPerBlock, offsetPerBlock, nfibers);
+
+    cudaStatus = cudaMalloc((void**)&gpu_TfibersPerBlockIC, nfibers*sizeof(uint32_t));
+    if (cudaStatus != cudaSuccess) return 1;
+    cudaStatus = cudaMalloc((void**)&gpu_ToffsetPerBlockIC, nfibers*sizeof(uint32_t));
+    if (cudaStatus != cudaSuccess) return 1;
+
+    cudaStatus = cudaMemcpy(gpu_TfibersPerBlockIC, fibersPerBlock, nfibers*sizeof(uint32_t), cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) return 2;
+    cudaStatus = cudaMemcpy(gpu_ToffsetPerBlockIC, offsetPerBlock, nfibers*sizeof(uint32_t), cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) return 2;
+
+    free(fibersPerBlock);
+    free(offsetPerBlock);
+
+    cudaStatus = cudaMalloc((void**)&gpu_TvoxelIC,  nsegments*sizeof(uint32_t)) ;
+    if (cudaStatus != cudaSuccess) return 1;
+    cudaStatus = cudaMalloc((void**)&gpu_TfiberIC,  nsegments*sizeof(uint32_t)) ;
+    if (cudaStatus != cudaSuccess) return 1;
+    cudaStatus = cudaMalloc((void**)&gpu_TorienIC,  nsegments*sizeof(uint16_t)) ;
+    if (cudaStatus != cudaSuccess) return 1;
+    cudaStatus = cudaMalloc((void**)&gpu_TlengthIC, nsegments*sizeof(float32_t));
+    if (cudaStatus != cudaSuccess) return 1;
+
+    cudaStatus = cudaMemcpy(gpu_TvoxelIC,  TvoxelIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) return 2;
+    cudaStatus = cudaMemcpy(gpu_TfiberIC,  TfiberIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) return 2;
+    cudaStatus = cudaMemcpy(gpu_TorienIC,  TorienIC,  nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) return 2;
+    cudaStatus = cudaMemcpy(gpu_TlengthIC, TlengthIC, nsegments*sizeof(float32_t), cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) return 2;
+    
+    return 0;
+}
+
+int CudaLinearOperator::setKernels(float32_t* lutIC, float32_t* lutEC, float32_t* lutISO){
+
+    cudaError_t cudaStatus;
+
+    printf("\t* loading LUT... ");
+
+    if (ndiameters > 0){
+        cudaStatus = cudaMalloc((void**)&gpu_lutIC, size_lutic*sizeof(float32_t));
+        if (cudaStatus != cudaSuccess) return 1;
+        cudaStatus = cudaMemcpy(gpu_lutIC, lutIC, size_lutic*sizeof(float32_t), cudaMemcpyHostToDevice);
+        if (cudaStatus != cudaSuccess) return 2;
+
+        tex_lutIC.addressMode[0] = cudaAddressModeBorder;
+        tex_lutIC.addressMode[1] = cudaAddressModeBorder;
+        tex_lutIC.filterMode = cudaFilterModePoint;
+        tex_lutIC.normalized = false;
+
+        cudaStatus = cudaBindTexture(NULL, tex_lutIC,  gpu_lutIC,  size_lutic*sizeof(float32_t));
+        if (cudaStatus != cudaSuccess) return 3;
+    }
+
+    if (nzeppelins > 0){
+        cudaStatus = cudaMalloc((void**)&gpu_lutEC,  size_lutec*sizeof(float32_t));
+        if (cudaStatus != cudaSuccess) return 1;
+        cudaStatus = cudaMemcpy(gpu_lutEC, lutEC, size_lutec*sizeof(float32_t), cudaMemcpyHostToDevice);
+        if (cudaStatus != cudaSuccess) return 2;
+
+        tex_lutEC.addressMode[0] = cudaAddressModeBorder;
+        tex_lutEC.addressMode[1] = cudaAddressModeBorder;
+        tex_lutEC.filterMode = cudaFilterModePoint;
+        tex_lutEC.normalized = false;
+
+        cudaStatus = cudaBindTexture(NULL, tex_lutEC,  gpu_lutEC,  size_lutec*sizeof(float32_t));
+        if (cudaStatus != cudaSuccess) return 3;
+    }
+
+    if (nballs > 0){
+        cudaStatus = cudaMalloc((void**)&gpu_lutISO, size_lutiso*sizeof(float32_t));
+        if (cudaStatus != cudaSuccess) return 1;
+        cudaStatus = cudaMemcpy(gpu_lutISO, lutISO, size_lutiso*sizeof(float32_t), cudaMemcpyHostToDevice);
+        if (cudaStatus != cudaSuccess) return 2;
+
+        tex_lutISO.addressMode[0] = cudaAddressModeBorder;
+        tex_lutISO.addressMode[1] = cudaAddressModeBorder;
+        tex_lutISO.filterMode = cudaFilterModePoint;
+        tex_lutISO.normalized = false;
+
+        cudaStatus = cudaBindTexture(NULL, tex_lutISO, gpu_lutISO, size_lutiso*sizeof(float32_t));
+        if (cudaStatus != cudaSuccess) return 3;
+    }
+
+    return 0;
+}
+
+int setVectors(){
+    
+    cudaError_t cudaStatus;
+
+    printf("\t* vectors x&y... ");
+
+    cudaStatus = cudaMalloc((void**)&gpu_x, ncols*sizeof(float64_t));
+    if (cudaStatus != cudaSuccess) return 1;
+    cudaStatus = cudaMalloc((void**)&gpu_y, nrows*sizeof(float64_t));
+    if (cudaStatus != cudaSuccess) return 1;
+    
+    return 0;
+}
+
+int setConstants(){
+    
+    cudaError_t cudaStatus;
+    
+    printf("\t* constant values... ");
+
+    cudaStatus = cudaMemcpyToSymbol(NUM_VOXELS,       &nvoxels,       sizeof(int));
+    if (cudaStatus != cudaSuccess) return -1;
+    cudaStatus = cudaMemcpyToSymbol(NUM_FIBERS,       &nfibers,       sizeof(int));
+    if (cudaStatus != cudaSuccess) return -1;
+    cudaStatus = cudaMemcpyToSymbol(NUM_PEAKS,        &npeaks,        sizeof(int));
+    if (cudaStatus != cudaSuccess) return -1;
+    cudaStatus = cudaMemcpyToSymbol(NUM_ORIENTATIONS, &norientations, sizeof(int));
+    if (cudaStatus != cudaSuccess) return -1;
+    cudaStatus = cudaMemcpyToSymbol(NUM_SAMPLES,      &nsamples,      sizeof(int));
+    if (cudaStatus != cudaSuccess) return -1;
+    cudaStatus = cudaMemcpyToSymbol(NUM_DIAMETERS,    &ndiameters,    sizeof(int));
+    if (cudaStatus != cudaSuccess) return -1;
+    cudaStatus = cudaMemcpyToSymbol(NUM_ZEPPELINS,    &nzeppelins,    sizeof(int));
+    if (cudaStatus != cudaSuccess) return -1;
+    cudaStatus = cudaMemcpyToSymbol(NUM_BALLS,        &nballs,        sizeof(int));
+    if (cudaStatus != cudaSuccess) return -1;
+    cudaStatus = cudaMemcpyToSymbol(NUM_ROWS,         &nrows,         sizeof(int));
+    if (cudaStatus != cudaSuccess) return -1;
+    cudaStatus = cudaMemcpyToSymbol(NUM_COLS,         &ncols,         sizeof(int));
+    if (cudaStatus != cudaSuccess) return -1;
+    cudaStatus = cudaMemcpyToSymbol(SIZE_LUTIC,       &size_lutic,    sizeof(int));
+    if (cudaStatus != cudaSuccess) return -1;
+    cudaStatus = cudaMemcpyToSymbol(SIZE_LUTEC,       &size_lutec,    sizeof(int));
+    if (cudaStatus != cudaSuccess) return -1;
+    cudaStatus = cudaMemcpyToSymbol(SIZE_LUTISO,      &size_lutiso,   sizeof(int));
+    if (cudaStatus != cudaSuccess) return -1;
+    
+    return 0;
+}
+
 CudaLinearOperator::CudaLinearOperator(
     // pointers to IC data in CPU memory
     uint32_t* voxelIC,
@@ -132,13 +360,28 @@ CudaLinearOperator::CudaLinearOperator(
     // id of the selected CUDA gpu
     int gpu_id)
 {
-    this->nsegments = nsegments;
+    /*this->nsegments = nsegments;
     this->nvoxels   = nvoxels;
     this->nfibers   = nfibers;
     this->nrows     = nvoxels * nsamples;
-    this->ncols     = nfibers*ndiameters + npeaks*nzeppelins + nvoxels*nballs;
+    this->ncols     = nfibers*ndiameters + npeaks*nzeppelins + nvoxels*nballs;//*/
 
-    if (fcall == 1) {
+    this->nsegments = nsegments;
+    this->nvoxels = nvoxels;
+    this->nfibers = nfibers;
+    this->npeaks = npeaks;
+    this->norientations = norientations;
+    this->nsamples = nsamples;
+    this->ndiameters = ndiameters;
+    this->nzeppelins = nzeppelins;   
+    this->nballs = nballs;
+    this->size_lutic = ndiameters*norientations*nsamples;
+    this->size_lutec = nzeppelins*norientations*nsamples;
+    this->size_lutiso = nballs*nsamples;
+    this->nrows = nvoxels*nsamples;
+    this->ncols = nfibers*ndiameters + npeaks*nzeppelins + nvoxels*nballs;
+
+    /*if (fcall == 1) {
         int size_lutic  = ndiameters*norientations*nsamples;
         int size_lutec  = nzeppelins*norientations*nsamples;
         int size_lutiso = nballs*nsamples;
@@ -268,11 +511,13 @@ CudaLinearOperator::CudaLinearOperator(
         }
         if (cudaStatus) printf("[ OK ]\n");
         else            printf("[ CUDA ERROR ]\n");
-    }
+    }//*/
 
 }
 
-CudaLinearOperator::~CudaLinearOperator() {}
+CudaLinearOperator::~CudaLinearOperator() {
+    printf("DESTRUCTOR!!!!!!!!!!!!!!!!!!!!!!");
+}
 
 void CudaLinearOperator::destroy(){
     bool cudaStatus;
diff --git a/commit/cudaoperator/operator_withCUDA.cuh b/commit/cudaoperator/operator_withCUDA.cuh
index 15651ea5..69f3fa7d 100644
--- a/commit/cudaoperator/operator_withCUDA.cuh
+++ b/commit/cudaoperator/operator_withCUDA.cuh
@@ -128,7 +128,7 @@ static uint32_t* gpu_segmentsPerBlockEC;
 static uint32_t* gpu_offsetPerBlockEC;
 
 // ====================================================
-// Pointers to LUTs in the GPU
+// Pointers to LUT in the GPU
 // ====================================================
 static float32_t* gpu_lutIC;
 static float32_t* gpu_lutEC;
@@ -146,15 +146,20 @@ static float64_t* gpu_y;
 class CudaLinearOperator {
 
     // constant values in CPU
+    int nsegments;
+    int nvoxels;    
+    int nfibers;      
+    int npeaks;
+    int norientations;
+    int nsamples;
+    int ndiameters;
+    int nzeppelins;   
+    int nballs;
+    int size_lutic;
+    int size_lutec;
+    int size_lutiso;
     int nrows;
     int ncols;
-    int nvoxels;
-    int nfibers;
-    int nsegments;
-
-    // CUDA GPU status
-    bool cudaStatus;
-    int  cudaError;
 
     public:
         CudaLinearOperator(
@@ -170,7 +175,7 @@ class CudaLinearOperator {
             float*    lutEC,
             // pointer to ISO data in CPU memory
             float*    lutISO,
-            // dataset constant values
+            // operator constant values
             int nsegments,
             int nvoxels,      
             int nfibers,      
@@ -187,7 +192,11 @@ class CudaLinearOperator {
 
         ~CudaLinearOperator();
 
-        int  getCudaStatus() { return (int)cudaStatus; }
+        int setDictionary(uint32_t* voxelIC, uint32_t* fiberIC, uint16_t* orienIC, float32_t* lengthIC, uint32_t* voxelEC, uint16_t* orienEC);
+        int setTransposeDictionary(uint32_t* voxelIC, uint32_t* fiberIC, uint16_t* orienIC, float32_t* lengthIC);
+        int setKernels(float32_t* lutIC, float32_t* lutEC, float32_t* lutISO);
+        int setVectors();
+        int setConstants();
         void setTransposeData(uint32_t*  voxelIDs, uint32_t*  fiberIDs, uint16_t*  orienIDs, float32_t* lengths);
         void destroy();
 

From 82deb61f6bde081df627f833ddb54cd3a4a216fe Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 23 Jan 2021 21:20:24 -0600
Subject: [PATCH 175/190] Modify CUDA error messages

---
 commit/cudaoperator/operator.pyx          | 6 +++++-
 commit/cudaoperator/operator_withCUDA.cuh | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/commit/cudaoperator/operator.pyx b/commit/cudaoperator/operator.pyx
index 3110dfc9..92fbb2ae 100644
--- a/commit/cudaoperator/operator.pyx
+++ b/commit/cudaoperator/operator.pyx
@@ -55,7 +55,11 @@ cdef extern from "operator_withCUDA.cuh":
             int,
             int)
 
-        int   getCudaStatus()
+        int setDictionary(np.uint32_t*, np.uint32_t*, np.uint16_t*, np.float32_t*,  np.uint32_t*, np.uint16_t*)
+        int setTransposeDictionary(np.uint32_t*, np.uint32_t*, np.uint16_t*, np.float32_t*)
+        int setKernels(np.float32_t*, np.float32_t*, np.float32_t*)
+        int setVectors()
+        int setConstants()
         void  setTransposeData(np.uint32_t*, np.uint32_t*, np.uint16_t*, np.float32_t*)
         void  destroy()
         void  dot(np.float64_t*, np.float64_t*)
diff --git a/commit/cudaoperator/operator_withCUDA.cuh b/commit/cudaoperator/operator_withCUDA.cuh
index 69f3fa7d..a4a0aa95 100644
--- a/commit/cudaoperator/operator_withCUDA.cuh
+++ b/commit/cudaoperator/operator_withCUDA.cuh
@@ -193,7 +193,7 @@ class CudaLinearOperator {
         ~CudaLinearOperator();
 
         int setDictionary(uint32_t* voxelIC, uint32_t* fiberIC, uint16_t* orienIC, float32_t* lengthIC, uint32_t* voxelEC, uint16_t* orienEC);
-        int setTransposeDictionary(uint32_t* voxelIC, uint32_t* fiberIC, uint16_t* orienIC, float32_t* lengthIC);
+        int setTransposeDictionary(uint32_t* TvoxelIC, uint32_t* TfiberIC, uint16_t* TorienIC, float32_t* TlengthIC);
         int setKernels(float32_t* lutIC, float32_t* lutEC, float32_t* lutISO);
         int setVectors();
         int setConstants();

From c84ee154a866dcf708031ec5e319afe3fcc96270 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 23 Jan 2021 21:22:03 -0600
Subject: [PATCH 176/190] Modify CUDA error messages

---
 commit/cudaoperator/operator.pyx | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/commit/cudaoperator/operator.pyx b/commit/cudaoperator/operator.pyx
index 92fbb2ae..466444a6 100644
--- a/commit/cudaoperator/operator.pyx
+++ b/commit/cudaoperator/operator.pyx
@@ -13,17 +13,17 @@ def check_compatibility(gpu_id):
     return checkCompatibility(gpu_id)
 
 def check_cuda(error_id):
-    if ans == -1:
+    if error_id == -1:
         ERROR( 'Impossible to allocate auxiliar memory in CPU' )
-    elif ans == 1:
+    elif error_id == 1:
         ERROR( 'Impossible to allocate memory in GPU' )
-    elif ans == 2:
+    elif error_id == 2:
         ERROR( 'Impossible to transfer memory to GPU' )
-    elif ans == 3:
+    elif error_id == 3:
         ERROR( 'Impossible to bind texture memory' )
-    elif ans == 4:
+    elif error_id == 4:
         ERROR( 'Impossible to transfer constant values to GPU' )
-    elif ans == 0:
+    elif error_id == 0:
         print( '[ OK ]' )
         
 

From 3ad7365094ee71cff1b45876dff17ac818e120fd Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 23 Jan 2021 21:24:47 -0600
Subject: [PATCH 177/190] Modify CUDA error messages

---
 commit/cudaoperator/operator_withCUDA.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/commit/cudaoperator/operator_withCUDA.cu b/commit/cudaoperator/operator_withCUDA.cu
index 7e59deb6..bb886e60 100644
--- a/commit/cudaoperator/operator_withCUDA.cu
+++ b/commit/cudaoperator/operator_withCUDA.cu
@@ -282,7 +282,7 @@ int CudaLinearOperator::setKernels(float32_t* lutIC, float32_t* lutEC, float32_t
     return 0;
 }
 
-int setVectors(){
+int CudaLinearOperator::setVectors(){
     
     cudaError_t cudaStatus;
 
@@ -296,7 +296,7 @@ int setVectors(){
     return 0;
 }
 
-int setConstants(){
+int CudaLinearOperator::setConstants(){
     
     cudaError_t cudaStatus;
     

From d780d5698256daac76565c86f06156769b1d7274 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 23 Jan 2021 21:26:40 -0600
Subject: [PATCH 178/190] Modify CUDA error messages

---
 commit/cudaoperator/operator_withCUDA.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/commit/cudaoperator/operator_withCUDA.cu b/commit/cudaoperator/operator_withCUDA.cu
index bb886e60..d3e84fcd 100644
--- a/commit/cudaoperator/operator_withCUDA.cu
+++ b/commit/cudaoperator/operator_withCUDA.cu
@@ -580,7 +580,7 @@ void CudaLinearOperator::setTransposeData(uint32_t*  voxelIDs,
                                           uint16_t*  orienIDs,
                                           float32_t* lengths)
 {
-    printf("\t* A' operator... ");
+    /*printf("\t* A' operator... ");
     cudaStatus = true;
     uint32_t*  fibersPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
     uint32_t*  offsetPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
@@ -608,7 +608,7 @@ void CudaLinearOperator::setTransposeData(uint32_t*  voxelIDs,
     cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TorienIC,  orienIDs, nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
     cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TlengthIC, lengths,  nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
     if (cudaStatus) printf("[ OK ]\n");
-    else            printf("[ CUDA ERROR ]\n");
+    else            printf("[ CUDA ERROR ]\n");//*/
 }
 
 void cudaCheckKernel(){

From d95aaa07773295391341156d33d918c363342ea5 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 23 Jan 2021 21:36:31 -0600
Subject: [PATCH 179/190] Modify CUDA error messages

---
 commit/cudaoperator/operator.pyx          | 13 +++++++++----
 commit/cudaoperator/operator_withCUDA.cu  | 13 +------------
 commit/cudaoperator/operator_withCUDA.cuh |  2 +-
 3 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/commit/cudaoperator/operator.pyx b/commit/cudaoperator/operator.pyx
index 466444a6..006deab2 100644
--- a/commit/cudaoperator/operator.pyx
+++ b/commit/cudaoperator/operator.pyx
@@ -20,7 +20,7 @@ def check_cuda(error_id):
     elif error_id == 2:
         ERROR( 'Impossible to transfer memory to GPU' )
     elif error_id == 3:
-        ERROR( 'Impossible to bind texture memory' )
+        ERROR( 'Impossible to bind textures' )
     elif error_id == 4:
         ERROR( 'Impossible to transfer constant values to GPU' )
     elif error_id == 0:
@@ -59,7 +59,7 @@ cdef extern from "operator_withCUDA.cuh":
         int setTransposeDictionary(np.uint32_t*, np.uint32_t*, np.uint16_t*, np.float32_t*)
         int setKernels(np.float32_t*, np.float32_t*, np.float32_t*)
         int setVectors()
-        int setConstants()
+        int setGlobals()
         void  setTransposeData(np.uint32_t*, np.uint32_t*, np.uint16_t*, np.float32_t*)
         void  destroy()
         void  dot(np.float64_t*, np.float64_t*)
@@ -182,13 +182,17 @@ cdef class CudaLinearOperator :
 
         # create the transpose of the operator in GPU memory
         if fcall == 1:
+            print( '\t* global values... ' )
             check_cuda( self.thisptr.setConstants() )
 
-            check_cuda( self.thisptr.setDictionary(&ICv[0],&ICf[0],&ICo[0],&ICl[0], &ECv[0],&ECo[0]) )
-
+            print( '\t* lookup tables... ' )
             check_cuda( self.thisptr.setKernels(&wmrSFP[0,0,0], &wmhSFP[0,0,0], &isoSFP[0,0]) )
 
+            print( '\t* x&y vectors...   ' )
             check_cuda( self.thisptr.setVectors() )
+        
+            print( '\t* A  operator...   ' )
+            check_cuda( self.thisptr.setDictionary(&ICv[0],&ICf[0],&ICo[0],&ICl[0], &ECv[0],&ECo[0]) )
 
             idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['fiber'])] )
 
@@ -208,6 +212,7 @@ cdef class CudaLinearOperator :
             self.ICo = &ICo[0]
 
             #self.thisptr.setTransposeData(&self.ICv[0], &self.ICf[0], &self.ICo[0], &self.ICl[0])
+            print( '\t* A\' operator... ' )
             check_cuda( self.thisptr.setTransposeDictionary(&self.ICv[0], &self.ICf[0], &self.ICo[0], &self.ICl[0]) )
 
     @property
diff --git a/commit/cudaoperator/operator_withCUDA.cu b/commit/cudaoperator/operator_withCUDA.cu
index d3e84fcd..1b09eb44 100644
--- a/commit/cudaoperator/operator_withCUDA.cu
+++ b/commit/cudaoperator/operator_withCUDA.cu
@@ -108,7 +108,6 @@ int CudaLinearOperator::setDictionary(uint32_t* voxelIC, uint32_t* fiberIC, uint
     
     cudaError_t cudaStatus;
 
-    printf("\t* pre-processing... ");
     uint32_t* segmentsPerBlock = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
     uint32_t* offsetPerBlock   = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
 
@@ -142,8 +141,6 @@ int CudaLinearOperator::setDictionary(uint32_t* voxelIC, uint32_t* fiberIC, uint
 
     free(segmentsPerBlock);
     free(offsetPerBlock);
-    
-    printf("\t* A  operator... ");
 
     // alloc IC part of the dictionary in GPU
     cudaStatus = cudaMalloc((void**)&gpu_voxelIC,  nsegments*sizeof(uint32_t)); 
@@ -185,8 +182,6 @@ int CudaLinearOperator::setDictionary(uint32_t* voxelIC, uint32_t* fiberIC, uint
 int CudaLinearOperator::setTransposeDictionary(uint32_t* TvoxelIC, uint32_t* TfiberIC, uint16_t* TorienIC, float32_t* TlengthIC){
     
     cudaError_t cudaStatus;
-    
-    printf("\t* A' operator... ");
 
     uint32_t*  fibersPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
     uint32_t*  offsetPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
@@ -232,8 +227,6 @@ int CudaLinearOperator::setKernels(float32_t* lutIC, float32_t* lutEC, float32_t
 
     cudaError_t cudaStatus;
 
-    printf("\t* loading LUT... ");
-
     if (ndiameters > 0){
         cudaStatus = cudaMalloc((void**)&gpu_lutIC, size_lutic*sizeof(float32_t));
         if (cudaStatus != cudaSuccess) return 1;
@@ -286,8 +279,6 @@ int CudaLinearOperator::setVectors(){
     
     cudaError_t cudaStatus;
 
-    printf("\t* vectors x&y... ");
-
     cudaStatus = cudaMalloc((void**)&gpu_x, ncols*sizeof(float64_t));
     if (cudaStatus != cudaSuccess) return 1;
     cudaStatus = cudaMalloc((void**)&gpu_y, nrows*sizeof(float64_t));
@@ -296,11 +287,9 @@ int CudaLinearOperator::setVectors(){
     return 0;
 }
 
-int CudaLinearOperator::setConstants(){
+int CudaLinearOperator::setGlobals(){
     
     cudaError_t cudaStatus;
-    
-    printf("\t* constant values... ");
 
     cudaStatus = cudaMemcpyToSymbol(NUM_VOXELS,       &nvoxels,       sizeof(int));
     if (cudaStatus != cudaSuccess) return -1;
diff --git a/commit/cudaoperator/operator_withCUDA.cuh b/commit/cudaoperator/operator_withCUDA.cuh
index a4a0aa95..e1ce3689 100644
--- a/commit/cudaoperator/operator_withCUDA.cuh
+++ b/commit/cudaoperator/operator_withCUDA.cuh
@@ -196,7 +196,7 @@ class CudaLinearOperator {
         int setTransposeDictionary(uint32_t* TvoxelIC, uint32_t* TfiberIC, uint16_t* TorienIC, float32_t* TlengthIC);
         int setKernels(float32_t* lutIC, float32_t* lutEC, float32_t* lutISO);
         int setVectors();
-        int setConstants();
+        int setGlobals();
         void setTransposeData(uint32_t*  voxelIDs, uint32_t*  fiberIDs, uint16_t*  orienIDs, float32_t* lengths);
         void destroy();
 

From 5a33853ca56404e80043aaedca0593e132f0dca8 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 23 Jan 2021 21:38:18 -0600
Subject: [PATCH 180/190] Modify CUDA error messages

---
 commit/cudaoperator/operator.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/commit/cudaoperator/operator.pyx b/commit/cudaoperator/operator.pyx
index 006deab2..8aacf6ca 100644
--- a/commit/cudaoperator/operator.pyx
+++ b/commit/cudaoperator/operator.pyx
@@ -183,7 +183,7 @@ cdef class CudaLinearOperator :
         # create the transpose of the operator in GPU memory
         if fcall == 1:
             print( '\t* global values... ' )
-            check_cuda( self.thisptr.setConstants() )
+            check_cuda( self.thisptr.setGlobals() )
 
             print( '\t* lookup tables... ' )
             check_cuda( self.thisptr.setKernels(&wmrSFP[0,0,0], &wmhSFP[0,0,0], &isoSFP[0,0]) )

From 7841549671a737a4e193c6c012b5a6b7e6c0c60f Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sun, 24 Jan 2021 00:44:49 -0600
Subject: [PATCH 181/190] Add color to COMMIT when CUDA is enabled

---
 commit/cudaoperator/operator.pyx          |  82 ++---
 commit/cudaoperator/operator_withCUDA.cu  | 363 ++++------------------
 commit/cudaoperator/operator_withCUDA.cuh |  42 +--
 3 files changed, 89 insertions(+), 398 deletions(-)

diff --git a/commit/cudaoperator/operator.pyx b/commit/cudaoperator/operator.pyx
index 8aacf6ca..52ff10ba 100644
--- a/commit/cudaoperator/operator.pyx
+++ b/commit/cudaoperator/operator.pyx
@@ -23,45 +23,26 @@ def check_cuda(error_id):
         ERROR( 'Impossible to bind textures' )
     elif error_id == 4:
         ERROR( 'Impossible to transfer constant values to GPU' )
+    elif error_id == 5:
+        ERROR( 'There was a problem deleting GPU memory' )
+    elif error_id == 6:
+        ERROR( 'There was a problem unbinding texture memory' )
+    elif error_id == 7:
+        ERROR( 'There was a problem resetting GPU' )
     elif error_id == 0:
         print( '[ OK ]' )
-        
 
 cdef extern from "operator_withCUDA.cuh":
     cdef cppclass C_CudaLinearOperator "CudaLinearOperator":
-        C_CudaLinearOperator(
-            np.uint32_t*,
-            np.uint32_t*,
-            np.uint16_t*,
-            np.float32_t*,
-            np.float32_t*,
-
-            np.uint32_t*,
-            np.uint16_t*,
-            np.float32_t*,
-
-            np.float32_t*,
-
-            int,
-            int,
-            int,
-            int,
-            int,
-            int,
-            int,
-            int,
-            int,
-            
-            int,
-            int)
+        C_CudaLinearOperator(int, int, int, int, int, int, int, int, int)
 
         int setDictionary(np.uint32_t*, np.uint32_t*, np.uint16_t*, np.float32_t*,  np.uint32_t*, np.uint16_t*)
         int setTransposeDictionary(np.uint32_t*, np.uint32_t*, np.uint16_t*, np.float32_t*)
         int setKernels(np.float32_t*, np.float32_t*, np.float32_t*)
         int setVectors()
         int setGlobals()
-        void  setTransposeData(np.uint32_t*, np.uint32_t*, np.uint16_t*, np.float32_t*)
-        void  destroy()
+        int destroy()
+
         void  dot(np.float64_t*, np.float64_t*)
         void Tdot(np.float64_t*, np.float64_t*)
 
@@ -154,44 +135,20 @@ cdef class CudaLinearOperator :
         self.LUT_ISO = &isoSFP[0,0]
 
         # create the operator in GPU memory
-        self.thisptr = new C_CudaLinearOperator(
-            &ICv[0],
-            &ICf[0],
-            &ICo[0],
-            &ICl[0],
-            &wmrSFP[0,0,0],
-
-            &ECv[0],
-            &ECo[0],
-            &wmhSFP[0,0,0],
-
-            &isoSFP[0,0],
-
-            self.n,
-            self.nV,
-            self.nF,
-            self.nE,
-            self.ndirs,
-            self.nS,
-            self.nR,
-            self.nT,
-            self.nI,
-            
-            fcall,
-            self.gpu_id)
-
-        # create the transpose of the operator in GPU memory
+        self.thisptr = new C_CudaLinearOperator(self.n, self.nV, self.nF, self.nE, self.ndirs, self.nS, self.nR, self.nT, self.nI)
+
+        # build operator in GPU only one time
         if fcall == 1:
-            print( '\t* global values... ' )
+            print( '\t* global values... ', end='' )
             check_cuda( self.thisptr.setGlobals() )
 
-            print( '\t* lookup tables... ' )
+            print( '\t* lookup tables... ', end='' )
             check_cuda( self.thisptr.setKernels(&wmrSFP[0,0,0], &wmhSFP[0,0,0], &isoSFP[0,0]) )
 
-            print( '\t* x&y vectors...   ' )
+            print( '\t* x&y vectors...   ', end='' )
             check_cuda( self.thisptr.setVectors() )
         
-            print( '\t* A  operator...   ' )
+            print( '\t* A  operator...   ', end='' )
             check_cuda( self.thisptr.setDictionary(&ICv[0],&ICf[0],&ICo[0],&ICl[0], &ECv[0],&ECo[0]) )
 
             idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['fiber'])] )
@@ -211,10 +168,13 @@ cdef class CudaLinearOperator :
             self.ICv = &ICv[0]
             self.ICo = &ICo[0]
 
-            #self.thisptr.setTransposeData(&self.ICv[0], &self.ICf[0], &self.ICo[0], &self.ICl[0])
-            print( '\t* A\' operator... ' )
+            print( '\t* A\' operator... ', end='' )
             check_cuda( self.thisptr.setTransposeDictionary(&self.ICv[0], &self.ICf[0], &self.ICo[0], &self.ICl[0]) )
 
+    def __del__( self ):
+        LOG( '\n-> Clearing GPU memory:' )
+        check_cuda( self.thisptr.destroy() )
+
     @property
     def T( self ) :
         """Transpose of the explicit matrix."""
diff --git a/commit/cudaoperator/operator_withCUDA.cu b/commit/cudaoperator/operator_withCUDA.cu
index 1b09eb44..d4fa5223 100644
--- a/commit/cudaoperator/operator_withCUDA.cu
+++ b/commit/cudaoperator/operator_withCUDA.cu
@@ -1,81 +1,29 @@
 #include "operator_withCUDA.cuh"
 
-// textures in GPU
-texture<float32_t, 1, cudaReadModeElementType> tex_lutIC;
-texture<float32_t, 1, cudaReadModeElementType> tex_lutEC;
-texture<float32_t, 1, cudaReadModeElementType> tex_lutISO;
-
-bool cudaCheck(cudaError_t cudaStatus){
-    return cudaStatus == cudaSuccess;
-}
-
 int checkCompatibility(int gpuID) {
     int gpuCount;
     cudaError_t cudaStatus;
     
     cudaStatus = cudaGetDeviceCount(&gpuCount);
 
-    if (gpuCount <= 0 || gpuID >= gpuCount || cudaStatus != cudaSuccess) {
-        //printf("\t* the selected GPU does not exist or it is not detected \n");
-        return 1;
-    }
+    if (gpuCount <= 0 || gpuID >= gpuCount || cudaStatus != cudaSuccess) return 1;
 
     cudaStatus = cudaSetDevice(gpuID);
 
-    if (cudaStatus != cudaSuccess){
-        //printf("\t* checking availability of CUDA ... [ ERROR ]: CUDA is not available or GPU is not CUDA compatible\n");
-        //there was a problem setting CUDA GPU with ID=gpuID
-        return 2;
-    }
+    if (cudaStatus != cudaSuccess) return 2;
 
     cudaDeviceProp gpuProperties;
     cudaStatus = cudaGetDeviceProperties(&gpuProperties, gpuID);
 
-    if (cudaStatus != cudaSuccess){
-        //problem getting properties from CUDA GPU
-        return 3;
-    }
+    if (cudaStatus != cudaSuccess) return 3;
 
     printf("\t* selected GPU...       [ %s ]\n",     gpuProperties.name);
     printf("\t* total memory...       [ %.2fGB ]\n", gpuProperties.totalGlobalMem*1e-9);
     printf("\t* compute capability... [ %d.%d ]\n",  gpuProperties.major, gpuProperties.minor);
 
-    if(gpuProperties.major < 5){
-        //printf("\t* GPU compute capability must be at least 5.0\n", gpuProperties.major, gpuProperties.minor);
-        return 4;
-    }
+    if(gpuProperties.major < 5) return 4;
 
     return 0;
-
-    /*if(cudaStatus == cudaSuccess){
-        cudaDeviceProp gpuProperties;
-        cudaGetDeviceProperties(&gpuProperties, gpuID);
-
-        printf("\t* checking availability of CUDA... [ OK ]\n");
-        printf("\t* number of CUDA GPUs detected: %d\n", gpuCount);
-        printf("\t* using GPU with ID %d... [ %s ]\n", gpuID, gpuProperties.name);
-
-        if (required_mem <= gpuProperties.totalGlobalMem) {
-            printf("\t* using %.2f GB of total %.2f GB... [ OK ]\n", required_mem*1e-9, gpuProperties.totalGlobalMem*1e-9);
-        }
-        else {
-            printf("\t* using %f GB of total %f GB... [ ERROR ]: dictionary too big for GPU memory\n", required_mem*1e-9, gpuProperties.totalGlobalMem*1e-9);
-        }
-
-        if(gpuProperties.major >= 5){
-            printf("\t* compute capability: %d.%d [ OK ]\n", gpuProperties.major, gpuProperties.minor);
-        }
-        else{
-            printf("\t* compute capability: %d.%d [ ERROR ]. GPU compute capability must be at least 5.0\n", gpuProperties.major, gpuProperties.minor);
-            //return false;
-        }
-
-        //return true;
-    }
-    else{
-        printf("\t* checking availability of CUDA ... [ ERROR ]: CUDA is not available or GPU is not CUDA compatible\n");
-        //return false;
-    }//*/
 }
 
 void cudaCheckLastError()
@@ -321,39 +269,7 @@ int CudaLinearOperator::setGlobals(){
     return 0;
 }
 
-CudaLinearOperator::CudaLinearOperator(
-    // pointers to IC data in CPU memory
-    uint32_t* voxelIC,
-    uint32_t* fiberIC,
-    uint16_t* orienIC,
-    float*    lengthIC,
-    float*    lutIC,
-    // pointers to EC data in CPU memory
-    uint32_t* voxelEC,
-    uint16_t* orienEC,
-    float*    lutEC,
-    // pointer to ISO data in CPU memory
-    float*    lutISO,
-    // dataset constant values
-    int nsegments,
-    int nvoxels,      
-    int nfibers,      
-    int npeaks,       
-    int norientations,
-    int nsamples,     
-    int ndiameters,   
-    int nzeppelins,   
-    int nballs,
-    // flag to ensure we create the operator only one time
-    int fcall,
-    // id of the selected CUDA gpu
-    int gpu_id)
-{
-    /*this->nsegments = nsegments;
-    this->nvoxels   = nvoxels;
-    this->nfibers   = nfibers;
-    this->nrows     = nvoxels * nsamples;
-    this->ncols     = nfibers*ndiameters + npeaks*nzeppelins + nvoxels*nballs;//*/
+CudaLinearOperator::CudaLinearOperator(int nsegments, int nvoxels, int nfibers, int npeaks, int norientations, int nsamples, int ndiameters, int nzeppelins, int nballs){
 
     this->nsegments = nsegments;
     this->nvoxels = nvoxels;
@@ -369,235 +285,72 @@ CudaLinearOperator::CudaLinearOperator(
     this->size_lutiso = nballs*nsamples;
     this->nrows = nvoxels*nsamples;
     this->ncols = nfibers*ndiameters + npeaks*nzeppelins + nvoxels*nballs;
-
-    /*if (fcall == 1) {
-        int size_lutic  = ndiameters*norientations*nsamples;
-        int size_lutec  = nzeppelins*norientations*nsamples;
-        int size_lutiso = nballs*nsamples;
-
-        //size_t required_mem = 28*(size_t)nsegments + 6.0*(size_t)nzeppelins + 8.0*(size_t)nfibers + 16.0*(size_t)nvoxels + 4.0*((size_t)size_lutic + (size_t)size_lutec + (size_t)size_lutiso + (size_t)this->nrows + (size_t)this->ncols);
-        //checkCompatibility(required_mem, gpu_id);
-
-        // transfer constant values to the GPU
-        printf("\t* constant values ... ");
-        cudaStatus = true;
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_VOXELS,       &nvoxels,       sizeof(int)) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_FIBERS,       &nfibers,       sizeof(int)) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_PEAKS,        &npeaks,        sizeof(int)) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_ORIENTATIONS, &norientations, sizeof(int)) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_SAMPLES,      &nsamples,      sizeof(int)) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_DIAMETERS,    &ndiameters,    sizeof(int)) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_ZEPPELINS,    &nzeppelins,    sizeof(int)) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_BALLS,        &nballs,        sizeof(int)) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_ROWS,         &nrows,         sizeof(int)) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(NUM_COLS,         &ncols,         sizeof(int)) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTIC,       &size_lutic,    sizeof(int)) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTEC,       &size_lutec,    sizeof(int)) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpyToSymbol(SIZE_LUTISO,      &size_lutiso,   sizeof(int)) );
-        if (cudaStatus) printf("[ OK ]\n");
-        else            cudaError = 1;
-
-        // alloc memory in GPU for vectors x and y
-        printf("\t* vectors x&y ... ");
-        cudaStatus = true;
-        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_x, ncols*sizeof(float64_t)) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_y, nrows*sizeof(float64_t)) );
-        if (cudaStatus) printf("[ OK ]\n");
-        else            cudaError = 2;
-
-        // pre-process data for GPU
-        printf("\t* pre-processing ... ");
-        cudaStatus = true;
-        uint32_t* segmentsPerBlock = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
-        uint32_t* offsetPerBlock   = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
-
-        preprocessDataForGPU(voxelIC, nsegments, segmentsPerBlock, offsetPerBlock, nvoxels);
-
-        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_segmentsPerBlockIC, nvoxels*sizeof(uint32_t)) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_offsetPerBlockIC,   nvoxels*sizeof(uint32_t)) );
-
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_segmentsPerBlockIC, segmentsPerBlock, nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_offsetPerBlockIC,   offsetPerBlock,   nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
-
-        if (npeaks > 0){
-            preprocessDataForGPU(voxelEC, npeaks, segmentsPerBlock, offsetPerBlock, nvoxels);
-
-            cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_segmentsPerBlockEC, nvoxels*sizeof(uint32_t)) );
-            cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_offsetPerBlockEC,   nvoxels*sizeof(uint32_t)) );
-
-            cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_segmentsPerBlockEC, segmentsPerBlock, nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
-            cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_offsetPerBlockEC,   offsetPerBlock,   nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice) );
-        }
-
-        free(segmentsPerBlock);
-        free(offsetPerBlock);
-        if (cudaStatus) printf("[ OK ]\n");
-        else            cudaError = 3;
-
-        // alloc and transfer LUTs
-        printf("\t* loading LUTs ... ");
-        cudaStatus = true;
-
-        if (ndiameters > 0){
-            cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lutIC, size_lutic*sizeof(float32_t)) );
-            cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lutIC, lutIC, size_lutic*sizeof(float32_t), cudaMemcpyHostToDevice) );
-
-            tex_lutIC.addressMode[0] = cudaAddressModeBorder;
-            tex_lutIC.addressMode[1] = cudaAddressModeBorder;
-            tex_lutIC.filterMode = cudaFilterModePoint;
-            tex_lutIC.normalized = false;
-
-            cudaStatus = cudaStatus && cudaCheck( cudaBindTexture(NULL, tex_lutIC,  gpu_lutIC,  size_lutic  * sizeof(float32_t)) );
-        }
-
-        if (nzeppelins > 0){
-            cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lutEC,  size_lutec*sizeof(float32_t)) );
-            cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lutEC, lutEC, size_lutec*sizeof(float32_t), cudaMemcpyHostToDevice) );
-
-            tex_lutEC.addressMode[0] = cudaAddressModeBorder;
-            tex_lutEC.addressMode[1] = cudaAddressModeBorder;
-            tex_lutEC.filterMode = cudaFilterModePoint;
-            tex_lutEC.normalized = false;
-
-            cudaStatus = cudaStatus && cudaCheck( cudaBindTexture(NULL, tex_lutEC,  gpu_lutEC,  size_lutec  * sizeof(float32_t)) );
-        }
-
-        if (nballs > 0){
-            cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lutISO, size_lutiso*sizeof(float32_t)) );
-            cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lutISO, lutISO, size_lutiso*sizeof(float32_t), cudaMemcpyHostToDevice) );
-
-            tex_lutISO.addressMode[0] = cudaAddressModeBorder;
-            tex_lutISO.addressMode[1] = cudaAddressModeBorder;
-            tex_lutISO.filterMode = cudaFilterModePoint;
-            tex_lutISO.normalized = false;
-
-            cudaStatus = cudaStatus && cudaCheck( cudaBindTexture(NULL, tex_lutISO, gpu_lutISO, size_lutiso * sizeof(float32_t)) );
-        }
-
-        if (cudaStatus) printf("[ OK ]\n");
-        else            cudaError = 4;
-
-
-        // alloc and transfer operator A
-        printf("\t* A  operator... ");
-        cudaStatus = true;
-        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_voxelIC,  nsegments*sizeof(uint32_t))  );
-        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_fiberIC,  nsegments*sizeof(uint32_t))  );
-        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_orienIC,  nsegments*sizeof(uint16_t))  );
-        cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_lengthIC, nsegments*sizeof(float32_t)) );
-        if (npeaks > 0){
-            cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_voxelEC,  npeaks*sizeof(uint32_t)) );
-            cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_orienEC,  npeaks*sizeof(uint16_t)) );
-        }
-
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_voxelIC,  voxelIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_fiberIC,  fiberIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_orienIC,  orienIC,  nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
-        cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_lengthIC, lengthIC, nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
-        if (npeaks > 0){
-            cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_voxelEC,  voxelEC,  npeaks*sizeof(uint32_t), cudaMemcpyHostToDevice) );
-            cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_orienEC,  orienEC,  npeaks*sizeof(uint16_t), cudaMemcpyHostToDevice) );
-        }
-        if (cudaStatus) printf("[ OK ]\n");
-        else            printf("[ CUDA ERROR ]\n");
-    }//*/
-
-}
-
-CudaLinearOperator::~CudaLinearOperator() {
-    printf("DESTRUCTOR!!!!!!!!!!!!!!!!!!!!!!");
 }
 
-void CudaLinearOperator::destroy(){
-    bool cudaStatus;
+CudaLinearOperator::~CudaLinearOperator() {}
 
-    printf("\n-> Clearing GPU memory:\n");
+int CudaLinearOperator::destroy(){
+    cudaError_t cudaStatus;    
 
     printf("\t* deleting A...   ");
-    cudaStatus = true;
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_voxelIC)  );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_fiberIC)  );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_orienIC)  );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_lengthIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_voxelEC)  );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_orienEC)  );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_segmentsPerBlockIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_offsetPerBlockIC)   );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_segmentsPerBlockEC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_offsetPerBlockEC)   );
-    if (cudaStatus) printf("[ OK ]\n");
-    else            printf("[ CUDA ERROR ]\n");
+    cudaStatus = cudaFree(gpu_voxelIC);
+    if (cudaStatus != cudaSuccess) return 5;
+    cudaStatus = cudaFree(gpu_fiberIC);
+    if (cudaStatus != cudaSuccess) return 5;
+    cudaStatus = cudaFree(gpu_orienIC);
+    if (cudaStatus != cudaSuccess) return 5;
+    cudaStatus = cudaFree(gpu_lengthIC);
+    if (cudaStatus != cudaSuccess) return 5;
+    cudaStatus = cudaFree(gpu_voxelEC);
+    if (cudaStatus != cudaSuccess) return 5;
+    cudaStatus = cudaFree(gpu_orienEC);
+    if (cudaStatus != cudaSuccess) return 5;
+    cudaStatus = cudaFree(gpu_segmentsPerBlockIC);
+    if (cudaStatus != cudaSuccess) return 5;
+    cudaStatus = cudaFree(gpu_offsetPerBlockIC);
+    if (cudaStatus != cudaSuccess) return 5;
+    cudaStatus = cudaFree(gpu_segmentsPerBlockEC);
+    if (cudaStatus != cudaSuccess) return 5;
+    cudaStatus = cudaFree(gpu_offsetPerBlockEC);
+    if (cudaStatus != cudaSuccess) return 5;
 
     printf("\t* deleting A'...  ");
-    cudaStatus = true;
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_TvoxelIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_TfiberIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_TorienIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_TlengthIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_TfibersPerBlockIC) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_ToffsetPerBlockIC) );
-    if (cudaStatus) printf("[ OK ]\n");
-    else            printf("[ CUDA ERROR ]\n");
+    cudaStatus = cudaFree(gpu_TvoxelIC);
+    if (cudaStatus != cudaSuccess) return 5;
+    cudaStatus = cudaFree(gpu_TfiberIC);
+    if (cudaStatus != cudaSuccess) return 5;
+    cudaStatus = cudaFree(gpu_TorienIC);
+    if (cudaStatus != cudaSuccess) return 5;
+    cudaStatus = cudaFree(gpu_TlengthIC);
+    if (cudaStatus != cudaSuccess) return 5;
+    cudaStatus = cudaFree(gpu_TfibersPerBlockIC);
+    if (cudaStatus != cudaSuccess) return 5;
+    cudaStatus = cudaFree(gpu_ToffsetPerBlockIC);
+    if (cudaStatus != cudaSuccess) return 5;
 
     printf("\t* deleting x&y... ");
-    cudaStatus = true;
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_x) );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_y) );
-    if (cudaStatus) printf("[ OK ]\n");
-    else            printf("[ CUDA ERROR ]\n");
+    cudaStatus = cudaCheck( cudaFree(gpu_x);
+    if (cudaStatus != cudaSuccess) return 5;
+    cudaStatus = cudaCheck( cudaFree(gpu_y);
+    if (cudaStatus != cudaSuccess) return 5;
 
     printf("\t* deleting LUT... ");
-    cudaStatus = true;
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_lutIC)  );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_lutEC)  );
-    cudaStatus = cudaStatus && cudaCheck( cudaFree(gpu_lutISO) );
-    cudaStatus = cudaStatus && cudaCheck( cudaUnbindTexture(tex_lutIC)  );
-    cudaStatus = cudaStatus && cudaCheck( cudaUnbindTexture(tex_lutEC)  );
-    cudaStatus = cudaStatus && cudaCheck( cudaUnbindTexture(tex_lutISO) );
-    if (cudaStatus) printf("[ OK ]\n");
-    else            printf("[ CUDA ERROR ]\n");
+    cudaStatus = cudaCheck( cudaFree(gpu_lutIC);
+    if (cudaStatus != cudaSuccess) return 5;
+    cudaStatus = cudaCheck( cudaFree(gpu_lutEC);
+    if (cudaStatus != cudaSuccess) return 5;
+    cudaStatus = cudaCheck( cudaFree(gpu_lutISO);
+    if (cudaStatus != cudaSuccess) return 5;
+    cudaStatus = cudaCheck( cudaUnbindTexture(tex_lutIC);
+    if (cudaStatus != cudaSuccess) return 6;
+    cudaStatus = cudaCheck( cudaUnbindTexture(tex_lutEC);
+    if (cudaStatus != cudaSuccess) return 6;
+    cudaStatus = cudaCheck( cudaUnbindTexture(tex_lutISO);
+    if (cudaStatus != cudaSuccess) return 6;
 
     printf("\t* reseting GPU... ");
-    cudaStatus = true;
-    cudaStatus = cudaStatus && cudaCheck( cudaDeviceReset() );
-    if (cudaStatus) printf("[ OK ]\n");
-    else            printf("[ CUDA ERROR ]\n");
-}
-
-void CudaLinearOperator::setTransposeData(uint32_t*  voxelIDs,
-                                          uint32_t*  fiberIDs,
-                                          uint16_t*  orienIDs,
-                                          float32_t* lengths)
-{
-    /*printf("\t* A' operator... ");
-    cudaStatus = true;
-    uint32_t*  fibersPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
-    uint32_t*  offsetPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
-
-    if(fibersPerBlock == NULL || offsetPerBlock == NULL) printf("problemas\n");
-
-    preprocessDataForGPU(fiberIDs, nsegments, fibersPerBlock, offsetPerBlock, nfibers);
-
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TfibersPerBlockIC, nfibers*sizeof(uint32_t)) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_ToffsetPerBlockIC, nfibers*sizeof(uint32_t)) );
-
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TfibersPerBlockIC, fibersPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_ToffsetPerBlockIC, offsetPerBlock, nfibers*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-
-    free(fibersPerBlock);
-    free(offsetPerBlock);
-
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TvoxelIC,  nsegments*sizeof(uint32_t))  );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TfiberIC,  nsegments*sizeof(uint32_t))  );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TorienIC,  nsegments*sizeof(uint16_t))  );
-    cudaStatus = cudaStatus && cudaCheck( cudaMalloc((void**)&gpu_TlengthIC, nsegments*sizeof(float32_t)) );
-
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TvoxelIC,  voxelIDs, nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TfiberIC,  fiberIDs, nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TorienIC,  orienIDs, nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice) );
-    cudaStatus = cudaStatus && cudaCheck( cudaMemcpy(gpu_TlengthIC, lengths,  nsegments*sizeof(float32_t), cudaMemcpyHostToDevice) );
-    if (cudaStatus) printf("[ OK ]\n");
-    else            printf("[ CUDA ERROR ]\n");//*/
+    cudaStatus = cudaDeviceReset();
+    if (cudaStatus != cudaSuccess) return 7;
 }
 
 void cudaCheckKernel(){
diff --git a/commit/cudaoperator/operator_withCUDA.cuh b/commit/cudaoperator/operator_withCUDA.cuh
index e1ce3689..87891efb 100644
--- a/commit/cudaoperator/operator_withCUDA.cuh
+++ b/commit/cudaoperator/operator_withCUDA.cuh
@@ -17,7 +17,6 @@ typedef double float64_t;
 // ====================================================
 // Util functions to check CUDA GPU compatibility
 // ====================================================
-bool cudaCheck(cudaError_t cudaStatus);
 int checkCompatibility(int gpu_id);
 void cudaCheckLastError();
 
@@ -134,6 +133,13 @@ static float32_t* gpu_lutIC;
 static float32_t* gpu_lutEC;
 static float32_t* gpu_lutISO;
 
+// ====================================================
+// Textures for LUT in the GPU
+// ====================================================
+texture<float32_t, 1, cudaReadModeElementType> tex_lutIC;
+texture<float32_t, 1, cudaReadModeElementType> tex_lutEC;
+texture<float32_t, 1, cudaReadModeElementType> tex_lutISO;
+
 // ====================================================
 // Pointers to x and y in the GPU
 // ====================================================
@@ -162,34 +168,7 @@ class CudaLinearOperator {
     int ncols;
 
     public:
-        CudaLinearOperator(
-            // pointers to IC data in CPU memory
-            uint32_t* voxelIC,
-            uint32_t* fiberIC,
-            uint16_t* orienIC,
-            float*    lengthIC,
-            float*    lutIC,
-            // pointers to EC data in CPU memory
-            uint32_t* voxelEC,
-            uint16_t* orienEC,
-            float*    lutEC,
-            // pointer to ISO data in CPU memory
-            float*    lutISO,
-            // operator constant values
-            int nsegments,
-            int nvoxels,      
-            int nfibers,      
-            int npeaks,
-            int norientations,
-            int nsamples,     
-            int ndiameters,   
-            int nzeppelins,   
-            int nballs,
-            // flag to ensure we create the operator only one time
-            int fcall,
-            // id of the selected CUDA gpu
-            int gpu_id);
-
+         CudaLinearOperator(int nsegments, int nvoxels, int nfibers, int npeaks, int norientations, int nsamples, int ndiameters, int nzeppelins, int nballs);
         ~CudaLinearOperator();
 
         int setDictionary(uint32_t* voxelIC, uint32_t* fiberIC, uint16_t* orienIC, float32_t* lengthIC, uint32_t* voxelEC, uint16_t* orienEC);
@@ -197,9 +176,8 @@ class CudaLinearOperator {
         int setKernels(float32_t* lutIC, float32_t* lutEC, float32_t* lutISO);
         int setVectors();
         int setGlobals();
-        void setTransposeData(uint32_t*  voxelIDs, uint32_t*  fiberIDs, uint16_t*  orienIDs, float32_t* lengths);
-        void destroy();
-
+        int destroy();
+        
         void  dot(float64_t* v_in, float64_t* v_out);
         void Tdot(float64_t* v_in, float64_t* v_out);
 };
\ No newline at end of file

From fbb90ea55a7ce36138000b351a94884036e50ebc Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sun, 24 Jan 2021 00:46:50 -0600
Subject: [PATCH 182/190] Add color to COMMIT when CUDA is enabled

---
 commit/cudaoperator/operator_withCUDA.cu  | 8 ++++++++
 commit/cudaoperator/operator_withCUDA.cuh | 9 +--------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/commit/cudaoperator/operator_withCUDA.cu b/commit/cudaoperator/operator_withCUDA.cu
index d4fa5223..c1c678ac 100644
--- a/commit/cudaoperator/operator_withCUDA.cu
+++ b/commit/cudaoperator/operator_withCUDA.cu
@@ -1,5 +1,13 @@
 #include "operator_withCUDA.cuh"
 
+// ====================================================
+// Textures for LUT in the GPU
+// ====================================================
+texture<float32_t, 1, cudaReadModeElementType> tex_lutIC;
+texture<float32_t, 1, cudaReadModeElementType> tex_lutEC;
+texture<float32_t, 1, cudaReadModeElementType> tex_lutISO;
+
+
 int checkCompatibility(int gpuID) {
     int gpuCount;
     cudaError_t cudaStatus;
diff --git a/commit/cudaoperator/operator_withCUDA.cuh b/commit/cudaoperator/operator_withCUDA.cuh
index 87891efb..6b3d09bc 100644
--- a/commit/cudaoperator/operator_withCUDA.cuh
+++ b/commit/cudaoperator/operator_withCUDA.cuh
@@ -133,13 +133,6 @@ static float32_t* gpu_lutIC;
 static float32_t* gpu_lutEC;
 static float32_t* gpu_lutISO;
 
-// ====================================================
-// Textures for LUT in the GPU
-// ====================================================
-texture<float32_t, 1, cudaReadModeElementType> tex_lutIC;
-texture<float32_t, 1, cudaReadModeElementType> tex_lutEC;
-texture<float32_t, 1, cudaReadModeElementType> tex_lutISO;
-
 // ====================================================
 // Pointers to x and y in the GPU
 // ====================================================
@@ -177,7 +170,7 @@ class CudaLinearOperator {
         int setVectors();
         int setGlobals();
         int destroy();
-        
+
         void  dot(float64_t* v_in, float64_t* v_out);
         void Tdot(float64_t* v_in, float64_t* v_out);
 };
\ No newline at end of file

From 9926bc6510e0f1e523e241ab4a6a170240f409bd Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sun, 24 Jan 2021 00:49:44 -0600
Subject: [PATCH 183/190] Add color to COMMIT when CUDA is enabled

---
 commit/cudaoperator/operator_withCUDA.cu | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/commit/cudaoperator/operator_withCUDA.cu b/commit/cudaoperator/operator_withCUDA.cu
index c1c678ac..c56fd7a2 100644
--- a/commit/cudaoperator/operator_withCUDA.cu
+++ b/commit/cudaoperator/operator_withCUDA.cu
@@ -337,28 +337,30 @@ int CudaLinearOperator::destroy(){
     if (cudaStatus != cudaSuccess) return 5;
 
     printf("\t* deleting x&y... ");
-    cudaStatus = cudaCheck( cudaFree(gpu_x);
+    cudaStatus = cudaFree(gpu_x);
     if (cudaStatus != cudaSuccess) return 5;
-    cudaStatus = cudaCheck( cudaFree(gpu_y);
+    cudaStatus = cudaFree(gpu_y);
     if (cudaStatus != cudaSuccess) return 5;
 
     printf("\t* deleting LUT... ");
-    cudaStatus = cudaCheck( cudaFree(gpu_lutIC);
+    cudaStatus = cudaFree(gpu_lutIC);
     if (cudaStatus != cudaSuccess) return 5;
-    cudaStatus = cudaCheck( cudaFree(gpu_lutEC);
+    cudaStatus = cudaFree(gpu_lutEC);
     if (cudaStatus != cudaSuccess) return 5;
-    cudaStatus = cudaCheck( cudaFree(gpu_lutISO);
+    cudaStatus = cudaFree(gpu_lutISO);
     if (cudaStatus != cudaSuccess) return 5;
-    cudaStatus = cudaCheck( cudaUnbindTexture(tex_lutIC);
+    cudaStatus = cudaUnbindTexture(tex_lutIC);
     if (cudaStatus != cudaSuccess) return 6;
-    cudaStatus = cudaCheck( cudaUnbindTexture(tex_lutEC);
+    cudaStatus = cudaUnbindTexture(tex_lutEC);
     if (cudaStatus != cudaSuccess) return 6;
-    cudaStatus = cudaCheck( cudaUnbindTexture(tex_lutISO);
+    cudaStatus = cudaUnbindTexture(tex_lutISO);
     if (cudaStatus != cudaSuccess) return 6;
 
     printf("\t* reseting GPU... ");
     cudaStatus = cudaDeviceReset();
     if (cudaStatus != cudaSuccess) return 7;
+
+    return 0;
 }
 
 void cudaCheckKernel(){

From 8920a1663120ed341eca3591f4bfe11173d89da0 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sun, 24 Jan 2021 00:56:46 -0600
Subject: [PATCH 184/190] Add color to COMMIT when CUDA is enabled

---
 commit/cudaoperator/operator_withCUDA.cu | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/commit/cudaoperator/operator_withCUDA.cu b/commit/cudaoperator/operator_withCUDA.cu
index c56fd7a2..80481654 100644
--- a/commit/cudaoperator/operator_withCUDA.cu
+++ b/commit/cudaoperator/operator_withCUDA.cu
@@ -360,6 +360,10 @@ int CudaLinearOperator::destroy(){
     cudaStatus = cudaDeviceReset();
     if (cudaStatus != cudaSuccess) return 7;
 
+    uint32_t* calis;
+    cudaStatus = cudaFree(calis);
+    if (cudaStatus != cudaSuccess) return 5;
+
     return 0;
 }
 

From 024603618c55372bff98eea5d6764b31d4e0d11c Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sun, 24 Jan 2021 02:16:14 -0600
Subject: [PATCH 185/190] Add destructor to CudaLinearOperator class

---
 commit/core.pyx                  | 10 +++++-----
 commit/cudaoperator/operator.pyx |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/commit/core.pyx b/commit/core.pyx
index 1999cd65..0e3028c3 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -533,14 +533,14 @@ cdef class Evaluation :
 
             from commit.cudaoperator.operator import check_compatibility
             #cdef unsigned long long required_mem = 28*self.n + 6*self.nzeppelins + 8.0*(size_t)nfibers + 16.0*(size_t)nvoxels + 4.0*((size_t)size_lutic + (size_t)size_lutec + (size_t)size_lutiso + (size_t)this->nrows + (size_t)this->ncols)
-            ans = check_compatibility(gpu_id)
-            if ans == 1:
+            error_id = check_compatibility(gpu_id)
+            if error_id == 1:
                 ERROR( 'The selected GPU is not detected' )
-            elif ans == 2:
+            elif error_id == 2:
                 ERROR( 'Impossible to set GPU with ID=%d' % gpu_id )
-            elif ans == 3:
+            elif error_id == 3:
                 ERROR( 'Impossible to get properties from GPU with ID=%d' % gpu_id )
-            elif ans == 4:
+            elif error_id == 4:
                 ERROR( 'Compute capability must be at least 5.0' )
 
             if gpu_id == 0:
diff --git a/commit/cudaoperator/operator.pyx b/commit/cudaoperator/operator.pyx
index 52ff10ba..9e92b091 100644
--- a/commit/cudaoperator/operator.pyx
+++ b/commit/cudaoperator/operator.pyx
@@ -168,7 +168,7 @@ cdef class CudaLinearOperator :
             self.ICv = &ICv[0]
             self.ICo = &ICo[0]
 
-            print( '\t* A\' operator... ', end='' )
+            print( '\t* A\' operator...   ', end='' )
             check_cuda( self.thisptr.setTransposeDictionary(&self.ICv[0], &self.ICf[0], &self.ICo[0], &self.ICl[0]) )
 
     def __del__( self ):

From 4e3098209c325316c5564619e59e095c58f423e2 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sun, 24 Jan 2021 02:24:36 -0600
Subject: [PATCH 186/190] Remove messages when deleting GPU memory

---
 commit/cudaoperator/operator.pyx         |  6 +----
 commit/cudaoperator/operator_withCUDA.cu | 34 ------------------------
 2 files changed, 1 insertion(+), 39 deletions(-)

diff --git a/commit/cudaoperator/operator.pyx b/commit/cudaoperator/operator.pyx
index 9e92b091..027bf484 100644
--- a/commit/cudaoperator/operator.pyx
+++ b/commit/cudaoperator/operator.pyx
@@ -172,8 +172,7 @@ cdef class CudaLinearOperator :
             check_cuda( self.thisptr.setTransposeDictionary(&self.ICv[0], &self.ICf[0], &self.ICo[0], &self.ICl[0]) )
 
     def __del__( self ):
-        LOG( '\n-> Clearing GPU memory:' )
-        check_cuda( self.thisptr.destroy() )
+        self.thisptr.destroy()
 
     @property
     def T( self ) :
@@ -222,7 +221,4 @@ cdef class CudaLinearOperator :
 
         return v_out
 
-    def destroy( self ):
-        """Free all memory of the CUDA GPU"""
-        self.thisptr.destroy()
 
diff --git a/commit/cudaoperator/operator_withCUDA.cu b/commit/cudaoperator/operator_withCUDA.cu
index 80481654..e0efdafb 100644
--- a/commit/cudaoperator/operator_withCUDA.cu
+++ b/commit/cudaoperator/operator_withCUDA.cu
@@ -300,69 +300,35 @@ CudaLinearOperator::~CudaLinearOperator() {}
 int CudaLinearOperator::destroy(){
     cudaError_t cudaStatus;    
 
-    printf("\t* deleting A...   ");
     cudaStatus = cudaFree(gpu_voxelIC);
-    if (cudaStatus != cudaSuccess) return 5;
     cudaStatus = cudaFree(gpu_fiberIC);
-    if (cudaStatus != cudaSuccess) return 5;
     cudaStatus = cudaFree(gpu_orienIC);
-    if (cudaStatus != cudaSuccess) return 5;
     cudaStatus = cudaFree(gpu_lengthIC);
-    if (cudaStatus != cudaSuccess) return 5;
     cudaStatus = cudaFree(gpu_voxelEC);
-    if (cudaStatus != cudaSuccess) return 5;
     cudaStatus = cudaFree(gpu_orienEC);
-    if (cudaStatus != cudaSuccess) return 5;
     cudaStatus = cudaFree(gpu_segmentsPerBlockIC);
-    if (cudaStatus != cudaSuccess) return 5;
     cudaStatus = cudaFree(gpu_offsetPerBlockIC);
-    if (cudaStatus != cudaSuccess) return 5;
     cudaStatus = cudaFree(gpu_segmentsPerBlockEC);
-    if (cudaStatus != cudaSuccess) return 5;
     cudaStatus = cudaFree(gpu_offsetPerBlockEC);
-    if (cudaStatus != cudaSuccess) return 5;
 
-    printf("\t* deleting A'...  ");
     cudaStatus = cudaFree(gpu_TvoxelIC);
-    if (cudaStatus != cudaSuccess) return 5;
     cudaStatus = cudaFree(gpu_TfiberIC);
-    if (cudaStatus != cudaSuccess) return 5;
     cudaStatus = cudaFree(gpu_TorienIC);
-    if (cudaStatus != cudaSuccess) return 5;
     cudaStatus = cudaFree(gpu_TlengthIC);
-    if (cudaStatus != cudaSuccess) return 5;
     cudaStatus = cudaFree(gpu_TfibersPerBlockIC);
-    if (cudaStatus != cudaSuccess) return 5;
     cudaStatus = cudaFree(gpu_ToffsetPerBlockIC);
-    if (cudaStatus != cudaSuccess) return 5;
 
-    printf("\t* deleting x&y... ");
     cudaStatus = cudaFree(gpu_x);
-    if (cudaStatus != cudaSuccess) return 5;
     cudaStatus = cudaFree(gpu_y);
-    if (cudaStatus != cudaSuccess) return 5;
 
-    printf("\t* deleting LUT... ");
     cudaStatus = cudaFree(gpu_lutIC);
-    if (cudaStatus != cudaSuccess) return 5;
     cudaStatus = cudaFree(gpu_lutEC);
-    if (cudaStatus != cudaSuccess) return 5;
     cudaStatus = cudaFree(gpu_lutISO);
-    if (cudaStatus != cudaSuccess) return 5;
     cudaStatus = cudaUnbindTexture(tex_lutIC);
-    if (cudaStatus != cudaSuccess) return 6;
     cudaStatus = cudaUnbindTexture(tex_lutEC);
-    if (cudaStatus != cudaSuccess) return 6;
     cudaStatus = cudaUnbindTexture(tex_lutISO);
-    if (cudaStatus != cudaSuccess) return 6;
 
-    printf("\t* reseting GPU... ");
     cudaStatus = cudaDeviceReset();
-    if (cudaStatus != cudaSuccess) return 7;
-
-    uint32_t* calis;
-    cudaStatus = cudaFree(calis);
-    if (cudaStatus != cudaSuccess) return 5;
 
     return 0;
 }

From 512eea8b9e1e8ac665866a1c6ca3aecb43feac62 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 3 Jul 2021 23:27:02 -0500
Subject: [PATCH 187/190] Assign 1 thread per voxel

---
 .gitattributes                              |    6 +-
 .gitignore                                  |   40 +-
 CHANGELOG.md                                |  310 +-
 LICENSE                                     |   66 +-
 MANIFEST.in                                 |   12 +-
 README.md                                   |   60 +-
 commit/__init__.py                          |   10 +-
 commit/core.pyx                             | 2058 ++++-----
 commit/cudaoperator/operator.pyx            |  448 +-
 commit/cudaoperator/operator_withCUDA.cu    | 1345 +++---
 commit/cudaoperator/operator_withCUDA.cuh   |  350 +-
 commit/operator/config.py                   |   12 +-
 commit/operator/operator.pyx                |  384 +-
 commit/operator/operator.pyxbld             |   78 +-
 commit/operator/operator_noLUT.c            |  374 +-
 commit/operator/operator_withLUT.c          | 4494 +++++++++----------
 commit/proximals.pyx                        |  280 +-
 commit/solvers.py                           |  806 ++--
 commit/trk2dictionary/trk2dictionary.pyx    |  858 ++--
 commit/trk2dictionary/trk2dictionary_c.cpp  | 1196 ++---
 extras/CMakeLists.txt                       |   22 +-
 extras/COMMIT_debugger/OPENGL_callbacks.cxx | 2264 +++++-----
 extras/COMMIT_debugger/OPENGL_utils.h       |  190 +-
 extras/COMMIT_debugger/main.cxx             | 1302 +++---
 extras/include/COLOR_ui.h                   |  146 +-
 requirements.txt                            |   10 +-
 setup.cfg                                   |   10 +-
 setup.py                                    |  408 +-
 28 files changed, 8791 insertions(+), 8748 deletions(-)

diff --git a/.gitattributes b/.gitattributes
index 6b4d6de6..e6cb0270 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,4 +1,4 @@
-extras/* linguist-vendored
-
-# Never modify line endings of our bash scripts
+extras/* linguist-vendored
+
+# Never modify line endings of our bash scripts
 *.sh -lf
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index de91de1c..7a67b8de 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,21 +1,21 @@
-build
-.ipynb_checkpoints
-.DS_Store
-.DS_Store?
-._*
-.Spotlight-V100
-.Trashes
-ehthumbs.db
-Thumbs.db
-__pycache__/
-.vscode/
-.eggs/
-*.egg-info/
-*.so
-*.cpp
-dist/
-
-trk2dictionary.c
-
-# Never modify line endings of our bash scripts
+build
+.ipynb_checkpoints
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
+__pycache__/
+.vscode/
+.eggs/
+*.egg-info/
+*.so
+*.cpp
+dist/
+
+trk2dictionary.c
+
+# Never modify line endings of our bash scripts
 *.sh -lf
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index cb5a637d..e6a263b0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,155 +1,155 @@
-# Change Log
-All notable changes to COMMIT will be documented in this file.
-
-## [1.5.0] - 2021-01-04
-
-### Changed
-- setup.py: Add compilation for .cu files
-
-### Added
-- GPU acceleration with CUDA for faster model fitting
-
-## [1.4.5] - 2020-12-29
-
-### Fixed
-- operator.pyxbld: Changed the condition to create a new operator
-
-### Added
-- core.pyx: Add to the function build_operator the parameter build_dir
-
-### Changed
-- core.pyx: The function build_operator checks if the LUT configuration 
-            changed before build a new operator
-
-## [1.4.4] - 2020-10-28
- 
-### Changed
-- Option to set one single direction in the resolution of the LUT
-
-## [1.4.3] - 2020-10-22
-
-### Added
-- store model parameters to results.pickle
-
-## [1.4.2] - 2020-10-22
-
-### Fixed
-- trk2dictionary.run(): check for invalid parameters passed to the blur
-
-## [1.4.1] - 2020-10-21
-
-### Fixed
-- operator.pyxbld: Changed the condition to create a new operator
-
-### Added
-- COMMIT version is stored in results.pickle
-- COMMIT version is stored in output NIFTI files
-
-## [1.4.0.4] - 2020-09-24
-
-### Fixed
-- trk2dictionary.run(): bug in the blurring functionality
-- trk2dictionary.run(): 'blur_sigma' defaults to 0
-
-## [1.4.0.3] - 2020-08-07
-
-### Fixed
-- COMMIT_debugger: compilation problem
-- COMMIT_debugger: wrong visualization in Linux
-
-## [1.4.0.2] - 2020-08-07
-
-### Changed
-- Moved the documentation to the Wiki
-
-## [1.4.0.1] - 2020-08-03
-
-### Changed
-- Updated the installation guide
-
-## [1.4.0.0] - 2020-07-30
-
-### Changed
-- trk2dictionary.run(): removed 'gen_trk' option
-- save_results(): removed 'save_coeff' and 'save_opt_details' parameters
-- save_results(): now saving only streamline_weights.txt (not anymore xic.txt, xec.txt, xiso.txt)
-- load_dictionary(): renamed 'use_mask' to 'use_all_voxels_in_mask'
-- Removed unused 'dictionary_ndirs.dict' file
-- trk2dictionary.run(): 'min_fiber_len' defaults to 0.0 for backward compatibility
-
-### Added
-- added 'get_coeffs()' function to get all estimated coefficients
-- save_results(): added 'stat_coeffs' parameter for saving streamline weights
-- trk2dictionary.run(): added 'max_fiber_len' parameter to discard long streamlines
-- load_data(): added 'b0_min_signal' to discard voxels with very low signal
-
-## [1.3.9] - 2020-06-09
-
-### Changed
-- Modify setup.py and fix spams dependencies
-
-## [1.3.8] - 2020-05-12
-
-### Changed
-- Improvements to the COMMIT_debugger.
-
-## [1.3.7] - 2020-04-25
-
-### Changed
-- Adapt demos to use d_perps instead of ICVFs for setting model parameters.
-
-## [1.3.6] - 2020-04-22
-
-### Fixed
-- Bug when the selected model has EC compartments but no peaks are provided (in trk2dictionary).
-
-## [1.3.5] - 2020-04-08
-
-### Added
-- Parameter 'min_fiber_len' in trk2dictionary to discard streamlines shorter than a given length in mm.
-
-### Fixed
-- Bug when 'points_to_skip' was higher then streamline length.
-- Few corrections to docstring of trk2dictionary.
-
-## [1.3.4] - 2020-04-02
-
-### Changed
-- Added colorized output. NB: needs AMICO 1.2.0 or above.
-
-## [1.3.3] - 2020-03-31
-
-### Added
-- Added possibility to save the predicted DW-MR signal in save_results.
- 
-### Fixed
-- Minor cleanup.
-
-
-## [1.3.2] - 2020-03-27
-
-### Added
-- Check if dictionary (upon loading) and data have the same geometry.
- 
-### Fixed
-- Bug while saving coefficients in save_results.
-
-
-## [1.3.1] - 2020-03-27
-
-### Fixed
-- Improved the loading of the streamlines in trk2dictionary
-
-
-## [1.3] - 2019-10-30
-
-This version of COMMIT *is not compatible* with [AMICO](https://github.com/daducci/AMICO) v1.0.1 of below. If you update COMMIT to this version, please update AMICO to version 1.1.0 or above.
- 
-### Added
-- Changelog file to keep tracking of the COMMIT versions.
- 
-### Changed
-- Added compatibility with low resolution LUTs.
- 
-### Fixed
-- Nothing.
+# Change Log
+All notable changes to COMMIT will be documented in this file.
+
+## [1.5.0] - 2021-01-04
+
+### Changed
+- setup.py: Add compilation for .cu files
+
+### Added
+- GPU acceleration with CUDA for faster model fitting
+
+## [1.4.5] - 2020-12-29
+
+### Fixed
+- operator.pyxbld: Changed the condition to create a new operator
+
+### Added
+- core.pyx: Add to the function build_operator the parameter build_dir
+
+### Changed
+- core.pyx: The function build_operator checks if the LUT configuration 
+            changed before build a new operator
+
+## [1.4.4] - 2020-10-28
+ 
+### Changed
+- Option to set one single direction in the resolution of the LUT
+
+## [1.4.3] - 2020-10-22
+
+### Added
+- store model parameters to results.pickle
+
+## [1.4.2] - 2020-10-22
+
+### Fixed
+- trk2dictionary.run(): check for invalid parameters passed to the blur
+
+## [1.4.1] - 2020-10-21
+
+### Fixed
+- operator.pyxbld: Changed the condition to create a new operator
+
+### Added
+- COMMIT version is stored in results.pickle
+- COMMIT version is stored in output NIFTI files
+
+## [1.4.0.4] - 2020-09-24
+
+### Fixed
+- trk2dictionary.run(): bug in the blurring functionality
+- trk2dictionary.run(): 'blur_sigma' defaults to 0
+
+## [1.4.0.3] - 2020-08-07
+
+### Fixed
+- COMMIT_debugger: compilation problem
+- COMMIT_debugger: wrong visualization in Linux
+
+## [1.4.0.2] - 2020-08-07
+
+### Changed
+- Moved the documentation to the Wiki
+
+## [1.4.0.1] - 2020-08-03
+
+### Changed
+- Updated the installation guide
+
+## [1.4.0.0] - 2020-07-30
+
+### Changed
+- trk2dictionary.run(): removed 'gen_trk' option
+- save_results(): removed 'save_coeff' and 'save_opt_details' parameters
+- save_results(): now saving only streamline_weights.txt (not anymore xic.txt, xec.txt, xiso.txt)
+- load_dictionary(): renamed 'use_mask' to 'use_all_voxels_in_mask'
+- Removed unused 'dictionary_ndirs.dict' file
+- trk2dictionary.run(): 'min_fiber_len' defaults to 0.0 for backward compatibility
+
+### Added
+- added 'get_coeffs()' function to get all estimated coefficients
+- save_results(): added 'stat_coeffs' parameter for saving streamline weights
+- trk2dictionary.run(): added 'max_fiber_len' parameter to discard long streamlines
+- load_data(): added 'b0_min_signal' to discard voxels with very low signal
+
+## [1.3.9] - 2020-06-09
+
+### Changed
+- Modify setup.py and fix spams dependencies
+
+## [1.3.8] - 2020-05-12
+
+### Changed
+- Improvements to the COMMIT_debugger.
+
+## [1.3.7] - 2020-04-25
+
+### Changed
+- Adapt demos to use d_perps instead of ICVFs for setting model parameters.
+
+## [1.3.6] - 2020-04-22
+
+### Fixed
+- Bug when the selected model has EC compartments but no peaks are provided (in trk2dictionary).
+
+## [1.3.5] - 2020-04-08
+
+### Added
+- Parameter 'min_fiber_len' in trk2dictionary to discard streamlines shorter than a given length in mm.
+
+### Fixed
+- Bug when 'points_to_skip' was higher then streamline length.
+- Few corrections to docstring of trk2dictionary.
+
+## [1.3.4] - 2020-04-02
+
+### Changed
+- Added colorized output. NB: needs AMICO 1.2.0 or above.
+
+## [1.3.3] - 2020-03-31
+
+### Added
+- Added possibility to save the predicted DW-MR signal in save_results.
+ 
+### Fixed
+- Minor cleanup.
+
+
+## [1.3.2] - 2020-03-27
+
+### Added
+- Check if dictionary (upon loading) and data have the same geometry.
+ 
+### Fixed
+- Bug while saving coefficients in save_results.
+
+
+## [1.3.1] - 2020-03-27
+
+### Fixed
+- Improved the loading of the streamlines in trk2dictionary
+
+
+## [1.3] - 2019-10-30
+
+This version of COMMIT *is not compatible* with [AMICO](https://github.com/daducci/AMICO) v1.0.1 of below. If you update COMMIT to this version, please update AMICO to version 1.1.0 or above.
+ 
+### Added
+- Changelog file to keep tracking of the COMMIT versions.
+ 
+### Changed
+- Added compatibility with low resolution LUTs.
+ 
+### Fixed
+- Nothing.
diff --git a/LICENSE b/LICENSE
index 70808f61..04e0c652 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,33 +1,33 @@
-Unless otherwise specified by LICENSE.txt files in individual
-directories, or within individual files or functions, all code is:
-
-Copyright (c) 2008-2020, COMMIT developers
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-    * Redistributions of source code must retain the above copyright
-       notice, this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above
-       copyright notice, this list of conditions and the following
-       disclaimer in the documentation and/or other materials provided
-       with the distribution.
-
-    * Neither the name of the COMMIT developers nor the names of any
-       contributors may be used to endorse or promote products derived
-       from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+Unless otherwise specified by LICENSE.txt files in individual
+directories, or within individual files or functions, all code is:
+
+Copyright (c) 2008-2020, COMMIT developers
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+       notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+       copyright notice, this list of conditions and the following
+       disclaimer in the documentation and/or other materials provided
+       with the distribution.
+
+    * Neither the name of the COMMIT developers nor the names of any
+       contributors may be used to endorse or promote products derived
+       from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/MANIFEST.in b/MANIFEST.in
index d3b5c5b7..fa48479d 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,7 +1,7 @@
-include README.md
-include LICENSE
-
-recursive-include commit *.h
-recursive-include commit *.cpp
-recursive-include commit *.pyx
+include README.md
+include LICENSE
+
+recursive-include commit *.h
+recursive-include commit *.cpp
+recursive-include commit *.pyx
 recursive-include commit *.c
\ No newline at end of file
diff --git a/README.md b/README.md
index 78bc5128..cdd2cb13 100644
--- a/README.md
+++ b/README.md
@@ -1,30 +1,30 @@
-# COMMIT
-
-The reconstructions recovered with existing tractography algorithms are *not really quantitative* even though diffusion MRI is a quantitative modality by nature. As a matter of fact, several techniques have been proposed in recent years to estimate, at the voxel level, intrinsic micro-structural features of the tissue, such as axonal density and diameter, by using multi-compartment models. COMMIT implements a novel framework to **re-establish the link between tractography and tissue micro-structure**.
-
-Starting from an input set of candidate fiber-tracts, which can be estimated using standard fiber-tracking techniques, COMMIT models the diffusion MRI signal in each voxel of the image as a *linear combination* of the restricted and hindered contributions generated in every location of the brain by these candidate tracts. Then, COMMIT seeks for the effective contribution of each of them such that they globally fit the measured signal at best.
-
-These weights can be easily estimated by solving a convenient **global convex optimization problem** and using efficient algorithms. Results clearly demonstrated the benefits of the proposed formulation, opening new perspectives for a more quantitative and biologically-plausible assessment of the structural connectivity in the brain.
-
-
-## Main features
-
-- Accepts and works with **any input tractogram** (i.e. set of fiber tracts).
-- Can easily implement and consider **any multi-compartment model** available in the literature: possibility to account for restricted, hindered as well as isotropic contributions into the signal forward model.
-- Very efficient: the core of the algorithm is implemented in C++ and using **multi-threading programming** for efficient parallel computation.
-- **Low memory** consumption using optimized sparse data structures, e.g. it can easily run on a standard laptop with 8GB RAM a full-brain tractogram from the HCP data (1M fibers, 3 shells, 1.25 mm^3 resolution).
-- **Soon**: **GPU implementation** for even faster model fitting.
-
-
-## Documentation
-
-More information/documentation, as well as a series of tutorials, can be found in the [wiki pages](https://github.com/daducci/COMMIT/wiki/Home).
-
-### Installation
-
-To install COMMIT, refer to the [installation guide](https://github.com/daducci/COMMIT/wiki/Installation).
-
-### Getting started
-
-To get started with the COMMIT framework, have a look at [this tutorial](https://github.com/daducci/COMMIT/wiki/Getting-started), which will guide you through the main steps of the processing.
-
+# COMMIT
+
+The reconstructions recovered with existing tractography algorithms are *not really quantitative* even though diffusion MRI is a quantitative modality by nature. As a matter of fact, several techniques have been proposed in recent years to estimate, at the voxel level, intrinsic micro-structural features of the tissue, such as axonal density and diameter, by using multi-compartment models. COMMIT implements a novel framework to **re-establish the link between tractography and tissue micro-structure**.
+
+Starting from an input set of candidate fiber-tracts, which can be estimated using standard fiber-tracking techniques, COMMIT models the diffusion MRI signal in each voxel of the image as a *linear combination* of the restricted and hindered contributions generated in every location of the brain by these candidate tracts. Then, COMMIT seeks for the effective contribution of each of them such that they globally fit the measured signal at best.
+
+These weights can be easily estimated by solving a convenient **global convex optimization problem** and using efficient algorithms. Results clearly demonstrated the benefits of the proposed formulation, opening new perspectives for a more quantitative and biologically-plausible assessment of the structural connectivity in the brain.
+
+
+## Main features
+
+- Accepts and works with **any input tractogram** (i.e. set of fiber tracts).
+- Can easily implement and consider **any multi-compartment model** available in the literature: possibility to account for restricted, hindered as well as isotropic contributions into the signal forward model.
+- Very efficient: the core of the algorithm is implemented in C++ and using **multi-threading programming** for efficient parallel computation.
+- **Low memory** consumption using optimized sparse data structures, e.g. it can easily run on a standard laptop with 8GB RAM a full-brain tractogram from the HCP data (1M fibers, 3 shells, 1.25 mm^3 resolution).
+- **Soon**: **GPU implementation** for even faster model fitting.
+
+
+## Documentation
+
+More information/documentation, as well as a series of tutorials, can be found in the [wiki pages](https://github.com/daducci/COMMIT/wiki/Home).
+
+### Installation
+
+To install COMMIT, refer to the [installation guide](https://github.com/daducci/COMMIT/wiki/Installation).
+
+### Getting started
+
+To get started with the COMMIT framework, have a look at [this tutorial](https://github.com/daducci/COMMIT/wiki/Getting-started), which will guide you through the main steps of the processing.
+
diff --git a/commit/__init__.py b/commit/__init__.py
index 3ab179d3..e7e71d6c 100755
--- a/commit/__init__.py
+++ b/commit/__init__.py
@@ -1,5 +1,5 @@
-from .core import Evaluation
-__all__ = ['core','models','solvers','trk2dictionary']
-
-from pkg_resources import get_distribution
-__version__ = get_distribution('dmri-commit').version
+from .core import Evaluation
+__all__ = ['core','models','solvers','trk2dictionary']
+
+from pkg_resources import get_distribution
+__version__ = get_distribution('dmri-commit').version
diff --git a/commit/core.pyx b/commit/core.pyx
index 0e3028c3..c3606410 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -1,1029 +1,1029 @@
-#!python
-#cython: language_level=3, boundscheck=False, wraparound=False, nonecheck=False, cdivision=True, initializedcheck=False, binding=False
-from __future__ import print_function
-cimport cython
-import numpy as np
-cimport numpy as np
-
-import time
-import glob
-import sys
-from os import makedirs, remove, getcwd, listdir
-from os.path import exists, join as pjoin, isfile, isdir
-import nibabel
-import pickle
-import commit.models
-import commit.solvers
-import amico.scheme
-import amico.lut
-import pyximport
-from pkg_resources import get_distribution
-
-from amico.util import LOG, NOTE, WARNING, ERROR
-
-
-def setup( lmax=12, ndirs=32761 ) :
-    """General setup/initialization of the COMMIT framework.
-    
-    Parameters
-    ----------
-    lmax : int
-        Maximum SH order to use for the rotation phase (default : 12)
-    ndirs : int
-        Number of directions on the half of the sphere representing the possible orientations of the response functions (default : 32761)
-    """
-
-    if not amico.lut.is_valid(ndirs):
-        ERROR( 'Unsupported value for ndirs.\nNote: Supported values for ndirs are [1, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 10000, 32761 (default)]' )
-
-    amico.lut.precompute_rotation_matrices( lmax, ndirs )
-
-
-def load_dictionary_info( filename ):
-    """Function to load dictionary info file
-    
-    Parameters
-    ----------
-    filename : string
-        This value is always COMMIT_PATH + dictionary_info.pickle
-    """
-    if not isfile( filename ):
-        ERROR( 'Dictionary is outdated or not found. Execute "trk2dictionary" script first' )
-    with open( filename, 'rb' ) as dictionary_info_file:
-        if sys.version_info.major == 3:
-            aux = pickle.load( dictionary_info_file, fix_imports=True, encoding='bytes' )
-            # Pickle files written by Python 2 are loaded with byte
-            # keys, whereas those written by Python 3 are loaded with
-            # str keys, even when both are written using protocol=2
-            result_aux = {(k.decode() if hasattr(k,"decode") else k): v for k, v in aux.items()}
-            return result_aux
-        else:
-            return pickle.load( dictionary_info_file )
-
-
-cdef class Evaluation :
-    """Class to hold all the information (data and parameters) when performing an
-    evaluation with the COMMIT framework.
-    """
-    cdef public niiDWI
-    cdef public niiDWI_img
-    cdef public scheme
-    cdef public model
-    cdef public KERNELS
-    cdef public DICTIONARY
-    cdef public THREADS
-    cdef public A
-    cdef public x
-    cdef public CONFIG
-
-    def __init__( self, study_path, subject ) :
-        """Setup the data structures with default values.
-
-        Parameters
-        ----------
-        study_path : string
-            The path to the folder containing all the subjects from one study
-        subject : string
-            The path (relative to previous folder) to the subject folder
-        """
-        self.niiDWI     = None # set by "load_data" method
-        self.scheme     = None # set by "load_data" method
-        self.model      = None # set by "set_model" method
-        self.KERNELS    = None # set by "load_kernels" method
-        self.DICTIONARY = None # set by "load_dictionary" method
-        self.THREADS    = None # set by "set_threads" method
-        self.A          = None # set by "build_operator" method
-        self.x          = None # set by "fit" method
-
-        # store all the parameters of an evaluation with COMMIT
-        self.CONFIG = {}
-        self.set_config('version', get_distribution('dmri-commit').version)
-        self.set_config('study_path', study_path)
-        self.set_config('subject', subject)
-        self.set_config('DATA_path', pjoin( study_path, subject ))
-
-        self.set_config('doNormalizeSignal', True)
-        self.set_config('doMergeB0', False)
-        self.set_config('doNormalizeKernels', True)
-        self.set_config('doDemean', False)
-        self.set_config('doNormalizeMaps', False)
-
-
-    def set_config( self, key, value ) :
-        self.CONFIG[ key ] = value
-
-
-    def get_config( self, key ) :
-        return self.CONFIG.get( key )
-
-
-    def load_data( self, dwi_filename='DWI.nii', scheme_filename='DWI.scheme', b0_thr=0, b0_min_signal=0 ) :
-        """Load the diffusion signal and its corresponding acquisition scheme.
-
-        Parameters
-        ----------
-        dwi_filename : string
-            The file name of the DWI data, relative to the subject folder (default : 'DWI.nii')
-        scheme_filename : string
-            The file name of the corresponding acquisition scheme (default : 'DWI.scheme')
-        b0_thr : float
-            The threshold below which a b-value is considered a b0 (default : 0)
-        b0_min_signal : float
-            Crop to zero the signal in voxels where the b0 <= b0_min_signal * mean(b0[b0>0]) (default : 0)
-        """
-
-        # Loading data and acquisition scheme
-        tic = time.time()
-        LOG( '\n-> Loading data:' )
-
-        print( '\t* DWI signal:' )
-        self.set_config('dwi_filename', dwi_filename)
-        self.niiDWI  = nibabel.load( pjoin( self.get_config('DATA_path'), dwi_filename) )
-        self.niiDWI_img = self.niiDWI.get_data().astype(np.float32)
-        if self.niiDWI_img.ndim ==3 :
-            self.niiDWI_img = np.expand_dims( self.niiDWI_img, axis=3 )
-        hdr = self.niiDWI.header if nibabel.__version__ >= '2.0.0' else self.niiDWI.get_header()
-        self.set_config('dim', self.niiDWI_img.shape[0:3])
-        self.set_config('pixdim', tuple( hdr.get_zooms()[:3] ))
-        print( '\t\t- dim    : %d x %d x %d x %d' % self.niiDWI_img.shape )
-        print( '\t\t- pixdim : %.3f x %.3f x %.3f' % self.get_config('pixdim') )
-        print( '\t\t- values : min=%.2f, max=%.2f, mean=%.2f' % ( self.niiDWI_img.min(), self.niiDWI_img.max(), self.niiDWI_img.mean() ) )
-
-        print( '\t* Acquisition scheme:' )
-        self.set_config('scheme_filename', scheme_filename)
-        self.set_config('b0_thr', b0_thr)
-        self.scheme = amico.scheme.Scheme( pjoin( self.get_config('DATA_path'), scheme_filename), b0_thr )
-        print( '\t\t- %d samples, %d shells' % ( self.scheme.nS, len(self.scheme.shells) ) )
-        print( '\t\t- %d @ b=0' % ( self.scheme.b0_count ), end='' )
-        for i in xrange(len(self.scheme.shells)) :
-            print( ', %d @ b=%.1f' % ( len(self.scheme.shells[i]['idx']), self.scheme.shells[i]['b'] ), end='' )
-        print()
-
-        if self.scheme.nS != self.niiDWI_img.shape[3] :
-            ERROR( 'Scheme does not match with DWI data' )
-
-        if self.scheme.dwi_count == 0 :
-            ERROR( 'There are no DWI volumes in the data' )
-
-        LOG( '   [ %.1f seconds ]' % ( time.time() - tic ) )
-
-        # Preprocessing
-        tic = time.time()
-        LOG( '\n-> Preprocessing:' )
-
-        if self.get_config('doNormalizeSignal') :
-            if self.scheme.b0_count > 0 :
-                print( '\t* Normalizing to b0... ', end='' )
-                sys.stdout.flush()
-                b0 = np.mean( self.niiDWI_img[:,:,:,self.scheme.b0_idx], axis=3 )
-                idx = b0 <= b0_min_signal * b0[b0>0].mean()
-                b0[ idx ] = 1
-                b0 = 1.0 / b0
-                b0[ idx ] = 0
-                for i in xrange(self.scheme.nS) :
-                    self.niiDWI_img[:,:,:,i] *= b0
-                print( '[ min=%.2f, max=%.2f, mean=%.2f ]' % ( self.niiDWI_img.min(), self.niiDWI_img.max(), self.niiDWI_img.mean() ) )
-                del idx, b0
-            else :
-                WARNING( 'There are no b0 volumes for normalization' )
-
-        if self.scheme.b0_count > 1 :
-            if self.get_config('doMergeB0') :
-                print( '\t* Merging multiple b0 volume(s)... ', end='' )
-                mean = np.expand_dims( np.mean( self.niiDWI_img[:,:,:,self.scheme.b0_idx], axis=3 ), axis=3 )
-                self.niiDWI_img = np.concatenate( (mean, self.niiDWI_img[:,:,:,self.scheme.dwi_idx]), axis=3 )
-                del mean
-            else :
-                print( '\t* Keeping all b0 volume(s)... ', end='' )
-            print( '[ %d x %d x %d x %d ]' % self.niiDWI_img.shape )
-
-        if self.get_config('doDemean') :
-            print( '\t* Demeaning signal... ', end='' )
-            sys.stdout.flush()
-            mean = np.repeat( np.expand_dims(np.mean(self.niiDWI_img,axis=3),axis=3), self.niiDWI_img.shape[3], axis=3 )
-            self.niiDWI_img = self.niiDWI_img - mean
-            print( '[ min=%.2f, max=%.2f, mean=%.2f ]' % ( self.niiDWI_img.min(), self.niiDWI_img.max(), self.niiDWI_img.mean() ) )
-
-        LOG( '   [ %.1f seconds ]' % ( time.time() - tic ) )
-
-
-    def set_model( self, model_name ) :
-        """Set the model to use to describe the signal contributions in each voxel.
-
-        Parameters
-        ----------
-        model_name : string
-            The name of the model (must match a class name in "commit.models" module)
-        """
-        # Call the specific model constructor
-        if hasattr(commit.models, model_name ) :
-            self.model = getattr(commit.models,model_name)()
-        else :
-            ERROR( 'Model "%s" not recognized' % model_name )
-
-        self.set_config('ATOMS_path', pjoin( self.get_config('study_path'), 'kernels', self.model.id ))
-
-
-    def generate_kernels( self, regenerate=False, lmax=12, ndirs=32761 ) :
-        """Generate the high-resolution response functions for each compartment.
-        Dispatch to the proper function, depending on the model.
-
-        Parameters
-        ----------
-        regenerate : boolean
-            Regenerate kernels if they already exist (default : False)
-        lmax : int
-            Maximum SH order to use for the rotation procedure (default : 12)
-        ndirs : int
-            Number of directions on the half of the sphere representing the possible orientations of the response functions (default : 32761)
-        """
-        if not amico.lut.is_valid(ndirs):
-            ERROR( 'Unsupported value for ndirs.\nNote: Supported values for ndirs are [1, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 10000, 32761 (default)]' )
-        if self.scheme is None :
-            ERROR( 'Scheme not loaded; call "load_data()" first' )
-        if self.model is None :
-            ERROR( 'Model not set; call "set_model()" method first' )
-
-        # store some values for later use
-        self.set_config('lmax', lmax)
-        self.set_config('ndirs', ndirs)
-        self.set_config('model', self.model.get_params())
-        self.model.scheme = self.scheme
-
-        LOG( '\n-> Simulating with "%s" model:' % self.model.name )
-
-        # check if kernels were already generated
-        tmp = glob.glob( pjoin(self.get_config('ATOMS_path'),'A_*.npy') )
-        if len(tmp)>0 and not regenerate :
-            LOG( '   [ Kernels already computed. Use option "regenerate=True" to force regeneration ]' )
-            return
-
-        # create folder or delete existing files (if any)
-        if not exists( self.get_config('ATOMS_path') ) :
-            makedirs( self.get_config('ATOMS_path') )
-        else :
-            for f in glob.glob( pjoin(self.get_config('ATOMS_path'),'*') ) :
-                remove( f )
-
-        # auxiliary data structures
-        aux = amico.lut.load_precomputed_rotation_matrices( lmax, ndirs )
-        idx_IN, idx_OUT = amico.lut.aux_structures_generate( self.scheme, lmax )
-
-        # Dispatch to the right handler for each model
-        tic = time.time()
-        self.model.generate( self.get_config('ATOMS_path'), aux, idx_IN, idx_OUT, ndirs )
-        LOG( '   [ %.1f seconds ]' % ( time.time() - tic ) )
-
-
-    def load_kernels( self ) :
-        """Load rotated kernels and project to the specific gradient scheme of this subject.
-        Dispatch to the proper function, depending on the model.
-        """
-        if self.model is None :
-            ERROR( 'Model not set; call "set_model()" method first' )
-        if self.scheme is None :
-            ERROR( 'Scheme not loaded; call "load_data()" first' )
-
-        tic = time.time()
-        LOG( '\n-> Resampling LUT for subject "%s":' % self.get_config('subject') )
-
-        # auxiliary data structures
-        idx_OUT, Ylm_OUT = amico.lut.aux_structures_resample( self.scheme, self.get_config('lmax') )
-
-        # Dispatch to the right handler for each model
-        if self.get_config('doMergeB0') :
-            print( '\t* Merging multiple b0 volume(s)...' )
-        else :
-            print( '\t* Keeping all b0 volume(s)...' )
-        self.KERNELS = self.model.resample( self.get_config('ATOMS_path'), idx_OUT, Ylm_OUT, self.get_config('doMergeB0'), self.get_config('ndirs') )
-        nIC  = self.KERNELS['wmr'].shape[0]
-        nEC  = self.KERNELS['wmh'].shape[0]
-        nISO = self.KERNELS['iso'].shape[0]
-        print( '\t  [ OK ]' )
-
-        # ensure contiguous arrays for C part
-        self.KERNELS['wmr'] = np.ascontiguousarray( self.KERNELS['wmr'] )
-        self.KERNELS['wmh'] = np.ascontiguousarray( self.KERNELS['wmh'] )
-        self.KERNELS['iso'] = np.ascontiguousarray( self.KERNELS['iso'] )
-
-        # De-mean kernels
-        if self.get_config('doDemean') :
-            print( '\t* Demeaning signal...', end='' )
-            for j in xrange(self.get_config('ndirs')) :
-                for i in xrange(nIC) :
-                    self.KERNELS['wmr'][i,j,:] -= self.KERNELS['wmr'][i,j,:].mean()
-                for i in xrange(nEC) :
-                    self.KERNELS['wmh'][i,j,:] -= self.KERNELS['wmh'][i,j,:].mean()
-            for i in xrange(nISO) :
-                self.KERNELS['iso'][i] -= self.KERNELS['iso'][i].mean()
-            print( '[ OK ]' )
-
-        # Normalize atoms
-        if self.get_config('doNormalizeKernels') :
-            print( '\t* Normalizing... ', end='' )
-
-            self.KERNELS['wmr_norm'] = np.zeros( nIC )
-            for i in xrange(nIC) :
-                self.KERNELS['wmr_norm'][i] = np.linalg.norm( self.KERNELS['wmr'][i,0,:] )
-                for j in xrange(self.get_config('ndirs')) :
-                    self.KERNELS['wmr'][i,j,:] /= self.KERNELS['wmr_norm'][i]
-
-            self.KERNELS['wmh_norm'] = np.zeros( nEC )
-            for i in xrange(nEC) :
-                self.KERNELS['wmh_norm'][i] = np.linalg.norm( self.KERNELS['wmh'][i,0,:] )
-                for j in xrange(self.get_config('ndirs')) :
-                    self.KERNELS['wmh'][i,j,:] /= self.KERNELS['wmh_norm'][i]
-
-            self.KERNELS['iso_norm'] = np.zeros( nISO )
-            for i in xrange(nISO) :
-                self.KERNELS['iso_norm'][i] = np.linalg.norm( self.KERNELS['iso'][i,:] )
-                self.KERNELS['iso'][i,:] /= self.KERNELS['iso_norm'][i]
-
-            print( '[ OK ]' )
-
-        LOG( '   [ %.1f seconds ]' % ( time.time() - tic ) )
-
-
-    cpdef load_dictionary( self, path, use_all_voxels_in_mask=False ) :
-        """Load the sparse structure previously created with "trk2dictionary" script.
-
-        Parameters
-        ----------
-        path : string
-            Folder containing the output of the trk2dictionary script (relative to subject path)
-        use_all_voxels_in_mask : boolean
-            If False (default) the optimization will be conducted only on the voxels actually
-            traversed by tracts. If True, then all voxels present in the mask specified in 
-            trk2dictionary.run(), i.e. "filename_mask" parameter, will be used instead.
-            NB: if no mask was specified in trk2dictionary, this parameter is irrelevant.
-        """
-        if self.niiDWI is None :
-            ERROR( 'Data not loaded; call "load_data()" first' )
-
-        tic = time.time()
-        LOG( '\n-> Loading the dictionary:' )
-        self.DICTIONARY = {}
-        self.set_config('TRACKING_path', pjoin(self.get_config('DATA_path'),path))
-
-        # check that ndirs of dictionary matches with that of the kernels
-        dictionary_info = load_dictionary_info( pjoin(self.get_config('TRACKING_path'), "dictionary_info.pickle") )
-        if dictionary_info['ndirs'] != self.get_config('ndirs'):
-            ERROR( '"ndirs" of the dictionary (%d) does not match with the kernels (%d)' % (dictionary_info['ndirs'], self.get_config('ndirs')) )
-        self.DICTIONARY['ndirs'] = dictionary_info['ndirs']
-
-        # load mask
-        self.set_config('dictionary_mask', 'mask' if use_all_voxels_in_mask else 'tdi' )
-        mask_filename = pjoin(self.get_config('TRACKING_path'),'dictionary_%s.nii'%self.get_config('dictionary_mask'))
-        if not exists( mask_filename ) :
-            mask_filename += '.gz'
-            if not exists( mask_filename ) :
-                ERROR( 'Dictionary not found. Execute "trk2dictionary" script first' );
-        niiMASK = nibabel.load( mask_filename )
-        niiMASK_hdr = niiMASK.header if nibabel.__version__ >= '2.0.0' else niiMASK.get_header()
-        if ( self.get_config('dim')[0]!=niiMASK.shape[0] or
-             self.get_config('dim')[1]!=niiMASK.shape[1] or
-             self.get_config('dim')[2]!=niiMASK.shape[2] or
-             abs(self.get_config('pixdim')[0]-niiMASK_hdr['pixdim'][1])>1e-3 or
-             abs(self.get_config('pixdim')[1]-niiMASK_hdr['pixdim'][2])>1e-3 or
-             abs(self.get_config('pixdim')[2]-niiMASK_hdr['pixdim'][3])>1e-3 ) :
-            WARNING( 'Dictionary does not have the same geometry as the dataset' )
-        self.DICTIONARY['MASK'] = (niiMASK.get_data() > 0).astype(np.uint8)
-
-        # segments from the tracts
-        # ------------------------
-        print( '\t* Segments from the tracts... ', end='' )
-        sys.stdout.flush()
-
-        self.DICTIONARY['TRK'] = {}
-        self.DICTIONARY['TRK']['kept']  = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_TRK_kept.dict'), dtype=np.uint8 )
-        self.DICTIONARY['TRK']['norm'] = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_TRK_norm.dict'), dtype=np.float32 )
-        self.DICTIONARY['TRK']['len']  = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_TRK_len.dict'), dtype=np.float32 )
-        
-
-        self.DICTIONARY['IC'] = {}
-        self.DICTIONARY['IC']['fiber'] = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_IC_f.dict'), dtype=np.uint32 )
-        self.DICTIONARY['IC']['v']     = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_IC_v.dict'), dtype=np.uint32 )
-        self.DICTIONARY['IC']['o']     = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_IC_o.dict'), dtype=np.uint16 )
-        self.DICTIONARY['IC']['len']   = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_IC_len.dict'), dtype=np.float32 )
-        self.DICTIONARY['IC']['n']     = self.DICTIONARY['IC']['fiber'].size
-        self.DICTIONARY['IC']['nF']    = self.DICTIONARY['TRK']['norm'].size
-
-        # reorder the segments based, first, on the "v" field and after based on the "o" field
-        idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['v'])] )
-        self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
-        self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]
-        self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
-        self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
-        del idx
-
-        # divide the length of each segment by the fiber length so that all the columns of the libear operator will have same length
-        # NB: it works in conjunction with the normalization of the kernels
-        cdef :
-            np.float32_t [:] sl = self.DICTIONARY['IC']['len']
-            np.float32_t [:] tl = self.DICTIONARY['TRK']['norm']
-            np.uint32_t  [:] f  = self.DICTIONARY['IC']['fiber']
-            int s
-        if self.get_config('doNormalizeKernels') :
-            for s in xrange(self.DICTIONARY['IC']['n']) :
-                sl[s] /= tl[ f[s] ]
-
-        print( '[ %d fibers and %d segments ]' % ( self.DICTIONARY['IC']['nF'], self.DICTIONARY['IC']['n'] ) )
-
-        # segments from the peaks
-        # -----------------------
-        print( '\t* Segments from the peaks...  ', end='' )
-        sys.stdout.flush()
-
-        self.DICTIONARY['EC'] = {}
-        self.DICTIONARY['EC']['v']  = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_EC_v.dict'), dtype=np.uint32 )
-        self.DICTIONARY['EC']['o']  = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_EC_o.dict'), dtype=np.uint16 )
-        self.DICTIONARY['EC']['nE'] = self.DICTIONARY['EC']['v'].size
-
-        # reorder the segments based, first, on the "v" field and after based on the "o" field
-        idx = np.lexsort( [np.array(self.DICTIONARY['EC']['o']), np.array(self.DICTIONARY['EC']['v'])] )
-        self.DICTIONARY['EC']['v'] = self.DICTIONARY['EC']['v'][ idx ]
-        self.DICTIONARY['EC']['o'] = self.DICTIONARY['EC']['o'][ idx ]
-        del idx
-
-        print( '[ %d segments ]' % self.DICTIONARY['EC']['nE'] )
-
-        # isotropic compartments
-        # ----------------------
-        print( '\t* Isotropic contributions...  ', end='' )
-        sys.stdout.flush()
-
-        self.DICTIONARY['ISO'] = {}
-
-        self.DICTIONARY['nV'] = self.DICTIONARY['MASK'].sum()
-
-        vx, vy, vz = ( self.DICTIONARY['MASK'] > 0 ).nonzero() # [TODO] find a way to avoid using int64 (not necessary and waste of memory)
-        vx = vx.astype(np.int32)
-        vy = vy.astype(np.int32)
-        vz = vz.astype(np.int32)
-        self.DICTIONARY['ISO']['v'] = vx + self.get_config('dim')[0] * ( vy + self.get_config('dim')[1] * vz )
-        del vx, vy, vz
-
-        # reorder the segments based on the "v" field
-        idx = np.argsort( self.DICTIONARY['ISO']['v'], kind='mergesort' )
-        self.DICTIONARY['ISO']['v'] = self.DICTIONARY['ISO']['v'][ idx ]
-        del idx
-
-        print( '[ %d voxels ]' % self.DICTIONARY['nV'] )
-
-        # post-processing
-        # ---------------
-        print( '\t* Post-processing...          ', end='' )
-        sys.stdout.flush()
-
-        # get the indices to extract the VOI as in MATLAB (in place of DICTIONARY.MASKidx)
-        idx = self.DICTIONARY['MASK'].ravel(order='F').nonzero()[0]
-        self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] = np.unravel_index( idx, self.DICTIONARY['MASK'].shape, order='F' )
-
-        lut = np.zeros( self.get_config('dim'), dtype=np.uint32 ).ravel()
-        for i in xrange(idx.size) :
-            lut[ idx[i] ] = i
-        self.DICTIONARY['IC'][ 'v'] = lut[ self.DICTIONARY['IC'][ 'v'] ]
-        self.DICTIONARY['EC'][ 'v'] = lut[ self.DICTIONARY['EC'][ 'v'] ]
-        self.DICTIONARY['ISO']['v'] = lut[ self.DICTIONARY['ISO']['v'] ]
-
-        print( '[ OK ]' )
-
-        LOG( '   [ %.1f seconds ]' % ( time.time() - tic ) )
-
-
-    def set_threads( self, n = None, nthreads = None, gpu_id = 0 ) :
-        """Set the number of threads to use for the matrix-vector operations with A and A'.
-
-        Parameters
-        ----------
-        n : integer
-            Same as nthreads. This remains just for compatibility with previous versions
-
-        nthreads : integer
-            Number of threads to use (nthreads = None ---> all the CPU threads available in the system
-                                      nthreads = 0    ---> enable CUDA GPU acceleration)
-        gpu_id : integer
-            GPU ID of the Nvidia GPU where COMMIT will be executed, default=0 and it is only required if nthreads=0
-            (To show a list of Nvidia GPUs and their IDs, open a system shell and run the command 'nvidia-smi')
-        """
-        if nthreads is None :
-            if n != None :
-                WARNING( '"n" parameter is deprecated, use "nthreads" instead' )
-                nthreads = n
-            else:
-                # Set to the number of CPUs in the system
-                try :
-                    import multiprocessing
-                    nthreads = multiprocessing.cpu_count()
-                except :
-                    nthreads = 1
-
-        if nthreads < 0 or nthreads > 255 :
-            ERROR( 'Number of threads must be between 0 and 255' )
-        if self.DICTIONARY is None :
-            ERROR( 'Dictionary not loaded; call "load_dictionary()" first' )
-        if self.KERNELS is None :
-            ERROR( 'Response functions not generated; call "generate_kernels()" and "load_kernels()" first' )
-
-        self.THREADS = {}
-        self.THREADS['n'] = nthreads
-        if nthreads == 0:
-            self.THREADS['gpu_id'] = gpu_id
-            LOG( '\n-> Checking CUDA GPU:' )
-
-            from commit.cudaoperator.operator import check_compatibility
-            #cdef unsigned long long required_mem = 28*self.n + 6*self.nzeppelins + 8.0*(size_t)nfibers + 16.0*(size_t)nvoxels + 4.0*((size_t)size_lutic + (size_t)size_lutec + (size_t)size_lutiso + (size_t)this->nrows + (size_t)this->ncols)
-            error_id = check_compatibility(gpu_id)
-            if error_id == 1:
-                ERROR( 'The selected GPU is not detected' )
-            elif error_id == 2:
-                ERROR( 'Impossible to set GPU with ID=%d' % gpu_id )
-            elif error_id == 3:
-                ERROR( 'Impossible to get properties from GPU with ID=%d' % gpu_id )
-            elif error_id == 4:
-                ERROR( 'Compute capability must be at least 5.0' )
-
-            if gpu_id == 0:
-                LOG( '   [ Default GPU selected. Use option "gpu_id" in "set_threads()" to change selection ]' )
-
-        cdef :
-            long [:] C
-            long t, tot, i1, i2, N, c
-            int i
-
-        tic = time.time()
-
-        if nthreads > 0:
-            LOG( '\n-> Distributing workload to different threads:' )
-            print( '\t* number of threads : %d' % nthreads )
-
-            # Distribute load for the computation of A*x product
-            print( '\t* A  operator... ', end='' )
-            sys.stdout.flush()
-
-            self.THREADS['IC']   = None
-            self.THREADS['EC']   = None
-            self.THREADS['ISO']  = None
-            self.THREADS['ICt']  = None
-            self.THREADS['ECt']  = None
-            self.THREADS['ISOt'] = None
-
-            if self.DICTIONARY['IC']['n'] > 0 :
-                self.THREADS['IC'] = np.zeros( nthreads+1, dtype=np.uint32 )
-                if nthreads > 1 :
-                    N = np.floor( self.DICTIONARY['IC']['n']/nthreads )
-                    t = 1
-                    tot = 0
-                    C = np.bincount( self.DICTIONARY['IC']['v'] )
-                    for c in C :
-                        tot += c
-                        if tot >= N :
-                            self.THREADS['IC'][t] = self.THREADS['IC'][t-1] + tot
-                            t += 1
-                            tot = 0
-                self.THREADS['IC'][nthreads] = self.DICTIONARY['IC']['n']
-
-                # check if some threads are not assigned any segment
-                if np.count_nonzero( np.diff( self.THREADS['IC'].astype(np.int32) ) <= 0 ) :
-                    self.THREADS = None
-                    ERROR( 'Too many threads for the IC compartments to evaluate; try decreasing the number.' )
-
-            if self.DICTIONARY['EC']['nE'] > 0 :
-                self.THREADS['EC'] = np.zeros( nthreads+1, dtype=np.uint32 )
-                for i in xrange(nthreads) :
-                    self.THREADS['EC'][i] = np.searchsorted( self.DICTIONARY['EC']['v'], self.DICTIONARY['IC']['v'][ self.THREADS['IC'][i] ] )
-                self.THREADS['EC'][nthreads] = self.DICTIONARY['EC']['nE']
-
-                # check if some threads are not assigned any segment
-                if np.count_nonzero( np.diff( self.THREADS['EC'].astype(np.int32) ) <= 0 ) :
-                    self.THREADS = None
-                    ERROR( 'Too many threads for the EC compartments to evaluate; try decreasing the number.' )
-
-            if self.DICTIONARY['nV'] > 0 :
-                self.THREADS['ISO'] = np.zeros( nthreads+1, dtype=np.uint32 )
-                for i in xrange(nthreads) :
-                    self.THREADS['ISO'][i] = np.searchsorted( self.DICTIONARY['ISO']['v'], self.DICTIONARY['IC']['v'][ self.THREADS['IC'][i] ] )
-                self.THREADS['ISO'][nthreads] = self.DICTIONARY['nV']
-
-                # check if some threads are not assigned any segment
-                if np.count_nonzero( np.diff( self.THREADS['ISO'].astype(np.int32) ) <= 0 ) :
-                    self.THREADS = None
-                    ERROR( 'Too many threads for the ISO compartments to evaluate; try decreasing the number.' )
-
-            print( '[ OK ]' )
-
-            # Distribute load for the computation of At*y product
-            print( '\t* A\' operator... ', end="" )
-            sys.stdout.flush()
-
-            if self.DICTIONARY['IC']['n'] > 0 :
-                self.THREADS['ICt'] = np.full( self.DICTIONARY['IC']['n'], nthreads-1, dtype=np.uint8 )
-                if nthreads > 1 :
-                    idx = np.argsort( self.DICTIONARY['IC']['fiber'], kind='mergesort' )
-                    C = np.bincount( self.DICTIONARY['IC']['fiber'] )
-                    t = tot = i1 = i2 = 0
-                    N = np.floor(self.DICTIONARY['IC']['n']/nthreads)
-                    for c in C :
-                        i2 += c
-                        tot += c
-                        if tot >= N :
-                            self.THREADS['ICt'][ i1:i2 ] = t
-                            t += 1
-                            if t==nthreads-1 :
-                                break
-                            i1 = i2
-                            tot = c
-                    self.THREADS['ICt'][idx] = self.THREADS['ICt'].copy()
-
-            if self.DICTIONARY['EC']['nE'] > 0 :
-                self.THREADS['ECt'] = np.zeros( nthreads+1, dtype=np.uint32 )
-                N = np.floor( self.DICTIONARY['EC']['nE']/nthreads )
-                for i in xrange(1,nthreads) :
-                    self.THREADS['ECt'][i] = self.THREADS['ECt'][i-1] + N
-                self.THREADS['ECt'][nthreads] = self.DICTIONARY['EC']['nE']
-
-                # check if some threads are not assigned any segment
-                if np.count_nonzero( np.diff( self.THREADS['ECt'].astype(np.int32) ) <= 0 ) :
-                    self.THREADS = None
-                    ERROR( 'Too many threads for the EC compartments to evaluate; try decreasing the number.' )
-
-            if self.DICTIONARY['nV'] > 0 :
-                self.THREADS['ISOt'] = np.zeros( nthreads+1, dtype=np.uint32 )
-                N = np.floor( self.DICTIONARY['nV']/nthreads )
-                for i in xrange(1,nthreads) :
-                    self.THREADS['ISOt'][i] = self.THREADS['ISOt'][i-1] + N
-                self.THREADS['ISOt'][nthreads] = self.DICTIONARY['nV']
-
-                # check if some threads are not assigned any segment
-                if np.count_nonzero( np.diff( self.THREADS['ISOt'].astype(np.int32) ) <= 0 ) :
-                    self.THREADS = None
-                    ERROR( 'Too many threads for the ISO compartments to evaluate; try decreasing the number.' )
-
-            print( '[ OK ]' )
-
-            LOG( '   [ %.1f seconds ]' % ( time.time() - tic ) )
-
-
-    def build_operator( self, build_dir=None ) :
-        """Compile/build the operator for computing the matrix-vector multiplications by A and A'
-        using the informations from self.DICTIONARY, self.KERNELS and self.THREADS.
-        NB: needs to call this function to update pointers to data structures in case
-            the data is changed in self.DICTIONARY, self.KERNELS or self.THREADS.
-
-        Parameters
-        ----------
-        build_dir : string
-            The folder in which to store the compiled files. 
-            If None (default), they will end up in the .pyxbld directory in the user’s home directory.
-            If using this option, it is recommended to use a temporary directory, quit your python 
-                console between each build, and delete the content of the temporary directory.
-        """
-        if self.DICTIONARY is None :
-            ERROR( 'Dictionary not loaded; call "load_dictionary()" first' )
-        if self.KERNELS is None :
-            ERROR( 'Response functions not generated; call "generate_kernels()" and "load_kernels()" first' )
-        if self.THREADS is None :
-            ERROR( 'Threads not set; call "set_threads()" first' )
-        
-        if self.DICTIONARY['IC']['nF'] <= 0 :
-            ERROR( 'No streamline found in the dictionary; check your data' )
-        if self.DICTIONARY['EC']['nE'] <= 0 and self.KERNELS['wmh'].shape[0] > 0 :
-            ERROR( 'The selected model has EC compartments, but no peaks have been provided; check your data' )
-
-        tic = time.time()
-        LOG( '\n-> Building linear operator A:' )
-
-        if self.THREADS['n'] > 0:
-            # need to pass these parameters at runtime for compiling the C code
-            from commit.operator import config
-
-            compilation_is_needed = False
-            
-            if config.nTHREADS is None or config.nTHREADS != self.THREADS['n']:
-                compilation_is_needed = True
-            if config.nIC is None or config.nIC != self.KERNELS['wmr'].shape[0]:
-                compilation_is_needed = True
-            if config.model is None or config.model != self.model.id:
-                compilation_is_needed = True        
-            if config.nEC is None or config.nEC != self.KERNELS['wmh'].shape[0]:
-                compilation_is_needed = True                
-            if config.nISO is None or config.nISO != self.KERNELS['iso'].shape[0]:
-                compilation_is_needed = True        
-            if config.build_dir != build_dir:
-                compilation_is_needed = True        
-
-            if compilation_is_needed or not 'commit.operator.operator' in sys.modules :       
-
-                if build_dir is not None:
-                    if isdir(build_dir) and not len(listdir(build_dir)) == 0:
-                        ERROR( '\nbuild_dir is not empty, unsafe build option.' )
-                    elif config.nTHREADS is not None:
-                        ERROR( '\nThe parameter build_dir has changed, unsafe build option.' )
-                    else:
-                        WARNING( '\nUsing build_dir, always quit your python console between COMMIT Evaluation.' )
-
-                config.nTHREADS   = self.THREADS['n']
-                config.model      = self.model.id
-                config.nIC        = self.KERNELS['wmr'].shape[0]
-                config.nEC        = self.KERNELS['wmh'].shape[0]
-                config.nISO       = self.KERNELS['iso'].shape[0]
-                config.build_dir  = build_dir
-
-                pyximport.install( reload_support=True, language_level=3, build_dir=build_dir, build_in_temp=True, inplace=False )
-
-                if not 'commit.operator.operator' in sys.modules :
-                    import commit.operator.operator
-                else :
-                    reload( sys.modules['commit.operator.operator'] )
-                
-            self.A = sys.modules['commit.operator.operator'].LinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )        
-        else:
-            import commit.cudaoperator.operator
-            self.A = commit.cudaoperator.operator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS, fcall=1 )
-
-        LOG( '   [ %.1f seconds ]' % ( time.time() - tic ) )
-
-
-    def get_y( self ):
-        """
-        Returns a numpy array that corresponds to the 'y' vector of the optimisation problem.
-        NB: this can be run only after having loaded the dictionary and the data.
-        """
-        if self.DICTIONARY is None :
-            ERROR( 'Dictionary not loaded; call "load_dictionary()" first' )
-        if self.niiDWI is None :
-            ERROR( 'Data not loaded; call "load_data()" first' )
-        return self.niiDWI_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'], : ].flatten().astype(np.float64)
-
-
-    def fit( self, tol_fun=1e-3, tol_x=1e-6, max_iter=100, verbose=1, x0=None, regularisation=None ) :
-        """Fit the model to the data.
-
-        Parameters
-        ----------
-        tol_fun : float
-            Tolerance on the objective function (default : 1e-3)
-        max_iter : integer
-            Maximum number of iterations (default : 100)
-        verbose : integer
-            Level of verbosity: 0=no print, 1=print progress (default : 1)
-        x0 : np.array
-            Initial guess for the solution of the problem (default : None)
-        regularisation : commit.solvers.init_regularisation object
-            Python dictionary that describes the wanted regularisation term.
-            Check the documentation of commit.solvers.init_regularisation to see
-            how to properly define the wanted mathematical formulation
-            ( default : None )
-        """
-        if self.niiDWI is None :
-            ERROR( 'Data not loaded; call "load_data()" first' )
-        if self.DICTIONARY is None :
-            ERROR( 'Dictionary not loaded; call "load_dictionary()" first' )
-        if self.KERNELS is None :
-            ERROR( 'Response functions not generated; call "generate_kernels()" and "load_kernels()" first' )
-        if self.THREADS is None :
-            ERROR( 'Threads not set; call "set_threads()" first' )
-        if self.A is None :
-            ERROR( 'Operator not built; call "build_operator()" first' )
-
-        if x0 is not None :
-            if x0.shape[0] != self.A.shape[1] :
-                ERROR( 'x0 dimension does not match the number of columns of the dictionary' )
-        if regularisation is None :
-            regularisation = commit.solvers.init_regularisation(self)
-
-        self.CONFIG['optimization']                   = {}
-        self.CONFIG['optimization']['tol_fun']        = tol_fun
-        self.CONFIG['optimization']['tol_x']          = tol_x
-        self.CONFIG['optimization']['max_iter']       = max_iter
-        self.CONFIG['optimization']['verbose']        = verbose
-        self.CONFIG['optimization']['regularisation'] = regularisation
-
-        # run solver
-        t = time.time()
-        LOG( '\n-> Fit model:' )
-
-        self.x, opt_details = commit.solvers.solve(self.get_y(), self.A, self.A.T, tol_fun = tol_fun, tol_x = tol_x, max_iter = max_iter, verbose = verbose, x0 = x0, regularisation = regularisation)
-
-        self.CONFIG['optimization']['fit_details'] = opt_details
-        self.CONFIG['optimization']['fit_time'] = time.time()-t
-
-        LOG( '\n   [ %s ]' % ( time.strftime("%Hh %Mm %Ss", time.gmtime(self.CONFIG['optimization']['fit_time']) ) ) )
-
-
-    def get_coeffs( self ):
-        """
-        Returns the coefficients, corresponding to the original optimisation problem,
-        i.e. the input tractogram to trk2dictionary, divided in three classes (ic, ec, iso).
-        """
-        if self.x is None :
-            ERROR( 'Model not fitted to the data; call "fit()" first' )
-
-        nF = self.DICTIONARY['IC']['nF']
-        nE = self.DICTIONARY['EC']['nE']
-        nV = self.DICTIONARY['nV']
-
-        if self.get_config('doNormalizeKernels') :
-            # renormalize the coefficients
-            norm1 = np.repeat(self.KERNELS['wmr_norm'],nF)
-            norm2 = np.repeat(self.KERNELS['wmh_norm'],nE)
-            norm3 = np.repeat(self.KERNELS['iso_norm'],nV)
-            norm_fib = np.kron(np.ones(self.KERNELS['wmr'].shape[0]), self.DICTIONARY['TRK']['norm'])
-            x = self.x / np.hstack( (norm1*norm_fib,norm2,norm3) )
-        else :
-            x = self.x
-
-        offset1 = nF * self.KERNELS['wmr'].shape[0]
-        offset2 = offset1 + nE * self.KERNELS['wmh'].shape[0]
-        kept = np.tile( self.DICTIONARY['TRK']['kept'], self.KERNELS['wmr'].shape[0] )
-        xic = np.zeros( kept.size )
-        xic[kept==1] = x[:offset1]
-        xec = x[offset1:offset2]
-        xiso = x[offset2:]
-
-        return xic, xec, xiso
-
-
-    def save_results( self, path_suffix=None, stat_coeffs='sum', save_est_dwi=False, save_coeff=None, save_opt_details=None ) :
-        """Save the output (coefficients, errors, maps etc).
-
-        Parameters
-        ----------
-        path_suffix : string
-            Text to be appended to "Results" to create the output path (default : None)
-        stat_coeffs : string
-            Stat to be used if more coefficients are estimated for each streamline.
-            Options: 'sum', 'mean', 'median', 'min', 'max', 'all' (default : 'sum')
-        save_est_dwi : boolean
-            Save the estimated DW-MRI signal (default : False)
-        save_opt_details : boolean
-            DEPRECATED. The details of the optimization and the coefficients are always saved.
-        save_coeff : boolean
-            DEPRECATED. The estimated weights for the streamlines are always saved.
-        """
-        RESULTS_path = 'Results_' + self.model.id
-        if path_suffix :
-            self.set_config('path_suffix', path_suffix)
-            RESULTS_path = RESULTS_path + path_suffix
-
-        LOG( '\n-> Saving results to "%s/*":' % RESULTS_path )
-        tic = time.time()
-
-        if self.x is None :
-            ERROR( 'Model not fitted to the data; call "fit()" first' )
-
-        if save_coeff is not None :
-            WARNING('"save_coeff" parameter is deprecated')
-
-        if save_opt_details is not None :
-            WARNING('"save_opt_details" parameter is deprecated')
-        
-        nF = self.DICTIONARY['IC']['nF']
-        nE = self.DICTIONARY['EC']['nE']
-        nV = self.DICTIONARY['nV']
-        norm_fib = np.ones( nF )
-        # x is the x of the original problem
-        # self.x is the x preconditioned
-        if self.get_config('doNormalizeKernels') :
-            # renormalize the coefficients
-            norm1 = np.repeat(self.KERNELS['wmr_norm'],nF)
-            norm2 = np.repeat(self.KERNELS['wmh_norm'],nE)
-            norm3 = np.repeat(self.KERNELS['iso_norm'],nV)
-            norm_fib = np.kron(np.ones(self.KERNELS['wmr'].shape[0]), self.DICTIONARY['TRK']['norm'])
-            x = self.x / np.hstack( (norm1*norm_fib,norm2,norm3) )
-        else :
-            x = self.x
-
-        # create folder or delete existing files (if any)
-        RESULTS_path = pjoin( self.get_config('TRACKING_path'), RESULTS_path )
-        if not exists( RESULTS_path ) :
-            makedirs( RESULTS_path )
-        else :
-            for f in glob.glob( pjoin(RESULTS_path,'*') ) :
-                remove( f )
-        self.set_config('RESULTS_path', RESULTS_path)
-
-        # Map of voxelwise errors
-        print( '\t* Fitting errors:' )
-
-        niiMAP_img = np.zeros( self.get_config('dim'), dtype=np.float32 )
-        affine = self.niiDWI.affine if nibabel.__version__ >= '2.0.0' else self.niiDWI.get_affine()
-        niiMAP     = nibabel.Nifti1Image( niiMAP_img, affine )
-        niiMAP_hdr = niiMAP.header if nibabel.__version__ >= '2.0.0' else niiMAP.get_header()
-        niiMAP_hdr['descrip'] = 'Created with COMMIT %s'%self.get_config('version')
-
-        y_mea = np.reshape( self.niiDWI_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'], : ].flatten().astype(np.float32), (nV,-1) )
-        y_est = np.reshape( self.A.dot(self.x), (nV,-1) ).astype(np.float32)
-
-        print( '\t\t- RMSE...  ', end='' )
-        sys.stdout.flush()
-        tmp = np.sqrt( np.mean((y_mea-y_est)**2,axis=1) )
-        niiMAP_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = tmp
-        niiMAP_hdr['cal_min'] = 0
-        niiMAP_hdr['cal_max'] = tmp.max()
-        nibabel.save( niiMAP, pjoin(RESULTS_path,'fit_RMSE.nii.gz') )
-        print( '[ %.3f +/- %.3f ]' % ( tmp.mean(), tmp.std() ) )
-
-        print( '\t\t- NRMSE... ', end='' )
-        sys.stdout.flush()
-        tmp = np.sum(y_mea**2,axis=1)
-        idx = np.where( tmp < 1E-12 )
-        tmp[ idx ] = 1
-        tmp = np.sqrt( np.sum((y_mea-y_est)**2,axis=1) / tmp )
-        tmp[ idx ] = 0
-        niiMAP_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = tmp
-        niiMAP_hdr['cal_min'] = 0
-        niiMAP_hdr['cal_max'] = 1
-        nibabel.save( niiMAP, pjoin(RESULTS_path,'fit_NRMSE.nii.gz') )
-        print( '[ %.3f +/- %.3f ]' % ( tmp.mean(), tmp.std() ) )
-
-        # Map of compartment contributions
-        print( '\t* Voxelwise contributions:' )
-
-        print( '\t\t- Intra-axonal... ', end='' )
-        sys.stdout.flush()
-        niiIC_img = np.zeros( self.get_config('dim'), dtype=np.float32 )
-        if len(self.KERNELS['wmr']) > 0 :
-            offset = nF * self.KERNELS['wmr'].shape[0]
-            tmp = ( x[:offset].reshape( (-1,nF) ) * norm_fib.reshape( (-1,nF) ) ).sum( axis=0 )
-            xv = np.bincount( self.DICTIONARY['IC']['v'], minlength=nV,
-                weights=tmp[ self.DICTIONARY['IC']['fiber'] ] * self.DICTIONARY['IC']['len']
-            ).astype(np.float32)
-            niiIC_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = xv
-        print( '[ OK ]' )
-
-        print( '\t\t- Extra-axonal... ', end='' )
-        sys.stdout.flush()
-        niiEC_img = np.zeros( self.get_config('dim'), dtype=np.float32 )
-        if len(self.KERNELS['wmh']) > 0 :
-            offset = nF * self.KERNELS['wmr'].shape[0]
-            tmp = x[offset:offset+nE*len(self.KERNELS['wmh'])].reshape( (-1,nE) ).sum( axis=0 )
-            xv = np.bincount( self.DICTIONARY['EC']['v'], weights=tmp, minlength=nV ).astype(np.float32)
-            niiEC_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = xv
-        print( '[ OK ]' )
-
-        print( '\t\t- Isotropic... ', end='' )
-        sys.stdout.flush()
-        niiISO_img = np.zeros( self.get_config('dim'), dtype=np.float32 )
-        if len(self.KERNELS['iso']) > 0 :
-            offset = nF * self.KERNELS['wmr'].shape[0] + nE * self.KERNELS['wmh'].shape[0]
-            xv = x[offset:].reshape( (-1,nV) ).sum( axis=0 )
-            niiISO_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = xv
-        print( '   [ OK ]' )
-
-        if self.get_config('doNormalizeMaps') :
-            niiIC = nibabel.Nifti1Image(  niiIC_img  / ( niiIC_img + niiEC_img + niiISO_img + 1e-16), affine, header=niiMAP_hdr )
-            niiEC = nibabel.Nifti1Image(  niiEC_img /  ( niiIC_img + niiEC_img + niiISO_img + 1E-16), affine, header=niiMAP_hdr )
-            niiISO = nibabel.Nifti1Image( niiISO_img / ( niiIC_img + niiEC_img + niiISO_img + 1E-16), affine, header=niiMAP_hdr )
-        else:
-            niiIC = nibabel.Nifti1Image(  niiIC_img,  affine, header=niiMAP_hdr )
-            niiEC = nibabel.Nifti1Image(  niiEC_img,  affine, header=niiMAP_hdr )
-            niiISO = nibabel.Nifti1Image( niiISO_img, affine, header=niiMAP_hdr )
-
-        nibabel.save( niiIC , pjoin(RESULTS_path,'compartment_IC.nii.gz') )
-        nibabel.save( niiEC , pjoin(RESULTS_path,'compartment_EC.nii.gz') )
-        nibabel.save( niiISO , pjoin(RESULTS_path,'compartment_ISO.nii.gz') )
-
-        # Configuration and results
-        print( '\t* Configuration and results:' )
-
-        print( '\t\t- streamline_weights.txt... ', end='' )
-        sys.stdout.flush()
-        xic, _, _ = self.get_coeffs()
-        if stat_coeffs != 'all' and xic.size > 0 :
-            xic = np.reshape( xic, (-1,self.DICTIONARY['TRK']['kept'].size) )
-            if stat_coeffs == 'sum' :
-                xic = np.sum( xic, axis=0 )
-            elif stat_coeffs == 'mean' :
-                xic = np.mean( xic, axis=0 )
-            elif stat_coeffs == 'median' :
-                xic = np.median( xic, axis=0 )
-            elif stat_coeffs == 'min' :
-                xic = np.min( xic, axis=0 )
-            elif stat_coeffs == 'max' :
-                xic = np.max( xic, axis=0 )
-            else :
-                ERROR( 'Stat not allowed. Possible values: sum, mean, median, min, max, all.', prefix='\n' )
-        np.savetxt( pjoin(RESULTS_path,'streamline_weights.txt'), xic, fmt='%.5e' )
-        self.set_config('stat_coeffs', stat_coeffs)
-        print( '[ OK ]' )
-
-        # Save to a pickle file the following items:
-        #   item 0: dictionary with all the configuration details
-        #   item 1: np.array obtained through the optimisation process with the normalised kernels
-        #   item 2: np.array renormalisation of coeffs in item 1
-        print( '\t\t- results.pickle... ', end='' )
-        sys.stdout.flush()
-        with open( pjoin(RESULTS_path,'results.pickle'), 'wb+' ) as fid :
-            pickle.dump( [self.CONFIG, self.x, x], fid, protocol=2 )
-        print( '        [ OK ]' )
-
-        if save_est_dwi :
-            print( '\t\t- Estimated signal... ', end='' )
-            sys.stdout.flush()
-            self.niiDWI_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'], : ] = y_est
-            nibabel.save( nibabel.Nifti1Image( self.niiDWI_img , affine ), pjoin(RESULTS_path,'fit_signal_estimated.nii.gz') )
-            self.niiDWI_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'], : ] = y_mea
-            print( '[ OK ]' )
-        
-        LOG( '   [ %.1f seconds ]' % ( time.time() - tic ) )
+#!python
+#cython: language_level=3, boundscheck=False, wraparound=False, nonecheck=False, cdivision=True, initializedcheck=False, binding=False
+from __future__ import print_function
+cimport cython
+import numpy as np
+cimport numpy as np
+
+import time
+import glob
+import sys
+from os import makedirs, remove, getcwd, listdir
+from os.path import exists, join as pjoin, isfile, isdir
+import nibabel
+import pickle
+import commit.models
+import commit.solvers
+import amico.scheme
+import amico.lut
+import pyximport
+from pkg_resources import get_distribution
+
+from amico.util import LOG, NOTE, WARNING, ERROR
+
+
+def setup( lmax=12, ndirs=32761 ) :
+    """General setup/initialization of the COMMIT framework.
+    
+    Parameters
+    ----------
+    lmax : int
+        Maximum SH order to use for the rotation phase (default : 12)
+    ndirs : int
+        Number of directions on the half of the sphere representing the possible orientations of the response functions (default : 32761)
+    """
+
+    if not amico.lut.is_valid(ndirs):
+        ERROR( 'Unsupported value for ndirs.\nNote: Supported values for ndirs are [1, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 10000, 32761 (default)]' )
+
+    amico.lut.precompute_rotation_matrices( lmax, ndirs )
+
+
+def load_dictionary_info( filename ):
+    """Function to load dictionary info file
+    
+    Parameters
+    ----------
+    filename : string
+        This value is always COMMIT_PATH + dictionary_info.pickle
+    """
+    if not isfile( filename ):
+        ERROR( 'Dictionary is outdated or not found. Execute "trk2dictionary" script first' )
+    with open( filename, 'rb' ) as dictionary_info_file:
+        if sys.version_info.major == 3:
+            aux = pickle.load( dictionary_info_file, fix_imports=True, encoding='bytes' )
+            # Pickle files written by Python 2 are loaded with byte
+            # keys, whereas those written by Python 3 are loaded with
+            # str keys, even when both are written using protocol=2
+            result_aux = {(k.decode() if hasattr(k,"decode") else k): v for k, v in aux.items()}
+            return result_aux
+        else:
+            return pickle.load( dictionary_info_file )
+
+
+cdef class Evaluation :
+    """Class to hold all the information (data and parameters) when performing an
+    evaluation with the COMMIT framework.
+    """
+    cdef public niiDWI
+    cdef public niiDWI_img
+    cdef public scheme
+    cdef public model
+    cdef public KERNELS
+    cdef public DICTIONARY
+    cdef public THREADS
+    cdef public A
+    cdef public x
+    cdef public CONFIG
+
+    def __init__( self, study_path, subject ) :
+        """Setup the data structures with default values.
+
+        Parameters
+        ----------
+        study_path : string
+            The path to the folder containing all the subjects from one study
+        subject : string
+            The path (relative to previous folder) to the subject folder
+        """
+        self.niiDWI     = None # set by "load_data" method
+        self.scheme     = None # set by "load_data" method
+        self.model      = None # set by "set_model" method
+        self.KERNELS    = None # set by "load_kernels" method
+        self.DICTIONARY = None # set by "load_dictionary" method
+        self.THREADS    = None # set by "set_threads" method
+        self.A          = None # set by "build_operator" method
+        self.x          = None # set by "fit" method
+
+        # store all the parameters of an evaluation with COMMIT
+        self.CONFIG = {}
+        self.set_config('version', get_distribution('dmri-commit').version)
+        self.set_config('study_path', study_path)
+        self.set_config('subject', subject)
+        self.set_config('DATA_path', pjoin( study_path, subject ))
+
+        self.set_config('doNormalizeSignal', True)
+        self.set_config('doMergeB0', False)
+        self.set_config('doNormalizeKernels', True)
+        self.set_config('doDemean', False)
+        self.set_config('doNormalizeMaps', False)
+
+
+    def set_config( self, key, value ) :
+        self.CONFIG[ key ] = value
+
+
+    def get_config( self, key ) :
+        return self.CONFIG.get( key )
+
+
+    def load_data( self, dwi_filename='DWI.nii', scheme_filename='DWI.scheme', b0_thr=0, b0_min_signal=0 ) :
+        """Load the diffusion signal and its corresponding acquisition scheme.
+
+        Parameters
+        ----------
+        dwi_filename : string
+            The file name of the DWI data, relative to the subject folder (default : 'DWI.nii')
+        scheme_filename : string
+            The file name of the corresponding acquisition scheme (default : 'DWI.scheme')
+        b0_thr : float
+            The threshold below which a b-value is considered a b0 (default : 0)
+        b0_min_signal : float
+            Crop to zero the signal in voxels where the b0 <= b0_min_signal * mean(b0[b0>0]) (default : 0)
+        """
+
+        # Loading data and acquisition scheme
+        tic = time.time()
+        LOG( '\n-> Loading data:' )
+
+        print( '\t* DWI signal:' )
+        self.set_config('dwi_filename', dwi_filename)
+        self.niiDWI  = nibabel.load( pjoin( self.get_config('DATA_path'), dwi_filename) )
+        self.niiDWI_img = self.niiDWI.get_data().astype(np.float32)
+        if self.niiDWI_img.ndim ==3 :
+            self.niiDWI_img = np.expand_dims( self.niiDWI_img, axis=3 )
+        hdr = self.niiDWI.header if nibabel.__version__ >= '2.0.0' else self.niiDWI.get_header()
+        self.set_config('dim', self.niiDWI_img.shape[0:3])
+        self.set_config('pixdim', tuple( hdr.get_zooms()[:3] ))
+        print( '\t\t- dim    : %d x %d x %d x %d' % self.niiDWI_img.shape )
+        print( '\t\t- pixdim : %.3f x %.3f x %.3f' % self.get_config('pixdim') )
+        print( '\t\t- values : min=%.2f, max=%.2f, mean=%.2f' % ( self.niiDWI_img.min(), self.niiDWI_img.max(), self.niiDWI_img.mean() ) )
+
+        print( '\t* Acquisition scheme:' )
+        self.set_config('scheme_filename', scheme_filename)
+        self.set_config('b0_thr', b0_thr)
+        self.scheme = amico.scheme.Scheme( pjoin( self.get_config('DATA_path'), scheme_filename), b0_thr )
+        print( '\t\t- %d samples, %d shells' % ( self.scheme.nS, len(self.scheme.shells) ) )
+        print( '\t\t- %d @ b=0' % ( self.scheme.b0_count ), end='' )
+        for i in xrange(len(self.scheme.shells)) :
+            print( ', %d @ b=%.1f' % ( len(self.scheme.shells[i]['idx']), self.scheme.shells[i]['b'] ), end='' )
+        print()
+
+        if self.scheme.nS != self.niiDWI_img.shape[3] :
+            ERROR( 'Scheme does not match with DWI data' )
+
+        if self.scheme.dwi_count == 0 :
+            ERROR( 'There are no DWI volumes in the data' )
+
+        LOG( '   [ %.1f seconds ]' % ( time.time() - tic ) )
+
+        # Preprocessing
+        tic = time.time()
+        LOG( '\n-> Preprocessing:' )
+
+        if self.get_config('doNormalizeSignal') :
+            if self.scheme.b0_count > 0 :
+                print( '\t* Normalizing to b0... ', end='' )
+                sys.stdout.flush()
+                b0 = np.mean( self.niiDWI_img[:,:,:,self.scheme.b0_idx], axis=3 )
+                idx = b0 <= b0_min_signal * b0[b0>0].mean()
+                b0[ idx ] = 1
+                b0 = 1.0 / b0
+                b0[ idx ] = 0
+                for i in xrange(self.scheme.nS) :
+                    self.niiDWI_img[:,:,:,i] *= b0
+                print( '[ min=%.2f, max=%.2f, mean=%.2f ]' % ( self.niiDWI_img.min(), self.niiDWI_img.max(), self.niiDWI_img.mean() ) )
+                del idx, b0
+            else :
+                WARNING( 'There are no b0 volumes for normalization' )
+
+        if self.scheme.b0_count > 1 :
+            if self.get_config('doMergeB0') :
+                print( '\t* Merging multiple b0 volume(s)... ', end='' )
+                mean = np.expand_dims( np.mean( self.niiDWI_img[:,:,:,self.scheme.b0_idx], axis=3 ), axis=3 )
+                self.niiDWI_img = np.concatenate( (mean, self.niiDWI_img[:,:,:,self.scheme.dwi_idx]), axis=3 )
+                del mean
+            else :
+                print( '\t* Keeping all b0 volume(s)... ', end='' )
+            print( '[ %d x %d x %d x %d ]' % self.niiDWI_img.shape )
+
+        if self.get_config('doDemean') :
+            print( '\t* Demeaning signal... ', end='' )
+            sys.stdout.flush()
+            mean = np.repeat( np.expand_dims(np.mean(self.niiDWI_img,axis=3),axis=3), self.niiDWI_img.shape[3], axis=3 )
+            self.niiDWI_img = self.niiDWI_img - mean
+            print( '[ min=%.2f, max=%.2f, mean=%.2f ]' % ( self.niiDWI_img.min(), self.niiDWI_img.max(), self.niiDWI_img.mean() ) )
+
+        LOG( '   [ %.1f seconds ]' % ( time.time() - tic ) )
+
+
+    def set_model( self, model_name ) :
+        """Set the model to use to describe the signal contributions in each voxel.
+
+        Parameters
+        ----------
+        model_name : string
+            The name of the model (must match a class name in "commit.models" module)
+        """
+        # Call the specific model constructor
+        if hasattr(commit.models, model_name ) :
+            self.model = getattr(commit.models,model_name)()
+        else :
+            ERROR( 'Model "%s" not recognized' % model_name )
+
+        self.set_config('ATOMS_path', pjoin( self.get_config('study_path'), 'kernels', self.model.id ))
+
+
+    def generate_kernels( self, regenerate=False, lmax=12, ndirs=32761 ) :
+        """Generate the high-resolution response functions for each compartment.
+        Dispatch to the proper function, depending on the model.
+
+        Parameters
+        ----------
+        regenerate : boolean
+            Regenerate kernels if they already exist (default : False)
+        lmax : int
+            Maximum SH order to use for the rotation procedure (default : 12)
+        ndirs : int
+            Number of directions on the half of the sphere representing the possible orientations of the response functions (default : 32761)
+        """
+        if not amico.lut.is_valid(ndirs):
+            ERROR( 'Unsupported value for ndirs.\nNote: Supported values for ndirs are [1, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 10000, 32761 (default)]' )
+        if self.scheme is None :
+            ERROR( 'Scheme not loaded; call "load_data()" first' )
+        if self.model is None :
+            ERROR( 'Model not set; call "set_model()" method first' )
+
+        # store some values for later use
+        self.set_config('lmax', lmax)
+        self.set_config('ndirs', ndirs)
+        self.set_config('model', self.model.get_params())
+        self.model.scheme = self.scheme
+
+        LOG( '\n-> Simulating with "%s" model:' % self.model.name )
+
+        # check if kernels were already generated
+        tmp = glob.glob( pjoin(self.get_config('ATOMS_path'),'A_*.npy') )
+        if len(tmp)>0 and not regenerate :
+            LOG( '   [ Kernels already computed. Use option "regenerate=True" to force regeneration ]' )
+            return
+
+        # create folder or delete existing files (if any)
+        if not exists( self.get_config('ATOMS_path') ) :
+            makedirs( self.get_config('ATOMS_path') )
+        else :
+            for f in glob.glob( pjoin(self.get_config('ATOMS_path'),'*') ) :
+                remove( f )
+
+        # auxiliary data structures
+        aux = amico.lut.load_precomputed_rotation_matrices( lmax, ndirs )
+        idx_IN, idx_OUT = amico.lut.aux_structures_generate( self.scheme, lmax )
+
+        # Dispatch to the right handler for each model
+        tic = time.time()
+        self.model.generate( self.get_config('ATOMS_path'), aux, idx_IN, idx_OUT, ndirs )
+        LOG( '   [ %.1f seconds ]' % ( time.time() - tic ) )
+
+
+    def load_kernels( self ) :
+        """Load rotated kernels and project to the specific gradient scheme of this subject.
+        Dispatch to the proper function, depending on the model.
+        """
+        if self.model is None :
+            ERROR( 'Model not set; call "set_model()" method first' )
+        if self.scheme is None :
+            ERROR( 'Scheme not loaded; call "load_data()" first' )
+
+        tic = time.time()
+        LOG( '\n-> Resampling LUT for subject "%s":' % self.get_config('subject') )
+
+        # auxiliary data structures
+        idx_OUT, Ylm_OUT = amico.lut.aux_structures_resample( self.scheme, self.get_config('lmax') )
+
+        # Dispatch to the right handler for each model
+        if self.get_config('doMergeB0') :
+            print( '\t* Merging multiple b0 volume(s)...' )
+        else :
+            print( '\t* Keeping all b0 volume(s)...' )
+        self.KERNELS = self.model.resample( self.get_config('ATOMS_path'), idx_OUT, Ylm_OUT, self.get_config('doMergeB0'), self.get_config('ndirs') )
+        nIC  = self.KERNELS['wmr'].shape[0]
+        nEC  = self.KERNELS['wmh'].shape[0]
+        nISO = self.KERNELS['iso'].shape[0]
+        print( '\t  [ OK ]' )
+
+        # ensure contiguous arrays for C part
+        self.KERNELS['wmr'] = np.ascontiguousarray( self.KERNELS['wmr'] )
+        self.KERNELS['wmh'] = np.ascontiguousarray( self.KERNELS['wmh'] )
+        self.KERNELS['iso'] = np.ascontiguousarray( self.KERNELS['iso'] )
+
+        # De-mean kernels
+        if self.get_config('doDemean') :
+            print( '\t* Demeaning signal...', end='' )
+            for j in xrange(self.get_config('ndirs')) :
+                for i in xrange(nIC) :
+                    self.KERNELS['wmr'][i,j,:] -= self.KERNELS['wmr'][i,j,:].mean()
+                for i in xrange(nEC) :
+                    self.KERNELS['wmh'][i,j,:] -= self.KERNELS['wmh'][i,j,:].mean()
+            for i in xrange(nISO) :
+                self.KERNELS['iso'][i] -= self.KERNELS['iso'][i].mean()
+            print( '[ OK ]' )
+
+        # Normalize atoms
+        if self.get_config('doNormalizeKernels') :
+            print( '\t* Normalizing... ', end='' )
+
+            self.KERNELS['wmr_norm'] = np.zeros( nIC )
+            for i in xrange(nIC) :
+                self.KERNELS['wmr_norm'][i] = np.linalg.norm( self.KERNELS['wmr'][i,0,:] )
+                for j in xrange(self.get_config('ndirs')) :
+                    self.KERNELS['wmr'][i,j,:] /= self.KERNELS['wmr_norm'][i]
+
+            self.KERNELS['wmh_norm'] = np.zeros( nEC )
+            for i in xrange(nEC) :
+                self.KERNELS['wmh_norm'][i] = np.linalg.norm( self.KERNELS['wmh'][i,0,:] )
+                for j in xrange(self.get_config('ndirs')) :
+                    self.KERNELS['wmh'][i,j,:] /= self.KERNELS['wmh_norm'][i]
+
+            self.KERNELS['iso_norm'] = np.zeros( nISO )
+            for i in xrange(nISO) :
+                self.KERNELS['iso_norm'][i] = np.linalg.norm( self.KERNELS['iso'][i,:] )
+                self.KERNELS['iso'][i,:] /= self.KERNELS['iso_norm'][i]
+
+            print( '[ OK ]' )
+
+        LOG( '   [ %.1f seconds ]' % ( time.time() - tic ) )
+
+
+    cpdef load_dictionary( self, path, use_all_voxels_in_mask=False ) :
+        """Load the sparse structure previously created with "trk2dictionary" script.
+
+        Parameters
+        ----------
+        path : string
+            Folder containing the output of the trk2dictionary script (relative to subject path)
+        use_all_voxels_in_mask : boolean
+            If False (default) the optimization will be conducted only on the voxels actually
+            traversed by tracts. If True, then all voxels present in the mask specified in 
+            trk2dictionary.run(), i.e. "filename_mask" parameter, will be used instead.
+            NB: if no mask was specified in trk2dictionary, this parameter is irrelevant.
+        """
+        if self.niiDWI is None :
+            ERROR( 'Data not loaded; call "load_data()" first' )
+
+        tic = time.time()
+        LOG( '\n-> Loading the dictionary:' )
+        self.DICTIONARY = {}
+        self.set_config('TRACKING_path', pjoin(self.get_config('DATA_path'),path))
+
+        # check that ndirs of dictionary matches with that of the kernels
+        dictionary_info = load_dictionary_info( pjoin(self.get_config('TRACKING_path'), "dictionary_info.pickle") )
+        if dictionary_info['ndirs'] != self.get_config('ndirs'):
+            ERROR( '"ndirs" of the dictionary (%d) does not match with the kernels (%d)' % (dictionary_info['ndirs'], self.get_config('ndirs')) )
+        self.DICTIONARY['ndirs'] = dictionary_info['ndirs']
+
+        # load mask
+        self.set_config('dictionary_mask', 'mask' if use_all_voxels_in_mask else 'tdi' )
+        mask_filename = pjoin(self.get_config('TRACKING_path'),'dictionary_%s.nii'%self.get_config('dictionary_mask'))
+        if not exists( mask_filename ) :
+            mask_filename += '.gz'
+            if not exists( mask_filename ) :
+                ERROR( 'Dictionary not found. Execute "trk2dictionary" script first' );
+        niiMASK = nibabel.load( mask_filename )
+        niiMASK_hdr = niiMASK.header if nibabel.__version__ >= '2.0.0' else niiMASK.get_header()
+        if ( self.get_config('dim')[0]!=niiMASK.shape[0] or
+             self.get_config('dim')[1]!=niiMASK.shape[1] or
+             self.get_config('dim')[2]!=niiMASK.shape[2] or
+             abs(self.get_config('pixdim')[0]-niiMASK_hdr['pixdim'][1])>1e-3 or
+             abs(self.get_config('pixdim')[1]-niiMASK_hdr['pixdim'][2])>1e-3 or
+             abs(self.get_config('pixdim')[2]-niiMASK_hdr['pixdim'][3])>1e-3 ) :
+            WARNING( 'Dictionary does not have the same geometry as the dataset' )
+        self.DICTIONARY['MASK'] = (niiMASK.get_data() > 0).astype(np.uint8)
+
+        # segments from the tracts
+        # ------------------------
+        print( '\t* Segments from the tracts... ', end='' )
+        sys.stdout.flush()
+
+        self.DICTIONARY['TRK'] = {}
+        self.DICTIONARY['TRK']['kept']  = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_TRK_kept.dict'), dtype=np.uint8 )
+        self.DICTIONARY['TRK']['norm'] = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_TRK_norm.dict'), dtype=np.float32 )
+        self.DICTIONARY['TRK']['len']  = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_TRK_len.dict'), dtype=np.float32 )
+        
+
+        self.DICTIONARY['IC'] = {}
+        self.DICTIONARY['IC']['fiber'] = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_IC_f.dict'), dtype=np.uint32 )
+        self.DICTIONARY['IC']['v']     = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_IC_v.dict'), dtype=np.uint32 )
+        self.DICTIONARY['IC']['o']     = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_IC_o.dict'), dtype=np.uint16 )
+        self.DICTIONARY['IC']['len']   = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_IC_len.dict'), dtype=np.float32 )
+        self.DICTIONARY['IC']['n']     = self.DICTIONARY['IC']['fiber'].size
+        self.DICTIONARY['IC']['nF']    = self.DICTIONARY['TRK']['norm'].size
+
+        # reorder the segments based, first, on the "v" field and after based on the "o" field
+        idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['v'])] )
+        self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
+        self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]
+        self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
+        self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
+        del idx
+
+        # divide the length of each segment by the fiber length so that all the columns of the libear operator will have same length
+        # NB: it works in conjunction with the normalization of the kernels
+        cdef :
+            np.float32_t [:] sl = self.DICTIONARY['IC']['len']
+            np.float32_t [:] tl = self.DICTIONARY['TRK']['norm']
+            np.uint32_t  [:] f  = self.DICTIONARY['IC']['fiber']
+            int s
+        if self.get_config('doNormalizeKernels') :
+            for s in xrange(self.DICTIONARY['IC']['n']) :
+                sl[s] /= tl[ f[s] ]
+
+        print( '[ %d fibers and %d segments ]' % ( self.DICTIONARY['IC']['nF'], self.DICTIONARY['IC']['n'] ) )
+
+        # segments from the peaks
+        # -----------------------
+        print( '\t* Segments from the peaks...  ', end='' )
+        sys.stdout.flush()
+
+        self.DICTIONARY['EC'] = {}
+        self.DICTIONARY['EC']['v']  = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_EC_v.dict'), dtype=np.uint32 )
+        self.DICTIONARY['EC']['o']  = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_EC_o.dict'), dtype=np.uint16 )
+        self.DICTIONARY['EC']['nE'] = self.DICTIONARY['EC']['v'].size
+
+        # reorder the segments based, first, on the "v" field and after based on the "o" field
+        idx = np.lexsort( [np.array(self.DICTIONARY['EC']['o']), np.array(self.DICTIONARY['EC']['v'])] )
+        self.DICTIONARY['EC']['v'] = self.DICTIONARY['EC']['v'][ idx ]
+        self.DICTIONARY['EC']['o'] = self.DICTIONARY['EC']['o'][ idx ]
+        del idx
+
+        print( '[ %d segments ]' % self.DICTIONARY['EC']['nE'] )
+
+        # isotropic compartments
+        # ----------------------
+        print( '\t* Isotropic contributions...  ', end='' )
+        sys.stdout.flush()
+
+        self.DICTIONARY['ISO'] = {}
+
+        self.DICTIONARY['nV'] = self.DICTIONARY['MASK'].sum()
+
+        vx, vy, vz = ( self.DICTIONARY['MASK'] > 0 ).nonzero() # [TODO] find a way to avoid using int64 (not necessary and waste of memory)
+        vx = vx.astype(np.int32)
+        vy = vy.astype(np.int32)
+        vz = vz.astype(np.int32)
+        self.DICTIONARY['ISO']['v'] = vx + self.get_config('dim')[0] * ( vy + self.get_config('dim')[1] * vz )
+        del vx, vy, vz
+
+        # reorder the segments based on the "v" field
+        idx = np.argsort( self.DICTIONARY['ISO']['v'], kind='mergesort' )
+        self.DICTIONARY['ISO']['v'] = self.DICTIONARY['ISO']['v'][ idx ]
+        del idx
+
+        print( '[ %d voxels ]' % self.DICTIONARY['nV'] )
+
+        # post-processing
+        # ---------------
+        print( '\t* Post-processing...          ', end='' )
+        sys.stdout.flush()
+
+        # get the indices to extract the VOI as in MATLAB (in place of DICTIONARY.MASKidx)
+        idx = self.DICTIONARY['MASK'].ravel(order='F').nonzero()[0]
+        self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] = np.unravel_index( idx, self.DICTIONARY['MASK'].shape, order='F' )
+
+        lut = np.zeros( self.get_config('dim'), dtype=np.uint32 ).ravel()
+        for i in xrange(idx.size) :
+            lut[ idx[i] ] = i
+        self.DICTIONARY['IC'][ 'v'] = lut[ self.DICTIONARY['IC'][ 'v'] ]
+        self.DICTIONARY['EC'][ 'v'] = lut[ self.DICTIONARY['EC'][ 'v'] ]
+        self.DICTIONARY['ISO']['v'] = lut[ self.DICTIONARY['ISO']['v'] ]
+
+        print( '[ OK ]' )
+
+        LOG( '   [ %.1f seconds ]' % ( time.time() - tic ) )
+
+
+    def set_threads( self, n = None, nthreads = None, gpu_id = 0 ) :
+        """Set the number of threads to use for the matrix-vector operations with A and A'.
+
+        Parameters
+        ----------
+        n : integer
+            Same as nthreads. This remains just for compatibility with previous versions
+
+        nthreads : integer
+            Number of threads to use (nthreads = None ---> all the CPU threads available in the system
+                                      nthreads = 0    ---> enable CUDA GPU acceleration)
+        gpu_id : integer
+            GPU ID of the Nvidia GPU where COMMIT will be executed, default=0 and it is only required if nthreads=0
+            (To show a list of Nvidia GPUs and their IDs, open a system shell and run the command 'nvidia-smi')
+        """
+        if nthreads is None :
+            if n != None :
+                WARNING( '"n" parameter is deprecated, use "nthreads" instead' )
+                nthreads = n
+            else:
+                # Set to the number of CPUs in the system
+                try :
+                    import multiprocessing
+                    nthreads = multiprocessing.cpu_count()
+                except :
+                    nthreads = 1
+
+        if nthreads < 0 or nthreads > 255 :
+            ERROR( 'Number of threads must be between 0 and 255' )
+        if self.DICTIONARY is None :
+            ERROR( 'Dictionary not loaded; call "load_dictionary()" first' )
+        if self.KERNELS is None :
+            ERROR( 'Response functions not generated; call "generate_kernels()" and "load_kernels()" first' )
+
+        self.THREADS = {}
+        self.THREADS['n'] = nthreads
+        if nthreads == 0:
+            self.THREADS['gpu_id'] = gpu_id
+            LOG( '\n-> Checking CUDA GPU:' )
+
+            from commit.cudaoperator.operator import check_compatibility
+            #cdef unsigned long long required_mem = 28*self.n + 6*self.nzeppelins + 8.0*(size_t)nfibers + 16.0*(size_t)nvoxels + 4.0*((size_t)size_lutic + (size_t)size_lutec + (size_t)size_lutiso + (size_t)this->nrows + (size_t)this->ncols)
+            error_id = check_compatibility(gpu_id)
+            if error_id == 1:
+                ERROR( 'The selected GPU is not detected' )
+            elif error_id == 2:
+                ERROR( 'Impossible to set GPU with ID=%d' % gpu_id )
+            elif error_id == 3:
+                ERROR( 'Impossible to get properties from GPU with ID=%d' % gpu_id )
+            elif error_id == 4:
+                ERROR( 'Compute capability must be at least 5.0' )
+
+            if gpu_id == 0:
+                LOG( '   [ Default GPU selected. Use option "gpu_id" in "set_threads()" to change selection ]' )
+
+        cdef :
+            long [:] C
+            long t, tot, i1, i2, N, c
+            int i
+
+        tic = time.time()
+
+        if nthreads > 0:
+            LOG( '\n-> Distributing workload to different threads:' )
+            print( '\t* number of threads : %d' % nthreads )
+
+            # Distribute load for the computation of A*x product
+            print( '\t* A  operator... ', end='' )
+            sys.stdout.flush()
+
+            self.THREADS['IC']   = None
+            self.THREADS['EC']   = None
+            self.THREADS['ISO']  = None
+            self.THREADS['ICt']  = None
+            self.THREADS['ECt']  = None
+            self.THREADS['ISOt'] = None
+
+            if self.DICTIONARY['IC']['n'] > 0 :
+                self.THREADS['IC'] = np.zeros( nthreads+1, dtype=np.uint32 )
+                if nthreads > 1 :
+                    N = np.floor( self.DICTIONARY['IC']['n']/nthreads )
+                    t = 1
+                    tot = 0
+                    C = np.bincount( self.DICTIONARY['IC']['v'] )
+                    for c in C :
+                        tot += c
+                        if tot >= N :
+                            self.THREADS['IC'][t] = self.THREADS['IC'][t-1] + tot
+                            t += 1
+                            tot = 0
+                self.THREADS['IC'][nthreads] = self.DICTIONARY['IC']['n']
+
+                # check if some threads are not assigned any segment
+                if np.count_nonzero( np.diff( self.THREADS['IC'].astype(np.int32) ) <= 0 ) :
+                    self.THREADS = None
+                    ERROR( 'Too many threads for the IC compartments to evaluate; try decreasing the number.' )
+
+            if self.DICTIONARY['EC']['nE'] > 0 :
+                self.THREADS['EC'] = np.zeros( nthreads+1, dtype=np.uint32 )
+                for i in xrange(nthreads) :
+                    self.THREADS['EC'][i] = np.searchsorted( self.DICTIONARY['EC']['v'], self.DICTIONARY['IC']['v'][ self.THREADS['IC'][i] ] )
+                self.THREADS['EC'][nthreads] = self.DICTIONARY['EC']['nE']
+
+                # check if some threads are not assigned any segment
+                if np.count_nonzero( np.diff( self.THREADS['EC'].astype(np.int32) ) <= 0 ) :
+                    self.THREADS = None
+                    ERROR( 'Too many threads for the EC compartments to evaluate; try decreasing the number.' )
+
+            if self.DICTIONARY['nV'] > 0 :
+                self.THREADS['ISO'] = np.zeros( nthreads+1, dtype=np.uint32 )
+                for i in xrange(nthreads) :
+                    self.THREADS['ISO'][i] = np.searchsorted( self.DICTIONARY['ISO']['v'], self.DICTIONARY['IC']['v'][ self.THREADS['IC'][i] ] )
+                self.THREADS['ISO'][nthreads] = self.DICTIONARY['nV']
+
+                # check if some threads are not assigned any segment
+                if np.count_nonzero( np.diff( self.THREADS['ISO'].astype(np.int32) ) <= 0 ) :
+                    self.THREADS = None
+                    ERROR( 'Too many threads for the ISO compartments to evaluate; try decreasing the number.' )
+
+            print( '[ OK ]' )
+
+            # Distribute load for the computation of At*y product
+            print( '\t* A\' operator... ', end="" )
+            sys.stdout.flush()
+
+            if self.DICTIONARY['IC']['n'] > 0 :
+                self.THREADS['ICt'] = np.full( self.DICTIONARY['IC']['n'], nthreads-1, dtype=np.uint8 )
+                if nthreads > 1 :
+                    idx = np.argsort( self.DICTIONARY['IC']['fiber'], kind='mergesort' )
+                    C = np.bincount( self.DICTIONARY['IC']['fiber'] )
+                    t = tot = i1 = i2 = 0
+                    N = np.floor(self.DICTIONARY['IC']['n']/nthreads)
+                    for c in C :
+                        i2 += c
+                        tot += c
+                        if tot >= N :
+                            self.THREADS['ICt'][ i1:i2 ] = t
+                            t += 1
+                            if t==nthreads-1 :
+                                break
+                            i1 = i2
+                            tot = c
+                    self.THREADS['ICt'][idx] = self.THREADS['ICt'].copy()
+
+            if self.DICTIONARY['EC']['nE'] > 0 :
+                self.THREADS['ECt'] = np.zeros( nthreads+1, dtype=np.uint32 )
+                N = np.floor( self.DICTIONARY['EC']['nE']/nthreads )
+                for i in xrange(1,nthreads) :
+                    self.THREADS['ECt'][i] = self.THREADS['ECt'][i-1] + N
+                self.THREADS['ECt'][nthreads] = self.DICTIONARY['EC']['nE']
+
+                # check if some threads are not assigned any segment
+                if np.count_nonzero( np.diff( self.THREADS['ECt'].astype(np.int32) ) <= 0 ) :
+                    self.THREADS = None
+                    ERROR( 'Too many threads for the EC compartments to evaluate; try decreasing the number.' )
+
+            if self.DICTIONARY['nV'] > 0 :
+                self.THREADS['ISOt'] = np.zeros( nthreads+1, dtype=np.uint32 )
+                N = np.floor( self.DICTIONARY['nV']/nthreads )
+                for i in xrange(1,nthreads) :
+                    self.THREADS['ISOt'][i] = self.THREADS['ISOt'][i-1] + N
+                self.THREADS['ISOt'][nthreads] = self.DICTIONARY['nV']
+
+                # check if some threads are not assigned any segment
+                if np.count_nonzero( np.diff( self.THREADS['ISOt'].astype(np.int32) ) <= 0 ) :
+                    self.THREADS = None
+                    ERROR( 'Too many threads for the ISO compartments to evaluate; try decreasing the number.' )
+
+            print( '[ OK ]' )
+
+            LOG( '   [ %.1f seconds ]' % ( time.time() - tic ) )
+
+
+    def build_operator( self, build_dir=None ) :
+        """Compile/build the operator for computing the matrix-vector multiplications by A and A'
+        using the informations from self.DICTIONARY, self.KERNELS and self.THREADS.
+        NB: needs to call this function to update pointers to data structures in case
+            the data is changed in self.DICTIONARY, self.KERNELS or self.THREADS.
+
+        Parameters
+        ----------
+        build_dir : string
+            The folder in which to store the compiled files. 
+            If None (default), they will end up in the .pyxbld directory in the user’s home directory.
+            If using this option, it is recommended to use a temporary directory, quit your python 
+                console between each build, and delete the content of the temporary directory.
+        """
+        if self.DICTIONARY is None :
+            ERROR( 'Dictionary not loaded; call "load_dictionary()" first' )
+        if self.KERNELS is None :
+            ERROR( 'Response functions not generated; call "generate_kernels()" and "load_kernels()" first' )
+        if self.THREADS is None :
+            ERROR( 'Threads not set; call "set_threads()" first' )
+        
+        if self.DICTIONARY['IC']['nF'] <= 0 :
+            ERROR( 'No streamline found in the dictionary; check your data' )
+        if self.DICTIONARY['EC']['nE'] <= 0 and self.KERNELS['wmh'].shape[0] > 0 :
+            ERROR( 'The selected model has EC compartments, but no peaks have been provided; check your data' )
+
+        tic = time.time()
+        LOG( '\n-> Building linear operator A:' )
+
+        if self.THREADS['n'] > 0:
+            # need to pass these parameters at runtime for compiling the C code
+            from commit.operator import config
+
+            compilation_is_needed = False
+            
+            if config.nTHREADS is None or config.nTHREADS != self.THREADS['n']:
+                compilation_is_needed = True
+            if config.nIC is None or config.nIC != self.KERNELS['wmr'].shape[0]:
+                compilation_is_needed = True
+            if config.model is None or config.model != self.model.id:
+                compilation_is_needed = True        
+            if config.nEC is None or config.nEC != self.KERNELS['wmh'].shape[0]:
+                compilation_is_needed = True                
+            if config.nISO is None or config.nISO != self.KERNELS['iso'].shape[0]:
+                compilation_is_needed = True        
+            if config.build_dir != build_dir:
+                compilation_is_needed = True        
+
+            if compilation_is_needed or not 'commit.operator.operator' in sys.modules :       
+
+                if build_dir is not None:
+                    if isdir(build_dir) and not len(listdir(build_dir)) == 0:
+                        ERROR( '\nbuild_dir is not empty, unsafe build option.' )
+                    elif config.nTHREADS is not None:
+                        ERROR( '\nThe parameter build_dir has changed, unsafe build option.' )
+                    else:
+                        WARNING( '\nUsing build_dir, always quit your python console between COMMIT Evaluation.' )
+
+                config.nTHREADS   = self.THREADS['n']
+                config.model      = self.model.id
+                config.nIC        = self.KERNELS['wmr'].shape[0]
+                config.nEC        = self.KERNELS['wmh'].shape[0]
+                config.nISO       = self.KERNELS['iso'].shape[0]
+                config.build_dir  = build_dir
+
+                pyximport.install( reload_support=True, language_level=3, build_dir=build_dir, build_in_temp=True, inplace=False )
+
+                if not 'commit.operator.operator' in sys.modules :
+                    import commit.operator.operator
+                else :
+                    reload( sys.modules['commit.operator.operator'] )
+                
+            self.A = sys.modules['commit.operator.operator'].LinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )        
+        else:
+            import commit.cudaoperator.operator
+            self.A = commit.cudaoperator.operator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS, fcall=1 )
+
+        LOG( '   [ %.1f seconds ]' % ( time.time() - tic ) )
+
+
+    def get_y( self ):
+        """
+        Returns a numpy array that corresponds to the 'y' vector of the optimisation problem.
+        NB: this can be run only after having loaded the dictionary and the data.
+        """
+        if self.DICTIONARY is None :
+            ERROR( 'Dictionary not loaded; call "load_dictionary()" first' )
+        if self.niiDWI is None :
+            ERROR( 'Data not loaded; call "load_data()" first' )
+        return self.niiDWI_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'], : ].flatten().astype(np.float64)
+
+
+    def fit( self, tol_fun=1e-3, tol_x=1e-6, max_iter=100, verbose=1, x0=None, regularisation=None ) :
+        """Fit the model to the data.
+
+        Parameters
+        ----------
+        tol_fun : float
+            Tolerance on the objective function (default : 1e-3)
+        max_iter : integer
+            Maximum number of iterations (default : 100)
+        verbose : integer
+            Level of verbosity: 0=no print, 1=print progress (default : 1)
+        x0 : np.array
+            Initial guess for the solution of the problem (default : None)
+        regularisation : commit.solvers.init_regularisation object
+            Python dictionary that describes the wanted regularisation term.
+            Check the documentation of commit.solvers.init_regularisation to see
+            how to properly define the wanted mathematical formulation
+            ( default : None )
+        """
+        if self.niiDWI is None :
+            ERROR( 'Data not loaded; call "load_data()" first' )
+        if self.DICTIONARY is None :
+            ERROR( 'Dictionary not loaded; call "load_dictionary()" first' )
+        if self.KERNELS is None :
+            ERROR( 'Response functions not generated; call "generate_kernels()" and "load_kernels()" first' )
+        if self.THREADS is None :
+            ERROR( 'Threads not set; call "set_threads()" first' )
+        if self.A is None :
+            ERROR( 'Operator not built; call "build_operator()" first' )
+
+        if x0 is not None :
+            if x0.shape[0] != self.A.shape[1] :
+                ERROR( 'x0 dimension does not match the number of columns of the dictionary' )
+        if regularisation is None :
+            regularisation = commit.solvers.init_regularisation(self)
+
+        self.CONFIG['optimization']                   = {}
+        self.CONFIG['optimization']['tol_fun']        = tol_fun
+        self.CONFIG['optimization']['tol_x']          = tol_x
+        self.CONFIG['optimization']['max_iter']       = max_iter
+        self.CONFIG['optimization']['verbose']        = verbose
+        self.CONFIG['optimization']['regularisation'] = regularisation
+
+        # run solver
+        t = time.time()
+        LOG( '\n-> Fit model:' )
+
+        self.x, opt_details = commit.solvers.solve(self.get_y(), self.A, self.A.T, tol_fun = tol_fun, tol_x = tol_x, max_iter = max_iter, verbose = verbose, x0 = x0, regularisation = regularisation)
+
+        self.CONFIG['optimization']['fit_details'] = opt_details
+        self.CONFIG['optimization']['fit_time'] = time.time()-t
+
+        LOG( '\n   [ %s ]' % ( time.strftime("%Hh %Mm %Ss", time.gmtime(self.CONFIG['optimization']['fit_time']) ) ) )
+
+
+    def get_coeffs( self ):
+        """
+        Returns the coefficients, corresponding to the original optimisation problem,
+        i.e. the input tractogram to trk2dictionary, divided in three classes (ic, ec, iso).
+        """
+        if self.x is None :
+            ERROR( 'Model not fitted to the data; call "fit()" first' )
+
+        nF = self.DICTIONARY['IC']['nF']
+        nE = self.DICTIONARY['EC']['nE']
+        nV = self.DICTIONARY['nV']
+
+        if self.get_config('doNormalizeKernels') :
+            # renormalize the coefficients
+            norm1 = np.repeat(self.KERNELS['wmr_norm'],nF)
+            norm2 = np.repeat(self.KERNELS['wmh_norm'],nE)
+            norm3 = np.repeat(self.KERNELS['iso_norm'],nV)
+            norm_fib = np.kron(np.ones(self.KERNELS['wmr'].shape[0]), self.DICTIONARY['TRK']['norm'])
+            x = self.x / np.hstack( (norm1*norm_fib,norm2,norm3) )
+        else :
+            x = self.x
+
+        offset1 = nF * self.KERNELS['wmr'].shape[0]
+        offset2 = offset1 + nE * self.KERNELS['wmh'].shape[0]
+        kept = np.tile( self.DICTIONARY['TRK']['kept'], self.KERNELS['wmr'].shape[0] )
+        xic = np.zeros( kept.size )
+        xic[kept==1] = x[:offset1]
+        xec = x[offset1:offset2]
+        xiso = x[offset2:]
+
+        return xic, xec, xiso
+
+
+    def save_results( self, path_suffix=None, stat_coeffs='sum', save_est_dwi=False, save_coeff=None, save_opt_details=None ) :
+        """Save the output (coefficients, errors, maps etc).
+
+        Parameters
+        ----------
+        path_suffix : string
+            Text to be appended to "Results" to create the output path (default : None)
+        stat_coeffs : string
+            Stat to be used if more coefficients are estimated for each streamline.
+            Options: 'sum', 'mean', 'median', 'min', 'max', 'all' (default : 'sum')
+        save_est_dwi : boolean
+            Save the estimated DW-MRI signal (default : False)
+        save_opt_details : boolean
+            DEPRECATED. The details of the optimization and the coefficients are always saved.
+        save_coeff : boolean
+            DEPRECATED. The estimated weights for the streamlines are always saved.
+        """
+        RESULTS_path = 'Results_' + self.model.id
+        if path_suffix :
+            self.set_config('path_suffix', path_suffix)
+            RESULTS_path = RESULTS_path + path_suffix
+
+        LOG( '\n-> Saving results to "%s/*":' % RESULTS_path )
+        tic = time.time()
+
+        if self.x is None :
+            ERROR( 'Model not fitted to the data; call "fit()" first' )
+
+        if save_coeff is not None :
+            WARNING('"save_coeff" parameter is deprecated')
+
+        if save_opt_details is not None :
+            WARNING('"save_opt_details" parameter is deprecated')
+        
+        nF = self.DICTIONARY['IC']['nF']
+        nE = self.DICTIONARY['EC']['nE']
+        nV = self.DICTIONARY['nV']
+        norm_fib = np.ones( nF )
+        # x is the x of the original problem
+        # self.x is the x preconditioned
+        if self.get_config('doNormalizeKernels') :
+            # renormalize the coefficients
+            norm1 = np.repeat(self.KERNELS['wmr_norm'],nF)
+            norm2 = np.repeat(self.KERNELS['wmh_norm'],nE)
+            norm3 = np.repeat(self.KERNELS['iso_norm'],nV)
+            norm_fib = np.kron(np.ones(self.KERNELS['wmr'].shape[0]), self.DICTIONARY['TRK']['norm'])
+            x = self.x / np.hstack( (norm1*norm_fib,norm2,norm3) )
+        else :
+            x = self.x
+
+        # create folder or delete existing files (if any)
+        RESULTS_path = pjoin( self.get_config('TRACKING_path'), RESULTS_path )
+        if not exists( RESULTS_path ) :
+            makedirs( RESULTS_path )
+        else :
+            for f in glob.glob( pjoin(RESULTS_path,'*') ) :
+                remove( f )
+        self.set_config('RESULTS_path', RESULTS_path)
+
+        # Map of voxelwise errors
+        print( '\t* Fitting errors:' )
+
+        niiMAP_img = np.zeros( self.get_config('dim'), dtype=np.float32 )
+        affine = self.niiDWI.affine if nibabel.__version__ >= '2.0.0' else self.niiDWI.get_affine()
+        niiMAP     = nibabel.Nifti1Image( niiMAP_img, affine )
+        niiMAP_hdr = niiMAP.header if nibabel.__version__ >= '2.0.0' else niiMAP.get_header()
+        niiMAP_hdr['descrip'] = 'Created with COMMIT %s'%self.get_config('version')
+
+        y_mea = np.reshape( self.niiDWI_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'], : ].flatten().astype(np.float32), (nV,-1) )
+        y_est = np.reshape( self.A.dot(self.x), (nV,-1) ).astype(np.float32)
+
+        print( '\t\t- RMSE...  ', end='' )
+        sys.stdout.flush()
+        tmp = np.sqrt( np.mean((y_mea-y_est)**2,axis=1) )
+        niiMAP_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = tmp
+        niiMAP_hdr['cal_min'] = 0
+        niiMAP_hdr['cal_max'] = tmp.max()
+        nibabel.save( niiMAP, pjoin(RESULTS_path,'fit_RMSE.nii.gz') )
+        print( '[ %.3f +/- %.3f ]' % ( tmp.mean(), tmp.std() ) )
+
+        print( '\t\t- NRMSE... ', end='' )
+        sys.stdout.flush()
+        tmp = np.sum(y_mea**2,axis=1)
+        idx = np.where( tmp < 1E-12 )
+        tmp[ idx ] = 1
+        tmp = np.sqrt( np.sum((y_mea-y_est)**2,axis=1) / tmp )
+        tmp[ idx ] = 0
+        niiMAP_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = tmp
+        niiMAP_hdr['cal_min'] = 0
+        niiMAP_hdr['cal_max'] = 1
+        nibabel.save( niiMAP, pjoin(RESULTS_path,'fit_NRMSE.nii.gz') )
+        print( '[ %.3f +/- %.3f ]' % ( tmp.mean(), tmp.std() ) )
+
+        # Map of compartment contributions
+        print( '\t* Voxelwise contributions:' )
+
+        print( '\t\t- Intra-axonal... ', end='' )
+        sys.stdout.flush()
+        niiIC_img = np.zeros( self.get_config('dim'), dtype=np.float32 )
+        if len(self.KERNELS['wmr']) > 0 :
+            offset = nF * self.KERNELS['wmr'].shape[0]
+            tmp = ( x[:offset].reshape( (-1,nF) ) * norm_fib.reshape( (-1,nF) ) ).sum( axis=0 )
+            xv = np.bincount( self.DICTIONARY['IC']['v'], minlength=nV,
+                weights=tmp[ self.DICTIONARY['IC']['fiber'] ] * self.DICTIONARY['IC']['len']
+            ).astype(np.float32)
+            niiIC_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = xv
+        print( '[ OK ]' )
+
+        print( '\t\t- Extra-axonal... ', end='' )
+        sys.stdout.flush()
+        niiEC_img = np.zeros( self.get_config('dim'), dtype=np.float32 )
+        if len(self.KERNELS['wmh']) > 0 :
+            offset = nF * self.KERNELS['wmr'].shape[0]
+            tmp = x[offset:offset+nE*len(self.KERNELS['wmh'])].reshape( (-1,nE) ).sum( axis=0 )
+            xv = np.bincount( self.DICTIONARY['EC']['v'], weights=tmp, minlength=nV ).astype(np.float32)
+            niiEC_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = xv
+        print( '[ OK ]' )
+
+        print( '\t\t- Isotropic... ', end='' )
+        sys.stdout.flush()
+        niiISO_img = np.zeros( self.get_config('dim'), dtype=np.float32 )
+        if len(self.KERNELS['iso']) > 0 :
+            offset = nF * self.KERNELS['wmr'].shape[0] + nE * self.KERNELS['wmh'].shape[0]
+            xv = x[offset:].reshape( (-1,nV) ).sum( axis=0 )
+            niiISO_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = xv
+        print( '   [ OK ]' )
+
+        if self.get_config('doNormalizeMaps') :
+            niiIC = nibabel.Nifti1Image(  niiIC_img  / ( niiIC_img + niiEC_img + niiISO_img + 1e-16), affine, header=niiMAP_hdr )
+            niiEC = nibabel.Nifti1Image(  niiEC_img /  ( niiIC_img + niiEC_img + niiISO_img + 1E-16), affine, header=niiMAP_hdr )
+            niiISO = nibabel.Nifti1Image( niiISO_img / ( niiIC_img + niiEC_img + niiISO_img + 1E-16), affine, header=niiMAP_hdr )
+        else:
+            niiIC = nibabel.Nifti1Image(  niiIC_img,  affine, header=niiMAP_hdr )
+            niiEC = nibabel.Nifti1Image(  niiEC_img,  affine, header=niiMAP_hdr )
+            niiISO = nibabel.Nifti1Image( niiISO_img, affine, header=niiMAP_hdr )
+
+        nibabel.save( niiIC , pjoin(RESULTS_path,'compartment_IC.nii.gz') )
+        nibabel.save( niiEC , pjoin(RESULTS_path,'compartment_EC.nii.gz') )
+        nibabel.save( niiISO , pjoin(RESULTS_path,'compartment_ISO.nii.gz') )
+
+        # Configuration and results
+        print( '\t* Configuration and results:' )
+
+        print( '\t\t- streamline_weights.txt... ', end='' )
+        sys.stdout.flush()
+        xic, _, _ = self.get_coeffs()
+        if stat_coeffs != 'all' and xic.size > 0 :
+            xic = np.reshape( xic, (-1,self.DICTIONARY['TRK']['kept'].size) )
+            if stat_coeffs == 'sum' :
+                xic = np.sum( xic, axis=0 )
+            elif stat_coeffs == 'mean' :
+                xic = np.mean( xic, axis=0 )
+            elif stat_coeffs == 'median' :
+                xic = np.median( xic, axis=0 )
+            elif stat_coeffs == 'min' :
+                xic = np.min( xic, axis=0 )
+            elif stat_coeffs == 'max' :
+                xic = np.max( xic, axis=0 )
+            else :
+                ERROR( 'Stat not allowed. Possible values: sum, mean, median, min, max, all.', prefix='\n' )
+        np.savetxt( pjoin(RESULTS_path,'streamline_weights.txt'), xic, fmt='%.5e' )
+        self.set_config('stat_coeffs', stat_coeffs)
+        print( '[ OK ]' )
+
+        # Save to a pickle file the following items:
+        #   item 0: dictionary with all the configuration details
+        #   item 1: np.array obtained through the optimisation process with the normalised kernels
+        #   item 2: np.array renormalisation of coeffs in item 1
+        print( '\t\t- results.pickle... ', end='' )
+        sys.stdout.flush()
+        with open( pjoin(RESULTS_path,'results.pickle'), 'wb+' ) as fid :
+            pickle.dump( [self.CONFIG, self.x, x], fid, protocol=2 )
+        print( '        [ OK ]' )
+
+        if save_est_dwi :
+            print( '\t\t- Estimated signal... ', end='' )
+            sys.stdout.flush()
+            self.niiDWI_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'], : ] = y_est
+            nibabel.save( nibabel.Nifti1Image( self.niiDWI_img , affine ), pjoin(RESULTS_path,'fit_signal_estimated.nii.gz') )
+            self.niiDWI_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'], : ] = y_mea
+            print( '[ OK ]' )
+        
+        LOG( '   [ %.1f seconds ]' % ( time.time() - tic ) )
diff --git a/commit/cudaoperator/operator.pyx b/commit/cudaoperator/operator.pyx
index 027bf484..a6278830 100644
--- a/commit/cudaoperator/operator.pyx
+++ b/commit/cudaoperator/operator.pyx
@@ -1,224 +1,224 @@
-#!python
-#cython: language_level=3, boundscheck=False, wraparound=False, profile=False
-
-import cython
-import numpy as np
-cimport numpy as np
-from amico.util import ERROR, LOG
-
-cdef extern from "operator_withCUDA.cuh":
-    int checkCompatibility(int)
-
-def check_compatibility(gpu_id):
-    return checkCompatibility(gpu_id)
-
-def check_cuda(error_id):
-    if error_id == -1:
-        ERROR( 'Impossible to allocate auxiliar memory in CPU' )
-    elif error_id == 1:
-        ERROR( 'Impossible to allocate memory in GPU' )
-    elif error_id == 2:
-        ERROR( 'Impossible to transfer memory to GPU' )
-    elif error_id == 3:
-        ERROR( 'Impossible to bind textures' )
-    elif error_id == 4:
-        ERROR( 'Impossible to transfer constant values to GPU' )
-    elif error_id == 5:
-        ERROR( 'There was a problem deleting GPU memory' )
-    elif error_id == 6:
-        ERROR( 'There was a problem unbinding texture memory' )
-    elif error_id == 7:
-        ERROR( 'There was a problem resetting GPU' )
-    elif error_id == 0:
-        print( '[ OK ]' )
-
-cdef extern from "operator_withCUDA.cuh":
-    cdef cppclass C_CudaLinearOperator "CudaLinearOperator":
-        C_CudaLinearOperator(int, int, int, int, int, int, int, int, int)
-
-        int setDictionary(np.uint32_t*, np.uint32_t*, np.uint16_t*, np.float32_t*,  np.uint32_t*, np.uint16_t*)
-        int setTransposeDictionary(np.uint32_t*, np.uint32_t*, np.uint16_t*, np.float32_t*)
-        int setKernels(np.float32_t*, np.float32_t*, np.float32_t*)
-        int setVectors()
-        int setGlobals()
-        int destroy()
-
-        void  dot(np.float64_t*, np.float64_t*)
-        void Tdot(np.float64_t*, np.float64_t*)
-
-cdef class CudaLinearOperator :
-    """This class is a wrapper to the CUDA C++ code for performing marix-vector multiplications
-    with the COMMIT linear operator A in a CUDA GPU. The multiplications are done using CUDA C++ code
-    that uses information from the DICTIONARY and KERNELS data structures.
-    """
-    cdef int nS, nF, nR, nE, nT, nV, nI, n, ndirs, gpu_id
-    cdef public int adjoint, n1, n2
-
-    cdef DICTIONARY
-    cdef KERNELS
-    cdef THREADS
-
-    cdef unsigned int*   ICf
-    cdef float*          ICl
-    cdef unsigned int*   ICv
-    cdef unsigned short* ICo
-    cdef unsigned int*   ECv
-    cdef unsigned short* ECo
-    cdef unsigned int*   ISOv
-
-    cdef float* LUT_IC
-    cdef float* LUT_EC
-    cdef float* LUT_ISO
-
-    # pointer to this operator in GPU memory
-    cdef C_CudaLinearOperator* thisptr
-
-    # these should be always None, they remain for compatibility
-    cdef unsigned int*   ICthreads
-    cdef unsigned int*   ECthreads
-    cdef unsigned int*   ISOthreads
-    cdef unsigned char*  ICthreadsT
-    cdef unsigned int*   ECthreadsT
-    cdef unsigned int*   ISOthreadsT
-
-
-    def __init__( self, DICTIONARY, KERNELS, THREADS, fcall = 0 ) :
-        """Set the pointers to the data structures used by the C code."""
-        self.DICTIONARY = DICTIONARY
-        self.KERNELS    = KERNELS
-        self.THREADS    = THREADS
-
-        self.nF         = DICTIONARY['IC']['nF']    # number of FIBERS
-        self.nR         = KERNELS['wmr'].shape[0]   # number of FIBER RADII
-        self.nE         = DICTIONARY['EC']['nE']    # number of EC segments
-        self.nT         = KERNELS['wmh'].shape[0]   # number of EC TORTUOSITY values
-        self.nV         = DICTIONARY['nV']          # number of VOXELS
-        self.nI         = KERNELS['iso'].shape[0]   # number of ISO contributions
-        self.n          = DICTIONARY['IC']['n']     # numbner of IC segments
-        self.ndirs      = KERNELS['wmr'].shape[1]   # number of directions
-        self.gpu_id     = THREADS['gpu_id']          # id of the CUDA GPU
-
-        if KERNELS['wmr'].size > 0 :
-            self.nS = KERNELS['wmr'].shape[2]       # number of SAMPLES
-        elif KERNELS['wmh'].size > 0 :
-            self.nS = KERNELS['wmh'].shape[2]
-        else :
-            self.nS = KERNELS['wmr'].shape[1]
-
-        self.adjoint = 0                            # direct of inverse product
-
-        self.n1 = self.nV*self.nS
-        self.n2 = self.nR*self.nF + self.nT*self.nE + self.nI*self.nV
-
-        # get C pointers to arrays in DICTIONARY
-        cdef unsigned int [::1]   ICf  = DICTIONARY['IC']['fiber']
-        self.ICf = &ICf[0]
-        cdef float [::1]          ICl  = DICTIONARY['IC']['len']
-        self.ICl = &ICl[0]
-        cdef unsigned int [::1]   ICv  = DICTIONARY['IC']['v']
-        self.ICv = &ICv[0]
-        cdef unsigned short [::1] ICo  = DICTIONARY['IC']['o']
-        self.ICo = &ICo[0]
-        cdef unsigned int [::1]   ECv  = DICTIONARY['EC']['v']
-        self.ECv = &ECv[0]
-        cdef unsigned short [::1] ECo  = DICTIONARY['EC']['o']
-        self.ECo = &ECo[0]
-        cdef unsigned int [::1]   ISOv = DICTIONARY['ISO']['v']
-        self.ISOv = &ISOv[0]
-
-        # get C pointers to arrays in KERNELS
-        cdef float [:, :, ::1] wmrSFP = KERNELS['wmr']
-        self.LUT_IC  = &wmrSFP[0,0,0]
-        cdef float [:, :, ::1] wmhSFP = KERNELS['wmh']
-        self.LUT_EC  = &wmhSFP[0,0,0]
-        cdef float [:, ::1] isoSFP = KERNELS['iso']
-        self.LUT_ISO = &isoSFP[0,0]
-
-        # create the operator in GPU memory
-        self.thisptr = new C_CudaLinearOperator(self.n, self.nV, self.nF, self.nE, self.ndirs, self.nS, self.nR, self.nT, self.nI)
-
-        # build operator in GPU only one time
-        if fcall == 1:
-            print( '\t* global values... ', end='' )
-            check_cuda( self.thisptr.setGlobals() )
-
-            print( '\t* lookup tables... ', end='' )
-            check_cuda( self.thisptr.setKernels(&wmrSFP[0,0,0], &wmhSFP[0,0,0], &isoSFP[0,0]) )
-
-            print( '\t* x&y vectors...   ', end='' )
-            check_cuda( self.thisptr.setVectors() )
-        
-            print( '\t* A  operator...   ', end='' )
-            check_cuda( self.thisptr.setDictionary(&ICv[0],&ICf[0],&ICo[0],&ICl[0], &ECv[0],&ECo[0]) )
-
-            idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['fiber'])] )
-
-            self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
-            self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]
-            self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
-            self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
-
-            ICf = self.DICTIONARY['IC']['fiber']
-            ICl = self.DICTIONARY['IC']['len']
-            ICv = self.DICTIONARY['IC']['v']
-            ICo = self.DICTIONARY['IC']['o']
-
-            self.ICf = &ICf[0]
-            self.ICl = &ICl[0]
-            self.ICv = &ICv[0]
-            self.ICo = &ICo[0]
-
-            print( '\t* A\' operator...   ', end='' )
-            check_cuda( self.thisptr.setTransposeDictionary(&self.ICv[0], &self.ICf[0], &self.ICo[0], &self.ICl[0]) )
-
-    def __del__( self ):
-        self.thisptr.destroy()
-
-    @property
-    def T( self ) :
-        """Transpose of the explicit matrix."""
-        C = CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
-        C.adjoint = 1 - C.adjoint
-        return C
-
-    @property
-    def shape( self ) :
-        """Size of the explicit matrix."""
-        if not self.adjoint :
-            return ( self.n1, self.n2 )
-        else :
-            return ( self.n2, self.n1 )
-
-
-    def dot( self, double [::1] v_in  ):
-        """Wrapper to C code for efficiently performing the matrix-vector multiplications.
-
-        Parameters
-        ----------
-        v_in : 1D numpy.array of double
-            Input vector for the matrix-vector multiplication
-
-        Returns
-        -------
-        v_out : 1D numpy.array of double
-            Results of the multiplication
-        """
-
-        # Permit only matrix-vector multiplications
-        if v_in.size != self.shape[1] :
-            ERROR( "A.dot(): dimensions do not match" )
-
-        # Create output array
-        cdef double [::1] v_out = np.zeros( self.shape[0], dtype=np.float64 )
-
-        # Call the cython function to read the memory pointers
-        if not self.adjoint :
-            # DIRECT PRODUCT A*x
-            self.thisptr.dot(&v_in[0], &v_out[0])
-        else :
-            # INVERSE PRODUCT A'*y
-            self.thisptr.Tdot(&v_in[0], &v_out[0])
-
-        return v_out
-
-
+#!python
+#cython: language_level=3, boundscheck=False, wraparound=False, profile=False
+
+import cython
+import numpy as np
+cimport numpy as np
+from amico.util import ERROR, LOG
+
+cdef extern from "operator_withCUDA.cuh":
+    int checkCompatibility(int)
+
+def check_compatibility(gpu_id):
+    return checkCompatibility(gpu_id)
+
+def check_cuda(error_id):
+    if error_id == -1:
+        ERROR( 'Impossible to allocate auxiliar memory in CPU' )
+    elif error_id == 1:
+        ERROR( 'Impossible to allocate memory in GPU' )
+    elif error_id == 2:
+        ERROR( 'Impossible to transfer memory to GPU' )
+    elif error_id == 3:
+        ERROR( 'Impossible to bind textures' )
+    elif error_id == 4:
+        ERROR( 'Impossible to transfer constant values to GPU' )
+    elif error_id == 5:
+        ERROR( 'There was a problem deleting GPU memory' )
+    elif error_id == 6:
+        ERROR( 'There was a problem unbinding texture memory' )
+    elif error_id == 7:
+        ERROR( 'There was a problem resetting GPU' )
+    elif error_id == 0:
+        print( '[ OK ]' )
+
+cdef extern from "operator_withCUDA.cuh":
+    cdef cppclass C_CudaLinearOperator "CudaLinearOperator":
+        C_CudaLinearOperator(int, int, int, int, int, int, int, int, int)
+
+        int setDictionary(np.uint32_t*, np.uint32_t*, np.uint16_t*, np.float32_t*,  np.uint32_t*, np.uint16_t*)
+        int setTransposeDictionary(np.uint32_t*, np.uint32_t*, np.uint16_t*, np.float32_t*)
+        int setKernels(np.float32_t*, np.float32_t*, np.float32_t*)
+        int setVectors()
+        int setGlobals()
+        int destroy()
+
+        void  dot(np.float64_t*, np.float64_t*)
+        void Tdot(np.float64_t*, np.float64_t*)
+
+cdef class CudaLinearOperator :
+    """This class is a wrapper to the CUDA C++ code for performing marix-vector multiplications
+    with the COMMIT linear operator A in a CUDA GPU. The multiplications are done using CUDA C++ code
+    that uses information from the DICTIONARY and KERNELS data structures.
+    """
+    cdef int nS, nF, nR, nE, nT, nV, nI, n, ndirs, gpu_id
+    cdef public int adjoint, n1, n2
+
+    cdef DICTIONARY
+    cdef KERNELS
+    cdef THREADS
+
+    cdef unsigned int*   ICf
+    cdef float*          ICl
+    cdef unsigned int*   ICv
+    cdef unsigned short* ICo
+    cdef unsigned int*   ECv
+    cdef unsigned short* ECo
+    cdef unsigned int*   ISOv
+
+    cdef float* LUT_IC
+    cdef float* LUT_EC
+    cdef float* LUT_ISO
+
+    # pointer to this operator in GPU memory
+    cdef C_CudaLinearOperator* thisptr
+
+    # these should be always None, they remain for compatibility
+    cdef unsigned int*   ICthreads
+    cdef unsigned int*   ECthreads
+    cdef unsigned int*   ISOthreads
+    cdef unsigned char*  ICthreadsT
+    cdef unsigned int*   ECthreadsT
+    cdef unsigned int*   ISOthreadsT
+
+
+    def __init__( self, DICTIONARY, KERNELS, THREADS, fcall = 0 ) :
+        """Set the pointers to the data structures used by the C code."""
+        self.DICTIONARY = DICTIONARY
+        self.KERNELS    = KERNELS
+        self.THREADS    = THREADS
+
+        self.nF         = DICTIONARY['IC']['nF']    # number of FIBERS
+        self.nR         = KERNELS['wmr'].shape[0]   # number of FIBER RADII
+        self.nE         = DICTIONARY['EC']['nE']    # number of EC segments
+        self.nT         = KERNELS['wmh'].shape[0]   # number of EC TORTUOSITY values
+        self.nV         = DICTIONARY['nV']          # number of VOXELS
+        self.nI         = KERNELS['iso'].shape[0]   # number of ISO contributions
+        self.n          = DICTIONARY['IC']['n']     # numbner of IC segments
+        self.ndirs      = KERNELS['wmr'].shape[1]   # number of directions
+        self.gpu_id     = THREADS['gpu_id']          # id of the CUDA GPU
+
+        if KERNELS['wmr'].size > 0 :
+            self.nS = KERNELS['wmr'].shape[2]       # number of SAMPLES
+        elif KERNELS['wmh'].size > 0 :
+            self.nS = KERNELS['wmh'].shape[2]
+        else :
+            self.nS = KERNELS['wmr'].shape[1]
+
+        self.adjoint = 0                            # direct of inverse product
+
+        self.n1 = self.nV*self.nS
+        self.n2 = self.nR*self.nF + self.nT*self.nE + self.nI*self.nV
+
+        # get C pointers to arrays in DICTIONARY
+        cdef unsigned int [::1]   ICf  = DICTIONARY['IC']['fiber']
+        self.ICf = &ICf[0]
+        cdef float [::1]          ICl  = DICTIONARY['IC']['len']
+        self.ICl = &ICl[0]
+        cdef unsigned int [::1]   ICv  = DICTIONARY['IC']['v']
+        self.ICv = &ICv[0]
+        cdef unsigned short [::1] ICo  = DICTIONARY['IC']['o']
+        self.ICo = &ICo[0]
+        cdef unsigned int [::1]   ECv  = DICTIONARY['EC']['v']
+        self.ECv = &ECv[0]
+        cdef unsigned short [::1] ECo  = DICTIONARY['EC']['o']
+        self.ECo = &ECo[0]
+        cdef unsigned int [::1]   ISOv = DICTIONARY['ISO']['v']
+        self.ISOv = &ISOv[0]
+
+        # get C pointers to arrays in KERNELS
+        cdef float [:, :, ::1] wmrSFP = KERNELS['wmr']
+        self.LUT_IC  = &wmrSFP[0,0,0]
+        cdef float [:, :, ::1] wmhSFP = KERNELS['wmh']
+        self.LUT_EC  = &wmhSFP[0,0,0]
+        cdef float [:, ::1] isoSFP = KERNELS['iso']
+        self.LUT_ISO = &isoSFP[0,0]
+
+        # create the operator in GPU memory
+        self.thisptr = new C_CudaLinearOperator(self.n, self.nV, self.nF, self.nE, self.ndirs, self.nS, self.nR, self.nT, self.nI)
+
+        # build operator in GPU only one time
+        if fcall == 1:
+            print( '\t* global values... ', end='' )
+            check_cuda( self.thisptr.setGlobals() )
+
+            print( '\t* lookup tables... ', end='' )
+            check_cuda( self.thisptr.setKernels(&wmrSFP[0,0,0], &wmhSFP[0,0,0], &isoSFP[0,0]) )
+
+            print( '\t* x&y vectors...   ', end='' )
+            check_cuda( self.thisptr.setVectors() )
+        
+            print( '\t* A  operator...   ', end='' )
+            check_cuda( self.thisptr.setDictionary(&ICv[0],&ICf[0],&ICo[0],&ICl[0], &ECv[0],&ECo[0]) )
+
+            idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['fiber'])] )
+
+            self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
+            self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]
+            self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
+            self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
+
+            ICf = self.DICTIONARY['IC']['fiber']
+            ICl = self.DICTIONARY['IC']['len']
+            ICv = self.DICTIONARY['IC']['v']
+            ICo = self.DICTIONARY['IC']['o']
+
+            self.ICf = &ICf[0]
+            self.ICl = &ICl[0]
+            self.ICv = &ICv[0]
+            self.ICo = &ICo[0]
+
+            print( '\t* A\' operator...   ', end='' )
+            check_cuda( self.thisptr.setTransposeDictionary(&self.ICv[0], &self.ICf[0], &self.ICo[0], &self.ICl[0]) )
+
+    def __del__( self ):
+        self.thisptr.destroy()
+
+    @property
+    def T( self ) :
+        """Transpose of the explicit matrix."""
+        C = CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
+        C.adjoint = 1 - C.adjoint
+        return C
+
+    @property
+    def shape( self ) :
+        """Size of the explicit matrix."""
+        if not self.adjoint :
+            return ( self.n1, self.n2 )
+        else :
+            return ( self.n2, self.n1 )
+
+
+    def dot( self, double [::1] v_in  ):
+        """Wrapper to C code for efficiently performing the matrix-vector multiplications.
+
+        Parameters
+        ----------
+        v_in : 1D numpy.array of double
+            Input vector for the matrix-vector multiplication
+
+        Returns
+        -------
+        v_out : 1D numpy.array of double
+            Results of the multiplication
+        """
+
+        # Permit only matrix-vector multiplications
+        if v_in.size != self.shape[1] :
+            ERROR( "A.dot(): dimensions do not match" )
+
+        # Create output array
+        cdef double [::1] v_out = np.zeros( self.shape[0], dtype=np.float64 )
+
+        # Call the cython function to read the memory pointers
+        if not self.adjoint :
+            # DIRECT PRODUCT A*x
+            self.thisptr.dot(&v_in[0], &v_out[0])
+        else :
+            # INVERSE PRODUCT A'*y
+            self.thisptr.Tdot(&v_in[0], &v_out[0])
+
+        return v_out
+
+
diff --git a/commit/cudaoperator/operator_withCUDA.cu b/commit/cudaoperator/operator_withCUDA.cu
index e0efdafb..6ccd0363 100644
--- a/commit/cudaoperator/operator_withCUDA.cu
+++ b/commit/cudaoperator/operator_withCUDA.cu
@@ -1,651 +1,694 @@
-#include "operator_withCUDA.cuh"
-
-// ====================================================
-// Textures for LUT in the GPU
-// ====================================================
-texture<float32_t, 1, cudaReadModeElementType> tex_lutIC;
-texture<float32_t, 1, cudaReadModeElementType> tex_lutEC;
-texture<float32_t, 1, cudaReadModeElementType> tex_lutISO;
-
-
-int checkCompatibility(int gpuID) {
-    int gpuCount;
-    cudaError_t cudaStatus;
-    
-    cudaStatus = cudaGetDeviceCount(&gpuCount);
-
-    if (gpuCount <= 0 || gpuID >= gpuCount || cudaStatus != cudaSuccess) return 1;
-
-    cudaStatus = cudaSetDevice(gpuID);
-
-    if (cudaStatus != cudaSuccess) return 2;
-
-    cudaDeviceProp gpuProperties;
-    cudaStatus = cudaGetDeviceProperties(&gpuProperties, gpuID);
-
-    if (cudaStatus != cudaSuccess) return 3;
-
-    printf("\t* selected GPU...       [ %s ]\n",     gpuProperties.name);
-    printf("\t* total memory...       [ %.2fGB ]\n", gpuProperties.totalGlobalMem*1e-9);
-    printf("\t* compute capability... [ %d.%d ]\n",  gpuProperties.major, gpuProperties.minor);
-
-    if(gpuProperties.major < 5) return 4;
-
-    return 0;
-}
-
-void cudaCheckLastError()
-{
-    cudaError_t err = cudaGetLastError();
-
-    if(err != cudaSuccess){
-        printf("CUDA Error: %s\n", cudaGetErrorString(err));
-        exit(-1);
-    }
-}
-
-void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compartmentsPerBlock, uint32_t* offsetPerBlock, int NUM_BLOCKS){
-
-    // fill arrays with zeros
-    memset(compartmentsPerBlock, 0, NUM_BLOCKS * sizeof(uint32_t));
-    memset(offsetPerBlock,       0, NUM_BLOCKS * sizeof(uint32_t));
-
-    // count compartments per block
-    for(int i = 0; i < NUM_COMPARTMENTS; i++)
-        compartmentsPerBlock[data[i]]++;
-
-    // calculate offset per block
-    offsetPerBlock[0] = 0;
-    for(int i = 1; i < NUM_BLOCKS; i++)
-        offsetPerBlock[i] = offsetPerBlock[i-1] + compartmentsPerBlock[i-1];
-}
-
-int CudaLinearOperator::setDictionary(uint32_t* voxelIC, uint32_t* fiberIC, uint16_t* orienIC, float32_t* lengthIC, uint32_t* voxelEC, uint16_t* orienEC){
-    
-    cudaError_t cudaStatus;
-
-    uint32_t* segmentsPerBlock = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
-    uint32_t* offsetPerBlock   = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
-
-    if (segmentsPerBlock == NULL || offsetPerBlock == NULL) return -1;
-
-    preprocessDataForGPU(voxelIC, nsegments, segmentsPerBlock, offsetPerBlock, nvoxels);
-
-    cudaStatus = cudaMalloc((void**)&gpu_segmentsPerBlockIC, nvoxels*sizeof(uint32_t));
-    if (cudaStatus != cudaSuccess) return 1;
-    cudaStatus = cudaMalloc((void**)&gpu_offsetPerBlockIC,   nvoxels*sizeof(uint32_t));
-    if (cudaStatus != cudaSuccess) return 1;
-
-    cudaStatus = cudaMemcpy(gpu_segmentsPerBlockIC, segmentsPerBlock, nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice);
-    if (cudaStatus != cudaSuccess) return 2;
-    cudaStatus = cudaMemcpy(gpu_offsetPerBlockIC,   offsetPerBlock,   nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice);
-    if (cudaStatus != cudaSuccess) return 2;
-
-    if (npeaks > 0){
-        preprocessDataForGPU(voxelEC, npeaks, segmentsPerBlock, offsetPerBlock, nvoxels);
-
-        cudaStatus = cudaMalloc((void**)&gpu_segmentsPerBlockEC, nvoxels*sizeof(uint32_t));
-        if (cudaStatus != cudaSuccess) return 1;
-        cudaStatus = cudaMalloc((void**)&gpu_offsetPerBlockEC,   nvoxels*sizeof(uint32_t));
-        if (cudaStatus != cudaSuccess) return 1;
-
-        cudaStatus = cudaMemcpy(gpu_segmentsPerBlockEC, segmentsPerBlock, nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice);
-        if (cudaStatus != cudaSuccess) return 2;
-        cudaStatus = cudaMemcpy(gpu_offsetPerBlockEC,   offsetPerBlock,   nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice);
-        if (cudaStatus != cudaSuccess) return 2;
-    }
-
-    free(segmentsPerBlock);
-    free(offsetPerBlock);
-
-    // alloc IC part of the dictionary in GPU
-    cudaStatus = cudaMalloc((void**)&gpu_voxelIC,  nsegments*sizeof(uint32_t)); 
-    if (cudaStatus != cudaSuccess) return 1;
-    cudaStatus = cudaMalloc((void**)&gpu_fiberIC,  nsegments*sizeof(uint32_t)); 
-    if (cudaStatus != cudaSuccess) return 1;
-    cudaStatus = cudaMalloc((void**)&gpu_orienIC,  nsegments*sizeof(uint16_t)); 
-    if (cudaStatus != cudaSuccess) return 1;
-    cudaStatus = cudaMalloc((void**)&gpu_lengthIC, nsegments*sizeof(float32_t));
-    if (cudaStatus != cudaSuccess) return 1;
-
-    // transfer IC part of the dictionary to GPU
-    cudaStatus = cudaMemcpy(gpu_voxelIC,  voxelIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice);
-    if (cudaStatus != cudaSuccess) return 2;
-    cudaStatus = cudaMemcpy(gpu_fiberIC,  fiberIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice);
-    if (cudaStatus != cudaSuccess) return 2;
-    cudaStatus = cudaMemcpy(gpu_orienIC,  orienIC,  nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice);
-    if (cudaStatus != cudaSuccess) return 2;
-    cudaStatus = cudaMemcpy(gpu_lengthIC, lengthIC, nsegments*sizeof(float32_t), cudaMemcpyHostToDevice);
-    if (cudaStatus != cudaSuccess) return 2;
-
-    if (npeaks > 0){
-        // alloc EC part of the dictionary in GPU
-        cudaStatus = cudaMalloc((void**)&gpu_voxelEC,  npeaks*sizeof(uint32_t));
-        if (cudaStatus != cudaSuccess) return 1;
-        cudaStatus = cudaMalloc((void**)&gpu_orienEC,  npeaks*sizeof(uint16_t));
-        if (cudaStatus != cudaSuccess) return 1;
-
-        // transfer EC part of the dictionary to GPU
-        cudaStatus = cudaMemcpy(gpu_voxelEC,  voxelEC,  npeaks*sizeof(uint32_t), cudaMemcpyHostToDevice);
-        if (cudaStatus != cudaSuccess) return 2;
-        cudaStatus = cudaMemcpy(gpu_orienEC,  orienEC,  npeaks*sizeof(uint16_t), cudaMemcpyHostToDevice);
-        if (cudaStatus != cudaSuccess) return 2;
-    }
-
-    return 0;
-}
-
-int CudaLinearOperator::setTransposeDictionary(uint32_t* TvoxelIC, uint32_t* TfiberIC, uint16_t* TorienIC, float32_t* TlengthIC){
-    
-    cudaError_t cudaStatus;
-
-    uint32_t*  fibersPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
-    uint32_t*  offsetPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
-    if(fibersPerBlock == NULL || offsetPerBlock == NULL) return -1;
-
-    preprocessDataForGPU(TfiberIC, nsegments, fibersPerBlock, offsetPerBlock, nfibers);
-
-    cudaStatus = cudaMalloc((void**)&gpu_TfibersPerBlockIC, nfibers*sizeof(uint32_t));
-    if (cudaStatus != cudaSuccess) return 1;
-    cudaStatus = cudaMalloc((void**)&gpu_ToffsetPerBlockIC, nfibers*sizeof(uint32_t));
-    if (cudaStatus != cudaSuccess) return 1;
-
-    cudaStatus = cudaMemcpy(gpu_TfibersPerBlockIC, fibersPerBlock, nfibers*sizeof(uint32_t), cudaMemcpyHostToDevice);
-    if (cudaStatus != cudaSuccess) return 2;
-    cudaStatus = cudaMemcpy(gpu_ToffsetPerBlockIC, offsetPerBlock, nfibers*sizeof(uint32_t), cudaMemcpyHostToDevice);
-    if (cudaStatus != cudaSuccess) return 2;
-
-    free(fibersPerBlock);
-    free(offsetPerBlock);
-
-    cudaStatus = cudaMalloc((void**)&gpu_TvoxelIC,  nsegments*sizeof(uint32_t)) ;
-    if (cudaStatus != cudaSuccess) return 1;
-    cudaStatus = cudaMalloc((void**)&gpu_TfiberIC,  nsegments*sizeof(uint32_t)) ;
-    if (cudaStatus != cudaSuccess) return 1;
-    cudaStatus = cudaMalloc((void**)&gpu_TorienIC,  nsegments*sizeof(uint16_t)) ;
-    if (cudaStatus != cudaSuccess) return 1;
-    cudaStatus = cudaMalloc((void**)&gpu_TlengthIC, nsegments*sizeof(float32_t));
-    if (cudaStatus != cudaSuccess) return 1;
-
-    cudaStatus = cudaMemcpy(gpu_TvoxelIC,  TvoxelIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice);
-    if (cudaStatus != cudaSuccess) return 2;
-    cudaStatus = cudaMemcpy(gpu_TfiberIC,  TfiberIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice);
-    if (cudaStatus != cudaSuccess) return 2;
-    cudaStatus = cudaMemcpy(gpu_TorienIC,  TorienIC,  nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice);
-    if (cudaStatus != cudaSuccess) return 2;
-    cudaStatus = cudaMemcpy(gpu_TlengthIC, TlengthIC, nsegments*sizeof(float32_t), cudaMemcpyHostToDevice);
-    if (cudaStatus != cudaSuccess) return 2;
-    
-    return 0;
-}
-
-int CudaLinearOperator::setKernels(float32_t* lutIC, float32_t* lutEC, float32_t* lutISO){
-
-    cudaError_t cudaStatus;
-
-    if (ndiameters > 0){
-        cudaStatus = cudaMalloc((void**)&gpu_lutIC, size_lutic*sizeof(float32_t));
-        if (cudaStatus != cudaSuccess) return 1;
-        cudaStatus = cudaMemcpy(gpu_lutIC, lutIC, size_lutic*sizeof(float32_t), cudaMemcpyHostToDevice);
-        if (cudaStatus != cudaSuccess) return 2;
-
-        tex_lutIC.addressMode[0] = cudaAddressModeBorder;
-        tex_lutIC.addressMode[1] = cudaAddressModeBorder;
-        tex_lutIC.filterMode = cudaFilterModePoint;
-        tex_lutIC.normalized = false;
-
-        cudaStatus = cudaBindTexture(NULL, tex_lutIC,  gpu_lutIC,  size_lutic*sizeof(float32_t));
-        if (cudaStatus != cudaSuccess) return 3;
-    }
-
-    if (nzeppelins > 0){
-        cudaStatus = cudaMalloc((void**)&gpu_lutEC,  size_lutec*sizeof(float32_t));
-        if (cudaStatus != cudaSuccess) return 1;
-        cudaStatus = cudaMemcpy(gpu_lutEC, lutEC, size_lutec*sizeof(float32_t), cudaMemcpyHostToDevice);
-        if (cudaStatus != cudaSuccess) return 2;
-
-        tex_lutEC.addressMode[0] = cudaAddressModeBorder;
-        tex_lutEC.addressMode[1] = cudaAddressModeBorder;
-        tex_lutEC.filterMode = cudaFilterModePoint;
-        tex_lutEC.normalized = false;
-
-        cudaStatus = cudaBindTexture(NULL, tex_lutEC,  gpu_lutEC,  size_lutec*sizeof(float32_t));
-        if (cudaStatus != cudaSuccess) return 3;
-    }
-
-    if (nballs > 0){
-        cudaStatus = cudaMalloc((void**)&gpu_lutISO, size_lutiso*sizeof(float32_t));
-        if (cudaStatus != cudaSuccess) return 1;
-        cudaStatus = cudaMemcpy(gpu_lutISO, lutISO, size_lutiso*sizeof(float32_t), cudaMemcpyHostToDevice);
-        if (cudaStatus != cudaSuccess) return 2;
-
-        tex_lutISO.addressMode[0] = cudaAddressModeBorder;
-        tex_lutISO.addressMode[1] = cudaAddressModeBorder;
-        tex_lutISO.filterMode = cudaFilterModePoint;
-        tex_lutISO.normalized = false;
-
-        cudaStatus = cudaBindTexture(NULL, tex_lutISO, gpu_lutISO, size_lutiso*sizeof(float32_t));
-        if (cudaStatus != cudaSuccess) return 3;
-    }
-
-    return 0;
-}
-
-int CudaLinearOperator::setVectors(){
-    
-    cudaError_t cudaStatus;
-
-    cudaStatus = cudaMalloc((void**)&gpu_x, ncols*sizeof(float64_t));
-    if (cudaStatus != cudaSuccess) return 1;
-    cudaStatus = cudaMalloc((void**)&gpu_y, nrows*sizeof(float64_t));
-    if (cudaStatus != cudaSuccess) return 1;
-    
-    return 0;
-}
-
-int CudaLinearOperator::setGlobals(){
-    
-    cudaError_t cudaStatus;
-
-    cudaStatus = cudaMemcpyToSymbol(NUM_VOXELS,       &nvoxels,       sizeof(int));
-    if (cudaStatus != cudaSuccess) return -1;
-    cudaStatus = cudaMemcpyToSymbol(NUM_FIBERS,       &nfibers,       sizeof(int));
-    if (cudaStatus != cudaSuccess) return -1;
-    cudaStatus = cudaMemcpyToSymbol(NUM_PEAKS,        &npeaks,        sizeof(int));
-    if (cudaStatus != cudaSuccess) return -1;
-    cudaStatus = cudaMemcpyToSymbol(NUM_ORIENTATIONS, &norientations, sizeof(int));
-    if (cudaStatus != cudaSuccess) return -1;
-    cudaStatus = cudaMemcpyToSymbol(NUM_SAMPLES,      &nsamples,      sizeof(int));
-    if (cudaStatus != cudaSuccess) return -1;
-    cudaStatus = cudaMemcpyToSymbol(NUM_DIAMETERS,    &ndiameters,    sizeof(int));
-    if (cudaStatus != cudaSuccess) return -1;
-    cudaStatus = cudaMemcpyToSymbol(NUM_ZEPPELINS,    &nzeppelins,    sizeof(int));
-    if (cudaStatus != cudaSuccess) return -1;
-    cudaStatus = cudaMemcpyToSymbol(NUM_BALLS,        &nballs,        sizeof(int));
-    if (cudaStatus != cudaSuccess) return -1;
-    cudaStatus = cudaMemcpyToSymbol(NUM_ROWS,         &nrows,         sizeof(int));
-    if (cudaStatus != cudaSuccess) return -1;
-    cudaStatus = cudaMemcpyToSymbol(NUM_COLS,         &ncols,         sizeof(int));
-    if (cudaStatus != cudaSuccess) return -1;
-    cudaStatus = cudaMemcpyToSymbol(SIZE_LUTIC,       &size_lutic,    sizeof(int));
-    if (cudaStatus != cudaSuccess) return -1;
-    cudaStatus = cudaMemcpyToSymbol(SIZE_LUTEC,       &size_lutec,    sizeof(int));
-    if (cudaStatus != cudaSuccess) return -1;
-    cudaStatus = cudaMemcpyToSymbol(SIZE_LUTISO,      &size_lutiso,   sizeof(int));
-    if (cudaStatus != cudaSuccess) return -1;
-    
-    return 0;
-}
-
-CudaLinearOperator::CudaLinearOperator(int nsegments, int nvoxels, int nfibers, int npeaks, int norientations, int nsamples, int ndiameters, int nzeppelins, int nballs){
-
-    this->nsegments = nsegments;
-    this->nvoxels = nvoxels;
-    this->nfibers = nfibers;
-    this->npeaks = npeaks;
-    this->norientations = norientations;
-    this->nsamples = nsamples;
-    this->ndiameters = ndiameters;
-    this->nzeppelins = nzeppelins;   
-    this->nballs = nballs;
-    this->size_lutic = ndiameters*norientations*nsamples;
-    this->size_lutec = nzeppelins*norientations*nsamples;
-    this->size_lutiso = nballs*nsamples;
-    this->nrows = nvoxels*nsamples;
-    this->ncols = nfibers*ndiameters + npeaks*nzeppelins + nvoxels*nballs;
-}
-
-CudaLinearOperator::~CudaLinearOperator() {}
-
-int CudaLinearOperator::destroy(){
-    cudaError_t cudaStatus;    
-
-    cudaStatus = cudaFree(gpu_voxelIC);
-    cudaStatus = cudaFree(gpu_fiberIC);
-    cudaStatus = cudaFree(gpu_orienIC);
-    cudaStatus = cudaFree(gpu_lengthIC);
-    cudaStatus = cudaFree(gpu_voxelEC);
-    cudaStatus = cudaFree(gpu_orienEC);
-    cudaStatus = cudaFree(gpu_segmentsPerBlockIC);
-    cudaStatus = cudaFree(gpu_offsetPerBlockIC);
-    cudaStatus = cudaFree(gpu_segmentsPerBlockEC);
-    cudaStatus = cudaFree(gpu_offsetPerBlockEC);
-
-    cudaStatus = cudaFree(gpu_TvoxelIC);
-    cudaStatus = cudaFree(gpu_TfiberIC);
-    cudaStatus = cudaFree(gpu_TorienIC);
-    cudaStatus = cudaFree(gpu_TlengthIC);
-    cudaStatus = cudaFree(gpu_TfibersPerBlockIC);
-    cudaStatus = cudaFree(gpu_ToffsetPerBlockIC);
-
-    cudaStatus = cudaFree(gpu_x);
-    cudaStatus = cudaFree(gpu_y);
-
-    cudaStatus = cudaFree(gpu_lutIC);
-    cudaStatus = cudaFree(gpu_lutEC);
-    cudaStatus = cudaFree(gpu_lutISO);
-    cudaStatus = cudaUnbindTexture(tex_lutIC);
-    cudaStatus = cudaUnbindTexture(tex_lutEC);
-    cudaStatus = cudaUnbindTexture(tex_lutISO);
-
-    cudaStatus = cudaDeviceReset();
-
-    return 0;
-}
-
-void cudaCheckKernel(){
-    cudaError_t cudaStatus;
-    
-    cudaStatus = cudaGetLastError();
-	if(cudaStatus != cudaSuccess)
-        fprintf(stderr, "\t* kernel launch... [ ERROR ]: %s\n\n", cudaGetErrorString(cudaStatus));
-    else
-        printf("\t* kernel launch... [ OK ]\n");
-
-    cudaStatus = cudaDeviceSynchronize();
-	if(cudaStatus != cudaSuccess)
-        fprintf(stderr, "\t* cudaDeviceSynchronize() after launching kernel... [ ERROR ]: %d\n", cudaStatus);
-    else
-        printf("\t* cudaDeviceSynchronize() after launching kernel... [ OK ]\n");
-}
-
-void CudaLinearOperator::dot(float64_t* v_in, float64_t* v_out){
-    
-    // Copy vector x to the GPU
-    cudaMemcpy(gpu_x, v_in, ncols*sizeof(double), cudaMemcpyHostToDevice);
-    //cudaCheckLastError();
-
-    // Multiply IC part in the GPU
-    multiply_Ax_ICpart<<<nvoxels, 1024>>>(gpu_voxelIC, gpu_fiberIC, gpu_orienIC, gpu_lengthIC, gpu_segmentsPerBlockIC, gpu_offsetPerBlockIC, gpu_lutIC, gpu_x, gpu_y);
-    //cudaCheckLastError();
-
-    // Multiply EC part in the GPU
-    multiply_Ax_ECpart<<<nvoxels, 512>>>(gpu_voxelEC, gpu_orienEC, gpu_segmentsPerBlockEC, gpu_offsetPerBlockEC, gpu_lutEC, gpu_x, gpu_y);
-    //cudaCheckLastError();
-
-    // Multiply ISO part in the GPU
-    multiply_Ax_ISOpart<<<nvoxels, 512>>>(gpu_lutISO, gpu_x, gpu_y);
-    //cudaCheckLastError();
-
-    // Copy back result to CPU
-    cudaMemcpy(v_out, gpu_y, nrows*sizeof(double), cudaMemcpyDeviceToHost);
-    //cudaCheckLastError();
-}
-
-void CudaLinearOperator::Tdot(float64_t* v_in, float64_t* v_out){
-    
-    // Copy vector y to the GPU
-    cudaMemcpy(gpu_y, v_in, nrows*sizeof(double), cudaMemcpyHostToDevice);
-    //cudaCheckLastError();
-
-    // Multiply IC part in the GPU
-    multiply_Aty_ICpart<<<nfibers, 512>>>(gpu_TvoxelIC, gpu_TfiberIC, gpu_TorienIC, gpu_TlengthIC, gpu_TfibersPerBlockIC, gpu_ToffsetPerBlockIC, gpu_lutIC, gpu_x, gpu_y);
-    //cudaCheckLastError();
-
-    // Multiply EC part in the GPU
-    multiply_Aty_ECpart<<<nvoxels, 512>>>(gpu_voxelEC, gpu_orienEC, gpu_segmentsPerBlockEC, gpu_offsetPerBlockEC, gpu_lutEC, gpu_x, gpu_y);
-    //cudaCheckLastError();
-
-    // Multiply ISO part in the GPU
-    multiply_Aty_ISOpart<<<nvoxels, 512>>>(gpu_lutISO, gpu_x, gpu_y);
-    //cudaCheckLastError();
-
-    // Copy back result to CPU
-    cudaMemcpy(v_out, gpu_x, ncols*sizeof(double), cudaMemcpyDeviceToHost);
-    //cudaCheckLastError();
-}
-
-// ------------------------------------------------------- KERNELS ------------------------------------------------------- //
-__global__ void multiply_Ax_ICpart(uint32_t*  voxelIDs,
-                                   uint32_t*  fiberIDs,
-                                   uint16_t*  orienIDs,
-                                   float32_t* lengths,
-                                   uint32_t*  segmentsPerBlock,
-                                   uint32_t*  offsetPerBlock,
-                                   float32_t* lut,
-                                   float64_t* x,
-                                   float64_t* y)
-{
-    __shared__ float64_t shmem[1024];
-
-    uint32_t bid = blockIdx.x;
-    uint32_t tid = threadIdx.x;
-    uint32_t gid = threadIdx.x / 512;
-    uint32_t sid = threadIdx.x - 512*gid;
-
-    shmem[tid] = 0.0;
-
-    if(sid >= NUM_SAMPLES) return;
-
-    uint32_t offset = offsetPerBlock[bid] + (segmentsPerBlock[bid]/2)*gid;
-    uint32_t nsegments = segmentsPerBlock[bid]/2 + (segmentsPerBlock[bid]%2)*gid;
-
-    uint32_t*  voxel  = voxelIDs + offset;
-    uint32_t*  fiber  = fiberIDs + offset;
-    uint16_t*  orien  = orienIDs + offset;
-    float32_t* length = lengths  + offset;
-
-    float64_t sum = 0.0;
-
-    for(int i = 0; i < nsegments; i++){
-        int offset_lut = (*orien)*NUM_SAMPLES + sid;
-
-        float64_t aux = 0.0;
-        for(int j = 0; j < NUM_DIAMETERS; j++){
-            aux += (double)(lut[offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES])*x[(*fiber) + j*NUM_FIBERS];
-            //aux += tex1Dfetch(tex_lutIC, offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES) * x[(*fiber) + j*NUM_FIBERS];
-        }
-
-        sum += aux * (*length);
-
-        fiber++;
-        orien++;
-        length++;
-    }
-
-    shmem[tid] = sum;
-    __syncthreads();
-
-    if(tid < NUM_SAMPLES)
-        y[(*voxel)*NUM_SAMPLES + sid] = sum + shmem[tid+512];
-}
-
-__global__ void multiply_Ax_ECpart(
-    uint32_t*  voxelIDs,
-    uint16_t*  orienIDs,
-    uint32_t*  segmentsPerBlock,
-    uint32_t*  offsetPerBlock,
-    float32_t* lut,
-    float64_t* x,
-    float64_t* y)
-{
-    uint32_t bid = blockIdx.x;
-    uint32_t tid = threadIdx.x;
-
-    if(tid >= NUM_SAMPLES) return;
-
-    uint32_t offset  = offsetPerBlock[bid];
-    uint32_t nsegments = segmentsPerBlock[bid];
-
-    uint32_t* voxel = voxelIDs + offset;
-    uint16_t* orien = orienIDs + offset;
-
-    uint32_t target = NUM_FIBERS*NUM_DIAMETERS + offset;
-
-    float64_t sum = 0.0;
-    for(int i = 0; i < nsegments; i++){
-        uint32_t offset_lut = (*orien)*NUM_SAMPLES + tid;
-
-        for(int j = 0; j < NUM_ZEPPELINS; j++)
-            sum += (double)(lut[offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES])*x[target + j*NUM_PEAKS + i];
-            //sum += tex1Dfetch(tex_lutEC, offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES) * x[target + j*NUM_PEAKS + i];
-
-        orien++;
-    }
-
-    y[(*voxel)*NUM_SAMPLES + tid] += sum;
-}
-
-__global__ void multiply_Ax_ISOpart(
-    float32_t* lut,
-    float64_t* x,
-    float64_t* y)
-{
-    uint32_t bid = blockIdx.x;
-    uint32_t tid = threadIdx.x;
-
-    if(tid >= NUM_SAMPLES) return;
-
-    uint32_t target = NUM_FIBERS*NUM_DIAMETERS + NUM_PEAKS*NUM_ZEPPELINS + bid;
-
-    float64_t sum = 0.0;
-    for(int j = 0; j < NUM_BALLS; j++)
-        sum += (double)(lut[j*NUM_SAMPLES + tid])*x[target + j*NUM_VOXELS];
-        //sum += (double)(tex1Dfetch(tex_lutISO, j*NUM_SAMPLES + tid))*x[target + j*NUM_VOXELS];
-        
-
-    y[bid*NUM_SAMPLES + tid] += sum;
-}
-
-__global__ void multiply_Aty_ICpart(
-    uint32_t*  voxelICt,
-    uint32_t*  fiberICt,
-    uint16_t*  orienICt,
-    float32_t* lengthICt,
-    uint32_t*  compartmentsPerBlock,
-    uint32_t*  offsetPerBlock,
-    float32_t* lut,
-    float64_t* x,
-    float64_t* y)
-{
-    __shared__ float64_t shmem[512];
-
-    uint32_t bid = blockIdx.x;
-    uint32_t tid = threadIdx.x;
-
-    shmem[tid] = 0.0;
-
-    if(tid >= NUM_SAMPLES) return;
-
-    uint32_t offset = offsetPerBlock[bid];
-    uint32_t nsegments = offset + compartmentsPerBlock[bid];
-
-    uint32_t*  voxel  = voxelICt  + offset;
-    uint32_t*  fiber  = fiberICt  + offset;
-    uint16_t*  orien  = orienICt  + offset;
-    float32_t* length = lengthICt + offset;
-
-    for(int j = 0; j < NUM_DIAMETERS; j++){
-        int offset_lut = j*NUM_ORIENTATIONS*NUM_SAMPLES + tid;
-
-        float64_t sum = 0.0;
-        voxel  = voxelICt  + offset;
-        orien  = orienICt  + offset;
-        length = lengthICt + offset;
-        for(int i = offset; i < nsegments; i++){
-            sum += ((float64_t)(*length)) *( (float64_t) lut[offset_lut + (*orien)*NUM_SAMPLES] )* y[(*voxel)*NUM_SAMPLES + tid];
-            //sum += ((float64_t)(*length)) *( (float64_t) tex1Dfetch(tex_lutIC, offset_lut + (*orien)*NUM_SAMPLES) )* y[(*voxel)*NUM_SAMPLES + tid];
-
-            voxel++;
-            orien++;
-            length++;
-        }
-
-        shmem[tid] = sum;
-        __syncthreads();
-
-        if(tid < 256) shmem[tid] += shmem[tid + 256]; __syncthreads();
-        if(tid < 128) shmem[tid] += shmem[tid + 128]; __syncthreads();
-        if(tid <  64) shmem[tid] += shmem[tid +  64]; __syncthreads();
-        if(tid <  32) shmem[tid] += shmem[tid +  32]; __syncthreads();
-        if(tid <  16) shmem[tid] += shmem[tid +  16]; __syncthreads();
-        if(tid <   8) shmem[tid] += shmem[tid +   8]; __syncthreads();
-        if(tid <   4) shmem[tid] += shmem[tid +   4]; __syncthreads();
-
-        if(tid == 0) x[j*NUM_FIBERS + (*fiber)] = shmem[0] + shmem[1] + shmem[2] + shmem[3];
-
-        __syncthreads();
-    }
-}
-
-__global__ void multiply_Aty_ECpart(
-    uint32_t*  voxelEC,
-    uint16_t*  orienEC,
-    uint32_t*  segmentsPerBlock,
-    uint32_t*  offsetPerBlock,
-    float32_t* lut,
-    float64_t* x,
-    float64_t* y)
-{
-    __shared__ float64_t shmem[512];
-
-    uint32_t bid = blockIdx.x;
-    uint32_t tid = threadIdx.x;
-
-    shmem[tid] = 0.0;
-
-    if(tid >= NUM_SAMPLES) return;
-
-    uint32_t offset  = offsetPerBlock[bid];
-    uint32_t ncompartments = segmentsPerBlock[bid] + offset;
-
-    uint32_t* voxel = voxelEC + offset;
-    uint16_t* orien = orienEC + offset;
-
-    for(int j = 0; j < NUM_ZEPPELINS; j++){        
-        uint32_t offset_lut = j*NUM_ORIENTATIONS*NUM_SAMPLES + tid;
-
-        voxel = voxelEC + offset;
-        orien = orienEC + offset;
-        for(int i = offset; i < ncompartments; i++){
-            shmem[tid] =( (float64_t)(lut[(*orien)*NUM_SAMPLES + offset_lut] ))* y[(*voxel)*NUM_SAMPLES + tid];
-            //shmem[tid] =( (float64_t)tex1Dfetch(tex_lutEC, (*orien)*NUM_SAMPLES + offset_lut) )* y[(*voxel)*NUM_SAMPLES + tid];
-            __syncthreads();
-
-            if(tid < 256) shmem[tid] += shmem[tid + 256]; __syncthreads();
-            if(tid < 128) shmem[tid] += shmem[tid + 128]; __syncthreads();
-            if(tid <  64) shmem[tid] += shmem[tid +  64]; __syncthreads();
-            if(tid <  32) shmem[tid] += shmem[tid +  32]; __syncthreads();
-            if(tid <  16) shmem[tid] += shmem[tid +  16]; __syncthreads();
-            if(tid <   8) shmem[tid] += shmem[tid +   8]; __syncthreads();
-            if(tid <   4) shmem[tid] += shmem[tid +   4]; __syncthreads();
-            if(tid <   2) shmem[tid] += shmem[tid +   2]; __syncthreads();
-
-            if(tid == 0) x[NUM_FIBERS*NUM_DIAMETERS + j*NUM_PEAKS + i] = shmem[0] + shmem[1];
-
-            voxel++;
-            orien++;
-            __syncthreads();
-        }
-    }
-}
-
-__global__ void multiply_Aty_ISOpart(float* lut, double* x, double* y){
-    __shared__ double shmem[512];
-
-    uint bid = blockIdx.x;
-    uint tid = threadIdx.x;
-    uint offset = NUM_FIBERS*NUM_DIAMETERS + NUM_PEAKS*NUM_ZEPPELINS + bid;
-
-    shmem[tid] = 0.0;
-
-    if(tid >= NUM_SAMPLES) return;
-
-    for(int j = 0; j < NUM_BALLS; j++){
-        shmem[tid] =( (float64_t) lut[j*NUM_SAMPLES + tid] )* y[bid*NUM_SAMPLES + tid];
-        //shmem[tid] =( (float64_t) tex1Dfetch(tex_lutISO, j*NUM_SAMPLES + tid) )* y[bid*NUM_SAMPLES + tid];
-        __syncthreads();
-
-        if(tid < 256) shmem[tid] += shmem[tid + 256]; __syncthreads();
-        if(tid < 128) shmem[tid] += shmem[tid + 128]; __syncthreads();
-        if(tid <  64) shmem[tid] += shmem[tid +  64]; __syncthreads();
-        if(tid <  32) shmem[tid] += shmem[tid +  32]; __syncthreads();
-        if(tid <  16) shmem[tid] += shmem[tid +  16]; __syncthreads();
-        if(tid <   8) shmem[tid] += shmem[tid +   8]; __syncthreads();
-        if(tid <   4) shmem[tid] += shmem[tid +   4]; __syncthreads(); 
-
-        if(tid == 0)
-            x[offset + j*NUM_VOXELS] = shmem[0] + shmem[1] + shmem[2] + shmem[3];
-    }
-}
-
+#include "operator_withCUDA.cuh"
+
+// ====================================================
+// Textures for LUT in the GPU
+// ====================================================
+texture<float32_t, 1, cudaReadModeElementType> tex_lutIC;
+texture<float32_t, 1, cudaReadModeElementType> tex_lutEC;
+texture<float32_t, 1, cudaReadModeElementType> tex_lutISO;
+
+
+int checkCompatibility(int gpuID) {
+    int gpuCount;
+    cudaError_t cudaStatus;
+    
+    cudaStatus = cudaGetDeviceCount(&gpuCount);
+
+    if (gpuCount <= 0 || gpuID >= gpuCount || cudaStatus != cudaSuccess) return 1;
+
+    cudaStatus = cudaSetDevice(gpuID);
+
+    if (cudaStatus != cudaSuccess) return 2;
+
+    cudaDeviceProp gpuProperties;
+    cudaStatus = cudaGetDeviceProperties(&gpuProperties, gpuID);
+
+    if (cudaStatus != cudaSuccess) return 3;
+
+    printf("\t* selected GPU...       [ %s ]\n",     gpuProperties.name);
+    printf("\t* total memory...       [ %.2fGB ]\n", gpuProperties.totalGlobalMem*1e-9);
+    printf("\t* compute capability... [ %d.%d ]\n",  gpuProperties.major, gpuProperties.minor);
+
+    if(gpuProperties.major < 5) return 4;
+
+    return 0;
+}
+
+void cudaCheckLastError()
+{
+    cudaError_t err = cudaGetLastError();
+
+    if(err != cudaSuccess){
+        printf("CUDA Error: %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+
+void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compartmentsPerBlock, uint32_t* offsetPerBlock, int NUM_BLOCKS){
+
+    // fill arrays with zeros
+    memset(compartmentsPerBlock, 0, NUM_BLOCKS * sizeof(uint32_t));
+    memset(offsetPerBlock,       0, NUM_BLOCKS * sizeof(uint32_t));
+
+    // count compartments per block
+    for(int i = 0; i < NUM_COMPARTMENTS; i++)
+        compartmentsPerBlock[data[i]]++;
+
+    // calculate offset per block
+    offsetPerBlock[0] = 0;
+    for(int i = 1; i < NUM_BLOCKS; i++)
+        offsetPerBlock[i] = offsetPerBlock[i-1] + compartmentsPerBlock[i-1];
+}
+
+int CudaLinearOperator::setDictionary(uint32_t* voxelIC, uint32_t* fiberIC, uint16_t* orienIC, float32_t* lengthIC, uint32_t* voxelEC, uint16_t* orienEC){
+    
+    cudaError_t cudaStatus;
+
+    uint32_t* segmentsPerBlock = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
+    uint32_t* offsetPerBlock   = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
+
+    if (segmentsPerBlock == NULL || offsetPerBlock == NULL) return -1;
+
+    preprocessDataForGPU(voxelIC, nsegments, segmentsPerBlock, offsetPerBlock, nvoxels);
+
+    cudaStatus = cudaMalloc((void**)&gpu_segmentsPerBlockIC, nvoxels*sizeof(uint32_t));
+    if (cudaStatus != cudaSuccess) return 1;
+    cudaStatus = cudaMalloc((void**)&gpu_offsetPerBlockIC,   nvoxels*sizeof(uint32_t));
+    if (cudaStatus != cudaSuccess) return 1;
+
+    cudaStatus = cudaMemcpy(gpu_segmentsPerBlockIC, segmentsPerBlock, nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) return 2;
+    cudaStatus = cudaMemcpy(gpu_offsetPerBlockIC,   offsetPerBlock,   nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) return 2;
+
+    if (npeaks > 0){
+        preprocessDataForGPU(voxelEC, npeaks, segmentsPerBlock, offsetPerBlock, nvoxels);
+
+        cudaStatus = cudaMalloc((void**)&gpu_segmentsPerBlockEC, nvoxels*sizeof(uint32_t));
+        if (cudaStatus != cudaSuccess) return 1;
+        cudaStatus = cudaMalloc((void**)&gpu_offsetPerBlockEC,   nvoxels*sizeof(uint32_t));
+        if (cudaStatus != cudaSuccess) return 1;
+
+        cudaStatus = cudaMemcpy(gpu_segmentsPerBlockEC, segmentsPerBlock, nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice);
+        if (cudaStatus != cudaSuccess) return 2;
+        cudaStatus = cudaMemcpy(gpu_offsetPerBlockEC,   offsetPerBlock,   nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice);
+        if (cudaStatus != cudaSuccess) return 2;
+    }
+
+    free(segmentsPerBlock);
+    free(offsetPerBlock);
+
+    // alloc IC part of the dictionary in GPU
+    cudaStatus = cudaMalloc((void**)&gpu_voxelIC,  nsegments*sizeof(uint32_t)); 
+    if (cudaStatus != cudaSuccess) return 1;
+    cudaStatus = cudaMalloc((void**)&gpu_fiberIC,  nsegments*sizeof(uint32_t)); 
+    if (cudaStatus != cudaSuccess) return 1;
+    cudaStatus = cudaMalloc((void**)&gpu_orienIC,  nsegments*sizeof(uint16_t)); 
+    if (cudaStatus != cudaSuccess) return 1;
+    cudaStatus = cudaMalloc((void**)&gpu_lengthIC, nsegments*sizeof(float32_t));
+    if (cudaStatus != cudaSuccess) return 1;
+
+    // transfer IC part of the dictionary to GPU
+    cudaStatus = cudaMemcpy(gpu_voxelIC,  voxelIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) return 2;
+    cudaStatus = cudaMemcpy(gpu_fiberIC,  fiberIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) return 2;
+    cudaStatus = cudaMemcpy(gpu_orienIC,  orienIC,  nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) return 2;
+    cudaStatus = cudaMemcpy(gpu_lengthIC, lengthIC, nsegments*sizeof(float32_t), cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) return 2;
+
+    if (npeaks > 0){
+        // alloc EC part of the dictionary in GPU
+        cudaStatus = cudaMalloc((void**)&gpu_voxelEC,  npeaks*sizeof(uint32_t));
+        if (cudaStatus != cudaSuccess) return 1;
+        cudaStatus = cudaMalloc((void**)&gpu_orienEC,  npeaks*sizeof(uint16_t));
+        if (cudaStatus != cudaSuccess) return 1;
+
+        // transfer EC part of the dictionary to GPU
+        cudaStatus = cudaMemcpy(gpu_voxelEC,  voxelEC,  npeaks*sizeof(uint32_t), cudaMemcpyHostToDevice);
+        if (cudaStatus != cudaSuccess) return 2;
+        cudaStatus = cudaMemcpy(gpu_orienEC,  orienEC,  npeaks*sizeof(uint16_t), cudaMemcpyHostToDevice);
+        if (cudaStatus != cudaSuccess) return 2;
+    }
+
+    return 0;
+}
+
+int CudaLinearOperator::setTransposeDictionary(uint32_t* TvoxelIC, uint32_t* TfiberIC, uint16_t* TorienIC, float32_t* TlengthIC){
+    
+    cudaError_t cudaStatus;
+
+    uint32_t*  fibersPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
+    uint32_t*  offsetPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
+    if(fibersPerBlock == NULL || offsetPerBlock == NULL) return -1;
+
+    preprocessDataForGPU(TfiberIC, nsegments, fibersPerBlock, offsetPerBlock, nfibers);
+
+    cudaStatus = cudaMalloc((void**)&gpu_TfibersPerBlockIC, nfibers*sizeof(uint32_t));
+    if (cudaStatus != cudaSuccess) return 1;
+    cudaStatus = cudaMalloc((void**)&gpu_ToffsetPerBlockIC, nfibers*sizeof(uint32_t));
+    if (cudaStatus != cudaSuccess) return 1;
+
+    cudaStatus = cudaMemcpy(gpu_TfibersPerBlockIC, fibersPerBlock, nfibers*sizeof(uint32_t), cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) return 2;
+    cudaStatus = cudaMemcpy(gpu_ToffsetPerBlockIC, offsetPerBlock, nfibers*sizeof(uint32_t), cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) return 2;
+
+    free(fibersPerBlock);
+    free(offsetPerBlock);
+
+    cudaStatus = cudaMalloc((void**)&gpu_TvoxelIC,  nsegments*sizeof(uint32_t)) ;
+    if (cudaStatus != cudaSuccess) return 1;
+    cudaStatus = cudaMalloc((void**)&gpu_TfiberIC,  nsegments*sizeof(uint32_t)) ;
+    if (cudaStatus != cudaSuccess) return 1;
+    cudaStatus = cudaMalloc((void**)&gpu_TorienIC,  nsegments*sizeof(uint16_t)) ;
+    if (cudaStatus != cudaSuccess) return 1;
+    cudaStatus = cudaMalloc((void**)&gpu_TlengthIC, nsegments*sizeof(float32_t));
+    if (cudaStatus != cudaSuccess) return 1;
+
+    cudaStatus = cudaMemcpy(gpu_TvoxelIC,  TvoxelIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) return 2;
+    cudaStatus = cudaMemcpy(gpu_TfiberIC,  TfiberIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) return 2;
+    cudaStatus = cudaMemcpy(gpu_TorienIC,  TorienIC,  nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) return 2;
+    cudaStatus = cudaMemcpy(gpu_TlengthIC, TlengthIC, nsegments*sizeof(float32_t), cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) return 2;
+    
+    return 0;
+}
+
+int CudaLinearOperator::setKernels(float32_t* lutIC, float32_t* lutEC, float32_t* lutISO){
+
+    cudaError_t cudaStatus;
+
+    if (ndiameters > 0){
+        cudaStatus = cudaMalloc((void**)&gpu_lutIC, size_lutic*sizeof(float32_t));
+        if (cudaStatus != cudaSuccess) return 1;
+        cudaStatus = cudaMemcpy(gpu_lutIC, lutIC, size_lutic*sizeof(float32_t), cudaMemcpyHostToDevice);
+        if (cudaStatus != cudaSuccess) return 2;
+
+        tex_lutIC.addressMode[0] = cudaAddressModeBorder;
+        tex_lutIC.addressMode[1] = cudaAddressModeBorder;
+        tex_lutIC.filterMode = cudaFilterModePoint;
+        tex_lutIC.normalized = false;
+
+        cudaStatus = cudaBindTexture(NULL, tex_lutIC,  gpu_lutIC,  size_lutic*sizeof(float32_t));
+        if (cudaStatus != cudaSuccess) return 3;
+    }
+
+    if (nzeppelins > 0){
+        cudaStatus = cudaMalloc((void**)&gpu_lutEC,  size_lutec*sizeof(float32_t));
+        if (cudaStatus != cudaSuccess) return 1;
+        cudaStatus = cudaMemcpy(gpu_lutEC, lutEC, size_lutec*sizeof(float32_t), cudaMemcpyHostToDevice);
+        if (cudaStatus != cudaSuccess) return 2;
+
+        tex_lutEC.addressMode[0] = cudaAddressModeBorder;
+        tex_lutEC.addressMode[1] = cudaAddressModeBorder;
+        tex_lutEC.filterMode = cudaFilterModePoint;
+        tex_lutEC.normalized = false;
+
+        cudaStatus = cudaBindTexture(NULL, tex_lutEC,  gpu_lutEC,  size_lutec*sizeof(float32_t));
+        if (cudaStatus != cudaSuccess) return 3;
+    }
+
+    if (nballs > 0){
+        cudaStatus = cudaMalloc((void**)&gpu_lutISO, size_lutiso*sizeof(float32_t));
+        if (cudaStatus != cudaSuccess) return 1;
+        cudaStatus = cudaMemcpy(gpu_lutISO, lutISO, size_lutiso*sizeof(float32_t), cudaMemcpyHostToDevice);
+        if (cudaStatus != cudaSuccess) return 2;
+
+        tex_lutISO.addressMode[0] = cudaAddressModeBorder;
+        tex_lutISO.addressMode[1] = cudaAddressModeBorder;
+        tex_lutISO.filterMode = cudaFilterModePoint;
+        tex_lutISO.normalized = false;
+
+        cudaStatus = cudaBindTexture(NULL, tex_lutISO, gpu_lutISO, size_lutiso*sizeof(float32_t));
+        if (cudaStatus != cudaSuccess) return 3;
+    }
+
+    return 0;
+}
+
+int CudaLinearOperator::setVectors(){
+    
+    cudaError_t cudaStatus;
+
+    cudaStatus = cudaMalloc((void**)&gpu_x, ncols*sizeof(float64_t));
+    if (cudaStatus != cudaSuccess) return 1;
+    cudaStatus = cudaMalloc((void**)&gpu_y, nrows*sizeof(float64_t));
+    if (cudaStatus != cudaSuccess) return 1;
+    
+    return 0;
+}
+
+int CudaLinearOperator::setGlobals(){
+    
+    cudaError_t cudaStatus;
+
+    cudaStatus = cudaMemcpyToSymbol(NUM_VOXELS,       &nvoxels,       sizeof(int));
+    if (cudaStatus != cudaSuccess) return -1;
+    cudaStatus = cudaMemcpyToSymbol(NUM_FIBERS,       &nfibers,       sizeof(int));
+    if (cudaStatus != cudaSuccess) return -1;
+    cudaStatus = cudaMemcpyToSymbol(NUM_PEAKS,        &npeaks,        sizeof(int));
+    if (cudaStatus != cudaSuccess) return -1;
+    cudaStatus = cudaMemcpyToSymbol(NUM_ORIENTATIONS, &norientations, sizeof(int));
+    if (cudaStatus != cudaSuccess) return -1;
+    cudaStatus = cudaMemcpyToSymbol(NUM_SAMPLES,      &nsamples,      sizeof(int));
+    if (cudaStatus != cudaSuccess) return -1;
+    cudaStatus = cudaMemcpyToSymbol(NUM_DIAMETERS,    &ndiameters,    sizeof(int));
+    if (cudaStatus != cudaSuccess) return -1;
+    cudaStatus = cudaMemcpyToSymbol(NUM_ZEPPELINS,    &nzeppelins,    sizeof(int));
+    if (cudaStatus != cudaSuccess) return -1;
+    cudaStatus = cudaMemcpyToSymbol(NUM_BALLS,        &nballs,        sizeof(int));
+    if (cudaStatus != cudaSuccess) return -1;
+    cudaStatus = cudaMemcpyToSymbol(NUM_ROWS,         &nrows,         sizeof(int));
+    if (cudaStatus != cudaSuccess) return -1;
+    cudaStatus = cudaMemcpyToSymbol(NUM_COLS,         &ncols,         sizeof(int));
+    if (cudaStatus != cudaSuccess) return -1;
+    cudaStatus = cudaMemcpyToSymbol(SIZE_LUTIC,       &size_lutic,    sizeof(int));
+    if (cudaStatus != cudaSuccess) return -1;
+    cudaStatus = cudaMemcpyToSymbol(SIZE_LUTEC,       &size_lutec,    sizeof(int));
+    if (cudaStatus != cudaSuccess) return -1;
+    cudaStatus = cudaMemcpyToSymbol(SIZE_LUTISO,      &size_lutiso,   sizeof(int));
+    if (cudaStatus != cudaSuccess) return -1;
+    
+    return 0;
+}
+
+CudaLinearOperator::CudaLinearOperator(int nsegments, int nvoxels, int nfibers, int npeaks, int norientations, int nsamples, int ndiameters, int nzeppelins, int nballs){
+
+    this->nsegments = nsegments;
+    this->nvoxels = nvoxels;
+    this->nfibers = nfibers;
+    this->npeaks = npeaks;
+    this->norientations = norientations;
+    this->nsamples = nsamples;
+    this->ndiameters = ndiameters;
+    this->nzeppelins = nzeppelins;   
+    this->nballs = nballs;
+    this->size_lutic = ndiameters*norientations*nsamples;
+    this->size_lutec = nzeppelins*norientations*nsamples;
+    this->size_lutiso = nballs*nsamples;
+    this->nrows = nvoxels*nsamples;
+    this->ncols = nfibers*ndiameters + npeaks*nzeppelins + nvoxels*nballs;
+}
+
+CudaLinearOperator::~CudaLinearOperator() {}
+
+int CudaLinearOperator::destroy(){
+    cudaError_t cudaStatus;    
+
+    cudaStatus = cudaFree(gpu_voxelIC);
+    cudaStatus = cudaFree(gpu_fiberIC);
+    cudaStatus = cudaFree(gpu_orienIC);
+    cudaStatus = cudaFree(gpu_lengthIC);
+    cudaStatus = cudaFree(gpu_voxelEC);
+    cudaStatus = cudaFree(gpu_orienEC);
+    cudaStatus = cudaFree(gpu_segmentsPerBlockIC);
+    cudaStatus = cudaFree(gpu_offsetPerBlockIC);
+    cudaStatus = cudaFree(gpu_segmentsPerBlockEC);
+    cudaStatus = cudaFree(gpu_offsetPerBlockEC);
+
+    cudaStatus = cudaFree(gpu_TvoxelIC);
+    cudaStatus = cudaFree(gpu_TfiberIC);
+    cudaStatus = cudaFree(gpu_TorienIC);
+    cudaStatus = cudaFree(gpu_TlengthIC);
+    cudaStatus = cudaFree(gpu_TfibersPerBlockIC);
+    cudaStatus = cudaFree(gpu_ToffsetPerBlockIC);
+
+    cudaStatus = cudaFree(gpu_x);
+    cudaStatus = cudaFree(gpu_y);
+
+    cudaStatus = cudaFree(gpu_lutIC);
+    cudaStatus = cudaFree(gpu_lutEC);
+    cudaStatus = cudaFree(gpu_lutISO);
+    cudaStatus = cudaUnbindTexture(tex_lutIC);
+    cudaStatus = cudaUnbindTexture(tex_lutEC);
+    cudaStatus = cudaUnbindTexture(tex_lutISO);
+
+    cudaStatus = cudaDeviceReset();
+
+    return 0;
+}
+
+void cudaCheckKernel(){
+    cudaError_t cudaStatus;
+    
+    cudaStatus = cudaGetLastError();
+	if(cudaStatus != cudaSuccess)
+        fprintf(stderr, "\t* kernel launch... [ ERROR ]: %s\n\n", cudaGetErrorString(cudaStatus));
+    else
+        printf("\t* kernel launch... [ OK ]\n");
+
+    cudaStatus = cudaDeviceSynchronize();
+	if(cudaStatus != cudaSuccess)
+        fprintf(stderr, "\t* cudaDeviceSynchronize() after launching kernel... [ ERROR ]: %d\n", cudaStatus);
+    else
+        printf("\t* cudaDeviceSynchronize() after launching kernel... [ OK ]\n");
+}
+
+void CudaLinearOperator::dot(float64_t* v_in, float64_t* v_out){
+    
+    // Copy vector x to the GPU
+    cudaMemcpy(gpu_x, v_in, ncols*sizeof(double), cudaMemcpyHostToDevice);
+    //cudaCheckLastError();
+
+    // Multiply IC part in the GPU
+    //multiply_Ax_ICpart<<<nvoxels, 1024>>>(gpu_voxelIC, gpu_fiberIC, gpu_orienIC, gpu_lengthIC, gpu_segmentsPerBlockIC, gpu_offsetPerBlockIC, gpu_lutIC, gpu_x, gpu_y);
+    multiply_Ax_ICpart<<<nvoxels/256 + 1, 256>>>(gpu_voxelIC, gpu_fiberIC, gpu_orienIC, gpu_lengthIC, gpu_segmentsPerBlockIC, gpu_offsetPerBlockIC, gpu_lutIC, gpu_x, gpu_y);
+    //cudaCheckLastError();
+
+    // Multiply EC part in the GPU
+    multiply_Ax_ECpart<<<nvoxels, 512>>>(gpu_voxelEC, gpu_orienEC, gpu_segmentsPerBlockEC, gpu_offsetPerBlockEC, gpu_lutEC, gpu_x, gpu_y);
+    //cudaCheckLastError();
+
+    // Multiply ISO part in the GPU
+    multiply_Ax_ISOpart<<<nvoxels, 512>>>(gpu_lutISO, gpu_x, gpu_y);
+    //cudaCheckLastError();
+
+    // Copy back result to CPU
+    cudaMemcpy(v_out, gpu_y, nrows*sizeof(double), cudaMemcpyDeviceToHost);
+    //cudaCheckLastError();
+}
+
+void CudaLinearOperator::Tdot(float64_t* v_in, float64_t* v_out){
+    
+    // Copy vector y to the GPU
+    cudaMemcpy(gpu_y, v_in, nrows*sizeof(double), cudaMemcpyHostToDevice);
+    //cudaCheckLastError();
+
+    // Multiply IC part in the GPU
+    multiply_Aty_ICpart<<<nfibers, 512>>>(gpu_TvoxelIC, gpu_TfiberIC, gpu_TorienIC, gpu_TlengthIC, gpu_TfibersPerBlockIC, gpu_ToffsetPerBlockIC, gpu_lutIC, gpu_x, gpu_y);
+    //cudaCheckLastError();
+
+    // Multiply EC part in the GPU
+    multiply_Aty_ECpart<<<nvoxels, 512>>>(gpu_voxelEC, gpu_orienEC, gpu_segmentsPerBlockEC, gpu_offsetPerBlockEC, gpu_lutEC, gpu_x, gpu_y);
+    //cudaCheckLastError();
+
+    // Multiply ISO part in the GPU
+    multiply_Aty_ISOpart<<<nvoxels, 512>>>(gpu_lutISO, gpu_x, gpu_y);
+    //cudaCheckLastError();
+
+    // Copy back result to CPU
+    cudaMemcpy(v_out, gpu_x, ncols*sizeof(double), cudaMemcpyDeviceToHost);
+    //cudaCheckLastError();
+}
+
+// ------------------------------------------------------- KERNELS ------------------------------------------------------- //
+/*__global__ void multiply_Ax_ICpart(uint32_t*  voxelIDs,
+                                   uint32_t*  fiberIDs,
+                                   uint16_t*  orienIDs,
+                                   float32_t* lengths,
+                                   uint32_t*  segmentsPerBlock,
+                                   uint32_t*  offsetPerBlock,
+                                   float32_t* lut,
+                                   float64_t* x,
+                                   float64_t* y)
+{
+    __shared__ float64_t shmem[1024];
+
+    uint32_t bid = blockIdx.x;
+    uint32_t tid = threadIdx.x;
+    uint32_t gid = threadIdx.x / 512;
+    uint32_t sid = threadIdx.x - 512*gid;
+
+    shmem[tid] = 0.0;
+
+    if(sid >= NUM_SAMPLES) return;
+
+    uint32_t offset = offsetPerBlock[bid] + (segmentsPerBlock[bid]/2)*gid;
+    uint32_t nsegments = segmentsPerBlock[bid]/2 + (segmentsPerBlock[bid]%2)*gid;
+
+    uint32_t*  voxel  = voxelIDs + offset;
+    uint32_t*  fiber  = fiberIDs + offset;
+    uint16_t*  orien  = orienIDs + offset;
+    float32_t* length = lengths  + offset;
+
+    float64_t sum = 0.0;
+
+    for(int i = 0; i < nsegments; i++){
+        int offset_lut = (*orien)*NUM_SAMPLES + sid;
+
+        float64_t aux = 0.0;
+        for(int j = 0; j < NUM_DIAMETERS; j++){
+            aux += (double)(lut[offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES])*x[(*fiber) + j*NUM_FIBERS];
+            //aux += tex1Dfetch(tex_lutIC, offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES) * x[(*fiber) + j*NUM_FIBERS];
+        }
+
+        sum += aux * (*length);
+
+        fiber++;
+        orien++;
+        length++;
+    }
+
+    shmem[tid] = sum;
+    __syncthreads();
+
+    if(tid < NUM_SAMPLES)
+        y[(*voxel)*NUM_SAMPLES + sid] = sum + shmem[tid+512];
+}//*/
+
+__global__ void multiply_Ax_ICpart(uint32_t*  voxelIDs,
+                     uint32_t*  fiberIDs,
+                     uint16_t*  orienIDs,
+                     float32_t* lengths,
+                     uint32_t*  segmentsPerVoxel,
+                     uint32_t*  offsetPerVoxel,
+                     float32_t* lut,
+                     float64_t* x,
+                     float64_t* y)
+{
+    uint32_t bid = blockIdx.x;
+    uint32_t tid = threadIdx.x;
+
+    uint32_t vid = bid*256 + tid;
+
+    if (vid >= NUM_VOXELS) return;
+
+    uint32_t offset = offsetPerVoxel[ vid ];
+    uint32_t nsegments = segmentsPerVoxel[ vid ];
+
+    uint32_t*  voxel  = voxelIDs + offset;
+    uint32_t*  fiber  = fiberIDs + offset;
+    uint16_t*  orien  = orienIDs + offset;
+    float32_t* length = lengths  + offset;
+
+    for(int i=0; i<nsegments; i++){
+        for(int s=0; s<NUM_SAMPLES; s++){
+            int offset_lut = (*orien)*NUM_SAMPLES + s;
+
+            float64_t aux = 0.0;
+            for(int j=0; j<NUM_DIAMETERS; j++)
+                aux += (float64_t)(lut[offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES]) * x[(*fiber) + j*NUM_FIBERS];
+
+            y[(*voxel)*NUM_SAMPLES + s] = aux*(*length);
+        }
+
+        fiber++;
+        orien++;
+        length++;
+    }
+}
+
+__global__ void multiply_Ax_ECpart(
+    uint32_t*  voxelIDs,
+    uint16_t*  orienIDs,
+    uint32_t*  segmentsPerBlock,
+    uint32_t*  offsetPerBlock,
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y)
+{
+    uint32_t bid = blockIdx.x;
+    uint32_t tid = threadIdx.x;
+
+    if(tid >= NUM_SAMPLES) return;
+
+    uint32_t offset  = offsetPerBlock[bid];
+    uint32_t nsegments = segmentsPerBlock[bid];
+
+    uint32_t* voxel = voxelIDs + offset;
+    uint16_t* orien = orienIDs + offset;
+
+    uint32_t target = NUM_FIBERS*NUM_DIAMETERS + offset;
+
+    float64_t sum = 0.0;
+    for(int i = 0; i < nsegments; i++){
+        uint32_t offset_lut = (*orien)*NUM_SAMPLES + tid;
+
+        for(int j = 0; j < NUM_ZEPPELINS; j++)
+            sum += (double)(lut[offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES])*x[target + j*NUM_PEAKS + i];
+            //sum += tex1Dfetch(tex_lutEC, offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES) * x[target + j*NUM_PEAKS + i];
+
+        orien++;
+    }
+
+    y[(*voxel)*NUM_SAMPLES + tid] += sum;
+}
+
+__global__ void multiply_Ax_ISOpart(
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y)
+{
+    uint32_t bid = blockIdx.x;
+    uint32_t tid = threadIdx.x;
+
+    if(tid >= NUM_SAMPLES) return;
+
+    uint32_t target = NUM_FIBERS*NUM_DIAMETERS + NUM_PEAKS*NUM_ZEPPELINS + bid;
+
+    float64_t sum = 0.0;
+    for(int j = 0; j < NUM_BALLS; j++)
+        sum += (double)(lut[j*NUM_SAMPLES + tid])*x[target + j*NUM_VOXELS];
+        //sum += (double)(tex1Dfetch(tex_lutISO, j*NUM_SAMPLES + tid))*x[target + j*NUM_VOXELS];
+        
+
+    y[bid*NUM_SAMPLES + tid] += sum;
+}
+
+__global__ void multiply_Aty_ICpart(
+    uint32_t*  voxelICt,
+    uint32_t*  fiberICt,
+    uint16_t*  orienICt,
+    float32_t* lengthICt,
+    uint32_t*  compartmentsPerBlock,
+    uint32_t*  offsetPerBlock,
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y)
+{
+    __shared__ float64_t shmem[512];
+
+    uint32_t bid = blockIdx.x;
+    uint32_t tid = threadIdx.x;
+
+    shmem[tid] = 0.0;
+
+    if(tid >= NUM_SAMPLES) return;
+
+    uint32_t offset = offsetPerBlock[bid];
+    uint32_t nsegments = offset + compartmentsPerBlock[bid];
+
+    uint32_t*  voxel  = voxelICt  + offset;
+    uint32_t*  fiber  = fiberICt  + offset;
+    uint16_t*  orien  = orienICt  + offset;
+    float32_t* length = lengthICt + offset;
+
+    for(int j = 0; j < NUM_DIAMETERS; j++){
+        int offset_lut = j*NUM_ORIENTATIONS*NUM_SAMPLES + tid;
+
+        float64_t sum = 0.0;
+        voxel  = voxelICt  + offset;
+        orien  = orienICt  + offset;
+        length = lengthICt + offset;
+        for(int i = offset; i < nsegments; i++){
+            sum += ((float64_t)(*length)) *( (float64_t) lut[offset_lut + (*orien)*NUM_SAMPLES] )* y[(*voxel)*NUM_SAMPLES + tid];
+            //sum += ((float64_t)(*length)) *( (float64_t) tex1Dfetch(tex_lutIC, offset_lut + (*orien)*NUM_SAMPLES) )* y[(*voxel)*NUM_SAMPLES + tid];
+
+            voxel++;
+            orien++;
+            length++;
+        }
+
+        shmem[tid] = sum;
+        __syncthreads();
+
+        if(tid < 256) shmem[tid] += shmem[tid + 256]; __syncthreads();
+        if(tid < 128) shmem[tid] += shmem[tid + 128]; __syncthreads();
+        if(tid <  64) shmem[tid] += shmem[tid +  64]; __syncthreads();
+        if(tid <  32) shmem[tid] += shmem[tid +  32]; __syncthreads();
+        if(tid <  16) shmem[tid] += shmem[tid +  16]; __syncthreads();
+        if(tid <   8) shmem[tid] += shmem[tid +   8]; __syncthreads();
+        if(tid <   4) shmem[tid] += shmem[tid +   4]; __syncthreads();
+
+        if(tid == 0) x[j*NUM_FIBERS + (*fiber)] = shmem[0] + shmem[1] + shmem[2] + shmem[3];
+
+        __syncthreads();
+    }
+}
+
+__global__ void multiply_Aty_ECpart(
+    uint32_t*  voxelEC,
+    uint16_t*  orienEC,
+    uint32_t*  segmentsPerBlock,
+    uint32_t*  offsetPerBlock,
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y)
+{
+    __shared__ float64_t shmem[512];
+
+    uint32_t bid = blockIdx.x;
+    uint32_t tid = threadIdx.x;
+
+    shmem[tid] = 0.0;
+
+    if(tid >= NUM_SAMPLES) return;
+
+    uint32_t offset  = offsetPerBlock[bid];
+    uint32_t ncompartments = segmentsPerBlock[bid] + offset;
+
+    uint32_t* voxel = voxelEC + offset;
+    uint16_t* orien = orienEC + offset;
+
+    for(int j = 0; j < NUM_ZEPPELINS; j++){        
+        uint32_t offset_lut = j*NUM_ORIENTATIONS*NUM_SAMPLES + tid;
+
+        voxel = voxelEC + offset;
+        orien = orienEC + offset;
+        for(int i = offset; i < ncompartments; i++){
+            shmem[tid] =( (float64_t)(lut[(*orien)*NUM_SAMPLES + offset_lut] ))* y[(*voxel)*NUM_SAMPLES + tid];
+            //shmem[tid] =( (float64_t)tex1Dfetch(tex_lutEC, (*orien)*NUM_SAMPLES + offset_lut) )* y[(*voxel)*NUM_SAMPLES + tid];
+            __syncthreads();
+
+            if(tid < 256) shmem[tid] += shmem[tid + 256]; __syncthreads();
+            if(tid < 128) shmem[tid] += shmem[tid + 128]; __syncthreads();
+            if(tid <  64) shmem[tid] += shmem[tid +  64]; __syncthreads();
+            if(tid <  32) shmem[tid] += shmem[tid +  32]; __syncthreads();
+            if(tid <  16) shmem[tid] += shmem[tid +  16]; __syncthreads();
+            if(tid <   8) shmem[tid] += shmem[tid +   8]; __syncthreads();
+            if(tid <   4) shmem[tid] += shmem[tid +   4]; __syncthreads();
+            if(tid <   2) shmem[tid] += shmem[tid +   2]; __syncthreads();
+
+            if(tid == 0) x[NUM_FIBERS*NUM_DIAMETERS + j*NUM_PEAKS + i] = shmem[0] + shmem[1];
+
+            voxel++;
+            orien++;
+            __syncthreads();
+        }
+    }
+}
+
+__global__ void multiply_Aty_ISOpart(float* lut, double* x, double* y){
+    __shared__ double shmem[512];
+
+    uint bid = blockIdx.x;
+    uint tid = threadIdx.x;
+    uint offset = NUM_FIBERS*NUM_DIAMETERS + NUM_PEAKS*NUM_ZEPPELINS + bid;
+
+    shmem[tid] = 0.0;
+
+    if(tid >= NUM_SAMPLES) return;
+
+    for(int j = 0; j < NUM_BALLS; j++){
+        shmem[tid] =( (float64_t) lut[j*NUM_SAMPLES + tid] )* y[bid*NUM_SAMPLES + tid];
+        //shmem[tid] =( (float64_t) tex1Dfetch(tex_lutISO, j*NUM_SAMPLES + tid) )* y[bid*NUM_SAMPLES + tid];
+        __syncthreads();
+
+        if(tid < 256) shmem[tid] += shmem[tid + 256]; __syncthreads();
+        if(tid < 128) shmem[tid] += shmem[tid + 128]; __syncthreads();
+        if(tid <  64) shmem[tid] += shmem[tid +  64]; __syncthreads();
+        if(tid <  32) shmem[tid] += shmem[tid +  32]; __syncthreads();
+        if(tid <  16) shmem[tid] += shmem[tid +  16]; __syncthreads();
+        if(tid <   8) shmem[tid] += shmem[tid +   8]; __syncthreads();
+        if(tid <   4) shmem[tid] += shmem[tid +   4]; __syncthreads(); 
+
+        if(tid == 0)
+            x[offset + j*NUM_VOXELS] = shmem[0] + shmem[1] + shmem[2] + shmem[3];
+    }
+}
+
diff --git a/commit/cudaoperator/operator_withCUDA.cuh b/commit/cudaoperator/operator_withCUDA.cuh
index 6b3d09bc..c6fb879a 100644
--- a/commit/cudaoperator/operator_withCUDA.cuh
+++ b/commit/cudaoperator/operator_withCUDA.cuh
@@ -1,176 +1,176 @@
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cuda_runtime_api.h>
-#include <device_launch_parameters.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <algorithm>
-
-using namespace std;
-
-typedef unsigned int uint32_t;
-typedef unsigned short int uint16_t;
-typedef float float32_t;
-typedef double float64_t;
-
-// ====================================================
-// Util functions to check CUDA GPU compatibility
-// ====================================================
-int checkCompatibility(int gpu_id);
-void cudaCheckLastError();
-
-// ====================================================
-// Function to preprocess data for GPU
-// ====================================================
-void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compartmentsPerBlock, uint32_t* offsetPerBlock, int NUM_BLOCKS);
-
-// ====================================================
-// CUDA Kernels for Ax operation
-// ====================================================
-__global__ void multiply_Ax_ICpart(
-    uint32_t*  voxelIDs,
-    uint32_t*  fiberIDs,
-    uint16_t*  orienIDs,
-    float32_t* lengths,
-    uint32_t*  segmentsPerBlock,
-    uint32_t*  offsetPerBlock,
-    float32_t* lut,
-    float64_t* x,
-    float64_t* y);
-
-__global__ void multiply_Ax_ECpart(
-        uint32_t*  voxelIDs,
-        uint16_t*  orienIDs,
-        uint32_t*  segmentsPerBlock,
-        uint32_t*  offsetPerBlock,
-        float32_t* lut,
-        float64_t* x,
-        float64_t* y);
-
-__global__ void multiply_Ax_ISOpart(
-    float32_t* lut,
-    float64_t* x,
-    float64_t* y);
-
-// ====================================================
-// CUDA Kernels for A'y operation
-// ====================================================
-__global__ void multiply_Aty_ICpart(
-    uint32_t*  TvoxelIC,
-    uint32_t*  TfiberIC,
-    uint16_t*  TorienIC,
-    float32_t* TlengthIC,
-    uint32_t*  compartmentsPerBlock,
-    uint32_t*  offsetPerBlock,
-    float32_t* lut,
-    float64_t* x,
-    float64_t* y);
-
-__global__ void multiply_Aty_ECpart(
-    uint32_t*  voxelEC,
-    uint16_t*  orienEC,
-    uint32_t*  segmentsPerBlock,
-    uint32_t*  offsetPerBlock,
-    float32_t* lut,
-    float64_t* x,
-    float64_t* y);
-
-__global__ void multiply_Aty_ISOpart(
-    float* lut,
-    double* x,
-    double* y);
-
-// ====================================================
-// Constant global values in the GPU
-// ====================================================
-__constant__ int NUM_VOXELS;
-__constant__ int NUM_FIBERS;
-__constant__ int NUM_PEAKS;
-__constant__ int NUM_ORIENTATIONS;
-__constant__ int NUM_SAMPLES;
-__constant__ int NUM_DIAMETERS;
-__constant__ int NUM_ZEPPELINS;
-__constant__ int NUM_BALLS;
-__constant__ int NUM_ROWS;        
-__constant__ int NUM_COLS;      
-__constant__ int SIZE_LUTIC;      
-__constant__ int SIZE_LUTEC;     
-__constant__ int SIZE_LUTISO;
-
-// ====================================================
-// Pointers to A (IC part) in the GPU
-// ====================================================
-static uint32_t*  gpu_voxelIC;
-static uint32_t*  gpu_fiberIC;
-static uint16_t*  gpu_orienIC;
-static float32_t* gpu_lengthIC;
-static uint32_t*  gpu_segmentsPerBlockIC;
-static uint32_t*  gpu_offsetPerBlockIC;
-
-// ====================================================
-// Pointers to A' (IC part) in the GPU
-// ====================================================
-static uint32_t*  gpu_TvoxelIC;
-static uint32_t*  gpu_TfiberIC;
-static uint16_t*  gpu_TorienIC;
-static float32_t* gpu_TlengthIC;
-static uint32_t*  gpu_TfibersPerBlockIC;
-static uint32_t*  gpu_ToffsetPerBlockIC;
-
-// ====================================================
-// Pointers to A (EC part) in the GPU
-// ====================================================
-static uint32_t* gpu_voxelEC;
-static uint16_t* gpu_orienEC;
-static uint32_t* gpu_segmentsPerBlockEC;
-static uint32_t* gpu_offsetPerBlockEC;
-
-// ====================================================
-// Pointers to LUT in the GPU
-// ====================================================
-static float32_t* gpu_lutIC;
-static float32_t* gpu_lutEC;
-static float32_t* gpu_lutISO;
-
-// ====================================================
-// Pointers to x and y in the GPU
-// ====================================================
-static float64_t* gpu_x;
-static float64_t* gpu_y;
-
-// ============================================================================
-// This class creates an instance of the LinearOperator in GPU memory
-// ============================================================================
-class CudaLinearOperator {
-
-    // constant values in CPU
-    int nsegments;
-    int nvoxels;    
-    int nfibers;      
-    int npeaks;
-    int norientations;
-    int nsamples;
-    int ndiameters;
-    int nzeppelins;   
-    int nballs;
-    int size_lutic;
-    int size_lutec;
-    int size_lutiso;
-    int nrows;
-    int ncols;
-
-    public:
-         CudaLinearOperator(int nsegments, int nvoxels, int nfibers, int npeaks, int norientations, int nsamples, int ndiameters, int nzeppelins, int nballs);
-        ~CudaLinearOperator();
-
-        int setDictionary(uint32_t* voxelIC, uint32_t* fiberIC, uint16_t* orienIC, float32_t* lengthIC, uint32_t* voxelEC, uint16_t* orienEC);
-        int setTransposeDictionary(uint32_t* TvoxelIC, uint32_t* TfiberIC, uint16_t* TorienIC, float32_t* TlengthIC);
-        int setKernels(float32_t* lutIC, float32_t* lutEC, float32_t* lutISO);
-        int setVectors();
-        int setGlobals();
-        int destroy();
-
-        void  dot(float64_t* v_in, float64_t* v_out);
-        void Tdot(float64_t* v_in, float64_t* v_out);
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
+#include <device_launch_parameters.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <algorithm>
+
+using namespace std;
+
+typedef unsigned int uint32_t;
+typedef unsigned short int uint16_t;
+typedef float float32_t;
+typedef double float64_t;
+
+// ====================================================
+// Util functions to check CUDA GPU compatibility
+// ====================================================
+int checkCompatibility(int gpu_id);
+void cudaCheckLastError();
+
+// ====================================================
+// Function to preprocess data for GPU
+// ====================================================
+void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compartmentsPerBlock, uint32_t* offsetPerBlock, int NUM_BLOCKS);
+
+// ====================================================
+// CUDA Kernels for Ax operation
+// ====================================================
+__global__ void multiply_Ax_ICpart(
+    uint32_t*  voxelIDs,
+    uint32_t*  fiberIDs,
+    uint16_t*  orienIDs,
+    float32_t* lengths,
+    uint32_t*  segmentsPerBlock,
+    uint32_t*  offsetPerBlock,
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y);
+
+__global__ void multiply_Ax_ECpart(
+        uint32_t*  voxelIDs,
+        uint16_t*  orienIDs,
+        uint32_t*  segmentsPerBlock,
+        uint32_t*  offsetPerBlock,
+        float32_t* lut,
+        float64_t* x,
+        float64_t* y);
+
+__global__ void multiply_Ax_ISOpart(
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y);
+
+// ====================================================
+// CUDA Kernels for A'y operation
+// ====================================================
+__global__ void multiply_Aty_ICpart(
+    uint32_t*  TvoxelIC,
+    uint32_t*  TfiberIC,
+    uint16_t*  TorienIC,
+    float32_t* TlengthIC,
+    uint32_t*  compartmentsPerBlock,
+    uint32_t*  offsetPerBlock,
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y);
+
+__global__ void multiply_Aty_ECpart(
+    uint32_t*  voxelEC,
+    uint16_t*  orienEC,
+    uint32_t*  segmentsPerBlock,
+    uint32_t*  offsetPerBlock,
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y);
+
+__global__ void multiply_Aty_ISOpart(
+    float* lut,
+    double* x,
+    double* y);
+
+// ====================================================
+// Constant global values in the GPU
+// ====================================================
+__constant__ int NUM_VOXELS;
+__constant__ int NUM_FIBERS;
+__constant__ int NUM_PEAKS;
+__constant__ int NUM_ORIENTATIONS;
+__constant__ int NUM_SAMPLES;
+__constant__ int NUM_DIAMETERS;
+__constant__ int NUM_ZEPPELINS;
+__constant__ int NUM_BALLS;
+__constant__ int NUM_ROWS;        
+__constant__ int NUM_COLS;      
+__constant__ int SIZE_LUTIC;      
+__constant__ int SIZE_LUTEC;     
+__constant__ int SIZE_LUTISO;
+
+// ====================================================
+// Pointers to A (IC part) in the GPU
+// ====================================================
+static uint32_t*  gpu_voxelIC;
+static uint32_t*  gpu_fiberIC;
+static uint16_t*  gpu_orienIC;
+static float32_t* gpu_lengthIC;
+static uint32_t*  gpu_segmentsPerBlockIC;
+static uint32_t*  gpu_offsetPerBlockIC;
+
+// ====================================================
+// Pointers to A' (IC part) in the GPU
+// ====================================================
+static uint32_t*  gpu_TvoxelIC;
+static uint32_t*  gpu_TfiberIC;
+static uint16_t*  gpu_TorienIC;
+static float32_t* gpu_TlengthIC;
+static uint32_t*  gpu_TfibersPerBlockIC;
+static uint32_t*  gpu_ToffsetPerBlockIC;
+
+// ====================================================
+// Pointers to A (EC part) in the GPU
+// ====================================================
+static uint32_t* gpu_voxelEC;
+static uint16_t* gpu_orienEC;
+static uint32_t* gpu_segmentsPerBlockEC;
+static uint32_t* gpu_offsetPerBlockEC;
+
+// ====================================================
+// Pointers to LUT in the GPU
+// ====================================================
+static float32_t* gpu_lutIC;
+static float32_t* gpu_lutEC;
+static float32_t* gpu_lutISO;
+
+// ====================================================
+// Pointers to x and y in the GPU
+// ====================================================
+static float64_t* gpu_x;
+static float64_t* gpu_y;
+
+// ============================================================================
+// This class creates an instance of the LinearOperator in GPU memory
+// ============================================================================
+class CudaLinearOperator {
+
+    // constant values in CPU
+    int nsegments;
+    int nvoxels;    
+    int nfibers;      
+    int npeaks;
+    int norientations;
+    int nsamples;
+    int ndiameters;
+    int nzeppelins;   
+    int nballs;
+    int size_lutic;
+    int size_lutec;
+    int size_lutiso;
+    int nrows;
+    int ncols;
+
+    public:
+         CudaLinearOperator(int nsegments, int nvoxels, int nfibers, int npeaks, int norientations, int nsamples, int ndiameters, int nzeppelins, int nballs);
+        ~CudaLinearOperator();
+
+        int setDictionary(uint32_t* voxelIC, uint32_t* fiberIC, uint16_t* orienIC, float32_t* lengthIC, uint32_t* voxelEC, uint16_t* orienEC);
+        int setTransposeDictionary(uint32_t* TvoxelIC, uint32_t* TfiberIC, uint16_t* TorienIC, float32_t* TlengthIC);
+        int setKernels(float32_t* lutIC, float32_t* lutEC, float32_t* lutISO);
+        int setVectors();
+        int setGlobals();
+        int destroy();
+
+        void  dot(float64_t* v_in, float64_t* v_out);
+        void Tdot(float64_t* v_in, float64_t* v_out);
 };
\ No newline at end of file
diff --git a/commit/operator/config.py b/commit/operator/config.py
index 8cbac4ed..e4c6bf58 100755
--- a/commit/operator/config.py
+++ b/commit/operator/config.py
@@ -1,6 +1,6 @@
-nTHREADS      = None
-model         = None
-nIC           = None
-nEC      	  = None
-nISO     	  = None
-build_dir	  = None
+nTHREADS      = None
+model         = None
+nIC           = None
+nEC      	  = None
+nISO     	  = None
+build_dir	  = None
diff --git a/commit/operator/operator.pyx b/commit/operator/operator.pyx
index a4187f95..4fc3a835 100755
--- a/commit/operator/operator.pyx
+++ b/commit/operator/operator.pyx
@@ -1,192 +1,192 @@
-#!python
-#cython: language_level=3, boundscheck=False, wraparound=False, profile=False
-
-import cython
-import numpy as np
-from amico.util import ERROR
-cimport numpy as np
-
-# Interfaces to actual C code performing the multiplications
-cdef extern void COMMIT_A(
-    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
-    double *_v_in, double *_v_out,
-    unsigned int *_ICf, unsigned int *_ICv, unsigned short *_ICo, float *_ICl,
-    unsigned int *_ECv, unsigned short *_ECo,
-    unsigned int *_ISOv,
-    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
-    unsigned int* _ICthreads, unsigned int* _ECthreads, unsigned int* _ISOthreads
-) nogil
-
-cdef extern void COMMIT_At(
-    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
-    double *_v_in, double *_v_out,
-    unsigned int *_ICf, unsigned int *_ICv, unsigned short *_ICo, float *_ICl,
-    unsigned int *_ECv, unsigned short *_ECo,
-    unsigned int *_ISOv,
-    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
-    unsigned char *_ICthreadsT, unsigned int *_ECthreadsT, unsigned int *_ISOthreadsT
-) nogil
-
-
-
-cdef class LinearOperator :
-    """This class is a wrapper to the C code for performing marix-vector multiplications
-    with the COMMIT linear operator A. The multiplications are done using C code
-    that uses information from the DICTIONARY, KERNELS and THREADS data structures.
-    """
-    cdef int nS, nF, nR, nE, nT, nV, nI, n, ndirs
-    cdef public int adjoint, n1, n2
-
-    cdef DICTIONARY
-    cdef KERNELS
-    cdef THREADS
-
-    cdef unsigned int*   ICf
-    cdef float*          ICl
-    cdef unsigned int*   ICv
-    cdef unsigned short* ICo
-    cdef unsigned int*   ECv
-    cdef unsigned short* ECo
-    cdef unsigned int*   ISOv
-
-    cdef float* LUT_IC
-    cdef float* LUT_EC
-    cdef float* LUT_ISO
-
-    cdef unsigned int*   ICthreads
-    cdef unsigned int*   ECthreads
-    cdef unsigned int*   ISOthreads
-
-    cdef unsigned char*  ICthreadsT
-    cdef unsigned int*   ECthreadsT
-    cdef unsigned int*   ISOthreadsT
-
-
-    def __init__( self, DICTIONARY, KERNELS, THREADS ) :
-        """Set the pointers to the data structures used by the C code."""
-        self.DICTIONARY = DICTIONARY
-        self.KERNELS    = KERNELS
-        self.THREADS    = THREADS
-
-        self.nF         = DICTIONARY['IC']['nF']    # number of FIBERS
-        self.nR         = KERNELS['wmr'].shape[0]   # number of FIBER RADII
-        self.nE         = DICTIONARY['EC']['nE']    # number of EC segments
-        self.nT         = KERNELS['wmh'].shape[0]   # number of EC TORTUOSITY values
-        self.nV         = DICTIONARY['nV']          # number of VOXELS
-        self.nI         = KERNELS['iso'].shape[0]   # number of ISO contributions
-        self.n          = DICTIONARY['IC']['n']     # numbner of IC segments
-        self.ndirs      = KERNELS['wmr'].shape[1]   # number of directions
-
-        if KERNELS['wmr'].size > 0 :
-            self.nS = KERNELS['wmr'].shape[2]       # number of SAMPLES
-        elif KERNELS['wmh'].size > 0 :
-            self.nS = KERNELS['wmh'].shape[2]
-        else :
-            self.nS = KERNELS['wmr'].shape[1]
-
-        self.adjoint    = 0                         # direct of inverse product
-
-        self.n1 = self.nV*self.nS
-        self.n2 = self.nR*self.nF + self.nT*self.nE + self.nI*self.nV
-
-        # get C pointers to arrays in DICTIONARY
-        cdef unsigned int [::1]   ICf  = DICTIONARY['IC']['fiber']
-        self.ICf = &ICf[0]
-        cdef float [::1]          ICl  = DICTIONARY['IC']['len']
-        self.ICl = &ICl[0]
-        cdef unsigned int [::1]   ICv  = DICTIONARY['IC']['v']
-        self.ICv = &ICv[0]
-        cdef unsigned short [::1] ICo  = DICTIONARY['IC']['o']
-        self.ICo = &ICo[0]
-        cdef unsigned int [::1]   ECv  = DICTIONARY['EC']['v']
-        self.ECv = &ECv[0]
-        cdef unsigned short [::1] ECo  = DICTIONARY['EC']['o']
-        self.ECo = &ECo[0]
-        cdef unsigned int [::1]   ISOv = DICTIONARY['ISO']['v']
-        self.ISOv = &ISOv[0]
-
-        # get C pointers to arrays in KERNELS
-        cdef float [:, :, ::1] wmrSFP = KERNELS['wmr']
-        self.LUT_IC  = &wmrSFP[0,0,0]
-        cdef float [:, :, ::1] wmhSFP = KERNELS['wmh']
-        self.LUT_EC  = &wmhSFP[0,0,0]
-        cdef float [:, ::1] isoSFP = KERNELS['iso']
-        self.LUT_ISO = &isoSFP[0,0]
-
-        # get C pointers to arrays in THREADS
-        cdef unsigned int [::1] ICthreads = THREADS['IC']
-        self.ICthreads  = &ICthreads[0]
-        cdef unsigned int [::1] ECthreads = THREADS['EC']
-        self.ECthreads  = &ECthreads[0]
-        cdef unsigned int [::1] ISOthreads = THREADS['ISO']
-        self.ISOthreads = &ISOthreads[0]
-
-        cdef unsigned char [::1] ICthreadsT = THREADS['ICt']
-        self.ICthreadsT  = &ICthreadsT[0]
-        cdef unsigned int  [::1] ECthreadsT = THREADS['ECt']
-        self.ECthreadsT  = &ECthreadsT[0]
-        cdef unsigned int  [::1] ISOthreadsT = THREADS['ISOt']
-        self.ISOthreadsT = &ISOthreadsT[0]
-
-
-    @property
-    def T( self ) :
-        """Transpose of the explicit matrix."""
-        C = LinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
-        C.adjoint = 1 - C.adjoint
-        return C
-
-
-    @property
-    def shape( self ) :
-        """Size of the explicit matrix."""
-        if not self.adjoint :
-            return ( self.n1, self.n2 )
-        else :
-            return ( self.n2, self.n1 )
-
-
-    def dot( self, double [::1] v_in  ):
-        """Wrapper to C code for efficiently performing the matrix-vector multiplications.
-
-        Parameters
-        ----------
-        v_in : 1D numpy.array of double
-            Input vector for the matrix-vector multiplication
-
-        Returns
-        -------
-        v_out : 1D numpy.array of double
-            Results of the multiplication
-        """
-
-        # Permit only matrix-vector multiplications
-        if v_in.size != self.shape[1] :
-            ERROR( "A.dot(): dimensions do not match" )
-
-        # Create output array
-        cdef double [::1] v_out = np.zeros( self.shape[0], dtype=np.float64 )
-
-        # Call the cython function to read the memory pointers
-        if not self.adjoint :
-            # DIRECT PRODUCT A*x
-            with nogil :
-                COMMIT_A(
-                    self.nF, self.n, self.nE, self.nV, self.nS, self.ndirs,
-                    &v_in[0], &v_out[0],
-                    self.ICf, self.ICv, self.ICo, self.ICl, self.ECv, self.ECo, self.ISOv,
-                    self.LUT_IC, self.LUT_EC, self.LUT_ISO,
-                    self.ICthreads, self.ECthreads, self.ISOthreads
-                )
-        else :
-            # INVERSE PRODUCT A'*y
-            with nogil :
-                COMMIT_At(
-                    self.nF, self.n, self.nE, self.nV, self.nS, self.ndirs,
-                    &v_in[0], &v_out[0],
-                    self.ICf, self.ICv, self.ICo, self.ICl, self.ECv, self.ECo, self.ISOv,
-                    self.LUT_IC, self.LUT_EC, self.LUT_ISO,
-                    self.ICthreadsT, self.ECthreadsT, self.ISOthreadsT
-                )
-
-        return v_out
+#!python
+#cython: language_level=3, boundscheck=False, wraparound=False, profile=False
+
+import cython
+import numpy as np
+from amico.util import ERROR
+cimport numpy as np
+
+# Interfaces to actual C code performing the multiplications
+cdef extern void COMMIT_A(
+    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
+    double *_v_in, double *_v_out,
+    unsigned int *_ICf, unsigned int *_ICv, unsigned short *_ICo, float *_ICl,
+    unsigned int *_ECv, unsigned short *_ECo,
+    unsigned int *_ISOv,
+    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
+    unsigned int* _ICthreads, unsigned int* _ECthreads, unsigned int* _ISOthreads
+) nogil
+
+cdef extern void COMMIT_At(
+    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
+    double *_v_in, double *_v_out,
+    unsigned int *_ICf, unsigned int *_ICv, unsigned short *_ICo, float *_ICl,
+    unsigned int *_ECv, unsigned short *_ECo,
+    unsigned int *_ISOv,
+    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
+    unsigned char *_ICthreadsT, unsigned int *_ECthreadsT, unsigned int *_ISOthreadsT
+) nogil
+
+
+
+cdef class LinearOperator :
+    """This class is a wrapper to the C code for performing marix-vector multiplications
+    with the COMMIT linear operator A. The multiplications are done using C code
+    that uses information from the DICTIONARY, KERNELS and THREADS data structures.
+    """
+    cdef int nS, nF, nR, nE, nT, nV, nI, n, ndirs
+    cdef public int adjoint, n1, n2
+
+    cdef DICTIONARY
+    cdef KERNELS
+    cdef THREADS
+
+    cdef unsigned int*   ICf
+    cdef float*          ICl
+    cdef unsigned int*   ICv
+    cdef unsigned short* ICo
+    cdef unsigned int*   ECv
+    cdef unsigned short* ECo
+    cdef unsigned int*   ISOv
+
+    cdef float* LUT_IC
+    cdef float* LUT_EC
+    cdef float* LUT_ISO
+
+    cdef unsigned int*   ICthreads
+    cdef unsigned int*   ECthreads
+    cdef unsigned int*   ISOthreads
+
+    cdef unsigned char*  ICthreadsT
+    cdef unsigned int*   ECthreadsT
+    cdef unsigned int*   ISOthreadsT
+
+
+    def __init__( self, DICTIONARY, KERNELS, THREADS ) :
+        """Set the pointers to the data structures used by the C code."""
+        self.DICTIONARY = DICTIONARY
+        self.KERNELS    = KERNELS
+        self.THREADS    = THREADS
+
+        self.nF         = DICTIONARY['IC']['nF']    # number of FIBERS
+        self.nR         = KERNELS['wmr'].shape[0]   # number of FIBER RADII
+        self.nE         = DICTIONARY['EC']['nE']    # number of EC segments
+        self.nT         = KERNELS['wmh'].shape[0]   # number of EC TORTUOSITY values
+        self.nV         = DICTIONARY['nV']          # number of VOXELS
+        self.nI         = KERNELS['iso'].shape[0]   # number of ISO contributions
+        self.n          = DICTIONARY['IC']['n']     # numbner of IC segments
+        self.ndirs      = KERNELS['wmr'].shape[1]   # number of directions
+
+        if KERNELS['wmr'].size > 0 :
+            self.nS = KERNELS['wmr'].shape[2]       # number of SAMPLES
+        elif KERNELS['wmh'].size > 0 :
+            self.nS = KERNELS['wmh'].shape[2]
+        else :
+            self.nS = KERNELS['wmr'].shape[1]
+
+        self.adjoint    = 0                         # direct of inverse product
+
+        self.n1 = self.nV*self.nS
+        self.n2 = self.nR*self.nF + self.nT*self.nE + self.nI*self.nV
+
+        # get C pointers to arrays in DICTIONARY
+        cdef unsigned int [::1]   ICf  = DICTIONARY['IC']['fiber']
+        self.ICf = &ICf[0]
+        cdef float [::1]          ICl  = DICTIONARY['IC']['len']
+        self.ICl = &ICl[0]
+        cdef unsigned int [::1]   ICv  = DICTIONARY['IC']['v']
+        self.ICv = &ICv[0]
+        cdef unsigned short [::1] ICo  = DICTIONARY['IC']['o']
+        self.ICo = &ICo[0]
+        cdef unsigned int [::1]   ECv  = DICTIONARY['EC']['v']
+        self.ECv = &ECv[0]
+        cdef unsigned short [::1] ECo  = DICTIONARY['EC']['o']
+        self.ECo = &ECo[0]
+        cdef unsigned int [::1]   ISOv = DICTIONARY['ISO']['v']
+        self.ISOv = &ISOv[0]
+
+        # get C pointers to arrays in KERNELS
+        cdef float [:, :, ::1] wmrSFP = KERNELS['wmr']
+        self.LUT_IC  = &wmrSFP[0,0,0]
+        cdef float [:, :, ::1] wmhSFP = KERNELS['wmh']
+        self.LUT_EC  = &wmhSFP[0,0,0]
+        cdef float [:, ::1] isoSFP = KERNELS['iso']
+        self.LUT_ISO = &isoSFP[0,0]
+
+        # get C pointers to arrays in THREADS
+        cdef unsigned int [::1] ICthreads = THREADS['IC']
+        self.ICthreads  = &ICthreads[0]
+        cdef unsigned int [::1] ECthreads = THREADS['EC']
+        self.ECthreads  = &ECthreads[0]
+        cdef unsigned int [::1] ISOthreads = THREADS['ISO']
+        self.ISOthreads = &ISOthreads[0]
+
+        cdef unsigned char [::1] ICthreadsT = THREADS['ICt']
+        self.ICthreadsT  = &ICthreadsT[0]
+        cdef unsigned int  [::1] ECthreadsT = THREADS['ECt']
+        self.ECthreadsT  = &ECthreadsT[0]
+        cdef unsigned int  [::1] ISOthreadsT = THREADS['ISOt']
+        self.ISOthreadsT = &ISOthreadsT[0]
+
+
+    @property
+    def T( self ) :
+        """Transpose of the explicit matrix."""
+        C = LinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
+        C.adjoint = 1 - C.adjoint
+        return C
+
+
+    @property
+    def shape( self ) :
+        """Size of the explicit matrix."""
+        if not self.adjoint :
+            return ( self.n1, self.n2 )
+        else :
+            return ( self.n2, self.n1 )
+
+
+    def dot( self, double [::1] v_in  ):
+        """Wrapper to C code for efficiently performing the matrix-vector multiplications.
+
+        Parameters
+        ----------
+        v_in : 1D numpy.array of double
+            Input vector for the matrix-vector multiplication
+
+        Returns
+        -------
+        v_out : 1D numpy.array of double
+            Results of the multiplication
+        """
+
+        # Permit only matrix-vector multiplications
+        if v_in.size != self.shape[1] :
+            ERROR( "A.dot(): dimensions do not match" )
+
+        # Create output array
+        cdef double [::1] v_out = np.zeros( self.shape[0], dtype=np.float64 )
+
+        # Call the cython function to read the memory pointers
+        if not self.adjoint :
+            # DIRECT PRODUCT A*x
+            with nogil :
+                COMMIT_A(
+                    self.nF, self.n, self.nE, self.nV, self.nS, self.ndirs,
+                    &v_in[0], &v_out[0],
+                    self.ICf, self.ICv, self.ICo, self.ICl, self.ECv, self.ECo, self.ISOv,
+                    self.LUT_IC, self.LUT_EC, self.LUT_ISO,
+                    self.ICthreads, self.ECthreads, self.ISOthreads
+                )
+        else :
+            # INVERSE PRODUCT A'*y
+            with nogil :
+                COMMIT_At(
+                    self.nF, self.n, self.nE, self.nV, self.nS, self.ndirs,
+                    &v_in[0], &v_out[0],
+                    self.ICf, self.ICv, self.ICo, self.ICl, self.ECv, self.ECo, self.ISOv,
+                    self.LUT_IC, self.LUT_EC, self.LUT_ISO,
+                    self.ICthreadsT, self.ECthreadsT, self.ISOthreadsT
+                )
+
+        return v_out
diff --git a/commit/operator/operator.pyxbld b/commit/operator/operator.pyxbld
index f3967a15..1f79d974 100755
--- a/commit/operator/operator.pyxbld
+++ b/commit/operator/operator.pyxbld
@@ -1,39 +1,39 @@
-import numpy
-from os import utime
-from os.path import dirname, join
-from setuptools import Extension
-
-# pass parameters to the compiler at runtime
-# [TODO] find a way to avoid using this fake module
-from commit.operator import config
-
-
-def make_ext(modname, pyxfilename):
-
-    if (config.nTHREADS is None or config.nTHREADS < 1 or config.nTHREADS > 255):
-        raise RuntimeError('config.nTHREADS must be between 1 and 255')
-    if (config.nIC is None or config.nIC < 0 or config.nIC > 20):
-        raise RuntimeError('config.nIC must be in the range [0..20]')
-    if (config.nEC is None or config.nEC < 0 or config.nEC > 20):
-        raise RuntimeError('config.nEC must be in the range [0..20]')
-    if (config.nISO is None or config.nISO < 0 or config.nISO > 20):
-        raise RuntimeError('config.nISO must be in the range [0..20]')
-
-    # Force recompilation
-    if config.model == "VolumeFractions":
-        filename = "operator_noLUT.c"
-    else:
-        filename = "operator_withLUT.c"
-    path = dirname(pyxfilename)
-
-    if config.build_dir is None:
-        utime( join(path,filename), None)
-
-    return Extension(name=modname,
-                     sources=[pyxfilename, join(path, filename)],
-                     include_dirs=[numpy.get_include()],
-                     define_macros=[('nTHREADS', config.nTHREADS),
-                                    ('nIC', config.nIC),
-                                    ('nEC', config.nEC),
-                                    ('nISO', config.nISO)],
-                     extra_compile_args=['-w', '-O3', '-Ofast'])
+import numpy
+from os import utime
+from os.path import dirname, join
+from setuptools import Extension
+
+# pass parameters to the compiler at runtime
+# [TODO] find a way to avoid using this fake module
+from commit.operator import config
+
+
+def make_ext(modname, pyxfilename):
+
+    if (config.nTHREADS is None or config.nTHREADS < 1 or config.nTHREADS > 255):
+        raise RuntimeError('config.nTHREADS must be between 1 and 255')
+    if (config.nIC is None or config.nIC < 0 or config.nIC > 20):
+        raise RuntimeError('config.nIC must be in the range [0..20]')
+    if (config.nEC is None or config.nEC < 0 or config.nEC > 20):
+        raise RuntimeError('config.nEC must be in the range [0..20]')
+    if (config.nISO is None or config.nISO < 0 or config.nISO > 20):
+        raise RuntimeError('config.nISO must be in the range [0..20]')
+
+    # Force recompilation
+    if config.model == "VolumeFractions":
+        filename = "operator_noLUT.c"
+    else:
+        filename = "operator_withLUT.c"
+    path = dirname(pyxfilename)
+
+    if config.build_dir is None:
+        utime( join(path,filename), None)
+
+    return Extension(name=modname,
+                     sources=[pyxfilename, join(path, filename)],
+                     include_dirs=[numpy.get_include()],
+                     define_macros=[('nTHREADS', config.nTHREADS),
+                                    ('nIC', config.nIC),
+                                    ('nEC', config.nEC),
+                                    ('nISO', config.nISO)],
+                     extra_compile_args=['-w', '-O3', '-Ofast'])
diff --git a/commit/operator/operator_noLUT.c b/commit/operator/operator_noLUT.c
index 061ca1d1..1bdfd5f9 100644
--- a/commit/operator/operator_noLUT.c
+++ b/commit/operator/operator_noLUT.c
@@ -1,187 +1,187 @@
-#include <pthread.h>
-#include <stdint.h> // uint32_t etc
-
-// number of THREADS
-#ifdef nTHREADS
-    #if (nTHREADS<1 || nTHREADS>255)
-    #error "nTHREADS" must be in the range 0..255
-    #endif
-#else
-    #error "nTHREADS" parameter must be passed to the compiler as "-DnTHREADS=<value>"
-#endif
-
-
-/* global variables */
-int         nF, n;
-double      *x, *Y;
-uint32_t    *ICthreads, *ISOthreads;
-uint8_t     *ICthreadsT;
-uint32_t    *ISOthreadsT;
-uint32_t    *ICf, *ICv, *ISOv;
-float       *ICl;
-
-
-// ====================================================
-// Compute a sub-block of the A*x MAtRIX-VECTOR product
-// ====================================================
-void* COMMIT_A__block( void *ptr )
-{
-    int      id = (long)ptr;
-    double   x0;
-    double   *xPtr;
-    uint32_t *t_v, *t_vEnd, *t_f;
-    float    *t_l;
-
-    // intra-cellular compartments
-    t_v    = ICv + ICthreads[id];
-    t_vEnd = ICv + ICthreads[id+1];
-    t_l    = ICl + ICthreads[id];
-    t_f    = ICf + ICthreads[id];
-
-    while( t_v != t_vEnd )
-    {
-        x0 = x[*t_f];
-        if ( x0 != 0 )
-            Y[*t_v] += (double)(*t_l) * x0;
-        t_f++;
-        t_v++;
-        t_l++;
-    }
-
-#if nISO>=1
-    // isotropic compartments
-    t_v    = ISOv + ISOthreads[id];
-    t_vEnd = ISOv + ISOthreads[id+1];
-    xPtr   = x + nF + ISOthreads[id];
-
-    while( t_v != t_vEnd )
-    {
-        x0 = *xPtr++;
-        if ( x0 != 0 )
-            Y[*t_v] += x0;
-        t_v++;
-    }
-#endif
-
-    pthread_exit( 0 );
-}
-
-
-// =========================
-// Function called by CYTHON
-// =========================
-void COMMIT_A(
-    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
-    double *_vIN, double *_vOUT,
-    uint32_t *_ICf, uint32_t *_ICv, uint16_t *_ICo, float *_ICl,
-    uint32_t *_ECv, uint16_t *_ECo,
-    uint32_t *_ISOv,
-    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
-    uint32_t* _ICthreads, uint32_t* _ECthreads, uint32_t* _ISOthreads
-)
-{
-    nF = _nF;
-    n  = _n;
-
-    x = _vIN;
-    Y = _vOUT;
-
-    ICf  = _ICf;
-    ICv  = _ICv;
-    ICl  = _ICl;
-    ISOv = _ISOv;
-
-    ICthreads  = _ICthreads;
-    ISOthreads = _ISOthreads;
-
-    // Run SEPARATE THREADS to perform the multiplication
-    pthread_t threads[nTHREADS];
-    int t;
-    for(t=0; t<nTHREADS ; t++)
-        pthread_create( &threads[t], NULL, COMMIT_A__block, (void *) (long int)t );
-    for(t=0; t<nTHREADS ; t++)
-        pthread_join( threads[t], NULL );
-    return;
-}
-
-
-
-/* ===================================================== */
-/* Compute a sub-block of the A'*y MAtRIX-VECTOR product */
-/* ===================================================== */
-void* COMMIT_At__block( void *ptr )
-{
-    int      id = (long)ptr;
-    double   *xPtr;
-    uint32_t *t_v, *t_vEnd, *t_f;
-    float    *t_l;
-    uint8_t  *t_t;
-
-    // intra-cellular compartments
-    t_v    = ICv;
-    t_vEnd = ICv + n;
-    t_l    = ICl;
-    t_f    = ICf;
-    t_t    = ICthreadsT;
-
-    while( t_v != t_vEnd )
-    {
-        // in this case, I need to walk throug because the segments are ordered in "voxel order"
-        if ( *t_t == id )
-            x[*t_f] += (double)(*t_l) * Y[*t_v];
-        t_t++;
-        t_f++;
-        t_v++;
-        t_l++;
-    }
-
-#if nISO>=1
-    // isotropic compartments
-    t_v    = ISOv + ISOthreadsT[id];
-    t_vEnd = ISOv + ISOthreadsT[id+1];
-    xPtr   = x + nF + ISOthreadsT[id];
-
-    while( t_v != t_vEnd )
-        (*xPtr++) += Y[*t_v++];
-#endif
-
-    pthread_exit( 0 );
-}
-
-
-// =========================
-// Function called by CYTHON
-// =========================
-void COMMIT_At(
-    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
-    double *_vIN, double *_vOUT,
-    uint32_t *_ICf, uint32_t *_ICv, uint16_t *_ICo, float *_ICl,
-    uint32_t *_ECv, uint16_t *_ECo,
-    uint32_t *_ISOv,
-    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
-    uint8_t* _ICthreadsT, uint32_t* _ECthreadsT, uint32_t* _ISOthreadsT
-)
-{
-    nF = _nF;
-    n  = _n;
-
-    x = _vOUT;
-    Y = _vIN;
-
-    ICf  = _ICf;
-    ICv  = _ICv;
-    ICl  = _ICl;
-    ISOv = _ISOv;
-
-    ICthreadsT  = _ICthreadsT;
-    ISOthreadsT = _ISOthreadsT;
-
-    // Run SEPARATE THREADS to perform the multiplication
-    pthread_t threads[nTHREADS];
-    int t;
-    for(t=0; t<nTHREADS ; t++)
-        pthread_create( &threads[t], NULL, COMMIT_At__block, (void *) (long int)t );
-    for(t=0; t<nTHREADS ; t++)
-        pthread_join( threads[t], NULL );
-    return;
-}
+#include <pthread.h>
+#include <stdint.h> // uint32_t etc
+
+// number of THREADS
+#ifdef nTHREADS
+    #if (nTHREADS<1 || nTHREADS>255)
+    #error "nTHREADS" must be in the range 0..255
+    #endif
+#else
+    #error "nTHREADS" parameter must be passed to the compiler as "-DnTHREADS=<value>"
+#endif
+
+
+/* global variables */
+int         nF, n;
+double      *x, *Y;
+uint32_t    *ICthreads, *ISOthreads;
+uint8_t     *ICthreadsT;
+uint32_t    *ISOthreadsT;
+uint32_t    *ICf, *ICv, *ISOv;
+float       *ICl;
+
+
+// ====================================================
+// Compute a sub-block of the A*x MAtRIX-VECTOR product
+// ====================================================
+void* COMMIT_A__block( void *ptr )
+{
+    int      id = (long)ptr;
+    double   x0;
+    double   *xPtr;
+    uint32_t *t_v, *t_vEnd, *t_f;
+    float    *t_l;
+
+    // intra-cellular compartments
+    t_v    = ICv + ICthreads[id];
+    t_vEnd = ICv + ICthreads[id+1];
+    t_l    = ICl + ICthreads[id];
+    t_f    = ICf + ICthreads[id];
+
+    while( t_v != t_vEnd )
+    {
+        x0 = x[*t_f];
+        if ( x0 != 0 )
+            Y[*t_v] += (double)(*t_l) * x0;
+        t_f++;
+        t_v++;
+        t_l++;
+    }
+
+#if nISO>=1
+    // isotropic compartments
+    t_v    = ISOv + ISOthreads[id];
+    t_vEnd = ISOv + ISOthreads[id+1];
+    xPtr   = x + nF + ISOthreads[id];
+
+    while( t_v != t_vEnd )
+    {
+        x0 = *xPtr++;
+        if ( x0 != 0 )
+            Y[*t_v] += x0;
+        t_v++;
+    }
+#endif
+
+    pthread_exit( 0 );
+}
+
+
+// =========================
+// Function called by CYTHON
+// =========================
+void COMMIT_A(
+    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
+    double *_vIN, double *_vOUT,
+    uint32_t *_ICf, uint32_t *_ICv, uint16_t *_ICo, float *_ICl,
+    uint32_t *_ECv, uint16_t *_ECo,
+    uint32_t *_ISOv,
+    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
+    uint32_t* _ICthreads, uint32_t* _ECthreads, uint32_t* _ISOthreads
+)
+{
+    nF = _nF;
+    n  = _n;
+
+    x = _vIN;
+    Y = _vOUT;
+
+    ICf  = _ICf;
+    ICv  = _ICv;
+    ICl  = _ICl;
+    ISOv = _ISOv;
+
+    ICthreads  = _ICthreads;
+    ISOthreads = _ISOthreads;
+
+    // Run SEPARATE THREADS to perform the multiplication
+    pthread_t threads[nTHREADS];
+    int t;
+    for(t=0; t<nTHREADS ; t++)
+        pthread_create( &threads[t], NULL, COMMIT_A__block, (void *) (long int)t );
+    for(t=0; t<nTHREADS ; t++)
+        pthread_join( threads[t], NULL );
+    return;
+}
+
+
+
+/* ===================================================== */
+/* Compute a sub-block of the A'*y MAtRIX-VECTOR product */
+/* ===================================================== */
+void* COMMIT_At__block( void *ptr )
+{
+    int      id = (long)ptr;
+    double   *xPtr;
+    uint32_t *t_v, *t_vEnd, *t_f;
+    float    *t_l;
+    uint8_t  *t_t;
+
+    // intra-cellular compartments
+    t_v    = ICv;
+    t_vEnd = ICv + n;
+    t_l    = ICl;
+    t_f    = ICf;
+    t_t    = ICthreadsT;
+
+    while( t_v != t_vEnd )
+    {
+        // in this case, I need to walk throug because the segments are ordered in "voxel order"
+        if ( *t_t == id )
+            x[*t_f] += (double)(*t_l) * Y[*t_v];
+        t_t++;
+        t_f++;
+        t_v++;
+        t_l++;
+    }
+
+#if nISO>=1
+    // isotropic compartments
+    t_v    = ISOv + ISOthreadsT[id];
+    t_vEnd = ISOv + ISOthreadsT[id+1];
+    xPtr   = x + nF + ISOthreadsT[id];
+
+    while( t_v != t_vEnd )
+        (*xPtr++) += Y[*t_v++];
+#endif
+
+    pthread_exit( 0 );
+}
+
+
+// =========================
+// Function called by CYTHON
+// =========================
+void COMMIT_At(
+    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
+    double *_vIN, double *_vOUT,
+    uint32_t *_ICf, uint32_t *_ICv, uint16_t *_ICo, float *_ICl,
+    uint32_t *_ECv, uint16_t *_ECo,
+    uint32_t *_ISOv,
+    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
+    uint8_t* _ICthreadsT, uint32_t* _ECthreadsT, uint32_t* _ISOthreadsT
+)
+{
+    nF = _nF;
+    n  = _n;
+
+    x = _vOUT;
+    Y = _vIN;
+
+    ICf  = _ICf;
+    ICv  = _ICv;
+    ICl  = _ICl;
+    ISOv = _ISOv;
+
+    ICthreadsT  = _ICthreadsT;
+    ISOthreadsT = _ISOthreadsT;
+
+    // Run SEPARATE THREADS to perform the multiplication
+    pthread_t threads[nTHREADS];
+    int t;
+    for(t=0; t<nTHREADS ; t++)
+        pthread_create( &threads[t], NULL, COMMIT_At__block, (void *) (long int)t );
+    for(t=0; t<nTHREADS ; t++)
+        pthread_join( threads[t], NULL );
+    return;
+}
diff --git a/commit/operator/operator_withLUT.c b/commit/operator/operator_withLUT.c
index 2137d4a3..3e549408 100644
--- a/commit/operator/operator_withLUT.c
+++ b/commit/operator/operator_withLUT.c
@@ -1,2247 +1,2247 @@
-#include <pthread.h>
-#include <stdint.h> // uint32_t etc
-
-// number of THREADS
-#ifdef nTHREADS
-    #if (nTHREADS<1 || nTHREADS>255)
-    #error "nTHREADS" must be in the range 0..255
-    #endif
-#else
-    #error "nTHREADS" parameter must be passed to the compiler as "-DnTHREADS=<value>"
-#endif
-
-
-/* global variables */
-int         nF, n, nE, nV, nS, ndirs;
-double      *x, *Y;
-uint32_t    *ICthreads, *ECthreads, *ISOthreads;
-uint8_t     *ICthreadsT;
-uint32_t    *ECthreadsT, *ISOthreadsT;
-uint32_t    *ICf, *ICv, *ECv, *ISOv;
-uint16_t    *ICo, *ECo;
-float       *ICl;
-float       *wmrSFP0, *wmrSFP1, *wmrSFP2, *wmrSFP3, *wmrSFP4, *wmrSFP5, *wmrSFP6, *wmrSFP7, *wmrSFP8, *wmrSFP9, *wmrSFP10, *wmrSFP11, *wmrSFP12, *wmrSFP13, *wmrSFP14, *wmrSFP15, *wmrSFP16, *wmrSFP17, *wmrSFP18, *wmrSFP19;
-float       *wmhSFP0, *wmhSFP1, *wmhSFP2, *wmhSFP3, *wmhSFP4, *wmhSFP5, *wmhSFP6, *wmhSFP7, *wmhSFP8, *wmhSFP9, *wmhSFP10, *wmhSFP11, *wmhSFP12, *wmhSFP13, *wmhSFP14, *wmhSFP15, *wmhSFP16, *wmhSFP17, *wmhSFP18, *wmhSFP19;
-float       *isoSFP0, *isoSFP1, *isoSFP2, *isoSFP3, *isoSFP4, *isoSFP5, *isoSFP6, *isoSFP7, *isoSFP8, *isoSFP9, *isoSFP10, *isoSFP11, *isoSFP12, *isoSFP13, *isoSFP14, *isoSFP15, *isoSFP16, *isoSFP17, *isoSFP18, *isoSFP19;
-
-
-
-// ====================================================
-// Compute a sub-block of the A*x MAtRIX-VECTOR product
-// ====================================================
-void* COMMIT_A__block( void *ptr )
-{
-    int      id = (long)ptr;
-    int      offset;
-    double   x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, w;
-    double   *x_Ptr0, *x_Ptr1, *x_Ptr2, *x_Ptr3, *x_Ptr4, *x_Ptr5, *x_Ptr6, *x_Ptr7, *x_Ptr8, *x_Ptr9, *x_Ptr10, *x_Ptr11, *x_Ptr12, *x_Ptr13, *x_Ptr14, *x_Ptr15, *x_Ptr16, *x_Ptr17, *x_Ptr18, *x_Ptr19;
-    double   *Yptr, *YptrEnd;
-    float    *SFP0ptr, *SFP1ptr, *SFP2ptr, *SFP3ptr, *SFP4ptr, *SFP5ptr, *SFP6ptr, *SFP7ptr, *SFP8ptr, *SFP9ptr, *SFP10ptr, *SFP11ptr, *SFP12ptr, *SFP13ptr, *SFP14ptr, *SFP15ptr, *SFP16ptr, *SFP17ptr, *SFP18ptr, *SFP19ptr;
-    uint32_t *t_v, *t_vEnd, *t_f;
-    uint16_t *t_o;
-    float    *t_l;
-
-#if nIC>=1
-    // intra-cellular compartments
-    t_v    = ICv + ICthreads[id];
-    t_vEnd = ICv + ICthreads[id+1];
-    t_o    = ICo + ICthreads[id];
-    t_l    = ICl + ICthreads[id];
-    t_f    = ICf + ICthreads[id];
-
-    while( t_v != t_vEnd )
-    {
-        x_Ptr0 = x + *t_f;
-        x0 = *x_Ptr0;
-        #if nIC>=2
-        x_Ptr1 = x_Ptr0 + nF;
-        x1 = *x_Ptr1;
-        #endif
-        #if nIC>=3
-        x_Ptr2 = x_Ptr1 + nF;
-        x2 = *x_Ptr2;
-        #endif
-        #if nIC>=4
-        x_Ptr3 = x_Ptr2 + nF;
-        x3 = *x_Ptr3;
-        #endif
-        #if nIC>=5
-        x_Ptr4 = x_Ptr3 + nF;
-        x4 = *x_Ptr4;
-        #endif
-        #if nIC>=6
-        x_Ptr5 = x_Ptr4 + nF;
-        x5 = *x_Ptr5;
-        #endif
-        #if nIC>=7
-        x_Ptr6 = x_Ptr5 + nF;
-        x6 = *x_Ptr6;
-        #endif
-        #if nIC>=8
-        x_Ptr7 = x_Ptr6 + nF;
-        x7 = *x_Ptr7;
-        #endif
-        #if nIC>=9
-        x_Ptr8 = x_Ptr7 + nF;
-        x8 = *x_Ptr8;
-        #endif
-        #if nIC>=10
-        x_Ptr9 = x_Ptr8 + nF;
-        x9 = *x_Ptr9;
-        #endif
-        #if nIC>=11
-        x_Ptr10 = x_Ptr9 + nF;
-        x10 = *x_Ptr10;
-        #endif
-        #if nIC>=12
-        x_Ptr11 = x_Ptr10 + nF;
-        x11 = *x_Ptr11;
-        #endif
-        #if nIC>=13
-        x_Ptr12 = x_Ptr11 + nF;
-        x12 = *x_Ptr12;
-        #endif
-        #if nIC>=14
-        x_Ptr13 = x_Ptr12 + nF;
-        x13 = *x_Ptr13;
-        #endif
-        #if nIC>=15
-        x_Ptr14 = x_Ptr13 + nF;
-        x14 = *x_Ptr14;
-        #endif
-        #if nIC>=16
-        x_Ptr15 = x_Ptr14 + nF;
-        x15 = *x_Ptr15;
-        #endif
-        #if nIC>=17
-        x_Ptr16 = x_Ptr15 + nF;
-        x16 = *x_Ptr16;
-        #endif
-        #if nIC>=18
-        x_Ptr17 = x_Ptr16 + nF;
-        x17 = *x_Ptr17;
-        #endif
-        #if nIC>=19
-        x_Ptr18 = x_Ptr17 + nF;
-        x18 = *x_Ptr18;
-        #endif
-        #if nIC>=20
-        x_Ptr19 = x_Ptr18 + nF;
-        x19 = *x_Ptr19;
-        #endif
-
-        if ( x0 != 0
-        #if nIC>=2
-            || x1 != 0
-        #endif
-        #if nIC>=3
-            || x2 != 0
-        #endif
-        #if nIC>=4
-            || x3 != 0
-        #endif
-        #if nIC>=5
-            || x4 != 0
-        #endif
-        #if nIC>=6
-            || x5 != 0
-        #endif
-        #if nIC>=7
-            || x6 != 0
-        #endif
-        #if nIC>=8
-            || x7 != 0
-        #endif
-        #if nIC>=9
-            || x8 != 0
-        #endif
-        #if nIC>=10
-            || x9 != 0
-        #endif
-        #if nIC>=11
-            || x10 != 0
-        #endif
-        #if nIC>=12
-            || x11 != 0
-        #endif
-        #if nIC>=13
-            || x12 != 0
-        #endif
-        #if nIC>=14
-            || x13 != 0
-        #endif
-        #if nIC>=15
-            || x14 != 0
-        #endif
-        #if nIC>=16
-            || x15 != 0
-        #endif
-        #if nIC>=17
-            || x16 != 0
-        #endif
-        #if nIC>=18
-            || x17 != 0
-        #endif
-        #if nIC>=19
-            || x18 != 0
-        #endif
-        #if nIC>=20
-            || x19 != 0
-        #endif
-        )
-        {
-            Yptr    = Y    + nS * (*t_v);
-            YptrEnd = Yptr + nS;
-            w       = (double)(*t_l);
-            offset  = nS * (*t_o);
-            SFP0ptr = wmrSFP0 + offset;
-            #if nIC>=2
-            SFP1ptr = wmrSFP1 + offset;
-            #endif
-            #if nIC>=3
-            SFP2ptr = wmrSFP2 + offset;
-            #endif
-            #if nIC>=4
-            SFP3ptr = wmrSFP3 + offset;
-            #endif
-            #if nIC>=5
-            SFP4ptr = wmrSFP4 + offset;
-            #endif
-            #if nIC>=6
-            SFP5ptr = wmrSFP5 + offset;
-            #endif
-            #if nIC>=7
-            SFP6ptr = wmrSFP6 + offset;
-            #endif
-            #if nIC>=8
-            SFP7ptr = wmrSFP7 + offset;
-            #endif
-            #if nIC>=9
-            SFP8ptr = wmrSFP8 + offset;
-            #endif
-            #if nIC>=10
-            SFP9ptr = wmrSFP9 + offset;
-            #endif
-            #if nIC>=11
-            SFP10ptr = wmrSFP10 + offset;
-            #endif
-            #if nIC>=12
-            SFP11ptr = wmrSFP11 + offset;
-            #endif
-            #if nIC>=13
-            SFP12ptr = wmrSFP12 + offset;
-            #endif
-            #if nIC>=14
-            SFP13ptr = wmrSFP13 + offset;
-            #endif
-            #if nIC>=15
-            SFP14ptr = wmrSFP14 + offset;
-            #endif
-            #if nIC>=16
-            SFP15ptr = wmrSFP15 + offset;
-            #endif
-            #if nIC>=17
-            SFP16ptr = wmrSFP16 + offset;
-            #endif
-            #if nIC>=18
-            SFP17ptr = wmrSFP17 + offset;
-            #endif
-            #if nIC>=19
-            SFP18ptr = wmrSFP18 + offset;
-            #endif
-            #if nIC>=20
-            SFP19ptr = wmrSFP19 + offset;
-            #endif
-
-            while( Yptr != YptrEnd )
-                (*Yptr++) += w * (
-                          x0 * (*SFP0ptr++)
-                        #if nIC>=2
-                        + x1 * (*SFP1ptr++)
-                        #endif
-                        #if nIC>=3
-                        + x2 * (*SFP2ptr++)
-                        #endif
-                        #if nIC>=4
-                        + x3 * (*SFP3ptr++)
-                        #endif
-                        #if nIC>=5
-                        + x4 * (*SFP4ptr++)
-                        #endif
-                        #if nIC>=6
-                        + x5 * (*SFP5ptr++)
-                        #endif
-                        #if nIC>=7
-                        + x6 * (*SFP6ptr++)
-                        #endif
-                        #if nIC>=8
-                        + x7 * (*SFP7ptr++)
-                        #endif
-                        #if nIC>=9
-                        + x8 * (*SFP8ptr++)
-                        #endif
-                        #if nIC>=10
-                        + x9 * (*SFP9ptr++)
-                        #endif
-                        #if nIC>=11
-                        + x10 * (*SFP10ptr++)
-                        #endif
-                        #if nIC>=12
-                        + x11 * (*SFP11ptr++)
-                        #endif
-                        #if nIC>=13
-                        + x12 * (*SFP12ptr++)
-                        #endif
-                        #if nIC>=14
-                        + x13 * (*SFP13ptr++)
-                        #endif
-                        #if nIC>=15
-                        + x14 * (*SFP14ptr++)
-                        #endif
-                        #if nIC>=16
-                        + x15 * (*SFP15ptr++)
-                        #endif
-                        #if nIC>=17
-                        + x16 * (*SFP16ptr++)
-                        #endif
-                        #if nIC>=18
-                        + x17 * (*SFP17ptr++)
-                        #endif
-                        #if nIC>=19
-                        + x18 * (*SFP18ptr++)
-                        #endif
-                        #if nIC>=20
-                        + x19 * (*SFP19ptr++)
-                        #endif
-                );
-        }
-
-        t_f++;
-        t_v++;
-        t_o++;
-        t_l++;
-    }
-#endif
-
-#if nEC>=1
-    // extra-cellular compartments
-    t_v    = ECv + ECthreads[id];
-    t_vEnd = ECv + ECthreads[id+1];
-    t_o    = ECo + ECthreads[id];
-
-    x_Ptr0 = x + nIC*nF + ECthreads[id];
-    #if nEC>=2
-    x_Ptr1 = x_Ptr0 + nE;
-    #endif
-    #if nEC>=3
-    x_Ptr2 = x_Ptr1 + nE;
-    #endif
-    #if nEC>=4
-    x_Ptr3 = x_Ptr2 + nE;
-    #endif
-    #if nEC>=5
-    x_Ptr4 = x_Ptr3 + nE;
-    #endif
-    #if nEC>=6
-    x_Ptr5 = x_Ptr4 + nE;
-    #endif
-    #if nEC>=7
-    x_Ptr6 = x_Ptr5 + nE;
-    #endif
-    #if nEC>=8
-    x_Ptr7 = x_Ptr6 + nE;
-    #endif
-    #if nEC>=9
-    x_Ptr8 = x_Ptr7 + nE;
-    #endif
-    #if nEC>=10
-    x_Ptr9 = x_Ptr8 + nE;
-    #endif
-    #if nEC>=11
-    x_Ptr10 = x_Ptr9 + nE;
-    #endif
-    #if nEC>=12
-    x_Ptr11 = x_Ptr10 + nE;
-    #endif
-    #if nEC>=13
-    x_Ptr12 = x_Ptr11 + nE;
-    #endif
-    #if nEC>=14
-    x_Ptr13 = x_Ptr12 + nE;
-    #endif
-    #if nEC>=15
-    x_Ptr14 = x_Ptr13 + nE;
-    #endif
-    #if nEC>=16
-    x_Ptr15 = x_Ptr14 + nE;
-    #endif
-    #if nEC>=17
-    x_Ptr16 = x_Ptr15 + nE;
-    #endif
-    #if nEC>=18
-    x_Ptr17 = x_Ptr16 + nE;
-    #endif
-    #if nEC>=19
-    x_Ptr18 = x_Ptr17 + nE;
-    #endif
-    #if nEC>=20
-    x_Ptr19 = x_Ptr18 + nE;
-    #endif
-
-    while( t_v != t_vEnd )
-    {
-        x0 = *x_Ptr0++;
-        #if nEC>=2
-        x1 = *x_Ptr1++;
-        #endif
-        #if nEC>=3
-        x2 = *x_Ptr2++;
-        #endif
-        #if nEC>=4
-        x3 = *x_Ptr3++;
-        #endif
-        #if nEC>=5
-        x4 = *x_Ptr4++;
-        #endif
-        #if nEC>=6
-        x5 = *x_Ptr5++;
-        #endif
-        #if nEC>=7
-        x6 = *x_Ptr6++;
-        #endif
-        #if nEC>=8
-        x7 = *x_Ptr7++;
-        #endif
-        #if nEC>=9
-        x8 = *x_Ptr8++;
-        #endif
-        #if nEC>=10
-        x9 = *x_Ptr9++;
-        #endif
-        #if nEC>=11
-        x10 = *x_Ptr10++;
-        #endif
-        #if nEC>=12
-        x11 = *x_Ptr11++;
-        #endif
-        #if nEC>=13
-        x12 = *x_Ptr12++;
-        #endif
-        #if nEC>=14
-        x13 = *x_Ptr13++;
-        #endif
-        #if nEC>=15
-        x14 = *x_Ptr14++;
-        #endif
-        #if nEC>=16
-        x15 = *x_Ptr15++;
-        #endif
-        #if nEC>=17
-        x16 = *x_Ptr16++;
-        #endif
-        #if nEC>=18
-        x17 = *x_Ptr17++;
-        #endif
-        #if nEC>=19
-        x18 = *x_Ptr18++;
-        #endif
-        #if nEC>=20
-        x19 = *x_Ptr19++;
-        #endif
-        if (
-               x0 != 0
-            #if nEC>=2
-            || x1 != 0
-            #endif
-            #if nEC>=3
-            || x2 != 0
-            #endif
-            #if nEC>=4
-            || x3 != 0
-            #endif
-            #if nEC>=5
-            || x4 != 0
-            #endif
-            #if nEC>=6
-            || x5 != 0
-            #endif
-            #if nEC>=7
-            || x6 != 0
-            #endif
-            #if nEC>=8
-            || x7 != 0
-            #endif
-            #if nEC>=9
-            || x8 != 0
-            #endif
-            #if nEC>=10
-            || x9 != 0
-            #endif
-            #if nEC>=11
-            || x10 != 0
-            #endif
-            #if nEC>=12
-            || x11 != 0
-            #endif
-            #if nEC>=13
-            || x12 != 0
-            #endif
-            #if nEC>=14
-            || x13 != 0
-            #endif
-            #if nEC>=15
-            || x14 != 0
-            #endif
-            #if nEC>=16
-            || x15 != 0
-            #endif
-            #if nEC>=17
-            || x16 != 0
-            #endif
-            #if nEC>=18
-            || x17 != 0
-            #endif
-            #if nEC>=19
-            || x18 != 0
-            #endif
-            #if nEC>=20
-            || x19 != 0
-            #endif
-          )
-        {
-            Yptr    = Y    + nS * (*t_v);
-            YptrEnd = Yptr + nS;
-            offset  = nS * (*t_o);
-            SFP0ptr = wmhSFP0 + offset;
-            #if nEC>=2
-            SFP1ptr = wmhSFP1 + offset;
-            #endif
-            #if nEC>=3
-            SFP2ptr = wmhSFP2 + offset;
-            #endif
-            #if nEC>=4
-            SFP3ptr = wmhSFP3 + offset;
-            #endif
-            #if nEC>=5
-            SFP4ptr = wmhSFP4 + offset;
-            #endif
-            #if nEC>=6
-            SFP5ptr = wmhSFP5 + offset;
-            #endif
-            #if nEC>=7
-            SFP6ptr = wmhSFP6 + offset;
-            #endif
-            #if nEC>=8
-            SFP7ptr = wmhSFP7 + offset;
-            #endif
-            #if nEC>=9
-            SFP8ptr = wmhSFP8 + offset;
-            #endif
-            #if nEC>=10
-            SFP9ptr = wmhSFP9 + offset;
-            #endif
-            #if nEC>=11
-            SFP10ptr = wmhSFP10 + offset;
-            #endif
-            #if nEC>=12
-            SFP11ptr = wmhSFP11 + offset;
-            #endif
-            #if nEC>=13
-            SFP12ptr = wmhSFP12 + offset;
-            #endif
-            #if nEC>=14
-            SFP13ptr = wmhSFP13 + offset;
-            #endif
-            #if nEC>=15
-            SFP14ptr = wmhSFP14 + offset;
-            #endif
-            #if nEC>=16
-            SFP15ptr = wmhSFP15 + offset;
-            #endif
-            #if nEC>=17
-            SFP16ptr = wmhSFP16 + offset;
-            #endif
-            #if nEC>=18
-            SFP17ptr = wmhSFP17 + offset;
-            #endif
-            #if nEC>=19
-            SFP18ptr = wmhSFP18 + offset;
-            #endif
-            #if nEC>=20
-            SFP19ptr = wmhSFP19 + offset;
-            #endif
-
-            while( Yptr != YptrEnd )
-                (*Yptr++) += (
-                      x0 * (*SFP0ptr++)
-                    #if nEC>=2
-                    + x1 * (*SFP1ptr++)
-                    #endif
-                    #if nEC>=3
-                    + x2 * (*SFP2ptr++)
-                    #endif
-                    #if nEC>=4
-                    + x3 * (*SFP3ptr++)
-                    #endif
-                    #if nEC>=5
-                    + x4 * (*SFP4ptr++)
-                    #endif
-                    #if nEC>=6
-                    + x5 * (*SFP5ptr++)
-                    #endif
-                    #if nEC>=7
-                    + x6 * (*SFP6ptr++)
-                    #endif
-                    #if nEC>=8
-                    + x7 * (*SFP7ptr++)
-                    #endif
-                    #if nEC>=9
-                    + x8 * (*SFP8ptr++)
-                    #endif
-                    #if nEC>=10
-                    + x9 * (*SFP9ptr++)
-                    #endif
-                    #if nEC>=11
-                    + x10 * (*SFP10ptr++)
-                    #endif
-                    #if nEC>=12
-                    + x11 * (*SFP11ptr++)
-                    #endif
-                    #if nEC>=13
-                    + x12 * (*SFP12ptr++)
-                    #endif
-                    #if nEC>=14
-                    + x13 * (*SFP13ptr++)
-                    #endif
-                    #if nEC>=15
-                    + x14 * (*SFP14ptr++)
-                    #endif
-                    #if nEC>=16
-                    + x15 * (*SFP15ptr++)
-                    #endif
-                    #if nEC>=17
-                    + x16 * (*SFP16ptr++)
-                    #endif
-                    #if nEC>=18
-                    + x17 * (*SFP17ptr++)
-                    #endif
-                    #if nEC>=19
-                    + x18 * (*SFP18ptr++)
-                    #endif
-                    #if nEC>=20
-                    + x19 * (*SFP19ptr++)
-                    #endif
-
-                );
-        }
-        t_v++;
-        t_o++;
-    }
-#endif
-
-#if nISO>=1
-    // isotropic compartments
-    t_v    = ISOv + ISOthreads[id];
-    t_vEnd = ISOv + ISOthreads[id+1];
-
-    x_Ptr0 = x + nIC*nF + nEC*nE + ISOthreads[id];
-    #if nISO>=2
-    x_Ptr1 = x_Ptr0 + nV;
-    #endif
-    #if nISO>=3
-    x_Ptr2 = x_Ptr1 + nV;
-    #endif
-    #if nISO>=4
-    x_Ptr3 = x_Ptr2 + nV;
-    #endif
-    #if nISO>=5
-    x_Ptr4 = x_Ptr3 + nV;
-    #endif
-    #if nISO>=6
-    x_Ptr5 = x_Ptr4 + nV;
-    #endif
-    #if nISO>=7
-    x_Ptr6 = x_Ptr5 + nV;
-    #endif
-    #if nISO>=8
-    x_Ptr7 = x_Ptr6 + nV;
-    #endif
-    #if nISO>=9
-    x_Ptr8 = x_Ptr7 + nV;
-    #endif
-    #if nISO>=10
-    x_Ptr9 = x_Ptr8 + nV;
-    #endif
-    #if nISO>=11
-    x_Ptr10 = x_Ptr9 + nV;
-    #endif
-    #if nISO>=12
-    x_Ptr11 = x_Ptr10 + nV;
-    #endif
-    #if nISO>=13
-    x_Ptr12 = x_Ptr11 + nV;
-    #endif
-    #if nISO>=14
-    x_Ptr13 = x_Ptr12 + nV;
-    #endif
-    #if nISO>=15
-    x_Ptr14 = x_Ptr13 + nV;
-    #endif
-    #if nISO>=16
-    x_Ptr15 = x_Ptr14 + nV;
-    #endif
-    #if nISO>=17
-    x_Ptr16 = x_Ptr15 + nV;
-    #endif
-    #if nISO>=18
-    x_Ptr17 = x_Ptr16 + nV;
-    #endif
-    #if nISO>=19
-    x_Ptr18 = x_Ptr17 + nV;
-    #endif
-    #if nISO>=20
-    x_Ptr19 = x_Ptr18 + nV;
-    #endif
-
-    while( t_v != t_vEnd )
-    {
-        x0 = *x_Ptr0++;
-        #if nISO>=2
-        x1 = *x_Ptr1++;
-        #endif
-        #if nISO>=3
-        x2 = *x_Ptr2++;
-        #endif
-        #if nISO>=4
-        x3 = *x_Ptr3++;
-        #endif
-        #if nISO>=5
-        x4 = *x_Ptr4++;
-        #endif
-        #if nISO>=6
-        x5 = *x_Ptr5++;
-        #endif
-        #if nISO>=7
-        x6 = *x_Ptr6++;
-        #endif
-        #if nISO>=8
-        x7 = *x_Ptr7++;
-        #endif
-        #if nISO>=9
-        x8 = *x_Ptr8++;
-        #endif
-        #if nISO>=10
-        x9 = *x_Ptr9++;
-        #endif
-        #if nISO>=11
-        x10 = *x_Ptr10++;
-        #endif
-        #if nISO>=12
-        x11 = *x_Ptr11++;
-        #endif
-        #if nISO>=13
-        x12 = *x_Ptr12++;
-        #endif
-        #if nISO>=14
-        x13 = *x_Ptr13++;
-        #endif
-        #if nISO>=15
-        x14 = *x_Ptr14++;
-        #endif
-        #if nISO>=16
-        x15 = *x_Ptr15++;
-        #endif
-        #if nISO>=17
-        x16 = *x_Ptr16++;
-        #endif
-        #if nISO>=18
-        x17 = *x_Ptr17++;
-        #endif
-        #if nISO>=19
-        x18 = *x_Ptr18++;
-        #endif
-        #if nISO>=20
-        x19 = *x_Ptr19++;
-        #endif
-
-        if (
-               x0 != 0
-            #if nISO>=2
-            || x1 != 0
-            #endif
-            #if nISO>=3
-            || x2 != 0
-            #endif
-            #if nISO>=4
-            || x3 != 0
-            #endif
-            #if nISO>=5
-            || x4 != 0
-            #endif
-            #if nISO>=6
-            || x5 != 0
-            #endif
-            #if nISO>=7
-            || x6 != 0
-            #endif
-            #if nISO>=8
-            || x7 != 0
-            #endif
-            #if nISO>=9
-            || x8 != 0
-            #endif
-            #if nISO>=10
-            || x9 != 0
-            #endif
-            #if nISO>=11
-            || x10 != 0
-            #endif
-            #if nISO>=12
-            || x11 != 0
-            #endif
-            #if nISO>=13
-            || x12 != 0
-            #endif
-            #if nISO>=14
-            || x13 != 0
-            #endif
-            #if nISO>=15
-            || x14 != 0
-            #endif
-            #if nISO>=16
-            || x15 != 0
-            #endif
-            #if nISO>=17
-            || x16 != 0
-            #endif
-            #if nISO>=18
-            || x17 != 0
-            #endif
-            #if nISO>=19
-            || x18 != 0
-            #endif
-            #if nISO>=20
-            || x19 != 0
-            #endif
-          )
-        {
-            Yptr    = Y    + nS * (*t_v);
-            YptrEnd = Yptr + nS;
-            SFP0ptr = isoSFP0;
-            #if nISO>=2
-            SFP1ptr = isoSFP1;
-            #endif
-            #if nISO>=3
-            SFP2ptr = isoSFP2;
-            #endif
-            #if nISO>=4
-            SFP3ptr = isoSFP3;
-            #endif
-            #if nISO>=5
-            SFP4ptr = isoSFP4;
-            #endif
-            #if nISO>=6
-            SFP5ptr = isoSFP5;
-            #endif
-            #if nISO>=7
-            SFP6ptr = isoSFP6;
-            #endif
-            #if nISO>=8
-            SFP7ptr = isoSFP7;
-            #endif
-            #if nISO>=9
-            SFP8ptr = isoSFP8;
-            #endif
-            #if nISO>=10
-            SFP9ptr = isoSFP9;
-            #endif
-            #if nISO>=11
-            SFP10ptr = isoSFP10;
-            #endif
-            #if nISO>=12
-            SFP11ptr = isoSFP11;
-            #endif
-            #if nISO>=13
-            SFP12ptr = isoSFP12;
-            #endif
-            #if nISO>=14
-            SFP13ptr = isoSFP13;
-            #endif
-            #if nISO>=15
-            SFP14ptr = isoSFP14;
-            #endif
-            #if nISO>=16
-            SFP15ptr = isoSFP15;
-            #endif
-            #if nISO>=17
-            SFP16ptr = isoSFP16;
-            #endif
-            #if nISO>=18
-            SFP17ptr = isoSFP17;
-            #endif
-            #if nISO>=19
-            SFP18ptr = isoSFP18;
-            #endif
-            #if nISO>=20
-            SFP19ptr = isoSFP19;
-            #endif
-
-            while( Yptr != YptrEnd )
-                (*Yptr++) += (
-                      x0 * (*SFP0ptr++)
-                    #if nISO>=2
-                    + x1 * (*SFP1ptr++)
-                    #endif
-                    #if nISO>=3
-                    + x2 * (*SFP2ptr++)
-                    #endif
-                    #if nISO>=4
-                    + x3 * (*SFP3ptr++)
-                    #endif
-                    #if nISO>=5
-                    + x4 * (*SFP4ptr++)
-                    #endif
-                    #if nISO>=6
-                    + x5 * (*SFP5ptr++)
-                    #endif
-                    #if nISO>=7
-                    + x6 * (*SFP6ptr++)
-                    #endif
-                    #if nISO>=8
-                    + x7 * (*SFP7ptr++)
-                    #endif
-                    #if nISO>=9
-                    + x8 * (*SFP8ptr++)
-                    #endif
-                    #if nISO>=10
-                    + x9 * (*SFP9ptr++)
-                    #endif
-                    #if nISO>=11
-                    + x10 * (*SFP10ptr++)
-                    #endif
-                    #if nISO>=12
-                    + x11 * (*SFP11ptr++)
-                    #endif
-                    #if nISO>=13
-                    + x12 * (*SFP12ptr++)
-                    #endif
-                    #if nISO>=14
-                    + x13 * (*SFP13ptr++)
-                    #endif
-                    #if nISO>=15
-                    + x14 * (*SFP14ptr++)
-                    #endif
-                    #if nISO>=16
-                    + x15 * (*SFP15ptr++)
-                    #endif
-                    #if nISO>=17
-                    + x16 * (*SFP16ptr++)
-                    #endif
-                    #if nISO>=18
-                    + x17 * (*SFP17ptr++)
-                    #endif
-                    #if nISO>=19
-                    + x18 * (*SFP18ptr++)
-                    #endif
-                    #if nISO>=20
-                    + x19 * (*SFP19ptr++)
-                    #endif
-                );
-        }
-        t_v++;
-    }
-#endif
-
-    pthread_exit( 0 );
-}
-
-
-// =========================
-// Function called by CYTHON
-// =========================
-void COMMIT_A(
-    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
-    double *_vIN, double *_vOUT,
-    uint32_t *_ICf, uint32_t *_ICv, uint16_t *_ICo, float *_ICl,
-    uint32_t *_ECv, uint16_t *_ECo,
-    uint32_t *_ISOv,
-    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
-    uint32_t* _ICthreads, uint32_t* _ECthreads, uint32_t* _ISOthreads
-)
-{
-    nF = _nF;
-    n  = _n;
-    nE = _nE;
-    nV = _nV;
-    nS = _nS;
-    ndirs = _ndirs;
-
-    x = _vIN;
-    Y = _vOUT;
-
-    ICf  = _ICf;
-    ICv  = _ICv;
-    ICo  = _ICo;
-    ICl  = _ICl;
-    ECv  = _ECv;
-    ECo  = _ECo;
-    ISOv = _ISOv;
-
-    #if nIC>=1
-    wmrSFP0 = _wmrSFP;
-    #if nIC>=2
-    wmrSFP1 = wmrSFP0 + _ndirs*_nS;
-    #if nIC>=3
-    wmrSFP2 = wmrSFP1 + _ndirs*_nS;
-    #if nIC>=4
-    wmrSFP3 = wmrSFP2 + _ndirs*_nS;
-    #if nIC>=5
-    wmrSFP4 = wmrSFP3 + _ndirs*_nS;
-    #if nIC>=6
-    wmrSFP5 = wmrSFP4 + _ndirs*_nS;
-    #if nIC>=7
-    wmrSFP6 = wmrSFP5 + _ndirs*_nS;
-    #if nIC>=8
-    wmrSFP7 = wmrSFP6 + _ndirs*_nS;
-    #if nIC>=9
-    wmrSFP8 = wmrSFP7 + _ndirs*_nS;
-    #if nIC>=10
-    wmrSFP9 = wmrSFP8 + _ndirs*_nS;
-    #if nIC>=11
-    wmrSFP10 = wmrSFP9 + _ndirs*_nS;
-    #if nIC>=12
-    wmrSFP11 = wmrSFP10 + _ndirs*_nS;
-    #if nIC>=13
-    wmrSFP12 = wmrSFP11 + _ndirs*_nS;
-    #if nIC>=14
-    wmrSFP13 = wmrSFP12 + _ndirs*_nS;
-    #if nIC>=15
-    wmrSFP14 = wmrSFP13 + _ndirs*_nS;
-    #if nIC>=16
-    wmrSFP15 = wmrSFP14 + _ndirs*_nS;
-    #if nIC>=17
-    wmrSFP16 = wmrSFP15 + _ndirs*_nS;
-    #if nIC>=18
-    wmrSFP17 = wmrSFP16 + _ndirs*_nS;
-    #if nIC>=19
-    wmrSFP18 = wmrSFP17 + _ndirs*_nS;
-    #if nIC>=20
-    wmrSFP19 = wmrSFP18 + _ndirs*_nS;
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #if nEC>=1
-    wmhSFP0 = _wmhSFP;
-    #if nEC>=2
-    wmhSFP1 = wmhSFP0 + _ndirs*_nS;
-    #if nEC>=3
-    wmhSFP2 = wmhSFP1 + _ndirs*_nS;
-    #if nEC>=4
-    wmhSFP3 = wmhSFP2 + _ndirs*_nS;
-    #if nEC>=5
-    wmhSFP4 = wmhSFP3 + _ndirs*_nS;
-    #if nEC>=6
-    wmhSFP5 = wmhSFP4 + _ndirs*_nS;
-    #if nEC>=7
-    wmhSFP6 = wmhSFP5 + _ndirs*_nS;
-    #if nEC>=8
-    wmhSFP7 = wmhSFP6 + _ndirs*_nS;
-    #if nEC>=9
-    wmhSFP8 = wmhSFP7 + _ndirs*_nS;
-    #if nEC>=10
-    wmhSFP9 = wmhSFP8 + _ndirs*_nS;
-    #if nEC>=11
-    wmhSFP10 = wmhSFP9 + _ndirs*_nS;
-    #if nEC>=12
-    wmhSFP11 = wmhSFP10 + _ndirs*_nS;
-    #if nEC>=13
-    wmhSFP12 = wmhSFP11 + _ndirs*_nS;
-    #if nEC>=14
-    wmhSFP13 = wmhSFP12 + _ndirs*_nS;
-    #if nEC>=15
-    wmhSFP14 = wmhSFP13 + _ndirs*_nS;
-    #if nEC>=16
-    wmhSFP15 = wmhSFP14 + _ndirs*_nS;
-    #if nEC>=17
-    wmhSFP16 = wmhSFP15 + _ndirs*_nS;
-    #if nEC>=18
-    wmhSFP17 = wmhSFP16 + _ndirs*_nS;
-    #if nEC>=19
-    wmhSFP18 = wmhSFP17 + _ndirs*_nS;
-    #if nEC>=20
-    wmhSFP19 = wmhSFP18 + _ndirs*_nS;
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #if nISO>=1
-    isoSFP0 = _isoSFP;
-    #if nISO>=2
-    isoSFP1 = isoSFP0 + _nS;
-    #if nISO>=3
-    isoSFP2 = isoSFP1 + _nS;
-    #if nISO>=4
-    isoSFP3 = isoSFP2 + _nS;
-    #if nISO>=5
-    isoSFP4 = isoSFP3 + _nS;
-    #if nISO>=6
-    isoSFP5 = isoSFP4 + _nS;
-    #if nISO>=7
-    isoSFP6 = isoSFP5 + _nS;
-    #if nISO>=8
-    isoSFP7 = isoSFP6 + _nS;
-    #if nISO>=9
-    isoSFP8 = isoSFP7 + _nS;
-    #if nISO>=10
-    isoSFP9 = isoSFP8 + _nS;
-    #if nISO>=11
-    isoSFP10 = isoSFP9 + _nS;
-    #if nISO>=12
-    isoSFP11 = isoSFP10 + _nS;
-    #if nISO>=13
-    isoSFP12 = isoSFP11 + _nS;
-    #if nISO>=14
-    isoSFP13 = isoSFP12 + _nS;
-    #if nISO>=15
-    isoSFP14 = isoSFP13 + _nS;
-    #if nISO>=16
-    isoSFP15 = isoSFP14 + _nS;
-    #if nISO>=17
-    isoSFP16 = isoSFP15 + _nS;
-    #if nISO>=18
-    isoSFP17 = isoSFP16 + _nS;
-    #if nISO>=19
-    isoSFP18 = isoSFP17 + _nS;
-    #if nISO>=20
-    isoSFP19 = isoSFP18 + _nS;
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-
-    ICthreads  = _ICthreads;
-    ECthreads  = _ECthreads;
-    ISOthreads = _ISOthreads;
-
-    // Run SEPARATE THREADS to perform the multiplication
-    pthread_t threads[nTHREADS];
-    int t;
-    for(t=0; t<nTHREADS ; t++)
-        pthread_create( &threads[t], NULL, COMMIT_A__block, (void *) (long int)t );
-    for(t=0; t<nTHREADS ; t++)
-        pthread_join( threads[t], NULL );
-    return;
-}
-
-
-
-/* ===================================================== */
-/* Compute a sub-block of the A'*y MAtRIX-VECTOR product */
-/* ===================================================== */
-void* COMMIT_At__block( void *ptr )
-{
-    int      id = (long)ptr;
-    int      offset;
-    double   x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, w, Y_tmp;
-    double   *x_Ptr0, *x_Ptr1, *x_Ptr2, *x_Ptr3, *x_Ptr4, *x_Ptr5, *x_Ptr6, *x_Ptr7, *x_Ptr8, *x_Ptr9, *x_Ptr10, *x_Ptr11, *x_Ptr12, *x_Ptr13, *x_Ptr14, *x_Ptr15, *x_Ptr16, *x_Ptr17, *x_Ptr18, *x_Ptr19;
-    double   *Yptr, *YptrEnd;
-    float    *SFP0ptr, *SFP1ptr, *SFP2ptr, *SFP3ptr, *SFP4ptr, *SFP5ptr, *SFP6ptr, *SFP7ptr, *SFP8ptr, *SFP9ptr, *SFP10ptr, *SFP11ptr, *SFP12ptr, *SFP13ptr, *SFP14ptr, *SFP15ptr, *SFP16ptr, *SFP17ptr, *SFP18ptr, *SFP19ptr;
-    uint32_t *t_v, *t_vEnd, *t_f;
-    uint16_t *t_o;
-    float    *t_l;
-    uint8_t  *t_t;
-
-#if nIC>=1
-    // intra-cellular compartments
-    t_v    = ICv;
-    t_vEnd = ICv + n;
-    t_o    = ICo;
-    t_l    = ICl;
-    t_f    = ICf;
-    t_t    = ICthreadsT;
-
-    while( t_v != t_vEnd )
-    {
-        // in this case, I need to walk throug because the segments are ordered in "voxel order"
-        if ( *t_t == id )
-        {
-            Yptr    = Y    + nS * (*t_v);
-            YptrEnd = Yptr + nS;
-            offset  = nS * (*t_o);
-
-            Y_tmp = *Yptr;
-            SFP0ptr   = wmrSFP0 + offset;
-            x0 = (*SFP0ptr++) * Y_tmp;
-            #if nIC>=2
-            SFP1ptr   = wmrSFP1 + offset;
-            x1 = (*SFP1ptr++) * Y_tmp;
-            #endif
-            #if nIC>=3
-            SFP2ptr   = wmrSFP2 + offset;
-            x2 = (*SFP2ptr++) * Y_tmp;
-            #endif
-            #if nIC>=4
-            SFP3ptr   = wmrSFP3 + offset;
-            x3 = (*SFP3ptr++) * Y_tmp;
-            #endif
-            #if nIC>=5
-            SFP4ptr   = wmrSFP4 + offset;
-            x4 = (*SFP4ptr++) * Y_tmp;
-            #endif
-            #if nIC>=6
-            SFP5ptr   = wmrSFP5 + offset;
-            x5 = (*SFP5ptr++) * Y_tmp;
-            #endif
-            #if nIC>=7
-            SFP6ptr   = wmrSFP6 + offset;
-            x6 = (*SFP6ptr++) * Y_tmp;
-            #endif
-            #if nIC>=8
-            SFP7ptr   = wmrSFP7 + offset;
-            x7 = (*SFP7ptr++) * Y_tmp;
-            #endif
-            #if nIC>=9
-            SFP8ptr   = wmrSFP8 + offset;
-            x8 = (*SFP8ptr++) * Y_tmp;
-            #endif
-            #if nIC>=10
-            SFP9ptr   = wmrSFP9 + offset;
-            x9 = (*SFP9ptr++) * Y_tmp;
-            #endif
-            #if nIC>=11
-            SFP10ptr   = wmrSFP10 + offset;
-            x10 = (*SFP10ptr++) * Y_tmp;
-            #endif
-            #if nIC>=12
-            SFP11ptr   = wmrSFP11 + offset;
-            x11 = (*SFP11ptr++) * Y_tmp;
-            #endif
-            #if nIC>=13
-            SFP12ptr   = wmrSFP12 + offset;
-            x12 = (*SFP12ptr++) * Y_tmp;
-            #endif
-            #if nIC>=14
-            SFP13ptr   = wmrSFP13 + offset;
-            x13 = (*SFP13ptr++) * Y_tmp;
-            #endif
-            #if nIC>=15
-            SFP14ptr   = wmrSFP14 + offset;
-            x14 = (*SFP14ptr++) * Y_tmp;
-            #endif
-            #if nIC>=16
-            SFP15ptr   = wmrSFP15 + offset;
-            x15 = (*SFP15ptr++) * Y_tmp;
-            #endif
-            #if nIC>=17
-            SFP16ptr   = wmrSFP16 + offset;
-            x16 = (*SFP16ptr++) * Y_tmp;
-            #endif
-            #if nIC>=18
-            SFP17ptr   = wmrSFP17 + offset;
-            x17 = (*SFP17ptr++) * Y_tmp;
-            #endif
-            #if nIC>=19
-            SFP18ptr   = wmrSFP18 + offset;
-            x18 = (*SFP18ptr++) * Y_tmp;
-            #endif
-            #if nIC>=20
-            SFP19ptr   = wmrSFP19 + offset;
-            x19 = (*SFP19ptr++) * Y_tmp;
-            #endif
-
-            while( ++Yptr != YptrEnd )
-            {
-                Y_tmp = *Yptr;
-                x0 += (*SFP0ptr++) * Y_tmp;
-                #if nIC>=2
-                x1 += (*SFP1ptr++) * Y_tmp;
-                #endif
-                #if nIC>=3
-                x2 += (*SFP2ptr++) * Y_tmp;
-                #endif
-                #if nIC>=4
-                x3 += (*SFP3ptr++) * Y_tmp;
-                #endif
-                #if nIC>=5
-                x4 += (*SFP4ptr++) * Y_tmp;
-                #endif
-                #if nIC>=6
-                x5 += (*SFP5ptr++) * Y_tmp;
-                #endif
-                #if nIC>=7
-                x6 += (*SFP6ptr++) * Y_tmp;
-                #endif
-                #if nIC>=8
-                x7 += (*SFP7ptr++) * Y_tmp;
-                #endif
-                #if nIC>=9
-                x8 += (*SFP8ptr++) * Y_tmp;
-                #endif
-                #if nIC>=10
-                x9 += (*SFP9ptr++) * Y_tmp;
-                #endif
-                #if nIC>=11
-                x10 += (*SFP10ptr++) * Y_tmp;
-                #endif
-                #if nIC>=12
-                x11 += (*SFP11ptr++) * Y_tmp;
-                #endif
-                #if nIC>=13
-                x12 += (*SFP12ptr++) * Y_tmp;
-                #endif
-                #if nIC>=14
-                x13 += (*SFP13ptr++) * Y_tmp;
-                #endif
-                #if nIC>=15
-                x14 += (*SFP14ptr++) * Y_tmp;
-                #endif
-                #if nIC>=16
-                x15 += (*SFP15ptr++) * Y_tmp;
-                #endif
-                #if nIC>=17
-                x16 += (*SFP16ptr++) * Y_tmp;
-                #endif
-                #if nIC>=18
-                x17 += (*SFP17ptr++) * Y_tmp;
-                #endif
-                #if nIC>=19
-                x18 += (*SFP18ptr++) * Y_tmp;
-                #endif
-                #if nIC>=20
-                x19 += (*SFP19ptr++) * Y_tmp;
-                #endif
-            }
-
-            w = (double)(*t_l);
-            x[*t_f]      += w * x0;
-            #if nIC>=2
-            x[*t_f+nF]   += w * x1;
-            #endif
-            #if nIC>=3
-            x[*t_f+2*nF] += w * x2;
-            #endif
-            #if nIC>=4
-            x[*t_f+3*nF] += w * x3;
-            #endif
-            #if nIC>=5
-            x[*t_f+4*nF] += w * x4;
-            #endif
-            #if nIC>=6
-            x[*t_f+5*nF] += w * x5;
-            #endif
-            #if nIC>=7
-            x[*t_f+6*nF] += w * x6;
-            #endif
-            #if nIC>=8
-            x[*t_f+7*nF] += w * x7;
-            #endif
-            #if nIC>=9
-            x[*t_f+8*nF] += w * x8;
-            #endif
-            #if nIC>=10
-            x[*t_f+9*nF] += w * x9;
-            #endif
-            #if nIC>=11
-            x[*t_f+10*nF] += w * x10;
-            #endif
-            #if nIC>=12
-            x[*t_f+11*nF] += w * x11;
-            #endif
-            #if nIC>=13
-            x[*t_f+12*nF] += w * x12;
-            #endif
-            #if nIC>=14
-            x[*t_f+13*nF] += w * x13;
-            #endif
-            #if nIC>=15
-            x[*t_f+14*nF] += w * x14;
-            #endif
-            #if nIC>=16
-            x[*t_f+15*nF] += w * x15;
-            #endif
-            #if nIC>=17
-            x[*t_f+16*nF] += w * x16;
-            #endif
-            #if nIC>=18
-            x[*t_f+17*nF] += w * x17;
-            #endif
-            #if nIC>=19
-            x[*t_f+18*nF] += w * x18;
-            #endif
-            #if nIC>=20
-            x[*t_f+19*nF] += w * x19;
-            #endif
-        }
-
-        t_f++;
-        t_v++;
-        t_o++;
-        t_l++;
-        t_t++;
-    }
-#endif
-
-#if nEC>=1
-    // extra-cellular compartments
-    t_v    = ECv + ECthreadsT[id];
-    t_vEnd = ECv + ECthreadsT[id+1];
-    t_o    = ECo + ECthreadsT[id];
-
-    x_Ptr0 = x + nIC*nF + ECthreadsT[id];
-    #if nEC>=2
-    x_Ptr1 = x_Ptr0 + nE;
-    #endif
-    #if nEC>=3
-    x_Ptr2 = x_Ptr1 + nE;
-    #endif
-    #if nEC>=4
-    x_Ptr3 = x_Ptr2 + nE;
-    #endif
-    #if nEC>=5
-    x_Ptr4 = x_Ptr3 + nE;
-    #endif
-    #if nEC>=6
-    x_Ptr5 = x_Ptr4 + nE;
-    #endif
-    #if nEC>=7
-    x_Ptr6 = x_Ptr5 + nE;
-    #endif
-    #if nEC>=8
-    x_Ptr7 = x_Ptr6 + nE;
-    #endif
-    #if nEC>=9
-    x_Ptr8 = x_Ptr7 + nE;
-    #endif
-    #if nEC>=10
-    x_Ptr9 = x_Ptr8 + nE;
-    #endif
-    #if nEC>=11
-    x_Ptr10 = x_Ptr9 + nE;
-    #endif
-    #if nEC>=12
-    x_Ptr11 = x_Ptr10 + nE;
-    #endif
-    #if nEC>=13
-    x_Ptr12 = x_Ptr11 + nE;
-    #endif
-    #if nEC>=14
-    x_Ptr13 = x_Ptr12 + nE;
-    #endif
-    #if nEC>=15
-    x_Ptr14 = x_Ptr13 + nE;
-    #endif
-    #if nEC>=16
-    x_Ptr15 = x_Ptr14 + nE;
-    #endif
-    #if nEC>=17
-    x_Ptr16 = x_Ptr15 + nE;
-    #endif
-    #if nEC>=18
-    x_Ptr17 = x_Ptr16 + nE;
-    #endif
-    #if nEC>=19
-    x_Ptr18 = x_Ptr17 + nE;
-    #endif
-    #if nEC>=20
-    x_Ptr19 = x_Ptr18 + nE;
-    #endif
-
-    while( t_v != t_vEnd )
-    {
-        Yptr    = Y    + nS * (*t_v++);
-        YptrEnd = Yptr + nS;
-        offset  = nS * (*t_o++);
-
-        Y_tmp = *Yptr;
-        SFP0ptr = wmhSFP0 + offset;
-        x0 = (*SFP0ptr++) * Y_tmp;
-        #if nEC>=2
-        SFP1ptr = wmhSFP1 + offset;
-        x1 = (*SFP1ptr++) * Y_tmp;
-        #endif
-        #if nEC>=3
-        SFP2ptr = wmhSFP2 + offset;
-        x2 = (*SFP2ptr++) * Y_tmp;
-        #endif
-        #if nEC>=4
-        SFP3ptr = wmhSFP3 + offset;
-        x3 = (*SFP3ptr++) * Y_tmp;
-        #endif
-        #if nEC>=5
-        SFP4ptr = wmhSFP4 + offset;
-        x4 = (*SFP4ptr++) * Y_tmp;
-        #endif
-        #if nEC>=6
-        SFP5ptr = wmhSFP5 + offset;
-        x5 = (*SFP5ptr++) * Y_tmp;
-        #endif
-        #if nEC>=7
-        SFP6ptr = wmhSFP6 + offset;
-        x6 = (*SFP6ptr++) * Y_tmp;
-        #endif
-        #if nEC>=8
-        SFP7ptr = wmhSFP7 + offset;
-        x7 = (*SFP7ptr++) * Y_tmp;
-        #endif
-        #if nEC>=9
-        SFP8ptr = wmhSFP8 + offset;
-        x8 = (*SFP8ptr++) * Y_tmp;
-        #endif
-        #if nEC>=10
-        SFP9ptr = wmhSFP9 + offset;
-        x9 = (*SFP9ptr++) * Y_tmp;
-        #endif
-        #if nEC>=11
-        SFP10ptr = wmhSFP10 + offset;
-        x10 = (*SFP10ptr++) * Y_tmp;
-        #endif
-        #if nEC>=12
-        SFP11ptr = wmhSFP11 + offset;
-        x11 = (*SFP11ptr++) * Y_tmp;
-        #endif
-        #if nEC>=13
-        SFP12ptr = wmhSFP12 + offset;
-        x12 = (*SFP12ptr++) * Y_tmp;
-        #endif
-        #if nEC>=14
-        SFP13ptr = wmhSFP13 + offset;
-        x13 = (*SFP13ptr++) * Y_tmp;
-        #endif
-        #if nEC>=15
-        SFP14ptr = wmhSFP14 + offset;
-        x14 = (*SFP14ptr++) * Y_tmp;
-        #endif
-        #if nEC>=16
-        SFP15ptr = wmhSFP15 + offset;
-        x15 = (*SFP15ptr++) * Y_tmp;
-        #endif
-        #if nEC>=17
-        SFP16ptr = wmhSFP16 + offset;
-        x16 = (*SFP16ptr++) * Y_tmp;
-        #endif
-        #if nEC>=18
-        SFP17ptr = wmhSFP17 + offset;
-        x17 = (*SFP17ptr++) * Y_tmp;
-        #endif
-        #if nEC>=19
-        SFP18ptr = wmhSFP18 + offset;
-        x18 = (*SFP18ptr++) * Y_tmp;
-        #endif
-        #if nEC>=20
-        SFP19ptr = wmhSFP19 + offset;
-        x19 = (*SFP19ptr++) * Y_tmp;
-        #endif
-
-        while( ++Yptr != YptrEnd )
-        {
-            Y_tmp = *Yptr;
-            x0 += (*SFP0ptr++) * Y_tmp;
-            #if nEC>=2
-            x1 += (*SFP1ptr++) * Y_tmp;
-            #endif
-            #if nEC>=3
-            x2 += (*SFP2ptr++) * Y_tmp;
-            #endif
-            #if nEC>=4
-            x3 += (*SFP3ptr++) * Y_tmp;
-            #endif
-            #if nEC>=5
-            x4 += (*SFP4ptr++) * Y_tmp;
-            #endif
-            #if nEC>=6
-            x5 += (*SFP5ptr++) * Y_tmp;
-            #endif
-            #if nEC>=7
-            x6 += (*SFP6ptr++) * Y_tmp;
-            #endif
-            #if nEC>=8
-            x7 += (*SFP7ptr++) * Y_tmp;
-            #endif
-            #if nEC>=9
-            x8 += (*SFP8ptr++) * Y_tmp;
-            #endif
-            #if nEC>=10
-            x9 += (*SFP9ptr++) * Y_tmp;
-            #endif
-            #if nEC>=11
-            x10 += (*SFP10ptr++) * Y_tmp;
-            #endif
-            #if nEC>=12
-            x11 += (*SFP11ptr++) * Y_tmp;
-            #endif
-            #if nEC>=13
-            x12 += (*SFP12ptr++) * Y_tmp;
-            #endif
-            #if nEC>=14
-            x13 += (*SFP13ptr++) * Y_tmp;
-            #endif
-            #if nEC>=15
-            x14 += (*SFP14ptr++) * Y_tmp;
-            #endif
-            #if nEC>=16
-            x15 += (*SFP15ptr++) * Y_tmp;
-            #endif
-            #if nEC>=17
-            x16 += (*SFP16ptr++) * Y_tmp;
-            #endif
-            #if nEC>=18
-            x17 += (*SFP17ptr++) * Y_tmp;
-            #endif
-            #if nEC>=19
-            x18 += (*SFP18ptr++) * Y_tmp;
-            #endif
-            #if nEC>=20
-            x19 += (*SFP19ptr++) * Y_tmp;
-            #endif
-        }
-        (*x_Ptr0++) += x0;
-        #if nEC>=2
-        (*x_Ptr1++) += x1;
-        #endif
-        #if nEC>=3
-        (*x_Ptr2++) += x2;
-        #endif
-        #if nEC>=4
-        (*x_Ptr3++) += x3;
-        #endif
-        #if nEC>=5
-        (*x_Ptr4++) += x4;
-        #endif
-        #if nEC>=6
-        (*x_Ptr5++) += x5;
-        #endif
-        #if nEC>=7
-        (*x_Ptr6++) += x6;
-        #endif
-        #if nEC>=8
-        (*x_Ptr7++) += x7;
-        #endif
-        #if nEC>=9
-        (*x_Ptr8++) += x8;
-        #endif
-        #if nEC>=10
-        (*x_Ptr9++) += x9;
-        #endif
-        #if nEC>=11
-        (*x_Ptr10++) += x10;
-        #endif
-        #if nEC>=12
-        (*x_Ptr11++) += x11;
-        #endif
-        #if nEC>=13
-        (*x_Ptr12++) += x12;
-        #endif
-        #if nEC>=14
-        (*x_Ptr13++) += x13;
-        #endif
-        #if nEC>=15
-        (*x_Ptr14++) += x14;
-        #endif
-        #if nEC>=16
-        (*x_Ptr15++) += x15;
-        #endif
-        #if nEC>=17
-        (*x_Ptr16++) += x16;
-        #endif
-        #if nEC>=18
-        (*x_Ptr17++) += x17;
-        #endif
-        #if nEC>=19
-        (*x_Ptr18++) += x18;
-        #endif
-        #if nEC>=20
-        (*x_Ptr19++) += x19;
-        #endif
-    }
-#endif
-
-#if nISO>=1
-    // isotropic compartments
-    t_v    = ISOv + ISOthreadsT[id];
-    t_vEnd = ISOv + ISOthreadsT[id+1];
-
-    x_Ptr0 = x + nIC*nF + nEC*nE + ISOthreadsT[id];
-    #if nISO>=2
-    x_Ptr1 = x_Ptr0 + nV;
-    #endif
-    #if nISO>=3
-    x_Ptr2 = x_Ptr1 + nV;
-    #endif
-    #if nISO>=4
-    x_Ptr3 = x_Ptr2 + nV;
-    #endif
-    #if nISO>=5
-    x_Ptr4 = x_Ptr3 + nV;
-    #endif
-    #if nISO>=6
-    x_Ptr5 = x_Ptr4 + nV;
-    #endif
-    #if nISO>=7
-    x_Ptr6 = x_Ptr5 + nV;
-    #endif
-    #if nISO>=8
-    x_Ptr7 = x_Ptr6 + nV;
-    #endif
-    #if nISO>=9
-    x_Ptr8 = x_Ptr7 + nV;
-    #endif
-    #if nISO>=10
-    x_Ptr9 = x_Ptr8 + nV;
-    #endif
-    #if nISO>=11
-    x_Ptr10 = x_Ptr9 + nV;
-    #endif
-    #if nISO>=12
-    x_Ptr11 = x_Ptr10 + nV;
-    #endif
-    #if nISO>=13
-    x_Ptr12 = x_Ptr11 + nV;
-    #endif
-    #if nISO>=14
-    x_Ptr13 = x_Ptr12 + nV;
-    #endif
-    #if nISO>=15
-    x_Ptr14 = x_Ptr13 + nV;
-    #endif
-    #if nISO>=16
-    x_Ptr15 = x_Ptr14 + nV;
-    #endif
-    #if nISO>=17
-    x_Ptr16 = x_Ptr15 + nV;
-    #endif
-    #if nISO>=18
-    x_Ptr17 = x_Ptr16 + nV;
-    #endif
-    #if nISO>=19
-    x_Ptr18 = x_Ptr17 + nV;
-    #endif
-    #if nISO>=20
-    x_Ptr19 = x_Ptr18 + nV;
-    #endif
-
-    while( t_v != t_vEnd )
-    {
-        Yptr    = Y    + nS * (*t_v++);
-        YptrEnd = Yptr + nS;
-
-        SFP0ptr = isoSFP0;
-        #if nISO>=2
-        SFP1ptr = isoSFP1;
-        #endif
-        #if nISO>=3
-        SFP2ptr = isoSFP2;
-        #endif
-        #if nISO>=4
-        SFP3ptr = isoSFP3;
-        #endif
-        #if nISO>=5
-        SFP4ptr = isoSFP4;
-        #endif
-        #if nISO>=6
-        SFP5ptr = isoSFP5;
-        #endif
-        #if nISO>=7
-        SFP6ptr = isoSFP6;
-        #endif
-        #if nISO>=8
-        SFP7ptr = isoSFP7;
-        #endif
-        #if nISO>=9
-        SFP8ptr = isoSFP8;
-        #endif
-        #if nISO>=10
-        SFP9ptr = isoSFP9;
-        #endif
-        #if nISO>=11
-        SFP10ptr = isoSFP10;
-        #endif
-        #if nISO>=12
-        SFP11ptr = isoSFP11;
-        #endif
-        #if nISO>=13
-        SFP12ptr = isoSFP12;
-        #endif
-        #if nISO>=14
-        SFP13ptr = isoSFP13;
-        #endif
-        #if nISO>=15
-        SFP14ptr = isoSFP14;
-        #endif
-        #if nISO>=16
-        SFP15ptr = isoSFP15;
-        #endif
-        #if nISO>=17
-        SFP16ptr = isoSFP16;
-        #endif
-        #if nISO>=18
-        SFP17ptr = isoSFP17;
-        #endif
-        #if nISO>=19
-        SFP18ptr = isoSFP18;
-        #endif
-        #if nISO>=20
-        SFP19ptr = isoSFP19;
-        #endif
-
-        Y_tmp = *Yptr;
-        x0 = (*SFP0ptr++) * Y_tmp;
-        #if nISO>=2
-        x1 = (*SFP1ptr++) * Y_tmp;
-        #endif
-        #if nISO>=3
-        x2 = (*SFP2ptr++) * Y_tmp;
-        #endif
-        #if nISO>=4
-        x3 = (*SFP3ptr++) * Y_tmp;
-        #endif
-        #if nISO>=5
-        x4 = (*SFP4ptr++) * Y_tmp;
-        #endif
-        #if nISO>=6
-        x5 = (*SFP5ptr++) * Y_tmp;
-        #endif
-        #if nISO>=7
-        x6 = (*SFP6ptr++) * Y_tmp;
-        #endif
-        #if nISO>=8
-        x7 = (*SFP7ptr++) * Y_tmp;
-        #endif
-        #if nISO>=9
-        x8 = (*SFP8ptr++) * Y_tmp;
-        #endif
-        #if nISO>=10
-        x9 = (*SFP9ptr++) * Y_tmp;
-        #endif
-        #if nISO>=11
-        x10 = (*SFP10ptr++) * Y_tmp;
-        #endif
-        #if nISO>=12
-        x11 = (*SFP11ptr++) * Y_tmp;
-        #endif
-        #if nISO>=13
-        x12 = (*SFP12ptr++) * Y_tmp;
-        #endif
-        #if nISO>=14
-        x13 = (*SFP13ptr++) * Y_tmp;
-        #endif
-        #if nISO>=15
-        x14 = (*SFP14ptr++) * Y_tmp;
-        #endif
-        #if nISO>=16
-        x15 = (*SFP15ptr++) * Y_tmp;
-        #endif
-        #if nISO>=17
-        x16 = (*SFP16ptr++) * Y_tmp;
-        #endif
-        #if nISO>=18
-        x17 = (*SFP17ptr++) * Y_tmp;
-        #endif
-        #if nISO>=19
-        x18 = (*SFP18ptr++) * Y_tmp;
-        #endif
-        #if nISO>=20
-        x19 = (*SFP19ptr++) * Y_tmp;
-        #endif
-
-        while( ++Yptr != YptrEnd )
-        {
-            Y_tmp = *Yptr;
-            x0  += (*SFP0ptr++) * Y_tmp;
-            #if nISO>=2
-            x1  += (*SFP1ptr++) * Y_tmp;
-            #endif
-            #if nISO>=3
-            x2  += (*SFP2ptr++) * Y_tmp;
-            #endif
-            #if nISO>=4
-            x3  += (*SFP3ptr++) * Y_tmp;
-            #endif
-            #if nISO>=5
-            x4  += (*SFP4ptr++) * Y_tmp;
-            #endif
-            #if nISO>=6
-            x5  += (*SFP5ptr++) * Y_tmp;
-            #endif
-            #if nISO>=7
-            x6  += (*SFP6ptr++) * Y_tmp;
-            #endif
-            #if nISO>=8
-            x7  += (*SFP7ptr++) * Y_tmp;
-            #endif
-            #if nISO>=9
-            x8  += (*SFP8ptr++) * Y_tmp;
-            #endif
-            #if nISO>=10
-            x9  += (*SFP9ptr++) * Y_tmp;
-            #endif
-            #if nISO>=11
-            x10  += (*SFP10ptr++) * Y_tmp;
-            #endif
-            #if nISO>=12
-            x11  += (*SFP11ptr++) * Y_tmp;
-            #endif
-            #if nISO>=13
-            x12  += (*SFP12ptr++) * Y_tmp;
-            #endif
-            #if nISO>=14
-            x13  += (*SFP13ptr++) * Y_tmp;
-            #endif
-            #if nISO>=15
-            x14  += (*SFP14ptr++) * Y_tmp;
-            #endif
-            #if nISO>=16
-            x15  += (*SFP15ptr++) * Y_tmp;
-            #endif
-            #if nISO>=17
-            x16  += (*SFP16ptr++) * Y_tmp;
-            #endif
-            #if nISO>=18
-            x17  += (*SFP17ptr++) * Y_tmp;
-            #endif
-            #if nISO>=19
-            x18  += (*SFP18ptr++) * Y_tmp;
-            #endif
-            #if nISO>=20
-            x19  += (*SFP19ptr++) * Y_tmp;
-            #endif
-        }
-
-        (*x_Ptr0++) += x0;
-        #if nISO>=2
-        (*x_Ptr1++) += x1;
-        #endif
-        #if nISO>=3
-        (*x_Ptr2++) += x2;
-        #endif
-        #if nISO>=4
-        (*x_Ptr3++) += x3;
-        #endif
-        #if nISO>=5
-        (*x_Ptr4++) += x4;
-        #endif
-        #if nISO>=6
-        (*x_Ptr5++) += x5;
-        #endif
-        #if nISO>=7
-        (*x_Ptr6++) += x6;
-        #endif
-        #if nISO>=8
-        (*x_Ptr7++) += x7;
-        #endif
-        #if nISO>=9
-        (*x_Ptr8++) += x8;
-        #endif
-        #if nISO>=10
-        (*x_Ptr9++) += x9;
-        #endif
-        #if nISO>=11
-        (*x_Ptr10++) += x10;
-        #endif
-        #if nISO>=12
-        (*x_Ptr11++) += x11;
-        #endif
-        #if nISO>=13
-        (*x_Ptr12++) += x12;
-        #endif
-        #if nISO>=14
-        (*x_Ptr13++) += x13;
-        #endif
-        #if nISO>=15
-        (*x_Ptr14++) += x14;
-        #endif
-        #if nISO>=16
-        (*x_Ptr15++) += x15;
-        #endif
-        #if nISO>=17
-        (*x_Ptr16++) += x16;
-        #endif
-        #if nISO>=18
-        (*x_Ptr17++) += x17;
-        #endif
-        #if nISO>=19
-        (*x_Ptr18++) += x18;
-        #endif
-        #if nISO>=20
-        (*x_Ptr19++) += x19;
-        #endif
-    }
-#endif
-
-    pthread_exit( 0 );
-}
-
-
-// =========================
-// Function called by CYTHON
-// =========================
-void COMMIT_At(
-    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
-    double *_vIN, double *_vOUT,
-    uint32_t *_ICf, uint32_t *_ICv, uint16_t *_ICo, float *_ICl,
-    uint32_t *_ECv, uint16_t *_ECo,
-    uint32_t *_ISOv,
-    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
-    uint8_t* _ICthreadsT, uint32_t* _ECthreadsT, uint32_t* _ISOthreadsT
-)
-{
-    nF = _nF;
-    n  = _n;
-    nE = _nE;
-    nV = _nV;
-    nS = _nS;
-    ndirs = _ndirs;
-
-    x = _vOUT;
-    Y = _vIN;
-
-    ICf  = _ICf;
-    ICv  = _ICv;
-    ICo  = _ICo;
-    ICl  = _ICl;
-    ECv  = _ECv;
-    ECo  = _ECo;
-    ISOv = _ISOv;
-
-    #if nIC>=1
-    wmrSFP0 = _wmrSFP;
-    #if nIC>=2
-    wmrSFP1 = wmrSFP0 + _ndirs*_nS;
-    #if nIC>=3
-    wmrSFP2 = wmrSFP1 + _ndirs*_nS;
-    #if nIC>=4
-    wmrSFP3 = wmrSFP2 + _ndirs*_nS;
-    #if nIC>=5
-    wmrSFP4 = wmrSFP3 + _ndirs*_nS;
-    #if nIC>=6
-    wmrSFP5 = wmrSFP4 + _ndirs*_nS;
-    #if nIC>=7
-    wmrSFP6 = wmrSFP5 + _ndirs*_nS;
-    #if nIC>=8
-    wmrSFP7 = wmrSFP6 + _ndirs*_nS;
-    #if nIC>=9
-    wmrSFP8 = wmrSFP7 + _ndirs*_nS;
-    #if nIC>=10
-    wmrSFP9 = wmrSFP8 + _ndirs*_nS;
-    #if nIC>=11
-    wmrSFP10 = wmrSFP9 + _ndirs*_nS;
-    #if nIC>=12
-    wmrSFP11 = wmrSFP10 + _ndirs*_nS;
-    #if nIC>=13
-    wmrSFP12 = wmrSFP11 + _ndirs*_nS;
-    #if nIC>=14
-    wmrSFP13 = wmrSFP12 + _ndirs*_nS;
-    #if nIC>=15
-    wmrSFP14 = wmrSFP13 + _ndirs*_nS;
-    #if nIC>=16
-    wmrSFP15 = wmrSFP14 + _ndirs*_nS;
-    #if nIC>=17
-    wmrSFP16 = wmrSFP15 + _ndirs*_nS;
-    #if nIC>=18
-    wmrSFP17 = wmrSFP16 + _ndirs*_nS;
-    #if nIC>=19
-    wmrSFP18 = wmrSFP17 + _ndirs*_nS;
-    #if nIC>=20
-    wmrSFP19 = wmrSFP18 + _ndirs*_nS;
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #if nEC>=1
-    wmhSFP0 = _wmhSFP;
-    #if nEC>=2
-    wmhSFP1 = wmhSFP0 + _ndirs*_nS;
-    #if nEC>=3
-    wmhSFP2 = wmhSFP1 + _ndirs*_nS;
-    #if nEC>=4
-    wmhSFP3 = wmhSFP2 + _ndirs*_nS;
-    #if nEC>=5
-    wmhSFP4 = wmhSFP3 + _ndirs*_nS;
-    #if nEC>=6
-    wmhSFP5 = wmhSFP4 + _ndirs*_nS;
-    #if nEC>=7
-    wmhSFP6 = wmhSFP5 + _ndirs*_nS;
-    #if nEC>=8
-    wmhSFP7 = wmhSFP6 + _ndirs*_nS;
-    #if nEC>=9
-    wmhSFP8 = wmhSFP7 + _ndirs*_nS;
-    #if nEC>=10
-    wmhSFP9 = wmhSFP8 + _ndirs*_nS;
-    #if nEC>=11
-    wmhSFP10 = wmhSFP9 + _ndirs*_nS;
-    #if nEC>=12
-    wmhSFP11 = wmhSFP10 + _ndirs*_nS;
-    #if nEC>=13
-    wmhSFP12 = wmhSFP11 + _ndirs*_nS;
-    #if nEC>=14
-    wmhSFP13 = wmhSFP12 + _ndirs*_nS;
-    #if nEC>=15
-    wmhSFP14 = wmhSFP13 + _ndirs*_nS;
-    #if nEC>=16
-    wmhSFP15 = wmhSFP14 + _ndirs*_nS;
-    #if nEC>=17
-    wmhSFP16 = wmhSFP15 + _ndirs*_nS;
-    #if nEC>=18
-    wmhSFP17 = wmhSFP16 + _ndirs*_nS;
-    #if nEC>=19
-    wmhSFP18 = wmhSFP17 + _ndirs*_nS;
-    #if nEC>=20
-    wmhSFP19 = wmhSFP18 + _ndirs*_nS;
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #if nISO>=1
-    isoSFP0 = _isoSFP;
-    #if nISO>=2
-    isoSFP1 = isoSFP0 + _nS;
-    #if nISO>=3
-    isoSFP2 = isoSFP1 + _nS;
-    #if nISO>=4
-    isoSFP3 = isoSFP2 + _nS;
-    #if nISO>=5
-    isoSFP4 = isoSFP3 + _nS;
-    #if nISO>=6
-    isoSFP5 = isoSFP4 + _nS;
-    #if nISO>=7
-    isoSFP6 = isoSFP5 + _nS;
-    #if nISO>=8
-    isoSFP7 = isoSFP6 + _nS;
-    #if nISO>=9
-    isoSFP8 = isoSFP7 + _nS;
-    #if nISO>=10
-    isoSFP9 = isoSFP8 + _nS;
-    #if nISO>=11
-    isoSFP10 = isoSFP9 + _nS;
-    #if nISO>=12
-    isoSFP11 = isoSFP10 + _nS;
-    #if nISO>=13
-    isoSFP12 = isoSFP11 + _nS;
-    #if nISO>=14
-    isoSFP13 = isoSFP12 + _nS;
-    #if nISO>=15
-    isoSFP14 = isoSFP13 + _nS;
-    #if nISO>=16
-    isoSFP15 = isoSFP14 + _nS;
-    #if nISO>=17
-    isoSFP16 = isoSFP15 + _nS;
-    #if nISO>=18
-    isoSFP17 = isoSFP16 + _nS;
-    #if nISO>=19
-    isoSFP18 = isoSFP17 + _nS;
-    #if nISO>=20
-    isoSFP19 = isoSFP18 + _nS;
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-
-    ICthreadsT  = _ICthreadsT;
-    ECthreadsT  = _ECthreadsT;
-    ISOthreadsT = _ISOthreadsT;
-
-    // Run SEPARATE THREADS to perform the multiplication
-    pthread_t threads[nTHREADS];
-    int t;
-    for(t=0; t<nTHREADS ; t++)
-        pthread_create( &threads[t], NULL, COMMIT_At__block, (void *) (long int)t );
-    for(t=0; t<nTHREADS ; t++)
-        pthread_join( threads[t], NULL );
-    return;
-}
+#include <pthread.h>
+#include <stdint.h> // uint32_t etc
+
+// number of THREADS
+#ifdef nTHREADS
+    #if (nTHREADS<1 || nTHREADS>255)
+    #error "nTHREADS" must be in the range 0..255
+    #endif
+#else
+    #error "nTHREADS" parameter must be passed to the compiler as "-DnTHREADS=<value>"
+#endif
+
+
+/* global variables */
+int         nF, n, nE, nV, nS, ndirs;
+double      *x, *Y;
+uint32_t    *ICthreads, *ECthreads, *ISOthreads;
+uint8_t     *ICthreadsT;
+uint32_t    *ECthreadsT, *ISOthreadsT;
+uint32_t    *ICf, *ICv, *ECv, *ISOv;
+uint16_t    *ICo, *ECo;
+float       *ICl;
+float       *wmrSFP0, *wmrSFP1, *wmrSFP2, *wmrSFP3, *wmrSFP4, *wmrSFP5, *wmrSFP6, *wmrSFP7, *wmrSFP8, *wmrSFP9, *wmrSFP10, *wmrSFP11, *wmrSFP12, *wmrSFP13, *wmrSFP14, *wmrSFP15, *wmrSFP16, *wmrSFP17, *wmrSFP18, *wmrSFP19;
+float       *wmhSFP0, *wmhSFP1, *wmhSFP2, *wmhSFP3, *wmhSFP4, *wmhSFP5, *wmhSFP6, *wmhSFP7, *wmhSFP8, *wmhSFP9, *wmhSFP10, *wmhSFP11, *wmhSFP12, *wmhSFP13, *wmhSFP14, *wmhSFP15, *wmhSFP16, *wmhSFP17, *wmhSFP18, *wmhSFP19;
+float       *isoSFP0, *isoSFP1, *isoSFP2, *isoSFP3, *isoSFP4, *isoSFP5, *isoSFP6, *isoSFP7, *isoSFP8, *isoSFP9, *isoSFP10, *isoSFP11, *isoSFP12, *isoSFP13, *isoSFP14, *isoSFP15, *isoSFP16, *isoSFP17, *isoSFP18, *isoSFP19;
+
+
+
+// ====================================================
+// Compute a sub-block of the A*x MAtRIX-VECTOR product
+// ====================================================
+void* COMMIT_A__block( void *ptr )
+{
+    int      id = (long)ptr;
+    int      offset;
+    double   x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, w;
+    double   *x_Ptr0, *x_Ptr1, *x_Ptr2, *x_Ptr3, *x_Ptr4, *x_Ptr5, *x_Ptr6, *x_Ptr7, *x_Ptr8, *x_Ptr9, *x_Ptr10, *x_Ptr11, *x_Ptr12, *x_Ptr13, *x_Ptr14, *x_Ptr15, *x_Ptr16, *x_Ptr17, *x_Ptr18, *x_Ptr19;
+    double   *Yptr, *YptrEnd;
+    float    *SFP0ptr, *SFP1ptr, *SFP2ptr, *SFP3ptr, *SFP4ptr, *SFP5ptr, *SFP6ptr, *SFP7ptr, *SFP8ptr, *SFP9ptr, *SFP10ptr, *SFP11ptr, *SFP12ptr, *SFP13ptr, *SFP14ptr, *SFP15ptr, *SFP16ptr, *SFP17ptr, *SFP18ptr, *SFP19ptr;
+    uint32_t *t_v, *t_vEnd, *t_f;
+    uint16_t *t_o;
+    float    *t_l;
+
+#if nIC>=1
+    // intra-cellular compartments
+    t_v    = ICv + ICthreads[id];
+    t_vEnd = ICv + ICthreads[id+1];
+    t_o    = ICo + ICthreads[id];
+    t_l    = ICl + ICthreads[id];
+    t_f    = ICf + ICthreads[id];
+
+    while( t_v != t_vEnd )
+    {
+        x_Ptr0 = x + *t_f;
+        x0 = *x_Ptr0;
+        #if nIC>=2
+        x_Ptr1 = x_Ptr0 + nF;
+        x1 = *x_Ptr1;
+        #endif
+        #if nIC>=3
+        x_Ptr2 = x_Ptr1 + nF;
+        x2 = *x_Ptr2;
+        #endif
+        #if nIC>=4
+        x_Ptr3 = x_Ptr2 + nF;
+        x3 = *x_Ptr3;
+        #endif
+        #if nIC>=5
+        x_Ptr4 = x_Ptr3 + nF;
+        x4 = *x_Ptr4;
+        #endif
+        #if nIC>=6
+        x_Ptr5 = x_Ptr4 + nF;
+        x5 = *x_Ptr5;
+        #endif
+        #if nIC>=7
+        x_Ptr6 = x_Ptr5 + nF;
+        x6 = *x_Ptr6;
+        #endif
+        #if nIC>=8
+        x_Ptr7 = x_Ptr6 + nF;
+        x7 = *x_Ptr7;
+        #endif
+        #if nIC>=9
+        x_Ptr8 = x_Ptr7 + nF;
+        x8 = *x_Ptr8;
+        #endif
+        #if nIC>=10
+        x_Ptr9 = x_Ptr8 + nF;
+        x9 = *x_Ptr9;
+        #endif
+        #if nIC>=11
+        x_Ptr10 = x_Ptr9 + nF;
+        x10 = *x_Ptr10;
+        #endif
+        #if nIC>=12
+        x_Ptr11 = x_Ptr10 + nF;
+        x11 = *x_Ptr11;
+        #endif
+        #if nIC>=13
+        x_Ptr12 = x_Ptr11 + nF;
+        x12 = *x_Ptr12;
+        #endif
+        #if nIC>=14
+        x_Ptr13 = x_Ptr12 + nF;
+        x13 = *x_Ptr13;
+        #endif
+        #if nIC>=15
+        x_Ptr14 = x_Ptr13 + nF;
+        x14 = *x_Ptr14;
+        #endif
+        #if nIC>=16
+        x_Ptr15 = x_Ptr14 + nF;
+        x15 = *x_Ptr15;
+        #endif
+        #if nIC>=17
+        x_Ptr16 = x_Ptr15 + nF;
+        x16 = *x_Ptr16;
+        #endif
+        #if nIC>=18
+        x_Ptr17 = x_Ptr16 + nF;
+        x17 = *x_Ptr17;
+        #endif
+        #if nIC>=19
+        x_Ptr18 = x_Ptr17 + nF;
+        x18 = *x_Ptr18;
+        #endif
+        #if nIC>=20
+        x_Ptr19 = x_Ptr18 + nF;
+        x19 = *x_Ptr19;
+        #endif
+
+        if ( x0 != 0
+        #if nIC>=2
+            || x1 != 0
+        #endif
+        #if nIC>=3
+            || x2 != 0
+        #endif
+        #if nIC>=4
+            || x3 != 0
+        #endif
+        #if nIC>=5
+            || x4 != 0
+        #endif
+        #if nIC>=6
+            || x5 != 0
+        #endif
+        #if nIC>=7
+            || x6 != 0
+        #endif
+        #if nIC>=8
+            || x7 != 0
+        #endif
+        #if nIC>=9
+            || x8 != 0
+        #endif
+        #if nIC>=10
+            || x9 != 0
+        #endif
+        #if nIC>=11
+            || x10 != 0
+        #endif
+        #if nIC>=12
+            || x11 != 0
+        #endif
+        #if nIC>=13
+            || x12 != 0
+        #endif
+        #if nIC>=14
+            || x13 != 0
+        #endif
+        #if nIC>=15
+            || x14 != 0
+        #endif
+        #if nIC>=16
+            || x15 != 0
+        #endif
+        #if nIC>=17
+            || x16 != 0
+        #endif
+        #if nIC>=18
+            || x17 != 0
+        #endif
+        #if nIC>=19
+            || x18 != 0
+        #endif
+        #if nIC>=20
+            || x19 != 0
+        #endif
+        )
+        {
+            Yptr    = Y    + nS * (*t_v);
+            YptrEnd = Yptr + nS;
+            w       = (double)(*t_l);
+            offset  = nS * (*t_o);
+            SFP0ptr = wmrSFP0 + offset;
+            #if nIC>=2
+            SFP1ptr = wmrSFP1 + offset;
+            #endif
+            #if nIC>=3
+            SFP2ptr = wmrSFP2 + offset;
+            #endif
+            #if nIC>=4
+            SFP3ptr = wmrSFP3 + offset;
+            #endif
+            #if nIC>=5
+            SFP4ptr = wmrSFP4 + offset;
+            #endif
+            #if nIC>=6
+            SFP5ptr = wmrSFP5 + offset;
+            #endif
+            #if nIC>=7
+            SFP6ptr = wmrSFP6 + offset;
+            #endif
+            #if nIC>=8
+            SFP7ptr = wmrSFP7 + offset;
+            #endif
+            #if nIC>=9
+            SFP8ptr = wmrSFP8 + offset;
+            #endif
+            #if nIC>=10
+            SFP9ptr = wmrSFP9 + offset;
+            #endif
+            #if nIC>=11
+            SFP10ptr = wmrSFP10 + offset;
+            #endif
+            #if nIC>=12
+            SFP11ptr = wmrSFP11 + offset;
+            #endif
+            #if nIC>=13
+            SFP12ptr = wmrSFP12 + offset;
+            #endif
+            #if nIC>=14
+            SFP13ptr = wmrSFP13 + offset;
+            #endif
+            #if nIC>=15
+            SFP14ptr = wmrSFP14 + offset;
+            #endif
+            #if nIC>=16
+            SFP15ptr = wmrSFP15 + offset;
+            #endif
+            #if nIC>=17
+            SFP16ptr = wmrSFP16 + offset;
+            #endif
+            #if nIC>=18
+            SFP17ptr = wmrSFP17 + offset;
+            #endif
+            #if nIC>=19
+            SFP18ptr = wmrSFP18 + offset;
+            #endif
+            #if nIC>=20
+            SFP19ptr = wmrSFP19 + offset;
+            #endif
+
+            while( Yptr != YptrEnd )
+                (*Yptr++) += w * (
+                          x0 * (*SFP0ptr++)
+                        #if nIC>=2
+                        + x1 * (*SFP1ptr++)
+                        #endif
+                        #if nIC>=3
+                        + x2 * (*SFP2ptr++)
+                        #endif
+                        #if nIC>=4
+                        + x3 * (*SFP3ptr++)
+                        #endif
+                        #if nIC>=5
+                        + x4 * (*SFP4ptr++)
+                        #endif
+                        #if nIC>=6
+                        + x5 * (*SFP5ptr++)
+                        #endif
+                        #if nIC>=7
+                        + x6 * (*SFP6ptr++)
+                        #endif
+                        #if nIC>=8
+                        + x7 * (*SFP7ptr++)
+                        #endif
+                        #if nIC>=9
+                        + x8 * (*SFP8ptr++)
+                        #endif
+                        #if nIC>=10
+                        + x9 * (*SFP9ptr++)
+                        #endif
+                        #if nIC>=11
+                        + x10 * (*SFP10ptr++)
+                        #endif
+                        #if nIC>=12
+                        + x11 * (*SFP11ptr++)
+                        #endif
+                        #if nIC>=13
+                        + x12 * (*SFP12ptr++)
+                        #endif
+                        #if nIC>=14
+                        + x13 * (*SFP13ptr++)
+                        #endif
+                        #if nIC>=15
+                        + x14 * (*SFP14ptr++)
+                        #endif
+                        #if nIC>=16
+                        + x15 * (*SFP15ptr++)
+                        #endif
+                        #if nIC>=17
+                        + x16 * (*SFP16ptr++)
+                        #endif
+                        #if nIC>=18
+                        + x17 * (*SFP17ptr++)
+                        #endif
+                        #if nIC>=19
+                        + x18 * (*SFP18ptr++)
+                        #endif
+                        #if nIC>=20
+                        + x19 * (*SFP19ptr++)
+                        #endif
+                );
+        }
+
+        t_f++;
+        t_v++;
+        t_o++;
+        t_l++;
+    }
+#endif
+
+#if nEC>=1
+    // extra-cellular compartments
+    t_v    = ECv + ECthreads[id];
+    t_vEnd = ECv + ECthreads[id+1];
+    t_o    = ECo + ECthreads[id];
+
+    x_Ptr0 = x + nIC*nF + ECthreads[id];
+    #if nEC>=2
+    x_Ptr1 = x_Ptr0 + nE;
+    #endif
+    #if nEC>=3
+    x_Ptr2 = x_Ptr1 + nE;
+    #endif
+    #if nEC>=4
+    x_Ptr3 = x_Ptr2 + nE;
+    #endif
+    #if nEC>=5
+    x_Ptr4 = x_Ptr3 + nE;
+    #endif
+    #if nEC>=6
+    x_Ptr5 = x_Ptr4 + nE;
+    #endif
+    #if nEC>=7
+    x_Ptr6 = x_Ptr5 + nE;
+    #endif
+    #if nEC>=8
+    x_Ptr7 = x_Ptr6 + nE;
+    #endif
+    #if nEC>=9
+    x_Ptr8 = x_Ptr7 + nE;
+    #endif
+    #if nEC>=10
+    x_Ptr9 = x_Ptr8 + nE;
+    #endif
+    #if nEC>=11
+    x_Ptr10 = x_Ptr9 + nE;
+    #endif
+    #if nEC>=12
+    x_Ptr11 = x_Ptr10 + nE;
+    #endif
+    #if nEC>=13
+    x_Ptr12 = x_Ptr11 + nE;
+    #endif
+    #if nEC>=14
+    x_Ptr13 = x_Ptr12 + nE;
+    #endif
+    #if nEC>=15
+    x_Ptr14 = x_Ptr13 + nE;
+    #endif
+    #if nEC>=16
+    x_Ptr15 = x_Ptr14 + nE;
+    #endif
+    #if nEC>=17
+    x_Ptr16 = x_Ptr15 + nE;
+    #endif
+    #if nEC>=18
+    x_Ptr17 = x_Ptr16 + nE;
+    #endif
+    #if nEC>=19
+    x_Ptr18 = x_Ptr17 + nE;
+    #endif
+    #if nEC>=20
+    x_Ptr19 = x_Ptr18 + nE;
+    #endif
+
+    while( t_v != t_vEnd )
+    {
+        x0 = *x_Ptr0++;
+        #if nEC>=2
+        x1 = *x_Ptr1++;
+        #endif
+        #if nEC>=3
+        x2 = *x_Ptr2++;
+        #endif
+        #if nEC>=4
+        x3 = *x_Ptr3++;
+        #endif
+        #if nEC>=5
+        x4 = *x_Ptr4++;
+        #endif
+        #if nEC>=6
+        x5 = *x_Ptr5++;
+        #endif
+        #if nEC>=7
+        x6 = *x_Ptr6++;
+        #endif
+        #if nEC>=8
+        x7 = *x_Ptr7++;
+        #endif
+        #if nEC>=9
+        x8 = *x_Ptr8++;
+        #endif
+        #if nEC>=10
+        x9 = *x_Ptr9++;
+        #endif
+        #if nEC>=11
+        x10 = *x_Ptr10++;
+        #endif
+        #if nEC>=12
+        x11 = *x_Ptr11++;
+        #endif
+        #if nEC>=13
+        x12 = *x_Ptr12++;
+        #endif
+        #if nEC>=14
+        x13 = *x_Ptr13++;
+        #endif
+        #if nEC>=15
+        x14 = *x_Ptr14++;
+        #endif
+        #if nEC>=16
+        x15 = *x_Ptr15++;
+        #endif
+        #if nEC>=17
+        x16 = *x_Ptr16++;
+        #endif
+        #if nEC>=18
+        x17 = *x_Ptr17++;
+        #endif
+        #if nEC>=19
+        x18 = *x_Ptr18++;
+        #endif
+        #if nEC>=20
+        x19 = *x_Ptr19++;
+        #endif
+        if (
+               x0 != 0
+            #if nEC>=2
+            || x1 != 0
+            #endif
+            #if nEC>=3
+            || x2 != 0
+            #endif
+            #if nEC>=4
+            || x3 != 0
+            #endif
+            #if nEC>=5
+            || x4 != 0
+            #endif
+            #if nEC>=6
+            || x5 != 0
+            #endif
+            #if nEC>=7
+            || x6 != 0
+            #endif
+            #if nEC>=8
+            || x7 != 0
+            #endif
+            #if nEC>=9
+            || x8 != 0
+            #endif
+            #if nEC>=10
+            || x9 != 0
+            #endif
+            #if nEC>=11
+            || x10 != 0
+            #endif
+            #if nEC>=12
+            || x11 != 0
+            #endif
+            #if nEC>=13
+            || x12 != 0
+            #endif
+            #if nEC>=14
+            || x13 != 0
+            #endif
+            #if nEC>=15
+            || x14 != 0
+            #endif
+            #if nEC>=16
+            || x15 != 0
+            #endif
+            #if nEC>=17
+            || x16 != 0
+            #endif
+            #if nEC>=18
+            || x17 != 0
+            #endif
+            #if nEC>=19
+            || x18 != 0
+            #endif
+            #if nEC>=20
+            || x19 != 0
+            #endif
+          )
+        {
+            Yptr    = Y    + nS * (*t_v);
+            YptrEnd = Yptr + nS;
+            offset  = nS * (*t_o);
+            SFP0ptr = wmhSFP0 + offset;
+            #if nEC>=2
+            SFP1ptr = wmhSFP1 + offset;
+            #endif
+            #if nEC>=3
+            SFP2ptr = wmhSFP2 + offset;
+            #endif
+            #if nEC>=4
+            SFP3ptr = wmhSFP3 + offset;
+            #endif
+            #if nEC>=5
+            SFP4ptr = wmhSFP4 + offset;
+            #endif
+            #if nEC>=6
+            SFP5ptr = wmhSFP5 + offset;
+            #endif
+            #if nEC>=7
+            SFP6ptr = wmhSFP6 + offset;
+            #endif
+            #if nEC>=8
+            SFP7ptr = wmhSFP7 + offset;
+            #endif
+            #if nEC>=9
+            SFP8ptr = wmhSFP8 + offset;
+            #endif
+            #if nEC>=10
+            SFP9ptr = wmhSFP9 + offset;
+            #endif
+            #if nEC>=11
+            SFP10ptr = wmhSFP10 + offset;
+            #endif
+            #if nEC>=12
+            SFP11ptr = wmhSFP11 + offset;
+            #endif
+            #if nEC>=13
+            SFP12ptr = wmhSFP12 + offset;
+            #endif
+            #if nEC>=14
+            SFP13ptr = wmhSFP13 + offset;
+            #endif
+            #if nEC>=15
+            SFP14ptr = wmhSFP14 + offset;
+            #endif
+            #if nEC>=16
+            SFP15ptr = wmhSFP15 + offset;
+            #endif
+            #if nEC>=17
+            SFP16ptr = wmhSFP16 + offset;
+            #endif
+            #if nEC>=18
+            SFP17ptr = wmhSFP17 + offset;
+            #endif
+            #if nEC>=19
+            SFP18ptr = wmhSFP18 + offset;
+            #endif
+            #if nEC>=20
+            SFP19ptr = wmhSFP19 + offset;
+            #endif
+
+            while( Yptr != YptrEnd )
+                (*Yptr++) += (
+                      x0 * (*SFP0ptr++)
+                    #if nEC>=2
+                    + x1 * (*SFP1ptr++)
+                    #endif
+                    #if nEC>=3
+                    + x2 * (*SFP2ptr++)
+                    #endif
+                    #if nEC>=4
+                    + x3 * (*SFP3ptr++)
+                    #endif
+                    #if nEC>=5
+                    + x4 * (*SFP4ptr++)
+                    #endif
+                    #if nEC>=6
+                    + x5 * (*SFP5ptr++)
+                    #endif
+                    #if nEC>=7
+                    + x6 * (*SFP6ptr++)
+                    #endif
+                    #if nEC>=8
+                    + x7 * (*SFP7ptr++)
+                    #endif
+                    #if nEC>=9
+                    + x8 * (*SFP8ptr++)
+                    #endif
+                    #if nEC>=10
+                    + x9 * (*SFP9ptr++)
+                    #endif
+                    #if nEC>=11
+                    + x10 * (*SFP10ptr++)
+                    #endif
+                    #if nEC>=12
+                    + x11 * (*SFP11ptr++)
+                    #endif
+                    #if nEC>=13
+                    + x12 * (*SFP12ptr++)
+                    #endif
+                    #if nEC>=14
+                    + x13 * (*SFP13ptr++)
+                    #endif
+                    #if nEC>=15
+                    + x14 * (*SFP14ptr++)
+                    #endif
+                    #if nEC>=16
+                    + x15 * (*SFP15ptr++)
+                    #endif
+                    #if nEC>=17
+                    + x16 * (*SFP16ptr++)
+                    #endif
+                    #if nEC>=18
+                    + x17 * (*SFP17ptr++)
+                    #endif
+                    #if nEC>=19
+                    + x18 * (*SFP18ptr++)
+                    #endif
+                    #if nEC>=20
+                    + x19 * (*SFP19ptr++)
+                    #endif
+
+                );
+        }
+        t_v++;
+        t_o++;
+    }
+#endif
+
+#if nISO>=1
+    // isotropic compartments
+    t_v    = ISOv + ISOthreads[id];
+    t_vEnd = ISOv + ISOthreads[id+1];
+
+    x_Ptr0 = x + nIC*nF + nEC*nE + ISOthreads[id];
+    #if nISO>=2
+    x_Ptr1 = x_Ptr0 + nV;
+    #endif
+    #if nISO>=3
+    x_Ptr2 = x_Ptr1 + nV;
+    #endif
+    #if nISO>=4
+    x_Ptr3 = x_Ptr2 + nV;
+    #endif
+    #if nISO>=5
+    x_Ptr4 = x_Ptr3 + nV;
+    #endif
+    #if nISO>=6
+    x_Ptr5 = x_Ptr4 + nV;
+    #endif
+    #if nISO>=7
+    x_Ptr6 = x_Ptr5 + nV;
+    #endif
+    #if nISO>=8
+    x_Ptr7 = x_Ptr6 + nV;
+    #endif
+    #if nISO>=9
+    x_Ptr8 = x_Ptr7 + nV;
+    #endif
+    #if nISO>=10
+    x_Ptr9 = x_Ptr8 + nV;
+    #endif
+    #if nISO>=11
+    x_Ptr10 = x_Ptr9 + nV;
+    #endif
+    #if nISO>=12
+    x_Ptr11 = x_Ptr10 + nV;
+    #endif
+    #if nISO>=13
+    x_Ptr12 = x_Ptr11 + nV;
+    #endif
+    #if nISO>=14
+    x_Ptr13 = x_Ptr12 + nV;
+    #endif
+    #if nISO>=15
+    x_Ptr14 = x_Ptr13 + nV;
+    #endif
+    #if nISO>=16
+    x_Ptr15 = x_Ptr14 + nV;
+    #endif
+    #if nISO>=17
+    x_Ptr16 = x_Ptr15 + nV;
+    #endif
+    #if nISO>=18
+    x_Ptr17 = x_Ptr16 + nV;
+    #endif
+    #if nISO>=19
+    x_Ptr18 = x_Ptr17 + nV;
+    #endif
+    #if nISO>=20
+    x_Ptr19 = x_Ptr18 + nV;
+    #endif
+
+    while( t_v != t_vEnd )
+    {
+        x0 = *x_Ptr0++;
+        #if nISO>=2
+        x1 = *x_Ptr1++;
+        #endif
+        #if nISO>=3
+        x2 = *x_Ptr2++;
+        #endif
+        #if nISO>=4
+        x3 = *x_Ptr3++;
+        #endif
+        #if nISO>=5
+        x4 = *x_Ptr4++;
+        #endif
+        #if nISO>=6
+        x5 = *x_Ptr5++;
+        #endif
+        #if nISO>=7
+        x6 = *x_Ptr6++;
+        #endif
+        #if nISO>=8
+        x7 = *x_Ptr7++;
+        #endif
+        #if nISO>=9
+        x8 = *x_Ptr8++;
+        #endif
+        #if nISO>=10
+        x9 = *x_Ptr9++;
+        #endif
+        #if nISO>=11
+        x10 = *x_Ptr10++;
+        #endif
+        #if nISO>=12
+        x11 = *x_Ptr11++;
+        #endif
+        #if nISO>=13
+        x12 = *x_Ptr12++;
+        #endif
+        #if nISO>=14
+        x13 = *x_Ptr13++;
+        #endif
+        #if nISO>=15
+        x14 = *x_Ptr14++;
+        #endif
+        #if nISO>=16
+        x15 = *x_Ptr15++;
+        #endif
+        #if nISO>=17
+        x16 = *x_Ptr16++;
+        #endif
+        #if nISO>=18
+        x17 = *x_Ptr17++;
+        #endif
+        #if nISO>=19
+        x18 = *x_Ptr18++;
+        #endif
+        #if nISO>=20
+        x19 = *x_Ptr19++;
+        #endif
+
+        if (
+               x0 != 0
+            #if nISO>=2
+            || x1 != 0
+            #endif
+            #if nISO>=3
+            || x2 != 0
+            #endif
+            #if nISO>=4
+            || x3 != 0
+            #endif
+            #if nISO>=5
+            || x4 != 0
+            #endif
+            #if nISO>=6
+            || x5 != 0
+            #endif
+            #if nISO>=7
+            || x6 != 0
+            #endif
+            #if nISO>=8
+            || x7 != 0
+            #endif
+            #if nISO>=9
+            || x8 != 0
+            #endif
+            #if nISO>=10
+            || x9 != 0
+            #endif
+            #if nISO>=11
+            || x10 != 0
+            #endif
+            #if nISO>=12
+            || x11 != 0
+            #endif
+            #if nISO>=13
+            || x12 != 0
+            #endif
+            #if nISO>=14
+            || x13 != 0
+            #endif
+            #if nISO>=15
+            || x14 != 0
+            #endif
+            #if nISO>=16
+            || x15 != 0
+            #endif
+            #if nISO>=17
+            || x16 != 0
+            #endif
+            #if nISO>=18
+            || x17 != 0
+            #endif
+            #if nISO>=19
+            || x18 != 0
+            #endif
+            #if nISO>=20
+            || x19 != 0
+            #endif
+          )
+        {
+            Yptr    = Y    + nS * (*t_v);
+            YptrEnd = Yptr + nS;
+            SFP0ptr = isoSFP0;
+            #if nISO>=2
+            SFP1ptr = isoSFP1;
+            #endif
+            #if nISO>=3
+            SFP2ptr = isoSFP2;
+            #endif
+            #if nISO>=4
+            SFP3ptr = isoSFP3;
+            #endif
+            #if nISO>=5
+            SFP4ptr = isoSFP4;
+            #endif
+            #if nISO>=6
+            SFP5ptr = isoSFP5;
+            #endif
+            #if nISO>=7
+            SFP6ptr = isoSFP6;
+            #endif
+            #if nISO>=8
+            SFP7ptr = isoSFP7;
+            #endif
+            #if nISO>=9
+            SFP8ptr = isoSFP8;
+            #endif
+            #if nISO>=10
+            SFP9ptr = isoSFP9;
+            #endif
+            #if nISO>=11
+            SFP10ptr = isoSFP10;
+            #endif
+            #if nISO>=12
+            SFP11ptr = isoSFP11;
+            #endif
+            #if nISO>=13
+            SFP12ptr = isoSFP12;
+            #endif
+            #if nISO>=14
+            SFP13ptr = isoSFP13;
+            #endif
+            #if nISO>=15
+            SFP14ptr = isoSFP14;
+            #endif
+            #if nISO>=16
+            SFP15ptr = isoSFP15;
+            #endif
+            #if nISO>=17
+            SFP16ptr = isoSFP16;
+            #endif
+            #if nISO>=18
+            SFP17ptr = isoSFP17;
+            #endif
+            #if nISO>=19
+            SFP18ptr = isoSFP18;
+            #endif
+            #if nISO>=20
+            SFP19ptr = isoSFP19;
+            #endif
+
+            while( Yptr != YptrEnd )
+                (*Yptr++) += (
+                      x0 * (*SFP0ptr++)
+                    #if nISO>=2
+                    + x1 * (*SFP1ptr++)
+                    #endif
+                    #if nISO>=3
+                    + x2 * (*SFP2ptr++)
+                    #endif
+                    #if nISO>=4
+                    + x3 * (*SFP3ptr++)
+                    #endif
+                    #if nISO>=5
+                    + x4 * (*SFP4ptr++)
+                    #endif
+                    #if nISO>=6
+                    + x5 * (*SFP5ptr++)
+                    #endif
+                    #if nISO>=7
+                    + x6 * (*SFP6ptr++)
+                    #endif
+                    #if nISO>=8
+                    + x7 * (*SFP7ptr++)
+                    #endif
+                    #if nISO>=9
+                    + x8 * (*SFP8ptr++)
+                    #endif
+                    #if nISO>=10
+                    + x9 * (*SFP9ptr++)
+                    #endif
+                    #if nISO>=11
+                    + x10 * (*SFP10ptr++)
+                    #endif
+                    #if nISO>=12
+                    + x11 * (*SFP11ptr++)
+                    #endif
+                    #if nISO>=13
+                    + x12 * (*SFP12ptr++)
+                    #endif
+                    #if nISO>=14
+                    + x13 * (*SFP13ptr++)
+                    #endif
+                    #if nISO>=15
+                    + x14 * (*SFP14ptr++)
+                    #endif
+                    #if nISO>=16
+                    + x15 * (*SFP15ptr++)
+                    #endif
+                    #if nISO>=17
+                    + x16 * (*SFP16ptr++)
+                    #endif
+                    #if nISO>=18
+                    + x17 * (*SFP17ptr++)
+                    #endif
+                    #if nISO>=19
+                    + x18 * (*SFP18ptr++)
+                    #endif
+                    #if nISO>=20
+                    + x19 * (*SFP19ptr++)
+                    #endif
+                );
+        }
+        t_v++;
+    }
+#endif
+
+    pthread_exit( 0 );
+}
+
+
+// =========================
+// Function called by CYTHON
+// =========================
+void COMMIT_A(
+    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
+    double *_vIN, double *_vOUT,
+    uint32_t *_ICf, uint32_t *_ICv, uint16_t *_ICo, float *_ICl,
+    uint32_t *_ECv, uint16_t *_ECo,
+    uint32_t *_ISOv,
+    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
+    uint32_t* _ICthreads, uint32_t* _ECthreads, uint32_t* _ISOthreads
+)
+{
+    nF = _nF;
+    n  = _n;
+    nE = _nE;
+    nV = _nV;
+    nS = _nS;
+    ndirs = _ndirs;
+
+    x = _vIN;
+    Y = _vOUT;
+
+    ICf  = _ICf;
+    ICv  = _ICv;
+    ICo  = _ICo;
+    ICl  = _ICl;
+    ECv  = _ECv;
+    ECo  = _ECo;
+    ISOv = _ISOv;
+
+    #if nIC>=1
+    wmrSFP0 = _wmrSFP;
+    #if nIC>=2
+    wmrSFP1 = wmrSFP0 + _ndirs*_nS;
+    #if nIC>=3
+    wmrSFP2 = wmrSFP1 + _ndirs*_nS;
+    #if nIC>=4
+    wmrSFP3 = wmrSFP2 + _ndirs*_nS;
+    #if nIC>=5
+    wmrSFP4 = wmrSFP3 + _ndirs*_nS;
+    #if nIC>=6
+    wmrSFP5 = wmrSFP4 + _ndirs*_nS;
+    #if nIC>=7
+    wmrSFP6 = wmrSFP5 + _ndirs*_nS;
+    #if nIC>=8
+    wmrSFP7 = wmrSFP6 + _ndirs*_nS;
+    #if nIC>=9
+    wmrSFP8 = wmrSFP7 + _ndirs*_nS;
+    #if nIC>=10
+    wmrSFP9 = wmrSFP8 + _ndirs*_nS;
+    #if nIC>=11
+    wmrSFP10 = wmrSFP9 + _ndirs*_nS;
+    #if nIC>=12
+    wmrSFP11 = wmrSFP10 + _ndirs*_nS;
+    #if nIC>=13
+    wmrSFP12 = wmrSFP11 + _ndirs*_nS;
+    #if nIC>=14
+    wmrSFP13 = wmrSFP12 + _ndirs*_nS;
+    #if nIC>=15
+    wmrSFP14 = wmrSFP13 + _ndirs*_nS;
+    #if nIC>=16
+    wmrSFP15 = wmrSFP14 + _ndirs*_nS;
+    #if nIC>=17
+    wmrSFP16 = wmrSFP15 + _ndirs*_nS;
+    #if nIC>=18
+    wmrSFP17 = wmrSFP16 + _ndirs*_nS;
+    #if nIC>=19
+    wmrSFP18 = wmrSFP17 + _ndirs*_nS;
+    #if nIC>=20
+    wmrSFP19 = wmrSFP18 + _ndirs*_nS;
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #if nEC>=1
+    wmhSFP0 = _wmhSFP;
+    #if nEC>=2
+    wmhSFP1 = wmhSFP0 + _ndirs*_nS;
+    #if nEC>=3
+    wmhSFP2 = wmhSFP1 + _ndirs*_nS;
+    #if nEC>=4
+    wmhSFP3 = wmhSFP2 + _ndirs*_nS;
+    #if nEC>=5
+    wmhSFP4 = wmhSFP3 + _ndirs*_nS;
+    #if nEC>=6
+    wmhSFP5 = wmhSFP4 + _ndirs*_nS;
+    #if nEC>=7
+    wmhSFP6 = wmhSFP5 + _ndirs*_nS;
+    #if nEC>=8
+    wmhSFP7 = wmhSFP6 + _ndirs*_nS;
+    #if nEC>=9
+    wmhSFP8 = wmhSFP7 + _ndirs*_nS;
+    #if nEC>=10
+    wmhSFP9 = wmhSFP8 + _ndirs*_nS;
+    #if nEC>=11
+    wmhSFP10 = wmhSFP9 + _ndirs*_nS;
+    #if nEC>=12
+    wmhSFP11 = wmhSFP10 + _ndirs*_nS;
+    #if nEC>=13
+    wmhSFP12 = wmhSFP11 + _ndirs*_nS;
+    #if nEC>=14
+    wmhSFP13 = wmhSFP12 + _ndirs*_nS;
+    #if nEC>=15
+    wmhSFP14 = wmhSFP13 + _ndirs*_nS;
+    #if nEC>=16
+    wmhSFP15 = wmhSFP14 + _ndirs*_nS;
+    #if nEC>=17
+    wmhSFP16 = wmhSFP15 + _ndirs*_nS;
+    #if nEC>=18
+    wmhSFP17 = wmhSFP16 + _ndirs*_nS;
+    #if nEC>=19
+    wmhSFP18 = wmhSFP17 + _ndirs*_nS;
+    #if nEC>=20
+    wmhSFP19 = wmhSFP18 + _ndirs*_nS;
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #if nISO>=1
+    isoSFP0 = _isoSFP;
+    #if nISO>=2
+    isoSFP1 = isoSFP0 + _nS;
+    #if nISO>=3
+    isoSFP2 = isoSFP1 + _nS;
+    #if nISO>=4
+    isoSFP3 = isoSFP2 + _nS;
+    #if nISO>=5
+    isoSFP4 = isoSFP3 + _nS;
+    #if nISO>=6
+    isoSFP5 = isoSFP4 + _nS;
+    #if nISO>=7
+    isoSFP6 = isoSFP5 + _nS;
+    #if nISO>=8
+    isoSFP7 = isoSFP6 + _nS;
+    #if nISO>=9
+    isoSFP8 = isoSFP7 + _nS;
+    #if nISO>=10
+    isoSFP9 = isoSFP8 + _nS;
+    #if nISO>=11
+    isoSFP10 = isoSFP9 + _nS;
+    #if nISO>=12
+    isoSFP11 = isoSFP10 + _nS;
+    #if nISO>=13
+    isoSFP12 = isoSFP11 + _nS;
+    #if nISO>=14
+    isoSFP13 = isoSFP12 + _nS;
+    #if nISO>=15
+    isoSFP14 = isoSFP13 + _nS;
+    #if nISO>=16
+    isoSFP15 = isoSFP14 + _nS;
+    #if nISO>=17
+    isoSFP16 = isoSFP15 + _nS;
+    #if nISO>=18
+    isoSFP17 = isoSFP16 + _nS;
+    #if nISO>=19
+    isoSFP18 = isoSFP17 + _nS;
+    #if nISO>=20
+    isoSFP19 = isoSFP18 + _nS;
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+
+    ICthreads  = _ICthreads;
+    ECthreads  = _ECthreads;
+    ISOthreads = _ISOthreads;
+
+    // Run SEPARATE THREADS to perform the multiplication
+    pthread_t threads[nTHREADS];
+    int t;
+    for(t=0; t<nTHREADS ; t++)
+        pthread_create( &threads[t], NULL, COMMIT_A__block, (void *) (long int)t );
+    for(t=0; t<nTHREADS ; t++)
+        pthread_join( threads[t], NULL );
+    return;
+}
+
+
+
+/* ===================================================== */
+/* Compute a sub-block of the A'*y MAtRIX-VECTOR product */
+/* ===================================================== */
+void* COMMIT_At__block( void *ptr )
+{
+    int      id = (long)ptr;
+    int      offset;
+    double   x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, w, Y_tmp;
+    double   *x_Ptr0, *x_Ptr1, *x_Ptr2, *x_Ptr3, *x_Ptr4, *x_Ptr5, *x_Ptr6, *x_Ptr7, *x_Ptr8, *x_Ptr9, *x_Ptr10, *x_Ptr11, *x_Ptr12, *x_Ptr13, *x_Ptr14, *x_Ptr15, *x_Ptr16, *x_Ptr17, *x_Ptr18, *x_Ptr19;
+    double   *Yptr, *YptrEnd;
+    float    *SFP0ptr, *SFP1ptr, *SFP2ptr, *SFP3ptr, *SFP4ptr, *SFP5ptr, *SFP6ptr, *SFP7ptr, *SFP8ptr, *SFP9ptr, *SFP10ptr, *SFP11ptr, *SFP12ptr, *SFP13ptr, *SFP14ptr, *SFP15ptr, *SFP16ptr, *SFP17ptr, *SFP18ptr, *SFP19ptr;
+    uint32_t *t_v, *t_vEnd, *t_f;
+    uint16_t *t_o;
+    float    *t_l;
+    uint8_t  *t_t;
+
+#if nIC>=1
+    // intra-cellular compartments
+    t_v    = ICv;
+    t_vEnd = ICv + n;
+    t_o    = ICo;
+    t_l    = ICl;
+    t_f    = ICf;
+    t_t    = ICthreadsT;
+
+    while( t_v != t_vEnd )
+    {
+        // in this case, I need to walk throug because the segments are ordered in "voxel order"
+        if ( *t_t == id )
+        {
+            Yptr    = Y    + nS * (*t_v);
+            YptrEnd = Yptr + nS;
+            offset  = nS * (*t_o);
+
+            Y_tmp = *Yptr;
+            SFP0ptr   = wmrSFP0 + offset;
+            x0 = (*SFP0ptr++) * Y_tmp;
+            #if nIC>=2
+            SFP1ptr   = wmrSFP1 + offset;
+            x1 = (*SFP1ptr++) * Y_tmp;
+            #endif
+            #if nIC>=3
+            SFP2ptr   = wmrSFP2 + offset;
+            x2 = (*SFP2ptr++) * Y_tmp;
+            #endif
+            #if nIC>=4
+            SFP3ptr   = wmrSFP3 + offset;
+            x3 = (*SFP3ptr++) * Y_tmp;
+            #endif
+            #if nIC>=5
+            SFP4ptr   = wmrSFP4 + offset;
+            x4 = (*SFP4ptr++) * Y_tmp;
+            #endif
+            #if nIC>=6
+            SFP5ptr   = wmrSFP5 + offset;
+            x5 = (*SFP5ptr++) * Y_tmp;
+            #endif
+            #if nIC>=7
+            SFP6ptr   = wmrSFP6 + offset;
+            x6 = (*SFP6ptr++) * Y_tmp;
+            #endif
+            #if nIC>=8
+            SFP7ptr   = wmrSFP7 + offset;
+            x7 = (*SFP7ptr++) * Y_tmp;
+            #endif
+            #if nIC>=9
+            SFP8ptr   = wmrSFP8 + offset;
+            x8 = (*SFP8ptr++) * Y_tmp;
+            #endif
+            #if nIC>=10
+            SFP9ptr   = wmrSFP9 + offset;
+            x9 = (*SFP9ptr++) * Y_tmp;
+            #endif
+            #if nIC>=11
+            SFP10ptr   = wmrSFP10 + offset;
+            x10 = (*SFP10ptr++) * Y_tmp;
+            #endif
+            #if nIC>=12
+            SFP11ptr   = wmrSFP11 + offset;
+            x11 = (*SFP11ptr++) * Y_tmp;
+            #endif
+            #if nIC>=13
+            SFP12ptr   = wmrSFP12 + offset;
+            x12 = (*SFP12ptr++) * Y_tmp;
+            #endif
+            #if nIC>=14
+            SFP13ptr   = wmrSFP13 + offset;
+            x13 = (*SFP13ptr++) * Y_tmp;
+            #endif
+            #if nIC>=15
+            SFP14ptr   = wmrSFP14 + offset;
+            x14 = (*SFP14ptr++) * Y_tmp;
+            #endif
+            #if nIC>=16
+            SFP15ptr   = wmrSFP15 + offset;
+            x15 = (*SFP15ptr++) * Y_tmp;
+            #endif
+            #if nIC>=17
+            SFP16ptr   = wmrSFP16 + offset;
+            x16 = (*SFP16ptr++) * Y_tmp;
+            #endif
+            #if nIC>=18
+            SFP17ptr   = wmrSFP17 + offset;
+            x17 = (*SFP17ptr++) * Y_tmp;
+            #endif
+            #if nIC>=19
+            SFP18ptr   = wmrSFP18 + offset;
+            x18 = (*SFP18ptr++) * Y_tmp;
+            #endif
+            #if nIC>=20
+            SFP19ptr   = wmrSFP19 + offset;
+            x19 = (*SFP19ptr++) * Y_tmp;
+            #endif
+
+            while( ++Yptr != YptrEnd )
+            {
+                Y_tmp = *Yptr;
+                x0 += (*SFP0ptr++) * Y_tmp;
+                #if nIC>=2
+                x1 += (*SFP1ptr++) * Y_tmp;
+                #endif
+                #if nIC>=3
+                x2 += (*SFP2ptr++) * Y_tmp;
+                #endif
+                #if nIC>=4
+                x3 += (*SFP3ptr++) * Y_tmp;
+                #endif
+                #if nIC>=5
+                x4 += (*SFP4ptr++) * Y_tmp;
+                #endif
+                #if nIC>=6
+                x5 += (*SFP5ptr++) * Y_tmp;
+                #endif
+                #if nIC>=7
+                x6 += (*SFP6ptr++) * Y_tmp;
+                #endif
+                #if nIC>=8
+                x7 += (*SFP7ptr++) * Y_tmp;
+                #endif
+                #if nIC>=9
+                x8 += (*SFP8ptr++) * Y_tmp;
+                #endif
+                #if nIC>=10
+                x9 += (*SFP9ptr++) * Y_tmp;
+                #endif
+                #if nIC>=11
+                x10 += (*SFP10ptr++) * Y_tmp;
+                #endif
+                #if nIC>=12
+                x11 += (*SFP11ptr++) * Y_tmp;
+                #endif
+                #if nIC>=13
+                x12 += (*SFP12ptr++) * Y_tmp;
+                #endif
+                #if nIC>=14
+                x13 += (*SFP13ptr++) * Y_tmp;
+                #endif
+                #if nIC>=15
+                x14 += (*SFP14ptr++) * Y_tmp;
+                #endif
+                #if nIC>=16
+                x15 += (*SFP15ptr++) * Y_tmp;
+                #endif
+                #if nIC>=17
+                x16 += (*SFP16ptr++) * Y_tmp;
+                #endif
+                #if nIC>=18
+                x17 += (*SFP17ptr++) * Y_tmp;
+                #endif
+                #if nIC>=19
+                x18 += (*SFP18ptr++) * Y_tmp;
+                #endif
+                #if nIC>=20
+                x19 += (*SFP19ptr++) * Y_tmp;
+                #endif
+            }
+
+            w = (double)(*t_l);
+            x[*t_f]      += w * x0;
+            #if nIC>=2
+            x[*t_f+nF]   += w * x1;
+            #endif
+            #if nIC>=3
+            x[*t_f+2*nF] += w * x2;
+            #endif
+            #if nIC>=4
+            x[*t_f+3*nF] += w * x3;
+            #endif
+            #if nIC>=5
+            x[*t_f+4*nF] += w * x4;
+            #endif
+            #if nIC>=6
+            x[*t_f+5*nF] += w * x5;
+            #endif
+            #if nIC>=7
+            x[*t_f+6*nF] += w * x6;
+            #endif
+            #if nIC>=8
+            x[*t_f+7*nF] += w * x7;
+            #endif
+            #if nIC>=9
+            x[*t_f+8*nF] += w * x8;
+            #endif
+            #if nIC>=10
+            x[*t_f+9*nF] += w * x9;
+            #endif
+            #if nIC>=11
+            x[*t_f+10*nF] += w * x10;
+            #endif
+            #if nIC>=12
+            x[*t_f+11*nF] += w * x11;
+            #endif
+            #if nIC>=13
+            x[*t_f+12*nF] += w * x12;
+            #endif
+            #if nIC>=14
+            x[*t_f+13*nF] += w * x13;
+            #endif
+            #if nIC>=15
+            x[*t_f+14*nF] += w * x14;
+            #endif
+            #if nIC>=16
+            x[*t_f+15*nF] += w * x15;
+            #endif
+            #if nIC>=17
+            x[*t_f+16*nF] += w * x16;
+            #endif
+            #if nIC>=18
+            x[*t_f+17*nF] += w * x17;
+            #endif
+            #if nIC>=19
+            x[*t_f+18*nF] += w * x18;
+            #endif
+            #if nIC>=20
+            x[*t_f+19*nF] += w * x19;
+            #endif
+        }
+
+        t_f++;
+        t_v++;
+        t_o++;
+        t_l++;
+        t_t++;
+    }
+#endif
+
+#if nEC>=1
+    // extra-cellular compartments
+    t_v    = ECv + ECthreadsT[id];
+    t_vEnd = ECv + ECthreadsT[id+1];
+    t_o    = ECo + ECthreadsT[id];
+
+    x_Ptr0 = x + nIC*nF + ECthreadsT[id];
+    #if nEC>=2
+    x_Ptr1 = x_Ptr0 + nE;
+    #endif
+    #if nEC>=3
+    x_Ptr2 = x_Ptr1 + nE;
+    #endif
+    #if nEC>=4
+    x_Ptr3 = x_Ptr2 + nE;
+    #endif
+    #if nEC>=5
+    x_Ptr4 = x_Ptr3 + nE;
+    #endif
+    #if nEC>=6
+    x_Ptr5 = x_Ptr4 + nE;
+    #endif
+    #if nEC>=7
+    x_Ptr6 = x_Ptr5 + nE;
+    #endif
+    #if nEC>=8
+    x_Ptr7 = x_Ptr6 + nE;
+    #endif
+    #if nEC>=9
+    x_Ptr8 = x_Ptr7 + nE;
+    #endif
+    #if nEC>=10
+    x_Ptr9 = x_Ptr8 + nE;
+    #endif
+    #if nEC>=11
+    x_Ptr10 = x_Ptr9 + nE;
+    #endif
+    #if nEC>=12
+    x_Ptr11 = x_Ptr10 + nE;
+    #endif
+    #if nEC>=13
+    x_Ptr12 = x_Ptr11 + nE;
+    #endif
+    #if nEC>=14
+    x_Ptr13 = x_Ptr12 + nE;
+    #endif
+    #if nEC>=15
+    x_Ptr14 = x_Ptr13 + nE;
+    #endif
+    #if nEC>=16
+    x_Ptr15 = x_Ptr14 + nE;
+    #endif
+    #if nEC>=17
+    x_Ptr16 = x_Ptr15 + nE;
+    #endif
+    #if nEC>=18
+    x_Ptr17 = x_Ptr16 + nE;
+    #endif
+    #if nEC>=19
+    x_Ptr18 = x_Ptr17 + nE;
+    #endif
+    #if nEC>=20
+    x_Ptr19 = x_Ptr18 + nE;
+    #endif
+
+    while( t_v != t_vEnd )
+    {
+        Yptr    = Y    + nS * (*t_v++);
+        YptrEnd = Yptr + nS;
+        offset  = nS * (*t_o++);
+
+        Y_tmp = *Yptr;
+        SFP0ptr = wmhSFP0 + offset;
+        x0 = (*SFP0ptr++) * Y_tmp;
+        #if nEC>=2
+        SFP1ptr = wmhSFP1 + offset;
+        x1 = (*SFP1ptr++) * Y_tmp;
+        #endif
+        #if nEC>=3
+        SFP2ptr = wmhSFP2 + offset;
+        x2 = (*SFP2ptr++) * Y_tmp;
+        #endif
+        #if nEC>=4
+        SFP3ptr = wmhSFP3 + offset;
+        x3 = (*SFP3ptr++) * Y_tmp;
+        #endif
+        #if nEC>=5
+        SFP4ptr = wmhSFP4 + offset;
+        x4 = (*SFP4ptr++) * Y_tmp;
+        #endif
+        #if nEC>=6
+        SFP5ptr = wmhSFP5 + offset;
+        x5 = (*SFP5ptr++) * Y_tmp;
+        #endif
+        #if nEC>=7
+        SFP6ptr = wmhSFP6 + offset;
+        x6 = (*SFP6ptr++) * Y_tmp;
+        #endif
+        #if nEC>=8
+        SFP7ptr = wmhSFP7 + offset;
+        x7 = (*SFP7ptr++) * Y_tmp;
+        #endif
+        #if nEC>=9
+        SFP8ptr = wmhSFP8 + offset;
+        x8 = (*SFP8ptr++) * Y_tmp;
+        #endif
+        #if nEC>=10
+        SFP9ptr = wmhSFP9 + offset;
+        x9 = (*SFP9ptr++) * Y_tmp;
+        #endif
+        #if nEC>=11
+        SFP10ptr = wmhSFP10 + offset;
+        x10 = (*SFP10ptr++) * Y_tmp;
+        #endif
+        #if nEC>=12
+        SFP11ptr = wmhSFP11 + offset;
+        x11 = (*SFP11ptr++) * Y_tmp;
+        #endif
+        #if nEC>=13
+        SFP12ptr = wmhSFP12 + offset;
+        x12 = (*SFP12ptr++) * Y_tmp;
+        #endif
+        #if nEC>=14
+        SFP13ptr = wmhSFP13 + offset;
+        x13 = (*SFP13ptr++) * Y_tmp;
+        #endif
+        #if nEC>=15
+        SFP14ptr = wmhSFP14 + offset;
+        x14 = (*SFP14ptr++) * Y_tmp;
+        #endif
+        #if nEC>=16
+        SFP15ptr = wmhSFP15 + offset;
+        x15 = (*SFP15ptr++) * Y_tmp;
+        #endif
+        #if nEC>=17
+        SFP16ptr = wmhSFP16 + offset;
+        x16 = (*SFP16ptr++) * Y_tmp;
+        #endif
+        #if nEC>=18
+        SFP17ptr = wmhSFP17 + offset;
+        x17 = (*SFP17ptr++) * Y_tmp;
+        #endif
+        #if nEC>=19
+        SFP18ptr = wmhSFP18 + offset;
+        x18 = (*SFP18ptr++) * Y_tmp;
+        #endif
+        #if nEC>=20
+        SFP19ptr = wmhSFP19 + offset;
+        x19 = (*SFP19ptr++) * Y_tmp;
+        #endif
+
+        while( ++Yptr != YptrEnd )
+        {
+            Y_tmp = *Yptr;
+            x0 += (*SFP0ptr++) * Y_tmp;
+            #if nEC>=2
+            x1 += (*SFP1ptr++) * Y_tmp;
+            #endif
+            #if nEC>=3
+            x2 += (*SFP2ptr++) * Y_tmp;
+            #endif
+            #if nEC>=4
+            x3 += (*SFP3ptr++) * Y_tmp;
+            #endif
+            #if nEC>=5
+            x4 += (*SFP4ptr++) * Y_tmp;
+            #endif
+            #if nEC>=6
+            x5 += (*SFP5ptr++) * Y_tmp;
+            #endif
+            #if nEC>=7
+            x6 += (*SFP6ptr++) * Y_tmp;
+            #endif
+            #if nEC>=8
+            x7 += (*SFP7ptr++) * Y_tmp;
+            #endif
+            #if nEC>=9
+            x8 += (*SFP8ptr++) * Y_tmp;
+            #endif
+            #if nEC>=10
+            x9 += (*SFP9ptr++) * Y_tmp;
+            #endif
+            #if nEC>=11
+            x10 += (*SFP10ptr++) * Y_tmp;
+            #endif
+            #if nEC>=12
+            x11 += (*SFP11ptr++) * Y_tmp;
+            #endif
+            #if nEC>=13
+            x12 += (*SFP12ptr++) * Y_tmp;
+            #endif
+            #if nEC>=14
+            x13 += (*SFP13ptr++) * Y_tmp;
+            #endif
+            #if nEC>=15
+            x14 += (*SFP14ptr++) * Y_tmp;
+            #endif
+            #if nEC>=16
+            x15 += (*SFP15ptr++) * Y_tmp;
+            #endif
+            #if nEC>=17
+            x16 += (*SFP16ptr++) * Y_tmp;
+            #endif
+            #if nEC>=18
+            x17 += (*SFP17ptr++) * Y_tmp;
+            #endif
+            #if nEC>=19
+            x18 += (*SFP18ptr++) * Y_tmp;
+            #endif
+            #if nEC>=20
+            x19 += (*SFP19ptr++) * Y_tmp;
+            #endif
+        }
+        (*x_Ptr0++) += x0;
+        #if nEC>=2
+        (*x_Ptr1++) += x1;
+        #endif
+        #if nEC>=3
+        (*x_Ptr2++) += x2;
+        #endif
+        #if nEC>=4
+        (*x_Ptr3++) += x3;
+        #endif
+        #if nEC>=5
+        (*x_Ptr4++) += x4;
+        #endif
+        #if nEC>=6
+        (*x_Ptr5++) += x5;
+        #endif
+        #if nEC>=7
+        (*x_Ptr6++) += x6;
+        #endif
+        #if nEC>=8
+        (*x_Ptr7++) += x7;
+        #endif
+        #if nEC>=9
+        (*x_Ptr8++) += x8;
+        #endif
+        #if nEC>=10
+        (*x_Ptr9++) += x9;
+        #endif
+        #if nEC>=11
+        (*x_Ptr10++) += x10;
+        #endif
+        #if nEC>=12
+        (*x_Ptr11++) += x11;
+        #endif
+        #if nEC>=13
+        (*x_Ptr12++) += x12;
+        #endif
+        #if nEC>=14
+        (*x_Ptr13++) += x13;
+        #endif
+        #if nEC>=15
+        (*x_Ptr14++) += x14;
+        #endif
+        #if nEC>=16
+        (*x_Ptr15++) += x15;
+        #endif
+        #if nEC>=17
+        (*x_Ptr16++) += x16;
+        #endif
+        #if nEC>=18
+        (*x_Ptr17++) += x17;
+        #endif
+        #if nEC>=19
+        (*x_Ptr18++) += x18;
+        #endif
+        #if nEC>=20
+        (*x_Ptr19++) += x19;
+        #endif
+    }
+#endif
+
+#if nISO>=1
+    // isotropic compartments
+    t_v    = ISOv + ISOthreadsT[id];
+    t_vEnd = ISOv + ISOthreadsT[id+1];
+
+    x_Ptr0 = x + nIC*nF + nEC*nE + ISOthreadsT[id];
+    #if nISO>=2
+    x_Ptr1 = x_Ptr0 + nV;
+    #endif
+    #if nISO>=3
+    x_Ptr2 = x_Ptr1 + nV;
+    #endif
+    #if nISO>=4
+    x_Ptr3 = x_Ptr2 + nV;
+    #endif
+    #if nISO>=5
+    x_Ptr4 = x_Ptr3 + nV;
+    #endif
+    #if nISO>=6
+    x_Ptr5 = x_Ptr4 + nV;
+    #endif
+    #if nISO>=7
+    x_Ptr6 = x_Ptr5 + nV;
+    #endif
+    #if nISO>=8
+    x_Ptr7 = x_Ptr6 + nV;
+    #endif
+    #if nISO>=9
+    x_Ptr8 = x_Ptr7 + nV;
+    #endif
+    #if nISO>=10
+    x_Ptr9 = x_Ptr8 + nV;
+    #endif
+    #if nISO>=11
+    x_Ptr10 = x_Ptr9 + nV;
+    #endif
+    #if nISO>=12
+    x_Ptr11 = x_Ptr10 + nV;
+    #endif
+    #if nISO>=13
+    x_Ptr12 = x_Ptr11 + nV;
+    #endif
+    #if nISO>=14
+    x_Ptr13 = x_Ptr12 + nV;
+    #endif
+    #if nISO>=15
+    x_Ptr14 = x_Ptr13 + nV;
+    #endif
+    #if nISO>=16
+    x_Ptr15 = x_Ptr14 + nV;
+    #endif
+    #if nISO>=17
+    x_Ptr16 = x_Ptr15 + nV;
+    #endif
+    #if nISO>=18
+    x_Ptr17 = x_Ptr16 + nV;
+    #endif
+    #if nISO>=19
+    x_Ptr18 = x_Ptr17 + nV;
+    #endif
+    #if nISO>=20
+    x_Ptr19 = x_Ptr18 + nV;
+    #endif
+
+    while( t_v != t_vEnd )
+    {
+        Yptr    = Y    + nS * (*t_v++);
+        YptrEnd = Yptr + nS;
+
+        SFP0ptr = isoSFP0;
+        #if nISO>=2
+        SFP1ptr = isoSFP1;
+        #endif
+        #if nISO>=3
+        SFP2ptr = isoSFP2;
+        #endif
+        #if nISO>=4
+        SFP3ptr = isoSFP3;
+        #endif
+        #if nISO>=5
+        SFP4ptr = isoSFP4;
+        #endif
+        #if nISO>=6
+        SFP5ptr = isoSFP5;
+        #endif
+        #if nISO>=7
+        SFP6ptr = isoSFP6;
+        #endif
+        #if nISO>=8
+        SFP7ptr = isoSFP7;
+        #endif
+        #if nISO>=9
+        SFP8ptr = isoSFP8;
+        #endif
+        #if nISO>=10
+        SFP9ptr = isoSFP9;
+        #endif
+        #if nISO>=11
+        SFP10ptr = isoSFP10;
+        #endif
+        #if nISO>=12
+        SFP11ptr = isoSFP11;
+        #endif
+        #if nISO>=13
+        SFP12ptr = isoSFP12;
+        #endif
+        #if nISO>=14
+        SFP13ptr = isoSFP13;
+        #endif
+        #if nISO>=15
+        SFP14ptr = isoSFP14;
+        #endif
+        #if nISO>=16
+        SFP15ptr = isoSFP15;
+        #endif
+        #if nISO>=17
+        SFP16ptr = isoSFP16;
+        #endif
+        #if nISO>=18
+        SFP17ptr = isoSFP17;
+        #endif
+        #if nISO>=19
+        SFP18ptr = isoSFP18;
+        #endif
+        #if nISO>=20
+        SFP19ptr = isoSFP19;
+        #endif
+
+        Y_tmp = *Yptr;
+        x0 = (*SFP0ptr++) * Y_tmp;
+        #if nISO>=2
+        x1 = (*SFP1ptr++) * Y_tmp;
+        #endif
+        #if nISO>=3
+        x2 = (*SFP2ptr++) * Y_tmp;
+        #endif
+        #if nISO>=4
+        x3 = (*SFP3ptr++) * Y_tmp;
+        #endif
+        #if nISO>=5
+        x4 = (*SFP4ptr++) * Y_tmp;
+        #endif
+        #if nISO>=6
+        x5 = (*SFP5ptr++) * Y_tmp;
+        #endif
+        #if nISO>=7
+        x6 = (*SFP6ptr++) * Y_tmp;
+        #endif
+        #if nISO>=8
+        x7 = (*SFP7ptr++) * Y_tmp;
+        #endif
+        #if nISO>=9
+        x8 = (*SFP8ptr++) * Y_tmp;
+        #endif
+        #if nISO>=10
+        x9 = (*SFP9ptr++) * Y_tmp;
+        #endif
+        #if nISO>=11
+        x10 = (*SFP10ptr++) * Y_tmp;
+        #endif
+        #if nISO>=12
+        x11 = (*SFP11ptr++) * Y_tmp;
+        #endif
+        #if nISO>=13
+        x12 = (*SFP12ptr++) * Y_tmp;
+        #endif
+        #if nISO>=14
+        x13 = (*SFP13ptr++) * Y_tmp;
+        #endif
+        #if nISO>=15
+        x14 = (*SFP14ptr++) * Y_tmp;
+        #endif
+        #if nISO>=16
+        x15 = (*SFP15ptr++) * Y_tmp;
+        #endif
+        #if nISO>=17
+        x16 = (*SFP16ptr++) * Y_tmp;
+        #endif
+        #if nISO>=18
+        x17 = (*SFP17ptr++) * Y_tmp;
+        #endif
+        #if nISO>=19
+        x18 = (*SFP18ptr++) * Y_tmp;
+        #endif
+        #if nISO>=20
+        x19 = (*SFP19ptr++) * Y_tmp;
+        #endif
+
+        while( ++Yptr != YptrEnd )
+        {
+            Y_tmp = *Yptr;
+            x0  += (*SFP0ptr++) * Y_tmp;
+            #if nISO>=2
+            x1  += (*SFP1ptr++) * Y_tmp;
+            #endif
+            #if nISO>=3
+            x2  += (*SFP2ptr++) * Y_tmp;
+            #endif
+            #if nISO>=4
+            x3  += (*SFP3ptr++) * Y_tmp;
+            #endif
+            #if nISO>=5
+            x4  += (*SFP4ptr++) * Y_tmp;
+            #endif
+            #if nISO>=6
+            x5  += (*SFP5ptr++) * Y_tmp;
+            #endif
+            #if nISO>=7
+            x6  += (*SFP6ptr++) * Y_tmp;
+            #endif
+            #if nISO>=8
+            x7  += (*SFP7ptr++) * Y_tmp;
+            #endif
+            #if nISO>=9
+            x8  += (*SFP8ptr++) * Y_tmp;
+            #endif
+            #if nISO>=10
+            x9  += (*SFP9ptr++) * Y_tmp;
+            #endif
+            #if nISO>=11
+            x10  += (*SFP10ptr++) * Y_tmp;
+            #endif
+            #if nISO>=12
+            x11  += (*SFP11ptr++) * Y_tmp;
+            #endif
+            #if nISO>=13
+            x12  += (*SFP12ptr++) * Y_tmp;
+            #endif
+            #if nISO>=14
+            x13  += (*SFP13ptr++) * Y_tmp;
+            #endif
+            #if nISO>=15
+            x14  += (*SFP14ptr++) * Y_tmp;
+            #endif
+            #if nISO>=16
+            x15  += (*SFP15ptr++) * Y_tmp;
+            #endif
+            #if nISO>=17
+            x16  += (*SFP16ptr++) * Y_tmp;
+            #endif
+            #if nISO>=18
+            x17  += (*SFP17ptr++) * Y_tmp;
+            #endif
+            #if nISO>=19
+            x18  += (*SFP18ptr++) * Y_tmp;
+            #endif
+            #if nISO>=20
+            x19  += (*SFP19ptr++) * Y_tmp;
+            #endif
+        }
+
+        (*x_Ptr0++) += x0;
+        #if nISO>=2
+        (*x_Ptr1++) += x1;
+        #endif
+        #if nISO>=3
+        (*x_Ptr2++) += x2;
+        #endif
+        #if nISO>=4
+        (*x_Ptr3++) += x3;
+        #endif
+        #if nISO>=5
+        (*x_Ptr4++) += x4;
+        #endif
+        #if nISO>=6
+        (*x_Ptr5++) += x5;
+        #endif
+        #if nISO>=7
+        (*x_Ptr6++) += x6;
+        #endif
+        #if nISO>=8
+        (*x_Ptr7++) += x7;
+        #endif
+        #if nISO>=9
+        (*x_Ptr8++) += x8;
+        #endif
+        #if nISO>=10
+        (*x_Ptr9++) += x9;
+        #endif
+        #if nISO>=11
+        (*x_Ptr10++) += x10;
+        #endif
+        #if nISO>=12
+        (*x_Ptr11++) += x11;
+        #endif
+        #if nISO>=13
+        (*x_Ptr12++) += x12;
+        #endif
+        #if nISO>=14
+        (*x_Ptr13++) += x13;
+        #endif
+        #if nISO>=15
+        (*x_Ptr14++) += x14;
+        #endif
+        #if nISO>=16
+        (*x_Ptr15++) += x15;
+        #endif
+        #if nISO>=17
+        (*x_Ptr16++) += x16;
+        #endif
+        #if nISO>=18
+        (*x_Ptr17++) += x17;
+        #endif
+        #if nISO>=19
+        (*x_Ptr18++) += x18;
+        #endif
+        #if nISO>=20
+        (*x_Ptr19++) += x19;
+        #endif
+    }
+#endif
+
+    pthread_exit( 0 );
+}
+
+
+// =========================
+// Function called by CYTHON
+// =========================
+void COMMIT_At(
+    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
+    double *_vIN, double *_vOUT,
+    uint32_t *_ICf, uint32_t *_ICv, uint16_t *_ICo, float *_ICl,
+    uint32_t *_ECv, uint16_t *_ECo,
+    uint32_t *_ISOv,
+    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
+    uint8_t* _ICthreadsT, uint32_t* _ECthreadsT, uint32_t* _ISOthreadsT
+)
+{
+    nF = _nF;
+    n  = _n;
+    nE = _nE;
+    nV = _nV;
+    nS = _nS;
+    ndirs = _ndirs;
+
+    x = _vOUT;
+    Y = _vIN;
+
+    ICf  = _ICf;
+    ICv  = _ICv;
+    ICo  = _ICo;
+    ICl  = _ICl;
+    ECv  = _ECv;
+    ECo  = _ECo;
+    ISOv = _ISOv;
+
+    #if nIC>=1
+    wmrSFP0 = _wmrSFP;
+    #if nIC>=2
+    wmrSFP1 = wmrSFP0 + _ndirs*_nS;
+    #if nIC>=3
+    wmrSFP2 = wmrSFP1 + _ndirs*_nS;
+    #if nIC>=4
+    wmrSFP3 = wmrSFP2 + _ndirs*_nS;
+    #if nIC>=5
+    wmrSFP4 = wmrSFP3 + _ndirs*_nS;
+    #if nIC>=6
+    wmrSFP5 = wmrSFP4 + _ndirs*_nS;
+    #if nIC>=7
+    wmrSFP6 = wmrSFP5 + _ndirs*_nS;
+    #if nIC>=8
+    wmrSFP7 = wmrSFP6 + _ndirs*_nS;
+    #if nIC>=9
+    wmrSFP8 = wmrSFP7 + _ndirs*_nS;
+    #if nIC>=10
+    wmrSFP9 = wmrSFP8 + _ndirs*_nS;
+    #if nIC>=11
+    wmrSFP10 = wmrSFP9 + _ndirs*_nS;
+    #if nIC>=12
+    wmrSFP11 = wmrSFP10 + _ndirs*_nS;
+    #if nIC>=13
+    wmrSFP12 = wmrSFP11 + _ndirs*_nS;
+    #if nIC>=14
+    wmrSFP13 = wmrSFP12 + _ndirs*_nS;
+    #if nIC>=15
+    wmrSFP14 = wmrSFP13 + _ndirs*_nS;
+    #if nIC>=16
+    wmrSFP15 = wmrSFP14 + _ndirs*_nS;
+    #if nIC>=17
+    wmrSFP16 = wmrSFP15 + _ndirs*_nS;
+    #if nIC>=18
+    wmrSFP17 = wmrSFP16 + _ndirs*_nS;
+    #if nIC>=19
+    wmrSFP18 = wmrSFP17 + _ndirs*_nS;
+    #if nIC>=20
+    wmrSFP19 = wmrSFP18 + _ndirs*_nS;
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #if nEC>=1
+    wmhSFP0 = _wmhSFP;
+    #if nEC>=2
+    wmhSFP1 = wmhSFP0 + _ndirs*_nS;
+    #if nEC>=3
+    wmhSFP2 = wmhSFP1 + _ndirs*_nS;
+    #if nEC>=4
+    wmhSFP3 = wmhSFP2 + _ndirs*_nS;
+    #if nEC>=5
+    wmhSFP4 = wmhSFP3 + _ndirs*_nS;
+    #if nEC>=6
+    wmhSFP5 = wmhSFP4 + _ndirs*_nS;
+    #if nEC>=7
+    wmhSFP6 = wmhSFP5 + _ndirs*_nS;
+    #if nEC>=8
+    wmhSFP7 = wmhSFP6 + _ndirs*_nS;
+    #if nEC>=9
+    wmhSFP8 = wmhSFP7 + _ndirs*_nS;
+    #if nEC>=10
+    wmhSFP9 = wmhSFP8 + _ndirs*_nS;
+    #if nEC>=11
+    wmhSFP10 = wmhSFP9 + _ndirs*_nS;
+    #if nEC>=12
+    wmhSFP11 = wmhSFP10 + _ndirs*_nS;
+    #if nEC>=13
+    wmhSFP12 = wmhSFP11 + _ndirs*_nS;
+    #if nEC>=14
+    wmhSFP13 = wmhSFP12 + _ndirs*_nS;
+    #if nEC>=15
+    wmhSFP14 = wmhSFP13 + _ndirs*_nS;
+    #if nEC>=16
+    wmhSFP15 = wmhSFP14 + _ndirs*_nS;
+    #if nEC>=17
+    wmhSFP16 = wmhSFP15 + _ndirs*_nS;
+    #if nEC>=18
+    wmhSFP17 = wmhSFP16 + _ndirs*_nS;
+    #if nEC>=19
+    wmhSFP18 = wmhSFP17 + _ndirs*_nS;
+    #if nEC>=20
+    wmhSFP19 = wmhSFP18 + _ndirs*_nS;
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #if nISO>=1
+    isoSFP0 = _isoSFP;
+    #if nISO>=2
+    isoSFP1 = isoSFP0 + _nS;
+    #if nISO>=3
+    isoSFP2 = isoSFP1 + _nS;
+    #if nISO>=4
+    isoSFP3 = isoSFP2 + _nS;
+    #if nISO>=5
+    isoSFP4 = isoSFP3 + _nS;
+    #if nISO>=6
+    isoSFP5 = isoSFP4 + _nS;
+    #if nISO>=7
+    isoSFP6 = isoSFP5 + _nS;
+    #if nISO>=8
+    isoSFP7 = isoSFP6 + _nS;
+    #if nISO>=9
+    isoSFP8 = isoSFP7 + _nS;
+    #if nISO>=10
+    isoSFP9 = isoSFP8 + _nS;
+    #if nISO>=11
+    isoSFP10 = isoSFP9 + _nS;
+    #if nISO>=12
+    isoSFP11 = isoSFP10 + _nS;
+    #if nISO>=13
+    isoSFP12 = isoSFP11 + _nS;
+    #if nISO>=14
+    isoSFP13 = isoSFP12 + _nS;
+    #if nISO>=15
+    isoSFP14 = isoSFP13 + _nS;
+    #if nISO>=16
+    isoSFP15 = isoSFP14 + _nS;
+    #if nISO>=17
+    isoSFP16 = isoSFP15 + _nS;
+    #if nISO>=18
+    isoSFP17 = isoSFP16 + _nS;
+    #if nISO>=19
+    isoSFP18 = isoSFP17 + _nS;
+    #if nISO>=20
+    isoSFP19 = isoSFP18 + _nS;
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+
+    ICthreadsT  = _ICthreadsT;
+    ECthreadsT  = _ECthreadsT;
+    ISOthreadsT = _ISOthreadsT;
+
+    // Run SEPARATE THREADS to perform the multiplication
+    pthread_t threads[nTHREADS];
+    int t;
+    for(t=0; t<nTHREADS ; t++)
+        pthread_create( &threads[t], NULL, COMMIT_At__block, (void *) (long int)t );
+    for(t=0; t<nTHREADS ; t++)
+        pthread_join( threads[t], NULL );
+    return;
+}
diff --git a/commit/proximals.pyx b/commit/proximals.pyx
index dcccf7aa..d019d914 100644
--- a/commit/proximals.pyx
+++ b/commit/proximals.pyx
@@ -1,141 +1,141 @@
-#!python
-#cython: language_level=3, boundscheck=False, wraparound=False, profile=False
-"""
-Author: Matteo Frigo - lts5 @ EPFL and Dep. of CS @ Univ. of Verona
-
-This structure is based on the previous work of Rafael Carrillo and was
-supported by the LTS5 laboratory at EPFL, Lausanne.
-"""
-cimport cython
-import numpy as np
-cimport numpy as np
-from libc.math cimport sqrt
-
-
-cpdef non_negativity(double [::1] x, int compartment_start, int compartment_size):
-    """
-    POCS for the first orthant (non-negativity)
-    """
-    cdef:
-        int i
-    for i in xrange(compartment_start, compartment_start+compartment_size):
-        if x[i] <= 0.0 :
-            x[i] = 0.0
-    return np.asarray( x )
-
-
-cpdef soft_thresholding(double [::1] x, double lam, int compartment_start, int compartment_size) :
-    """
-    Proximal of L1 norm
-    """
-    # NB: this preserves non-negativity
-    cdef:
-        int i
-    for i in xrange(compartment_start, compartment_start+compartment_size):
-        if x[i] <= lam:
-            x[i] = 0.0
-        else:
-            x[i] = x[i] - lam
-    return np.asarray( x )
-
-
-cpdef projection_onto_l2_ball(double [::1] x, double lam, int compartment_start, int compartment_size) :
-    """
-    Proximal of L2 norm
-    """
-    # NB: this preserves non-negativity
-    cdef:
-        double xn = 0.0, k
-        int i
-    for i in xrange(compartment_start, compartment_start+compartment_size):
-        xn += x[i]*x[i]
-    xn = sqrt(xn)
-    if xn > lam :
-        k = 1. - lam/xn
-        for i in xrange(compartment_start, compartment_start+compartment_size):
-            x[i] = x[i]*k
-    else :
-        for i in xrange(compartment_start, compartment_start+compartment_size):
-            x[i] = 0
-    return np.asarray( x )
-
-
-cpdef omega_group_sparsity(double [::1] x, int [::1] group_idx, int [::1] group_size, double [::1] group_weight, double lam, double n) :
-    """
-    References:
-        [1] Jenatton et al. - `Proximal Methods for Hierarchical Sparse Coding`
-    """
-    cdef:
-        int nG = group_size.size, N
-        int k, i, j = 0
-        double omega = 0.0, gNorm, x_i
-
-    if lam != 0:
-        if n == 2:
-            for k in xrange(nG):
-                N = group_size[k]
-                gNorm = 0.0
-                for i in xrange(j,j+N) :
-                    x_i = x[group_idx[i]]
-                    gNorm += x_i*x_i
-                omega += group_weight[k] * sqrt( gNorm )
-                j += N
-        elif n == np.inf:
-            for k in xrange(nG):
-                N = group_size[k]
-                gNorm = x[group_idx[j]]
-                for i in xrange(j+1,j+N) :
-                    x_i = x[group_idx[i]]
-                    if x_i > gNorm :
-                        gNorm = x_i
-                omega += group_weight[k] * gNorm
-                j += N
-    return lam*omega
-
-
-cpdef prox_group_sparsity( double [::1] x, int [::1] group_idx, int [::1] group_size, double [::1] group_weight, double lam, double n ) :
-    """
-    References:
-        [1] Jenatton et al. - `Proximal Methods for Hierarchical Sparse Coding`
-    """
-    cdef:
-        int nG = group_size.size, N
-        int k, i, j = 0
-        double wl, gNorm, x_i
-
-    k = x.size
-    for i in xrange(k):
-        if x[i] <= 0.0:
-            x[i] = 0.0
-
-    if lam != 0:
-        if n == 2 :
-            for k in xrange(nG) :
-                N = group_size[k]
-                gNorm = 0.0
-                for i in xrange(j,j+N) :
-                    x_i = x[group_idx[i]]
-                    gNorm += x_i*x_i
-                gNorm = sqrt( gNorm )
-
-                wl = group_weight[k] * lam
-                if gNorm <= wl :
-                    for i in xrange(j,j+N) :
-                        x[ group_idx[i] ] = 0.0
-                else :
-                    wl = (gNorm-wl)/gNorm
-                    for i in xrange(j,j+N) :
-                        x[ group_idx[i] ] *= wl
-                j += N
-        # elif n == np.inf :
-        # [TODO] TO be correctly implemented
-        #     for k in range(nG) :
-        #         idx = subtree[k]
-        #         # xn = max( v[idx] )
-        #         r = weight[k] * lam
-        #         for i in idx :
-        #             if v[i] <= r:
-        #                 v[i] = 0.0
-        #             else :
-        #                 v[i] -= r
+#!python
+#cython: language_level=3, boundscheck=False, wraparound=False, profile=False
+"""
+Author: Matteo Frigo - lts5 @ EPFL and Dep. of CS @ Univ. of Verona
+
+This structure is based on the previous work of Rafael Carrillo and was
+supported by the LTS5 laboratory at EPFL, Lausanne.
+"""
+cimport cython
+import numpy as np
+cimport numpy as np
+from libc.math cimport sqrt
+
+
+cpdef non_negativity(double [::1] x, int compartment_start, int compartment_size):
+    """
+    POCS for the first orthant (non-negativity)
+    """
+    cdef:
+        int i
+    for i in xrange(compartment_start, compartment_start+compartment_size):
+        if x[i] <= 0.0 :
+            x[i] = 0.0
+    return np.asarray( x )
+
+
+cpdef soft_thresholding(double [::1] x, double lam, int compartment_start, int compartment_size) :
+    """
+    Proximal of L1 norm
+    """
+    # NB: this preserves non-negativity
+    cdef:
+        int i
+    for i in xrange(compartment_start, compartment_start+compartment_size):
+        if x[i] <= lam:
+            x[i] = 0.0
+        else:
+            x[i] = x[i] - lam
+    return np.asarray( x )
+
+
+cpdef projection_onto_l2_ball(double [::1] x, double lam, int compartment_start, int compartment_size) :
+    """
+    Proximal of L2 norm
+    """
+    # NB: this preserves non-negativity
+    cdef:
+        double xn = 0.0, k
+        int i
+    for i in xrange(compartment_start, compartment_start+compartment_size):
+        xn += x[i]*x[i]
+    xn = sqrt(xn)
+    if xn > lam :
+        k = 1. - lam/xn
+        for i in xrange(compartment_start, compartment_start+compartment_size):
+            x[i] = x[i]*k
+    else :
+        for i in xrange(compartment_start, compartment_start+compartment_size):
+            x[i] = 0
+    return np.asarray( x )
+
+
+cpdef omega_group_sparsity(double [::1] x, int [::1] group_idx, int [::1] group_size, double [::1] group_weight, double lam, double n) :
+    """
+    References:
+        [1] Jenatton et al. - `Proximal Methods for Hierarchical Sparse Coding`
+    """
+    cdef:
+        int nG = group_size.size, N
+        int k, i, j = 0
+        double omega = 0.0, gNorm, x_i
+
+    if lam != 0:
+        if n == 2:
+            for k in xrange(nG):
+                N = group_size[k]
+                gNorm = 0.0
+                for i in xrange(j,j+N) :
+                    x_i = x[group_idx[i]]
+                    gNorm += x_i*x_i
+                omega += group_weight[k] * sqrt( gNorm )
+                j += N
+        elif n == np.inf:
+            for k in xrange(nG):
+                N = group_size[k]
+                gNorm = x[group_idx[j]]
+                for i in xrange(j+1,j+N) :
+                    x_i = x[group_idx[i]]
+                    if x_i > gNorm :
+                        gNorm = x_i
+                omega += group_weight[k] * gNorm
+                j += N
+    return lam*omega
+
+
+cpdef prox_group_sparsity( double [::1] x, int [::1] group_idx, int [::1] group_size, double [::1] group_weight, double lam, double n ) :
+    """
+    References:
+        [1] Jenatton et al. - `Proximal Methods for Hierarchical Sparse Coding`
+    """
+    cdef:
+        int nG = group_size.size, N
+        int k, i, j = 0
+        double wl, gNorm, x_i
+
+    k = x.size
+    for i in xrange(k):
+        if x[i] <= 0.0:
+            x[i] = 0.0
+
+    if lam != 0:
+        if n == 2 :
+            for k in xrange(nG) :
+                N = group_size[k]
+                gNorm = 0.0
+                for i in xrange(j,j+N) :
+                    x_i = x[group_idx[i]]
+                    gNorm += x_i*x_i
+                gNorm = sqrt( gNorm )
+
+                wl = group_weight[k] * lam
+                if gNorm <= wl :
+                    for i in xrange(j,j+N) :
+                        x[ group_idx[i] ] = 0.0
+                else :
+                    wl = (gNorm-wl)/gNorm
+                    for i in xrange(j,j+N) :
+                        x[ group_idx[i] ] *= wl
+                j += N
+        # elif n == np.inf :
+        # [TODO] TO be correctly implemented
+        #     for k in range(nG) :
+        #         idx = subtree[k]
+        #         # xn = max( v[idx] )
+        #         r = weight[k] * lam
+        #         for i in idx :
+        #             if v[i] <= r:
+        #                 v[i] = 0.0
+        #             else :
+        #                 v[i] -= r
     return np.asarray( x )
\ No newline at end of file
diff --git a/commit/solvers.py b/commit/solvers.py
index dc7767ce..29bc8374 100755
--- a/commit/solvers.py
+++ b/commit/solvers.py
@@ -1,403 +1,403 @@
-"""
-Author: Matteo Frigo - lts5 @ EPFL and Dep. of CS @ Univ. of Verona
-
-This structure is based on the previous work of Rafael Carrillo and was
-supported by the LTS5 laboratory at EPFL, Lausanne.
-"""
-from __future__ import print_function
-import numpy as np
-from math import sqrt
-import sys
-import warnings
-eps = np.finfo(float).eps
-
-from commit.proximals import (non_negativity,
-                             omega_group_sparsity,
-                             prox_group_sparsity,
-                             soft_thresholding,
-                             projection_onto_l2_ball)
-group_sparsity = -1
-non_negative = 0
-norm1 = 1
-norm2 = 2
-norminf = np.inf
-list_regnorms = [group_sparsity, non_negative, norm1, norm2]
-list_group_sparsity_norms = [norm2]#, norminf] # removed because of issue #54
-
-
-def init_regularisation(commit_evaluation,
-                        regnorms = (non_negative, non_negative, non_negative),
-                        structureIC = None, weightsIC = None, group_norm = 2,
-                        lambdas = (.0,.0,.0) ):
-    """
-    Initialise the data structure that defines Omega in
-
-        argmin_x 0.5*||Ax-y||_2^2 + Omega(x)
-
-
-    Input
-    -----
-    commit_evaluation - commit.Evaluation object :
-        dictionary and model have to be loaded beforehand.
-
-
-    regnorms - tuple :
-        this sets the penalty term to be used for each compartment.
-            Default = (non_negative,non_negative,non_negative).
-
-            regnorms[0] corresponds to the Intracellular compartment
-            regnorms[1] corresponds to the Extracellular compartment
-            regnorms[2] corresponds to the Isotropic compartment
-
-            Each regnorms[k] must be one of commit.solvers.
-                                {group_sparsity, non_negative, norm1, norm2}.
-
-            commit.solvers.group_sparsity considers both the non-overlapping
-                and the hierarchical group sparsity (see [1]). This option is
-                allowed only in the IC compartment. The mathematical formulation
-                of this term is
-                $\Omega(x) = \lambda \sum_{g\in G} w_g |x_g|
-
-            commit.solvers.non_negative puts a non negativity constraint on the
-                coefficients corresponding to the compartment. This is the
-                default option for each compartment
-
-            commit.solvers.norm1 penalises with the 1-norm of the coefficients
-                corresponding to the compartment.
-
-            commit.solvers.norm2 penalises with the 2-norm of the coefficients
-                corresponding to the compartment.
-
-
-    structureIC - np.array(list(list)) :
-        group structure for the IC compartment.
-            This field is necessary only if regterm[0]=commit.solver.group_sparsity.
-            Example:
-                structureIC = np.array([[0,2,5],[1,3,4],[0,1,2,3,4,5],[6]])
-
-                that is equivalent to
-                            [0,1,2,3,4,5]        [6]
-                              /       \
-                        [0,2,5]       [1,3,4]
-                which has two non overlapping groups, one of which is the union
-                of two other non-overlapping groups.
-
-
-    weightsIC - np.array(np.float64) :
-        this defines the weights associated to each group of structure IC.
-
-
-    group_norm - number :
-        norm type for the commit.solver.group_sparsity penalisation of the IC compartment.
-            Default: group_norm = commit.solver.norm2
-            To be chosen among commit.solver.{norm2,norminf}.
-
-    lambdas - tuple :
-        regularisation parameter for each compartment.
-            Default: lambdas = (0.0, 0.0, 0.0)
-            The lambdas correspond to the onse described in the mathematical
-            formulation of the regularisation term
-            $\Omega(x) = lambdas[0]*regnorm[0](x) + lambdas[1]*regnorm[1](x) + lambdas[2]*regnorm[2](x)$
-
-
-    References:
-        [1] Jenatton et al. - 'Proximal Methods for Hierarchical Sparse Coding'
-    """
-    regularisation = {}
-
-    regularisation['startIC']  = 0
-    regularisation['sizeIC']   = int( commit_evaluation.DICTIONARY['IC']['nF'] * commit_evaluation.KERNELS['wmr'].shape[0])
-    regularisation['startEC']  = int( regularisation['sizeIC'] )
-    regularisation['sizeEC']   = int( commit_evaluation.DICTIONARY['EC']['nE'] * commit_evaluation.KERNELS['wmh'].shape[0])
-    regularisation['startISO'] = int( regularisation['sizeIC'] + regularisation['sizeEC'] )
-    regularisation['sizeISO']  = int( commit_evaluation.DICTIONARY['nV'] * commit_evaluation.KERNELS['iso'].shape[0])
-
-    regularisation['normIC']  = regnorms[0]
-    regularisation['normEC']  = regnorms[1]
-    regularisation['normISO'] = regnorms[2]
-
-    regularisation['lambdaIC']  = float( lambdas[0] )
-    regularisation['lambdaEC']  = float( lambdas[1] )
-    regularisation['lambdaISO'] = float( lambdas[2] )
-
-    # Solver-specific fields
-    regularisation['structureIC']      = structureIC
-    regularisation['weightsIC']        = weightsIC
-    regularisation['group_norm']       = group_norm
-
-    return regularisation
-
-
-def regularisation2omegaprox(regularisation):
-    lambdaIC  = float(regularisation.get('lambdaIC'))
-    lambdaEC  = float(regularisation.get('lambdaEC'))
-    lambdaISO = float(regularisation.get('lambdaISO'))
-    if lambdaIC < 0.0 or lambdaEC < 0.0 or lambdaISO < 0.0:
-        raise ValueError('Negative regularisation parameters are not allowed')
-
-    normIC  = regularisation.get('normIC')
-    normEC  = regularisation.get('normEC')
-    normISO = regularisation.get('normISO')
-    if not normIC in list_regnorms:
-        raise ValueError('normIC must be one of commit.solvers.{group_sparsity,non_negative,norm1,norm2}')
-    if not normEC in list_regnorms:
-        raise ValueError('normEC must be one of commit.solvers.{group_sparsity,non_negative,norm1,norm2}')
-    if not normISO in list_regnorms:
-        raise ValueError('normISO must be one of commit.solvers.{group_sparsity,non_negative,norm1,norm2}')
-
-    ## NNLS case
-    if (lambdaIC == 0.0 and lambdaEC == 0.0 and lambdaISO == 0.0) or (normIC == non_negative and normEC == non_negative and normISO == non_negative):
-        omega = lambda x: 0.0
-        prox  = lambda x: non_negativity(x, 0, len(x))
-        return omega, prox
-
-    ## All other cases
-    # Intracellular Compartment
-    startIC = regularisation.get('startIC')
-    sizeIC  = regularisation.get('sizeIC')
-    if lambdaIC == 0.0:
-        omegaIC = lambda x: 0.0
-        proxIC  = lambda x: x
-    elif normIC == norm2:
-        omegaIC = lambda x: lambdaIC * np.linalg.norm(x[startIC:sizeIC])
-        proxIC  = lambda x: projection_onto_l2_ball(x, lambdaIC, startIC, sizeIC)
-    elif normIC == norm1:
-        omegaIC = lambda x: lambdaIC * sum( x[startIC:sizeIC] )
-        proxIC  = lambda x: soft_thresholding(x, lambdaIC, startIC, sizeIC)
-    elif normIC == non_negative:
-        omegaIC = lambda x: 0.0
-        proxIC  = lambda x: non_negativity(x, startIC, sizeIC)
-    elif normIC == group_sparsity:
-        structureIC = regularisation.get('structureIC')
-        groupWeightIC   = regularisation.get('weightsIC')
-        if not len(structureIC) == len(groupWeightIC):
-            raise ValueError('Number of groups and weights do not coincide.')
-        group_norm = regularisation.get('group_norm')
-        if not group_norm in list_group_sparsity_norms:
-            raise ValueError('Wrong norm in the structured sparsity term. Choose between %s.' % str(list_group_sparsity_norms))
-
-        # convert to new data structure (needed for faster access)
-        N = np.sum([g.size for g in structureIC])
-        groupIdxIC  = np.zeros( (N,), dtype=np.int32 )
-        groupSizeIC = np.zeros( (structureIC.size,), dtype=np.int32 )
-        pos = 0
-        for i, g in enumerate(structureIC) :
-            groupSizeIC[i] = g.size
-            groupIdxIC[pos:(pos+g.size)] = g[:]
-            pos += g.size
-
-        omegaIC = lambda x: omega_group_sparsity( x, groupIdxIC, groupSizeIC, groupWeightIC, lambdaIC, group_norm )
-        proxIC  = lambda x:  prox_group_sparsity( x, groupIdxIC, groupSizeIC, groupWeightIC, lambdaIC, group_norm )
-    else:
-        raise ValueError('Type of regularisation for IC compartment not recognized.')
-
-
-    # Extracellular Compartment
-    startEC = regularisation.get('startEC')
-    sizeEC  = regularisation.get('sizeEC')
-    if lambdaEC == 0.0:
-        omegaEC = lambda x: 0.0
-        proxEC  = lambda x: x
-    elif normEC == norm2:
-        omegaEC = lambda x: lambdaEC * np.linalg.norm(x[startEC:(startEC+sizeEC)])
-        proxEC  = lambda x: projection_onto_l2_ball(x, lambdaEC, startEC, sizeEC)
-    elif normEC == norm1:
-        omegaEC = lambda x: lambdaEC * sum( x[startEC:(startEC+sizeEC)] )
-        proxEC  = lambda x: soft_thresholding(x, lambdaEC, startEC, sizeEC)
-    elif normEC == non_negative:
-        omegaEC = lambda x: 0.0
-        proxEC  = lambda x: non_negativity(x, startEC, sizeEC)
-    else:
-        raise ValueError('Type of regularisation for EC compartment not recognized.')
-
-    # Isotropic Compartment
-    startISO = regularisation.get('startISO')
-    sizeISO  = regularisation.get('sizeISO')
-    if lambdaISO == 0.0:
-        omegaISO = lambda x: 0.0
-        proxISO  = lambda x: x
-    elif normISO == norm2:
-        omegaISO = lambda x: lambdaISO * np.linalg.norm(x[startISO:(startISO+sizeISO)])
-        proxISO  = lambda x: projection_onto_l2_ball(x, lambdaISO, startISO, sizeISO)
-    elif normISO == norm1:
-        omegaISO = lambda x: lambdaISO * sum( x[startISO:(startISO+sizeISO)] )
-        proxISO  = lambda x: soft_thresholding(x, lambdaISO, startISO, sizeISO)
-    elif normISO == non_negative:
-        omegaISO = lambda x: 0.0
-        proxISO  = lambda x: non_negativity(x, startISO, sizeISO)
-    else:
-        raise ValueError('Type of regularisation for ISO compartment not recognized.')
-
-    omega = lambda x: omegaIC(x) + omegaEC(x) + omegaISO(x)
-    prox = lambda x: non_negativity(proxIC(proxEC(proxISO(x))),0,x.size) # non negativity is redunduntly forced
-
-    return omega, prox
-
-
-def evaluate_model(y, A, x, regularisation = None):
-    if regularisation is None:
-        omega = lambda x: 0.0
-        prox  = lambda x: non_negativity(x, 0, len(x))
-    else:
-        omega, _ = regularisation2omegaprox(regularisation)
-
-    return 0.5*np.linalg.norm(A.dot(x)-y)**2 + omega(x)
-
-
-def solve(y, A, At, tol_fun = 1e-4, tol_x = 1e-6, max_iter = 1000, verbose = 1, x0 = None, regularisation = None):
-    """
-    Solve the regularised least squares problem
-
-        argmin_x 0.5*||Ax-y||_2^2 + Omega(x)
-
-    with the Omega described by 'regularisation'.
-
-    Check the documentation of commit.solvers.init_regularisation to see how to
-    solve a specific problem.
-    """
-    if regularisation is None:
-        omega = lambda x: 0.0
-        prox  = lambda x: non_negativity(x, 0, x.size)
-    else:
-        omega, prox = regularisation2omegaprox(regularisation)
-
-    if x0 is None:
-        x0 = np.zeros(A.shape[1])
-
-    return fista( y, A, At, tol_fun, tol_x, max_iter, verbose, x0, omega, prox)
-
-
-def fista( y, A, At, tol_fun, tol_x, max_iter, verbose, x0, omega, proximal) :
-    """
-    Solve the regularised least squares problem
-
-        argmin_x 0.5*||Ax-y||_2^2 + Omega(x)
-
-    with the FISTA algorithm described in [1].
-
-    The penalty term and its proximal operator must be defined in such a way
-    that they already contain the regularisation parameter.
-
-    References:
-        [1] Beck & Teboulle - `A Fast Iterative Shrinkage Thresholding
-            Algorithm for Linear Inverse Problems`
-    """
-
-    # Initialization
-    res = -y.copy()
-    xhat = x0.copy()
-    x = np.zeros_like(xhat)
-    res += A.dot(xhat)
-    proximal( xhat )
-    reg_term = omega( xhat )
-    prev_obj = 0.5 * np.linalg.norm(res)**2 + reg_term
-
-    told = 1
-    beta = 0.9
-    prev_x = xhat.copy()
-    grad = np.asarray(At.dot(res))
-    qfval = prev_obj
-
-    # Step size computation
-    L = ( np.linalg.norm( A.dot(grad) ) / np.linalg.norm(grad) )**2
-    mu = 1.9 / L
-
-    # Main loop
-    if verbose >= 1 :
-        print()
-        print( "      |  1/2||Ax-y||^2      Omega      |  Cost function    Abs error      Rel error    |      Abs x          Rel x    " )
-        print( "------|--------------------------------|-----------------------------------------------|------------------------------" )
-    iter = 1
-    while True :
-        if verbose >= 1 :
-            print( "%4d  |" % iter, end="" )
-            sys.stdout.flush()
-
-        # Smooth step
-        x = xhat - mu*grad
-
-        # Non-smooth step
-        proximal( x )
-        reg_term_x = omega( x )
-
-        # Check stepsize
-        tmp = x-xhat
-        q = qfval + np.real( np.dot(tmp,grad) ) + 0.5/mu * np.linalg.norm(tmp)**2 + reg_term_x
-        res = A.dot(x) - y
-        res_norm = np.linalg.norm(res)
-        curr_obj = 0.5 * res_norm**2 + reg_term_x
-
-        # Backtracking
-        while curr_obj > q :
-            # Smooth step
-            mu = beta*mu
-            x = xhat - mu*grad
-
-            # Non-smooth step
-            proximal( x )
-            reg_term_x = omega( x )
-
-            # Check stepsize
-            tmp = x-xhat
-            q = qfval + np.real( np.dot(tmp,grad) ) + 0.5/mu * np.linalg.norm(tmp)**2 + reg_term_x
-            res = A.dot(x) - y
-            res_norm = np.linalg.norm(res)
-            curr_obj = 0.5 * res_norm**2 + reg_term_x
-
-        # Global stopping criterion
-        abs_obj = abs(curr_obj - prev_obj)
-        rel_obj = abs_obj / curr_obj
-        abs_x   = np.linalg.norm(x - prev_x)
-        rel_x   = abs_x / ( np.linalg.norm(x) + eps )
-        if verbose >= 1 :
-            print( "  %13.7e  %13.7e  |  %13.7e  %13.7e  %13.7e  |  %13.7e  %13.7e" % ( 0.5 * res_norm**2, reg_term_x, curr_obj, abs_obj, rel_obj, abs_x, rel_x ) )
-
-        if abs_obj < eps :
-            criterion = "Absolute tolerance on the objective"
-            break
-        elif rel_obj < tol_fun :
-            criterion = "Relative tolerance on the objective"
-            break
-        elif abs_x < eps :
-            criterion = "Absolute tolerance on the unknown"
-            break
-        elif rel_x < tol_x :
-            criterion = "Relative tolerance on the unknown"
-            break
-        elif iter >= max_iter :
-            criterion = "Maximum number of iterations"
-            break
-
-        # FISTA update
-        t = 0.5 * ( 1 + sqrt(1+4*told**2) )
-        xhat = x + (told-1)/t * (x - prev_x)
-
-        # Gradient computation
-        res = A.dot(xhat) - y
-        xarr = np.asarray(x)
-
-        grad = np.asarray(At.dot(res))
-
-        # Update variables
-        iter += 1
-        prev_obj = curr_obj
-        prev_x = x.copy()
-        told = t
-        qfval = 0.5 * np.linalg.norm(res)**2
-
-
-    if verbose >= 1 :
-        print( "< Stopping criterion: %s >" % criterion )
-
-    opt_details = {}
-    opt_details['residual'] = 0.5*res_norm**2
-    opt_details['regterm'] = reg_term_x
-    opt_details['cost_function'] = curr_obj
-    opt_details['abs_cost'] = abs_obj
-    opt_details['rel_cost'] = rel_obj
-    opt_details['abs_x'] = abs_x
-    opt_details['rel _x'] = rel_x
-    opt_details['iterations'] = iter
-    opt_details['stopping_criterion'] = criterion
-
-    return x, opt_details
+"""
+Author: Matteo Frigo - lts5 @ EPFL and Dep. of CS @ Univ. of Verona
+
+This structure is based on the previous work of Rafael Carrillo and was
+supported by the LTS5 laboratory at EPFL, Lausanne.
+"""
+from __future__ import print_function
+import numpy as np
+from math import sqrt
+import sys
+import warnings
+eps = np.finfo(float).eps
+
+from commit.proximals import (non_negativity,
+                             omega_group_sparsity,
+                             prox_group_sparsity,
+                             soft_thresholding,
+                             projection_onto_l2_ball)
+group_sparsity = -1
+non_negative = 0
+norm1 = 1
+norm2 = 2
+norminf = np.inf
+list_regnorms = [group_sparsity, non_negative, norm1, norm2]
+list_group_sparsity_norms = [norm2]#, norminf] # removed because of issue #54
+
+
+def init_regularisation(commit_evaluation,
+                        regnorms = (non_negative, non_negative, non_negative),
+                        structureIC = None, weightsIC = None, group_norm = 2,
+                        lambdas = (.0,.0,.0) ):
+    """
+    Initialise the data structure that defines Omega in
+
+        argmin_x 0.5*||Ax-y||_2^2 + Omega(x)
+
+
+    Input
+    -----
+    commit_evaluation - commit.Evaluation object :
+        dictionary and model have to be loaded beforehand.
+
+
+    regnorms - tuple :
+        this sets the penalty term to be used for each compartment.
+            Default = (non_negative,non_negative,non_negative).
+
+            regnorms[0] corresponds to the Intracellular compartment
+            regnorms[1] corresponds to the Extracellular compartment
+            regnorms[2] corresponds to the Isotropic compartment
+
+            Each regnorms[k] must be one of commit.solvers.
+                                {group_sparsity, non_negative, norm1, norm2}.
+
+            commit.solvers.group_sparsity considers both the non-overlapping
+                and the hierarchical group sparsity (see [1]). This option is
+                allowed only in the IC compartment. The mathematical formulation
+                of this term is
+                $\Omega(x) = \lambda \sum_{g\in G} w_g |x_g|
+
+            commit.solvers.non_negative puts a non negativity constraint on the
+                coefficients corresponding to the compartment. This is the
+                default option for each compartment
+
+            commit.solvers.norm1 penalises with the 1-norm of the coefficients
+                corresponding to the compartment.
+
+            commit.solvers.norm2 penalises with the 2-norm of the coefficients
+                corresponding to the compartment.
+
+
+    structureIC - np.array(list(list)) :
+        group structure for the IC compartment.
+            This field is necessary only if regterm[0]=commit.solver.group_sparsity.
+            Example:
+                structureIC = np.array([[0,2,5],[1,3,4],[0,1,2,3,4,5],[6]])
+
+                that is equivalent to
+                            [0,1,2,3,4,5]        [6]
+                              /       \
+                        [0,2,5]       [1,3,4]
+                which has two non overlapping groups, one of which is the union
+                of two other non-overlapping groups.
+
+
+    weightsIC - np.array(np.float64) :
+        this defines the weights associated to each group of structure IC.
+
+
+    group_norm - number :
+        norm type for the commit.solver.group_sparsity penalisation of the IC compartment.
+            Default: group_norm = commit.solver.norm2
+            To be chosen among commit.solver.{norm2,norminf}.
+
+    lambdas - tuple :
+        regularisation parameter for each compartment.
+            Default: lambdas = (0.0, 0.0, 0.0)
+            The lambdas correspond to the onse described in the mathematical
+            formulation of the regularisation term
+            $\Omega(x) = lambdas[0]*regnorm[0](x) + lambdas[1]*regnorm[1](x) + lambdas[2]*regnorm[2](x)$
+
+
+    References:
+        [1] Jenatton et al. - 'Proximal Methods for Hierarchical Sparse Coding'
+    """
+    regularisation = {}
+
+    regularisation['startIC']  = 0
+    regularisation['sizeIC']   = int( commit_evaluation.DICTIONARY['IC']['nF'] * commit_evaluation.KERNELS['wmr'].shape[0])
+    regularisation['startEC']  = int( regularisation['sizeIC'] )
+    regularisation['sizeEC']   = int( commit_evaluation.DICTIONARY['EC']['nE'] * commit_evaluation.KERNELS['wmh'].shape[0])
+    regularisation['startISO'] = int( regularisation['sizeIC'] + regularisation['sizeEC'] )
+    regularisation['sizeISO']  = int( commit_evaluation.DICTIONARY['nV'] * commit_evaluation.KERNELS['iso'].shape[0])
+
+    regularisation['normIC']  = regnorms[0]
+    regularisation['normEC']  = regnorms[1]
+    regularisation['normISO'] = regnorms[2]
+
+    regularisation['lambdaIC']  = float( lambdas[0] )
+    regularisation['lambdaEC']  = float( lambdas[1] )
+    regularisation['lambdaISO'] = float( lambdas[2] )
+
+    # Solver-specific fields
+    regularisation['structureIC']      = structureIC
+    regularisation['weightsIC']        = weightsIC
+    regularisation['group_norm']       = group_norm
+
+    return regularisation
+
+
+def regularisation2omegaprox(regularisation):
+    lambdaIC  = float(regularisation.get('lambdaIC'))
+    lambdaEC  = float(regularisation.get('lambdaEC'))
+    lambdaISO = float(regularisation.get('lambdaISO'))
+    if lambdaIC < 0.0 or lambdaEC < 0.0 or lambdaISO < 0.0:
+        raise ValueError('Negative regularisation parameters are not allowed')
+
+    normIC  = regularisation.get('normIC')
+    normEC  = regularisation.get('normEC')
+    normISO = regularisation.get('normISO')
+    if not normIC in list_regnorms:
+        raise ValueError('normIC must be one of commit.solvers.{group_sparsity,non_negative,norm1,norm2}')
+    if not normEC in list_regnorms:
+        raise ValueError('normEC must be one of commit.solvers.{group_sparsity,non_negative,norm1,norm2}')
+    if not normISO in list_regnorms:
+        raise ValueError('normISO must be one of commit.solvers.{group_sparsity,non_negative,norm1,norm2}')
+
+    ## NNLS case
+    if (lambdaIC == 0.0 and lambdaEC == 0.0 and lambdaISO == 0.0) or (normIC == non_negative and normEC == non_negative and normISO == non_negative):
+        omega = lambda x: 0.0
+        prox  = lambda x: non_negativity(x, 0, len(x))
+        return omega, prox
+
+    ## All other cases
+    # Intracellular Compartment
+    startIC = regularisation.get('startIC')
+    sizeIC  = regularisation.get('sizeIC')
+    if lambdaIC == 0.0:
+        omegaIC = lambda x: 0.0
+        proxIC  = lambda x: x
+    elif normIC == norm2:
+        omegaIC = lambda x: lambdaIC * np.linalg.norm(x[startIC:sizeIC])
+        proxIC  = lambda x: projection_onto_l2_ball(x, lambdaIC, startIC, sizeIC)
+    elif normIC == norm1:
+        omegaIC = lambda x: lambdaIC * sum( x[startIC:sizeIC] )
+        proxIC  = lambda x: soft_thresholding(x, lambdaIC, startIC, sizeIC)
+    elif normIC == non_negative:
+        omegaIC = lambda x: 0.0
+        proxIC  = lambda x: non_negativity(x, startIC, sizeIC)
+    elif normIC == group_sparsity:
+        structureIC = regularisation.get('structureIC')
+        groupWeightIC   = regularisation.get('weightsIC')
+        if not len(structureIC) == len(groupWeightIC):
+            raise ValueError('Number of groups and weights do not coincide.')
+        group_norm = regularisation.get('group_norm')
+        if not group_norm in list_group_sparsity_norms:
+            raise ValueError('Wrong norm in the structured sparsity term. Choose between %s.' % str(list_group_sparsity_norms))
+
+        # convert to new data structure (needed for faster access)
+        N = np.sum([g.size for g in structureIC])
+        groupIdxIC  = np.zeros( (N,), dtype=np.int32 )
+        groupSizeIC = np.zeros( (structureIC.size,), dtype=np.int32 )
+        pos = 0
+        for i, g in enumerate(structureIC) :
+            groupSizeIC[i] = g.size
+            groupIdxIC[pos:(pos+g.size)] = g[:]
+            pos += g.size
+
+        omegaIC = lambda x: omega_group_sparsity( x, groupIdxIC, groupSizeIC, groupWeightIC, lambdaIC, group_norm )
+        proxIC  = lambda x:  prox_group_sparsity( x, groupIdxIC, groupSizeIC, groupWeightIC, lambdaIC, group_norm )
+    else:
+        raise ValueError('Type of regularisation for IC compartment not recognized.')
+
+
+    # Extracellular Compartment
+    startEC = regularisation.get('startEC')
+    sizeEC  = regularisation.get('sizeEC')
+    if lambdaEC == 0.0:
+        omegaEC = lambda x: 0.0
+        proxEC  = lambda x: x
+    elif normEC == norm2:
+        omegaEC = lambda x: lambdaEC * np.linalg.norm(x[startEC:(startEC+sizeEC)])
+        proxEC  = lambda x: projection_onto_l2_ball(x, lambdaEC, startEC, sizeEC)
+    elif normEC == norm1:
+        omegaEC = lambda x: lambdaEC * sum( x[startEC:(startEC+sizeEC)] )
+        proxEC  = lambda x: soft_thresholding(x, lambdaEC, startEC, sizeEC)
+    elif normEC == non_negative:
+        omegaEC = lambda x: 0.0
+        proxEC  = lambda x: non_negativity(x, startEC, sizeEC)
+    else:
+        raise ValueError('Type of regularisation for EC compartment not recognized.')
+
+    # Isotropic Compartment
+    startISO = regularisation.get('startISO')
+    sizeISO  = regularisation.get('sizeISO')
+    if lambdaISO == 0.0:
+        omegaISO = lambda x: 0.0
+        proxISO  = lambda x: x
+    elif normISO == norm2:
+        omegaISO = lambda x: lambdaISO * np.linalg.norm(x[startISO:(startISO+sizeISO)])
+        proxISO  = lambda x: projection_onto_l2_ball(x, lambdaISO, startISO, sizeISO)
+    elif normISO == norm1:
+        omegaISO = lambda x: lambdaISO * sum( x[startISO:(startISO+sizeISO)] )
+        proxISO  = lambda x: soft_thresholding(x, lambdaISO, startISO, sizeISO)
+    elif normISO == non_negative:
+        omegaISO = lambda x: 0.0
+        proxISO  = lambda x: non_negativity(x, startISO, sizeISO)
+    else:
+        raise ValueError('Type of regularisation for ISO compartment not recognized.')
+
+    omega = lambda x: omegaIC(x) + omegaEC(x) + omegaISO(x)
+    prox = lambda x: non_negativity(proxIC(proxEC(proxISO(x))),0,x.size) # non negativity is redunduntly forced
+
+    return omega, prox
+
+
+def evaluate_model(y, A, x, regularisation = None):
+    if regularisation is None:
+        omega = lambda x: 0.0
+        prox  = lambda x: non_negativity(x, 0, len(x))
+    else:
+        omega, _ = regularisation2omegaprox(regularisation)
+
+    return 0.5*np.linalg.norm(A.dot(x)-y)**2 + omega(x)
+
+
+def solve(y, A, At, tol_fun = 1e-4, tol_x = 1e-6, max_iter = 1000, verbose = 1, x0 = None, regularisation = None):
+    """
+    Solve the regularised least squares problem
+
+        argmin_x 0.5*||Ax-y||_2^2 + Omega(x)
+
+    with the Omega described by 'regularisation'.
+
+    Check the documentation of commit.solvers.init_regularisation to see how to
+    solve a specific problem.
+    """
+    if regularisation is None:
+        omega = lambda x: 0.0
+        prox  = lambda x: non_negativity(x, 0, x.size)
+    else:
+        omega, prox = regularisation2omegaprox(regularisation)
+
+    if x0 is None:
+        x0 = np.zeros(A.shape[1])
+
+    return fista( y, A, At, tol_fun, tol_x, max_iter, verbose, x0, omega, prox)
+
+
+def fista( y, A, At, tol_fun, tol_x, max_iter, verbose, x0, omega, proximal) :
+    """
+    Solve the regularised least squares problem
+
+        argmin_x 0.5*||Ax-y||_2^2 + Omega(x)
+
+    with the FISTA algorithm described in [1].
+
+    The penalty term and its proximal operator must be defined in such a way
+    that they already contain the regularisation parameter.
+
+    References:
+        [1] Beck & Teboulle - `A Fast Iterative Shrinkage Thresholding
+            Algorithm for Linear Inverse Problems`
+    """
+
+    # Initialization
+    res = -y.copy()
+    xhat = x0.copy()
+    x = np.zeros_like(xhat)
+    res += A.dot(xhat)
+    proximal( xhat )
+    reg_term = omega( xhat )
+    prev_obj = 0.5 * np.linalg.norm(res)**2 + reg_term
+
+    told = 1
+    beta = 0.9
+    prev_x = xhat.copy()
+    grad = np.asarray(At.dot(res))
+    qfval = prev_obj
+
+    # Step size computation
+    L = ( np.linalg.norm( A.dot(grad) ) / np.linalg.norm(grad) )**2
+    mu = 1.9 / L
+
+    # Main loop
+    if verbose >= 1 :
+        print()
+        print( "      |  1/2||Ax-y||^2      Omega      |  Cost function    Abs error      Rel error    |      Abs x          Rel x    " )
+        print( "------|--------------------------------|-----------------------------------------------|------------------------------" )
+    iter = 1
+    while True :
+        if verbose >= 1 :
+            print( "%4d  |" % iter, end="" )
+            sys.stdout.flush()
+
+        # Smooth step
+        x = xhat - mu*grad
+
+        # Non-smooth step
+        proximal( x )
+        reg_term_x = omega( x )
+
+        # Check stepsize
+        tmp = x-xhat
+        q = qfval + np.real( np.dot(tmp,grad) ) + 0.5/mu * np.linalg.norm(tmp)**2 + reg_term_x
+        res = A.dot(x) - y
+        res_norm = np.linalg.norm(res)
+        curr_obj = 0.5 * res_norm**2 + reg_term_x
+
+        # Backtracking
+        while curr_obj > q :
+            # Smooth step
+            mu = beta*mu
+            x = xhat - mu*grad
+
+            # Non-smooth step
+            proximal( x )
+            reg_term_x = omega( x )
+
+            # Check stepsize
+            tmp = x-xhat
+            q = qfval + np.real( np.dot(tmp,grad) ) + 0.5/mu * np.linalg.norm(tmp)**2 + reg_term_x
+            res = A.dot(x) - y
+            res_norm = np.linalg.norm(res)
+            curr_obj = 0.5 * res_norm**2 + reg_term_x
+
+        # Global stopping criterion
+        abs_obj = abs(curr_obj - prev_obj)
+        rel_obj = abs_obj / curr_obj
+        abs_x   = np.linalg.norm(x - prev_x)
+        rel_x   = abs_x / ( np.linalg.norm(x) + eps )
+        if verbose >= 1 :
+            print( "  %13.7e  %13.7e  |  %13.7e  %13.7e  %13.7e  |  %13.7e  %13.7e" % ( 0.5 * res_norm**2, reg_term_x, curr_obj, abs_obj, rel_obj, abs_x, rel_x ) )
+
+        if abs_obj < eps :
+            criterion = "Absolute tolerance on the objective"
+            break
+        elif rel_obj < tol_fun :
+            criterion = "Relative tolerance on the objective"
+            break
+        elif abs_x < eps :
+            criterion = "Absolute tolerance on the unknown"
+            break
+        elif rel_x < tol_x :
+            criterion = "Relative tolerance on the unknown"
+            break
+        elif iter >= max_iter :
+            criterion = "Maximum number of iterations"
+            break
+
+        # FISTA update
+        t = 0.5 * ( 1 + sqrt(1+4*told**2) )
+        xhat = x + (told-1)/t * (x - prev_x)
+
+        # Gradient computation
+        res = A.dot(xhat) - y
+        xarr = np.asarray(x)
+
+        grad = np.asarray(At.dot(res))
+
+        # Update variables
+        iter += 1
+        prev_obj = curr_obj
+        prev_x = x.copy()
+        told = t
+        qfval = 0.5 * np.linalg.norm(res)**2
+
+
+    if verbose >= 1 :
+        print( "< Stopping criterion: %s >" % criterion )
+
+    opt_details = {}
+    opt_details['residual'] = 0.5*res_norm**2
+    opt_details['regterm'] = reg_term_x
+    opt_details['cost_function'] = curr_obj
+    opt_details['abs_cost'] = abs_obj
+    opt_details['rel_cost'] = rel_obj
+    opt_details['abs_x'] = abs_x
+    opt_details['rel _x'] = rel_x
+    opt_details['iterations'] = iter
+    opt_details['stopping_criterion'] = criterion
+
+    return x, opt_details
diff --git a/commit/trk2dictionary/trk2dictionary.pyx b/commit/trk2dictionary/trk2dictionary.pyx
index 9a0b6099..181222d3 100755
--- a/commit/trk2dictionary/trk2dictionary.pyx
+++ b/commit/trk2dictionary/trk2dictionary.pyx
@@ -1,430 +1,430 @@
-#!python
-# cython: language_level=3, c_string_type=str, c_string_encoding=ascii, boundscheck=False, wraparound=False, profile=False
-from __future__ import print_function
-import cython
-import numpy as np
-cimport numpy as np
-import nibabel
-from os.path import join, exists, splitext, dirname, isdir
-from os import makedirs, remove
-import time
-import amico
-import pickle
-from amico.util import LOG, NOTE, WARNING, ERROR
-from pkg_resources import get_distribution
-
-
-# Interface to actual C code
-cdef extern from "trk2dictionary_c.cpp":
-    int trk2dictionary(
-        char* filename_tractogram, int data_offset, int Nx, int Ny, int Nz, float Px, float Py, float Pz, int n_count, int n_scalars, 
-        int n_properties, float fiber_shiftX, float fiber_shiftY, float fiber_shiftZ, int points_to_skip, float min_seg_len, float min_fiber_len,  float max_fiber_len,
-        float* ptrPEAKS, int Np, float vf_THR, int ECix, int ECiy, int ECiz,
-        float* _ptrMASK, float* ptrTDI, char* path_out, int c, double* ptrPeaksAffine,
-        int nBlurRadii, double blurSigma, double* ptrBlurRadii, int* ptrBlurSamples, double* ptrBlurWeights,  float* ptrTractsAffine, unsigned short ndirs, short* prtHashTable
-    ) nogil
-
-
-cpdef run( filename_tractogram=None, path_out=None, filename_peaks=None, filename_mask=None, do_intersect=True,
-    fiber_shift=0, min_seg_len=1e-3, min_fiber_len=0.0, max_fiber_len=250.0, points_to_skip=0,
-    vf_THR=0.1, peaks_use_affine=False, flip_peaks=[False,False,False], 
-    blur_radii=[], blur_samples=[], blur_sigma=0.0,
-    filename_trk=None, gen_trk=None, TCK_ref_image=None, ndirs=32761
-    ):
-    """Perform the conversion of a tractoram to the sparse data-structure internally
-    used by COMMIT to perform the matrix-vector multiplications with the operator A
-    during the inversion of the linear system.
-
-    Parameters
-    ----------
-    filename_tractogram : string
-        Path to the tractogram (.trk or .tck) containing the streamlines to load.
-        
-    TCK_ref_image: string
-        When loading a .tck tractogram, path to the NIFTI file containing the information about
-        the geometry to be used for the tractogram to load. If not specified, it will try to use
-        the information from filename_peaks or filename_mask.
-    
-    path_out : string
-        Path to the folder for storing the sparse data structure. If not specified (default),
-        a folder name "COMMIT" will be created in the same folder of the tractogram.
-
-    filename_mask : string
-        Path to a binary mask for restricting the analysis to specific areas.
-        Segments outside this mask are discarded. If not specified (default),
-        the mask is created from all voxels intersected by the tracts.
-
-    do_intersect : boolean
-        If True then fiber segments that intersect voxel boundaries are splitted (default).
-        If False then the centroid of the segment is used as its voxel position.
-
-    fiber_shift : float or list of three float
-        If necessary, apply a translation to fiber coordinates (default : 0) to account
-        for differences between the reference system of the tracking algorithm and COMMIT.
-        The value is specified in voxel units, eg 0.5 translates by half voxel.
-
-    min_seg_len : float
-        Discard segments <= than this length in mm (default : 1e-3).
-
-    min_fiber_len : float
-        Discard streamlines <= than this length in mm (default : 0.0).
-
-    max_fiber_len : float
-        Discard streamlines >= than this length in mm (default : 250.0).
-
-    points_to_skip : integer
-        If necessary, discard first points at beginning/end of a fiber (default : 0).
-
-    filename_peaks : string
-        Path to the NIFTI file containing the peaks to use as extra-cellular contributions.
-        The data matrix should be 4D with last dimension 3*N, where N is the number
-        of peaks in each voxel. (default : no extra-cellular contributions).
-
-    peaks_use_affine : boolean
-        Whether to rotate the peaks according to the affine matrix (default : False).
-
-    vf_THR : float
-        Discard peaks smaller than vf_THR * max peak (default : 0.1).
-
-    flip_peaks : list of three boolean
-        If necessary, flips peak orientations along each axis (default : no flipping).
-
-    blur_radii : list of float
-        Translate each segment to given radii to assign a broader fiber contribution (default : []).
-    
-    blur_samples : list of integer
-        Segments are duplicated along a circle at a given radius; this parameter controls the
-        number of samples to take over a given circle (defaut : []).
-
-    blur_sigma: float
-        The contributions of the segments at different radii are damped as a Gaussian (default : 0.0).
-    
-    ndirs : int
-        Number of orientations on the sphere used to discretize the orientation of each
-        each segment in a streamline (default : 32761).
-
-    filename_trk : string
-        DEPRECATED. Use filename_tractogram instead.
-
-    gen_trk : string
-        DEPRECATED. No tractogram will be saved any more, but the returned coefficients will account
-        for the streamlines that were pre-filtered in this function.
-    """
-
-    # check the value of ndirs
-    if not amico.lut.is_valid(ndirs):
-        ERROR( 'Unsupported value for ndirs.\nNote: Supported values for ndirs are [500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 10000, 32761 (default)]' )
-
-    # check conflicts of fiber_shift
-    if np.isscalar(fiber_shift) :
-        fiber_shiftX = fiber_shift
-        fiber_shiftY = fiber_shift
-        fiber_shiftZ = fiber_shift
-    elif len(fiber_shift) == 3 :
-        fiber_shiftX = fiber_shift[0]
-        fiber_shiftY = fiber_shift[1]
-        fiber_shiftZ = fiber_shift[2]
-    else :
-        ERROR( '"fiber_shift" must be a scalar or a vector with 3 elements' )
-
-    # check for invalid parameters in the blur
-    if type(blur_radii)==list:
-        blur_radii = np.ndarray(blur_radii, np.double)
-    if type(blur_samples)==list:
-        blur_samples = np.ndarray(blur_samples, np.int32)
-
-    if blur_sigma > 0 :
-        if blur_radii.size != blur_samples.size :
-            ERROR( 'The number of blur radii and blur samples must match' )
-
-        if np.count_nonzero( blur_radii<=0 ):
-            ERROR( 'A blur radius was <= 0; only positive radii can be used' )
-
-        if np.count_nonzero( blur_samples<1 ):
-            ERROR( 'Please specify at least 1 sample per blur radius' )
-
-    tic = time.time()
-    LOG( '\n-> Creating the dictionary from tractogram:' )
-    
-    LOG( '\n   * Configuration:' )
-    print( '\t- Segment position = %s' % ( 'COMPUTE INTERSECTIONS' if do_intersect else 'CENTROID' ) )
-    print( '\t- Fiber shift X    = %.3f (voxel-size units)' % fiber_shiftX )
-    print( '\t- Fiber shift Y    = %.3f (voxel-size units)' % fiber_shiftY )
-    print( '\t- Fiber shift Z    = %.3f (voxel-size units)' % fiber_shiftZ )
-    print( '\t- Points to skip   = %d' % points_to_skip )
-    if min_seg_len >= 1e-3:
-        print( '\t- Min segment len  = %.3f mm' % min_seg_len )
-    else:
-        print( '\t- Min segment len  = %.2e mm' % min_seg_len )
-    print( '\t- Min fiber len    = %.2f mm' % min_fiber_len )
-    print( '\t- Max fiber len    = %.2f mm' % max_fiber_len )
-
-    # check blur params
-    cdef :
-        double [:] blurRadii
-        int [:] blurSamples
-        double [:] blurWeights
-        double* ptrBlurRadii
-        int* ptrBlurSamples
-        double* ptrBlurWeights
-        int nBlurRadii
-        float [:] ArrayInvM
-        float* ptrArrayInvM
-    
-    # convert to numpy arrays (and add fake radius for original segment)
-    if blur_sigma == 0:
-        nBlurRadii = 1
-        blurRadii = np.array( [0.0], np.double )
-        blurSamples = np.array( [1], np.int32 )
-        blurWeights = np.array( [1], np.double )
-    else:
-        nBlurRadii = len(blur_radii)+1
-        blurRadii = np.insert( blur_radii, 0, 0.0 ).astype(np.double)
-        blurSamples = np.insert( blur_samples, 0, 1 ).astype(np.int32)
-
-        # compute weights for gaussian damping
-        blurWeights = np.empty_like( blurRadii )
-        for i in xrange(nBlurRadii):
-            blurWeights[i] = np.exp( -blurRadii[i]**2 / (2.0*blur_sigma**2) )
-
-    if nBlurRadii == 1 :
-        print( '\t- Do not blur fibers' )
-    else :
-        print( '\t- Blur fibers:' )
-        print( '\t\t- sigma = %.3f' % blur_sigma )
-        print( '\t\t- radii =   [ ', end="" )
-        for i in xrange( 1, blurRadii.size ) :
-            print( '%.3f ' % blurRadii[i], end="" )
-        print( ']' )
-        print( '\t\t- weights = [ ', end="" )
-        for i in xrange( 1, blurWeights.size ) :
-            print( '%.3f ' % blurWeights[i], end="" )
-        print( ']' )
-        print( '\t\t- samples = [ ', end="" )
-        for i in xrange( 1, blurSamples.size ) :
-            print( '%5d ' % blurSamples[i], end="" )
-        print( ']' )
-
-    ptrBlurRadii   = &blurRadii[0]
-    ptrBlurSamples = &blurSamples[0]
-    ptrBlurWeights = &blurWeights[0]
-
-    if min_seg_len < 0 :
-        ERROR( '"min_seg_len" must be >= 0' )
-    if min_fiber_len < 0 :
-        ERROR( '"min_fiber_len" must be >= 0' )
-    if max_fiber_len < min_fiber_len :
-        ERROR( '"max_fiber_len" must be >= "min_fiber_len"' )
-
-    if filename_trk is None and filename_tractogram is None:
-        ERROR( '"filename_tractogram" not defined' )
-
-    if filename_trk is not None and filename_tractogram is not None:
-        WARNING('"filename_trk" will not be considered, "filename_tractogram" will be used')
-
-    if filename_trk is not None and filename_tractogram is None:
-        filename_tractogram = filename_trk
-        WARNING('"filename_trk" parameter is deprecated, use "filename_tractogram" instead')
-
-    if path_out is None:
-        path_out = dirname(filename_tractogram)
-        if path_out == '':
-            path_out = '.'
-        if not isdir(path_out):
-            ERROR( '"path_out" cannot be inferred from "filename_tractogram"' )
-        path_out = join(path_out,'COMMIT')
-
-    if gen_trk is not None:
-        WARNING('"gen_trk" parameter is deprecated')
-
-    # create output path
-    print( '\t- Output written to "%s"' % path_out )
-    if not exists( path_out ):
-        makedirs( path_out )
-
-    # Load data from files
-    LOG( '\n   * Loading data:' )
-    cdef short [:] htable = amico.lut.load_precomputed_hash_table(ndirs)
-    cdef short* ptrHashTable = &htable[0]
-
-    # Streamlines from tractogram
-    print( '\t- Tractogram' )
-    
-    extension = splitext(filename_tractogram)[1]
-    if extension != ".trk" and extension != ".tck":
-        ERROR( 'Invalid input file: only .trk and .tck are supported' )
-    try :
-        hdr = nibabel.streamlines.load( filename_tractogram, lazy_load=True ).header
-    except :
-        ERROR( 'Tractogram file not found' )
-        
-    if extension == ".trk":
-        Nx = hdr['dimensions'][0]
-        Ny = hdr['dimensions'][1]
-        Nz = hdr['dimensions'][2]
-        Px = hdr['voxel_sizes'][0]
-        Py = hdr['voxel_sizes'][1]
-        Pz = hdr['voxel_sizes'][2]
-
-        data_offset = 1000
-        n_count = hdr['nb_streamlines']
-        n_scalars = hdr['nb_scalars_per_point']
-        n_properties = hdr['nb_properties_per_streamline']
-
-    if extension == ".tck":
-        if TCK_ref_image is None:
-            if filename_peaks is not None:
-                TCK_ref_image = filename_peaks
-            elif filename_mask is not None:
-                TCK_ref_image = filename_mask
-            else:
-                ERROR( 'TCK files do not contain information about the geometry. Use "TCK_ref_image" for that' )
-
-        print ('\t\t- geometry taken from "%s"' %TCK_ref_image)
-
-        nii_image = nibabel.load(TCK_ref_image)
-        nii_hdr = nii_image.header if nibabel.__version__ >= '2.0.0' else nii_image.get_header()
-        Nx = nii_image.shape[0]
-        Ny = nii_image.shape[1]
-        Nz = nii_image.shape[2]
-        Px = nii_hdr['pixdim'][1]
-        Py = nii_hdr['pixdim'][2]
-        Pz = nii_hdr['pixdim'][3]
-        data_offset = int(hdr['_offset_data'])  #set offset
-        n_count = int(hdr['count'])  #set number of fibers
-        n_scalars = 0
-        n_properties = 0
-        
-    print( '\t\t- %d x %d x %d' % ( Nx, Ny, Nz ) )
-    print( '\t\t- %.4f x %.4f x %.4f' % ( Px, Py, Pz ) )
-    print( '\t\t- %d fibers' % n_count )
-    if Nx >= 2**16 or Nz >= 2**16 or Nz >= 2**16 :
-        ERROR( 'The max dim size is 2^16 voxels' )
-    
-    # get the affine matrix
-    if extension == ".tck":
-        scaleMat = np.diag(np.divide(1.0, [Px,Py,Pz]))
-        M = nii_hdr.get_best_affine()
-
-        # Affine matrix without scaling, i.e. diagonal is 1
-        M[:3, :3] = np.dot(scaleMat, M[:3, :3])
-        M = M.astype('<f4') # affine matrix in float value
-        invM = np.linalg.inv(M) # inverse affine matrix
-        #create a vector of inverse matrix M
-        ArrayInvM = np.ravel(invM)
-        ptrArrayInvM = &ArrayInvM[0]
-
-    # white-matter mask
-    cdef float* ptrMASK
-    cdef float [:, :, ::1] niiMASK_img
-    if filename_mask is not None :
-        print( '\t- Filtering mask' )
-        niiMASK = nibabel.load( filename_mask )
-        niiMASK_hdr = niiMASK.header if nibabel.__version__ >= '2.0.0' else niiMASK.get_header()
-        print( '\t\t- %d x %d x %d' % ( niiMASK.shape[0], niiMASK.shape[1], niiMASK.shape[2] ) )
-        print( '\t\t- %.4f x %.4f x %.4f' % ( niiMASK_hdr['pixdim'][1], niiMASK_hdr['pixdim'][2], niiMASK_hdr['pixdim'][3] ) )
-        if ( Nx!=niiMASK.shape[0] or Ny!=niiMASK.shape[1] or Nz!=niiMASK.shape[2] or
-            abs(Px-niiMASK_hdr['pixdim'][1])>1e-3 or abs(Py-niiMASK_hdr['pixdim'][2])>1e-3 or abs(Pz-niiMASK_hdr['pixdim'][3])>1e-3 ) :
-            WARNING( 'Dataset does not have the same geometry as the tractogram' )
-        niiMASK_img = np.ascontiguousarray( niiMASK.get_data().astype(np.float32) )
-        ptrMASK  = &niiMASK_img[0,0,0]
-    else :
-        print( '\t- No mask specified to filter IC compartments' )
-        ptrMASK = NULL
-
-    # peaks file for EC contributions
-    cdef float* ptrPEAKS
-    cdef float [:, :, :, ::1] niiPEAKS_img
-    cdef int Np
-    cdef float [:, :, ::1] niiTDI_img = np.ascontiguousarray( np.zeros((Nx,Ny,Nz),dtype=np.float32) )
-    cdef float* ptrTDI  = &niiTDI_img[0,0,0]
-    cdef double [:, ::1] affine
-    cdef double* ptrAFFINE
-    if filename_peaks is not None :
-        print( '\t- EC orientations' )
-        niiPEAKS = nibabel.load( filename_peaks )
-        niiPEAKS_hdr = niiPEAKS.header if nibabel.__version__ >= '2.0.0' else niiPEAKS.get_header()
-        print( '\t\t- %d x %d x %d x %d' % ( niiPEAKS.shape[0], niiPEAKS.shape[1], niiPEAKS.shape[2], niiPEAKS.shape[3] ) )
-        print( '\t\t- %.4f x %.4f x %.4f' % ( niiPEAKS_hdr['pixdim'][1], niiPEAKS_hdr['pixdim'][2], niiPEAKS_hdr['pixdim'][3] ) )
-        print( '\t\t- ignoring peaks < %.2f * MaxPeak' % vf_THR )
-        print( '\t\t- %susing affine matrix' % ( "" if peaks_use_affine else "not " ) )
-        print( '\t\t- flipping axes : [ x=%s, y=%s, z=%s ]' % ( flip_peaks[0], flip_peaks[1], flip_peaks[2] ) )
-        if ( Nx!=niiPEAKS.shape[0] or Ny!=niiPEAKS.shape[1] or Nz!=niiPEAKS.shape[2] or
-            abs(Px-niiPEAKS_hdr['pixdim'][1])>1e-3 or abs(Py-niiPEAKS_hdr['pixdim'][2])>1e-3 or abs(Pz-niiPEAKS_hdr['pixdim'][3])>1e-3 ) :
-            WARNING( "Dataset does not have the same geometry as the tractogram" )
-        if niiPEAKS.shape[3] % 3 :
-            ERROR( 'PEAKS dataset must have 3*k volumes' )
-        if vf_THR < 0 or vf_THR > 1 :
-            ERROR( '"vf_THR" must be between 0 and 1' )
-        niiPEAKS_img = np.ascontiguousarray( niiPEAKS.get_data().astype(np.float32) )
-        ptrPEAKS = &niiPEAKS_img[0,0,0,0]
-        Np = niiPEAKS.shape[3]/3
-
-        # affine matrix to rotate gradien directions (if required)
-        if peaks_use_affine :
-            affine = np.ascontiguousarray( niiPEAKS.affine[:3,:3].T )
-        else :
-            affine = np.ascontiguousarray( np.eye(3) )
-        ptrAFFINE = &affine[0,0]
-    else :
-        print( '\t- No dataset specified for EC compartments' )
-        Np = 0
-        ptrPEAKS = NULL
-        ptrAFFINE = NULL
-
-    # write dictionary information info file
-    dictionary_info = {}
-    dictionary_info['filename_tractogram'] = filename_tractogram
-    dictionary_info['TCK_ref_image'] = TCK_ref_image
-    dictionary_info['path_out'] = path_out
-    dictionary_info['filename_peaks'] = filename_peaks
-    dictionary_info['filename_mask'] = filename_mask
-    dictionary_info['do_intersect'] = do_intersect
-    dictionary_info['fiber_shift'] = fiber_shift
-    dictionary_info['min_seg_len'] = min_seg_len
-    dictionary_info['min_fiber_len'] = min_fiber_len
-    dictionary_info['max_fiber_len'] = max_fiber_len
-    dictionary_info['points_to_skip'] = points_to_skip
-    dictionary_info['vf_THR'] = vf_THR
-    dictionary_info['peaks_use_affine'] = peaks_use_affine
-    dictionary_info['flip_peaks'] = flip_peaks
-    dictionary_info['blur_radii'] = blur_radii
-    dictionary_info['blur_samples'] = blur_samples
-    dictionary_info['blur_sigma'] = blur_sigma    
-    dictionary_info['ndirs'] = ndirs
-    with open( join(path_out,'dictionary_info.pickle'), 'wb+' ) as dictionary_info_file:
-        pickle.dump(dictionary_info, dictionary_info_file, protocol=2)
-
-    # calling actual C code
-    ret = trk2dictionary( filename_tractogram, data_offset,
-        Nx, Ny, Nz, Px, Py, Pz, n_count, n_scalars, n_properties,
-        fiber_shiftX, fiber_shiftY, fiber_shiftZ, points_to_skip, min_seg_len, min_fiber_len, max_fiber_len,
-        ptrPEAKS, Np, vf_THR, -1 if flip_peaks[0] else 1, -1 if flip_peaks[1] else 1, -1 if flip_peaks[2] else 1,
-        ptrMASK, ptrTDI, path_out, 1 if do_intersect else 0, ptrAFFINE,
-        nBlurRadii, blur_sigma, ptrBlurRadii, ptrBlurSamples, ptrBlurWeights, ptrArrayInvM, ndirs, ptrHashTable  );
-    if ret == 0 :
-        WARNING( 'DICTIONARY not generated' )
-        return None
-
-    # save TDI and MASK maps
-    if filename_mask is not None :
-        affine = niiMASK.affine if nibabel.__version__ >= '2.0.0' else niiMASK.get_affine()
-    elif filename_peaks is not None :
-        affine = niiPEAKS.affine if nibabel.__version__ >= '2.0.0' else niiPEAKS.get_affine()
-    else :
-        affine = np.diag( [Px, Py, Pz, 1] )
-
-    niiTDI = nibabel.Nifti1Image( niiTDI_img, affine )
-    nii_hdr = niiTDI.header if nibabel.__version__ >= '2.0.0' else niiTDI.get_header()
-    nii_hdr['descrip'] = 'Created with COMMIT %s'%get_distribution('dmri-commit').version
-    nibabel.save( niiTDI, join(path_out,'dictionary_tdi.nii.gz') )
-
-    if filename_mask is not None :
-        niiMASK = nibabel.Nifti1Image( niiMASK_img, affine )
-    else :
-        niiMASK = nibabel.Nifti1Image( (np.asarray(niiTDI_img)>0).astype(np.float32), affine )
-    nii_hdr = niiMASK.header if nibabel.__version__ >= '2.0.0' else niiMASK.get_header()
-    nii_hdr['descrip'] = 'Created with COMMIT %s'%get_distribution('dmri-commit').version
-    nibabel.save( niiMASK, join(path_out,'dictionary_mask.nii.gz') )
-
+#!python
+# cython: language_level=3, c_string_type=str, c_string_encoding=ascii, boundscheck=False, wraparound=False, profile=False
+from __future__ import print_function
+import cython
+import numpy as np
+cimport numpy as np
+import nibabel
+from os.path import join, exists, splitext, dirname, isdir
+from os import makedirs, remove
+import time
+import amico
+import pickle
+from amico.util import LOG, NOTE, WARNING, ERROR
+from pkg_resources import get_distribution
+
+
+# Interface to actual C code
+cdef extern from "trk2dictionary_c.cpp":
+    int trk2dictionary(
+        char* filename_tractogram, int data_offset, int Nx, int Ny, int Nz, float Px, float Py, float Pz, int n_count, int n_scalars, 
+        int n_properties, float fiber_shiftX, float fiber_shiftY, float fiber_shiftZ, int points_to_skip, float min_seg_len, float min_fiber_len,  float max_fiber_len,
+        float* ptrPEAKS, int Np, float vf_THR, int ECix, int ECiy, int ECiz,
+        float* _ptrMASK, float* ptrTDI, char* path_out, int c, double* ptrPeaksAffine,
+        int nBlurRadii, double blurSigma, double* ptrBlurRadii, int* ptrBlurSamples, double* ptrBlurWeights,  float* ptrTractsAffine, unsigned short ndirs, short* prtHashTable
+    ) nogil
+
+
+cpdef run( filename_tractogram=None, path_out=None, filename_peaks=None, filename_mask=None, do_intersect=True,
+    fiber_shift=0, min_seg_len=1e-3, min_fiber_len=0.0, max_fiber_len=250.0, points_to_skip=0,
+    vf_THR=0.1, peaks_use_affine=False, flip_peaks=[False,False,False], 
+    blur_radii=[], blur_samples=[], blur_sigma=0.0,
+    filename_trk=None, gen_trk=None, TCK_ref_image=None, ndirs=32761
+    ):
+    """Perform the conversion of a tractoram to the sparse data-structure internally
+    used by COMMIT to perform the matrix-vector multiplications with the operator A
+    during the inversion of the linear system.
+
+    Parameters
+    ----------
+    filename_tractogram : string
+        Path to the tractogram (.trk or .tck) containing the streamlines to load.
+        
+    TCK_ref_image: string
+        When loading a .tck tractogram, path to the NIFTI file containing the information about
+        the geometry to be used for the tractogram to load. If not specified, it will try to use
+        the information from filename_peaks or filename_mask.
+    
+    path_out : string
+        Path to the folder for storing the sparse data structure. If not specified (default),
+        a folder name "COMMIT" will be created in the same folder of the tractogram.
+
+    filename_mask : string
+        Path to a binary mask for restricting the analysis to specific areas.
+        Segments outside this mask are discarded. If not specified (default),
+        the mask is created from all voxels intersected by the tracts.
+
+    do_intersect : boolean
+        If True then fiber segments that intersect voxel boundaries are splitted (default).
+        If False then the centroid of the segment is used as its voxel position.
+
+    fiber_shift : float or list of three float
+        If necessary, apply a translation to fiber coordinates (default : 0) to account
+        for differences between the reference system of the tracking algorithm and COMMIT.
+        The value is specified in voxel units, eg 0.5 translates by half voxel.
+
+    min_seg_len : float
+        Discard segments <= than this length in mm (default : 1e-3).
+
+    min_fiber_len : float
+        Discard streamlines <= than this length in mm (default : 0.0).
+
+    max_fiber_len : float
+        Discard streamlines >= than this length in mm (default : 250.0).
+
+    points_to_skip : integer
+        If necessary, discard first points at beginning/end of a fiber (default : 0).
+
+    filename_peaks : string
+        Path to the NIFTI file containing the peaks to use as extra-cellular contributions.
+        The data matrix should be 4D with last dimension 3*N, where N is the number
+        of peaks in each voxel. (default : no extra-cellular contributions).
+
+    peaks_use_affine : boolean
+        Whether to rotate the peaks according to the affine matrix (default : False).
+
+    vf_THR : float
+        Discard peaks smaller than vf_THR * max peak (default : 0.1).
+
+    flip_peaks : list of three boolean
+        If necessary, flips peak orientations along each axis (default : no flipping).
+
+    blur_radii : list of float
+        Translate each segment to given radii to assign a broader fiber contribution (default : []).
+    
+    blur_samples : list of integer
+        Segments are duplicated along a circle at a given radius; this parameter controls the
+        number of samples to take over a given circle (defaut : []).
+
+    blur_sigma: float
+        The contributions of the segments at different radii are damped as a Gaussian (default : 0.0).
+    
+    ndirs : int
+        Number of orientations on the sphere used to discretize the orientation of each
+        each segment in a streamline (default : 32761).
+
+    filename_trk : string
+        DEPRECATED. Use filename_tractogram instead.
+
+    gen_trk : string
+        DEPRECATED. No tractogram will be saved any more, but the returned coefficients will account
+        for the streamlines that were pre-filtered in this function.
+    """
+
+    # check the value of ndirs
+    if not amico.lut.is_valid(ndirs):
+        ERROR( 'Unsupported value for ndirs.\nNote: Supported values for ndirs are [500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 10000, 32761 (default)]' )
+
+    # check conflicts of fiber_shift
+    if np.isscalar(fiber_shift) :
+        fiber_shiftX = fiber_shift
+        fiber_shiftY = fiber_shift
+        fiber_shiftZ = fiber_shift
+    elif len(fiber_shift) == 3 :
+        fiber_shiftX = fiber_shift[0]
+        fiber_shiftY = fiber_shift[1]
+        fiber_shiftZ = fiber_shift[2]
+    else :
+        ERROR( '"fiber_shift" must be a scalar or a vector with 3 elements' )
+
+    # check for invalid parameters in the blur
+    if type(blur_radii)==list:
+        blur_radii = np.ndarray(blur_radii, np.double)
+    if type(blur_samples)==list:
+        blur_samples = np.ndarray(blur_samples, np.int32)
+
+    if blur_sigma > 0 :
+        if blur_radii.size != blur_samples.size :
+            ERROR( 'The number of blur radii and blur samples must match' )
+
+        if np.count_nonzero( blur_radii<=0 ):
+            ERROR( 'A blur radius was <= 0; only positive radii can be used' )
+
+        if np.count_nonzero( blur_samples<1 ):
+            ERROR( 'Please specify at least 1 sample per blur radius' )
+
+    tic = time.time()
+    LOG( '\n-> Creating the dictionary from tractogram:' )
+    
+    LOG( '\n   * Configuration:' )
+    print( '\t- Segment position = %s' % ( 'COMPUTE INTERSECTIONS' if do_intersect else 'CENTROID' ) )
+    print( '\t- Fiber shift X    = %.3f (voxel-size units)' % fiber_shiftX )
+    print( '\t- Fiber shift Y    = %.3f (voxel-size units)' % fiber_shiftY )
+    print( '\t- Fiber shift Z    = %.3f (voxel-size units)' % fiber_shiftZ )
+    print( '\t- Points to skip   = %d' % points_to_skip )
+    if min_seg_len >= 1e-3:
+        print( '\t- Min segment len  = %.3f mm' % min_seg_len )
+    else:
+        print( '\t- Min segment len  = %.2e mm' % min_seg_len )
+    print( '\t- Min fiber len    = %.2f mm' % min_fiber_len )
+    print( '\t- Max fiber len    = %.2f mm' % max_fiber_len )
+
+    # check blur params
+    cdef :
+        double [:] blurRadii
+        int [:] blurSamples
+        double [:] blurWeights
+        double* ptrBlurRadii
+        int* ptrBlurSamples
+        double* ptrBlurWeights
+        int nBlurRadii
+        float [:] ArrayInvM
+        float* ptrArrayInvM
+    
+    # convert to numpy arrays (and add fake radius for original segment)
+    if blur_sigma == 0:
+        nBlurRadii = 1
+        blurRadii = np.array( [0.0], np.double )
+        blurSamples = np.array( [1], np.int32 )
+        blurWeights = np.array( [1], np.double )
+    else:
+        nBlurRadii = len(blur_radii)+1
+        blurRadii = np.insert( blur_radii, 0, 0.0 ).astype(np.double)
+        blurSamples = np.insert( blur_samples, 0, 1 ).astype(np.int32)
+
+        # compute weights for gaussian damping
+        blurWeights = np.empty_like( blurRadii )
+        for i in xrange(nBlurRadii):
+            blurWeights[i] = np.exp( -blurRadii[i]**2 / (2.0*blur_sigma**2) )
+
+    if nBlurRadii == 1 :
+        print( '\t- Do not blur fibers' )
+    else :
+        print( '\t- Blur fibers:' )
+        print( '\t\t- sigma = %.3f' % blur_sigma )
+        print( '\t\t- radii =   [ ', end="" )
+        for i in xrange( 1, blurRadii.size ) :
+            print( '%.3f ' % blurRadii[i], end="" )
+        print( ']' )
+        print( '\t\t- weights = [ ', end="" )
+        for i in xrange( 1, blurWeights.size ) :
+            print( '%.3f ' % blurWeights[i], end="" )
+        print( ']' )
+        print( '\t\t- samples = [ ', end="" )
+        for i in xrange( 1, blurSamples.size ) :
+            print( '%5d ' % blurSamples[i], end="" )
+        print( ']' )
+
+    ptrBlurRadii   = &blurRadii[0]
+    ptrBlurSamples = &blurSamples[0]
+    ptrBlurWeights = &blurWeights[0]
+
+    if min_seg_len < 0 :
+        ERROR( '"min_seg_len" must be >= 0' )
+    if min_fiber_len < 0 :
+        ERROR( '"min_fiber_len" must be >= 0' )
+    if max_fiber_len < min_fiber_len :
+        ERROR( '"max_fiber_len" must be >= "min_fiber_len"' )
+
+    if filename_trk is None and filename_tractogram is None:
+        ERROR( '"filename_tractogram" not defined' )
+
+    if filename_trk is not None and filename_tractogram is not None:
+        WARNING('"filename_trk" will not be considered, "filename_tractogram" will be used')
+
+    if filename_trk is not None and filename_tractogram is None:
+        filename_tractogram = filename_trk
+        WARNING('"filename_trk" parameter is deprecated, use "filename_tractogram" instead')
+
+    if path_out is None:
+        path_out = dirname(filename_tractogram)
+        if path_out == '':
+            path_out = '.'
+        if not isdir(path_out):
+            ERROR( '"path_out" cannot be inferred from "filename_tractogram"' )
+        path_out = join(path_out,'COMMIT')
+
+    if gen_trk is not None:
+        WARNING('"gen_trk" parameter is deprecated')
+
+    # create output path
+    print( '\t- Output written to "%s"' % path_out )
+    if not exists( path_out ):
+        makedirs( path_out )
+
+    # Load data from files
+    LOG( '\n   * Loading data:' )
+    cdef short [:] htable = amico.lut.load_precomputed_hash_table(ndirs)
+    cdef short* ptrHashTable = &htable[0]
+
+    # Streamlines from tractogram
+    print( '\t- Tractogram' )
+    
+    extension = splitext(filename_tractogram)[1]
+    if extension != ".trk" and extension != ".tck":
+        ERROR( 'Invalid input file: only .trk and .tck are supported' )
+    try :
+        hdr = nibabel.streamlines.load( filename_tractogram, lazy_load=True ).header
+    except :
+        ERROR( 'Tractogram file not found' )
+        
+    if extension == ".trk":
+        Nx = hdr['dimensions'][0]
+        Ny = hdr['dimensions'][1]
+        Nz = hdr['dimensions'][2]
+        Px = hdr['voxel_sizes'][0]
+        Py = hdr['voxel_sizes'][1]
+        Pz = hdr['voxel_sizes'][2]
+
+        data_offset = 1000
+        n_count = hdr['nb_streamlines']
+        n_scalars = hdr['nb_scalars_per_point']
+        n_properties = hdr['nb_properties_per_streamline']
+
+    if extension == ".tck":
+        if TCK_ref_image is None:
+            if filename_peaks is not None:
+                TCK_ref_image = filename_peaks
+            elif filename_mask is not None:
+                TCK_ref_image = filename_mask
+            else:
+                ERROR( 'TCK files do not contain information about the geometry. Use "TCK_ref_image" for that' )
+
+        print ('\t\t- geometry taken from "%s"' %TCK_ref_image)
+
+        nii_image = nibabel.load(TCK_ref_image)
+        nii_hdr = nii_image.header if nibabel.__version__ >= '2.0.0' else nii_image.get_header()
+        Nx = nii_image.shape[0]
+        Ny = nii_image.shape[1]
+        Nz = nii_image.shape[2]
+        Px = nii_hdr['pixdim'][1]
+        Py = nii_hdr['pixdim'][2]
+        Pz = nii_hdr['pixdim'][3]
+        data_offset = int(hdr['_offset_data'])  #set offset
+        n_count = int(hdr['count'])  #set number of fibers
+        n_scalars = 0
+        n_properties = 0
+        
+    print( '\t\t- %d x %d x %d' % ( Nx, Ny, Nz ) )
+    print( '\t\t- %.4f x %.4f x %.4f' % ( Px, Py, Pz ) )
+    print( '\t\t- %d fibers' % n_count )
+    if Nx >= 2**16 or Nz >= 2**16 or Nz >= 2**16 :
+        ERROR( 'The max dim size is 2^16 voxels' )
+    
+    # get the affine matrix
+    if extension == ".tck":
+        scaleMat = np.diag(np.divide(1.0, [Px,Py,Pz]))
+        M = nii_hdr.get_best_affine()
+
+        # Affine matrix without scaling, i.e. diagonal is 1
+        M[:3, :3] = np.dot(scaleMat, M[:3, :3])
+        M = M.astype('<f4') # affine matrix in float value
+        invM = np.linalg.inv(M) # inverse affine matrix
+        #create a vector of inverse matrix M
+        ArrayInvM = np.ravel(invM)
+        ptrArrayInvM = &ArrayInvM[0]
+
+    # white-matter mask
+    cdef float* ptrMASK
+    cdef float [:, :, ::1] niiMASK_img
+    if filename_mask is not None :
+        print( '\t- Filtering mask' )
+        niiMASK = nibabel.load( filename_mask )
+        niiMASK_hdr = niiMASK.header if nibabel.__version__ >= '2.0.0' else niiMASK.get_header()
+        print( '\t\t- %d x %d x %d' % ( niiMASK.shape[0], niiMASK.shape[1], niiMASK.shape[2] ) )
+        print( '\t\t- %.4f x %.4f x %.4f' % ( niiMASK_hdr['pixdim'][1], niiMASK_hdr['pixdim'][2], niiMASK_hdr['pixdim'][3] ) )
+        if ( Nx!=niiMASK.shape[0] or Ny!=niiMASK.shape[1] or Nz!=niiMASK.shape[2] or
+            abs(Px-niiMASK_hdr['pixdim'][1])>1e-3 or abs(Py-niiMASK_hdr['pixdim'][2])>1e-3 or abs(Pz-niiMASK_hdr['pixdim'][3])>1e-3 ) :
+            WARNING( 'Dataset does not have the same geometry as the tractogram' )
+        niiMASK_img = np.ascontiguousarray( niiMASK.get_data().astype(np.float32) )
+        ptrMASK  = &niiMASK_img[0,0,0]
+    else :
+        print( '\t- No mask specified to filter IC compartments' )
+        ptrMASK = NULL
+
+    # peaks file for EC contributions
+    cdef float* ptrPEAKS
+    cdef float [:, :, :, ::1] niiPEAKS_img
+    cdef int Np
+    cdef float [:, :, ::1] niiTDI_img = np.ascontiguousarray( np.zeros((Nx,Ny,Nz),dtype=np.float32) )
+    cdef float* ptrTDI  = &niiTDI_img[0,0,0]
+    cdef double [:, ::1] affine
+    cdef double* ptrAFFINE
+    if filename_peaks is not None :
+        print( '\t- EC orientations' )
+        niiPEAKS = nibabel.load( filename_peaks )
+        niiPEAKS_hdr = niiPEAKS.header if nibabel.__version__ >= '2.0.0' else niiPEAKS.get_header()
+        print( '\t\t- %d x %d x %d x %d' % ( niiPEAKS.shape[0], niiPEAKS.shape[1], niiPEAKS.shape[2], niiPEAKS.shape[3] ) )
+        print( '\t\t- %.4f x %.4f x %.4f' % ( niiPEAKS_hdr['pixdim'][1], niiPEAKS_hdr['pixdim'][2], niiPEAKS_hdr['pixdim'][3] ) )
+        print( '\t\t- ignoring peaks < %.2f * MaxPeak' % vf_THR )
+        print( '\t\t- %susing affine matrix' % ( "" if peaks_use_affine else "not " ) )
+        print( '\t\t- flipping axes : [ x=%s, y=%s, z=%s ]' % ( flip_peaks[0], flip_peaks[1], flip_peaks[2] ) )
+        if ( Nx!=niiPEAKS.shape[0] or Ny!=niiPEAKS.shape[1] or Nz!=niiPEAKS.shape[2] or
+            abs(Px-niiPEAKS_hdr['pixdim'][1])>1e-3 or abs(Py-niiPEAKS_hdr['pixdim'][2])>1e-3 or abs(Pz-niiPEAKS_hdr['pixdim'][3])>1e-3 ) :
+            WARNING( "Dataset does not have the same geometry as the tractogram" )
+        if niiPEAKS.shape[3] % 3 :
+            ERROR( 'PEAKS dataset must have 3*k volumes' )
+        if vf_THR < 0 or vf_THR > 1 :
+            ERROR( '"vf_THR" must be between 0 and 1' )
+        niiPEAKS_img = np.ascontiguousarray( niiPEAKS.get_data().astype(np.float32) )
+        ptrPEAKS = &niiPEAKS_img[0,0,0,0]
+        Np = niiPEAKS.shape[3]/3
+
+        # affine matrix to rotate gradien directions (if required)
+        if peaks_use_affine :
+            affine = np.ascontiguousarray( niiPEAKS.affine[:3,:3].T )
+        else :
+            affine = np.ascontiguousarray( np.eye(3) )
+        ptrAFFINE = &affine[0,0]
+    else :
+        print( '\t- No dataset specified for EC compartments' )
+        Np = 0
+        ptrPEAKS = NULL
+        ptrAFFINE = NULL
+
+    # write dictionary information info file
+    dictionary_info = {}
+    dictionary_info['filename_tractogram'] = filename_tractogram
+    dictionary_info['TCK_ref_image'] = TCK_ref_image
+    dictionary_info['path_out'] = path_out
+    dictionary_info['filename_peaks'] = filename_peaks
+    dictionary_info['filename_mask'] = filename_mask
+    dictionary_info['do_intersect'] = do_intersect
+    dictionary_info['fiber_shift'] = fiber_shift
+    dictionary_info['min_seg_len'] = min_seg_len
+    dictionary_info['min_fiber_len'] = min_fiber_len
+    dictionary_info['max_fiber_len'] = max_fiber_len
+    dictionary_info['points_to_skip'] = points_to_skip
+    dictionary_info['vf_THR'] = vf_THR
+    dictionary_info['peaks_use_affine'] = peaks_use_affine
+    dictionary_info['flip_peaks'] = flip_peaks
+    dictionary_info['blur_radii'] = blur_radii
+    dictionary_info['blur_samples'] = blur_samples
+    dictionary_info['blur_sigma'] = blur_sigma    
+    dictionary_info['ndirs'] = ndirs
+    with open( join(path_out,'dictionary_info.pickle'), 'wb+' ) as dictionary_info_file:
+        pickle.dump(dictionary_info, dictionary_info_file, protocol=2)
+
+    # calling actual C code
+    ret = trk2dictionary( filename_tractogram, data_offset,
+        Nx, Ny, Nz, Px, Py, Pz, n_count, n_scalars, n_properties,
+        fiber_shiftX, fiber_shiftY, fiber_shiftZ, points_to_skip, min_seg_len, min_fiber_len, max_fiber_len,
+        ptrPEAKS, Np, vf_THR, -1 if flip_peaks[0] else 1, -1 if flip_peaks[1] else 1, -1 if flip_peaks[2] else 1,
+        ptrMASK, ptrTDI, path_out, 1 if do_intersect else 0, ptrAFFINE,
+        nBlurRadii, blur_sigma, ptrBlurRadii, ptrBlurSamples, ptrBlurWeights, ptrArrayInvM, ndirs, ptrHashTable  );
+    if ret == 0 :
+        WARNING( 'DICTIONARY not generated' )
+        return None
+
+    # save TDI and MASK maps
+    if filename_mask is not None :
+        affine = niiMASK.affine if nibabel.__version__ >= '2.0.0' else niiMASK.get_affine()
+    elif filename_peaks is not None :
+        affine = niiPEAKS.affine if nibabel.__version__ >= '2.0.0' else niiPEAKS.get_affine()
+    else :
+        affine = np.diag( [Px, Py, Pz, 1] )
+
+    niiTDI = nibabel.Nifti1Image( niiTDI_img, affine )
+    nii_hdr = niiTDI.header if nibabel.__version__ >= '2.0.0' else niiTDI.get_header()
+    nii_hdr['descrip'] = 'Created with COMMIT %s'%get_distribution('dmri-commit').version
+    nibabel.save( niiTDI, join(path_out,'dictionary_tdi.nii.gz') )
+
+    if filename_mask is not None :
+        niiMASK = nibabel.Nifti1Image( niiMASK_img, affine )
+    else :
+        niiMASK = nibabel.Nifti1Image( (np.asarray(niiTDI_img)>0).astype(np.float32), affine )
+    nii_hdr = niiMASK.header if nibabel.__version__ >= '2.0.0' else niiMASK.get_header()
+    nii_hdr['descrip'] = 'Created with COMMIT %s'%get_distribution('dmri-commit').version
+    nibabel.save( niiMASK, join(path_out,'dictionary_mask.nii.gz') )
+
     LOG( '\n   [ %.1f seconds ]' % ( time.time() - tic ) )
\ No newline at end of file
diff --git a/commit/trk2dictionary/trk2dictionary_c.cpp b/commit/trk2dictionary/trk2dictionary_c.cpp
index 7a295102..c8991b1c 100644
--- a/commit/trk2dictionary/trk2dictionary_c.cpp
+++ b/commit/trk2dictionary/trk2dictionary_c.cpp
@@ -1,598 +1,598 @@
-#include <stdio.h>
-#include <cstdio>
-#include <string>
-#include <map>
-#include <vector>
-#include "Vector.h"
-#include "ProgressBar.h"
-#include <numpy/arrayobject.h>
-#include <math.h>
-
-#define MAX_FIB_LEN 10000
-
-
-// CLASS to store the segments of one fiber
-class segKey
-{
-    public:
-    unsigned short x, y, z;
-    unsigned short o;
-    segKey(){}
-
-    void set(unsigned short _x, unsigned short _y, unsigned short _z, unsigned short _o)
-    {
-        x  = _x;
-        y  = _y;
-        z  = _z;
-        o = _o;
-    }
-
-    bool const operator <(const segKey& seg) const
-    {
-        return o < seg.o || (o==seg.o && z<seg.z) || (o==seg.o && z==seg.z && y<seg.y) || (o==seg.o && z==seg.z && y==seg.y && x<seg.x);
-    }
-};
-
-class segInVoxKey
-{
-    public:
-    unsigned short x, y, z;
-    segInVoxKey(){}
-
-    void set(unsigned short _x, unsigned short _y, unsigned short _z)
-    {
-        x  = _x;
-        y  = _y;
-        z  = _z;
-    }
-    bool const operator <(const segInVoxKey& o) const
-    {
-        return (z<o.z) || (z==o.z && y<o.y) || (z==o.z && y==o.y && x<o.x);
-    }
-};
-
-// global variables (to avoid passing them at each call)
-std::map<segKey,float>  FiberSegments;
-float                   FiberLen;
-
-Vector<int>     dim;
-Vector<float>   pixdim;
-float*          ptrMASK;
-unsigned int    nPointsToSkip;
-float           fiberShiftXmm, fiberShiftYmm, fiberShiftZmm;
-bool            doIntersect;
-float           minSegLen, minFiberLen, maxFiberLen;
-
-std::vector<double> radii;         // radii for the extrusion
-std::vector<double> weights;       // damping weight
-std::vector<int>    sectors;       // number of duplicates across the extrusion circle
-double              radiusSigma;   // modulates the impact of each segment as function of radius
-
-
-bool rayBoxIntersection( Vector<double>& origin, Vector<double>& direction, Vector<double>& vmin, Vector<double>& vmax, double & t);
-void fiberForwardModel( float fiber[3][MAX_FIB_LEN], unsigned int pts, std::vector<int> sectors, std::vector<double> radii, std::vector<double> weight, short* ptrHashTable );
-void segmentForwardModel( const Vector<double>& P1, const Vector<double>& P2, int k, double w, short* ptrHashTable );
-unsigned int read_fiberTRK( FILE* fp, float fiber[3][MAX_FIB_LEN], int ns, int np );
-unsigned int read_fiberTCK( FILE* fp, float fiber[3][MAX_FIB_LEN] , float affine[4][4]);
-
-
-// =========================
-// Function called by CYTHON
-// =========================
-int trk2dictionary(
-    char* str_filename, int data_offset, int Nx, int Ny, int Nz, float Px, float Py, float Pz, int n_count, int n_scalars, int n_properties,
-    float fiber_shiftX, float fiber_shiftY, float fiber_shiftZ, int points_to_skip, float min_seg_len, float min_fiber_len, float max_fiber_len,
-    float* ptrPEAKS, int Np, float vf_THR, int ECix, int ECiy, int ECiz,
-    float* _ptrMASK, float* ptrTDI, char* path_out, int c, double* ptrPeaksAffine,
-    int nBlurRadii, double blurSigma, double* ptrBlurRadii, int* ptrBlurSamples, double* ptrBlurWeights, float* ptrTractsAffine, unsigned short ndirs, short* ptrHashTable
-)
-{
-    /*=========================*/
-    /*     IC compartments     */
-    /*=========================*/
-    float          fiber[3][MAX_FIB_LEN];
-    float          fiberNorm;
-    unsigned int   N, totICSegments = 0, totFibers = 0, v;
-    unsigned short o;
-    unsigned char  kept;
-    Vector<double> P;
-    std::string    filename;
-    std::string    OUTPUT_path(path_out);
-    std::map<segKey,float>::iterator it;
-
-    std::map<segInVoxKey,float> FiberNorm;
-    std::map<segInVoxKey,float>::iterator itNorm;
-    segInVoxKey         inVoxKey;
-
-    printf( "\n   \033[0;32m* Exporting IC compartments:\033[0m\n" );
-    
-    int isTRK; // var to check
-
-    char *ext = strrchr(str_filename, '.'); //get the extension of input file
-
-    if (strcmp(ext,".trk")==0) //for .trk file
-        isTRK = 1;
-    else if (strcmp(ext,".tck")==0)// for .tck file
-        isTRK = 0;
-    else
-        return 0;
-
-    FILE* fpTractogram = fopen(str_filename,"rb"); //open 
-    if (fpTractogram == NULL) return 0;
-    fseek(fpTractogram,data_offset,SEEK_SET); //skip header
-
-    // set global variables
-    dim.Set( Nx, Ny, Nz );
-    pixdim.Set( Px, Py, Pz );
-    nPointsToSkip = points_to_skip;
-    fiberShiftXmm = fiber_shiftX * pixdim.x; // shift in mm for the coordinates
-    fiberShiftYmm = fiber_shiftY * pixdim.y;
-    fiberShiftZmm = fiber_shiftZ * pixdim.z;
-    ptrMASK       = _ptrMASK;
-    doIntersect   = c > 0;
-    minSegLen     = min_seg_len;
-    minFiberLen   = min_fiber_len;
-    maxFiberLen   = max_fiber_len;
-
-    radii.clear();
-    sectors.clear();
-    weights.clear();
-    for(int i=0; i<nBlurRadii ;i++)
-    {
-        radii.push_back( ptrBlurRadii[i] );
-        sectors.push_back( ptrBlurSamples[i] );
-        weights.push_back( ptrBlurWeights[i] );
-    }
-    radiusSigma = blurSigma;
-
-    // open files
-    filename = OUTPUT_path+"/dictionary_TRK_norm.dict";   FILE* pDict_TRK_norm = fopen(filename.c_str(),"wb");
-    if ( !pDict_TRK_norm )
-    {
-        printf( "\n[trk2dictionary] Unable to create output files" );
-        return 0;
-    }
-    filename = OUTPUT_path+"/dictionary_IC_f.dict";        FILE* pDict_IC_f      = fopen(filename.c_str(),"wb");
-    filename = OUTPUT_path+"/dictionary_IC_v.dict";        FILE* pDict_IC_v      = fopen(filename.c_str(),"wb");
-    filename = OUTPUT_path+"/dictionary_IC_o.dict";        FILE* pDict_IC_o      = fopen(filename.c_str(),"wb");
-    filename = OUTPUT_path+"/dictionary_IC_len.dict";      FILE* pDict_IC_len    = fopen(filename.c_str(),"wb");
-    filename = OUTPUT_path+"/dictionary_TRK_len.dict";     FILE* pDict_TRK_len   = fopen(filename.c_str(),"wb");
-    filename = OUTPUT_path+"/dictionary_TRK_kept.dict";    FILE* pDict_TRK_kept  = fopen(filename.c_str(),"wb");
-
-    // iterate over fibers
-    ProgressBar PROGRESS( n_count );
-    PROGRESS.setPrefix("     ");
-    
-    float affine[4][4];
-    if (!isTRK)  {//.tck
-        //ricreate affine matrix
-        int k = 0;
-        for(int i=0; i<4; i++) {
-            for (int j=0; j<4; j++) {
-                affine[i][j] = ptrTractsAffine[k];
-                k++;
-            }
-        }
-    }
-
-    for(int f=0; f<n_count ;f++)
-    {
-        PROGRESS.inc();
-        if (isTRK) N = read_fiberTRK( fpTractogram, fiber, n_scalars, n_properties );
-        else N = read_fiberTCK( fpTractogram, fiber , affine );
-        fiberForwardModel( fiber, N, sectors, radii, weights, ptrHashTable  );
-
-        kept = 0;
-        if ( FiberSegments.size() > 0 )
-        {
-            if ( FiberLen > minFiberLen && FiberLen < maxFiberLen )
-            {
-                // add segments to files
-                for (it=FiberSegments.begin(); it!=FiberSegments.end(); it++)
-                {
-                    // NB: plese note inverted ordering for 'v'
-                    v = it->first.x + dim.x * ( it->first.y + dim.y * it->first.z );
-                    o = it->first.o;
-                    fwrite( &totFibers,      4, 1, pDict_IC_f );
-                    fwrite( &v,              4, 1, pDict_IC_v );
-                    fwrite( &o,              2, 1, pDict_IC_o );
-                    fwrite( &(it->second),   4, 1, pDict_IC_len );
-                    ptrTDI[ it->first.z + dim.z * ( it->first.y + dim.y * it->first.x ) ] += it->second;
-                    inVoxKey.set( it->first.x, it->first.y, it->first.z );
-                    FiberNorm[inVoxKey] += it->second;
-                }
-                for (fiberNorm=0, itNorm=FiberNorm.begin(); itNorm!=FiberNorm.end(); itNorm++)
-                    fiberNorm += pow(itNorm->second,2);
-                fiberNorm = sqrt(fiberNorm);
-                FiberNorm.clear();
-                fwrite( &fiberNorm,  1, 4, pDict_TRK_norm ); // actual length considered in optimization
-                fwrite( &FiberLen,   1, 4, pDict_TRK_len );
-                totICSegments += FiberSegments.size();
-                totFibers++;
-                kept = 1;
-            }
-        }
-        fwrite( &kept, 1, 1, pDict_TRK_kept );
-    }
-    PROGRESS.close();
-
-    fclose( fpTractogram );
-    fclose( pDict_TRK_norm );
-    fclose( pDict_IC_f );
-    fclose( pDict_IC_v );
-    fclose( pDict_IC_o );
-    fclose( pDict_IC_len );
-    fclose( pDict_TRK_len );
-    fclose( pDict_TRK_kept );
-
-    printf("     [ %d fibers kept, %d segments in total ]\n", totFibers, totICSegments );
-
-
-    /*=========================*/
-    /*     EC compartments     */
-    /*=========================*/
-    unsigned int totECSegments = 0, totECVoxels = 0;
-
-    printf( "\n   \033[0;32m* Exporting EC compartments:\033[0m\n" );
-
-    filename = OUTPUT_path+"/dictionary_EC_v.dict";        FILE* pDict_EC_v   = fopen( filename.c_str(),   "wb" );
-    filename = OUTPUT_path+"/dictionary_EC_o.dict";        FILE* pDict_EC_o   = fopen( filename.c_str(),   "wb" );
-
-    if ( ptrPEAKS != NULL )
-    {
-        Vector<double> dir;
-        double         longitude, colatitude;
-        segKey         ec_seg;
-        int            ix, iy, iz, id, atLeastOne;
-        float          peakMax;
-        float          norms[ Np ];
-        float          *ptr;
-        int            ox, oy;
-
-        PROGRESS.reset( dim.z );
-        for(iz=0; iz<dim.z ;iz++)
-        {
-            PROGRESS.inc();
-            for(iy=0; iy<dim.y ;iy++)
-            for(ix=0; ix<dim.x ;ix++)
-            {
-                // check if in mask previously computed from IC segments
-                if ( ptrTDI[ iz + dim.z * ( iy + dim.y * ix ) ] == 0 ) continue;
-
-                peakMax = -1;
-                for(id=0; id<Np ;id++)
-                {
-                    ptr = ptrPEAKS + 3*(id + Np * ( iz + dim.z * ( iy + dim.y * ix ) ));
-                    dir.x = ptr[0];
-                    dir.y = ptr[1];
-                    dir.z = ptr[2];
-                    norms[id] = dir.norm();
-                    if ( norms[id] > peakMax )
-                        peakMax = norms[id];
-                }
-
-                if ( peakMax > 0 )
-                {
-                    ec_seg.x  = ix;
-                    ec_seg.y  = iy;
-                    ec_seg.z  = iz;
-                    atLeastOne = 0;
-                    for(id=0; id<Np ;id++)
-                    {
-                        if ( norms[id]==0 || norms[id] < vf_THR*peakMax ) continue; // peak too small, don't consider it
-
-                        // get the orientation of the current peak
-                        ptr = ptrPEAKS + 3*(id + Np * ( iz + dim.z * ( iy + dim.y * ix ) ));
-
-                        // multiply by the affine matrix
-                        dir.x = ptr[0] * ptrPeaksAffine[0] + ptr[1] * ptrPeaksAffine[1] + ptr[2] * ptrPeaksAffine[2];
-                        dir.y = ptr[0] * ptrPeaksAffine[3] + ptr[1] * ptrPeaksAffine[4] + ptr[2] * ptrPeaksAffine[5];
-                        dir.z = ptr[0] * ptrPeaksAffine[6] + ptr[1] * ptrPeaksAffine[7] + ptr[2] * ptrPeaksAffine[8];
-
-                        // flip axes if requested
-                        dir.x *= ECix;
-                        dir.y *= ECiy;
-                        dir.z *= ECiz;
-                        if ( dir.y < 0 )
-                        {
-                            // ensure to be in the right hemisphere (the one where kernels were pre-computed)
-                            dir.x = -dir.x;
-                            dir.y = -dir.y;
-                            dir.z = -dir.z;
-                        }
-                        colatitude = atan2( sqrt(dir.x*dir.x + dir.y*dir.y), dir.z );
-                        longitude  = atan2( dir.y, dir.x );
-                        ox = (int)round(colatitude/M_PI*180.0);
-                        oy = (int)round(longitude/M_PI*180.0);
-
-                        v = ec_seg.x + dim.x * ( ec_seg.y + dim.y * ec_seg.z );
-                        o = ptrHashTable[ox*181 + oy];
-                        fwrite( &v, 4, 1, pDict_EC_v );
-                        fwrite( &o, 2, 1, pDict_EC_o );
-                        totECSegments++;
-                        atLeastOne = 1;
-                    }
-                    if ( atLeastOne>0 )
-                        totECVoxels++;
-                }
-            }
-        }
-        PROGRESS.close();
-    }
-
-    fclose( pDict_EC_v );
-    fclose( pDict_EC_o );
-
-    printf("     [ %d voxels, %d segments ]\n", totECVoxels, totECSegments );
-
-    return 1;
-}
-
-
-/********************************************************************************************************************/
-/*                                                 fiberForwardModel                                                */
-/********************************************************************************************************************/
-void fiberForwardModel( float fiber[3][MAX_FIB_LEN], unsigned int pts, std::vector<int> sectors, std::vector<double> radii, std::vector<double> weights, short* ptrHashTable )
-{
-    static Vector<double> S1, S2, S1m, S2m, P, q, n, qxn, qxqxn;
-    static Vector<double> vox, vmin, vmax, dir;
-    static double         len, t, alpha, w, R;
-    static int            i, j, k;
-
-    FiberLen = 0.0;
-    FiberSegments.clear();
-    if ( pts <= 2*nPointsToSkip )
-        return;
-
-    for(i=nPointsToSkip; i<pts-1-nPointsToSkip ;i++)
-    {
-        // original segment to be processed
-        S1.Set( fiber[0][i]   + fiberShiftXmm, fiber[1][i]   + fiberShiftYmm, fiber[2][i]   + fiberShiftZmm );
-        S2.Set( fiber[0][i+1] + fiberShiftXmm, fiber[1][i+1] + fiberShiftYmm, fiber[2][i+1] + fiberShiftZmm );
-        dir.x = S2.x-S1.x;
-        dir.y = S2.y-S1.y;
-        dir.z = S2.z-S1.z;
-        dir.Normalize();
-
-        // get a normal to the vector to move
-        n.x = dir.y-dir.z;
-        n.y = dir.z-dir.x;
-        n.z = dir.x-dir.y;
-        n.Normalize();
-
-        /* assign contribution(s) */
-        for(k=0; k<(int)radii.size() ;k++)
-        {
-            if ( weights[k] < 1e-3 )
-                continue;
-
-            R = radii[k];
-
-            // quaternion (q.x, q.y, q.z, w) for rotation
-            alpha = 2.0*M_PI/sectors[k];
-            w = sin(alpha/2.0);
-            q.x = dir.x * w;
-            q.y = dir.y * w;
-            q.z = dir.z * w;
-            w = cos(alpha/2.0);
-            for(j=0; j<sectors[k] ;j++)
-            {
-                // rotate the segment's normal
-                qxn.x = 2.0 * ( q.y * n.z - q.z * n.y );
-                qxn.y = 2.0 * ( q.z * n.x - q.x * n.z );
-                qxn.z = 2.0 * ( q.x * n.y - q.y * n.x );
-                qxqxn.x = q.y * qxn.z - q.z * qxn.y;
-                qxqxn.y = q.z * qxn.x - q.x * qxn.z;
-                qxqxn.z = q.x * qxn.y - q.y * qxn.x;
-                n.x += w * qxn.x + qxqxn.x;
-                n.y += w * qxn.y + qxqxn.y;
-                n.z += w * qxn.z + qxqxn.z;
-
-                // move the segment
-                S1m.x = S1.x + R*n.x;
-                S1m.y = S1.y + R*n.y;
-                S1m.z = S1.z + R*n.z;
-                S2m.x = S2.x + R*n.x;
-                S2m.y = S2.y + R*n.y;
-                S2m.z = S2.z + R*n.z;
-
-                if ( doIntersect==false )
-                    segmentForwardModel( S1m, S2m, k, weights[k], ptrHashTable );
-                else
-                    while( 1 )
-                    {
-                        len = sqrt( pow(S2m.x-S1m.x,2) + pow(S2m.y-S1m.y,2) + pow(S2m.z-S1m.z,2) ); // in mm
-                        if ( len <= minSegLen )
-                            break;
-                        
-                        if ( floor(S1m.x/pixdim.x)==floor(S2m.x/pixdim.x) &&
-                             floor(S1m.y/pixdim.y)==floor(S2m.y/pixdim.y) &&
-                             floor(S1m.z/pixdim.z)==floor(S2m.z/pixdim.z)
-                            )
-                        {
-                            // same voxel, no need to compute intersections
-                            segmentForwardModel( S1m, S2m, k, weights[k], ptrHashTable );
-                            break;
-                        }
-
-                        // compute AABB of the first point (in mm)
-                        vmin.x = floor( (S1m.x + 1e-6*dir.x)/pixdim.x ) * pixdim.x;
-                        vmin.y = floor( (S1m.y + 1e-6*dir.y)/pixdim.y ) * pixdim.y;
-                        vmin.z = floor( (S1m.z + 1e-6*dir.z)/pixdim.z ) * pixdim.z;
-                        vmax.x = vmin.x + pixdim.x;
-                        vmax.y = vmin.y + pixdim.y;
-                        vmax.z = vmin.z + pixdim.z;
-
-                        if ( rayBoxIntersection( S1m, dir, vmin, vmax, t ) && t>0 && t<len )
-                        {
-                            // add the portion S1P, and then reiterate
-                            P.Set( S1m.x + t*dir.x, S1m.y + t*dir.y, S1m.z + t*dir.z );
-                            segmentForwardModel( S1m, P, k, weights[k], ptrHashTable );
-                            S1m.Set( P.x, P.y, P.z );
-                        }
-                        else
-                        {
-                            // add the segment S1S2 and stop iterating
-                            segmentForwardModel( S1m, S2m, k, weights[k], ptrHashTable );
-                            break;
-                        }
-                    }
-            }
-        }
-    }
-}
-
-
-/********************************************************************************************************************/
-/*                                                segmentForwardModel                                               */
-/********************************************************************************************************************/
-void segmentForwardModel( const Vector<double>& P1, const Vector<double>& P2, int k, double w, short* ptrHashTable )
-{
-    static Vector<int>    vox;
-    static Vector<double> dir, dirTrue;
-    static double         longitude, colatitude, len;
-    static segKey         key;
-    static int            ox, oy;
-
-    // direction of the segment
-    dir.y = P2.y-P1.y;
-    if ( dir.y >= 0 )
-    {
-        dir.x = P2.x-P1.x;
-        dir.z = P2.z-P1.z;
-    }
-    else
-    {
-        dir.x = P1.x-P2.x;
-        dir.y = P1.y-P2.y;
-        dir.z = P1.z-P2.z;
-    }
-
-    // length of the segment
-    len = dir.norm();
-    if ( len <= minSegLen )
-        return;
-    dir.Normalize();
-
-    // voxel of the segment is the centroid
-    vox.x = floor( 0.5 * (P1.x + P2.x) / pixdim.x );
-    vox.y = floor( 0.5 * (P1.y + P2.y) / pixdim.y );
-    vox.z = floor( 0.5 * (P1.z + P2.z) / pixdim.z );
-    if ( vox.x>=dim.x || vox.x<0 || vox.y>=dim.y || vox.y<0 || vox.z>=dim.z || vox.z<0 )
-        return;
-    if ( ptrMASK && ptrMASK[ vox.z + dim.z * ( vox.y + dim.y * vox.x ) ]==0 )
-        return;
-
-    // add the segment to the data structure
-    longitude  = atan2(dir.y, dir.x);
-    colatitude = atan2( sqrt(dir.x*dir.x + dir.y*dir.y), dir.z );
-    ox = (int)round(colatitude/M_PI*180.0); // theta // i1
-    oy = (int)round(longitude/M_PI*180.0);  // phi   // i2
-    key.set( vox.x, vox.y, vox.z, (unsigned short) ptrHashTable[ox*181 + oy] );
-    FiberSegments[key] += w * len;
-    if ( k==0 ) // fiber length computed only from origianl segments
-        FiberLen += len;
-}
-
-
-/********************************************************************************************************************/
-/*                                                rayBoxIntersection                                                */
-/********************************************************************************************************************/
-bool rayBoxIntersection( Vector<double>& origin, Vector<double>& direction, Vector<double>& vmin, Vector<double>& vmax, double & t)
-{
-    static double tmin, tmax, tymin, tymax, tzmin, tzmax;
-    static Vector<double> invrd;
-
-    // inverse direction to catch float problems
-    invrd.x = 1.0 / direction.x;
-    invrd.y = 1.0 / direction.y;
-    invrd.z = 1.0 / direction.z;
-
-    if (invrd.x >= 0)
-    {
-      tmin = (vmin.x - origin.x) * invrd.x;
-      tmax = (vmax.x - origin.x) * invrd.x;
-    }
-    else
-    {
-      tmin = (vmax.x - origin.x) * invrd.x;
-      tmax = (vmin.x - origin.x) * invrd.x;
-    }
-
-    if (invrd.y >= 0)
-    {
-      tymin = (vmin.y - origin.y) * invrd.y;
-      tymax = (vmax.y - origin.y) * invrd.y;
-    }
-    else
-    {
-      tymin = (vmax.y - origin.y) * invrd.y;
-      tymax = (vmin.y - origin.y) * invrd.y;
-    }
-
-    if ( (tmin > tymax) || (tymin > tmax) ) return false;
-    if ( tymin > tmin) tmin = tymin;
-    if ( tymax < tmax) tmax = tymax;
-
-    if (invrd.z >= 0)
-    {
-      tzmin = (vmin.z - origin.z) * invrd.z;
-      tzmax = (vmax.z - origin.z) * invrd.z;
-    }else
-    {
-      tzmin = (vmax.z - origin.z) * invrd.z;
-      tzmax = (vmin.z - origin.z) * invrd.z;
-    }
-
-    if ( (tmin > tzmax) || (tzmin > tmax) ) return false;
-    if ( tzmin > tmin) tmin = tzmin;
-    if ( tzmax < tmax) tmax = tzmax;
-
-    // check if values are valid
-    t = tmin;
-    if (t <= 0) t = tmax;
-
-    return true;
-}
-
-
-// Read a fiber from file .trk
-unsigned int read_fiberTRK( FILE* fp, float fiber[3][MAX_FIB_LEN], int ns, int np )
-{
-    int N;
-    fread((char*)&N, 1, 4, fp);
-
-    if ( N >= MAX_FIB_LEN || N <= 0 )
-        return 0;
-
-    float tmp[3];
-    for(int i=0; i<N; i++)
-    {
-        fread((char*)tmp, 1, 12, fp);
-        fiber[0][i] = tmp[0];
-        fiber[1][i] = tmp[1];
-        fiber[2][i] = tmp[2];
-        fseek(fp,4*ns,SEEK_CUR);
-    }
-    fseek(fp,4*np,SEEK_CUR);
-
-    return N;
-}
-
-// Read a fiber from file .tck
-unsigned int read_fiberTCK( FILE* fp, float fiber[3][MAX_FIB_LEN], float affine[4][4])
-{
-    int i = 0;
-    float tmp[3];
-    fread((char*)tmp, 1, 12, fp);
-    while( !(isnan(tmp[0])) && !(isnan(tmp[1])) &&  !(isnan(tmp[2])) )
-    {
-        fiber[0][i] = tmp[0]*affine[0][0] + tmp[1]*affine[0][1] + tmp[2]*affine[0][2] + affine[0][3];
-        fiber[1][i] = tmp[0]*affine[1][0] + tmp[1]*affine[1][1] + tmp[2]*affine[1][2] + affine[1][3];
-        fiber[2][i] = tmp[0]*affine[2][0] + tmp[1]*affine[2][1] + tmp[2]*affine[2][2] + affine[2][3];
-        i++;
-        fread((char*)tmp, 1, 12, fp);
-    }
-
-    return i;
-}
+#include <stdio.h>
+#include <cstdio>
+#include <string>
+#include <map>
+#include <vector>
+#include "Vector.h"
+#include "ProgressBar.h"
+#include <numpy/arrayobject.h>
+#include <math.h>
+
+#define MAX_FIB_LEN 10000
+
+
+// CLASS to store the segments of one fiber
+class segKey
+{
+    public:
+    unsigned short x, y, z;
+    unsigned short o;
+    segKey(){}
+
+    void set(unsigned short _x, unsigned short _y, unsigned short _z, unsigned short _o)
+    {
+        x  = _x;
+        y  = _y;
+        z  = _z;
+        o = _o;
+    }
+
+    bool const operator <(const segKey& seg) const
+    {
+        return o < seg.o || (o==seg.o && z<seg.z) || (o==seg.o && z==seg.z && y<seg.y) || (o==seg.o && z==seg.z && y==seg.y && x<seg.x);
+    }
+};
+
+class segInVoxKey
+{
+    public:
+    unsigned short x, y, z;
+    segInVoxKey(){}
+
+    void set(unsigned short _x, unsigned short _y, unsigned short _z)
+    {
+        x  = _x;
+        y  = _y;
+        z  = _z;
+    }
+    bool const operator <(const segInVoxKey& o) const
+    {
+        return (z<o.z) || (z==o.z && y<o.y) || (z==o.z && y==o.y && x<o.x);
+    }
+};
+
+// global variables (to avoid passing them at each call)
+std::map<segKey,float>  FiberSegments;
+float                   FiberLen;
+
+Vector<int>     dim;
+Vector<float>   pixdim;
+float*          ptrMASK;
+unsigned int    nPointsToSkip;
+float           fiberShiftXmm, fiberShiftYmm, fiberShiftZmm;
+bool            doIntersect;
+float           minSegLen, minFiberLen, maxFiberLen;
+
+std::vector<double> radii;         // radii for the extrusion
+std::vector<double> weights;       // damping weight
+std::vector<int>    sectors;       // number of duplicates across the extrusion circle
+double              radiusSigma;   // modulates the impact of each segment as function of radius
+
+
+bool rayBoxIntersection( Vector<double>& origin, Vector<double>& direction, Vector<double>& vmin, Vector<double>& vmax, double & t);
+void fiberForwardModel( float fiber[3][MAX_FIB_LEN], unsigned int pts, std::vector<int> sectors, std::vector<double> radii, std::vector<double> weight, short* ptrHashTable );
+void segmentForwardModel( const Vector<double>& P1, const Vector<double>& P2, int k, double w, short* ptrHashTable );
+unsigned int read_fiberTRK( FILE* fp, float fiber[3][MAX_FIB_LEN], int ns, int np );
+unsigned int read_fiberTCK( FILE* fp, float fiber[3][MAX_FIB_LEN] , float affine[4][4]);
+
+
+// =========================
+// Function called by CYTHON
+// =========================
+int trk2dictionary(
+    char* str_filename, int data_offset, int Nx, int Ny, int Nz, float Px, float Py, float Pz, int n_count, int n_scalars, int n_properties,
+    float fiber_shiftX, float fiber_shiftY, float fiber_shiftZ, int points_to_skip, float min_seg_len, float min_fiber_len, float max_fiber_len,
+    float* ptrPEAKS, int Np, float vf_THR, int ECix, int ECiy, int ECiz,
+    float* _ptrMASK, float* ptrTDI, char* path_out, int c, double* ptrPeaksAffine,
+    int nBlurRadii, double blurSigma, double* ptrBlurRadii, int* ptrBlurSamples, double* ptrBlurWeights, float* ptrTractsAffine, unsigned short ndirs, short* ptrHashTable
+)
+{
+    /*=========================*/
+    /*     IC compartments     */
+    /*=========================*/
+    float          fiber[3][MAX_FIB_LEN];
+    float          fiberNorm;
+    unsigned int   N, totICSegments = 0, totFibers = 0, v;
+    unsigned short o;
+    unsigned char  kept;
+    Vector<double> P;
+    std::string    filename;
+    std::string    OUTPUT_path(path_out);
+    std::map<segKey,float>::iterator it;
+
+    std::map<segInVoxKey,float> FiberNorm;
+    std::map<segInVoxKey,float>::iterator itNorm;
+    segInVoxKey         inVoxKey;
+
+    printf( "\n   \033[0;32m* Exporting IC compartments:\033[0m\n" );
+    
+    int isTRK; // var to check
+
+    char *ext = strrchr(str_filename, '.'); //get the extension of input file
+
+    if (strcmp(ext,".trk")==0) //for .trk file
+        isTRK = 1;
+    else if (strcmp(ext,".tck")==0)// for .tck file
+        isTRK = 0;
+    else
+        return 0;
+
+    FILE* fpTractogram = fopen(str_filename,"rb"); //open 
+    if (fpTractogram == NULL) return 0;
+    fseek(fpTractogram,data_offset,SEEK_SET); //skip header
+
+    // set global variables
+    dim.Set( Nx, Ny, Nz );
+    pixdim.Set( Px, Py, Pz );
+    nPointsToSkip = points_to_skip;
+    fiberShiftXmm = fiber_shiftX * pixdim.x; // shift in mm for the coordinates
+    fiberShiftYmm = fiber_shiftY * pixdim.y;
+    fiberShiftZmm = fiber_shiftZ * pixdim.z;
+    ptrMASK       = _ptrMASK;
+    doIntersect   = c > 0;
+    minSegLen     = min_seg_len;
+    minFiberLen   = min_fiber_len;
+    maxFiberLen   = max_fiber_len;
+
+    radii.clear();
+    sectors.clear();
+    weights.clear();
+    for(int i=0; i<nBlurRadii ;i++)
+    {
+        radii.push_back( ptrBlurRadii[i] );
+        sectors.push_back( ptrBlurSamples[i] );
+        weights.push_back( ptrBlurWeights[i] );
+    }
+    radiusSigma = blurSigma;
+
+    // open files
+    filename = OUTPUT_path+"/dictionary_TRK_norm.dict";   FILE* pDict_TRK_norm = fopen(filename.c_str(),"wb");
+    if ( !pDict_TRK_norm )
+    {
+        printf( "\n[trk2dictionary] Unable to create output files" );
+        return 0;
+    }
+    filename = OUTPUT_path+"/dictionary_IC_f.dict";        FILE* pDict_IC_f      = fopen(filename.c_str(),"wb");
+    filename = OUTPUT_path+"/dictionary_IC_v.dict";        FILE* pDict_IC_v      = fopen(filename.c_str(),"wb");
+    filename = OUTPUT_path+"/dictionary_IC_o.dict";        FILE* pDict_IC_o      = fopen(filename.c_str(),"wb");
+    filename = OUTPUT_path+"/dictionary_IC_len.dict";      FILE* pDict_IC_len    = fopen(filename.c_str(),"wb");
+    filename = OUTPUT_path+"/dictionary_TRK_len.dict";     FILE* pDict_TRK_len   = fopen(filename.c_str(),"wb");
+    filename = OUTPUT_path+"/dictionary_TRK_kept.dict";    FILE* pDict_TRK_kept  = fopen(filename.c_str(),"wb");
+
+    // iterate over fibers
+    ProgressBar PROGRESS( n_count );
+    PROGRESS.setPrefix("     ");
+    
+    float affine[4][4];
+    if (!isTRK)  {//.tck
+        //ricreate affine matrix
+        int k = 0;
+        for(int i=0; i<4; i++) {
+            for (int j=0; j<4; j++) {
+                affine[i][j] = ptrTractsAffine[k];
+                k++;
+            }
+        }
+    }
+
+    for(int f=0; f<n_count ;f++)
+    {
+        PROGRESS.inc();
+        if (isTRK) N = read_fiberTRK( fpTractogram, fiber, n_scalars, n_properties );
+        else N = read_fiberTCK( fpTractogram, fiber , affine );
+        fiberForwardModel( fiber, N, sectors, radii, weights, ptrHashTable  );
+
+        kept = 0;
+        if ( FiberSegments.size() > 0 )
+        {
+            if ( FiberLen > minFiberLen && FiberLen < maxFiberLen )
+            {
+                // add segments to files
+                for (it=FiberSegments.begin(); it!=FiberSegments.end(); it++)
+                {
+                    // NB: plese note inverted ordering for 'v'
+                    v = it->first.x + dim.x * ( it->first.y + dim.y * it->first.z );
+                    o = it->first.o;
+                    fwrite( &totFibers,      4, 1, pDict_IC_f );
+                    fwrite( &v,              4, 1, pDict_IC_v );
+                    fwrite( &o,              2, 1, pDict_IC_o );
+                    fwrite( &(it->second),   4, 1, pDict_IC_len );
+                    ptrTDI[ it->first.z + dim.z * ( it->first.y + dim.y * it->first.x ) ] += it->second;
+                    inVoxKey.set( it->first.x, it->first.y, it->first.z );
+                    FiberNorm[inVoxKey] += it->second;
+                }
+                for (fiberNorm=0, itNorm=FiberNorm.begin(); itNorm!=FiberNorm.end(); itNorm++)
+                    fiberNorm += pow(itNorm->second,2);
+                fiberNorm = sqrt(fiberNorm);
+                FiberNorm.clear();
+                fwrite( &fiberNorm,  1, 4, pDict_TRK_norm ); // actual length considered in optimization
+                fwrite( &FiberLen,   1, 4, pDict_TRK_len );
+                totICSegments += FiberSegments.size();
+                totFibers++;
+                kept = 1;
+            }
+        }
+        fwrite( &kept, 1, 1, pDict_TRK_kept );
+    }
+    PROGRESS.close();
+
+    fclose( fpTractogram );
+    fclose( pDict_TRK_norm );
+    fclose( pDict_IC_f );
+    fclose( pDict_IC_v );
+    fclose( pDict_IC_o );
+    fclose( pDict_IC_len );
+    fclose( pDict_TRK_len );
+    fclose( pDict_TRK_kept );
+
+    printf("     [ %d fibers kept, %d segments in total ]\n", totFibers, totICSegments );
+
+
+    /*=========================*/
+    /*     EC compartments     */
+    /*=========================*/
+    unsigned int totECSegments = 0, totECVoxels = 0;
+
+    printf( "\n   \033[0;32m* Exporting EC compartments:\033[0m\n" );
+
+    filename = OUTPUT_path+"/dictionary_EC_v.dict";        FILE* pDict_EC_v   = fopen( filename.c_str(),   "wb" );
+    filename = OUTPUT_path+"/dictionary_EC_o.dict";        FILE* pDict_EC_o   = fopen( filename.c_str(),   "wb" );
+
+    if ( ptrPEAKS != NULL )
+    {
+        Vector<double> dir;
+        double         longitude, colatitude;
+        segKey         ec_seg;
+        int            ix, iy, iz, id, atLeastOne;
+        float          peakMax;
+        float          norms[ Np ];
+        float          *ptr;
+        int            ox, oy;
+
+        PROGRESS.reset( dim.z );
+        for(iz=0; iz<dim.z ;iz++)
+        {
+            PROGRESS.inc();
+            for(iy=0; iy<dim.y ;iy++)
+            for(ix=0; ix<dim.x ;ix++)
+            {
+                // check if in mask previously computed from IC segments
+                if ( ptrTDI[ iz + dim.z * ( iy + dim.y * ix ) ] == 0 ) continue;
+
+                peakMax = -1;
+                for(id=0; id<Np ;id++)
+                {
+                    ptr = ptrPEAKS + 3*(id + Np * ( iz + dim.z * ( iy + dim.y * ix ) ));
+                    dir.x = ptr[0];
+                    dir.y = ptr[1];
+                    dir.z = ptr[2];
+                    norms[id] = dir.norm();
+                    if ( norms[id] > peakMax )
+                        peakMax = norms[id];
+                }
+
+                if ( peakMax > 0 )
+                {
+                    ec_seg.x  = ix;
+                    ec_seg.y  = iy;
+                    ec_seg.z  = iz;
+                    atLeastOne = 0;
+                    for(id=0; id<Np ;id++)
+                    {
+                        if ( norms[id]==0 || norms[id] < vf_THR*peakMax ) continue; // peak too small, don't consider it
+
+                        // get the orientation of the current peak
+                        ptr = ptrPEAKS + 3*(id + Np * ( iz + dim.z * ( iy + dim.y * ix ) ));
+
+                        // multiply by the affine matrix
+                        dir.x = ptr[0] * ptrPeaksAffine[0] + ptr[1] * ptrPeaksAffine[1] + ptr[2] * ptrPeaksAffine[2];
+                        dir.y = ptr[0] * ptrPeaksAffine[3] + ptr[1] * ptrPeaksAffine[4] + ptr[2] * ptrPeaksAffine[5];
+                        dir.z = ptr[0] * ptrPeaksAffine[6] + ptr[1] * ptrPeaksAffine[7] + ptr[2] * ptrPeaksAffine[8];
+
+                        // flip axes if requested
+                        dir.x *= ECix;
+                        dir.y *= ECiy;
+                        dir.z *= ECiz;
+                        if ( dir.y < 0 )
+                        {
+                            // ensure to be in the right hemisphere (the one where kernels were pre-computed)
+                            dir.x = -dir.x;
+                            dir.y = -dir.y;
+                            dir.z = -dir.z;
+                        }
+                        colatitude = atan2( sqrt(dir.x*dir.x + dir.y*dir.y), dir.z );
+                        longitude  = atan2( dir.y, dir.x );
+                        ox = (int)round(colatitude/M_PI*180.0);
+                        oy = (int)round(longitude/M_PI*180.0);
+
+                        v = ec_seg.x + dim.x * ( ec_seg.y + dim.y * ec_seg.z );
+                        o = ptrHashTable[ox*181 + oy];
+                        fwrite( &v, 4, 1, pDict_EC_v );
+                        fwrite( &o, 2, 1, pDict_EC_o );
+                        totECSegments++;
+                        atLeastOne = 1;
+                    }
+                    if ( atLeastOne>0 )
+                        totECVoxels++;
+                }
+            }
+        }
+        PROGRESS.close();
+    }
+
+    fclose( pDict_EC_v );
+    fclose( pDict_EC_o );
+
+    printf("     [ %d voxels, %d segments ]\n", totECVoxels, totECSegments );
+
+    return 1;
+}
+
+
+/********************************************************************************************************************/
+/*                                                 fiberForwardModel                                                */
+/********************************************************************************************************************/
+void fiberForwardModel( float fiber[3][MAX_FIB_LEN], unsigned int pts, std::vector<int> sectors, std::vector<double> radii, std::vector<double> weights, short* ptrHashTable )
+{
+    static Vector<double> S1, S2, S1m, S2m, P, q, n, qxn, qxqxn;
+    static Vector<double> vox, vmin, vmax, dir;
+    static double         len, t, alpha, w, R;
+    static int            i, j, k;
+
+    FiberLen = 0.0;
+    FiberSegments.clear();
+    if ( pts <= 2*nPointsToSkip )
+        return;
+
+    for(i=nPointsToSkip; i<pts-1-nPointsToSkip ;i++)
+    {
+        // original segment to be processed
+        S1.Set( fiber[0][i]   + fiberShiftXmm, fiber[1][i]   + fiberShiftYmm, fiber[2][i]   + fiberShiftZmm );
+        S2.Set( fiber[0][i+1] + fiberShiftXmm, fiber[1][i+1] + fiberShiftYmm, fiber[2][i+1] + fiberShiftZmm );
+        dir.x = S2.x-S1.x;
+        dir.y = S2.y-S1.y;
+        dir.z = S2.z-S1.z;
+        dir.Normalize();
+
+        // get a normal to the vector to move
+        n.x = dir.y-dir.z;
+        n.y = dir.z-dir.x;
+        n.z = dir.x-dir.y;
+        n.Normalize();
+
+        /* assign contribution(s) */
+        for(k=0; k<(int)radii.size() ;k++)
+        {
+            if ( weights[k] < 1e-3 )
+                continue;
+
+            R = radii[k];
+
+            // quaternion (q.x, q.y, q.z, w) for rotation
+            alpha = 2.0*M_PI/sectors[k];
+            w = sin(alpha/2.0);
+            q.x = dir.x * w;
+            q.y = dir.y * w;
+            q.z = dir.z * w;
+            w = cos(alpha/2.0);
+            for(j=0; j<sectors[k] ;j++)
+            {
+                // rotate the segment's normal
+                qxn.x = 2.0 * ( q.y * n.z - q.z * n.y );
+                qxn.y = 2.0 * ( q.z * n.x - q.x * n.z );
+                qxn.z = 2.0 * ( q.x * n.y - q.y * n.x );
+                qxqxn.x = q.y * qxn.z - q.z * qxn.y;
+                qxqxn.y = q.z * qxn.x - q.x * qxn.z;
+                qxqxn.z = q.x * qxn.y - q.y * qxn.x;
+                n.x += w * qxn.x + qxqxn.x;
+                n.y += w * qxn.y + qxqxn.y;
+                n.z += w * qxn.z + qxqxn.z;
+
+                // move the segment
+                S1m.x = S1.x + R*n.x;
+                S1m.y = S1.y + R*n.y;
+                S1m.z = S1.z + R*n.z;
+                S2m.x = S2.x + R*n.x;
+                S2m.y = S2.y + R*n.y;
+                S2m.z = S2.z + R*n.z;
+
+                if ( doIntersect==false )
+                    segmentForwardModel( S1m, S2m, k, weights[k], ptrHashTable );
+                else
+                    while( 1 )
+                    {
+                        len = sqrt( pow(S2m.x-S1m.x,2) + pow(S2m.y-S1m.y,2) + pow(S2m.z-S1m.z,2) ); // in mm
+                        if ( len <= minSegLen )
+                            break;
+                        
+                        if ( floor(S1m.x/pixdim.x)==floor(S2m.x/pixdim.x) &&
+                             floor(S1m.y/pixdim.y)==floor(S2m.y/pixdim.y) &&
+                             floor(S1m.z/pixdim.z)==floor(S2m.z/pixdim.z)
+                            )
+                        {
+                            // same voxel, no need to compute intersections
+                            segmentForwardModel( S1m, S2m, k, weights[k], ptrHashTable );
+                            break;
+                        }
+
+                        // compute AABB of the first point (in mm)
+                        vmin.x = floor( (S1m.x + 1e-6*dir.x)/pixdim.x ) * pixdim.x;
+                        vmin.y = floor( (S1m.y + 1e-6*dir.y)/pixdim.y ) * pixdim.y;
+                        vmin.z = floor( (S1m.z + 1e-6*dir.z)/pixdim.z ) * pixdim.z;
+                        vmax.x = vmin.x + pixdim.x;
+                        vmax.y = vmin.y + pixdim.y;
+                        vmax.z = vmin.z + pixdim.z;
+
+                        if ( rayBoxIntersection( S1m, dir, vmin, vmax, t ) && t>0 && t<len )
+                        {
+                            // add the portion S1P, and then reiterate
+                            P.Set( S1m.x + t*dir.x, S1m.y + t*dir.y, S1m.z + t*dir.z );
+                            segmentForwardModel( S1m, P, k, weights[k], ptrHashTable );
+                            S1m.Set( P.x, P.y, P.z );
+                        }
+                        else
+                        {
+                            // add the segment S1S2 and stop iterating
+                            segmentForwardModel( S1m, S2m, k, weights[k], ptrHashTable );
+                            break;
+                        }
+                    }
+            }
+        }
+    }
+}
+
+
+/********************************************************************************************************************/
+/*                                                segmentForwardModel                                               */
+/********************************************************************************************************************/
+void segmentForwardModel( const Vector<double>& P1, const Vector<double>& P2, int k, double w, short* ptrHashTable )
+{
+    static Vector<int>    vox;
+    static Vector<double> dir, dirTrue;
+    static double         longitude, colatitude, len;
+    static segKey         key;
+    static int            ox, oy;
+
+    // direction of the segment
+    dir.y = P2.y-P1.y;
+    if ( dir.y >= 0 )
+    {
+        dir.x = P2.x-P1.x;
+        dir.z = P2.z-P1.z;
+    }
+    else
+    {
+        dir.x = P1.x-P2.x;
+        dir.y = P1.y-P2.y;
+        dir.z = P1.z-P2.z;
+    }
+
+    // length of the segment
+    len = dir.norm();
+    if ( len <= minSegLen )
+        return;
+    dir.Normalize();
+
+    // voxel of the segment is the centroid
+    vox.x = floor( 0.5 * (P1.x + P2.x) / pixdim.x );
+    vox.y = floor( 0.5 * (P1.y + P2.y) / pixdim.y );
+    vox.z = floor( 0.5 * (P1.z + P2.z) / pixdim.z );
+    if ( vox.x>=dim.x || vox.x<0 || vox.y>=dim.y || vox.y<0 || vox.z>=dim.z || vox.z<0 )
+        return;
+    if ( ptrMASK && ptrMASK[ vox.z + dim.z * ( vox.y + dim.y * vox.x ) ]==0 )
+        return;
+
+    // add the segment to the data structure
+    longitude  = atan2(dir.y, dir.x);
+    colatitude = atan2( sqrt(dir.x*dir.x + dir.y*dir.y), dir.z );
+    ox = (int)round(colatitude/M_PI*180.0); // theta // i1
+    oy = (int)round(longitude/M_PI*180.0);  // phi   // i2
+    key.set( vox.x, vox.y, vox.z, (unsigned short) ptrHashTable[ox*181 + oy] );
+    FiberSegments[key] += w * len;
+    if ( k==0 ) // fiber length computed only from origianl segments
+        FiberLen += len;
+}
+
+
+/********************************************************************************************************************/
+/*                                                rayBoxIntersection                                                */
+/********************************************************************************************************************/
+bool rayBoxIntersection( Vector<double>& origin, Vector<double>& direction, Vector<double>& vmin, Vector<double>& vmax, double & t)
+{
+    static double tmin, tmax, tymin, tymax, tzmin, tzmax;
+    static Vector<double> invrd;
+
+    // inverse direction to catch float problems
+    invrd.x = 1.0 / direction.x;
+    invrd.y = 1.0 / direction.y;
+    invrd.z = 1.0 / direction.z;
+
+    if (invrd.x >= 0)
+    {
+      tmin = (vmin.x - origin.x) * invrd.x;
+      tmax = (vmax.x - origin.x) * invrd.x;
+    }
+    else
+    {
+      tmin = (vmax.x - origin.x) * invrd.x;
+      tmax = (vmin.x - origin.x) * invrd.x;
+    }
+
+    if (invrd.y >= 0)
+    {
+      tymin = (vmin.y - origin.y) * invrd.y;
+      tymax = (vmax.y - origin.y) * invrd.y;
+    }
+    else
+    {
+      tymin = (vmax.y - origin.y) * invrd.y;
+      tymax = (vmin.y - origin.y) * invrd.y;
+    }
+
+    if ( (tmin > tymax) || (tymin > tmax) ) return false;
+    if ( tymin > tmin) tmin = tymin;
+    if ( tymax < tmax) tmax = tymax;
+
+    if (invrd.z >= 0)
+    {
+      tzmin = (vmin.z - origin.z) * invrd.z;
+      tzmax = (vmax.z - origin.z) * invrd.z;
+    }else
+    {
+      tzmin = (vmax.z - origin.z) * invrd.z;
+      tzmax = (vmin.z - origin.z) * invrd.z;
+    }
+
+    if ( (tmin > tzmax) || (tzmin > tmax) ) return false;
+    if ( tzmin > tmin) tmin = tzmin;
+    if ( tzmax < tmax) tmax = tzmax;
+
+    // check if values are valid
+    t = tmin;
+    if (t <= 0) t = tmax;
+
+    return true;
+}
+
+
+// Read a fiber from file .trk
+unsigned int read_fiberTRK( FILE* fp, float fiber[3][MAX_FIB_LEN], int ns, int np )
+{
+    int N;
+    fread((char*)&N, 1, 4, fp);
+
+    if ( N >= MAX_FIB_LEN || N <= 0 )
+        return 0;
+
+    float tmp[3];
+    for(int i=0; i<N; i++)
+    {
+        fread((char*)tmp, 1, 12, fp);
+        fiber[0][i] = tmp[0];
+        fiber[1][i] = tmp[1];
+        fiber[2][i] = tmp[2];
+        fseek(fp,4*ns,SEEK_CUR);
+    }
+    fseek(fp,4*np,SEEK_CUR);
+
+    return N;
+}
+
+// Read a fiber from file .tck
+unsigned int read_fiberTCK( FILE* fp, float fiber[3][MAX_FIB_LEN], float affine[4][4])
+{
+    int i = 0;
+    float tmp[3];
+    fread((char*)tmp, 1, 12, fp);
+    while( !(isnan(tmp[0])) && !(isnan(tmp[1])) &&  !(isnan(tmp[2])) )
+    {
+        fiber[0][i] = tmp[0]*affine[0][0] + tmp[1]*affine[0][1] + tmp[2]*affine[0][2] + affine[0][3];
+        fiber[1][i] = tmp[0]*affine[1][0] + tmp[1]*affine[1][1] + tmp[2]*affine[1][2] + affine[1][3];
+        fiber[2][i] = tmp[0]*affine[2][0] + tmp[1]*affine[2][1] + tmp[2]*affine[2][2] + affine[2][3];
+        i++;
+        fread((char*)tmp, 1, 12, fp);
+    }
+
+    return i;
+}
diff --git a/extras/CMakeLists.txt b/extras/CMakeLists.txt
index 6a1dc72b..b7689d18 100644
--- a/extras/CMakeLists.txt
+++ b/extras/CMakeLists.txt
@@ -1,11 +1,11 @@
-CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
-
-PROJECT( COMMITcpp )
-set( CMAKE_CXX_STANDARD 11 )
-
-set( CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/CMake" )
-SET( CMAKE_CXX_FLAGS "-w" )
-
-INCLUDE_DIRECTORIES ("${PROJECT_SOURCE_DIR}/include")
-
-ADD_SUBDIRECTORY( COMMIT_debugger )
+CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
+
+PROJECT( COMMITcpp )
+set( CMAKE_CXX_STANDARD 11 )
+
+set( CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/CMake" )
+SET( CMAKE_CXX_FLAGS "-w" )
+
+INCLUDE_DIRECTORIES ("${PROJECT_SOURCE_DIR}/include")
+
+ADD_SUBDIRECTORY( COMMIT_debugger )
diff --git a/extras/COMMIT_debugger/OPENGL_callbacks.cxx b/extras/COMMIT_debugger/OPENGL_callbacks.cxx
index e90e9c08..fcf4bca3 100755
--- a/extras/COMMIT_debugger/OPENGL_callbacks.cxx
+++ b/extras/COMMIT_debugger/OPENGL_callbacks.cxx
@@ -1,1132 +1,1132 @@
-#define GL_GLEXT_PROTOTYPES 1
-#ifdef __APPLE__
-    #include <OpenGL/gl.h>
-    #include <OpenGL/glext.h>
-    #include <GLUT/glut.h>
-#else
-    #include <GL/gl.h>
-    #include <GL/glext.h>
-    #include <GL/glut.h>
-#endif
-
-#include "OPENGL_utils.h"
-using namespace OPENGL_utils;
-
-/* global variables */
-GLfloat			id[16], rot[16], rot1[16], rot2[16], rot3[16];
-Vec3Df			translation;
-Vec3Di			start;
-GLint			moving;
-GLfloat			zoom;
-
-float ScreenX, ScreenY;
-
-
-void drawString( const char *string )
-{
-    static int y = glutGet( GLUT_WINDOW_HEIGHT ) - 50;
-    if ( string=="" )
-        y = glutGet( GLUT_WINDOW_HEIGHT ) - 50;
-    else
-    {
-        glRasterPos2i(10, y);
-        for (const char* c=string; *c != '\0'; c++) 
-            glutBitmapCharacter(GLUT_BITMAP_9_BY_15, *c);
-        y -= 18;
-    }
-}
-
-
-void PrintConfig()
-{
-    if ( !showConfig )
-        return;
-
-    glMatrixMode(GL_PROJECTION);
-    glPushMatrix();             
-    glLoadIdentity();
-    glMatrixMode( GL_MODELVIEW ) ;
-    glPushMatrix() ;
-    glLoadIdentity() ;
-    int w = glutGet( GLUT_WINDOW_WIDTH );
-    int h = glutGet( GLUT_WINDOW_HEIGHT );
-    glOrtho( 0, w, 0, h, -1, 1 );
-    glDisable( GL_DEPTH_TEST ); 
-
-    char s[1024];
-    glColor3f(1, 1, 0);
-    drawString( "" ); // reset initial position
-
-    drawString( "MAP" );
-    sprintf( s, "   - value(%d,%d,%d) = %.2f", VOXEL.x, VOXEL.y, VOXEL.z, MAP(VOXEL.x, VOXEL.y, VOXEL.z) );
-    drawString( s );
-    sprintf( s, "   - range = [ %.1f ... %.1f ]", MAP_min_view, MAP_max_view );
-    drawString( s );
-    sprintf( s, "   - opacity = %.1f", MAP_opacity );
-    drawString( s );
-
-    drawString( "SIGNAL" );
-    sprintf( s, "   - shell = %d/%d  (b=%.1f)", GLYPHS_shell+1, SCHEME_shells_b.size(), SCHEME_shells_b[GLYPHS_shell] );
-    drawString( s );
-    sprintf( s, "   - use affine = %s", GLYPHS_use_affine?"true":"false" );
-    drawString( s );
-    sprintf( s, "   - flip = [ %d, %d, %d ]", GLYPHS_flip[0], GLYPHS_flip[1], GLYPHS_flip[2] );
-    drawString( s );
-    sprintf( s, "   - b0 thr = %.1f", GLYPHS_b0_thr );
-    drawString( s );
-
-    if ( PEAKS_n>0 )
-    {
-        drawString( "PEAKS" );
-        sprintf( s, "   - use affine = %s", PEAKS_use_affine?"true":"false" );
-        drawString( s );
-        sprintf( s, "   - flip = [ %d, %d, %d ]", PEAKS_flip[0], PEAKS_flip[1], PEAKS_flip[2] );
-        drawString( s );
-        sprintf( s, "   - thr = %.1f", PEAKS_thr );
-        drawString( s );
-        sprintf( s, "   - normalize = %s", PEAKS_doNormalize?"true":"false" );
-        drawString( s );
-    }
-
-    if ( TRK_nTractsPlotted>0 )
-    {
-        drawString( "FIBERS" );
-        sprintf( s, "   - shift = [ %.1f %.1f %.1f ]  (voxels)", TRK_offset.x, TRK_offset.y, TRK_offset.z );
-        drawString( s );
-        sprintf( s, "   - slab thickness = %.1f  (voxels)", TRK_crop );
-        drawString( s );
-    }
-
-    glEnable (GL_DEPTH_TEST);     
-    glMatrixMode(GL_PROJECTION);
-    glPopMatrix();
-    glMatrixMode(GL_MODELVIEW);
-    glPopMatrix();
-}
-
-
-// KEYBOARD callback
-// -----------------
-void GLUT__keyboard( unsigned char key, GLint x=0, GLint y=0 )
-{
-    bool doRedraw = true;
-
-    switch( key )
-    {
-        case 'l': showConfig = 1 - showConfig; break;
-
-        case '1': showPlane[0] = 1 - showPlane[0]; break;
-        case '2': showPlane[1] = 1 - showPlane[1]; break;
-        case '3': showPlane[2] = 1 - showPlane[2]; break;
-        case '4':
-            showPlane[0] = 1;
-            showPlane[1] = 0;
-            showPlane[2] = 0;
-            translation.x	= translation.y = 0;
-            OPENGL_utils::identity(rot1);
-            OPENGL_utils::rotateX(rot1, 90.0, rot2);
-            OPENGL_utils::rotateZ(rot2, 90.0, rot);
-            break;
-        case '5':
-            showPlane[0] = 0;
-            showPlane[1] = 1;
-            showPlane[2] = 0;
-            translation.x	= translation.y = 0;
-            OPENGL_utils::identity(rot1);
-            OPENGL_utils::rotateX(rot1, 90.0, rot);
-            break;
-        case '6':
-            showPlane[0] = 0;
-            showPlane[1] = 0;
-            showPlane[2] = 1;
-            translation.x	= translation.y = 0;
-            OPENGL_utils::identity( rot );
-            break;
-
-        case '0': showAxes = 1 - showAxes; break;
-        case '-': zoom += 10.0; break;
-        case '+': zoom -= 10.0; break;
-        case 'm': MAP_max_view = fmaxf(0.0,MAP_max_view-MAP_max*0.05); break;
-        case 'M': MAP_max_view = fminf(MAP_max,MAP_max_view+MAP_max*0.05); break;
-        case 'o': MAP_opacity = fmaxf(0.0,MAP_opacity-0.1); break;
-        case 'O': MAP_opacity = fminf(1.0,MAP_opacity+0.1); break;
-        case 'w': LINE_width = fmaxf( 1,LINE_width-1); break;
-        case 'W': LINE_width = fminf(10,LINE_width+1); break;
-        case 'r':
-            showPlane[0] = showPlane[1] = showPlane[2] = 1;
-            translation.x	= translation.y = 0;
-            zoom			= 0;
-            OPENGL_utils::identity( rot );
-            break;
-
-        case 's': GLYPHS_show = 1 - GLYPHS_show; break;
-        case 'S': GLYPHS_shell = (GLYPHS_shell+1) % SCHEME_shells_idx.size(); break;
-        case 'a': GLYPHS_use_affine = 1 - GLYPHS_use_affine; break;
-        case 'x': GLYPHS_flip[0] = 1 - GLYPHS_flip[0]; for(int d=0; d < SCHEME_dirs.size() ;d++) SCHEME_dirs[d].x *= -1; break;
-        case 'y': GLYPHS_flip[1] = 1 - GLYPHS_flip[1]; for(int d=0; d < SCHEME_dirs.size() ;d++) SCHEME_dirs[d].y *= -1; break;
-        case 'z': GLYPHS_flip[2] = 1 - GLYPHS_flip[2]; for(int d=0; d < SCHEME_dirs.size() ;d++) SCHEME_dirs[d].z *= -1; break;
-        case 'b': GLYPHS_b0_thr = fmaxf(0.0,GLYPHS_b0_thr-10.0); break;
-        case 'B': GLYPHS_b0_thr = fminf(MAP_max,GLYPHS_b0_thr+10.0); break;
-
-        case 'p': if ( PEAKS_n>0 ) PEAKS_show  = 1 - PEAKS_show; break;
-        case 'A': PEAKS_use_affine = 1 - PEAKS_use_affine; break;
-        case 'X': PEAKS_flip[0] = 1 - PEAKS_flip[0]; break;
-        case 'Y': PEAKS_flip[1] = 1 - PEAKS_flip[1]; break;
-        case 'Z': PEAKS_flip[2] = 1 - PEAKS_flip[2]; break;
-        case 't': PEAKS_thr = fmaxf(PEAKS_thr - 0.1, 0.0); break;
-        case 'T': PEAKS_thr = fminf(PEAKS_thr + 0.1, 1.0); break;
-        case 'n': PEAKS_doNormalize = 1 - PEAKS_doNormalize; break;
-
-        case 'f': if ( TRK_nTractsPlotted>0 ) TRK_show = 1 - TRK_show; break;
-        case 'c': TRK_crop = fmaxf( 0.0,TRK_crop-0.5); break;
-        case 'C': TRK_crop = fminf(max(dim.x,max(dim.y,dim.z)),TRK_crop+0.5); break;
-        case ' ': TRK_crop_mode = 1 - TRK_crop_mode; break;
-
-        case 'q':
-        case 27 : exit(0); break;
-
-        default: doRedraw = false;
-    }
-
-    if ( doRedraw )
-        glutPostRedisplay();
-}
-
-
-// MENU callback
-// -------------
-void GLUT__menu( int id ) 
-{
-    switch( id )
-    {
-        case   0: GLUT__keyboard('q'); break;
-
-        case 101: GLUT__keyboard('s'); break;
-        case 102: GLUT__keyboard('S'); break;
-        case 103: GLUT__keyboard('a'); break;
-        case 104: GLUT__keyboard('x'); break;
-        case 105: GLUT__keyboard('y'); break;
-        case 106: GLUT__keyboard('z'); break;
-        case 107: GLUT__keyboard('b'); break;
-        case 108: GLUT__keyboard('B'); break;
-
-        case 201: GLUT__keyboard('p'); break;
-        case 202: GLUT__keyboard('A'); break;
-        case 203: GLUT__keyboard('X'); break;
-        case 204: GLUT__keyboard('Y'); break;
-        case 205: GLUT__keyboard('Z'); break;
-        case 206: GLUT__keyboard('t'); break;
-        case 207: GLUT__keyboard('T'); break;
-        case 208: GLUT__keyboard('n'); break;
-
-        case 301: GLUT__keyboard('f'); break;
-        case 302: GLUT__keyboard('c'); break;
-        case 303: GLUT__keyboard('C'); break;
-        case 304: GLUT__keyboard(' '); break;
-
-        case 401: GLUT__keyboard('1'); break;
-        case 402: GLUT__keyboard('2'); break;
-        case 403: GLUT__keyboard('3'); break;
-        case 404: GLUT__keyboard('4'); break;
-        case 405: GLUT__keyboard('5'); break;
-        case 406: GLUT__keyboard('6'); break;
-        case 407: GLUT__keyboard('0'); break;
-        case 408: GLUT__keyboard('-'); break;
-        case 409: GLUT__keyboard('+'); break;
-        case 410: GLUT__keyboard('m'); break;
-        case 411: GLUT__keyboard('M'); break;
-        case 412: GLUT__keyboard('o'); break;
-        case 413: GLUT__keyboard('O'); break;
-        case 414: GLUT__keyboard('w'); break;
-        case 415: GLUT__keyboard('W'); break;
-        case 416: GLUT__keyboard('r'); break;
-        case 417: GLUT__keyboard('l'); break;
-    }
-}
-
-
-// Create the dropdown MENU
-// ------------------------
-void GLUT__createMenu()
-{
-    int submenu_SIGNAL_id, submenu_PEAKS_id, submenu_FIBERS_id, submenu_VIEW_id;
-
-    submenu_SIGNAL_id = glutCreateMenu( GLUT__menu );
-    glutAddMenuEntry("[s] Show/hide",         101);
-    glutAddMenuEntry("[S] Change shell",      102);
-    glutAddMenuEntry("[a] Use affine",        103);
-    glutAddMenuEntry("[x] Flip X axis",       104);
-    glutAddMenuEntry("[y] Flip Y axis",       105);
-    glutAddMenuEntry("[z] Flip Z axis",       106);
-    glutAddMenuEntry("[b] Decrease b0 thr",   107);
-    glutAddMenuEntry("[B] Increase b0 thr",   108);
-
-    if ( PEAKS_n>0 )
-    {
-        submenu_PEAKS_id = glutCreateMenu( GLUT__menu );
-        glutAddMenuEntry("[p] Show/hide",         201);
-        glutAddMenuEntry("[A] Use affine",        202);
-        glutAddMenuEntry("[X] Flip X axis",       203);
-        glutAddMenuEntry("[Y] Flip Y axis",       204);
-        glutAddMenuEntry("[Z] Flip Z axis",       205);
-        glutAddMenuEntry("[t] Decrease threshold",206);
-        glutAddMenuEntry("[T] Increase threshold",207);
-        glutAddMenuEntry("[n] Normalize length",  208);
-    }
-
-    if ( TRK_nTractsPlotted>0 )
-    {
-        submenu_FIBERS_id = glutCreateMenu( GLUT__menu );
-        glutAddMenuEntry("[f] Show/hide",         301);
-        glutAddMenuEntry("[c] Decrease crop size",302);
-        glutAddMenuEntry("[C] Increase crop size",303);
-        glutAddMenuEntry("[ ] Change crop mode",  304);
-    }
-
-    submenu_VIEW_id = glutCreateMenu( GLUT__menu );
-    glutAddMenuEntry("[1] Show/hide YZ plane", 401);
-    glutAddMenuEntry("[2] Show/hide XZ plane", 402);
-    glutAddMenuEntry("[3] Show/hide XY plane", 403);
-    glutAddMenuEntry("[4] Reset to YZ plane",  404);
-    glutAddMenuEntry("[5] Reset to XZ plane",  405);
-    glutAddMenuEntry("[6] Reset to XY plane",  406);
-    glutAddMenuEntry("[0] Show/hide axes",     407);
-    glutAddMenuEntry("[-] Decrease zoom",      408);
-    glutAddMenuEntry("[+] Increase zoom",      409);
-    glutAddMenuEntry("[m] Decrease max value", 410);
-    glutAddMenuEntry("[M] Increase max value", 411);
-    glutAddMenuEntry("[o] Decrease opacity",   412);
-    glutAddMenuEntry("[O] Increase opacity",   413);
-    glutAddMenuEntry("[t] Decrease line width",414);
-    glutAddMenuEntry("[T] Increase line width",415);
-    glutAddMenuEntry("[r] Reset view",         416);
-    glutAddMenuEntry("[l] Show/hide log",      417);
-
-    int menu_id = glutCreateMenu( GLUT__menu );
-    glutAddSubMenu("Signal", submenu_SIGNAL_id);
-    if ( PEAKS_n>0 )
-        glutAddSubMenu("Peaks", submenu_PEAKS_id);
-    if ( TRK_nTractsPlotted>0 )
-        glutAddSubMenu("Fibers", submenu_FIBERS_id);
-    glutAddSubMenu("View options", submenu_VIEW_id);
-    glutAddMenuEntry("Quit", 0);
-    glutAttachMenu(GLUT_RIGHT_BUTTON);
-}
-
-
-// RESHAPE callback
-// ----------------
-void GLUT__reshape( GLint w, GLint h )
-{
-    ScreenX = w;
-    ScreenY = h;
-
-    glViewport( 0, 0, w, h );
-
-    glMatrixMode( GL_PROJECTION );
-    glLoadIdentity();
-    gluPerspective( 45.0f, ScreenX/ScreenY, 1.0f, 5000.0f );
-
-    glMatrixMode( GL_MODELVIEW );
-    glLoadIdentity();
-    gluLookAt(
-        0.0, 0.0, 2.0 * max(pixdim.x*dim.x,pixdim.y*dim.y) * ScreenY/ScreenX, // eye point
-        0.0, 0.0, 0.0, // reference point
-        0.0, 1.0, 0.0  // up vector
-    );
-}
-
-
-// SPECIALKEY callback
-// -------------------
-void GLUT__specialkey( GLint key, GLint x, GLint y )
-{
-    bool doRedraw = true;
-    GLint modif = glutGetModifiers();
-    GLint ALT   = modif & GLUT_ACTIVE_ALT;
-    GLint CTRL  = modif & GLUT_ACTIVE_CTRL;
-
-    switch( key )
-    {
-        case GLUT_KEY_LEFT:
-            if ( ALT )
-                TRK_offset.x -= 0.5;
-            else if ( CTRL )
-                translation.x -= 2.0;
-            else
-                VOXEL.x--;
-            break;
-        case GLUT_KEY_RIGHT:
-            if ( ALT )
-                TRK_offset.x += 0.5;
-            else if ( CTRL )
-                translation.x += 2.0;
-            else
-                VOXEL.x++;
-            break;
-        case GLUT_KEY_DOWN:
-            if ( ALT )
-                TRK_offset.y -= 0.5;
-            else if ( CTRL )
-                translation.y -= 2.0;
-            else
-                VOXEL.y--;
-            break;
-        case GLUT_KEY_UP:
-            if ( ALT )
-                TRK_offset.y += 0.5;
-            else if ( CTRL )
-                translation.y += 2.0;
-            else
-                VOXEL.y++;
-            break;
-        case GLUT_KEY_PAGE_DOWN:
-            if ( ALT )
-                TRK_offset.z -= 0.5;
-            else
-                VOXEL.z--;
-            break;
-        case GLUT_KEY_PAGE_UP:
-            if ( ALT )
-                TRK_offset.z += 0.5;
-            else
-                VOXEL.z++;
-            break;
-
-        default:
-            doRedraw = false;
-    }
-
-    // check the bounds
-    VOXEL.x = max( VOXEL.x, 0 );
-    VOXEL.y = max( VOXEL.y, 0 );
-    VOXEL.z = max( VOXEL.z, 0 );
-    VOXEL.x = min( VOXEL.x, dim.x-1 );
-    VOXEL.y = min( VOXEL.y, dim.y-1 );
-    VOXEL.z = min( VOXEL.z, dim.z-1 );
-
-    if ( doRedraw )
-        glutPostRedisplay();
-}
-
-
-// MOUSE callback
-// --------------
-void GLUT__mouse( GLint button, GLint state, GLint x, GLint y )
-{
-    if (state == GLUT_DOWN)
-    {
-        if ( button == GLUT_LEFT_BUTTON && glutGetModifiers() != GLUT_ACTIVE_CTRL )
-        {
-            moving = 1;
-            start.x = x;
-            start.y = y;
-        }
-        // NOTE: does not work, issue with glutGetModifiers not getting CTRL
-        // else if ( button == GLUT_LEFT_BUTTON && glutGetModifiers() == GLUT_ACTIVE_CTRL )
-        // {
-        //     moving = 2;
-        //     start.x = x;
-        //     start.y = y;
-        // }
-        else if ( (button == GLUT_MIDDLE_BUTTON) || (button == GLUT_LEFT_BUTTON && glutGetModifiers() == GLUT_ACTIVE_ALT) )
-        {
-            moving = 3;
-            start.x = x;
-            start.y = y;
-        }
-    }
-    else if (state == GLUT_UP)
-    {
-        moving = 0;
-    }
-}
-
-
-// MOTION callback
-// ---------------
-void GLUT__motion( GLint x, GLint y )
-{
-    if (moving==1)
-    {
-        OPENGL_utils::translate(id, 0,0,0, rot1);
-
-        OPENGL_utils::rotateY(id,start.x-x,rot3);
-        OPENGL_utils::matXMat(rot,rot1,rot2);
-        OPENGL_utils::rotateX(id,start.y-y,rot1);
-        OPENGL_utils::matXMat(rot2,rot1,rot);
-        OPENGL_utils::matXMat(rot,rot3,rot2);
-
-        OPENGL_utils::translate(id, 0,0,0, rot1);
-        OPENGL_utils::matXMat(rot2,rot1,rot);
-
-        start.x = x;
-        start.y = y;
-    }
-
-    else if (moving==2)
-    {
-        zoom = zoom + (y-start.y)/2.0;
-        start.y = y;
-    }
-
-    else if (moving==3)
-    {
-        translation.x = translation.x - (start.x-x)/3.0;
-        translation.y = translation.y + (start.y-y)/3.0;
-        start.x = x;
-        start.y = y;
-    }
-
-    glutPostRedisplay();
-}
-
-
-// DISPLAY callback
-// ----------------
-void GLUT__display( void )
-{
-    glClear( GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT );
-
-    glPushMatrix();
-    glTranslatef(translation.x, translation.y, -zoom); // mouse translation + zoom
-    glMultMatrixf(rot); // mouse rotation    
-    glTranslatef( -pixdim.x*dim.x/2.0, -pixdim.y*dim.y/2.0, -pixdim.z*dim.z/2.0 ); // center the FOV
-    glScalef( pixdim.x, pixdim.y, pixdim.z ); // account for voxel size
-
-    glEnable(GL_MULTISAMPLE_ARB);
-
-    /* ============= */
-    /* Draw the AXES */
-    /* ============= */
-    if ( showAxes )
-    {
-        glLineWidth(2);
-        glBegin(GL_LINES);
-            glColor4f( 1,0,0,1); glVertex3f( 0,0,0 ); glVertex3f( 10,  0,  0 );
-            glColor4f( 0,1,0,1); glVertex3f( 0,0,0 ); glVertex3f(  0, 10,  0 );
-            glColor4f( 0,0,1,1); glVertex3f( 0,0,0 ); glVertex3f(  0,  0, 10 );
-        glEnd();
-    }
-
-    /* =============== */
-    /* Draw the TRACTS */
-    /* =============== */
-    if ( TRK_show )
-    {
-        glPushMatrix();
-        glTranslatef(TRK_offset.x, TRK_offset.y, TRK_offset.z);
-
-        glLineWidth(1.0f);
-
-        float *ptr  = TRK_coords, *ptrc = TRK_colors;
-        VECTOR<float> Vc( VOXEL.x+0.5, VOXEL.y+0.5, VOXEL.z+0.5 ); // voxel center
-        float thr = 0.5*TRK_crop;
-        for(int f=0; f < TRK_nTractsPlotted; f++)
-        {
-            glBegin(GL_LINE_STRIP);
-            for(int i=0; i < TRK_nPoints[f]; i++)
-            {
-                // plot segment only if it's close to center of VOXEL
-                if (
-                      (
-                        TRK_crop_mode && (
-                        ( showPlane[0] && abs( (ptr[0]+TRK_offset.x) - Vc.x ) <= thr ) ||
-                        ( showPlane[1] && abs( (ptr[1]+TRK_offset.y) - Vc.y ) <= thr ) ||
-                        ( showPlane[2] && abs( (ptr[2]+TRK_offset.z) - Vc.z ) <= thr ) )
-                      )
-                      ||
-                      (
-                        !TRK_crop_mode && (
-                        ( abs( (ptr[0]+TRK_offset.x) - Vc.x ) <= thr ) &&
-                        ( abs( (ptr[1]+TRK_offset.y) - Vc.y ) <= thr ) &&
-                        ( abs( (ptr[2]+TRK_offset.z) - Vc.z ) <= thr ) )
-                      )
-                    )
-                {
-                    glColor3f(  ptrc[0], ptrc[1], ptrc[2] );
-                    glVertex3f( ptr[0],  ptr[1],  ptr[2]  );
-                }
-                else
-                {
-                    glEnd();
-                    glBegin(GL_LINE_STRIP);
-                }
-                ptr  += 3;
-                ptrc += 3;
-            }
-            glEnd();
-        }
-
-        glPopMatrix();
-    }
-
-    /* ============== */
-    /* Draw the PEAKS */
-    /* ============== */
-    if ( PEAKS_show || GLYPHS_show )
-    {
-        glDisable( GL_BLEND );
-        glLineWidth( LINE_width );
-        glPointSize( LINE_width );
-
-        glPushMatrix();
-        glTranslatef(.5,.5,.5);
-
-        Vec3Df dir, col;
-        int x,y,z,d,idx;
-        float norms[PEAKS_n], normMax, b0, w;
-
-        // plane YZ
-        if ( showPlane[0]  )
-        {
-            x = (int)VOXEL.x;
-            for(y=0; y<dim.y ;y++)
-            for(z=0; z<dim.z ;z++)
-            {
-                if ( PEAKS_show )
-                {
-                    normMax = 0;
-                    for(d=0; d<PEAKS_n; d++)
-                    {
-                        col.x = (*niiPEAKS->img)(x,y,z,3*d+0); // use "col" as tmp variable
-                        col.y = (*niiPEAKS->img)(x,y,z,3*d+1);
-                        col.z = (*niiPEAKS->img)(x,y,z,3*d+2);
-                        if ( PEAKS_use_affine )
-                        {
-                            dir.x = col.x * ((float*)PEAKS_affine)[0] + col.y * ((float*)PEAKS_affine)[1] + col.z * ((float*)PEAKS_affine)[2];
-                            dir.y = col.x * ((float*)PEAKS_affine)[3] + col.y * ((float*)PEAKS_affine)[4] + col.z * ((float*)PEAKS_affine)[5];
-                            dir.z = col.x * ((float*)PEAKS_affine)[6] + col.y * ((float*)PEAKS_affine)[7] + col.z * ((float*)PEAKS_affine)[8];
-                        }
-                        else
-                        {
-                            dir.x = col.x;
-                            dir.y = col.y;
-                            dir.z = col.z;
-                        }
-                        norms[d] = dir.norm();
-                        if ( norms[d] > normMax )
-                            normMax = norms[d];
-                    }
-
-                    for(d=0; d<PEAKS_n; d++)
-                    {
-                        if ( norms[d] < PEAKS_thr*normMax )
-                            continue;
-
-                        col.x = (*niiPEAKS->img)(x,y,z,3*d+0); // use "col" as tmp variable
-                        col.y = (*niiPEAKS->img)(x,y,z,3*d+1);
-                        col.z = (*niiPEAKS->img)(x,y,z,3*d+2);
-                        if ( PEAKS_use_affine )
-                        {
-                            dir.x = col.x * ((float*)PEAKS_affine)[0] + col.y * ((float*)PEAKS_affine)[1] + col.z * ((float*)PEAKS_affine)[2];
-                            dir.y = col.x * ((float*)PEAKS_affine)[3] + col.y * ((float*)PEAKS_affine)[4] + col.z * ((float*)PEAKS_affine)[5];
-                            dir.z = col.x * ((float*)PEAKS_affine)[6] + col.y * ((float*)PEAKS_affine)[7] + col.z * ((float*)PEAKS_affine)[8];
-                        }
-                        else
-                        {
-                            dir.x = col.x;
-                            dir.y = col.y;
-                            dir.z = col.z;
-                        }
-                        col.x = 0.5 * (PEAKS_flip[0]?-1:1) * dir.x / norms[d];
-                        col.y = 0.5 * (PEAKS_flip[1]?-1:1) * dir.y / norms[d];
-                        col.z = 0.5 * (PEAKS_flip[2]?-1:1) * dir.z / norms[d];
-
-                        if ( PEAKS_doNormalize )
-                        {
-                            dir.x = col.x;
-                            dir.y = col.y;
-                            dir.z = col.z;
-                        }
-                        else
-                        {
-                            dir.x = col.x * norms[d] / normMax;
-                            dir.y = col.y * norms[d] / normMax;
-                            dir.z = col.z * norms[d] / normMax;
-                        }
-
-                        glColor3f( fabs(2.0*col.x), fabs(2.0*col.y), fabs(2.0*col.z) ); 
-                        glBegin(GL_LINES);
-                            glVertex3f( x-dir.x, y-dir.y, z-dir.z );
-                            glVertex3f( x+dir.x, y+dir.y, z+dir.z );
-                        glEnd();
-                    }
-                }
-                if ( GLYPHS_show )
-                {
-                    b0 = (*niiDWI->img)(x,y,z,SCHEME_idxB0[0]);
-                    if ( b0 > GLYPHS_b0_thr )
-                    {
-                        glBegin(GL_POINTS);
-                        for(d=0; d < SCHEME_shells_idx[GLYPHS_shell].size() ;d++)
-                        {
-                            idx = SCHEME_shells_idx[GLYPHS_shell][d];
-                            w = 0.5 * (float)(*niiDWI->img)(x,y,z,idx) / b0;
-                            if ( GLYPHS_use_affine ) 
-                            {
-                                dir.x = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[0] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[1] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[2];
-                                dir.y = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[3] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[4] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[5];
-                                dir.z = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[6] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[7] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[8];
-                                normMax = dir.norm();
-                                dir.x *= w / normMax;
-                                dir.y *= w / normMax;
-                                dir.z *= w / normMax;
-                            }
-                            else
-                            {
-                                dir.x = w * SCHEME_dirs[idx].x;
-                                dir.y = w * SCHEME_dirs[idx].y;
-                                dir.z = w * SCHEME_dirs[idx].z;
-                            }
-                            normMax = dir.norm();
-                            glColor3f( fabs(dir.x)/normMax, fabs(dir.y)/normMax, fabs(dir.z)/normMax );
-                            glVertex3f( x+dir.x, y+dir.y, z+dir.z );
-                            glVertex3f( x-dir.x, y-dir.y, z-dir.z );
-                        }
-                        glEnd();
-                    }
-                }
-            }
-        }
-
-        // plane XZ
-        if ( showPlane[1] )
-        {
-            y = (int)VOXEL.y;
-            for(x=0; x<dim.x ;x++)
-            for(z=0; z<dim.z ;z++)
-            {
-                if ( PEAKS_show )
-                {
-                    normMax = 0;
-                    for(d=0; d<PEAKS_n; d++)
-                    {
-                        col.x = (*niiPEAKS->img)(x,y,z,3*d+0); // use "col" as tmp variable
-                        col.y = (*niiPEAKS->img)(x,y,z,3*d+1);
-                        col.z = (*niiPEAKS->img)(x,y,z,3*d+2);
-                        if ( PEAKS_use_affine )
-                        {
-                            dir.x = col.x * ((float*)PEAKS_affine)[0] + col.y * ((float*)PEAKS_affine)[1] + col.z * ((float*)PEAKS_affine)[2];
-                            dir.y = col.x * ((float*)PEAKS_affine)[3] + col.y * ((float*)PEAKS_affine)[4] + col.z * ((float*)PEAKS_affine)[5];
-                            dir.z = col.x * ((float*)PEAKS_affine)[6] + col.y * ((float*)PEAKS_affine)[7] + col.z * ((float*)PEAKS_affine)[8];
-                        }
-                        else
-                        {
-                            dir.x = col.x;
-                            dir.y = col.y;
-                            dir.z = col.z;
-                        }
-                        norms[d] = dir.norm();
-                        if ( norms[d] > normMax )
-                            normMax = norms[d];
-                    }
-
-                    for(d=0; d<PEAKS_n; d++)
-                    {
-                        if ( norms[d] < normMax*PEAKS_thr )
-                            continue;
-
-                        col.x = (*niiPEAKS->img)(x,y,z,3*d+0); // use "col" as tmp variable
-                        col.y = (*niiPEAKS->img)(x,y,z,3*d+1);
-                        col.z = (*niiPEAKS->img)(x,y,z,3*d+2);
-                        if ( PEAKS_use_affine )
-                        {
-                            dir.x = col.x * ((float*)PEAKS_affine)[0] + col.y * ((float*)PEAKS_affine)[1] + col.z * ((float*)PEAKS_affine)[2];
-                            dir.y = col.x * ((float*)PEAKS_affine)[3] + col.y * ((float*)PEAKS_affine)[4] + col.z * ((float*)PEAKS_affine)[5];
-                            dir.z = col.x * ((float*)PEAKS_affine)[6] + col.y * ((float*)PEAKS_affine)[7] + col.z * ((float*)PEAKS_affine)[8];
-                        }
-                        else
-                        {
-                            dir.x = col.x;
-                            dir.y = col.y;
-                            dir.z = col.z;
-                        }
-                        col.x = 0.5 * (PEAKS_flip[0]?-1:1) * dir.x / norms[d];
-                        col.y = 0.5 * (PEAKS_flip[1]?-1:1) * dir.y / norms[d];
-                        col.z = 0.5 * (PEAKS_flip[2]?-1:1) * dir.z / norms[d];
-
-                        if ( PEAKS_doNormalize )
-                        {
-                            dir.x = col.x;
-                            dir.y = col.y;
-                            dir.z = col.z;
-                        }
-                        else
-                        {
-                            dir.x = col.x * norms[d] / normMax;
-                            dir.y = col.y * norms[d] / normMax;
-                            dir.z = col.z * norms[d] / normMax;
-                        }
-
-                        glColor3f( fabs(2.0*col.x), fabs(2.0*col.y), fabs(2.0*col.z) );
-                        glBegin(GL_LINES);
-                            glVertex3f( x-dir.x, y-dir.y, z-dir.z );
-                            glVertex3f( x+dir.x, y+dir.y, z+dir.z );
-                        glEnd();
-                    }
-                }
-
-                if ( GLYPHS_show )
-                {
-                    b0 = (*niiDWI->img)(x,y,z,SCHEME_idxB0[0]);
-                    if ( b0 > GLYPHS_b0_thr )
-                    {
-                        glBegin(GL_POINTS);
-                        for(d=0; d < SCHEME_shells_idx[GLYPHS_shell].size() ;d++)
-                        {
-                            idx = SCHEME_shells_idx[GLYPHS_shell][d];
-                            w = 0.5 * (float)(*niiDWI->img)(x,y,z,idx) / b0;
-                            if ( GLYPHS_use_affine ) 
-                            {
-                                dir.x = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[0] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[1] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[2];
-                                dir.y = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[3] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[4] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[5];
-                                dir.z = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[6] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[7] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[8];
-                                normMax = dir.norm();
-                                dir.x *= w / normMax;
-                                dir.y *= w / normMax;
-                                dir.z *= w / normMax;
-                            }
-                            else
-                            {
-                                dir.x = w * SCHEME_dirs[idx].x;
-                                dir.y = w * SCHEME_dirs[idx].y;
-                                dir.z = w * SCHEME_dirs[idx].z;
-                            }
-                            normMax = dir.norm();
-                            glColor3f( fabs(dir.x)/normMax, fabs(dir.y)/normMax, fabs(dir.z)/normMax );
-                            glVertex3f( x+dir.x, y+dir.y, z+dir.z );
-                            glVertex3f( x-dir.x, y-dir.y, z-dir.z );
-                        }
-                        glEnd();
-                    }
-                }
-            }
-        }
-
-        // plane XY
-        if ( showPlane[2] )
-        {
-            z = (int)VOXEL.z;
-            for(y=0; y<dim.y ;y++)
-            for(x=0; x<dim.x ;x++)
-            {
-                if ( PEAKS_show )
-                {
-                    normMax = 0;
-                    for(d=0; d<PEAKS_n; d++)
-                    {
-                        col.x = (*niiPEAKS->img)(x,y,z,3*d+0); // use "col" as tmp variable
-                        col.y = (*niiPEAKS->img)(x,y,z,3*d+1);
-                        col.z = (*niiPEAKS->img)(x,y,z,3*d+2);
-                        if ( PEAKS_use_affine )
-                        {
-                            dir.x = col.x * ((float*)PEAKS_affine)[0] + col.y * ((float*)PEAKS_affine)[1] + col.z * ((float*)PEAKS_affine)[2];
-                            dir.y = col.x * ((float*)PEAKS_affine)[3] + col.y * ((float*)PEAKS_affine)[4] + col.z * ((float*)PEAKS_affine)[5];
-                            dir.z = col.x * ((float*)PEAKS_affine)[6] + col.y * ((float*)PEAKS_affine)[7] + col.z * ((float*)PEAKS_affine)[8];
-                        }
-                        else
-                        {
-                            dir.x = col.x;
-                            dir.y = col.y;
-                            dir.z = col.z;
-                        }
-                        norms[d] = dir.norm();
-                        if ( norms[d] > normMax )
-                            normMax = norms[d];
-                    }
-
-                    for(d=0; d<PEAKS_n; d++)
-                    {
-                        if ( norms[d] < normMax*PEAKS_thr )
-                            continue;
-
-                        col.x = (*niiPEAKS->img)(x,y,z,3*d+0); // use "col" as tmp variable
-                        col.y = (*niiPEAKS->img)(x,y,z,3*d+1);
-                        col.z = (*niiPEAKS->img)(x,y,z,3*d+2);
-                        if ( PEAKS_use_affine )
-                        {
-                            dir.x = col.x * ((float*)PEAKS_affine)[0] + col.y * ((float*)PEAKS_affine)[1] + col.z * ((float*)PEAKS_affine)[2];
-                            dir.y = col.x * ((float*)PEAKS_affine)[3] + col.y * ((float*)PEAKS_affine)[4] + col.z * ((float*)PEAKS_affine)[5];
-                            dir.z = col.x * ((float*)PEAKS_affine)[6] + col.y * ((float*)PEAKS_affine)[7] + col.z * ((float*)PEAKS_affine)[8];
-                        }
-                        else
-                        {
-                            dir.x = col.x;
-                            dir.y = col.y;
-                            dir.z = col.z;
-                        }
-                        col.x = 0.5 * (PEAKS_flip[0]?-1:1) * dir.x / norms[d];
-                        col.y = 0.5 * (PEAKS_flip[1]?-1:1) * dir.y / norms[d];
-                        col.z = 0.5 * (PEAKS_flip[2]?-1:1) * dir.z / norms[d];
-
-                        if ( PEAKS_doNormalize )
-                        {
-                            dir.x = col.x;
-                            dir.y = col.y;
-                            dir.z = col.z;
-                        }
-                        else
-                        {
-                            dir.x = col.x * norms[d] / normMax;
-                            dir.y = col.y * norms[d] / normMax;
-                            dir.z = col.z * norms[d] / normMax;
-                        }
-
-                        glColor3f( fabs(2.0*col.x), fabs(2.0*col.y), fabs(2.0*col.z) );
-                        glBegin(GL_LINES);
-                            glVertex3f( x-dir.x, y-dir.y, z-dir.z );
-                            glVertex3f( x+dir.x, y+dir.y, z+dir.z );
-                        glEnd();
-                    }
-                }
-
-                if( GLYPHS_show)
-                {
-                    b0 = (*niiDWI->img)(x,y,z,SCHEME_idxB0[0]);
-                    if ( b0 > GLYPHS_b0_thr )
-                    {
-                        glBegin(GL_POINTS);
-                        for(d=0; d < SCHEME_shells_idx[GLYPHS_shell].size() ;d++)
-                        {
-                            idx = SCHEME_shells_idx[GLYPHS_shell][d];
-                            w = 0.5 * (float)(*niiDWI->img)(x,y,z,idx) / b0;
-                            if ( GLYPHS_use_affine ) 
-                            {
-                                dir.x = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[0] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[1] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[2];
-                                dir.y = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[3] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[4] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[5];
-                                dir.z = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[6] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[7] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[8];
-                                normMax = dir.norm();
-                                dir.x *= w / normMax;
-                                dir.y *= w / normMax;
-                                dir.z *= w / normMax;
-                            }
-                            else
-                            {
-                                dir.x = w * SCHEME_dirs[idx].x;
-                                dir.y = w * SCHEME_dirs[idx].y;
-                                dir.z = w * SCHEME_dirs[idx].z;
-                            }
-
-                            normMax = dir.norm();
-                            glColor3f( fabs(dir.x)/normMax, fabs(dir.y)/normMax, fabs(dir.z)/normMax );
-                            glVertex3f( x+dir.x, y+dir.y, z+dir.z );
-                            glVertex3f( x-dir.x, y-dir.y, z-dir.z );
-                        }
-                        glEnd();
-                    }
-                }
-            }
-        }
-
-        glPopMatrix();
-    }
-
-    /* =================== */
-    /* Draw the SCALAR MAP */
-    /* =================== */
-    if ( showPlane[0] || showPlane[1] || showPlane[2] )
-    {
-        glDisable( GL_CULL_FACE );
-        glEnable( GL_BLEND );
-        glBlendFunc( GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA );
-
-        // to avoid z-fighting
-        glPolygonOffset( 1.0, 1.0 );
-        glEnable(GL_POLYGON_OFFSET_FILL);
-        glPolygonMode(GL_FRONT_AND_BACK, GL_FILL);
-
-        glLineWidth( 3 );
-
-        int x, y, z; // voxel coordinates NB: (0,0,0) -> corner of voxel
-        float color;
-
-        // plane YZ
-        if ( showPlane[0]  )
-        {
-            glPushMatrix();
-            glTranslatef(0.5,0,0);
-
-            x = (int)VOXEL.x;
-            for(y=0; y<dim.y ;y++)
-            for(z=0; z<dim.z ;z++)
-            {
-                color = ( MAP(x,y,z) - MAP_min_view) / ( MAP_max_view - MAP_min_view );
-                glColor4f(color,color,color,MAP_opacity);
-                glBegin(GL_QUADS);
-                    glVertex3f(x, y,   z);
-                    glVertex3f(x, y,   z+1);
-                    glVertex3f(x, y+1, z+1);
-                    glVertex3f(x, y+1, z);
-                glEnd();
-            }
-            // colored frame
-            if ( showAxes )
-            {
-                glColor3f(1,0,0);
-                glBegin(GL_LINE_STRIP);
-                    glVertex3f(x,0,0);
-                    glVertex3f(x,dim.y,0);
-                    glVertex3f(x,dim.y,dim.z);
-                    glVertex3f(x,0,dim.z);
-                    glVertex3f(x,0,0);
-                glEnd();
-            }
-
-            glPopMatrix();
-        }
-
-        // plane XZ
-        if ( showPlane[1] )
-        {
-            glPushMatrix();
-            glTranslatef(0,0.5,0);
-
-            y = (int)VOXEL.y;
-            for(x=0; x<dim.x ;x++)
-            for(z=0; z<dim.z ;z++)
-            {
-                color = ( MAP(x,y,z) - MAP_min_view) / ( MAP_max_view - MAP_min_view );
-                glColor4f(color,color,color,MAP_opacity);
-                glBegin(GL_QUADS);
-                    glVertex3f(x,   y, z);
-                    glVertex3f(x,   y, z+1);
-                    glVertex3f(x+1, y, z+1);
-                    glVertex3f(x+1, y, z);
-                glEnd();
-            }
-            // colored frame
-            if ( showAxes )
-            {
-                glColor3f(0,1,0);
-                glBegin(GL_LINE_STRIP);
-                    glVertex3f(0,y,0);
-                    glVertex3f(dim.x,y,0);
-                    glVertex3f(dim.x,y,dim.z);
-                    glVertex3f(0,y,dim.z);
-                    glVertex3f(0,y,0);
-                glEnd();
-            }
-
-            glPopMatrix();
-        }
-
-        // plane XY
-        if ( showPlane[2] )
-        {
-            glPushMatrix();
-            glTranslatef(0,0,0.5);
-
-            z = (int)VOXEL.z;
-            for(y=0; y<dim.y ;y++)
-            for(x=0; x<dim.x ;x++)
-            {
-                color = ( MAP(x,y,z) - MAP_min_view) / ( MAP_max_view - MAP_min_view );
-                glColor4f(color,color,color,MAP_opacity);
-                glBegin(GL_QUADS);
-                    glVertex3f(x,   y,   z);
-                    glVertex3f(x+1, y,   z);
-                    glVertex3f(x+1, y+1, z);
-                    glVertex3f(x,   y+1, z);
-                glEnd();
-            }
-
-            // colored frame
-            if ( showAxes )
-            {
-                glColor3f(0,0,1);
-                glBegin(GL_LINE_STRIP);
-                    glVertex3f(0,0,z);
-                    glVertex3f(dim.x,0,z);
-                    glVertex3f(dim.x,dim.y,z);
-                    glVertex3f(0,dim.y,z);
-                    glVertex3f(0,0,z);
-                glEnd();
-            }
-
-            glPopMatrix();
-        }
-
-        glEnable(GL_CULL_FACE);
-        glDisable( GL_BLEND );
-        glDisable(GL_POLYGON_OFFSET_FILL);
-    }
-
-    /* ====================== */
-    /* Draw the CURRENT VOXEL */
-    /* ====================== */
-    if ( showAxes )
-    {
-        glPushMatrix();
-        glTranslatef( VOXEL.x+0.5, VOXEL.y+0.5, VOXEL.z+0.5 );
-
-        glEnable( GL_BLEND );
-        glBlendFunc( GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA );
-        glLineWidth(1);
-        glColor4f( 1,1,0,1 );
-        glutWireCube( 1 );
-        glColor4f( 1,1,0,0.25 );
-        glutSolidCube( 1 );
-        glDisable( GL_BLEND );
-
-        glPopMatrix();
-    }
-
-    glPopMatrix();
-    PrintConfig();
-    glutSwapBuffers();
-}
-
-
-// INITIALIZATION
-// --------------
-void OpenGL_init( int argc, char** argv )
-{
-    glutInit( &argc, argv );
-    glutInitDisplayMode( GLUT_DEPTH | GLUT_DOUBLE | GLUT_RGBA | GLUT_ALPHA | GLUT_MULTISAMPLE );
-    ScreenX = 0.7*glutGet(GLUT_SCREEN_WIDTH);  if (ScreenX==0) ScreenX = 800;
-    ScreenY = 0.7*glutGet(GLUT_SCREEN_HEIGHT); if (ScreenY==0) ScreenY = 600;
-    glutInitWindowSize( ScreenX, ScreenY );
-    glutInitWindowPosition( 0.15*glutGet(GLUT_SCREEN_WIDTH), 0.15*glutGet(GLUT_SCREEN_HEIGHT) );
-    glutCreateWindow( "COMMIT debugger" );
-
-    // Projection and model matrix
-    glMatrixMode(GL_PROJECTION);
-    glLoadIdentity();
-    glMatrixMode(GL_MODELVIEW);
-    glLoadIdentity();
-
-    translation.x	= translation.y = 0;
-    zoom			= 0;
-    OPENGL_utils::identity( rot );
-    OPENGL_utils::identity( id );
-
-    glEnable( GL_DEPTH_TEST );
-    glClearColor( 0.1, 0.1, 0.1, 0.0 );
-
-    // lighting
-    glShadeModel( GL_SMOOTH );
-    glEnable( GL_NORMALIZE );
-
-    GLfloat white[] = {.5f, .5f, .5f, 1.0f};
-    glMaterialfv(GL_FRONT, GL_SPECULAR, white);
-    GLfloat shininess[] = {32};
-    glMaterialfv(GL_FRONT, GL_SHININESS, shininess);
-
-    glLightModeli(GL_LIGHT_MODEL_TWO_SIDE, GL_FALSE);
-    GLfloat global_ambient[] = { 0.2f, 0.2f, 0.2f, 1.0f };
-    glLightModelfv(GL_LIGHT_MODEL_AMBIENT, global_ambient);
-    glEnable ( GL_COLOR_MATERIAL );
-
-    // register CALLBACKS and open window
-    glutKeyboardFunc( GLUT__keyboard );
-    glutSpecialFunc(  GLUT__specialkey );
-    glutDisplayFunc(  GLUT__display );
-    glutReshapeFunc(  GLUT__reshape );
-    glutMouseFunc(    GLUT__mouse );
-    glutMotionFunc(   GLUT__motion );
-
-    GLUT__createMenu();
-
-    glutMainLoop();
-}
+#define GL_GLEXT_PROTOTYPES 1
+#ifdef __APPLE__
+    #include <OpenGL/gl.h>
+    #include <OpenGL/glext.h>
+    #include <GLUT/glut.h>
+#else
+    #include <GL/gl.h>
+    #include <GL/glext.h>
+    #include <GL/glut.h>
+#endif
+
+#include "OPENGL_utils.h"
+using namespace OPENGL_utils;
+
+/* global variables */
+GLfloat			id[16], rot[16], rot1[16], rot2[16], rot3[16];
+Vec3Df			translation;
+Vec3Di			start;
+GLint			moving;
+GLfloat			zoom;
+
+float ScreenX, ScreenY;
+
+
+void drawString( const char *string )
+{
+    static int y = glutGet( GLUT_WINDOW_HEIGHT ) - 50;
+    if ( string=="" )
+        y = glutGet( GLUT_WINDOW_HEIGHT ) - 50;
+    else
+    {
+        glRasterPos2i(10, y);
+        for (const char* c=string; *c != '\0'; c++) 
+            glutBitmapCharacter(GLUT_BITMAP_9_BY_15, *c);
+        y -= 18;
+    }
+}
+
+
+void PrintConfig()
+{
+    if ( !showConfig )
+        return;
+
+    glMatrixMode(GL_PROJECTION);
+    glPushMatrix();             
+    glLoadIdentity();
+    glMatrixMode( GL_MODELVIEW ) ;
+    glPushMatrix() ;
+    glLoadIdentity() ;
+    int w = glutGet( GLUT_WINDOW_WIDTH );
+    int h = glutGet( GLUT_WINDOW_HEIGHT );
+    glOrtho( 0, w, 0, h, -1, 1 );
+    glDisable( GL_DEPTH_TEST ); 
+
+    char s[1024];
+    glColor3f(1, 1, 0);
+    drawString( "" ); // reset initial position
+
+    drawString( "MAP" );
+    sprintf( s, "   - value(%d,%d,%d) = %.2f", VOXEL.x, VOXEL.y, VOXEL.z, MAP(VOXEL.x, VOXEL.y, VOXEL.z) );
+    drawString( s );
+    sprintf( s, "   - range = [ %.1f ... %.1f ]", MAP_min_view, MAP_max_view );
+    drawString( s );
+    sprintf( s, "   - opacity = %.1f", MAP_opacity );
+    drawString( s );
+
+    drawString( "SIGNAL" );
+    sprintf( s, "   - shell = %d/%d  (b=%.1f)", GLYPHS_shell+1, SCHEME_shells_b.size(), SCHEME_shells_b[GLYPHS_shell] );
+    drawString( s );
+    sprintf( s, "   - use affine = %s", GLYPHS_use_affine?"true":"false" );
+    drawString( s );
+    sprintf( s, "   - flip = [ %d, %d, %d ]", GLYPHS_flip[0], GLYPHS_flip[1], GLYPHS_flip[2] );
+    drawString( s );
+    sprintf( s, "   - b0 thr = %.1f", GLYPHS_b0_thr );
+    drawString( s );
+
+    if ( PEAKS_n>0 )
+    {
+        drawString( "PEAKS" );
+        sprintf( s, "   - use affine = %s", PEAKS_use_affine?"true":"false" );
+        drawString( s );
+        sprintf( s, "   - flip = [ %d, %d, %d ]", PEAKS_flip[0], PEAKS_flip[1], PEAKS_flip[2] );
+        drawString( s );
+        sprintf( s, "   - thr = %.1f", PEAKS_thr );
+        drawString( s );
+        sprintf( s, "   - normalize = %s", PEAKS_doNormalize?"true":"false" );
+        drawString( s );
+    }
+
+    if ( TRK_nTractsPlotted>0 )
+    {
+        drawString( "FIBERS" );
+        sprintf( s, "   - shift = [ %.1f %.1f %.1f ]  (voxels)", TRK_offset.x, TRK_offset.y, TRK_offset.z );
+        drawString( s );
+        sprintf( s, "   - slab thickness = %.1f  (voxels)", TRK_crop );
+        drawString( s );
+    }
+
+    glEnable (GL_DEPTH_TEST);     
+    glMatrixMode(GL_PROJECTION);
+    glPopMatrix();
+    glMatrixMode(GL_MODELVIEW);
+    glPopMatrix();
+}
+
+
+// KEYBOARD callback
+// -----------------
+void GLUT__keyboard( unsigned char key, GLint x=0, GLint y=0 )
+{
+    bool doRedraw = true;
+
+    switch( key )
+    {
+        case 'l': showConfig = 1 - showConfig; break;
+
+        case '1': showPlane[0] = 1 - showPlane[0]; break;
+        case '2': showPlane[1] = 1 - showPlane[1]; break;
+        case '3': showPlane[2] = 1 - showPlane[2]; break;
+        case '4':
+            showPlane[0] = 1;
+            showPlane[1] = 0;
+            showPlane[2] = 0;
+            translation.x	= translation.y = 0;
+            OPENGL_utils::identity(rot1);
+            OPENGL_utils::rotateX(rot1, 90.0, rot2);
+            OPENGL_utils::rotateZ(rot2, 90.0, rot);
+            break;
+        case '5':
+            showPlane[0] = 0;
+            showPlane[1] = 1;
+            showPlane[2] = 0;
+            translation.x	= translation.y = 0;
+            OPENGL_utils::identity(rot1);
+            OPENGL_utils::rotateX(rot1, 90.0, rot);
+            break;
+        case '6':
+            showPlane[0] = 0;
+            showPlane[1] = 0;
+            showPlane[2] = 1;
+            translation.x	= translation.y = 0;
+            OPENGL_utils::identity( rot );
+            break;
+
+        case '0': showAxes = 1 - showAxes; break;
+        case '-': zoom += 10.0; break;
+        case '+': zoom -= 10.0; break;
+        case 'm': MAP_max_view = fmaxf(0.0,MAP_max_view-MAP_max*0.05); break;
+        case 'M': MAP_max_view = fminf(MAP_max,MAP_max_view+MAP_max*0.05); break;
+        case 'o': MAP_opacity = fmaxf(0.0,MAP_opacity-0.1); break;
+        case 'O': MAP_opacity = fminf(1.0,MAP_opacity+0.1); break;
+        case 'w': LINE_width = fmaxf( 1,LINE_width-1); break;
+        case 'W': LINE_width = fminf(10,LINE_width+1); break;
+        case 'r':
+            showPlane[0] = showPlane[1] = showPlane[2] = 1;
+            translation.x	= translation.y = 0;
+            zoom			= 0;
+            OPENGL_utils::identity( rot );
+            break;
+
+        case 's': GLYPHS_show = 1 - GLYPHS_show; break;
+        case 'S': GLYPHS_shell = (GLYPHS_shell+1) % SCHEME_shells_idx.size(); break;
+        case 'a': GLYPHS_use_affine = 1 - GLYPHS_use_affine; break;
+        case 'x': GLYPHS_flip[0] = 1 - GLYPHS_flip[0]; for(int d=0; d < SCHEME_dirs.size() ;d++) SCHEME_dirs[d].x *= -1; break;
+        case 'y': GLYPHS_flip[1] = 1 - GLYPHS_flip[1]; for(int d=0; d < SCHEME_dirs.size() ;d++) SCHEME_dirs[d].y *= -1; break;
+        case 'z': GLYPHS_flip[2] = 1 - GLYPHS_flip[2]; for(int d=0; d < SCHEME_dirs.size() ;d++) SCHEME_dirs[d].z *= -1; break;
+        case 'b': GLYPHS_b0_thr = fmaxf(0.0,GLYPHS_b0_thr-10.0); break;
+        case 'B': GLYPHS_b0_thr = fminf(MAP_max,GLYPHS_b0_thr+10.0); break;
+
+        case 'p': if ( PEAKS_n>0 ) PEAKS_show  = 1 - PEAKS_show; break;
+        case 'A': PEAKS_use_affine = 1 - PEAKS_use_affine; break;
+        case 'X': PEAKS_flip[0] = 1 - PEAKS_flip[0]; break;
+        case 'Y': PEAKS_flip[1] = 1 - PEAKS_flip[1]; break;
+        case 'Z': PEAKS_flip[2] = 1 - PEAKS_flip[2]; break;
+        case 't': PEAKS_thr = fmaxf(PEAKS_thr - 0.1, 0.0); break;
+        case 'T': PEAKS_thr = fminf(PEAKS_thr + 0.1, 1.0); break;
+        case 'n': PEAKS_doNormalize = 1 - PEAKS_doNormalize; break;
+
+        case 'f': if ( TRK_nTractsPlotted>0 ) TRK_show = 1 - TRK_show; break;
+        case 'c': TRK_crop = fmaxf( 0.0,TRK_crop-0.5); break;
+        case 'C': TRK_crop = fminf(max(dim.x,max(dim.y,dim.z)),TRK_crop+0.5); break;
+        case ' ': TRK_crop_mode = 1 - TRK_crop_mode; break;
+
+        case 'q':
+        case 27 : exit(0); break;
+
+        default: doRedraw = false;
+    }
+
+    if ( doRedraw )
+        glutPostRedisplay();
+}
+
+
+// MENU callback
+// -------------
+void GLUT__menu( int id ) 
+{
+    switch( id )
+    {
+        case   0: GLUT__keyboard('q'); break;
+
+        case 101: GLUT__keyboard('s'); break;
+        case 102: GLUT__keyboard('S'); break;
+        case 103: GLUT__keyboard('a'); break;
+        case 104: GLUT__keyboard('x'); break;
+        case 105: GLUT__keyboard('y'); break;
+        case 106: GLUT__keyboard('z'); break;
+        case 107: GLUT__keyboard('b'); break;
+        case 108: GLUT__keyboard('B'); break;
+
+        case 201: GLUT__keyboard('p'); break;
+        case 202: GLUT__keyboard('A'); break;
+        case 203: GLUT__keyboard('X'); break;
+        case 204: GLUT__keyboard('Y'); break;
+        case 205: GLUT__keyboard('Z'); break;
+        case 206: GLUT__keyboard('t'); break;
+        case 207: GLUT__keyboard('T'); break;
+        case 208: GLUT__keyboard('n'); break;
+
+        case 301: GLUT__keyboard('f'); break;
+        case 302: GLUT__keyboard('c'); break;
+        case 303: GLUT__keyboard('C'); break;
+        case 304: GLUT__keyboard(' '); break;
+
+        case 401: GLUT__keyboard('1'); break;
+        case 402: GLUT__keyboard('2'); break;
+        case 403: GLUT__keyboard('3'); break;
+        case 404: GLUT__keyboard('4'); break;
+        case 405: GLUT__keyboard('5'); break;
+        case 406: GLUT__keyboard('6'); break;
+        case 407: GLUT__keyboard('0'); break;
+        case 408: GLUT__keyboard('-'); break;
+        case 409: GLUT__keyboard('+'); break;
+        case 410: GLUT__keyboard('m'); break;
+        case 411: GLUT__keyboard('M'); break;
+        case 412: GLUT__keyboard('o'); break;
+        case 413: GLUT__keyboard('O'); break;
+        case 414: GLUT__keyboard('w'); break;
+        case 415: GLUT__keyboard('W'); break;
+        case 416: GLUT__keyboard('r'); break;
+        case 417: GLUT__keyboard('l'); break;
+    }
+}
+
+
+// Create the dropdown MENU
+// ------------------------
+void GLUT__createMenu()
+{
+    int submenu_SIGNAL_id, submenu_PEAKS_id, submenu_FIBERS_id, submenu_VIEW_id;
+
+    submenu_SIGNAL_id = glutCreateMenu( GLUT__menu );
+    glutAddMenuEntry("[s] Show/hide",         101);
+    glutAddMenuEntry("[S] Change shell",      102);
+    glutAddMenuEntry("[a] Use affine",        103);
+    glutAddMenuEntry("[x] Flip X axis",       104);
+    glutAddMenuEntry("[y] Flip Y axis",       105);
+    glutAddMenuEntry("[z] Flip Z axis",       106);
+    glutAddMenuEntry("[b] Decrease b0 thr",   107);
+    glutAddMenuEntry("[B] Increase b0 thr",   108);
+
+    if ( PEAKS_n>0 )
+    {
+        submenu_PEAKS_id = glutCreateMenu( GLUT__menu );
+        glutAddMenuEntry("[p] Show/hide",         201);
+        glutAddMenuEntry("[A] Use affine",        202);
+        glutAddMenuEntry("[X] Flip X axis",       203);
+        glutAddMenuEntry("[Y] Flip Y axis",       204);
+        glutAddMenuEntry("[Z] Flip Z axis",       205);
+        glutAddMenuEntry("[t] Decrease threshold",206);
+        glutAddMenuEntry("[T] Increase threshold",207);
+        glutAddMenuEntry("[n] Normalize length",  208);
+    }
+
+    if ( TRK_nTractsPlotted>0 )
+    {
+        submenu_FIBERS_id = glutCreateMenu( GLUT__menu );
+        glutAddMenuEntry("[f] Show/hide",         301);
+        glutAddMenuEntry("[c] Decrease crop size",302);
+        glutAddMenuEntry("[C] Increase crop size",303);
+        glutAddMenuEntry("[ ] Change crop mode",  304);
+    }
+
+    submenu_VIEW_id = glutCreateMenu( GLUT__menu );
+    glutAddMenuEntry("[1] Show/hide YZ plane", 401);
+    glutAddMenuEntry("[2] Show/hide XZ plane", 402);
+    glutAddMenuEntry("[3] Show/hide XY plane", 403);
+    glutAddMenuEntry("[4] Reset to YZ plane",  404);
+    glutAddMenuEntry("[5] Reset to XZ plane",  405);
+    glutAddMenuEntry("[6] Reset to XY plane",  406);
+    glutAddMenuEntry("[0] Show/hide axes",     407);
+    glutAddMenuEntry("[-] Decrease zoom",      408);
+    glutAddMenuEntry("[+] Increase zoom",      409);
+    glutAddMenuEntry("[m] Decrease max value", 410);
+    glutAddMenuEntry("[M] Increase max value", 411);
+    glutAddMenuEntry("[o] Decrease opacity",   412);
+    glutAddMenuEntry("[O] Increase opacity",   413);
+    glutAddMenuEntry("[t] Decrease line width",414);
+    glutAddMenuEntry("[T] Increase line width",415);
+    glutAddMenuEntry("[r] Reset view",         416);
+    glutAddMenuEntry("[l] Show/hide log",      417);
+
+    int menu_id = glutCreateMenu( GLUT__menu );
+    glutAddSubMenu("Signal", submenu_SIGNAL_id);
+    if ( PEAKS_n>0 )
+        glutAddSubMenu("Peaks", submenu_PEAKS_id);
+    if ( TRK_nTractsPlotted>0 )
+        glutAddSubMenu("Fibers", submenu_FIBERS_id);
+    glutAddSubMenu("View options", submenu_VIEW_id);
+    glutAddMenuEntry("Quit", 0);
+    glutAttachMenu(GLUT_RIGHT_BUTTON);
+}
+
+
+// RESHAPE callback
+// ----------------
+void GLUT__reshape( GLint w, GLint h )
+{
+    ScreenX = w;
+    ScreenY = h;
+
+    glViewport( 0, 0, w, h );
+
+    glMatrixMode( GL_PROJECTION );
+    glLoadIdentity();
+    gluPerspective( 45.0f, ScreenX/ScreenY, 1.0f, 5000.0f );
+
+    glMatrixMode( GL_MODELVIEW );
+    glLoadIdentity();
+    gluLookAt(
+        0.0, 0.0, 2.0 * max(pixdim.x*dim.x,pixdim.y*dim.y) * ScreenY/ScreenX, // eye point
+        0.0, 0.0, 0.0, // reference point
+        0.0, 1.0, 0.0  // up vector
+    );
+}
+
+
+// SPECIALKEY callback
+// -------------------
+void GLUT__specialkey( GLint key, GLint x, GLint y )
+{
+    bool doRedraw = true;
+    GLint modif = glutGetModifiers();
+    GLint ALT   = modif & GLUT_ACTIVE_ALT;
+    GLint CTRL  = modif & GLUT_ACTIVE_CTRL;
+
+    switch( key )
+    {
+        case GLUT_KEY_LEFT:
+            if ( ALT )
+                TRK_offset.x -= 0.5;
+            else if ( CTRL )
+                translation.x -= 2.0;
+            else
+                VOXEL.x--;
+            break;
+        case GLUT_KEY_RIGHT:
+            if ( ALT )
+                TRK_offset.x += 0.5;
+            else if ( CTRL )
+                translation.x += 2.0;
+            else
+                VOXEL.x++;
+            break;
+        case GLUT_KEY_DOWN:
+            if ( ALT )
+                TRK_offset.y -= 0.5;
+            else if ( CTRL )
+                translation.y -= 2.0;
+            else
+                VOXEL.y--;
+            break;
+        case GLUT_KEY_UP:
+            if ( ALT )
+                TRK_offset.y += 0.5;
+            else if ( CTRL )
+                translation.y += 2.0;
+            else
+                VOXEL.y++;
+            break;
+        case GLUT_KEY_PAGE_DOWN:
+            if ( ALT )
+                TRK_offset.z -= 0.5;
+            else
+                VOXEL.z--;
+            break;
+        case GLUT_KEY_PAGE_UP:
+            if ( ALT )
+                TRK_offset.z += 0.5;
+            else
+                VOXEL.z++;
+            break;
+
+        default:
+            doRedraw = false;
+    }
+
+    // check the bounds
+    VOXEL.x = max( VOXEL.x, 0 );
+    VOXEL.y = max( VOXEL.y, 0 );
+    VOXEL.z = max( VOXEL.z, 0 );
+    VOXEL.x = min( VOXEL.x, dim.x-1 );
+    VOXEL.y = min( VOXEL.y, dim.y-1 );
+    VOXEL.z = min( VOXEL.z, dim.z-1 );
+
+    if ( doRedraw )
+        glutPostRedisplay();
+}
+
+
+// MOUSE callback
+// --------------
+void GLUT__mouse( GLint button, GLint state, GLint x, GLint y )
+{
+    if (state == GLUT_DOWN)
+    {
+        if ( button == GLUT_LEFT_BUTTON && glutGetModifiers() != GLUT_ACTIVE_CTRL )
+        {
+            moving = 1;
+            start.x = x;
+            start.y = y;
+        }
+        // NOTE: does not work, issue with glutGetModifiers not getting CTRL
+        // else if ( button == GLUT_LEFT_BUTTON && glutGetModifiers() == GLUT_ACTIVE_CTRL )
+        // {
+        //     moving = 2;
+        //     start.x = x;
+        //     start.y = y;
+        // }
+        else if ( (button == GLUT_MIDDLE_BUTTON) || (button == GLUT_LEFT_BUTTON && glutGetModifiers() == GLUT_ACTIVE_ALT) )
+        {
+            moving = 3;
+            start.x = x;
+            start.y = y;
+        }
+    }
+    else if (state == GLUT_UP)
+    {
+        moving = 0;
+    }
+}
+
+
+// MOTION callback
+// ---------------
+void GLUT__motion( GLint x, GLint y )
+{
+    if (moving==1)
+    {
+        OPENGL_utils::translate(id, 0,0,0, rot1);
+
+        OPENGL_utils::rotateY(id,start.x-x,rot3);
+        OPENGL_utils::matXMat(rot,rot1,rot2);
+        OPENGL_utils::rotateX(id,start.y-y,rot1);
+        OPENGL_utils::matXMat(rot2,rot1,rot);
+        OPENGL_utils::matXMat(rot,rot3,rot2);
+
+        OPENGL_utils::translate(id, 0,0,0, rot1);
+        OPENGL_utils::matXMat(rot2,rot1,rot);
+
+        start.x = x;
+        start.y = y;
+    }
+
+    else if (moving==2)
+    {
+        zoom = zoom + (y-start.y)/2.0;
+        start.y = y;
+    }
+
+    else if (moving==3)
+    {
+        translation.x = translation.x - (start.x-x)/3.0;
+        translation.y = translation.y + (start.y-y)/3.0;
+        start.x = x;
+        start.y = y;
+    }
+
+    glutPostRedisplay();
+}
+
+
+// DISPLAY callback
+// ----------------
+void GLUT__display( void )
+{
+    glClear( GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT );
+
+    glPushMatrix();
+    glTranslatef(translation.x, translation.y, -zoom); // mouse translation + zoom
+    glMultMatrixf(rot); // mouse rotation    
+    glTranslatef( -pixdim.x*dim.x/2.0, -pixdim.y*dim.y/2.0, -pixdim.z*dim.z/2.0 ); // center the FOV
+    glScalef( pixdim.x, pixdim.y, pixdim.z ); // account for voxel size
+
+    glEnable(GL_MULTISAMPLE_ARB);
+
+    /* ============= */
+    /* Draw the AXES */
+    /* ============= */
+    if ( showAxes )
+    {
+        glLineWidth(2);
+        glBegin(GL_LINES);
+            glColor4f( 1,0,0,1); glVertex3f( 0,0,0 ); glVertex3f( 10,  0,  0 );
+            glColor4f( 0,1,0,1); glVertex3f( 0,0,0 ); glVertex3f(  0, 10,  0 );
+            glColor4f( 0,0,1,1); glVertex3f( 0,0,0 ); glVertex3f(  0,  0, 10 );
+        glEnd();
+    }
+
+    /* =============== */
+    /* Draw the TRACTS */
+    /* =============== */
+    if ( TRK_show )
+    {
+        glPushMatrix();
+        glTranslatef(TRK_offset.x, TRK_offset.y, TRK_offset.z);
+
+        glLineWidth(1.0f);
+
+        float *ptr  = TRK_coords, *ptrc = TRK_colors;
+        VECTOR<float> Vc( VOXEL.x+0.5, VOXEL.y+0.5, VOXEL.z+0.5 ); // voxel center
+        float thr = 0.5*TRK_crop;
+        for(int f=0; f < TRK_nTractsPlotted; f++)
+        {
+            glBegin(GL_LINE_STRIP);
+            for(int i=0; i < TRK_nPoints[f]; i++)
+            {
+                // plot segment only if it's close to center of VOXEL
+                if (
+                      (
+                        TRK_crop_mode && (
+                        ( showPlane[0] && abs( (ptr[0]+TRK_offset.x) - Vc.x ) <= thr ) ||
+                        ( showPlane[1] && abs( (ptr[1]+TRK_offset.y) - Vc.y ) <= thr ) ||
+                        ( showPlane[2] && abs( (ptr[2]+TRK_offset.z) - Vc.z ) <= thr ) )
+                      )
+                      ||
+                      (
+                        !TRK_crop_mode && (
+                        ( abs( (ptr[0]+TRK_offset.x) - Vc.x ) <= thr ) &&
+                        ( abs( (ptr[1]+TRK_offset.y) - Vc.y ) <= thr ) &&
+                        ( abs( (ptr[2]+TRK_offset.z) - Vc.z ) <= thr ) )
+                      )
+                    )
+                {
+                    glColor3f(  ptrc[0], ptrc[1], ptrc[2] );
+                    glVertex3f( ptr[0],  ptr[1],  ptr[2]  );
+                }
+                else
+                {
+                    glEnd();
+                    glBegin(GL_LINE_STRIP);
+                }
+                ptr  += 3;
+                ptrc += 3;
+            }
+            glEnd();
+        }
+
+        glPopMatrix();
+    }
+
+    /* ============== */
+    /* Draw the PEAKS */
+    /* ============== */
+    if ( PEAKS_show || GLYPHS_show )
+    {
+        glDisable( GL_BLEND );
+        glLineWidth( LINE_width );
+        glPointSize( LINE_width );
+
+        glPushMatrix();
+        glTranslatef(.5,.5,.5);
+
+        Vec3Df dir, col;
+        int x,y,z,d,idx;
+        float norms[PEAKS_n], normMax, b0, w;
+
+        // plane YZ
+        if ( showPlane[0]  )
+        {
+            x = (int)VOXEL.x;
+            for(y=0; y<dim.y ;y++)
+            for(z=0; z<dim.z ;z++)
+            {
+                if ( PEAKS_show )
+                {
+                    normMax = 0;
+                    for(d=0; d<PEAKS_n; d++)
+                    {
+                        col.x = (*niiPEAKS->img)(x,y,z,3*d+0); // use "col" as tmp variable
+                        col.y = (*niiPEAKS->img)(x,y,z,3*d+1);
+                        col.z = (*niiPEAKS->img)(x,y,z,3*d+2);
+                        if ( PEAKS_use_affine )
+                        {
+                            dir.x = col.x * ((float*)PEAKS_affine)[0] + col.y * ((float*)PEAKS_affine)[1] + col.z * ((float*)PEAKS_affine)[2];
+                            dir.y = col.x * ((float*)PEAKS_affine)[3] + col.y * ((float*)PEAKS_affine)[4] + col.z * ((float*)PEAKS_affine)[5];
+                            dir.z = col.x * ((float*)PEAKS_affine)[6] + col.y * ((float*)PEAKS_affine)[7] + col.z * ((float*)PEAKS_affine)[8];
+                        }
+                        else
+                        {
+                            dir.x = col.x;
+                            dir.y = col.y;
+                            dir.z = col.z;
+                        }
+                        norms[d] = dir.norm();
+                        if ( norms[d] > normMax )
+                            normMax = norms[d];
+                    }
+
+                    for(d=0; d<PEAKS_n; d++)
+                    {
+                        if ( norms[d] < PEAKS_thr*normMax )
+                            continue;
+
+                        col.x = (*niiPEAKS->img)(x,y,z,3*d+0); // use "col" as tmp variable
+                        col.y = (*niiPEAKS->img)(x,y,z,3*d+1);
+                        col.z = (*niiPEAKS->img)(x,y,z,3*d+2);
+                        if ( PEAKS_use_affine )
+                        {
+                            dir.x = col.x * ((float*)PEAKS_affine)[0] + col.y * ((float*)PEAKS_affine)[1] + col.z * ((float*)PEAKS_affine)[2];
+                            dir.y = col.x * ((float*)PEAKS_affine)[3] + col.y * ((float*)PEAKS_affine)[4] + col.z * ((float*)PEAKS_affine)[5];
+                            dir.z = col.x * ((float*)PEAKS_affine)[6] + col.y * ((float*)PEAKS_affine)[7] + col.z * ((float*)PEAKS_affine)[8];
+                        }
+                        else
+                        {
+                            dir.x = col.x;
+                            dir.y = col.y;
+                            dir.z = col.z;
+                        }
+                        col.x = 0.5 * (PEAKS_flip[0]?-1:1) * dir.x / norms[d];
+                        col.y = 0.5 * (PEAKS_flip[1]?-1:1) * dir.y / norms[d];
+                        col.z = 0.5 * (PEAKS_flip[2]?-1:1) * dir.z / norms[d];
+
+                        if ( PEAKS_doNormalize )
+                        {
+                            dir.x = col.x;
+                            dir.y = col.y;
+                            dir.z = col.z;
+                        }
+                        else
+                        {
+                            dir.x = col.x * norms[d] / normMax;
+                            dir.y = col.y * norms[d] / normMax;
+                            dir.z = col.z * norms[d] / normMax;
+                        }
+
+                        glColor3f( fabs(2.0*col.x), fabs(2.0*col.y), fabs(2.0*col.z) ); 
+                        glBegin(GL_LINES);
+                            glVertex3f( x-dir.x, y-dir.y, z-dir.z );
+                            glVertex3f( x+dir.x, y+dir.y, z+dir.z );
+                        glEnd();
+                    }
+                }
+                if ( GLYPHS_show )
+                {
+                    b0 = (*niiDWI->img)(x,y,z,SCHEME_idxB0[0]);
+                    if ( b0 > GLYPHS_b0_thr )
+                    {
+                        glBegin(GL_POINTS);
+                        for(d=0; d < SCHEME_shells_idx[GLYPHS_shell].size() ;d++)
+                        {
+                            idx = SCHEME_shells_idx[GLYPHS_shell][d];
+                            w = 0.5 * (float)(*niiDWI->img)(x,y,z,idx) / b0;
+                            if ( GLYPHS_use_affine ) 
+                            {
+                                dir.x = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[0] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[1] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[2];
+                                dir.y = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[3] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[4] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[5];
+                                dir.z = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[6] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[7] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[8];
+                                normMax = dir.norm();
+                                dir.x *= w / normMax;
+                                dir.y *= w / normMax;
+                                dir.z *= w / normMax;
+                            }
+                            else
+                            {
+                                dir.x = w * SCHEME_dirs[idx].x;
+                                dir.y = w * SCHEME_dirs[idx].y;
+                                dir.z = w * SCHEME_dirs[idx].z;
+                            }
+                            normMax = dir.norm();
+                            glColor3f( fabs(dir.x)/normMax, fabs(dir.y)/normMax, fabs(dir.z)/normMax );
+                            glVertex3f( x+dir.x, y+dir.y, z+dir.z );
+                            glVertex3f( x-dir.x, y-dir.y, z-dir.z );
+                        }
+                        glEnd();
+                    }
+                }
+            }
+        }
+
+        // plane XZ
+        if ( showPlane[1] )
+        {
+            y = (int)VOXEL.y;
+            for(x=0; x<dim.x ;x++)
+            for(z=0; z<dim.z ;z++)
+            {
+                if ( PEAKS_show )
+                {
+                    normMax = 0;
+                    for(d=0; d<PEAKS_n; d++)
+                    {
+                        col.x = (*niiPEAKS->img)(x,y,z,3*d+0); // use "col" as tmp variable
+                        col.y = (*niiPEAKS->img)(x,y,z,3*d+1);
+                        col.z = (*niiPEAKS->img)(x,y,z,3*d+2);
+                        if ( PEAKS_use_affine )
+                        {
+                            dir.x = col.x * ((float*)PEAKS_affine)[0] + col.y * ((float*)PEAKS_affine)[1] + col.z * ((float*)PEAKS_affine)[2];
+                            dir.y = col.x * ((float*)PEAKS_affine)[3] + col.y * ((float*)PEAKS_affine)[4] + col.z * ((float*)PEAKS_affine)[5];
+                            dir.z = col.x * ((float*)PEAKS_affine)[6] + col.y * ((float*)PEAKS_affine)[7] + col.z * ((float*)PEAKS_affine)[8];
+                        }
+                        else
+                        {
+                            dir.x = col.x;
+                            dir.y = col.y;
+                            dir.z = col.z;
+                        }
+                        norms[d] = dir.norm();
+                        if ( norms[d] > normMax )
+                            normMax = norms[d];
+                    }
+
+                    for(d=0; d<PEAKS_n; d++)
+                    {
+                        if ( norms[d] < normMax*PEAKS_thr )
+                            continue;
+
+                        col.x = (*niiPEAKS->img)(x,y,z,3*d+0); // use "col" as tmp variable
+                        col.y = (*niiPEAKS->img)(x,y,z,3*d+1);
+                        col.z = (*niiPEAKS->img)(x,y,z,3*d+2);
+                        if ( PEAKS_use_affine )
+                        {
+                            dir.x = col.x * ((float*)PEAKS_affine)[0] + col.y * ((float*)PEAKS_affine)[1] + col.z * ((float*)PEAKS_affine)[2];
+                            dir.y = col.x * ((float*)PEAKS_affine)[3] + col.y * ((float*)PEAKS_affine)[4] + col.z * ((float*)PEAKS_affine)[5];
+                            dir.z = col.x * ((float*)PEAKS_affine)[6] + col.y * ((float*)PEAKS_affine)[7] + col.z * ((float*)PEAKS_affine)[8];
+                        }
+                        else
+                        {
+                            dir.x = col.x;
+                            dir.y = col.y;
+                            dir.z = col.z;
+                        }
+                        col.x = 0.5 * (PEAKS_flip[0]?-1:1) * dir.x / norms[d];
+                        col.y = 0.5 * (PEAKS_flip[1]?-1:1) * dir.y / norms[d];
+                        col.z = 0.5 * (PEAKS_flip[2]?-1:1) * dir.z / norms[d];
+
+                        if ( PEAKS_doNormalize )
+                        {
+                            dir.x = col.x;
+                            dir.y = col.y;
+                            dir.z = col.z;
+                        }
+                        else
+                        {
+                            dir.x = col.x * norms[d] / normMax;
+                            dir.y = col.y * norms[d] / normMax;
+                            dir.z = col.z * norms[d] / normMax;
+                        }
+
+                        glColor3f( fabs(2.0*col.x), fabs(2.0*col.y), fabs(2.0*col.z) );
+                        glBegin(GL_LINES);
+                            glVertex3f( x-dir.x, y-dir.y, z-dir.z );
+                            glVertex3f( x+dir.x, y+dir.y, z+dir.z );
+                        glEnd();
+                    }
+                }
+
+                if ( GLYPHS_show )
+                {
+                    b0 = (*niiDWI->img)(x,y,z,SCHEME_idxB0[0]);
+                    if ( b0 > GLYPHS_b0_thr )
+                    {
+                        glBegin(GL_POINTS);
+                        for(d=0; d < SCHEME_shells_idx[GLYPHS_shell].size() ;d++)
+                        {
+                            idx = SCHEME_shells_idx[GLYPHS_shell][d];
+                            w = 0.5 * (float)(*niiDWI->img)(x,y,z,idx) / b0;
+                            if ( GLYPHS_use_affine ) 
+                            {
+                                dir.x = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[0] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[1] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[2];
+                                dir.y = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[3] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[4] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[5];
+                                dir.z = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[6] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[7] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[8];
+                                normMax = dir.norm();
+                                dir.x *= w / normMax;
+                                dir.y *= w / normMax;
+                                dir.z *= w / normMax;
+                            }
+                            else
+                            {
+                                dir.x = w * SCHEME_dirs[idx].x;
+                                dir.y = w * SCHEME_dirs[idx].y;
+                                dir.z = w * SCHEME_dirs[idx].z;
+                            }
+                            normMax = dir.norm();
+                            glColor3f( fabs(dir.x)/normMax, fabs(dir.y)/normMax, fabs(dir.z)/normMax );
+                            glVertex3f( x+dir.x, y+dir.y, z+dir.z );
+                            glVertex3f( x-dir.x, y-dir.y, z-dir.z );
+                        }
+                        glEnd();
+                    }
+                }
+            }
+        }
+
+        // plane XY
+        if ( showPlane[2] )
+        {
+            z = (int)VOXEL.z;
+            for(y=0; y<dim.y ;y++)
+            for(x=0; x<dim.x ;x++)
+            {
+                if ( PEAKS_show )
+                {
+                    normMax = 0;
+                    for(d=0; d<PEAKS_n; d++)
+                    {
+                        col.x = (*niiPEAKS->img)(x,y,z,3*d+0); // use "col" as tmp variable
+                        col.y = (*niiPEAKS->img)(x,y,z,3*d+1);
+                        col.z = (*niiPEAKS->img)(x,y,z,3*d+2);
+                        if ( PEAKS_use_affine )
+                        {
+                            dir.x = col.x * ((float*)PEAKS_affine)[0] + col.y * ((float*)PEAKS_affine)[1] + col.z * ((float*)PEAKS_affine)[2];
+                            dir.y = col.x * ((float*)PEAKS_affine)[3] + col.y * ((float*)PEAKS_affine)[4] + col.z * ((float*)PEAKS_affine)[5];
+                            dir.z = col.x * ((float*)PEAKS_affine)[6] + col.y * ((float*)PEAKS_affine)[7] + col.z * ((float*)PEAKS_affine)[8];
+                        }
+                        else
+                        {
+                            dir.x = col.x;
+                            dir.y = col.y;
+                            dir.z = col.z;
+                        }
+                        norms[d] = dir.norm();
+                        if ( norms[d] > normMax )
+                            normMax = norms[d];
+                    }
+
+                    for(d=0; d<PEAKS_n; d++)
+                    {
+                        if ( norms[d] < normMax*PEAKS_thr )
+                            continue;
+
+                        col.x = (*niiPEAKS->img)(x,y,z,3*d+0); // use "col" as tmp variable
+                        col.y = (*niiPEAKS->img)(x,y,z,3*d+1);
+                        col.z = (*niiPEAKS->img)(x,y,z,3*d+2);
+                        if ( PEAKS_use_affine )
+                        {
+                            dir.x = col.x * ((float*)PEAKS_affine)[0] + col.y * ((float*)PEAKS_affine)[1] + col.z * ((float*)PEAKS_affine)[2];
+                            dir.y = col.x * ((float*)PEAKS_affine)[3] + col.y * ((float*)PEAKS_affine)[4] + col.z * ((float*)PEAKS_affine)[5];
+                            dir.z = col.x * ((float*)PEAKS_affine)[6] + col.y * ((float*)PEAKS_affine)[7] + col.z * ((float*)PEAKS_affine)[8];
+                        }
+                        else
+                        {
+                            dir.x = col.x;
+                            dir.y = col.y;
+                            dir.z = col.z;
+                        }
+                        col.x = 0.5 * (PEAKS_flip[0]?-1:1) * dir.x / norms[d];
+                        col.y = 0.5 * (PEAKS_flip[1]?-1:1) * dir.y / norms[d];
+                        col.z = 0.5 * (PEAKS_flip[2]?-1:1) * dir.z / norms[d];
+
+                        if ( PEAKS_doNormalize )
+                        {
+                            dir.x = col.x;
+                            dir.y = col.y;
+                            dir.z = col.z;
+                        }
+                        else
+                        {
+                            dir.x = col.x * norms[d] / normMax;
+                            dir.y = col.y * norms[d] / normMax;
+                            dir.z = col.z * norms[d] / normMax;
+                        }
+
+                        glColor3f( fabs(2.0*col.x), fabs(2.0*col.y), fabs(2.0*col.z) );
+                        glBegin(GL_LINES);
+                            glVertex3f( x-dir.x, y-dir.y, z-dir.z );
+                            glVertex3f( x+dir.x, y+dir.y, z+dir.z );
+                        glEnd();
+                    }
+                }
+
+                if( GLYPHS_show)
+                {
+                    b0 = (*niiDWI->img)(x,y,z,SCHEME_idxB0[0]);
+                    if ( b0 > GLYPHS_b0_thr )
+                    {
+                        glBegin(GL_POINTS);
+                        for(d=0; d < SCHEME_shells_idx[GLYPHS_shell].size() ;d++)
+                        {
+                            idx = SCHEME_shells_idx[GLYPHS_shell][d];
+                            w = 0.5 * (float)(*niiDWI->img)(x,y,z,idx) / b0;
+                            if ( GLYPHS_use_affine ) 
+                            {
+                                dir.x = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[0] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[1] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[2];
+                                dir.y = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[3] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[4] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[5];
+                                dir.z = SCHEME_dirs[idx].x * ((float*)GLYPHS_affine)[6] + SCHEME_dirs[idx].y * ((float*)GLYPHS_affine)[7] + SCHEME_dirs[idx].z * ((float*)GLYPHS_affine)[8];
+                                normMax = dir.norm();
+                                dir.x *= w / normMax;
+                                dir.y *= w / normMax;
+                                dir.z *= w / normMax;
+                            }
+                            else
+                            {
+                                dir.x = w * SCHEME_dirs[idx].x;
+                                dir.y = w * SCHEME_dirs[idx].y;
+                                dir.z = w * SCHEME_dirs[idx].z;
+                            }
+
+                            normMax = dir.norm();
+                            glColor3f( fabs(dir.x)/normMax, fabs(dir.y)/normMax, fabs(dir.z)/normMax );
+                            glVertex3f( x+dir.x, y+dir.y, z+dir.z );
+                            glVertex3f( x-dir.x, y-dir.y, z-dir.z );
+                        }
+                        glEnd();
+                    }
+                }
+            }
+        }
+
+        glPopMatrix();
+    }
+
+    /* =================== */
+    /* Draw the SCALAR MAP */
+    /* =================== */
+    if ( showPlane[0] || showPlane[1] || showPlane[2] )
+    {
+        glDisable( GL_CULL_FACE );
+        glEnable( GL_BLEND );
+        glBlendFunc( GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA );
+
+        // to avoid z-fighting
+        glPolygonOffset( 1.0, 1.0 );
+        glEnable(GL_POLYGON_OFFSET_FILL);
+        glPolygonMode(GL_FRONT_AND_BACK, GL_FILL);
+
+        glLineWidth( 3 );
+
+        int x, y, z; // voxel coordinates NB: (0,0,0) -> corner of voxel
+        float color;
+
+        // plane YZ
+        if ( showPlane[0]  )
+        {
+            glPushMatrix();
+            glTranslatef(0.5,0,0);
+
+            x = (int)VOXEL.x;
+            for(y=0; y<dim.y ;y++)
+            for(z=0; z<dim.z ;z++)
+            {
+                color = ( MAP(x,y,z) - MAP_min_view) / ( MAP_max_view - MAP_min_view );
+                glColor4f(color,color,color,MAP_opacity);
+                glBegin(GL_QUADS);
+                    glVertex3f(x, y,   z);
+                    glVertex3f(x, y,   z+1);
+                    glVertex3f(x, y+1, z+1);
+                    glVertex3f(x, y+1, z);
+                glEnd();
+            }
+            // colored frame
+            if ( showAxes )
+            {
+                glColor3f(1,0,0);
+                glBegin(GL_LINE_STRIP);
+                    glVertex3f(x,0,0);
+                    glVertex3f(x,dim.y,0);
+                    glVertex3f(x,dim.y,dim.z);
+                    glVertex3f(x,0,dim.z);
+                    glVertex3f(x,0,0);
+                glEnd();
+            }
+
+            glPopMatrix();
+        }
+
+        // plane XZ
+        if ( showPlane[1] )
+        {
+            glPushMatrix();
+            glTranslatef(0,0.5,0);
+
+            y = (int)VOXEL.y;
+            for(x=0; x<dim.x ;x++)
+            for(z=0; z<dim.z ;z++)
+            {
+                color = ( MAP(x,y,z) - MAP_min_view) / ( MAP_max_view - MAP_min_view );
+                glColor4f(color,color,color,MAP_opacity);
+                glBegin(GL_QUADS);
+                    glVertex3f(x,   y, z);
+                    glVertex3f(x,   y, z+1);
+                    glVertex3f(x+1, y, z+1);
+                    glVertex3f(x+1, y, z);
+                glEnd();
+            }
+            // colored frame
+            if ( showAxes )
+            {
+                glColor3f(0,1,0);
+                glBegin(GL_LINE_STRIP);
+                    glVertex3f(0,y,0);
+                    glVertex3f(dim.x,y,0);
+                    glVertex3f(dim.x,y,dim.z);
+                    glVertex3f(0,y,dim.z);
+                    glVertex3f(0,y,0);
+                glEnd();
+            }
+
+            glPopMatrix();
+        }
+
+        // plane XY
+        if ( showPlane[2] )
+        {
+            glPushMatrix();
+            glTranslatef(0,0,0.5);
+
+            z = (int)VOXEL.z;
+            for(y=0; y<dim.y ;y++)
+            for(x=0; x<dim.x ;x++)
+            {
+                color = ( MAP(x,y,z) - MAP_min_view) / ( MAP_max_view - MAP_min_view );
+                glColor4f(color,color,color,MAP_opacity);
+                glBegin(GL_QUADS);
+                    glVertex3f(x,   y,   z);
+                    glVertex3f(x+1, y,   z);
+                    glVertex3f(x+1, y+1, z);
+                    glVertex3f(x,   y+1, z);
+                glEnd();
+            }
+
+            // colored frame
+            if ( showAxes )
+            {
+                glColor3f(0,0,1);
+                glBegin(GL_LINE_STRIP);
+                    glVertex3f(0,0,z);
+                    glVertex3f(dim.x,0,z);
+                    glVertex3f(dim.x,dim.y,z);
+                    glVertex3f(0,dim.y,z);
+                    glVertex3f(0,0,z);
+                glEnd();
+            }
+
+            glPopMatrix();
+        }
+
+        glEnable(GL_CULL_FACE);
+        glDisable( GL_BLEND );
+        glDisable(GL_POLYGON_OFFSET_FILL);
+    }
+
+    /* ====================== */
+    /* Draw the CURRENT VOXEL */
+    /* ====================== */
+    if ( showAxes )
+    {
+        glPushMatrix();
+        glTranslatef( VOXEL.x+0.5, VOXEL.y+0.5, VOXEL.z+0.5 );
+
+        glEnable( GL_BLEND );
+        glBlendFunc( GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA );
+        glLineWidth(1);
+        glColor4f( 1,1,0,1 );
+        glutWireCube( 1 );
+        glColor4f( 1,1,0,0.25 );
+        glutSolidCube( 1 );
+        glDisable( GL_BLEND );
+
+        glPopMatrix();
+    }
+
+    glPopMatrix();
+    PrintConfig();
+    glutSwapBuffers();
+}
+
+
+// INITIALIZATION
+// --------------
+void OpenGL_init( int argc, char** argv )
+{
+    glutInit( &argc, argv );
+    glutInitDisplayMode( GLUT_DEPTH | GLUT_DOUBLE | GLUT_RGBA | GLUT_ALPHA | GLUT_MULTISAMPLE );
+    ScreenX = 0.7*glutGet(GLUT_SCREEN_WIDTH);  if (ScreenX==0) ScreenX = 800;
+    ScreenY = 0.7*glutGet(GLUT_SCREEN_HEIGHT); if (ScreenY==0) ScreenY = 600;
+    glutInitWindowSize( ScreenX, ScreenY );
+    glutInitWindowPosition( 0.15*glutGet(GLUT_SCREEN_WIDTH), 0.15*glutGet(GLUT_SCREEN_HEIGHT) );
+    glutCreateWindow( "COMMIT debugger" );
+
+    // Projection and model matrix
+    glMatrixMode(GL_PROJECTION);
+    glLoadIdentity();
+    glMatrixMode(GL_MODELVIEW);
+    glLoadIdentity();
+
+    translation.x	= translation.y = 0;
+    zoom			= 0;
+    OPENGL_utils::identity( rot );
+    OPENGL_utils::identity( id );
+
+    glEnable( GL_DEPTH_TEST );
+    glClearColor( 0.1, 0.1, 0.1, 0.0 );
+
+    // lighting
+    glShadeModel( GL_SMOOTH );
+    glEnable( GL_NORMALIZE );
+
+    GLfloat white[] = {.5f, .5f, .5f, 1.0f};
+    glMaterialfv(GL_FRONT, GL_SPECULAR, white);
+    GLfloat shininess[] = {32};
+    glMaterialfv(GL_FRONT, GL_SHININESS, shininess);
+
+    glLightModeli(GL_LIGHT_MODEL_TWO_SIDE, GL_FALSE);
+    GLfloat global_ambient[] = { 0.2f, 0.2f, 0.2f, 1.0f };
+    glLightModelfv(GL_LIGHT_MODEL_AMBIENT, global_ambient);
+    glEnable ( GL_COLOR_MATERIAL );
+
+    // register CALLBACKS and open window
+    glutKeyboardFunc( GLUT__keyboard );
+    glutSpecialFunc(  GLUT__specialkey );
+    glutDisplayFunc(  GLUT__display );
+    glutReshapeFunc(  GLUT__reshape );
+    glutMouseFunc(    GLUT__mouse );
+    glutMotionFunc(   GLUT__motion );
+
+    GLUT__createMenu();
+
+    glutMainLoop();
+}
diff --git a/extras/COMMIT_debugger/OPENGL_utils.h b/extras/COMMIT_debugger/OPENGL_utils.h
index cd3f3607..a9390517 100755
--- a/extras/COMMIT_debugger/OPENGL_utils.h
+++ b/extras/COMMIT_debugger/OPENGL_utils.h
@@ -1,95 +1,95 @@
-#ifndef __OPENGL_UTILS_H__
-#define __OPENGL_UTILS_H__
-
-#include <algorithm>
-
-#include "VECTOR.h"
-typedef VECTOR<GLint>		Vec3Di;
-typedef VECTOR<GLfloat>		Vec3Df;
-
-
-namespace OPENGL_utils
-{
-
-void identity(GLfloat* result)
-{
-    for (int i=0; i<4; i++)
-    for (int j=0; j<4; j++)
-        if (i==j) result[4*i+j]=1; else result[4*i+j]=0;
-}
-
-
-void matXMat(GLfloat* m, GLfloat* m1, GLfloat* result)
-{
-    for (int i=0; i<4; i++)
-    for (int j=0; j<4; j++)
-    {
-        result[4*i+j]=0;
-        for (int t=0; t<4; t++)
-            result[4*i+j]=result[4*i+j]+m[4*i+t]*m1[4*t+j];
-    }
-}
-
-
-void rotateZ(GLfloat* m, GLfloat ang, GLfloat* result)
-{
-    static GLfloat matrix[16];
-
-    for (int i=0; i<16 ; i++) matrix[i] = 0;
-    matrix[0]  = cos(ang/180*3.1415);
-    matrix[5]  = cos(ang/180*3.1415);
-    matrix[1]  = -sin(ang/180*3.1415);
-    matrix[4]  = sin(ang/180*3.1415);
-    matrix[10] = 1;
-    matrix[15] = 1;
-    matXMat(matrix,m,result);
-}
-
-
-void rotateY(GLfloat* m, GLfloat ang, GLfloat* result)
-{
-    static GLfloat matrix[16];
-
-    for (int i=0; i<16 ; i++) matrix[i] = 0;
-    matrix[0]  = cos(ang/180*3.1415);
-    matrix[10] = cos(ang/180*3.1415);
-    matrix[8]  = -sin(ang/180*3.1415);
-    matrix[2]  = sin(ang/180*3.1415);
-    matrix[5]  = 1;
-    matrix[15] = 1;
-    matXMat(matrix,m,result);
-}
-
-
-void rotateX(GLfloat* m, GLfloat ang, GLfloat* result)
-{
-    static GLfloat matrix[16];
-
-    for (int i=0; i<16 ; i++) matrix[i] = 0;
-    matrix[5]  = cos(ang/180*3.1415);
-    matrix[10] = cos(ang/180*3.1415);
-    matrix[6]  = -sin(ang/180*3.1415);
-    matrix[9]  = sin(ang/180*3.1415);
-    matrix[0]  = 1;
-    matrix[15] = 1;
-    matXMat(matrix,m,result);
-}
-
-
-void translate(GLfloat* m, GLfloat x,GLfloat y,GLfloat z, GLfloat* result)
-{
-    static GLfloat matrix[16];
-
-    for (int i=0; i<16 ; i++) matrix[i] = 0;
-    matrix[0]  = 1;
-    matrix[5]  = 1;
-    matrix[10] = 1;
-    matrix[15] = 1;
-    matrix[12] = x;
-    matrix[13] = y;
-    matrix[14] = z;
-    matXMat(matrix,m,result);
-}
-
-}
-#endif
+#ifndef __OPENGL_UTILS_H__
+#define __OPENGL_UTILS_H__
+
+#include <algorithm>
+
+#include "VECTOR.h"
+typedef VECTOR<GLint>		Vec3Di;
+typedef VECTOR<GLfloat>		Vec3Df;
+
+
+namespace OPENGL_utils
+{
+
+void identity(GLfloat* result)
+{
+    for (int i=0; i<4; i++)
+    for (int j=0; j<4; j++)
+        if (i==j) result[4*i+j]=1; else result[4*i+j]=0;
+}
+
+
+void matXMat(GLfloat* m, GLfloat* m1, GLfloat* result)
+{
+    for (int i=0; i<4; i++)
+    for (int j=0; j<4; j++)
+    {
+        result[4*i+j]=0;
+        for (int t=0; t<4; t++)
+            result[4*i+j]=result[4*i+j]+m[4*i+t]*m1[4*t+j];
+    }
+}
+
+
+void rotateZ(GLfloat* m, GLfloat ang, GLfloat* result)
+{
+    static GLfloat matrix[16];
+
+    for (int i=0; i<16 ; i++) matrix[i] = 0;
+    matrix[0]  = cos(ang/180*3.1415);
+    matrix[5]  = cos(ang/180*3.1415);
+    matrix[1]  = -sin(ang/180*3.1415);
+    matrix[4]  = sin(ang/180*3.1415);
+    matrix[10] = 1;
+    matrix[15] = 1;
+    matXMat(matrix,m,result);
+}
+
+
+void rotateY(GLfloat* m, GLfloat ang, GLfloat* result)
+{
+    static GLfloat matrix[16];
+
+    for (int i=0; i<16 ; i++) matrix[i] = 0;
+    matrix[0]  = cos(ang/180*3.1415);
+    matrix[10] = cos(ang/180*3.1415);
+    matrix[8]  = -sin(ang/180*3.1415);
+    matrix[2]  = sin(ang/180*3.1415);
+    matrix[5]  = 1;
+    matrix[15] = 1;
+    matXMat(matrix,m,result);
+}
+
+
+void rotateX(GLfloat* m, GLfloat ang, GLfloat* result)
+{
+    static GLfloat matrix[16];
+
+    for (int i=0; i<16 ; i++) matrix[i] = 0;
+    matrix[5]  = cos(ang/180*3.1415);
+    matrix[10] = cos(ang/180*3.1415);
+    matrix[6]  = -sin(ang/180*3.1415);
+    matrix[9]  = sin(ang/180*3.1415);
+    matrix[0]  = 1;
+    matrix[15] = 1;
+    matXMat(matrix,m,result);
+}
+
+
+void translate(GLfloat* m, GLfloat x,GLfloat y,GLfloat z, GLfloat* result)
+{
+    static GLfloat matrix[16];
+
+    for (int i=0; i<16 ; i++) matrix[i] = 0;
+    matrix[0]  = 1;
+    matrix[5]  = 1;
+    matrix[10] = 1;
+    matrix[15] = 1;
+    matrix[12] = x;
+    matrix[13] = y;
+    matrix[14] = z;
+    matXMat(matrix,m,result);
+}
+
+}
+#endif
diff --git a/extras/COMMIT_debugger/main.cxx b/extras/COMMIT_debugger/main.cxx
index 7d4eb823..2a4e2e62 100755
--- a/extras/COMMIT_debugger/main.cxx
+++ b/extras/COMMIT_debugger/main.cxx
@@ -1,652 +1,652 @@
-#include <NIFTI.h>
-#include <nifti1_io.h>
-#include <COLOR_ui.h>
-#include <TrackVis.h>
-#include <VECTOR.h>
-#include <cmath>
-#include <regex>
-#include <cstdlib>
-#include "tclap/CmdLine.h"
-#include <blitz/array.h>
-using namespace std;
-
-#include "colormaps.h"
-
-NIFTI*                   niiDWI;
-VECTOR<int>		         dim;
-VECTOR<float>	         pixdim;
-
-int                      SCHEME_version;
-vector< VECTOR<float> >	 SCHEME_dirs;
-vector<float>	         SCHEME_b;
-vector<int>              SCHEME_idxB0;
-vector<int>              SCHEME_idxDWI;
-vector<float>	         SCHEME_shells_b;
-vector< vector<int> >    SCHEME_shells_idx;
-
-blitz::Array<float,3>    MAP;
-VECTOR<int>		         VOXEL;
-float                    MAP_min, MAP_min_view, MAP_max, MAP_max_view;
-float 			         MAP_opacity = 0.5;
-bool			         showPlane[3] = { true, true, true };
-bool                     showAxes = true;
-bool			         showConfig = true;
-float				     LINE_width = 2.0;
-
-NIFTI*                   niiPEAKS;
-int				         PEAKS_n;
-bool			         PEAKS_show = false;
-float			         PEAKS_thr = 0.0;
-bool			         PEAKS_doNormalize = false;
-bool			         PEAKS_flip[3] = {false, false, false};
-bool			         PEAKS_use_affine = false;
-float                    PEAKS_affine[3][3];
-
-TrackVis 		         TRK_file;
-int				         TRK_skip;
-int				         TRK_nTractsPlotted;
-int*   			         TRK_nPoints;
-float*			         TRK_coords;
-float*			         TRK_colors;
-float 			         TRK_crop = 1.0;
-bool 			         TRK_crop_mode = true;
-bool 			         TRK_show = false;
-VECTOR<float> 	         TRK_offset;
-
-bool 			         GLYPHS_show = false;
-int                      GLYPHS_shell = 0;
-bool			         GLYPHS_flip[3] = {false, false, false};
-float	                 GLYPHS_b0_thr = 50.0;
-bool			         GLYPHS_use_affine = false;
-float                    GLYPHS_affine[3][3];
-
-#include "OPENGL_callbacks.cxx"
-
-
-/*----------------------------------------------------------------------------------------------------------------------------------*/
-int main(int argc, char** argv)
-{
-    TCLAP::CmdLine cmd("This tool allows one to display in a common 3D space all the objects (DWI data, streamlines etc...) used by COMMIT in order to spot possible incosistencies between the conventions of COMMIT and the software that generated the data, e.g. flip in some axes in the DWI data or in the peaks, spatial shift in the streamlines, whether the affine transformation was already applied to the data etc..", ' ', "1.2");
-
-    TCLAP::UnlabeledValueArg<string> argDWI(    "dwi","Filename of the DWI dataset [4D NIFTI]", true, "", "DWI", cmd );
-    TCLAP::ValueArg<string>          argMAP(    "m", "map", "Background map [3D NIFTI]", false, "", "map", cmd );
-    TCLAP::ValueArg<string>          argPEAKS(  "p", "peaks", "Main diffusion directions for the extra-axonal part in each voxel [4D NIFTI]", false, "", "peaks", cmd );
-    TCLAP::ValueArg<string>          argTRK(    "f", "fibers", "Streamlines for the intra-axonal part [.TRK format]", false, "", "fibers", cmd );
-    TCLAP::UnlabeledValueArg<string> argSCHEME( "scheme","Acquisition scheme [text]", true, "", "scheme", cmd );
-
-    try	{ cmd.parse( argc, argv ); }
-    catch (TCLAP::ArgException &e) { cerr << "error: " << e.error() << " for arg " << e.argId() << endl; }
-
-    string DWI_filename( argDWI.getValue() );
-    string SCHEME_filename( argSCHEME.getValue() );
-    string PEAKS_filename( argPEAKS.getValue() );
-    string TRK_filename( argTRK.getValue() );
-    string MAP_filename( argMAP.getValue() );
-
-
-    // ===================
-    // Reading DWI dataset
-    // ===================
-    COLOR_msg( "-> Reading 'DWI' dataset:", "\n" );
-
-    niiDWI = new NIFTI;
-    niiDWI->open( DWI_filename, true );
-    if ( !niiDWI->isValid() )
-    {
-        COLOR_error( "Unable to open file", "\t" );
-        return EXIT_FAILURE;
-    }
-    dim.x = niiDWI->hdr->dim[1];
-    dim.y = niiDWI->hdr->dim[2];
-    dim.z = niiDWI->hdr->dim[3];
-    pixdim.x = niiDWI->hdr->pixdim[1];
-    pixdim.y = niiDWI->hdr->pixdim[2];
-    pixdim.z = niiDWI->hdr->pixdim[3];
-    printf( "\tdim    : %d x %d x %d x %d\n", dim.x, dim.y, dim.z, niiDWI->hdr->dim[4] );
-    printf( "\tpixdim : %.4f x %.4f x %.4f\n", 	pixdim.x, pixdim.y, pixdim.z );
-    printf( "\tqform  : %d\n", niiDWI->hdr->qform_code );
-    mat44 DWI_qform = niiDWI->hdr->qto_xyz;
-    if ( niiDWI->hdr->qform_code > 0 )
-    {
-        for(int i=0; i<3 ;i++)
-        {
-            printf( "\t\t| " );
-            for(int j=0; j<4 ;j++)
-                printf( "%9.4f ", DWI_qform.m[i][j] );
-            printf( "|\n" );
-        }
-    }
-    else
-    {
-        COLOR_warning( "This should never happen!", "\t\t" );
-    }
-    printf( "\tsform  : %d\n", niiDWI->hdr->sform_code );
-    mat44 DWI_sform = niiDWI->hdr->sto_xyz;
-    if ( niiDWI->hdr->sform_code > 0 )
-    {
-        for(int i=0; i<3 ;i++)
-        {
-            printf( "\t\t| " );
-            for(int j=0; j<4 ;j++)
-                printf( "%9.4f ", DWI_sform.m[i][j] );
-            printf( "|\n" );
-        }
-    }
-
-    // Read the affine matrix to rotate the vectors
-    // NB: we need the inverse, but in this case inv=transpose
-    if ( niiDWI->hdr->qform_code != 0 )
-    {
-        for(int i=0; i<3 ;i++)
-        for(int j=0; j<3 ;j++)
-            GLYPHS_affine[i][j] = DWI_qform.m[j][i];
-    }
-    else if ( niiDWI->hdr->sform_code != 0 )
-    {
-        for(int i=0; i<3 ;i++)
-        for(int j=0; j<3 ;j++)
-            GLYPHS_affine[i][j] = DWI_sform.m[j][i];
-    }
-    else {
-        for(int i=0; i<3 ;i++)
-        for(int j=0; j<3 ;j++)
-            GLYPHS_affine[i][j] = 0;
-        for(int i=0; i<3 ;i++)
-            GLYPHS_affine[i][i] = 1;
-    }
-
-    mat33 tmp;
-    for(int i=0; i<3 ;i++)
-        for(int j=0; j<3 ;j++)
-            tmp.m[i][j] = GLYPHS_affine[i][j];
-    printf( "\tAffine used (%s):\n", nifti_mat33_determ(tmp)<0?"RADIOLOGICAL":"NEUROLOGICAL" );
-    for(int i=0; i<3 ;i++)
-    {
-        printf( "\t\t| " );
-        for(int j=0; j<3 ;j++)
-            printf( "%9.4f ", GLYPHS_affine[i][j] );
-        printf( "|\n" );
-    }
-
-    COLOR_msg( "   [OK]" );
-
-
-    // ===================
-    // Reading SCHEME file
-    // ===================
-    COLOR_msg( "-> Reading 'SCHEME' file:", "\n" );
-
-    char line[1000];
-    FILE* pFile = fopen( SCHEME_filename.c_str(), "rt" );
-
-    // read the version
-    // ----------------
-    try
-    {
-        while( fgets(line, 1000, pFile) )
-            if ( line[0]!='#' )
-                break;
-
-        std::regex reVersion("^VERSION: (.*)\\s*$");
-        std::smatch reMatches;
-
-        std::string str_line = string(line);
-        if ( !std::regex_match(str_line, reMatches, reVersion) )
-        {
-            // no header found, assume standards BVECTOR format
-            SCHEME_version = 0;
-            fseek(pFile, -strlen(line), SEEK_CUR);
-        }
-        else
-        {
-            if( strcmp(reMatches[1].str().c_str(),"0")==0 || strcmp(reMatches[1].str().c_str(),"BVECTOR")==0 )
-                SCHEME_version = 0;
-            else if( strcmp(reMatches[1].str().c_str(),"1")==0 || strcmp(reMatches[1].str().c_str(),"STEJSKALTANNER")==0 )
-                SCHEME_version = 1;
-            else
-                throw "Version not recognized";
-        }
-    }
-    catch( const char* msg )
-    {
-        COLOR_error( msg, "\t" );
-        return EXIT_FAILURE;
-    }
-    printf( "\tversion   : %s\n", SCHEME_version==0?"BVECTOR":"STEJSKALTANNER" );
-
-    // read the data
-    // -------------
-    try
-    {
-        string      reFLOAT( "[-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?" );
-        std::regex  reVERSION0( "^\\s*("+reFLOAT+")\\s+("+reFLOAT+")\\s+("+reFLOAT+")\\s+("+reFLOAT+")\\s*$" );
-        std::regex  reVERSION1( "^\\s*("+reFLOAT+")\\s+("+reFLOAT+")\\s+("+reFLOAT+")\\s+("+reFLOAT+")\\s+("+reFLOAT+")\\s+("+reFLOAT+")\\s+("+reFLOAT+")\\s*$" );
-        std::regex  reEMPTY( "^\\s*$" );
-        std::smatch reMatches;
-        int         Ns = 0;
-        float       x, y, z, b, G, D, d;
-        while( fgets(line, 1000, pFile) )
-        {
-            std::string str_line = string(line);
-            if( std::regex_match(str_line, reMatches, reEMPTY) )
-                continue;   // skip empty lines
-
-            if( SCHEME_version == 0 )
-            {
-                if ( !std::regex_match(str_line, reMatches, reVERSION0) )
-                    throw "Wrong row format";
-                x = std::atof( reMatches[1].str().c_str() );
-                y = std::atof( reMatches[2].str().c_str() );
-                z = std::atof( reMatches[3].str().c_str() );
-                b = std::atof( reMatches[4].str().c_str() ); // in mm^2/s
-                VECTOR<float> tmp( x, y, z );
-                tmp.Normalize();
-                SCHEME_dirs.push_back( tmp );
-                SCHEME_b.push_back( b );
-            }
-            else
-            {
-                if ( !std::regex_match(str_line, reMatches, reVERSION1) )
-                    throw "Wrong row format";
-                x = std::atof( reMatches[1].str().c_str() );
-                y = std::atof( reMatches[2].str().c_str() );
-                z = std::atof( reMatches[3].str().c_str() );
-                G = std::atof( reMatches[4].str().c_str() );
-                D = std::atof( reMatches[5].str().c_str() );
-                d = std::atof( reMatches[6].str().c_str() );
-                VECTOR<float> tmp( x, y, z );
-                tmp.Normalize();
-                SCHEME_dirs.push_back( tmp );
-                b = std::pow( 267.513e6*G*d, 2 ) * (D-d/3.0) * 1e-6; // in mm^2/s
-                SCHEME_b.push_back( b );
-            }
-
-            if ( b<5.0 )
-            {
-                SCHEME_idxB0.push_back( Ns );
-            }
-            else
-            {
-                SCHEME_idxDWI.push_back( Ns );
-                if ( std::find(SCHEME_shells_b.begin(), SCHEME_shells_b.end(), b) == SCHEME_shells_b.end() )
-                {
-                    SCHEME_shells_b.push_back( b ) ;
-                    vector<int> tmp;
-                    SCHEME_shells_idx.push_back( tmp ) ;
-                }
-            }
-            Ns++;
-        }
-    }
-    catch( const char* msg )
-    {
-        COLOR_error( msg, "\t" );
-        return EXIT_FAILURE;
-    }
-    fclose(pFile);
-
-    printf( "\tgradients : %d\n", SCHEME_b.size() );
-    if ( niiDWI->hdr->dim[4] != SCHEME_b.size() )
-    {
-        COLOR_error( "The scheme does not match the DWI dataset", "\t" );
-        return EXIT_FAILURE;
-    }
-
-    // fill data structure about the SCHEME
-    // ------------------------------------
-    for(int i=0; i < SCHEME_b.size() ;i++)
-    {
-        if ( SCHEME_b[i] < 5 )
-            continue;
-        int s = std::find( SCHEME_shells_b.begin(), SCHEME_shells_b.end(), SCHEME_b[i] ) - SCHEME_shells_b.begin();
-        SCHEME_shells_idx[s].push_back( i );
-    }
-
-    printf( "\tscheme    : %d b0 and %d shells (", SCHEME_idxB0.size(), SCHEME_shells_idx.size() );
-    for(int i=0; i < SCHEME_shells_b.size() ;i++)
-        printf( " [%d @ b=%.1f]", SCHEME_shells_idx[i].size(), SCHEME_shells_b[i] );
-    printf( " )\n" );
-
-    COLOR_msg( "   [OK]" );
-
-
-
-    // =======================
-    // Creating BACKGROUND map
-    // =======================
-    COLOR_msg( "-> Preparing 'BACKGROUND' map:", "\n" );
-    MAP.resize(dim.x,dim.y,dim.z);
-    if ( !MAP_filename.empty() )
-    {
-        printf( "\tdata   : reading from file\n" );
-        NIFTI* niiMAP = new NIFTI;
-        niiMAP->open( MAP_filename, true );
-        if ( !niiMAP->isValid() )
-        {
-            COLOR_error( "Unable to open the file", "\t" );
-            return EXIT_FAILURE;
-        }
-
-        printf( "\tdim    : %d x %d x %d x %d\n" , niiMAP->hdr->dim[1],    niiMAP->hdr->dim[2],    niiMAP->hdr->dim[3], niiMAP->hdr->dim[4] );
-        printf( "\tpixdim : %.4f x %.4f x %.4f\n", niiMAP->hdr->pixdim[1], niiMAP->hdr->pixdim[2], niiMAP->hdr->pixdim[3] );
-
-        if ( niiMAP->hdr->dim[1] != dim.x || niiMAP->hdr->dim[2] != dim.y || niiMAP->hdr->dim[3] != dim.z )
-        {
-            COLOR_error( "The DIMENSIONS do not match those of DWI images", "\t" );
-            return EXIT_FAILURE;
-        }
-        if ( abs(niiMAP->hdr->pixdim[1]-pixdim.x) > 1e-4 || abs(niiMAP->hdr->pixdim[2]-pixdim.y) > 1e-4 || abs(niiMAP->hdr->pixdim[3]-pixdim.z) > 1e-4 )
-        {
-            COLOR_warning( "The VOXEL SIZE does not match that of DWI images", "\t" );
-        }
-
-        FLOAT32 MIN = 0;//(*niiMAP->img)(0,0,0);
-        FLOAT32 MAX = MIN;
-
-        for(int i=0; i<dim.x ;i++)
-        for(int j=0; j<dim.y ;j++)
-        for(int k=0; k<dim.z ;k++)
-        {
-            MAP(i,j,k) = (*niiMAP->img)(i,j,k);
-            if ( MAP(i,j,k) > MAX )
-                MAX = MAP(i,j,k);
-            if ( MAP(i,j,k) < MIN )
-                MIN = MAP(i,j,k);
-        }
-        if ( MAX - MIN <= 0 )
-        {
-            COLOR_error( "The dynamic range is zero", "\t" );
-            return EXIT_FAILURE;
-        }
-        MAP_min	= MIN;
-        MAP_min_view = 0;
-        MAP_max	= MAP_max_view = MAX;
-
-        printf( "\tvalues : [%.2e ... %.2e]\n", MAP_min, MAP_max );
-        COLOR_msg( "   [OK]" );
-    }
-    else
-    {
-        printf( "\tdata   : " );
-
-        if ( SCHEME_idxB0.size() > 0 )
-        {
-            printf( "taking first b0 image\n" );
-            FLOAT32 MIN = (*niiDWI->img)(0,0,0,SCHEME_idxB0[0]);
-            FLOAT32 MAX = MIN;
-
-            for(int i=0; i<dim.x ;i++)
-            for(int j=0; j<dim.y ;j++)
-            for(int k=0; k<dim.z ;k++)
-            {
-                MAP(i,j,k) = (*niiDWI->img)(i,j,k,SCHEME_idxB0[0]);
-                if ( MAP(i,j,k) > MAX )
-                    MAX = MAP(i,j,k);
-                if ( MAP(i,j,k) < MIN )
-                    MIN = MAP(i,j,k);
-            }
-            if ( MAX - MIN <= 0 )
-            {
-                COLOR_error( "The dynamic range is zero", "\t" );
-                return EXIT_FAILURE;
-            }
-            MAP_min	= MIN;
-            MAP_min_view = 0;
-            MAP_max	= MAP_max_view = MAX;
-        }
-        else
-        {
-            printf( "no b0 found\n" );
-            MAP = 0;
-            MAP_min	= MAP_min_view = 0;
-            MAP_max	= MAP_max_view = 1;
-        }
-        printf( "\tvalues : [%.2e ... %.2e]\n", MAP_min, MAP_max );
-        COLOR_msg( "   [OK]" );
-    }
-
-
-    // ==================
-    // Reading PEAKS file
-    // ==================
-    COLOR_msg( "-> Reading 'PEAKS' dataset:", "\n" );
-
-    if ( !PEAKS_filename.empty() )
-    {
-        niiPEAKS = new NIFTI;
-        niiPEAKS->open( PEAKS_filename, true );
-        if ( !niiPEAKS->isValid() )
-        {
-            COLOR_error( "Unable to open the file", "\t" );
-            return false;
-        }
-
-        if ( niiPEAKS->hdr->dim[0] != 4 || niiPEAKS->hdr->dim[4]%3 != 0 )
-        {
-            COLOR_error( "The size must be (*,*,*,3*k)", "\t" );
-            return EXIT_FAILURE;
-        }
-        PEAKS_n = niiPEAKS->hdr->dim[4]/3;
-
-        printf( "\tdim     : %d x %d x %d (%d peaks per voxel)\n" , niiPEAKS->hdr->dim[1], niiPEAKS->hdr->dim[2], niiPEAKS->hdr->dim[3], PEAKS_n );
-        printf( "\tpixdim  : %.4f x %.4f x %.4f\n", niiPEAKS->hdr->pixdim[1], niiPEAKS->hdr->pixdim[2], niiPEAKS->hdr->pixdim[3] );
-
-        printf( "\tqform   : %d\n", niiPEAKS->hdr->qform_code );
-        mat44 PEAKS_qform = niiPEAKS->hdr->qto_xyz;
-        if ( niiPEAKS->hdr->qform_code > 0 )
-        {
-            for(int i=0; i<3 ;i++)
-            {
-                printf( "\t\t| " );
-                for(int j=0; j<4 ;j++)
-                    printf( "%9.4f ", PEAKS_qform.m[i][j] );
-                printf( "|\n" );
-            }
-        }
-        else
-        {
-            COLOR_warning( "This should never happen!", "\t\t" );
-        }
-
-        printf( "\tsform  : %d\n", niiPEAKS->hdr->sform_code );
-        mat44 PEAKS_sform = niiPEAKS->hdr->sto_xyz;
-        if ( niiPEAKS->hdr->sform_code > 0 )
-        {
-            for(int i=0; i<3 ;i++)
-            {
-                printf( "\t\t| " );
-                for(int j=0; j<4 ;j++)
-                    printf( "%9.4f ", PEAKS_sform.m[i][j] );
-                printf( "|\n" );
-            }
-        }
-
-        if ( niiPEAKS->hdr->dim[1] != dim.x || niiPEAKS->hdr->dim[2] != dim.y || niiPEAKS->hdr->dim[3] != dim.z )
-        {
-            COLOR_error( "The DIMENSIONS do not match those of DWI images", "\t" );
-            return EXIT_FAILURE;
-        }
-        if ( abs(niiPEAKS->hdr->pixdim[1]-pixdim.x) > 1e-3 || abs(niiPEAKS->hdr->pixdim[2]-pixdim.y) > 1e-3 || abs(niiPEAKS->hdr->pixdim[3]-pixdim.z) > 1e-3 )
-        {
-            COLOR_warning( "The VOXEL SIZE does not match that of DWI images", "\t" );
-        }
-        if (
-            niiPEAKS->hdr->sform_code != niiDWI->hdr->sform_code || niiPEAKS->hdr->qform_code != niiDWI->hdr->qform_code || niiPEAKS->hdr->pixdim[0] != niiDWI->hdr->pixdim[0] ||
-            niiPEAKS->hdr->quatern_b != niiDWI->hdr->quatern_b || niiPEAKS->hdr->quatern_c != niiDWI->hdr->quatern_c || niiPEAKS->hdr->quatern_d != niiDWI->hdr->quatern_d ||
-            niiPEAKS->hdr->qoffset_x != niiDWI->hdr->qoffset_x || niiPEAKS->hdr->qoffset_y != niiDWI->hdr->qoffset_y || niiPEAKS->hdr->qoffset_z != niiDWI->hdr->qoffset_z
-        )
-        {
-            COLOR_warning( "The GEOMETRY does not match that of DWI images", "\t" );
-        }
-
-        // Read the affine matrix to rotate the vectors
-        // NB: we need the inverse, but in this case inv=transpose
-        if ( niiPEAKS->hdr->qform_code != 0 )
-        {
-            for(int i=0; i<3 ;i++)
-            for(int j=0; j<3 ;j++)
-                PEAKS_affine[i][j] = PEAKS_qform.m[j][i];
-        }
-        else if ( niiPEAKS->hdr->sform_code != 0 )
-        {
-            for(int i=0; i<3 ;i++)
-            for(int j=0; j<3 ;j++)
-                PEAKS_affine[i][j] = PEAKS_sform.m[j][i];
-        }
-        else {
-            for(int i=0; i<3 ;i++)
-            for(int j=0; j<3 ;j++)
-                PEAKS_affine[i][j] = 0;
-            for(int i=0; i<3 ;i++)
-                PEAKS_affine[i][i] = 1;
-        }
-
-        printf( "\tAffine used :\n" );
-        for(int i=0; i<3 ;i++)
-        {
-            printf( "\t\t| " );
-            for(int j=0; j<3 ;j++)
-                printf( "%9.4f ", PEAKS_affine[i][j] );
-            printf( "|\n" );
-        }
-
-        COLOR_msg( "   [OK]" );
-    }
-    else {
-        // no peaks are passed and won't be showed
-        COLOR_msg( "   [no peaks specified]" );
-        PEAKS_n = 0;
-    }
-
-
-    // ===================
-    // Reading TRACTS file
-    // ===================
-    COLOR_msg( "-> Reading 'TRACTOGRAM':", "\n" );
-
-    if ( !TRK_filename.empty() )
-    {
-        TRK_file = TrackVis();
-        if ( !TRK_file.open( TRK_filename ) )
-        {
-            COLOR_error( "Unable to open the file", "\t" );
-            return false;
-        }
-
-        printf("\tcount      : %d\n" , TRK_file.hdr.n_count );
-        printf("\tdim        : %d x %d x %d\n" , TRK_file.hdr.dim[0], TRK_file.hdr.dim[1], TRK_file.hdr.dim[2] );
-        printf("\tpixdim     : %.4f x %.4f x %.4f\n", TRK_file.hdr.voxel_size[0], TRK_file.hdr.voxel_size[1], TRK_file.hdr.voxel_size[2] );
-        printf("\tscalars    : %d\n" , TRK_file.hdr.n_scalars );
-        printf("\tproperties : %d\n" , TRK_file.hdr.n_properties );
-
-        if ( TRK_file.hdr.dim[0] != dim.x || TRK_file.hdr.dim[1] != dim.y || TRK_file.hdr.dim[2] != dim.z ||
-             abs(TRK_file.hdr.voxel_size[0]-pixdim.x) > 1e-4 || abs(TRK_file.hdr.voxel_size[1]-pixdim.y) > 1e-4 || abs(TRK_file.hdr.voxel_size[2]-pixdim.z) > 1e-4 )
-        {
-            COLOR_warning( "The GEOMETRY does not match those of DWI images", "\t" );
-        }
-
-        TRK_skip = ceil( TRK_file.hdr.n_count / 25000.0 );
-        int N, n_s = TRK_file.hdr.n_scalars, n_p = TRK_file.hdr.n_properties;
-        FILE* fp = TRK_file.getFilePtr();
-
-        // count how many points I need to store in memory
-        int TractsRead = 0, CoordsRead = 0;
-        fseek(fp, 1000, SEEK_SET);
-        for(int f=0; f < TRK_file.hdr.n_count ; f++)
-        {
-            fread( (char*)&N, 1, 4, fp );
-            fseek( fp, N*(3+n_s)*4 + n_p*4, SEEK_CUR );
-            if ( f%TRK_skip==0 )
-            {
-                TractsRead++;
-                CoordsRead += N;
-            }
-        }
-        printf("\tin memory  : %d (%d points)\n" , TractsRead, CoordsRead );
-
-        // create data structure for drawing the tracts
-        TRK_nTractsPlotted = TractsRead;
-        TRK_nPoints = new int[TRK_nTractsPlotted];
-        TRK_coords  = new float[3*CoordsRead];
-        TRK_colors  = new float[3*CoordsRead];
-
-        float* ptr  = TRK_coords;
-        float* ptrc = TRK_colors;
-        float norm;
-        VECTOR<float> dir;
-        TractsRead = 0;
-        fseek(fp, 1000, SEEK_SET);
-        for(int f=0; f < TRK_file.hdr.n_count ; f++)
-        {
-            if ( f%TRK_skip==0 )
-            {
-                fread( (char*)&N, 1, 4, fp );
-                TRK_nPoints[TractsRead] = N;
-
-                for(int i=0; i<N; i++)
-                {
-                    fread((char*)ptr, 1, 12, fp);
-                    fseek( fp, n_s*4, SEEK_CUR );
-
-                    // coordinates (later they will be scaled back to voxel size)
-                    ptr[0] /= pixdim.x;
-                    ptr[1] /= pixdim.y;
-                    ptr[2] /= pixdim.z;
-
-                    // colors
-                    if ( i > 0 )
-                    {
-                        dir.x = *(ptr  ) - *(ptr-3);
-                        dir.y = *(ptr+1) - *(ptr-2);
-                        dir.z = *(ptr+2) - *(ptr-1);
-                        norm = dir.norm();
-                        ptrc[0] = abs( dir.x / norm );
-                        ptrc[1] = abs( dir.y / norm );
-                        ptrc[2] = abs( dir.z / norm );
-                    }
-                    else
-                    {
-                        ptrc[0] = 0;
-                        ptrc[1] = 0;
-                        ptrc[2] = 0;
-                    }
-
-                    ptr  += 3;
-                    ptrc += 3;
-                }
-                fseek( fp, n_p*4, SEEK_CUR );
-                TractsRead++;
-            }
-            else
-            {
-                fread( (char*)&N, 1, 4, fp );
-                fseek( fp, N*(3+n_s)*4 + n_p*4, SEEK_CUR );
-            }
-        }
-
-        COLOR_msg( "   [OK]" );
-        printf( "\n\n" );
-    }
-    else
-    {
-        // no fibers are passed and won't be showed
-        COLOR_msg( "   [no streamlines specified]" );
-        TRK_nTractsPlotted = 0;
-    }
-
-    TRK_offset.x = 0;
-    TRK_offset.y = 0;
-    TRK_offset.z = 0;
-
-
-    // ============
-    // SETUP OpenGL
-    // ============
-    VOXEL.x = round( dim.x / 2.0 );
-    VOXEL.y = round( dim.y / 2.0 );
-    VOXEL.z = round( dim.z / 2.0 );
-    OpenGL_init( argc, argv );
-
-    return EXIT_SUCCESS;
-}
+#include <NIFTI.h>
+#include <nifti1_io.h>
+#include <COLOR_ui.h>
+#include <TrackVis.h>
+#include <VECTOR.h>
+#include <cmath>
+#include <regex>
+#include <cstdlib>
+#include "tclap/CmdLine.h"
+#include <blitz/array.h>
+using namespace std;
+
+#include "colormaps.h"
+
+NIFTI*                   niiDWI;
+VECTOR<int>		         dim;
+VECTOR<float>	         pixdim;
+
+int                      SCHEME_version;
+vector< VECTOR<float> >	 SCHEME_dirs;
+vector<float>	         SCHEME_b;
+vector<int>              SCHEME_idxB0;
+vector<int>              SCHEME_idxDWI;
+vector<float>	         SCHEME_shells_b;
+vector< vector<int> >    SCHEME_shells_idx;
+
+blitz::Array<float,3>    MAP;
+VECTOR<int>		         VOXEL;
+float                    MAP_min, MAP_min_view, MAP_max, MAP_max_view;
+float 			         MAP_opacity = 0.5;
+bool			         showPlane[3] = { true, true, true };
+bool                     showAxes = true;
+bool			         showConfig = true;
+float				     LINE_width = 2.0;
+
+NIFTI*                   niiPEAKS;
+int				         PEAKS_n;
+bool			         PEAKS_show = false;
+float			         PEAKS_thr = 0.0;
+bool			         PEAKS_doNormalize = false;
+bool			         PEAKS_flip[3] = {false, false, false};
+bool			         PEAKS_use_affine = false;
+float                    PEAKS_affine[3][3];
+
+TrackVis 		         TRK_file;
+int				         TRK_skip;
+int				         TRK_nTractsPlotted;
+int*   			         TRK_nPoints;
+float*			         TRK_coords;
+float*			         TRK_colors;
+float 			         TRK_crop = 1.0;
+bool 			         TRK_crop_mode = true;
+bool 			         TRK_show = false;
+VECTOR<float> 	         TRK_offset;
+
+bool 			         GLYPHS_show = false;
+int                      GLYPHS_shell = 0;
+bool			         GLYPHS_flip[3] = {false, false, false};
+float	                 GLYPHS_b0_thr = 50.0;
+bool			         GLYPHS_use_affine = false;
+float                    GLYPHS_affine[3][3];
+
+#include "OPENGL_callbacks.cxx"
+
+
+/*----------------------------------------------------------------------------------------------------------------------------------*/
+int main(int argc, char** argv)
+{
+    TCLAP::CmdLine cmd("This tool allows one to display in a common 3D space all the objects (DWI data, streamlines etc...) used by COMMIT in order to spot possible incosistencies between the conventions of COMMIT and the software that generated the data, e.g. flip in some axes in the DWI data or in the peaks, spatial shift in the streamlines, whether the affine transformation was already applied to the data etc..", ' ', "1.2");
+
+    TCLAP::UnlabeledValueArg<string> argDWI(    "dwi","Filename of the DWI dataset [4D NIFTI]", true, "", "DWI", cmd );
+    TCLAP::ValueArg<string>          argMAP(    "m", "map", "Background map [3D NIFTI]", false, "", "map", cmd );
+    TCLAP::ValueArg<string>          argPEAKS(  "p", "peaks", "Main diffusion directions for the extra-axonal part in each voxel [4D NIFTI]", false, "", "peaks", cmd );
+    TCLAP::ValueArg<string>          argTRK(    "f", "fibers", "Streamlines for the intra-axonal part [.TRK format]", false, "", "fibers", cmd );
+    TCLAP::UnlabeledValueArg<string> argSCHEME( "scheme","Acquisition scheme [text]", true, "", "scheme", cmd );
+
+    try	{ cmd.parse( argc, argv ); }
+    catch (TCLAP::ArgException &e) { cerr << "error: " << e.error() << " for arg " << e.argId() << endl; }
+
+    string DWI_filename( argDWI.getValue() );
+    string SCHEME_filename( argSCHEME.getValue() );
+    string PEAKS_filename( argPEAKS.getValue() );
+    string TRK_filename( argTRK.getValue() );
+    string MAP_filename( argMAP.getValue() );
+
+
+    // ===================
+    // Reading DWI dataset
+    // ===================
+    COLOR_msg( "-> Reading 'DWI' dataset:", "\n" );
+
+    niiDWI = new NIFTI;
+    niiDWI->open( DWI_filename, true );
+    if ( !niiDWI->isValid() )
+    {
+        COLOR_error( "Unable to open file", "\t" );
+        return EXIT_FAILURE;
+    }
+    dim.x = niiDWI->hdr->dim[1];
+    dim.y = niiDWI->hdr->dim[2];
+    dim.z = niiDWI->hdr->dim[3];
+    pixdim.x = niiDWI->hdr->pixdim[1];
+    pixdim.y = niiDWI->hdr->pixdim[2];
+    pixdim.z = niiDWI->hdr->pixdim[3];
+    printf( "\tdim    : %d x %d x %d x %d\n", dim.x, dim.y, dim.z, niiDWI->hdr->dim[4] );
+    printf( "\tpixdim : %.4f x %.4f x %.4f\n", 	pixdim.x, pixdim.y, pixdim.z );
+    printf( "\tqform  : %d\n", niiDWI->hdr->qform_code );
+    mat44 DWI_qform = niiDWI->hdr->qto_xyz;
+    if ( niiDWI->hdr->qform_code > 0 )
+    {
+        for(int i=0; i<3 ;i++)
+        {
+            printf( "\t\t| " );
+            for(int j=0; j<4 ;j++)
+                printf( "%9.4f ", DWI_qform.m[i][j] );
+            printf( "|\n" );
+        }
+    }
+    else
+    {
+        COLOR_warning( "This should never happen!", "\t\t" );
+    }
+    printf( "\tsform  : %d\n", niiDWI->hdr->sform_code );
+    mat44 DWI_sform = niiDWI->hdr->sto_xyz;
+    if ( niiDWI->hdr->sform_code > 0 )
+    {
+        for(int i=0; i<3 ;i++)
+        {
+            printf( "\t\t| " );
+            for(int j=0; j<4 ;j++)
+                printf( "%9.4f ", DWI_sform.m[i][j] );
+            printf( "|\n" );
+        }
+    }
+
+    // Read the affine matrix to rotate the vectors
+    // NB: we need the inverse, but in this case inv=transpose
+    if ( niiDWI->hdr->qform_code != 0 )
+    {
+        for(int i=0; i<3 ;i++)
+        for(int j=0; j<3 ;j++)
+            GLYPHS_affine[i][j] = DWI_qform.m[j][i];
+    }
+    else if ( niiDWI->hdr->sform_code != 0 )
+    {
+        for(int i=0; i<3 ;i++)
+        for(int j=0; j<3 ;j++)
+            GLYPHS_affine[i][j] = DWI_sform.m[j][i];
+    }
+    else {
+        for(int i=0; i<3 ;i++)
+        for(int j=0; j<3 ;j++)
+            GLYPHS_affine[i][j] = 0;
+        for(int i=0; i<3 ;i++)
+            GLYPHS_affine[i][i] = 1;
+    }
+
+    mat33 tmp;
+    for(int i=0; i<3 ;i++)
+        for(int j=0; j<3 ;j++)
+            tmp.m[i][j] = GLYPHS_affine[i][j];
+    printf( "\tAffine used (%s):\n", nifti_mat33_determ(tmp)<0?"RADIOLOGICAL":"NEUROLOGICAL" );
+    for(int i=0; i<3 ;i++)
+    {
+        printf( "\t\t| " );
+        for(int j=0; j<3 ;j++)
+            printf( "%9.4f ", GLYPHS_affine[i][j] );
+        printf( "|\n" );
+    }
+
+    COLOR_msg( "   [OK]" );
+
+
+    // ===================
+    // Reading SCHEME file
+    // ===================
+    COLOR_msg( "-> Reading 'SCHEME' file:", "\n" );
+
+    char line[1000];
+    FILE* pFile = fopen( SCHEME_filename.c_str(), "rt" );
+
+    // read the version
+    // ----------------
+    try
+    {
+        while( fgets(line, 1000, pFile) )
+            if ( line[0]!='#' )
+                break;
+
+        std::regex reVersion("^VERSION: (.*)\\s*$");
+        std::smatch reMatches;
+
+        std::string str_line = string(line);
+        if ( !std::regex_match(str_line, reMatches, reVersion) )
+        {
+            // no header found, assume standards BVECTOR format
+            SCHEME_version = 0;
+            fseek(pFile, -strlen(line), SEEK_CUR);
+        }
+        else
+        {
+            if( strcmp(reMatches[1].str().c_str(),"0")==0 || strcmp(reMatches[1].str().c_str(),"BVECTOR")==0 )
+                SCHEME_version = 0;
+            else if( strcmp(reMatches[1].str().c_str(),"1")==0 || strcmp(reMatches[1].str().c_str(),"STEJSKALTANNER")==0 )
+                SCHEME_version = 1;
+            else
+                throw "Version not recognized";
+        }
+    }
+    catch( const char* msg )
+    {
+        COLOR_error( msg, "\t" );
+        return EXIT_FAILURE;
+    }
+    printf( "\tversion   : %s\n", SCHEME_version==0?"BVECTOR":"STEJSKALTANNER" );
+
+    // read the data
+    // -------------
+    try
+    {
+        string      reFLOAT( "[-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?" );
+        std::regex  reVERSION0( "^\\s*("+reFLOAT+")\\s+("+reFLOAT+")\\s+("+reFLOAT+")\\s+("+reFLOAT+")\\s*$" );
+        std::regex  reVERSION1( "^\\s*("+reFLOAT+")\\s+("+reFLOAT+")\\s+("+reFLOAT+")\\s+("+reFLOAT+")\\s+("+reFLOAT+")\\s+("+reFLOAT+")\\s+("+reFLOAT+")\\s*$" );
+        std::regex  reEMPTY( "^\\s*$" );
+        std::smatch reMatches;
+        int         Ns = 0;
+        float       x, y, z, b, G, D, d;
+        while( fgets(line, 1000, pFile) )
+        {
+            std::string str_line = string(line);
+            if( std::regex_match(str_line, reMatches, reEMPTY) )
+                continue;   // skip empty lines
+
+            if( SCHEME_version == 0 )
+            {
+                if ( !std::regex_match(str_line, reMatches, reVERSION0) )
+                    throw "Wrong row format";
+                x = std::atof( reMatches[1].str().c_str() );
+                y = std::atof( reMatches[2].str().c_str() );
+                z = std::atof( reMatches[3].str().c_str() );
+                b = std::atof( reMatches[4].str().c_str() ); // in mm^2/s
+                VECTOR<float> tmp( x, y, z );
+                tmp.Normalize();
+                SCHEME_dirs.push_back( tmp );
+                SCHEME_b.push_back( b );
+            }
+            else
+            {
+                if ( !std::regex_match(str_line, reMatches, reVERSION1) )
+                    throw "Wrong row format";
+                x = std::atof( reMatches[1].str().c_str() );
+                y = std::atof( reMatches[2].str().c_str() );
+                z = std::atof( reMatches[3].str().c_str() );
+                G = std::atof( reMatches[4].str().c_str() );
+                D = std::atof( reMatches[5].str().c_str() );
+                d = std::atof( reMatches[6].str().c_str() );
+                VECTOR<float> tmp( x, y, z );
+                tmp.Normalize();
+                SCHEME_dirs.push_back( tmp );
+                b = std::pow( 267.513e6*G*d, 2 ) * (D-d/3.0) * 1e-6; // in mm^2/s
+                SCHEME_b.push_back( b );
+            }
+
+            if ( b<5.0 )
+            {
+                SCHEME_idxB0.push_back( Ns );
+            }
+            else
+            {
+                SCHEME_idxDWI.push_back( Ns );
+                if ( std::find(SCHEME_shells_b.begin(), SCHEME_shells_b.end(), b) == SCHEME_shells_b.end() )
+                {
+                    SCHEME_shells_b.push_back( b ) ;
+                    vector<int> tmp;
+                    SCHEME_shells_idx.push_back( tmp ) ;
+                }
+            }
+            Ns++;
+        }
+    }
+    catch( const char* msg )
+    {
+        COLOR_error( msg, "\t" );
+        return EXIT_FAILURE;
+    }
+    fclose(pFile);
+
+    printf( "\tgradients : %d\n", SCHEME_b.size() );
+    if ( niiDWI->hdr->dim[4] != SCHEME_b.size() )
+    {
+        COLOR_error( "The scheme does not match the DWI dataset", "\t" );
+        return EXIT_FAILURE;
+    }
+
+    // fill data structure about the SCHEME
+    // ------------------------------------
+    for(int i=0; i < SCHEME_b.size() ;i++)
+    {
+        if ( SCHEME_b[i] < 5 )
+            continue;
+        int s = std::find( SCHEME_shells_b.begin(), SCHEME_shells_b.end(), SCHEME_b[i] ) - SCHEME_shells_b.begin();
+        SCHEME_shells_idx[s].push_back( i );
+    }
+
+    printf( "\tscheme    : %d b0 and %d shells (", SCHEME_idxB0.size(), SCHEME_shells_idx.size() );
+    for(int i=0; i < SCHEME_shells_b.size() ;i++)
+        printf( " [%d @ b=%.1f]", SCHEME_shells_idx[i].size(), SCHEME_shells_b[i] );
+    printf( " )\n" );
+
+    COLOR_msg( "   [OK]" );
+
+
+
+    // =======================
+    // Creating BACKGROUND map
+    // =======================
+    COLOR_msg( "-> Preparing 'BACKGROUND' map:", "\n" );
+    MAP.resize(dim.x,dim.y,dim.z);
+    if ( !MAP_filename.empty() )
+    {
+        printf( "\tdata   : reading from file\n" );
+        NIFTI* niiMAP = new NIFTI;
+        niiMAP->open( MAP_filename, true );
+        if ( !niiMAP->isValid() )
+        {
+            COLOR_error( "Unable to open the file", "\t" );
+            return EXIT_FAILURE;
+        }
+
+        printf( "\tdim    : %d x %d x %d x %d\n" , niiMAP->hdr->dim[1],    niiMAP->hdr->dim[2],    niiMAP->hdr->dim[3], niiMAP->hdr->dim[4] );
+        printf( "\tpixdim : %.4f x %.4f x %.4f\n", niiMAP->hdr->pixdim[1], niiMAP->hdr->pixdim[2], niiMAP->hdr->pixdim[3] );
+
+        if ( niiMAP->hdr->dim[1] != dim.x || niiMAP->hdr->dim[2] != dim.y || niiMAP->hdr->dim[3] != dim.z )
+        {
+            COLOR_error( "The DIMENSIONS do not match those of DWI images", "\t" );
+            return EXIT_FAILURE;
+        }
+        if ( abs(niiMAP->hdr->pixdim[1]-pixdim.x) > 1e-4 || abs(niiMAP->hdr->pixdim[2]-pixdim.y) > 1e-4 || abs(niiMAP->hdr->pixdim[3]-pixdim.z) > 1e-4 )
+        {
+            COLOR_warning( "The VOXEL SIZE does not match that of DWI images", "\t" );
+        }
+
+        FLOAT32 MIN = 0;//(*niiMAP->img)(0,0,0);
+        FLOAT32 MAX = MIN;
+
+        for(int i=0; i<dim.x ;i++)
+        for(int j=0; j<dim.y ;j++)
+        for(int k=0; k<dim.z ;k++)
+        {
+            MAP(i,j,k) = (*niiMAP->img)(i,j,k);
+            if ( MAP(i,j,k) > MAX )
+                MAX = MAP(i,j,k);
+            if ( MAP(i,j,k) < MIN )
+                MIN = MAP(i,j,k);
+        }
+        if ( MAX - MIN <= 0 )
+        {
+            COLOR_error( "The dynamic range is zero", "\t" );
+            return EXIT_FAILURE;
+        }
+        MAP_min	= MIN;
+        MAP_min_view = 0;
+        MAP_max	= MAP_max_view = MAX;
+
+        printf( "\tvalues : [%.2e ... %.2e]\n", MAP_min, MAP_max );
+        COLOR_msg( "   [OK]" );
+    }
+    else
+    {
+        printf( "\tdata   : " );
+
+        if ( SCHEME_idxB0.size() > 0 )
+        {
+            printf( "taking first b0 image\n" );
+            FLOAT32 MIN = (*niiDWI->img)(0,0,0,SCHEME_idxB0[0]);
+            FLOAT32 MAX = MIN;
+
+            for(int i=0; i<dim.x ;i++)
+            for(int j=0; j<dim.y ;j++)
+            for(int k=0; k<dim.z ;k++)
+            {
+                MAP(i,j,k) = (*niiDWI->img)(i,j,k,SCHEME_idxB0[0]);
+                if ( MAP(i,j,k) > MAX )
+                    MAX = MAP(i,j,k);
+                if ( MAP(i,j,k) < MIN )
+                    MIN = MAP(i,j,k);
+            }
+            if ( MAX - MIN <= 0 )
+            {
+                COLOR_error( "The dynamic range is zero", "\t" );
+                return EXIT_FAILURE;
+            }
+            MAP_min	= MIN;
+            MAP_min_view = 0;
+            MAP_max	= MAP_max_view = MAX;
+        }
+        else
+        {
+            printf( "no b0 found\n" );
+            MAP = 0;
+            MAP_min	= MAP_min_view = 0;
+            MAP_max	= MAP_max_view = 1;
+        }
+        printf( "\tvalues : [%.2e ... %.2e]\n", MAP_min, MAP_max );
+        COLOR_msg( "   [OK]" );
+    }
+
+
+    // ==================
+    // Reading PEAKS file
+    // ==================
+    COLOR_msg( "-> Reading 'PEAKS' dataset:", "\n" );
+
+    if ( !PEAKS_filename.empty() )
+    {
+        niiPEAKS = new NIFTI;
+        niiPEAKS->open( PEAKS_filename, true );
+        if ( !niiPEAKS->isValid() )
+        {
+            COLOR_error( "Unable to open the file", "\t" );
+            return false;
+        }
+
+        if ( niiPEAKS->hdr->dim[0] != 4 || niiPEAKS->hdr->dim[4]%3 != 0 )
+        {
+            COLOR_error( "The size must be (*,*,*,3*k)", "\t" );
+            return EXIT_FAILURE;
+        }
+        PEAKS_n = niiPEAKS->hdr->dim[4]/3;
+
+        printf( "\tdim     : %d x %d x %d (%d peaks per voxel)\n" , niiPEAKS->hdr->dim[1], niiPEAKS->hdr->dim[2], niiPEAKS->hdr->dim[3], PEAKS_n );
+        printf( "\tpixdim  : %.4f x %.4f x %.4f\n", niiPEAKS->hdr->pixdim[1], niiPEAKS->hdr->pixdim[2], niiPEAKS->hdr->pixdim[3] );
+
+        printf( "\tqform   : %d\n", niiPEAKS->hdr->qform_code );
+        mat44 PEAKS_qform = niiPEAKS->hdr->qto_xyz;
+        if ( niiPEAKS->hdr->qform_code > 0 )
+        {
+            for(int i=0; i<3 ;i++)
+            {
+                printf( "\t\t| " );
+                for(int j=0; j<4 ;j++)
+                    printf( "%9.4f ", PEAKS_qform.m[i][j] );
+                printf( "|\n" );
+            }
+        }
+        else
+        {
+            COLOR_warning( "This should never happen!", "\t\t" );
+        }
+
+        printf( "\tsform  : %d\n", niiPEAKS->hdr->sform_code );
+        mat44 PEAKS_sform = niiPEAKS->hdr->sto_xyz;
+        if ( niiPEAKS->hdr->sform_code > 0 )
+        {
+            for(int i=0; i<3 ;i++)
+            {
+                printf( "\t\t| " );
+                for(int j=0; j<4 ;j++)
+                    printf( "%9.4f ", PEAKS_sform.m[i][j] );
+                printf( "|\n" );
+            }
+        }
+
+        if ( niiPEAKS->hdr->dim[1] != dim.x || niiPEAKS->hdr->dim[2] != dim.y || niiPEAKS->hdr->dim[3] != dim.z )
+        {
+            COLOR_error( "The DIMENSIONS do not match those of DWI images", "\t" );
+            return EXIT_FAILURE;
+        }
+        if ( abs(niiPEAKS->hdr->pixdim[1]-pixdim.x) > 1e-3 || abs(niiPEAKS->hdr->pixdim[2]-pixdim.y) > 1e-3 || abs(niiPEAKS->hdr->pixdim[3]-pixdim.z) > 1e-3 )
+        {
+            COLOR_warning( "The VOXEL SIZE does not match that of DWI images", "\t" );
+        }
+        if (
+            niiPEAKS->hdr->sform_code != niiDWI->hdr->sform_code || niiPEAKS->hdr->qform_code != niiDWI->hdr->qform_code || niiPEAKS->hdr->pixdim[0] != niiDWI->hdr->pixdim[0] ||
+            niiPEAKS->hdr->quatern_b != niiDWI->hdr->quatern_b || niiPEAKS->hdr->quatern_c != niiDWI->hdr->quatern_c || niiPEAKS->hdr->quatern_d != niiDWI->hdr->quatern_d ||
+            niiPEAKS->hdr->qoffset_x != niiDWI->hdr->qoffset_x || niiPEAKS->hdr->qoffset_y != niiDWI->hdr->qoffset_y || niiPEAKS->hdr->qoffset_z != niiDWI->hdr->qoffset_z
+        )
+        {
+            COLOR_warning( "The GEOMETRY does not match that of DWI images", "\t" );
+        }
+
+        // Read the affine matrix to rotate the vectors
+        // NB: we need the inverse, but in this case inv=transpose
+        if ( niiPEAKS->hdr->qform_code != 0 )
+        {
+            for(int i=0; i<3 ;i++)
+            for(int j=0; j<3 ;j++)
+                PEAKS_affine[i][j] = PEAKS_qform.m[j][i];
+        }
+        else if ( niiPEAKS->hdr->sform_code != 0 )
+        {
+            for(int i=0; i<3 ;i++)
+            for(int j=0; j<3 ;j++)
+                PEAKS_affine[i][j] = PEAKS_sform.m[j][i];
+        }
+        else {
+            for(int i=0; i<3 ;i++)
+            for(int j=0; j<3 ;j++)
+                PEAKS_affine[i][j] = 0;
+            for(int i=0; i<3 ;i++)
+                PEAKS_affine[i][i] = 1;
+        }
+
+        printf( "\tAffine used :\n" );
+        for(int i=0; i<3 ;i++)
+        {
+            printf( "\t\t| " );
+            for(int j=0; j<3 ;j++)
+                printf( "%9.4f ", PEAKS_affine[i][j] );
+            printf( "|\n" );
+        }
+
+        COLOR_msg( "   [OK]" );
+    }
+    else {
+        // no peaks are passed and won't be showed
+        COLOR_msg( "   [no peaks specified]" );
+        PEAKS_n = 0;
+    }
+
+
+    // ===================
+    // Reading TRACTS file
+    // ===================
+    COLOR_msg( "-> Reading 'TRACTOGRAM':", "\n" );
+
+    if ( !TRK_filename.empty() )
+    {
+        TRK_file = TrackVis();
+        if ( !TRK_file.open( TRK_filename ) )
+        {
+            COLOR_error( "Unable to open the file", "\t" );
+            return false;
+        }
+
+        printf("\tcount      : %d\n" , TRK_file.hdr.n_count );
+        printf("\tdim        : %d x %d x %d\n" , TRK_file.hdr.dim[0], TRK_file.hdr.dim[1], TRK_file.hdr.dim[2] );
+        printf("\tpixdim     : %.4f x %.4f x %.4f\n", TRK_file.hdr.voxel_size[0], TRK_file.hdr.voxel_size[1], TRK_file.hdr.voxel_size[2] );
+        printf("\tscalars    : %d\n" , TRK_file.hdr.n_scalars );
+        printf("\tproperties : %d\n" , TRK_file.hdr.n_properties );
+
+        if ( TRK_file.hdr.dim[0] != dim.x || TRK_file.hdr.dim[1] != dim.y || TRK_file.hdr.dim[2] != dim.z ||
+             abs(TRK_file.hdr.voxel_size[0]-pixdim.x) > 1e-4 || abs(TRK_file.hdr.voxel_size[1]-pixdim.y) > 1e-4 || abs(TRK_file.hdr.voxel_size[2]-pixdim.z) > 1e-4 )
+        {
+            COLOR_warning( "The GEOMETRY does not match those of DWI images", "\t" );
+        }
+
+        TRK_skip = ceil( TRK_file.hdr.n_count / 25000.0 );
+        int N, n_s = TRK_file.hdr.n_scalars, n_p = TRK_file.hdr.n_properties;
+        FILE* fp = TRK_file.getFilePtr();
+
+        // count how many points I need to store in memory
+        int TractsRead = 0, CoordsRead = 0;
+        fseek(fp, 1000, SEEK_SET);
+        for(int f=0; f < TRK_file.hdr.n_count ; f++)
+        {
+            fread( (char*)&N, 1, 4, fp );
+            fseek( fp, N*(3+n_s)*4 + n_p*4, SEEK_CUR );
+            if ( f%TRK_skip==0 )
+            {
+                TractsRead++;
+                CoordsRead += N;
+            }
+        }
+        printf("\tin memory  : %d (%d points)\n" , TractsRead, CoordsRead );
+
+        // create data structure for drawing the tracts
+        TRK_nTractsPlotted = TractsRead;
+        TRK_nPoints = new int[TRK_nTractsPlotted];
+        TRK_coords  = new float[3*CoordsRead];
+        TRK_colors  = new float[3*CoordsRead];
+
+        float* ptr  = TRK_coords;
+        float* ptrc = TRK_colors;
+        float norm;
+        VECTOR<float> dir;
+        TractsRead = 0;
+        fseek(fp, 1000, SEEK_SET);
+        for(int f=0; f < TRK_file.hdr.n_count ; f++)
+        {
+            if ( f%TRK_skip==0 )
+            {
+                fread( (char*)&N, 1, 4, fp );
+                TRK_nPoints[TractsRead] = N;
+
+                for(int i=0; i<N; i++)
+                {
+                    fread((char*)ptr, 1, 12, fp);
+                    fseek( fp, n_s*4, SEEK_CUR );
+
+                    // coordinates (later they will be scaled back to voxel size)
+                    ptr[0] /= pixdim.x;
+                    ptr[1] /= pixdim.y;
+                    ptr[2] /= pixdim.z;
+
+                    // colors
+                    if ( i > 0 )
+                    {
+                        dir.x = *(ptr  ) - *(ptr-3);
+                        dir.y = *(ptr+1) - *(ptr-2);
+                        dir.z = *(ptr+2) - *(ptr-1);
+                        norm = dir.norm();
+                        ptrc[0] = abs( dir.x / norm );
+                        ptrc[1] = abs( dir.y / norm );
+                        ptrc[2] = abs( dir.z / norm );
+                    }
+                    else
+                    {
+                        ptrc[0] = 0;
+                        ptrc[1] = 0;
+                        ptrc[2] = 0;
+                    }
+
+                    ptr  += 3;
+                    ptrc += 3;
+                }
+                fseek( fp, n_p*4, SEEK_CUR );
+                TractsRead++;
+            }
+            else
+            {
+                fread( (char*)&N, 1, 4, fp );
+                fseek( fp, N*(3+n_s)*4 + n_p*4, SEEK_CUR );
+            }
+        }
+
+        COLOR_msg( "   [OK]" );
+        printf( "\n\n" );
+    }
+    else
+    {
+        // no fibers are passed and won't be showed
+        COLOR_msg( "   [no streamlines specified]" );
+        TRK_nTractsPlotted = 0;
+    }
+
+    TRK_offset.x = 0;
+    TRK_offset.y = 0;
+    TRK_offset.z = 0;
+
+
+    // ============
+    // SETUP OpenGL
+    // ============
+    VOXEL.x = round( dim.x / 2.0 );
+    VOXEL.y = round( dim.y / 2.0 );
+    VOXEL.z = round( dim.z / 2.0 );
+    OpenGL_init( argc, argv );
+
+    return EXIT_SUCCESS;
+}
  
\ No newline at end of file
diff --git a/extras/include/COLOR_ui.h b/extras/include/COLOR_ui.h
index 54bdd5b4..83d3aab0 100644
--- a/extras/include/COLOR_ui.h
+++ b/extras/include/COLOR_ui.h
@@ -1,73 +1,73 @@
-#ifndef __UI_H__
-#define __UI_H__
-
-
-#include <iostream>
-#include <fstream>
-#include <time.h>
-#include <string>
-using namespace std;
-
-
-/* COLOR constants (abckground is foreground+10) */
-#define		COLOR_black		30
-#define		COLOR_red		31
-#define		COLOR_green		32
-#define		COLOR_yellow	33
-#define		COLOR_blue		34
-#define		COLOR_magenta	35
-#define		COLOR_cyan		36
-#define		COLOR_white		37
-
-#define		COLOR_normal	0
-#define		COLOR_bold		1
-#define		COLOR_underline	4
-#define		COLOR_blink		5
-
-#define		COLOR(FG,BG,FONT) "\033["#FONT";"#FG";"#BG"m"
-#define		COLOR_reset "\033[0m"
-#define		COLOR_strERR COLOR(31,48,7) "[ERROR]" COLOR(31,48,0) " "
-#define		COLOR_strWAR COLOR(33,48,7) "[WARNING]" COLOR(33,48,0) " "
-
-
-void COLOR_print(string str, short int FG=COLOR_white, short int BG=COLOR_black, short int FONT=COLOR_normal)
-{
-    printf("\033[%d;%d;%dm%s\033[0m", FONT,FG,BG+10, str.c_str());
-}
-
-
-void COLOR_log(string str, short int FG=COLOR_green, short int BG=COLOR_black, short int FONT=COLOR_normal)
-{
-    char buffer [80];
-    time_t rawtime = time(0);
-    struct tm * timeinfo = localtime ( &rawtime );
-    strftime (buffer,80,"%H:%M:%S",timeinfo);
-
-    printf("\n\033[0;%d;%dm[ %s ]\033[%d;%d;%dm %s\033[0m\n", BG,FG+10,buffer, FONT,FG,BG+10,str.c_str());
-}
-
-
-void COLOR_msg( string msg, string prefix="" )
-{
-    if ( !prefix.empty() )
-        cerr << prefix;
-    cerr << "\033[0;34m "<< msg.c_str() <<"\033[0m\n";
-}
-
-
-void COLOR_error( string msg, string prefix="" )
-{
-    if ( !prefix.empty() )
-        cerr << prefix;
-    cerr << "\033[0;30;41m[ ERROR ]\033[0;31m "<< msg.c_str() <<"\033[0m\n";
-}
-
-
-void COLOR_warning( string msg, string prefix="" )
-{
-    if ( !prefix.empty() )
-        cerr << prefix;
-    cerr << "\033[0;30;43m[ WARNING ]\033[0;33m "<< msg.c_str() <<"\033[0m\n";
-}
-
-#endif
+#ifndef __UI_H__
+#define __UI_H__
+
+
+#include <iostream>
+#include <fstream>
+#include <time.h>
+#include <string>
+using namespace std;
+
+
+/* COLOR constants (abckground is foreground+10) */
+#define		COLOR_black		30
+#define		COLOR_red		31
+#define		COLOR_green		32
+#define		COLOR_yellow	33
+#define		COLOR_blue		34
+#define		COLOR_magenta	35
+#define		COLOR_cyan		36
+#define		COLOR_white		37
+
+#define		COLOR_normal	0
+#define		COLOR_bold		1
+#define		COLOR_underline	4
+#define		COLOR_blink		5
+
+#define		COLOR(FG,BG,FONT) "\033["#FONT";"#FG";"#BG"m"
+#define		COLOR_reset "\033[0m"
+#define		COLOR_strERR COLOR(31,48,7) "[ERROR]" COLOR(31,48,0) " "
+#define		COLOR_strWAR COLOR(33,48,7) "[WARNING]" COLOR(33,48,0) " "
+
+
+void COLOR_print(string str, short int FG=COLOR_white, short int BG=COLOR_black, short int FONT=COLOR_normal)
+{
+    printf("\033[%d;%d;%dm%s\033[0m", FONT,FG,BG+10, str.c_str());
+}
+
+
+void COLOR_log(string str, short int FG=COLOR_green, short int BG=COLOR_black, short int FONT=COLOR_normal)
+{
+    char buffer [80];
+    time_t rawtime = time(0);
+    struct tm * timeinfo = localtime ( &rawtime );
+    strftime (buffer,80,"%H:%M:%S",timeinfo);
+
+    printf("\n\033[0;%d;%dm[ %s ]\033[%d;%d;%dm %s\033[0m\n", BG,FG+10,buffer, FONT,FG,BG+10,str.c_str());
+}
+
+
+void COLOR_msg( string msg, string prefix="" )
+{
+    if ( !prefix.empty() )
+        cerr << prefix;
+    cerr << "\033[0;34m "<< msg.c_str() <<"\033[0m\n";
+}
+
+
+void COLOR_error( string msg, string prefix="" )
+{
+    if ( !prefix.empty() )
+        cerr << prefix;
+    cerr << "\033[0;30;41m[ ERROR ]\033[0;31m "<< msg.c_str() <<"\033[0m\n";
+}
+
+
+void COLOR_warning( string msg, string prefix="" )
+{
+    if ( !prefix.empty() )
+        cerr << prefix;
+    cerr << "\033[0;30;43m[ WARNING ]\033[0;33m "<< msg.c_str() <<"\033[0m\n";
+}
+
+#endif
diff --git a/requirements.txt b/requirements.txt
index 1c03d182..9234880c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
-Cython>=0.29
-dipy>=1.0
-dmri-amico>=1.2.3
-numpy>=1.12
-setuptools>=46.1
+Cython>=0.29
+dipy>=1.0
+dmri-amico>=1.2.3
+numpy>=1.12
+setuptools>=46.1
diff --git a/setup.cfg b/setup.cfg
index 3463cc53..a96a1715 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
-# Inside of setup.cfg
-[metadata]
-description-file = README.md
-
-[bdist_wheel]
+# Inside of setup.cfg
+[metadata]
+description-file = README.md
+
+[bdist_wheel]
 universal = 1
\ No newline at end of file
diff --git a/setup.py b/setup.py
index b0a29135..515e988d 100644
--- a/setup.py
+++ b/setup.py
@@ -1,205 +1,205 @@
-from setuptools import Extension, setup
-from setuptools.command.build_ext import build_ext
-import os
-from os.path import join as pjoin
-
-# taken from https://github.com/rmcgibbo/npcuda-example/blob/master/cython/setup.py
-def find_in_path(name, path):
-    """Find a file in a search path"""
-
-    # Adapted fom http://code.activestate.com/recipes/52224
-    for dir in path.split(os.pathsep):
-        binpath = pjoin(dir, name)
-        if os.path.exists(binpath):
-            return os.path.abspath(binpath)
-    return None
-
-def locate_cuda():
-    """Locate the CUDA environment on the system
-    Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64'
-    and values giving the absolute path to each directory.
-    Starts by looking for the CUDAHOME env variable. If not found,
-    everything is based on finding 'nvcc' in the PATH.
-    """
-
-    # First check if the CUDAHOME env variable is in use
-    if 'CUDAHOME' in os.environ:
-        home = os.environ['CUDAHOME']
-        nvcc = pjoin(home, 'bin', 'nvcc')
-    else:
-        # Otherwise, search the PATH for NVCC
-        nvcc = find_in_path('nvcc', os.environ['PATH'])
-        if nvcc is None:
-            return None
-        home = os.path.dirname(os.path.dirname(nvcc))
-
-    cudaconfig = {'home': home, 'nvcc': nvcc,
-                  'include': pjoin(home, 'include'),
-                  'lib64': pjoin(home, 'lib64')}
-    for k, v in iter(cudaconfig.items()):
-        if not os.path.exists(v):
-            return None
-
-    return cudaconfig
-
-def customize_compiler_for_nvcc(self):
-    """Inject deep into distutils to customize how the dispatch
-    to gcc/nvcc works.
-    If you subclass UnixCCompiler, it's not trivial to get your subclass
-    injected in, and still have the right customizations (i.e.
-    distutils.sysconfig.customize_compiler) run on it. So instead of going
-    the OO route, I have this. Note, it's kindof like a wierd functional
-    subclassing going on.
-    """
-
-    # Tell the compiler it can processes .cu
-    self.src_extensions.append('.cu')
-
-    # Save references to the default compiler_so and _comple methods
-    default_compiler_so = self.compiler_so
-    super = self._compile
-
-    # Now redefine the _compile method. This gets executed for each
-    # object but distutils doesn't have the ability to change compilers
-    # based on source extension: we add it.
-    def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
-        if os.path.splitext(src)[1] == '.cu':
-            # use the cuda for .cu files
-            self.set_executable('compiler_so', CUDA['nvcc'])
-            # use only a subset of the extra_postargs, which are 1-1
-            # translated from the extra_compile_args in the Extension class
-            print(type(extra_postargs))
-            print(extra_postargs)
-            postargs = extra_postargs['nvcc']
-        else:
-            print(type(extra_postargs))
-            print(extra_postargs)
-            postargs = extra_postargs['gcc']
-
-        super(obj, src, ext, cc_args, postargs, pp_opts)
-        # Reset the default compiler_so, which we might have changed for cuda
-        self.compiler_so = default_compiler_so
-
-    # Inject our redefined _compile method into the class
-    self._compile = _compile
-
-# Locate CUDA
-CUDA = locate_cuda()
-
-def get_extensions():
-    # Cython extension to create the sparse data structure from a tractogram
-    # for the computation of matrix-vector multiplications
-    ext1 = Extension(name='commit.trk2dictionary',
-                     sources=['commit/trk2dictionary/trk2dictionary.pyx'],
-                     extra_compile_args=['-w'],
-                     language='c++')
-
-    ext2 = Extension(name='commit.core',
-                     sources=['commit/core.pyx'],
-                     extra_compile_args=['-w'],
-                     language='c++')
-
-    ext3 = Extension(name='commit.proximals',
-                     sources=['commit/proximals.pyx'],
-                     extra_compile_args=['-w'],
-                     language='c++')
-
-    return [ext1, ext2, ext3]
-
-def get_extensions_with_cuda():
-    # Cython extension to create the sparse data structure from a tractogram
-    # for the computation of matrix-vector multiplications
-
-    ext1 = Extension(name='commit.trk2dictionary',
-                     sources=['commit/trk2dictionary/trk2dictionary.pyx'],
-                     extra_compile_args= {'gcc':  ['-w'],
-                                          'nvcc': ['-arch=sm_50', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
-                     extra_link_args=[],
-                     language='c++')
-
-    ext2 = Extension(name='commit.core',
-                     sources=['commit/core.pyx'],
-                     extra_compile_args= {'gcc':  ['-w'],
-                                          'nvcc': ['-arch=sm_50', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
-                     extra_link_args=[],
-                     language='c++')
-
-    ext3 = Extension(name='commit.proximals',
-                      sources=['commit/proximals.pyx'],
-                      extra_compile_args= {'gcc':  ['-w'],
-                                           'nvcc': ['-arch=sm_50', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
-                      extra_link_args=[],
-                      language='c++')
-
-    ext4 = Extension(name='commit.cudaoperator.operator',
-                     sources = ['commit/cudaoperator/operator_withCUDA.cu', 'commit/cudaoperator/operator.pyx'],
-                     extra_compile_args= {'gcc':  ['-w'],
-                                          'nvcc': ['-arch=sm_50', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
-                     language = 'c++',
-                     library_dirs = [CUDA['lib64']],
-                     libraries = ['cudart'],
-                     runtime_library_dirs = [CUDA['lib64']])
-
-    return [ext1, ext2, ext3, ext4]
-
-if CUDA == None:
-    extensions = get_extensions()
-else:
-    extensions = get_extensions_with_cuda()
-
-if CUDA == None:
-    class CustomBuildExtCommand(build_ext):
-        """ build_ext command to use when numpy headers are needed. """
-
-        def run(self):
-            # Now that the requirements are installed, get everything from numpy
-            from Cython.Build import cythonize
-            from numpy import get_include
-            
-            # Add everything requires for build
-            self.swig_opts = None
-            self.include_dirs = [get_include()]
-            self.distribution.ext_modules[:] = cythonize(self.distribution.ext_modules)
-
-            # Call original build_ext command
-            build_ext.finalize_options(self)
-            build_ext.run(self)
-else:
-    class CustomBuildExtCommand(build_ext):
-        """ build_ext command to use when numpy headers are needed. """
-
-        def build_extensions(self):
-            customize_compiler_for_nvcc(self.compiler)
-            build_ext.build_extensions(self)
-
-        def run(self):
-            # Now that the requirements are installed, get everything from numpy
-            from Cython.Build import cythonize
-            from numpy import get_include
-            
-            # Add everything requires for build
-            self.swig_opts = None
-            self.include_dirs = [get_include(), CUDA['include'], 'commit/cudaoperator']
-            self.distribution.ext_modules[:] = cythonize(self.distribution.ext_modules)
-
-            # Call original build_ext command
-            build_ext.finalize_options(self)
-            build_ext.run(self)
-
-description = 'Convex Optimization Modeling for Microstructure Informed Tractography (COMMIT)'
-
-opts = dict(name='dmri-commit',
-            version='1.5.0',
-            description=description,
-            long_description=description,
-            author='Alessandro Daducci',
-            author_email='alessandro.daducci@univr.it',
-            url='https://github.com/daducci/COMMIT',
-            packages=['commit', 'commit.operator'],
-            cmdclass={'build_ext': CustomBuildExtCommand},
-            ext_modules=extensions,
-            setup_requires=['Cython>=0.29', 'numpy>=1.12'],
-            install_requires=['Cython>=0.29', 'dmri-amico>=1.2.6', 'dipy>=1.0', 'numpy>=1.12'],
-            package_data={'commit.operator': ["*.*"]})
-
+from setuptools import Extension, setup
+from setuptools.command.build_ext import build_ext
+import os
+from os.path import join as pjoin
+
+# taken from https://github.com/rmcgibbo/npcuda-example/blob/master/cython/setup.py
+def find_in_path(name, path):
+    """Find a file in a search path"""
+
+    # Adapted fom http://code.activestate.com/recipes/52224
+    for dir in path.split(os.pathsep):
+        binpath = pjoin(dir, name)
+        if os.path.exists(binpath):
+            return os.path.abspath(binpath)
+    return None
+
+def locate_cuda():
+    """Locate the CUDA environment on the system
+    Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64'
+    and values giving the absolute path to each directory.
+    Starts by looking for the CUDAHOME env variable. If not found,
+    everything is based on finding 'nvcc' in the PATH.
+    """
+
+    # First check if the CUDAHOME env variable is in use
+    if 'CUDAHOME' in os.environ:
+        home = os.environ['CUDAHOME']
+        nvcc = pjoin(home, 'bin', 'nvcc')
+    else:
+        # Otherwise, search the PATH for NVCC
+        nvcc = find_in_path('nvcc', os.environ['PATH'])
+        if nvcc is None:
+            return None
+        home = os.path.dirname(os.path.dirname(nvcc))
+
+    cudaconfig = {'home': home, 'nvcc': nvcc,
+                  'include': pjoin(home, 'include'),
+                  'lib64': pjoin(home, 'lib64')}
+    for k, v in iter(cudaconfig.items()):
+        if not os.path.exists(v):
+            return None
+
+    return cudaconfig
+
+def customize_compiler_for_nvcc(self):
+    """Inject deep into distutils to customize how the dispatch
+    to gcc/nvcc works.
+    If you subclass UnixCCompiler, it's not trivial to get your subclass
+    injected in, and still have the right customizations (i.e.
+    distutils.sysconfig.customize_compiler) run on it. So instead of going
+    the OO route, I have this. Note, it's kindof like a wierd functional
+    subclassing going on.
+    """
+
+    # Tell the compiler it can processes .cu
+    self.src_extensions.append('.cu')
+
+    # Save references to the default compiler_so and _comple methods
+    default_compiler_so = self.compiler_so
+    super = self._compile
+
+    # Now redefine the _compile method. This gets executed for each
+    # object but distutils doesn't have the ability to change compilers
+    # based on source extension: we add it.
+    def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
+        if os.path.splitext(src)[1] == '.cu':
+            # use the cuda for .cu files
+            self.set_executable('compiler_so', CUDA['nvcc'])
+            # use only a subset of the extra_postargs, which are 1-1
+            # translated from the extra_compile_args in the Extension class
+            print(type(extra_postargs))
+            print(extra_postargs)
+            postargs = extra_postargs['nvcc']
+        else:
+            print(type(extra_postargs))
+            print(extra_postargs)
+            postargs = extra_postargs['gcc']
+
+        super(obj, src, ext, cc_args, postargs, pp_opts)
+        # Reset the default compiler_so, which we might have changed for cuda
+        self.compiler_so = default_compiler_so
+
+    # Inject our redefined _compile method into the class
+    self._compile = _compile
+
+# Locate CUDA
+CUDA = locate_cuda()
+
+def get_extensions():
+    # Cython extension to create the sparse data structure from a tractogram
+    # for the computation of matrix-vector multiplications
+    ext1 = Extension(name='commit.trk2dictionary',
+                     sources=['commit/trk2dictionary/trk2dictionary.pyx'],
+                     extra_compile_args=['-w'],
+                     language='c++')
+
+    ext2 = Extension(name='commit.core',
+                     sources=['commit/core.pyx'],
+                     extra_compile_args=['-w'],
+                     language='c++')
+
+    ext3 = Extension(name='commit.proximals',
+                     sources=['commit/proximals.pyx'],
+                     extra_compile_args=['-w'],
+                     language='c++')
+
+    return [ext1, ext2, ext3]
+
+def get_extensions_with_cuda():
+    # Cython extension to create the sparse data structure from a tractogram
+    # for the computation of matrix-vector multiplications
+
+    ext1 = Extension(name='commit.trk2dictionary',
+                     sources=['commit/trk2dictionary/trk2dictionary.pyx'],
+                     extra_compile_args= {'gcc':  ['-w'],
+                                          'nvcc': ['-arch=sm_50', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
+                     extra_link_args=[],
+                     language='c++')
+
+    ext2 = Extension(name='commit.core',
+                     sources=['commit/core.pyx'],
+                     extra_compile_args= {'gcc':  ['-w'],
+                                          'nvcc': ['-arch=sm_50', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
+                     extra_link_args=[],
+                     language='c++')
+
+    ext3 = Extension(name='commit.proximals',
+                      sources=['commit/proximals.pyx'],
+                      extra_compile_args= {'gcc':  ['-w'],
+                                           'nvcc': ['-arch=sm_50', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
+                      extra_link_args=[],
+                      language='c++')
+
+    ext4 = Extension(name='commit.cudaoperator.operator',
+                     sources = ['commit/cudaoperator/operator_withCUDA.cu', 'commit/cudaoperator/operator.pyx'],
+                     extra_compile_args= {'gcc':  ['-w'],
+                                          'nvcc': ['-arch=sm_50', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
+                     language = 'c++',
+                     library_dirs = [CUDA['lib64']],
+                     libraries = ['cudart'],
+                     runtime_library_dirs = [CUDA['lib64']])
+
+    return [ext1, ext2, ext3, ext4]
+
+if CUDA == None:
+    extensions = get_extensions()
+else:
+    extensions = get_extensions_with_cuda()
+
+if CUDA == None:
+    class CustomBuildExtCommand(build_ext):
+        """ build_ext command to use when numpy headers are needed. """
+
+        def run(self):
+            # Now that the requirements are installed, get everything from numpy
+            from Cython.Build import cythonize
+            from numpy import get_include
+            
+            # Add everything requires for build
+            self.swig_opts = None
+            self.include_dirs = [get_include()]
+            self.distribution.ext_modules[:] = cythonize(self.distribution.ext_modules)
+
+            # Call original build_ext command
+            build_ext.finalize_options(self)
+            build_ext.run(self)
+else:
+    class CustomBuildExtCommand(build_ext):
+        """ build_ext command to use when numpy headers are needed. """
+
+        def build_extensions(self):
+            customize_compiler_for_nvcc(self.compiler)
+            build_ext.build_extensions(self)
+
+        def run(self):
+            # Now that the requirements are installed, get everything from numpy
+            from Cython.Build import cythonize
+            from numpy import get_include
+            
+            # Add everything requires for build
+            self.swig_opts = None
+            self.include_dirs = [get_include(), CUDA['include'], 'commit/cudaoperator']
+            self.distribution.ext_modules[:] = cythonize(self.distribution.ext_modules)
+
+            # Call original build_ext command
+            build_ext.finalize_options(self)
+            build_ext.run(self)
+
+description = 'Convex Optimization Modeling for Microstructure Informed Tractography (COMMIT)'
+
+opts = dict(name='dmri-commit',
+            version='1.5.0',
+            description=description,
+            long_description=description,
+            author='Alessandro Daducci',
+            author_email='alessandro.daducci@univr.it',
+            url='https://github.com/daducci/COMMIT',
+            packages=['commit', 'commit.operator'],
+            cmdclass={'build_ext': CustomBuildExtCommand},
+            ext_modules=extensions,
+            setup_requires=['Cython>=0.29', 'numpy>=1.12'],
+            install_requires=['Cython>=0.29', 'dmri-amico>=1.2.6', 'dipy>=1.0', 'numpy>=1.12'],
+            package_data={'commit.operator': ["*.*"]})
+
 setup(**opts)
\ No newline at end of file

From 867c4c51c557cfbbcb18c035a35e9b7a1b46cf1d Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Sat, 3 Jul 2021 23:31:35 -0500
Subject: [PATCH 188/190] Assign 1 thread per voxel

---
 commit/cudaoperator/operator_withCUDA.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/commit/cudaoperator/operator_withCUDA.cu b/commit/cudaoperator/operator_withCUDA.cu
index 6ccd0363..4dff1ca3 100644
--- a/commit/cudaoperator/operator_withCUDA.cu
+++ b/commit/cudaoperator/operator_withCUDA.cu
@@ -484,7 +484,7 @@ __global__ void multiply_Ax_ICpart(uint32_t*  voxelIDs,
             for(int j=0; j<NUM_DIAMETERS; j++)
                 aux += (float64_t)(lut[offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES]) * x[(*fiber) + j*NUM_FIBERS];
 
-            y[(*voxel)*NUM_SAMPLES + s] = aux*(*length);
+            y[(*voxel)*NUM_SAMPLES + s] += aux*(*length);
         }
 
         fiber++;

From 32c8144b9549820347c76981eae79cdea3bd6d89 Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Tue, 28 Sep 2021 15:56:19 -0500
Subject: [PATCH 189/190] Change end of line from CRLF to LF

---
 .gitignore                                 |   40 +-
 CHANGELOG.md                               |  310 +-
 LICENSE                                    |   66 +-
 MANIFEST.in                                |   12 +-
 README.md                                  |   60 +-
 commit/__init__.py                         |   10 +-
 commit/core.pyx                            | 2058 ++++-----
 commit/cudaoperator/operator.pyx           |  448 +-
 commit/cudaoperator/operator_withCUDA.cu   | 1388 +++---
 commit/cudaoperator/operator_withCUDA.cuh  |  350 +-
 commit/operator/config.py                  |   12 +-
 commit/operator/operator.pyx               |  384 +-
 commit/operator/operator.pyxbld            |   78 +-
 commit/operator/operator_noLUT.c           |  374 +-
 commit/operator/operator_withLUT.c         | 4494 ++++++++++----------
 commit/proximals.pyx                       |  280 +-
 commit/solvers.py                          |  806 ++--
 commit/trk2dictionary/trk2dictionary.pyx   |  858 ++--
 commit/trk2dictionary/trk2dictionary_c.cpp | 1196 +++---
 extras/CMakeLists.txt                      |   22 +-
 extras/include/COLOR_ui.h                  |  146 +-
 requirements.txt                           |   10 +-
 setup.cfg                                  |   10 +-
 setup.py                                   |  408 +-
 24 files changed, 6910 insertions(+), 6910 deletions(-)

diff --git a/.gitignore b/.gitignore
index 7a67b8de..de91de1c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,21 +1,21 @@
-build
-.ipynb_checkpoints
-.DS_Store
-.DS_Store?
-._*
-.Spotlight-V100
-.Trashes
-ehthumbs.db
-Thumbs.db
-__pycache__/
-.vscode/
-.eggs/
-*.egg-info/
-*.so
-*.cpp
-dist/
-
-trk2dictionary.c
-
-# Never modify line endings of our bash scripts
+build
+.ipynb_checkpoints
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
+__pycache__/
+.vscode/
+.eggs/
+*.egg-info/
+*.so
+*.cpp
+dist/
+
+trk2dictionary.c
+
+# Never modify line endings of our bash scripts
 *.sh -lf
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index e6a263b0..cb5a637d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,155 +1,155 @@
-# Change Log
-All notable changes to COMMIT will be documented in this file.
-
-## [1.5.0] - 2021-01-04
-
-### Changed
-- setup.py: Add compilation for .cu files
-
-### Added
-- GPU acceleration with CUDA for faster model fitting
-
-## [1.4.5] - 2020-12-29
-
-### Fixed
-- operator.pyxbld: Changed the condition to create a new operator
-
-### Added
-- core.pyx: Add to the function build_operator the parameter build_dir
-
-### Changed
-- core.pyx: The function build_operator checks if the LUT configuration 
-            changed before build a new operator
-
-## [1.4.4] - 2020-10-28
- 
-### Changed
-- Option to set one single direction in the resolution of the LUT
-
-## [1.4.3] - 2020-10-22
-
-### Added
-- store model parameters to results.pickle
-
-## [1.4.2] - 2020-10-22
-
-### Fixed
-- trk2dictionary.run(): check for invalid parameters passed to the blur
-
-## [1.4.1] - 2020-10-21
-
-### Fixed
-- operator.pyxbld: Changed the condition to create a new operator
-
-### Added
-- COMMIT version is stored in results.pickle
-- COMMIT version is stored in output NIFTI files
-
-## [1.4.0.4] - 2020-09-24
-
-### Fixed
-- trk2dictionary.run(): bug in the blurring functionality
-- trk2dictionary.run(): 'blur_sigma' defaults to 0
-
-## [1.4.0.3] - 2020-08-07
-
-### Fixed
-- COMMIT_debugger: compilation problem
-- COMMIT_debugger: wrong visualization in Linux
-
-## [1.4.0.2] - 2020-08-07
-
-### Changed
-- Moved the documentation to the Wiki
-
-## [1.4.0.1] - 2020-08-03
-
-### Changed
-- Updated the installation guide
-
-## [1.4.0.0] - 2020-07-30
-
-### Changed
-- trk2dictionary.run(): removed 'gen_trk' option
-- save_results(): removed 'save_coeff' and 'save_opt_details' parameters
-- save_results(): now saving only streamline_weights.txt (not anymore xic.txt, xec.txt, xiso.txt)
-- load_dictionary(): renamed 'use_mask' to 'use_all_voxels_in_mask'
-- Removed unused 'dictionary_ndirs.dict' file
-- trk2dictionary.run(): 'min_fiber_len' defaults to 0.0 for backward compatibility
-
-### Added
-- added 'get_coeffs()' function to get all estimated coefficients
-- save_results(): added 'stat_coeffs' parameter for saving streamline weights
-- trk2dictionary.run(): added 'max_fiber_len' parameter to discard long streamlines
-- load_data(): added 'b0_min_signal' to discard voxels with very low signal
-
-## [1.3.9] - 2020-06-09
-
-### Changed
-- Modify setup.py and fix spams dependencies
-
-## [1.3.8] - 2020-05-12
-
-### Changed
-- Improvements to the COMMIT_debugger.
-
-## [1.3.7] - 2020-04-25
-
-### Changed
-- Adapt demos to use d_perps instead of ICVFs for setting model parameters.
-
-## [1.3.6] - 2020-04-22
-
-### Fixed
-- Bug when the selected model has EC compartments but no peaks are provided (in trk2dictionary).
-
-## [1.3.5] - 2020-04-08
-
-### Added
-- Parameter 'min_fiber_len' in trk2dictionary to discard streamlines shorter than a given length in mm.
-
-### Fixed
-- Bug when 'points_to_skip' was higher then streamline length.
-- Few corrections to docstring of trk2dictionary.
-
-## [1.3.4] - 2020-04-02
-
-### Changed
-- Added colorized output. NB: needs AMICO 1.2.0 or above.
-
-## [1.3.3] - 2020-03-31
-
-### Added
-- Added possibility to save the predicted DW-MR signal in save_results.
- 
-### Fixed
-- Minor cleanup.
-
-
-## [1.3.2] - 2020-03-27
-
-### Added
-- Check if dictionary (upon loading) and data have the same geometry.
- 
-### Fixed
-- Bug while saving coefficients in save_results.
-
-
-## [1.3.1] - 2020-03-27
-
-### Fixed
-- Improved the loading of the streamlines in trk2dictionary
-
-
-## [1.3] - 2019-10-30
-
-This version of COMMIT *is not compatible* with [AMICO](https://github.com/daducci/AMICO) v1.0.1 of below. If you update COMMIT to this version, please update AMICO to version 1.1.0 or above.
- 
-### Added
-- Changelog file to keep tracking of the COMMIT versions.
- 
-### Changed
-- Added compatibility with low resolution LUTs.
- 
-### Fixed
-- Nothing.
+# Change Log
+All notable changes to COMMIT will be documented in this file.
+
+## [1.5.0] - 2021-01-04
+
+### Changed
+- setup.py: Add compilation for .cu files
+
+### Added
+- GPU acceleration with CUDA for faster model fitting
+
+## [1.4.5] - 2020-12-29
+
+### Fixed
+- operator.pyxbld: Changed the condition to create a new operator
+
+### Added
+- core.pyx: Add to the function build_operator the parameter build_dir
+
+### Changed
+- core.pyx: The function build_operator checks if the LUT configuration 
+            changed before build a new operator
+
+## [1.4.4] - 2020-10-28
+ 
+### Changed
+- Option to set one single direction in the resolution of the LUT
+
+## [1.4.3] - 2020-10-22
+
+### Added
+- store model parameters to results.pickle
+
+## [1.4.2] - 2020-10-22
+
+### Fixed
+- trk2dictionary.run(): check for invalid parameters passed to the blur
+
+## [1.4.1] - 2020-10-21
+
+### Fixed
+- operator.pyxbld: Changed the condition to create a new operator
+
+### Added
+- COMMIT version is stored in results.pickle
+- COMMIT version is stored in output NIFTI files
+
+## [1.4.0.4] - 2020-09-24
+
+### Fixed
+- trk2dictionary.run(): bug in the blurring functionality
+- trk2dictionary.run(): 'blur_sigma' defaults to 0
+
+## [1.4.0.3] - 2020-08-07
+
+### Fixed
+- COMMIT_debugger: compilation problem
+- COMMIT_debugger: wrong visualization in Linux
+
+## [1.4.0.2] - 2020-08-07
+
+### Changed
+- Moved the documentation to the Wiki
+
+## [1.4.0.1] - 2020-08-03
+
+### Changed
+- Updated the installation guide
+
+## [1.4.0.0] - 2020-07-30
+
+### Changed
+- trk2dictionary.run(): removed 'gen_trk' option
+- save_results(): removed 'save_coeff' and 'save_opt_details' parameters
+- save_results(): now saving only streamline_weights.txt (not anymore xic.txt, xec.txt, xiso.txt)
+- load_dictionary(): renamed 'use_mask' to 'use_all_voxels_in_mask'
+- Removed unused 'dictionary_ndirs.dict' file
+- trk2dictionary.run(): 'min_fiber_len' defaults to 0.0 for backward compatibility
+
+### Added
+- added 'get_coeffs()' function to get all estimated coefficients
+- save_results(): added 'stat_coeffs' parameter for saving streamline weights
+- trk2dictionary.run(): added 'max_fiber_len' parameter to discard long streamlines
+- load_data(): added 'b0_min_signal' to discard voxels with very low signal
+
+## [1.3.9] - 2020-06-09
+
+### Changed
+- Modify setup.py and fix spams dependencies
+
+## [1.3.8] - 2020-05-12
+
+### Changed
+- Improvements to the COMMIT_debugger.
+
+## [1.3.7] - 2020-04-25
+
+### Changed
+- Adapt demos to use d_perps instead of ICVFs for setting model parameters.
+
+## [1.3.6] - 2020-04-22
+
+### Fixed
+- Bug when the selected model has EC compartments but no peaks are provided (in trk2dictionary).
+
+## [1.3.5] - 2020-04-08
+
+### Added
+- Parameter 'min_fiber_len' in trk2dictionary to discard streamlines shorter than a given length in mm.
+
+### Fixed
+- Bug when 'points_to_skip' was higher then streamline length.
+- Few corrections to docstring of trk2dictionary.
+
+## [1.3.4] - 2020-04-02
+
+### Changed
+- Added colorized output. NB: needs AMICO 1.2.0 or above.
+
+## [1.3.3] - 2020-03-31
+
+### Added
+- Added possibility to save the predicted DW-MR signal in save_results.
+ 
+### Fixed
+- Minor cleanup.
+
+
+## [1.3.2] - 2020-03-27
+
+### Added
+- Check if dictionary (upon loading) and data have the same geometry.
+ 
+### Fixed
+- Bug while saving coefficients in save_results.
+
+
+## [1.3.1] - 2020-03-27
+
+### Fixed
+- Improved the loading of the streamlines in trk2dictionary
+
+
+## [1.3] - 2019-10-30
+
+This version of COMMIT *is not compatible* with [AMICO](https://github.com/daducci/AMICO) v1.0.1 of below. If you update COMMIT to this version, please update AMICO to version 1.1.0 or above.
+ 
+### Added
+- Changelog file to keep tracking of the COMMIT versions.
+ 
+### Changed
+- Added compatibility with low resolution LUTs.
+ 
+### Fixed
+- Nothing.
diff --git a/LICENSE b/LICENSE
index 04e0c652..70808f61 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,33 +1,33 @@
-Unless otherwise specified by LICENSE.txt files in individual
-directories, or within individual files or functions, all code is:
-
-Copyright (c) 2008-2020, COMMIT developers
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-    * Redistributions of source code must retain the above copyright
-       notice, this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above
-       copyright notice, this list of conditions and the following
-       disclaimer in the documentation and/or other materials provided
-       with the distribution.
-
-    * Neither the name of the COMMIT developers nor the names of any
-       contributors may be used to endorse or promote products derived
-       from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+Unless otherwise specified by LICENSE.txt files in individual
+directories, or within individual files or functions, all code is:
+
+Copyright (c) 2008-2020, COMMIT developers
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+       notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+       copyright notice, this list of conditions and the following
+       disclaimer in the documentation and/or other materials provided
+       with the distribution.
+
+    * Neither the name of the COMMIT developers nor the names of any
+       contributors may be used to endorse or promote products derived
+       from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/MANIFEST.in b/MANIFEST.in
index fa48479d..d3b5c5b7 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,7 +1,7 @@
-include README.md
-include LICENSE
-
-recursive-include commit *.h
-recursive-include commit *.cpp
-recursive-include commit *.pyx
+include README.md
+include LICENSE
+
+recursive-include commit *.h
+recursive-include commit *.cpp
+recursive-include commit *.pyx
 recursive-include commit *.c
\ No newline at end of file
diff --git a/README.md b/README.md
index cdd2cb13..78bc5128 100644
--- a/README.md
+++ b/README.md
@@ -1,30 +1,30 @@
-# COMMIT
-
-The reconstructions recovered with existing tractography algorithms are *not really quantitative* even though diffusion MRI is a quantitative modality by nature. As a matter of fact, several techniques have been proposed in recent years to estimate, at the voxel level, intrinsic micro-structural features of the tissue, such as axonal density and diameter, by using multi-compartment models. COMMIT implements a novel framework to **re-establish the link between tractography and tissue micro-structure**.
-
-Starting from an input set of candidate fiber-tracts, which can be estimated using standard fiber-tracking techniques, COMMIT models the diffusion MRI signal in each voxel of the image as a *linear combination* of the restricted and hindered contributions generated in every location of the brain by these candidate tracts. Then, COMMIT seeks for the effective contribution of each of them such that they globally fit the measured signal at best.
-
-These weights can be easily estimated by solving a convenient **global convex optimization problem** and using efficient algorithms. Results clearly demonstrated the benefits of the proposed formulation, opening new perspectives for a more quantitative and biologically-plausible assessment of the structural connectivity in the brain.
-
-
-## Main features
-
-- Accepts and works with **any input tractogram** (i.e. set of fiber tracts).
-- Can easily implement and consider **any multi-compartment model** available in the literature: possibility to account for restricted, hindered as well as isotropic contributions into the signal forward model.
-- Very efficient: the core of the algorithm is implemented in C++ and using **multi-threading programming** for efficient parallel computation.
-- **Low memory** consumption using optimized sparse data structures, e.g. it can easily run on a standard laptop with 8GB RAM a full-brain tractogram from the HCP data (1M fibers, 3 shells, 1.25 mm^3 resolution).
-- **Soon**: **GPU implementation** for even faster model fitting.
-
-
-## Documentation
-
-More information/documentation, as well as a series of tutorials, can be found in the [wiki pages](https://github.com/daducci/COMMIT/wiki/Home).
-
-### Installation
-
-To install COMMIT, refer to the [installation guide](https://github.com/daducci/COMMIT/wiki/Installation).
-
-### Getting started
-
-To get started with the COMMIT framework, have a look at [this tutorial](https://github.com/daducci/COMMIT/wiki/Getting-started), which will guide you through the main steps of the processing.
-
+# COMMIT
+
+The reconstructions recovered with existing tractography algorithms are *not really quantitative* even though diffusion MRI is a quantitative modality by nature. As a matter of fact, several techniques have been proposed in recent years to estimate, at the voxel level, intrinsic micro-structural features of the tissue, such as axonal density and diameter, by using multi-compartment models. COMMIT implements a novel framework to **re-establish the link between tractography and tissue micro-structure**.
+
+Starting from an input set of candidate fiber-tracts, which can be estimated using standard fiber-tracking techniques, COMMIT models the diffusion MRI signal in each voxel of the image as a *linear combination* of the restricted and hindered contributions generated in every location of the brain by these candidate tracts. Then, COMMIT seeks for the effective contribution of each of them such that they globally fit the measured signal at best.
+
+These weights can be easily estimated by solving a convenient **global convex optimization problem** and using efficient algorithms. Results clearly demonstrated the benefits of the proposed formulation, opening new perspectives for a more quantitative and biologically-plausible assessment of the structural connectivity in the brain.
+
+
+## Main features
+
+- Accepts and works with **any input tractogram** (i.e. set of fiber tracts).
+- Can easily implement and consider **any multi-compartment model** available in the literature: possibility to account for restricted, hindered as well as isotropic contributions into the signal forward model.
+- Very efficient: the core of the algorithm is implemented in C++ and using **multi-threading programming** for efficient parallel computation.
+- **Low memory** consumption using optimized sparse data structures, e.g. it can easily run on a standard laptop with 8GB RAM a full-brain tractogram from the HCP data (1M fibers, 3 shells, 1.25 mm^3 resolution).
+- **Soon**: **GPU implementation** for even faster model fitting.
+
+
+## Documentation
+
+More information/documentation, as well as a series of tutorials, can be found in the [wiki pages](https://github.com/daducci/COMMIT/wiki/Home).
+
+### Installation
+
+To install COMMIT, refer to the [installation guide](https://github.com/daducci/COMMIT/wiki/Installation).
+
+### Getting started
+
+To get started with the COMMIT framework, have a look at [this tutorial](https://github.com/daducci/COMMIT/wiki/Getting-started), which will guide you through the main steps of the processing.
+
diff --git a/commit/__init__.py b/commit/__init__.py
index e7e71d6c..3ab179d3 100755
--- a/commit/__init__.py
+++ b/commit/__init__.py
@@ -1,5 +1,5 @@
-from .core import Evaluation
-__all__ = ['core','models','solvers','trk2dictionary']
-
-from pkg_resources import get_distribution
-__version__ = get_distribution('dmri-commit').version
+from .core import Evaluation
+__all__ = ['core','models','solvers','trk2dictionary']
+
+from pkg_resources import get_distribution
+__version__ = get_distribution('dmri-commit').version
diff --git a/commit/core.pyx b/commit/core.pyx
index c3606410..0e3028c3 100755
--- a/commit/core.pyx
+++ b/commit/core.pyx
@@ -1,1029 +1,1029 @@
-#!python
-#cython: language_level=3, boundscheck=False, wraparound=False, nonecheck=False, cdivision=True, initializedcheck=False, binding=False
-from __future__ import print_function
-cimport cython
-import numpy as np
-cimport numpy as np
-
-import time
-import glob
-import sys
-from os import makedirs, remove, getcwd, listdir
-from os.path import exists, join as pjoin, isfile, isdir
-import nibabel
-import pickle
-import commit.models
-import commit.solvers
-import amico.scheme
-import amico.lut
-import pyximport
-from pkg_resources import get_distribution
-
-from amico.util import LOG, NOTE, WARNING, ERROR
-
-
-def setup( lmax=12, ndirs=32761 ) :
-    """General setup/initialization of the COMMIT framework.
-    
-    Parameters
-    ----------
-    lmax : int
-        Maximum SH order to use for the rotation phase (default : 12)
-    ndirs : int
-        Number of directions on the half of the sphere representing the possible orientations of the response functions (default : 32761)
-    """
-
-    if not amico.lut.is_valid(ndirs):
-        ERROR( 'Unsupported value for ndirs.\nNote: Supported values for ndirs are [1, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 10000, 32761 (default)]' )
-
-    amico.lut.precompute_rotation_matrices( lmax, ndirs )
-
-
-def load_dictionary_info( filename ):
-    """Function to load dictionary info file
-    
-    Parameters
-    ----------
-    filename : string
-        This value is always COMMIT_PATH + dictionary_info.pickle
-    """
-    if not isfile( filename ):
-        ERROR( 'Dictionary is outdated or not found. Execute "trk2dictionary" script first' )
-    with open( filename, 'rb' ) as dictionary_info_file:
-        if sys.version_info.major == 3:
-            aux = pickle.load( dictionary_info_file, fix_imports=True, encoding='bytes' )
-            # Pickle files written by Python 2 are loaded with byte
-            # keys, whereas those written by Python 3 are loaded with
-            # str keys, even when both are written using protocol=2
-            result_aux = {(k.decode() if hasattr(k,"decode") else k): v for k, v in aux.items()}
-            return result_aux
-        else:
-            return pickle.load( dictionary_info_file )
-
-
-cdef class Evaluation :
-    """Class to hold all the information (data and parameters) when performing an
-    evaluation with the COMMIT framework.
-    """
-    cdef public niiDWI
-    cdef public niiDWI_img
-    cdef public scheme
-    cdef public model
-    cdef public KERNELS
-    cdef public DICTIONARY
-    cdef public THREADS
-    cdef public A
-    cdef public x
-    cdef public CONFIG
-
-    def __init__( self, study_path, subject ) :
-        """Setup the data structures with default values.
-
-        Parameters
-        ----------
-        study_path : string
-            The path to the folder containing all the subjects from one study
-        subject : string
-            The path (relative to previous folder) to the subject folder
-        """
-        self.niiDWI     = None # set by "load_data" method
-        self.scheme     = None # set by "load_data" method
-        self.model      = None # set by "set_model" method
-        self.KERNELS    = None # set by "load_kernels" method
-        self.DICTIONARY = None # set by "load_dictionary" method
-        self.THREADS    = None # set by "set_threads" method
-        self.A          = None # set by "build_operator" method
-        self.x          = None # set by "fit" method
-
-        # store all the parameters of an evaluation with COMMIT
-        self.CONFIG = {}
-        self.set_config('version', get_distribution('dmri-commit').version)
-        self.set_config('study_path', study_path)
-        self.set_config('subject', subject)
-        self.set_config('DATA_path', pjoin( study_path, subject ))
-
-        self.set_config('doNormalizeSignal', True)
-        self.set_config('doMergeB0', False)
-        self.set_config('doNormalizeKernels', True)
-        self.set_config('doDemean', False)
-        self.set_config('doNormalizeMaps', False)
-
-
-    def set_config( self, key, value ) :
-        self.CONFIG[ key ] = value
-
-
-    def get_config( self, key ) :
-        return self.CONFIG.get( key )
-
-
-    def load_data( self, dwi_filename='DWI.nii', scheme_filename='DWI.scheme', b0_thr=0, b0_min_signal=0 ) :
-        """Load the diffusion signal and its corresponding acquisition scheme.
-
-        Parameters
-        ----------
-        dwi_filename : string
-            The file name of the DWI data, relative to the subject folder (default : 'DWI.nii')
-        scheme_filename : string
-            The file name of the corresponding acquisition scheme (default : 'DWI.scheme')
-        b0_thr : float
-            The threshold below which a b-value is considered a b0 (default : 0)
-        b0_min_signal : float
-            Crop to zero the signal in voxels where the b0 <= b0_min_signal * mean(b0[b0>0]) (default : 0)
-        """
-
-        # Loading data and acquisition scheme
-        tic = time.time()
-        LOG( '\n-> Loading data:' )
-
-        print( '\t* DWI signal:' )
-        self.set_config('dwi_filename', dwi_filename)
-        self.niiDWI  = nibabel.load( pjoin( self.get_config('DATA_path'), dwi_filename) )
-        self.niiDWI_img = self.niiDWI.get_data().astype(np.float32)
-        if self.niiDWI_img.ndim ==3 :
-            self.niiDWI_img = np.expand_dims( self.niiDWI_img, axis=3 )
-        hdr = self.niiDWI.header if nibabel.__version__ >= '2.0.0' else self.niiDWI.get_header()
-        self.set_config('dim', self.niiDWI_img.shape[0:3])
-        self.set_config('pixdim', tuple( hdr.get_zooms()[:3] ))
-        print( '\t\t- dim    : %d x %d x %d x %d' % self.niiDWI_img.shape )
-        print( '\t\t- pixdim : %.3f x %.3f x %.3f' % self.get_config('pixdim') )
-        print( '\t\t- values : min=%.2f, max=%.2f, mean=%.2f' % ( self.niiDWI_img.min(), self.niiDWI_img.max(), self.niiDWI_img.mean() ) )
-
-        print( '\t* Acquisition scheme:' )
-        self.set_config('scheme_filename', scheme_filename)
-        self.set_config('b0_thr', b0_thr)
-        self.scheme = amico.scheme.Scheme( pjoin( self.get_config('DATA_path'), scheme_filename), b0_thr )
-        print( '\t\t- %d samples, %d shells' % ( self.scheme.nS, len(self.scheme.shells) ) )
-        print( '\t\t- %d @ b=0' % ( self.scheme.b0_count ), end='' )
-        for i in xrange(len(self.scheme.shells)) :
-            print( ', %d @ b=%.1f' % ( len(self.scheme.shells[i]['idx']), self.scheme.shells[i]['b'] ), end='' )
-        print()
-
-        if self.scheme.nS != self.niiDWI_img.shape[3] :
-            ERROR( 'Scheme does not match with DWI data' )
-
-        if self.scheme.dwi_count == 0 :
-            ERROR( 'There are no DWI volumes in the data' )
-
-        LOG( '   [ %.1f seconds ]' % ( time.time() - tic ) )
-
-        # Preprocessing
-        tic = time.time()
-        LOG( '\n-> Preprocessing:' )
-
-        if self.get_config('doNormalizeSignal') :
-            if self.scheme.b0_count > 0 :
-                print( '\t* Normalizing to b0... ', end='' )
-                sys.stdout.flush()
-                b0 = np.mean( self.niiDWI_img[:,:,:,self.scheme.b0_idx], axis=3 )
-                idx = b0 <= b0_min_signal * b0[b0>0].mean()
-                b0[ idx ] = 1
-                b0 = 1.0 / b0
-                b0[ idx ] = 0
-                for i in xrange(self.scheme.nS) :
-                    self.niiDWI_img[:,:,:,i] *= b0
-                print( '[ min=%.2f, max=%.2f, mean=%.2f ]' % ( self.niiDWI_img.min(), self.niiDWI_img.max(), self.niiDWI_img.mean() ) )
-                del idx, b0
-            else :
-                WARNING( 'There are no b0 volumes for normalization' )
-
-        if self.scheme.b0_count > 1 :
-            if self.get_config('doMergeB0') :
-                print( '\t* Merging multiple b0 volume(s)... ', end='' )
-                mean = np.expand_dims( np.mean( self.niiDWI_img[:,:,:,self.scheme.b0_idx], axis=3 ), axis=3 )
-                self.niiDWI_img = np.concatenate( (mean, self.niiDWI_img[:,:,:,self.scheme.dwi_idx]), axis=3 )
-                del mean
-            else :
-                print( '\t* Keeping all b0 volume(s)... ', end='' )
-            print( '[ %d x %d x %d x %d ]' % self.niiDWI_img.shape )
-
-        if self.get_config('doDemean') :
-            print( '\t* Demeaning signal... ', end='' )
-            sys.stdout.flush()
-            mean = np.repeat( np.expand_dims(np.mean(self.niiDWI_img,axis=3),axis=3), self.niiDWI_img.shape[3], axis=3 )
-            self.niiDWI_img = self.niiDWI_img - mean
-            print( '[ min=%.2f, max=%.2f, mean=%.2f ]' % ( self.niiDWI_img.min(), self.niiDWI_img.max(), self.niiDWI_img.mean() ) )
-
-        LOG( '   [ %.1f seconds ]' % ( time.time() - tic ) )
-
-
-    def set_model( self, model_name ) :
-        """Set the model to use to describe the signal contributions in each voxel.
-
-        Parameters
-        ----------
-        model_name : string
-            The name of the model (must match a class name in "commit.models" module)
-        """
-        # Call the specific model constructor
-        if hasattr(commit.models, model_name ) :
-            self.model = getattr(commit.models,model_name)()
-        else :
-            ERROR( 'Model "%s" not recognized' % model_name )
-
-        self.set_config('ATOMS_path', pjoin( self.get_config('study_path'), 'kernels', self.model.id ))
-
-
-    def generate_kernels( self, regenerate=False, lmax=12, ndirs=32761 ) :
-        """Generate the high-resolution response functions for each compartment.
-        Dispatch to the proper function, depending on the model.
-
-        Parameters
-        ----------
-        regenerate : boolean
-            Regenerate kernels if they already exist (default : False)
-        lmax : int
-            Maximum SH order to use for the rotation procedure (default : 12)
-        ndirs : int
-            Number of directions on the half of the sphere representing the possible orientations of the response functions (default : 32761)
-        """
-        if not amico.lut.is_valid(ndirs):
-            ERROR( 'Unsupported value for ndirs.\nNote: Supported values for ndirs are [1, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 10000, 32761 (default)]' )
-        if self.scheme is None :
-            ERROR( 'Scheme not loaded; call "load_data()" first' )
-        if self.model is None :
-            ERROR( 'Model not set; call "set_model()" method first' )
-
-        # store some values for later use
-        self.set_config('lmax', lmax)
-        self.set_config('ndirs', ndirs)
-        self.set_config('model', self.model.get_params())
-        self.model.scheme = self.scheme
-
-        LOG( '\n-> Simulating with "%s" model:' % self.model.name )
-
-        # check if kernels were already generated
-        tmp = glob.glob( pjoin(self.get_config('ATOMS_path'),'A_*.npy') )
-        if len(tmp)>0 and not regenerate :
-            LOG( '   [ Kernels already computed. Use option "regenerate=True" to force regeneration ]' )
-            return
-
-        # create folder or delete existing files (if any)
-        if not exists( self.get_config('ATOMS_path') ) :
-            makedirs( self.get_config('ATOMS_path') )
-        else :
-            for f in glob.glob( pjoin(self.get_config('ATOMS_path'),'*') ) :
-                remove( f )
-
-        # auxiliary data structures
-        aux = amico.lut.load_precomputed_rotation_matrices( lmax, ndirs )
-        idx_IN, idx_OUT = amico.lut.aux_structures_generate( self.scheme, lmax )
-
-        # Dispatch to the right handler for each model
-        tic = time.time()
-        self.model.generate( self.get_config('ATOMS_path'), aux, idx_IN, idx_OUT, ndirs )
-        LOG( '   [ %.1f seconds ]' % ( time.time() - tic ) )
-
-
-    def load_kernels( self ) :
-        """Load rotated kernels and project to the specific gradient scheme of this subject.
-        Dispatch to the proper function, depending on the model.
-        """
-        if self.model is None :
-            ERROR( 'Model not set; call "set_model()" method first' )
-        if self.scheme is None :
-            ERROR( 'Scheme not loaded; call "load_data()" first' )
-
-        tic = time.time()
-        LOG( '\n-> Resampling LUT for subject "%s":' % self.get_config('subject') )
-
-        # auxiliary data structures
-        idx_OUT, Ylm_OUT = amico.lut.aux_structures_resample( self.scheme, self.get_config('lmax') )
-
-        # Dispatch to the right handler for each model
-        if self.get_config('doMergeB0') :
-            print( '\t* Merging multiple b0 volume(s)...' )
-        else :
-            print( '\t* Keeping all b0 volume(s)...' )
-        self.KERNELS = self.model.resample( self.get_config('ATOMS_path'), idx_OUT, Ylm_OUT, self.get_config('doMergeB0'), self.get_config('ndirs') )
-        nIC  = self.KERNELS['wmr'].shape[0]
-        nEC  = self.KERNELS['wmh'].shape[0]
-        nISO = self.KERNELS['iso'].shape[0]
-        print( '\t  [ OK ]' )
-
-        # ensure contiguous arrays for C part
-        self.KERNELS['wmr'] = np.ascontiguousarray( self.KERNELS['wmr'] )
-        self.KERNELS['wmh'] = np.ascontiguousarray( self.KERNELS['wmh'] )
-        self.KERNELS['iso'] = np.ascontiguousarray( self.KERNELS['iso'] )
-
-        # De-mean kernels
-        if self.get_config('doDemean') :
-            print( '\t* Demeaning signal...', end='' )
-            for j in xrange(self.get_config('ndirs')) :
-                for i in xrange(nIC) :
-                    self.KERNELS['wmr'][i,j,:] -= self.KERNELS['wmr'][i,j,:].mean()
-                for i in xrange(nEC) :
-                    self.KERNELS['wmh'][i,j,:] -= self.KERNELS['wmh'][i,j,:].mean()
-            for i in xrange(nISO) :
-                self.KERNELS['iso'][i] -= self.KERNELS['iso'][i].mean()
-            print( '[ OK ]' )
-
-        # Normalize atoms
-        if self.get_config('doNormalizeKernels') :
-            print( '\t* Normalizing... ', end='' )
-
-            self.KERNELS['wmr_norm'] = np.zeros( nIC )
-            for i in xrange(nIC) :
-                self.KERNELS['wmr_norm'][i] = np.linalg.norm( self.KERNELS['wmr'][i,0,:] )
-                for j in xrange(self.get_config('ndirs')) :
-                    self.KERNELS['wmr'][i,j,:] /= self.KERNELS['wmr_norm'][i]
-
-            self.KERNELS['wmh_norm'] = np.zeros( nEC )
-            for i in xrange(nEC) :
-                self.KERNELS['wmh_norm'][i] = np.linalg.norm( self.KERNELS['wmh'][i,0,:] )
-                for j in xrange(self.get_config('ndirs')) :
-                    self.KERNELS['wmh'][i,j,:] /= self.KERNELS['wmh_norm'][i]
-
-            self.KERNELS['iso_norm'] = np.zeros( nISO )
-            for i in xrange(nISO) :
-                self.KERNELS['iso_norm'][i] = np.linalg.norm( self.KERNELS['iso'][i,:] )
-                self.KERNELS['iso'][i,:] /= self.KERNELS['iso_norm'][i]
-
-            print( '[ OK ]' )
-
-        LOG( '   [ %.1f seconds ]' % ( time.time() - tic ) )
-
-
-    cpdef load_dictionary( self, path, use_all_voxels_in_mask=False ) :
-        """Load the sparse structure previously created with "trk2dictionary" script.
-
-        Parameters
-        ----------
-        path : string
-            Folder containing the output of the trk2dictionary script (relative to subject path)
-        use_all_voxels_in_mask : boolean
-            If False (default) the optimization will be conducted only on the voxels actually
-            traversed by tracts. If True, then all voxels present in the mask specified in 
-            trk2dictionary.run(), i.e. "filename_mask" parameter, will be used instead.
-            NB: if no mask was specified in trk2dictionary, this parameter is irrelevant.
-        """
-        if self.niiDWI is None :
-            ERROR( 'Data not loaded; call "load_data()" first' )
-
-        tic = time.time()
-        LOG( '\n-> Loading the dictionary:' )
-        self.DICTIONARY = {}
-        self.set_config('TRACKING_path', pjoin(self.get_config('DATA_path'),path))
-
-        # check that ndirs of dictionary matches with that of the kernels
-        dictionary_info = load_dictionary_info( pjoin(self.get_config('TRACKING_path'), "dictionary_info.pickle") )
-        if dictionary_info['ndirs'] != self.get_config('ndirs'):
-            ERROR( '"ndirs" of the dictionary (%d) does not match with the kernels (%d)' % (dictionary_info['ndirs'], self.get_config('ndirs')) )
-        self.DICTIONARY['ndirs'] = dictionary_info['ndirs']
-
-        # load mask
-        self.set_config('dictionary_mask', 'mask' if use_all_voxels_in_mask else 'tdi' )
-        mask_filename = pjoin(self.get_config('TRACKING_path'),'dictionary_%s.nii'%self.get_config('dictionary_mask'))
-        if not exists( mask_filename ) :
-            mask_filename += '.gz'
-            if not exists( mask_filename ) :
-                ERROR( 'Dictionary not found. Execute "trk2dictionary" script first' );
-        niiMASK = nibabel.load( mask_filename )
-        niiMASK_hdr = niiMASK.header if nibabel.__version__ >= '2.0.0' else niiMASK.get_header()
-        if ( self.get_config('dim')[0]!=niiMASK.shape[0] or
-             self.get_config('dim')[1]!=niiMASK.shape[1] or
-             self.get_config('dim')[2]!=niiMASK.shape[2] or
-             abs(self.get_config('pixdim')[0]-niiMASK_hdr['pixdim'][1])>1e-3 or
-             abs(self.get_config('pixdim')[1]-niiMASK_hdr['pixdim'][2])>1e-3 or
-             abs(self.get_config('pixdim')[2]-niiMASK_hdr['pixdim'][3])>1e-3 ) :
-            WARNING( 'Dictionary does not have the same geometry as the dataset' )
-        self.DICTIONARY['MASK'] = (niiMASK.get_data() > 0).astype(np.uint8)
-
-        # segments from the tracts
-        # ------------------------
-        print( '\t* Segments from the tracts... ', end='' )
-        sys.stdout.flush()
-
-        self.DICTIONARY['TRK'] = {}
-        self.DICTIONARY['TRK']['kept']  = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_TRK_kept.dict'), dtype=np.uint8 )
-        self.DICTIONARY['TRK']['norm'] = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_TRK_norm.dict'), dtype=np.float32 )
-        self.DICTIONARY['TRK']['len']  = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_TRK_len.dict'), dtype=np.float32 )
-        
-
-        self.DICTIONARY['IC'] = {}
-        self.DICTIONARY['IC']['fiber'] = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_IC_f.dict'), dtype=np.uint32 )
-        self.DICTIONARY['IC']['v']     = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_IC_v.dict'), dtype=np.uint32 )
-        self.DICTIONARY['IC']['o']     = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_IC_o.dict'), dtype=np.uint16 )
-        self.DICTIONARY['IC']['len']   = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_IC_len.dict'), dtype=np.float32 )
-        self.DICTIONARY['IC']['n']     = self.DICTIONARY['IC']['fiber'].size
-        self.DICTIONARY['IC']['nF']    = self.DICTIONARY['TRK']['norm'].size
-
-        # reorder the segments based, first, on the "v" field and after based on the "o" field
-        idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['v'])] )
-        self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
-        self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]
-        self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
-        self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
-        del idx
-
-        # divide the length of each segment by the fiber length so that all the columns of the libear operator will have same length
-        # NB: it works in conjunction with the normalization of the kernels
-        cdef :
-            np.float32_t [:] sl = self.DICTIONARY['IC']['len']
-            np.float32_t [:] tl = self.DICTIONARY['TRK']['norm']
-            np.uint32_t  [:] f  = self.DICTIONARY['IC']['fiber']
-            int s
-        if self.get_config('doNormalizeKernels') :
-            for s in xrange(self.DICTIONARY['IC']['n']) :
-                sl[s] /= tl[ f[s] ]
-
-        print( '[ %d fibers and %d segments ]' % ( self.DICTIONARY['IC']['nF'], self.DICTIONARY['IC']['n'] ) )
-
-        # segments from the peaks
-        # -----------------------
-        print( '\t* Segments from the peaks...  ', end='' )
-        sys.stdout.flush()
-
-        self.DICTIONARY['EC'] = {}
-        self.DICTIONARY['EC']['v']  = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_EC_v.dict'), dtype=np.uint32 )
-        self.DICTIONARY['EC']['o']  = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_EC_o.dict'), dtype=np.uint16 )
-        self.DICTIONARY['EC']['nE'] = self.DICTIONARY['EC']['v'].size
-
-        # reorder the segments based, first, on the "v" field and after based on the "o" field
-        idx = np.lexsort( [np.array(self.DICTIONARY['EC']['o']), np.array(self.DICTIONARY['EC']['v'])] )
-        self.DICTIONARY['EC']['v'] = self.DICTIONARY['EC']['v'][ idx ]
-        self.DICTIONARY['EC']['o'] = self.DICTIONARY['EC']['o'][ idx ]
-        del idx
-
-        print( '[ %d segments ]' % self.DICTIONARY['EC']['nE'] )
-
-        # isotropic compartments
-        # ----------------------
-        print( '\t* Isotropic contributions...  ', end='' )
-        sys.stdout.flush()
-
-        self.DICTIONARY['ISO'] = {}
-
-        self.DICTIONARY['nV'] = self.DICTIONARY['MASK'].sum()
-
-        vx, vy, vz = ( self.DICTIONARY['MASK'] > 0 ).nonzero() # [TODO] find a way to avoid using int64 (not necessary and waste of memory)
-        vx = vx.astype(np.int32)
-        vy = vy.astype(np.int32)
-        vz = vz.astype(np.int32)
-        self.DICTIONARY['ISO']['v'] = vx + self.get_config('dim')[0] * ( vy + self.get_config('dim')[1] * vz )
-        del vx, vy, vz
-
-        # reorder the segments based on the "v" field
-        idx = np.argsort( self.DICTIONARY['ISO']['v'], kind='mergesort' )
-        self.DICTIONARY['ISO']['v'] = self.DICTIONARY['ISO']['v'][ idx ]
-        del idx
-
-        print( '[ %d voxels ]' % self.DICTIONARY['nV'] )
-
-        # post-processing
-        # ---------------
-        print( '\t* Post-processing...          ', end='' )
-        sys.stdout.flush()
-
-        # get the indices to extract the VOI as in MATLAB (in place of DICTIONARY.MASKidx)
-        idx = self.DICTIONARY['MASK'].ravel(order='F').nonzero()[0]
-        self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] = np.unravel_index( idx, self.DICTIONARY['MASK'].shape, order='F' )
-
-        lut = np.zeros( self.get_config('dim'), dtype=np.uint32 ).ravel()
-        for i in xrange(idx.size) :
-            lut[ idx[i] ] = i
-        self.DICTIONARY['IC'][ 'v'] = lut[ self.DICTIONARY['IC'][ 'v'] ]
-        self.DICTIONARY['EC'][ 'v'] = lut[ self.DICTIONARY['EC'][ 'v'] ]
-        self.DICTIONARY['ISO']['v'] = lut[ self.DICTIONARY['ISO']['v'] ]
-
-        print( '[ OK ]' )
-
-        LOG( '   [ %.1f seconds ]' % ( time.time() - tic ) )
-
-
-    def set_threads( self, n = None, nthreads = None, gpu_id = 0 ) :
-        """Set the number of threads to use for the matrix-vector operations with A and A'.
-
-        Parameters
-        ----------
-        n : integer
-            Same as nthreads. This remains just for compatibility with previous versions
-
-        nthreads : integer
-            Number of threads to use (nthreads = None ---> all the CPU threads available in the system
-                                      nthreads = 0    ---> enable CUDA GPU acceleration)
-        gpu_id : integer
-            GPU ID of the Nvidia GPU where COMMIT will be executed, default=0 and it is only required if nthreads=0
-            (To show a list of Nvidia GPUs and their IDs, open a system shell and run the command 'nvidia-smi')
-        """
-        if nthreads is None :
-            if n != None :
-                WARNING( '"n" parameter is deprecated, use "nthreads" instead' )
-                nthreads = n
-            else:
-                # Set to the number of CPUs in the system
-                try :
-                    import multiprocessing
-                    nthreads = multiprocessing.cpu_count()
-                except :
-                    nthreads = 1
-
-        if nthreads < 0 or nthreads > 255 :
-            ERROR( 'Number of threads must be between 0 and 255' )
-        if self.DICTIONARY is None :
-            ERROR( 'Dictionary not loaded; call "load_dictionary()" first' )
-        if self.KERNELS is None :
-            ERROR( 'Response functions not generated; call "generate_kernels()" and "load_kernels()" first' )
-
-        self.THREADS = {}
-        self.THREADS['n'] = nthreads
-        if nthreads == 0:
-            self.THREADS['gpu_id'] = gpu_id
-            LOG( '\n-> Checking CUDA GPU:' )
-
-            from commit.cudaoperator.operator import check_compatibility
-            #cdef unsigned long long required_mem = 28*self.n + 6*self.nzeppelins + 8.0*(size_t)nfibers + 16.0*(size_t)nvoxels + 4.0*((size_t)size_lutic + (size_t)size_lutec + (size_t)size_lutiso + (size_t)this->nrows + (size_t)this->ncols)
-            error_id = check_compatibility(gpu_id)
-            if error_id == 1:
-                ERROR( 'The selected GPU is not detected' )
-            elif error_id == 2:
-                ERROR( 'Impossible to set GPU with ID=%d' % gpu_id )
-            elif error_id == 3:
-                ERROR( 'Impossible to get properties from GPU with ID=%d' % gpu_id )
-            elif error_id == 4:
-                ERROR( 'Compute capability must be at least 5.0' )
-
-            if gpu_id == 0:
-                LOG( '   [ Default GPU selected. Use option "gpu_id" in "set_threads()" to change selection ]' )
-
-        cdef :
-            long [:] C
-            long t, tot, i1, i2, N, c
-            int i
-
-        tic = time.time()
-
-        if nthreads > 0:
-            LOG( '\n-> Distributing workload to different threads:' )
-            print( '\t* number of threads : %d' % nthreads )
-
-            # Distribute load for the computation of A*x product
-            print( '\t* A  operator... ', end='' )
-            sys.stdout.flush()
-
-            self.THREADS['IC']   = None
-            self.THREADS['EC']   = None
-            self.THREADS['ISO']  = None
-            self.THREADS['ICt']  = None
-            self.THREADS['ECt']  = None
-            self.THREADS['ISOt'] = None
-
-            if self.DICTIONARY['IC']['n'] > 0 :
-                self.THREADS['IC'] = np.zeros( nthreads+1, dtype=np.uint32 )
-                if nthreads > 1 :
-                    N = np.floor( self.DICTIONARY['IC']['n']/nthreads )
-                    t = 1
-                    tot = 0
-                    C = np.bincount( self.DICTIONARY['IC']['v'] )
-                    for c in C :
-                        tot += c
-                        if tot >= N :
-                            self.THREADS['IC'][t] = self.THREADS['IC'][t-1] + tot
-                            t += 1
-                            tot = 0
-                self.THREADS['IC'][nthreads] = self.DICTIONARY['IC']['n']
-
-                # check if some threads are not assigned any segment
-                if np.count_nonzero( np.diff( self.THREADS['IC'].astype(np.int32) ) <= 0 ) :
-                    self.THREADS = None
-                    ERROR( 'Too many threads for the IC compartments to evaluate; try decreasing the number.' )
-
-            if self.DICTIONARY['EC']['nE'] > 0 :
-                self.THREADS['EC'] = np.zeros( nthreads+1, dtype=np.uint32 )
-                for i in xrange(nthreads) :
-                    self.THREADS['EC'][i] = np.searchsorted( self.DICTIONARY['EC']['v'], self.DICTIONARY['IC']['v'][ self.THREADS['IC'][i] ] )
-                self.THREADS['EC'][nthreads] = self.DICTIONARY['EC']['nE']
-
-                # check if some threads are not assigned any segment
-                if np.count_nonzero( np.diff( self.THREADS['EC'].astype(np.int32) ) <= 0 ) :
-                    self.THREADS = None
-                    ERROR( 'Too many threads for the EC compartments to evaluate; try decreasing the number.' )
-
-            if self.DICTIONARY['nV'] > 0 :
-                self.THREADS['ISO'] = np.zeros( nthreads+1, dtype=np.uint32 )
-                for i in xrange(nthreads) :
-                    self.THREADS['ISO'][i] = np.searchsorted( self.DICTIONARY['ISO']['v'], self.DICTIONARY['IC']['v'][ self.THREADS['IC'][i] ] )
-                self.THREADS['ISO'][nthreads] = self.DICTIONARY['nV']
-
-                # check if some threads are not assigned any segment
-                if np.count_nonzero( np.diff( self.THREADS['ISO'].astype(np.int32) ) <= 0 ) :
-                    self.THREADS = None
-                    ERROR( 'Too many threads for the ISO compartments to evaluate; try decreasing the number.' )
-
-            print( '[ OK ]' )
-
-            # Distribute load for the computation of At*y product
-            print( '\t* A\' operator... ', end="" )
-            sys.stdout.flush()
-
-            if self.DICTIONARY['IC']['n'] > 0 :
-                self.THREADS['ICt'] = np.full( self.DICTIONARY['IC']['n'], nthreads-1, dtype=np.uint8 )
-                if nthreads > 1 :
-                    idx = np.argsort( self.DICTIONARY['IC']['fiber'], kind='mergesort' )
-                    C = np.bincount( self.DICTIONARY['IC']['fiber'] )
-                    t = tot = i1 = i2 = 0
-                    N = np.floor(self.DICTIONARY['IC']['n']/nthreads)
-                    for c in C :
-                        i2 += c
-                        tot += c
-                        if tot >= N :
-                            self.THREADS['ICt'][ i1:i2 ] = t
-                            t += 1
-                            if t==nthreads-1 :
-                                break
-                            i1 = i2
-                            tot = c
-                    self.THREADS['ICt'][idx] = self.THREADS['ICt'].copy()
-
-            if self.DICTIONARY['EC']['nE'] > 0 :
-                self.THREADS['ECt'] = np.zeros( nthreads+1, dtype=np.uint32 )
-                N = np.floor( self.DICTIONARY['EC']['nE']/nthreads )
-                for i in xrange(1,nthreads) :
-                    self.THREADS['ECt'][i] = self.THREADS['ECt'][i-1] + N
-                self.THREADS['ECt'][nthreads] = self.DICTIONARY['EC']['nE']
-
-                # check if some threads are not assigned any segment
-                if np.count_nonzero( np.diff( self.THREADS['ECt'].astype(np.int32) ) <= 0 ) :
-                    self.THREADS = None
-                    ERROR( 'Too many threads for the EC compartments to evaluate; try decreasing the number.' )
-
-            if self.DICTIONARY['nV'] > 0 :
-                self.THREADS['ISOt'] = np.zeros( nthreads+1, dtype=np.uint32 )
-                N = np.floor( self.DICTIONARY['nV']/nthreads )
-                for i in xrange(1,nthreads) :
-                    self.THREADS['ISOt'][i] = self.THREADS['ISOt'][i-1] + N
-                self.THREADS['ISOt'][nthreads] = self.DICTIONARY['nV']
-
-                # check if some threads are not assigned any segment
-                if np.count_nonzero( np.diff( self.THREADS['ISOt'].astype(np.int32) ) <= 0 ) :
-                    self.THREADS = None
-                    ERROR( 'Too many threads for the ISO compartments to evaluate; try decreasing the number.' )
-
-            print( '[ OK ]' )
-
-            LOG( '   [ %.1f seconds ]' % ( time.time() - tic ) )
-
-
-    def build_operator( self, build_dir=None ) :
-        """Compile/build the operator for computing the matrix-vector multiplications by A and A'
-        using the informations from self.DICTIONARY, self.KERNELS and self.THREADS.
-        NB: needs to call this function to update pointers to data structures in case
-            the data is changed in self.DICTIONARY, self.KERNELS or self.THREADS.
-
-        Parameters
-        ----------
-        build_dir : string
-            The folder in which to store the compiled files. 
-            If None (default), they will end up in the .pyxbld directory in the user’s home directory.
-            If using this option, it is recommended to use a temporary directory, quit your python 
-                console between each build, and delete the content of the temporary directory.
-        """
-        if self.DICTIONARY is None :
-            ERROR( 'Dictionary not loaded; call "load_dictionary()" first' )
-        if self.KERNELS is None :
-            ERROR( 'Response functions not generated; call "generate_kernels()" and "load_kernels()" first' )
-        if self.THREADS is None :
-            ERROR( 'Threads not set; call "set_threads()" first' )
-        
-        if self.DICTIONARY['IC']['nF'] <= 0 :
-            ERROR( 'No streamline found in the dictionary; check your data' )
-        if self.DICTIONARY['EC']['nE'] <= 0 and self.KERNELS['wmh'].shape[0] > 0 :
-            ERROR( 'The selected model has EC compartments, but no peaks have been provided; check your data' )
-
-        tic = time.time()
-        LOG( '\n-> Building linear operator A:' )
-
-        if self.THREADS['n'] > 0:
-            # need to pass these parameters at runtime for compiling the C code
-            from commit.operator import config
-
-            compilation_is_needed = False
-            
-            if config.nTHREADS is None or config.nTHREADS != self.THREADS['n']:
-                compilation_is_needed = True
-            if config.nIC is None or config.nIC != self.KERNELS['wmr'].shape[0]:
-                compilation_is_needed = True
-            if config.model is None or config.model != self.model.id:
-                compilation_is_needed = True        
-            if config.nEC is None or config.nEC != self.KERNELS['wmh'].shape[0]:
-                compilation_is_needed = True                
-            if config.nISO is None or config.nISO != self.KERNELS['iso'].shape[0]:
-                compilation_is_needed = True        
-            if config.build_dir != build_dir:
-                compilation_is_needed = True        
-
-            if compilation_is_needed or not 'commit.operator.operator' in sys.modules :       
-
-                if build_dir is not None:
-                    if isdir(build_dir) and not len(listdir(build_dir)) == 0:
-                        ERROR( '\nbuild_dir is not empty, unsafe build option.' )
-                    elif config.nTHREADS is not None:
-                        ERROR( '\nThe parameter build_dir has changed, unsafe build option.' )
-                    else:
-                        WARNING( '\nUsing build_dir, always quit your python console between COMMIT Evaluation.' )
-
-                config.nTHREADS   = self.THREADS['n']
-                config.model      = self.model.id
-                config.nIC        = self.KERNELS['wmr'].shape[0]
-                config.nEC        = self.KERNELS['wmh'].shape[0]
-                config.nISO       = self.KERNELS['iso'].shape[0]
-                config.build_dir  = build_dir
-
-                pyximport.install( reload_support=True, language_level=3, build_dir=build_dir, build_in_temp=True, inplace=False )
-
-                if not 'commit.operator.operator' in sys.modules :
-                    import commit.operator.operator
-                else :
-                    reload( sys.modules['commit.operator.operator'] )
-                
-            self.A = sys.modules['commit.operator.operator'].LinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )        
-        else:
-            import commit.cudaoperator.operator
-            self.A = commit.cudaoperator.operator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS, fcall=1 )
-
-        LOG( '   [ %.1f seconds ]' % ( time.time() - tic ) )
-
-
-    def get_y( self ):
-        """
-        Returns a numpy array that corresponds to the 'y' vector of the optimisation problem.
-        NB: this can be run only after having loaded the dictionary and the data.
-        """
-        if self.DICTIONARY is None :
-            ERROR( 'Dictionary not loaded; call "load_dictionary()" first' )
-        if self.niiDWI is None :
-            ERROR( 'Data not loaded; call "load_data()" first' )
-        return self.niiDWI_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'], : ].flatten().astype(np.float64)
-
-
-    def fit( self, tol_fun=1e-3, tol_x=1e-6, max_iter=100, verbose=1, x0=None, regularisation=None ) :
-        """Fit the model to the data.
-
-        Parameters
-        ----------
-        tol_fun : float
-            Tolerance on the objective function (default : 1e-3)
-        max_iter : integer
-            Maximum number of iterations (default : 100)
-        verbose : integer
-            Level of verbosity: 0=no print, 1=print progress (default : 1)
-        x0 : np.array
-            Initial guess for the solution of the problem (default : None)
-        regularisation : commit.solvers.init_regularisation object
-            Python dictionary that describes the wanted regularisation term.
-            Check the documentation of commit.solvers.init_regularisation to see
-            how to properly define the wanted mathematical formulation
-            ( default : None )
-        """
-        if self.niiDWI is None :
-            ERROR( 'Data not loaded; call "load_data()" first' )
-        if self.DICTIONARY is None :
-            ERROR( 'Dictionary not loaded; call "load_dictionary()" first' )
-        if self.KERNELS is None :
-            ERROR( 'Response functions not generated; call "generate_kernels()" and "load_kernels()" first' )
-        if self.THREADS is None :
-            ERROR( 'Threads not set; call "set_threads()" first' )
-        if self.A is None :
-            ERROR( 'Operator not built; call "build_operator()" first' )
-
-        if x0 is not None :
-            if x0.shape[0] != self.A.shape[1] :
-                ERROR( 'x0 dimension does not match the number of columns of the dictionary' )
-        if regularisation is None :
-            regularisation = commit.solvers.init_regularisation(self)
-
-        self.CONFIG['optimization']                   = {}
-        self.CONFIG['optimization']['tol_fun']        = tol_fun
-        self.CONFIG['optimization']['tol_x']          = tol_x
-        self.CONFIG['optimization']['max_iter']       = max_iter
-        self.CONFIG['optimization']['verbose']        = verbose
-        self.CONFIG['optimization']['regularisation'] = regularisation
-
-        # run solver
-        t = time.time()
-        LOG( '\n-> Fit model:' )
-
-        self.x, opt_details = commit.solvers.solve(self.get_y(), self.A, self.A.T, tol_fun = tol_fun, tol_x = tol_x, max_iter = max_iter, verbose = verbose, x0 = x0, regularisation = regularisation)
-
-        self.CONFIG['optimization']['fit_details'] = opt_details
-        self.CONFIG['optimization']['fit_time'] = time.time()-t
-
-        LOG( '\n   [ %s ]' % ( time.strftime("%Hh %Mm %Ss", time.gmtime(self.CONFIG['optimization']['fit_time']) ) ) )
-
-
-    def get_coeffs( self ):
-        """
-        Returns the coefficients, corresponding to the original optimisation problem,
-        i.e. the input tractogram to trk2dictionary, divided in three classes (ic, ec, iso).
-        """
-        if self.x is None :
-            ERROR( 'Model not fitted to the data; call "fit()" first' )
-
-        nF = self.DICTIONARY['IC']['nF']
-        nE = self.DICTIONARY['EC']['nE']
-        nV = self.DICTIONARY['nV']
-
-        if self.get_config('doNormalizeKernels') :
-            # renormalize the coefficients
-            norm1 = np.repeat(self.KERNELS['wmr_norm'],nF)
-            norm2 = np.repeat(self.KERNELS['wmh_norm'],nE)
-            norm3 = np.repeat(self.KERNELS['iso_norm'],nV)
-            norm_fib = np.kron(np.ones(self.KERNELS['wmr'].shape[0]), self.DICTIONARY['TRK']['norm'])
-            x = self.x / np.hstack( (norm1*norm_fib,norm2,norm3) )
-        else :
-            x = self.x
-
-        offset1 = nF * self.KERNELS['wmr'].shape[0]
-        offset2 = offset1 + nE * self.KERNELS['wmh'].shape[0]
-        kept = np.tile( self.DICTIONARY['TRK']['kept'], self.KERNELS['wmr'].shape[0] )
-        xic = np.zeros( kept.size )
-        xic[kept==1] = x[:offset1]
-        xec = x[offset1:offset2]
-        xiso = x[offset2:]
-
-        return xic, xec, xiso
-
-
-    def save_results( self, path_suffix=None, stat_coeffs='sum', save_est_dwi=False, save_coeff=None, save_opt_details=None ) :
-        """Save the output (coefficients, errors, maps etc).
-
-        Parameters
-        ----------
-        path_suffix : string
-            Text to be appended to "Results" to create the output path (default : None)
-        stat_coeffs : string
-            Stat to be used if more coefficients are estimated for each streamline.
-            Options: 'sum', 'mean', 'median', 'min', 'max', 'all' (default : 'sum')
-        save_est_dwi : boolean
-            Save the estimated DW-MRI signal (default : False)
-        save_opt_details : boolean
-            DEPRECATED. The details of the optimization and the coefficients are always saved.
-        save_coeff : boolean
-            DEPRECATED. The estimated weights for the streamlines are always saved.
-        """
-        RESULTS_path = 'Results_' + self.model.id
-        if path_suffix :
-            self.set_config('path_suffix', path_suffix)
-            RESULTS_path = RESULTS_path + path_suffix
-
-        LOG( '\n-> Saving results to "%s/*":' % RESULTS_path )
-        tic = time.time()
-
-        if self.x is None :
-            ERROR( 'Model not fitted to the data; call "fit()" first' )
-
-        if save_coeff is not None :
-            WARNING('"save_coeff" parameter is deprecated')
-
-        if save_opt_details is not None :
-            WARNING('"save_opt_details" parameter is deprecated')
-        
-        nF = self.DICTIONARY['IC']['nF']
-        nE = self.DICTIONARY['EC']['nE']
-        nV = self.DICTIONARY['nV']
-        norm_fib = np.ones( nF )
-        # x is the x of the original problem
-        # self.x is the x preconditioned
-        if self.get_config('doNormalizeKernels') :
-            # renormalize the coefficients
-            norm1 = np.repeat(self.KERNELS['wmr_norm'],nF)
-            norm2 = np.repeat(self.KERNELS['wmh_norm'],nE)
-            norm3 = np.repeat(self.KERNELS['iso_norm'],nV)
-            norm_fib = np.kron(np.ones(self.KERNELS['wmr'].shape[0]), self.DICTIONARY['TRK']['norm'])
-            x = self.x / np.hstack( (norm1*norm_fib,norm2,norm3) )
-        else :
-            x = self.x
-
-        # create folder or delete existing files (if any)
-        RESULTS_path = pjoin( self.get_config('TRACKING_path'), RESULTS_path )
-        if not exists( RESULTS_path ) :
-            makedirs( RESULTS_path )
-        else :
-            for f in glob.glob( pjoin(RESULTS_path,'*') ) :
-                remove( f )
-        self.set_config('RESULTS_path', RESULTS_path)
-
-        # Map of voxelwise errors
-        print( '\t* Fitting errors:' )
-
-        niiMAP_img = np.zeros( self.get_config('dim'), dtype=np.float32 )
-        affine = self.niiDWI.affine if nibabel.__version__ >= '2.0.0' else self.niiDWI.get_affine()
-        niiMAP     = nibabel.Nifti1Image( niiMAP_img, affine )
-        niiMAP_hdr = niiMAP.header if nibabel.__version__ >= '2.0.0' else niiMAP.get_header()
-        niiMAP_hdr['descrip'] = 'Created with COMMIT %s'%self.get_config('version')
-
-        y_mea = np.reshape( self.niiDWI_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'], : ].flatten().astype(np.float32), (nV,-1) )
-        y_est = np.reshape( self.A.dot(self.x), (nV,-1) ).astype(np.float32)
-
-        print( '\t\t- RMSE...  ', end='' )
-        sys.stdout.flush()
-        tmp = np.sqrt( np.mean((y_mea-y_est)**2,axis=1) )
-        niiMAP_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = tmp
-        niiMAP_hdr['cal_min'] = 0
-        niiMAP_hdr['cal_max'] = tmp.max()
-        nibabel.save( niiMAP, pjoin(RESULTS_path,'fit_RMSE.nii.gz') )
-        print( '[ %.3f +/- %.3f ]' % ( tmp.mean(), tmp.std() ) )
-
-        print( '\t\t- NRMSE... ', end='' )
-        sys.stdout.flush()
-        tmp = np.sum(y_mea**2,axis=1)
-        idx = np.where( tmp < 1E-12 )
-        tmp[ idx ] = 1
-        tmp = np.sqrt( np.sum((y_mea-y_est)**2,axis=1) / tmp )
-        tmp[ idx ] = 0
-        niiMAP_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = tmp
-        niiMAP_hdr['cal_min'] = 0
-        niiMAP_hdr['cal_max'] = 1
-        nibabel.save( niiMAP, pjoin(RESULTS_path,'fit_NRMSE.nii.gz') )
-        print( '[ %.3f +/- %.3f ]' % ( tmp.mean(), tmp.std() ) )
-
-        # Map of compartment contributions
-        print( '\t* Voxelwise contributions:' )
-
-        print( '\t\t- Intra-axonal... ', end='' )
-        sys.stdout.flush()
-        niiIC_img = np.zeros( self.get_config('dim'), dtype=np.float32 )
-        if len(self.KERNELS['wmr']) > 0 :
-            offset = nF * self.KERNELS['wmr'].shape[0]
-            tmp = ( x[:offset].reshape( (-1,nF) ) * norm_fib.reshape( (-1,nF) ) ).sum( axis=0 )
-            xv = np.bincount( self.DICTIONARY['IC']['v'], minlength=nV,
-                weights=tmp[ self.DICTIONARY['IC']['fiber'] ] * self.DICTIONARY['IC']['len']
-            ).astype(np.float32)
-            niiIC_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = xv
-        print( '[ OK ]' )
-
-        print( '\t\t- Extra-axonal... ', end='' )
-        sys.stdout.flush()
-        niiEC_img = np.zeros( self.get_config('dim'), dtype=np.float32 )
-        if len(self.KERNELS['wmh']) > 0 :
-            offset = nF * self.KERNELS['wmr'].shape[0]
-            tmp = x[offset:offset+nE*len(self.KERNELS['wmh'])].reshape( (-1,nE) ).sum( axis=0 )
-            xv = np.bincount( self.DICTIONARY['EC']['v'], weights=tmp, minlength=nV ).astype(np.float32)
-            niiEC_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = xv
-        print( '[ OK ]' )
-
-        print( '\t\t- Isotropic... ', end='' )
-        sys.stdout.flush()
-        niiISO_img = np.zeros( self.get_config('dim'), dtype=np.float32 )
-        if len(self.KERNELS['iso']) > 0 :
-            offset = nF * self.KERNELS['wmr'].shape[0] + nE * self.KERNELS['wmh'].shape[0]
-            xv = x[offset:].reshape( (-1,nV) ).sum( axis=0 )
-            niiISO_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = xv
-        print( '   [ OK ]' )
-
-        if self.get_config('doNormalizeMaps') :
-            niiIC = nibabel.Nifti1Image(  niiIC_img  / ( niiIC_img + niiEC_img + niiISO_img + 1e-16), affine, header=niiMAP_hdr )
-            niiEC = nibabel.Nifti1Image(  niiEC_img /  ( niiIC_img + niiEC_img + niiISO_img + 1E-16), affine, header=niiMAP_hdr )
-            niiISO = nibabel.Nifti1Image( niiISO_img / ( niiIC_img + niiEC_img + niiISO_img + 1E-16), affine, header=niiMAP_hdr )
-        else:
-            niiIC = nibabel.Nifti1Image(  niiIC_img,  affine, header=niiMAP_hdr )
-            niiEC = nibabel.Nifti1Image(  niiEC_img,  affine, header=niiMAP_hdr )
-            niiISO = nibabel.Nifti1Image( niiISO_img, affine, header=niiMAP_hdr )
-
-        nibabel.save( niiIC , pjoin(RESULTS_path,'compartment_IC.nii.gz') )
-        nibabel.save( niiEC , pjoin(RESULTS_path,'compartment_EC.nii.gz') )
-        nibabel.save( niiISO , pjoin(RESULTS_path,'compartment_ISO.nii.gz') )
-
-        # Configuration and results
-        print( '\t* Configuration and results:' )
-
-        print( '\t\t- streamline_weights.txt... ', end='' )
-        sys.stdout.flush()
-        xic, _, _ = self.get_coeffs()
-        if stat_coeffs != 'all' and xic.size > 0 :
-            xic = np.reshape( xic, (-1,self.DICTIONARY['TRK']['kept'].size) )
-            if stat_coeffs == 'sum' :
-                xic = np.sum( xic, axis=0 )
-            elif stat_coeffs == 'mean' :
-                xic = np.mean( xic, axis=0 )
-            elif stat_coeffs == 'median' :
-                xic = np.median( xic, axis=0 )
-            elif stat_coeffs == 'min' :
-                xic = np.min( xic, axis=0 )
-            elif stat_coeffs == 'max' :
-                xic = np.max( xic, axis=0 )
-            else :
-                ERROR( 'Stat not allowed. Possible values: sum, mean, median, min, max, all.', prefix='\n' )
-        np.savetxt( pjoin(RESULTS_path,'streamline_weights.txt'), xic, fmt='%.5e' )
-        self.set_config('stat_coeffs', stat_coeffs)
-        print( '[ OK ]' )
-
-        # Save to a pickle file the following items:
-        #   item 0: dictionary with all the configuration details
-        #   item 1: np.array obtained through the optimisation process with the normalised kernels
-        #   item 2: np.array renormalisation of coeffs in item 1
-        print( '\t\t- results.pickle... ', end='' )
-        sys.stdout.flush()
-        with open( pjoin(RESULTS_path,'results.pickle'), 'wb+' ) as fid :
-            pickle.dump( [self.CONFIG, self.x, x], fid, protocol=2 )
-        print( '        [ OK ]' )
-
-        if save_est_dwi :
-            print( '\t\t- Estimated signal... ', end='' )
-            sys.stdout.flush()
-            self.niiDWI_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'], : ] = y_est
-            nibabel.save( nibabel.Nifti1Image( self.niiDWI_img , affine ), pjoin(RESULTS_path,'fit_signal_estimated.nii.gz') )
-            self.niiDWI_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'], : ] = y_mea
-            print( '[ OK ]' )
-        
-        LOG( '   [ %.1f seconds ]' % ( time.time() - tic ) )
+#!python
+#cython: language_level=3, boundscheck=False, wraparound=False, nonecheck=False, cdivision=True, initializedcheck=False, binding=False
+from __future__ import print_function
+cimport cython
+import numpy as np
+cimport numpy as np
+
+import time
+import glob
+import sys
+from os import makedirs, remove, getcwd, listdir
+from os.path import exists, join as pjoin, isfile, isdir
+import nibabel
+import pickle
+import commit.models
+import commit.solvers
+import amico.scheme
+import amico.lut
+import pyximport
+from pkg_resources import get_distribution
+
+from amico.util import LOG, NOTE, WARNING, ERROR
+
+
+def setup( lmax=12, ndirs=32761 ) :
+    """General setup/initialization of the COMMIT framework.
+    
+    Parameters
+    ----------
+    lmax : int
+        Maximum SH order to use for the rotation phase (default : 12)
+    ndirs : int
+        Number of directions on the half of the sphere representing the possible orientations of the response functions (default : 32761)
+    """
+
+    if not amico.lut.is_valid(ndirs):
+        ERROR( 'Unsupported value for ndirs.\nNote: Supported values for ndirs are [1, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 10000, 32761 (default)]' )
+
+    amico.lut.precompute_rotation_matrices( lmax, ndirs )
+
+
+def load_dictionary_info( filename ):
+    """Function to load dictionary info file
+    
+    Parameters
+    ----------
+    filename : string
+        This value is always COMMIT_PATH + dictionary_info.pickle
+    """
+    if not isfile( filename ):
+        ERROR( 'Dictionary is outdated or not found. Execute "trk2dictionary" script first' )
+    with open( filename, 'rb' ) as dictionary_info_file:
+        if sys.version_info.major == 3:
+            aux = pickle.load( dictionary_info_file, fix_imports=True, encoding='bytes' )
+            # Pickle files written by Python 2 are loaded with byte
+            # keys, whereas those written by Python 3 are loaded with
+            # str keys, even when both are written using protocol=2
+            result_aux = {(k.decode() if hasattr(k,"decode") else k): v for k, v in aux.items()}
+            return result_aux
+        else:
+            return pickle.load( dictionary_info_file )
+
+
+cdef class Evaluation :
+    """Class to hold all the information (data and parameters) when performing an
+    evaluation with the COMMIT framework.
+    """
+    cdef public niiDWI
+    cdef public niiDWI_img
+    cdef public scheme
+    cdef public model
+    cdef public KERNELS
+    cdef public DICTIONARY
+    cdef public THREADS
+    cdef public A
+    cdef public x
+    cdef public CONFIG
+
+    def __init__( self, study_path, subject ) :
+        """Setup the data structures with default values.
+
+        Parameters
+        ----------
+        study_path : string
+            The path to the folder containing all the subjects from one study
+        subject : string
+            The path (relative to previous folder) to the subject folder
+        """
+        self.niiDWI     = None # set by "load_data" method
+        self.scheme     = None # set by "load_data" method
+        self.model      = None # set by "set_model" method
+        self.KERNELS    = None # set by "load_kernels" method
+        self.DICTIONARY = None # set by "load_dictionary" method
+        self.THREADS    = None # set by "set_threads" method
+        self.A          = None # set by "build_operator" method
+        self.x          = None # set by "fit" method
+
+        # store all the parameters of an evaluation with COMMIT
+        self.CONFIG = {}
+        self.set_config('version', get_distribution('dmri-commit').version)
+        self.set_config('study_path', study_path)
+        self.set_config('subject', subject)
+        self.set_config('DATA_path', pjoin( study_path, subject ))
+
+        self.set_config('doNormalizeSignal', True)
+        self.set_config('doMergeB0', False)
+        self.set_config('doNormalizeKernels', True)
+        self.set_config('doDemean', False)
+        self.set_config('doNormalizeMaps', False)
+
+
+    def set_config( self, key, value ) :
+        self.CONFIG[ key ] = value
+
+
+    def get_config( self, key ) :
+        return self.CONFIG.get( key )
+
+
+    def load_data( self, dwi_filename='DWI.nii', scheme_filename='DWI.scheme', b0_thr=0, b0_min_signal=0 ) :
+        """Load the diffusion signal and its corresponding acquisition scheme.
+
+        Parameters
+        ----------
+        dwi_filename : string
+            The file name of the DWI data, relative to the subject folder (default : 'DWI.nii')
+        scheme_filename : string
+            The file name of the corresponding acquisition scheme (default : 'DWI.scheme')
+        b0_thr : float
+            The threshold below which a b-value is considered a b0 (default : 0)
+        b0_min_signal : float
+            Crop to zero the signal in voxels where the b0 <= b0_min_signal * mean(b0[b0>0]) (default : 0)
+        """
+
+        # Loading data and acquisition scheme
+        tic = time.time()
+        LOG( '\n-> Loading data:' )
+
+        print( '\t* DWI signal:' )
+        self.set_config('dwi_filename', dwi_filename)
+        self.niiDWI  = nibabel.load( pjoin( self.get_config('DATA_path'), dwi_filename) )
+        self.niiDWI_img = self.niiDWI.get_data().astype(np.float32)
+        if self.niiDWI_img.ndim ==3 :
+            self.niiDWI_img = np.expand_dims( self.niiDWI_img, axis=3 )
+        hdr = self.niiDWI.header if nibabel.__version__ >= '2.0.0' else self.niiDWI.get_header()
+        self.set_config('dim', self.niiDWI_img.shape[0:3])
+        self.set_config('pixdim', tuple( hdr.get_zooms()[:3] ))
+        print( '\t\t- dim    : %d x %d x %d x %d' % self.niiDWI_img.shape )
+        print( '\t\t- pixdim : %.3f x %.3f x %.3f' % self.get_config('pixdim') )
+        print( '\t\t- values : min=%.2f, max=%.2f, mean=%.2f' % ( self.niiDWI_img.min(), self.niiDWI_img.max(), self.niiDWI_img.mean() ) )
+
+        print( '\t* Acquisition scheme:' )
+        self.set_config('scheme_filename', scheme_filename)
+        self.set_config('b0_thr', b0_thr)
+        self.scheme = amico.scheme.Scheme( pjoin( self.get_config('DATA_path'), scheme_filename), b0_thr )
+        print( '\t\t- %d samples, %d shells' % ( self.scheme.nS, len(self.scheme.shells) ) )
+        print( '\t\t- %d @ b=0' % ( self.scheme.b0_count ), end='' )
+        for i in xrange(len(self.scheme.shells)) :
+            print( ', %d @ b=%.1f' % ( len(self.scheme.shells[i]['idx']), self.scheme.shells[i]['b'] ), end='' )
+        print()
+
+        if self.scheme.nS != self.niiDWI_img.shape[3] :
+            ERROR( 'Scheme does not match with DWI data' )
+
+        if self.scheme.dwi_count == 0 :
+            ERROR( 'There are no DWI volumes in the data' )
+
+        LOG( '   [ %.1f seconds ]' % ( time.time() - tic ) )
+
+        # Preprocessing
+        tic = time.time()
+        LOG( '\n-> Preprocessing:' )
+
+        if self.get_config('doNormalizeSignal') :
+            if self.scheme.b0_count > 0 :
+                print( '\t* Normalizing to b0... ', end='' )
+                sys.stdout.flush()
+                b0 = np.mean( self.niiDWI_img[:,:,:,self.scheme.b0_idx], axis=3 )
+                idx = b0 <= b0_min_signal * b0[b0>0].mean()
+                b0[ idx ] = 1
+                b0 = 1.0 / b0
+                b0[ idx ] = 0
+                for i in xrange(self.scheme.nS) :
+                    self.niiDWI_img[:,:,:,i] *= b0
+                print( '[ min=%.2f, max=%.2f, mean=%.2f ]' % ( self.niiDWI_img.min(), self.niiDWI_img.max(), self.niiDWI_img.mean() ) )
+                del idx, b0
+            else :
+                WARNING( 'There are no b0 volumes for normalization' )
+
+        if self.scheme.b0_count > 1 :
+            if self.get_config('doMergeB0') :
+                print( '\t* Merging multiple b0 volume(s)... ', end='' )
+                mean = np.expand_dims( np.mean( self.niiDWI_img[:,:,:,self.scheme.b0_idx], axis=3 ), axis=3 )
+                self.niiDWI_img = np.concatenate( (mean, self.niiDWI_img[:,:,:,self.scheme.dwi_idx]), axis=3 )
+                del mean
+            else :
+                print( '\t* Keeping all b0 volume(s)... ', end='' )
+            print( '[ %d x %d x %d x %d ]' % self.niiDWI_img.shape )
+
+        if self.get_config('doDemean') :
+            print( '\t* Demeaning signal... ', end='' )
+            sys.stdout.flush()
+            mean = np.repeat( np.expand_dims(np.mean(self.niiDWI_img,axis=3),axis=3), self.niiDWI_img.shape[3], axis=3 )
+            self.niiDWI_img = self.niiDWI_img - mean
+            print( '[ min=%.2f, max=%.2f, mean=%.2f ]' % ( self.niiDWI_img.min(), self.niiDWI_img.max(), self.niiDWI_img.mean() ) )
+
+        LOG( '   [ %.1f seconds ]' % ( time.time() - tic ) )
+
+
+    def set_model( self, model_name ) :
+        """Set the model to use to describe the signal contributions in each voxel.
+
+        Parameters
+        ----------
+        model_name : string
+            The name of the model (must match a class name in "commit.models" module)
+        """
+        # Call the specific model constructor
+        if hasattr(commit.models, model_name ) :
+            self.model = getattr(commit.models,model_name)()
+        else :
+            ERROR( 'Model "%s" not recognized' % model_name )
+
+        self.set_config('ATOMS_path', pjoin( self.get_config('study_path'), 'kernels', self.model.id ))
+
+
+    def generate_kernels( self, regenerate=False, lmax=12, ndirs=32761 ) :
+        """Generate the high-resolution response functions for each compartment.
+        Dispatch to the proper function, depending on the model.
+
+        Parameters
+        ----------
+        regenerate : boolean
+            Regenerate kernels if they already exist (default : False)
+        lmax : int
+            Maximum SH order to use for the rotation procedure (default : 12)
+        ndirs : int
+            Number of directions on the half of the sphere representing the possible orientations of the response functions (default : 32761)
+        """
+        if not amico.lut.is_valid(ndirs):
+            ERROR( 'Unsupported value for ndirs.\nNote: Supported values for ndirs are [1, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 10000, 32761 (default)]' )
+        if self.scheme is None :
+            ERROR( 'Scheme not loaded; call "load_data()" first' )
+        if self.model is None :
+            ERROR( 'Model not set; call "set_model()" method first' )
+
+        # store some values for later use
+        self.set_config('lmax', lmax)
+        self.set_config('ndirs', ndirs)
+        self.set_config('model', self.model.get_params())
+        self.model.scheme = self.scheme
+
+        LOG( '\n-> Simulating with "%s" model:' % self.model.name )
+
+        # check if kernels were already generated
+        tmp = glob.glob( pjoin(self.get_config('ATOMS_path'),'A_*.npy') )
+        if len(tmp)>0 and not regenerate :
+            LOG( '   [ Kernels already computed. Use option "regenerate=True" to force regeneration ]' )
+            return
+
+        # create folder or delete existing files (if any)
+        if not exists( self.get_config('ATOMS_path') ) :
+            makedirs( self.get_config('ATOMS_path') )
+        else :
+            for f in glob.glob( pjoin(self.get_config('ATOMS_path'),'*') ) :
+                remove( f )
+
+        # auxiliary data structures
+        aux = amico.lut.load_precomputed_rotation_matrices( lmax, ndirs )
+        idx_IN, idx_OUT = amico.lut.aux_structures_generate( self.scheme, lmax )
+
+        # Dispatch to the right handler for each model
+        tic = time.time()
+        self.model.generate( self.get_config('ATOMS_path'), aux, idx_IN, idx_OUT, ndirs )
+        LOG( '   [ %.1f seconds ]' % ( time.time() - tic ) )
+
+
+    def load_kernels( self ) :
+        """Load rotated kernels and project to the specific gradient scheme of this subject.
+        Dispatch to the proper function, depending on the model.
+        """
+        if self.model is None :
+            ERROR( 'Model not set; call "set_model()" method first' )
+        if self.scheme is None :
+            ERROR( 'Scheme not loaded; call "load_data()" first' )
+
+        tic = time.time()
+        LOG( '\n-> Resampling LUT for subject "%s":' % self.get_config('subject') )
+
+        # auxiliary data structures
+        idx_OUT, Ylm_OUT = amico.lut.aux_structures_resample( self.scheme, self.get_config('lmax') )
+
+        # Dispatch to the right handler for each model
+        if self.get_config('doMergeB0') :
+            print( '\t* Merging multiple b0 volume(s)...' )
+        else :
+            print( '\t* Keeping all b0 volume(s)...' )
+        self.KERNELS = self.model.resample( self.get_config('ATOMS_path'), idx_OUT, Ylm_OUT, self.get_config('doMergeB0'), self.get_config('ndirs') )
+        nIC  = self.KERNELS['wmr'].shape[0]
+        nEC  = self.KERNELS['wmh'].shape[0]
+        nISO = self.KERNELS['iso'].shape[0]
+        print( '\t  [ OK ]' )
+
+        # ensure contiguous arrays for C part
+        self.KERNELS['wmr'] = np.ascontiguousarray( self.KERNELS['wmr'] )
+        self.KERNELS['wmh'] = np.ascontiguousarray( self.KERNELS['wmh'] )
+        self.KERNELS['iso'] = np.ascontiguousarray( self.KERNELS['iso'] )
+
+        # De-mean kernels
+        if self.get_config('doDemean') :
+            print( '\t* Demeaning signal...', end='' )
+            for j in xrange(self.get_config('ndirs')) :
+                for i in xrange(nIC) :
+                    self.KERNELS['wmr'][i,j,:] -= self.KERNELS['wmr'][i,j,:].mean()
+                for i in xrange(nEC) :
+                    self.KERNELS['wmh'][i,j,:] -= self.KERNELS['wmh'][i,j,:].mean()
+            for i in xrange(nISO) :
+                self.KERNELS['iso'][i] -= self.KERNELS['iso'][i].mean()
+            print( '[ OK ]' )
+
+        # Normalize atoms
+        if self.get_config('doNormalizeKernels') :
+            print( '\t* Normalizing... ', end='' )
+
+            self.KERNELS['wmr_norm'] = np.zeros( nIC )
+            for i in xrange(nIC) :
+                self.KERNELS['wmr_norm'][i] = np.linalg.norm( self.KERNELS['wmr'][i,0,:] )
+                for j in xrange(self.get_config('ndirs')) :
+                    self.KERNELS['wmr'][i,j,:] /= self.KERNELS['wmr_norm'][i]
+
+            self.KERNELS['wmh_norm'] = np.zeros( nEC )
+            for i in xrange(nEC) :
+                self.KERNELS['wmh_norm'][i] = np.linalg.norm( self.KERNELS['wmh'][i,0,:] )
+                for j in xrange(self.get_config('ndirs')) :
+                    self.KERNELS['wmh'][i,j,:] /= self.KERNELS['wmh_norm'][i]
+
+            self.KERNELS['iso_norm'] = np.zeros( nISO )
+            for i in xrange(nISO) :
+                self.KERNELS['iso_norm'][i] = np.linalg.norm( self.KERNELS['iso'][i,:] )
+                self.KERNELS['iso'][i,:] /= self.KERNELS['iso_norm'][i]
+
+            print( '[ OK ]' )
+
+        LOG( '   [ %.1f seconds ]' % ( time.time() - tic ) )
+
+
+    cpdef load_dictionary( self, path, use_all_voxels_in_mask=False ) :
+        """Load the sparse structure previously created with "trk2dictionary" script.
+
+        Parameters
+        ----------
+        path : string
+            Folder containing the output of the trk2dictionary script (relative to subject path)
+        use_all_voxels_in_mask : boolean
+            If False (default) the optimization will be conducted only on the voxels actually
+            traversed by tracts. If True, then all voxels present in the mask specified in 
+            trk2dictionary.run(), i.e. "filename_mask" parameter, will be used instead.
+            NB: if no mask was specified in trk2dictionary, this parameter is irrelevant.
+        """
+        if self.niiDWI is None :
+            ERROR( 'Data not loaded; call "load_data()" first' )
+
+        tic = time.time()
+        LOG( '\n-> Loading the dictionary:' )
+        self.DICTIONARY = {}
+        self.set_config('TRACKING_path', pjoin(self.get_config('DATA_path'),path))
+
+        # check that ndirs of dictionary matches with that of the kernels
+        dictionary_info = load_dictionary_info( pjoin(self.get_config('TRACKING_path'), "dictionary_info.pickle") )
+        if dictionary_info['ndirs'] != self.get_config('ndirs'):
+            ERROR( '"ndirs" of the dictionary (%d) does not match with the kernels (%d)' % (dictionary_info['ndirs'], self.get_config('ndirs')) )
+        self.DICTIONARY['ndirs'] = dictionary_info['ndirs']
+
+        # load mask
+        self.set_config('dictionary_mask', 'mask' if use_all_voxels_in_mask else 'tdi' )
+        mask_filename = pjoin(self.get_config('TRACKING_path'),'dictionary_%s.nii'%self.get_config('dictionary_mask'))
+        if not exists( mask_filename ) :
+            mask_filename += '.gz'
+            if not exists( mask_filename ) :
+                ERROR( 'Dictionary not found. Execute "trk2dictionary" script first' );
+        niiMASK = nibabel.load( mask_filename )
+        niiMASK_hdr = niiMASK.header if nibabel.__version__ >= '2.0.0' else niiMASK.get_header()
+        if ( self.get_config('dim')[0]!=niiMASK.shape[0] or
+             self.get_config('dim')[1]!=niiMASK.shape[1] or
+             self.get_config('dim')[2]!=niiMASK.shape[2] or
+             abs(self.get_config('pixdim')[0]-niiMASK_hdr['pixdim'][1])>1e-3 or
+             abs(self.get_config('pixdim')[1]-niiMASK_hdr['pixdim'][2])>1e-3 or
+             abs(self.get_config('pixdim')[2]-niiMASK_hdr['pixdim'][3])>1e-3 ) :
+            WARNING( 'Dictionary does not have the same geometry as the dataset' )
+        self.DICTIONARY['MASK'] = (niiMASK.get_data() > 0).astype(np.uint8)
+
+        # segments from the tracts
+        # ------------------------
+        print( '\t* Segments from the tracts... ', end='' )
+        sys.stdout.flush()
+
+        self.DICTIONARY['TRK'] = {}
+        self.DICTIONARY['TRK']['kept']  = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_TRK_kept.dict'), dtype=np.uint8 )
+        self.DICTIONARY['TRK']['norm'] = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_TRK_norm.dict'), dtype=np.float32 )
+        self.DICTIONARY['TRK']['len']  = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_TRK_len.dict'), dtype=np.float32 )
+        
+
+        self.DICTIONARY['IC'] = {}
+        self.DICTIONARY['IC']['fiber'] = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_IC_f.dict'), dtype=np.uint32 )
+        self.DICTIONARY['IC']['v']     = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_IC_v.dict'), dtype=np.uint32 )
+        self.DICTIONARY['IC']['o']     = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_IC_o.dict'), dtype=np.uint16 )
+        self.DICTIONARY['IC']['len']   = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_IC_len.dict'), dtype=np.float32 )
+        self.DICTIONARY['IC']['n']     = self.DICTIONARY['IC']['fiber'].size
+        self.DICTIONARY['IC']['nF']    = self.DICTIONARY['TRK']['norm'].size
+
+        # reorder the segments based, first, on the "v" field and after based on the "o" field
+        idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['v'])] )
+        self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
+        self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]
+        self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
+        self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
+        del idx
+
+        # divide the length of each segment by the fiber length so that all the columns of the libear operator will have same length
+        # NB: it works in conjunction with the normalization of the kernels
+        cdef :
+            np.float32_t [:] sl = self.DICTIONARY['IC']['len']
+            np.float32_t [:] tl = self.DICTIONARY['TRK']['norm']
+            np.uint32_t  [:] f  = self.DICTIONARY['IC']['fiber']
+            int s
+        if self.get_config('doNormalizeKernels') :
+            for s in xrange(self.DICTIONARY['IC']['n']) :
+                sl[s] /= tl[ f[s] ]
+
+        print( '[ %d fibers and %d segments ]' % ( self.DICTIONARY['IC']['nF'], self.DICTIONARY['IC']['n'] ) )
+
+        # segments from the peaks
+        # -----------------------
+        print( '\t* Segments from the peaks...  ', end='' )
+        sys.stdout.flush()
+
+        self.DICTIONARY['EC'] = {}
+        self.DICTIONARY['EC']['v']  = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_EC_v.dict'), dtype=np.uint32 )
+        self.DICTIONARY['EC']['o']  = np.fromfile( pjoin(self.get_config('TRACKING_path'),'dictionary_EC_o.dict'), dtype=np.uint16 )
+        self.DICTIONARY['EC']['nE'] = self.DICTIONARY['EC']['v'].size
+
+        # reorder the segments based, first, on the "v" field and after based on the "o" field
+        idx = np.lexsort( [np.array(self.DICTIONARY['EC']['o']), np.array(self.DICTIONARY['EC']['v'])] )
+        self.DICTIONARY['EC']['v'] = self.DICTIONARY['EC']['v'][ idx ]
+        self.DICTIONARY['EC']['o'] = self.DICTIONARY['EC']['o'][ idx ]
+        del idx
+
+        print( '[ %d segments ]' % self.DICTIONARY['EC']['nE'] )
+
+        # isotropic compartments
+        # ----------------------
+        print( '\t* Isotropic contributions...  ', end='' )
+        sys.stdout.flush()
+
+        self.DICTIONARY['ISO'] = {}
+
+        self.DICTIONARY['nV'] = self.DICTIONARY['MASK'].sum()
+
+        vx, vy, vz = ( self.DICTIONARY['MASK'] > 0 ).nonzero() # [TODO] find a way to avoid using int64 (not necessary and waste of memory)
+        vx = vx.astype(np.int32)
+        vy = vy.astype(np.int32)
+        vz = vz.astype(np.int32)
+        self.DICTIONARY['ISO']['v'] = vx + self.get_config('dim')[0] * ( vy + self.get_config('dim')[1] * vz )
+        del vx, vy, vz
+
+        # reorder the segments based on the "v" field
+        idx = np.argsort( self.DICTIONARY['ISO']['v'], kind='mergesort' )
+        self.DICTIONARY['ISO']['v'] = self.DICTIONARY['ISO']['v'][ idx ]
+        del idx
+
+        print( '[ %d voxels ]' % self.DICTIONARY['nV'] )
+
+        # post-processing
+        # ---------------
+        print( '\t* Post-processing...          ', end='' )
+        sys.stdout.flush()
+
+        # get the indices to extract the VOI as in MATLAB (in place of DICTIONARY.MASKidx)
+        idx = self.DICTIONARY['MASK'].ravel(order='F').nonzero()[0]
+        self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] = np.unravel_index( idx, self.DICTIONARY['MASK'].shape, order='F' )
+
+        lut = np.zeros( self.get_config('dim'), dtype=np.uint32 ).ravel()
+        for i in xrange(idx.size) :
+            lut[ idx[i] ] = i
+        self.DICTIONARY['IC'][ 'v'] = lut[ self.DICTIONARY['IC'][ 'v'] ]
+        self.DICTIONARY['EC'][ 'v'] = lut[ self.DICTIONARY['EC'][ 'v'] ]
+        self.DICTIONARY['ISO']['v'] = lut[ self.DICTIONARY['ISO']['v'] ]
+
+        print( '[ OK ]' )
+
+        LOG( '   [ %.1f seconds ]' % ( time.time() - tic ) )
+
+
+    def set_threads( self, n = None, nthreads = None, gpu_id = 0 ) :
+        """Set the number of threads to use for the matrix-vector operations with A and A'.
+
+        Parameters
+        ----------
+        n : integer
+            Same as nthreads. This remains just for compatibility with previous versions
+
+        nthreads : integer
+            Number of threads to use (nthreads = None ---> all the CPU threads available in the system
+                                      nthreads = 0    ---> enable CUDA GPU acceleration)
+        gpu_id : integer
+            GPU ID of the Nvidia GPU where COMMIT will be executed, default=0 and it is only required if nthreads=0
+            (To show a list of Nvidia GPUs and their IDs, open a system shell and run the command 'nvidia-smi')
+        """
+        if nthreads is None :
+            if n != None :
+                WARNING( '"n" parameter is deprecated, use "nthreads" instead' )
+                nthreads = n
+            else:
+                # Set to the number of CPUs in the system
+                try :
+                    import multiprocessing
+                    nthreads = multiprocessing.cpu_count()
+                except :
+                    nthreads = 1
+
+        if nthreads < 0 or nthreads > 255 :
+            ERROR( 'Number of threads must be between 0 and 255' )
+        if self.DICTIONARY is None :
+            ERROR( 'Dictionary not loaded; call "load_dictionary()" first' )
+        if self.KERNELS is None :
+            ERROR( 'Response functions not generated; call "generate_kernels()" and "load_kernels()" first' )
+
+        self.THREADS = {}
+        self.THREADS['n'] = nthreads
+        if nthreads == 0:
+            self.THREADS['gpu_id'] = gpu_id
+            LOG( '\n-> Checking CUDA GPU:' )
+
+            from commit.cudaoperator.operator import check_compatibility
+            #cdef unsigned long long required_mem = 28*self.n + 6*self.nzeppelins + 8.0*(size_t)nfibers + 16.0*(size_t)nvoxels + 4.0*((size_t)size_lutic + (size_t)size_lutec + (size_t)size_lutiso + (size_t)this->nrows + (size_t)this->ncols)
+            error_id = check_compatibility(gpu_id)
+            if error_id == 1:
+                ERROR( 'The selected GPU is not detected' )
+            elif error_id == 2:
+                ERROR( 'Impossible to set GPU with ID=%d' % gpu_id )
+            elif error_id == 3:
+                ERROR( 'Impossible to get properties from GPU with ID=%d' % gpu_id )
+            elif error_id == 4:
+                ERROR( 'Compute capability must be at least 5.0' )
+
+            if gpu_id == 0:
+                LOG( '   [ Default GPU selected. Use option "gpu_id" in "set_threads()" to change selection ]' )
+
+        cdef :
+            long [:] C
+            long t, tot, i1, i2, N, c
+            int i
+
+        tic = time.time()
+
+        if nthreads > 0:
+            LOG( '\n-> Distributing workload to different threads:' )
+            print( '\t* number of threads : %d' % nthreads )
+
+            # Distribute load for the computation of A*x product
+            print( '\t* A  operator... ', end='' )
+            sys.stdout.flush()
+
+            self.THREADS['IC']   = None
+            self.THREADS['EC']   = None
+            self.THREADS['ISO']  = None
+            self.THREADS['ICt']  = None
+            self.THREADS['ECt']  = None
+            self.THREADS['ISOt'] = None
+
+            if self.DICTIONARY['IC']['n'] > 0 :
+                self.THREADS['IC'] = np.zeros( nthreads+1, dtype=np.uint32 )
+                if nthreads > 1 :
+                    N = np.floor( self.DICTIONARY['IC']['n']/nthreads )
+                    t = 1
+                    tot = 0
+                    C = np.bincount( self.DICTIONARY['IC']['v'] )
+                    for c in C :
+                        tot += c
+                        if tot >= N :
+                            self.THREADS['IC'][t] = self.THREADS['IC'][t-1] + tot
+                            t += 1
+                            tot = 0
+                self.THREADS['IC'][nthreads] = self.DICTIONARY['IC']['n']
+
+                # check if some threads are not assigned any segment
+                if np.count_nonzero( np.diff( self.THREADS['IC'].astype(np.int32) ) <= 0 ) :
+                    self.THREADS = None
+                    ERROR( 'Too many threads for the IC compartments to evaluate; try decreasing the number.' )
+
+            if self.DICTIONARY['EC']['nE'] > 0 :
+                self.THREADS['EC'] = np.zeros( nthreads+1, dtype=np.uint32 )
+                for i in xrange(nthreads) :
+                    self.THREADS['EC'][i] = np.searchsorted( self.DICTIONARY['EC']['v'], self.DICTIONARY['IC']['v'][ self.THREADS['IC'][i] ] )
+                self.THREADS['EC'][nthreads] = self.DICTIONARY['EC']['nE']
+
+                # check if some threads are not assigned any segment
+                if np.count_nonzero( np.diff( self.THREADS['EC'].astype(np.int32) ) <= 0 ) :
+                    self.THREADS = None
+                    ERROR( 'Too many threads for the EC compartments to evaluate; try decreasing the number.' )
+
+            if self.DICTIONARY['nV'] > 0 :
+                self.THREADS['ISO'] = np.zeros( nthreads+1, dtype=np.uint32 )
+                for i in xrange(nthreads) :
+                    self.THREADS['ISO'][i] = np.searchsorted( self.DICTIONARY['ISO']['v'], self.DICTIONARY['IC']['v'][ self.THREADS['IC'][i] ] )
+                self.THREADS['ISO'][nthreads] = self.DICTIONARY['nV']
+
+                # check if some threads are not assigned any segment
+                if np.count_nonzero( np.diff( self.THREADS['ISO'].astype(np.int32) ) <= 0 ) :
+                    self.THREADS = None
+                    ERROR( 'Too many threads for the ISO compartments to evaluate; try decreasing the number.' )
+
+            print( '[ OK ]' )
+
+            # Distribute load for the computation of At*y product
+            print( '\t* A\' operator... ', end="" )
+            sys.stdout.flush()
+
+            if self.DICTIONARY['IC']['n'] > 0 :
+                self.THREADS['ICt'] = np.full( self.DICTIONARY['IC']['n'], nthreads-1, dtype=np.uint8 )
+                if nthreads > 1 :
+                    idx = np.argsort( self.DICTIONARY['IC']['fiber'], kind='mergesort' )
+                    C = np.bincount( self.DICTIONARY['IC']['fiber'] )
+                    t = tot = i1 = i2 = 0
+                    N = np.floor(self.DICTIONARY['IC']['n']/nthreads)
+                    for c in C :
+                        i2 += c
+                        tot += c
+                        if tot >= N :
+                            self.THREADS['ICt'][ i1:i2 ] = t
+                            t += 1
+                            if t==nthreads-1 :
+                                break
+                            i1 = i2
+                            tot = c
+                    self.THREADS['ICt'][idx] = self.THREADS['ICt'].copy()
+
+            if self.DICTIONARY['EC']['nE'] > 0 :
+                self.THREADS['ECt'] = np.zeros( nthreads+1, dtype=np.uint32 )
+                N = np.floor( self.DICTIONARY['EC']['nE']/nthreads )
+                for i in xrange(1,nthreads) :
+                    self.THREADS['ECt'][i] = self.THREADS['ECt'][i-1] + N
+                self.THREADS['ECt'][nthreads] = self.DICTIONARY['EC']['nE']
+
+                # check if some threads are not assigned any segment
+                if np.count_nonzero( np.diff( self.THREADS['ECt'].astype(np.int32) ) <= 0 ) :
+                    self.THREADS = None
+                    ERROR( 'Too many threads for the EC compartments to evaluate; try decreasing the number.' )
+
+            if self.DICTIONARY['nV'] > 0 :
+                self.THREADS['ISOt'] = np.zeros( nthreads+1, dtype=np.uint32 )
+                N = np.floor( self.DICTIONARY['nV']/nthreads )
+                for i in xrange(1,nthreads) :
+                    self.THREADS['ISOt'][i] = self.THREADS['ISOt'][i-1] + N
+                self.THREADS['ISOt'][nthreads] = self.DICTIONARY['nV']
+
+                # check if some threads are not assigned any segment
+                if np.count_nonzero( np.diff( self.THREADS['ISOt'].astype(np.int32) ) <= 0 ) :
+                    self.THREADS = None
+                    ERROR( 'Too many threads for the ISO compartments to evaluate; try decreasing the number.' )
+
+            print( '[ OK ]' )
+
+            LOG( '   [ %.1f seconds ]' % ( time.time() - tic ) )
+
+
+    def build_operator( self, build_dir=None ) :
+        """Compile/build the operator for computing the matrix-vector multiplications by A and A'
+        using the informations from self.DICTIONARY, self.KERNELS and self.THREADS.
+        NB: needs to call this function to update pointers to data structures in case
+            the data is changed in self.DICTIONARY, self.KERNELS or self.THREADS.
+
+        Parameters
+        ----------
+        build_dir : string
+            The folder in which to store the compiled files. 
+            If None (default), they will end up in the .pyxbld directory in the user’s home directory.
+            If using this option, it is recommended to use a temporary directory, quit your python 
+                console between each build, and delete the content of the temporary directory.
+        """
+        if self.DICTIONARY is None :
+            ERROR( 'Dictionary not loaded; call "load_dictionary()" first' )
+        if self.KERNELS is None :
+            ERROR( 'Response functions not generated; call "generate_kernels()" and "load_kernels()" first' )
+        if self.THREADS is None :
+            ERROR( 'Threads not set; call "set_threads()" first' )
+        
+        if self.DICTIONARY['IC']['nF'] <= 0 :
+            ERROR( 'No streamline found in the dictionary; check your data' )
+        if self.DICTIONARY['EC']['nE'] <= 0 and self.KERNELS['wmh'].shape[0] > 0 :
+            ERROR( 'The selected model has EC compartments, but no peaks have been provided; check your data' )
+
+        tic = time.time()
+        LOG( '\n-> Building linear operator A:' )
+
+        if self.THREADS['n'] > 0:
+            # need to pass these parameters at runtime for compiling the C code
+            from commit.operator import config
+
+            compilation_is_needed = False
+            
+            if config.nTHREADS is None or config.nTHREADS != self.THREADS['n']:
+                compilation_is_needed = True
+            if config.nIC is None or config.nIC != self.KERNELS['wmr'].shape[0]:
+                compilation_is_needed = True
+            if config.model is None or config.model != self.model.id:
+                compilation_is_needed = True        
+            if config.nEC is None or config.nEC != self.KERNELS['wmh'].shape[0]:
+                compilation_is_needed = True                
+            if config.nISO is None or config.nISO != self.KERNELS['iso'].shape[0]:
+                compilation_is_needed = True        
+            if config.build_dir != build_dir:
+                compilation_is_needed = True        
+
+            if compilation_is_needed or not 'commit.operator.operator' in sys.modules :       
+
+                if build_dir is not None:
+                    if isdir(build_dir) and not len(listdir(build_dir)) == 0:
+                        ERROR( '\nbuild_dir is not empty, unsafe build option.' )
+                    elif config.nTHREADS is not None:
+                        ERROR( '\nThe parameter build_dir has changed, unsafe build option.' )
+                    else:
+                        WARNING( '\nUsing build_dir, always quit your python console between COMMIT Evaluation.' )
+
+                config.nTHREADS   = self.THREADS['n']
+                config.model      = self.model.id
+                config.nIC        = self.KERNELS['wmr'].shape[0]
+                config.nEC        = self.KERNELS['wmh'].shape[0]
+                config.nISO       = self.KERNELS['iso'].shape[0]
+                config.build_dir  = build_dir
+
+                pyximport.install( reload_support=True, language_level=3, build_dir=build_dir, build_in_temp=True, inplace=False )
+
+                if not 'commit.operator.operator' in sys.modules :
+                    import commit.operator.operator
+                else :
+                    reload( sys.modules['commit.operator.operator'] )
+                
+            self.A = sys.modules['commit.operator.operator'].LinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )        
+        else:
+            import commit.cudaoperator.operator
+            self.A = commit.cudaoperator.operator.CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS, fcall=1 )
+
+        LOG( '   [ %.1f seconds ]' % ( time.time() - tic ) )
+
+
+    def get_y( self ):
+        """
+        Returns a numpy array that corresponds to the 'y' vector of the optimisation problem.
+        NB: this can be run only after having loaded the dictionary and the data.
+        """
+        if self.DICTIONARY is None :
+            ERROR( 'Dictionary not loaded; call "load_dictionary()" first' )
+        if self.niiDWI is None :
+            ERROR( 'Data not loaded; call "load_data()" first' )
+        return self.niiDWI_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'], : ].flatten().astype(np.float64)
+
+
+    def fit( self, tol_fun=1e-3, tol_x=1e-6, max_iter=100, verbose=1, x0=None, regularisation=None ) :
+        """Fit the model to the data.
+
+        Parameters
+        ----------
+        tol_fun : float
+            Tolerance on the objective function (default : 1e-3)
+        max_iter : integer
+            Maximum number of iterations (default : 100)
+        verbose : integer
+            Level of verbosity: 0=no print, 1=print progress (default : 1)
+        x0 : np.array
+            Initial guess for the solution of the problem (default : None)
+        regularisation : commit.solvers.init_regularisation object
+            Python dictionary that describes the wanted regularisation term.
+            Check the documentation of commit.solvers.init_regularisation to see
+            how to properly define the wanted mathematical formulation
+            ( default : None )
+        """
+        if self.niiDWI is None :
+            ERROR( 'Data not loaded; call "load_data()" first' )
+        if self.DICTIONARY is None :
+            ERROR( 'Dictionary not loaded; call "load_dictionary()" first' )
+        if self.KERNELS is None :
+            ERROR( 'Response functions not generated; call "generate_kernels()" and "load_kernels()" first' )
+        if self.THREADS is None :
+            ERROR( 'Threads not set; call "set_threads()" first' )
+        if self.A is None :
+            ERROR( 'Operator not built; call "build_operator()" first' )
+
+        if x0 is not None :
+            if x0.shape[0] != self.A.shape[1] :
+                ERROR( 'x0 dimension does not match the number of columns of the dictionary' )
+        if regularisation is None :
+            regularisation = commit.solvers.init_regularisation(self)
+
+        self.CONFIG['optimization']                   = {}
+        self.CONFIG['optimization']['tol_fun']        = tol_fun
+        self.CONFIG['optimization']['tol_x']          = tol_x
+        self.CONFIG['optimization']['max_iter']       = max_iter
+        self.CONFIG['optimization']['verbose']        = verbose
+        self.CONFIG['optimization']['regularisation'] = regularisation
+
+        # run solver
+        t = time.time()
+        LOG( '\n-> Fit model:' )
+
+        self.x, opt_details = commit.solvers.solve(self.get_y(), self.A, self.A.T, tol_fun = tol_fun, tol_x = tol_x, max_iter = max_iter, verbose = verbose, x0 = x0, regularisation = regularisation)
+
+        self.CONFIG['optimization']['fit_details'] = opt_details
+        self.CONFIG['optimization']['fit_time'] = time.time()-t
+
+        LOG( '\n   [ %s ]' % ( time.strftime("%Hh %Mm %Ss", time.gmtime(self.CONFIG['optimization']['fit_time']) ) ) )
+
+
+    def get_coeffs( self ):
+        """
+        Returns the coefficients, corresponding to the original optimisation problem,
+        i.e. the input tractogram to trk2dictionary, divided in three classes (ic, ec, iso).
+        """
+        if self.x is None :
+            ERROR( 'Model not fitted to the data; call "fit()" first' )
+
+        nF = self.DICTIONARY['IC']['nF']
+        nE = self.DICTIONARY['EC']['nE']
+        nV = self.DICTIONARY['nV']
+
+        if self.get_config('doNormalizeKernels') :
+            # renormalize the coefficients
+            norm1 = np.repeat(self.KERNELS['wmr_norm'],nF)
+            norm2 = np.repeat(self.KERNELS['wmh_norm'],nE)
+            norm3 = np.repeat(self.KERNELS['iso_norm'],nV)
+            norm_fib = np.kron(np.ones(self.KERNELS['wmr'].shape[0]), self.DICTIONARY['TRK']['norm'])
+            x = self.x / np.hstack( (norm1*norm_fib,norm2,norm3) )
+        else :
+            x = self.x
+
+        offset1 = nF * self.KERNELS['wmr'].shape[0]
+        offset2 = offset1 + nE * self.KERNELS['wmh'].shape[0]
+        kept = np.tile( self.DICTIONARY['TRK']['kept'], self.KERNELS['wmr'].shape[0] )
+        xic = np.zeros( kept.size )
+        xic[kept==1] = x[:offset1]
+        xec = x[offset1:offset2]
+        xiso = x[offset2:]
+
+        return xic, xec, xiso
+
+
+    def save_results( self, path_suffix=None, stat_coeffs='sum', save_est_dwi=False, save_coeff=None, save_opt_details=None ) :
+        """Save the output (coefficients, errors, maps etc).
+
+        Parameters
+        ----------
+        path_suffix : string
+            Text to be appended to "Results" to create the output path (default : None)
+        stat_coeffs : string
+            Stat to be used if more coefficients are estimated for each streamline.
+            Options: 'sum', 'mean', 'median', 'min', 'max', 'all' (default : 'sum')
+        save_est_dwi : boolean
+            Save the estimated DW-MRI signal (default : False)
+        save_opt_details : boolean
+            DEPRECATED. The details of the optimization and the coefficients are always saved.
+        save_coeff : boolean
+            DEPRECATED. The estimated weights for the streamlines are always saved.
+        """
+        RESULTS_path = 'Results_' + self.model.id
+        if path_suffix :
+            self.set_config('path_suffix', path_suffix)
+            RESULTS_path = RESULTS_path + path_suffix
+
+        LOG( '\n-> Saving results to "%s/*":' % RESULTS_path )
+        tic = time.time()
+
+        if self.x is None :
+            ERROR( 'Model not fitted to the data; call "fit()" first' )
+
+        if save_coeff is not None :
+            WARNING('"save_coeff" parameter is deprecated')
+
+        if save_opt_details is not None :
+            WARNING('"save_opt_details" parameter is deprecated')
+        
+        nF = self.DICTIONARY['IC']['nF']
+        nE = self.DICTIONARY['EC']['nE']
+        nV = self.DICTIONARY['nV']
+        norm_fib = np.ones( nF )
+        # x is the x of the original problem
+        # self.x is the x preconditioned
+        if self.get_config('doNormalizeKernels') :
+            # renormalize the coefficients
+            norm1 = np.repeat(self.KERNELS['wmr_norm'],nF)
+            norm2 = np.repeat(self.KERNELS['wmh_norm'],nE)
+            norm3 = np.repeat(self.KERNELS['iso_norm'],nV)
+            norm_fib = np.kron(np.ones(self.KERNELS['wmr'].shape[0]), self.DICTIONARY['TRK']['norm'])
+            x = self.x / np.hstack( (norm1*norm_fib,norm2,norm3) )
+        else :
+            x = self.x
+
+        # create folder or delete existing files (if any)
+        RESULTS_path = pjoin( self.get_config('TRACKING_path'), RESULTS_path )
+        if not exists( RESULTS_path ) :
+            makedirs( RESULTS_path )
+        else :
+            for f in glob.glob( pjoin(RESULTS_path,'*') ) :
+                remove( f )
+        self.set_config('RESULTS_path', RESULTS_path)
+
+        # Map of voxelwise errors
+        print( '\t* Fitting errors:' )
+
+        niiMAP_img = np.zeros( self.get_config('dim'), dtype=np.float32 )
+        affine = self.niiDWI.affine if nibabel.__version__ >= '2.0.0' else self.niiDWI.get_affine()
+        niiMAP     = nibabel.Nifti1Image( niiMAP_img, affine )
+        niiMAP_hdr = niiMAP.header if nibabel.__version__ >= '2.0.0' else niiMAP.get_header()
+        niiMAP_hdr['descrip'] = 'Created with COMMIT %s'%self.get_config('version')
+
+        y_mea = np.reshape( self.niiDWI_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'], : ].flatten().astype(np.float32), (nV,-1) )
+        y_est = np.reshape( self.A.dot(self.x), (nV,-1) ).astype(np.float32)
+
+        print( '\t\t- RMSE...  ', end='' )
+        sys.stdout.flush()
+        tmp = np.sqrt( np.mean((y_mea-y_est)**2,axis=1) )
+        niiMAP_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = tmp
+        niiMAP_hdr['cal_min'] = 0
+        niiMAP_hdr['cal_max'] = tmp.max()
+        nibabel.save( niiMAP, pjoin(RESULTS_path,'fit_RMSE.nii.gz') )
+        print( '[ %.3f +/- %.3f ]' % ( tmp.mean(), tmp.std() ) )
+
+        print( '\t\t- NRMSE... ', end='' )
+        sys.stdout.flush()
+        tmp = np.sum(y_mea**2,axis=1)
+        idx = np.where( tmp < 1E-12 )
+        tmp[ idx ] = 1
+        tmp = np.sqrt( np.sum((y_mea-y_est)**2,axis=1) / tmp )
+        tmp[ idx ] = 0
+        niiMAP_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = tmp
+        niiMAP_hdr['cal_min'] = 0
+        niiMAP_hdr['cal_max'] = 1
+        nibabel.save( niiMAP, pjoin(RESULTS_path,'fit_NRMSE.nii.gz') )
+        print( '[ %.3f +/- %.3f ]' % ( tmp.mean(), tmp.std() ) )
+
+        # Map of compartment contributions
+        print( '\t* Voxelwise contributions:' )
+
+        print( '\t\t- Intra-axonal... ', end='' )
+        sys.stdout.flush()
+        niiIC_img = np.zeros( self.get_config('dim'), dtype=np.float32 )
+        if len(self.KERNELS['wmr']) > 0 :
+            offset = nF * self.KERNELS['wmr'].shape[0]
+            tmp = ( x[:offset].reshape( (-1,nF) ) * norm_fib.reshape( (-1,nF) ) ).sum( axis=0 )
+            xv = np.bincount( self.DICTIONARY['IC']['v'], minlength=nV,
+                weights=tmp[ self.DICTIONARY['IC']['fiber'] ] * self.DICTIONARY['IC']['len']
+            ).astype(np.float32)
+            niiIC_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = xv
+        print( '[ OK ]' )
+
+        print( '\t\t- Extra-axonal... ', end='' )
+        sys.stdout.flush()
+        niiEC_img = np.zeros( self.get_config('dim'), dtype=np.float32 )
+        if len(self.KERNELS['wmh']) > 0 :
+            offset = nF * self.KERNELS['wmr'].shape[0]
+            tmp = x[offset:offset+nE*len(self.KERNELS['wmh'])].reshape( (-1,nE) ).sum( axis=0 )
+            xv = np.bincount( self.DICTIONARY['EC']['v'], weights=tmp, minlength=nV ).astype(np.float32)
+            niiEC_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = xv
+        print( '[ OK ]' )
+
+        print( '\t\t- Isotropic... ', end='' )
+        sys.stdout.flush()
+        niiISO_img = np.zeros( self.get_config('dim'), dtype=np.float32 )
+        if len(self.KERNELS['iso']) > 0 :
+            offset = nF * self.KERNELS['wmr'].shape[0] + nE * self.KERNELS['wmh'].shape[0]
+            xv = x[offset:].reshape( (-1,nV) ).sum( axis=0 )
+            niiISO_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'] ] = xv
+        print( '   [ OK ]' )
+
+        if self.get_config('doNormalizeMaps') :
+            niiIC = nibabel.Nifti1Image(  niiIC_img  / ( niiIC_img + niiEC_img + niiISO_img + 1e-16), affine, header=niiMAP_hdr )
+            niiEC = nibabel.Nifti1Image(  niiEC_img /  ( niiIC_img + niiEC_img + niiISO_img + 1E-16), affine, header=niiMAP_hdr )
+            niiISO = nibabel.Nifti1Image( niiISO_img / ( niiIC_img + niiEC_img + niiISO_img + 1E-16), affine, header=niiMAP_hdr )
+        else:
+            niiIC = nibabel.Nifti1Image(  niiIC_img,  affine, header=niiMAP_hdr )
+            niiEC = nibabel.Nifti1Image(  niiEC_img,  affine, header=niiMAP_hdr )
+            niiISO = nibabel.Nifti1Image( niiISO_img, affine, header=niiMAP_hdr )
+
+        nibabel.save( niiIC , pjoin(RESULTS_path,'compartment_IC.nii.gz') )
+        nibabel.save( niiEC , pjoin(RESULTS_path,'compartment_EC.nii.gz') )
+        nibabel.save( niiISO , pjoin(RESULTS_path,'compartment_ISO.nii.gz') )
+
+        # Configuration and results
+        print( '\t* Configuration and results:' )
+
+        print( '\t\t- streamline_weights.txt... ', end='' )
+        sys.stdout.flush()
+        xic, _, _ = self.get_coeffs()
+        if stat_coeffs != 'all' and xic.size > 0 :
+            xic = np.reshape( xic, (-1,self.DICTIONARY['TRK']['kept'].size) )
+            if stat_coeffs == 'sum' :
+                xic = np.sum( xic, axis=0 )
+            elif stat_coeffs == 'mean' :
+                xic = np.mean( xic, axis=0 )
+            elif stat_coeffs == 'median' :
+                xic = np.median( xic, axis=0 )
+            elif stat_coeffs == 'min' :
+                xic = np.min( xic, axis=0 )
+            elif stat_coeffs == 'max' :
+                xic = np.max( xic, axis=0 )
+            else :
+                ERROR( 'Stat not allowed. Possible values: sum, mean, median, min, max, all.', prefix='\n' )
+        np.savetxt( pjoin(RESULTS_path,'streamline_weights.txt'), xic, fmt='%.5e' )
+        self.set_config('stat_coeffs', stat_coeffs)
+        print( '[ OK ]' )
+
+        # Save to a pickle file the following items:
+        #   item 0: dictionary with all the configuration details
+        #   item 1: np.array obtained through the optimisation process with the normalised kernels
+        #   item 2: np.array renormalisation of coeffs in item 1
+        print( '\t\t- results.pickle... ', end='' )
+        sys.stdout.flush()
+        with open( pjoin(RESULTS_path,'results.pickle'), 'wb+' ) as fid :
+            pickle.dump( [self.CONFIG, self.x, x], fid, protocol=2 )
+        print( '        [ OK ]' )
+
+        if save_est_dwi :
+            print( '\t\t- Estimated signal... ', end='' )
+            sys.stdout.flush()
+            self.niiDWI_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'], : ] = y_est
+            nibabel.save( nibabel.Nifti1Image( self.niiDWI_img , affine ), pjoin(RESULTS_path,'fit_signal_estimated.nii.gz') )
+            self.niiDWI_img[ self.DICTIONARY['MASK_ix'], self.DICTIONARY['MASK_iy'], self.DICTIONARY['MASK_iz'], : ] = y_mea
+            print( '[ OK ]' )
+        
+        LOG( '   [ %.1f seconds ]' % ( time.time() - tic ) )
diff --git a/commit/cudaoperator/operator.pyx b/commit/cudaoperator/operator.pyx
index a6278830..027bf484 100644
--- a/commit/cudaoperator/operator.pyx
+++ b/commit/cudaoperator/operator.pyx
@@ -1,224 +1,224 @@
-#!python
-#cython: language_level=3, boundscheck=False, wraparound=False, profile=False
-
-import cython
-import numpy as np
-cimport numpy as np
-from amico.util import ERROR, LOG
-
-cdef extern from "operator_withCUDA.cuh":
-    int checkCompatibility(int)
-
-def check_compatibility(gpu_id):
-    return checkCompatibility(gpu_id)
-
-def check_cuda(error_id):
-    if error_id == -1:
-        ERROR( 'Impossible to allocate auxiliar memory in CPU' )
-    elif error_id == 1:
-        ERROR( 'Impossible to allocate memory in GPU' )
-    elif error_id == 2:
-        ERROR( 'Impossible to transfer memory to GPU' )
-    elif error_id == 3:
-        ERROR( 'Impossible to bind textures' )
-    elif error_id == 4:
-        ERROR( 'Impossible to transfer constant values to GPU' )
-    elif error_id == 5:
-        ERROR( 'There was a problem deleting GPU memory' )
-    elif error_id == 6:
-        ERROR( 'There was a problem unbinding texture memory' )
-    elif error_id == 7:
-        ERROR( 'There was a problem resetting GPU' )
-    elif error_id == 0:
-        print( '[ OK ]' )
-
-cdef extern from "operator_withCUDA.cuh":
-    cdef cppclass C_CudaLinearOperator "CudaLinearOperator":
-        C_CudaLinearOperator(int, int, int, int, int, int, int, int, int)
-
-        int setDictionary(np.uint32_t*, np.uint32_t*, np.uint16_t*, np.float32_t*,  np.uint32_t*, np.uint16_t*)
-        int setTransposeDictionary(np.uint32_t*, np.uint32_t*, np.uint16_t*, np.float32_t*)
-        int setKernels(np.float32_t*, np.float32_t*, np.float32_t*)
-        int setVectors()
-        int setGlobals()
-        int destroy()
-
-        void  dot(np.float64_t*, np.float64_t*)
-        void Tdot(np.float64_t*, np.float64_t*)
-
-cdef class CudaLinearOperator :
-    """This class is a wrapper to the CUDA C++ code for performing marix-vector multiplications
-    with the COMMIT linear operator A in a CUDA GPU. The multiplications are done using CUDA C++ code
-    that uses information from the DICTIONARY and KERNELS data structures.
-    """
-    cdef int nS, nF, nR, nE, nT, nV, nI, n, ndirs, gpu_id
-    cdef public int adjoint, n1, n2
-
-    cdef DICTIONARY
-    cdef KERNELS
-    cdef THREADS
-
-    cdef unsigned int*   ICf
-    cdef float*          ICl
-    cdef unsigned int*   ICv
-    cdef unsigned short* ICo
-    cdef unsigned int*   ECv
-    cdef unsigned short* ECo
-    cdef unsigned int*   ISOv
-
-    cdef float* LUT_IC
-    cdef float* LUT_EC
-    cdef float* LUT_ISO
-
-    # pointer to this operator in GPU memory
-    cdef C_CudaLinearOperator* thisptr
-
-    # these should be always None, they remain for compatibility
-    cdef unsigned int*   ICthreads
-    cdef unsigned int*   ECthreads
-    cdef unsigned int*   ISOthreads
-    cdef unsigned char*  ICthreadsT
-    cdef unsigned int*   ECthreadsT
-    cdef unsigned int*   ISOthreadsT
-
-
-    def __init__( self, DICTIONARY, KERNELS, THREADS, fcall = 0 ) :
-        """Set the pointers to the data structures used by the C code."""
-        self.DICTIONARY = DICTIONARY
-        self.KERNELS    = KERNELS
-        self.THREADS    = THREADS
-
-        self.nF         = DICTIONARY['IC']['nF']    # number of FIBERS
-        self.nR         = KERNELS['wmr'].shape[0]   # number of FIBER RADII
-        self.nE         = DICTIONARY['EC']['nE']    # number of EC segments
-        self.nT         = KERNELS['wmh'].shape[0]   # number of EC TORTUOSITY values
-        self.nV         = DICTIONARY['nV']          # number of VOXELS
-        self.nI         = KERNELS['iso'].shape[0]   # number of ISO contributions
-        self.n          = DICTIONARY['IC']['n']     # numbner of IC segments
-        self.ndirs      = KERNELS['wmr'].shape[1]   # number of directions
-        self.gpu_id     = THREADS['gpu_id']          # id of the CUDA GPU
-
-        if KERNELS['wmr'].size > 0 :
-            self.nS = KERNELS['wmr'].shape[2]       # number of SAMPLES
-        elif KERNELS['wmh'].size > 0 :
-            self.nS = KERNELS['wmh'].shape[2]
-        else :
-            self.nS = KERNELS['wmr'].shape[1]
-
-        self.adjoint = 0                            # direct of inverse product
-
-        self.n1 = self.nV*self.nS
-        self.n2 = self.nR*self.nF + self.nT*self.nE + self.nI*self.nV
-
-        # get C pointers to arrays in DICTIONARY
-        cdef unsigned int [::1]   ICf  = DICTIONARY['IC']['fiber']
-        self.ICf = &ICf[0]
-        cdef float [::1]          ICl  = DICTIONARY['IC']['len']
-        self.ICl = &ICl[0]
-        cdef unsigned int [::1]   ICv  = DICTIONARY['IC']['v']
-        self.ICv = &ICv[0]
-        cdef unsigned short [::1] ICo  = DICTIONARY['IC']['o']
-        self.ICo = &ICo[0]
-        cdef unsigned int [::1]   ECv  = DICTIONARY['EC']['v']
-        self.ECv = &ECv[0]
-        cdef unsigned short [::1] ECo  = DICTIONARY['EC']['o']
-        self.ECo = &ECo[0]
-        cdef unsigned int [::1]   ISOv = DICTIONARY['ISO']['v']
-        self.ISOv = &ISOv[0]
-
-        # get C pointers to arrays in KERNELS
-        cdef float [:, :, ::1] wmrSFP = KERNELS['wmr']
-        self.LUT_IC  = &wmrSFP[0,0,0]
-        cdef float [:, :, ::1] wmhSFP = KERNELS['wmh']
-        self.LUT_EC  = &wmhSFP[0,0,0]
-        cdef float [:, ::1] isoSFP = KERNELS['iso']
-        self.LUT_ISO = &isoSFP[0,0]
-
-        # create the operator in GPU memory
-        self.thisptr = new C_CudaLinearOperator(self.n, self.nV, self.nF, self.nE, self.ndirs, self.nS, self.nR, self.nT, self.nI)
-
-        # build operator in GPU only one time
-        if fcall == 1:
-            print( '\t* global values... ', end='' )
-            check_cuda( self.thisptr.setGlobals() )
-
-            print( '\t* lookup tables... ', end='' )
-            check_cuda( self.thisptr.setKernels(&wmrSFP[0,0,0], &wmhSFP[0,0,0], &isoSFP[0,0]) )
-
-            print( '\t* x&y vectors...   ', end='' )
-            check_cuda( self.thisptr.setVectors() )
-        
-            print( '\t* A  operator...   ', end='' )
-            check_cuda( self.thisptr.setDictionary(&ICv[0],&ICf[0],&ICo[0],&ICl[0], &ECv[0],&ECo[0]) )
-
-            idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['fiber'])] )
-
-            self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
-            self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]
-            self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
-            self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
-
-            ICf = self.DICTIONARY['IC']['fiber']
-            ICl = self.DICTIONARY['IC']['len']
-            ICv = self.DICTIONARY['IC']['v']
-            ICo = self.DICTIONARY['IC']['o']
-
-            self.ICf = &ICf[0]
-            self.ICl = &ICl[0]
-            self.ICv = &ICv[0]
-            self.ICo = &ICo[0]
-
-            print( '\t* A\' operator...   ', end='' )
-            check_cuda( self.thisptr.setTransposeDictionary(&self.ICv[0], &self.ICf[0], &self.ICo[0], &self.ICl[0]) )
-
-    def __del__( self ):
-        self.thisptr.destroy()
-
-    @property
-    def T( self ) :
-        """Transpose of the explicit matrix."""
-        C = CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
-        C.adjoint = 1 - C.adjoint
-        return C
-
-    @property
-    def shape( self ) :
-        """Size of the explicit matrix."""
-        if not self.adjoint :
-            return ( self.n1, self.n2 )
-        else :
-            return ( self.n2, self.n1 )
-
-
-    def dot( self, double [::1] v_in  ):
-        """Wrapper to C code for efficiently performing the matrix-vector multiplications.
-
-        Parameters
-        ----------
-        v_in : 1D numpy.array of double
-            Input vector for the matrix-vector multiplication
-
-        Returns
-        -------
-        v_out : 1D numpy.array of double
-            Results of the multiplication
-        """
-
-        # Permit only matrix-vector multiplications
-        if v_in.size != self.shape[1] :
-            ERROR( "A.dot(): dimensions do not match" )
-
-        # Create output array
-        cdef double [::1] v_out = np.zeros( self.shape[0], dtype=np.float64 )
-
-        # Call the cython function to read the memory pointers
-        if not self.adjoint :
-            # DIRECT PRODUCT A*x
-            self.thisptr.dot(&v_in[0], &v_out[0])
-        else :
-            # INVERSE PRODUCT A'*y
-            self.thisptr.Tdot(&v_in[0], &v_out[0])
-
-        return v_out
-
-
+#!python
+#cython: language_level=3, boundscheck=False, wraparound=False, profile=False
+
+import cython
+import numpy as np
+cimport numpy as np
+from amico.util import ERROR, LOG
+
+cdef extern from "operator_withCUDA.cuh":
+    int checkCompatibility(int)
+
+def check_compatibility(gpu_id):
+    return checkCompatibility(gpu_id)
+
+def check_cuda(error_id):
+    if error_id == -1:
+        ERROR( 'Impossible to allocate auxiliar memory in CPU' )
+    elif error_id == 1:
+        ERROR( 'Impossible to allocate memory in GPU' )
+    elif error_id == 2:
+        ERROR( 'Impossible to transfer memory to GPU' )
+    elif error_id == 3:
+        ERROR( 'Impossible to bind textures' )
+    elif error_id == 4:
+        ERROR( 'Impossible to transfer constant values to GPU' )
+    elif error_id == 5:
+        ERROR( 'There was a problem deleting GPU memory' )
+    elif error_id == 6:
+        ERROR( 'There was a problem unbinding texture memory' )
+    elif error_id == 7:
+        ERROR( 'There was a problem resetting GPU' )
+    elif error_id == 0:
+        print( '[ OK ]' )
+
+cdef extern from "operator_withCUDA.cuh":
+    cdef cppclass C_CudaLinearOperator "CudaLinearOperator":
+        C_CudaLinearOperator(int, int, int, int, int, int, int, int, int)
+
+        int setDictionary(np.uint32_t*, np.uint32_t*, np.uint16_t*, np.float32_t*,  np.uint32_t*, np.uint16_t*)
+        int setTransposeDictionary(np.uint32_t*, np.uint32_t*, np.uint16_t*, np.float32_t*)
+        int setKernels(np.float32_t*, np.float32_t*, np.float32_t*)
+        int setVectors()
+        int setGlobals()
+        int destroy()
+
+        void  dot(np.float64_t*, np.float64_t*)
+        void Tdot(np.float64_t*, np.float64_t*)
+
+cdef class CudaLinearOperator :
+    """This class is a wrapper to the CUDA C++ code for performing marix-vector multiplications
+    with the COMMIT linear operator A in a CUDA GPU. The multiplications are done using CUDA C++ code
+    that uses information from the DICTIONARY and KERNELS data structures.
+    """
+    cdef int nS, nF, nR, nE, nT, nV, nI, n, ndirs, gpu_id
+    cdef public int adjoint, n1, n2
+
+    cdef DICTIONARY
+    cdef KERNELS
+    cdef THREADS
+
+    cdef unsigned int*   ICf
+    cdef float*          ICl
+    cdef unsigned int*   ICv
+    cdef unsigned short* ICo
+    cdef unsigned int*   ECv
+    cdef unsigned short* ECo
+    cdef unsigned int*   ISOv
+
+    cdef float* LUT_IC
+    cdef float* LUT_EC
+    cdef float* LUT_ISO
+
+    # pointer to this operator in GPU memory
+    cdef C_CudaLinearOperator* thisptr
+
+    # these should be always None, they remain for compatibility
+    cdef unsigned int*   ICthreads
+    cdef unsigned int*   ECthreads
+    cdef unsigned int*   ISOthreads
+    cdef unsigned char*  ICthreadsT
+    cdef unsigned int*   ECthreadsT
+    cdef unsigned int*   ISOthreadsT
+
+
+    def __init__( self, DICTIONARY, KERNELS, THREADS, fcall = 0 ) :
+        """Set the pointers to the data structures used by the C code."""
+        self.DICTIONARY = DICTIONARY
+        self.KERNELS    = KERNELS
+        self.THREADS    = THREADS
+
+        self.nF         = DICTIONARY['IC']['nF']    # number of FIBERS
+        self.nR         = KERNELS['wmr'].shape[0]   # number of FIBER RADII
+        self.nE         = DICTIONARY['EC']['nE']    # number of EC segments
+        self.nT         = KERNELS['wmh'].shape[0]   # number of EC TORTUOSITY values
+        self.nV         = DICTIONARY['nV']          # number of VOXELS
+        self.nI         = KERNELS['iso'].shape[0]   # number of ISO contributions
+        self.n          = DICTIONARY['IC']['n']     # numbner of IC segments
+        self.ndirs      = KERNELS['wmr'].shape[1]   # number of directions
+        self.gpu_id     = THREADS['gpu_id']          # id of the CUDA GPU
+
+        if KERNELS['wmr'].size > 0 :
+            self.nS = KERNELS['wmr'].shape[2]       # number of SAMPLES
+        elif KERNELS['wmh'].size > 0 :
+            self.nS = KERNELS['wmh'].shape[2]
+        else :
+            self.nS = KERNELS['wmr'].shape[1]
+
+        self.adjoint = 0                            # direct of inverse product
+
+        self.n1 = self.nV*self.nS
+        self.n2 = self.nR*self.nF + self.nT*self.nE + self.nI*self.nV
+
+        # get C pointers to arrays in DICTIONARY
+        cdef unsigned int [::1]   ICf  = DICTIONARY['IC']['fiber']
+        self.ICf = &ICf[0]
+        cdef float [::1]          ICl  = DICTIONARY['IC']['len']
+        self.ICl = &ICl[0]
+        cdef unsigned int [::1]   ICv  = DICTIONARY['IC']['v']
+        self.ICv = &ICv[0]
+        cdef unsigned short [::1] ICo  = DICTIONARY['IC']['o']
+        self.ICo = &ICo[0]
+        cdef unsigned int [::1]   ECv  = DICTIONARY['EC']['v']
+        self.ECv = &ECv[0]
+        cdef unsigned short [::1] ECo  = DICTIONARY['EC']['o']
+        self.ECo = &ECo[0]
+        cdef unsigned int [::1]   ISOv = DICTIONARY['ISO']['v']
+        self.ISOv = &ISOv[0]
+
+        # get C pointers to arrays in KERNELS
+        cdef float [:, :, ::1] wmrSFP = KERNELS['wmr']
+        self.LUT_IC  = &wmrSFP[0,0,0]
+        cdef float [:, :, ::1] wmhSFP = KERNELS['wmh']
+        self.LUT_EC  = &wmhSFP[0,0,0]
+        cdef float [:, ::1] isoSFP = KERNELS['iso']
+        self.LUT_ISO = &isoSFP[0,0]
+
+        # create the operator in GPU memory
+        self.thisptr = new C_CudaLinearOperator(self.n, self.nV, self.nF, self.nE, self.ndirs, self.nS, self.nR, self.nT, self.nI)
+
+        # build operator in GPU only one time
+        if fcall == 1:
+            print( '\t* global values... ', end='' )
+            check_cuda( self.thisptr.setGlobals() )
+
+            print( '\t* lookup tables... ', end='' )
+            check_cuda( self.thisptr.setKernels(&wmrSFP[0,0,0], &wmhSFP[0,0,0], &isoSFP[0,0]) )
+
+            print( '\t* x&y vectors...   ', end='' )
+            check_cuda( self.thisptr.setVectors() )
+        
+            print( '\t* A  operator...   ', end='' )
+            check_cuda( self.thisptr.setDictionary(&ICv[0],&ICf[0],&ICo[0],&ICl[0], &ECv[0],&ECo[0]) )
+
+            idx = np.lexsort( [np.array(self.DICTIONARY['IC']['o']), np.array(self.DICTIONARY['IC']['fiber'])] )
+
+            self.DICTIONARY['IC']['v']     = self.DICTIONARY['IC']['v'][ idx ]
+            self.DICTIONARY['IC']['o']     = self.DICTIONARY['IC']['o'][ idx ]
+            self.DICTIONARY['IC']['fiber'] = self.DICTIONARY['IC']['fiber'][ idx ]
+            self.DICTIONARY['IC']['len']   = self.DICTIONARY['IC']['len'][ idx ]
+
+            ICf = self.DICTIONARY['IC']['fiber']
+            ICl = self.DICTIONARY['IC']['len']
+            ICv = self.DICTIONARY['IC']['v']
+            ICo = self.DICTIONARY['IC']['o']
+
+            self.ICf = &ICf[0]
+            self.ICl = &ICl[0]
+            self.ICv = &ICv[0]
+            self.ICo = &ICo[0]
+
+            print( '\t* A\' operator...   ', end='' )
+            check_cuda( self.thisptr.setTransposeDictionary(&self.ICv[0], &self.ICf[0], &self.ICo[0], &self.ICl[0]) )
+
+    def __del__( self ):
+        self.thisptr.destroy()
+
+    @property
+    def T( self ) :
+        """Transpose of the explicit matrix."""
+        C = CudaLinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
+        C.adjoint = 1 - C.adjoint
+        return C
+
+    @property
+    def shape( self ) :
+        """Size of the explicit matrix."""
+        if not self.adjoint :
+            return ( self.n1, self.n2 )
+        else :
+            return ( self.n2, self.n1 )
+
+
+    def dot( self, double [::1] v_in  ):
+        """Wrapper to C code for efficiently performing the matrix-vector multiplications.
+
+        Parameters
+        ----------
+        v_in : 1D numpy.array of double
+            Input vector for the matrix-vector multiplication
+
+        Returns
+        -------
+        v_out : 1D numpy.array of double
+            Results of the multiplication
+        """
+
+        # Permit only matrix-vector multiplications
+        if v_in.size != self.shape[1] :
+            ERROR( "A.dot(): dimensions do not match" )
+
+        # Create output array
+        cdef double [::1] v_out = np.zeros( self.shape[0], dtype=np.float64 )
+
+        # Call the cython function to read the memory pointers
+        if not self.adjoint :
+            # DIRECT PRODUCT A*x
+            self.thisptr.dot(&v_in[0], &v_out[0])
+        else :
+            # INVERSE PRODUCT A'*y
+            self.thisptr.Tdot(&v_in[0], &v_out[0])
+
+        return v_out
+
+
diff --git a/commit/cudaoperator/operator_withCUDA.cu b/commit/cudaoperator/operator_withCUDA.cu
index 4dff1ca3..ea014db5 100644
--- a/commit/cudaoperator/operator_withCUDA.cu
+++ b/commit/cudaoperator/operator_withCUDA.cu
@@ -1,694 +1,694 @@
-#include "operator_withCUDA.cuh"
-
-// ====================================================
-// Textures for LUT in the GPU
-// ====================================================
-texture<float32_t, 1, cudaReadModeElementType> tex_lutIC;
-texture<float32_t, 1, cudaReadModeElementType> tex_lutEC;
-texture<float32_t, 1, cudaReadModeElementType> tex_lutISO;
-
-
-int checkCompatibility(int gpuID) {
-    int gpuCount;
-    cudaError_t cudaStatus;
-    
-    cudaStatus = cudaGetDeviceCount(&gpuCount);
-
-    if (gpuCount <= 0 || gpuID >= gpuCount || cudaStatus != cudaSuccess) return 1;
-
-    cudaStatus = cudaSetDevice(gpuID);
-
-    if (cudaStatus != cudaSuccess) return 2;
-
-    cudaDeviceProp gpuProperties;
-    cudaStatus = cudaGetDeviceProperties(&gpuProperties, gpuID);
-
-    if (cudaStatus != cudaSuccess) return 3;
-
-    printf("\t* selected GPU...       [ %s ]\n",     gpuProperties.name);
-    printf("\t* total memory...       [ %.2fGB ]\n", gpuProperties.totalGlobalMem*1e-9);
-    printf("\t* compute capability... [ %d.%d ]\n",  gpuProperties.major, gpuProperties.minor);
-
-    if(gpuProperties.major < 5) return 4;
-
-    return 0;
-}
-
-void cudaCheckLastError()
-{
-    cudaError_t err = cudaGetLastError();
-
-    if(err != cudaSuccess){
-        printf("CUDA Error: %s\n", cudaGetErrorString(err));
-        exit(-1);
-    }
-}
-
-void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compartmentsPerBlock, uint32_t* offsetPerBlock, int NUM_BLOCKS){
-
-    // fill arrays with zeros
-    memset(compartmentsPerBlock, 0, NUM_BLOCKS * sizeof(uint32_t));
-    memset(offsetPerBlock,       0, NUM_BLOCKS * sizeof(uint32_t));
-
-    // count compartments per block
-    for(int i = 0; i < NUM_COMPARTMENTS; i++)
-        compartmentsPerBlock[data[i]]++;
-
-    // calculate offset per block
-    offsetPerBlock[0] = 0;
-    for(int i = 1; i < NUM_BLOCKS; i++)
-        offsetPerBlock[i] = offsetPerBlock[i-1] + compartmentsPerBlock[i-1];
-}
-
-int CudaLinearOperator::setDictionary(uint32_t* voxelIC, uint32_t* fiberIC, uint16_t* orienIC, float32_t* lengthIC, uint32_t* voxelEC, uint16_t* orienEC){
-    
-    cudaError_t cudaStatus;
-
-    uint32_t* segmentsPerBlock = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
-    uint32_t* offsetPerBlock   = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
-
-    if (segmentsPerBlock == NULL || offsetPerBlock == NULL) return -1;
-
-    preprocessDataForGPU(voxelIC, nsegments, segmentsPerBlock, offsetPerBlock, nvoxels);
-
-    cudaStatus = cudaMalloc((void**)&gpu_segmentsPerBlockIC, nvoxels*sizeof(uint32_t));
-    if (cudaStatus != cudaSuccess) return 1;
-    cudaStatus = cudaMalloc((void**)&gpu_offsetPerBlockIC,   nvoxels*sizeof(uint32_t));
-    if (cudaStatus != cudaSuccess) return 1;
-
-    cudaStatus = cudaMemcpy(gpu_segmentsPerBlockIC, segmentsPerBlock, nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice);
-    if (cudaStatus != cudaSuccess) return 2;
-    cudaStatus = cudaMemcpy(gpu_offsetPerBlockIC,   offsetPerBlock,   nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice);
-    if (cudaStatus != cudaSuccess) return 2;
-
-    if (npeaks > 0){
-        preprocessDataForGPU(voxelEC, npeaks, segmentsPerBlock, offsetPerBlock, nvoxels);
-
-        cudaStatus = cudaMalloc((void**)&gpu_segmentsPerBlockEC, nvoxels*sizeof(uint32_t));
-        if (cudaStatus != cudaSuccess) return 1;
-        cudaStatus = cudaMalloc((void**)&gpu_offsetPerBlockEC,   nvoxels*sizeof(uint32_t));
-        if (cudaStatus != cudaSuccess) return 1;
-
-        cudaStatus = cudaMemcpy(gpu_segmentsPerBlockEC, segmentsPerBlock, nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice);
-        if (cudaStatus != cudaSuccess) return 2;
-        cudaStatus = cudaMemcpy(gpu_offsetPerBlockEC,   offsetPerBlock,   nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice);
-        if (cudaStatus != cudaSuccess) return 2;
-    }
-
-    free(segmentsPerBlock);
-    free(offsetPerBlock);
-
-    // alloc IC part of the dictionary in GPU
-    cudaStatus = cudaMalloc((void**)&gpu_voxelIC,  nsegments*sizeof(uint32_t)); 
-    if (cudaStatus != cudaSuccess) return 1;
-    cudaStatus = cudaMalloc((void**)&gpu_fiberIC,  nsegments*sizeof(uint32_t)); 
-    if (cudaStatus != cudaSuccess) return 1;
-    cudaStatus = cudaMalloc((void**)&gpu_orienIC,  nsegments*sizeof(uint16_t)); 
-    if (cudaStatus != cudaSuccess) return 1;
-    cudaStatus = cudaMalloc((void**)&gpu_lengthIC, nsegments*sizeof(float32_t));
-    if (cudaStatus != cudaSuccess) return 1;
-
-    // transfer IC part of the dictionary to GPU
-    cudaStatus = cudaMemcpy(gpu_voxelIC,  voxelIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice);
-    if (cudaStatus != cudaSuccess) return 2;
-    cudaStatus = cudaMemcpy(gpu_fiberIC,  fiberIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice);
-    if (cudaStatus != cudaSuccess) return 2;
-    cudaStatus = cudaMemcpy(gpu_orienIC,  orienIC,  nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice);
-    if (cudaStatus != cudaSuccess) return 2;
-    cudaStatus = cudaMemcpy(gpu_lengthIC, lengthIC, nsegments*sizeof(float32_t), cudaMemcpyHostToDevice);
-    if (cudaStatus != cudaSuccess) return 2;
-
-    if (npeaks > 0){
-        // alloc EC part of the dictionary in GPU
-        cudaStatus = cudaMalloc((void**)&gpu_voxelEC,  npeaks*sizeof(uint32_t));
-        if (cudaStatus != cudaSuccess) return 1;
-        cudaStatus = cudaMalloc((void**)&gpu_orienEC,  npeaks*sizeof(uint16_t));
-        if (cudaStatus != cudaSuccess) return 1;
-
-        // transfer EC part of the dictionary to GPU
-        cudaStatus = cudaMemcpy(gpu_voxelEC,  voxelEC,  npeaks*sizeof(uint32_t), cudaMemcpyHostToDevice);
-        if (cudaStatus != cudaSuccess) return 2;
-        cudaStatus = cudaMemcpy(gpu_orienEC,  orienEC,  npeaks*sizeof(uint16_t), cudaMemcpyHostToDevice);
-        if (cudaStatus != cudaSuccess) return 2;
-    }
-
-    return 0;
-}
-
-int CudaLinearOperator::setTransposeDictionary(uint32_t* TvoxelIC, uint32_t* TfiberIC, uint16_t* TorienIC, float32_t* TlengthIC){
-    
-    cudaError_t cudaStatus;
-
-    uint32_t*  fibersPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
-    uint32_t*  offsetPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
-    if(fibersPerBlock == NULL || offsetPerBlock == NULL) return -1;
-
-    preprocessDataForGPU(TfiberIC, nsegments, fibersPerBlock, offsetPerBlock, nfibers);
-
-    cudaStatus = cudaMalloc((void**)&gpu_TfibersPerBlockIC, nfibers*sizeof(uint32_t));
-    if (cudaStatus != cudaSuccess) return 1;
-    cudaStatus = cudaMalloc((void**)&gpu_ToffsetPerBlockIC, nfibers*sizeof(uint32_t));
-    if (cudaStatus != cudaSuccess) return 1;
-
-    cudaStatus = cudaMemcpy(gpu_TfibersPerBlockIC, fibersPerBlock, nfibers*sizeof(uint32_t), cudaMemcpyHostToDevice);
-    if (cudaStatus != cudaSuccess) return 2;
-    cudaStatus = cudaMemcpy(gpu_ToffsetPerBlockIC, offsetPerBlock, nfibers*sizeof(uint32_t), cudaMemcpyHostToDevice);
-    if (cudaStatus != cudaSuccess) return 2;
-
-    free(fibersPerBlock);
-    free(offsetPerBlock);
-
-    cudaStatus = cudaMalloc((void**)&gpu_TvoxelIC,  nsegments*sizeof(uint32_t)) ;
-    if (cudaStatus != cudaSuccess) return 1;
-    cudaStatus = cudaMalloc((void**)&gpu_TfiberIC,  nsegments*sizeof(uint32_t)) ;
-    if (cudaStatus != cudaSuccess) return 1;
-    cudaStatus = cudaMalloc((void**)&gpu_TorienIC,  nsegments*sizeof(uint16_t)) ;
-    if (cudaStatus != cudaSuccess) return 1;
-    cudaStatus = cudaMalloc((void**)&gpu_TlengthIC, nsegments*sizeof(float32_t));
-    if (cudaStatus != cudaSuccess) return 1;
-
-    cudaStatus = cudaMemcpy(gpu_TvoxelIC,  TvoxelIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice);
-    if (cudaStatus != cudaSuccess) return 2;
-    cudaStatus = cudaMemcpy(gpu_TfiberIC,  TfiberIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice);
-    if (cudaStatus != cudaSuccess) return 2;
-    cudaStatus = cudaMemcpy(gpu_TorienIC,  TorienIC,  nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice);
-    if (cudaStatus != cudaSuccess) return 2;
-    cudaStatus = cudaMemcpy(gpu_TlengthIC, TlengthIC, nsegments*sizeof(float32_t), cudaMemcpyHostToDevice);
-    if (cudaStatus != cudaSuccess) return 2;
-    
-    return 0;
-}
-
-int CudaLinearOperator::setKernels(float32_t* lutIC, float32_t* lutEC, float32_t* lutISO){
-
-    cudaError_t cudaStatus;
-
-    if (ndiameters > 0){
-        cudaStatus = cudaMalloc((void**)&gpu_lutIC, size_lutic*sizeof(float32_t));
-        if (cudaStatus != cudaSuccess) return 1;
-        cudaStatus = cudaMemcpy(gpu_lutIC, lutIC, size_lutic*sizeof(float32_t), cudaMemcpyHostToDevice);
-        if (cudaStatus != cudaSuccess) return 2;
-
-        tex_lutIC.addressMode[0] = cudaAddressModeBorder;
-        tex_lutIC.addressMode[1] = cudaAddressModeBorder;
-        tex_lutIC.filterMode = cudaFilterModePoint;
-        tex_lutIC.normalized = false;
-
-        cudaStatus = cudaBindTexture(NULL, tex_lutIC,  gpu_lutIC,  size_lutic*sizeof(float32_t));
-        if (cudaStatus != cudaSuccess) return 3;
-    }
-
-    if (nzeppelins > 0){
-        cudaStatus = cudaMalloc((void**)&gpu_lutEC,  size_lutec*sizeof(float32_t));
-        if (cudaStatus != cudaSuccess) return 1;
-        cudaStatus = cudaMemcpy(gpu_lutEC, lutEC, size_lutec*sizeof(float32_t), cudaMemcpyHostToDevice);
-        if (cudaStatus != cudaSuccess) return 2;
-
-        tex_lutEC.addressMode[0] = cudaAddressModeBorder;
-        tex_lutEC.addressMode[1] = cudaAddressModeBorder;
-        tex_lutEC.filterMode = cudaFilterModePoint;
-        tex_lutEC.normalized = false;
-
-        cudaStatus = cudaBindTexture(NULL, tex_lutEC,  gpu_lutEC,  size_lutec*sizeof(float32_t));
-        if (cudaStatus != cudaSuccess) return 3;
-    }
-
-    if (nballs > 0){
-        cudaStatus = cudaMalloc((void**)&gpu_lutISO, size_lutiso*sizeof(float32_t));
-        if (cudaStatus != cudaSuccess) return 1;
-        cudaStatus = cudaMemcpy(gpu_lutISO, lutISO, size_lutiso*sizeof(float32_t), cudaMemcpyHostToDevice);
-        if (cudaStatus != cudaSuccess) return 2;
-
-        tex_lutISO.addressMode[0] = cudaAddressModeBorder;
-        tex_lutISO.addressMode[1] = cudaAddressModeBorder;
-        tex_lutISO.filterMode = cudaFilterModePoint;
-        tex_lutISO.normalized = false;
-
-        cudaStatus = cudaBindTexture(NULL, tex_lutISO, gpu_lutISO, size_lutiso*sizeof(float32_t));
-        if (cudaStatus != cudaSuccess) return 3;
-    }
-
-    return 0;
-}
-
-int CudaLinearOperator::setVectors(){
-    
-    cudaError_t cudaStatus;
-
-    cudaStatus = cudaMalloc((void**)&gpu_x, ncols*sizeof(float64_t));
-    if (cudaStatus != cudaSuccess) return 1;
-    cudaStatus = cudaMalloc((void**)&gpu_y, nrows*sizeof(float64_t));
-    if (cudaStatus != cudaSuccess) return 1;
-    
-    return 0;
-}
-
-int CudaLinearOperator::setGlobals(){
-    
-    cudaError_t cudaStatus;
-
-    cudaStatus = cudaMemcpyToSymbol(NUM_VOXELS,       &nvoxels,       sizeof(int));
-    if (cudaStatus != cudaSuccess) return -1;
-    cudaStatus = cudaMemcpyToSymbol(NUM_FIBERS,       &nfibers,       sizeof(int));
-    if (cudaStatus != cudaSuccess) return -1;
-    cudaStatus = cudaMemcpyToSymbol(NUM_PEAKS,        &npeaks,        sizeof(int));
-    if (cudaStatus != cudaSuccess) return -1;
-    cudaStatus = cudaMemcpyToSymbol(NUM_ORIENTATIONS, &norientations, sizeof(int));
-    if (cudaStatus != cudaSuccess) return -1;
-    cudaStatus = cudaMemcpyToSymbol(NUM_SAMPLES,      &nsamples,      sizeof(int));
-    if (cudaStatus != cudaSuccess) return -1;
-    cudaStatus = cudaMemcpyToSymbol(NUM_DIAMETERS,    &ndiameters,    sizeof(int));
-    if (cudaStatus != cudaSuccess) return -1;
-    cudaStatus = cudaMemcpyToSymbol(NUM_ZEPPELINS,    &nzeppelins,    sizeof(int));
-    if (cudaStatus != cudaSuccess) return -1;
-    cudaStatus = cudaMemcpyToSymbol(NUM_BALLS,        &nballs,        sizeof(int));
-    if (cudaStatus != cudaSuccess) return -1;
-    cudaStatus = cudaMemcpyToSymbol(NUM_ROWS,         &nrows,         sizeof(int));
-    if (cudaStatus != cudaSuccess) return -1;
-    cudaStatus = cudaMemcpyToSymbol(NUM_COLS,         &ncols,         sizeof(int));
-    if (cudaStatus != cudaSuccess) return -1;
-    cudaStatus = cudaMemcpyToSymbol(SIZE_LUTIC,       &size_lutic,    sizeof(int));
-    if (cudaStatus != cudaSuccess) return -1;
-    cudaStatus = cudaMemcpyToSymbol(SIZE_LUTEC,       &size_lutec,    sizeof(int));
-    if (cudaStatus != cudaSuccess) return -1;
-    cudaStatus = cudaMemcpyToSymbol(SIZE_LUTISO,      &size_lutiso,   sizeof(int));
-    if (cudaStatus != cudaSuccess) return -1;
-    
-    return 0;
-}
-
-CudaLinearOperator::CudaLinearOperator(int nsegments, int nvoxels, int nfibers, int npeaks, int norientations, int nsamples, int ndiameters, int nzeppelins, int nballs){
-
-    this->nsegments = nsegments;
-    this->nvoxels = nvoxels;
-    this->nfibers = nfibers;
-    this->npeaks = npeaks;
-    this->norientations = norientations;
-    this->nsamples = nsamples;
-    this->ndiameters = ndiameters;
-    this->nzeppelins = nzeppelins;   
-    this->nballs = nballs;
-    this->size_lutic = ndiameters*norientations*nsamples;
-    this->size_lutec = nzeppelins*norientations*nsamples;
-    this->size_lutiso = nballs*nsamples;
-    this->nrows = nvoxels*nsamples;
-    this->ncols = nfibers*ndiameters + npeaks*nzeppelins + nvoxels*nballs;
-}
-
-CudaLinearOperator::~CudaLinearOperator() {}
-
-int CudaLinearOperator::destroy(){
-    cudaError_t cudaStatus;    
-
-    cudaStatus = cudaFree(gpu_voxelIC);
-    cudaStatus = cudaFree(gpu_fiberIC);
-    cudaStatus = cudaFree(gpu_orienIC);
-    cudaStatus = cudaFree(gpu_lengthIC);
-    cudaStatus = cudaFree(gpu_voxelEC);
-    cudaStatus = cudaFree(gpu_orienEC);
-    cudaStatus = cudaFree(gpu_segmentsPerBlockIC);
-    cudaStatus = cudaFree(gpu_offsetPerBlockIC);
-    cudaStatus = cudaFree(gpu_segmentsPerBlockEC);
-    cudaStatus = cudaFree(gpu_offsetPerBlockEC);
-
-    cudaStatus = cudaFree(gpu_TvoxelIC);
-    cudaStatus = cudaFree(gpu_TfiberIC);
-    cudaStatus = cudaFree(gpu_TorienIC);
-    cudaStatus = cudaFree(gpu_TlengthIC);
-    cudaStatus = cudaFree(gpu_TfibersPerBlockIC);
-    cudaStatus = cudaFree(gpu_ToffsetPerBlockIC);
-
-    cudaStatus = cudaFree(gpu_x);
-    cudaStatus = cudaFree(gpu_y);
-
-    cudaStatus = cudaFree(gpu_lutIC);
-    cudaStatus = cudaFree(gpu_lutEC);
-    cudaStatus = cudaFree(gpu_lutISO);
-    cudaStatus = cudaUnbindTexture(tex_lutIC);
-    cudaStatus = cudaUnbindTexture(tex_lutEC);
-    cudaStatus = cudaUnbindTexture(tex_lutISO);
-
-    cudaStatus = cudaDeviceReset();
-
-    return 0;
-}
-
-void cudaCheckKernel(){
-    cudaError_t cudaStatus;
-    
-    cudaStatus = cudaGetLastError();
-	if(cudaStatus != cudaSuccess)
-        fprintf(stderr, "\t* kernel launch... [ ERROR ]: %s\n\n", cudaGetErrorString(cudaStatus));
-    else
-        printf("\t* kernel launch... [ OK ]\n");
-
-    cudaStatus = cudaDeviceSynchronize();
-	if(cudaStatus != cudaSuccess)
-        fprintf(stderr, "\t* cudaDeviceSynchronize() after launching kernel... [ ERROR ]: %d\n", cudaStatus);
-    else
-        printf("\t* cudaDeviceSynchronize() after launching kernel... [ OK ]\n");
-}
-
-void CudaLinearOperator::dot(float64_t* v_in, float64_t* v_out){
-    
-    // Copy vector x to the GPU
-    cudaMemcpy(gpu_x, v_in, ncols*sizeof(double), cudaMemcpyHostToDevice);
-    //cudaCheckLastError();
-
-    // Multiply IC part in the GPU
-    //multiply_Ax_ICpart<<<nvoxels, 1024>>>(gpu_voxelIC, gpu_fiberIC, gpu_orienIC, gpu_lengthIC, gpu_segmentsPerBlockIC, gpu_offsetPerBlockIC, gpu_lutIC, gpu_x, gpu_y);
-    multiply_Ax_ICpart<<<nvoxels/256 + 1, 256>>>(gpu_voxelIC, gpu_fiberIC, gpu_orienIC, gpu_lengthIC, gpu_segmentsPerBlockIC, gpu_offsetPerBlockIC, gpu_lutIC, gpu_x, gpu_y);
-    //cudaCheckLastError();
-
-    // Multiply EC part in the GPU
-    multiply_Ax_ECpart<<<nvoxels, 512>>>(gpu_voxelEC, gpu_orienEC, gpu_segmentsPerBlockEC, gpu_offsetPerBlockEC, gpu_lutEC, gpu_x, gpu_y);
-    //cudaCheckLastError();
-
-    // Multiply ISO part in the GPU
-    multiply_Ax_ISOpart<<<nvoxels, 512>>>(gpu_lutISO, gpu_x, gpu_y);
-    //cudaCheckLastError();
-
-    // Copy back result to CPU
-    cudaMemcpy(v_out, gpu_y, nrows*sizeof(double), cudaMemcpyDeviceToHost);
-    //cudaCheckLastError();
-}
-
-void CudaLinearOperator::Tdot(float64_t* v_in, float64_t* v_out){
-    
-    // Copy vector y to the GPU
-    cudaMemcpy(gpu_y, v_in, nrows*sizeof(double), cudaMemcpyHostToDevice);
-    //cudaCheckLastError();
-
-    // Multiply IC part in the GPU
-    multiply_Aty_ICpart<<<nfibers, 512>>>(gpu_TvoxelIC, gpu_TfiberIC, gpu_TorienIC, gpu_TlengthIC, gpu_TfibersPerBlockIC, gpu_ToffsetPerBlockIC, gpu_lutIC, gpu_x, gpu_y);
-    //cudaCheckLastError();
-
-    // Multiply EC part in the GPU
-    multiply_Aty_ECpart<<<nvoxels, 512>>>(gpu_voxelEC, gpu_orienEC, gpu_segmentsPerBlockEC, gpu_offsetPerBlockEC, gpu_lutEC, gpu_x, gpu_y);
-    //cudaCheckLastError();
-
-    // Multiply ISO part in the GPU
-    multiply_Aty_ISOpart<<<nvoxels, 512>>>(gpu_lutISO, gpu_x, gpu_y);
-    //cudaCheckLastError();
-
-    // Copy back result to CPU
-    cudaMemcpy(v_out, gpu_x, ncols*sizeof(double), cudaMemcpyDeviceToHost);
-    //cudaCheckLastError();
-}
-
-// ------------------------------------------------------- KERNELS ------------------------------------------------------- //
-/*__global__ void multiply_Ax_ICpart(uint32_t*  voxelIDs,
-                                   uint32_t*  fiberIDs,
-                                   uint16_t*  orienIDs,
-                                   float32_t* lengths,
-                                   uint32_t*  segmentsPerBlock,
-                                   uint32_t*  offsetPerBlock,
-                                   float32_t* lut,
-                                   float64_t* x,
-                                   float64_t* y)
-{
-    __shared__ float64_t shmem[1024];
-
-    uint32_t bid = blockIdx.x;
-    uint32_t tid = threadIdx.x;
-    uint32_t gid = threadIdx.x / 512;
-    uint32_t sid = threadIdx.x - 512*gid;
-
-    shmem[tid] = 0.0;
-
-    if(sid >= NUM_SAMPLES) return;
-
-    uint32_t offset = offsetPerBlock[bid] + (segmentsPerBlock[bid]/2)*gid;
-    uint32_t nsegments = segmentsPerBlock[bid]/2 + (segmentsPerBlock[bid]%2)*gid;
-
-    uint32_t*  voxel  = voxelIDs + offset;
-    uint32_t*  fiber  = fiberIDs + offset;
-    uint16_t*  orien  = orienIDs + offset;
-    float32_t* length = lengths  + offset;
-
-    float64_t sum = 0.0;
-
-    for(int i = 0; i < nsegments; i++){
-        int offset_lut = (*orien)*NUM_SAMPLES + sid;
-
-        float64_t aux = 0.0;
-        for(int j = 0; j < NUM_DIAMETERS; j++){
-            aux += (double)(lut[offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES])*x[(*fiber) + j*NUM_FIBERS];
-            //aux += tex1Dfetch(tex_lutIC, offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES) * x[(*fiber) + j*NUM_FIBERS];
-        }
-
-        sum += aux * (*length);
-
-        fiber++;
-        orien++;
-        length++;
-    }
-
-    shmem[tid] = sum;
-    __syncthreads();
-
-    if(tid < NUM_SAMPLES)
-        y[(*voxel)*NUM_SAMPLES + sid] = sum + shmem[tid+512];
-}//*/
-
-__global__ void multiply_Ax_ICpart(uint32_t*  voxelIDs,
-                     uint32_t*  fiberIDs,
-                     uint16_t*  orienIDs,
-                     float32_t* lengths,
-                     uint32_t*  segmentsPerVoxel,
-                     uint32_t*  offsetPerVoxel,
-                     float32_t* lut,
-                     float64_t* x,
-                     float64_t* y)
-{
-    uint32_t bid = blockIdx.x;
-    uint32_t tid = threadIdx.x;
-
-    uint32_t vid = bid*256 + tid;
-
-    if (vid >= NUM_VOXELS) return;
-
-    uint32_t offset = offsetPerVoxel[ vid ];
-    uint32_t nsegments = segmentsPerVoxel[ vid ];
-
-    uint32_t*  voxel  = voxelIDs + offset;
-    uint32_t*  fiber  = fiberIDs + offset;
-    uint16_t*  orien  = orienIDs + offset;
-    float32_t* length = lengths  + offset;
-
-    for(int i=0; i<nsegments; i++){
-        for(int s=0; s<NUM_SAMPLES; s++){
-            int offset_lut = (*orien)*NUM_SAMPLES + s;
-
-            float64_t aux = 0.0;
-            for(int j=0; j<NUM_DIAMETERS; j++)
-                aux += (float64_t)(lut[offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES]) * x[(*fiber) + j*NUM_FIBERS];
-
-            y[(*voxel)*NUM_SAMPLES + s] += aux*(*length);
-        }
-
-        fiber++;
-        orien++;
-        length++;
-    }
-}
-
-__global__ void multiply_Ax_ECpart(
-    uint32_t*  voxelIDs,
-    uint16_t*  orienIDs,
-    uint32_t*  segmentsPerBlock,
-    uint32_t*  offsetPerBlock,
-    float32_t* lut,
-    float64_t* x,
-    float64_t* y)
-{
-    uint32_t bid = blockIdx.x;
-    uint32_t tid = threadIdx.x;
-
-    if(tid >= NUM_SAMPLES) return;
-
-    uint32_t offset  = offsetPerBlock[bid];
-    uint32_t nsegments = segmentsPerBlock[bid];
-
-    uint32_t* voxel = voxelIDs + offset;
-    uint16_t* orien = orienIDs + offset;
-
-    uint32_t target = NUM_FIBERS*NUM_DIAMETERS + offset;
-
-    float64_t sum = 0.0;
-    for(int i = 0; i < nsegments; i++){
-        uint32_t offset_lut = (*orien)*NUM_SAMPLES + tid;
-
-        for(int j = 0; j < NUM_ZEPPELINS; j++)
-            sum += (double)(lut[offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES])*x[target + j*NUM_PEAKS + i];
-            //sum += tex1Dfetch(tex_lutEC, offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES) * x[target + j*NUM_PEAKS + i];
-
-        orien++;
-    }
-
-    y[(*voxel)*NUM_SAMPLES + tid] += sum;
-}
-
-__global__ void multiply_Ax_ISOpart(
-    float32_t* lut,
-    float64_t* x,
-    float64_t* y)
-{
-    uint32_t bid = blockIdx.x;
-    uint32_t tid = threadIdx.x;
-
-    if(tid >= NUM_SAMPLES) return;
-
-    uint32_t target = NUM_FIBERS*NUM_DIAMETERS + NUM_PEAKS*NUM_ZEPPELINS + bid;
-
-    float64_t sum = 0.0;
-    for(int j = 0; j < NUM_BALLS; j++)
-        sum += (double)(lut[j*NUM_SAMPLES + tid])*x[target + j*NUM_VOXELS];
-        //sum += (double)(tex1Dfetch(tex_lutISO, j*NUM_SAMPLES + tid))*x[target + j*NUM_VOXELS];
-        
-
-    y[bid*NUM_SAMPLES + tid] += sum;
-}
-
-__global__ void multiply_Aty_ICpart(
-    uint32_t*  voxelICt,
-    uint32_t*  fiberICt,
-    uint16_t*  orienICt,
-    float32_t* lengthICt,
-    uint32_t*  compartmentsPerBlock,
-    uint32_t*  offsetPerBlock,
-    float32_t* lut,
-    float64_t* x,
-    float64_t* y)
-{
-    __shared__ float64_t shmem[512];
-
-    uint32_t bid = blockIdx.x;
-    uint32_t tid = threadIdx.x;
-
-    shmem[tid] = 0.0;
-
-    if(tid >= NUM_SAMPLES) return;
-
-    uint32_t offset = offsetPerBlock[bid];
-    uint32_t nsegments = offset + compartmentsPerBlock[bid];
-
-    uint32_t*  voxel  = voxelICt  + offset;
-    uint32_t*  fiber  = fiberICt  + offset;
-    uint16_t*  orien  = orienICt  + offset;
-    float32_t* length = lengthICt + offset;
-
-    for(int j = 0; j < NUM_DIAMETERS; j++){
-        int offset_lut = j*NUM_ORIENTATIONS*NUM_SAMPLES + tid;
-
-        float64_t sum = 0.0;
-        voxel  = voxelICt  + offset;
-        orien  = orienICt  + offset;
-        length = lengthICt + offset;
-        for(int i = offset; i < nsegments; i++){
-            sum += ((float64_t)(*length)) *( (float64_t) lut[offset_lut + (*orien)*NUM_SAMPLES] )* y[(*voxel)*NUM_SAMPLES + tid];
-            //sum += ((float64_t)(*length)) *( (float64_t) tex1Dfetch(tex_lutIC, offset_lut + (*orien)*NUM_SAMPLES) )* y[(*voxel)*NUM_SAMPLES + tid];
-
-            voxel++;
-            orien++;
-            length++;
-        }
-
-        shmem[tid] = sum;
-        __syncthreads();
-
-        if(tid < 256) shmem[tid] += shmem[tid + 256]; __syncthreads();
-        if(tid < 128) shmem[tid] += shmem[tid + 128]; __syncthreads();
-        if(tid <  64) shmem[tid] += shmem[tid +  64]; __syncthreads();
-        if(tid <  32) shmem[tid] += shmem[tid +  32]; __syncthreads();
-        if(tid <  16) shmem[tid] += shmem[tid +  16]; __syncthreads();
-        if(tid <   8) shmem[tid] += shmem[tid +   8]; __syncthreads();
-        if(tid <   4) shmem[tid] += shmem[tid +   4]; __syncthreads();
-
-        if(tid == 0) x[j*NUM_FIBERS + (*fiber)] = shmem[0] + shmem[1] + shmem[2] + shmem[3];
-
-        __syncthreads();
-    }
-}
-
-__global__ void multiply_Aty_ECpart(
-    uint32_t*  voxelEC,
-    uint16_t*  orienEC,
-    uint32_t*  segmentsPerBlock,
-    uint32_t*  offsetPerBlock,
-    float32_t* lut,
-    float64_t* x,
-    float64_t* y)
-{
-    __shared__ float64_t shmem[512];
-
-    uint32_t bid = blockIdx.x;
-    uint32_t tid = threadIdx.x;
-
-    shmem[tid] = 0.0;
-
-    if(tid >= NUM_SAMPLES) return;
-
-    uint32_t offset  = offsetPerBlock[bid];
-    uint32_t ncompartments = segmentsPerBlock[bid] + offset;
-
-    uint32_t* voxel = voxelEC + offset;
-    uint16_t* orien = orienEC + offset;
-
-    for(int j = 0; j < NUM_ZEPPELINS; j++){        
-        uint32_t offset_lut = j*NUM_ORIENTATIONS*NUM_SAMPLES + tid;
-
-        voxel = voxelEC + offset;
-        orien = orienEC + offset;
-        for(int i = offset; i < ncompartments; i++){
-            shmem[tid] =( (float64_t)(lut[(*orien)*NUM_SAMPLES + offset_lut] ))* y[(*voxel)*NUM_SAMPLES + tid];
-            //shmem[tid] =( (float64_t)tex1Dfetch(tex_lutEC, (*orien)*NUM_SAMPLES + offset_lut) )* y[(*voxel)*NUM_SAMPLES + tid];
-            __syncthreads();
-
-            if(tid < 256) shmem[tid] += shmem[tid + 256]; __syncthreads();
-            if(tid < 128) shmem[tid] += shmem[tid + 128]; __syncthreads();
-            if(tid <  64) shmem[tid] += shmem[tid +  64]; __syncthreads();
-            if(tid <  32) shmem[tid] += shmem[tid +  32]; __syncthreads();
-            if(tid <  16) shmem[tid] += shmem[tid +  16]; __syncthreads();
-            if(tid <   8) shmem[tid] += shmem[tid +   8]; __syncthreads();
-            if(tid <   4) shmem[tid] += shmem[tid +   4]; __syncthreads();
-            if(tid <   2) shmem[tid] += shmem[tid +   2]; __syncthreads();
-
-            if(tid == 0) x[NUM_FIBERS*NUM_DIAMETERS + j*NUM_PEAKS + i] = shmem[0] + shmem[1];
-
-            voxel++;
-            orien++;
-            __syncthreads();
-        }
-    }
-}
-
-__global__ void multiply_Aty_ISOpart(float* lut, double* x, double* y){
-    __shared__ double shmem[512];
-
-    uint bid = blockIdx.x;
-    uint tid = threadIdx.x;
-    uint offset = NUM_FIBERS*NUM_DIAMETERS + NUM_PEAKS*NUM_ZEPPELINS + bid;
-
-    shmem[tid] = 0.0;
-
-    if(tid >= NUM_SAMPLES) return;
-
-    for(int j = 0; j < NUM_BALLS; j++){
-        shmem[tid] =( (float64_t) lut[j*NUM_SAMPLES + tid] )* y[bid*NUM_SAMPLES + tid];
-        //shmem[tid] =( (float64_t) tex1Dfetch(tex_lutISO, j*NUM_SAMPLES + tid) )* y[bid*NUM_SAMPLES + tid];
-        __syncthreads();
-
-        if(tid < 256) shmem[tid] += shmem[tid + 256]; __syncthreads();
-        if(tid < 128) shmem[tid] += shmem[tid + 128]; __syncthreads();
-        if(tid <  64) shmem[tid] += shmem[tid +  64]; __syncthreads();
-        if(tid <  32) shmem[tid] += shmem[tid +  32]; __syncthreads();
-        if(tid <  16) shmem[tid] += shmem[tid +  16]; __syncthreads();
-        if(tid <   8) shmem[tid] += shmem[tid +   8]; __syncthreads();
-        if(tid <   4) shmem[tid] += shmem[tid +   4]; __syncthreads(); 
-
-        if(tid == 0)
-            x[offset + j*NUM_VOXELS] = shmem[0] + shmem[1] + shmem[2] + shmem[3];
-    }
-}
-
+#include "operator_withCUDA.cuh"
+
+// ====================================================
+// Textures for LUT in the GPU
+// ====================================================
+texture<float32_t, 1, cudaReadModeElementType> tex_lutIC;
+texture<float32_t, 1, cudaReadModeElementType> tex_lutEC;
+texture<float32_t, 1, cudaReadModeElementType> tex_lutISO;
+
+
+int checkCompatibility(int gpuID) {
+    int gpuCount;
+    cudaError_t cudaStatus;
+    
+    cudaStatus = cudaGetDeviceCount(&gpuCount);
+
+    if (gpuCount <= 0 || gpuID >= gpuCount || cudaStatus != cudaSuccess) return 1;
+
+    cudaStatus = cudaSetDevice(gpuID);
+
+    if (cudaStatus != cudaSuccess) return 2;
+
+    cudaDeviceProp gpuProperties;
+    cudaStatus = cudaGetDeviceProperties(&gpuProperties, gpuID);
+
+    if (cudaStatus != cudaSuccess) return 3;
+
+    printf("\t* selected GPU...       [ %s ]\n",     gpuProperties.name);
+    printf("\t* total memory...       [ %.2fGB ]\n", gpuProperties.totalGlobalMem*1e-9);
+    printf("\t* compute capability... [ %d.%d ]\n",  gpuProperties.major, gpuProperties.minor);
+
+    if(gpuProperties.major < 5) return 4;
+
+    return 0;
+}
+
+void cudaCheckLastError()
+{
+    cudaError_t err = cudaGetLastError();
+
+    if(err != cudaSuccess){
+        printf("CUDA Error: %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+
+void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compartmentsPerBlock, uint32_t* offsetPerBlock, int NUM_BLOCKS){
+
+    // fill arrays with zeros
+    memset(compartmentsPerBlock, 0, NUM_BLOCKS * sizeof(uint32_t));
+    memset(offsetPerBlock,       0, NUM_BLOCKS * sizeof(uint32_t));
+
+    // count compartments per block
+    for(int i = 0; i < NUM_COMPARTMENTS; i++)
+        compartmentsPerBlock[data[i]]++;
+
+    // calculate offset per block
+    offsetPerBlock[0] = 0;
+    for(int i = 1; i < NUM_BLOCKS; i++)
+        offsetPerBlock[i] = offsetPerBlock[i-1] + compartmentsPerBlock[i-1];
+}
+
+int CudaLinearOperator::setDictionary(uint32_t* voxelIC, uint32_t* fiberIC, uint16_t* orienIC, float32_t* lengthIC, uint32_t* voxelEC, uint16_t* orienEC){
+    
+    cudaError_t cudaStatus;
+
+    uint32_t* segmentsPerBlock = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
+    uint32_t* offsetPerBlock   = (uint32_t*) malloc(nvoxels*sizeof(uint32_t));
+
+    if (segmentsPerBlock == NULL || offsetPerBlock == NULL) return -1;
+
+    preprocessDataForGPU(voxelIC, nsegments, segmentsPerBlock, offsetPerBlock, nvoxels);
+
+    cudaStatus = cudaMalloc((void**)&gpu_segmentsPerBlockIC, nvoxels*sizeof(uint32_t));
+    if (cudaStatus != cudaSuccess) return 1;
+    cudaStatus = cudaMalloc((void**)&gpu_offsetPerBlockIC,   nvoxels*sizeof(uint32_t));
+    if (cudaStatus != cudaSuccess) return 1;
+
+    cudaStatus = cudaMemcpy(gpu_segmentsPerBlockIC, segmentsPerBlock, nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) return 2;
+    cudaStatus = cudaMemcpy(gpu_offsetPerBlockIC,   offsetPerBlock,   nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) return 2;
+
+    if (npeaks > 0){
+        preprocessDataForGPU(voxelEC, npeaks, segmentsPerBlock, offsetPerBlock, nvoxels);
+
+        cudaStatus = cudaMalloc((void**)&gpu_segmentsPerBlockEC, nvoxels*sizeof(uint32_t));
+        if (cudaStatus != cudaSuccess) return 1;
+        cudaStatus = cudaMalloc((void**)&gpu_offsetPerBlockEC,   nvoxels*sizeof(uint32_t));
+        if (cudaStatus != cudaSuccess) return 1;
+
+        cudaStatus = cudaMemcpy(gpu_segmentsPerBlockEC, segmentsPerBlock, nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice);
+        if (cudaStatus != cudaSuccess) return 2;
+        cudaStatus = cudaMemcpy(gpu_offsetPerBlockEC,   offsetPerBlock,   nvoxels*sizeof(uint32_t), cudaMemcpyHostToDevice);
+        if (cudaStatus != cudaSuccess) return 2;
+    }
+
+    free(segmentsPerBlock);
+    free(offsetPerBlock);
+
+    // alloc IC part of the dictionary in GPU
+    cudaStatus = cudaMalloc((void**)&gpu_voxelIC,  nsegments*sizeof(uint32_t)); 
+    if (cudaStatus != cudaSuccess) return 1;
+    cudaStatus = cudaMalloc((void**)&gpu_fiberIC,  nsegments*sizeof(uint32_t)); 
+    if (cudaStatus != cudaSuccess) return 1;
+    cudaStatus = cudaMalloc((void**)&gpu_orienIC,  nsegments*sizeof(uint16_t)); 
+    if (cudaStatus != cudaSuccess) return 1;
+    cudaStatus = cudaMalloc((void**)&gpu_lengthIC, nsegments*sizeof(float32_t));
+    if (cudaStatus != cudaSuccess) return 1;
+
+    // transfer IC part of the dictionary to GPU
+    cudaStatus = cudaMemcpy(gpu_voxelIC,  voxelIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) return 2;
+    cudaStatus = cudaMemcpy(gpu_fiberIC,  fiberIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) return 2;
+    cudaStatus = cudaMemcpy(gpu_orienIC,  orienIC,  nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) return 2;
+    cudaStatus = cudaMemcpy(gpu_lengthIC, lengthIC, nsegments*sizeof(float32_t), cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) return 2;
+
+    if (npeaks > 0){
+        // alloc EC part of the dictionary in GPU
+        cudaStatus = cudaMalloc((void**)&gpu_voxelEC,  npeaks*sizeof(uint32_t));
+        if (cudaStatus != cudaSuccess) return 1;
+        cudaStatus = cudaMalloc((void**)&gpu_orienEC,  npeaks*sizeof(uint16_t));
+        if (cudaStatus != cudaSuccess) return 1;
+
+        // transfer EC part of the dictionary to GPU
+        cudaStatus = cudaMemcpy(gpu_voxelEC,  voxelEC,  npeaks*sizeof(uint32_t), cudaMemcpyHostToDevice);
+        if (cudaStatus != cudaSuccess) return 2;
+        cudaStatus = cudaMemcpy(gpu_orienEC,  orienEC,  npeaks*sizeof(uint16_t), cudaMemcpyHostToDevice);
+        if (cudaStatus != cudaSuccess) return 2;
+    }
+
+    return 0;
+}
+
+int CudaLinearOperator::setTransposeDictionary(uint32_t* TvoxelIC, uint32_t* TfiberIC, uint16_t* TorienIC, float32_t* TlengthIC){
+    
+    cudaError_t cudaStatus;
+
+    uint32_t*  fibersPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
+    uint32_t*  offsetPerBlock = (uint32_t*) malloc(nfibers*sizeof(uint32_t));
+    if(fibersPerBlock == NULL || offsetPerBlock == NULL) return -1;
+
+    preprocessDataForGPU(TfiberIC, nsegments, fibersPerBlock, offsetPerBlock, nfibers);
+
+    cudaStatus = cudaMalloc((void**)&gpu_TfibersPerBlockIC, nfibers*sizeof(uint32_t));
+    if (cudaStatus != cudaSuccess) return 1;
+    cudaStatus = cudaMalloc((void**)&gpu_ToffsetPerBlockIC, nfibers*sizeof(uint32_t));
+    if (cudaStatus != cudaSuccess) return 1;
+
+    cudaStatus = cudaMemcpy(gpu_TfibersPerBlockIC, fibersPerBlock, nfibers*sizeof(uint32_t), cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) return 2;
+    cudaStatus = cudaMemcpy(gpu_ToffsetPerBlockIC, offsetPerBlock, nfibers*sizeof(uint32_t), cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) return 2;
+
+    free(fibersPerBlock);
+    free(offsetPerBlock);
+
+    cudaStatus = cudaMalloc((void**)&gpu_TvoxelIC,  nsegments*sizeof(uint32_t)) ;
+    if (cudaStatus != cudaSuccess) return 1;
+    cudaStatus = cudaMalloc((void**)&gpu_TfiberIC,  nsegments*sizeof(uint32_t)) ;
+    if (cudaStatus != cudaSuccess) return 1;
+    cudaStatus = cudaMalloc((void**)&gpu_TorienIC,  nsegments*sizeof(uint16_t)) ;
+    if (cudaStatus != cudaSuccess) return 1;
+    cudaStatus = cudaMalloc((void**)&gpu_TlengthIC, nsegments*sizeof(float32_t));
+    if (cudaStatus != cudaSuccess) return 1;
+
+    cudaStatus = cudaMemcpy(gpu_TvoxelIC,  TvoxelIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) return 2;
+    cudaStatus = cudaMemcpy(gpu_TfiberIC,  TfiberIC,  nsegments*sizeof(uint32_t),  cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) return 2;
+    cudaStatus = cudaMemcpy(gpu_TorienIC,  TorienIC,  nsegments*sizeof(uint16_t),  cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) return 2;
+    cudaStatus = cudaMemcpy(gpu_TlengthIC, TlengthIC, nsegments*sizeof(float32_t), cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) return 2;
+    
+    return 0;
+}
+
+int CudaLinearOperator::setKernels(float32_t* lutIC, float32_t* lutEC, float32_t* lutISO){
+
+    cudaError_t cudaStatus;
+
+    if (ndiameters > 0){
+        cudaStatus = cudaMalloc((void**)&gpu_lutIC, size_lutic*sizeof(float32_t));
+        if (cudaStatus != cudaSuccess) return 1;
+        cudaStatus = cudaMemcpy(gpu_lutIC, lutIC, size_lutic*sizeof(float32_t), cudaMemcpyHostToDevice);
+        if (cudaStatus != cudaSuccess) return 2;
+
+        tex_lutIC.addressMode[0] = cudaAddressModeBorder;
+        tex_lutIC.addressMode[1] = cudaAddressModeBorder;
+        tex_lutIC.filterMode = cudaFilterModePoint;
+        tex_lutIC.normalized = false;
+
+        cudaStatus = cudaBindTexture(NULL, tex_lutIC,  gpu_lutIC,  size_lutic*sizeof(float32_t));
+        if (cudaStatus != cudaSuccess) return 3;
+    }
+
+    if (nzeppelins > 0){
+        cudaStatus = cudaMalloc((void**)&gpu_lutEC,  size_lutec*sizeof(float32_t));
+        if (cudaStatus != cudaSuccess) return 1;
+        cudaStatus = cudaMemcpy(gpu_lutEC, lutEC, size_lutec*sizeof(float32_t), cudaMemcpyHostToDevice);
+        if (cudaStatus != cudaSuccess) return 2;
+
+        tex_lutEC.addressMode[0] = cudaAddressModeBorder;
+        tex_lutEC.addressMode[1] = cudaAddressModeBorder;
+        tex_lutEC.filterMode = cudaFilterModePoint;
+        tex_lutEC.normalized = false;
+
+        cudaStatus = cudaBindTexture(NULL, tex_lutEC,  gpu_lutEC,  size_lutec*sizeof(float32_t));
+        if (cudaStatus != cudaSuccess) return 3;
+    }
+
+    if (nballs > 0){
+        cudaStatus = cudaMalloc((void**)&gpu_lutISO, size_lutiso*sizeof(float32_t));
+        if (cudaStatus != cudaSuccess) return 1;
+        cudaStatus = cudaMemcpy(gpu_lutISO, lutISO, size_lutiso*sizeof(float32_t), cudaMemcpyHostToDevice);
+        if (cudaStatus != cudaSuccess) return 2;
+
+        tex_lutISO.addressMode[0] = cudaAddressModeBorder;
+        tex_lutISO.addressMode[1] = cudaAddressModeBorder;
+        tex_lutISO.filterMode = cudaFilterModePoint;
+        tex_lutISO.normalized = false;
+
+        cudaStatus = cudaBindTexture(NULL, tex_lutISO, gpu_lutISO, size_lutiso*sizeof(float32_t));
+        if (cudaStatus != cudaSuccess) return 3;
+    }
+
+    return 0;
+}
+
+int CudaLinearOperator::setVectors(){
+    
+    cudaError_t cudaStatus;
+
+    cudaStatus = cudaMalloc((void**)&gpu_x, ncols*sizeof(float64_t));
+    if (cudaStatus != cudaSuccess) return 1;
+    cudaStatus = cudaMalloc((void**)&gpu_y, nrows*sizeof(float64_t));
+    if (cudaStatus != cudaSuccess) return 1;
+    
+    return 0;
+}
+
+int CudaLinearOperator::setGlobals(){
+    
+    cudaError_t cudaStatus;
+
+    cudaStatus = cudaMemcpyToSymbol(NUM_VOXELS,       &nvoxels,       sizeof(int));
+    if (cudaStatus != cudaSuccess) return -1;
+    cudaStatus = cudaMemcpyToSymbol(NUM_FIBERS,       &nfibers,       sizeof(int));
+    if (cudaStatus != cudaSuccess) return -1;
+    cudaStatus = cudaMemcpyToSymbol(NUM_PEAKS,        &npeaks,        sizeof(int));
+    if (cudaStatus != cudaSuccess) return -1;
+    cudaStatus = cudaMemcpyToSymbol(NUM_ORIENTATIONS, &norientations, sizeof(int));
+    if (cudaStatus != cudaSuccess) return -1;
+    cudaStatus = cudaMemcpyToSymbol(NUM_SAMPLES,      &nsamples,      sizeof(int));
+    if (cudaStatus != cudaSuccess) return -1;
+    cudaStatus = cudaMemcpyToSymbol(NUM_DIAMETERS,    &ndiameters,    sizeof(int));
+    if (cudaStatus != cudaSuccess) return -1;
+    cudaStatus = cudaMemcpyToSymbol(NUM_ZEPPELINS,    &nzeppelins,    sizeof(int));
+    if (cudaStatus != cudaSuccess) return -1;
+    cudaStatus = cudaMemcpyToSymbol(NUM_BALLS,        &nballs,        sizeof(int));
+    if (cudaStatus != cudaSuccess) return -1;
+    cudaStatus = cudaMemcpyToSymbol(NUM_ROWS,         &nrows,         sizeof(int));
+    if (cudaStatus != cudaSuccess) return -1;
+    cudaStatus = cudaMemcpyToSymbol(NUM_COLS,         &ncols,         sizeof(int));
+    if (cudaStatus != cudaSuccess) return -1;
+    cudaStatus = cudaMemcpyToSymbol(SIZE_LUTIC,       &size_lutic,    sizeof(int));
+    if (cudaStatus != cudaSuccess) return -1;
+    cudaStatus = cudaMemcpyToSymbol(SIZE_LUTEC,       &size_lutec,    sizeof(int));
+    if (cudaStatus != cudaSuccess) return -1;
+    cudaStatus = cudaMemcpyToSymbol(SIZE_LUTISO,      &size_lutiso,   sizeof(int));
+    if (cudaStatus != cudaSuccess) return -1;
+    
+    return 0;
+}
+
+CudaLinearOperator::CudaLinearOperator(int nsegments, int nvoxels, int nfibers, int npeaks, int norientations, int nsamples, int ndiameters, int nzeppelins, int nballs){
+
+    this->nsegments = nsegments;
+    this->nvoxels = nvoxels;
+    this->nfibers = nfibers;
+    this->npeaks = npeaks;
+    this->norientations = norientations;
+    this->nsamples = nsamples;
+    this->ndiameters = ndiameters;
+    this->nzeppelins = nzeppelins;   
+    this->nballs = nballs;
+    this->size_lutic = ndiameters*norientations*nsamples;
+    this->size_lutec = nzeppelins*norientations*nsamples;
+    this->size_lutiso = nballs*nsamples;
+    this->nrows = nvoxels*nsamples;
+    this->ncols = nfibers*ndiameters + npeaks*nzeppelins + nvoxels*nballs;
+}
+
+CudaLinearOperator::~CudaLinearOperator() {}
+
+int CudaLinearOperator::destroy(){
+    cudaError_t cudaStatus;    
+
+    cudaStatus = cudaFree(gpu_voxelIC);
+    cudaStatus = cudaFree(gpu_fiberIC);
+    cudaStatus = cudaFree(gpu_orienIC);
+    cudaStatus = cudaFree(gpu_lengthIC);
+    cudaStatus = cudaFree(gpu_voxelEC);
+    cudaStatus = cudaFree(gpu_orienEC);
+    cudaStatus = cudaFree(gpu_segmentsPerBlockIC);
+    cudaStatus = cudaFree(gpu_offsetPerBlockIC);
+    cudaStatus = cudaFree(gpu_segmentsPerBlockEC);
+    cudaStatus = cudaFree(gpu_offsetPerBlockEC);
+
+    cudaStatus = cudaFree(gpu_TvoxelIC);
+    cudaStatus = cudaFree(gpu_TfiberIC);
+    cudaStatus = cudaFree(gpu_TorienIC);
+    cudaStatus = cudaFree(gpu_TlengthIC);
+    cudaStatus = cudaFree(gpu_TfibersPerBlockIC);
+    cudaStatus = cudaFree(gpu_ToffsetPerBlockIC);
+
+    cudaStatus = cudaFree(gpu_x);
+    cudaStatus = cudaFree(gpu_y);
+
+    cudaStatus = cudaFree(gpu_lutIC);
+    cudaStatus = cudaFree(gpu_lutEC);
+    cudaStatus = cudaFree(gpu_lutISO);
+    cudaStatus = cudaUnbindTexture(tex_lutIC);
+    cudaStatus = cudaUnbindTexture(tex_lutEC);
+    cudaStatus = cudaUnbindTexture(tex_lutISO);
+
+    cudaStatus = cudaDeviceReset();
+
+    return 0;
+}
+
+void cudaCheckKernel(){
+    cudaError_t cudaStatus;
+    
+    cudaStatus = cudaGetLastError();
+	if(cudaStatus != cudaSuccess)
+        fprintf(stderr, "\t* kernel launch... [ ERROR ]: %s\n\n", cudaGetErrorString(cudaStatus));
+    else
+        printf("\t* kernel launch... [ OK ]\n");
+
+    cudaStatus = cudaDeviceSynchronize();
+	if(cudaStatus != cudaSuccess)
+        fprintf(stderr, "\t* cudaDeviceSynchronize() after launching kernel... [ ERROR ]: %d\n", cudaStatus);
+    else
+        printf("\t* cudaDeviceSynchronize() after launching kernel... [ OK ]\n");
+}
+
+void CudaLinearOperator::dot(float64_t* v_in, float64_t* v_out){
+    
+    // Copy vector x to the GPU
+    cudaMemcpy(gpu_x, v_in, ncols*sizeof(double), cudaMemcpyHostToDevice);
+    //cudaCheckLastError();
+
+    // Multiply IC part in the GPU
+    //multiply_Ax_ICpart<<<nvoxels, 1024>>>(gpu_voxelIC, gpu_fiberIC, gpu_orienIC, gpu_lengthIC, gpu_segmentsPerBlockIC, gpu_offsetPerBlockIC, gpu_lutIC, gpu_x, gpu_y);
+    multiply_Ax_ICpart<<<nvoxels/256 + 1, 256>>>(gpu_voxelIC, gpu_fiberIC, gpu_orienIC, gpu_lengthIC, gpu_segmentsPerBlockIC, gpu_offsetPerBlockIC, gpu_lutIC, gpu_x, gpu_y);
+    //cudaCheckLastError();
+
+    // Multiply EC part in the GPU
+    multiply_Ax_ECpart<<<nvoxels, 512>>>(gpu_voxelEC, gpu_orienEC, gpu_segmentsPerBlockEC, gpu_offsetPerBlockEC, gpu_lutEC, gpu_x, gpu_y);
+    //cudaCheckLastError();
+
+    // Multiply ISO part in the GPU
+    multiply_Ax_ISOpart<<<nvoxels, 512>>>(gpu_lutISO, gpu_x, gpu_y);
+    //cudaCheckLastError();
+
+    // Copy back result to CPU
+    cudaMemcpy(v_out, gpu_y, nrows*sizeof(double), cudaMemcpyDeviceToHost);
+    //cudaCheckLastError();
+}
+
+void CudaLinearOperator::Tdot(float64_t* v_in, float64_t* v_out){
+    
+    // Copy vector y to the GPU
+    cudaMemcpy(gpu_y, v_in, nrows*sizeof(double), cudaMemcpyHostToDevice);
+    //cudaCheckLastError();
+
+    // Multiply IC part in the GPU
+    multiply_Aty_ICpart<<<nfibers, 512>>>(gpu_TvoxelIC, gpu_TfiberIC, gpu_TorienIC, gpu_TlengthIC, gpu_TfibersPerBlockIC, gpu_ToffsetPerBlockIC, gpu_lutIC, gpu_x, gpu_y);
+    //cudaCheckLastError();
+
+    // Multiply EC part in the GPU
+    multiply_Aty_ECpart<<<nvoxels, 512>>>(gpu_voxelEC, gpu_orienEC, gpu_segmentsPerBlockEC, gpu_offsetPerBlockEC, gpu_lutEC, gpu_x, gpu_y);
+    //cudaCheckLastError();
+
+    // Multiply ISO part in the GPU
+    multiply_Aty_ISOpart<<<nvoxels, 512>>>(gpu_lutISO, gpu_x, gpu_y);
+    //cudaCheckLastError();
+
+    // Copy back result to CPU
+    cudaMemcpy(v_out, gpu_x, ncols*sizeof(double), cudaMemcpyDeviceToHost);
+    //cudaCheckLastError();
+}
+
+// ------------------------------------------------------- KERNELS ------------------------------------------------------- //
+/*__global__ void multiply_Ax_ICpart(uint32_t*  voxelIDs,
+                                   uint32_t*  fiberIDs,
+                                   uint16_t*  orienIDs,
+                                   float32_t* lengths,
+                                   uint32_t*  segmentsPerBlock,
+                                   uint32_t*  offsetPerBlock,
+                                   float32_t* lut,
+                                   float64_t* x,
+                                   float64_t* y)
+{
+    __shared__ float64_t shmem[1024];
+
+    uint32_t bid = blockIdx.x;
+    uint32_t tid = threadIdx.x;
+    uint32_t gid = threadIdx.x / 512;
+    uint32_t sid = threadIdx.x - 512*gid;
+
+    shmem[tid] = 0.0;
+
+    if(sid >= NUM_SAMPLES) return;
+
+    uint32_t offset = offsetPerBlock[bid] + (segmentsPerBlock[bid]/2)*gid;
+    uint32_t nsegments = segmentsPerBlock[bid]/2 + (segmentsPerBlock[bid]%2)*gid;
+
+    uint32_t*  voxel  = voxelIDs + offset;
+    uint32_t*  fiber  = fiberIDs + offset;
+    uint16_t*  orien  = orienIDs + offset;
+    float32_t* length = lengths  + offset;
+
+    float64_t sum = 0.0;
+
+    for(int i = 0; i < nsegments; i++){
+        int offset_lut = (*orien)*NUM_SAMPLES + sid;
+
+        float64_t aux = 0.0;
+        for(int j = 0; j < NUM_DIAMETERS; j++){
+            aux += (double)(lut[offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES])*x[(*fiber) + j*NUM_FIBERS];
+            //aux += tex1Dfetch(tex_lutIC, offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES) * x[(*fiber) + j*NUM_FIBERS];
+        }
+
+        sum += aux * (*length);
+
+        fiber++;
+        orien++;
+        length++;
+    }
+
+    shmem[tid] = sum;
+    __syncthreads();
+
+    if(tid < NUM_SAMPLES)
+        y[(*voxel)*NUM_SAMPLES + sid] = sum + shmem[tid+512];
+}//*/
+
+__global__ void multiply_Ax_ICpart(uint32_t*  voxelIDs,
+                     uint32_t*  fiberIDs,
+                     uint16_t*  orienIDs,
+                     float32_t* lengths,
+                     uint32_t*  segmentsPerVoxel,
+                     uint32_t*  offsetPerVoxel,
+                     float32_t* lut,
+                     float64_t* x,
+                     float64_t* y)
+{
+    uint32_t bid = blockIdx.x;
+    uint32_t tid = threadIdx.x;
+
+    uint32_t vid = bid*256 + tid;
+
+    if (vid >= NUM_VOXELS) return;
+
+    uint32_t offset = offsetPerVoxel[ vid ];
+    uint32_t nsegments = segmentsPerVoxel[ vid ];
+
+    uint32_t*  voxel  = voxelIDs + offset;
+    uint32_t*  fiber  = fiberIDs + offset;
+    uint16_t*  orien  = orienIDs + offset;
+    float32_t* length = lengths  + offset;
+
+    for(int i=0; i<nsegments; i++){
+        for(int s=0; s<NUM_SAMPLES; s++){
+            int offset_lut = (*orien)*NUM_SAMPLES + s;
+
+            float64_t aux = 0.0;
+            for(int j=0; j<NUM_DIAMETERS; j++)
+                aux += (float64_t)(lut[offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES]) * x[(*fiber) + j*NUM_FIBERS];
+
+            y[(*voxel)*NUM_SAMPLES + s] += aux*(*length);
+        }
+
+        fiber++;
+        orien++;
+        length++;
+    }
+}
+
+__global__ void multiply_Ax_ECpart(
+    uint32_t*  voxelIDs,
+    uint16_t*  orienIDs,
+    uint32_t*  segmentsPerBlock,
+    uint32_t*  offsetPerBlock,
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y)
+{
+    uint32_t bid = blockIdx.x;
+    uint32_t tid = threadIdx.x;
+
+    if(tid >= NUM_SAMPLES) return;
+
+    uint32_t offset  = offsetPerBlock[bid];
+    uint32_t nsegments = segmentsPerBlock[bid];
+
+    uint32_t* voxel = voxelIDs + offset;
+    uint16_t* orien = orienIDs + offset;
+
+    uint32_t target = NUM_FIBERS*NUM_DIAMETERS + offset;
+
+    float64_t sum = 0.0;
+    for(int i = 0; i < nsegments; i++){
+        uint32_t offset_lut = (*orien)*NUM_SAMPLES + tid;
+
+        for(int j = 0; j < NUM_ZEPPELINS; j++)
+            sum += (double)(lut[offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES])*x[target + j*NUM_PEAKS + i];
+            //sum += tex1Dfetch(tex_lutEC, offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES) * x[target + j*NUM_PEAKS + i];
+
+        orien++;
+    }
+
+    y[(*voxel)*NUM_SAMPLES + tid] += sum;
+}
+
+__global__ void multiply_Ax_ISOpart(
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y)
+{
+    uint32_t bid = blockIdx.x;
+    uint32_t tid = threadIdx.x;
+
+    if(tid >= NUM_SAMPLES) return;
+
+    uint32_t target = NUM_FIBERS*NUM_DIAMETERS + NUM_PEAKS*NUM_ZEPPELINS + bid;
+
+    float64_t sum = 0.0;
+    for(int j = 0; j < NUM_BALLS; j++)
+        sum += (double)(lut[j*NUM_SAMPLES + tid])*x[target + j*NUM_VOXELS];
+        //sum += (double)(tex1Dfetch(tex_lutISO, j*NUM_SAMPLES + tid))*x[target + j*NUM_VOXELS];
+        
+
+    y[bid*NUM_SAMPLES + tid] += sum;
+}
+
+__global__ void multiply_Aty_ICpart(
+    uint32_t*  voxelICt,
+    uint32_t*  fiberICt,
+    uint16_t*  orienICt,
+    float32_t* lengthICt,
+    uint32_t*  compartmentsPerBlock,
+    uint32_t*  offsetPerBlock,
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y)
+{
+    __shared__ float64_t shmem[512];
+
+    uint32_t bid = blockIdx.x;
+    uint32_t tid = threadIdx.x;
+
+    shmem[tid] = 0.0;
+
+    if(tid >= NUM_SAMPLES) return;
+
+    uint32_t offset = offsetPerBlock[bid];
+    uint32_t nsegments = offset + compartmentsPerBlock[bid];
+
+    uint32_t*  voxel  = voxelICt  + offset;
+    uint32_t*  fiber  = fiberICt  + offset;
+    uint16_t*  orien  = orienICt  + offset;
+    float32_t* length = lengthICt + offset;
+
+    for(int j = 0; j < NUM_DIAMETERS; j++){
+        int offset_lut = j*NUM_ORIENTATIONS*NUM_SAMPLES + tid;
+
+        float64_t sum = 0.0;
+        voxel  = voxelICt  + offset;
+        orien  = orienICt  + offset;
+        length = lengthICt + offset;
+        for(int i = offset; i < nsegments; i++){
+            sum += ((float64_t)(*length)) *( (float64_t) lut[offset_lut + (*orien)*NUM_SAMPLES] )* y[(*voxel)*NUM_SAMPLES + tid];
+            //sum += ((float64_t)(*length)) *( (float64_t) tex1Dfetch(tex_lutIC, offset_lut + (*orien)*NUM_SAMPLES) )* y[(*voxel)*NUM_SAMPLES + tid];
+
+            voxel++;
+            orien++;
+            length++;
+        }
+
+        shmem[tid] = sum;
+        __syncthreads();
+
+        if(tid < 256) shmem[tid] += shmem[tid + 256]; __syncthreads();
+        if(tid < 128) shmem[tid] += shmem[tid + 128]; __syncthreads();
+        if(tid <  64) shmem[tid] += shmem[tid +  64]; __syncthreads();
+        if(tid <  32) shmem[tid] += shmem[tid +  32]; __syncthreads();
+        if(tid <  16) shmem[tid] += shmem[tid +  16]; __syncthreads();
+        if(tid <   8) shmem[tid] += shmem[tid +   8]; __syncthreads();
+        if(tid <   4) shmem[tid] += shmem[tid +   4]; __syncthreads();
+
+        if(tid == 0) x[j*NUM_FIBERS + (*fiber)] = shmem[0] + shmem[1] + shmem[2] + shmem[3];
+
+        __syncthreads();
+    }
+}
+
+__global__ void multiply_Aty_ECpart(
+    uint32_t*  voxelEC,
+    uint16_t*  orienEC,
+    uint32_t*  segmentsPerBlock,
+    uint32_t*  offsetPerBlock,
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y)
+{
+    __shared__ float64_t shmem[512];
+
+    uint32_t bid = blockIdx.x;
+    uint32_t tid = threadIdx.x;
+
+    shmem[tid] = 0.0;
+
+    if(tid >= NUM_SAMPLES) return;
+
+    uint32_t offset  = offsetPerBlock[bid];
+    uint32_t ncompartments = segmentsPerBlock[bid] + offset;
+
+    uint32_t* voxel = voxelEC + offset;
+    uint16_t* orien = orienEC + offset;
+
+    for(int j = 0; j < NUM_ZEPPELINS; j++){        
+        uint32_t offset_lut = j*NUM_ORIENTATIONS*NUM_SAMPLES + tid;
+
+        voxel = voxelEC + offset;
+        orien = orienEC + offset;
+        for(int i = offset; i < ncompartments; i++){
+            shmem[tid] =( (float64_t)(lut[(*orien)*NUM_SAMPLES + offset_lut] ))* y[(*voxel)*NUM_SAMPLES + tid];
+            //shmem[tid] =( (float64_t)tex1Dfetch(tex_lutEC, (*orien)*NUM_SAMPLES + offset_lut) )* y[(*voxel)*NUM_SAMPLES + tid];
+            __syncthreads();
+
+            if(tid < 256) shmem[tid] += shmem[tid + 256]; __syncthreads();
+            if(tid < 128) shmem[tid] += shmem[tid + 128]; __syncthreads();
+            if(tid <  64) shmem[tid] += shmem[tid +  64]; __syncthreads();
+            if(tid <  32) shmem[tid] += shmem[tid +  32]; __syncthreads();
+            if(tid <  16) shmem[tid] += shmem[tid +  16]; __syncthreads();
+            if(tid <   8) shmem[tid] += shmem[tid +   8]; __syncthreads();
+            if(tid <   4) shmem[tid] += shmem[tid +   4]; __syncthreads();
+            if(tid <   2) shmem[tid] += shmem[tid +   2]; __syncthreads();
+
+            if(tid == 0) x[NUM_FIBERS*NUM_DIAMETERS + j*NUM_PEAKS + i] = shmem[0] + shmem[1];
+
+            voxel++;
+            orien++;
+            __syncthreads();
+        }
+    }
+}
+
+__global__ void multiply_Aty_ISOpart(float* lut, double* x, double* y){
+    __shared__ double shmem[512];
+
+    uint bid = blockIdx.x;
+    uint tid = threadIdx.x;
+    uint offset = NUM_FIBERS*NUM_DIAMETERS + NUM_PEAKS*NUM_ZEPPELINS + bid;
+
+    shmem[tid] = 0.0;
+
+    if(tid >= NUM_SAMPLES) return;
+
+    for(int j = 0; j < NUM_BALLS; j++){
+        shmem[tid] =( (float64_t) lut[j*NUM_SAMPLES + tid] )* y[bid*NUM_SAMPLES + tid];
+        //shmem[tid] =( (float64_t) tex1Dfetch(tex_lutISO, j*NUM_SAMPLES + tid) )* y[bid*NUM_SAMPLES + tid];
+        __syncthreads();
+
+        if(tid < 256) shmem[tid] += shmem[tid + 256]; __syncthreads();
+        if(tid < 128) shmem[tid] += shmem[tid + 128]; __syncthreads();
+        if(tid <  64) shmem[tid] += shmem[tid +  64]; __syncthreads();
+        if(tid <  32) shmem[tid] += shmem[tid +  32]; __syncthreads();
+        if(tid <  16) shmem[tid] += shmem[tid +  16]; __syncthreads();
+        if(tid <   8) shmem[tid] += shmem[tid +   8]; __syncthreads();
+        if(tid <   4) shmem[tid] += shmem[tid +   4]; __syncthreads(); 
+
+        if(tid == 0)
+            x[offset + j*NUM_VOXELS] = shmem[0] + shmem[1] + shmem[2] + shmem[3];
+    }
+}
+
diff --git a/commit/cudaoperator/operator_withCUDA.cuh b/commit/cudaoperator/operator_withCUDA.cuh
index c6fb879a..6b3d09bc 100644
--- a/commit/cudaoperator/operator_withCUDA.cuh
+++ b/commit/cudaoperator/operator_withCUDA.cuh
@@ -1,176 +1,176 @@
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cuda_runtime_api.h>
-#include <device_launch_parameters.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <algorithm>
-
-using namespace std;
-
-typedef unsigned int uint32_t;
-typedef unsigned short int uint16_t;
-typedef float float32_t;
-typedef double float64_t;
-
-// ====================================================
-// Util functions to check CUDA GPU compatibility
-// ====================================================
-int checkCompatibility(int gpu_id);
-void cudaCheckLastError();
-
-// ====================================================
-// Function to preprocess data for GPU
-// ====================================================
-void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compartmentsPerBlock, uint32_t* offsetPerBlock, int NUM_BLOCKS);
-
-// ====================================================
-// CUDA Kernels for Ax operation
-// ====================================================
-__global__ void multiply_Ax_ICpart(
-    uint32_t*  voxelIDs,
-    uint32_t*  fiberIDs,
-    uint16_t*  orienIDs,
-    float32_t* lengths,
-    uint32_t*  segmentsPerBlock,
-    uint32_t*  offsetPerBlock,
-    float32_t* lut,
-    float64_t* x,
-    float64_t* y);
-
-__global__ void multiply_Ax_ECpart(
-        uint32_t*  voxelIDs,
-        uint16_t*  orienIDs,
-        uint32_t*  segmentsPerBlock,
-        uint32_t*  offsetPerBlock,
-        float32_t* lut,
-        float64_t* x,
-        float64_t* y);
-
-__global__ void multiply_Ax_ISOpart(
-    float32_t* lut,
-    float64_t* x,
-    float64_t* y);
-
-// ====================================================
-// CUDA Kernels for A'y operation
-// ====================================================
-__global__ void multiply_Aty_ICpart(
-    uint32_t*  TvoxelIC,
-    uint32_t*  TfiberIC,
-    uint16_t*  TorienIC,
-    float32_t* TlengthIC,
-    uint32_t*  compartmentsPerBlock,
-    uint32_t*  offsetPerBlock,
-    float32_t* lut,
-    float64_t* x,
-    float64_t* y);
-
-__global__ void multiply_Aty_ECpart(
-    uint32_t*  voxelEC,
-    uint16_t*  orienEC,
-    uint32_t*  segmentsPerBlock,
-    uint32_t*  offsetPerBlock,
-    float32_t* lut,
-    float64_t* x,
-    float64_t* y);
-
-__global__ void multiply_Aty_ISOpart(
-    float* lut,
-    double* x,
-    double* y);
-
-// ====================================================
-// Constant global values in the GPU
-// ====================================================
-__constant__ int NUM_VOXELS;
-__constant__ int NUM_FIBERS;
-__constant__ int NUM_PEAKS;
-__constant__ int NUM_ORIENTATIONS;
-__constant__ int NUM_SAMPLES;
-__constant__ int NUM_DIAMETERS;
-__constant__ int NUM_ZEPPELINS;
-__constant__ int NUM_BALLS;
-__constant__ int NUM_ROWS;        
-__constant__ int NUM_COLS;      
-__constant__ int SIZE_LUTIC;      
-__constant__ int SIZE_LUTEC;     
-__constant__ int SIZE_LUTISO;
-
-// ====================================================
-// Pointers to A (IC part) in the GPU
-// ====================================================
-static uint32_t*  gpu_voxelIC;
-static uint32_t*  gpu_fiberIC;
-static uint16_t*  gpu_orienIC;
-static float32_t* gpu_lengthIC;
-static uint32_t*  gpu_segmentsPerBlockIC;
-static uint32_t*  gpu_offsetPerBlockIC;
-
-// ====================================================
-// Pointers to A' (IC part) in the GPU
-// ====================================================
-static uint32_t*  gpu_TvoxelIC;
-static uint32_t*  gpu_TfiberIC;
-static uint16_t*  gpu_TorienIC;
-static float32_t* gpu_TlengthIC;
-static uint32_t*  gpu_TfibersPerBlockIC;
-static uint32_t*  gpu_ToffsetPerBlockIC;
-
-// ====================================================
-// Pointers to A (EC part) in the GPU
-// ====================================================
-static uint32_t* gpu_voxelEC;
-static uint16_t* gpu_orienEC;
-static uint32_t* gpu_segmentsPerBlockEC;
-static uint32_t* gpu_offsetPerBlockEC;
-
-// ====================================================
-// Pointers to LUT in the GPU
-// ====================================================
-static float32_t* gpu_lutIC;
-static float32_t* gpu_lutEC;
-static float32_t* gpu_lutISO;
-
-// ====================================================
-// Pointers to x and y in the GPU
-// ====================================================
-static float64_t* gpu_x;
-static float64_t* gpu_y;
-
-// ============================================================================
-// This class creates an instance of the LinearOperator in GPU memory
-// ============================================================================
-class CudaLinearOperator {
-
-    // constant values in CPU
-    int nsegments;
-    int nvoxels;    
-    int nfibers;      
-    int npeaks;
-    int norientations;
-    int nsamples;
-    int ndiameters;
-    int nzeppelins;   
-    int nballs;
-    int size_lutic;
-    int size_lutec;
-    int size_lutiso;
-    int nrows;
-    int ncols;
-
-    public:
-         CudaLinearOperator(int nsegments, int nvoxels, int nfibers, int npeaks, int norientations, int nsamples, int ndiameters, int nzeppelins, int nballs);
-        ~CudaLinearOperator();
-
-        int setDictionary(uint32_t* voxelIC, uint32_t* fiberIC, uint16_t* orienIC, float32_t* lengthIC, uint32_t* voxelEC, uint16_t* orienEC);
-        int setTransposeDictionary(uint32_t* TvoxelIC, uint32_t* TfiberIC, uint16_t* TorienIC, float32_t* TlengthIC);
-        int setKernels(float32_t* lutIC, float32_t* lutEC, float32_t* lutISO);
-        int setVectors();
-        int setGlobals();
-        int destroy();
-
-        void  dot(float64_t* v_in, float64_t* v_out);
-        void Tdot(float64_t* v_in, float64_t* v_out);
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
+#include <device_launch_parameters.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <algorithm>
+
+using namespace std;
+
+typedef unsigned int uint32_t;
+typedef unsigned short int uint16_t;
+typedef float float32_t;
+typedef double float64_t;
+
+// ====================================================
+// Util functions to check CUDA GPU compatibility
+// ====================================================
+int checkCompatibility(int gpu_id);
+void cudaCheckLastError();
+
+// ====================================================
+// Function to preprocess data for GPU
+// ====================================================
+void preprocessDataForGPU(uint32_t* data, int NUM_COMPARTMENTS, uint32_t* compartmentsPerBlock, uint32_t* offsetPerBlock, int NUM_BLOCKS);
+
+// ====================================================
+// CUDA Kernels for Ax operation
+// ====================================================
+__global__ void multiply_Ax_ICpart(
+    uint32_t*  voxelIDs,
+    uint32_t*  fiberIDs,
+    uint16_t*  orienIDs,
+    float32_t* lengths,
+    uint32_t*  segmentsPerBlock,
+    uint32_t*  offsetPerBlock,
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y);
+
+__global__ void multiply_Ax_ECpart(
+        uint32_t*  voxelIDs,
+        uint16_t*  orienIDs,
+        uint32_t*  segmentsPerBlock,
+        uint32_t*  offsetPerBlock,
+        float32_t* lut,
+        float64_t* x,
+        float64_t* y);
+
+__global__ void multiply_Ax_ISOpart(
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y);
+
+// ====================================================
+// CUDA Kernels for A'y operation
+// ====================================================
+__global__ void multiply_Aty_ICpart(
+    uint32_t*  TvoxelIC,
+    uint32_t*  TfiberIC,
+    uint16_t*  TorienIC,
+    float32_t* TlengthIC,
+    uint32_t*  compartmentsPerBlock,
+    uint32_t*  offsetPerBlock,
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y);
+
+__global__ void multiply_Aty_ECpart(
+    uint32_t*  voxelEC,
+    uint16_t*  orienEC,
+    uint32_t*  segmentsPerBlock,
+    uint32_t*  offsetPerBlock,
+    float32_t* lut,
+    float64_t* x,
+    float64_t* y);
+
+__global__ void multiply_Aty_ISOpart(
+    float* lut,
+    double* x,
+    double* y);
+
+// ====================================================
+// Constant global values in the GPU
+// ====================================================
+__constant__ int NUM_VOXELS;
+__constant__ int NUM_FIBERS;
+__constant__ int NUM_PEAKS;
+__constant__ int NUM_ORIENTATIONS;
+__constant__ int NUM_SAMPLES;
+__constant__ int NUM_DIAMETERS;
+__constant__ int NUM_ZEPPELINS;
+__constant__ int NUM_BALLS;
+__constant__ int NUM_ROWS;        
+__constant__ int NUM_COLS;      
+__constant__ int SIZE_LUTIC;      
+__constant__ int SIZE_LUTEC;     
+__constant__ int SIZE_LUTISO;
+
+// ====================================================
+// Pointers to A (IC part) in the GPU
+// ====================================================
+static uint32_t*  gpu_voxelIC;
+static uint32_t*  gpu_fiberIC;
+static uint16_t*  gpu_orienIC;
+static float32_t* gpu_lengthIC;
+static uint32_t*  gpu_segmentsPerBlockIC;
+static uint32_t*  gpu_offsetPerBlockIC;
+
+// ====================================================
+// Pointers to A' (IC part) in the GPU
+// ====================================================
+static uint32_t*  gpu_TvoxelIC;
+static uint32_t*  gpu_TfiberIC;
+static uint16_t*  gpu_TorienIC;
+static float32_t* gpu_TlengthIC;
+static uint32_t*  gpu_TfibersPerBlockIC;
+static uint32_t*  gpu_ToffsetPerBlockIC;
+
+// ====================================================
+// Pointers to A (EC part) in the GPU
+// ====================================================
+static uint32_t* gpu_voxelEC;
+static uint16_t* gpu_orienEC;
+static uint32_t* gpu_segmentsPerBlockEC;
+static uint32_t* gpu_offsetPerBlockEC;
+
+// ====================================================
+// Pointers to LUT in the GPU
+// ====================================================
+static float32_t* gpu_lutIC;
+static float32_t* gpu_lutEC;
+static float32_t* gpu_lutISO;
+
+// ====================================================
+// Pointers to x and y in the GPU
+// ====================================================
+static float64_t* gpu_x;
+static float64_t* gpu_y;
+
+// ============================================================================
+// This class creates an instance of the LinearOperator in GPU memory
+// ============================================================================
+class CudaLinearOperator {
+
+    // constant values in CPU
+    int nsegments;
+    int nvoxels;    
+    int nfibers;      
+    int npeaks;
+    int norientations;
+    int nsamples;
+    int ndiameters;
+    int nzeppelins;   
+    int nballs;
+    int size_lutic;
+    int size_lutec;
+    int size_lutiso;
+    int nrows;
+    int ncols;
+
+    public:
+         CudaLinearOperator(int nsegments, int nvoxels, int nfibers, int npeaks, int norientations, int nsamples, int ndiameters, int nzeppelins, int nballs);
+        ~CudaLinearOperator();
+
+        int setDictionary(uint32_t* voxelIC, uint32_t* fiberIC, uint16_t* orienIC, float32_t* lengthIC, uint32_t* voxelEC, uint16_t* orienEC);
+        int setTransposeDictionary(uint32_t* TvoxelIC, uint32_t* TfiberIC, uint16_t* TorienIC, float32_t* TlengthIC);
+        int setKernels(float32_t* lutIC, float32_t* lutEC, float32_t* lutISO);
+        int setVectors();
+        int setGlobals();
+        int destroy();
+
+        void  dot(float64_t* v_in, float64_t* v_out);
+        void Tdot(float64_t* v_in, float64_t* v_out);
 };
\ No newline at end of file
diff --git a/commit/operator/config.py b/commit/operator/config.py
index e4c6bf58..8cbac4ed 100755
--- a/commit/operator/config.py
+++ b/commit/operator/config.py
@@ -1,6 +1,6 @@
-nTHREADS      = None
-model         = None
-nIC           = None
-nEC      	  = None
-nISO     	  = None
-build_dir	  = None
+nTHREADS      = None
+model         = None
+nIC           = None
+nEC      	  = None
+nISO     	  = None
+build_dir	  = None
diff --git a/commit/operator/operator.pyx b/commit/operator/operator.pyx
index 4fc3a835..a4187f95 100755
--- a/commit/operator/operator.pyx
+++ b/commit/operator/operator.pyx
@@ -1,192 +1,192 @@
-#!python
-#cython: language_level=3, boundscheck=False, wraparound=False, profile=False
-
-import cython
-import numpy as np
-from amico.util import ERROR
-cimport numpy as np
-
-# Interfaces to actual C code performing the multiplications
-cdef extern void COMMIT_A(
-    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
-    double *_v_in, double *_v_out,
-    unsigned int *_ICf, unsigned int *_ICv, unsigned short *_ICo, float *_ICl,
-    unsigned int *_ECv, unsigned short *_ECo,
-    unsigned int *_ISOv,
-    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
-    unsigned int* _ICthreads, unsigned int* _ECthreads, unsigned int* _ISOthreads
-) nogil
-
-cdef extern void COMMIT_At(
-    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
-    double *_v_in, double *_v_out,
-    unsigned int *_ICf, unsigned int *_ICv, unsigned short *_ICo, float *_ICl,
-    unsigned int *_ECv, unsigned short *_ECo,
-    unsigned int *_ISOv,
-    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
-    unsigned char *_ICthreadsT, unsigned int *_ECthreadsT, unsigned int *_ISOthreadsT
-) nogil
-
-
-
-cdef class LinearOperator :
-    """This class is a wrapper to the C code for performing marix-vector multiplications
-    with the COMMIT linear operator A. The multiplications are done using C code
-    that uses information from the DICTIONARY, KERNELS and THREADS data structures.
-    """
-    cdef int nS, nF, nR, nE, nT, nV, nI, n, ndirs
-    cdef public int adjoint, n1, n2
-
-    cdef DICTIONARY
-    cdef KERNELS
-    cdef THREADS
-
-    cdef unsigned int*   ICf
-    cdef float*          ICl
-    cdef unsigned int*   ICv
-    cdef unsigned short* ICo
-    cdef unsigned int*   ECv
-    cdef unsigned short* ECo
-    cdef unsigned int*   ISOv
-
-    cdef float* LUT_IC
-    cdef float* LUT_EC
-    cdef float* LUT_ISO
-
-    cdef unsigned int*   ICthreads
-    cdef unsigned int*   ECthreads
-    cdef unsigned int*   ISOthreads
-
-    cdef unsigned char*  ICthreadsT
-    cdef unsigned int*   ECthreadsT
-    cdef unsigned int*   ISOthreadsT
-
-
-    def __init__( self, DICTIONARY, KERNELS, THREADS ) :
-        """Set the pointers to the data structures used by the C code."""
-        self.DICTIONARY = DICTIONARY
-        self.KERNELS    = KERNELS
-        self.THREADS    = THREADS
-
-        self.nF         = DICTIONARY['IC']['nF']    # number of FIBERS
-        self.nR         = KERNELS['wmr'].shape[0]   # number of FIBER RADII
-        self.nE         = DICTIONARY['EC']['nE']    # number of EC segments
-        self.nT         = KERNELS['wmh'].shape[0]   # number of EC TORTUOSITY values
-        self.nV         = DICTIONARY['nV']          # number of VOXELS
-        self.nI         = KERNELS['iso'].shape[0]   # number of ISO contributions
-        self.n          = DICTIONARY['IC']['n']     # numbner of IC segments
-        self.ndirs      = KERNELS['wmr'].shape[1]   # number of directions
-
-        if KERNELS['wmr'].size > 0 :
-            self.nS = KERNELS['wmr'].shape[2]       # number of SAMPLES
-        elif KERNELS['wmh'].size > 0 :
-            self.nS = KERNELS['wmh'].shape[2]
-        else :
-            self.nS = KERNELS['wmr'].shape[1]
-
-        self.adjoint    = 0                         # direct of inverse product
-
-        self.n1 = self.nV*self.nS
-        self.n2 = self.nR*self.nF + self.nT*self.nE + self.nI*self.nV
-
-        # get C pointers to arrays in DICTIONARY
-        cdef unsigned int [::1]   ICf  = DICTIONARY['IC']['fiber']
-        self.ICf = &ICf[0]
-        cdef float [::1]          ICl  = DICTIONARY['IC']['len']
-        self.ICl = &ICl[0]
-        cdef unsigned int [::1]   ICv  = DICTIONARY['IC']['v']
-        self.ICv = &ICv[0]
-        cdef unsigned short [::1] ICo  = DICTIONARY['IC']['o']
-        self.ICo = &ICo[0]
-        cdef unsigned int [::1]   ECv  = DICTIONARY['EC']['v']
-        self.ECv = &ECv[0]
-        cdef unsigned short [::1] ECo  = DICTIONARY['EC']['o']
-        self.ECo = &ECo[0]
-        cdef unsigned int [::1]   ISOv = DICTIONARY['ISO']['v']
-        self.ISOv = &ISOv[0]
-
-        # get C pointers to arrays in KERNELS
-        cdef float [:, :, ::1] wmrSFP = KERNELS['wmr']
-        self.LUT_IC  = &wmrSFP[0,0,0]
-        cdef float [:, :, ::1] wmhSFP = KERNELS['wmh']
-        self.LUT_EC  = &wmhSFP[0,0,0]
-        cdef float [:, ::1] isoSFP = KERNELS['iso']
-        self.LUT_ISO = &isoSFP[0,0]
-
-        # get C pointers to arrays in THREADS
-        cdef unsigned int [::1] ICthreads = THREADS['IC']
-        self.ICthreads  = &ICthreads[0]
-        cdef unsigned int [::1] ECthreads = THREADS['EC']
-        self.ECthreads  = &ECthreads[0]
-        cdef unsigned int [::1] ISOthreads = THREADS['ISO']
-        self.ISOthreads = &ISOthreads[0]
-
-        cdef unsigned char [::1] ICthreadsT = THREADS['ICt']
-        self.ICthreadsT  = &ICthreadsT[0]
-        cdef unsigned int  [::1] ECthreadsT = THREADS['ECt']
-        self.ECthreadsT  = &ECthreadsT[0]
-        cdef unsigned int  [::1] ISOthreadsT = THREADS['ISOt']
-        self.ISOthreadsT = &ISOthreadsT[0]
-
-
-    @property
-    def T( self ) :
-        """Transpose of the explicit matrix."""
-        C = LinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
-        C.adjoint = 1 - C.adjoint
-        return C
-
-
-    @property
-    def shape( self ) :
-        """Size of the explicit matrix."""
-        if not self.adjoint :
-            return ( self.n1, self.n2 )
-        else :
-            return ( self.n2, self.n1 )
-
-
-    def dot( self, double [::1] v_in  ):
-        """Wrapper to C code for efficiently performing the matrix-vector multiplications.
-
-        Parameters
-        ----------
-        v_in : 1D numpy.array of double
-            Input vector for the matrix-vector multiplication
-
-        Returns
-        -------
-        v_out : 1D numpy.array of double
-            Results of the multiplication
-        """
-
-        # Permit only matrix-vector multiplications
-        if v_in.size != self.shape[1] :
-            ERROR( "A.dot(): dimensions do not match" )
-
-        # Create output array
-        cdef double [::1] v_out = np.zeros( self.shape[0], dtype=np.float64 )
-
-        # Call the cython function to read the memory pointers
-        if not self.adjoint :
-            # DIRECT PRODUCT A*x
-            with nogil :
-                COMMIT_A(
-                    self.nF, self.n, self.nE, self.nV, self.nS, self.ndirs,
-                    &v_in[0], &v_out[0],
-                    self.ICf, self.ICv, self.ICo, self.ICl, self.ECv, self.ECo, self.ISOv,
-                    self.LUT_IC, self.LUT_EC, self.LUT_ISO,
-                    self.ICthreads, self.ECthreads, self.ISOthreads
-                )
-        else :
-            # INVERSE PRODUCT A'*y
-            with nogil :
-                COMMIT_At(
-                    self.nF, self.n, self.nE, self.nV, self.nS, self.ndirs,
-                    &v_in[0], &v_out[0],
-                    self.ICf, self.ICv, self.ICo, self.ICl, self.ECv, self.ECo, self.ISOv,
-                    self.LUT_IC, self.LUT_EC, self.LUT_ISO,
-                    self.ICthreadsT, self.ECthreadsT, self.ISOthreadsT
-                )
-
-        return v_out
+#!python
+#cython: language_level=3, boundscheck=False, wraparound=False, profile=False
+
+import cython
+import numpy as np
+from amico.util import ERROR
+cimport numpy as np
+
+# Interfaces to actual C code performing the multiplications
+cdef extern void COMMIT_A(
+    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
+    double *_v_in, double *_v_out,
+    unsigned int *_ICf, unsigned int *_ICv, unsigned short *_ICo, float *_ICl,
+    unsigned int *_ECv, unsigned short *_ECo,
+    unsigned int *_ISOv,
+    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
+    unsigned int* _ICthreads, unsigned int* _ECthreads, unsigned int* _ISOthreads
+) nogil
+
+cdef extern void COMMIT_At(
+    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
+    double *_v_in, double *_v_out,
+    unsigned int *_ICf, unsigned int *_ICv, unsigned short *_ICo, float *_ICl,
+    unsigned int *_ECv, unsigned short *_ECo,
+    unsigned int *_ISOv,
+    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
+    unsigned char *_ICthreadsT, unsigned int *_ECthreadsT, unsigned int *_ISOthreadsT
+) nogil
+
+
+
+cdef class LinearOperator :
+    """This class is a wrapper to the C code for performing marix-vector multiplications
+    with the COMMIT linear operator A. The multiplications are done using C code
+    that uses information from the DICTIONARY, KERNELS and THREADS data structures.
+    """
+    cdef int nS, nF, nR, nE, nT, nV, nI, n, ndirs
+    cdef public int adjoint, n1, n2
+
+    cdef DICTIONARY
+    cdef KERNELS
+    cdef THREADS
+
+    cdef unsigned int*   ICf
+    cdef float*          ICl
+    cdef unsigned int*   ICv
+    cdef unsigned short* ICo
+    cdef unsigned int*   ECv
+    cdef unsigned short* ECo
+    cdef unsigned int*   ISOv
+
+    cdef float* LUT_IC
+    cdef float* LUT_EC
+    cdef float* LUT_ISO
+
+    cdef unsigned int*   ICthreads
+    cdef unsigned int*   ECthreads
+    cdef unsigned int*   ISOthreads
+
+    cdef unsigned char*  ICthreadsT
+    cdef unsigned int*   ECthreadsT
+    cdef unsigned int*   ISOthreadsT
+
+
+    def __init__( self, DICTIONARY, KERNELS, THREADS ) :
+        """Set the pointers to the data structures used by the C code."""
+        self.DICTIONARY = DICTIONARY
+        self.KERNELS    = KERNELS
+        self.THREADS    = THREADS
+
+        self.nF         = DICTIONARY['IC']['nF']    # number of FIBERS
+        self.nR         = KERNELS['wmr'].shape[0]   # number of FIBER RADII
+        self.nE         = DICTIONARY['EC']['nE']    # number of EC segments
+        self.nT         = KERNELS['wmh'].shape[0]   # number of EC TORTUOSITY values
+        self.nV         = DICTIONARY['nV']          # number of VOXELS
+        self.nI         = KERNELS['iso'].shape[0]   # number of ISO contributions
+        self.n          = DICTIONARY['IC']['n']     # numbner of IC segments
+        self.ndirs      = KERNELS['wmr'].shape[1]   # number of directions
+
+        if KERNELS['wmr'].size > 0 :
+            self.nS = KERNELS['wmr'].shape[2]       # number of SAMPLES
+        elif KERNELS['wmh'].size > 0 :
+            self.nS = KERNELS['wmh'].shape[2]
+        else :
+            self.nS = KERNELS['wmr'].shape[1]
+
+        self.adjoint    = 0                         # direct of inverse product
+
+        self.n1 = self.nV*self.nS
+        self.n2 = self.nR*self.nF + self.nT*self.nE + self.nI*self.nV
+
+        # get C pointers to arrays in DICTIONARY
+        cdef unsigned int [::1]   ICf  = DICTIONARY['IC']['fiber']
+        self.ICf = &ICf[0]
+        cdef float [::1]          ICl  = DICTIONARY['IC']['len']
+        self.ICl = &ICl[0]
+        cdef unsigned int [::1]   ICv  = DICTIONARY['IC']['v']
+        self.ICv = &ICv[0]
+        cdef unsigned short [::1] ICo  = DICTIONARY['IC']['o']
+        self.ICo = &ICo[0]
+        cdef unsigned int [::1]   ECv  = DICTIONARY['EC']['v']
+        self.ECv = &ECv[0]
+        cdef unsigned short [::1] ECo  = DICTIONARY['EC']['o']
+        self.ECo = &ECo[0]
+        cdef unsigned int [::1]   ISOv = DICTIONARY['ISO']['v']
+        self.ISOv = &ISOv[0]
+
+        # get C pointers to arrays in KERNELS
+        cdef float [:, :, ::1] wmrSFP = KERNELS['wmr']
+        self.LUT_IC  = &wmrSFP[0,0,0]
+        cdef float [:, :, ::1] wmhSFP = KERNELS['wmh']
+        self.LUT_EC  = &wmhSFP[0,0,0]
+        cdef float [:, ::1] isoSFP = KERNELS['iso']
+        self.LUT_ISO = &isoSFP[0,0]
+
+        # get C pointers to arrays in THREADS
+        cdef unsigned int [::1] ICthreads = THREADS['IC']
+        self.ICthreads  = &ICthreads[0]
+        cdef unsigned int [::1] ECthreads = THREADS['EC']
+        self.ECthreads  = &ECthreads[0]
+        cdef unsigned int [::1] ISOthreads = THREADS['ISO']
+        self.ISOthreads = &ISOthreads[0]
+
+        cdef unsigned char [::1] ICthreadsT = THREADS['ICt']
+        self.ICthreadsT  = &ICthreadsT[0]
+        cdef unsigned int  [::1] ECthreadsT = THREADS['ECt']
+        self.ECthreadsT  = &ECthreadsT[0]
+        cdef unsigned int  [::1] ISOthreadsT = THREADS['ISOt']
+        self.ISOthreadsT = &ISOthreadsT[0]
+
+
+    @property
+    def T( self ) :
+        """Transpose of the explicit matrix."""
+        C = LinearOperator( self.DICTIONARY, self.KERNELS, self.THREADS )
+        C.adjoint = 1 - C.adjoint
+        return C
+
+
+    @property
+    def shape( self ) :
+        """Size of the explicit matrix."""
+        if not self.adjoint :
+            return ( self.n1, self.n2 )
+        else :
+            return ( self.n2, self.n1 )
+
+
+    def dot( self, double [::1] v_in  ):
+        """Wrapper to C code for efficiently performing the matrix-vector multiplications.
+
+        Parameters
+        ----------
+        v_in : 1D numpy.array of double
+            Input vector for the matrix-vector multiplication
+
+        Returns
+        -------
+        v_out : 1D numpy.array of double
+            Results of the multiplication
+        """
+
+        # Permit only matrix-vector multiplications
+        if v_in.size != self.shape[1] :
+            ERROR( "A.dot(): dimensions do not match" )
+
+        # Create output array
+        cdef double [::1] v_out = np.zeros( self.shape[0], dtype=np.float64 )
+
+        # Call the cython function to read the memory pointers
+        if not self.adjoint :
+            # DIRECT PRODUCT A*x
+            with nogil :
+                COMMIT_A(
+                    self.nF, self.n, self.nE, self.nV, self.nS, self.ndirs,
+                    &v_in[0], &v_out[0],
+                    self.ICf, self.ICv, self.ICo, self.ICl, self.ECv, self.ECo, self.ISOv,
+                    self.LUT_IC, self.LUT_EC, self.LUT_ISO,
+                    self.ICthreads, self.ECthreads, self.ISOthreads
+                )
+        else :
+            # INVERSE PRODUCT A'*y
+            with nogil :
+                COMMIT_At(
+                    self.nF, self.n, self.nE, self.nV, self.nS, self.ndirs,
+                    &v_in[0], &v_out[0],
+                    self.ICf, self.ICv, self.ICo, self.ICl, self.ECv, self.ECo, self.ISOv,
+                    self.LUT_IC, self.LUT_EC, self.LUT_ISO,
+                    self.ICthreadsT, self.ECthreadsT, self.ISOthreadsT
+                )
+
+        return v_out
diff --git a/commit/operator/operator.pyxbld b/commit/operator/operator.pyxbld
index 1f79d974..f3967a15 100755
--- a/commit/operator/operator.pyxbld
+++ b/commit/operator/operator.pyxbld
@@ -1,39 +1,39 @@
-import numpy
-from os import utime
-from os.path import dirname, join
-from setuptools import Extension
-
-# pass parameters to the compiler at runtime
-# [TODO] find a way to avoid using this fake module
-from commit.operator import config
-
-
-def make_ext(modname, pyxfilename):
-
-    if (config.nTHREADS is None or config.nTHREADS < 1 or config.nTHREADS > 255):
-        raise RuntimeError('config.nTHREADS must be between 1 and 255')
-    if (config.nIC is None or config.nIC < 0 or config.nIC > 20):
-        raise RuntimeError('config.nIC must be in the range [0..20]')
-    if (config.nEC is None or config.nEC < 0 or config.nEC > 20):
-        raise RuntimeError('config.nEC must be in the range [0..20]')
-    if (config.nISO is None or config.nISO < 0 or config.nISO > 20):
-        raise RuntimeError('config.nISO must be in the range [0..20]')
-
-    # Force recompilation
-    if config.model == "VolumeFractions":
-        filename = "operator_noLUT.c"
-    else:
-        filename = "operator_withLUT.c"
-    path = dirname(pyxfilename)
-
-    if config.build_dir is None:
-        utime( join(path,filename), None)
-
-    return Extension(name=modname,
-                     sources=[pyxfilename, join(path, filename)],
-                     include_dirs=[numpy.get_include()],
-                     define_macros=[('nTHREADS', config.nTHREADS),
-                                    ('nIC', config.nIC),
-                                    ('nEC', config.nEC),
-                                    ('nISO', config.nISO)],
-                     extra_compile_args=['-w', '-O3', '-Ofast'])
+import numpy
+from os import utime
+from os.path import dirname, join
+from setuptools import Extension
+
+# pass parameters to the compiler at runtime
+# [TODO] find a way to avoid using this fake module
+from commit.operator import config
+
+
+def make_ext(modname, pyxfilename):
+
+    if (config.nTHREADS is None or config.nTHREADS < 1 or config.nTHREADS > 255):
+        raise RuntimeError('config.nTHREADS must be between 1 and 255')
+    if (config.nIC is None or config.nIC < 0 or config.nIC > 20):
+        raise RuntimeError('config.nIC must be in the range [0..20]')
+    if (config.nEC is None or config.nEC < 0 or config.nEC > 20):
+        raise RuntimeError('config.nEC must be in the range [0..20]')
+    if (config.nISO is None or config.nISO < 0 or config.nISO > 20):
+        raise RuntimeError('config.nISO must be in the range [0..20]')
+
+    # Force recompilation
+    if config.model == "VolumeFractions":
+        filename = "operator_noLUT.c"
+    else:
+        filename = "operator_withLUT.c"
+    path = dirname(pyxfilename)
+
+    if config.build_dir is None:
+        utime( join(path,filename), None)
+
+    return Extension(name=modname,
+                     sources=[pyxfilename, join(path, filename)],
+                     include_dirs=[numpy.get_include()],
+                     define_macros=[('nTHREADS', config.nTHREADS),
+                                    ('nIC', config.nIC),
+                                    ('nEC', config.nEC),
+                                    ('nISO', config.nISO)],
+                     extra_compile_args=['-w', '-O3', '-Ofast'])
diff --git a/commit/operator/operator_noLUT.c b/commit/operator/operator_noLUT.c
index 1bdfd5f9..061ca1d1 100644
--- a/commit/operator/operator_noLUT.c
+++ b/commit/operator/operator_noLUT.c
@@ -1,187 +1,187 @@
-#include <pthread.h>
-#include <stdint.h> // uint32_t etc
-
-// number of THREADS
-#ifdef nTHREADS
-    #if (nTHREADS<1 || nTHREADS>255)
-    #error "nTHREADS" must be in the range 0..255
-    #endif
-#else
-    #error "nTHREADS" parameter must be passed to the compiler as "-DnTHREADS=<value>"
-#endif
-
-
-/* global variables */
-int         nF, n;
-double      *x, *Y;
-uint32_t    *ICthreads, *ISOthreads;
-uint8_t     *ICthreadsT;
-uint32_t    *ISOthreadsT;
-uint32_t    *ICf, *ICv, *ISOv;
-float       *ICl;
-
-
-// ====================================================
-// Compute a sub-block of the A*x MAtRIX-VECTOR product
-// ====================================================
-void* COMMIT_A__block( void *ptr )
-{
-    int      id = (long)ptr;
-    double   x0;
-    double   *xPtr;
-    uint32_t *t_v, *t_vEnd, *t_f;
-    float    *t_l;
-
-    // intra-cellular compartments
-    t_v    = ICv + ICthreads[id];
-    t_vEnd = ICv + ICthreads[id+1];
-    t_l    = ICl + ICthreads[id];
-    t_f    = ICf + ICthreads[id];
-
-    while( t_v != t_vEnd )
-    {
-        x0 = x[*t_f];
-        if ( x0 != 0 )
-            Y[*t_v] += (double)(*t_l) * x0;
-        t_f++;
-        t_v++;
-        t_l++;
-    }
-
-#if nISO>=1
-    // isotropic compartments
-    t_v    = ISOv + ISOthreads[id];
-    t_vEnd = ISOv + ISOthreads[id+1];
-    xPtr   = x + nF + ISOthreads[id];
-
-    while( t_v != t_vEnd )
-    {
-        x0 = *xPtr++;
-        if ( x0 != 0 )
-            Y[*t_v] += x0;
-        t_v++;
-    }
-#endif
-
-    pthread_exit( 0 );
-}
-
-
-// =========================
-// Function called by CYTHON
-// =========================
-void COMMIT_A(
-    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
-    double *_vIN, double *_vOUT,
-    uint32_t *_ICf, uint32_t *_ICv, uint16_t *_ICo, float *_ICl,
-    uint32_t *_ECv, uint16_t *_ECo,
-    uint32_t *_ISOv,
-    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
-    uint32_t* _ICthreads, uint32_t* _ECthreads, uint32_t* _ISOthreads
-)
-{
-    nF = _nF;
-    n  = _n;
-
-    x = _vIN;
-    Y = _vOUT;
-
-    ICf  = _ICf;
-    ICv  = _ICv;
-    ICl  = _ICl;
-    ISOv = _ISOv;
-
-    ICthreads  = _ICthreads;
-    ISOthreads = _ISOthreads;
-
-    // Run SEPARATE THREADS to perform the multiplication
-    pthread_t threads[nTHREADS];
-    int t;
-    for(t=0; t<nTHREADS ; t++)
-        pthread_create( &threads[t], NULL, COMMIT_A__block, (void *) (long int)t );
-    for(t=0; t<nTHREADS ; t++)
-        pthread_join( threads[t], NULL );
-    return;
-}
-
-
-
-/* ===================================================== */
-/* Compute a sub-block of the A'*y MAtRIX-VECTOR product */
-/* ===================================================== */
-void* COMMIT_At__block( void *ptr )
-{
-    int      id = (long)ptr;
-    double   *xPtr;
-    uint32_t *t_v, *t_vEnd, *t_f;
-    float    *t_l;
-    uint8_t  *t_t;
-
-    // intra-cellular compartments
-    t_v    = ICv;
-    t_vEnd = ICv + n;
-    t_l    = ICl;
-    t_f    = ICf;
-    t_t    = ICthreadsT;
-
-    while( t_v != t_vEnd )
-    {
-        // in this case, I need to walk throug because the segments are ordered in "voxel order"
-        if ( *t_t == id )
-            x[*t_f] += (double)(*t_l) * Y[*t_v];
-        t_t++;
-        t_f++;
-        t_v++;
-        t_l++;
-    }
-
-#if nISO>=1
-    // isotropic compartments
-    t_v    = ISOv + ISOthreadsT[id];
-    t_vEnd = ISOv + ISOthreadsT[id+1];
-    xPtr   = x + nF + ISOthreadsT[id];
-
-    while( t_v != t_vEnd )
-        (*xPtr++) += Y[*t_v++];
-#endif
-
-    pthread_exit( 0 );
-}
-
-
-// =========================
-// Function called by CYTHON
-// =========================
-void COMMIT_At(
-    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
-    double *_vIN, double *_vOUT,
-    uint32_t *_ICf, uint32_t *_ICv, uint16_t *_ICo, float *_ICl,
-    uint32_t *_ECv, uint16_t *_ECo,
-    uint32_t *_ISOv,
-    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
-    uint8_t* _ICthreadsT, uint32_t* _ECthreadsT, uint32_t* _ISOthreadsT
-)
-{
-    nF = _nF;
-    n  = _n;
-
-    x = _vOUT;
-    Y = _vIN;
-
-    ICf  = _ICf;
-    ICv  = _ICv;
-    ICl  = _ICl;
-    ISOv = _ISOv;
-
-    ICthreadsT  = _ICthreadsT;
-    ISOthreadsT = _ISOthreadsT;
-
-    // Run SEPARATE THREADS to perform the multiplication
-    pthread_t threads[nTHREADS];
-    int t;
-    for(t=0; t<nTHREADS ; t++)
-        pthread_create( &threads[t], NULL, COMMIT_At__block, (void *) (long int)t );
-    for(t=0; t<nTHREADS ; t++)
-        pthread_join( threads[t], NULL );
-    return;
-}
+#include <pthread.h>
+#include <stdint.h> // uint32_t etc
+
+// number of THREADS
+#ifdef nTHREADS
+    #if (nTHREADS<1 || nTHREADS>255)
+    #error "nTHREADS" must be in the range 0..255
+    #endif
+#else
+    #error "nTHREADS" parameter must be passed to the compiler as "-DnTHREADS=<value>"
+#endif
+
+
+/* global variables */
+int         nF, n;
+double      *x, *Y;
+uint32_t    *ICthreads, *ISOthreads;
+uint8_t     *ICthreadsT;
+uint32_t    *ISOthreadsT;
+uint32_t    *ICf, *ICv, *ISOv;
+float       *ICl;
+
+
+// ====================================================
+// Compute a sub-block of the A*x MAtRIX-VECTOR product
+// ====================================================
+void* COMMIT_A__block( void *ptr )
+{
+    int      id = (long)ptr;
+    double   x0;
+    double   *xPtr;
+    uint32_t *t_v, *t_vEnd, *t_f;
+    float    *t_l;
+
+    // intra-cellular compartments
+    t_v    = ICv + ICthreads[id];
+    t_vEnd = ICv + ICthreads[id+1];
+    t_l    = ICl + ICthreads[id];
+    t_f    = ICf + ICthreads[id];
+
+    while( t_v != t_vEnd )
+    {
+        x0 = x[*t_f];
+        if ( x0 != 0 )
+            Y[*t_v] += (double)(*t_l) * x0;
+        t_f++;
+        t_v++;
+        t_l++;
+    }
+
+#if nISO>=1
+    // isotropic compartments
+    t_v    = ISOv + ISOthreads[id];
+    t_vEnd = ISOv + ISOthreads[id+1];
+    xPtr   = x + nF + ISOthreads[id];
+
+    while( t_v != t_vEnd )
+    {
+        x0 = *xPtr++;
+        if ( x0 != 0 )
+            Y[*t_v] += x0;
+        t_v++;
+    }
+#endif
+
+    pthread_exit( 0 );
+}
+
+
+// =========================
+// Function called by CYTHON
+// =========================
+void COMMIT_A(
+    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
+    double *_vIN, double *_vOUT,
+    uint32_t *_ICf, uint32_t *_ICv, uint16_t *_ICo, float *_ICl,
+    uint32_t *_ECv, uint16_t *_ECo,
+    uint32_t *_ISOv,
+    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
+    uint32_t* _ICthreads, uint32_t* _ECthreads, uint32_t* _ISOthreads
+)
+{
+    nF = _nF;
+    n  = _n;
+
+    x = _vIN;
+    Y = _vOUT;
+
+    ICf  = _ICf;
+    ICv  = _ICv;
+    ICl  = _ICl;
+    ISOv = _ISOv;
+
+    ICthreads  = _ICthreads;
+    ISOthreads = _ISOthreads;
+
+    // Run SEPARATE THREADS to perform the multiplication
+    pthread_t threads[nTHREADS];
+    int t;
+    for(t=0; t<nTHREADS ; t++)
+        pthread_create( &threads[t], NULL, COMMIT_A__block, (void *) (long int)t );
+    for(t=0; t<nTHREADS ; t++)
+        pthread_join( threads[t], NULL );
+    return;
+}
+
+
+
+/* ===================================================== */
+/* Compute a sub-block of the A'*y MAtRIX-VECTOR product */
+/* ===================================================== */
+void* COMMIT_At__block( void *ptr )
+{
+    int      id = (long)ptr;
+    double   *xPtr;
+    uint32_t *t_v, *t_vEnd, *t_f;
+    float    *t_l;
+    uint8_t  *t_t;
+
+    // intra-cellular compartments
+    t_v    = ICv;
+    t_vEnd = ICv + n;
+    t_l    = ICl;
+    t_f    = ICf;
+    t_t    = ICthreadsT;
+
+    while( t_v != t_vEnd )
+    {
+        // in this case, I need to walk throug because the segments are ordered in "voxel order"
+        if ( *t_t == id )
+            x[*t_f] += (double)(*t_l) * Y[*t_v];
+        t_t++;
+        t_f++;
+        t_v++;
+        t_l++;
+    }
+
+#if nISO>=1
+    // isotropic compartments
+    t_v    = ISOv + ISOthreadsT[id];
+    t_vEnd = ISOv + ISOthreadsT[id+1];
+    xPtr   = x + nF + ISOthreadsT[id];
+
+    while( t_v != t_vEnd )
+        (*xPtr++) += Y[*t_v++];
+#endif
+
+    pthread_exit( 0 );
+}
+
+
+// =========================
+// Function called by CYTHON
+// =========================
+void COMMIT_At(
+    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
+    double *_vIN, double *_vOUT,
+    uint32_t *_ICf, uint32_t *_ICv, uint16_t *_ICo, float *_ICl,
+    uint32_t *_ECv, uint16_t *_ECo,
+    uint32_t *_ISOv,
+    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
+    uint8_t* _ICthreadsT, uint32_t* _ECthreadsT, uint32_t* _ISOthreadsT
+)
+{
+    nF = _nF;
+    n  = _n;
+
+    x = _vOUT;
+    Y = _vIN;
+
+    ICf  = _ICf;
+    ICv  = _ICv;
+    ICl  = _ICl;
+    ISOv = _ISOv;
+
+    ICthreadsT  = _ICthreadsT;
+    ISOthreadsT = _ISOthreadsT;
+
+    // Run SEPARATE THREADS to perform the multiplication
+    pthread_t threads[nTHREADS];
+    int t;
+    for(t=0; t<nTHREADS ; t++)
+        pthread_create( &threads[t], NULL, COMMIT_At__block, (void *) (long int)t );
+    for(t=0; t<nTHREADS ; t++)
+        pthread_join( threads[t], NULL );
+    return;
+}
diff --git a/commit/operator/operator_withLUT.c b/commit/operator/operator_withLUT.c
index 3e549408..2137d4a3 100644
--- a/commit/operator/operator_withLUT.c
+++ b/commit/operator/operator_withLUT.c
@@ -1,2247 +1,2247 @@
-#include <pthread.h>
-#include <stdint.h> // uint32_t etc
-
-// number of THREADS
-#ifdef nTHREADS
-    #if (nTHREADS<1 || nTHREADS>255)
-    #error "nTHREADS" must be in the range 0..255
-    #endif
-#else
-    #error "nTHREADS" parameter must be passed to the compiler as "-DnTHREADS=<value>"
-#endif
-
-
-/* global variables */
-int         nF, n, nE, nV, nS, ndirs;
-double      *x, *Y;
-uint32_t    *ICthreads, *ECthreads, *ISOthreads;
-uint8_t     *ICthreadsT;
-uint32_t    *ECthreadsT, *ISOthreadsT;
-uint32_t    *ICf, *ICv, *ECv, *ISOv;
-uint16_t    *ICo, *ECo;
-float       *ICl;
-float       *wmrSFP0, *wmrSFP1, *wmrSFP2, *wmrSFP3, *wmrSFP4, *wmrSFP5, *wmrSFP6, *wmrSFP7, *wmrSFP8, *wmrSFP9, *wmrSFP10, *wmrSFP11, *wmrSFP12, *wmrSFP13, *wmrSFP14, *wmrSFP15, *wmrSFP16, *wmrSFP17, *wmrSFP18, *wmrSFP19;
-float       *wmhSFP0, *wmhSFP1, *wmhSFP2, *wmhSFP3, *wmhSFP4, *wmhSFP5, *wmhSFP6, *wmhSFP7, *wmhSFP8, *wmhSFP9, *wmhSFP10, *wmhSFP11, *wmhSFP12, *wmhSFP13, *wmhSFP14, *wmhSFP15, *wmhSFP16, *wmhSFP17, *wmhSFP18, *wmhSFP19;
-float       *isoSFP0, *isoSFP1, *isoSFP2, *isoSFP3, *isoSFP4, *isoSFP5, *isoSFP6, *isoSFP7, *isoSFP8, *isoSFP9, *isoSFP10, *isoSFP11, *isoSFP12, *isoSFP13, *isoSFP14, *isoSFP15, *isoSFP16, *isoSFP17, *isoSFP18, *isoSFP19;
-
-
-
-// ====================================================
-// Compute a sub-block of the A*x MAtRIX-VECTOR product
-// ====================================================
-void* COMMIT_A__block( void *ptr )
-{
-    int      id = (long)ptr;
-    int      offset;
-    double   x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, w;
-    double   *x_Ptr0, *x_Ptr1, *x_Ptr2, *x_Ptr3, *x_Ptr4, *x_Ptr5, *x_Ptr6, *x_Ptr7, *x_Ptr8, *x_Ptr9, *x_Ptr10, *x_Ptr11, *x_Ptr12, *x_Ptr13, *x_Ptr14, *x_Ptr15, *x_Ptr16, *x_Ptr17, *x_Ptr18, *x_Ptr19;
-    double   *Yptr, *YptrEnd;
-    float    *SFP0ptr, *SFP1ptr, *SFP2ptr, *SFP3ptr, *SFP4ptr, *SFP5ptr, *SFP6ptr, *SFP7ptr, *SFP8ptr, *SFP9ptr, *SFP10ptr, *SFP11ptr, *SFP12ptr, *SFP13ptr, *SFP14ptr, *SFP15ptr, *SFP16ptr, *SFP17ptr, *SFP18ptr, *SFP19ptr;
-    uint32_t *t_v, *t_vEnd, *t_f;
-    uint16_t *t_o;
-    float    *t_l;
-
-#if nIC>=1
-    // intra-cellular compartments
-    t_v    = ICv + ICthreads[id];
-    t_vEnd = ICv + ICthreads[id+1];
-    t_o    = ICo + ICthreads[id];
-    t_l    = ICl + ICthreads[id];
-    t_f    = ICf + ICthreads[id];
-
-    while( t_v != t_vEnd )
-    {
-        x_Ptr0 = x + *t_f;
-        x0 = *x_Ptr0;
-        #if nIC>=2
-        x_Ptr1 = x_Ptr0 + nF;
-        x1 = *x_Ptr1;
-        #endif
-        #if nIC>=3
-        x_Ptr2 = x_Ptr1 + nF;
-        x2 = *x_Ptr2;
-        #endif
-        #if nIC>=4
-        x_Ptr3 = x_Ptr2 + nF;
-        x3 = *x_Ptr3;
-        #endif
-        #if nIC>=5
-        x_Ptr4 = x_Ptr3 + nF;
-        x4 = *x_Ptr4;
-        #endif
-        #if nIC>=6
-        x_Ptr5 = x_Ptr4 + nF;
-        x5 = *x_Ptr5;
-        #endif
-        #if nIC>=7
-        x_Ptr6 = x_Ptr5 + nF;
-        x6 = *x_Ptr6;
-        #endif
-        #if nIC>=8
-        x_Ptr7 = x_Ptr6 + nF;
-        x7 = *x_Ptr7;
-        #endif
-        #if nIC>=9
-        x_Ptr8 = x_Ptr7 + nF;
-        x8 = *x_Ptr8;
-        #endif
-        #if nIC>=10
-        x_Ptr9 = x_Ptr8 + nF;
-        x9 = *x_Ptr9;
-        #endif
-        #if nIC>=11
-        x_Ptr10 = x_Ptr9 + nF;
-        x10 = *x_Ptr10;
-        #endif
-        #if nIC>=12
-        x_Ptr11 = x_Ptr10 + nF;
-        x11 = *x_Ptr11;
-        #endif
-        #if nIC>=13
-        x_Ptr12 = x_Ptr11 + nF;
-        x12 = *x_Ptr12;
-        #endif
-        #if nIC>=14
-        x_Ptr13 = x_Ptr12 + nF;
-        x13 = *x_Ptr13;
-        #endif
-        #if nIC>=15
-        x_Ptr14 = x_Ptr13 + nF;
-        x14 = *x_Ptr14;
-        #endif
-        #if nIC>=16
-        x_Ptr15 = x_Ptr14 + nF;
-        x15 = *x_Ptr15;
-        #endif
-        #if nIC>=17
-        x_Ptr16 = x_Ptr15 + nF;
-        x16 = *x_Ptr16;
-        #endif
-        #if nIC>=18
-        x_Ptr17 = x_Ptr16 + nF;
-        x17 = *x_Ptr17;
-        #endif
-        #if nIC>=19
-        x_Ptr18 = x_Ptr17 + nF;
-        x18 = *x_Ptr18;
-        #endif
-        #if nIC>=20
-        x_Ptr19 = x_Ptr18 + nF;
-        x19 = *x_Ptr19;
-        #endif
-
-        if ( x0 != 0
-        #if nIC>=2
-            || x1 != 0
-        #endif
-        #if nIC>=3
-            || x2 != 0
-        #endif
-        #if nIC>=4
-            || x3 != 0
-        #endif
-        #if nIC>=5
-            || x4 != 0
-        #endif
-        #if nIC>=6
-            || x5 != 0
-        #endif
-        #if nIC>=7
-            || x6 != 0
-        #endif
-        #if nIC>=8
-            || x7 != 0
-        #endif
-        #if nIC>=9
-            || x8 != 0
-        #endif
-        #if nIC>=10
-            || x9 != 0
-        #endif
-        #if nIC>=11
-            || x10 != 0
-        #endif
-        #if nIC>=12
-            || x11 != 0
-        #endif
-        #if nIC>=13
-            || x12 != 0
-        #endif
-        #if nIC>=14
-            || x13 != 0
-        #endif
-        #if nIC>=15
-            || x14 != 0
-        #endif
-        #if nIC>=16
-            || x15 != 0
-        #endif
-        #if nIC>=17
-            || x16 != 0
-        #endif
-        #if nIC>=18
-            || x17 != 0
-        #endif
-        #if nIC>=19
-            || x18 != 0
-        #endif
-        #if nIC>=20
-            || x19 != 0
-        #endif
-        )
-        {
-            Yptr    = Y    + nS * (*t_v);
-            YptrEnd = Yptr + nS;
-            w       = (double)(*t_l);
-            offset  = nS * (*t_o);
-            SFP0ptr = wmrSFP0 + offset;
-            #if nIC>=2
-            SFP1ptr = wmrSFP1 + offset;
-            #endif
-            #if nIC>=3
-            SFP2ptr = wmrSFP2 + offset;
-            #endif
-            #if nIC>=4
-            SFP3ptr = wmrSFP3 + offset;
-            #endif
-            #if nIC>=5
-            SFP4ptr = wmrSFP4 + offset;
-            #endif
-            #if nIC>=6
-            SFP5ptr = wmrSFP5 + offset;
-            #endif
-            #if nIC>=7
-            SFP6ptr = wmrSFP6 + offset;
-            #endif
-            #if nIC>=8
-            SFP7ptr = wmrSFP7 + offset;
-            #endif
-            #if nIC>=9
-            SFP8ptr = wmrSFP8 + offset;
-            #endif
-            #if nIC>=10
-            SFP9ptr = wmrSFP9 + offset;
-            #endif
-            #if nIC>=11
-            SFP10ptr = wmrSFP10 + offset;
-            #endif
-            #if nIC>=12
-            SFP11ptr = wmrSFP11 + offset;
-            #endif
-            #if nIC>=13
-            SFP12ptr = wmrSFP12 + offset;
-            #endif
-            #if nIC>=14
-            SFP13ptr = wmrSFP13 + offset;
-            #endif
-            #if nIC>=15
-            SFP14ptr = wmrSFP14 + offset;
-            #endif
-            #if nIC>=16
-            SFP15ptr = wmrSFP15 + offset;
-            #endif
-            #if nIC>=17
-            SFP16ptr = wmrSFP16 + offset;
-            #endif
-            #if nIC>=18
-            SFP17ptr = wmrSFP17 + offset;
-            #endif
-            #if nIC>=19
-            SFP18ptr = wmrSFP18 + offset;
-            #endif
-            #if nIC>=20
-            SFP19ptr = wmrSFP19 + offset;
-            #endif
-
-            while( Yptr != YptrEnd )
-                (*Yptr++) += w * (
-                          x0 * (*SFP0ptr++)
-                        #if nIC>=2
-                        + x1 * (*SFP1ptr++)
-                        #endif
-                        #if nIC>=3
-                        + x2 * (*SFP2ptr++)
-                        #endif
-                        #if nIC>=4
-                        + x3 * (*SFP3ptr++)
-                        #endif
-                        #if nIC>=5
-                        + x4 * (*SFP4ptr++)
-                        #endif
-                        #if nIC>=6
-                        + x5 * (*SFP5ptr++)
-                        #endif
-                        #if nIC>=7
-                        + x6 * (*SFP6ptr++)
-                        #endif
-                        #if nIC>=8
-                        + x7 * (*SFP7ptr++)
-                        #endif
-                        #if nIC>=9
-                        + x8 * (*SFP8ptr++)
-                        #endif
-                        #if nIC>=10
-                        + x9 * (*SFP9ptr++)
-                        #endif
-                        #if nIC>=11
-                        + x10 * (*SFP10ptr++)
-                        #endif
-                        #if nIC>=12
-                        + x11 * (*SFP11ptr++)
-                        #endif
-                        #if nIC>=13
-                        + x12 * (*SFP12ptr++)
-                        #endif
-                        #if nIC>=14
-                        + x13 * (*SFP13ptr++)
-                        #endif
-                        #if nIC>=15
-                        + x14 * (*SFP14ptr++)
-                        #endif
-                        #if nIC>=16
-                        + x15 * (*SFP15ptr++)
-                        #endif
-                        #if nIC>=17
-                        + x16 * (*SFP16ptr++)
-                        #endif
-                        #if nIC>=18
-                        + x17 * (*SFP17ptr++)
-                        #endif
-                        #if nIC>=19
-                        + x18 * (*SFP18ptr++)
-                        #endif
-                        #if nIC>=20
-                        + x19 * (*SFP19ptr++)
-                        #endif
-                );
-        }
-
-        t_f++;
-        t_v++;
-        t_o++;
-        t_l++;
-    }
-#endif
-
-#if nEC>=1
-    // extra-cellular compartments
-    t_v    = ECv + ECthreads[id];
-    t_vEnd = ECv + ECthreads[id+1];
-    t_o    = ECo + ECthreads[id];
-
-    x_Ptr0 = x + nIC*nF + ECthreads[id];
-    #if nEC>=2
-    x_Ptr1 = x_Ptr0 + nE;
-    #endif
-    #if nEC>=3
-    x_Ptr2 = x_Ptr1 + nE;
-    #endif
-    #if nEC>=4
-    x_Ptr3 = x_Ptr2 + nE;
-    #endif
-    #if nEC>=5
-    x_Ptr4 = x_Ptr3 + nE;
-    #endif
-    #if nEC>=6
-    x_Ptr5 = x_Ptr4 + nE;
-    #endif
-    #if nEC>=7
-    x_Ptr6 = x_Ptr5 + nE;
-    #endif
-    #if nEC>=8
-    x_Ptr7 = x_Ptr6 + nE;
-    #endif
-    #if nEC>=9
-    x_Ptr8 = x_Ptr7 + nE;
-    #endif
-    #if nEC>=10
-    x_Ptr9 = x_Ptr8 + nE;
-    #endif
-    #if nEC>=11
-    x_Ptr10 = x_Ptr9 + nE;
-    #endif
-    #if nEC>=12
-    x_Ptr11 = x_Ptr10 + nE;
-    #endif
-    #if nEC>=13
-    x_Ptr12 = x_Ptr11 + nE;
-    #endif
-    #if nEC>=14
-    x_Ptr13 = x_Ptr12 + nE;
-    #endif
-    #if nEC>=15
-    x_Ptr14 = x_Ptr13 + nE;
-    #endif
-    #if nEC>=16
-    x_Ptr15 = x_Ptr14 + nE;
-    #endif
-    #if nEC>=17
-    x_Ptr16 = x_Ptr15 + nE;
-    #endif
-    #if nEC>=18
-    x_Ptr17 = x_Ptr16 + nE;
-    #endif
-    #if nEC>=19
-    x_Ptr18 = x_Ptr17 + nE;
-    #endif
-    #if nEC>=20
-    x_Ptr19 = x_Ptr18 + nE;
-    #endif
-
-    while( t_v != t_vEnd )
-    {
-        x0 = *x_Ptr0++;
-        #if nEC>=2
-        x1 = *x_Ptr1++;
-        #endif
-        #if nEC>=3
-        x2 = *x_Ptr2++;
-        #endif
-        #if nEC>=4
-        x3 = *x_Ptr3++;
-        #endif
-        #if nEC>=5
-        x4 = *x_Ptr4++;
-        #endif
-        #if nEC>=6
-        x5 = *x_Ptr5++;
-        #endif
-        #if nEC>=7
-        x6 = *x_Ptr6++;
-        #endif
-        #if nEC>=8
-        x7 = *x_Ptr7++;
-        #endif
-        #if nEC>=9
-        x8 = *x_Ptr8++;
-        #endif
-        #if nEC>=10
-        x9 = *x_Ptr9++;
-        #endif
-        #if nEC>=11
-        x10 = *x_Ptr10++;
-        #endif
-        #if nEC>=12
-        x11 = *x_Ptr11++;
-        #endif
-        #if nEC>=13
-        x12 = *x_Ptr12++;
-        #endif
-        #if nEC>=14
-        x13 = *x_Ptr13++;
-        #endif
-        #if nEC>=15
-        x14 = *x_Ptr14++;
-        #endif
-        #if nEC>=16
-        x15 = *x_Ptr15++;
-        #endif
-        #if nEC>=17
-        x16 = *x_Ptr16++;
-        #endif
-        #if nEC>=18
-        x17 = *x_Ptr17++;
-        #endif
-        #if nEC>=19
-        x18 = *x_Ptr18++;
-        #endif
-        #if nEC>=20
-        x19 = *x_Ptr19++;
-        #endif
-        if (
-               x0 != 0
-            #if nEC>=2
-            || x1 != 0
-            #endif
-            #if nEC>=3
-            || x2 != 0
-            #endif
-            #if nEC>=4
-            || x3 != 0
-            #endif
-            #if nEC>=5
-            || x4 != 0
-            #endif
-            #if nEC>=6
-            || x5 != 0
-            #endif
-            #if nEC>=7
-            || x6 != 0
-            #endif
-            #if nEC>=8
-            || x7 != 0
-            #endif
-            #if nEC>=9
-            || x8 != 0
-            #endif
-            #if nEC>=10
-            || x9 != 0
-            #endif
-            #if nEC>=11
-            || x10 != 0
-            #endif
-            #if nEC>=12
-            || x11 != 0
-            #endif
-            #if nEC>=13
-            || x12 != 0
-            #endif
-            #if nEC>=14
-            || x13 != 0
-            #endif
-            #if nEC>=15
-            || x14 != 0
-            #endif
-            #if nEC>=16
-            || x15 != 0
-            #endif
-            #if nEC>=17
-            || x16 != 0
-            #endif
-            #if nEC>=18
-            || x17 != 0
-            #endif
-            #if nEC>=19
-            || x18 != 0
-            #endif
-            #if nEC>=20
-            || x19 != 0
-            #endif
-          )
-        {
-            Yptr    = Y    + nS * (*t_v);
-            YptrEnd = Yptr + nS;
-            offset  = nS * (*t_o);
-            SFP0ptr = wmhSFP0 + offset;
-            #if nEC>=2
-            SFP1ptr = wmhSFP1 + offset;
-            #endif
-            #if nEC>=3
-            SFP2ptr = wmhSFP2 + offset;
-            #endif
-            #if nEC>=4
-            SFP3ptr = wmhSFP3 + offset;
-            #endif
-            #if nEC>=5
-            SFP4ptr = wmhSFP4 + offset;
-            #endif
-            #if nEC>=6
-            SFP5ptr = wmhSFP5 + offset;
-            #endif
-            #if nEC>=7
-            SFP6ptr = wmhSFP6 + offset;
-            #endif
-            #if nEC>=8
-            SFP7ptr = wmhSFP7 + offset;
-            #endif
-            #if nEC>=9
-            SFP8ptr = wmhSFP8 + offset;
-            #endif
-            #if nEC>=10
-            SFP9ptr = wmhSFP9 + offset;
-            #endif
-            #if nEC>=11
-            SFP10ptr = wmhSFP10 + offset;
-            #endif
-            #if nEC>=12
-            SFP11ptr = wmhSFP11 + offset;
-            #endif
-            #if nEC>=13
-            SFP12ptr = wmhSFP12 + offset;
-            #endif
-            #if nEC>=14
-            SFP13ptr = wmhSFP13 + offset;
-            #endif
-            #if nEC>=15
-            SFP14ptr = wmhSFP14 + offset;
-            #endif
-            #if nEC>=16
-            SFP15ptr = wmhSFP15 + offset;
-            #endif
-            #if nEC>=17
-            SFP16ptr = wmhSFP16 + offset;
-            #endif
-            #if nEC>=18
-            SFP17ptr = wmhSFP17 + offset;
-            #endif
-            #if nEC>=19
-            SFP18ptr = wmhSFP18 + offset;
-            #endif
-            #if nEC>=20
-            SFP19ptr = wmhSFP19 + offset;
-            #endif
-
-            while( Yptr != YptrEnd )
-                (*Yptr++) += (
-                      x0 * (*SFP0ptr++)
-                    #if nEC>=2
-                    + x1 * (*SFP1ptr++)
-                    #endif
-                    #if nEC>=3
-                    + x2 * (*SFP2ptr++)
-                    #endif
-                    #if nEC>=4
-                    + x3 * (*SFP3ptr++)
-                    #endif
-                    #if nEC>=5
-                    + x4 * (*SFP4ptr++)
-                    #endif
-                    #if nEC>=6
-                    + x5 * (*SFP5ptr++)
-                    #endif
-                    #if nEC>=7
-                    + x6 * (*SFP6ptr++)
-                    #endif
-                    #if nEC>=8
-                    + x7 * (*SFP7ptr++)
-                    #endif
-                    #if nEC>=9
-                    + x8 * (*SFP8ptr++)
-                    #endif
-                    #if nEC>=10
-                    + x9 * (*SFP9ptr++)
-                    #endif
-                    #if nEC>=11
-                    + x10 * (*SFP10ptr++)
-                    #endif
-                    #if nEC>=12
-                    + x11 * (*SFP11ptr++)
-                    #endif
-                    #if nEC>=13
-                    + x12 * (*SFP12ptr++)
-                    #endif
-                    #if nEC>=14
-                    + x13 * (*SFP13ptr++)
-                    #endif
-                    #if nEC>=15
-                    + x14 * (*SFP14ptr++)
-                    #endif
-                    #if nEC>=16
-                    + x15 * (*SFP15ptr++)
-                    #endif
-                    #if nEC>=17
-                    + x16 * (*SFP16ptr++)
-                    #endif
-                    #if nEC>=18
-                    + x17 * (*SFP17ptr++)
-                    #endif
-                    #if nEC>=19
-                    + x18 * (*SFP18ptr++)
-                    #endif
-                    #if nEC>=20
-                    + x19 * (*SFP19ptr++)
-                    #endif
-
-                );
-        }
-        t_v++;
-        t_o++;
-    }
-#endif
-
-#if nISO>=1
-    // isotropic compartments
-    t_v    = ISOv + ISOthreads[id];
-    t_vEnd = ISOv + ISOthreads[id+1];
-
-    x_Ptr0 = x + nIC*nF + nEC*nE + ISOthreads[id];
-    #if nISO>=2
-    x_Ptr1 = x_Ptr0 + nV;
-    #endif
-    #if nISO>=3
-    x_Ptr2 = x_Ptr1 + nV;
-    #endif
-    #if nISO>=4
-    x_Ptr3 = x_Ptr2 + nV;
-    #endif
-    #if nISO>=5
-    x_Ptr4 = x_Ptr3 + nV;
-    #endif
-    #if nISO>=6
-    x_Ptr5 = x_Ptr4 + nV;
-    #endif
-    #if nISO>=7
-    x_Ptr6 = x_Ptr5 + nV;
-    #endif
-    #if nISO>=8
-    x_Ptr7 = x_Ptr6 + nV;
-    #endif
-    #if nISO>=9
-    x_Ptr8 = x_Ptr7 + nV;
-    #endif
-    #if nISO>=10
-    x_Ptr9 = x_Ptr8 + nV;
-    #endif
-    #if nISO>=11
-    x_Ptr10 = x_Ptr9 + nV;
-    #endif
-    #if nISO>=12
-    x_Ptr11 = x_Ptr10 + nV;
-    #endif
-    #if nISO>=13
-    x_Ptr12 = x_Ptr11 + nV;
-    #endif
-    #if nISO>=14
-    x_Ptr13 = x_Ptr12 + nV;
-    #endif
-    #if nISO>=15
-    x_Ptr14 = x_Ptr13 + nV;
-    #endif
-    #if nISO>=16
-    x_Ptr15 = x_Ptr14 + nV;
-    #endif
-    #if nISO>=17
-    x_Ptr16 = x_Ptr15 + nV;
-    #endif
-    #if nISO>=18
-    x_Ptr17 = x_Ptr16 + nV;
-    #endif
-    #if nISO>=19
-    x_Ptr18 = x_Ptr17 + nV;
-    #endif
-    #if nISO>=20
-    x_Ptr19 = x_Ptr18 + nV;
-    #endif
-
-    while( t_v != t_vEnd )
-    {
-        x0 = *x_Ptr0++;
-        #if nISO>=2
-        x1 = *x_Ptr1++;
-        #endif
-        #if nISO>=3
-        x2 = *x_Ptr2++;
-        #endif
-        #if nISO>=4
-        x3 = *x_Ptr3++;
-        #endif
-        #if nISO>=5
-        x4 = *x_Ptr4++;
-        #endif
-        #if nISO>=6
-        x5 = *x_Ptr5++;
-        #endif
-        #if nISO>=7
-        x6 = *x_Ptr6++;
-        #endif
-        #if nISO>=8
-        x7 = *x_Ptr7++;
-        #endif
-        #if nISO>=9
-        x8 = *x_Ptr8++;
-        #endif
-        #if nISO>=10
-        x9 = *x_Ptr9++;
-        #endif
-        #if nISO>=11
-        x10 = *x_Ptr10++;
-        #endif
-        #if nISO>=12
-        x11 = *x_Ptr11++;
-        #endif
-        #if nISO>=13
-        x12 = *x_Ptr12++;
-        #endif
-        #if nISO>=14
-        x13 = *x_Ptr13++;
-        #endif
-        #if nISO>=15
-        x14 = *x_Ptr14++;
-        #endif
-        #if nISO>=16
-        x15 = *x_Ptr15++;
-        #endif
-        #if nISO>=17
-        x16 = *x_Ptr16++;
-        #endif
-        #if nISO>=18
-        x17 = *x_Ptr17++;
-        #endif
-        #if nISO>=19
-        x18 = *x_Ptr18++;
-        #endif
-        #if nISO>=20
-        x19 = *x_Ptr19++;
-        #endif
-
-        if (
-               x0 != 0
-            #if nISO>=2
-            || x1 != 0
-            #endif
-            #if nISO>=3
-            || x2 != 0
-            #endif
-            #if nISO>=4
-            || x3 != 0
-            #endif
-            #if nISO>=5
-            || x4 != 0
-            #endif
-            #if nISO>=6
-            || x5 != 0
-            #endif
-            #if nISO>=7
-            || x6 != 0
-            #endif
-            #if nISO>=8
-            || x7 != 0
-            #endif
-            #if nISO>=9
-            || x8 != 0
-            #endif
-            #if nISO>=10
-            || x9 != 0
-            #endif
-            #if nISO>=11
-            || x10 != 0
-            #endif
-            #if nISO>=12
-            || x11 != 0
-            #endif
-            #if nISO>=13
-            || x12 != 0
-            #endif
-            #if nISO>=14
-            || x13 != 0
-            #endif
-            #if nISO>=15
-            || x14 != 0
-            #endif
-            #if nISO>=16
-            || x15 != 0
-            #endif
-            #if nISO>=17
-            || x16 != 0
-            #endif
-            #if nISO>=18
-            || x17 != 0
-            #endif
-            #if nISO>=19
-            || x18 != 0
-            #endif
-            #if nISO>=20
-            || x19 != 0
-            #endif
-          )
-        {
-            Yptr    = Y    + nS * (*t_v);
-            YptrEnd = Yptr + nS;
-            SFP0ptr = isoSFP0;
-            #if nISO>=2
-            SFP1ptr = isoSFP1;
-            #endif
-            #if nISO>=3
-            SFP2ptr = isoSFP2;
-            #endif
-            #if nISO>=4
-            SFP3ptr = isoSFP3;
-            #endif
-            #if nISO>=5
-            SFP4ptr = isoSFP4;
-            #endif
-            #if nISO>=6
-            SFP5ptr = isoSFP5;
-            #endif
-            #if nISO>=7
-            SFP6ptr = isoSFP6;
-            #endif
-            #if nISO>=8
-            SFP7ptr = isoSFP7;
-            #endif
-            #if nISO>=9
-            SFP8ptr = isoSFP8;
-            #endif
-            #if nISO>=10
-            SFP9ptr = isoSFP9;
-            #endif
-            #if nISO>=11
-            SFP10ptr = isoSFP10;
-            #endif
-            #if nISO>=12
-            SFP11ptr = isoSFP11;
-            #endif
-            #if nISO>=13
-            SFP12ptr = isoSFP12;
-            #endif
-            #if nISO>=14
-            SFP13ptr = isoSFP13;
-            #endif
-            #if nISO>=15
-            SFP14ptr = isoSFP14;
-            #endif
-            #if nISO>=16
-            SFP15ptr = isoSFP15;
-            #endif
-            #if nISO>=17
-            SFP16ptr = isoSFP16;
-            #endif
-            #if nISO>=18
-            SFP17ptr = isoSFP17;
-            #endif
-            #if nISO>=19
-            SFP18ptr = isoSFP18;
-            #endif
-            #if nISO>=20
-            SFP19ptr = isoSFP19;
-            #endif
-
-            while( Yptr != YptrEnd )
-                (*Yptr++) += (
-                      x0 * (*SFP0ptr++)
-                    #if nISO>=2
-                    + x1 * (*SFP1ptr++)
-                    #endif
-                    #if nISO>=3
-                    + x2 * (*SFP2ptr++)
-                    #endif
-                    #if nISO>=4
-                    + x3 * (*SFP3ptr++)
-                    #endif
-                    #if nISO>=5
-                    + x4 * (*SFP4ptr++)
-                    #endif
-                    #if nISO>=6
-                    + x5 * (*SFP5ptr++)
-                    #endif
-                    #if nISO>=7
-                    + x6 * (*SFP6ptr++)
-                    #endif
-                    #if nISO>=8
-                    + x7 * (*SFP7ptr++)
-                    #endif
-                    #if nISO>=9
-                    + x8 * (*SFP8ptr++)
-                    #endif
-                    #if nISO>=10
-                    + x9 * (*SFP9ptr++)
-                    #endif
-                    #if nISO>=11
-                    + x10 * (*SFP10ptr++)
-                    #endif
-                    #if nISO>=12
-                    + x11 * (*SFP11ptr++)
-                    #endif
-                    #if nISO>=13
-                    + x12 * (*SFP12ptr++)
-                    #endif
-                    #if nISO>=14
-                    + x13 * (*SFP13ptr++)
-                    #endif
-                    #if nISO>=15
-                    + x14 * (*SFP14ptr++)
-                    #endif
-                    #if nISO>=16
-                    + x15 * (*SFP15ptr++)
-                    #endif
-                    #if nISO>=17
-                    + x16 * (*SFP16ptr++)
-                    #endif
-                    #if nISO>=18
-                    + x17 * (*SFP17ptr++)
-                    #endif
-                    #if nISO>=19
-                    + x18 * (*SFP18ptr++)
-                    #endif
-                    #if nISO>=20
-                    + x19 * (*SFP19ptr++)
-                    #endif
-                );
-        }
-        t_v++;
-    }
-#endif
-
-    pthread_exit( 0 );
-}
-
-
-// =========================
-// Function called by CYTHON
-// =========================
-void COMMIT_A(
-    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
-    double *_vIN, double *_vOUT,
-    uint32_t *_ICf, uint32_t *_ICv, uint16_t *_ICo, float *_ICl,
-    uint32_t *_ECv, uint16_t *_ECo,
-    uint32_t *_ISOv,
-    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
-    uint32_t* _ICthreads, uint32_t* _ECthreads, uint32_t* _ISOthreads
-)
-{
-    nF = _nF;
-    n  = _n;
-    nE = _nE;
-    nV = _nV;
-    nS = _nS;
-    ndirs = _ndirs;
-
-    x = _vIN;
-    Y = _vOUT;
-
-    ICf  = _ICf;
-    ICv  = _ICv;
-    ICo  = _ICo;
-    ICl  = _ICl;
-    ECv  = _ECv;
-    ECo  = _ECo;
-    ISOv = _ISOv;
-
-    #if nIC>=1
-    wmrSFP0 = _wmrSFP;
-    #if nIC>=2
-    wmrSFP1 = wmrSFP0 + _ndirs*_nS;
-    #if nIC>=3
-    wmrSFP2 = wmrSFP1 + _ndirs*_nS;
-    #if nIC>=4
-    wmrSFP3 = wmrSFP2 + _ndirs*_nS;
-    #if nIC>=5
-    wmrSFP4 = wmrSFP3 + _ndirs*_nS;
-    #if nIC>=6
-    wmrSFP5 = wmrSFP4 + _ndirs*_nS;
-    #if nIC>=7
-    wmrSFP6 = wmrSFP5 + _ndirs*_nS;
-    #if nIC>=8
-    wmrSFP7 = wmrSFP6 + _ndirs*_nS;
-    #if nIC>=9
-    wmrSFP8 = wmrSFP7 + _ndirs*_nS;
-    #if nIC>=10
-    wmrSFP9 = wmrSFP8 + _ndirs*_nS;
-    #if nIC>=11
-    wmrSFP10 = wmrSFP9 + _ndirs*_nS;
-    #if nIC>=12
-    wmrSFP11 = wmrSFP10 + _ndirs*_nS;
-    #if nIC>=13
-    wmrSFP12 = wmrSFP11 + _ndirs*_nS;
-    #if nIC>=14
-    wmrSFP13 = wmrSFP12 + _ndirs*_nS;
-    #if nIC>=15
-    wmrSFP14 = wmrSFP13 + _ndirs*_nS;
-    #if nIC>=16
-    wmrSFP15 = wmrSFP14 + _ndirs*_nS;
-    #if nIC>=17
-    wmrSFP16 = wmrSFP15 + _ndirs*_nS;
-    #if nIC>=18
-    wmrSFP17 = wmrSFP16 + _ndirs*_nS;
-    #if nIC>=19
-    wmrSFP18 = wmrSFP17 + _ndirs*_nS;
-    #if nIC>=20
-    wmrSFP19 = wmrSFP18 + _ndirs*_nS;
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #if nEC>=1
-    wmhSFP0 = _wmhSFP;
-    #if nEC>=2
-    wmhSFP1 = wmhSFP0 + _ndirs*_nS;
-    #if nEC>=3
-    wmhSFP2 = wmhSFP1 + _ndirs*_nS;
-    #if nEC>=4
-    wmhSFP3 = wmhSFP2 + _ndirs*_nS;
-    #if nEC>=5
-    wmhSFP4 = wmhSFP3 + _ndirs*_nS;
-    #if nEC>=6
-    wmhSFP5 = wmhSFP4 + _ndirs*_nS;
-    #if nEC>=7
-    wmhSFP6 = wmhSFP5 + _ndirs*_nS;
-    #if nEC>=8
-    wmhSFP7 = wmhSFP6 + _ndirs*_nS;
-    #if nEC>=9
-    wmhSFP8 = wmhSFP7 + _ndirs*_nS;
-    #if nEC>=10
-    wmhSFP9 = wmhSFP8 + _ndirs*_nS;
-    #if nEC>=11
-    wmhSFP10 = wmhSFP9 + _ndirs*_nS;
-    #if nEC>=12
-    wmhSFP11 = wmhSFP10 + _ndirs*_nS;
-    #if nEC>=13
-    wmhSFP12 = wmhSFP11 + _ndirs*_nS;
-    #if nEC>=14
-    wmhSFP13 = wmhSFP12 + _ndirs*_nS;
-    #if nEC>=15
-    wmhSFP14 = wmhSFP13 + _ndirs*_nS;
-    #if nEC>=16
-    wmhSFP15 = wmhSFP14 + _ndirs*_nS;
-    #if nEC>=17
-    wmhSFP16 = wmhSFP15 + _ndirs*_nS;
-    #if nEC>=18
-    wmhSFP17 = wmhSFP16 + _ndirs*_nS;
-    #if nEC>=19
-    wmhSFP18 = wmhSFP17 + _ndirs*_nS;
-    #if nEC>=20
-    wmhSFP19 = wmhSFP18 + _ndirs*_nS;
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #if nISO>=1
-    isoSFP0 = _isoSFP;
-    #if nISO>=2
-    isoSFP1 = isoSFP0 + _nS;
-    #if nISO>=3
-    isoSFP2 = isoSFP1 + _nS;
-    #if nISO>=4
-    isoSFP3 = isoSFP2 + _nS;
-    #if nISO>=5
-    isoSFP4 = isoSFP3 + _nS;
-    #if nISO>=6
-    isoSFP5 = isoSFP4 + _nS;
-    #if nISO>=7
-    isoSFP6 = isoSFP5 + _nS;
-    #if nISO>=8
-    isoSFP7 = isoSFP6 + _nS;
-    #if nISO>=9
-    isoSFP8 = isoSFP7 + _nS;
-    #if nISO>=10
-    isoSFP9 = isoSFP8 + _nS;
-    #if nISO>=11
-    isoSFP10 = isoSFP9 + _nS;
-    #if nISO>=12
-    isoSFP11 = isoSFP10 + _nS;
-    #if nISO>=13
-    isoSFP12 = isoSFP11 + _nS;
-    #if nISO>=14
-    isoSFP13 = isoSFP12 + _nS;
-    #if nISO>=15
-    isoSFP14 = isoSFP13 + _nS;
-    #if nISO>=16
-    isoSFP15 = isoSFP14 + _nS;
-    #if nISO>=17
-    isoSFP16 = isoSFP15 + _nS;
-    #if nISO>=18
-    isoSFP17 = isoSFP16 + _nS;
-    #if nISO>=19
-    isoSFP18 = isoSFP17 + _nS;
-    #if nISO>=20
-    isoSFP19 = isoSFP18 + _nS;
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-
-    ICthreads  = _ICthreads;
-    ECthreads  = _ECthreads;
-    ISOthreads = _ISOthreads;
-
-    // Run SEPARATE THREADS to perform the multiplication
-    pthread_t threads[nTHREADS];
-    int t;
-    for(t=0; t<nTHREADS ; t++)
-        pthread_create( &threads[t], NULL, COMMIT_A__block, (void *) (long int)t );
-    for(t=0; t<nTHREADS ; t++)
-        pthread_join( threads[t], NULL );
-    return;
-}
-
-
-
-/* ===================================================== */
-/* Compute a sub-block of the A'*y MAtRIX-VECTOR product */
-/* ===================================================== */
-void* COMMIT_At__block( void *ptr )
-{
-    int      id = (long)ptr;
-    int      offset;
-    double   x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, w, Y_tmp;
-    double   *x_Ptr0, *x_Ptr1, *x_Ptr2, *x_Ptr3, *x_Ptr4, *x_Ptr5, *x_Ptr6, *x_Ptr7, *x_Ptr8, *x_Ptr9, *x_Ptr10, *x_Ptr11, *x_Ptr12, *x_Ptr13, *x_Ptr14, *x_Ptr15, *x_Ptr16, *x_Ptr17, *x_Ptr18, *x_Ptr19;
-    double   *Yptr, *YptrEnd;
-    float    *SFP0ptr, *SFP1ptr, *SFP2ptr, *SFP3ptr, *SFP4ptr, *SFP5ptr, *SFP6ptr, *SFP7ptr, *SFP8ptr, *SFP9ptr, *SFP10ptr, *SFP11ptr, *SFP12ptr, *SFP13ptr, *SFP14ptr, *SFP15ptr, *SFP16ptr, *SFP17ptr, *SFP18ptr, *SFP19ptr;
-    uint32_t *t_v, *t_vEnd, *t_f;
-    uint16_t *t_o;
-    float    *t_l;
-    uint8_t  *t_t;
-
-#if nIC>=1
-    // intra-cellular compartments
-    t_v    = ICv;
-    t_vEnd = ICv + n;
-    t_o    = ICo;
-    t_l    = ICl;
-    t_f    = ICf;
-    t_t    = ICthreadsT;
-
-    while( t_v != t_vEnd )
-    {
-        // in this case, I need to walk throug because the segments are ordered in "voxel order"
-        if ( *t_t == id )
-        {
-            Yptr    = Y    + nS * (*t_v);
-            YptrEnd = Yptr + nS;
-            offset  = nS * (*t_o);
-
-            Y_tmp = *Yptr;
-            SFP0ptr   = wmrSFP0 + offset;
-            x0 = (*SFP0ptr++) * Y_tmp;
-            #if nIC>=2
-            SFP1ptr   = wmrSFP1 + offset;
-            x1 = (*SFP1ptr++) * Y_tmp;
-            #endif
-            #if nIC>=3
-            SFP2ptr   = wmrSFP2 + offset;
-            x2 = (*SFP2ptr++) * Y_tmp;
-            #endif
-            #if nIC>=4
-            SFP3ptr   = wmrSFP3 + offset;
-            x3 = (*SFP3ptr++) * Y_tmp;
-            #endif
-            #if nIC>=5
-            SFP4ptr   = wmrSFP4 + offset;
-            x4 = (*SFP4ptr++) * Y_tmp;
-            #endif
-            #if nIC>=6
-            SFP5ptr   = wmrSFP5 + offset;
-            x5 = (*SFP5ptr++) * Y_tmp;
-            #endif
-            #if nIC>=7
-            SFP6ptr   = wmrSFP6 + offset;
-            x6 = (*SFP6ptr++) * Y_tmp;
-            #endif
-            #if nIC>=8
-            SFP7ptr   = wmrSFP7 + offset;
-            x7 = (*SFP7ptr++) * Y_tmp;
-            #endif
-            #if nIC>=9
-            SFP8ptr   = wmrSFP8 + offset;
-            x8 = (*SFP8ptr++) * Y_tmp;
-            #endif
-            #if nIC>=10
-            SFP9ptr   = wmrSFP9 + offset;
-            x9 = (*SFP9ptr++) * Y_tmp;
-            #endif
-            #if nIC>=11
-            SFP10ptr   = wmrSFP10 + offset;
-            x10 = (*SFP10ptr++) * Y_tmp;
-            #endif
-            #if nIC>=12
-            SFP11ptr   = wmrSFP11 + offset;
-            x11 = (*SFP11ptr++) * Y_tmp;
-            #endif
-            #if nIC>=13
-            SFP12ptr   = wmrSFP12 + offset;
-            x12 = (*SFP12ptr++) * Y_tmp;
-            #endif
-            #if nIC>=14
-            SFP13ptr   = wmrSFP13 + offset;
-            x13 = (*SFP13ptr++) * Y_tmp;
-            #endif
-            #if nIC>=15
-            SFP14ptr   = wmrSFP14 + offset;
-            x14 = (*SFP14ptr++) * Y_tmp;
-            #endif
-            #if nIC>=16
-            SFP15ptr   = wmrSFP15 + offset;
-            x15 = (*SFP15ptr++) * Y_tmp;
-            #endif
-            #if nIC>=17
-            SFP16ptr   = wmrSFP16 + offset;
-            x16 = (*SFP16ptr++) * Y_tmp;
-            #endif
-            #if nIC>=18
-            SFP17ptr   = wmrSFP17 + offset;
-            x17 = (*SFP17ptr++) * Y_tmp;
-            #endif
-            #if nIC>=19
-            SFP18ptr   = wmrSFP18 + offset;
-            x18 = (*SFP18ptr++) * Y_tmp;
-            #endif
-            #if nIC>=20
-            SFP19ptr   = wmrSFP19 + offset;
-            x19 = (*SFP19ptr++) * Y_tmp;
-            #endif
-
-            while( ++Yptr != YptrEnd )
-            {
-                Y_tmp = *Yptr;
-                x0 += (*SFP0ptr++) * Y_tmp;
-                #if nIC>=2
-                x1 += (*SFP1ptr++) * Y_tmp;
-                #endif
-                #if nIC>=3
-                x2 += (*SFP2ptr++) * Y_tmp;
-                #endif
-                #if nIC>=4
-                x3 += (*SFP3ptr++) * Y_tmp;
-                #endif
-                #if nIC>=5
-                x4 += (*SFP4ptr++) * Y_tmp;
-                #endif
-                #if nIC>=6
-                x5 += (*SFP5ptr++) * Y_tmp;
-                #endif
-                #if nIC>=7
-                x6 += (*SFP6ptr++) * Y_tmp;
-                #endif
-                #if nIC>=8
-                x7 += (*SFP7ptr++) * Y_tmp;
-                #endif
-                #if nIC>=9
-                x8 += (*SFP8ptr++) * Y_tmp;
-                #endif
-                #if nIC>=10
-                x9 += (*SFP9ptr++) * Y_tmp;
-                #endif
-                #if nIC>=11
-                x10 += (*SFP10ptr++) * Y_tmp;
-                #endif
-                #if nIC>=12
-                x11 += (*SFP11ptr++) * Y_tmp;
-                #endif
-                #if nIC>=13
-                x12 += (*SFP12ptr++) * Y_tmp;
-                #endif
-                #if nIC>=14
-                x13 += (*SFP13ptr++) * Y_tmp;
-                #endif
-                #if nIC>=15
-                x14 += (*SFP14ptr++) * Y_tmp;
-                #endif
-                #if nIC>=16
-                x15 += (*SFP15ptr++) * Y_tmp;
-                #endif
-                #if nIC>=17
-                x16 += (*SFP16ptr++) * Y_tmp;
-                #endif
-                #if nIC>=18
-                x17 += (*SFP17ptr++) * Y_tmp;
-                #endif
-                #if nIC>=19
-                x18 += (*SFP18ptr++) * Y_tmp;
-                #endif
-                #if nIC>=20
-                x19 += (*SFP19ptr++) * Y_tmp;
-                #endif
-            }
-
-            w = (double)(*t_l);
-            x[*t_f]      += w * x0;
-            #if nIC>=2
-            x[*t_f+nF]   += w * x1;
-            #endif
-            #if nIC>=3
-            x[*t_f+2*nF] += w * x2;
-            #endif
-            #if nIC>=4
-            x[*t_f+3*nF] += w * x3;
-            #endif
-            #if nIC>=5
-            x[*t_f+4*nF] += w * x4;
-            #endif
-            #if nIC>=6
-            x[*t_f+5*nF] += w * x5;
-            #endif
-            #if nIC>=7
-            x[*t_f+6*nF] += w * x6;
-            #endif
-            #if nIC>=8
-            x[*t_f+7*nF] += w * x7;
-            #endif
-            #if nIC>=9
-            x[*t_f+8*nF] += w * x8;
-            #endif
-            #if nIC>=10
-            x[*t_f+9*nF] += w * x9;
-            #endif
-            #if nIC>=11
-            x[*t_f+10*nF] += w * x10;
-            #endif
-            #if nIC>=12
-            x[*t_f+11*nF] += w * x11;
-            #endif
-            #if nIC>=13
-            x[*t_f+12*nF] += w * x12;
-            #endif
-            #if nIC>=14
-            x[*t_f+13*nF] += w * x13;
-            #endif
-            #if nIC>=15
-            x[*t_f+14*nF] += w * x14;
-            #endif
-            #if nIC>=16
-            x[*t_f+15*nF] += w * x15;
-            #endif
-            #if nIC>=17
-            x[*t_f+16*nF] += w * x16;
-            #endif
-            #if nIC>=18
-            x[*t_f+17*nF] += w * x17;
-            #endif
-            #if nIC>=19
-            x[*t_f+18*nF] += w * x18;
-            #endif
-            #if nIC>=20
-            x[*t_f+19*nF] += w * x19;
-            #endif
-        }
-
-        t_f++;
-        t_v++;
-        t_o++;
-        t_l++;
-        t_t++;
-    }
-#endif
-
-#if nEC>=1
-    // extra-cellular compartments
-    t_v    = ECv + ECthreadsT[id];
-    t_vEnd = ECv + ECthreadsT[id+1];
-    t_o    = ECo + ECthreadsT[id];
-
-    x_Ptr0 = x + nIC*nF + ECthreadsT[id];
-    #if nEC>=2
-    x_Ptr1 = x_Ptr0 + nE;
-    #endif
-    #if nEC>=3
-    x_Ptr2 = x_Ptr1 + nE;
-    #endif
-    #if nEC>=4
-    x_Ptr3 = x_Ptr2 + nE;
-    #endif
-    #if nEC>=5
-    x_Ptr4 = x_Ptr3 + nE;
-    #endif
-    #if nEC>=6
-    x_Ptr5 = x_Ptr4 + nE;
-    #endif
-    #if nEC>=7
-    x_Ptr6 = x_Ptr5 + nE;
-    #endif
-    #if nEC>=8
-    x_Ptr7 = x_Ptr6 + nE;
-    #endif
-    #if nEC>=9
-    x_Ptr8 = x_Ptr7 + nE;
-    #endif
-    #if nEC>=10
-    x_Ptr9 = x_Ptr8 + nE;
-    #endif
-    #if nEC>=11
-    x_Ptr10 = x_Ptr9 + nE;
-    #endif
-    #if nEC>=12
-    x_Ptr11 = x_Ptr10 + nE;
-    #endif
-    #if nEC>=13
-    x_Ptr12 = x_Ptr11 + nE;
-    #endif
-    #if nEC>=14
-    x_Ptr13 = x_Ptr12 + nE;
-    #endif
-    #if nEC>=15
-    x_Ptr14 = x_Ptr13 + nE;
-    #endif
-    #if nEC>=16
-    x_Ptr15 = x_Ptr14 + nE;
-    #endif
-    #if nEC>=17
-    x_Ptr16 = x_Ptr15 + nE;
-    #endif
-    #if nEC>=18
-    x_Ptr17 = x_Ptr16 + nE;
-    #endif
-    #if nEC>=19
-    x_Ptr18 = x_Ptr17 + nE;
-    #endif
-    #if nEC>=20
-    x_Ptr19 = x_Ptr18 + nE;
-    #endif
-
-    while( t_v != t_vEnd )
-    {
-        Yptr    = Y    + nS * (*t_v++);
-        YptrEnd = Yptr + nS;
-        offset  = nS * (*t_o++);
-
-        Y_tmp = *Yptr;
-        SFP0ptr = wmhSFP0 + offset;
-        x0 = (*SFP0ptr++) * Y_tmp;
-        #if nEC>=2
-        SFP1ptr = wmhSFP1 + offset;
-        x1 = (*SFP1ptr++) * Y_tmp;
-        #endif
-        #if nEC>=3
-        SFP2ptr = wmhSFP2 + offset;
-        x2 = (*SFP2ptr++) * Y_tmp;
-        #endif
-        #if nEC>=4
-        SFP3ptr = wmhSFP3 + offset;
-        x3 = (*SFP3ptr++) * Y_tmp;
-        #endif
-        #if nEC>=5
-        SFP4ptr = wmhSFP4 + offset;
-        x4 = (*SFP4ptr++) * Y_tmp;
-        #endif
-        #if nEC>=6
-        SFP5ptr = wmhSFP5 + offset;
-        x5 = (*SFP5ptr++) * Y_tmp;
-        #endif
-        #if nEC>=7
-        SFP6ptr = wmhSFP6 + offset;
-        x6 = (*SFP6ptr++) * Y_tmp;
-        #endif
-        #if nEC>=8
-        SFP7ptr = wmhSFP7 + offset;
-        x7 = (*SFP7ptr++) * Y_tmp;
-        #endif
-        #if nEC>=9
-        SFP8ptr = wmhSFP8 + offset;
-        x8 = (*SFP8ptr++) * Y_tmp;
-        #endif
-        #if nEC>=10
-        SFP9ptr = wmhSFP9 + offset;
-        x9 = (*SFP9ptr++) * Y_tmp;
-        #endif
-        #if nEC>=11
-        SFP10ptr = wmhSFP10 + offset;
-        x10 = (*SFP10ptr++) * Y_tmp;
-        #endif
-        #if nEC>=12
-        SFP11ptr = wmhSFP11 + offset;
-        x11 = (*SFP11ptr++) * Y_tmp;
-        #endif
-        #if nEC>=13
-        SFP12ptr = wmhSFP12 + offset;
-        x12 = (*SFP12ptr++) * Y_tmp;
-        #endif
-        #if nEC>=14
-        SFP13ptr = wmhSFP13 + offset;
-        x13 = (*SFP13ptr++) * Y_tmp;
-        #endif
-        #if nEC>=15
-        SFP14ptr = wmhSFP14 + offset;
-        x14 = (*SFP14ptr++) * Y_tmp;
-        #endif
-        #if nEC>=16
-        SFP15ptr = wmhSFP15 + offset;
-        x15 = (*SFP15ptr++) * Y_tmp;
-        #endif
-        #if nEC>=17
-        SFP16ptr = wmhSFP16 + offset;
-        x16 = (*SFP16ptr++) * Y_tmp;
-        #endif
-        #if nEC>=18
-        SFP17ptr = wmhSFP17 + offset;
-        x17 = (*SFP17ptr++) * Y_tmp;
-        #endif
-        #if nEC>=19
-        SFP18ptr = wmhSFP18 + offset;
-        x18 = (*SFP18ptr++) * Y_tmp;
-        #endif
-        #if nEC>=20
-        SFP19ptr = wmhSFP19 + offset;
-        x19 = (*SFP19ptr++) * Y_tmp;
-        #endif
-
-        while( ++Yptr != YptrEnd )
-        {
-            Y_tmp = *Yptr;
-            x0 += (*SFP0ptr++) * Y_tmp;
-            #if nEC>=2
-            x1 += (*SFP1ptr++) * Y_tmp;
-            #endif
-            #if nEC>=3
-            x2 += (*SFP2ptr++) * Y_tmp;
-            #endif
-            #if nEC>=4
-            x3 += (*SFP3ptr++) * Y_tmp;
-            #endif
-            #if nEC>=5
-            x4 += (*SFP4ptr++) * Y_tmp;
-            #endif
-            #if nEC>=6
-            x5 += (*SFP5ptr++) * Y_tmp;
-            #endif
-            #if nEC>=7
-            x6 += (*SFP6ptr++) * Y_tmp;
-            #endif
-            #if nEC>=8
-            x7 += (*SFP7ptr++) * Y_tmp;
-            #endif
-            #if nEC>=9
-            x8 += (*SFP8ptr++) * Y_tmp;
-            #endif
-            #if nEC>=10
-            x9 += (*SFP9ptr++) * Y_tmp;
-            #endif
-            #if nEC>=11
-            x10 += (*SFP10ptr++) * Y_tmp;
-            #endif
-            #if nEC>=12
-            x11 += (*SFP11ptr++) * Y_tmp;
-            #endif
-            #if nEC>=13
-            x12 += (*SFP12ptr++) * Y_tmp;
-            #endif
-            #if nEC>=14
-            x13 += (*SFP13ptr++) * Y_tmp;
-            #endif
-            #if nEC>=15
-            x14 += (*SFP14ptr++) * Y_tmp;
-            #endif
-            #if nEC>=16
-            x15 += (*SFP15ptr++) * Y_tmp;
-            #endif
-            #if nEC>=17
-            x16 += (*SFP16ptr++) * Y_tmp;
-            #endif
-            #if nEC>=18
-            x17 += (*SFP17ptr++) * Y_tmp;
-            #endif
-            #if nEC>=19
-            x18 += (*SFP18ptr++) * Y_tmp;
-            #endif
-            #if nEC>=20
-            x19 += (*SFP19ptr++) * Y_tmp;
-            #endif
-        }
-        (*x_Ptr0++) += x0;
-        #if nEC>=2
-        (*x_Ptr1++) += x1;
-        #endif
-        #if nEC>=3
-        (*x_Ptr2++) += x2;
-        #endif
-        #if nEC>=4
-        (*x_Ptr3++) += x3;
-        #endif
-        #if nEC>=5
-        (*x_Ptr4++) += x4;
-        #endif
-        #if nEC>=6
-        (*x_Ptr5++) += x5;
-        #endif
-        #if nEC>=7
-        (*x_Ptr6++) += x6;
-        #endif
-        #if nEC>=8
-        (*x_Ptr7++) += x7;
-        #endif
-        #if nEC>=9
-        (*x_Ptr8++) += x8;
-        #endif
-        #if nEC>=10
-        (*x_Ptr9++) += x9;
-        #endif
-        #if nEC>=11
-        (*x_Ptr10++) += x10;
-        #endif
-        #if nEC>=12
-        (*x_Ptr11++) += x11;
-        #endif
-        #if nEC>=13
-        (*x_Ptr12++) += x12;
-        #endif
-        #if nEC>=14
-        (*x_Ptr13++) += x13;
-        #endif
-        #if nEC>=15
-        (*x_Ptr14++) += x14;
-        #endif
-        #if nEC>=16
-        (*x_Ptr15++) += x15;
-        #endif
-        #if nEC>=17
-        (*x_Ptr16++) += x16;
-        #endif
-        #if nEC>=18
-        (*x_Ptr17++) += x17;
-        #endif
-        #if nEC>=19
-        (*x_Ptr18++) += x18;
-        #endif
-        #if nEC>=20
-        (*x_Ptr19++) += x19;
-        #endif
-    }
-#endif
-
-#if nISO>=1
-    // isotropic compartments
-    t_v    = ISOv + ISOthreadsT[id];
-    t_vEnd = ISOv + ISOthreadsT[id+1];
-
-    x_Ptr0 = x + nIC*nF + nEC*nE + ISOthreadsT[id];
-    #if nISO>=2
-    x_Ptr1 = x_Ptr0 + nV;
-    #endif
-    #if nISO>=3
-    x_Ptr2 = x_Ptr1 + nV;
-    #endif
-    #if nISO>=4
-    x_Ptr3 = x_Ptr2 + nV;
-    #endif
-    #if nISO>=5
-    x_Ptr4 = x_Ptr3 + nV;
-    #endif
-    #if nISO>=6
-    x_Ptr5 = x_Ptr4 + nV;
-    #endif
-    #if nISO>=7
-    x_Ptr6 = x_Ptr5 + nV;
-    #endif
-    #if nISO>=8
-    x_Ptr7 = x_Ptr6 + nV;
-    #endif
-    #if nISO>=9
-    x_Ptr8 = x_Ptr7 + nV;
-    #endif
-    #if nISO>=10
-    x_Ptr9 = x_Ptr8 + nV;
-    #endif
-    #if nISO>=11
-    x_Ptr10 = x_Ptr9 + nV;
-    #endif
-    #if nISO>=12
-    x_Ptr11 = x_Ptr10 + nV;
-    #endif
-    #if nISO>=13
-    x_Ptr12 = x_Ptr11 + nV;
-    #endif
-    #if nISO>=14
-    x_Ptr13 = x_Ptr12 + nV;
-    #endif
-    #if nISO>=15
-    x_Ptr14 = x_Ptr13 + nV;
-    #endif
-    #if nISO>=16
-    x_Ptr15 = x_Ptr14 + nV;
-    #endif
-    #if nISO>=17
-    x_Ptr16 = x_Ptr15 + nV;
-    #endif
-    #if nISO>=18
-    x_Ptr17 = x_Ptr16 + nV;
-    #endif
-    #if nISO>=19
-    x_Ptr18 = x_Ptr17 + nV;
-    #endif
-    #if nISO>=20
-    x_Ptr19 = x_Ptr18 + nV;
-    #endif
-
-    while( t_v != t_vEnd )
-    {
-        Yptr    = Y    + nS * (*t_v++);
-        YptrEnd = Yptr + nS;
-
-        SFP0ptr = isoSFP0;
-        #if nISO>=2
-        SFP1ptr = isoSFP1;
-        #endif
-        #if nISO>=3
-        SFP2ptr = isoSFP2;
-        #endif
-        #if nISO>=4
-        SFP3ptr = isoSFP3;
-        #endif
-        #if nISO>=5
-        SFP4ptr = isoSFP4;
-        #endif
-        #if nISO>=6
-        SFP5ptr = isoSFP5;
-        #endif
-        #if nISO>=7
-        SFP6ptr = isoSFP6;
-        #endif
-        #if nISO>=8
-        SFP7ptr = isoSFP7;
-        #endif
-        #if nISO>=9
-        SFP8ptr = isoSFP8;
-        #endif
-        #if nISO>=10
-        SFP9ptr = isoSFP9;
-        #endif
-        #if nISO>=11
-        SFP10ptr = isoSFP10;
-        #endif
-        #if nISO>=12
-        SFP11ptr = isoSFP11;
-        #endif
-        #if nISO>=13
-        SFP12ptr = isoSFP12;
-        #endif
-        #if nISO>=14
-        SFP13ptr = isoSFP13;
-        #endif
-        #if nISO>=15
-        SFP14ptr = isoSFP14;
-        #endif
-        #if nISO>=16
-        SFP15ptr = isoSFP15;
-        #endif
-        #if nISO>=17
-        SFP16ptr = isoSFP16;
-        #endif
-        #if nISO>=18
-        SFP17ptr = isoSFP17;
-        #endif
-        #if nISO>=19
-        SFP18ptr = isoSFP18;
-        #endif
-        #if nISO>=20
-        SFP19ptr = isoSFP19;
-        #endif
-
-        Y_tmp = *Yptr;
-        x0 = (*SFP0ptr++) * Y_tmp;
-        #if nISO>=2
-        x1 = (*SFP1ptr++) * Y_tmp;
-        #endif
-        #if nISO>=3
-        x2 = (*SFP2ptr++) * Y_tmp;
-        #endif
-        #if nISO>=4
-        x3 = (*SFP3ptr++) * Y_tmp;
-        #endif
-        #if nISO>=5
-        x4 = (*SFP4ptr++) * Y_tmp;
-        #endif
-        #if nISO>=6
-        x5 = (*SFP5ptr++) * Y_tmp;
-        #endif
-        #if nISO>=7
-        x6 = (*SFP6ptr++) * Y_tmp;
-        #endif
-        #if nISO>=8
-        x7 = (*SFP7ptr++) * Y_tmp;
-        #endif
-        #if nISO>=9
-        x8 = (*SFP8ptr++) * Y_tmp;
-        #endif
-        #if nISO>=10
-        x9 = (*SFP9ptr++) * Y_tmp;
-        #endif
-        #if nISO>=11
-        x10 = (*SFP10ptr++) * Y_tmp;
-        #endif
-        #if nISO>=12
-        x11 = (*SFP11ptr++) * Y_tmp;
-        #endif
-        #if nISO>=13
-        x12 = (*SFP12ptr++) * Y_tmp;
-        #endif
-        #if nISO>=14
-        x13 = (*SFP13ptr++) * Y_tmp;
-        #endif
-        #if nISO>=15
-        x14 = (*SFP14ptr++) * Y_tmp;
-        #endif
-        #if nISO>=16
-        x15 = (*SFP15ptr++) * Y_tmp;
-        #endif
-        #if nISO>=17
-        x16 = (*SFP16ptr++) * Y_tmp;
-        #endif
-        #if nISO>=18
-        x17 = (*SFP17ptr++) * Y_tmp;
-        #endif
-        #if nISO>=19
-        x18 = (*SFP18ptr++) * Y_tmp;
-        #endif
-        #if nISO>=20
-        x19 = (*SFP19ptr++) * Y_tmp;
-        #endif
-
-        while( ++Yptr != YptrEnd )
-        {
-            Y_tmp = *Yptr;
-            x0  += (*SFP0ptr++) * Y_tmp;
-            #if nISO>=2
-            x1  += (*SFP1ptr++) * Y_tmp;
-            #endif
-            #if nISO>=3
-            x2  += (*SFP2ptr++) * Y_tmp;
-            #endif
-            #if nISO>=4
-            x3  += (*SFP3ptr++) * Y_tmp;
-            #endif
-            #if nISO>=5
-            x4  += (*SFP4ptr++) * Y_tmp;
-            #endif
-            #if nISO>=6
-            x5  += (*SFP5ptr++) * Y_tmp;
-            #endif
-            #if nISO>=7
-            x6  += (*SFP6ptr++) * Y_tmp;
-            #endif
-            #if nISO>=8
-            x7  += (*SFP7ptr++) * Y_tmp;
-            #endif
-            #if nISO>=9
-            x8  += (*SFP8ptr++) * Y_tmp;
-            #endif
-            #if nISO>=10
-            x9  += (*SFP9ptr++) * Y_tmp;
-            #endif
-            #if nISO>=11
-            x10  += (*SFP10ptr++) * Y_tmp;
-            #endif
-            #if nISO>=12
-            x11  += (*SFP11ptr++) * Y_tmp;
-            #endif
-            #if nISO>=13
-            x12  += (*SFP12ptr++) * Y_tmp;
-            #endif
-            #if nISO>=14
-            x13  += (*SFP13ptr++) * Y_tmp;
-            #endif
-            #if nISO>=15
-            x14  += (*SFP14ptr++) * Y_tmp;
-            #endif
-            #if nISO>=16
-            x15  += (*SFP15ptr++) * Y_tmp;
-            #endif
-            #if nISO>=17
-            x16  += (*SFP16ptr++) * Y_tmp;
-            #endif
-            #if nISO>=18
-            x17  += (*SFP17ptr++) * Y_tmp;
-            #endif
-            #if nISO>=19
-            x18  += (*SFP18ptr++) * Y_tmp;
-            #endif
-            #if nISO>=20
-            x19  += (*SFP19ptr++) * Y_tmp;
-            #endif
-        }
-
-        (*x_Ptr0++) += x0;
-        #if nISO>=2
-        (*x_Ptr1++) += x1;
-        #endif
-        #if nISO>=3
-        (*x_Ptr2++) += x2;
-        #endif
-        #if nISO>=4
-        (*x_Ptr3++) += x3;
-        #endif
-        #if nISO>=5
-        (*x_Ptr4++) += x4;
-        #endif
-        #if nISO>=6
-        (*x_Ptr5++) += x5;
-        #endif
-        #if nISO>=7
-        (*x_Ptr6++) += x6;
-        #endif
-        #if nISO>=8
-        (*x_Ptr7++) += x7;
-        #endif
-        #if nISO>=9
-        (*x_Ptr8++) += x8;
-        #endif
-        #if nISO>=10
-        (*x_Ptr9++) += x9;
-        #endif
-        #if nISO>=11
-        (*x_Ptr10++) += x10;
-        #endif
-        #if nISO>=12
-        (*x_Ptr11++) += x11;
-        #endif
-        #if nISO>=13
-        (*x_Ptr12++) += x12;
-        #endif
-        #if nISO>=14
-        (*x_Ptr13++) += x13;
-        #endif
-        #if nISO>=15
-        (*x_Ptr14++) += x14;
-        #endif
-        #if nISO>=16
-        (*x_Ptr15++) += x15;
-        #endif
-        #if nISO>=17
-        (*x_Ptr16++) += x16;
-        #endif
-        #if nISO>=18
-        (*x_Ptr17++) += x17;
-        #endif
-        #if nISO>=19
-        (*x_Ptr18++) += x18;
-        #endif
-        #if nISO>=20
-        (*x_Ptr19++) += x19;
-        #endif
-    }
-#endif
-
-    pthread_exit( 0 );
-}
-
-
-// =========================
-// Function called by CYTHON
-// =========================
-void COMMIT_At(
-    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
-    double *_vIN, double *_vOUT,
-    uint32_t *_ICf, uint32_t *_ICv, uint16_t *_ICo, float *_ICl,
-    uint32_t *_ECv, uint16_t *_ECo,
-    uint32_t *_ISOv,
-    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
-    uint8_t* _ICthreadsT, uint32_t* _ECthreadsT, uint32_t* _ISOthreadsT
-)
-{
-    nF = _nF;
-    n  = _n;
-    nE = _nE;
-    nV = _nV;
-    nS = _nS;
-    ndirs = _ndirs;
-
-    x = _vOUT;
-    Y = _vIN;
-
-    ICf  = _ICf;
-    ICv  = _ICv;
-    ICo  = _ICo;
-    ICl  = _ICl;
-    ECv  = _ECv;
-    ECo  = _ECo;
-    ISOv = _ISOv;
-
-    #if nIC>=1
-    wmrSFP0 = _wmrSFP;
-    #if nIC>=2
-    wmrSFP1 = wmrSFP0 + _ndirs*_nS;
-    #if nIC>=3
-    wmrSFP2 = wmrSFP1 + _ndirs*_nS;
-    #if nIC>=4
-    wmrSFP3 = wmrSFP2 + _ndirs*_nS;
-    #if nIC>=5
-    wmrSFP4 = wmrSFP3 + _ndirs*_nS;
-    #if nIC>=6
-    wmrSFP5 = wmrSFP4 + _ndirs*_nS;
-    #if nIC>=7
-    wmrSFP6 = wmrSFP5 + _ndirs*_nS;
-    #if nIC>=8
-    wmrSFP7 = wmrSFP6 + _ndirs*_nS;
-    #if nIC>=9
-    wmrSFP8 = wmrSFP7 + _ndirs*_nS;
-    #if nIC>=10
-    wmrSFP9 = wmrSFP8 + _ndirs*_nS;
-    #if nIC>=11
-    wmrSFP10 = wmrSFP9 + _ndirs*_nS;
-    #if nIC>=12
-    wmrSFP11 = wmrSFP10 + _ndirs*_nS;
-    #if nIC>=13
-    wmrSFP12 = wmrSFP11 + _ndirs*_nS;
-    #if nIC>=14
-    wmrSFP13 = wmrSFP12 + _ndirs*_nS;
-    #if nIC>=15
-    wmrSFP14 = wmrSFP13 + _ndirs*_nS;
-    #if nIC>=16
-    wmrSFP15 = wmrSFP14 + _ndirs*_nS;
-    #if nIC>=17
-    wmrSFP16 = wmrSFP15 + _ndirs*_nS;
-    #if nIC>=18
-    wmrSFP17 = wmrSFP16 + _ndirs*_nS;
-    #if nIC>=19
-    wmrSFP18 = wmrSFP17 + _ndirs*_nS;
-    #if nIC>=20
-    wmrSFP19 = wmrSFP18 + _ndirs*_nS;
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #if nEC>=1
-    wmhSFP0 = _wmhSFP;
-    #if nEC>=2
-    wmhSFP1 = wmhSFP0 + _ndirs*_nS;
-    #if nEC>=3
-    wmhSFP2 = wmhSFP1 + _ndirs*_nS;
-    #if nEC>=4
-    wmhSFP3 = wmhSFP2 + _ndirs*_nS;
-    #if nEC>=5
-    wmhSFP4 = wmhSFP3 + _ndirs*_nS;
-    #if nEC>=6
-    wmhSFP5 = wmhSFP4 + _ndirs*_nS;
-    #if nEC>=7
-    wmhSFP6 = wmhSFP5 + _ndirs*_nS;
-    #if nEC>=8
-    wmhSFP7 = wmhSFP6 + _ndirs*_nS;
-    #if nEC>=9
-    wmhSFP8 = wmhSFP7 + _ndirs*_nS;
-    #if nEC>=10
-    wmhSFP9 = wmhSFP8 + _ndirs*_nS;
-    #if nEC>=11
-    wmhSFP10 = wmhSFP9 + _ndirs*_nS;
-    #if nEC>=12
-    wmhSFP11 = wmhSFP10 + _ndirs*_nS;
-    #if nEC>=13
-    wmhSFP12 = wmhSFP11 + _ndirs*_nS;
-    #if nEC>=14
-    wmhSFP13 = wmhSFP12 + _ndirs*_nS;
-    #if nEC>=15
-    wmhSFP14 = wmhSFP13 + _ndirs*_nS;
-    #if nEC>=16
-    wmhSFP15 = wmhSFP14 + _ndirs*_nS;
-    #if nEC>=17
-    wmhSFP16 = wmhSFP15 + _ndirs*_nS;
-    #if nEC>=18
-    wmhSFP17 = wmhSFP16 + _ndirs*_nS;
-    #if nEC>=19
-    wmhSFP18 = wmhSFP17 + _ndirs*_nS;
-    #if nEC>=20
-    wmhSFP19 = wmhSFP18 + _ndirs*_nS;
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #if nISO>=1
-    isoSFP0 = _isoSFP;
-    #if nISO>=2
-    isoSFP1 = isoSFP0 + _nS;
-    #if nISO>=3
-    isoSFP2 = isoSFP1 + _nS;
-    #if nISO>=4
-    isoSFP3 = isoSFP2 + _nS;
-    #if nISO>=5
-    isoSFP4 = isoSFP3 + _nS;
-    #if nISO>=6
-    isoSFP5 = isoSFP4 + _nS;
-    #if nISO>=7
-    isoSFP6 = isoSFP5 + _nS;
-    #if nISO>=8
-    isoSFP7 = isoSFP6 + _nS;
-    #if nISO>=9
-    isoSFP8 = isoSFP7 + _nS;
-    #if nISO>=10
-    isoSFP9 = isoSFP8 + _nS;
-    #if nISO>=11
-    isoSFP10 = isoSFP9 + _nS;
-    #if nISO>=12
-    isoSFP11 = isoSFP10 + _nS;
-    #if nISO>=13
-    isoSFP12 = isoSFP11 + _nS;
-    #if nISO>=14
-    isoSFP13 = isoSFP12 + _nS;
-    #if nISO>=15
-    isoSFP14 = isoSFP13 + _nS;
-    #if nISO>=16
-    isoSFP15 = isoSFP14 + _nS;
-    #if nISO>=17
-    isoSFP16 = isoSFP15 + _nS;
-    #if nISO>=18
-    isoSFP17 = isoSFP16 + _nS;
-    #if nISO>=19
-    isoSFP18 = isoSFP17 + _nS;
-    #if nISO>=20
-    isoSFP19 = isoSFP18 + _nS;
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-    #endif
-
-    ICthreadsT  = _ICthreadsT;
-    ECthreadsT  = _ECthreadsT;
-    ISOthreadsT = _ISOthreadsT;
-
-    // Run SEPARATE THREADS to perform the multiplication
-    pthread_t threads[nTHREADS];
-    int t;
-    for(t=0; t<nTHREADS ; t++)
-        pthread_create( &threads[t], NULL, COMMIT_At__block, (void *) (long int)t );
-    for(t=0; t<nTHREADS ; t++)
-        pthread_join( threads[t], NULL );
-    return;
-}
+#include <pthread.h>
+#include <stdint.h> // uint32_t etc
+
+// number of THREADS
+#ifdef nTHREADS
+    #if (nTHREADS<1 || nTHREADS>255)
+    #error "nTHREADS" must be in the range 0..255
+    #endif
+#else
+    #error "nTHREADS" parameter must be passed to the compiler as "-DnTHREADS=<value>"
+#endif
+
+
+/* global variables */
+int         nF, n, nE, nV, nS, ndirs;
+double      *x, *Y;
+uint32_t    *ICthreads, *ECthreads, *ISOthreads;
+uint8_t     *ICthreadsT;
+uint32_t    *ECthreadsT, *ISOthreadsT;
+uint32_t    *ICf, *ICv, *ECv, *ISOv;
+uint16_t    *ICo, *ECo;
+float       *ICl;
+float       *wmrSFP0, *wmrSFP1, *wmrSFP2, *wmrSFP3, *wmrSFP4, *wmrSFP5, *wmrSFP6, *wmrSFP7, *wmrSFP8, *wmrSFP9, *wmrSFP10, *wmrSFP11, *wmrSFP12, *wmrSFP13, *wmrSFP14, *wmrSFP15, *wmrSFP16, *wmrSFP17, *wmrSFP18, *wmrSFP19;
+float       *wmhSFP0, *wmhSFP1, *wmhSFP2, *wmhSFP3, *wmhSFP4, *wmhSFP5, *wmhSFP6, *wmhSFP7, *wmhSFP8, *wmhSFP9, *wmhSFP10, *wmhSFP11, *wmhSFP12, *wmhSFP13, *wmhSFP14, *wmhSFP15, *wmhSFP16, *wmhSFP17, *wmhSFP18, *wmhSFP19;
+float       *isoSFP0, *isoSFP1, *isoSFP2, *isoSFP3, *isoSFP4, *isoSFP5, *isoSFP6, *isoSFP7, *isoSFP8, *isoSFP9, *isoSFP10, *isoSFP11, *isoSFP12, *isoSFP13, *isoSFP14, *isoSFP15, *isoSFP16, *isoSFP17, *isoSFP18, *isoSFP19;
+
+
+
+// ====================================================
+// Compute a sub-block of the A*x MAtRIX-VECTOR product
+// ====================================================
+void* COMMIT_A__block( void *ptr )
+{
+    int      id = (long)ptr;
+    int      offset;
+    double   x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, w;
+    double   *x_Ptr0, *x_Ptr1, *x_Ptr2, *x_Ptr3, *x_Ptr4, *x_Ptr5, *x_Ptr6, *x_Ptr7, *x_Ptr8, *x_Ptr9, *x_Ptr10, *x_Ptr11, *x_Ptr12, *x_Ptr13, *x_Ptr14, *x_Ptr15, *x_Ptr16, *x_Ptr17, *x_Ptr18, *x_Ptr19;
+    double   *Yptr, *YptrEnd;
+    float    *SFP0ptr, *SFP1ptr, *SFP2ptr, *SFP3ptr, *SFP4ptr, *SFP5ptr, *SFP6ptr, *SFP7ptr, *SFP8ptr, *SFP9ptr, *SFP10ptr, *SFP11ptr, *SFP12ptr, *SFP13ptr, *SFP14ptr, *SFP15ptr, *SFP16ptr, *SFP17ptr, *SFP18ptr, *SFP19ptr;
+    uint32_t *t_v, *t_vEnd, *t_f;
+    uint16_t *t_o;
+    float    *t_l;
+
+#if nIC>=1
+    // intra-cellular compartments
+    t_v    = ICv + ICthreads[id];
+    t_vEnd = ICv + ICthreads[id+1];
+    t_o    = ICo + ICthreads[id];
+    t_l    = ICl + ICthreads[id];
+    t_f    = ICf + ICthreads[id];
+
+    while( t_v != t_vEnd )
+    {
+        x_Ptr0 = x + *t_f;
+        x0 = *x_Ptr0;
+        #if nIC>=2
+        x_Ptr1 = x_Ptr0 + nF;
+        x1 = *x_Ptr1;
+        #endif
+        #if nIC>=3
+        x_Ptr2 = x_Ptr1 + nF;
+        x2 = *x_Ptr2;
+        #endif
+        #if nIC>=4
+        x_Ptr3 = x_Ptr2 + nF;
+        x3 = *x_Ptr3;
+        #endif
+        #if nIC>=5
+        x_Ptr4 = x_Ptr3 + nF;
+        x4 = *x_Ptr4;
+        #endif
+        #if nIC>=6
+        x_Ptr5 = x_Ptr4 + nF;
+        x5 = *x_Ptr5;
+        #endif
+        #if nIC>=7
+        x_Ptr6 = x_Ptr5 + nF;
+        x6 = *x_Ptr6;
+        #endif
+        #if nIC>=8
+        x_Ptr7 = x_Ptr6 + nF;
+        x7 = *x_Ptr7;
+        #endif
+        #if nIC>=9
+        x_Ptr8 = x_Ptr7 + nF;
+        x8 = *x_Ptr8;
+        #endif
+        #if nIC>=10
+        x_Ptr9 = x_Ptr8 + nF;
+        x9 = *x_Ptr9;
+        #endif
+        #if nIC>=11
+        x_Ptr10 = x_Ptr9 + nF;
+        x10 = *x_Ptr10;
+        #endif
+        #if nIC>=12
+        x_Ptr11 = x_Ptr10 + nF;
+        x11 = *x_Ptr11;
+        #endif
+        #if nIC>=13
+        x_Ptr12 = x_Ptr11 + nF;
+        x12 = *x_Ptr12;
+        #endif
+        #if nIC>=14
+        x_Ptr13 = x_Ptr12 + nF;
+        x13 = *x_Ptr13;
+        #endif
+        #if nIC>=15
+        x_Ptr14 = x_Ptr13 + nF;
+        x14 = *x_Ptr14;
+        #endif
+        #if nIC>=16
+        x_Ptr15 = x_Ptr14 + nF;
+        x15 = *x_Ptr15;
+        #endif
+        #if nIC>=17
+        x_Ptr16 = x_Ptr15 + nF;
+        x16 = *x_Ptr16;
+        #endif
+        #if nIC>=18
+        x_Ptr17 = x_Ptr16 + nF;
+        x17 = *x_Ptr17;
+        #endif
+        #if nIC>=19
+        x_Ptr18 = x_Ptr17 + nF;
+        x18 = *x_Ptr18;
+        #endif
+        #if nIC>=20
+        x_Ptr19 = x_Ptr18 + nF;
+        x19 = *x_Ptr19;
+        #endif
+
+        if ( x0 != 0
+        #if nIC>=2
+            || x1 != 0
+        #endif
+        #if nIC>=3
+            || x2 != 0
+        #endif
+        #if nIC>=4
+            || x3 != 0
+        #endif
+        #if nIC>=5
+            || x4 != 0
+        #endif
+        #if nIC>=6
+            || x5 != 0
+        #endif
+        #if nIC>=7
+            || x6 != 0
+        #endif
+        #if nIC>=8
+            || x7 != 0
+        #endif
+        #if nIC>=9
+            || x8 != 0
+        #endif
+        #if nIC>=10
+            || x9 != 0
+        #endif
+        #if nIC>=11
+            || x10 != 0
+        #endif
+        #if nIC>=12
+            || x11 != 0
+        #endif
+        #if nIC>=13
+            || x12 != 0
+        #endif
+        #if nIC>=14
+            || x13 != 0
+        #endif
+        #if nIC>=15
+            || x14 != 0
+        #endif
+        #if nIC>=16
+            || x15 != 0
+        #endif
+        #if nIC>=17
+            || x16 != 0
+        #endif
+        #if nIC>=18
+            || x17 != 0
+        #endif
+        #if nIC>=19
+            || x18 != 0
+        #endif
+        #if nIC>=20
+            || x19 != 0
+        #endif
+        )
+        {
+            Yptr    = Y    + nS * (*t_v);
+            YptrEnd = Yptr + nS;
+            w       = (double)(*t_l);
+            offset  = nS * (*t_o);
+            SFP0ptr = wmrSFP0 + offset;
+            #if nIC>=2
+            SFP1ptr = wmrSFP1 + offset;
+            #endif
+            #if nIC>=3
+            SFP2ptr = wmrSFP2 + offset;
+            #endif
+            #if nIC>=4
+            SFP3ptr = wmrSFP3 + offset;
+            #endif
+            #if nIC>=5
+            SFP4ptr = wmrSFP4 + offset;
+            #endif
+            #if nIC>=6
+            SFP5ptr = wmrSFP5 + offset;
+            #endif
+            #if nIC>=7
+            SFP6ptr = wmrSFP6 + offset;
+            #endif
+            #if nIC>=8
+            SFP7ptr = wmrSFP7 + offset;
+            #endif
+            #if nIC>=9
+            SFP8ptr = wmrSFP8 + offset;
+            #endif
+            #if nIC>=10
+            SFP9ptr = wmrSFP9 + offset;
+            #endif
+            #if nIC>=11
+            SFP10ptr = wmrSFP10 + offset;
+            #endif
+            #if nIC>=12
+            SFP11ptr = wmrSFP11 + offset;
+            #endif
+            #if nIC>=13
+            SFP12ptr = wmrSFP12 + offset;
+            #endif
+            #if nIC>=14
+            SFP13ptr = wmrSFP13 + offset;
+            #endif
+            #if nIC>=15
+            SFP14ptr = wmrSFP14 + offset;
+            #endif
+            #if nIC>=16
+            SFP15ptr = wmrSFP15 + offset;
+            #endif
+            #if nIC>=17
+            SFP16ptr = wmrSFP16 + offset;
+            #endif
+            #if nIC>=18
+            SFP17ptr = wmrSFP17 + offset;
+            #endif
+            #if nIC>=19
+            SFP18ptr = wmrSFP18 + offset;
+            #endif
+            #if nIC>=20
+            SFP19ptr = wmrSFP19 + offset;
+            #endif
+
+            while( Yptr != YptrEnd )
+                (*Yptr++) += w * (
+                          x0 * (*SFP0ptr++)
+                        #if nIC>=2
+                        + x1 * (*SFP1ptr++)
+                        #endif
+                        #if nIC>=3
+                        + x2 * (*SFP2ptr++)
+                        #endif
+                        #if nIC>=4
+                        + x3 * (*SFP3ptr++)
+                        #endif
+                        #if nIC>=5
+                        + x4 * (*SFP4ptr++)
+                        #endif
+                        #if nIC>=6
+                        + x5 * (*SFP5ptr++)
+                        #endif
+                        #if nIC>=7
+                        + x6 * (*SFP6ptr++)
+                        #endif
+                        #if nIC>=8
+                        + x7 * (*SFP7ptr++)
+                        #endif
+                        #if nIC>=9
+                        + x8 * (*SFP8ptr++)
+                        #endif
+                        #if nIC>=10
+                        + x9 * (*SFP9ptr++)
+                        #endif
+                        #if nIC>=11
+                        + x10 * (*SFP10ptr++)
+                        #endif
+                        #if nIC>=12
+                        + x11 * (*SFP11ptr++)
+                        #endif
+                        #if nIC>=13
+                        + x12 * (*SFP12ptr++)
+                        #endif
+                        #if nIC>=14
+                        + x13 * (*SFP13ptr++)
+                        #endif
+                        #if nIC>=15
+                        + x14 * (*SFP14ptr++)
+                        #endif
+                        #if nIC>=16
+                        + x15 * (*SFP15ptr++)
+                        #endif
+                        #if nIC>=17
+                        + x16 * (*SFP16ptr++)
+                        #endif
+                        #if nIC>=18
+                        + x17 * (*SFP17ptr++)
+                        #endif
+                        #if nIC>=19
+                        + x18 * (*SFP18ptr++)
+                        #endif
+                        #if nIC>=20
+                        + x19 * (*SFP19ptr++)
+                        #endif
+                );
+        }
+
+        t_f++;
+        t_v++;
+        t_o++;
+        t_l++;
+    }
+#endif
+
+#if nEC>=1
+    // extra-cellular compartments
+    t_v    = ECv + ECthreads[id];
+    t_vEnd = ECv + ECthreads[id+1];
+    t_o    = ECo + ECthreads[id];
+
+    x_Ptr0 = x + nIC*nF + ECthreads[id];
+    #if nEC>=2
+    x_Ptr1 = x_Ptr0 + nE;
+    #endif
+    #if nEC>=3
+    x_Ptr2 = x_Ptr1 + nE;
+    #endif
+    #if nEC>=4
+    x_Ptr3 = x_Ptr2 + nE;
+    #endif
+    #if nEC>=5
+    x_Ptr4 = x_Ptr3 + nE;
+    #endif
+    #if nEC>=6
+    x_Ptr5 = x_Ptr4 + nE;
+    #endif
+    #if nEC>=7
+    x_Ptr6 = x_Ptr5 + nE;
+    #endif
+    #if nEC>=8
+    x_Ptr7 = x_Ptr6 + nE;
+    #endif
+    #if nEC>=9
+    x_Ptr8 = x_Ptr7 + nE;
+    #endif
+    #if nEC>=10
+    x_Ptr9 = x_Ptr8 + nE;
+    #endif
+    #if nEC>=11
+    x_Ptr10 = x_Ptr9 + nE;
+    #endif
+    #if nEC>=12
+    x_Ptr11 = x_Ptr10 + nE;
+    #endif
+    #if nEC>=13
+    x_Ptr12 = x_Ptr11 + nE;
+    #endif
+    #if nEC>=14
+    x_Ptr13 = x_Ptr12 + nE;
+    #endif
+    #if nEC>=15
+    x_Ptr14 = x_Ptr13 + nE;
+    #endif
+    #if nEC>=16
+    x_Ptr15 = x_Ptr14 + nE;
+    #endif
+    #if nEC>=17
+    x_Ptr16 = x_Ptr15 + nE;
+    #endif
+    #if nEC>=18
+    x_Ptr17 = x_Ptr16 + nE;
+    #endif
+    #if nEC>=19
+    x_Ptr18 = x_Ptr17 + nE;
+    #endif
+    #if nEC>=20
+    x_Ptr19 = x_Ptr18 + nE;
+    #endif
+
+    while( t_v != t_vEnd )
+    {
+        x0 = *x_Ptr0++;
+        #if nEC>=2
+        x1 = *x_Ptr1++;
+        #endif
+        #if nEC>=3
+        x2 = *x_Ptr2++;
+        #endif
+        #if nEC>=4
+        x3 = *x_Ptr3++;
+        #endif
+        #if nEC>=5
+        x4 = *x_Ptr4++;
+        #endif
+        #if nEC>=6
+        x5 = *x_Ptr5++;
+        #endif
+        #if nEC>=7
+        x6 = *x_Ptr6++;
+        #endif
+        #if nEC>=8
+        x7 = *x_Ptr7++;
+        #endif
+        #if nEC>=9
+        x8 = *x_Ptr8++;
+        #endif
+        #if nEC>=10
+        x9 = *x_Ptr9++;
+        #endif
+        #if nEC>=11
+        x10 = *x_Ptr10++;
+        #endif
+        #if nEC>=12
+        x11 = *x_Ptr11++;
+        #endif
+        #if nEC>=13
+        x12 = *x_Ptr12++;
+        #endif
+        #if nEC>=14
+        x13 = *x_Ptr13++;
+        #endif
+        #if nEC>=15
+        x14 = *x_Ptr14++;
+        #endif
+        #if nEC>=16
+        x15 = *x_Ptr15++;
+        #endif
+        #if nEC>=17
+        x16 = *x_Ptr16++;
+        #endif
+        #if nEC>=18
+        x17 = *x_Ptr17++;
+        #endif
+        #if nEC>=19
+        x18 = *x_Ptr18++;
+        #endif
+        #if nEC>=20
+        x19 = *x_Ptr19++;
+        #endif
+        if (
+               x0 != 0
+            #if nEC>=2
+            || x1 != 0
+            #endif
+            #if nEC>=3
+            || x2 != 0
+            #endif
+            #if nEC>=4
+            || x3 != 0
+            #endif
+            #if nEC>=5
+            || x4 != 0
+            #endif
+            #if nEC>=6
+            || x5 != 0
+            #endif
+            #if nEC>=7
+            || x6 != 0
+            #endif
+            #if nEC>=8
+            || x7 != 0
+            #endif
+            #if nEC>=9
+            || x8 != 0
+            #endif
+            #if nEC>=10
+            || x9 != 0
+            #endif
+            #if nEC>=11
+            || x10 != 0
+            #endif
+            #if nEC>=12
+            || x11 != 0
+            #endif
+            #if nEC>=13
+            || x12 != 0
+            #endif
+            #if nEC>=14
+            || x13 != 0
+            #endif
+            #if nEC>=15
+            || x14 != 0
+            #endif
+            #if nEC>=16
+            || x15 != 0
+            #endif
+            #if nEC>=17
+            || x16 != 0
+            #endif
+            #if nEC>=18
+            || x17 != 0
+            #endif
+            #if nEC>=19
+            || x18 != 0
+            #endif
+            #if nEC>=20
+            || x19 != 0
+            #endif
+          )
+        {
+            Yptr    = Y    + nS * (*t_v);
+            YptrEnd = Yptr + nS;
+            offset  = nS * (*t_o);
+            SFP0ptr = wmhSFP0 + offset;
+            #if nEC>=2
+            SFP1ptr = wmhSFP1 + offset;
+            #endif
+            #if nEC>=3
+            SFP2ptr = wmhSFP2 + offset;
+            #endif
+            #if nEC>=4
+            SFP3ptr = wmhSFP3 + offset;
+            #endif
+            #if nEC>=5
+            SFP4ptr = wmhSFP4 + offset;
+            #endif
+            #if nEC>=6
+            SFP5ptr = wmhSFP5 + offset;
+            #endif
+            #if nEC>=7
+            SFP6ptr = wmhSFP6 + offset;
+            #endif
+            #if nEC>=8
+            SFP7ptr = wmhSFP7 + offset;
+            #endif
+            #if nEC>=9
+            SFP8ptr = wmhSFP8 + offset;
+            #endif
+            #if nEC>=10
+            SFP9ptr = wmhSFP9 + offset;
+            #endif
+            #if nEC>=11
+            SFP10ptr = wmhSFP10 + offset;
+            #endif
+            #if nEC>=12
+            SFP11ptr = wmhSFP11 + offset;
+            #endif
+            #if nEC>=13
+            SFP12ptr = wmhSFP12 + offset;
+            #endif
+            #if nEC>=14
+            SFP13ptr = wmhSFP13 + offset;
+            #endif
+            #if nEC>=15
+            SFP14ptr = wmhSFP14 + offset;
+            #endif
+            #if nEC>=16
+            SFP15ptr = wmhSFP15 + offset;
+            #endif
+            #if nEC>=17
+            SFP16ptr = wmhSFP16 + offset;
+            #endif
+            #if nEC>=18
+            SFP17ptr = wmhSFP17 + offset;
+            #endif
+            #if nEC>=19
+            SFP18ptr = wmhSFP18 + offset;
+            #endif
+            #if nEC>=20
+            SFP19ptr = wmhSFP19 + offset;
+            #endif
+
+            while( Yptr != YptrEnd )
+                (*Yptr++) += (
+                      x0 * (*SFP0ptr++)
+                    #if nEC>=2
+                    + x1 * (*SFP1ptr++)
+                    #endif
+                    #if nEC>=3
+                    + x2 * (*SFP2ptr++)
+                    #endif
+                    #if nEC>=4
+                    + x3 * (*SFP3ptr++)
+                    #endif
+                    #if nEC>=5
+                    + x4 * (*SFP4ptr++)
+                    #endif
+                    #if nEC>=6
+                    + x5 * (*SFP5ptr++)
+                    #endif
+                    #if nEC>=7
+                    + x6 * (*SFP6ptr++)
+                    #endif
+                    #if nEC>=8
+                    + x7 * (*SFP7ptr++)
+                    #endif
+                    #if nEC>=9
+                    + x8 * (*SFP8ptr++)
+                    #endif
+                    #if nEC>=10
+                    + x9 * (*SFP9ptr++)
+                    #endif
+                    #if nEC>=11
+                    + x10 * (*SFP10ptr++)
+                    #endif
+                    #if nEC>=12
+                    + x11 * (*SFP11ptr++)
+                    #endif
+                    #if nEC>=13
+                    + x12 * (*SFP12ptr++)
+                    #endif
+                    #if nEC>=14
+                    + x13 * (*SFP13ptr++)
+                    #endif
+                    #if nEC>=15
+                    + x14 * (*SFP14ptr++)
+                    #endif
+                    #if nEC>=16
+                    + x15 * (*SFP15ptr++)
+                    #endif
+                    #if nEC>=17
+                    + x16 * (*SFP16ptr++)
+                    #endif
+                    #if nEC>=18
+                    + x17 * (*SFP17ptr++)
+                    #endif
+                    #if nEC>=19
+                    + x18 * (*SFP18ptr++)
+                    #endif
+                    #if nEC>=20
+                    + x19 * (*SFP19ptr++)
+                    #endif
+
+                );
+        }
+        t_v++;
+        t_o++;
+    }
+#endif
+
+#if nISO>=1
+    // isotropic compartments
+    t_v    = ISOv + ISOthreads[id];
+    t_vEnd = ISOv + ISOthreads[id+1];
+
+    x_Ptr0 = x + nIC*nF + nEC*nE + ISOthreads[id];
+    #if nISO>=2
+    x_Ptr1 = x_Ptr0 + nV;
+    #endif
+    #if nISO>=3
+    x_Ptr2 = x_Ptr1 + nV;
+    #endif
+    #if nISO>=4
+    x_Ptr3 = x_Ptr2 + nV;
+    #endif
+    #if nISO>=5
+    x_Ptr4 = x_Ptr3 + nV;
+    #endif
+    #if nISO>=6
+    x_Ptr5 = x_Ptr4 + nV;
+    #endif
+    #if nISO>=7
+    x_Ptr6 = x_Ptr5 + nV;
+    #endif
+    #if nISO>=8
+    x_Ptr7 = x_Ptr6 + nV;
+    #endif
+    #if nISO>=9
+    x_Ptr8 = x_Ptr7 + nV;
+    #endif
+    #if nISO>=10
+    x_Ptr9 = x_Ptr8 + nV;
+    #endif
+    #if nISO>=11
+    x_Ptr10 = x_Ptr9 + nV;
+    #endif
+    #if nISO>=12
+    x_Ptr11 = x_Ptr10 + nV;
+    #endif
+    #if nISO>=13
+    x_Ptr12 = x_Ptr11 + nV;
+    #endif
+    #if nISO>=14
+    x_Ptr13 = x_Ptr12 + nV;
+    #endif
+    #if nISO>=15
+    x_Ptr14 = x_Ptr13 + nV;
+    #endif
+    #if nISO>=16
+    x_Ptr15 = x_Ptr14 + nV;
+    #endif
+    #if nISO>=17
+    x_Ptr16 = x_Ptr15 + nV;
+    #endif
+    #if nISO>=18
+    x_Ptr17 = x_Ptr16 + nV;
+    #endif
+    #if nISO>=19
+    x_Ptr18 = x_Ptr17 + nV;
+    #endif
+    #if nISO>=20
+    x_Ptr19 = x_Ptr18 + nV;
+    #endif
+
+    while( t_v != t_vEnd )
+    {
+        x0 = *x_Ptr0++;
+        #if nISO>=2
+        x1 = *x_Ptr1++;
+        #endif
+        #if nISO>=3
+        x2 = *x_Ptr2++;
+        #endif
+        #if nISO>=4
+        x3 = *x_Ptr3++;
+        #endif
+        #if nISO>=5
+        x4 = *x_Ptr4++;
+        #endif
+        #if nISO>=6
+        x5 = *x_Ptr5++;
+        #endif
+        #if nISO>=7
+        x6 = *x_Ptr6++;
+        #endif
+        #if nISO>=8
+        x7 = *x_Ptr7++;
+        #endif
+        #if nISO>=9
+        x8 = *x_Ptr8++;
+        #endif
+        #if nISO>=10
+        x9 = *x_Ptr9++;
+        #endif
+        #if nISO>=11
+        x10 = *x_Ptr10++;
+        #endif
+        #if nISO>=12
+        x11 = *x_Ptr11++;
+        #endif
+        #if nISO>=13
+        x12 = *x_Ptr12++;
+        #endif
+        #if nISO>=14
+        x13 = *x_Ptr13++;
+        #endif
+        #if nISO>=15
+        x14 = *x_Ptr14++;
+        #endif
+        #if nISO>=16
+        x15 = *x_Ptr15++;
+        #endif
+        #if nISO>=17
+        x16 = *x_Ptr16++;
+        #endif
+        #if nISO>=18
+        x17 = *x_Ptr17++;
+        #endif
+        #if nISO>=19
+        x18 = *x_Ptr18++;
+        #endif
+        #if nISO>=20
+        x19 = *x_Ptr19++;
+        #endif
+
+        if (
+               x0 != 0
+            #if nISO>=2
+            || x1 != 0
+            #endif
+            #if nISO>=3
+            || x2 != 0
+            #endif
+            #if nISO>=4
+            || x3 != 0
+            #endif
+            #if nISO>=5
+            || x4 != 0
+            #endif
+            #if nISO>=6
+            || x5 != 0
+            #endif
+            #if nISO>=7
+            || x6 != 0
+            #endif
+            #if nISO>=8
+            || x7 != 0
+            #endif
+            #if nISO>=9
+            || x8 != 0
+            #endif
+            #if nISO>=10
+            || x9 != 0
+            #endif
+            #if nISO>=11
+            || x10 != 0
+            #endif
+            #if nISO>=12
+            || x11 != 0
+            #endif
+            #if nISO>=13
+            || x12 != 0
+            #endif
+            #if nISO>=14
+            || x13 != 0
+            #endif
+            #if nISO>=15
+            || x14 != 0
+            #endif
+            #if nISO>=16
+            || x15 != 0
+            #endif
+            #if nISO>=17
+            || x16 != 0
+            #endif
+            #if nISO>=18
+            || x17 != 0
+            #endif
+            #if nISO>=19
+            || x18 != 0
+            #endif
+            #if nISO>=20
+            || x19 != 0
+            #endif
+          )
+        {
+            Yptr    = Y    + nS * (*t_v);
+            YptrEnd = Yptr + nS;
+            SFP0ptr = isoSFP0;
+            #if nISO>=2
+            SFP1ptr = isoSFP1;
+            #endif
+            #if nISO>=3
+            SFP2ptr = isoSFP2;
+            #endif
+            #if nISO>=4
+            SFP3ptr = isoSFP3;
+            #endif
+            #if nISO>=5
+            SFP4ptr = isoSFP4;
+            #endif
+            #if nISO>=6
+            SFP5ptr = isoSFP5;
+            #endif
+            #if nISO>=7
+            SFP6ptr = isoSFP6;
+            #endif
+            #if nISO>=8
+            SFP7ptr = isoSFP7;
+            #endif
+            #if nISO>=9
+            SFP8ptr = isoSFP8;
+            #endif
+            #if nISO>=10
+            SFP9ptr = isoSFP9;
+            #endif
+            #if nISO>=11
+            SFP10ptr = isoSFP10;
+            #endif
+            #if nISO>=12
+            SFP11ptr = isoSFP11;
+            #endif
+            #if nISO>=13
+            SFP12ptr = isoSFP12;
+            #endif
+            #if nISO>=14
+            SFP13ptr = isoSFP13;
+            #endif
+            #if nISO>=15
+            SFP14ptr = isoSFP14;
+            #endif
+            #if nISO>=16
+            SFP15ptr = isoSFP15;
+            #endif
+            #if nISO>=17
+            SFP16ptr = isoSFP16;
+            #endif
+            #if nISO>=18
+            SFP17ptr = isoSFP17;
+            #endif
+            #if nISO>=19
+            SFP18ptr = isoSFP18;
+            #endif
+            #if nISO>=20
+            SFP19ptr = isoSFP19;
+            #endif
+
+            while( Yptr != YptrEnd )
+                (*Yptr++) += (
+                      x0 * (*SFP0ptr++)
+                    #if nISO>=2
+                    + x1 * (*SFP1ptr++)
+                    #endif
+                    #if nISO>=3
+                    + x2 * (*SFP2ptr++)
+                    #endif
+                    #if nISO>=4
+                    + x3 * (*SFP3ptr++)
+                    #endif
+                    #if nISO>=5
+                    + x4 * (*SFP4ptr++)
+                    #endif
+                    #if nISO>=6
+                    + x5 * (*SFP5ptr++)
+                    #endif
+                    #if nISO>=7
+                    + x6 * (*SFP6ptr++)
+                    #endif
+                    #if nISO>=8
+                    + x7 * (*SFP7ptr++)
+                    #endif
+                    #if nISO>=9
+                    + x8 * (*SFP8ptr++)
+                    #endif
+                    #if nISO>=10
+                    + x9 * (*SFP9ptr++)
+                    #endif
+                    #if nISO>=11
+                    + x10 * (*SFP10ptr++)
+                    #endif
+                    #if nISO>=12
+                    + x11 * (*SFP11ptr++)
+                    #endif
+                    #if nISO>=13
+                    + x12 * (*SFP12ptr++)
+                    #endif
+                    #if nISO>=14
+                    + x13 * (*SFP13ptr++)
+                    #endif
+                    #if nISO>=15
+                    + x14 * (*SFP14ptr++)
+                    #endif
+                    #if nISO>=16
+                    + x15 * (*SFP15ptr++)
+                    #endif
+                    #if nISO>=17
+                    + x16 * (*SFP16ptr++)
+                    #endif
+                    #if nISO>=18
+                    + x17 * (*SFP17ptr++)
+                    #endif
+                    #if nISO>=19
+                    + x18 * (*SFP18ptr++)
+                    #endif
+                    #if nISO>=20
+                    + x19 * (*SFP19ptr++)
+                    #endif
+                );
+        }
+        t_v++;
+    }
+#endif
+
+    pthread_exit( 0 );
+}
+
+
+// =========================
+// Function called by CYTHON
+// =========================
+void COMMIT_A(
+    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
+    double *_vIN, double *_vOUT,
+    uint32_t *_ICf, uint32_t *_ICv, uint16_t *_ICo, float *_ICl,
+    uint32_t *_ECv, uint16_t *_ECo,
+    uint32_t *_ISOv,
+    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
+    uint32_t* _ICthreads, uint32_t* _ECthreads, uint32_t* _ISOthreads
+)
+{
+    nF = _nF;
+    n  = _n;
+    nE = _nE;
+    nV = _nV;
+    nS = _nS;
+    ndirs = _ndirs;
+
+    x = _vIN;
+    Y = _vOUT;
+
+    ICf  = _ICf;
+    ICv  = _ICv;
+    ICo  = _ICo;
+    ICl  = _ICl;
+    ECv  = _ECv;
+    ECo  = _ECo;
+    ISOv = _ISOv;
+
+    #if nIC>=1
+    wmrSFP0 = _wmrSFP;
+    #if nIC>=2
+    wmrSFP1 = wmrSFP0 + _ndirs*_nS;
+    #if nIC>=3
+    wmrSFP2 = wmrSFP1 + _ndirs*_nS;
+    #if nIC>=4
+    wmrSFP3 = wmrSFP2 + _ndirs*_nS;
+    #if nIC>=5
+    wmrSFP4 = wmrSFP3 + _ndirs*_nS;
+    #if nIC>=6
+    wmrSFP5 = wmrSFP4 + _ndirs*_nS;
+    #if nIC>=7
+    wmrSFP6 = wmrSFP5 + _ndirs*_nS;
+    #if nIC>=8
+    wmrSFP7 = wmrSFP6 + _ndirs*_nS;
+    #if nIC>=9
+    wmrSFP8 = wmrSFP7 + _ndirs*_nS;
+    #if nIC>=10
+    wmrSFP9 = wmrSFP8 + _ndirs*_nS;
+    #if nIC>=11
+    wmrSFP10 = wmrSFP9 + _ndirs*_nS;
+    #if nIC>=12
+    wmrSFP11 = wmrSFP10 + _ndirs*_nS;
+    #if nIC>=13
+    wmrSFP12 = wmrSFP11 + _ndirs*_nS;
+    #if nIC>=14
+    wmrSFP13 = wmrSFP12 + _ndirs*_nS;
+    #if nIC>=15
+    wmrSFP14 = wmrSFP13 + _ndirs*_nS;
+    #if nIC>=16
+    wmrSFP15 = wmrSFP14 + _ndirs*_nS;
+    #if nIC>=17
+    wmrSFP16 = wmrSFP15 + _ndirs*_nS;
+    #if nIC>=18
+    wmrSFP17 = wmrSFP16 + _ndirs*_nS;
+    #if nIC>=19
+    wmrSFP18 = wmrSFP17 + _ndirs*_nS;
+    #if nIC>=20
+    wmrSFP19 = wmrSFP18 + _ndirs*_nS;
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #if nEC>=1
+    wmhSFP0 = _wmhSFP;
+    #if nEC>=2
+    wmhSFP1 = wmhSFP0 + _ndirs*_nS;
+    #if nEC>=3
+    wmhSFP2 = wmhSFP1 + _ndirs*_nS;
+    #if nEC>=4
+    wmhSFP3 = wmhSFP2 + _ndirs*_nS;
+    #if nEC>=5
+    wmhSFP4 = wmhSFP3 + _ndirs*_nS;
+    #if nEC>=6
+    wmhSFP5 = wmhSFP4 + _ndirs*_nS;
+    #if nEC>=7
+    wmhSFP6 = wmhSFP5 + _ndirs*_nS;
+    #if nEC>=8
+    wmhSFP7 = wmhSFP6 + _ndirs*_nS;
+    #if nEC>=9
+    wmhSFP8 = wmhSFP7 + _ndirs*_nS;
+    #if nEC>=10
+    wmhSFP9 = wmhSFP8 + _ndirs*_nS;
+    #if nEC>=11
+    wmhSFP10 = wmhSFP9 + _ndirs*_nS;
+    #if nEC>=12
+    wmhSFP11 = wmhSFP10 + _ndirs*_nS;
+    #if nEC>=13
+    wmhSFP12 = wmhSFP11 + _ndirs*_nS;
+    #if nEC>=14
+    wmhSFP13 = wmhSFP12 + _ndirs*_nS;
+    #if nEC>=15
+    wmhSFP14 = wmhSFP13 + _ndirs*_nS;
+    #if nEC>=16
+    wmhSFP15 = wmhSFP14 + _ndirs*_nS;
+    #if nEC>=17
+    wmhSFP16 = wmhSFP15 + _ndirs*_nS;
+    #if nEC>=18
+    wmhSFP17 = wmhSFP16 + _ndirs*_nS;
+    #if nEC>=19
+    wmhSFP18 = wmhSFP17 + _ndirs*_nS;
+    #if nEC>=20
+    wmhSFP19 = wmhSFP18 + _ndirs*_nS;
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #if nISO>=1
+    isoSFP0 = _isoSFP;
+    #if nISO>=2
+    isoSFP1 = isoSFP0 + _nS;
+    #if nISO>=3
+    isoSFP2 = isoSFP1 + _nS;
+    #if nISO>=4
+    isoSFP3 = isoSFP2 + _nS;
+    #if nISO>=5
+    isoSFP4 = isoSFP3 + _nS;
+    #if nISO>=6
+    isoSFP5 = isoSFP4 + _nS;
+    #if nISO>=7
+    isoSFP6 = isoSFP5 + _nS;
+    #if nISO>=8
+    isoSFP7 = isoSFP6 + _nS;
+    #if nISO>=9
+    isoSFP8 = isoSFP7 + _nS;
+    #if nISO>=10
+    isoSFP9 = isoSFP8 + _nS;
+    #if nISO>=11
+    isoSFP10 = isoSFP9 + _nS;
+    #if nISO>=12
+    isoSFP11 = isoSFP10 + _nS;
+    #if nISO>=13
+    isoSFP12 = isoSFP11 + _nS;
+    #if nISO>=14
+    isoSFP13 = isoSFP12 + _nS;
+    #if nISO>=15
+    isoSFP14 = isoSFP13 + _nS;
+    #if nISO>=16
+    isoSFP15 = isoSFP14 + _nS;
+    #if nISO>=17
+    isoSFP16 = isoSFP15 + _nS;
+    #if nISO>=18
+    isoSFP17 = isoSFP16 + _nS;
+    #if nISO>=19
+    isoSFP18 = isoSFP17 + _nS;
+    #if nISO>=20
+    isoSFP19 = isoSFP18 + _nS;
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+
+    ICthreads  = _ICthreads;
+    ECthreads  = _ECthreads;
+    ISOthreads = _ISOthreads;
+
+    // Run SEPARATE THREADS to perform the multiplication
+    pthread_t threads[nTHREADS];
+    int t;
+    for(t=0; t<nTHREADS ; t++)
+        pthread_create( &threads[t], NULL, COMMIT_A__block, (void *) (long int)t );
+    for(t=0; t<nTHREADS ; t++)
+        pthread_join( threads[t], NULL );
+    return;
+}
+
+
+
+/* ===================================================== */
+/* Compute a sub-block of the A'*y MAtRIX-VECTOR product */
+/* ===================================================== */
+void* COMMIT_At__block( void *ptr )
+{
+    int      id = (long)ptr;
+    int      offset;
+    double   x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, w, Y_tmp;
+    double   *x_Ptr0, *x_Ptr1, *x_Ptr2, *x_Ptr3, *x_Ptr4, *x_Ptr5, *x_Ptr6, *x_Ptr7, *x_Ptr8, *x_Ptr9, *x_Ptr10, *x_Ptr11, *x_Ptr12, *x_Ptr13, *x_Ptr14, *x_Ptr15, *x_Ptr16, *x_Ptr17, *x_Ptr18, *x_Ptr19;
+    double   *Yptr, *YptrEnd;
+    float    *SFP0ptr, *SFP1ptr, *SFP2ptr, *SFP3ptr, *SFP4ptr, *SFP5ptr, *SFP6ptr, *SFP7ptr, *SFP8ptr, *SFP9ptr, *SFP10ptr, *SFP11ptr, *SFP12ptr, *SFP13ptr, *SFP14ptr, *SFP15ptr, *SFP16ptr, *SFP17ptr, *SFP18ptr, *SFP19ptr;
+    uint32_t *t_v, *t_vEnd, *t_f;
+    uint16_t *t_o;
+    float    *t_l;
+    uint8_t  *t_t;
+
+#if nIC>=1
+    // intra-cellular compartments
+    t_v    = ICv;
+    t_vEnd = ICv + n;
+    t_o    = ICo;
+    t_l    = ICl;
+    t_f    = ICf;
+    t_t    = ICthreadsT;
+
+    while( t_v != t_vEnd )
+    {
+        // in this case, I need to walk throug because the segments are ordered in "voxel order"
+        if ( *t_t == id )
+        {
+            Yptr    = Y    + nS * (*t_v);
+            YptrEnd = Yptr + nS;
+            offset  = nS * (*t_o);
+
+            Y_tmp = *Yptr;
+            SFP0ptr   = wmrSFP0 + offset;
+            x0 = (*SFP0ptr++) * Y_tmp;
+            #if nIC>=2
+            SFP1ptr   = wmrSFP1 + offset;
+            x1 = (*SFP1ptr++) * Y_tmp;
+            #endif
+            #if nIC>=3
+            SFP2ptr   = wmrSFP2 + offset;
+            x2 = (*SFP2ptr++) * Y_tmp;
+            #endif
+            #if nIC>=4
+            SFP3ptr   = wmrSFP3 + offset;
+            x3 = (*SFP3ptr++) * Y_tmp;
+            #endif
+            #if nIC>=5
+            SFP4ptr   = wmrSFP4 + offset;
+            x4 = (*SFP4ptr++) * Y_tmp;
+            #endif
+            #if nIC>=6
+            SFP5ptr   = wmrSFP5 + offset;
+            x5 = (*SFP5ptr++) * Y_tmp;
+            #endif
+            #if nIC>=7
+            SFP6ptr   = wmrSFP6 + offset;
+            x6 = (*SFP6ptr++) * Y_tmp;
+            #endif
+            #if nIC>=8
+            SFP7ptr   = wmrSFP7 + offset;
+            x7 = (*SFP7ptr++) * Y_tmp;
+            #endif
+            #if nIC>=9
+            SFP8ptr   = wmrSFP8 + offset;
+            x8 = (*SFP8ptr++) * Y_tmp;
+            #endif
+            #if nIC>=10
+            SFP9ptr   = wmrSFP9 + offset;
+            x9 = (*SFP9ptr++) * Y_tmp;
+            #endif
+            #if nIC>=11
+            SFP10ptr   = wmrSFP10 + offset;
+            x10 = (*SFP10ptr++) * Y_tmp;
+            #endif
+            #if nIC>=12
+            SFP11ptr   = wmrSFP11 + offset;
+            x11 = (*SFP11ptr++) * Y_tmp;
+            #endif
+            #if nIC>=13
+            SFP12ptr   = wmrSFP12 + offset;
+            x12 = (*SFP12ptr++) * Y_tmp;
+            #endif
+            #if nIC>=14
+            SFP13ptr   = wmrSFP13 + offset;
+            x13 = (*SFP13ptr++) * Y_tmp;
+            #endif
+            #if nIC>=15
+            SFP14ptr   = wmrSFP14 + offset;
+            x14 = (*SFP14ptr++) * Y_tmp;
+            #endif
+            #if nIC>=16
+            SFP15ptr   = wmrSFP15 + offset;
+            x15 = (*SFP15ptr++) * Y_tmp;
+            #endif
+            #if nIC>=17
+            SFP16ptr   = wmrSFP16 + offset;
+            x16 = (*SFP16ptr++) * Y_tmp;
+            #endif
+            #if nIC>=18
+            SFP17ptr   = wmrSFP17 + offset;
+            x17 = (*SFP17ptr++) * Y_tmp;
+            #endif
+            #if nIC>=19
+            SFP18ptr   = wmrSFP18 + offset;
+            x18 = (*SFP18ptr++) * Y_tmp;
+            #endif
+            #if nIC>=20
+            SFP19ptr   = wmrSFP19 + offset;
+            x19 = (*SFP19ptr++) * Y_tmp;
+            #endif
+
+            while( ++Yptr != YptrEnd )
+            {
+                Y_tmp = *Yptr;
+                x0 += (*SFP0ptr++) * Y_tmp;
+                #if nIC>=2
+                x1 += (*SFP1ptr++) * Y_tmp;
+                #endif
+                #if nIC>=3
+                x2 += (*SFP2ptr++) * Y_tmp;
+                #endif
+                #if nIC>=4
+                x3 += (*SFP3ptr++) * Y_tmp;
+                #endif
+                #if nIC>=5
+                x4 += (*SFP4ptr++) * Y_tmp;
+                #endif
+                #if nIC>=6
+                x5 += (*SFP5ptr++) * Y_tmp;
+                #endif
+                #if nIC>=7
+                x6 += (*SFP6ptr++) * Y_tmp;
+                #endif
+                #if nIC>=8
+                x7 += (*SFP7ptr++) * Y_tmp;
+                #endif
+                #if nIC>=9
+                x8 += (*SFP8ptr++) * Y_tmp;
+                #endif
+                #if nIC>=10
+                x9 += (*SFP9ptr++) * Y_tmp;
+                #endif
+                #if nIC>=11
+                x10 += (*SFP10ptr++) * Y_tmp;
+                #endif
+                #if nIC>=12
+                x11 += (*SFP11ptr++) * Y_tmp;
+                #endif
+                #if nIC>=13
+                x12 += (*SFP12ptr++) * Y_tmp;
+                #endif
+                #if nIC>=14
+                x13 += (*SFP13ptr++) * Y_tmp;
+                #endif
+                #if nIC>=15
+                x14 += (*SFP14ptr++) * Y_tmp;
+                #endif
+                #if nIC>=16
+                x15 += (*SFP15ptr++) * Y_tmp;
+                #endif
+                #if nIC>=17
+                x16 += (*SFP16ptr++) * Y_tmp;
+                #endif
+                #if nIC>=18
+                x17 += (*SFP17ptr++) * Y_tmp;
+                #endif
+                #if nIC>=19
+                x18 += (*SFP18ptr++) * Y_tmp;
+                #endif
+                #if nIC>=20
+                x19 += (*SFP19ptr++) * Y_tmp;
+                #endif
+            }
+
+            w = (double)(*t_l);
+            x[*t_f]      += w * x0;
+            #if nIC>=2
+            x[*t_f+nF]   += w * x1;
+            #endif
+            #if nIC>=3
+            x[*t_f+2*nF] += w * x2;
+            #endif
+            #if nIC>=4
+            x[*t_f+3*nF] += w * x3;
+            #endif
+            #if nIC>=5
+            x[*t_f+4*nF] += w * x4;
+            #endif
+            #if nIC>=6
+            x[*t_f+5*nF] += w * x5;
+            #endif
+            #if nIC>=7
+            x[*t_f+6*nF] += w * x6;
+            #endif
+            #if nIC>=8
+            x[*t_f+7*nF] += w * x7;
+            #endif
+            #if nIC>=9
+            x[*t_f+8*nF] += w * x8;
+            #endif
+            #if nIC>=10
+            x[*t_f+9*nF] += w * x9;
+            #endif
+            #if nIC>=11
+            x[*t_f+10*nF] += w * x10;
+            #endif
+            #if nIC>=12
+            x[*t_f+11*nF] += w * x11;
+            #endif
+            #if nIC>=13
+            x[*t_f+12*nF] += w * x12;
+            #endif
+            #if nIC>=14
+            x[*t_f+13*nF] += w * x13;
+            #endif
+            #if nIC>=15
+            x[*t_f+14*nF] += w * x14;
+            #endif
+            #if nIC>=16
+            x[*t_f+15*nF] += w * x15;
+            #endif
+            #if nIC>=17
+            x[*t_f+16*nF] += w * x16;
+            #endif
+            #if nIC>=18
+            x[*t_f+17*nF] += w * x17;
+            #endif
+            #if nIC>=19
+            x[*t_f+18*nF] += w * x18;
+            #endif
+            #if nIC>=20
+            x[*t_f+19*nF] += w * x19;
+            #endif
+        }
+
+        t_f++;
+        t_v++;
+        t_o++;
+        t_l++;
+        t_t++;
+    }
+#endif
+
+#if nEC>=1
+    // extra-cellular compartments
+    t_v    = ECv + ECthreadsT[id];
+    t_vEnd = ECv + ECthreadsT[id+1];
+    t_o    = ECo + ECthreadsT[id];
+
+    x_Ptr0 = x + nIC*nF + ECthreadsT[id];
+    #if nEC>=2
+    x_Ptr1 = x_Ptr0 + nE;
+    #endif
+    #if nEC>=3
+    x_Ptr2 = x_Ptr1 + nE;
+    #endif
+    #if nEC>=4
+    x_Ptr3 = x_Ptr2 + nE;
+    #endif
+    #if nEC>=5
+    x_Ptr4 = x_Ptr3 + nE;
+    #endif
+    #if nEC>=6
+    x_Ptr5 = x_Ptr4 + nE;
+    #endif
+    #if nEC>=7
+    x_Ptr6 = x_Ptr5 + nE;
+    #endif
+    #if nEC>=8
+    x_Ptr7 = x_Ptr6 + nE;
+    #endif
+    #if nEC>=9
+    x_Ptr8 = x_Ptr7 + nE;
+    #endif
+    #if nEC>=10
+    x_Ptr9 = x_Ptr8 + nE;
+    #endif
+    #if nEC>=11
+    x_Ptr10 = x_Ptr9 + nE;
+    #endif
+    #if nEC>=12
+    x_Ptr11 = x_Ptr10 + nE;
+    #endif
+    #if nEC>=13
+    x_Ptr12 = x_Ptr11 + nE;
+    #endif
+    #if nEC>=14
+    x_Ptr13 = x_Ptr12 + nE;
+    #endif
+    #if nEC>=15
+    x_Ptr14 = x_Ptr13 + nE;
+    #endif
+    #if nEC>=16
+    x_Ptr15 = x_Ptr14 + nE;
+    #endif
+    #if nEC>=17
+    x_Ptr16 = x_Ptr15 + nE;
+    #endif
+    #if nEC>=18
+    x_Ptr17 = x_Ptr16 + nE;
+    #endif
+    #if nEC>=19
+    x_Ptr18 = x_Ptr17 + nE;
+    #endif
+    #if nEC>=20
+    x_Ptr19 = x_Ptr18 + nE;
+    #endif
+
+    while( t_v != t_vEnd )
+    {
+        Yptr    = Y    + nS * (*t_v++);
+        YptrEnd = Yptr + nS;
+        offset  = nS * (*t_o++);
+
+        Y_tmp = *Yptr;
+        SFP0ptr = wmhSFP0 + offset;
+        x0 = (*SFP0ptr++) * Y_tmp;
+        #if nEC>=2
+        SFP1ptr = wmhSFP1 + offset;
+        x1 = (*SFP1ptr++) * Y_tmp;
+        #endif
+        #if nEC>=3
+        SFP2ptr = wmhSFP2 + offset;
+        x2 = (*SFP2ptr++) * Y_tmp;
+        #endif
+        #if nEC>=4
+        SFP3ptr = wmhSFP3 + offset;
+        x3 = (*SFP3ptr++) * Y_tmp;
+        #endif
+        #if nEC>=5
+        SFP4ptr = wmhSFP4 + offset;
+        x4 = (*SFP4ptr++) * Y_tmp;
+        #endif
+        #if nEC>=6
+        SFP5ptr = wmhSFP5 + offset;
+        x5 = (*SFP5ptr++) * Y_tmp;
+        #endif
+        #if nEC>=7
+        SFP6ptr = wmhSFP6 + offset;
+        x6 = (*SFP6ptr++) * Y_tmp;
+        #endif
+        #if nEC>=8
+        SFP7ptr = wmhSFP7 + offset;
+        x7 = (*SFP7ptr++) * Y_tmp;
+        #endif
+        #if nEC>=9
+        SFP8ptr = wmhSFP8 + offset;
+        x8 = (*SFP8ptr++) * Y_tmp;
+        #endif
+        #if nEC>=10
+        SFP9ptr = wmhSFP9 + offset;
+        x9 = (*SFP9ptr++) * Y_tmp;
+        #endif
+        #if nEC>=11
+        SFP10ptr = wmhSFP10 + offset;
+        x10 = (*SFP10ptr++) * Y_tmp;
+        #endif
+        #if nEC>=12
+        SFP11ptr = wmhSFP11 + offset;
+        x11 = (*SFP11ptr++) * Y_tmp;
+        #endif
+        #if nEC>=13
+        SFP12ptr = wmhSFP12 + offset;
+        x12 = (*SFP12ptr++) * Y_tmp;
+        #endif
+        #if nEC>=14
+        SFP13ptr = wmhSFP13 + offset;
+        x13 = (*SFP13ptr++) * Y_tmp;
+        #endif
+        #if nEC>=15
+        SFP14ptr = wmhSFP14 + offset;
+        x14 = (*SFP14ptr++) * Y_tmp;
+        #endif
+        #if nEC>=16
+        SFP15ptr = wmhSFP15 + offset;
+        x15 = (*SFP15ptr++) * Y_tmp;
+        #endif
+        #if nEC>=17
+        SFP16ptr = wmhSFP16 + offset;
+        x16 = (*SFP16ptr++) * Y_tmp;
+        #endif
+        #if nEC>=18
+        SFP17ptr = wmhSFP17 + offset;
+        x17 = (*SFP17ptr++) * Y_tmp;
+        #endif
+        #if nEC>=19
+        SFP18ptr = wmhSFP18 + offset;
+        x18 = (*SFP18ptr++) * Y_tmp;
+        #endif
+        #if nEC>=20
+        SFP19ptr = wmhSFP19 + offset;
+        x19 = (*SFP19ptr++) * Y_tmp;
+        #endif
+
+        while( ++Yptr != YptrEnd )
+        {
+            Y_tmp = *Yptr;
+            x0 += (*SFP0ptr++) * Y_tmp;
+            #if nEC>=2
+            x1 += (*SFP1ptr++) * Y_tmp;
+            #endif
+            #if nEC>=3
+            x2 += (*SFP2ptr++) * Y_tmp;
+            #endif
+            #if nEC>=4
+            x3 += (*SFP3ptr++) * Y_tmp;
+            #endif
+            #if nEC>=5
+            x4 += (*SFP4ptr++) * Y_tmp;
+            #endif
+            #if nEC>=6
+            x5 += (*SFP5ptr++) * Y_tmp;
+            #endif
+            #if nEC>=7
+            x6 += (*SFP6ptr++) * Y_tmp;
+            #endif
+            #if nEC>=8
+            x7 += (*SFP7ptr++) * Y_tmp;
+            #endif
+            #if nEC>=9
+            x8 += (*SFP8ptr++) * Y_tmp;
+            #endif
+            #if nEC>=10
+            x9 += (*SFP9ptr++) * Y_tmp;
+            #endif
+            #if nEC>=11
+            x10 += (*SFP10ptr++) * Y_tmp;
+            #endif
+            #if nEC>=12
+            x11 += (*SFP11ptr++) * Y_tmp;
+            #endif
+            #if nEC>=13
+            x12 += (*SFP12ptr++) * Y_tmp;
+            #endif
+            #if nEC>=14
+            x13 += (*SFP13ptr++) * Y_tmp;
+            #endif
+            #if nEC>=15
+            x14 += (*SFP14ptr++) * Y_tmp;
+            #endif
+            #if nEC>=16
+            x15 += (*SFP15ptr++) * Y_tmp;
+            #endif
+            #if nEC>=17
+            x16 += (*SFP16ptr++) * Y_tmp;
+            #endif
+            #if nEC>=18
+            x17 += (*SFP17ptr++) * Y_tmp;
+            #endif
+            #if nEC>=19
+            x18 += (*SFP18ptr++) * Y_tmp;
+            #endif
+            #if nEC>=20
+            x19 += (*SFP19ptr++) * Y_tmp;
+            #endif
+        }
+        (*x_Ptr0++) += x0;
+        #if nEC>=2
+        (*x_Ptr1++) += x1;
+        #endif
+        #if nEC>=3
+        (*x_Ptr2++) += x2;
+        #endif
+        #if nEC>=4
+        (*x_Ptr3++) += x3;
+        #endif
+        #if nEC>=5
+        (*x_Ptr4++) += x4;
+        #endif
+        #if nEC>=6
+        (*x_Ptr5++) += x5;
+        #endif
+        #if nEC>=7
+        (*x_Ptr6++) += x6;
+        #endif
+        #if nEC>=8
+        (*x_Ptr7++) += x7;
+        #endif
+        #if nEC>=9
+        (*x_Ptr8++) += x8;
+        #endif
+        #if nEC>=10
+        (*x_Ptr9++) += x9;
+        #endif
+        #if nEC>=11
+        (*x_Ptr10++) += x10;
+        #endif
+        #if nEC>=12
+        (*x_Ptr11++) += x11;
+        #endif
+        #if nEC>=13
+        (*x_Ptr12++) += x12;
+        #endif
+        #if nEC>=14
+        (*x_Ptr13++) += x13;
+        #endif
+        #if nEC>=15
+        (*x_Ptr14++) += x14;
+        #endif
+        #if nEC>=16
+        (*x_Ptr15++) += x15;
+        #endif
+        #if nEC>=17
+        (*x_Ptr16++) += x16;
+        #endif
+        #if nEC>=18
+        (*x_Ptr17++) += x17;
+        #endif
+        #if nEC>=19
+        (*x_Ptr18++) += x18;
+        #endif
+        #if nEC>=20
+        (*x_Ptr19++) += x19;
+        #endif
+    }
+#endif
+
+#if nISO>=1
+    // isotropic compartments
+    t_v    = ISOv + ISOthreadsT[id];
+    t_vEnd = ISOv + ISOthreadsT[id+1];
+
+    x_Ptr0 = x + nIC*nF + nEC*nE + ISOthreadsT[id];
+    #if nISO>=2
+    x_Ptr1 = x_Ptr0 + nV;
+    #endif
+    #if nISO>=3
+    x_Ptr2 = x_Ptr1 + nV;
+    #endif
+    #if nISO>=4
+    x_Ptr3 = x_Ptr2 + nV;
+    #endif
+    #if nISO>=5
+    x_Ptr4 = x_Ptr3 + nV;
+    #endif
+    #if nISO>=6
+    x_Ptr5 = x_Ptr4 + nV;
+    #endif
+    #if nISO>=7
+    x_Ptr6 = x_Ptr5 + nV;
+    #endif
+    #if nISO>=8
+    x_Ptr7 = x_Ptr6 + nV;
+    #endif
+    #if nISO>=9
+    x_Ptr8 = x_Ptr7 + nV;
+    #endif
+    #if nISO>=10
+    x_Ptr9 = x_Ptr8 + nV;
+    #endif
+    #if nISO>=11
+    x_Ptr10 = x_Ptr9 + nV;
+    #endif
+    #if nISO>=12
+    x_Ptr11 = x_Ptr10 + nV;
+    #endif
+    #if nISO>=13
+    x_Ptr12 = x_Ptr11 + nV;
+    #endif
+    #if nISO>=14
+    x_Ptr13 = x_Ptr12 + nV;
+    #endif
+    #if nISO>=15
+    x_Ptr14 = x_Ptr13 + nV;
+    #endif
+    #if nISO>=16
+    x_Ptr15 = x_Ptr14 + nV;
+    #endif
+    #if nISO>=17
+    x_Ptr16 = x_Ptr15 + nV;
+    #endif
+    #if nISO>=18
+    x_Ptr17 = x_Ptr16 + nV;
+    #endif
+    #if nISO>=19
+    x_Ptr18 = x_Ptr17 + nV;
+    #endif
+    #if nISO>=20
+    x_Ptr19 = x_Ptr18 + nV;
+    #endif
+
+    while( t_v != t_vEnd )
+    {
+        Yptr    = Y    + nS * (*t_v++);
+        YptrEnd = Yptr + nS;
+
+        SFP0ptr = isoSFP0;
+        #if nISO>=2
+        SFP1ptr = isoSFP1;
+        #endif
+        #if nISO>=3
+        SFP2ptr = isoSFP2;
+        #endif
+        #if nISO>=4
+        SFP3ptr = isoSFP3;
+        #endif
+        #if nISO>=5
+        SFP4ptr = isoSFP4;
+        #endif
+        #if nISO>=6
+        SFP5ptr = isoSFP5;
+        #endif
+        #if nISO>=7
+        SFP6ptr = isoSFP6;
+        #endif
+        #if nISO>=8
+        SFP7ptr = isoSFP7;
+        #endif
+        #if nISO>=9
+        SFP8ptr = isoSFP8;
+        #endif
+        #if nISO>=10
+        SFP9ptr = isoSFP9;
+        #endif
+        #if nISO>=11
+        SFP10ptr = isoSFP10;
+        #endif
+        #if nISO>=12
+        SFP11ptr = isoSFP11;
+        #endif
+        #if nISO>=13
+        SFP12ptr = isoSFP12;
+        #endif
+        #if nISO>=14
+        SFP13ptr = isoSFP13;
+        #endif
+        #if nISO>=15
+        SFP14ptr = isoSFP14;
+        #endif
+        #if nISO>=16
+        SFP15ptr = isoSFP15;
+        #endif
+        #if nISO>=17
+        SFP16ptr = isoSFP16;
+        #endif
+        #if nISO>=18
+        SFP17ptr = isoSFP17;
+        #endif
+        #if nISO>=19
+        SFP18ptr = isoSFP18;
+        #endif
+        #if nISO>=20
+        SFP19ptr = isoSFP19;
+        #endif
+
+        Y_tmp = *Yptr;
+        x0 = (*SFP0ptr++) * Y_tmp;
+        #if nISO>=2
+        x1 = (*SFP1ptr++) * Y_tmp;
+        #endif
+        #if nISO>=3
+        x2 = (*SFP2ptr++) * Y_tmp;
+        #endif
+        #if nISO>=4
+        x3 = (*SFP3ptr++) * Y_tmp;
+        #endif
+        #if nISO>=5
+        x4 = (*SFP4ptr++) * Y_tmp;
+        #endif
+        #if nISO>=6
+        x5 = (*SFP5ptr++) * Y_tmp;
+        #endif
+        #if nISO>=7
+        x6 = (*SFP6ptr++) * Y_tmp;
+        #endif
+        #if nISO>=8
+        x7 = (*SFP7ptr++) * Y_tmp;
+        #endif
+        #if nISO>=9
+        x8 = (*SFP8ptr++) * Y_tmp;
+        #endif
+        #if nISO>=10
+        x9 = (*SFP9ptr++) * Y_tmp;
+        #endif
+        #if nISO>=11
+        x10 = (*SFP10ptr++) * Y_tmp;
+        #endif
+        #if nISO>=12
+        x11 = (*SFP11ptr++) * Y_tmp;
+        #endif
+        #if nISO>=13
+        x12 = (*SFP12ptr++) * Y_tmp;
+        #endif
+        #if nISO>=14
+        x13 = (*SFP13ptr++) * Y_tmp;
+        #endif
+        #if nISO>=15
+        x14 = (*SFP14ptr++) * Y_tmp;
+        #endif
+        #if nISO>=16
+        x15 = (*SFP15ptr++) * Y_tmp;
+        #endif
+        #if nISO>=17
+        x16 = (*SFP16ptr++) * Y_tmp;
+        #endif
+        #if nISO>=18
+        x17 = (*SFP17ptr++) * Y_tmp;
+        #endif
+        #if nISO>=19
+        x18 = (*SFP18ptr++) * Y_tmp;
+        #endif
+        #if nISO>=20
+        x19 = (*SFP19ptr++) * Y_tmp;
+        #endif
+
+        while( ++Yptr != YptrEnd )
+        {
+            Y_tmp = *Yptr;
+            x0  += (*SFP0ptr++) * Y_tmp;
+            #if nISO>=2
+            x1  += (*SFP1ptr++) * Y_tmp;
+            #endif
+            #if nISO>=3
+            x2  += (*SFP2ptr++) * Y_tmp;
+            #endif
+            #if nISO>=4
+            x3  += (*SFP3ptr++) * Y_tmp;
+            #endif
+            #if nISO>=5
+            x4  += (*SFP4ptr++) * Y_tmp;
+            #endif
+            #if nISO>=6
+            x5  += (*SFP5ptr++) * Y_tmp;
+            #endif
+            #if nISO>=7
+            x6  += (*SFP6ptr++) * Y_tmp;
+            #endif
+            #if nISO>=8
+            x7  += (*SFP7ptr++) * Y_tmp;
+            #endif
+            #if nISO>=9
+            x8  += (*SFP8ptr++) * Y_tmp;
+            #endif
+            #if nISO>=10
+            x9  += (*SFP9ptr++) * Y_tmp;
+            #endif
+            #if nISO>=11
+            x10  += (*SFP10ptr++) * Y_tmp;
+            #endif
+            #if nISO>=12
+            x11  += (*SFP11ptr++) * Y_tmp;
+            #endif
+            #if nISO>=13
+            x12  += (*SFP12ptr++) * Y_tmp;
+            #endif
+            #if nISO>=14
+            x13  += (*SFP13ptr++) * Y_tmp;
+            #endif
+            #if nISO>=15
+            x14  += (*SFP14ptr++) * Y_tmp;
+            #endif
+            #if nISO>=16
+            x15  += (*SFP15ptr++) * Y_tmp;
+            #endif
+            #if nISO>=17
+            x16  += (*SFP16ptr++) * Y_tmp;
+            #endif
+            #if nISO>=18
+            x17  += (*SFP17ptr++) * Y_tmp;
+            #endif
+            #if nISO>=19
+            x18  += (*SFP18ptr++) * Y_tmp;
+            #endif
+            #if nISO>=20
+            x19  += (*SFP19ptr++) * Y_tmp;
+            #endif
+        }
+
+        (*x_Ptr0++) += x0;
+        #if nISO>=2
+        (*x_Ptr1++) += x1;
+        #endif
+        #if nISO>=3
+        (*x_Ptr2++) += x2;
+        #endif
+        #if nISO>=4
+        (*x_Ptr3++) += x3;
+        #endif
+        #if nISO>=5
+        (*x_Ptr4++) += x4;
+        #endif
+        #if nISO>=6
+        (*x_Ptr5++) += x5;
+        #endif
+        #if nISO>=7
+        (*x_Ptr6++) += x6;
+        #endif
+        #if nISO>=8
+        (*x_Ptr7++) += x7;
+        #endif
+        #if nISO>=9
+        (*x_Ptr8++) += x8;
+        #endif
+        #if nISO>=10
+        (*x_Ptr9++) += x9;
+        #endif
+        #if nISO>=11
+        (*x_Ptr10++) += x10;
+        #endif
+        #if nISO>=12
+        (*x_Ptr11++) += x11;
+        #endif
+        #if nISO>=13
+        (*x_Ptr12++) += x12;
+        #endif
+        #if nISO>=14
+        (*x_Ptr13++) += x13;
+        #endif
+        #if nISO>=15
+        (*x_Ptr14++) += x14;
+        #endif
+        #if nISO>=16
+        (*x_Ptr15++) += x15;
+        #endif
+        #if nISO>=17
+        (*x_Ptr16++) += x16;
+        #endif
+        #if nISO>=18
+        (*x_Ptr17++) += x17;
+        #endif
+        #if nISO>=19
+        (*x_Ptr18++) += x18;
+        #endif
+        #if nISO>=20
+        (*x_Ptr19++) += x19;
+        #endif
+    }
+#endif
+
+    pthread_exit( 0 );
+}
+
+
+// =========================
+// Function called by CYTHON
+// =========================
+void COMMIT_At(
+    int _nF, int _n, int _nE, int _nV, int _nS, int _ndirs,
+    double *_vIN, double *_vOUT,
+    uint32_t *_ICf, uint32_t *_ICv, uint16_t *_ICo, float *_ICl,
+    uint32_t *_ECv, uint16_t *_ECo,
+    uint32_t *_ISOv,
+    float *_wmrSFP, float *_wmhSFP, float *_isoSFP,
+    uint8_t* _ICthreadsT, uint32_t* _ECthreadsT, uint32_t* _ISOthreadsT
+)
+{
+    nF = _nF;
+    n  = _n;
+    nE = _nE;
+    nV = _nV;
+    nS = _nS;
+    ndirs = _ndirs;
+
+    x = _vOUT;
+    Y = _vIN;
+
+    ICf  = _ICf;
+    ICv  = _ICv;
+    ICo  = _ICo;
+    ICl  = _ICl;
+    ECv  = _ECv;
+    ECo  = _ECo;
+    ISOv = _ISOv;
+
+    #if nIC>=1
+    wmrSFP0 = _wmrSFP;
+    #if nIC>=2
+    wmrSFP1 = wmrSFP0 + _ndirs*_nS;
+    #if nIC>=3
+    wmrSFP2 = wmrSFP1 + _ndirs*_nS;
+    #if nIC>=4
+    wmrSFP3 = wmrSFP2 + _ndirs*_nS;
+    #if nIC>=5
+    wmrSFP4 = wmrSFP3 + _ndirs*_nS;
+    #if nIC>=6
+    wmrSFP5 = wmrSFP4 + _ndirs*_nS;
+    #if nIC>=7
+    wmrSFP6 = wmrSFP5 + _ndirs*_nS;
+    #if nIC>=8
+    wmrSFP7 = wmrSFP6 + _ndirs*_nS;
+    #if nIC>=9
+    wmrSFP8 = wmrSFP7 + _ndirs*_nS;
+    #if nIC>=10
+    wmrSFP9 = wmrSFP8 + _ndirs*_nS;
+    #if nIC>=11
+    wmrSFP10 = wmrSFP9 + _ndirs*_nS;
+    #if nIC>=12
+    wmrSFP11 = wmrSFP10 + _ndirs*_nS;
+    #if nIC>=13
+    wmrSFP12 = wmrSFP11 + _ndirs*_nS;
+    #if nIC>=14
+    wmrSFP13 = wmrSFP12 + _ndirs*_nS;
+    #if nIC>=15
+    wmrSFP14 = wmrSFP13 + _ndirs*_nS;
+    #if nIC>=16
+    wmrSFP15 = wmrSFP14 + _ndirs*_nS;
+    #if nIC>=17
+    wmrSFP16 = wmrSFP15 + _ndirs*_nS;
+    #if nIC>=18
+    wmrSFP17 = wmrSFP16 + _ndirs*_nS;
+    #if nIC>=19
+    wmrSFP18 = wmrSFP17 + _ndirs*_nS;
+    #if nIC>=20
+    wmrSFP19 = wmrSFP18 + _ndirs*_nS;
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #if nEC>=1
+    wmhSFP0 = _wmhSFP;
+    #if nEC>=2
+    wmhSFP1 = wmhSFP0 + _ndirs*_nS;
+    #if nEC>=3
+    wmhSFP2 = wmhSFP1 + _ndirs*_nS;
+    #if nEC>=4
+    wmhSFP3 = wmhSFP2 + _ndirs*_nS;
+    #if nEC>=5
+    wmhSFP4 = wmhSFP3 + _ndirs*_nS;
+    #if nEC>=6
+    wmhSFP5 = wmhSFP4 + _ndirs*_nS;
+    #if nEC>=7
+    wmhSFP6 = wmhSFP5 + _ndirs*_nS;
+    #if nEC>=8
+    wmhSFP7 = wmhSFP6 + _ndirs*_nS;
+    #if nEC>=9
+    wmhSFP8 = wmhSFP7 + _ndirs*_nS;
+    #if nEC>=10
+    wmhSFP9 = wmhSFP8 + _ndirs*_nS;
+    #if nEC>=11
+    wmhSFP10 = wmhSFP9 + _ndirs*_nS;
+    #if nEC>=12
+    wmhSFP11 = wmhSFP10 + _ndirs*_nS;
+    #if nEC>=13
+    wmhSFP12 = wmhSFP11 + _ndirs*_nS;
+    #if nEC>=14
+    wmhSFP13 = wmhSFP12 + _ndirs*_nS;
+    #if nEC>=15
+    wmhSFP14 = wmhSFP13 + _ndirs*_nS;
+    #if nEC>=16
+    wmhSFP15 = wmhSFP14 + _ndirs*_nS;
+    #if nEC>=17
+    wmhSFP16 = wmhSFP15 + _ndirs*_nS;
+    #if nEC>=18
+    wmhSFP17 = wmhSFP16 + _ndirs*_nS;
+    #if nEC>=19
+    wmhSFP18 = wmhSFP17 + _ndirs*_nS;
+    #if nEC>=20
+    wmhSFP19 = wmhSFP18 + _ndirs*_nS;
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #if nISO>=1
+    isoSFP0 = _isoSFP;
+    #if nISO>=2
+    isoSFP1 = isoSFP0 + _nS;
+    #if nISO>=3
+    isoSFP2 = isoSFP1 + _nS;
+    #if nISO>=4
+    isoSFP3 = isoSFP2 + _nS;
+    #if nISO>=5
+    isoSFP4 = isoSFP3 + _nS;
+    #if nISO>=6
+    isoSFP5 = isoSFP4 + _nS;
+    #if nISO>=7
+    isoSFP6 = isoSFP5 + _nS;
+    #if nISO>=8
+    isoSFP7 = isoSFP6 + _nS;
+    #if nISO>=9
+    isoSFP8 = isoSFP7 + _nS;
+    #if nISO>=10
+    isoSFP9 = isoSFP8 + _nS;
+    #if nISO>=11
+    isoSFP10 = isoSFP9 + _nS;
+    #if nISO>=12
+    isoSFP11 = isoSFP10 + _nS;
+    #if nISO>=13
+    isoSFP12 = isoSFP11 + _nS;
+    #if nISO>=14
+    isoSFP13 = isoSFP12 + _nS;
+    #if nISO>=15
+    isoSFP14 = isoSFP13 + _nS;
+    #if nISO>=16
+    isoSFP15 = isoSFP14 + _nS;
+    #if nISO>=17
+    isoSFP16 = isoSFP15 + _nS;
+    #if nISO>=18
+    isoSFP17 = isoSFP16 + _nS;
+    #if nISO>=19
+    isoSFP18 = isoSFP17 + _nS;
+    #if nISO>=20
+    isoSFP19 = isoSFP18 + _nS;
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+    #endif
+
+    ICthreadsT  = _ICthreadsT;
+    ECthreadsT  = _ECthreadsT;
+    ISOthreadsT = _ISOthreadsT;
+
+    // Run SEPARATE THREADS to perform the multiplication
+    pthread_t threads[nTHREADS];
+    int t;
+    for(t=0; t<nTHREADS ; t++)
+        pthread_create( &threads[t], NULL, COMMIT_At__block, (void *) (long int)t );
+    for(t=0; t<nTHREADS ; t++)
+        pthread_join( threads[t], NULL );
+    return;
+}
diff --git a/commit/proximals.pyx b/commit/proximals.pyx
index d019d914..dcccf7aa 100644
--- a/commit/proximals.pyx
+++ b/commit/proximals.pyx
@@ -1,141 +1,141 @@
-#!python
-#cython: language_level=3, boundscheck=False, wraparound=False, profile=False
-"""
-Author: Matteo Frigo - lts5 @ EPFL and Dep. of CS @ Univ. of Verona
-
-This structure is based on the previous work of Rafael Carrillo and was
-supported by the LTS5 laboratory at EPFL, Lausanne.
-"""
-cimport cython
-import numpy as np
-cimport numpy as np
-from libc.math cimport sqrt
-
-
-cpdef non_negativity(double [::1] x, int compartment_start, int compartment_size):
-    """
-    POCS for the first orthant (non-negativity)
-    """
-    cdef:
-        int i
-    for i in xrange(compartment_start, compartment_start+compartment_size):
-        if x[i] <= 0.0 :
-            x[i] = 0.0
-    return np.asarray( x )
-
-
-cpdef soft_thresholding(double [::1] x, double lam, int compartment_start, int compartment_size) :
-    """
-    Proximal of L1 norm
-    """
-    # NB: this preserves non-negativity
-    cdef:
-        int i
-    for i in xrange(compartment_start, compartment_start+compartment_size):
-        if x[i] <= lam:
-            x[i] = 0.0
-        else:
-            x[i] = x[i] - lam
-    return np.asarray( x )
-
-
-cpdef projection_onto_l2_ball(double [::1] x, double lam, int compartment_start, int compartment_size) :
-    """
-    Proximal of L2 norm
-    """
-    # NB: this preserves non-negativity
-    cdef:
-        double xn = 0.0, k
-        int i
-    for i in xrange(compartment_start, compartment_start+compartment_size):
-        xn += x[i]*x[i]
-    xn = sqrt(xn)
-    if xn > lam :
-        k = 1. - lam/xn
-        for i in xrange(compartment_start, compartment_start+compartment_size):
-            x[i] = x[i]*k
-    else :
-        for i in xrange(compartment_start, compartment_start+compartment_size):
-            x[i] = 0
-    return np.asarray( x )
-
-
-cpdef omega_group_sparsity(double [::1] x, int [::1] group_idx, int [::1] group_size, double [::1] group_weight, double lam, double n) :
-    """
-    References:
-        [1] Jenatton et al. - `Proximal Methods for Hierarchical Sparse Coding`
-    """
-    cdef:
-        int nG = group_size.size, N
-        int k, i, j = 0
-        double omega = 0.0, gNorm, x_i
-
-    if lam != 0:
-        if n == 2:
-            for k in xrange(nG):
-                N = group_size[k]
-                gNorm = 0.0
-                for i in xrange(j,j+N) :
-                    x_i = x[group_idx[i]]
-                    gNorm += x_i*x_i
-                omega += group_weight[k] * sqrt( gNorm )
-                j += N
-        elif n == np.inf:
-            for k in xrange(nG):
-                N = group_size[k]
-                gNorm = x[group_idx[j]]
-                for i in xrange(j+1,j+N) :
-                    x_i = x[group_idx[i]]
-                    if x_i > gNorm :
-                        gNorm = x_i
-                omega += group_weight[k] * gNorm
-                j += N
-    return lam*omega
-
-
-cpdef prox_group_sparsity( double [::1] x, int [::1] group_idx, int [::1] group_size, double [::1] group_weight, double lam, double n ) :
-    """
-    References:
-        [1] Jenatton et al. - `Proximal Methods for Hierarchical Sparse Coding`
-    """
-    cdef:
-        int nG = group_size.size, N
-        int k, i, j = 0
-        double wl, gNorm, x_i
-
-    k = x.size
-    for i in xrange(k):
-        if x[i] <= 0.0:
-            x[i] = 0.0
-
-    if lam != 0:
-        if n == 2 :
-            for k in xrange(nG) :
-                N = group_size[k]
-                gNorm = 0.0
-                for i in xrange(j,j+N) :
-                    x_i = x[group_idx[i]]
-                    gNorm += x_i*x_i
-                gNorm = sqrt( gNorm )
-
-                wl = group_weight[k] * lam
-                if gNorm <= wl :
-                    for i in xrange(j,j+N) :
-                        x[ group_idx[i] ] = 0.0
-                else :
-                    wl = (gNorm-wl)/gNorm
-                    for i in xrange(j,j+N) :
-                        x[ group_idx[i] ] *= wl
-                j += N
-        # elif n == np.inf :
-        # [TODO] TO be correctly implemented
-        #     for k in range(nG) :
-        #         idx = subtree[k]
-        #         # xn = max( v[idx] )
-        #         r = weight[k] * lam
-        #         for i in idx :
-        #             if v[i] <= r:
-        #                 v[i] = 0.0
-        #             else :
-        #                 v[i] -= r
+#!python
+#cython: language_level=3, boundscheck=False, wraparound=False, profile=False
+"""
+Author: Matteo Frigo - lts5 @ EPFL and Dep. of CS @ Univ. of Verona
+
+This structure is based on the previous work of Rafael Carrillo and was
+supported by the LTS5 laboratory at EPFL, Lausanne.
+"""
+cimport cython
+import numpy as np
+cimport numpy as np
+from libc.math cimport sqrt
+
+
+cpdef non_negativity(double [::1] x, int compartment_start, int compartment_size):
+    """
+    POCS for the first orthant (non-negativity)
+    """
+    cdef:
+        int i
+    for i in xrange(compartment_start, compartment_start+compartment_size):
+        if x[i] <= 0.0 :
+            x[i] = 0.0
+    return np.asarray( x )
+
+
+cpdef soft_thresholding(double [::1] x, double lam, int compartment_start, int compartment_size) :
+    """
+    Proximal of L1 norm
+    """
+    # NB: this preserves non-negativity
+    cdef:
+        int i
+    for i in xrange(compartment_start, compartment_start+compartment_size):
+        if x[i] <= lam:
+            x[i] = 0.0
+        else:
+            x[i] = x[i] - lam
+    return np.asarray( x )
+
+
+cpdef projection_onto_l2_ball(double [::1] x, double lam, int compartment_start, int compartment_size) :
+    """
+    Proximal of L2 norm
+    """
+    # NB: this preserves non-negativity
+    cdef:
+        double xn = 0.0, k
+        int i
+    for i in xrange(compartment_start, compartment_start+compartment_size):
+        xn += x[i]*x[i]
+    xn = sqrt(xn)
+    if xn > lam :
+        k = 1. - lam/xn
+        for i in xrange(compartment_start, compartment_start+compartment_size):
+            x[i] = x[i]*k
+    else :
+        for i in xrange(compartment_start, compartment_start+compartment_size):
+            x[i] = 0
+    return np.asarray( x )
+
+
+cpdef omega_group_sparsity(double [::1] x, int [::1] group_idx, int [::1] group_size, double [::1] group_weight, double lam, double n) :
+    """
+    References:
+        [1] Jenatton et al. - `Proximal Methods for Hierarchical Sparse Coding`
+    """
+    cdef:
+        int nG = group_size.size, N
+        int k, i, j = 0
+        double omega = 0.0, gNorm, x_i
+
+    if lam != 0:
+        if n == 2:
+            for k in xrange(nG):
+                N = group_size[k]
+                gNorm = 0.0
+                for i in xrange(j,j+N) :
+                    x_i = x[group_idx[i]]
+                    gNorm += x_i*x_i
+                omega += group_weight[k] * sqrt( gNorm )
+                j += N
+        elif n == np.inf:
+            for k in xrange(nG):
+                N = group_size[k]
+                gNorm = x[group_idx[j]]
+                for i in xrange(j+1,j+N) :
+                    x_i = x[group_idx[i]]
+                    if x_i > gNorm :
+                        gNorm = x_i
+                omega += group_weight[k] * gNorm
+                j += N
+    return lam*omega
+
+
+cpdef prox_group_sparsity( double [::1] x, int [::1] group_idx, int [::1] group_size, double [::1] group_weight, double lam, double n ) :
+    """
+    References:
+        [1] Jenatton et al. - `Proximal Methods for Hierarchical Sparse Coding`
+    """
+    cdef:
+        int nG = group_size.size, N
+        int k, i, j = 0
+        double wl, gNorm, x_i
+
+    k = x.size
+    for i in xrange(k):
+        if x[i] <= 0.0:
+            x[i] = 0.0
+
+    if lam != 0:
+        if n == 2 :
+            for k in xrange(nG) :
+                N = group_size[k]
+                gNorm = 0.0
+                for i in xrange(j,j+N) :
+                    x_i = x[group_idx[i]]
+                    gNorm += x_i*x_i
+                gNorm = sqrt( gNorm )
+
+                wl = group_weight[k] * lam
+                if gNorm <= wl :
+                    for i in xrange(j,j+N) :
+                        x[ group_idx[i] ] = 0.0
+                else :
+                    wl = (gNorm-wl)/gNorm
+                    for i in xrange(j,j+N) :
+                        x[ group_idx[i] ] *= wl
+                j += N
+        # elif n == np.inf :
+        # [TODO] TO be correctly implemented
+        #     for k in range(nG) :
+        #         idx = subtree[k]
+        #         # xn = max( v[idx] )
+        #         r = weight[k] * lam
+        #         for i in idx :
+        #             if v[i] <= r:
+        #                 v[i] = 0.0
+        #             else :
+        #                 v[i] -= r
     return np.asarray( x )
\ No newline at end of file
diff --git a/commit/solvers.py b/commit/solvers.py
index 29bc8374..dc7767ce 100755
--- a/commit/solvers.py
+++ b/commit/solvers.py
@@ -1,403 +1,403 @@
-"""
-Author: Matteo Frigo - lts5 @ EPFL and Dep. of CS @ Univ. of Verona
-
-This structure is based on the previous work of Rafael Carrillo and was
-supported by the LTS5 laboratory at EPFL, Lausanne.
-"""
-from __future__ import print_function
-import numpy as np
-from math import sqrt
-import sys
-import warnings
-eps = np.finfo(float).eps
-
-from commit.proximals import (non_negativity,
-                             omega_group_sparsity,
-                             prox_group_sparsity,
-                             soft_thresholding,
-                             projection_onto_l2_ball)
-group_sparsity = -1
-non_negative = 0
-norm1 = 1
-norm2 = 2
-norminf = np.inf
-list_regnorms = [group_sparsity, non_negative, norm1, norm2]
-list_group_sparsity_norms = [norm2]#, norminf] # removed because of issue #54
-
-
-def init_regularisation(commit_evaluation,
-                        regnorms = (non_negative, non_negative, non_negative),
-                        structureIC = None, weightsIC = None, group_norm = 2,
-                        lambdas = (.0,.0,.0) ):
-    """
-    Initialise the data structure that defines Omega in
-
-        argmin_x 0.5*||Ax-y||_2^2 + Omega(x)
-
-
-    Input
-    -----
-    commit_evaluation - commit.Evaluation object :
-        dictionary and model have to be loaded beforehand.
-
-
-    regnorms - tuple :
-        this sets the penalty term to be used for each compartment.
-            Default = (non_negative,non_negative,non_negative).
-
-            regnorms[0] corresponds to the Intracellular compartment
-            regnorms[1] corresponds to the Extracellular compartment
-            regnorms[2] corresponds to the Isotropic compartment
-
-            Each regnorms[k] must be one of commit.solvers.
-                                {group_sparsity, non_negative, norm1, norm2}.
-
-            commit.solvers.group_sparsity considers both the non-overlapping
-                and the hierarchical group sparsity (see [1]). This option is
-                allowed only in the IC compartment. The mathematical formulation
-                of this term is
-                $\Omega(x) = \lambda \sum_{g\in G} w_g |x_g|
-
-            commit.solvers.non_negative puts a non negativity constraint on the
-                coefficients corresponding to the compartment. This is the
-                default option for each compartment
-
-            commit.solvers.norm1 penalises with the 1-norm of the coefficients
-                corresponding to the compartment.
-
-            commit.solvers.norm2 penalises with the 2-norm of the coefficients
-                corresponding to the compartment.
-
-
-    structureIC - np.array(list(list)) :
-        group structure for the IC compartment.
-            This field is necessary only if regterm[0]=commit.solver.group_sparsity.
-            Example:
-                structureIC = np.array([[0,2,5],[1,3,4],[0,1,2,3,4,5],[6]])
-
-                that is equivalent to
-                            [0,1,2,3,4,5]        [6]
-                              /       \
-                        [0,2,5]       [1,3,4]
-                which has two non overlapping groups, one of which is the union
-                of two other non-overlapping groups.
-
-
-    weightsIC - np.array(np.float64) :
-        this defines the weights associated to each group of structure IC.
-
-
-    group_norm - number :
-        norm type for the commit.solver.group_sparsity penalisation of the IC compartment.
-            Default: group_norm = commit.solver.norm2
-            To be chosen among commit.solver.{norm2,norminf}.
-
-    lambdas - tuple :
-        regularisation parameter for each compartment.
-            Default: lambdas = (0.0, 0.0, 0.0)
-            The lambdas correspond to the onse described in the mathematical
-            formulation of the regularisation term
-            $\Omega(x) = lambdas[0]*regnorm[0](x) + lambdas[1]*regnorm[1](x) + lambdas[2]*regnorm[2](x)$
-
-
-    References:
-        [1] Jenatton et al. - 'Proximal Methods for Hierarchical Sparse Coding'
-    """
-    regularisation = {}
-
-    regularisation['startIC']  = 0
-    regularisation['sizeIC']   = int( commit_evaluation.DICTIONARY['IC']['nF'] * commit_evaluation.KERNELS['wmr'].shape[0])
-    regularisation['startEC']  = int( regularisation['sizeIC'] )
-    regularisation['sizeEC']   = int( commit_evaluation.DICTIONARY['EC']['nE'] * commit_evaluation.KERNELS['wmh'].shape[0])
-    regularisation['startISO'] = int( regularisation['sizeIC'] + regularisation['sizeEC'] )
-    regularisation['sizeISO']  = int( commit_evaluation.DICTIONARY['nV'] * commit_evaluation.KERNELS['iso'].shape[0])
-
-    regularisation['normIC']  = regnorms[0]
-    regularisation['normEC']  = regnorms[1]
-    regularisation['normISO'] = regnorms[2]
-
-    regularisation['lambdaIC']  = float( lambdas[0] )
-    regularisation['lambdaEC']  = float( lambdas[1] )
-    regularisation['lambdaISO'] = float( lambdas[2] )
-
-    # Solver-specific fields
-    regularisation['structureIC']      = structureIC
-    regularisation['weightsIC']        = weightsIC
-    regularisation['group_norm']       = group_norm
-
-    return regularisation
-
-
-def regularisation2omegaprox(regularisation):
-    lambdaIC  = float(regularisation.get('lambdaIC'))
-    lambdaEC  = float(regularisation.get('lambdaEC'))
-    lambdaISO = float(regularisation.get('lambdaISO'))
-    if lambdaIC < 0.0 or lambdaEC < 0.0 or lambdaISO < 0.0:
-        raise ValueError('Negative regularisation parameters are not allowed')
-
-    normIC  = regularisation.get('normIC')
-    normEC  = regularisation.get('normEC')
-    normISO = regularisation.get('normISO')
-    if not normIC in list_regnorms:
-        raise ValueError('normIC must be one of commit.solvers.{group_sparsity,non_negative,norm1,norm2}')
-    if not normEC in list_regnorms:
-        raise ValueError('normEC must be one of commit.solvers.{group_sparsity,non_negative,norm1,norm2}')
-    if not normISO in list_regnorms:
-        raise ValueError('normISO must be one of commit.solvers.{group_sparsity,non_negative,norm1,norm2}')
-
-    ## NNLS case
-    if (lambdaIC == 0.0 and lambdaEC == 0.0 and lambdaISO == 0.0) or (normIC == non_negative and normEC == non_negative and normISO == non_negative):
-        omega = lambda x: 0.0
-        prox  = lambda x: non_negativity(x, 0, len(x))
-        return omega, prox
-
-    ## All other cases
-    # Intracellular Compartment
-    startIC = regularisation.get('startIC')
-    sizeIC  = regularisation.get('sizeIC')
-    if lambdaIC == 0.0:
-        omegaIC = lambda x: 0.0
-        proxIC  = lambda x: x
-    elif normIC == norm2:
-        omegaIC = lambda x: lambdaIC * np.linalg.norm(x[startIC:sizeIC])
-        proxIC  = lambda x: projection_onto_l2_ball(x, lambdaIC, startIC, sizeIC)
-    elif normIC == norm1:
-        omegaIC = lambda x: lambdaIC * sum( x[startIC:sizeIC] )
-        proxIC  = lambda x: soft_thresholding(x, lambdaIC, startIC, sizeIC)
-    elif normIC == non_negative:
-        omegaIC = lambda x: 0.0
-        proxIC  = lambda x: non_negativity(x, startIC, sizeIC)
-    elif normIC == group_sparsity:
-        structureIC = regularisation.get('structureIC')
-        groupWeightIC   = regularisation.get('weightsIC')
-        if not len(structureIC) == len(groupWeightIC):
-            raise ValueError('Number of groups and weights do not coincide.')
-        group_norm = regularisation.get('group_norm')
-        if not group_norm in list_group_sparsity_norms:
-            raise ValueError('Wrong norm in the structured sparsity term. Choose between %s.' % str(list_group_sparsity_norms))
-
-        # convert to new data structure (needed for faster access)
-        N = np.sum([g.size for g in structureIC])
-        groupIdxIC  = np.zeros( (N,), dtype=np.int32 )
-        groupSizeIC = np.zeros( (structureIC.size,), dtype=np.int32 )
-        pos = 0
-        for i, g in enumerate(structureIC) :
-            groupSizeIC[i] = g.size
-            groupIdxIC[pos:(pos+g.size)] = g[:]
-            pos += g.size
-
-        omegaIC = lambda x: omega_group_sparsity( x, groupIdxIC, groupSizeIC, groupWeightIC, lambdaIC, group_norm )
-        proxIC  = lambda x:  prox_group_sparsity( x, groupIdxIC, groupSizeIC, groupWeightIC, lambdaIC, group_norm )
-    else:
-        raise ValueError('Type of regularisation for IC compartment not recognized.')
-
-
-    # Extracellular Compartment
-    startEC = regularisation.get('startEC')
-    sizeEC  = regularisation.get('sizeEC')
-    if lambdaEC == 0.0:
-        omegaEC = lambda x: 0.0
-        proxEC  = lambda x: x
-    elif normEC == norm2:
-        omegaEC = lambda x: lambdaEC * np.linalg.norm(x[startEC:(startEC+sizeEC)])
-        proxEC  = lambda x: projection_onto_l2_ball(x, lambdaEC, startEC, sizeEC)
-    elif normEC == norm1:
-        omegaEC = lambda x: lambdaEC * sum( x[startEC:(startEC+sizeEC)] )
-        proxEC  = lambda x: soft_thresholding(x, lambdaEC, startEC, sizeEC)
-    elif normEC == non_negative:
-        omegaEC = lambda x: 0.0
-        proxEC  = lambda x: non_negativity(x, startEC, sizeEC)
-    else:
-        raise ValueError('Type of regularisation for EC compartment not recognized.')
-
-    # Isotropic Compartment
-    startISO = regularisation.get('startISO')
-    sizeISO  = regularisation.get('sizeISO')
-    if lambdaISO == 0.0:
-        omegaISO = lambda x: 0.0
-        proxISO  = lambda x: x
-    elif normISO == norm2:
-        omegaISO = lambda x: lambdaISO * np.linalg.norm(x[startISO:(startISO+sizeISO)])
-        proxISO  = lambda x: projection_onto_l2_ball(x, lambdaISO, startISO, sizeISO)
-    elif normISO == norm1:
-        omegaISO = lambda x: lambdaISO * sum( x[startISO:(startISO+sizeISO)] )
-        proxISO  = lambda x: soft_thresholding(x, lambdaISO, startISO, sizeISO)
-    elif normISO == non_negative:
-        omegaISO = lambda x: 0.0
-        proxISO  = lambda x: non_negativity(x, startISO, sizeISO)
-    else:
-        raise ValueError('Type of regularisation for ISO compartment not recognized.')
-
-    omega = lambda x: omegaIC(x) + omegaEC(x) + omegaISO(x)
-    prox = lambda x: non_negativity(proxIC(proxEC(proxISO(x))),0,x.size) # non negativity is redunduntly forced
-
-    return omega, prox
-
-
-def evaluate_model(y, A, x, regularisation = None):
-    if regularisation is None:
-        omega = lambda x: 0.0
-        prox  = lambda x: non_negativity(x, 0, len(x))
-    else:
-        omega, _ = regularisation2omegaprox(regularisation)
-
-    return 0.5*np.linalg.norm(A.dot(x)-y)**2 + omega(x)
-
-
-def solve(y, A, At, tol_fun = 1e-4, tol_x = 1e-6, max_iter = 1000, verbose = 1, x0 = None, regularisation = None):
-    """
-    Solve the regularised least squares problem
-
-        argmin_x 0.5*||Ax-y||_2^2 + Omega(x)
-
-    with the Omega described by 'regularisation'.
-
-    Check the documentation of commit.solvers.init_regularisation to see how to
-    solve a specific problem.
-    """
-    if regularisation is None:
-        omega = lambda x: 0.0
-        prox  = lambda x: non_negativity(x, 0, x.size)
-    else:
-        omega, prox = regularisation2omegaprox(regularisation)
-
-    if x0 is None:
-        x0 = np.zeros(A.shape[1])
-
-    return fista( y, A, At, tol_fun, tol_x, max_iter, verbose, x0, omega, prox)
-
-
-def fista( y, A, At, tol_fun, tol_x, max_iter, verbose, x0, omega, proximal) :
-    """
-    Solve the regularised least squares problem
-
-        argmin_x 0.5*||Ax-y||_2^2 + Omega(x)
-
-    with the FISTA algorithm described in [1].
-
-    The penalty term and its proximal operator must be defined in such a way
-    that they already contain the regularisation parameter.
-
-    References:
-        [1] Beck & Teboulle - `A Fast Iterative Shrinkage Thresholding
-            Algorithm for Linear Inverse Problems`
-    """
-
-    # Initialization
-    res = -y.copy()
-    xhat = x0.copy()
-    x = np.zeros_like(xhat)
-    res += A.dot(xhat)
-    proximal( xhat )
-    reg_term = omega( xhat )
-    prev_obj = 0.5 * np.linalg.norm(res)**2 + reg_term
-
-    told = 1
-    beta = 0.9
-    prev_x = xhat.copy()
-    grad = np.asarray(At.dot(res))
-    qfval = prev_obj
-
-    # Step size computation
-    L = ( np.linalg.norm( A.dot(grad) ) / np.linalg.norm(grad) )**2
-    mu = 1.9 / L
-
-    # Main loop
-    if verbose >= 1 :
-        print()
-        print( "      |  1/2||Ax-y||^2      Omega      |  Cost function    Abs error      Rel error    |      Abs x          Rel x    " )
-        print( "------|--------------------------------|-----------------------------------------------|------------------------------" )
-    iter = 1
-    while True :
-        if verbose >= 1 :
-            print( "%4d  |" % iter, end="" )
-            sys.stdout.flush()
-
-        # Smooth step
-        x = xhat - mu*grad
-
-        # Non-smooth step
-        proximal( x )
-        reg_term_x = omega( x )
-
-        # Check stepsize
-        tmp = x-xhat
-        q = qfval + np.real( np.dot(tmp,grad) ) + 0.5/mu * np.linalg.norm(tmp)**2 + reg_term_x
-        res = A.dot(x) - y
-        res_norm = np.linalg.norm(res)
-        curr_obj = 0.5 * res_norm**2 + reg_term_x
-
-        # Backtracking
-        while curr_obj > q :
-            # Smooth step
-            mu = beta*mu
-            x = xhat - mu*grad
-
-            # Non-smooth step
-            proximal( x )
-            reg_term_x = omega( x )
-
-            # Check stepsize
-            tmp = x-xhat
-            q = qfval + np.real( np.dot(tmp,grad) ) + 0.5/mu * np.linalg.norm(tmp)**2 + reg_term_x
-            res = A.dot(x) - y
-            res_norm = np.linalg.norm(res)
-            curr_obj = 0.5 * res_norm**2 + reg_term_x
-
-        # Global stopping criterion
-        abs_obj = abs(curr_obj - prev_obj)
-        rel_obj = abs_obj / curr_obj
-        abs_x   = np.linalg.norm(x - prev_x)
-        rel_x   = abs_x / ( np.linalg.norm(x) + eps )
-        if verbose >= 1 :
-            print( "  %13.7e  %13.7e  |  %13.7e  %13.7e  %13.7e  |  %13.7e  %13.7e" % ( 0.5 * res_norm**2, reg_term_x, curr_obj, abs_obj, rel_obj, abs_x, rel_x ) )
-
-        if abs_obj < eps :
-            criterion = "Absolute tolerance on the objective"
-            break
-        elif rel_obj < tol_fun :
-            criterion = "Relative tolerance on the objective"
-            break
-        elif abs_x < eps :
-            criterion = "Absolute tolerance on the unknown"
-            break
-        elif rel_x < tol_x :
-            criterion = "Relative tolerance on the unknown"
-            break
-        elif iter >= max_iter :
-            criterion = "Maximum number of iterations"
-            break
-
-        # FISTA update
-        t = 0.5 * ( 1 + sqrt(1+4*told**2) )
-        xhat = x + (told-1)/t * (x - prev_x)
-
-        # Gradient computation
-        res = A.dot(xhat) - y
-        xarr = np.asarray(x)
-
-        grad = np.asarray(At.dot(res))
-
-        # Update variables
-        iter += 1
-        prev_obj = curr_obj
-        prev_x = x.copy()
-        told = t
-        qfval = 0.5 * np.linalg.norm(res)**2
-
-
-    if verbose >= 1 :
-        print( "< Stopping criterion: %s >" % criterion )
-
-    opt_details = {}
-    opt_details['residual'] = 0.5*res_norm**2
-    opt_details['regterm'] = reg_term_x
-    opt_details['cost_function'] = curr_obj
-    opt_details['abs_cost'] = abs_obj
-    opt_details['rel_cost'] = rel_obj
-    opt_details['abs_x'] = abs_x
-    opt_details['rel _x'] = rel_x
-    opt_details['iterations'] = iter
-    opt_details['stopping_criterion'] = criterion
-
-    return x, opt_details
+"""
+Author: Matteo Frigo - lts5 @ EPFL and Dep. of CS @ Univ. of Verona
+
+This structure is based on the previous work of Rafael Carrillo and was
+supported by the LTS5 laboratory at EPFL, Lausanne.
+"""
+from __future__ import print_function
+import numpy as np
+from math import sqrt
+import sys
+import warnings
+eps = np.finfo(float).eps
+
+from commit.proximals import (non_negativity,
+                             omega_group_sparsity,
+                             prox_group_sparsity,
+                             soft_thresholding,
+                             projection_onto_l2_ball)
+group_sparsity = -1
+non_negative = 0
+norm1 = 1
+norm2 = 2
+norminf = np.inf
+list_regnorms = [group_sparsity, non_negative, norm1, norm2]
+list_group_sparsity_norms = [norm2]#, norminf] # removed because of issue #54
+
+
+def init_regularisation(commit_evaluation,
+                        regnorms = (non_negative, non_negative, non_negative),
+                        structureIC = None, weightsIC = None, group_norm = 2,
+                        lambdas = (.0,.0,.0) ):
+    """
+    Initialise the data structure that defines Omega in
+
+        argmin_x 0.5*||Ax-y||_2^2 + Omega(x)
+
+
+    Input
+    -----
+    commit_evaluation - commit.Evaluation object :
+        dictionary and model have to be loaded beforehand.
+
+
+    regnorms - tuple :
+        this sets the penalty term to be used for each compartment.
+            Default = (non_negative,non_negative,non_negative).
+
+            regnorms[0] corresponds to the Intracellular compartment
+            regnorms[1] corresponds to the Extracellular compartment
+            regnorms[2] corresponds to the Isotropic compartment
+
+            Each regnorms[k] must be one of commit.solvers.
+                                {group_sparsity, non_negative, norm1, norm2}.
+
+            commit.solvers.group_sparsity considers both the non-overlapping
+                and the hierarchical group sparsity (see [1]). This option is
+                allowed only in the IC compartment. The mathematical formulation
+                of this term is
+                $\Omega(x) = \lambda \sum_{g\in G} w_g |x_g|
+
+            commit.solvers.non_negative puts a non negativity constraint on the
+                coefficients corresponding to the compartment. This is the
+                default option for each compartment
+
+            commit.solvers.norm1 penalises with the 1-norm of the coefficients
+                corresponding to the compartment.
+
+            commit.solvers.norm2 penalises with the 2-norm of the coefficients
+                corresponding to the compartment.
+
+
+    structureIC - np.array(list(list)) :
+        group structure for the IC compartment.
+            This field is necessary only if regterm[0]=commit.solver.group_sparsity.
+            Example:
+                structureIC = np.array([[0,2,5],[1,3,4],[0,1,2,3,4,5],[6]])
+
+                that is equivalent to
+                            [0,1,2,3,4,5]        [6]
+                              /       \
+                        [0,2,5]       [1,3,4]
+                which has two non overlapping groups, one of which is the union
+                of two other non-overlapping groups.
+
+
+    weightsIC - np.array(np.float64) :
+        this defines the weights associated to each group of structure IC.
+
+
+    group_norm - number :
+        norm type for the commit.solver.group_sparsity penalisation of the IC compartment.
+            Default: group_norm = commit.solver.norm2
+            To be chosen among commit.solver.{norm2,norminf}.
+
+    lambdas - tuple :
+        regularisation parameter for each compartment.
+            Default: lambdas = (0.0, 0.0, 0.0)
+            The lambdas correspond to the onse described in the mathematical
+            formulation of the regularisation term
+            $\Omega(x) = lambdas[0]*regnorm[0](x) + lambdas[1]*regnorm[1](x) + lambdas[2]*regnorm[2](x)$
+
+
+    References:
+        [1] Jenatton et al. - 'Proximal Methods for Hierarchical Sparse Coding'
+    """
+    regularisation = {}
+
+    regularisation['startIC']  = 0
+    regularisation['sizeIC']   = int( commit_evaluation.DICTIONARY['IC']['nF'] * commit_evaluation.KERNELS['wmr'].shape[0])
+    regularisation['startEC']  = int( regularisation['sizeIC'] )
+    regularisation['sizeEC']   = int( commit_evaluation.DICTIONARY['EC']['nE'] * commit_evaluation.KERNELS['wmh'].shape[0])
+    regularisation['startISO'] = int( regularisation['sizeIC'] + regularisation['sizeEC'] )
+    regularisation['sizeISO']  = int( commit_evaluation.DICTIONARY['nV'] * commit_evaluation.KERNELS['iso'].shape[0])
+
+    regularisation['normIC']  = regnorms[0]
+    regularisation['normEC']  = regnorms[1]
+    regularisation['normISO'] = regnorms[2]
+
+    regularisation['lambdaIC']  = float( lambdas[0] )
+    regularisation['lambdaEC']  = float( lambdas[1] )
+    regularisation['lambdaISO'] = float( lambdas[2] )
+
+    # Solver-specific fields
+    regularisation['structureIC']      = structureIC
+    regularisation['weightsIC']        = weightsIC
+    regularisation['group_norm']       = group_norm
+
+    return regularisation
+
+
+def regularisation2omegaprox(regularisation):
+    lambdaIC  = float(regularisation.get('lambdaIC'))
+    lambdaEC  = float(regularisation.get('lambdaEC'))
+    lambdaISO = float(regularisation.get('lambdaISO'))
+    if lambdaIC < 0.0 or lambdaEC < 0.0 or lambdaISO < 0.0:
+        raise ValueError('Negative regularisation parameters are not allowed')
+
+    normIC  = regularisation.get('normIC')
+    normEC  = regularisation.get('normEC')
+    normISO = regularisation.get('normISO')
+    if not normIC in list_regnorms:
+        raise ValueError('normIC must be one of commit.solvers.{group_sparsity,non_negative,norm1,norm2}')
+    if not normEC in list_regnorms:
+        raise ValueError('normEC must be one of commit.solvers.{group_sparsity,non_negative,norm1,norm2}')
+    if not normISO in list_regnorms:
+        raise ValueError('normISO must be one of commit.solvers.{group_sparsity,non_negative,norm1,norm2}')
+
+    ## NNLS case
+    if (lambdaIC == 0.0 and lambdaEC == 0.0 and lambdaISO == 0.0) or (normIC == non_negative and normEC == non_negative and normISO == non_negative):
+        omega = lambda x: 0.0
+        prox  = lambda x: non_negativity(x, 0, len(x))
+        return omega, prox
+
+    ## All other cases
+    # Intracellular Compartment
+    startIC = regularisation.get('startIC')
+    sizeIC  = regularisation.get('sizeIC')
+    if lambdaIC == 0.0:
+        omegaIC = lambda x: 0.0
+        proxIC  = lambda x: x
+    elif normIC == norm2:
+        omegaIC = lambda x: lambdaIC * np.linalg.norm(x[startIC:sizeIC])
+        proxIC  = lambda x: projection_onto_l2_ball(x, lambdaIC, startIC, sizeIC)
+    elif normIC == norm1:
+        omegaIC = lambda x: lambdaIC * sum( x[startIC:sizeIC] )
+        proxIC  = lambda x: soft_thresholding(x, lambdaIC, startIC, sizeIC)
+    elif normIC == non_negative:
+        omegaIC = lambda x: 0.0
+        proxIC  = lambda x: non_negativity(x, startIC, sizeIC)
+    elif normIC == group_sparsity:
+        structureIC = regularisation.get('structureIC')
+        groupWeightIC   = regularisation.get('weightsIC')
+        if not len(structureIC) == len(groupWeightIC):
+            raise ValueError('Number of groups and weights do not coincide.')
+        group_norm = regularisation.get('group_norm')
+        if not group_norm in list_group_sparsity_norms:
+            raise ValueError('Wrong norm in the structured sparsity term. Choose between %s.' % str(list_group_sparsity_norms))
+
+        # convert to new data structure (needed for faster access)
+        N = np.sum([g.size for g in structureIC])
+        groupIdxIC  = np.zeros( (N,), dtype=np.int32 )
+        groupSizeIC = np.zeros( (structureIC.size,), dtype=np.int32 )
+        pos = 0
+        for i, g in enumerate(structureIC) :
+            groupSizeIC[i] = g.size
+            groupIdxIC[pos:(pos+g.size)] = g[:]
+            pos += g.size
+
+        omegaIC = lambda x: omega_group_sparsity( x, groupIdxIC, groupSizeIC, groupWeightIC, lambdaIC, group_norm )
+        proxIC  = lambda x:  prox_group_sparsity( x, groupIdxIC, groupSizeIC, groupWeightIC, lambdaIC, group_norm )
+    else:
+        raise ValueError('Type of regularisation for IC compartment not recognized.')
+
+
+    # Extracellular Compartment
+    startEC = regularisation.get('startEC')
+    sizeEC  = regularisation.get('sizeEC')
+    if lambdaEC == 0.0:
+        omegaEC = lambda x: 0.0
+        proxEC  = lambda x: x
+    elif normEC == norm2:
+        omegaEC = lambda x: lambdaEC * np.linalg.norm(x[startEC:(startEC+sizeEC)])
+        proxEC  = lambda x: projection_onto_l2_ball(x, lambdaEC, startEC, sizeEC)
+    elif normEC == norm1:
+        omegaEC = lambda x: lambdaEC * sum( x[startEC:(startEC+sizeEC)] )
+        proxEC  = lambda x: soft_thresholding(x, lambdaEC, startEC, sizeEC)
+    elif normEC == non_negative:
+        omegaEC = lambda x: 0.0
+        proxEC  = lambda x: non_negativity(x, startEC, sizeEC)
+    else:
+        raise ValueError('Type of regularisation for EC compartment not recognized.')
+
+    # Isotropic Compartment
+    startISO = regularisation.get('startISO')
+    sizeISO  = regularisation.get('sizeISO')
+    if lambdaISO == 0.0:
+        omegaISO = lambda x: 0.0
+        proxISO  = lambda x: x
+    elif normISO == norm2:
+        omegaISO = lambda x: lambdaISO * np.linalg.norm(x[startISO:(startISO+sizeISO)])
+        proxISO  = lambda x: projection_onto_l2_ball(x, lambdaISO, startISO, sizeISO)
+    elif normISO == norm1:
+        omegaISO = lambda x: lambdaISO * sum( x[startISO:(startISO+sizeISO)] )
+        proxISO  = lambda x: soft_thresholding(x, lambdaISO, startISO, sizeISO)
+    elif normISO == non_negative:
+        omegaISO = lambda x: 0.0
+        proxISO  = lambda x: non_negativity(x, startISO, sizeISO)
+    else:
+        raise ValueError('Type of regularisation for ISO compartment not recognized.')
+
+    omega = lambda x: omegaIC(x) + omegaEC(x) + omegaISO(x)
+    prox = lambda x: non_negativity(proxIC(proxEC(proxISO(x))),0,x.size) # non negativity is redunduntly forced
+
+    return omega, prox
+
+
+def evaluate_model(y, A, x, regularisation = None):
+    if regularisation is None:
+        omega = lambda x: 0.0
+        prox  = lambda x: non_negativity(x, 0, len(x))
+    else:
+        omega, _ = regularisation2omegaprox(regularisation)
+
+    return 0.5*np.linalg.norm(A.dot(x)-y)**2 + omega(x)
+
+
+def solve(y, A, At, tol_fun = 1e-4, tol_x = 1e-6, max_iter = 1000, verbose = 1, x0 = None, regularisation = None):
+    """
+    Solve the regularised least squares problem
+
+        argmin_x 0.5*||Ax-y||_2^2 + Omega(x)
+
+    with the Omega described by 'regularisation'.
+
+    Check the documentation of commit.solvers.init_regularisation to see how to
+    solve a specific problem.
+    """
+    if regularisation is None:
+        omega = lambda x: 0.0
+        prox  = lambda x: non_negativity(x, 0, x.size)
+    else:
+        omega, prox = regularisation2omegaprox(regularisation)
+
+    if x0 is None:
+        x0 = np.zeros(A.shape[1])
+
+    return fista( y, A, At, tol_fun, tol_x, max_iter, verbose, x0, omega, prox)
+
+
+def fista( y, A, At, tol_fun, tol_x, max_iter, verbose, x0, omega, proximal) :
+    """
+    Solve the regularised least squares problem
+
+        argmin_x 0.5*||Ax-y||_2^2 + Omega(x)
+
+    with the FISTA algorithm described in [1].
+
+    The penalty term and its proximal operator must be defined in such a way
+    that they already contain the regularisation parameter.
+
+    References:
+        [1] Beck & Teboulle - `A Fast Iterative Shrinkage Thresholding
+            Algorithm for Linear Inverse Problems`
+    """
+
+    # Initialization
+    res = -y.copy()
+    xhat = x0.copy()
+    x = np.zeros_like(xhat)
+    res += A.dot(xhat)
+    proximal( xhat )
+    reg_term = omega( xhat )
+    prev_obj = 0.5 * np.linalg.norm(res)**2 + reg_term
+
+    told = 1
+    beta = 0.9
+    prev_x = xhat.copy()
+    grad = np.asarray(At.dot(res))
+    qfval = prev_obj
+
+    # Step size computation
+    L = ( np.linalg.norm( A.dot(grad) ) / np.linalg.norm(grad) )**2
+    mu = 1.9 / L
+
+    # Main loop
+    if verbose >= 1 :
+        print()
+        print( "      |  1/2||Ax-y||^2      Omega      |  Cost function    Abs error      Rel error    |      Abs x          Rel x    " )
+        print( "------|--------------------------------|-----------------------------------------------|------------------------------" )
+    iter = 1
+    while True :
+        if verbose >= 1 :
+            print( "%4d  |" % iter, end="" )
+            sys.stdout.flush()
+
+        # Smooth step
+        x = xhat - mu*grad
+
+        # Non-smooth step
+        proximal( x )
+        reg_term_x = omega( x )
+
+        # Check stepsize
+        tmp = x-xhat
+        q = qfval + np.real( np.dot(tmp,grad) ) + 0.5/mu * np.linalg.norm(tmp)**2 + reg_term_x
+        res = A.dot(x) - y
+        res_norm = np.linalg.norm(res)
+        curr_obj = 0.5 * res_norm**2 + reg_term_x
+
+        # Backtracking
+        while curr_obj > q :
+            # Smooth step
+            mu = beta*mu
+            x = xhat - mu*grad
+
+            # Non-smooth step
+            proximal( x )
+            reg_term_x = omega( x )
+
+            # Check stepsize
+            tmp = x-xhat
+            q = qfval + np.real( np.dot(tmp,grad) ) + 0.5/mu * np.linalg.norm(tmp)**2 + reg_term_x
+            res = A.dot(x) - y
+            res_norm = np.linalg.norm(res)
+            curr_obj = 0.5 * res_norm**2 + reg_term_x
+
+        # Global stopping criterion
+        abs_obj = abs(curr_obj - prev_obj)
+        rel_obj = abs_obj / curr_obj
+        abs_x   = np.linalg.norm(x - prev_x)
+        rel_x   = abs_x / ( np.linalg.norm(x) + eps )
+        if verbose >= 1 :
+            print( "  %13.7e  %13.7e  |  %13.7e  %13.7e  %13.7e  |  %13.7e  %13.7e" % ( 0.5 * res_norm**2, reg_term_x, curr_obj, abs_obj, rel_obj, abs_x, rel_x ) )
+
+        if abs_obj < eps :
+            criterion = "Absolute tolerance on the objective"
+            break
+        elif rel_obj < tol_fun :
+            criterion = "Relative tolerance on the objective"
+            break
+        elif abs_x < eps :
+            criterion = "Absolute tolerance on the unknown"
+            break
+        elif rel_x < tol_x :
+            criterion = "Relative tolerance on the unknown"
+            break
+        elif iter >= max_iter :
+            criterion = "Maximum number of iterations"
+            break
+
+        # FISTA update
+        t = 0.5 * ( 1 + sqrt(1+4*told**2) )
+        xhat = x + (told-1)/t * (x - prev_x)
+
+        # Gradient computation
+        res = A.dot(xhat) - y
+        xarr = np.asarray(x)
+
+        grad = np.asarray(At.dot(res))
+
+        # Update variables
+        iter += 1
+        prev_obj = curr_obj
+        prev_x = x.copy()
+        told = t
+        qfval = 0.5 * np.linalg.norm(res)**2
+
+
+    if verbose >= 1 :
+        print( "< Stopping criterion: %s >" % criterion )
+
+    opt_details = {}
+    opt_details['residual'] = 0.5*res_norm**2
+    opt_details['regterm'] = reg_term_x
+    opt_details['cost_function'] = curr_obj
+    opt_details['abs_cost'] = abs_obj
+    opt_details['rel_cost'] = rel_obj
+    opt_details['abs_x'] = abs_x
+    opt_details['rel _x'] = rel_x
+    opt_details['iterations'] = iter
+    opt_details['stopping_criterion'] = criterion
+
+    return x, opt_details
diff --git a/commit/trk2dictionary/trk2dictionary.pyx b/commit/trk2dictionary/trk2dictionary.pyx
index 181222d3..9a0b6099 100755
--- a/commit/trk2dictionary/trk2dictionary.pyx
+++ b/commit/trk2dictionary/trk2dictionary.pyx
@@ -1,430 +1,430 @@
-#!python
-# cython: language_level=3, c_string_type=str, c_string_encoding=ascii, boundscheck=False, wraparound=False, profile=False
-from __future__ import print_function
-import cython
-import numpy as np
-cimport numpy as np
-import nibabel
-from os.path import join, exists, splitext, dirname, isdir
-from os import makedirs, remove
-import time
-import amico
-import pickle
-from amico.util import LOG, NOTE, WARNING, ERROR
-from pkg_resources import get_distribution
-
-
-# Interface to actual C code
-cdef extern from "trk2dictionary_c.cpp":
-    int trk2dictionary(
-        char* filename_tractogram, int data_offset, int Nx, int Ny, int Nz, float Px, float Py, float Pz, int n_count, int n_scalars, 
-        int n_properties, float fiber_shiftX, float fiber_shiftY, float fiber_shiftZ, int points_to_skip, float min_seg_len, float min_fiber_len,  float max_fiber_len,
-        float* ptrPEAKS, int Np, float vf_THR, int ECix, int ECiy, int ECiz,
-        float* _ptrMASK, float* ptrTDI, char* path_out, int c, double* ptrPeaksAffine,
-        int nBlurRadii, double blurSigma, double* ptrBlurRadii, int* ptrBlurSamples, double* ptrBlurWeights,  float* ptrTractsAffine, unsigned short ndirs, short* prtHashTable
-    ) nogil
-
-
-cpdef run( filename_tractogram=None, path_out=None, filename_peaks=None, filename_mask=None, do_intersect=True,
-    fiber_shift=0, min_seg_len=1e-3, min_fiber_len=0.0, max_fiber_len=250.0, points_to_skip=0,
-    vf_THR=0.1, peaks_use_affine=False, flip_peaks=[False,False,False], 
-    blur_radii=[], blur_samples=[], blur_sigma=0.0,
-    filename_trk=None, gen_trk=None, TCK_ref_image=None, ndirs=32761
-    ):
-    """Perform the conversion of a tractoram to the sparse data-structure internally
-    used by COMMIT to perform the matrix-vector multiplications with the operator A
-    during the inversion of the linear system.
-
-    Parameters
-    ----------
-    filename_tractogram : string
-        Path to the tractogram (.trk or .tck) containing the streamlines to load.
-        
-    TCK_ref_image: string
-        When loading a .tck tractogram, path to the NIFTI file containing the information about
-        the geometry to be used for the tractogram to load. If not specified, it will try to use
-        the information from filename_peaks or filename_mask.
-    
-    path_out : string
-        Path to the folder for storing the sparse data structure. If not specified (default),
-        a folder name "COMMIT" will be created in the same folder of the tractogram.
-
-    filename_mask : string
-        Path to a binary mask for restricting the analysis to specific areas.
-        Segments outside this mask are discarded. If not specified (default),
-        the mask is created from all voxels intersected by the tracts.
-
-    do_intersect : boolean
-        If True then fiber segments that intersect voxel boundaries are splitted (default).
-        If False then the centroid of the segment is used as its voxel position.
-
-    fiber_shift : float or list of three float
-        If necessary, apply a translation to fiber coordinates (default : 0) to account
-        for differences between the reference system of the tracking algorithm and COMMIT.
-        The value is specified in voxel units, eg 0.5 translates by half voxel.
-
-    min_seg_len : float
-        Discard segments <= than this length in mm (default : 1e-3).
-
-    min_fiber_len : float
-        Discard streamlines <= than this length in mm (default : 0.0).
-
-    max_fiber_len : float
-        Discard streamlines >= than this length in mm (default : 250.0).
-
-    points_to_skip : integer
-        If necessary, discard first points at beginning/end of a fiber (default : 0).
-
-    filename_peaks : string
-        Path to the NIFTI file containing the peaks to use as extra-cellular contributions.
-        The data matrix should be 4D with last dimension 3*N, where N is the number
-        of peaks in each voxel. (default : no extra-cellular contributions).
-
-    peaks_use_affine : boolean
-        Whether to rotate the peaks according to the affine matrix (default : False).
-
-    vf_THR : float
-        Discard peaks smaller than vf_THR * max peak (default : 0.1).
-
-    flip_peaks : list of three boolean
-        If necessary, flips peak orientations along each axis (default : no flipping).
-
-    blur_radii : list of float
-        Translate each segment to given radii to assign a broader fiber contribution (default : []).
-    
-    blur_samples : list of integer
-        Segments are duplicated along a circle at a given radius; this parameter controls the
-        number of samples to take over a given circle (defaut : []).
-
-    blur_sigma: float
-        The contributions of the segments at different radii are damped as a Gaussian (default : 0.0).
-    
-    ndirs : int
-        Number of orientations on the sphere used to discretize the orientation of each
-        each segment in a streamline (default : 32761).
-
-    filename_trk : string
-        DEPRECATED. Use filename_tractogram instead.
-
-    gen_trk : string
-        DEPRECATED. No tractogram will be saved any more, but the returned coefficients will account
-        for the streamlines that were pre-filtered in this function.
-    """
-
-    # check the value of ndirs
-    if not amico.lut.is_valid(ndirs):
-        ERROR( 'Unsupported value for ndirs.\nNote: Supported values for ndirs are [500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 10000, 32761 (default)]' )
-
-    # check conflicts of fiber_shift
-    if np.isscalar(fiber_shift) :
-        fiber_shiftX = fiber_shift
-        fiber_shiftY = fiber_shift
-        fiber_shiftZ = fiber_shift
-    elif len(fiber_shift) == 3 :
-        fiber_shiftX = fiber_shift[0]
-        fiber_shiftY = fiber_shift[1]
-        fiber_shiftZ = fiber_shift[2]
-    else :
-        ERROR( '"fiber_shift" must be a scalar or a vector with 3 elements' )
-
-    # check for invalid parameters in the blur
-    if type(blur_radii)==list:
-        blur_radii = np.ndarray(blur_radii, np.double)
-    if type(blur_samples)==list:
-        blur_samples = np.ndarray(blur_samples, np.int32)
-
-    if blur_sigma > 0 :
-        if blur_radii.size != blur_samples.size :
-            ERROR( 'The number of blur radii and blur samples must match' )
-
-        if np.count_nonzero( blur_radii<=0 ):
-            ERROR( 'A blur radius was <= 0; only positive radii can be used' )
-
-        if np.count_nonzero( blur_samples<1 ):
-            ERROR( 'Please specify at least 1 sample per blur radius' )
-
-    tic = time.time()
-    LOG( '\n-> Creating the dictionary from tractogram:' )
-    
-    LOG( '\n   * Configuration:' )
-    print( '\t- Segment position = %s' % ( 'COMPUTE INTERSECTIONS' if do_intersect else 'CENTROID' ) )
-    print( '\t- Fiber shift X    = %.3f (voxel-size units)' % fiber_shiftX )
-    print( '\t- Fiber shift Y    = %.3f (voxel-size units)' % fiber_shiftY )
-    print( '\t- Fiber shift Z    = %.3f (voxel-size units)' % fiber_shiftZ )
-    print( '\t- Points to skip   = %d' % points_to_skip )
-    if min_seg_len >= 1e-3:
-        print( '\t- Min segment len  = %.3f mm' % min_seg_len )
-    else:
-        print( '\t- Min segment len  = %.2e mm' % min_seg_len )
-    print( '\t- Min fiber len    = %.2f mm' % min_fiber_len )
-    print( '\t- Max fiber len    = %.2f mm' % max_fiber_len )
-
-    # check blur params
-    cdef :
-        double [:] blurRadii
-        int [:] blurSamples
-        double [:] blurWeights
-        double* ptrBlurRadii
-        int* ptrBlurSamples
-        double* ptrBlurWeights
-        int nBlurRadii
-        float [:] ArrayInvM
-        float* ptrArrayInvM
-    
-    # convert to numpy arrays (and add fake radius for original segment)
-    if blur_sigma == 0:
-        nBlurRadii = 1
-        blurRadii = np.array( [0.0], np.double )
-        blurSamples = np.array( [1], np.int32 )
-        blurWeights = np.array( [1], np.double )
-    else:
-        nBlurRadii = len(blur_radii)+1
-        blurRadii = np.insert( blur_radii, 0, 0.0 ).astype(np.double)
-        blurSamples = np.insert( blur_samples, 0, 1 ).astype(np.int32)
-
-        # compute weights for gaussian damping
-        blurWeights = np.empty_like( blurRadii )
-        for i in xrange(nBlurRadii):
-            blurWeights[i] = np.exp( -blurRadii[i]**2 / (2.0*blur_sigma**2) )
-
-    if nBlurRadii == 1 :
-        print( '\t- Do not blur fibers' )
-    else :
-        print( '\t- Blur fibers:' )
-        print( '\t\t- sigma = %.3f' % blur_sigma )
-        print( '\t\t- radii =   [ ', end="" )
-        for i in xrange( 1, blurRadii.size ) :
-            print( '%.3f ' % blurRadii[i], end="" )
-        print( ']' )
-        print( '\t\t- weights = [ ', end="" )
-        for i in xrange( 1, blurWeights.size ) :
-            print( '%.3f ' % blurWeights[i], end="" )
-        print( ']' )
-        print( '\t\t- samples = [ ', end="" )
-        for i in xrange( 1, blurSamples.size ) :
-            print( '%5d ' % blurSamples[i], end="" )
-        print( ']' )
-
-    ptrBlurRadii   = &blurRadii[0]
-    ptrBlurSamples = &blurSamples[0]
-    ptrBlurWeights = &blurWeights[0]
-
-    if min_seg_len < 0 :
-        ERROR( '"min_seg_len" must be >= 0' )
-    if min_fiber_len < 0 :
-        ERROR( '"min_fiber_len" must be >= 0' )
-    if max_fiber_len < min_fiber_len :
-        ERROR( '"max_fiber_len" must be >= "min_fiber_len"' )
-
-    if filename_trk is None and filename_tractogram is None:
-        ERROR( '"filename_tractogram" not defined' )
-
-    if filename_trk is not None and filename_tractogram is not None:
-        WARNING('"filename_trk" will not be considered, "filename_tractogram" will be used')
-
-    if filename_trk is not None and filename_tractogram is None:
-        filename_tractogram = filename_trk
-        WARNING('"filename_trk" parameter is deprecated, use "filename_tractogram" instead')
-
-    if path_out is None:
-        path_out = dirname(filename_tractogram)
-        if path_out == '':
-            path_out = '.'
-        if not isdir(path_out):
-            ERROR( '"path_out" cannot be inferred from "filename_tractogram"' )
-        path_out = join(path_out,'COMMIT')
-
-    if gen_trk is not None:
-        WARNING('"gen_trk" parameter is deprecated')
-
-    # create output path
-    print( '\t- Output written to "%s"' % path_out )
-    if not exists( path_out ):
-        makedirs( path_out )
-
-    # Load data from files
-    LOG( '\n   * Loading data:' )
-    cdef short [:] htable = amico.lut.load_precomputed_hash_table(ndirs)
-    cdef short* ptrHashTable = &htable[0]
-
-    # Streamlines from tractogram
-    print( '\t- Tractogram' )
-    
-    extension = splitext(filename_tractogram)[1]
-    if extension != ".trk" and extension != ".tck":
-        ERROR( 'Invalid input file: only .trk and .tck are supported' )
-    try :
-        hdr = nibabel.streamlines.load( filename_tractogram, lazy_load=True ).header
-    except :
-        ERROR( 'Tractogram file not found' )
-        
-    if extension == ".trk":
-        Nx = hdr['dimensions'][0]
-        Ny = hdr['dimensions'][1]
-        Nz = hdr['dimensions'][2]
-        Px = hdr['voxel_sizes'][0]
-        Py = hdr['voxel_sizes'][1]
-        Pz = hdr['voxel_sizes'][2]
-
-        data_offset = 1000
-        n_count = hdr['nb_streamlines']
-        n_scalars = hdr['nb_scalars_per_point']
-        n_properties = hdr['nb_properties_per_streamline']
-
-    if extension == ".tck":
-        if TCK_ref_image is None:
-            if filename_peaks is not None:
-                TCK_ref_image = filename_peaks
-            elif filename_mask is not None:
-                TCK_ref_image = filename_mask
-            else:
-                ERROR( 'TCK files do not contain information about the geometry. Use "TCK_ref_image" for that' )
-
-        print ('\t\t- geometry taken from "%s"' %TCK_ref_image)
-
-        nii_image = nibabel.load(TCK_ref_image)
-        nii_hdr = nii_image.header if nibabel.__version__ >= '2.0.0' else nii_image.get_header()
-        Nx = nii_image.shape[0]
-        Ny = nii_image.shape[1]
-        Nz = nii_image.shape[2]
-        Px = nii_hdr['pixdim'][1]
-        Py = nii_hdr['pixdim'][2]
-        Pz = nii_hdr['pixdim'][3]
-        data_offset = int(hdr['_offset_data'])  #set offset
-        n_count = int(hdr['count'])  #set number of fibers
-        n_scalars = 0
-        n_properties = 0
-        
-    print( '\t\t- %d x %d x %d' % ( Nx, Ny, Nz ) )
-    print( '\t\t- %.4f x %.4f x %.4f' % ( Px, Py, Pz ) )
-    print( '\t\t- %d fibers' % n_count )
-    if Nx >= 2**16 or Nz >= 2**16 or Nz >= 2**16 :
-        ERROR( 'The max dim size is 2^16 voxels' )
-    
-    # get the affine matrix
-    if extension == ".tck":
-        scaleMat = np.diag(np.divide(1.0, [Px,Py,Pz]))
-        M = nii_hdr.get_best_affine()
-
-        # Affine matrix without scaling, i.e. diagonal is 1
-        M[:3, :3] = np.dot(scaleMat, M[:3, :3])
-        M = M.astype('<f4') # affine matrix in float value
-        invM = np.linalg.inv(M) # inverse affine matrix
-        #create a vector of inverse matrix M
-        ArrayInvM = np.ravel(invM)
-        ptrArrayInvM = &ArrayInvM[0]
-
-    # white-matter mask
-    cdef float* ptrMASK
-    cdef float [:, :, ::1] niiMASK_img
-    if filename_mask is not None :
-        print( '\t- Filtering mask' )
-        niiMASK = nibabel.load( filename_mask )
-        niiMASK_hdr = niiMASK.header if nibabel.__version__ >= '2.0.0' else niiMASK.get_header()
-        print( '\t\t- %d x %d x %d' % ( niiMASK.shape[0], niiMASK.shape[1], niiMASK.shape[2] ) )
-        print( '\t\t- %.4f x %.4f x %.4f' % ( niiMASK_hdr['pixdim'][1], niiMASK_hdr['pixdim'][2], niiMASK_hdr['pixdim'][3] ) )
-        if ( Nx!=niiMASK.shape[0] or Ny!=niiMASK.shape[1] or Nz!=niiMASK.shape[2] or
-            abs(Px-niiMASK_hdr['pixdim'][1])>1e-3 or abs(Py-niiMASK_hdr['pixdim'][2])>1e-3 or abs(Pz-niiMASK_hdr['pixdim'][3])>1e-3 ) :
-            WARNING( 'Dataset does not have the same geometry as the tractogram' )
-        niiMASK_img = np.ascontiguousarray( niiMASK.get_data().astype(np.float32) )
-        ptrMASK  = &niiMASK_img[0,0,0]
-    else :
-        print( '\t- No mask specified to filter IC compartments' )
-        ptrMASK = NULL
-
-    # peaks file for EC contributions
-    cdef float* ptrPEAKS
-    cdef float [:, :, :, ::1] niiPEAKS_img
-    cdef int Np
-    cdef float [:, :, ::1] niiTDI_img = np.ascontiguousarray( np.zeros((Nx,Ny,Nz),dtype=np.float32) )
-    cdef float* ptrTDI  = &niiTDI_img[0,0,0]
-    cdef double [:, ::1] affine
-    cdef double* ptrAFFINE
-    if filename_peaks is not None :
-        print( '\t- EC orientations' )
-        niiPEAKS = nibabel.load( filename_peaks )
-        niiPEAKS_hdr = niiPEAKS.header if nibabel.__version__ >= '2.0.0' else niiPEAKS.get_header()
-        print( '\t\t- %d x %d x %d x %d' % ( niiPEAKS.shape[0], niiPEAKS.shape[1], niiPEAKS.shape[2], niiPEAKS.shape[3] ) )
-        print( '\t\t- %.4f x %.4f x %.4f' % ( niiPEAKS_hdr['pixdim'][1], niiPEAKS_hdr['pixdim'][2], niiPEAKS_hdr['pixdim'][3] ) )
-        print( '\t\t- ignoring peaks < %.2f * MaxPeak' % vf_THR )
-        print( '\t\t- %susing affine matrix' % ( "" if peaks_use_affine else "not " ) )
-        print( '\t\t- flipping axes : [ x=%s, y=%s, z=%s ]' % ( flip_peaks[0], flip_peaks[1], flip_peaks[2] ) )
-        if ( Nx!=niiPEAKS.shape[0] or Ny!=niiPEAKS.shape[1] or Nz!=niiPEAKS.shape[2] or
-            abs(Px-niiPEAKS_hdr['pixdim'][1])>1e-3 or abs(Py-niiPEAKS_hdr['pixdim'][2])>1e-3 or abs(Pz-niiPEAKS_hdr['pixdim'][3])>1e-3 ) :
-            WARNING( "Dataset does not have the same geometry as the tractogram" )
-        if niiPEAKS.shape[3] % 3 :
-            ERROR( 'PEAKS dataset must have 3*k volumes' )
-        if vf_THR < 0 or vf_THR > 1 :
-            ERROR( '"vf_THR" must be between 0 and 1' )
-        niiPEAKS_img = np.ascontiguousarray( niiPEAKS.get_data().astype(np.float32) )
-        ptrPEAKS = &niiPEAKS_img[0,0,0,0]
-        Np = niiPEAKS.shape[3]/3
-
-        # affine matrix to rotate gradien directions (if required)
-        if peaks_use_affine :
-            affine = np.ascontiguousarray( niiPEAKS.affine[:3,:3].T )
-        else :
-            affine = np.ascontiguousarray( np.eye(3) )
-        ptrAFFINE = &affine[0,0]
-    else :
-        print( '\t- No dataset specified for EC compartments' )
-        Np = 0
-        ptrPEAKS = NULL
-        ptrAFFINE = NULL
-
-    # write dictionary information info file
-    dictionary_info = {}
-    dictionary_info['filename_tractogram'] = filename_tractogram
-    dictionary_info['TCK_ref_image'] = TCK_ref_image
-    dictionary_info['path_out'] = path_out
-    dictionary_info['filename_peaks'] = filename_peaks
-    dictionary_info['filename_mask'] = filename_mask
-    dictionary_info['do_intersect'] = do_intersect
-    dictionary_info['fiber_shift'] = fiber_shift
-    dictionary_info['min_seg_len'] = min_seg_len
-    dictionary_info['min_fiber_len'] = min_fiber_len
-    dictionary_info['max_fiber_len'] = max_fiber_len
-    dictionary_info['points_to_skip'] = points_to_skip
-    dictionary_info['vf_THR'] = vf_THR
-    dictionary_info['peaks_use_affine'] = peaks_use_affine
-    dictionary_info['flip_peaks'] = flip_peaks
-    dictionary_info['blur_radii'] = blur_radii
-    dictionary_info['blur_samples'] = blur_samples
-    dictionary_info['blur_sigma'] = blur_sigma    
-    dictionary_info['ndirs'] = ndirs
-    with open( join(path_out,'dictionary_info.pickle'), 'wb+' ) as dictionary_info_file:
-        pickle.dump(dictionary_info, dictionary_info_file, protocol=2)
-
-    # calling actual C code
-    ret = trk2dictionary( filename_tractogram, data_offset,
-        Nx, Ny, Nz, Px, Py, Pz, n_count, n_scalars, n_properties,
-        fiber_shiftX, fiber_shiftY, fiber_shiftZ, points_to_skip, min_seg_len, min_fiber_len, max_fiber_len,
-        ptrPEAKS, Np, vf_THR, -1 if flip_peaks[0] else 1, -1 if flip_peaks[1] else 1, -1 if flip_peaks[2] else 1,
-        ptrMASK, ptrTDI, path_out, 1 if do_intersect else 0, ptrAFFINE,
-        nBlurRadii, blur_sigma, ptrBlurRadii, ptrBlurSamples, ptrBlurWeights, ptrArrayInvM, ndirs, ptrHashTable  );
-    if ret == 0 :
-        WARNING( 'DICTIONARY not generated' )
-        return None
-
-    # save TDI and MASK maps
-    if filename_mask is not None :
-        affine = niiMASK.affine if nibabel.__version__ >= '2.0.0' else niiMASK.get_affine()
-    elif filename_peaks is not None :
-        affine = niiPEAKS.affine if nibabel.__version__ >= '2.0.0' else niiPEAKS.get_affine()
-    else :
-        affine = np.diag( [Px, Py, Pz, 1] )
-
-    niiTDI = nibabel.Nifti1Image( niiTDI_img, affine )
-    nii_hdr = niiTDI.header if nibabel.__version__ >= '2.0.0' else niiTDI.get_header()
-    nii_hdr['descrip'] = 'Created with COMMIT %s'%get_distribution('dmri-commit').version
-    nibabel.save( niiTDI, join(path_out,'dictionary_tdi.nii.gz') )
-
-    if filename_mask is not None :
-        niiMASK = nibabel.Nifti1Image( niiMASK_img, affine )
-    else :
-        niiMASK = nibabel.Nifti1Image( (np.asarray(niiTDI_img)>0).astype(np.float32), affine )
-    nii_hdr = niiMASK.header if nibabel.__version__ >= '2.0.0' else niiMASK.get_header()
-    nii_hdr['descrip'] = 'Created with COMMIT %s'%get_distribution('dmri-commit').version
-    nibabel.save( niiMASK, join(path_out,'dictionary_mask.nii.gz') )
-
+#!python
+# cython: language_level=3, c_string_type=str, c_string_encoding=ascii, boundscheck=False, wraparound=False, profile=False
+from __future__ import print_function
+import cython
+import numpy as np
+cimport numpy as np
+import nibabel
+from os.path import join, exists, splitext, dirname, isdir
+from os import makedirs, remove
+import time
+import amico
+import pickle
+from amico.util import LOG, NOTE, WARNING, ERROR
+from pkg_resources import get_distribution
+
+
+# Interface to actual C code
+cdef extern from "trk2dictionary_c.cpp":
+    int trk2dictionary(
+        char* filename_tractogram, int data_offset, int Nx, int Ny, int Nz, float Px, float Py, float Pz, int n_count, int n_scalars, 
+        int n_properties, float fiber_shiftX, float fiber_shiftY, float fiber_shiftZ, int points_to_skip, float min_seg_len, float min_fiber_len,  float max_fiber_len,
+        float* ptrPEAKS, int Np, float vf_THR, int ECix, int ECiy, int ECiz,
+        float* _ptrMASK, float* ptrTDI, char* path_out, int c, double* ptrPeaksAffine,
+        int nBlurRadii, double blurSigma, double* ptrBlurRadii, int* ptrBlurSamples, double* ptrBlurWeights,  float* ptrTractsAffine, unsigned short ndirs, short* prtHashTable
+    ) nogil
+
+
+cpdef run( filename_tractogram=None, path_out=None, filename_peaks=None, filename_mask=None, do_intersect=True,
+    fiber_shift=0, min_seg_len=1e-3, min_fiber_len=0.0, max_fiber_len=250.0, points_to_skip=0,
+    vf_THR=0.1, peaks_use_affine=False, flip_peaks=[False,False,False], 
+    blur_radii=[], blur_samples=[], blur_sigma=0.0,
+    filename_trk=None, gen_trk=None, TCK_ref_image=None, ndirs=32761
+    ):
+    """Perform the conversion of a tractoram to the sparse data-structure internally
+    used by COMMIT to perform the matrix-vector multiplications with the operator A
+    during the inversion of the linear system.
+
+    Parameters
+    ----------
+    filename_tractogram : string
+        Path to the tractogram (.trk or .tck) containing the streamlines to load.
+        
+    TCK_ref_image: string
+        When loading a .tck tractogram, path to the NIFTI file containing the information about
+        the geometry to be used for the tractogram to load. If not specified, it will try to use
+        the information from filename_peaks or filename_mask.
+    
+    path_out : string
+        Path to the folder for storing the sparse data structure. If not specified (default),
+        a folder name "COMMIT" will be created in the same folder of the tractogram.
+
+    filename_mask : string
+        Path to a binary mask for restricting the analysis to specific areas.
+        Segments outside this mask are discarded. If not specified (default),
+        the mask is created from all voxels intersected by the tracts.
+
+    do_intersect : boolean
+        If True then fiber segments that intersect voxel boundaries are splitted (default).
+        If False then the centroid of the segment is used as its voxel position.
+
+    fiber_shift : float or list of three float
+        If necessary, apply a translation to fiber coordinates (default : 0) to account
+        for differences between the reference system of the tracking algorithm and COMMIT.
+        The value is specified in voxel units, eg 0.5 translates by half voxel.
+
+    min_seg_len : float
+        Discard segments <= than this length in mm (default : 1e-3).
+
+    min_fiber_len : float
+        Discard streamlines <= than this length in mm (default : 0.0).
+
+    max_fiber_len : float
+        Discard streamlines >= than this length in mm (default : 250.0).
+
+    points_to_skip : integer
+        If necessary, discard first points at beginning/end of a fiber (default : 0).
+
+    filename_peaks : string
+        Path to the NIFTI file containing the peaks to use as extra-cellular contributions.
+        The data matrix should be 4D with last dimension 3*N, where N is the number
+        of peaks in each voxel. (default : no extra-cellular contributions).
+
+    peaks_use_affine : boolean
+        Whether to rotate the peaks according to the affine matrix (default : False).
+
+    vf_THR : float
+        Discard peaks smaller than vf_THR * max peak (default : 0.1).
+
+    flip_peaks : list of three boolean
+        If necessary, flips peak orientations along each axis (default : no flipping).
+
+    blur_radii : list of float
+        Translate each segment to given radii to assign a broader fiber contribution (default : []).
+    
+    blur_samples : list of integer
+        Segments are duplicated along a circle at a given radius; this parameter controls the
+        number of samples to take over a given circle (defaut : []).
+
+    blur_sigma: float
+        The contributions of the segments at different radii are damped as a Gaussian (default : 0.0).
+    
+    ndirs : int
+        Number of orientations on the sphere used to discretize the orientation of each
+        each segment in a streamline (default : 32761).
+
+    filename_trk : string
+        DEPRECATED. Use filename_tractogram instead.
+
+    gen_trk : string
+        DEPRECATED. No tractogram will be saved any more, but the returned coefficients will account
+        for the streamlines that were pre-filtered in this function.
+    """
+
+    # check the value of ndirs
+    if not amico.lut.is_valid(ndirs):
+        ERROR( 'Unsupported value for ndirs.\nNote: Supported values for ndirs are [500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 10000, 32761 (default)]' )
+
+    # check conflicts of fiber_shift
+    if np.isscalar(fiber_shift) :
+        fiber_shiftX = fiber_shift
+        fiber_shiftY = fiber_shift
+        fiber_shiftZ = fiber_shift
+    elif len(fiber_shift) == 3 :
+        fiber_shiftX = fiber_shift[0]
+        fiber_shiftY = fiber_shift[1]
+        fiber_shiftZ = fiber_shift[2]
+    else :
+        ERROR( '"fiber_shift" must be a scalar or a vector with 3 elements' )
+
+    # check for invalid parameters in the blur
+    if type(blur_radii)==list:
+        blur_radii = np.ndarray(blur_radii, np.double)
+    if type(blur_samples)==list:
+        blur_samples = np.ndarray(blur_samples, np.int32)
+
+    if blur_sigma > 0 :
+        if blur_radii.size != blur_samples.size :
+            ERROR( 'The number of blur radii and blur samples must match' )
+
+        if np.count_nonzero( blur_radii<=0 ):
+            ERROR( 'A blur radius was <= 0; only positive radii can be used' )
+
+        if np.count_nonzero( blur_samples<1 ):
+            ERROR( 'Please specify at least 1 sample per blur radius' )
+
+    tic = time.time()
+    LOG( '\n-> Creating the dictionary from tractogram:' )
+    
+    LOG( '\n   * Configuration:' )
+    print( '\t- Segment position = %s' % ( 'COMPUTE INTERSECTIONS' if do_intersect else 'CENTROID' ) )
+    print( '\t- Fiber shift X    = %.3f (voxel-size units)' % fiber_shiftX )
+    print( '\t- Fiber shift Y    = %.3f (voxel-size units)' % fiber_shiftY )
+    print( '\t- Fiber shift Z    = %.3f (voxel-size units)' % fiber_shiftZ )
+    print( '\t- Points to skip   = %d' % points_to_skip )
+    if min_seg_len >= 1e-3:
+        print( '\t- Min segment len  = %.3f mm' % min_seg_len )
+    else:
+        print( '\t- Min segment len  = %.2e mm' % min_seg_len )
+    print( '\t- Min fiber len    = %.2f mm' % min_fiber_len )
+    print( '\t- Max fiber len    = %.2f mm' % max_fiber_len )
+
+    # check blur params
+    cdef :
+        double [:] blurRadii
+        int [:] blurSamples
+        double [:] blurWeights
+        double* ptrBlurRadii
+        int* ptrBlurSamples
+        double* ptrBlurWeights
+        int nBlurRadii
+        float [:] ArrayInvM
+        float* ptrArrayInvM
+    
+    # convert to numpy arrays (and add fake radius for original segment)
+    if blur_sigma == 0:
+        nBlurRadii = 1
+        blurRadii = np.array( [0.0], np.double )
+        blurSamples = np.array( [1], np.int32 )
+        blurWeights = np.array( [1], np.double )
+    else:
+        nBlurRadii = len(blur_radii)+1
+        blurRadii = np.insert( blur_radii, 0, 0.0 ).astype(np.double)
+        blurSamples = np.insert( blur_samples, 0, 1 ).astype(np.int32)
+
+        # compute weights for gaussian damping
+        blurWeights = np.empty_like( blurRadii )
+        for i in xrange(nBlurRadii):
+            blurWeights[i] = np.exp( -blurRadii[i]**2 / (2.0*blur_sigma**2) )
+
+    if nBlurRadii == 1 :
+        print( '\t- Do not blur fibers' )
+    else :
+        print( '\t- Blur fibers:' )
+        print( '\t\t- sigma = %.3f' % blur_sigma )
+        print( '\t\t- radii =   [ ', end="" )
+        for i in xrange( 1, blurRadii.size ) :
+            print( '%.3f ' % blurRadii[i], end="" )
+        print( ']' )
+        print( '\t\t- weights = [ ', end="" )
+        for i in xrange( 1, blurWeights.size ) :
+            print( '%.3f ' % blurWeights[i], end="" )
+        print( ']' )
+        print( '\t\t- samples = [ ', end="" )
+        for i in xrange( 1, blurSamples.size ) :
+            print( '%5d ' % blurSamples[i], end="" )
+        print( ']' )
+
+    ptrBlurRadii   = &blurRadii[0]
+    ptrBlurSamples = &blurSamples[0]
+    ptrBlurWeights = &blurWeights[0]
+
+    if min_seg_len < 0 :
+        ERROR( '"min_seg_len" must be >= 0' )
+    if min_fiber_len < 0 :
+        ERROR( '"min_fiber_len" must be >= 0' )
+    if max_fiber_len < min_fiber_len :
+        ERROR( '"max_fiber_len" must be >= "min_fiber_len"' )
+
+    if filename_trk is None and filename_tractogram is None:
+        ERROR( '"filename_tractogram" not defined' )
+
+    if filename_trk is not None and filename_tractogram is not None:
+        WARNING('"filename_trk" will not be considered, "filename_tractogram" will be used')
+
+    if filename_trk is not None and filename_tractogram is None:
+        filename_tractogram = filename_trk
+        WARNING('"filename_trk" parameter is deprecated, use "filename_tractogram" instead')
+
+    if path_out is None:
+        path_out = dirname(filename_tractogram)
+        if path_out == '':
+            path_out = '.'
+        if not isdir(path_out):
+            ERROR( '"path_out" cannot be inferred from "filename_tractogram"' )
+        path_out = join(path_out,'COMMIT')
+
+    if gen_trk is not None:
+        WARNING('"gen_trk" parameter is deprecated')
+
+    # create output path
+    print( '\t- Output written to "%s"' % path_out )
+    if not exists( path_out ):
+        makedirs( path_out )
+
+    # Load data from files
+    LOG( '\n   * Loading data:' )
+    cdef short [:] htable = amico.lut.load_precomputed_hash_table(ndirs)
+    cdef short* ptrHashTable = &htable[0]
+
+    # Streamlines from tractogram
+    print( '\t- Tractogram' )
+    
+    extension = splitext(filename_tractogram)[1]
+    if extension != ".trk" and extension != ".tck":
+        ERROR( 'Invalid input file: only .trk and .tck are supported' )
+    try :
+        hdr = nibabel.streamlines.load( filename_tractogram, lazy_load=True ).header
+    except :
+        ERROR( 'Tractogram file not found' )
+        
+    if extension == ".trk":
+        Nx = hdr['dimensions'][0]
+        Ny = hdr['dimensions'][1]
+        Nz = hdr['dimensions'][2]
+        Px = hdr['voxel_sizes'][0]
+        Py = hdr['voxel_sizes'][1]
+        Pz = hdr['voxel_sizes'][2]
+
+        data_offset = 1000
+        n_count = hdr['nb_streamlines']
+        n_scalars = hdr['nb_scalars_per_point']
+        n_properties = hdr['nb_properties_per_streamline']
+
+    if extension == ".tck":
+        if TCK_ref_image is None:
+            if filename_peaks is not None:
+                TCK_ref_image = filename_peaks
+            elif filename_mask is not None:
+                TCK_ref_image = filename_mask
+            else:
+                ERROR( 'TCK files do not contain information about the geometry. Use "TCK_ref_image" for that' )
+
+        print ('\t\t- geometry taken from "%s"' %TCK_ref_image)
+
+        nii_image = nibabel.load(TCK_ref_image)
+        nii_hdr = nii_image.header if nibabel.__version__ >= '2.0.0' else nii_image.get_header()
+        Nx = nii_image.shape[0]
+        Ny = nii_image.shape[1]
+        Nz = nii_image.shape[2]
+        Px = nii_hdr['pixdim'][1]
+        Py = nii_hdr['pixdim'][2]
+        Pz = nii_hdr['pixdim'][3]
+        data_offset = int(hdr['_offset_data'])  #set offset
+        n_count = int(hdr['count'])  #set number of fibers
+        n_scalars = 0
+        n_properties = 0
+        
+    print( '\t\t- %d x %d x %d' % ( Nx, Ny, Nz ) )
+    print( '\t\t- %.4f x %.4f x %.4f' % ( Px, Py, Pz ) )
+    print( '\t\t- %d fibers' % n_count )
+    if Nx >= 2**16 or Nz >= 2**16 or Nz >= 2**16 :
+        ERROR( 'The max dim size is 2^16 voxels' )
+    
+    # get the affine matrix
+    if extension == ".tck":
+        scaleMat = np.diag(np.divide(1.0, [Px,Py,Pz]))
+        M = nii_hdr.get_best_affine()
+
+        # Affine matrix without scaling, i.e. diagonal is 1
+        M[:3, :3] = np.dot(scaleMat, M[:3, :3])
+        M = M.astype('<f4') # affine matrix in float value
+        invM = np.linalg.inv(M) # inverse affine matrix
+        #create a vector of inverse matrix M
+        ArrayInvM = np.ravel(invM)
+        ptrArrayInvM = &ArrayInvM[0]
+
+    # white-matter mask
+    cdef float* ptrMASK
+    cdef float [:, :, ::1] niiMASK_img
+    if filename_mask is not None :
+        print( '\t- Filtering mask' )
+        niiMASK = nibabel.load( filename_mask )
+        niiMASK_hdr = niiMASK.header if nibabel.__version__ >= '2.0.0' else niiMASK.get_header()
+        print( '\t\t- %d x %d x %d' % ( niiMASK.shape[0], niiMASK.shape[1], niiMASK.shape[2] ) )
+        print( '\t\t- %.4f x %.4f x %.4f' % ( niiMASK_hdr['pixdim'][1], niiMASK_hdr['pixdim'][2], niiMASK_hdr['pixdim'][3] ) )
+        if ( Nx!=niiMASK.shape[0] or Ny!=niiMASK.shape[1] or Nz!=niiMASK.shape[2] or
+            abs(Px-niiMASK_hdr['pixdim'][1])>1e-3 or abs(Py-niiMASK_hdr['pixdim'][2])>1e-3 or abs(Pz-niiMASK_hdr['pixdim'][3])>1e-3 ) :
+            WARNING( 'Dataset does not have the same geometry as the tractogram' )
+        niiMASK_img = np.ascontiguousarray( niiMASK.get_data().astype(np.float32) )
+        ptrMASK  = &niiMASK_img[0,0,0]
+    else :
+        print( '\t- No mask specified to filter IC compartments' )
+        ptrMASK = NULL
+
+    # peaks file for EC contributions
+    cdef float* ptrPEAKS
+    cdef float [:, :, :, ::1] niiPEAKS_img
+    cdef int Np
+    cdef float [:, :, ::1] niiTDI_img = np.ascontiguousarray( np.zeros((Nx,Ny,Nz),dtype=np.float32) )
+    cdef float* ptrTDI  = &niiTDI_img[0,0,0]
+    cdef double [:, ::1] affine
+    cdef double* ptrAFFINE
+    if filename_peaks is not None :
+        print( '\t- EC orientations' )
+        niiPEAKS = nibabel.load( filename_peaks )
+        niiPEAKS_hdr = niiPEAKS.header if nibabel.__version__ >= '2.0.0' else niiPEAKS.get_header()
+        print( '\t\t- %d x %d x %d x %d' % ( niiPEAKS.shape[0], niiPEAKS.shape[1], niiPEAKS.shape[2], niiPEAKS.shape[3] ) )
+        print( '\t\t- %.4f x %.4f x %.4f' % ( niiPEAKS_hdr['pixdim'][1], niiPEAKS_hdr['pixdim'][2], niiPEAKS_hdr['pixdim'][3] ) )
+        print( '\t\t- ignoring peaks < %.2f * MaxPeak' % vf_THR )
+        print( '\t\t- %susing affine matrix' % ( "" if peaks_use_affine else "not " ) )
+        print( '\t\t- flipping axes : [ x=%s, y=%s, z=%s ]' % ( flip_peaks[0], flip_peaks[1], flip_peaks[2] ) )
+        if ( Nx!=niiPEAKS.shape[0] or Ny!=niiPEAKS.shape[1] or Nz!=niiPEAKS.shape[2] or
+            abs(Px-niiPEAKS_hdr['pixdim'][1])>1e-3 or abs(Py-niiPEAKS_hdr['pixdim'][2])>1e-3 or abs(Pz-niiPEAKS_hdr['pixdim'][3])>1e-3 ) :
+            WARNING( "Dataset does not have the same geometry as the tractogram" )
+        if niiPEAKS.shape[3] % 3 :
+            ERROR( 'PEAKS dataset must have 3*k volumes' )
+        if vf_THR < 0 or vf_THR > 1 :
+            ERROR( '"vf_THR" must be between 0 and 1' )
+        niiPEAKS_img = np.ascontiguousarray( niiPEAKS.get_data().astype(np.float32) )
+        ptrPEAKS = &niiPEAKS_img[0,0,0,0]
+        Np = niiPEAKS.shape[3]/3
+
+        # affine matrix to rotate gradien directions (if required)
+        if peaks_use_affine :
+            affine = np.ascontiguousarray( niiPEAKS.affine[:3,:3].T )
+        else :
+            affine = np.ascontiguousarray( np.eye(3) )
+        ptrAFFINE = &affine[0,0]
+    else :
+        print( '\t- No dataset specified for EC compartments' )
+        Np = 0
+        ptrPEAKS = NULL
+        ptrAFFINE = NULL
+
+    # write dictionary information info file
+    dictionary_info = {}
+    dictionary_info['filename_tractogram'] = filename_tractogram
+    dictionary_info['TCK_ref_image'] = TCK_ref_image
+    dictionary_info['path_out'] = path_out
+    dictionary_info['filename_peaks'] = filename_peaks
+    dictionary_info['filename_mask'] = filename_mask
+    dictionary_info['do_intersect'] = do_intersect
+    dictionary_info['fiber_shift'] = fiber_shift
+    dictionary_info['min_seg_len'] = min_seg_len
+    dictionary_info['min_fiber_len'] = min_fiber_len
+    dictionary_info['max_fiber_len'] = max_fiber_len
+    dictionary_info['points_to_skip'] = points_to_skip
+    dictionary_info['vf_THR'] = vf_THR
+    dictionary_info['peaks_use_affine'] = peaks_use_affine
+    dictionary_info['flip_peaks'] = flip_peaks
+    dictionary_info['blur_radii'] = blur_radii
+    dictionary_info['blur_samples'] = blur_samples
+    dictionary_info['blur_sigma'] = blur_sigma    
+    dictionary_info['ndirs'] = ndirs
+    with open( join(path_out,'dictionary_info.pickle'), 'wb+' ) as dictionary_info_file:
+        pickle.dump(dictionary_info, dictionary_info_file, protocol=2)
+
+    # calling actual C code
+    ret = trk2dictionary( filename_tractogram, data_offset,
+        Nx, Ny, Nz, Px, Py, Pz, n_count, n_scalars, n_properties,
+        fiber_shiftX, fiber_shiftY, fiber_shiftZ, points_to_skip, min_seg_len, min_fiber_len, max_fiber_len,
+        ptrPEAKS, Np, vf_THR, -1 if flip_peaks[0] else 1, -1 if flip_peaks[1] else 1, -1 if flip_peaks[2] else 1,
+        ptrMASK, ptrTDI, path_out, 1 if do_intersect else 0, ptrAFFINE,
+        nBlurRadii, blur_sigma, ptrBlurRadii, ptrBlurSamples, ptrBlurWeights, ptrArrayInvM, ndirs, ptrHashTable  );
+    if ret == 0 :
+        WARNING( 'DICTIONARY not generated' )
+        return None
+
+    # save TDI and MASK maps
+    if filename_mask is not None :
+        affine = niiMASK.affine if nibabel.__version__ >= '2.0.0' else niiMASK.get_affine()
+    elif filename_peaks is not None :
+        affine = niiPEAKS.affine if nibabel.__version__ >= '2.0.0' else niiPEAKS.get_affine()
+    else :
+        affine = np.diag( [Px, Py, Pz, 1] )
+
+    niiTDI = nibabel.Nifti1Image( niiTDI_img, affine )
+    nii_hdr = niiTDI.header if nibabel.__version__ >= '2.0.0' else niiTDI.get_header()
+    nii_hdr['descrip'] = 'Created with COMMIT %s'%get_distribution('dmri-commit').version
+    nibabel.save( niiTDI, join(path_out,'dictionary_tdi.nii.gz') )
+
+    if filename_mask is not None :
+        niiMASK = nibabel.Nifti1Image( niiMASK_img, affine )
+    else :
+        niiMASK = nibabel.Nifti1Image( (np.asarray(niiTDI_img)>0).astype(np.float32), affine )
+    nii_hdr = niiMASK.header if nibabel.__version__ >= '2.0.0' else niiMASK.get_header()
+    nii_hdr['descrip'] = 'Created with COMMIT %s'%get_distribution('dmri-commit').version
+    nibabel.save( niiMASK, join(path_out,'dictionary_mask.nii.gz') )
+
     LOG( '\n   [ %.1f seconds ]' % ( time.time() - tic ) )
\ No newline at end of file
diff --git a/commit/trk2dictionary/trk2dictionary_c.cpp b/commit/trk2dictionary/trk2dictionary_c.cpp
index c8991b1c..7a295102 100644
--- a/commit/trk2dictionary/trk2dictionary_c.cpp
+++ b/commit/trk2dictionary/trk2dictionary_c.cpp
@@ -1,598 +1,598 @@
-#include <stdio.h>
-#include <cstdio>
-#include <string>
-#include <map>
-#include <vector>
-#include "Vector.h"
-#include "ProgressBar.h"
-#include <numpy/arrayobject.h>
-#include <math.h>
-
-#define MAX_FIB_LEN 10000
-
-
-// CLASS to store the segments of one fiber
-class segKey
-{
-    public:
-    unsigned short x, y, z;
-    unsigned short o;
-    segKey(){}
-
-    void set(unsigned short _x, unsigned short _y, unsigned short _z, unsigned short _o)
-    {
-        x  = _x;
-        y  = _y;
-        z  = _z;
-        o = _o;
-    }
-
-    bool const operator <(const segKey& seg) const
-    {
-        return o < seg.o || (o==seg.o && z<seg.z) || (o==seg.o && z==seg.z && y<seg.y) || (o==seg.o && z==seg.z && y==seg.y && x<seg.x);
-    }
-};
-
-class segInVoxKey
-{
-    public:
-    unsigned short x, y, z;
-    segInVoxKey(){}
-
-    void set(unsigned short _x, unsigned short _y, unsigned short _z)
-    {
-        x  = _x;
-        y  = _y;
-        z  = _z;
-    }
-    bool const operator <(const segInVoxKey& o) const
-    {
-        return (z<o.z) || (z==o.z && y<o.y) || (z==o.z && y==o.y && x<o.x);
-    }
-};
-
-// global variables (to avoid passing them at each call)
-std::map<segKey,float>  FiberSegments;
-float                   FiberLen;
-
-Vector<int>     dim;
-Vector<float>   pixdim;
-float*          ptrMASK;
-unsigned int    nPointsToSkip;
-float           fiberShiftXmm, fiberShiftYmm, fiberShiftZmm;
-bool            doIntersect;
-float           minSegLen, minFiberLen, maxFiberLen;
-
-std::vector<double> radii;         // radii for the extrusion
-std::vector<double> weights;       // damping weight
-std::vector<int>    sectors;       // number of duplicates across the extrusion circle
-double              radiusSigma;   // modulates the impact of each segment as function of radius
-
-
-bool rayBoxIntersection( Vector<double>& origin, Vector<double>& direction, Vector<double>& vmin, Vector<double>& vmax, double & t);
-void fiberForwardModel( float fiber[3][MAX_FIB_LEN], unsigned int pts, std::vector<int> sectors, std::vector<double> radii, std::vector<double> weight, short* ptrHashTable );
-void segmentForwardModel( const Vector<double>& P1, const Vector<double>& P2, int k, double w, short* ptrHashTable );
-unsigned int read_fiberTRK( FILE* fp, float fiber[3][MAX_FIB_LEN], int ns, int np );
-unsigned int read_fiberTCK( FILE* fp, float fiber[3][MAX_FIB_LEN] , float affine[4][4]);
-
-
-// =========================
-// Function called by CYTHON
-// =========================
-int trk2dictionary(
-    char* str_filename, int data_offset, int Nx, int Ny, int Nz, float Px, float Py, float Pz, int n_count, int n_scalars, int n_properties,
-    float fiber_shiftX, float fiber_shiftY, float fiber_shiftZ, int points_to_skip, float min_seg_len, float min_fiber_len, float max_fiber_len,
-    float* ptrPEAKS, int Np, float vf_THR, int ECix, int ECiy, int ECiz,
-    float* _ptrMASK, float* ptrTDI, char* path_out, int c, double* ptrPeaksAffine,
-    int nBlurRadii, double blurSigma, double* ptrBlurRadii, int* ptrBlurSamples, double* ptrBlurWeights, float* ptrTractsAffine, unsigned short ndirs, short* ptrHashTable
-)
-{
-    /*=========================*/
-    /*     IC compartments     */
-    /*=========================*/
-    float          fiber[3][MAX_FIB_LEN];
-    float          fiberNorm;
-    unsigned int   N, totICSegments = 0, totFibers = 0, v;
-    unsigned short o;
-    unsigned char  kept;
-    Vector<double> P;
-    std::string    filename;
-    std::string    OUTPUT_path(path_out);
-    std::map<segKey,float>::iterator it;
-
-    std::map<segInVoxKey,float> FiberNorm;
-    std::map<segInVoxKey,float>::iterator itNorm;
-    segInVoxKey         inVoxKey;
-
-    printf( "\n   \033[0;32m* Exporting IC compartments:\033[0m\n" );
-    
-    int isTRK; // var to check
-
-    char *ext = strrchr(str_filename, '.'); //get the extension of input file
-
-    if (strcmp(ext,".trk")==0) //for .trk file
-        isTRK = 1;
-    else if (strcmp(ext,".tck")==0)// for .tck file
-        isTRK = 0;
-    else
-        return 0;
-
-    FILE* fpTractogram = fopen(str_filename,"rb"); //open 
-    if (fpTractogram == NULL) return 0;
-    fseek(fpTractogram,data_offset,SEEK_SET); //skip header
-
-    // set global variables
-    dim.Set( Nx, Ny, Nz );
-    pixdim.Set( Px, Py, Pz );
-    nPointsToSkip = points_to_skip;
-    fiberShiftXmm = fiber_shiftX * pixdim.x; // shift in mm for the coordinates
-    fiberShiftYmm = fiber_shiftY * pixdim.y;
-    fiberShiftZmm = fiber_shiftZ * pixdim.z;
-    ptrMASK       = _ptrMASK;
-    doIntersect   = c > 0;
-    minSegLen     = min_seg_len;
-    minFiberLen   = min_fiber_len;
-    maxFiberLen   = max_fiber_len;
-
-    radii.clear();
-    sectors.clear();
-    weights.clear();
-    for(int i=0; i<nBlurRadii ;i++)
-    {
-        radii.push_back( ptrBlurRadii[i] );
-        sectors.push_back( ptrBlurSamples[i] );
-        weights.push_back( ptrBlurWeights[i] );
-    }
-    radiusSigma = blurSigma;
-
-    // open files
-    filename = OUTPUT_path+"/dictionary_TRK_norm.dict";   FILE* pDict_TRK_norm = fopen(filename.c_str(),"wb");
-    if ( !pDict_TRK_norm )
-    {
-        printf( "\n[trk2dictionary] Unable to create output files" );
-        return 0;
-    }
-    filename = OUTPUT_path+"/dictionary_IC_f.dict";        FILE* pDict_IC_f      = fopen(filename.c_str(),"wb");
-    filename = OUTPUT_path+"/dictionary_IC_v.dict";        FILE* pDict_IC_v      = fopen(filename.c_str(),"wb");
-    filename = OUTPUT_path+"/dictionary_IC_o.dict";        FILE* pDict_IC_o      = fopen(filename.c_str(),"wb");
-    filename = OUTPUT_path+"/dictionary_IC_len.dict";      FILE* pDict_IC_len    = fopen(filename.c_str(),"wb");
-    filename = OUTPUT_path+"/dictionary_TRK_len.dict";     FILE* pDict_TRK_len   = fopen(filename.c_str(),"wb");
-    filename = OUTPUT_path+"/dictionary_TRK_kept.dict";    FILE* pDict_TRK_kept  = fopen(filename.c_str(),"wb");
-
-    // iterate over fibers
-    ProgressBar PROGRESS( n_count );
-    PROGRESS.setPrefix("     ");
-    
-    float affine[4][4];
-    if (!isTRK)  {//.tck
-        //ricreate affine matrix
-        int k = 0;
-        for(int i=0; i<4; i++) {
-            for (int j=0; j<4; j++) {
-                affine[i][j] = ptrTractsAffine[k];
-                k++;
-            }
-        }
-    }
-
-    for(int f=0; f<n_count ;f++)
-    {
-        PROGRESS.inc();
-        if (isTRK) N = read_fiberTRK( fpTractogram, fiber, n_scalars, n_properties );
-        else N = read_fiberTCK( fpTractogram, fiber , affine );
-        fiberForwardModel( fiber, N, sectors, radii, weights, ptrHashTable  );
-
-        kept = 0;
-        if ( FiberSegments.size() > 0 )
-        {
-            if ( FiberLen > minFiberLen && FiberLen < maxFiberLen )
-            {
-                // add segments to files
-                for (it=FiberSegments.begin(); it!=FiberSegments.end(); it++)
-                {
-                    // NB: plese note inverted ordering for 'v'
-                    v = it->first.x + dim.x * ( it->first.y + dim.y * it->first.z );
-                    o = it->first.o;
-                    fwrite( &totFibers,      4, 1, pDict_IC_f );
-                    fwrite( &v,              4, 1, pDict_IC_v );
-                    fwrite( &o,              2, 1, pDict_IC_o );
-                    fwrite( &(it->second),   4, 1, pDict_IC_len );
-                    ptrTDI[ it->first.z + dim.z * ( it->first.y + dim.y * it->first.x ) ] += it->second;
-                    inVoxKey.set( it->first.x, it->first.y, it->first.z );
-                    FiberNorm[inVoxKey] += it->second;
-                }
-                for (fiberNorm=0, itNorm=FiberNorm.begin(); itNorm!=FiberNorm.end(); itNorm++)
-                    fiberNorm += pow(itNorm->second,2);
-                fiberNorm = sqrt(fiberNorm);
-                FiberNorm.clear();
-                fwrite( &fiberNorm,  1, 4, pDict_TRK_norm ); // actual length considered in optimization
-                fwrite( &FiberLen,   1, 4, pDict_TRK_len );
-                totICSegments += FiberSegments.size();
-                totFibers++;
-                kept = 1;
-            }
-        }
-        fwrite( &kept, 1, 1, pDict_TRK_kept );
-    }
-    PROGRESS.close();
-
-    fclose( fpTractogram );
-    fclose( pDict_TRK_norm );
-    fclose( pDict_IC_f );
-    fclose( pDict_IC_v );
-    fclose( pDict_IC_o );
-    fclose( pDict_IC_len );
-    fclose( pDict_TRK_len );
-    fclose( pDict_TRK_kept );
-
-    printf("     [ %d fibers kept, %d segments in total ]\n", totFibers, totICSegments );
-
-
-    /*=========================*/
-    /*     EC compartments     */
-    /*=========================*/
-    unsigned int totECSegments = 0, totECVoxels = 0;
-
-    printf( "\n   \033[0;32m* Exporting EC compartments:\033[0m\n" );
-
-    filename = OUTPUT_path+"/dictionary_EC_v.dict";        FILE* pDict_EC_v   = fopen( filename.c_str(),   "wb" );
-    filename = OUTPUT_path+"/dictionary_EC_o.dict";        FILE* pDict_EC_o   = fopen( filename.c_str(),   "wb" );
-
-    if ( ptrPEAKS != NULL )
-    {
-        Vector<double> dir;
-        double         longitude, colatitude;
-        segKey         ec_seg;
-        int            ix, iy, iz, id, atLeastOne;
-        float          peakMax;
-        float          norms[ Np ];
-        float          *ptr;
-        int            ox, oy;
-
-        PROGRESS.reset( dim.z );
-        for(iz=0; iz<dim.z ;iz++)
-        {
-            PROGRESS.inc();
-            for(iy=0; iy<dim.y ;iy++)
-            for(ix=0; ix<dim.x ;ix++)
-            {
-                // check if in mask previously computed from IC segments
-                if ( ptrTDI[ iz + dim.z * ( iy + dim.y * ix ) ] == 0 ) continue;
-
-                peakMax = -1;
-                for(id=0; id<Np ;id++)
-                {
-                    ptr = ptrPEAKS + 3*(id + Np * ( iz + dim.z * ( iy + dim.y * ix ) ));
-                    dir.x = ptr[0];
-                    dir.y = ptr[1];
-                    dir.z = ptr[2];
-                    norms[id] = dir.norm();
-                    if ( norms[id] > peakMax )
-                        peakMax = norms[id];
-                }
-
-                if ( peakMax > 0 )
-                {
-                    ec_seg.x  = ix;
-                    ec_seg.y  = iy;
-                    ec_seg.z  = iz;
-                    atLeastOne = 0;
-                    for(id=0; id<Np ;id++)
-                    {
-                        if ( norms[id]==0 || norms[id] < vf_THR*peakMax ) continue; // peak too small, don't consider it
-
-                        // get the orientation of the current peak
-                        ptr = ptrPEAKS + 3*(id + Np * ( iz + dim.z * ( iy + dim.y * ix ) ));
-
-                        // multiply by the affine matrix
-                        dir.x = ptr[0] * ptrPeaksAffine[0] + ptr[1] * ptrPeaksAffine[1] + ptr[2] * ptrPeaksAffine[2];
-                        dir.y = ptr[0] * ptrPeaksAffine[3] + ptr[1] * ptrPeaksAffine[4] + ptr[2] * ptrPeaksAffine[5];
-                        dir.z = ptr[0] * ptrPeaksAffine[6] + ptr[1] * ptrPeaksAffine[7] + ptr[2] * ptrPeaksAffine[8];
-
-                        // flip axes if requested
-                        dir.x *= ECix;
-                        dir.y *= ECiy;
-                        dir.z *= ECiz;
-                        if ( dir.y < 0 )
-                        {
-                            // ensure to be in the right hemisphere (the one where kernels were pre-computed)
-                            dir.x = -dir.x;
-                            dir.y = -dir.y;
-                            dir.z = -dir.z;
-                        }
-                        colatitude = atan2( sqrt(dir.x*dir.x + dir.y*dir.y), dir.z );
-                        longitude  = atan2( dir.y, dir.x );
-                        ox = (int)round(colatitude/M_PI*180.0);
-                        oy = (int)round(longitude/M_PI*180.0);
-
-                        v = ec_seg.x + dim.x * ( ec_seg.y + dim.y * ec_seg.z );
-                        o = ptrHashTable[ox*181 + oy];
-                        fwrite( &v, 4, 1, pDict_EC_v );
-                        fwrite( &o, 2, 1, pDict_EC_o );
-                        totECSegments++;
-                        atLeastOne = 1;
-                    }
-                    if ( atLeastOne>0 )
-                        totECVoxels++;
-                }
-            }
-        }
-        PROGRESS.close();
-    }
-
-    fclose( pDict_EC_v );
-    fclose( pDict_EC_o );
-
-    printf("     [ %d voxels, %d segments ]\n", totECVoxels, totECSegments );
-
-    return 1;
-}
-
-
-/********************************************************************************************************************/
-/*                                                 fiberForwardModel                                                */
-/********************************************************************************************************************/
-void fiberForwardModel( float fiber[3][MAX_FIB_LEN], unsigned int pts, std::vector<int> sectors, std::vector<double> radii, std::vector<double> weights, short* ptrHashTable )
-{
-    static Vector<double> S1, S2, S1m, S2m, P, q, n, qxn, qxqxn;
-    static Vector<double> vox, vmin, vmax, dir;
-    static double         len, t, alpha, w, R;
-    static int            i, j, k;
-
-    FiberLen = 0.0;
-    FiberSegments.clear();
-    if ( pts <= 2*nPointsToSkip )
-        return;
-
-    for(i=nPointsToSkip; i<pts-1-nPointsToSkip ;i++)
-    {
-        // original segment to be processed
-        S1.Set( fiber[0][i]   + fiberShiftXmm, fiber[1][i]   + fiberShiftYmm, fiber[2][i]   + fiberShiftZmm );
-        S2.Set( fiber[0][i+1] + fiberShiftXmm, fiber[1][i+1] + fiberShiftYmm, fiber[2][i+1] + fiberShiftZmm );
-        dir.x = S2.x-S1.x;
-        dir.y = S2.y-S1.y;
-        dir.z = S2.z-S1.z;
-        dir.Normalize();
-
-        // get a normal to the vector to move
-        n.x = dir.y-dir.z;
-        n.y = dir.z-dir.x;
-        n.z = dir.x-dir.y;
-        n.Normalize();
-
-        /* assign contribution(s) */
-        for(k=0; k<(int)radii.size() ;k++)
-        {
-            if ( weights[k] < 1e-3 )
-                continue;
-
-            R = radii[k];
-
-            // quaternion (q.x, q.y, q.z, w) for rotation
-            alpha = 2.0*M_PI/sectors[k];
-            w = sin(alpha/2.0);
-            q.x = dir.x * w;
-            q.y = dir.y * w;
-            q.z = dir.z * w;
-            w = cos(alpha/2.0);
-            for(j=0; j<sectors[k] ;j++)
-            {
-                // rotate the segment's normal
-                qxn.x = 2.0 * ( q.y * n.z - q.z * n.y );
-                qxn.y = 2.0 * ( q.z * n.x - q.x * n.z );
-                qxn.z = 2.0 * ( q.x * n.y - q.y * n.x );
-                qxqxn.x = q.y * qxn.z - q.z * qxn.y;
-                qxqxn.y = q.z * qxn.x - q.x * qxn.z;
-                qxqxn.z = q.x * qxn.y - q.y * qxn.x;
-                n.x += w * qxn.x + qxqxn.x;
-                n.y += w * qxn.y + qxqxn.y;
-                n.z += w * qxn.z + qxqxn.z;
-
-                // move the segment
-                S1m.x = S1.x + R*n.x;
-                S1m.y = S1.y + R*n.y;
-                S1m.z = S1.z + R*n.z;
-                S2m.x = S2.x + R*n.x;
-                S2m.y = S2.y + R*n.y;
-                S2m.z = S2.z + R*n.z;
-
-                if ( doIntersect==false )
-                    segmentForwardModel( S1m, S2m, k, weights[k], ptrHashTable );
-                else
-                    while( 1 )
-                    {
-                        len = sqrt( pow(S2m.x-S1m.x,2) + pow(S2m.y-S1m.y,2) + pow(S2m.z-S1m.z,2) ); // in mm
-                        if ( len <= minSegLen )
-                            break;
-                        
-                        if ( floor(S1m.x/pixdim.x)==floor(S2m.x/pixdim.x) &&
-                             floor(S1m.y/pixdim.y)==floor(S2m.y/pixdim.y) &&
-                             floor(S1m.z/pixdim.z)==floor(S2m.z/pixdim.z)
-                            )
-                        {
-                            // same voxel, no need to compute intersections
-                            segmentForwardModel( S1m, S2m, k, weights[k], ptrHashTable );
-                            break;
-                        }
-
-                        // compute AABB of the first point (in mm)
-                        vmin.x = floor( (S1m.x + 1e-6*dir.x)/pixdim.x ) * pixdim.x;
-                        vmin.y = floor( (S1m.y + 1e-6*dir.y)/pixdim.y ) * pixdim.y;
-                        vmin.z = floor( (S1m.z + 1e-6*dir.z)/pixdim.z ) * pixdim.z;
-                        vmax.x = vmin.x + pixdim.x;
-                        vmax.y = vmin.y + pixdim.y;
-                        vmax.z = vmin.z + pixdim.z;
-
-                        if ( rayBoxIntersection( S1m, dir, vmin, vmax, t ) && t>0 && t<len )
-                        {
-                            // add the portion S1P, and then reiterate
-                            P.Set( S1m.x + t*dir.x, S1m.y + t*dir.y, S1m.z + t*dir.z );
-                            segmentForwardModel( S1m, P, k, weights[k], ptrHashTable );
-                            S1m.Set( P.x, P.y, P.z );
-                        }
-                        else
-                        {
-                            // add the segment S1S2 and stop iterating
-                            segmentForwardModel( S1m, S2m, k, weights[k], ptrHashTable );
-                            break;
-                        }
-                    }
-            }
-        }
-    }
-}
-
-
-/********************************************************************************************************************/
-/*                                                segmentForwardModel                                               */
-/********************************************************************************************************************/
-void segmentForwardModel( const Vector<double>& P1, const Vector<double>& P2, int k, double w, short* ptrHashTable )
-{
-    static Vector<int>    vox;
-    static Vector<double> dir, dirTrue;
-    static double         longitude, colatitude, len;
-    static segKey         key;
-    static int            ox, oy;
-
-    // direction of the segment
-    dir.y = P2.y-P1.y;
-    if ( dir.y >= 0 )
-    {
-        dir.x = P2.x-P1.x;
-        dir.z = P2.z-P1.z;
-    }
-    else
-    {
-        dir.x = P1.x-P2.x;
-        dir.y = P1.y-P2.y;
-        dir.z = P1.z-P2.z;
-    }
-
-    // length of the segment
-    len = dir.norm();
-    if ( len <= minSegLen )
-        return;
-    dir.Normalize();
-
-    // voxel of the segment is the centroid
-    vox.x = floor( 0.5 * (P1.x + P2.x) / pixdim.x );
-    vox.y = floor( 0.5 * (P1.y + P2.y) / pixdim.y );
-    vox.z = floor( 0.5 * (P1.z + P2.z) / pixdim.z );
-    if ( vox.x>=dim.x || vox.x<0 || vox.y>=dim.y || vox.y<0 || vox.z>=dim.z || vox.z<0 )
-        return;
-    if ( ptrMASK && ptrMASK[ vox.z + dim.z * ( vox.y + dim.y * vox.x ) ]==0 )
-        return;
-
-    // add the segment to the data structure
-    longitude  = atan2(dir.y, dir.x);
-    colatitude = atan2( sqrt(dir.x*dir.x + dir.y*dir.y), dir.z );
-    ox = (int)round(colatitude/M_PI*180.0); // theta // i1
-    oy = (int)round(longitude/M_PI*180.0);  // phi   // i2
-    key.set( vox.x, vox.y, vox.z, (unsigned short) ptrHashTable[ox*181 + oy] );
-    FiberSegments[key] += w * len;
-    if ( k==0 ) // fiber length computed only from origianl segments
-        FiberLen += len;
-}
-
-
-/********************************************************************************************************************/
-/*                                                rayBoxIntersection                                                */
-/********************************************************************************************************************/
-bool rayBoxIntersection( Vector<double>& origin, Vector<double>& direction, Vector<double>& vmin, Vector<double>& vmax, double & t)
-{
-    static double tmin, tmax, tymin, tymax, tzmin, tzmax;
-    static Vector<double> invrd;
-
-    // inverse direction to catch float problems
-    invrd.x = 1.0 / direction.x;
-    invrd.y = 1.0 / direction.y;
-    invrd.z = 1.0 / direction.z;
-
-    if (invrd.x >= 0)
-    {
-      tmin = (vmin.x - origin.x) * invrd.x;
-      tmax = (vmax.x - origin.x) * invrd.x;
-    }
-    else
-    {
-      tmin = (vmax.x - origin.x) * invrd.x;
-      tmax = (vmin.x - origin.x) * invrd.x;
-    }
-
-    if (invrd.y >= 0)
-    {
-      tymin = (vmin.y - origin.y) * invrd.y;
-      tymax = (vmax.y - origin.y) * invrd.y;
-    }
-    else
-    {
-      tymin = (vmax.y - origin.y) * invrd.y;
-      tymax = (vmin.y - origin.y) * invrd.y;
-    }
-
-    if ( (tmin > tymax) || (tymin > tmax) ) return false;
-    if ( tymin > tmin) tmin = tymin;
-    if ( tymax < tmax) tmax = tymax;
-
-    if (invrd.z >= 0)
-    {
-      tzmin = (vmin.z - origin.z) * invrd.z;
-      tzmax = (vmax.z - origin.z) * invrd.z;
-    }else
-    {
-      tzmin = (vmax.z - origin.z) * invrd.z;
-      tzmax = (vmin.z - origin.z) * invrd.z;
-    }
-
-    if ( (tmin > tzmax) || (tzmin > tmax) ) return false;
-    if ( tzmin > tmin) tmin = tzmin;
-    if ( tzmax < tmax) tmax = tzmax;
-
-    // check if values are valid
-    t = tmin;
-    if (t <= 0) t = tmax;
-
-    return true;
-}
-
-
-// Read a fiber from file .trk
-unsigned int read_fiberTRK( FILE* fp, float fiber[3][MAX_FIB_LEN], int ns, int np )
-{
-    int N;
-    fread((char*)&N, 1, 4, fp);
-
-    if ( N >= MAX_FIB_LEN || N <= 0 )
-        return 0;
-
-    float tmp[3];
-    for(int i=0; i<N; i++)
-    {
-        fread((char*)tmp, 1, 12, fp);
-        fiber[0][i] = tmp[0];
-        fiber[1][i] = tmp[1];
-        fiber[2][i] = tmp[2];
-        fseek(fp,4*ns,SEEK_CUR);
-    }
-    fseek(fp,4*np,SEEK_CUR);
-
-    return N;
-}
-
-// Read a fiber from file .tck
-unsigned int read_fiberTCK( FILE* fp, float fiber[3][MAX_FIB_LEN], float affine[4][4])
-{
-    int i = 0;
-    float tmp[3];
-    fread((char*)tmp, 1, 12, fp);
-    while( !(isnan(tmp[0])) && !(isnan(tmp[1])) &&  !(isnan(tmp[2])) )
-    {
-        fiber[0][i] = tmp[0]*affine[0][0] + tmp[1]*affine[0][1] + tmp[2]*affine[0][2] + affine[0][3];
-        fiber[1][i] = tmp[0]*affine[1][0] + tmp[1]*affine[1][1] + tmp[2]*affine[1][2] + affine[1][3];
-        fiber[2][i] = tmp[0]*affine[2][0] + tmp[1]*affine[2][1] + tmp[2]*affine[2][2] + affine[2][3];
-        i++;
-        fread((char*)tmp, 1, 12, fp);
-    }
-
-    return i;
-}
+#include <stdio.h>
+#include <cstdio>
+#include <string>
+#include <map>
+#include <vector>
+#include "Vector.h"
+#include "ProgressBar.h"
+#include <numpy/arrayobject.h>
+#include <math.h>
+
+#define MAX_FIB_LEN 10000
+
+
+// CLASS to store the segments of one fiber
+class segKey
+{
+    public:
+    unsigned short x, y, z;
+    unsigned short o;
+    segKey(){}
+
+    void set(unsigned short _x, unsigned short _y, unsigned short _z, unsigned short _o)
+    {
+        x  = _x;
+        y  = _y;
+        z  = _z;
+        o = _o;
+    }
+
+    bool const operator <(const segKey& seg) const
+    {
+        return o < seg.o || (o==seg.o && z<seg.z) || (o==seg.o && z==seg.z && y<seg.y) || (o==seg.o && z==seg.z && y==seg.y && x<seg.x);
+    }
+};
+
+class segInVoxKey
+{
+    public:
+    unsigned short x, y, z;
+    segInVoxKey(){}
+
+    void set(unsigned short _x, unsigned short _y, unsigned short _z)
+    {
+        x  = _x;
+        y  = _y;
+        z  = _z;
+    }
+    bool const operator <(const segInVoxKey& o) const
+    {
+        return (z<o.z) || (z==o.z && y<o.y) || (z==o.z && y==o.y && x<o.x);
+    }
+};
+
+// global variables (to avoid passing them at each call)
+std::map<segKey,float>  FiberSegments;
+float                   FiberLen;
+
+Vector<int>     dim;
+Vector<float>   pixdim;
+float*          ptrMASK;
+unsigned int    nPointsToSkip;
+float           fiberShiftXmm, fiberShiftYmm, fiberShiftZmm;
+bool            doIntersect;
+float           minSegLen, minFiberLen, maxFiberLen;
+
+std::vector<double> radii;         // radii for the extrusion
+std::vector<double> weights;       // damping weight
+std::vector<int>    sectors;       // number of duplicates across the extrusion circle
+double              radiusSigma;   // modulates the impact of each segment as function of radius
+
+
+bool rayBoxIntersection( Vector<double>& origin, Vector<double>& direction, Vector<double>& vmin, Vector<double>& vmax, double & t);
+void fiberForwardModel( float fiber[3][MAX_FIB_LEN], unsigned int pts, std::vector<int> sectors, std::vector<double> radii, std::vector<double> weight, short* ptrHashTable );
+void segmentForwardModel( const Vector<double>& P1, const Vector<double>& P2, int k, double w, short* ptrHashTable );
+unsigned int read_fiberTRK( FILE* fp, float fiber[3][MAX_FIB_LEN], int ns, int np );
+unsigned int read_fiberTCK( FILE* fp, float fiber[3][MAX_FIB_LEN] , float affine[4][4]);
+
+
+// =========================
+// Function called by CYTHON
+// =========================
+int trk2dictionary(
+    char* str_filename, int data_offset, int Nx, int Ny, int Nz, float Px, float Py, float Pz, int n_count, int n_scalars, int n_properties,
+    float fiber_shiftX, float fiber_shiftY, float fiber_shiftZ, int points_to_skip, float min_seg_len, float min_fiber_len, float max_fiber_len,
+    float* ptrPEAKS, int Np, float vf_THR, int ECix, int ECiy, int ECiz,
+    float* _ptrMASK, float* ptrTDI, char* path_out, int c, double* ptrPeaksAffine,
+    int nBlurRadii, double blurSigma, double* ptrBlurRadii, int* ptrBlurSamples, double* ptrBlurWeights, float* ptrTractsAffine, unsigned short ndirs, short* ptrHashTable
+)
+{
+    /*=========================*/
+    /*     IC compartments     */
+    /*=========================*/
+    float          fiber[3][MAX_FIB_LEN];
+    float          fiberNorm;
+    unsigned int   N, totICSegments = 0, totFibers = 0, v;
+    unsigned short o;
+    unsigned char  kept;
+    Vector<double> P;
+    std::string    filename;
+    std::string    OUTPUT_path(path_out);
+    std::map<segKey,float>::iterator it;
+
+    std::map<segInVoxKey,float> FiberNorm;
+    std::map<segInVoxKey,float>::iterator itNorm;
+    segInVoxKey         inVoxKey;
+
+    printf( "\n   \033[0;32m* Exporting IC compartments:\033[0m\n" );
+    
+    int isTRK; // var to check
+
+    char *ext = strrchr(str_filename, '.'); //get the extension of input file
+
+    if (strcmp(ext,".trk")==0) //for .trk file
+        isTRK = 1;
+    else if (strcmp(ext,".tck")==0)// for .tck file
+        isTRK = 0;
+    else
+        return 0;
+
+    FILE* fpTractogram = fopen(str_filename,"rb"); //open 
+    if (fpTractogram == NULL) return 0;
+    fseek(fpTractogram,data_offset,SEEK_SET); //skip header
+
+    // set global variables
+    dim.Set( Nx, Ny, Nz );
+    pixdim.Set( Px, Py, Pz );
+    nPointsToSkip = points_to_skip;
+    fiberShiftXmm = fiber_shiftX * pixdim.x; // shift in mm for the coordinates
+    fiberShiftYmm = fiber_shiftY * pixdim.y;
+    fiberShiftZmm = fiber_shiftZ * pixdim.z;
+    ptrMASK       = _ptrMASK;
+    doIntersect   = c > 0;
+    minSegLen     = min_seg_len;
+    minFiberLen   = min_fiber_len;
+    maxFiberLen   = max_fiber_len;
+
+    radii.clear();
+    sectors.clear();
+    weights.clear();
+    for(int i=0; i<nBlurRadii ;i++)
+    {
+        radii.push_back( ptrBlurRadii[i] );
+        sectors.push_back( ptrBlurSamples[i] );
+        weights.push_back( ptrBlurWeights[i] );
+    }
+    radiusSigma = blurSigma;
+
+    // open files
+    filename = OUTPUT_path+"/dictionary_TRK_norm.dict";   FILE* pDict_TRK_norm = fopen(filename.c_str(),"wb");
+    if ( !pDict_TRK_norm )
+    {
+        printf( "\n[trk2dictionary] Unable to create output files" );
+        return 0;
+    }
+    filename = OUTPUT_path+"/dictionary_IC_f.dict";        FILE* pDict_IC_f      = fopen(filename.c_str(),"wb");
+    filename = OUTPUT_path+"/dictionary_IC_v.dict";        FILE* pDict_IC_v      = fopen(filename.c_str(),"wb");
+    filename = OUTPUT_path+"/dictionary_IC_o.dict";        FILE* pDict_IC_o      = fopen(filename.c_str(),"wb");
+    filename = OUTPUT_path+"/dictionary_IC_len.dict";      FILE* pDict_IC_len    = fopen(filename.c_str(),"wb");
+    filename = OUTPUT_path+"/dictionary_TRK_len.dict";     FILE* pDict_TRK_len   = fopen(filename.c_str(),"wb");
+    filename = OUTPUT_path+"/dictionary_TRK_kept.dict";    FILE* pDict_TRK_kept  = fopen(filename.c_str(),"wb");
+
+    // iterate over fibers
+    ProgressBar PROGRESS( n_count );
+    PROGRESS.setPrefix("     ");
+    
+    float affine[4][4];
+    if (!isTRK)  {//.tck
+        //ricreate affine matrix
+        int k = 0;
+        for(int i=0; i<4; i++) {
+            for (int j=0; j<4; j++) {
+                affine[i][j] = ptrTractsAffine[k];
+                k++;
+            }
+        }
+    }
+
+    for(int f=0; f<n_count ;f++)
+    {
+        PROGRESS.inc();
+        if (isTRK) N = read_fiberTRK( fpTractogram, fiber, n_scalars, n_properties );
+        else N = read_fiberTCK( fpTractogram, fiber , affine );
+        fiberForwardModel( fiber, N, sectors, radii, weights, ptrHashTable  );
+
+        kept = 0;
+        if ( FiberSegments.size() > 0 )
+        {
+            if ( FiberLen > minFiberLen && FiberLen < maxFiberLen )
+            {
+                // add segments to files
+                for (it=FiberSegments.begin(); it!=FiberSegments.end(); it++)
+                {
+                    // NB: plese note inverted ordering for 'v'
+                    v = it->first.x + dim.x * ( it->first.y + dim.y * it->first.z );
+                    o = it->first.o;
+                    fwrite( &totFibers,      4, 1, pDict_IC_f );
+                    fwrite( &v,              4, 1, pDict_IC_v );
+                    fwrite( &o,              2, 1, pDict_IC_o );
+                    fwrite( &(it->second),   4, 1, pDict_IC_len );
+                    ptrTDI[ it->first.z + dim.z * ( it->first.y + dim.y * it->first.x ) ] += it->second;
+                    inVoxKey.set( it->first.x, it->first.y, it->first.z );
+                    FiberNorm[inVoxKey] += it->second;
+                }
+                for (fiberNorm=0, itNorm=FiberNorm.begin(); itNorm!=FiberNorm.end(); itNorm++)
+                    fiberNorm += pow(itNorm->second,2);
+                fiberNorm = sqrt(fiberNorm);
+                FiberNorm.clear();
+                fwrite( &fiberNorm,  1, 4, pDict_TRK_norm ); // actual length considered in optimization
+                fwrite( &FiberLen,   1, 4, pDict_TRK_len );
+                totICSegments += FiberSegments.size();
+                totFibers++;
+                kept = 1;
+            }
+        }
+        fwrite( &kept, 1, 1, pDict_TRK_kept );
+    }
+    PROGRESS.close();
+
+    fclose( fpTractogram );
+    fclose( pDict_TRK_norm );
+    fclose( pDict_IC_f );
+    fclose( pDict_IC_v );
+    fclose( pDict_IC_o );
+    fclose( pDict_IC_len );
+    fclose( pDict_TRK_len );
+    fclose( pDict_TRK_kept );
+
+    printf("     [ %d fibers kept, %d segments in total ]\n", totFibers, totICSegments );
+
+
+    /*=========================*/
+    /*     EC compartments     */
+    /*=========================*/
+    unsigned int totECSegments = 0, totECVoxels = 0;
+
+    printf( "\n   \033[0;32m* Exporting EC compartments:\033[0m\n" );
+
+    filename = OUTPUT_path+"/dictionary_EC_v.dict";        FILE* pDict_EC_v   = fopen( filename.c_str(),   "wb" );
+    filename = OUTPUT_path+"/dictionary_EC_o.dict";        FILE* pDict_EC_o   = fopen( filename.c_str(),   "wb" );
+
+    if ( ptrPEAKS != NULL )
+    {
+        Vector<double> dir;
+        double         longitude, colatitude;
+        segKey         ec_seg;
+        int            ix, iy, iz, id, atLeastOne;
+        float          peakMax;
+        float          norms[ Np ];
+        float          *ptr;
+        int            ox, oy;
+
+        PROGRESS.reset( dim.z );
+        for(iz=0; iz<dim.z ;iz++)
+        {
+            PROGRESS.inc();
+            for(iy=0; iy<dim.y ;iy++)
+            for(ix=0; ix<dim.x ;ix++)
+            {
+                // check if in mask previously computed from IC segments
+                if ( ptrTDI[ iz + dim.z * ( iy + dim.y * ix ) ] == 0 ) continue;
+
+                peakMax = -1;
+                for(id=0; id<Np ;id++)
+                {
+                    ptr = ptrPEAKS + 3*(id + Np * ( iz + dim.z * ( iy + dim.y * ix ) ));
+                    dir.x = ptr[0];
+                    dir.y = ptr[1];
+                    dir.z = ptr[2];
+                    norms[id] = dir.norm();
+                    if ( norms[id] > peakMax )
+                        peakMax = norms[id];
+                }
+
+                if ( peakMax > 0 )
+                {
+                    ec_seg.x  = ix;
+                    ec_seg.y  = iy;
+                    ec_seg.z  = iz;
+                    atLeastOne = 0;
+                    for(id=0; id<Np ;id++)
+                    {
+                        if ( norms[id]==0 || norms[id] < vf_THR*peakMax ) continue; // peak too small, don't consider it
+
+                        // get the orientation of the current peak
+                        ptr = ptrPEAKS + 3*(id + Np * ( iz + dim.z * ( iy + dim.y * ix ) ));
+
+                        // multiply by the affine matrix
+                        dir.x = ptr[0] * ptrPeaksAffine[0] + ptr[1] * ptrPeaksAffine[1] + ptr[2] * ptrPeaksAffine[2];
+                        dir.y = ptr[0] * ptrPeaksAffine[3] + ptr[1] * ptrPeaksAffine[4] + ptr[2] * ptrPeaksAffine[5];
+                        dir.z = ptr[0] * ptrPeaksAffine[6] + ptr[1] * ptrPeaksAffine[7] + ptr[2] * ptrPeaksAffine[8];
+
+                        // flip axes if requested
+                        dir.x *= ECix;
+                        dir.y *= ECiy;
+                        dir.z *= ECiz;
+                        if ( dir.y < 0 )
+                        {
+                            // ensure to be in the right hemisphere (the one where kernels were pre-computed)
+                            dir.x = -dir.x;
+                            dir.y = -dir.y;
+                            dir.z = -dir.z;
+                        }
+                        colatitude = atan2( sqrt(dir.x*dir.x + dir.y*dir.y), dir.z );
+                        longitude  = atan2( dir.y, dir.x );
+                        ox = (int)round(colatitude/M_PI*180.0);
+                        oy = (int)round(longitude/M_PI*180.0);
+
+                        v = ec_seg.x + dim.x * ( ec_seg.y + dim.y * ec_seg.z );
+                        o = ptrHashTable[ox*181 + oy];
+                        fwrite( &v, 4, 1, pDict_EC_v );
+                        fwrite( &o, 2, 1, pDict_EC_o );
+                        totECSegments++;
+                        atLeastOne = 1;
+                    }
+                    if ( atLeastOne>0 )
+                        totECVoxels++;
+                }
+            }
+        }
+        PROGRESS.close();
+    }
+
+    fclose( pDict_EC_v );
+    fclose( pDict_EC_o );
+
+    printf("     [ %d voxels, %d segments ]\n", totECVoxels, totECSegments );
+
+    return 1;
+}
+
+
+/********************************************************************************************************************/
+/*                                                 fiberForwardModel                                                */
+/********************************************************************************************************************/
+void fiberForwardModel( float fiber[3][MAX_FIB_LEN], unsigned int pts, std::vector<int> sectors, std::vector<double> radii, std::vector<double> weights, short* ptrHashTable )
+{
+    static Vector<double> S1, S2, S1m, S2m, P, q, n, qxn, qxqxn;
+    static Vector<double> vox, vmin, vmax, dir;
+    static double         len, t, alpha, w, R;
+    static int            i, j, k;
+
+    FiberLen = 0.0;
+    FiberSegments.clear();
+    if ( pts <= 2*nPointsToSkip )
+        return;
+
+    for(i=nPointsToSkip; i<pts-1-nPointsToSkip ;i++)
+    {
+        // original segment to be processed
+        S1.Set( fiber[0][i]   + fiberShiftXmm, fiber[1][i]   + fiberShiftYmm, fiber[2][i]   + fiberShiftZmm );
+        S2.Set( fiber[0][i+1] + fiberShiftXmm, fiber[1][i+1] + fiberShiftYmm, fiber[2][i+1] + fiberShiftZmm );
+        dir.x = S2.x-S1.x;
+        dir.y = S2.y-S1.y;
+        dir.z = S2.z-S1.z;
+        dir.Normalize();
+
+        // get a normal to the vector to move
+        n.x = dir.y-dir.z;
+        n.y = dir.z-dir.x;
+        n.z = dir.x-dir.y;
+        n.Normalize();
+
+        /* assign contribution(s) */
+        for(k=0; k<(int)radii.size() ;k++)
+        {
+            if ( weights[k] < 1e-3 )
+                continue;
+
+            R = radii[k];
+
+            // quaternion (q.x, q.y, q.z, w) for rotation
+            alpha = 2.0*M_PI/sectors[k];
+            w = sin(alpha/2.0);
+            q.x = dir.x * w;
+            q.y = dir.y * w;
+            q.z = dir.z * w;
+            w = cos(alpha/2.0);
+            for(j=0; j<sectors[k] ;j++)
+            {
+                // rotate the segment's normal
+                qxn.x = 2.0 * ( q.y * n.z - q.z * n.y );
+                qxn.y = 2.0 * ( q.z * n.x - q.x * n.z );
+                qxn.z = 2.0 * ( q.x * n.y - q.y * n.x );
+                qxqxn.x = q.y * qxn.z - q.z * qxn.y;
+                qxqxn.y = q.z * qxn.x - q.x * qxn.z;
+                qxqxn.z = q.x * qxn.y - q.y * qxn.x;
+                n.x += w * qxn.x + qxqxn.x;
+                n.y += w * qxn.y + qxqxn.y;
+                n.z += w * qxn.z + qxqxn.z;
+
+                // move the segment
+                S1m.x = S1.x + R*n.x;
+                S1m.y = S1.y + R*n.y;
+                S1m.z = S1.z + R*n.z;
+                S2m.x = S2.x + R*n.x;
+                S2m.y = S2.y + R*n.y;
+                S2m.z = S2.z + R*n.z;
+
+                if ( doIntersect==false )
+                    segmentForwardModel( S1m, S2m, k, weights[k], ptrHashTable );
+                else
+                    while( 1 )
+                    {
+                        len = sqrt( pow(S2m.x-S1m.x,2) + pow(S2m.y-S1m.y,2) + pow(S2m.z-S1m.z,2) ); // in mm
+                        if ( len <= minSegLen )
+                            break;
+                        
+                        if ( floor(S1m.x/pixdim.x)==floor(S2m.x/pixdim.x) &&
+                             floor(S1m.y/pixdim.y)==floor(S2m.y/pixdim.y) &&
+                             floor(S1m.z/pixdim.z)==floor(S2m.z/pixdim.z)
+                            )
+                        {
+                            // same voxel, no need to compute intersections
+                            segmentForwardModel( S1m, S2m, k, weights[k], ptrHashTable );
+                            break;
+                        }
+
+                        // compute AABB of the first point (in mm)
+                        vmin.x = floor( (S1m.x + 1e-6*dir.x)/pixdim.x ) * pixdim.x;
+                        vmin.y = floor( (S1m.y + 1e-6*dir.y)/pixdim.y ) * pixdim.y;
+                        vmin.z = floor( (S1m.z + 1e-6*dir.z)/pixdim.z ) * pixdim.z;
+                        vmax.x = vmin.x + pixdim.x;
+                        vmax.y = vmin.y + pixdim.y;
+                        vmax.z = vmin.z + pixdim.z;
+
+                        if ( rayBoxIntersection( S1m, dir, vmin, vmax, t ) && t>0 && t<len )
+                        {
+                            // add the portion S1P, and then reiterate
+                            P.Set( S1m.x + t*dir.x, S1m.y + t*dir.y, S1m.z + t*dir.z );
+                            segmentForwardModel( S1m, P, k, weights[k], ptrHashTable );
+                            S1m.Set( P.x, P.y, P.z );
+                        }
+                        else
+                        {
+                            // add the segment S1S2 and stop iterating
+                            segmentForwardModel( S1m, S2m, k, weights[k], ptrHashTable );
+                            break;
+                        }
+                    }
+            }
+        }
+    }
+}
+
+
+/********************************************************************************************************************/
+/*                                                segmentForwardModel                                               */
+/********************************************************************************************************************/
+void segmentForwardModel( const Vector<double>& P1, const Vector<double>& P2, int k, double w, short* ptrHashTable )
+{
+    static Vector<int>    vox;
+    static Vector<double> dir, dirTrue;
+    static double         longitude, colatitude, len;
+    static segKey         key;
+    static int            ox, oy;
+
+    // direction of the segment
+    dir.y = P2.y-P1.y;
+    if ( dir.y >= 0 )
+    {
+        dir.x = P2.x-P1.x;
+        dir.z = P2.z-P1.z;
+    }
+    else
+    {
+        dir.x = P1.x-P2.x;
+        dir.y = P1.y-P2.y;
+        dir.z = P1.z-P2.z;
+    }
+
+    // length of the segment
+    len = dir.norm();
+    if ( len <= minSegLen )
+        return;
+    dir.Normalize();
+
+    // voxel of the segment is the centroid
+    vox.x = floor( 0.5 * (P1.x + P2.x) / pixdim.x );
+    vox.y = floor( 0.5 * (P1.y + P2.y) / pixdim.y );
+    vox.z = floor( 0.5 * (P1.z + P2.z) / pixdim.z );
+    if ( vox.x>=dim.x || vox.x<0 || vox.y>=dim.y || vox.y<0 || vox.z>=dim.z || vox.z<0 )
+        return;
+    if ( ptrMASK && ptrMASK[ vox.z + dim.z * ( vox.y + dim.y * vox.x ) ]==0 )
+        return;
+
+    // add the segment to the data structure
+    longitude  = atan2(dir.y, dir.x);
+    colatitude = atan2( sqrt(dir.x*dir.x + dir.y*dir.y), dir.z );
+    ox = (int)round(colatitude/M_PI*180.0); // theta // i1
+    oy = (int)round(longitude/M_PI*180.0);  // phi   // i2
+    key.set( vox.x, vox.y, vox.z, (unsigned short) ptrHashTable[ox*181 + oy] );
+    FiberSegments[key] += w * len;
+    if ( k==0 ) // fiber length computed only from origianl segments
+        FiberLen += len;
+}
+
+
+/********************************************************************************************************************/
+/*                                                rayBoxIntersection                                                */
+/********************************************************************************************************************/
+bool rayBoxIntersection( Vector<double>& origin, Vector<double>& direction, Vector<double>& vmin, Vector<double>& vmax, double & t)
+{
+    static double tmin, tmax, tymin, tymax, tzmin, tzmax;
+    static Vector<double> invrd;
+
+    // inverse direction to catch float problems
+    invrd.x = 1.0 / direction.x;
+    invrd.y = 1.0 / direction.y;
+    invrd.z = 1.0 / direction.z;
+
+    if (invrd.x >= 0)
+    {
+      tmin = (vmin.x - origin.x) * invrd.x;
+      tmax = (vmax.x - origin.x) * invrd.x;
+    }
+    else
+    {
+      tmin = (vmax.x - origin.x) * invrd.x;
+      tmax = (vmin.x - origin.x) * invrd.x;
+    }
+
+    if (invrd.y >= 0)
+    {
+      tymin = (vmin.y - origin.y) * invrd.y;
+      tymax = (vmax.y - origin.y) * invrd.y;
+    }
+    else
+    {
+      tymin = (vmax.y - origin.y) * invrd.y;
+      tymax = (vmin.y - origin.y) * invrd.y;
+    }
+
+    if ( (tmin > tymax) || (tymin > tmax) ) return false;
+    if ( tymin > tmin) tmin = tymin;
+    if ( tymax < tmax) tmax = tymax;
+
+    if (invrd.z >= 0)
+    {
+      tzmin = (vmin.z - origin.z) * invrd.z;
+      tzmax = (vmax.z - origin.z) * invrd.z;
+    }else
+    {
+      tzmin = (vmax.z - origin.z) * invrd.z;
+      tzmax = (vmin.z - origin.z) * invrd.z;
+    }
+
+    if ( (tmin > tzmax) || (tzmin > tmax) ) return false;
+    if ( tzmin > tmin) tmin = tzmin;
+    if ( tzmax < tmax) tmax = tzmax;
+
+    // check if values are valid
+    t = tmin;
+    if (t <= 0) t = tmax;
+
+    return true;
+}
+
+
+// Read a fiber from file .trk
+unsigned int read_fiberTRK( FILE* fp, float fiber[3][MAX_FIB_LEN], int ns, int np )
+{
+    int N;
+    fread((char*)&N, 1, 4, fp);
+
+    if ( N >= MAX_FIB_LEN || N <= 0 )
+        return 0;
+
+    float tmp[3];
+    for(int i=0; i<N; i++)
+    {
+        fread((char*)tmp, 1, 12, fp);
+        fiber[0][i] = tmp[0];
+        fiber[1][i] = tmp[1];
+        fiber[2][i] = tmp[2];
+        fseek(fp,4*ns,SEEK_CUR);
+    }
+    fseek(fp,4*np,SEEK_CUR);
+
+    return N;
+}
+
+// Read a fiber from file .tck
+unsigned int read_fiberTCK( FILE* fp, float fiber[3][MAX_FIB_LEN], float affine[4][4])
+{
+    int i = 0;
+    float tmp[3];
+    fread((char*)tmp, 1, 12, fp);
+    while( !(isnan(tmp[0])) && !(isnan(tmp[1])) &&  !(isnan(tmp[2])) )
+    {
+        fiber[0][i] = tmp[0]*affine[0][0] + tmp[1]*affine[0][1] + tmp[2]*affine[0][2] + affine[0][3];
+        fiber[1][i] = tmp[0]*affine[1][0] + tmp[1]*affine[1][1] + tmp[2]*affine[1][2] + affine[1][3];
+        fiber[2][i] = tmp[0]*affine[2][0] + tmp[1]*affine[2][1] + tmp[2]*affine[2][2] + affine[2][3];
+        i++;
+        fread((char*)tmp, 1, 12, fp);
+    }
+
+    return i;
+}
diff --git a/extras/CMakeLists.txt b/extras/CMakeLists.txt
index b7689d18..6a1dc72b 100644
--- a/extras/CMakeLists.txt
+++ b/extras/CMakeLists.txt
@@ -1,11 +1,11 @@
-CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
-
-PROJECT( COMMITcpp )
-set( CMAKE_CXX_STANDARD 11 )
-
-set( CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/CMake" )
-SET( CMAKE_CXX_FLAGS "-w" )
-
-INCLUDE_DIRECTORIES ("${PROJECT_SOURCE_DIR}/include")
-
-ADD_SUBDIRECTORY( COMMIT_debugger )
+CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
+
+PROJECT( COMMITcpp )
+set( CMAKE_CXX_STANDARD 11 )
+
+set( CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/CMake" )
+SET( CMAKE_CXX_FLAGS "-w" )
+
+INCLUDE_DIRECTORIES ("${PROJECT_SOURCE_DIR}/include")
+
+ADD_SUBDIRECTORY( COMMIT_debugger )
diff --git a/extras/include/COLOR_ui.h b/extras/include/COLOR_ui.h
index 83d3aab0..54bdd5b4 100644
--- a/extras/include/COLOR_ui.h
+++ b/extras/include/COLOR_ui.h
@@ -1,73 +1,73 @@
-#ifndef __UI_H__
-#define __UI_H__
-
-
-#include <iostream>
-#include <fstream>
-#include <time.h>
-#include <string>
-using namespace std;
-
-
-/* COLOR constants (abckground is foreground+10) */
-#define		COLOR_black		30
-#define		COLOR_red		31
-#define		COLOR_green		32
-#define		COLOR_yellow	33
-#define		COLOR_blue		34
-#define		COLOR_magenta	35
-#define		COLOR_cyan		36
-#define		COLOR_white		37
-
-#define		COLOR_normal	0
-#define		COLOR_bold		1
-#define		COLOR_underline	4
-#define		COLOR_blink		5
-
-#define		COLOR(FG,BG,FONT) "\033["#FONT";"#FG";"#BG"m"
-#define		COLOR_reset "\033[0m"
-#define		COLOR_strERR COLOR(31,48,7) "[ERROR]" COLOR(31,48,0) " "
-#define		COLOR_strWAR COLOR(33,48,7) "[WARNING]" COLOR(33,48,0) " "
-
-
-void COLOR_print(string str, short int FG=COLOR_white, short int BG=COLOR_black, short int FONT=COLOR_normal)
-{
-    printf("\033[%d;%d;%dm%s\033[0m", FONT,FG,BG+10, str.c_str());
-}
-
-
-void COLOR_log(string str, short int FG=COLOR_green, short int BG=COLOR_black, short int FONT=COLOR_normal)
-{
-    char buffer [80];
-    time_t rawtime = time(0);
-    struct tm * timeinfo = localtime ( &rawtime );
-    strftime (buffer,80,"%H:%M:%S",timeinfo);
-
-    printf("\n\033[0;%d;%dm[ %s ]\033[%d;%d;%dm %s\033[0m\n", BG,FG+10,buffer, FONT,FG,BG+10,str.c_str());
-}
-
-
-void COLOR_msg( string msg, string prefix="" )
-{
-    if ( !prefix.empty() )
-        cerr << prefix;
-    cerr << "\033[0;34m "<< msg.c_str() <<"\033[0m\n";
-}
-
-
-void COLOR_error( string msg, string prefix="" )
-{
-    if ( !prefix.empty() )
-        cerr << prefix;
-    cerr << "\033[0;30;41m[ ERROR ]\033[0;31m "<< msg.c_str() <<"\033[0m\n";
-}
-
-
-void COLOR_warning( string msg, string prefix="" )
-{
-    if ( !prefix.empty() )
-        cerr << prefix;
-    cerr << "\033[0;30;43m[ WARNING ]\033[0;33m "<< msg.c_str() <<"\033[0m\n";
-}
-
-#endif
+#ifndef __UI_H__
+#define __UI_H__
+
+
+#include <iostream>
+#include <fstream>
+#include <time.h>
+#include <string>
+using namespace std;
+
+
+/* COLOR constants (abckground is foreground+10) */
+#define		COLOR_black		30
+#define		COLOR_red		31
+#define		COLOR_green		32
+#define		COLOR_yellow	33
+#define		COLOR_blue		34
+#define		COLOR_magenta	35
+#define		COLOR_cyan		36
+#define		COLOR_white		37
+
+#define		COLOR_normal	0
+#define		COLOR_bold		1
+#define		COLOR_underline	4
+#define		COLOR_blink		5
+
+#define		COLOR(FG,BG,FONT) "\033["#FONT";"#FG";"#BG"m"
+#define		COLOR_reset "\033[0m"
+#define		COLOR_strERR COLOR(31,48,7) "[ERROR]" COLOR(31,48,0) " "
+#define		COLOR_strWAR COLOR(33,48,7) "[WARNING]" COLOR(33,48,0) " "
+
+
+void COLOR_print(string str, short int FG=COLOR_white, short int BG=COLOR_black, short int FONT=COLOR_normal)
+{
+    printf("\033[%d;%d;%dm%s\033[0m", FONT,FG,BG+10, str.c_str());
+}
+
+
+void COLOR_log(string str, short int FG=COLOR_green, short int BG=COLOR_black, short int FONT=COLOR_normal)
+{
+    char buffer [80];
+    time_t rawtime = time(0);
+    struct tm * timeinfo = localtime ( &rawtime );
+    strftime (buffer,80,"%H:%M:%S",timeinfo);
+
+    printf("\n\033[0;%d;%dm[ %s ]\033[%d;%d;%dm %s\033[0m\n", BG,FG+10,buffer, FONT,FG,BG+10,str.c_str());
+}
+
+
+void COLOR_msg( string msg, string prefix="" )
+{
+    if ( !prefix.empty() )
+        cerr << prefix;
+    cerr << "\033[0;34m "<< msg.c_str() <<"\033[0m\n";
+}
+
+
+void COLOR_error( string msg, string prefix="" )
+{
+    if ( !prefix.empty() )
+        cerr << prefix;
+    cerr << "\033[0;30;41m[ ERROR ]\033[0;31m "<< msg.c_str() <<"\033[0m\n";
+}
+
+
+void COLOR_warning( string msg, string prefix="" )
+{
+    if ( !prefix.empty() )
+        cerr << prefix;
+    cerr << "\033[0;30;43m[ WARNING ]\033[0;33m "<< msg.c_str() <<"\033[0m\n";
+}
+
+#endif
diff --git a/requirements.txt b/requirements.txt
index 9234880c..1c03d182 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
-Cython>=0.29
-dipy>=1.0
-dmri-amico>=1.2.3
-numpy>=1.12
-setuptools>=46.1
+Cython>=0.29
+dipy>=1.0
+dmri-amico>=1.2.3
+numpy>=1.12
+setuptools>=46.1
diff --git a/setup.cfg b/setup.cfg
index a96a1715..3463cc53 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
-# Inside of setup.cfg
-[metadata]
-description-file = README.md
-
-[bdist_wheel]
+# Inside of setup.cfg
+[metadata]
+description-file = README.md
+
+[bdist_wheel]
 universal = 1
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 515e988d..b0a29135 100644
--- a/setup.py
+++ b/setup.py
@@ -1,205 +1,205 @@
-from setuptools import Extension, setup
-from setuptools.command.build_ext import build_ext
-import os
-from os.path import join as pjoin
-
-# taken from https://github.com/rmcgibbo/npcuda-example/blob/master/cython/setup.py
-def find_in_path(name, path):
-    """Find a file in a search path"""
-
-    # Adapted fom http://code.activestate.com/recipes/52224
-    for dir in path.split(os.pathsep):
-        binpath = pjoin(dir, name)
-        if os.path.exists(binpath):
-            return os.path.abspath(binpath)
-    return None
-
-def locate_cuda():
-    """Locate the CUDA environment on the system
-    Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64'
-    and values giving the absolute path to each directory.
-    Starts by looking for the CUDAHOME env variable. If not found,
-    everything is based on finding 'nvcc' in the PATH.
-    """
-
-    # First check if the CUDAHOME env variable is in use
-    if 'CUDAHOME' in os.environ:
-        home = os.environ['CUDAHOME']
-        nvcc = pjoin(home, 'bin', 'nvcc')
-    else:
-        # Otherwise, search the PATH for NVCC
-        nvcc = find_in_path('nvcc', os.environ['PATH'])
-        if nvcc is None:
-            return None
-        home = os.path.dirname(os.path.dirname(nvcc))
-
-    cudaconfig = {'home': home, 'nvcc': nvcc,
-                  'include': pjoin(home, 'include'),
-                  'lib64': pjoin(home, 'lib64')}
-    for k, v in iter(cudaconfig.items()):
-        if not os.path.exists(v):
-            return None
-
-    return cudaconfig
-
-def customize_compiler_for_nvcc(self):
-    """Inject deep into distutils to customize how the dispatch
-    to gcc/nvcc works.
-    If you subclass UnixCCompiler, it's not trivial to get your subclass
-    injected in, and still have the right customizations (i.e.
-    distutils.sysconfig.customize_compiler) run on it. So instead of going
-    the OO route, I have this. Note, it's kindof like a wierd functional
-    subclassing going on.
-    """
-
-    # Tell the compiler it can processes .cu
-    self.src_extensions.append('.cu')
-
-    # Save references to the default compiler_so and _comple methods
-    default_compiler_so = self.compiler_so
-    super = self._compile
-
-    # Now redefine the _compile method. This gets executed for each
-    # object but distutils doesn't have the ability to change compilers
-    # based on source extension: we add it.
-    def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
-        if os.path.splitext(src)[1] == '.cu':
-            # use the cuda for .cu files
-            self.set_executable('compiler_so', CUDA['nvcc'])
-            # use only a subset of the extra_postargs, which are 1-1
-            # translated from the extra_compile_args in the Extension class
-            print(type(extra_postargs))
-            print(extra_postargs)
-            postargs = extra_postargs['nvcc']
-        else:
-            print(type(extra_postargs))
-            print(extra_postargs)
-            postargs = extra_postargs['gcc']
-
-        super(obj, src, ext, cc_args, postargs, pp_opts)
-        # Reset the default compiler_so, which we might have changed for cuda
-        self.compiler_so = default_compiler_so
-
-    # Inject our redefined _compile method into the class
-    self._compile = _compile
-
-# Locate CUDA
-CUDA = locate_cuda()
-
-def get_extensions():
-    # Cython extension to create the sparse data structure from a tractogram
-    # for the computation of matrix-vector multiplications
-    ext1 = Extension(name='commit.trk2dictionary',
-                     sources=['commit/trk2dictionary/trk2dictionary.pyx'],
-                     extra_compile_args=['-w'],
-                     language='c++')
-
-    ext2 = Extension(name='commit.core',
-                     sources=['commit/core.pyx'],
-                     extra_compile_args=['-w'],
-                     language='c++')
-
-    ext3 = Extension(name='commit.proximals',
-                     sources=['commit/proximals.pyx'],
-                     extra_compile_args=['-w'],
-                     language='c++')
-
-    return [ext1, ext2, ext3]
-
-def get_extensions_with_cuda():
-    # Cython extension to create the sparse data structure from a tractogram
-    # for the computation of matrix-vector multiplications
-
-    ext1 = Extension(name='commit.trk2dictionary',
-                     sources=['commit/trk2dictionary/trk2dictionary.pyx'],
-                     extra_compile_args= {'gcc':  ['-w'],
-                                          'nvcc': ['-arch=sm_50', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
-                     extra_link_args=[],
-                     language='c++')
-
-    ext2 = Extension(name='commit.core',
-                     sources=['commit/core.pyx'],
-                     extra_compile_args= {'gcc':  ['-w'],
-                                          'nvcc': ['-arch=sm_50', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
-                     extra_link_args=[],
-                     language='c++')
-
-    ext3 = Extension(name='commit.proximals',
-                      sources=['commit/proximals.pyx'],
-                      extra_compile_args= {'gcc':  ['-w'],
-                                           'nvcc': ['-arch=sm_50', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
-                      extra_link_args=[],
-                      language='c++')
-
-    ext4 = Extension(name='commit.cudaoperator.operator',
-                     sources = ['commit/cudaoperator/operator_withCUDA.cu', 'commit/cudaoperator/operator.pyx'],
-                     extra_compile_args= {'gcc':  ['-w'],
-                                          'nvcc': ['-arch=sm_50', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
-                     language = 'c++',
-                     library_dirs = [CUDA['lib64']],
-                     libraries = ['cudart'],
-                     runtime_library_dirs = [CUDA['lib64']])
-
-    return [ext1, ext2, ext3, ext4]
-
-if CUDA == None:
-    extensions = get_extensions()
-else:
-    extensions = get_extensions_with_cuda()
-
-if CUDA == None:
-    class CustomBuildExtCommand(build_ext):
-        """ build_ext command to use when numpy headers are needed. """
-
-        def run(self):
-            # Now that the requirements are installed, get everything from numpy
-            from Cython.Build import cythonize
-            from numpy import get_include
-            
-            # Add everything requires for build
-            self.swig_opts = None
-            self.include_dirs = [get_include()]
-            self.distribution.ext_modules[:] = cythonize(self.distribution.ext_modules)
-
-            # Call original build_ext command
-            build_ext.finalize_options(self)
-            build_ext.run(self)
-else:
-    class CustomBuildExtCommand(build_ext):
-        """ build_ext command to use when numpy headers are needed. """
-
-        def build_extensions(self):
-            customize_compiler_for_nvcc(self.compiler)
-            build_ext.build_extensions(self)
-
-        def run(self):
-            # Now that the requirements are installed, get everything from numpy
-            from Cython.Build import cythonize
-            from numpy import get_include
-            
-            # Add everything requires for build
-            self.swig_opts = None
-            self.include_dirs = [get_include(), CUDA['include'], 'commit/cudaoperator']
-            self.distribution.ext_modules[:] = cythonize(self.distribution.ext_modules)
-
-            # Call original build_ext command
-            build_ext.finalize_options(self)
-            build_ext.run(self)
-
-description = 'Convex Optimization Modeling for Microstructure Informed Tractography (COMMIT)'
-
-opts = dict(name='dmri-commit',
-            version='1.5.0',
-            description=description,
-            long_description=description,
-            author='Alessandro Daducci',
-            author_email='alessandro.daducci@univr.it',
-            url='https://github.com/daducci/COMMIT',
-            packages=['commit', 'commit.operator'],
-            cmdclass={'build_ext': CustomBuildExtCommand},
-            ext_modules=extensions,
-            setup_requires=['Cython>=0.29', 'numpy>=1.12'],
-            install_requires=['Cython>=0.29', 'dmri-amico>=1.2.6', 'dipy>=1.0', 'numpy>=1.12'],
-            package_data={'commit.operator': ["*.*"]})
-
+from setuptools import Extension, setup
+from setuptools.command.build_ext import build_ext
+import os
+from os.path import join as pjoin
+
+# taken from https://github.com/rmcgibbo/npcuda-example/blob/master/cython/setup.py
+def find_in_path(name, path):
+    """Find a file in a search path"""
+
+    # Adapted fom http://code.activestate.com/recipes/52224
+    for dir in path.split(os.pathsep):
+        binpath = pjoin(dir, name)
+        if os.path.exists(binpath):
+            return os.path.abspath(binpath)
+    return None
+
+def locate_cuda():
+    """Locate the CUDA environment on the system
+    Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64'
+    and values giving the absolute path to each directory.
+    Starts by looking for the CUDAHOME env variable. If not found,
+    everything is based on finding 'nvcc' in the PATH.
+    """
+
+    # First check if the CUDAHOME env variable is in use
+    if 'CUDAHOME' in os.environ:
+        home = os.environ['CUDAHOME']
+        nvcc = pjoin(home, 'bin', 'nvcc')
+    else:
+        # Otherwise, search the PATH for NVCC
+        nvcc = find_in_path('nvcc', os.environ['PATH'])
+        if nvcc is None:
+            return None
+        home = os.path.dirname(os.path.dirname(nvcc))
+
+    cudaconfig = {'home': home, 'nvcc': nvcc,
+                  'include': pjoin(home, 'include'),
+                  'lib64': pjoin(home, 'lib64')}
+    for k, v in iter(cudaconfig.items()):
+        if not os.path.exists(v):
+            return None
+
+    return cudaconfig
+
+def customize_compiler_for_nvcc(self):
+    """Inject deep into distutils to customize how the dispatch
+    to gcc/nvcc works.
+    If you subclass UnixCCompiler, it's not trivial to get your subclass
+    injected in, and still have the right customizations (i.e.
+    distutils.sysconfig.customize_compiler) run on it. So instead of going
+    the OO route, I have this. Note, it's kindof like a wierd functional
+    subclassing going on.
+    """
+
+    # Tell the compiler it can processes .cu
+    self.src_extensions.append('.cu')
+
+    # Save references to the default compiler_so and _comple methods
+    default_compiler_so = self.compiler_so
+    super = self._compile
+
+    # Now redefine the _compile method. This gets executed for each
+    # object but distutils doesn't have the ability to change compilers
+    # based on source extension: we add it.
+    def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
+        if os.path.splitext(src)[1] == '.cu':
+            # use the cuda for .cu files
+            self.set_executable('compiler_so', CUDA['nvcc'])
+            # use only a subset of the extra_postargs, which are 1-1
+            # translated from the extra_compile_args in the Extension class
+            print(type(extra_postargs))
+            print(extra_postargs)
+            postargs = extra_postargs['nvcc']
+        else:
+            print(type(extra_postargs))
+            print(extra_postargs)
+            postargs = extra_postargs['gcc']
+
+        super(obj, src, ext, cc_args, postargs, pp_opts)
+        # Reset the default compiler_so, which we might have changed for cuda
+        self.compiler_so = default_compiler_so
+
+    # Inject our redefined _compile method into the class
+    self._compile = _compile
+
+# Locate CUDA
+CUDA = locate_cuda()
+
+def get_extensions():
+    # Cython extension to create the sparse data structure from a tractogram
+    # for the computation of matrix-vector multiplications
+    ext1 = Extension(name='commit.trk2dictionary',
+                     sources=['commit/trk2dictionary/trk2dictionary.pyx'],
+                     extra_compile_args=['-w'],
+                     language='c++')
+
+    ext2 = Extension(name='commit.core',
+                     sources=['commit/core.pyx'],
+                     extra_compile_args=['-w'],
+                     language='c++')
+
+    ext3 = Extension(name='commit.proximals',
+                     sources=['commit/proximals.pyx'],
+                     extra_compile_args=['-w'],
+                     language='c++')
+
+    return [ext1, ext2, ext3]
+
+def get_extensions_with_cuda():
+    # Cython extension to create the sparse data structure from a tractogram
+    # for the computation of matrix-vector multiplications
+
+    ext1 = Extension(name='commit.trk2dictionary',
+                     sources=['commit/trk2dictionary/trk2dictionary.pyx'],
+                     extra_compile_args= {'gcc':  ['-w'],
+                                          'nvcc': ['-arch=sm_50', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
+                     extra_link_args=[],
+                     language='c++')
+
+    ext2 = Extension(name='commit.core',
+                     sources=['commit/core.pyx'],
+                     extra_compile_args= {'gcc':  ['-w'],
+                                          'nvcc': ['-arch=sm_50', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
+                     extra_link_args=[],
+                     language='c++')
+
+    ext3 = Extension(name='commit.proximals',
+                      sources=['commit/proximals.pyx'],
+                      extra_compile_args= {'gcc':  ['-w'],
+                                           'nvcc': ['-arch=sm_50', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
+                      extra_link_args=[],
+                      language='c++')
+
+    ext4 = Extension(name='commit.cudaoperator.operator',
+                     sources = ['commit/cudaoperator/operator_withCUDA.cu', 'commit/cudaoperator/operator.pyx'],
+                     extra_compile_args= {'gcc':  ['-w'],
+                                          'nvcc': ['-arch=sm_50', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'"]},
+                     language = 'c++',
+                     library_dirs = [CUDA['lib64']],
+                     libraries = ['cudart'],
+                     runtime_library_dirs = [CUDA['lib64']])
+
+    return [ext1, ext2, ext3, ext4]
+
+if CUDA == None:
+    extensions = get_extensions()
+else:
+    extensions = get_extensions_with_cuda()
+
+if CUDA == None:
+    class CustomBuildExtCommand(build_ext):
+        """ build_ext command to use when numpy headers are needed. """
+
+        def run(self):
+            # Now that the requirements are installed, get everything from numpy
+            from Cython.Build import cythonize
+            from numpy import get_include
+            
+            # Add everything requires for build
+            self.swig_opts = None
+            self.include_dirs = [get_include()]
+            self.distribution.ext_modules[:] = cythonize(self.distribution.ext_modules)
+
+            # Call original build_ext command
+            build_ext.finalize_options(self)
+            build_ext.run(self)
+else:
+    class CustomBuildExtCommand(build_ext):
+        """ build_ext command to use when numpy headers are needed. """
+
+        def build_extensions(self):
+            customize_compiler_for_nvcc(self.compiler)
+            build_ext.build_extensions(self)
+
+        def run(self):
+            # Now that the requirements are installed, get everything from numpy
+            from Cython.Build import cythonize
+            from numpy import get_include
+            
+            # Add everything requires for build
+            self.swig_opts = None
+            self.include_dirs = [get_include(), CUDA['include'], 'commit/cudaoperator']
+            self.distribution.ext_modules[:] = cythonize(self.distribution.ext_modules)
+
+            # Call original build_ext command
+            build_ext.finalize_options(self)
+            build_ext.run(self)
+
+description = 'Convex Optimization Modeling for Microstructure Informed Tractography (COMMIT)'
+
+opts = dict(name='dmri-commit',
+            version='1.5.0',
+            description=description,
+            long_description=description,
+            author='Alessandro Daducci',
+            author_email='alessandro.daducci@univr.it',
+            url='https://github.com/daducci/COMMIT',
+            packages=['commit', 'commit.operator'],
+            cmdclass={'build_ext': CustomBuildExtCommand},
+            ext_modules=extensions,
+            setup_requires=['Cython>=0.29', 'numpy>=1.12'],
+            install_requires=['Cython>=0.29', 'dmri-amico>=1.2.6', 'dipy>=1.0', 'numpy>=1.12'],
+            package_data={'commit.operator': ["*.*"]})
+
 setup(**opts)
\ No newline at end of file

From bd0aa44efc931f43d774e267d9c447bde544467e Mon Sep 17 00:00:00 2001
From: ErickHernandezGutierrez <erick.hernandez@cimat.mx>
Date: Tue, 28 Sep 2021 19:07:43 -0500
Subject: [PATCH 190/190] GPU acceleration added to CHANGELOG

---
 CHANGELOG.md                             | 13 +++++
 commit/cudaoperator/operator_withCUDA.cu | 61 ++++--------------------
 setup.py                                 |  2 +-
 3 files changed, 24 insertions(+), 52 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6f7dc64d..b3863589 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,19 @@
 # Change Log
 All notable changes to COMMIT will be documented in this file.
 
+## [1.6.0] - 2021-09-28
+
+### Added
+- GPU acceleration with CUDA for faster model fitting
+- 'cudaoperator' extension to handle operator in GPU memory
+
+### Fixed
+- Changed end of line from CRLF to LF in several files
+
+### Changed
+- setupy.py: Added custom cython compilation for .cu files with nvcc
+- set_threads(): 'n' parameter was renamed to 'nthreads'
+
 ## [1.5.0] - 2021-06-19
 
 ### Added
diff --git a/commit/cudaoperator/operator_withCUDA.cu b/commit/cudaoperator/operator_withCUDA.cu
index ea014db5..0156913c 100644
--- a/commit/cudaoperator/operator_withCUDA.cu
+++ b/commit/cudaoperator/operator_withCUDA.cu
@@ -1,7 +1,7 @@
 #include "operator_withCUDA.cuh"
 
 // ====================================================
-// Textures for LUT in the GPU
+// Textures for LUT in the GPU memory
 // ====================================================
 texture<float32_t, 1, cudaReadModeElementType> tex_lutIC;
 texture<float32_t, 1, cudaReadModeElementType> tex_lutEC;
@@ -352,12 +352,11 @@ void cudaCheckKernel(){
 void CudaLinearOperator::dot(float64_t* v_in, float64_t* v_out){
     
     // Copy vector x to the GPU
-    cudaMemcpy(gpu_x, v_in, ncols*sizeof(double), cudaMemcpyHostToDevice);
+    cudaMemcpy(gpu_x, v_in, ncols*sizeof(float64_t), cudaMemcpyHostToDevice);
     //cudaCheckLastError();
 
     // Multiply IC part in the GPU
-    //multiply_Ax_ICpart<<<nvoxels, 1024>>>(gpu_voxelIC, gpu_fiberIC, gpu_orienIC, gpu_lengthIC, gpu_segmentsPerBlockIC, gpu_offsetPerBlockIC, gpu_lutIC, gpu_x, gpu_y);
-    multiply_Ax_ICpart<<<nvoxels/256 + 1, 256>>>(gpu_voxelIC, gpu_fiberIC, gpu_orienIC, gpu_lengthIC, gpu_segmentsPerBlockIC, gpu_offsetPerBlockIC, gpu_lutIC, gpu_x, gpu_y);
+    multiply_Ax_ICpart<<<nvoxels, 1024>>>(gpu_voxelIC, gpu_fiberIC, gpu_orienIC, gpu_lengthIC, gpu_segmentsPerBlockIC, gpu_offsetPerBlockIC, gpu_lutIC, gpu_x, gpu_y);
     //cudaCheckLastError();
 
     // Multiply EC part in the GPU
@@ -369,14 +368,14 @@ void CudaLinearOperator::dot(float64_t* v_in, float64_t* v_out){
     //cudaCheckLastError();
 
     // Copy back result to CPU
-    cudaMemcpy(v_out, gpu_y, nrows*sizeof(double), cudaMemcpyDeviceToHost);
+    cudaMemcpy(v_out, gpu_y, nrows*sizeof(float64_t), cudaMemcpyDeviceToHost);
     //cudaCheckLastError();
 }
 
 void CudaLinearOperator::Tdot(float64_t* v_in, float64_t* v_out){
     
     // Copy vector y to the GPU
-    cudaMemcpy(gpu_y, v_in, nrows*sizeof(double), cudaMemcpyHostToDevice);
+    cudaMemcpy(gpu_y, v_in, nrows*sizeof(float64_t), cudaMemcpyHostToDevice);
     //cudaCheckLastError();
 
     // Multiply IC part in the GPU
@@ -392,12 +391,14 @@ void CudaLinearOperator::Tdot(float64_t* v_in, float64_t* v_out){
     //cudaCheckLastError();
 
     // Copy back result to CPU
-    cudaMemcpy(v_out, gpu_x, ncols*sizeof(double), cudaMemcpyDeviceToHost);
+    cudaMemcpy(v_out, gpu_x, ncols*sizeof(float64_t), cudaMemcpyDeviceToHost);
     //cudaCheckLastError();
 }
 
-// ------------------------------------------------------- KERNELS ------------------------------------------------------- //
-/*__global__ void multiply_Ax_ICpart(uint32_t*  voxelIDs,
+// ============================================================================================================================================================
+// Function Kernels that are called from CPU and executed in GPU
+// ============================================================================================================================================================
+__global__ void multiply_Ax_ICpart(uint32_t*  voxelIDs,
                                    uint32_t*  fiberIDs,
                                    uint16_t*  orienIDs,
                                    float32_t* lengths,
@@ -449,48 +450,6 @@ void CudaLinearOperator::Tdot(float64_t* v_in, float64_t* v_out){
 
     if(tid < NUM_SAMPLES)
         y[(*voxel)*NUM_SAMPLES + sid] = sum + shmem[tid+512];
-}//*/
-
-__global__ void multiply_Ax_ICpart(uint32_t*  voxelIDs,
-                     uint32_t*  fiberIDs,
-                     uint16_t*  orienIDs,
-                     float32_t* lengths,
-                     uint32_t*  segmentsPerVoxel,
-                     uint32_t*  offsetPerVoxel,
-                     float32_t* lut,
-                     float64_t* x,
-                     float64_t* y)
-{
-    uint32_t bid = blockIdx.x;
-    uint32_t tid = threadIdx.x;
-
-    uint32_t vid = bid*256 + tid;
-
-    if (vid >= NUM_VOXELS) return;
-
-    uint32_t offset = offsetPerVoxel[ vid ];
-    uint32_t nsegments = segmentsPerVoxel[ vid ];
-
-    uint32_t*  voxel  = voxelIDs + offset;
-    uint32_t*  fiber  = fiberIDs + offset;
-    uint16_t*  orien  = orienIDs + offset;
-    float32_t* length = lengths  + offset;
-
-    for(int i=0; i<nsegments; i++){
-        for(int s=0; s<NUM_SAMPLES; s++){
-            int offset_lut = (*orien)*NUM_SAMPLES + s;
-
-            float64_t aux = 0.0;
-            for(int j=0; j<NUM_DIAMETERS; j++)
-                aux += (float64_t)(lut[offset_lut + j*NUM_ORIENTATIONS*NUM_SAMPLES]) * x[(*fiber) + j*NUM_FIBERS];
-
-            y[(*voxel)*NUM_SAMPLES + s] += aux*(*length);
-        }
-
-        fiber++;
-        orien++;
-        length++;
-    }
 }
 
 __global__ void multiply_Ax_ECpart(
diff --git a/setup.py b/setup.py
index e0639ec9..f9f2fb0f 100644
--- a/setup.py
+++ b/setup.py
@@ -188,7 +188,7 @@ def run(self):
 
 description = 'Convex Optimization Modeling for Microstructure Informed Tractography (COMMIT)'
 opts = dict(name='dmri-commit',
-            version='1.5.0',
+            version='1.6.0',
             description=description,
             long_description=description,
             author='Alessandro Daducci',