diff --git a/scripts/staging/cuda-counter-based-prng/PhiloxJNvrtcExample.java b/scripts/staging/cuda-counter-based-prng/PhiloxJNvrtcExample.java new file mode 100644 index 00000000000..04855bd83ea --- /dev/null +++ b/scripts/staging/cuda-counter-based-prng/PhiloxJNvrtcExample.java @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import jcuda.*; +import jcuda.driver.*; +import jcuda.nvrtc.*; +import jcuda.runtime.JCuda; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Paths; + +import static jcuda.driver.JCudaDriver.cuCtxCreate; + +public class PhiloxJNvrtcExample { + + public static void main(String[] args) { + // Enable exceptions and omit error checks + JCuda.setExceptionsEnabled(true); + JCudaDriver.setExceptionsEnabled(true); + JNvrtc.setExceptionsEnabled(true); + + String ptx = ""; + try { + ptx = new String(Files.readAllBytes(Paths.get("philox_kernel.ptx"))); + } catch (IOException e) { + System.out.println(e.getMessage()); + } + + // Print the PTX for debugging + //System.out.println("Generated PTX:"); + // System.out.println(ptx); + + // Initialize the driver API and create a context + JCudaDriver.cuInit(0); + CUdevice device = new CUdevice(); + JCudaDriver.cuDeviceGet(device, 0); + CUcontext context = new CUcontext(); + cuCtxCreate(context, 0, device); + + CUmodule module = new CUmodule(); + JCudaDriver.cuModuleLoadData(module, ptx); + + // Get a function pointer to the kernel + CUfunction function = new CUfunction(); + JCudaDriver.cuModuleGetFunction(function, module, "philox_4_64"); + + // Prepare data + int n = 1000; // Number of random numbers to generate + long[] hostOut = new long[n]; + CUdeviceptr deviceOut = new CUdeviceptr(); + JCudaDriver.cuMemAlloc(deviceOut, n * Sizeof.LONG); + + // Direkte Werte für seed und startingCounter + long seed = 0L; // Fester Seed-Wert + long startingCounter = 0L; // Startwert für Counter + + Pointer kernelParameters = Pointer.to( + Pointer.to(deviceOut), // ulong* output + Pointer.to(new long[]{seed}), // uint64_t seed + Pointer.to(new long[]{startingCounter}), // uint64_t startingCounter + Pointer.to(new long[]{n}) // size_t numElements + ); + + // Launch the kernel + int blockSizeX = 128; + int gridSizeX = (int) Math.ceil((double)n / blockSizeX); + JCudaDriver.cuLaunchKernel( + function, + gridSizeX, 1, 1, // Grid dimension + blockSizeX, 1, 1, // Block dimension + 0, null, // Shared memory size and stream + kernelParameters, null // Kernel- und extra parameters + ); + JCudaDriver.cuCtxSynchronize(); + + // Copy result back + JCudaDriver.cuMemcpyDtoH(Pointer.to(hostOut), deviceOut, n * Sizeof.LONG); + + // Print results + System.out.println("Generated random numbers with seed=" + + String.format("0x%016X", seed) + + " and startingCounter=" + startingCounter); + for (int i = 0; i < Math.min(10, n); i++) { + System.out.printf("hostOut[%d] = 0x%016X\n", i, hostOut[i]); + } + + // Cleanup + JCudaDriver.cuMemFree(deviceOut); + JCudaDriver.cuCtxDestroy(context); + } +} diff --git a/scripts/staging/cuda-counter-based-prng/PhiloxRuntimeCompilationExample.java b/scripts/staging/cuda-counter-based-prng/PhiloxRuntimeCompilationExample.java new file mode 100644 index 00000000000..72fb5a05c45 --- /dev/null +++ b/scripts/staging/cuda-counter-based-prng/PhiloxRuntimeCompilationExample.java @@ -0,0 +1,244 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import jcuda.*; +import jcuda.driver.*; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileWriter; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.List; +import java.util.Random; + +import static java.nio.file.Files.readAllBytes; +import static jcuda.driver.JCudaDriver.*; + +public class PhiloxRuntimeCompilationExample implements AutoCloseable { + private static String philox4x64KernelSource = "#include \n" + + "#include \n" + + "extern \"C\" __global__ void philox_4_64(ulong* output, uint64_t startingCounter, uint64_t seed, size_t numElements) {\n" + + + " uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x;\n" + + " if (idx * 4 < numElements) {\n" + + " r123::Philox4x64 rng;\n" + + " r123::Philox4x64::ctr_type ctr = {{startingCounter + idx, 0, 0, 0}};\n" + + " r123::Philox4x64::key_type key = {{seed}};\n" + + " r123::Philox4x64::ctr_type result = rng(ctr, key);\n" + + " for (int i = 0; i < 4; ++i) {\n" + + " size_t outputIdx = idx * 4 + i;\n" + + " if (outputIdx < numElements) {\n" + + " output[outputIdx] = result[i];\n" + + " }\n" + + " }\n" + + " }\n" + + "}\n"; + + private final CUcontext context; + private final CUmodule module; + private final CUfunction function; + private final int blockSize; + + public PhiloxRuntimeCompilationExample() { + JCudaDriver.setExceptionsEnabled(true); + // Initialize CUDA + cuInit(0); + CUdevice device = new CUdevice(); + cuDeviceGet(device, 0); + context = new CUcontext(); + int result = cuCtxCreate(context, 0, device); + if (result != CUresult.CUDA_SUCCESS) { + throw new RuntimeException( + "Kontext-Erstellung fehlgeschlagen: " + result + ", " + CUresult.stringFor(result)); + } + + // Compile to PTX + String ptx = compileToTPX(philox4x64KernelSource); + + // Load the PTX + module = new CUmodule(); + cuModuleLoadData(module, ptx); + function = new CUfunction(); + cuModuleGetFunction(function, module, "philox_4_64"); + + // Set block size based on device capabilities + blockSize = 64; // Can be adjusted based on device properties + } + + private String compileToTPX(String source) { + try { + // Temporäre Dateien erstellen + File sourceFile = File.createTempFile("philox_kernel", ".cu"); + File outputFile = File.createTempFile("philox_kernel", ".ptx"); + + // CUDA-Quellcode in temporäre Datei schreiben + try (FileWriter writer = new FileWriter(sourceFile)) { + writer.write(philox4x64KernelSource); + } + + // nvcc Kommando zusammenbauen + List command = new ArrayList<>(); + command.add("/usr/local/cuda/bin/nvcc"); + command.add("-ccbin"); + command.add("gcc-8"); + command.add("--ptx"); // PTX-Output generieren + command.add("-o"); + command.add(outputFile.getAbsolutePath()); + command.add("-I"); + command.add("./lib/random123/include"); + command.add(sourceFile.getAbsolutePath()); + + // Prozess erstellen und ausführen + ProcessBuilder pb = new ProcessBuilder(command); + pb.redirectErrorStream(true); + Process process = pb.start(); + + // Output des Kompilers lesen + try (BufferedReader reader = new BufferedReader( + new InputStreamReader(process.getInputStream()))) { + String line; + StringBuilder output = new StringBuilder(); + while ((line = reader.readLine()) != null) { + output.append(line).append("\n"); + } + System.out.println("Compiler Output: " + output.toString()); + } + + // Auf Prozessende warten + int exitCode = process.waitFor(); + if (exitCode != 0) { + throw new RuntimeException("nvcc Kompilierung fehlgeschlagen mit Exit-Code: " + exitCode); + } + + // PTX-Datei einlesen + String ptxCode = new String(readAllBytes(outputFile.toPath())); + + // Aufräumen + sourceFile.delete(); + outputFile.delete(); + + return ptxCode; + + } catch (Exception e) { + throw new RuntimeException("Fehler bei der CUDA-Kompilierung: " + e.getMessage(), e); + } + } + + /** + * Generates random numbers using the Philox4x64 algorithm + * + * @param startingCounter Initial counter value + * @param seed Random seed + * @param numElements Number of random numbers to generate + * @return Array of random numbers + */ + public CUdeviceptr Philox4x64(long startingCounter, long seed, int numElements) { + // Allocate host memory for results + // long[] hostOutput = new long[numElements]; + + // Allocate device memory + CUdeviceptr deviceOutput = new CUdeviceptr(); + cuMemAlloc(deviceOutput, (long) numElements * Sizeof.LONG); + + try { + // Set up kernel parameters mit Debugging + System.out.printf("numElements: %d, seed: %d, startingCounter: %d%n", + numElements, seed, startingCounter); + + Pointer kernelParams = Pointer.to( + Pointer.to(deviceOutput), + Pointer.to(new long[] { startingCounter }), + Pointer.to(new long[] { seed }), + Pointer.to(new long[] { numElements })); + + // Calculate grid size + int gridSize = (numElements + (blockSize * 4) - 1) / (blockSize * 4); + + // Launch kernel mit Fehlerprüfung + int kernelResult = cuLaunchKernel(function, + gridSize, 1, 1, // Grid dimension + blockSize, 1, 1, // Block dimension + 0, null, // Shared memory size and stream + kernelParams, null // Kernel parameters and extra parameters + ); + if (kernelResult != CUresult.CUDA_SUCCESS) { + throw new RuntimeException( + "Kernel-Launch fehlgeschlagen: " + kernelResult + ", " + CUresult.stringFor(kernelResult)); + } + + // Copy results back to host + // cuMemcpyDtoH(Pointer.to(hostOutput), deviceOutput, (long) numElements * + // Sizeof.LONG); + } finally { + // Free device memory + // cuMemFree(deviceOutput); + } + + // return hostOutput; + return deviceOutput; + } + + /** + * Cleans up CUDA resources + */ + public void close() { + cuModuleUnload(module); + cuCtxDestroy(context); + } + + // Example usage + public static void main(String[] args) { + try (PhiloxRuntimeCompilationExample generator = new PhiloxRuntimeCompilationExample()) { + // Generate 1 million random numbers + int numElements = 1_000_000; + long seed = 0L; + long startingCounter = 0L; + + CUdeviceptr randomNumbers = generator.Philox4x64(startingCounter, seed, numElements); + + long[] elements = new long[10]; + cuMemcpyDtoH(Pointer.to(elements), randomNumbers, 10L * Sizeof.LONG); + cuMemFree(randomNumbers); + + // Print first few numbers + System.out.println("First 10 random numbers:"); + for (int i = 0; i < 10; i++) { + System.out.printf("%d: %x%n", i, elements[i]); + } + + int size = 10_000_000; + long start = System.currentTimeMillis(); + CUdeviceptr ptr = generator.Philox4x64(0L, 0L, size); + long end = System.currentTimeMillis(); + System.out.println("philox4x64 speed test: " + (end - start) * 1000 + " microseconds"); + cuMemFree(ptr); + Random r = new Random(); + long javaStart = System.currentTimeMillis(); + for (int i = 0; i < size; i++) { + r.nextLong(); + } + long javaEnd = System.currentTimeMillis(); + System.out.println("java speed test: " + (javaEnd - javaStart) * 1000 + " microseconds"); + System.out.println("philox4x64 is " + (double) (javaEnd - javaStart) / (double) (end - start) + + " times faster than java"); + + } + } +} diff --git a/scripts/staging/cuda-counter-based-prng/kernel.cu b/scripts/staging/cuda-counter-based-prng/kernel.cu new file mode 100644 index 00000000000..8ecce451c66 --- /dev/null +++ b/scripts/staging/cuda-counter-based-prng/kernel.cu @@ -0,0 +1,279 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +// CUDA kernel to generate random doubles between 0 and 1 using all 4 integers from Philox +extern "C" __global__ void philox_4_64_uniform(double* output, uint64_t originalKey, r123::Philox4x64::ctr_type startingCounter, size_t numElements) { + // Calculate the thread's unique index + uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x; + double UINT_TO_ZERO_ONE = 1.0 / LONG_MAX; + + // Ensure the thread index is within bounds + if (idx * 4 < numElements) { + // Initialize the Philox generator with a unique counter and key + r123::Philox4x64 rng; + r123::Philox4x64::ctr_type ctr; + uint64_t sum0 = startingCounter[0] + idx; + uint64_t sum1 = startingCounter[1] + (sum0 < startingCounter[0] ? 1 : 0); // Carry-Bit + + ctr[0] = sum0; + ctr[1] = sum1; + ctr[2] = startingCounter[2]; + ctr[3] = startingCounter[3]; + r123::Philox4x64::key_type key = {{originalKey}}; // Key (seed) + + // Generate 4 random integers + r123::Philox4x64::ctr_type result = rng(ctr, key); + + // Convert each 64-bit integer to a double in [-1, 1] + for (int i = 0; i < 4; ++i) { + double randomDouble = static_cast((long)result[i]) * UINT_TO_ZERO_ONE; + size_t outputIdx = idx * 4 + i; + + // Ensure we don't exceed the output array bounds + if (outputIdx < numElements) { + output[outputIdx] = randomDouble; + } + } + } +} + +// CUDA kernel to generate random doubles between 0 and 1 using all 4 integers from Philox +extern "C" __global__ void philox_4_64_standard(double* output, uint64_t originalKey, r123::Philox4x64::ctr_type startingCounter, size_t numElements) { + // Calculate the thread's unique index + uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x; + uint64_t idx2 = idx + numElements; + double UINT_TO_ZERO_ONE = 1.0 / LONG_MAX; + + // Ensure the thread index is within bounds + if (idx * 4 < numElements) { + // Initialize the Philox generator with a unique counter and key + r123::Philox4x64 rng; + r123::Philox4x64::ctr_type ctr1; + uint64_t sum0 = startingCounter[0] + idx; + uint64_t sum1 = startingCounter[1] + (sum0 < startingCounter[0] ? 1 : 0); // Carry-Bit + + ctr1[0] = sum0; + ctr1[1] = sum1; + ctr1[2] = startingCounter[2]; + ctr1[3] = startingCounter[3]; + r123::Philox4x64::ctr_type ctr2; + sum0 = startingCounter[0] + idx2; + sum1 = startingCounter[1] + (sum0 < startingCounter[0] ? 1 : 0); // Carry-Bit + + ctr2[0] = sum0; + ctr2[1] = sum1; + ctr2[2] = startingCounter[2]; + ctr2[3] = startingCounter[3]; + + r123::Philox4x64::key_type key1 = {{originalKey}}; + r123::Philox4x64::key_type key2 = {{originalKey}}; + + // Generate 4 random integers + r123::Philox4x64::ctr_type result1 = rng(ctr1, key1); + r123::Philox4x64::ctr_type result2 = rng(ctr2, key2); + + // Convert each 64-bit integer to a double in [-1, 1] + for (int i = 0; i < 4; ++i) { + double randomDouble1 = static_cast((long)result1[i]) * UINT_TO_ZERO_ONE; + double randomDouble2 = static_cast((long)result2[i]) * UINT_TO_ZERO_ONE; + + size_t outputIdx = idx * 4 + i; + + // Ensure we don't exceed the output array bounds + if (outputIdx < numElements) { + output[outputIdx] = (randomDouble1 + randomDouble2) / 2; + } + } + } +} + + +// CUDA kernel to generate random integers from Philox +extern "C" __global__ void philox_4_32(uint* output, uint32_t seed, uint32_t startingCounter, size_t numElements) { + // Calculate the thread's unique index + uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + // Ensure the thread index is within bounds + if (idx * 4 < numElements) { + // Initialize the Philox generator with a unique counter and key + r123::Philox4x32 rng; + r123::Philox4x32::ctr_type ctr = {{startingCounter + idx, 0, 0, 0}}; // Counter (startingCounter + thread index) + r123::Philox4x32::key_type key = {{seed}}; // Key (seed) + + // Generate 4 random integers + r123::Philox4x32::ctr_type result = rng(ctr, key); + + for (int i = 0; i < 4; ++i) { + size_t outputIdx = idx * 4 + i; + + // Ensure we don't exceed the output array bounds + if (outputIdx < numElements) { + output[outputIdx] = result[i]; + } + } + } +} + + +// CUDA kernel to generate random longs from Philox +extern "C" __global__ void philox_4_64(ulong* output, uint64_t seed, uint64_t startingCounter, size_t numElements) { + // Calculate the thread's unique index + uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + // Ensure the thread index is within bounds + if (idx * 4 < numElements) { + // Initialize the Philox generator with a unique counter and key + r123::Philox4x64 rng; + r123::Philox4x64::ctr_type ctr = {{startingCounter + idx, 0, 0, 0}}; // Counter (startingCounter + thread index) + r123::Philox4x64::key_type key = {{seed}}; // Key (seed) + + // Generate 4 random integers + r123::Philox4x64::ctr_type result = rng(ctr, key); + + for (int i = 0; i < 4; ++i) { + size_t outputIdx = idx * 4 + i; + + // Ensure we don't exceed the output array bounds + if (outputIdx < numElements) { + output[outputIdx] = result[i]; + } + } + } +} + + +int main(int argc, char** argv) { + // Check command-line arguments + if (argc != 4) { + std::cerr << "Usage: " << argv[0] << " \n"; + return 1; + } + + // Parse command-line arguments + uint64_t seed = std::stoull(argv[1]); // Seed (key) + uint64_t startingCounter = std::stoull(argv[2]); // Starting counter + size_t numElements = std::stoul(argv[3]); // Number of random numbers to generate + + // Allocate host and device memory + double* h_output = new double[numElements]; + double* d_output; + cudaMalloc(&d_output, numElements * sizeof(double)); + + // Launch the CUDA kernel + const int blockSize = 512; + const int gridSize = (numElements + blockSize * 4 - 1) / (blockSize * 4); // Adjust grid size for 4 outputs per thread + r123::Philox4x64::ctr_type uniformCounter = {{startingCounter, 0, 0, 0}}; + + auto start = std::chrono::high_resolution_clock::now(); + philox_4_64_standard<<>>(d_output, seed, uniformCounter, numElements); + auto end = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(end - start); + + std::cout << "Time: " << duration.count() << " microseconds" << std::endl; + + // Copy the results back to the host + cudaMemcpy(h_output, d_output, numElements * sizeof(double), cudaMemcpyDeviceToHost); + + // Print the first 10 random doubles + std::cout << "First 10 random doubles:\n"; + for (int i = 0; i < 10; ++i) { + std::cout << h_output[i] << "\n"; + } + + double avg = 0.0; + for (int i = 0; i < numElements; i++) { + avg += (double)h_output[i] / numElements; + } + printf("Average: %f\n", avg); + double standardDeviation = 0.0; + for (int i = 0; i < numElements; i++) { + standardDeviation += std::pow((double)h_output[i] - avg, 2); + } + standardDeviation = sqrt(standardDeviation / numElements); + printf("standardDeviation: %f\n", standardDeviation); + + + // Free memory + delete[] h_output; + cudaFree(d_output); + + // -------------------------------------------------------------------------------- + + seed = std::stoull(argv[1]); // Seed (key) + startingCounter = std::stoull(argv[2]); // Starting counter + numElements = std::stoul(argv[3]); // Number of random numbers to generate + + // Allocate host and device memory + uint* h_output_int = new uint[numElements]; + uint* d_output_int; + cudaMalloc(&d_output_int, numElements * sizeof(uint)); + + // Launch the CUDA kernel + philox_4_32<<>>(d_output_int, seed, startingCounter, numElements); + + // Copy the results back to the host + cudaMemcpy(h_output_int, d_output_int, numElements * sizeof(uint), cudaMemcpyDeviceToHost); + + // Print the first 10 random doubles + std::cout << "First 10 random doubles:\n"; + for (int i = 0; i < 10; ++i) { + std::cout << std::hex << h_output_int[i] << " " << r123::uneg11(h_output_int[i]) << "\n"; + } + + // Free memory + delete[] h_output_int; + cudaFree(d_output_int); + + // -------------------------------------------------------------------------------- + + seed = std::stoull(argv[1]); // Seed (key) + startingCounter = std::stoull(argv[2]); // Starting counter + numElements = std::stoul(argv[3]); // Number of random numbers to generate + + // Allocate host and device memory + ulong* h_output_long = new ulong[numElements]; + ulong* d_output_long; + cudaMalloc(&d_output_long, numElements * sizeof(ulong)); + + // Launch the CUDA kernel + philox_4_64<<>>(d_output_long, seed, startingCounter, numElements); + + // Copy the results back to the host + cudaMemcpy(h_output_long, d_output_long, numElements * sizeof(ulong), cudaMemcpyDeviceToHost); + + // Print the first 10 random doubles + std::cout << "First 10 random doubles:\n"; + for (int i = 0; i < 10; ++i) { + std::cout << std::setprecision(17) << std::hex << h_output_long[i] << " " << (static_cast((long)h_output_long[i]) / LONG_MAX) << "\n"; + } + + // Free memory + delete[] h_output_long; + cudaFree(d_output_long); + + return 0; +} diff --git a/scripts/staging/cuda-counter-based-prng/pom.xml b/scripts/staging/cuda-counter-based-prng/pom.xml new file mode 100644 index 00000000000..ff96112cfc4 --- /dev/null +++ b/scripts/staging/cuda-counter-based-prng/pom.xml @@ -0,0 +1,56 @@ + + + + + 4.0.0 + + org.apache.systemds + CudaCounterBasedRandom + 1.0-SNAPSHOT + jar + + + + org.jcuda + jcuda + 10.2.0 + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.1 + + 1.8 + 1.8 + + + + org.apache.maven.plugins + maven-jar-plugin + + + + diff --git a/scripts/staging/cuda-counter-based-prng/readme.md b/scripts/staging/cuda-counter-based-prng/readme.md new file mode 100644 index 00000000000..b8649d6a913 --- /dev/null +++ b/scripts/staging/cuda-counter-based-prng/readme.md @@ -0,0 +1,121 @@ + + +# CUDA counter based PRNG + +Currently, random matrix generation is done using Java implementations. Either the Java Random class or the custom +counter based Philox4x64 implementation is used. This is not efficient for large matrices because first, Java is slow +and second, the matrix has to be copied from the main memory to the GPUs memory for performing matrix operations there. +We propose to implement a counter-based PRNG on CUDA to generate random matrices directly on the GPU. + +To be consistent with the current counter based PRNG implementation, we will use the Philox4x64 algorithm. +Unfortunately, the CUDA curand library is not open source, and we failed to replicate the numbers generated by the +curand library using a Java implementation. We therefore propose to use the random123 library, which is an open-source +library that implements the Philox4x64 algorithm under BSD-3 license. The random123 library is available +at https://github.com/DEShawResearch/random123. It is well tested using statistical tests as described in the +paper [Parallel random numbers: as easy as 1, 2, 3](https://doi.org/10.1145/2063384.2063405). + +## How to implement + +There are two ways how to integrate cuda kernels into the SystemDS project. The first way is to ship a precompiled +cuda ptx file with the SystemDS project. This has the drawback that the cuda ptx file has to be compiled for each +cuda version and each gpu architecture. + +The second way is to compile the cuda kernels during runtime. This means, the cuda build tools need to be installed +on the system where the SystemDS project is running, but the cuda ptx file can be compiled for the specific cuda +version and gpu architecture. + +### Precompiled cuda ptx file + +Example cuda kernel: + +```c++ +extern "C" __global__ void philox_4_64(ulong* output, uint64_t seed, uint64_t startingCounter, size_t numElements) { + // Calculate the thread's unique index + uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + // Ensure the thread index is within bounds + if (idx * 4 < numElements) { + // Initialize the Philox generator with a unique counter and key + r123::Philox4x64 rng; + r123::Philox4x64::ctr_type ctr = {{startingCounter + idx, 0, 0, 0}}; // Counter (startingCounter + thread index) + r123::Philox4x64::key_type key = {{seed}}; // Key (seed) + + // Generate 4 random integers + r123::Philox4x64::ctr_type result = rng(ctr, key); + + for (int i = 0; i < 4; ++i) { + size_t outputIdx = idx * 4 + i; + + // Ensure we don't exceed the output array bounds + if (outputIdx < numElements) { + output[outputIdx] = result[i]; + } + } + } +} +``` + +To compile the cuda kernel to a ptx file, you can use the following command: + +```bash +/usr/local/cuda/bin/nvcc kernel.cu -ccbin gcc-8 -lstdc++ -I ./random123/include -o cuda_test.ptx -lm --ptx -std=c++11 --gpu-architecture=sm_70 +``` + +This will compile the cuda kernel to a ptx file that can be shipped with the SystemDS project. + +```ptx +.version 6.5 +.target sm_70 +.address_size 64 + +.visible .entry philox_4_64( + .param .u64 philox_4_64_param_0, + .param .u64 philox_4_64_param_1, + .param .u64 philox_4_64_param_2, + .param .u64 philox_4_64_param_3 +) +{ + ... cuda kernel code ... +} + +``` +To use this ptx file in the SystemDS project, you can use this code: + +[PhiloxJNvrtcExample.java](/scripts/staging/cuda-counter-based-prng/PhiloxJNvrtcExample.java) + +Run the code with the following command: + +```bash +javac -cp .:./target/dependency/jcuda-10.2.0.jar:./target/dependency/jcuda-natives-10.2.0-linux-x86_64.jar PhiloxJNvrtcExample.java && java -cp .:./target/dependency/jcuda-10.2.0.jar:./target/dependency/jcuda-natives-10.2.0-linux-x86_64.jar PhiloxJNvrtcExample +``` + +### Compile cuda kernels during runtime + +To compile the cuda kernel during runtime, you can use the following code: + +[PhiloxRuntimeCompilationExample.java](/scripts/staging/cuda-counter-based-prng/PhiloxRuntimeCompilationExample.java) + +Run the code with the following command: + +```bash +javac -cp .:./target/dependency/jcuda-10.2.0.jar:./target/dependency/jcuda-natives-10.2.0-linux-x86_64.jar Random123_cuda.java && java -cp .:./target/dependency/jcuda-10.2.0.jar:./target/dependency/jcuda-natives-10.2.0-linux-x86_64.jar Random123_cuda +``` + + diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixDatagen.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixDatagen.java index c934ca02adf..58fb8e4fa8d 100644 --- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixDatagen.java +++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixDatagen.java @@ -21,11 +21,14 @@ package org.apache.sysds.runtime.matrix.data; import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; import java.util.Random; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; import java.util.stream.LongStream; +import java.util.stream.Stream; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -37,12 +40,17 @@ import org.apache.sysds.runtime.data.DenseBlock; import org.apache.sysds.runtime.data.SparseBlock; import org.apache.sysds.runtime.util.CommonThreadPool; +import org.apache.sysds.runtime.util.CounterBasedPRNGenerator; +import org.apache.sysds.runtime.util.IPRNGenerator; import org.apache.sysds.runtime.util.NormalPRNGenerator; import org.apache.sysds.runtime.util.PRNGenerator; +import org.apache.sysds.runtime.util.PhiloxNormalCBPRNGenerator; +import org.apache.sysds.runtime.util.PhiloxUniformCBPRNGenerator; import org.apache.sysds.runtime.util.PoissonPRNGenerator; import org.apache.sysds.runtime.util.UniformPRNGenerator; import org.apache.sysds.runtime.util.UtilFunctions; + public class LibMatrixDatagen { private static final Log LOG = LogFactory.getLog(LibMatrixDatagen.class.getName()); @@ -140,9 +148,11 @@ public static RandomMatrixGenerator createRandomMatrixGenerator(String pdfStr, i RandomMatrixGenerator rgen = null; switch (pdf) { case UNIFORM: + case CB_UNIFORM: rgen = new RandomMatrixGenerator(pdf, r, c, blen, sp, min, max); break; case NORMAL: + case CB_NORMAL: rgen = new RandomMatrixGenerator(pdf, r, c, blen, sp); break; case POISSON: @@ -465,13 +475,17 @@ private static void genRandomNumbers(boolean invokedFromCP, int rl, int ru, int int ncb = (int) Math.ceil((double)cols/blen); int counter = 0; + long[] ctr = {0, 0, 0, 0}; // Counter based RNG counter + // Setup Pseudo Random Number Generator for cell values based on 'pdf'. - PRNGenerator valuePRNG = rgen._valuePRNG; + IPRNGenerator valuePRNG = rgen._valuePRNG; if (valuePRNG == null) { switch (rgen._pdf) { case UNIFORM: valuePRNG = new UniformPRNGenerator(); break; case NORMAL: valuePRNG = new NormalPRNGenerator(); break; case POISSON: valuePRNG = new PoissonPRNGenerator(); break; + case CB_UNIFORM: valuePRNG = new PhiloxUniformCBPRNGenerator(); break; + case CB_NORMAL: valuePRNG = new PhiloxNormalCBPRNGenerator(); break; default: throw new DMLRuntimeException("Unsupported distribution function for Rand: " + rgen._pdf); } @@ -505,7 +519,13 @@ private static void genRandomNumbers(boolean invokedFromCP, int rl, int ru, int // Also note that we cannot use the same seed here, because for ultra-sparse generation // the number of calls to the valuePRNG and nnzPRNG are the same, thus creating correlated // outcomes (bias toward the end of the value range) - nnzPRNG.setSeed((long)(valuePRNG.nextDouble()*Long.MAX_VALUE)); + + if (valuePRNG instanceof CounterBasedPRNGenerator) { + nnzPRNG.setSeed((long)(((CounterBasedPRNGenerator) valuePRNG).getDoubles(ctr, 1)[0]*Long.MAX_VALUE )); + } else { + nnzPRNG.setSeed((long)(((PRNGenerator) valuePRNG).nextDouble()*Long.MAX_VALUE)); + } + boolean localSparse = sparsity < 1 && MatrixBlock.evalSparseFormatInMemory( blockrows, blockcols, (long)(sparsity*blockrows*blockcols)); if ( localSparse) { @@ -515,17 +535,22 @@ private static void genRandomNumbers(boolean invokedFromCP, int rl, int ru, int out.sparse = true; //otherwise ignored c = out.sparseBlock; } - genSparse(c, clen, blockrows, blockcols, rowoffset, coloffset, - sparsity, estnnzRow, min, range, valuePRNG, nnzPRNG); + if (valuePRNG instanceof PRNGenerator) { + genSparse(c, clen, blockrows, blockcols, rowoffset, coloffset, + sparsity, estnnzRow, min, range, (PRNGenerator) valuePRNG, nnzPRNG); + } } else { if (sparsity == 1.0) { genFullyDense(out.getDenseBlock(), blockrows, blockcols, - rowoffset, coloffset, min, range, valuePRNG); + rowoffset, coloffset, min, range, valuePRNG, ctr); } else { - genDense(out, clen, blockrows, blockcols, rowoffset, coloffset, - sparsity, estnnzRow, min, range, valuePRNG, nnzPRNG); + if (valuePRNG instanceof PRNGenerator) { + genDense(out, clen, blockrows, blockcols, rowoffset, coloffset, + sparsity, estnnzRow, min, range, (PRNGenerator) valuePRNG, nnzPRNG); + } + } } // sparse or dense } // cbj @@ -590,13 +615,22 @@ private static void genDense(MatrixBlock out, int clen, int blockrows, int block } private static void genFullyDense(DenseBlock c, int blockrows, int blockcols, int rowoffset, int coloffset, - double min, double range, PRNGenerator valuePRNG) + double min, double range, IPRNGenerator valuePRNG, long[] ctr) { - for(int i = rowoffset; i < rowoffset+blockrows; i++) { + Iterator rngStream; + if (valuePRNG instanceof PRNGenerator) { + rngStream = Stream.generate(() -> min + (range * ((PRNGenerator) valuePRNG).nextDouble())).iterator(); + } else if (valuePRNG instanceof CounterBasedPRNGenerator) { + rngStream = Arrays.stream(((CounterBasedPRNGenerator)valuePRNG).getDoubles(ctr, blockrows * blockcols)).map(i -> min + (range * i)).iterator(); + } else { + throw new DMLRuntimeException("Unsupported PRNGenerator for fully dense generation"); + } + for (int i = rowoffset; i < rowoffset + blockrows; i++) { double[] cvals = c.values(i); int cix = c.pos(i, coloffset); - for(int j = 0; j < blockcols; j++) - cvals[cix+j] = min + (range * valuePRNG.nextDouble()); + for (int j = 0; j < blockcols; j++) { + cvals[cix + j] = rngStream.next(); + } } } diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/RandomMatrixGenerator.java b/src/main/java/org/apache/sysds/runtime/matrix/data/RandomMatrixGenerator.java index 38f92be4cbd..408f2475fb5 100644 --- a/src/main/java/org/apache/sysds/runtime/matrix/data/RandomMatrixGenerator.java +++ b/src/main/java/org/apache/sysds/runtime/matrix/data/RandomMatrixGenerator.java @@ -20,8 +20,10 @@ package org.apache.sysds.runtime.matrix.data; import org.apache.sysds.runtime.DMLRuntimeException; +import org.apache.sysds.runtime.util.IPRNGenerator; import org.apache.sysds.runtime.util.NormalPRNGenerator; -import org.apache.sysds.runtime.util.PRNGenerator; +import org.apache.sysds.runtime.util.PhiloxNormalCBPRNGenerator; +import org.apache.sysds.runtime.util.PhiloxUniformCBPRNGenerator; import org.apache.sysds.runtime.util.PoissonPRNGenerator; import org.apache.sysds.runtime.util.UniformPRNGenerator; @@ -31,14 +33,14 @@ public class RandomMatrixGenerator { * Types of Probability density functions */ public enum PDF { - NORMAL, UNIFORM, POISSON + NORMAL, UNIFORM, POISSON, CB_UNIFORM, CB_NORMAL } PDF _pdf; int _rows, _cols, _blocksize; double _sparsity, _mean; double _min, _max; - PRNGenerator _valuePRNG; + IPRNGenerator _valuePRNG; public RandomMatrixGenerator() { _pdf = PDF.UNIFORM; @@ -166,6 +168,12 @@ protected void setupValuePRNG() { throw new DMLRuntimeException("Invalid parameter (" + _mean + ") for Poisson distribution."); _valuePRNG = new PoissonPRNGenerator(_mean); break; + case CB_UNIFORM: + _valuePRNG = new PhiloxUniformCBPRNGenerator(); + break; + case CB_NORMAL: + _valuePRNG = new PhiloxNormalCBPRNGenerator(); + break; default: throw new DMLRuntimeException("Unsupported probability density function"); } diff --git a/src/main/java/org/apache/sysds/runtime/util/CounterBasedPRNGenerator.java b/src/main/java/org/apache/sysds/runtime/util/CounterBasedPRNGenerator.java new file mode 100644 index 00000000000..d163012ec2c --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/util/CounterBasedPRNGenerator.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + + +package org.apache.sysds.runtime.util; + +public abstract class CounterBasedPRNGenerator implements IPRNGenerator { + + public abstract void setSeed(long sd); + + public abstract double[] getDoubles(long[] ctr, int size); +} diff --git a/src/main/java/org/apache/sysds/runtime/util/IPRNGenerator.java b/src/main/java/org/apache/sysds/runtime/util/IPRNGenerator.java new file mode 100644 index 00000000000..e348874cee1 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/util/IPRNGenerator.java @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + + +package org.apache.sysds.runtime.util; + +public interface IPRNGenerator { + public void setSeed(long seed); +} diff --git a/src/main/java/org/apache/sysds/runtime/util/PRNGenerator.java b/src/main/java/org/apache/sysds/runtime/util/PRNGenerator.java index ec1fc512efd..ce59978cbae 100644 --- a/src/main/java/org/apache/sysds/runtime/util/PRNGenerator.java +++ b/src/main/java/org/apache/sysds/runtime/util/PRNGenerator.java @@ -20,7 +20,7 @@ package org.apache.sysds.runtime.util; -public abstract class PRNGenerator { +public abstract class PRNGenerator implements IPRNGenerator { public abstract void setSeed(long sd); diff --git a/src/main/java/org/apache/sysds/runtime/util/PhiloxNormalCBPRNGenerator.java b/src/main/java/org/apache/sysds/runtime/util/PhiloxNormalCBPRNGenerator.java new file mode 100644 index 00000000000..f1d9594858d --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/util/PhiloxNormalCBPRNGenerator.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + + +package org.apache.sysds.runtime.util; + +public class PhiloxNormalCBPRNGenerator extends CounterBasedPRNGenerator { + private long[] seed; + private PhiloxUniformCBPRNGenerator uniformGen; + + public void setSeed(long sd) { + this.seed = new long[2]; + this.seed[0] = sd; + this.seed[1] = sd; + uniformGen = new PhiloxUniformCBPRNGenerator(); + uniformGen.setSeed(this.seed[0]); + } + + /** + * Generate a sequence of random doubles using the Philox4x64 counter-based PRNG. + * + * @param ctr The start counter to use for the PRNG + * @param size The number of doubles to generate + * @return An array of random doubles distributed normally with mean 0 and variance 1 + */ + public double[] getDoubles(long[] ctr, int size) { + // Ensure the key is correct size + if (this.seed.length != 2) { + throw new IllegalArgumentException("Key must be 128 bits"); + } + // Ensure the counter is correct size + if (ctr.length != 4) { + throw new IllegalArgumentException("Counter must be 256 bits"); + } + + double[] uniform = uniformGen.getDoubles(ctr, size + size % 2); + double[] normal = new double[size]; + for (int i = 0; i < size; i+=2) { + double v1 = Math.sqrt(-2*Math.log(uniform[i])); + double v2 = 2*Math.PI*uniform[i + 1]; + normal[i] = v1 * Math.cos(v2); + if (i + 1 < size) { + normal[i + 1] = v1 * Math.sin(v2); + } + } + + return normal; + } +} diff --git a/src/main/java/org/apache/sysds/runtime/util/PhiloxUniformCBPRNGenerator.java b/src/main/java/org/apache/sysds/runtime/util/PhiloxUniformCBPRNGenerator.java new file mode 100644 index 00000000000..9815eca8682 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/util/PhiloxUniformCBPRNGenerator.java @@ -0,0 +1,181 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + + +package org.apache.sysds.runtime.util; + +public class PhiloxUniformCBPRNGenerator extends CounterBasedPRNGenerator { + + // Constants for Philox + private static final long PHILOX_M4x64_0_hi = 0xD2E7470EE14C6C93L >>> 32; + private static final long PHILOX_M4x64_0_lo = 0xD2E7470EE14C6C93L & 0xFFFFFFFFL; + private static final long PHILOX_M4x64_1_hi = 0xCA5A826395121157L >>> 32; + private static final long PHILOX_M4x64_1_lo = 0xCA5A826395121157L & 0xFFFFFFFFL; + private static final long PHILOX_W64_0 = 0x9E3779B97F4A7C15L; + private static final long PHILOX_W64_1 = 0xBB67AE8584CAA73BL; + private static final double LONG_TO_01 = 0.5 / Long.MAX_VALUE; + + // Default number of rounds + private static final int PHILOX4x64_DEFAULT_ROUNDS = 10; + private long[] seed; + + public void setSeed(long sd) { + this.seed = new long[2]; + this.seed[0] = sd; + this.seed[1] = sd; + } + + /** + * Generate a sequence of random doubles using the Philox4x64 counter-based PRNG. + * + * @param ctr The start counter to use for the PRNG + * @param size The number of doubles to generate + * @return An array of random doubles distributed uniformly between 0 and 1 + */ + public double[] getDoubles(long[] ctr, int size) { + // Ensure the key is correct size + if (this.seed.length != 2) { + throw new IllegalArgumentException("Key must be 128 bits"); + } + // Ensure the counter is correct size + if (ctr.length != 4) { + throw new IllegalArgumentException("Counter must be 256 bits"); + } + + int iterations = size / 4; + long[] result = new long[size]; + long[] currentKey = new long[]{this.seed[0], this.seed[1]}; // Create a copy of the key + + // Reusable arrays for counters + long[] currentCtr = ctr.clone(); + + for (int i = 0; i < iterations; i++) { + for (int j = 0; j < PHILOX4x64_DEFAULT_ROUNDS; j++) { + // Multiply as 128-bit + long bHigh = currentCtr[0] >>> 32; + long bLow = currentCtr[0] & 0xFFFFFFFFL; + + long hi0 = PHILOX_M4x64_0_hi * bHigh; + long mid1 = PHILOX_M4x64_0_hi * bLow; + long mid2 = PHILOX_M4x64_0_lo * bHigh; + long lo0 = PHILOX_M4x64_0_lo * bLow; + + // Combine results + long carry = (lo0 >>> 32) + (mid1 & 0xFFFFFFFFL) + (mid2 & 0xFFFFFFFFL); + hi0 += (mid1 >>> 32) + (mid2 >>> 32) + (carry >>> 32); + lo0 = (lo0 & 0xFFFFFFFFL) | (carry << 32); + + // Multiply as 128-bit + bHigh = currentCtr[2] >>> 32; + bLow = currentCtr[2] & 0xFFFFFFFFL; + + long hi1 = PHILOX_M4x64_1_hi * bHigh; + mid1 = PHILOX_M4x64_1_hi * bLow; + mid2 = PHILOX_M4x64_1_lo * bHigh; + long lo1 = PHILOX_M4x64_1_lo * bLow; + + // Combine results + carry = (lo1 >>> 32) + (mid1 & 0xFFFFFFFFL) + (mid2 & 0xFFFFFFFFL); + hi1 += (mid1 >>> 32) + (mid2 >>> 32) + (carry >>> 32); + lo1 = (lo1 & 0xFFFFFFFFL) | (carry << 32); + + currentCtr[0] = hi1 ^ currentCtr[1] ^ currentKey[0]; + currentCtr[2] = hi0 ^ currentCtr[3] ^ currentKey[1]; + currentCtr[1] = lo1; + currentCtr[3] = lo0; + + currentKey[0] += PHILOX_W64_0; + currentKey[1] += PHILOX_W64_1; + } + + // Unpack the results + result[i * 4] = currentCtr[0]; + result[i * 4 + 1] = currentCtr[1]; + result[i * 4 + 2] = currentCtr[2]; + result[i * 4 + 3] = currentCtr[3]; + + // Increment the counter + if (++ctr[0] == 0 && ++ctr[1] == 0 && ++ctr[2] == 0) { + ++ctr[3]; + } + currentCtr[0] = ctr[0]; + currentCtr[1] = ctr[1]; + currentCtr[2] = ctr[2]; + currentCtr[3] = ctr[3]; + currentKey[0] = this.seed[0]; + currentKey[1] = this.seed[1]; + } + + // Handle remaining elements + if (size % 4 != 0) { + for (int j = 0; j < PHILOX4x64_DEFAULT_ROUNDS; j++) { + // Multiply as 128-bit + long bHigh = currentCtr[0] >>> 32; + long bLow = currentCtr[0] & 0xFFFFFFFFL; + + long hi0 = PHILOX_M4x64_0_hi * bHigh; + long mid1 = PHILOX_M4x64_0_hi * bLow; + long mid2 = PHILOX_M4x64_0_lo * bHigh; + long lo0 = PHILOX_M4x64_0_lo * bLow; + + // Combine results + long carry = (lo0 >>> 32) + (mid1 & 0xFFFFFFFFL) + (mid2 & 0xFFFFFFFFL); + hi0 += (mid1 >>> 32) + (mid2 >>> 32) + (carry >>> 32); + lo0 = (lo0 & 0xFFFFFFFFL) | (carry << 32); + + // Multiply as 128-bit + bHigh = currentCtr[2] >>> 32; + bLow = currentCtr[2] & 0xFFFFFFFFL; + + long hi1 = PHILOX_M4x64_1_hi * bHigh; + mid1 = PHILOX_M4x64_1_hi * bLow; + mid2 = PHILOX_M4x64_1_lo * bHigh; + long lo1 = PHILOX_M4x64_1_lo * bLow; + + // Combine results + carry = (lo1 >>> 32) + (mid1 & 0xFFFFFFFFL) + (mid2 & 0xFFFFFFFFL); + hi1 += (mid1 >>> 32) + (mid2 >>> 32) + (carry >>> 32); + lo1 = (lo1 & 0xFFFFFFFFL) | (carry << 32); + + currentCtr[0] = hi1 ^ currentCtr[1] ^ currentKey[0]; + currentCtr[2] = hi0 ^ currentCtr[3] ^ currentKey[1]; + currentCtr[1] = lo1; + currentCtr[3] = lo0; + + currentKey[0] += PHILOX_W64_0; + currentKey[1] += PHILOX_W64_1; + } + + // Store the remaining results + switch (size % 4) { + case 3: + result[iterations * 4 + 2] = currentCtr[2]; + case 2: + result[iterations * 4 + 1] = currentCtr[1]; + case 1: + result[iterations * 4] = currentCtr[0]; + } + } + double[] double_result = new double[result.length]; + for (int i = 0; i < result.length; i++) { + double_result[i] = result[i] * LONG_TO_01 + .5; + } + return double_result; + } +} diff --git a/src/test/java/org/apache/sysds/test/component/matrix/LibMatrixDatagenTest.java b/src/test/java/org/apache/sysds/test/component/matrix/LibMatrixDatagenTest.java new file mode 100644 index 00000000000..30e0c75fbd2 --- /dev/null +++ b/src/test/java/org/apache/sysds/test/component/matrix/LibMatrixDatagenTest.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.component.matrix; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.sysds.runtime.matrix.data.LibMatrixDatagen; +import org.apache.sysds.runtime.matrix.data.MatrixBlock; +import org.apache.sysds.runtime.matrix.data.RandomMatrixGenerator; +import org.junit.Test; + +import java.util.Arrays; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.assertArrayEquals; + +public class LibMatrixDatagenTest { + protected static final Log LOG = LogFactory.getLog(LibMatrixDatagenTest.class.getName()); + + @Test + public void testGenerateUniformMatrixPhilox() { + MatrixBlock mb = new MatrixBlock(); + RandomMatrixGenerator rgen = new RandomMatrixGenerator(RandomMatrixGenerator.PDF.CB_UNIFORM, 10, 10, 10, 1, 0., 1.); + LibMatrixDatagen.generateRandomMatrix(mb, rgen, null, 0L); + for(int i = 0; i < 10; i++) { + for(int j = 0; j < 10; j++) { + assertTrue("Value: " + mb.get(i, j) + "needs to be less than 1", mb.get(i, j) < 1); + assertTrue("Value: " + mb.get(i, j) + "needs to be greater than 0", mb.get(i, j) > 0); + } + } + } + + @Test + public void testGenerateNormalMatrixPhilox() { + MatrixBlock mb = new MatrixBlock(); + RandomMatrixGenerator rgen = new RandomMatrixGenerator(RandomMatrixGenerator.PDF.CB_NORMAL, 1000, 1000, 1000 * 1000, 1); + LibMatrixDatagen.generateRandomMatrix(mb, rgen, null, 123123123123L); + double mean = mb.mean(); + double[] bv = mb.getDenseBlockValues(); + double variance = Arrays.stream(bv).map(x -> Math.pow(x - mean, 2)).sum() / bv.length; + assertEquals("Mean should be 0", 0, mean, 0.01); + assertEquals("Variance should be 1", 1, variance, 0.001); + } + + @Test + public void testGenerateUniformMatrixPhiloxShouldHaveGoodStatistics() { + MatrixBlock mb = new MatrixBlock(); + RandomMatrixGenerator rgen = new RandomMatrixGenerator(RandomMatrixGenerator.PDF.CB_UNIFORM, 1000, 1000, 100, 1, 0., 1.); + LibMatrixDatagen.generateRandomMatrix(mb, rgen, null, 0L); + + double mean = mb.mean(); + assertEquals("Mean should be 0.5", 0.5, mean, 0.001); + + double[] bv = mb.getDenseBlockValues(); + assertEquals(1000 * 1000, bv.length); + double variance = Arrays.stream(bv).map(x -> Math.pow(x - mean, 2)).sum() / bv.length; + assertEquals("Variance should be 1", 0.0833, variance, 0.001); + } + + @Test + public void testGenerateUniformMatrixShouldReturnSameValuesUsingStreams() { + MatrixBlock mb = new MatrixBlock(); + RandomMatrixGenerator rgen = new RandomMatrixGenerator(RandomMatrixGenerator.PDF.UNIFORM, 1000, 1000, 100, 1, 0., 1.); + LibMatrixDatagen.generateRandomMatrix(mb, rgen, null, 0L); + + double[] bv = Arrays.copyOf(mb.getDenseBlockValues(), 100); + double[] previous = new double[] {0.24053641567148587, 0.6374174253501083, 0.5504370051176339, 0.5975452777972018, 0.3332183994766498, 0.3851891847407185, 0.984841540199809, 0.8791825178724801, 0.9412491794821144, 0.27495396603548483, 0.12889715087377673, 0.14660165764651822, 0.023238122483889456, 0.5467397571984656, 0.9644868606768501, 0.10449068625097169, 0.6251463634655593, 0.4107961954910617, 0.7763122912749325, 0.990722785714783, 0.4872328470301428, 0.7462414053223305, 0.7331520701949938, 0.8172970714093244, 0.8388903500470183, 0.5266994346048661, 0.8993350116114935, 0.13393984058689223, 0.0830623982249149, 0.9785743401478403, 0.7223571191888487, 0.7150310138504744, 0.14322038530059678, 0.4629578184224229, 0.004485602182885184, 0.07149831487989411, 0.34842022979166454, 0.3387696535357536, 0.859356551354648, 0.9715469888517128, 0.8657458802140383, 0.6125811047098682, 0.17898798452881726, 0.21757041220968598, 0.8544871670422907, 0.009673497300974332, 0.6922930069529333, 0.7713129661706796, 0.7126874281456893, 0.2112353749298962, 0.7830924897671794, 0.945333238959629, 0.014236355103667941, 0.3942035527773311, 0.8537907753080728, 0.7860424508145526, 0.993471955005814, 0.883104405981479, 0.17029153024770394, 0.9620689182075386, 0.7242950335788688, 0.6773541612498745, 0.8043954172246357, 0.44142677367579175, 0.46208799028599445, 0.8528274665994607, 0.501834850205735, 0.9919429804102169, 0.9692699099404161, 0.35310607217911816, 0.047265869196129406, 0.0716236234178006, 0.02910751272163581, 0.48367019010510015, 0.9719501209537452, 0.9891171507514055, 0.7674421030154899, 0.5013973510122299, 0.2555253108964435, 0.30915818724818767, 0.8482805002723425, 0.052084538173983286, 0.010175454536229256, 0.35385296970871194, 0.08673785516572752, 0.8503115152643057, 0.0036769023557003955, 0.3078931676344727, 0.5316085562487977, 0.9188142018385732, 0.27721002606871137, 0.8742622102831944, 0.6098815135127635, 0.9086392096967358, 0.04449062015679506, 0.6467239010388895, 0.4968037636226561, 0.5067015959528527, 0.5206888198929495, 0.36636074451399603}; + assertArrayEquals(previous, bv, 0.0001); + } +} diff --git a/src/test/scripts/functions/data/RandCounterBasedNormal.dml b/src/test/scripts/functions/data/RandCounterBasedNormal.dml new file mode 100644 index 00000000000..f8263d966a0 --- /dev/null +++ b/src/test/scripts/functions/data/RandCounterBasedNormal.dml @@ -0,0 +1,24 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + + +A = Rand(rows=$1, cols=$2, sparsity=$3, seed=$4, pdf="CB_NORMAL"); +write(A, $5); \ No newline at end of file diff --git a/src/test/scripts/functions/data/RandCounterBasedUniform.dml b/src/test/scripts/functions/data/RandCounterBasedUniform.dml new file mode 100644 index 00000000000..65dc1f07363 --- /dev/null +++ b/src/test/scripts/functions/data/RandCounterBasedUniform.dml @@ -0,0 +1,24 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + + +A = Rand(rows=$1, cols=$2, sparsity=$3, seed=$4, pdf="CB_UNIFORM"); +write(A, $5); \ No newline at end of file