diff --git a/.github/workflows/rc.yml b/.github/workflows/rc.yml index 61281964..9dc04fee 100644 --- a/.github/workflows/rc.yml +++ b/.github/workflows/rc.yml @@ -421,8 +421,8 @@ jobs: - name: Prepare docs run: | mkdir -p docs - cp -a target/site/apidocs docs/reference - tar -cvzf docs.tar.gz docs + cp -a target/site/apidocs reference + tar -cvzf reference.tar.gz reference - name: Upload binaries uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 with: @@ -431,8 +431,46 @@ jobs: - name: Upload docs uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 with: - name: release-docs - path: docs.tar.gz + name: reference + path: reference.tar.gz + docs: + name: Docs + needs: + - binaries + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + steps: + - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + cache: 'pip' + - name: Download source archive + uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8 + with: + name: release-source + - name: Download Javadocs + uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8 + with: + name: reference + - name: Extract source archive + run: | + tar -xf apache-arrow-java-*.tar.gz --strip-components=1 + - name: Build + run: | + cd docs + python -m venv venv + source venv/bin/activate + pip install -r requirements.txt + make html + tar -xf ../reference.tar.gz -C build/html + - name: Compress into single artifact to keep directory structure + run: tar -cvzf html.tar.gz -C docs/build html + - name: Upload artifacts + uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 + with: + name: release-html + path: html.tar.gz verify: name: Verify needs: @@ -473,6 +511,7 @@ jobs: name: Upload if: github.ref_type == 'tag' needs: + - docs - verify runs-on: ubuntu-latest permissions: diff --git a/.gitignore b/.gitignore index 205be77e..b57597af 100644 --- a/.gitignore +++ b/.gitignore @@ -20,6 +20,7 @@ /dev/release/apache-rat-0.16.1.jar /dev/release/filtered_rat.txt /dev/release/rat.xml +/docs/build/ CMakeCache.txt CMakeFiles/ Makefile diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index 8efd379a..8324d32c 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -17,3 +17,4 @@ .gitmodules dataset/src/test/resources/data/student.csv +docs/Makefile diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 00000000..a4de0bff --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= -W +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 00000000..70c2ef2a --- /dev/null +++ b/docs/README.md @@ -0,0 +1,28 @@ + + +# Documentation + +Build with Sphinx. + +```bash +cd docs +pip install -r requirements.txt +make html +``` diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 00000000..fa2d0bbe --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +furo==2024.8.6 +myst-parser==4.0.0 +Sphinx==8.1.3 +sphinx-autobuild==2024.10.3 +sphinx-basic-ng==1.0.0b2 +sphinxcontrib-applehelp==2.0.0 +sphinxcontrib-devhelp==2.0.0 +sphinxcontrib-htmlhelp==2.1.0 +sphinxcontrib-jsmath==1.0.1 +sphinxcontrib-qthelp==2.0.0 +sphinxcontrib-serializinghtml==2.0.0 diff --git a/docs/source/_static/.gitignore b/docs/source/_static/.gitignore new file mode 100644 index 00000000..e69de29b diff --git a/docs/source/algorithm.rst b/docs/source/algorithm.rst new file mode 100644 index 00000000..d4838967 --- /dev/null +++ b/docs/source/algorithm.rst @@ -0,0 +1,92 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Java Algorithms +=============== + +Arrow's Java library provides algorithms for some commonly-used +functionalities. The algorithms are provided in the ``org.apache.arrow.algorithm`` +package of the ``algorithm`` module. + +Comparing Vector Elements +------------------------- + +Comparing vector elements is the basic for many algorithms. Vector +elements can be compared in one of the two ways: + +1. **Equality comparison**: there are two possible results for this type of comparisons: ``equal`` and ``unequal``. +Currently, this type of comparison is supported through the ``org.apache.arrow.vector.compare.VectorValueEqualizer`` +interface. + +2. **Ordering comparison**: there are three possible results for this type of comparisons: ``less than``, ``equal to`` +and ``greater than``. This comparison is supported by the abstract class ``org.apache.arrow.algorithm.sort.VectorValueComparator``. + +We provide default implementations to compare vector elements. However, users can also define ways +for customized comparisons. + +Vector Element Search +--------------------- + +A search algorithm tries to find a particular value in a vector. When successful, a vector index is +returned; otherwise, a ``-1`` is returned. The following search algorithms are provided: + +1. **Linear search**: this algorithm simply traverses the vector from the beginning, until a match is +found, or the end of the vector is reached. So it takes ``O(n)`` time, where ``n`` is the number of elements +in the vector. This algorithm is implemented in ``org.apache.arrow.algorithm.search.VectorSearcher#linearSearch``. + +2. **Binary search**: this represents a more efficient search algorithm, as it runs in ``O(log(n))`` time. +However, it is only applicable to sorted vectors. To get a sorted vector, +one can use one of our sorting algorithms, which will be discussed in the next section. This algorithm +is implemented in ``org.apache.arrow.algorithm.search.VectorSearcher#binarySearch``. + +3. **Parallel search**: when the vector is large, it takes a long time to traverse the elements to search +for a value. To make this process faster, one can split the vector into multiple partitions, and perform the +search for each partition in parallel. This is supported by ``org.apache.arrow.algorithm.search.ParallelSearcher``. + +4. **Range search**: for many scenarios, there can be multiple matching values in the vector. +If the vector is sorted, the matching values reside in a contiguous region in the vector. The +range search algorithm tries to find the upper/lower bound of the region in ``O(log(n))`` time. +An implementation is provided in ``org.apache.arrow.algorithm.search.VectorRangeSearcher``. + +Vector Sorting +-------------- + +Given a vector, a sorting algorithm turns it into a sorted one. The sorting criteria must +be specified by some ordering comparison operation. The sorting algorithms can be +classified into the following categories: + +1. **In-place sorter**: an in-place sorter performs the sorting by manipulating the original +vector, without creating any new vector. So it just returns the original vector after the sorting operations. +Currently, we have ``org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter`` for in-place +sorting in ``O(nlog(n))`` time. As the name suggests, it only supports fixed width vectors. + +2. **Out-of-place sorter**: an out-of-place sorter does not mutate the original vector. Instead, +it copies vector elements to a new vector in sorted order, and returns the new vector. +We have ``org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter.FixedWidthOutOfPlaceVectorSorter`` +and ``org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter.VariableWidthOutOfPlaceVectorSorter`` +for fixed width and variable width vectors, respectively. Both algorithms run in ``O(nlog(n))`` time. + +3. **Index sorter**: this sorter does not actually sort the vector. Instead, it returns an integer +vector, which correspond to indices of vector elements in sorted order. With the index vector, one can +easily construct a sorted vector. In addition, some other tasks can be easily achieved, like finding the ``k`` th +smallest value in the vector. Index sorting is supported by ``org.apache.arrow.algorithm.sort.IndexSorter``, +which runs in ``O(nlog(n))`` time. It is applicable to vectors of any type. + +Other Algorithms +---------------- + +Other algorithms include vector deduplication, dictionary encoding, etc., in the ``algorithm`` module. diff --git a/docs/source/cdata.rst b/docs/source/cdata.rst new file mode 100644 index 00000000..9643d88d --- /dev/null +++ b/docs/source/cdata.rst @@ -0,0 +1,468 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +================ +C Data Interface +================ + +Arrow supports exchanging data without copying or serialization within the same process +through :external+arrow:ref:`c-data-interface`, even between different language runtimes. + +Java to Python +-------------- + +See :external+arrow:doc:`python/integration/python_java` to implement Java to +Python communication using the C Data Interface. + +Java to C++ +----------- + +See :external+arrow:doc:`developers/cpp/building` to build the Arrow C++ libraries: + +.. code-block:: shell + + $ git clone https://github.com/apache/arrow.git + $ cd arrow/cpp + $ mkdir build # from inside the `cpp` subdirectory + $ cd build + $ cmake .. --preset ninja-debug-minimal + $ cmake --build . + $ tree debug/ + debug/ + ├── libarrow.800.0.0.dylib + ├── libarrow.800.dylib -> libarrow.800.0.0.dylib + └── libarrow.dylib -> libarrow.800.dylib + +Share an Int64 array from C++ to Java +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**C++ Side** + +Implement a function in CDataCppBridge.h that exports an array via the C Data Interface: + +.. code-block:: cpp + + #include + #include + #include + + void FillInt64Array(const uintptr_t c_schema_ptr, const uintptr_t c_array_ptr) { + arrow::Int64Builder builder; + builder.Append(1); + builder.Append(2); + builder.Append(3); + builder.AppendNull(); + builder.Append(5); + builder.Append(6); + builder.Append(7); + builder.Append(8); + builder.Append(9); + builder.Append(10); + std::shared_ptr array = *builder.Finish(); + + struct ArrowSchema* c_schema = reinterpret_cast(c_schema_ptr); + auto c_schema_status = arrow::ExportType(*array->type(), c_schema); + if (!c_schema_status.ok()) c_schema_status.Abort(); + + struct ArrowArray* c_array = reinterpret_cast(c_array_ptr); + auto c_array_status = arrow::ExportArray(*array, c_array); + if (!c_array_status.ok()) c_array_status.Abort(); + } + +**Java Side** + +For this example, we will use `JavaCPP`_ to call our C++ function from Java, +without writing JNI bindings ourselves. + +.. code-block:: xml + + + + 4.0.0 + + org.example + java-cdata-example + 1.0-SNAPSHOT + + + 8 + 8 + 9.0.0 + + + + org.bytedeco + javacpp + 1.5.7 + + + org.apache.arrow + arrow-c-data + ${arrow.version} + + + org.apache.arrow + arrow-vector + ${arrow.version} + + + org.apache.arrow + arrow-memory-core + ${arrow.version} + + + org.apache.arrow + arrow-memory-netty + ${arrow.version} + + + org.apache.arrow + arrow-format + ${arrow.version} + + + + +.. code-block:: java + + import org.bytedeco.javacpp.annotation.Platform; + import org.bytedeco.javacpp.annotation.Properties; + import org.bytedeco.javacpp.tools.InfoMap; + import org.bytedeco.javacpp.tools.InfoMapper; + + @Properties( + target = "CDataJavaToCppExample", + value = @Platform( + include = { + "CDataCppBridge.h" + }, + compiler = {"cpp17"}, + linkpath = {"/arrow/cpp/build/debug/"}, + link = {"arrow"} + ) + ) + public class CDataJavaConfig implements InfoMapper { + + @Override + public void map(InfoMap infoMap) { + } + } + +.. code-block:: shell + + # Compile our Java code + $ javac -cp javacpp-1.5.7.jar CDataJavaConfig.java + + # Generate CDataInterfaceLibrary + $ java -jar javacpp-1.5.7.jar CDataJavaConfig.java + + # Generate libjniCDataInterfaceLibrary.dylib + $ java -jar javacpp-1.5.7.jar CDataJavaToCppExample.java + + # Validate libjniCDataInterfaceLibrary.dylib created + $ otool -L macosx-x86_64/libjniCDataJavaToCppExample.dylib + macosx-x86_64/libjniCDataJavaToCppExample.dylib: + libjniCDataJavaToCppExample.dylib (compatibility version 0.0.0, current version 0.0.0) + @rpath/libarrow.800.dylib (compatibility version 800.0.0, current version 800.0.0) + /usr/lib/libc++.1.dylib (compatibility version 1.0.0, current version 1200.3.0) + /usr/lib/libSystem.B.dylib (compatibility version 1.0.0, current version 1311.0.0) + +**Java Test** + +Let's create a Java class to test our bridge: + +.. code-block:: java + + import org.apache.arrow.c.ArrowArray; + import org.apache.arrow.c.ArrowSchema; + import org.apache.arrow.c.Data; + import org.apache.arrow.memory.BufferAllocator; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.BigIntVector; + + public class TestCDataInterface { + public static void main(String[] args) { + try( + BufferAllocator allocator = new RootAllocator(); + ArrowSchema arrowSchema = ArrowSchema.allocateNew(allocator); + ArrowArray arrowArray = ArrowArray.allocateNew(allocator) + ){ + CDataJavaToCppExample.FillInt64Array( + arrowSchema.memoryAddress(), arrowArray.memoryAddress()); + try( + BigIntVector bigIntVector = (BigIntVector) Data.importVector( + allocator, arrowArray, arrowSchema, null) + ){ + System.out.println("C++-allocated array: " + bigIntVector); + } + } + } + } + +.. code-block:: shell + + C++-allocated array: [1, 2, 3, null, 5, 6, 7, 8, 9, 10] + +Share an Int32 array from Java to C++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Java Side** + +For this example, we will build a JAR with all dependencies bundled. + +.. code-block:: xml + + + + 4.0.0 + org.example + cpptojava + 1.0-SNAPSHOT + + 8 + 8 + 9.0.0 + + + + org.apache.arrow + arrow-c-data + ${arrow.version} + + + org.apache.arrow + arrow-memory-netty + ${arrow.version} + + + + + + org.apache.maven.plugins + maven-assembly-plugin + + + package + + single + + + + jar-with-dependencies + + + + + + + + + +.. code-block:: java + + import org.apache.arrow.c.ArrowArray; + import org.apache.arrow.c.ArrowSchema; + import org.apache.arrow.c.Data; + import org.apache.arrow.memory.BufferAllocator; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.FieldVector; + import org.apache.arrow.vector.IntVector; + import org.apache.arrow.vector.VectorSchemaRoot; + + import java.util.Arrays; + + public class ToBeCalledByCpp { + final static BufferAllocator allocator = new RootAllocator(); + + /** + * Create a {@link FieldVector} and export it via the C Data Interface + * @param schemaAddress Schema memory address to wrap + * @param arrayAddress Array memory address to wrap + */ + public static void fillVector(long schemaAddress, long arrayAddress){ + try (ArrowArray arrow_array = ArrowArray.wrap(arrayAddress); + ArrowSchema arrow_schema = ArrowSchema.wrap(schemaAddress) ) { + Data.exportVector(allocator, populateFieldVectorToExport(), null, arrow_array, arrow_schema); + } + } + + /** + * Create a {@link VectorSchemaRoot} and export it via the C Data Interface + * @param schemaAddress Schema memory address to wrap + * @param arrayAddress Array memory address to wrap + */ + public static void fillVectorSchemaRoot(long schemaAddress, long arrayAddress){ + try (ArrowArray arrow_array = ArrowArray.wrap(arrayAddress); + ArrowSchema arrow_schema = ArrowSchema.wrap(schemaAddress) ) { + Data.exportVectorSchemaRoot(allocator, populateVectorSchemaRootToExport(), null, arrow_array, arrow_schema); + } + } + + private static FieldVector populateFieldVectorToExport(){ + IntVector intVector = new IntVector("int-to-export", allocator); + intVector.allocateNew(3); + intVector.setSafe(0, 1); + intVector.setSafe(1, 2); + intVector.setSafe(2, 3); + intVector.setValueCount(3); + System.out.println("[Java] FieldVector: \n" + intVector); + return intVector; + } + + private static VectorSchemaRoot populateVectorSchemaRootToExport(){ + IntVector intVector = new IntVector("age-to-export", allocator); + intVector.setSafe(0, 10); + intVector.setSafe(1, 20); + intVector.setSafe(2, 30); + VectorSchemaRoot root = new VectorSchemaRoot(Arrays.asList(intVector)); + root.setRowCount(3); + System.out.println("[Java] VectorSchemaRoot: \n" + root.contentToTSVString()); + return root; + } + } + +Build the JAR and copy it to the C++ project. + +.. code-block:: shell + + $ mvn clean install + $ cp target/cpptojava-1.0-SNAPSHOT-jar-with-dependencies.jar /cpptojava.jar + +**C++ Side** + +This application uses JNI to call Java code, but transfers data (zero-copy) via the C Data Interface instead. + +.. code-block:: cpp + + #include + #include + + #include + #include + + JNIEnv *CreateVM(JavaVM **jvm) { + JNIEnv *env; + JavaVMInitArgs vm_args; + JavaVMOption options[2]; + options[0].optionString = "-Djava.class.path=cpptojava.jar"; + options[1].optionString = "-DXcheck:jni:pedantic"; + vm_args.version = JNI_VERSION_10; + vm_args.nOptions = 2; + vm_args.options = options; + int status = JNI_CreateJavaVM(jvm, (void **) &env, &vm_args); + if (status < 0) { + std::cerr << "\n<<<<< Unable to Launch JVM >>>>>\n" << std::endl; + return nullptr; + } + return env; + } + + int main() { + JNIEnv *env; + JavaVM *jvm; + env = CreateVM(&jvm); + if (env == nullptr) return EXIT_FAILURE; + jclass javaClassToBeCalledByCpp = env->FindClass("ToBeCalledByCpp"); + if (javaClassToBeCalledByCpp != nullptr) { + jmethodID fillVector = env->GetStaticMethodID(javaClassToBeCalledByCpp, + "fillVector", + "(JJ)V"); + if (fillVector != nullptr) { + struct ArrowSchema arrowSchema; + struct ArrowArray arrowArray; + std::cout << "\n<<<<< C++ to Java for Arrays >>>>>\n" << std::endl; + env->CallStaticVoidMethod(javaClassToBeCalledByCpp, fillVector, + static_cast(reinterpret_cast(&arrowSchema)), + static_cast(reinterpret_cast(&arrowArray))); + auto resultImportArray = arrow::ImportArray(&arrowArray, &arrowSchema); + std::shared_ptr array = resultImportArray.ValueOrDie(); + std::cout << "[C++] Array: " << array->ToString() << std::endl; + } else { + std::cerr << "Could not find fillVector method\n" << std::endl; + return EXIT_FAILURE; + } + jmethodID fillVectorSchemaRoot = env->GetStaticMethodID(javaClassToBeCalledByCpp, + "fillVectorSchemaRoot", + "(JJ)V"); + if (fillVectorSchemaRoot != nullptr) { + struct ArrowSchema arrowSchema; + struct ArrowArray arrowArray; + std::cout << "\n<<<<< C++ to Java for RecordBatch >>>>>\n" << std::endl; + env->CallStaticVoidMethod(javaClassToBeCalledByCpp, fillVectorSchemaRoot, + static_cast(reinterpret_cast(&arrowSchema)), + static_cast(reinterpret_cast(&arrowArray))); + auto resultImportVectorSchemaRoot = arrow::ImportRecordBatch(&arrowArray, &arrowSchema); + std::shared_ptr recordBatch = resultImportVectorSchemaRoot.ValueOrDie(); + std::cout << "[C++] RecordBatch: " << recordBatch->ToString() << std::endl; + } else { + std::cerr << "Could not find fillVectorSchemaRoot method\n" << std::endl; + return EXIT_FAILURE; + } + } else { + std::cout << "Could not find ToBeCalledByCpp class\n" << std::endl; + return EXIT_FAILURE; + } + jvm->DestroyJavaVM(); + return EXIT_SUCCESS; + } + +CMakeLists.txt definition file: + +.. code-block:: cmake + + cmake_minimum_required(VERSION 3.19) + project(cdatacpptojava) + find_package(JNI REQUIRED) + find_package(Arrow REQUIRED) + message(STATUS "Arrow version: ${ARROW_VERSION}") + include_directories(${JNI_INCLUDE_DIRS}) + set(CMAKE_CXX_STANDARD 17) + add_executable(${PROJECT_NAME} main.cpp) + target_link_libraries(cdatacpptojava PRIVATE Arrow::arrow_shared) + target_link_libraries(cdatacpptojava PRIVATE ${JNI_LIBRARIES}) + +**Result** + +.. code-block:: text + + <<<<< C++ to Java for Arrays >>>>> + [Java] FieldVector: + [1, 2, 3] + [C++] Array: [ + 1, + 2, + 3 + ] + + <<<<< C++ to Java for RecordBatch >>>>> + [Java] VectorSchemaRoot: + age-to-export + 10 + 20 + 30 + + [C++] RecordBatch: age-to-export: [ + 10, + 20, + 30 + ] + +.. _`JavaCPP`: https://github.com/bytedeco/javacpp diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 00000000..166a3bc4 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,51 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = 'arrow-java' +copyright = '2025, Apache Arrow Developers' +author = 'Apache Arrow Developers' +release = '18.1.0' + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = ["sphinx.ext.intersphinx"] + +templates_path = ['_templates'] +exclude_patterns = [] + +# -- Intersphinx ------------------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/extensions/intersphinx.html + +intersphinx_mapping = { + 'arrow': ('https://arrow.apache.org/docs/', None), + 'cookbook': ('https://arrow.apache.org/cookbook/java/', None), +} + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = 'furo' +html_static_path = ['_static'] diff --git a/docs/source/dataset.rst b/docs/source/dataset.rst new file mode 100644 index 00000000..deaa0095 --- /dev/null +++ b/docs/source/dataset.rst @@ -0,0 +1,309 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +======= +Dataset +======= + +.. warning:: + + Experimental: The Java module ``dataset`` is currently under early + development. API might be changed in each release of Apache Arrow until it + gets mature. + +Dataset is an universal layer in Apache Arrow for querying data in different +formats or in different partitioning strategies. Usually the data to be queried +is supposed to be located from a traditional file system, however Arrow Dataset +is not designed only for querying files but can be extended to serve all +possible data sources such as from inter-process communication or from other +network locations, etc. + +Getting Started +=============== + +Currently supported file formats are: + +- Apache Arrow (``.arrow``) +- Apache ORC (``.orc``) +- Apache Parquet (``.parquet``) +- Comma-Separated Values (``.csv``) +- Line-delimited JSON Values (``.json``) + +Below shows a simplest example of using Dataset to query a Parquet file in Java: + +.. code-block:: Java + + // read data from file /opt/example.parquet + String uri = "file:/opt/example.parquet"; + ScanOptions options = new ScanOptions(/*batchSize*/ 32768); + try ( + BufferAllocator allocator = new RootAllocator(); + DatasetFactory datasetFactory = new FileSystemDatasetFactory( + allocator, NativeMemoryPool.getDefault(), + FileFormat.PARQUET, uri); + Dataset dataset = datasetFactory.finish(); + Scanner scanner = dataset.newScan(options); + ArrowReader reader = scanner.scanBatches() + ) { + List batches = new ArrayList<>(); + while (reader.loadNextBatch()) { + try (VectorSchemaRoot root = reader.getVectorSchemaRoot()) { + final VectorUnloader unloader = new VectorUnloader(root); + batches.add(unloader.getRecordBatch()); + } + } + + // do something with read record batches, for example: + analyzeArrowData(batches); + + // finished the analysis of the data, close all resources: + AutoCloseables.close(batches); + } catch (Exception e) { + e.printStackTrace(); + } + +.. note:: + ``ArrowRecordBatch`` is a low-level composite Arrow data exchange format + that doesn't provide API to read typed data from it directly. + It's recommended to use utilities ``VectorLoader`` to load it into a schema + aware container ``VectorSchemaRoot`` by which user could be able to access + decoded data conveniently in Java. + + The ``ScanOptions batchSize`` argument takes effect only if it is set to a value + smaller than the number of rows in the recordbatch. + +.. seealso:: + Load record batches with :doc:`VectorSchemaRoot `. + +Schema +====== + +Schema of the data to be queried can be inspected via method +``DatasetFactory#inspect()`` before actually reading it. For example: + +.. code-block:: Java + + // read data from local file /opt/example.parquet + String uri = "file:/opt/example.parquet"; + BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); + DatasetFactory factory = new FileSystemDatasetFactory(allocator, + NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri); + + // inspect schema + Schema schema = factory.inspect(); + +For some of the data format that is compatible with a user-defined schema, user +can use method ``DatasetFactory#inspect(Schema schema)`` to create the dataset: + +.. code-block:: Java + + Schema schema = createUserSchema() + Dataset dataset = factory.finish(schema); + +Otherwise when the non-parameter method ``DatasetFactory#inspect()`` is called, +schema will be inferred automatically from data source. The same as the result +of ``DatasetFactory#inspect()``. + +Also, if projector is specified during scanning (see next section +:ref:`java-dataset-projection`), the actual schema of output data can be got +within method ``Scanner::schema()``: + +.. code-block:: Java + + Scanner scanner = dataset.newScan( + new ScanOptions(32768, Optional.of(new String[] {"id", "name"}))); + Schema projectedSchema = scanner.schema(); + +.. _java-dataset-projection: + +Projection (Subset of Columns) +============================== + +User can specify projections in ScanOptions. For example: + +.. code-block:: Java + + String[] projection = new String[] {"id", "name"}; + ScanOptions options = new ScanOptions(32768, Optional.of(projection)); + +If no projection is needed, leave the optional projection argument absent in +ScanOptions: + +.. code-block:: Java + + ScanOptions options = new ScanOptions(32768, Optional.empty()); + +Or use shortcut constructor: + +.. code-block:: Java + + ScanOptions options = new ScanOptions(32768); + +Then all columns will be emitted during scanning. + +Projection (Produce New Columns) and Filters +============================================ + +User can specify projections (new columns) or filters in ScanOptions using Substrait. For example: + +.. code-block:: Java + + ByteBuffer substraitExpressionFilter = getSubstraitExpressionFilter(); + ByteBuffer substraitExpressionProject = getSubstraitExpressionProjection(); + // Use Substrait APIs to create an Expression and serialize to a ByteBuffer + ScanOptions options = new ScanOptions.Builder(/*batchSize*/ 32768) + .columns(Optional.empty()) + .substraitExpressionFilter(substraitExpressionFilter) + .substraitExpressionProjection(getSubstraitExpressionProjection()) + .build(); + +.. seealso:: + + :doc:`Executing Projections and Filters Using Extended Expressions ` + Projections and Filters using Substrait. + +Read Data from HDFS +=================== + +``FileSystemDataset`` supports reading data from non-local file systems. HDFS +support is included in the official Apache Arrow Java package releases and +can be used directly without re-building the source code. + +To access HDFS data using Dataset API, pass a general HDFS URI to +``FilesSystemDatasetFactory``: + +.. code-block:: Java + + String uri = "hdfs://{hdfs_host}:{port}/data/example.parquet"; + BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); + DatasetFactory factory = new FileSystemDatasetFactory(allocator, + NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri); + +Native Memory Management +======================== + +To gain better performance and reduce code complexity, Java +``FileSystemDataset`` internally relies on C++ +``arrow::dataset::FileSystemDataset`` via JNI. +As a result, all Arrow data read from ``FileSystemDataset`` is supposed to be +allocated off the JVM heap. To manage this part of memory, an utility class +``NativeMemoryPool`` is provided to users. + +As a basic example, by using a listenable ``NativeMemoryPool``, user can pass +a listener hooking on C++ buffer allocation/deallocation: + +.. code-block:: Java + + AtomicLong reserved = new AtomicLong(0L); + ReservationListener listener = new ReservationListener() { + @Override + public void reserve(long size) { + reserved.getAndAdd(size); + } + + @Override + public void unreserve(long size) { + reserved.getAndAdd(-size); + } + }; + NativeMemoryPool pool = NativeMemoryPool.createListenable(listener); + FileSystemDatasetFactory factory = new FileSystemDatasetFactory(allocator, + pool, FileFormat.PARQUET, uri); + + +Also, it's a very common case to reserve the same amount of JVM direct memory +for the data read from datasets. For this use a built-in utility +class ``DirectReservationListener`` is provided: + +.. code-block:: Java + + NativeMemoryPool pool = NativeMemoryPool.createListenable( + DirectReservationListener.instance()); + +This way, once the allocated byte count of Arrow buffers reaches the limit of +JVM direct memory, ``OutOfMemoryError: Direct buffer memory`` will +be thrown during scanning. + +.. note:: + The default instance ``NativeMemoryPool.getDefaultMemoryPool()`` does + nothing on buffer allocation/deallocation. It's OK to use it in + the case of POC or testing, but for production use in complex environment, + it's recommended to manage memory by using a listenable memory pool. + +.. note:: + The ``BufferAllocator`` instance passed to ``FileSystemDatasetFactory``'s + constructor is also aware of the overall memory usage of the produced + dataset instances. Once the Java buffers are created the passed allocator + will become their parent allocator. + +Usage Notes +=========== + +Native Object Resource Management +--------------------------------- + +As another result of relying on JNI, all components related to +``FileSystemDataset`` should be closed manually or use try-with-resources to +release the corresponding native objects after using. For example: + +.. code-block:: Java + + String uri = "file:/opt/example.parquet"; + ScanOptions options = new ScanOptions(/*batchSize*/ 32768); + try ( + BufferAllocator allocator = new RootAllocator(); + DatasetFactory factory = new FileSystemDatasetFactory( + allocator, NativeMemoryPool.getDefault(), + FileFormat.PARQUET, uri); + Dataset dataset = factory.finish(); + Scanner scanner = dataset.newScan(options) + ) { + + // do something + + } catch (Exception e) { + e.printStackTrace(); + } + +If user forgets to close them then native object leakage might be caused. + +BatchSize +--------- + +The ``batchSize`` argument of ``ScanOptions`` is a limit on the size of an individual batch. + +For example, let's try to read a Parquet file with gzip compression and 3 row groups: + +.. code-block:: + + # Let configure ScanOptions as: + ScanOptions options = new ScanOptions(/*batchSize*/ 32768); + + $ parquet-tools meta data4_3rg_gzip.parquet + file schema: schema + age: OPTIONAL INT64 R:0 D:1 + name: OPTIONAL BINARY L:STRING R:0 D:1 + row group 1: RC:4 TS:182 OFFSET:4 + row group 2: RC:4 TS:190 OFFSET:420 + row group 3: RC:3 TS:179 OFFSET:838 + +Here, we set the batchSize in ScanOptions to 32768. Because that's greater +than the number of rows in the next batch, which is 4 rows because the first +row group has only 4 rows, then the program gets only 4 rows. The scanner +will not combine smaller batches to reach the limit, but it will split +large batches to stay under the limit. So in the case the row group had more +than 32768 rows, it would get split into blocks of 32768 rows or less. diff --git a/docs/source/developers/building.rst b/docs/source/developers/building.rst new file mode 100644 index 00000000..f9ef7dae --- /dev/null +++ b/docs/source/developers/building.rst @@ -0,0 +1,624 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. highlight:: console + +.. _building-arrow-java: + +=================== +Building Arrow Java +=================== + +.. contents:: + +System Setup +============ + +Arrow Java uses the `Maven `_ build system. + +Building requires: + +* JDK 11+ +* Maven 3+ + +.. note:: + CI will test all supported JDK LTS versions, plus the latest non-LTS version. + +Building +======== + +All the instructions below assume that you have cloned the Arrow git +repository: + +.. code-block:: + + $ git clone https://github.com/apache/arrow.git + $ cd arrow + $ git submodule update --init --recursive + +These are the options available to compile Arrow Java modules with: + +* Maven build tool. +* Docker Compose. +* Archery. + +Building Java Modules +--------------------- + +To build the default modules, go to the project root and execute: + +Maven +~~~~~ + +.. code-block:: + + $ cd arrow/java + $ export JAVA_HOME= + $ java --version + $ mvn clean install + +Docker compose +~~~~~~~~~~~~~~ + +.. code-block:: + + $ cd arrow/java + $ export JAVA_HOME= + $ java --version + $ docker compose run java + +Archery +~~~~~~~ + +.. code-block:: + + $ cd arrow/java + $ export JAVA_HOME= + $ java --version + $ archery docker run java + +Building JNI Libraries (\*.dylib / \*.so / \*.dll) +-------------------------------------------------- + +First, we need to build the `C++ shared libraries`_ that the JNI bindings will use. +We can build these manually or we can use `Archery`_ to build them using a Docker container +(This will require installing Docker, Docker Compose, and Archery). + +.. note:: + If you are building on Apple Silicon, be sure to use a JDK version that was compiled + for that architecture. See, for example, the `Azul JDK `_. + + If you are building on Windows OS, see :ref:`Developing on Windows `. + +Maven +~~~~~ + +- To build only the JNI C Data Interface library (macOS / Linux): + + .. code-block:: text + + $ cd arrow/java + $ export JAVA_HOME= + $ java --version + $ mvn generate-resources -Pgenerate-libs-cdata-all-os -N + $ ls -latr ../java-dist/lib + |__ arrow_cdata_jni/ + +- To build only the JNI C Data Interface library (Windows): + + .. code-block:: + + $ cd arrow/java + $ mvn generate-resources -Pgenerate-libs-cdata-all-os -N + $ dir "../java-dist/bin" + |__ arrow_cdata_jni/ + +- To build all JNI libraries (macOS / Linux) except the JNI C Data Interface library: + + .. code-block:: text + + $ cd arrow/java + $ export JAVA_HOME= + $ java --version + $ mvn generate-resources -Pgenerate-libs-jni-macos-linux -N + $ ls -latr java-dist/lib + |__ arrow_dataset_jni/ + |__ arrow_orc_jni/ + |__ gandiva_jni/ + +- To build all JNI libraries (Windows) except the JNI C Data Interface library: + + .. code-block:: + + $ cd arrow/java + $ mvn generate-resources -Pgenerate-libs-jni-windows -N + $ dir "../java-dist/bin" + |__ arrow_dataset_jni/ + +CMake +~~~~~ + +- To build only the JNI C Data Interface library (macOS / Linux): + + .. code-block:: text + + $ cd arrow + $ mkdir -p java-dist java-cdata + $ cmake \ + -S java \ + -B java-cdata \ + -DARROW_JAVA_JNI_ENABLE_C=ON \ + -DARROW_JAVA_JNI_ENABLE_DEFAULT=OFF \ + -DBUILD_TESTING=OFF \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX=java-dist + $ cmake --build java-cdata --target install --config Release + $ ls -latr java-dist/lib + |__ arrow_cdata_jni/ + +- To build only the JNI C Data Interface library (Windows): + + .. code-block:: text + + $ cd arrow + $ mkdir java-dist, java-cdata + $ cmake ^ + -S java ^ + -B java-cdata ^ + -DARROW_JAVA_JNI_ENABLE_C=ON ^ + -DARROW_JAVA_JNI_ENABLE_DEFAULT=OFF ^ + -DBUILD_TESTING=OFF ^ + -DCMAKE_BUILD_TYPE=Release ^ + -DCMAKE_INSTALL_PREFIX=java-dist + $ cmake --build java-cdata --target install --config Release + $ dir "java-dist/bin" + |__ arrow_cdata_jni/ + +- To build all JNI libraries (macOS / Linux) except the JNI C Data Interface library: + + .. code-block:: text + + $ cd arrow + $ brew bundle --file=cpp/Brewfile + # Homebrew Bundle complete! 25 Brewfile dependencies now installed. + $ brew uninstall aws-sdk-cpp + # (We can't use aws-sdk-cpp installed by Homebrew because it has + # an issue: https://github.com/aws/aws-sdk-cpp/issues/1809 ) + $ export JAVA_HOME= + $ mkdir -p java-dist cpp-jni + $ cmake \ + -S cpp \ + -B cpp-jni \ + -DARROW_BUILD_SHARED=OFF \ + -DARROW_CSV=ON \ + -DARROW_DATASET=ON \ + -DARROW_DEPENDENCY_SOURCE=BUNDLED \ + -DARROW_DEPENDENCY_USE_SHARED=OFF \ + -DARROW_FILESYSTEM=ON \ + -DARROW_GANDIVA=ON \ + -DARROW_GANDIVA_STATIC_LIBSTDCPP=ON \ + -DARROW_JSON=ON \ + -DARROW_ORC=ON \ + -DARROW_PARQUET=ON \ + -DARROW_S3=ON \ + -DARROW_SUBSTRAIT=ON \ + -DARROW_USE_CCACHE=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX=java-dist \ + -DCMAKE_UNITY_BUILD=ON + $ cmake --build cpp-jni --target install --config Release + $ cmake \ + -S java \ + -B java-jni \ + -DARROW_JAVA_JNI_ENABLE_C=OFF \ + -DARROW_JAVA_JNI_ENABLE_DEFAULT=ON \ + -DBUILD_TESTING=OFF \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX=java-dist \ + -DCMAKE_PREFIX_PATH=$PWD/java-dist \ + -DProtobuf_ROOT=$PWD/../cpp-jni/protobuf_ep-install \ + -DProtobuf_USE_STATIC_LIBS=ON + $ cmake --build java-jni --target install --config Release + $ ls -latr java-dist/lib/ + |__ arrow_dataset_jni/ + |__ arrow_orc_jni/ + |__ gandiva_jni/ + +- To build all JNI libraries (Windows) except the JNI C Data Interface library: + + .. code-block:: + + $ cd arrow + $ mkdir java-dist, cpp-jni + $ cmake ^ + -S cpp ^ + -B cpp-jni ^ + -DARROW_BUILD_SHARED=OFF ^ + -DARROW_CSV=ON ^ + -DARROW_DATASET=ON ^ + -DARROW_DEPENDENCY_USE_SHARED=OFF ^ + -DARROW_FILESYSTEM=ON ^ + -DARROW_GANDIVA=OFF ^ + -DARROW_JSON=ON ^ + -DARROW_ORC=ON ^ + -DARROW_PARQUET=ON ^ + -DARROW_S3=ON ^ + -DARROW_SUBSTRAIT=ON ^ + -DARROW_USE_CCACHE=ON ^ + -DARROW_WITH_BROTLI=ON ^ + -DARROW_WITH_LZ4=ON ^ + -DARROW_WITH_SNAPPY=ON ^ + -DARROW_WITH_ZLIB=ON ^ + -DARROW_WITH_ZSTD=ON ^ + -DCMAKE_BUILD_TYPE=Release ^ + -DCMAKE_INSTALL_PREFIX=java-dist ^ + -DCMAKE_UNITY_BUILD=ON ^ + -GNinja + $ cd cpp-jni + $ ninja install + $ cd ../ + $ cmake ^ + -S java ^ + -B java-jni ^ + -DARROW_JAVA_JNI_ENABLE_C=OFF ^ + -DARROW_JAVA_JNI_ENABLE_DATASET=ON ^ + -DARROW_JAVA_JNI_ENABLE_DEFAULT=ON ^ + -DARROW_JAVA_JNI_ENABLE_GANDIVA=OFF ^ + -DARROW_JAVA_JNI_ENABLE_ORC=ON ^ + -DBUILD_TESTING=OFF ^ + -DCMAKE_BUILD_TYPE=Release ^ + -DCMAKE_INSTALL_PREFIX=java-dist ^ + -DCMAKE_PREFIX_PATH=$PWD/java-dist + $ cmake --build java-jni --target install --config Release + $ dir "java-dist/bin" + |__ arrow_orc_jni/ + |__ arrow_dataset_jni/ + +Archery +~~~~~~~ + +.. code-block:: text + + $ cd arrow + $ archery docker run java-jni-manylinux-2014 + $ ls -latr java-dist + |__ arrow_cdata_jni/ + |__ arrow_dataset_jni/ + |__ arrow_orc_jni/ + |__ gandiva_jni/ + +Building Java JNI Modules +------------------------- + +- To compile the JNI bindings, use the ``arrow-c-data`` Maven profile: + + .. code-block:: + + $ cd arrow/java + $ mvn -Darrow.c.jni.dist.dir=/java-dist/lib -Parrow-c-data clean install + +- To compile the JNI bindings for ORC / Gandiva / Dataset, use the ``arrow-jni`` Maven profile: + + .. code-block:: + + $ cd arrow/java + $ mvn \ + -Darrow.cpp.build.dir=/java-dist/lib/ \ + -Darrow.c.jni.dist.dir=/java-dist/lib/ \ + -Parrow-jni clean install + +Testing +======= + +By default, Maven uses the same Java version to both build the code and run the tests. + +It is also possible to use a different JDK version for the tests. This requires Maven +toolchains to be configured beforehand, and then a specific test property needs to be set. + +Configuring Maven toolchains +---------------------------- + +To be able to use a JDK version for testing, it needs to be registered first in Maven ``toolchains.xml`` +configuration file usually located under ``${HOME}/.m2`` with the following snippet added to it: + + .. code-block:: + + + + + [...] + + + jdk + + 21 + temurin + + + path/to/jdk/home + + + + [...] + + + +Testing with a specific JDK +--------------------------- + +To run Arrow tests with a specific JDK version, use the ``arrow.test.jdk-version`` property. + +For example, to run Arrow tests with JDK 17, use the following snippet: + + .. code-block:: + + $ cd arrow/java + $ mvn -Darrow.test.jdk-version=17 clean verify + +IDE Configuration +================= + +IntelliJ +-------- + +To start working on Arrow in IntelliJ: build the project once from the command +line using ``mvn clean install``. Then open the ``java/`` subdirectory of the +Arrow repository, and update the following settings: + +* In the Files tool window, find the path ``vector/target/generated-sources``, + right click the directory, and select Mark Directory as > Generated Sources + Root. There is no need to mark other generated sources directories, as only + the ``vector`` module generates sources. +* For JDK 11, due to an `IntelliJ bug + `__, you must go into + Settings > Build, Execution, Deployment > Compiler > Java Compiler and disable + "Use '--release' option for cross-compilation (Java 9 and later)". Otherwise + you will get an error like "package sun.misc does not exist". +* You may want to disable error-prone entirely if it gives spurious + warnings (disable both error-prone profiles in the Maven tool window + and "Reload All Maven Projects"). +* If using IntelliJ's Maven integration to build, you may need to change + ```` to ``false`` in the pom.xml files due to an `IntelliJ bug + `__. +* To enable debugging JNI-based modules like ``dataset``, + activate specific profiles in the Maven tab under "Profiles". + Ensure the profiles ``arrow-c-data``, ``arrow-jni``, ``generate-libs-cdata-all-os``, + ``generate-libs-jni-macos-linux``, and ``jdk11+`` are enabled, so that the + IDE can build them and enable debugging. + +You may not need to update all of these settings if you build/test with the +IntelliJ Maven integration instead of with IntelliJ directly. + +Common Errors +============= + +* When working with the JNI code: if the C++ build cannot find dependencies, with errors like these: + + .. code-block:: + + Could NOT find Boost (missing: Boost_INCLUDE_DIR system filesystem) + Could NOT find Lz4 (missing: LZ4_LIB) + Could NOT find zstd (missing: ZSTD_LIB) + + Specify that the dependencies should be downloaded at build time (more details at `Dependency Resolution`_): + + .. code-block:: + + -Dre2_SOURCE=BUNDLED \ + -DBoost_SOURCE=BUNDLED \ + -Dutf8proc_SOURCE=BUNDLED \ + -DSnappy_SOURCE=BUNDLED \ + -DORC_SOURCE=BUNDLED \ + -DZLIB_SOURCE=BUNDLED + +.. _Archery: https://github.com/apache/arrow/blob/main/dev/archery/README.md +.. _Dependency Resolution: https://arrow.apache.org/docs/developers/cpp/building.html#individual-dependency-resolution +.. _C++ shared libraries: https://arrow.apache.org/docs/cpp/build_system.html + + +Installing Nightly Packages +=========================== + +.. warning:: + These packages are not official releases. Use them at your own risk. + +Arrow nightly builds are posted on the mailing list at `builds@arrow.apache.org`_. +The artifacts are uploaded to GitHub. For example, for 2022/07/30, they can be found at `GitHub Nightly`_. + + +Installing from Apache Nightlies +-------------------------------- +1. Look up the nightly version number for the Arrow libraries used. + + For example, for ``arrow-memory``, visit https://nightlies.apache.org/arrow/java/org/apache/arrow/arrow-memory/ and see what versions are available (e.g. 9.0.0.dev501). +2. Add Apache Nightlies Repository to the Maven/Gradle project. + + .. code-block:: xml + + + 9.0.0.dev501 + + ... + + + arrow-apache-nightlies + https://nightlies.apache.org/arrow/java + + + ... + + + org.apache.arrow + arrow-vector + ${arrow.version} + + + ... + +Installing Manually +------------------- + +1. Decide nightly packages repository to use, for example: https://github.com/ursacomputing/crossbow/releases/tag/nightly-packaging-2022-07-30-0-github-java-jars +2. Add packages to your pom.xml, for example: flight-core (it depends on: arrow-format, arrow-vector, arrow-memory-core and arrow-memory-netty). + + .. code-block:: xml + + + 8 + 8 + 9.0.0.dev501 + + + + + org.apache.arrow + flight-core + ${arrow.version} + + + +3. Download the necessary pom and jar files to a temporary directory: + + .. code-block:: shell + + $ mkdir nightly-packaging-2022-07-30-0-github-java-jars + $ cd nightly-packaging-2022-07-30-0-github-java-jars + $ wget https://github.com/ursacomputing/crossbow/releases/download/nightly-packaging-2022-07-30-0-github-java-jars/arrow-java-root-9.0.0.dev501.pom + $ wget https://github.com/ursacomputing/crossbow/releases/download/nightly-packaging-2022-07-30-0-github-java-jars/arrow-format-9.0.0.dev501.pom + $ wget https://github.com/ursacomputing/crossbow/releases/download/nightly-packaging-2022-07-30-0-github-java-jars/arrow-format-9.0.0.dev501.jar + $ wget https://github.com/ursacomputing/crossbow/releases/download/nightly-packaging-2022-07-30-0-github-java-jars/arrow-vector-9.0.0.dev501.pom + $ wget https://github.com/ursacomputing/crossbow/releases/download/nightly-packaging-2022-07-30-0-github-java-jars/arrow-vector-9.0.0.dev501.jar + $ wget https://github.com/ursacomputing/crossbow/releases/download/nightly-packaging-2022-07-30-0-github-java-jars/arrow-memory-9.0.0.dev501.pom + $ wget https://github.com/ursacomputing/crossbow/releases/download/nightly-packaging-2022-07-30-0-github-java-jars/arrow-memory-core-9.0.0.dev501.pom + $ wget https://github.com/ursacomputing/crossbow/releases/download/nightly-packaging-2022-07-30-0-github-java-jars/arrow-memory-netty-9.0.0.dev501.pom + $ wget https://github.com/ursacomputing/crossbow/releases/download/nightly-packaging-2022-07-30-0-github-java-jars/arrow-memory-core-9.0.0.dev501.jar + $ wget https://github.com/ursacomputing/crossbow/releases/download/nightly-packaging-2022-07-30-0-github-java-jars/arrow-memory-netty-9.0.0.dev501.jar + $ wget https://github.com/ursacomputing/crossbow/releases/download/nightly-packaging-2022-07-30-0-github-java-jars/arrow-flight-9.0.0.dev501.pom + $ wget https://github.com/ursacomputing/crossbow/releases/download/nightly-packaging-2022-07-30-0-github-java-jars/flight-core-9.0.0.dev501.pom + $ wget https://github.com/ursacomputing/crossbow/releases/download/nightly-packaging-2022-07-30-0-github-java-jars/flight-core-9.0.0.dev501.jar + $ tree + . + ├── arrow-flight-9.0.0.dev501.pom + ├── arrow-format-9.0.0.dev501.jar + ├── arrow-format-9.0.0.dev501.pom + ├── arrow-java-root-9.0.0.dev501.pom + ├── arrow-memory-9.0.0.dev501.pom + ├── arrow-memory-core-9.0.0.dev501.jar + ├── arrow-memory-core-9.0.0.dev501.pom + ├── arrow-memory-netty-9.0.0.dev501.jar + ├── arrow-memory-netty-9.0.0.dev501.pom + ├── arrow-vector-9.0.0.dev501.jar + ├── arrow-vector-9.0.0.dev501.pom + ├── flight-core-9.0.0.dev501.jar + └── flight-core-9.0.0.dev501.pom + +4. Install the artifacts to the local Maven repository with ``mvn install:install-file``: + + .. code-block:: shell + + $ mvn install:install-file -Dfile="$(pwd)/arrow-java-root-9.0.0.dev501.pom" -DgroupId=org.apache.arrow -DartifactId=arrow-java-root -Dversion=9.0.0.dev501 -Dpackaging=pom + $ mvn install:install-file -Dfile="$(pwd)/arrow-format-9.0.0.dev501.pom" -DgroupId=org.apache.arrow -DartifactId=arrow-format -Dversion=9.0.0.dev501 -Dpackaging=pom + $ mvn install:install-file -Dfile="$(pwd)/arrow-format-9.0.0.dev501.jar" -DgroupId=org.apache.arrow -DartifactId=arrow-format -Dversion=9.0.0.dev501 -Dpackaging=jar + $ mvn install:install-file -Dfile="$(pwd)/arrow-vector-9.0.0.dev501.pom" -DgroupId=org.apache.arrow -DartifactId=arrow-vector -Dversion=9.0.0.dev501 -Dpackaging=pom + $ mvn install:install-file -Dfile="$(pwd)/arrow-vector-9.0.0.dev501.jar" -DgroupId=org.apache.arrow -DartifactId=arrow-vector -Dversion=9.0.0.dev501 -Dpackaging=jar + $ mvn install:install-file -Dfile="$(pwd)/arrow-memory-9.0.0.dev501.pom" -DgroupId=org.apache.arrow -DartifactId=arrow-memory -Dversion=9.0.0.dev501 -Dpackaging=pom + $ mvn install:install-file -Dfile="$(pwd)/arrow-memory-core-9.0.0.dev501.pom" -DgroupId=org.apache.arrow -DartifactId=arrow-memory-core -Dversion=9.0.0.dev501 -Dpackaging=pom + $ mvn install:install-file -Dfile="$(pwd)/arrow-memory-netty-9.0.0.dev501.pom" -DgroupId=org.apache.arrow -DartifactId=arrow-memory-netty -Dversion=9.0.0.dev501 -Dpackaging=pom + $ mvn install:install-file -Dfile="$(pwd)/arrow-memory-core-9.0.0.dev501.jar" -DgroupId=org.apache.arrow -DartifactId=arrow-memory-core -Dversion=9.0.0.dev501 -Dpackaging=jar + $ mvn install:install-file -Dfile="$(pwd)/arrow-memory-netty-9.0.0.dev501.jar" -DgroupId=org.apache.arrow -DartifactId=arrow-memory-netty -Dversion=9.0.0.dev501 -Dpackaging=jar + $ mvn install:install-file -Dfile="$(pwd)/arrow-flight-9.0.0.dev501.pom" -DgroupId=org.apache.arrow -DartifactId=arrow-flight -Dversion=9.0.0.dev501 -Dpackaging=pom + $ mvn install:install-file -Dfile="$(pwd)/flight-core-9.0.0.dev501.pom" -DgroupId=org.apache.arrow -DartifactId=flight-core -Dversion=9.0.0.dev501 -Dpackaging=pom + $ mvn install:install-file -Dfile="$(pwd)/flight-core-9.0.0.dev501.jar" -DgroupId=org.apache.arrow -DartifactId=flight-core -Dversion=9.0.0.dev501 -Dpackaging=jar + +5. Validate that the packages were installed: + + .. code-block:: shell + + $ tree ~/.m2/repository/org/apache/arrow + . + ├── arrow-flight + │   ├── 9.0.0.dev501 + │   │   └── arrow-flight-9.0.0.dev501.pom + ├── arrow-format + │   ├── 9.0.0.dev501 + │   │   ├── arrow-format-9.0.0.dev501.jar + │   │   └── arrow-format-9.0.0.dev501.pom + ├── arrow-java-root + │   ├── 9.0.0.dev501 + │   │   └── arrow-java-root-9.0.0.dev501.pom + ├── arrow-memory + │   ├── 9.0.0.dev501 + │   │   └── arrow-memory-9.0.0.dev501.pom + ├── arrow-memory-core + │   ├── 9.0.0.dev501 + │   │   ├── arrow-memory-core-9.0.0.dev501.jar + │   │   └── arrow-memory-core-9.0.0.dev501.pom + ├── arrow-memory-netty + │   ├── 9.0.0.dev501 + │   │   ├── arrow-memory-netty-9.0.0.dev501.jar + │   │   └── arrow-memory-netty-9.0.0.dev501.pom + ├── arrow-vector + │   ├── 9.0.0.dev501 + │   │   ├── _remote.repositories + │   │   ├── arrow-vector-9.0.0.dev501.jar + │   │   └── arrow-vector-9.0.0.dev501.pom + └── flight-core + ├── 9.0.0.dev501 + │   ├── flight-core-9.0.0.dev501.jar + │   └── flight-core-9.0.0.dev501.pom + +6. Compile your project like usual with ``mvn clean install``. + +.. _builds@arrow.apache.org: https://lists.apache.org/list.html?builds@arrow.apache.org +.. _GitHub Nightly: https://github.com/ursacomputing/crossbow/releases/tag/nightly-packaging-2022-07-30-0-github-java-jars + +Installing Staging Packages +=========================== + +.. warning:: + These packages are not official releases. Use them at your own risk. + +Arrow staging builds are created when a Release Candidate (RC) is being prepared. This allows users to test the RC in their applications before voting on the release. + + +Installing from Apache Staging +-------------------------------- +1. Look up the next version number for the Arrow libraries used. + +2. Add Apache Staging Repository to the Maven/Gradle project. + + .. code-block:: xml + + + 9.0.0 + + ... + + + arrow-apache-staging + https://repository.apache.org/content/repositories/staging + + + ... + + + org.apache.arrow + arrow-vector + ${arrow.version} + + + ... diff --git a/docs/source/developers/development.rst b/docs/source/developers/development.rst new file mode 100644 index 00000000..dd183925 --- /dev/null +++ b/docs/source/developers/development.rst @@ -0,0 +1,197 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. highlight:: console + +====================== +Development Guidelines +====================== + +.. contents:: + +Logger Abstraction +================== + +Apache Arrow Java uses the SLF4J API, so please configure SLF4J to see logs (e.g. via Logback/Apache Log4j): + +1. If no jar dependencies are added by the user via Logback or Apache Log4j then SLF4J will default + to no-operation (NOP) logging. + +2. If a user adds any dependencies via Logback or Apache Log4j but does not configure/add/define + logback.xml/log4j2.xml, then logs will default to DEBUG mode. + +3. To disable debug logs, the user must define their own rules within their logback.xml/log4j2.xml + and define their own loggers. + +Unit Testing +============ +Unit tests are run by Maven during the build. + +To speed up the build, you can skip them by passing -DskipTests. + +.. code-block:: + + $ cd arrow/java + $ mvn \ + -Darrow.cpp.build.dir=../java-dist/lib -Parrow-jni \ + -Darrow.c.jni.dist.dir=../java-dist/lib -Parrow-c-data \ + clean install + +Performance Testing +=================== + +The ``arrow-performance`` module contains benchmarks. + +Let's configure our environment to run performance tests: + +- Install `benchmark`_ +- Install `archery`_ + +In case you need to see your performance tests on the UI, then, configure (optional): + +- Install `conbench`_ + +Lets execute benchmark tests: + +.. code-block:: + + $ cd benchmarks + $ conbench java-micro --help + $ conbench java-micro + --iterations=1 + --commit=e90472e35b40f58b17d408438bb8de1641bfe6ef + --java-home= + --src= + --benchmark-filter=org.apache.arrow.adapter.AvroAdapterBenchmarks.testAvroToArrow + Benchmark Mode Cnt Score Error Units + AvroAdapterBenchmarks.testAvroToArrow avgt 725545.783 ns/op + Time to POST http://localhost:5000/api/login/ 0.14911699295043945 + Time to POST http://localhost:5000/api/benchmarks/ 0.06116318702697754 + +Then go to: http://127.0.0.1:5000/ to see reports: + +UI Home: + +.. image:: img/conbench_ui.png + +UI Runs: + +.. image:: img/conbench_runs.png + +UI Benchmark: + +.. image:: img/conbench_benchmark.png + +Integration Testing +=================== + +Integration tests can be run :ref:`via Archery `. +For example, assuming you only built Arrow Java and want to run the IPC +integration tests, you would do: + +.. code-block:: console + + $ archery integration --run-ipc --with-java 1 + +Code Style +========== + +The current Java code follows the `Google Java Style`_ with Apache license headers. + +Java code style is checked by `Spotless`_ during the build, and the continuous integration build will verify +that changes adhere to the style guide. + +Automatically fixing code style issues +-------------------------------------- + +- You can check the style without building the project with ``mvn spotless:check``. +- You can autoformat the source with ``mvn spotless:apply``. + +Example: + +.. code-block:: bash + + The following files had format violations: + src/main/java/org/apache/arrow/algorithm/rank/VectorRank.java + @@ -15,7 +15,6 @@ + ·*·limitations·under·the·License. + ·*/ + + - + package·org.apache.arrow.algorithm.rank; + + import·java.util.stream.IntStream; + Run 'mvn spotless:apply' to fix these violations. + +Code Formatter for Intellij IDEA and Eclipse +-------------------------------------------- + +Follow the instructions to set up google-java-format for: + +- `Eclipse`_ +- `IntelliJ`_ + + +Checkstyle +---------- + +Checkstyle is also used for general linting. The configuration is located at `checkstyle`_. +You can also just check the style without building the project. +This checks the code style of all source code under the current directory or from within an individual module. + +.. code-block:: + + $ mvn checkstyle:check + +Maven ``pom.xml`` style is enforced with Spotless using `Apache Maven pom.xml guidelines`_ +You can also just check the style without building the project. +This checks the style of all pom.xml files under the current directory or from within an individual module. + +.. code-block:: + + $ mvn spotless:check + +This applies the style to all pom.xml files under the current directory or from within an individual module. + +.. code-block:: + + $ mvn spotless:apply + +.. _benchmark: https://github.com/ursacomputing/benchmarks +.. _archery: https://github.com/apache/arrow/blob/main/dev/conbench_envs/README.md#L188 +.. _conbench: https://github.com/conbench/conbench +.. _checkstyle: https://github.com/apache/arrow/blob/main/java/dev/checkstyle/checkstyle.xml +.. _Apache Maven pom.xml guidelines: https://maven.apache.org/developers/conventions/code.html#pom-code-convention +.. _Spotless: https://github.com/diffplug/spotless +.. _Google Java Style: https://google.github.io/styleguide/javaguide.html +.. _Eclipse: https://github.com/google/google-java-format?tab=readme-ov-file#eclipse +.. _IntelliJ: https://github.com/google/google-java-format?tab=readme-ov-file#intellij-android-studio-and-other-jetbrains-ides + +Build Caching +============= + +Build caching is done through Develocity (formerly Maven Enterprise). To force +a build without the cache, run:: + + mvn clean install -Ddevelocity.cache.local.enabled=false -Ddevelocity.cache.remote.enabled=false + +This can be useful to make sure you see all warnings from ErrorProne, for example. + +ErrorProne +========== + +ErrorProne should be disabled for generated code. diff --git a/docs/source/developers/img/conbench_benchmark.png b/docs/source/developers/img/conbench_benchmark.png new file mode 100644 index 00000000..3adf3e8c Binary files /dev/null and b/docs/source/developers/img/conbench_benchmark.png differ diff --git a/docs/source/developers/img/conbench_runs.png b/docs/source/developers/img/conbench_runs.png new file mode 100644 index 00000000..3a9c0507 Binary files /dev/null and b/docs/source/developers/img/conbench_runs.png differ diff --git a/docs/source/developers/img/conbench_ui.png b/docs/source/developers/img/conbench_ui.png new file mode 100644 index 00000000..2f72df02 Binary files /dev/null and b/docs/source/developers/img/conbench_ui.png differ diff --git a/docs/source/developers/index.rst b/docs/source/developers/index.rst new file mode 100644 index 00000000..976d1825 --- /dev/null +++ b/docs/source/developers/index.rst @@ -0,0 +1,28 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _java-development: + +**************** +Java Development +**************** + +.. toctree:: + :maxdepth: 2 + + building + development diff --git a/docs/source/flight.rst b/docs/source/flight.rst new file mode 100644 index 00000000..fabced80 --- /dev/null +++ b/docs/source/flight.rst @@ -0,0 +1,239 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +================ +Arrow Flight RPC +================ + +Arrow Flight is an RPC framework for efficient transfer of Arrow data +over the network. + +.. seealso:: + + :external+arrow:doc:`Flight protocol documentation ` + Documentation of the Flight protocol, including how to use + Flight conceptually. + + :external+cookbook:doc:`Java Cookbook ` + Recipes for using Arrow Flight in Java. + +Writing a Flight Service +======================== + +Flight servers implement the `FlightProducer`_ interface. For convenience, +they can subclass `NoOpFlightProducer`_ instead, which offers default +implementations of all the RPC methods. + +.. code-block:: Java + + public class TutorialFlightProducer implements FlightProducer { + @Override + // Override methods or use NoOpFlightProducer for only methods needed + } + +Each RPC method always takes a ``CallContext`` for common parameters. To indicate +failure, pass an exception to the "listener" if present, or else raise an +exception. + +.. code-block:: Java + + // Server + @Override + public void listFlights(CallContext context, Criteria criteria, StreamListener listener) { + // ... + listener.onError( + CallStatus.UNAUTHENTICATED.withDescription( + "Custom UNAUTHENTICATED description message.").toRuntimeException()); + // ... + } + + // Client + try{ + Iterable flightInfosBefore = flightClient.listFlights(Criteria.ALL); + // ... + } catch (FlightRuntimeException e){ + // Catch UNAUTHENTICATED exception + } + +To start a server, create a `Location`_ to specify where to listen, and then create +a `FlightServer`_ with an instance of a producer. This will start the server, but +won't block the rest of the program. Call ``FlightServer.awaitTermination`` +to block until the server stops. + +.. code-block:: Java + + class TutorialFlightProducer implements FlightProducer { + @Override + // Override methods or use NoOpFlightProducer for only methods needed + } + + Location location = Location.forGrpcInsecure("0.0.0.0", 0); + try( + BufferAllocator allocator = new RootAllocator(); + FlightServer server = FlightServer.builder( + allocator, + location, + new TutorialFlightProducer() + ).build(); + ){ + server.start(); + System.out.println("Server listening on port " + server.getPort()); + server.awaitTermination(); + } catch (Exception e) { + e.printStackTrace(); + } + +.. code-block:: shell + + Server listening on port 58104 + +Using the Flight Client +======================= + +To connect to a Flight service, create a `FlightClient`_ with a location. + +.. code-block:: Java + + Location location = Location.forGrpcInsecure("0.0.0.0", 58104); + + try(BufferAllocator allocator = new RootAllocator(); + FlightClient client = FlightClient.builder(allocator, location).build()){ + // ... Consume operations exposed by Flight server + } catch (Exception e) { + e.printStackTrace(); + } + +Cancellation and Timeouts +========================= + +When making a call, clients can optionally provide ``CallOptions``. This allows +clients to set a timeout on calls. Also, some objects returned by client RPC calls +expose a cancel method which allows terminating a call early. + +.. code-block:: Java + + Location location = Location.forGrpcInsecure("0.0.0.0", 58609); + + try(BufferAllocator allocator = new RootAllocator(); + FlightClient tutorialFlightClient = FlightClient.builder(allocator, location).build()){ + + Iterator resultIterator = tutorialFlightClient.doAction( + new Action("test-timeout"), + CallOptions.timeout(2, TimeUnit.SECONDS) + ); + } catch (Exception e) { + e.printStackTrace(); + } + +On the server side, timeouts are transparent. For cancellation, the server needs to manually poll +``setOnCancelHandler`` or ``isCancelled`` to check if the client has cancelled the call, +and if so, break out of any processing the server is currently doing. + +.. code-block:: Java + + // Client + Location location = Location.forGrpcInsecure("0.0.0.0", 58609); + try(BufferAllocator allocator = new RootAllocator(); + FlightClient tutorialFlightClient = FlightClient.builder(allocator, location).build()){ + try(FlightStream flightStream = flightClient.getStream(new Ticket(new byte[]{}))) { + // ... + flightStream.cancel("tutorial-cancel", new Exception("Testing cancellation option!")); + } + } catch (Exception e) { + e.printStackTrace(); + } + // Server + @Override + public void getStream(CallContext context, Ticket ticket, ServerStreamListener listener) { + // ... + listener.setOnCancelHandler(()->{ + // Implement logic to handle cancellation option + }); + } + +Enabling TLS +============ + +TLS can be enabled when setting up a server by providing a +certificate and key pair to ``FlightServer.Builder.useTls``. + +On the client side, use ``Location.forGrpcTls`` to create the Location for the client. + +Enabling Authentication +======================= + +.. warning:: Authentication is insecure without enabling TLS. + +Handshake-based authentication can be enabled by implementing +``ServerAuthHandler``. Authentication consists of two parts: on +initial client connection, the server and client authentication +implementations can perform any negotiation needed. The client authentication +handler then provides a token that will be attached to future calls. + +The client send data to be validated through ``ClientAuthHandler.authenticate`` +The server validate data received through ``ServerAuthHandler.authenticate``. + +Custom Middleware +================= + +Servers and clients support custom middleware (or interceptors) that are called on every +request and can modify the request in a limited fashion. These can be implemented by implementing the +``FlightServerMiddleware`` and ``FlightClientMiddleware`` interfaces. + +Middleware are fairly limited, but they can add headers to a +request/response. On the server, they can inspect incoming headers and +fail the request; hence, they can be used to implement custom +authentication methods. + +Adding Services +=============== + +Servers can add other gRPC services. For example, to add the `Health Check service `_: + +.. code-block:: Java + + final HealthStatusManager statusManager = new HealthStatusManager(); + final Consumer consumer = (builder) -> { + builder.addService(statusManager.getHealthService()); + }; + final Location location = forGrpcInsecure(LOCALHOST, 5555); + try ( + BufferAllocator a = new RootAllocator(Long.MAX_VALUE); + Producer producer = new Producer(a); + FlightServer s = FlightServer.builder(a, location, producer) + .transportHint("grpc.builderConsumer", consumer).build().start(); + ) { + Channel channel = NettyChannelBuilder.forAddress(location.toSocketAddress()).usePlaintext().build(); + HealthCheckResponse response = HealthGrpc + .newBlockingStub(channel) + .check(HealthCheckRequest.getDefaultInstance()); + + System.out.println(response.getStatus()); + } + + +:external+arrow:ref:`Flight best practices ` +=================================================================== + +See the :external+arrow:ref:`best practices for C++ `. + + +.. _`FlightClient`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/flight/FlightClient.html +.. _`FlightProducer`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/flight/FlightProducer.html +.. _`FlightServer`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/flight/FlightServer.html +.. _`NoOpFlightProducer`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/flight/NoOpFlightProducer.html +.. _`Location`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/flight/Location.html diff --git a/docs/source/flight_sql.rst b/docs/source/flight_sql.rst new file mode 100644 index 00000000..169a0e24 --- /dev/null +++ b/docs/source/flight_sql.rst @@ -0,0 +1,32 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +================ +Arrow Flight SQL +================ + +Arrow Flight SQL is an RPC framework for efficient transfer of Arrow data +over the network. + +.. seealso:: + + :external+arrow:doc:`Flight SQL protocol documentation ` + Documentation of the Flight SQL protocol. + +For usage information, see the `API documentation`_. + +.. _API documentation: https://arrow.apache.org/docs/java/reference/org/apache/arrow/flight/sql/package-summary.html diff --git a/docs/source/flight_sql_jdbc_driver.rst b/docs/source/flight_sql_jdbc_driver.rst new file mode 100644 index 00000000..18069309 --- /dev/null +++ b/docs/source/flight_sql_jdbc_driver.rst @@ -0,0 +1,175 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +============================ +Arrow Flight SQL JDBC Driver +============================ + +The Flight SQL JDBC driver is a JDBC driver implementation that uses +the :external+arrow:doc:`Flight SQL protocol ` under +the hood. This driver can be used with any database that implements +Flight SQL. + +Installation and Requirements +============================= + +The driver is compatible with JDK 11+. Note that the following JVM +parameter is required: + +.. code-block:: shell + + java --add-opens=java.base/java.nio=ALL-UNNAMED ... + +To add a dependency via Maven, use a ``pom.xml`` like the following: + +.. code-block:: xml + + + + 4.0.0 + org.example + demo + 1.0-SNAPSHOT + + 18.1.0 + + + + org.apache.arrow + flight-sql-jdbc-driver + ${arrow.version} + + + + +Connecting to a Database +======================== + +The URI format is as follows:: + + jdbc:arrow-flight-sql://HOSTNAME:PORT[/?param1=val1¶m2=val2&...] + +For example, take this URI:: + + jdbc:arrow-flight-sql://localhost:12345/?username=admin&password=pass&useEncryption=1 + +This will connect to a Flight SQL service running on ``localhost`` on +port 12345. It will create a secure, encrypted connection, and +authenticate using the username ``admin`` and the password ``pass``. + +The components of the URI are as follows. + +* The URI scheme must be ``jdbc:arrow-flight-sql://``. +* **HOSTNAME** is the hostname of the Flight SQL service. +* **PORT** is the port of the Flight SQL service. + +Additional options can be passed as query parameters. Parameter names are +case-sensitive. The supported parameters are: + +.. list-table:: + :header-rows: 1 + + * - Parameter + - Default + - Description + + * - disableCertificateVerification + - false + - When TLS is enabled, whether to verify the server certificate + + * - password + - null + - The password for user/password authentication + + * - threadPoolSize + - 1 + - The size of an internal thread pool + + * - token + - null + - The token used for token authentication + + * - trustStore + - null + - When TLS is enabled, the path to the certificate store + + * - trustStorePassword + - null + - When TLS is enabled, the password for the certificate store + + * - tlsRootCerts + - null + - Path to PEM-encoded root certificates for TLS - use this as + an alternative to ``trustStore`` + + * - clientCertificate + - null + - Path to PEM-encoded client mTLS certificate when the Flight + SQL server requires client verification. + + * - clientKey + - null + - Path to PEM-encoded client mTLS key when the Flight + SQL server requires client verification. + + * - useEncryption + - true + - Whether to use TLS (the default is an encrypted connection) + + * - user + - null + - The username for user/password authentication + + * - useSystemTrustStore + - true + - When TLS is enabled, whether to use the system certificate store + + * - retainCookies + - true + - Whether to use cookies from the initial connection in subsequent + internal connections when retrieving streams from separate endpoints. + + * - retainAuth + - true + - Whether to use bearer tokens obtained from the initial connection + in subsequent internal connections used for retrieving streams + from separate endpoints. + +Note that URI values must be URI-encoded if they contain characters such +as !, @, $, etc. + +Any URI parameters that are not handled by the driver are passed to +the Flight SQL service as gRPC headers. For example, the following URI :: + + jdbc:arrow-flight-sql://localhost:12345/?useEncryption=0&database=mydb + +This will connect without authentication or encryption, to a Flight +SQL service running on ``localhost`` on port 12345. Each request will +also include a ``database=mydb`` gRPC header. + +Connection parameters may also be supplied using the Properties object +when using the JDBC Driver Manager to connect. When supplying using +the Properties object, values should *not* be URI-encoded. + +Parameters specified by the URI supercede parameters supplied by the +Properties object. When calling the `user/password overload of +DriverManager#getConnection() +`_, +the username and password supplied on the URI supercede the username and +password arguments to the function call. diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 00000000..5cdf41e1 --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,48 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _java: + +Java Implementation +=================== + +This is the documentation of the Java API of Apache Arrow. For more details +on the Arrow format and other language bindings see the :doc:`parent documentation <../index>`. + +.. toctree:: + :maxdepth: 2 + + quickstartguide + overview + install + developers/index + + memory + vector + vector_schema_root + table + ipc + algorithm + flight + flight_sql + flight_sql_jdbc_driver + dataset + substrait + cdata + jdbc + Reference (javadoc) + Cookbook diff --git a/docs/source/install.rst b/docs/source/install.rst new file mode 100644 index 00000000..b2b1c716 --- /dev/null +++ b/docs/source/install.rst @@ -0,0 +1,230 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +======================= +Installing Java Modules +======================= + +System Compatibility +==================== + +Java modules are regularly built and tested on macOS and Linux distributions. + +Java Compatibility +================== + +Java modules are compatible with JDK 11 and above. Currently, JDK versions +11, 17, 21, and latest are tested in CI. + +Note that some JDK internals must be exposed by +adding ``--add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED`` to the ``java`` command: + +.. code-block:: shell + + # Directly on the command line + $ java --add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED -jar ... + # Indirectly via environment variables + $ env JDK_JAVA_OPTIONS="--add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED" java -jar ... + +Otherwise, you may see errors like ``module java.base does not "opens +java.nio" to unnamed module`` or ``module java.base does not "opens +java.nio" to org.apache.arrow.memory.core`` + +Note that the command has changed from Arrow 15 and earlier. If you are still using the flags from that version +(``--add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED``) you will see the +``module java.base does not "opens java.nio" to org.apache.arrow.memory.core`` error. + +If you are using flight-core or dependent modules, you will need to mark that flight-core can read unnamed modules. +Modifying the command above for Flight: + +.. code-block:: shell + + # Directly on the command line + $ java --add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED -jar ... + # Indirectly via environment variables + $ env JDK_JAVA_OPTIONS="--add-reads=org.apache.arrow.flight.core=ALL-UNNAMED --add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED" java -jar ... + +Otherwise, you may see errors like ``java.lang.IllegalAccessError: superclass access check failed: class +org.apache.arrow.flight.ArrowMessage$ArrowBufRetainingCompositeByteBuf (in module org.apache.arrow.flight.core) +cannot access class io.netty.buffer.CompositeByteBuf (in unnamed module ...) because module +org.apache.arrow.flight.core does not read unnamed module ...`` + +Finally, if you are using arrow-dataset, you'll also need to report that JDK internals need to be exposed. +Modifying the command above for arrow-memory: + +.. code-block:: shell + + # Directly on the command line + $ java --add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED -jar ... + # Indirectly via environment variables + $ env JDK_JAVA_OPTIONS="--add-opens=java.base/java.nio=org.apache.arrow.dataset,org.apache.arrow.memory.core,ALL-UNNAMED" java -jar ... + +Otherwise you may see errors such as ``java.lang.RuntimeException: java.lang.reflect.InaccessibleObjectException: +Unable to make static void java.nio.Bits.reserveMemory(long,long) accessible: module +java.base does not "opens java.nio" to module org.apache.arrow.dataset`` + +If using Maven and Surefire for unit testing, :ref:`this argument must +be added to Surefire as well `. + +Installing from Maven +===================== + +By default, Maven will download from the central repository: https://repo.maven.apache.org/maven2/org/apache/arrow/ + +Configure your pom.xml with the Java modules needed, for example: +arrow-vector, and arrow-memory-netty. + +.. code-block:: xml + + + + 4.0.0 + org.example + demo + 1.0-SNAPSHOT + + 9.0.0 + + + + org.apache.arrow + arrow-vector + ${arrow.version} + + + org.apache.arrow + arrow-memory-netty + ${arrow.version} + + + + +A bill of materials (BOM) module has been provided to simplify adding +Arrow modules. This eliminates the need to specify the version for +every module. An alternative to the above would be: + +.. code-block:: xml + + + + 4.0.0 + org.example + demo + 1.0-SNAPSHOT + + 15.0.0 + + + + org.apache.arrow + arrow-vector + + + org.apache.arrow + arrow-memory-netty + + + + + + org.apache.arrow + arrow-bom + ${arrow.version} + pom + import + + + + + +To use the Arrow Flight dependencies, also add the ``os-maven-plugin`` +plugin. This plugin generates useful platform-dependent properties +such as ``os.detected.name`` and ``os.detected.arch`` needed to resolve +transitive dependencies of Flight. + +.. code-block:: xml + + + + 4.0.0 + org.example + demo + 1.0-SNAPSHOT + + 9.0.0 + + + + org.apache.arrow + flight-core + ${arrow.version} + + + + + + kr.motd.maven + os-maven-plugin + 1.7.0 + + + + + +.. _java-install-maven-testing: + +The ``--add-opens`` flag must be added when running unit tests through Maven: + +.. code-block:: xml + + + + + org.apache.maven.plugins + maven-surefire-plugin + 3.0.0-M6 + + --add-opens=java.base/java.nio=ALL-UNNAMED + + + + + +Or they can be added via environment variable, for example when executing your code: + +.. code-block:: + + JDK_JAVA_OPTIONS="--add-opens=java.base/java.nio=ALL-UNNAMED" mvn exec:java -Dexec.mainClass="YourMainCode" + +Installing from Source +====================== + +See :ref:`java-development`. + +IDE Configuration +================= + +Generally, no additional configuration should be needed. However, +ensure your Maven or other build configuration has the ``--add-opens`` +flag as described above, so that the IDE picks it up and runs tests +with that flag as well. diff --git a/docs/source/ipc.rst b/docs/source/ipc.rst new file mode 100644 index 00000000..f5939179 --- /dev/null +++ b/docs/source/ipc.rst @@ -0,0 +1,202 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +=========================== +Reading/Writing IPC formats +=========================== +Arrow defines two types of binary formats for serializing record batches: + +* **Streaming format**: for sending an arbitrary number of record + batches. The format must be processed from start to end, and does not support + random access + +* **File or Random Access format**: for serializing a fixed number of record + batches. It supports random access, and thus is very useful when used with + memory maps + +Writing and Reading Streaming Format +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +First, let's populate a :class:`VectorSchemaRoot` with a small batch of records + +.. code-block:: Java + + BitVector bitVector = new BitVector("boolean", allocator); + VarCharVector varCharVector = new VarCharVector("varchar", allocator); + for (int i = 0; i < 10; i++) { + bitVector.setSafe(i, i % 2 == 0 ? 0 : 1); + varCharVector.setSafe(i, ("test" + i).getBytes(StandardCharsets.UTF_8)); + } + bitVector.setValueCount(10); + varCharVector.setValueCount(10); + + List fields = Arrays.asList(bitVector.getField(), varCharVector.getField()); + List vectors = Arrays.asList(bitVector, varCharVector); + VectorSchemaRoot root = new VectorSchemaRoot(fields, vectors); + +Now, we can begin writing a stream containing some number of these batches. For this we use :class:`ArrowStreamWriter` +(DictionaryProvider used for any vectors that are dictionary encoded is optional and can be null)) + +.. code-block:: Java + + try ( + ByteArrayOutputStream out = new ByteArrayOutputStream(); + ArrowStreamWriter writer = new ArrowStreamWriter(root, /*DictionaryProvider=*/null, Channels.newChannel(out)); + ) { + // ... do write into the ArrowStreamWriter + } + +Here we used an in-memory stream, but this could have been a socket or some other IO stream. Then we can do + +.. code-block:: Java + + writer.start(); + // write the first batch + writer.writeBatch(); + + // write another four batches. + for (int i = 0; i < 4; i++) { + // populate VectorSchemaRoot data and write the second batch + BitVector childVector1 = (BitVector)root.getVector(0); + VarCharVector childVector2 = (VarCharVector)root.getVector(1); + childVector1.reset(); + childVector2.reset(); + // ... do some populate work here, could be different for each batch + writer.writeBatch(); + } + + writer.end(); + +Note that, since the :class:`VectorSchemaRoot` in the writer is a container that can hold batches, batches flow through +:class:`VectorSchemaRoot` as part of a pipeline, so we need to populate data before ``writeBatch``, so that later batches +could overwrite previous ones. + +Now the :class:`ByteArrayOutputStream` contains the complete stream which contains 5 record batches. +We can read such a stream with :class:`ArrowStreamReader`. Note that the :class:`VectorSchemaRoot` within the reader +will be loaded with new values on every call to :class:`loadNextBatch()` + +.. code-block:: Java + + try (ArrowStreamReader reader = new ArrowStreamReader(new ByteArrayInputStream(out.toByteArray()), allocator)) { + // This will be loaded with new values on every call to loadNextBatch + VectorSchemaRoot readRoot = reader.getVectorSchemaRoot(); + Schema schema = readRoot.getSchema(); + for (int i = 0; i < 5; i++) { + reader.loadNextBatch(); + // ... do something with readRoot + } + } + +Here we also give a simple example with dictionary encoded vectors + +.. code-block:: Java + + // create provider + DictionaryProvider.MapDictionaryProvider provider = new DictionaryProvider.MapDictionaryProvider(); + + try ( + final VarCharVector dictVector = new VarCharVector("dict", allocator); + final VarCharVector vector = new VarCharVector("vector", allocator); + ) { + // create dictionary vector + dictVector.allocateNewSafe(); + dictVector.setSafe(0, "aa".getBytes()); + dictVector.setSafe(1, "bb".getBytes()); + dictVector.setSafe(2, "cc".getBytes()); + dictVector.setValueCount(3); + + // create dictionary + Dictionary dictionary = + new Dictionary(dictVector, new DictionaryEncoding(1L, false, /*indexType=*/null)); + provider.put(dictionary); + + // create original data vector + vector.allocateNewSafe(); + vector.setSafe(0, "bb".getBytes()); + vector.setSafe(1, "bb".getBytes()); + vector.setSafe(2, "cc".getBytes()); + vector.setSafe(3, "aa".getBytes()); + vector.setValueCount(4); + + // get the encoded vector + IntVector encodedVector = (IntVector) DictionaryEncoder.encode(vector, dictionary); + + ByteArrayOutputStream out = new ByteArrayOutputStream(); + + // create VectorSchemaRoot + List fields = Arrays.asList(encodedVector.getField()); + List vectors = Arrays.asList(encodedVector); + try (VectorSchemaRoot root = new VectorSchemaRoot(fields, vectors)) { + + // write data + ArrowStreamWriter writer = new ArrowStreamWriter(root, provider, Channels.newChannel(out)); + writer.start(); + writer.writeBatch(); + writer.end(); + } + + // read data + try (ArrowStreamReader reader = new ArrowStreamReader(new ByteArrayInputStream(out.toByteArray()), allocator)) { + reader.loadNextBatch(); + VectorSchemaRoot readRoot = reader.getVectorSchemaRoot(); + // get the encoded vector + IntVector intVector = (IntVector) readRoot.getVector(0); + + // get dictionaries and decode the vector + Map dictionaryMap = reader.getDictionaryVectors(); + long dictionaryId = intVector.getField().getDictionary().getId(); + try (VarCharVector varCharVector = + (VarCharVector) DictionaryEncoder.decode(intVector, dictionaryMap.get(dictionaryId))) { + // ... use decoded vector + } + } + } + +Writing and Reading Random Access Files +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The :class:`ArrowFileWriter` has the same API as :class:`ArrowStreamWriter` + +.. code-block:: Java + + try ( + ByteArrayOutputStream out = new ByteArrayOutputStream(); + ArrowFileWriter writer = new ArrowFileWriter(root, /*DictionaryProvider=*/null, Channels.newChannel(out)); + ) { + writer.start(); + // write the first batch + writer.writeBatch(); + // write another four batches. + for (int i = 0; i < 4; i++) { + // ... do populate work + writer.writeBatch(); + } + writer.end(); + } + +The difference between :class:`ArrowFileReader` and :class:`ArrowStreamReader` is that the input source +must have a ``seek`` method for random access. Because we have access to the entire payload, we know the +number of record batches in the file, and can read any at random + +.. code-block:: Java + + try (ArrowFileReader reader = new ArrowFileReader( + new ByteArrayReadableSeekableByteChannel(out.toByteArray()), allocator)) { + + // read the 4-th batch + ArrowBlock block = reader.getRecordBlocks().get(3); + reader.loadRecordBatch(block); + VectorSchemaRoot readBatch = reader.getVectorSchemaRoot(); + } diff --git a/docs/source/jdbc.rst b/docs/source/jdbc.rst new file mode 100644 index 00000000..c0477cb0 --- /dev/null +++ b/docs/source/jdbc.rst @@ -0,0 +1,278 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +================== +Arrow JDBC Adapter +================== + +The Arrow JDBC Adapter assists with working with JDBC and Arrow +data. Currently, it supports reading JDBC ResultSets into Arrow +VectorSchemaRoots. + +ResultSet to VectorSchemaRoot Conversion +======================================== + +This can be accessed via the JdbcToArrow class. The resulting +ArrowVectorIterator will convert a ResultSet to Arrow data in batches +of rows. + +.. code-block:: java + + try (ArrowVectorIterator it = JdbcToArrow.sqlToArrowVectorIterator(resultSet, allocator)) { + while (it.hasNext()) { + VectorSchemaRoot root = it.next(); + // Consume the root… + } + } + +The batch size and type mapping can both be customized: + +.. code-block:: java + + JdbcToArrowConfig config = new JdbcToArrowConfigBuilder(allocator, /*calendar=*/null) + .setReuseVectorSchemaRoot(reuseVectorSchemaRoot) + .setJdbcToArrowTypeConverter((jdbcFieldInfo -> { + switch (jdbcFieldInfo.getJdbcType()) { + case Types.BIGINT: + // Assume actual value range is SMALLINT + return new ArrowType.Int(16, true); + default: + return null; + } + })) + .build(); + try (ArrowVectorIterator iter = JdbcToArrow.sqlToArrowVectorIterator(rs, config)) { + while (iter.hasNext()) { + VectorSchemaRoot root = iter.next(); + // Consume the root… + } + } + +The JDBC type can be explicitly specified, which is useful since JDBC +drivers can give spurious type information. For example, the Postgres +driver has been observed to use Decimal types with scale and precision +0; these cases can be handled by specifying the type explicitly before +reading. Also, some JDBC drivers may return BigDecimal values with +inconsistent scale. A RoundingMode can be set to handle these cases: + +.. code-block:: java + + Map mapping = new HashMap<>(); + mapping.put(1, new JdbcFieldInfo(Types.DECIMAL, 20, 7)); + JdbcToArrowConfig config = new JdbcToArrowConfigBuilder(allocator, /*calendar=*/null) + .setBigDecimalRoundingMode(RoundingMode.UNNECESSARY) + .setExplicitTypesByColumnIndex(mapping) + .build(); + try (ArrowVectorIterator iter = JdbcToArrow.sqlToArrowVectorIterator(rs, config)) { + while (iter.hasNext()) { + VectorSchemaRoot root = iter.next(); + // Consume the root… + } + } + +The mapping from JDBC type to Arrow type can be overridden via the +``JdbcToArrowConfig``, but it is not possible to customize the +conversion from JDBC value to Arrow value itself, nor is it possible +to define a conversion for an unsupported type. + +Type Mapping +------------ + +The JDBC to Arrow type mapping can be obtained at runtime from +`JdbcToArrowUtils.getArrowTypeFromJdbcType`_. + +.. _JdbcToArrowUtils.getArrowTypeFromJdbcType: https://arrow.apache.org/docs/java/reference/org/apache/arrow/adapter/jdbc/JdbcToArrowUtils.html#getArrowTypeFromJdbcType-org.apache.arrow.adapter.jdbc.JdbcFieldInfo-java.util.Calendar- + ++--------------------+--------------------+-------+ +| JDBC Type | Arrow Type | Notes | ++====================+====================+=======+ +| ARRAY | List | \(1) | ++--------------------+--------------------+-------+ +| BIGINT | Int64 | | ++--------------------+--------------------+-------+ +| BINARY | Binary | | ++--------------------+--------------------+-------+ +| BIT | Bool | | ++--------------------+--------------------+-------+ +| BLOB | Binary | | ++--------------------+--------------------+-------+ +| BOOLEAN | Bool | | ++--------------------+--------------------+-------+ +| CHAR | Utf8 | | ++--------------------+--------------------+-------+ +| CLOB | Utf8 | | ++--------------------+--------------------+-------+ +| DATE | Date32 | | ++--------------------+--------------------+-------+ +| DECIMAL | Decimal128 | \(2) | ++--------------------+--------------------+-------+ +| DOUBLE | Double | | ++--------------------+--------------------+-------+ +| FLOAT | Float32 | | ++--------------------+--------------------+-------+ +| INTEGER | Int32 | | ++--------------------+--------------------+-------+ +| LONGVARBINARY | Binary | | ++--------------------+--------------------+-------+ +| LONGNVARCHAR | Utf8 | | ++--------------------+--------------------+-------+ +| LONGVARCHAR | Utf8 | | ++--------------------+--------------------+-------+ +| NCHAR | Utf8 | | ++--------------------+--------------------+-------+ +| NULL | Null | | ++--------------------+--------------------+-------+ +| NUMERIC | Decimal128 | | ++--------------------+--------------------+-------+ +| NVARCHAR | Utf8 | | ++--------------------+--------------------+-------+ +| REAL | Float32 | | ++--------------------+--------------------+-------+ +| SMALLINT | Int16 | | ++--------------------+--------------------+-------+ +| STRUCT | Struct | \(3) | ++--------------------+--------------------+-------+ +| TIME | Time32[ms] | | ++--------------------+--------------------+-------+ +| TIMESTAMP | Timestamp[ms] | \(4) | ++--------------------+--------------------+-------+ +| TINYINT | Int8 | | ++--------------------+--------------------+-------+ +| VARBINARY | Binary | | ++--------------------+--------------------+-------+ +| VARCHAR | Utf8 | | ++--------------------+--------------------+-------+ + +* \(1) The list value type must be explicitly configured and cannot be + inferred. Use `setArraySubTypeByColumnIndexMap`_ or + `setArraySubTypeByColumnNameMap`_. +* \(2) By default, the scale of decimal values must match the scale in + the type exactly; precision is allowed to be any value greater or + equal to the type precision. If there is a mismatch, by default, an + exception will be thrown. This can be configured by setting a + different RoundingMode with setBigDecimalRoundingMode. +* \(3) Not fully supported: while the type conversion is defined, the + value conversion is not. See ARROW-17006_. +* \(4) If a Calendar is provided, then the timestamp will have the + timezone of the calendar, else it will be a timestamp without + timezone. + +.. _setArraySubTypeByColumnIndexMap: https://arrow.apache.org/docs/java/reference/org/apache/arrow/adapter/jdbc/JdbcToArrowConfigBuilder.html#setArraySubTypeByColumnIndexMap-java.util.Map- +.. _setArraySubTypeByColumnNameMap: https://arrow.apache.org/docs/java/reference/org/apache/arrow/adapter/jdbc/JdbcToArrowConfigBuilder.html#setArraySubTypeByColumnNameMap-java.util.Map- +.. _ARROW-17006: https://issues.apache.org/jira/browse/ARROW-17006 + +VectorSchemaRoot to PreparedStatement Parameter Conversion +========================================================== + +The adapter can bind rows of Arrow data from a VectorSchemaRoot to +parameters of a JDBC PreparedStatement. This can be accessed via the +JdbcParameterBinder class. Each call to next() will bind parameters +from the next row of data, and then the application can execute the +statement, call addBatch(), etc. as desired. Null values will lead to +a setNull call with an appropriate JDBC type code (listed below). + +.. code-block:: java + + final JdbcParameterBinder binder = + JdbcParameterBinder.builder(statement, root).bindAll().build(); + while (binder.next()) { + statement.executeUpdate(); + } + // Use a VectorLoader to update the root + binder.reset(); + while (binder.next()) { + statement.executeUpdate(); + } + +The mapping of vectors to parameters, the JDBC type code used by the +converters, and the type conversions themselves can all be customized: + +.. code-block:: java + + final JdbcParameterBinder binder = + JdbcParameterBinder.builder(statement, root) + .bind(/*parameterIndex*/2, /*columnIndex*/0) + .bind(/*parameterIndex*/1, customColumnBinderInstance) + .build(); + +Type Mapping +------------ + +The Arrow to JDBC type mapping can be obtained at runtime via +a method on ColumnBinder. + ++----------------------------+----------------------------+-------+ +| Arrow Type | JDBC Type | Notes | ++============================+============================+=======+ +| Binary | VARBINARY (setBytes) | | ++----------------------------+----------------------------+-------+ +| Bool | BOOLEAN (setBoolean) | | ++----------------------------+----------------------------+-------+ +| Date32 | DATE (setDate) | | ++----------------------------+----------------------------+-------+ +| Date64 | DATE (setDate) | | ++----------------------------+----------------------------+-------+ +| Decimal128 | DECIMAL (setBigDecimal) | | ++----------------------------+----------------------------+-------+ +| Decimal256 | DECIMAL (setBigDecimal) | | ++----------------------------+----------------------------+-------+ +| FixedSizeBinary | BINARY (setBytes) | | ++----------------------------+----------------------------+-------+ +| Float32 | REAL (setFloat) | | ++----------------------------+----------------------------+-------+ +| Int8 | TINYINT (setByte) | | ++----------------------------+----------------------------+-------+ +| Int16 | SMALLINT (setShort) | | ++----------------------------+----------------------------+-------+ +| Int32 | INTEGER (setInt) | | ++----------------------------+----------------------------+-------+ +| Int64 | BIGINT (setLong) | | ++----------------------------+----------------------------+-------+ +| LargeBinary | LONGVARBINARY (setBytes) | | ++----------------------------+----------------------------+-------+ +| LargeUtf8 | LONGVARCHAR (setString) | \(1) | ++----------------------------+----------------------------+-------+ +| Time[s] | TIME (setTime) | | ++----------------------------+----------------------------+-------+ +| Time[ms] | TIME (setTime) | | ++----------------------------+----------------------------+-------+ +| Time[us] | TIME (setTime) | | ++----------------------------+----------------------------+-------+ +| Time[ns] | TIME (setTime) | | ++----------------------------+----------------------------+-------+ +| Timestamp[s] | TIMESTAMP (setTimestamp) | \(2) | ++----------------------------+----------------------------+-------+ +| Timestamp[ms] | TIMESTAMP (setTimestamp) | \(2) | ++----------------------------+----------------------------+-------+ +| Timestamp[us] | TIMESTAMP (setTimestamp) | \(2) | ++----------------------------+----------------------------+-------+ +| Timestamp[ns] | TIMESTAMP (setTimestamp) | \(2) | ++----------------------------+----------------------------+-------+ +| Utf8 | VARCHAR (setString) | | ++----------------------------+----------------------------+-------+ + +* \(1) Strings longer than Integer.MAX_VALUE bytes (the maximum length + of a Java ``byte[]``) will cause a runtime exception. +* \(2) If the timestamp has a timezone, the JDBC type defaults to + TIMESTAMP_WITH_TIMEZONE. If the timestamp has no timezone, + technically there is not a correct conversion from Arrow value to + JDBC value, because a JDBC Timestamp is in UTC, and we have no + timezone information. In this case, the default binder will call + `setTimestamp(int, Timestamp) + `_, + which will lead to the driver using the "default timezone" (that of + the Java VM). diff --git a/docs/source/memory.rst b/docs/source/memory.rst new file mode 100644 index 00000000..58ef382d --- /dev/null +++ b/docs/source/memory.rst @@ -0,0 +1,499 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +================= +Memory Management +================= + +The memory modules contain all the functionality that Arrow uses to allocate and deallocate memory. This document is divided in two parts: +The first part, *Memory Basics*, provides a high-level introduction. The following section, *Arrow Memory In-Depth*, fills in the details. + +Memory Basics +============= +This section will introduce you to the major concepts in Java’s memory management: + +* `ArrowBuf`_ +* `BufferAllocator`_ +* Reference counting + +It also provides some guidelines for working with memory in Arrow, and describes how to debug memory issues when they arise. + +Getting Started +--------------- + +Arrow's memory management is built around the needs of the columnar format and using off-heap memory. +Arrow Java has its own independent implementation. It does not wrap the C++ implementation, although the framework is flexible enough +to be used with memory allocated in C++ that is used by Java code. + +Arrow provides multiple modules: the core interfaces, and implementations of the interfaces. +Users need the core interfaces, and exactly one of the implementations. + +* ``memory-core``: Provides the interfaces used by the Arrow libraries and applications. +* ``memory-netty``: An implementation of the memory interfaces based on the `Netty`_ library. +* ``memory-unsafe``: An implementation of the memory interfaces based on the `sun.misc.Unsafe`_ library. + + +ArrowBuf +-------- + +ArrowBuf represents a single, contiguous region of `direct memory`_. It consists of an address and a length, +and provides low-level interfaces for working with the contents, similar to ByteBuffer. + +Unlike (Direct)ByteBuffer, it has reference counting built in, as discussed later. + +Why Arrow Uses Direct Memory +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +* The JVM can optimize I/O operations when using direct memory/direct buffers; it will attempt to avoid copying buffer contents to/from an intermediate buffer. This can speed up IPC in Arrow. +* Since Arrow always uses direct memory, JNI modules can directly wrap native memory addresses instead of copying data. We use this in modules like the C Data Interface. +* Conversely, on the C++ side of the JNI boundary, we can directly access the memory in ArrowBuf without copying data. + +BufferAllocator +--------------- + +The `BufferAllocator`_ is primarily an arena or nursery used for accounting of buffers (ArrowBuf instances). +As the name suggests, it can allocate new buffers associated with itself, but it can also +handle the accounting for buffers allocated elsewhere. For example, it handles the Java-side accounting for +memory allocated in C++ and shared with Java using the C-Data Interface. In the code below it performs an allocation: + +.. code-block:: Java + + import org.apache.arrow.memory.ArrowBuf; + import org.apache.arrow.memory.BufferAllocator; + import org.apache.arrow.memory.RootAllocator; + + try(BufferAllocator bufferAllocator = new RootAllocator(8 * 1024)){ + ArrowBuf arrowBuf = bufferAllocator.buffer(4 * 1024); + System.out.println(arrowBuf); + arrowBuf.close(); + } + +.. code-block:: shell + + ArrowBuf[2], address:140363641651200, length:4096 + +The concrete implementation of the BufferAllocator interface is `RootAllocator`_. Applications should generally create +one RootAllocator at the start of the program, and use it through the BufferAllocator interface. Allocators implement +AutoCloseable and must be closed after the application is done with them; this will check that all outstanding memory +has been freed (see the next section). + +Arrow provides a tree-based model for memory allocation. The RootAllocator is created first, then more allocators +are created as children of an existing allocator via `newChildAllocator`_. When creating a RootAllocator or a child +allocator, a memory limit is provided, and when allocating memory, the limit is checked. Furthermore, when allocating +memory from a child allocator, those allocations are also reflected in all parent allocators. Hence, the RootAllocator +effectively sets the program-wide memory limit, and serves as the master bookkeeper for all memory allocations. + +Child allocators are not strictly required, but can help better organize code. For instance, a lower memory limit can +be set for a particular section of code. The child allocator can be closed when that section completes, +at which point it checks that that section didn't leak any memory. +Child allocators can also be named, which makes it easier to tell where an ArrowBuf came from during debugging. + +Reference counting +------------------ + +Because direct memory is expensive to allocate and deallocate, allocators may share direct buffers. To manage shared buffers +deterministically, we use manual reference counting instead of the garbage collector. +This simply means that each buffer has a counter keeping track of the number of references to +the buffer, and the user is responsible for properly incrementing/decrementing the counter as the buffer is used. + +In Arrow, each ArrowBuf has an associated `ReferenceManager`_ that tracks the reference count. You can retrieve +it with ArrowBuf.getReferenceManager(). The reference count is updated using `ReferenceManager.release`_ to decrement the count, +and `ReferenceManager.retain`_ to increment it. + +Of course, this is tedious and error-prone, so instead of directly working with buffers, we typically use +higher-level APIs like ValueVector. Such classes generally implement Closeable/AutoCloseable and will automatically +decrement the reference count when closed. + +Allocators implement AutoCloseable as well. In this case, closing the allocator will check that all buffers +obtained from the allocator are closed. If not, ``close()`` method will raise an exception; this helps track +memory leaks from unclosed buffers. + +Reference counting needs to be handled carefully. To ensure that an +independent section of code has fully cleaned up all allocated buffers, use a new child allocator. + +Development Guidelines +---------------------- + +Applications should generally: + +* Use the BufferAllocator interface in APIs instead of RootAllocator. +* Create one RootAllocator at the start of the program and explicitly pass it when needed. +* ``close()`` allocators after use (whether they are child allocators or the RootAllocator), either manually or preferably via a try-with-resources statement. + + +Debugging Memory Leaks/Allocation +--------------------------------- + +In ``DEBUG`` mode, the allocator and supporting classes will record additional +debug tracking information to better track down memory leaks and issues. To +enable DEBUG mode pass the following system property to the VM when starting +``-Darrow.memory.debug.allocator=true``. + +When DEBUG is enabled, a log will be kept of allocations. Configure SLF4J to see these logs (e.g. via Logback/Apache Log4j). +Consider the following example to see how it helps us with the tracking of allocators: + +.. code-block:: Java + + import org.apache.arrow.memory.ArrowBuf; + import org.apache.arrow.memory.BufferAllocator; + import org.apache.arrow.memory.RootAllocator; + + try (BufferAllocator bufferAllocator = new RootAllocator(8 * 1024)) { + ArrowBuf arrowBuf = bufferAllocator.buffer(4 * 1024); + System.out.println(arrowBuf); + } + +Without the debug mode enabled, when we close the allocator, we get this: + +.. code-block:: shell + + 11:56:48.944 [main] INFO o.apache.arrow.memory.BaseAllocator - Debug mode disabled. + ArrowBuf[2], address:140508391276544, length:4096 + 16:28:08.847 [main] ERROR o.apache.arrow.memory.BaseAllocator - Memory was leaked by query. Memory leaked: (4096) + Allocator(ROOT) 0/4096/4096/8192 (res/actual/peak/limit) + +Enabling the debug mode, we get more details: + +.. code-block:: shell + + 11:56:48.944 [main] INFO o.apache.arrow.memory.BaseAllocator - Debug mode enabled. + ArrowBuf[2], address:140437894463488, length:4096 + Exception in thread "main" java.lang.IllegalStateException: Allocator[ROOT] closed with outstanding buffers allocated (1). + Allocator(ROOT) 0/4096/4096/8192 (res/actual/peak/limit) + child allocators: 0 + ledgers: 1 + ledger[1] allocator: ROOT), isOwning: , size: , references: 1, life: 261438177096661..0, allocatorManager: [, life: ] holds 1 buffers. + ArrowBuf[2], address:140437894463488, length:4096 + reservations: 0 + +Additionally, in debug mode, `ArrowBuf.print()`_ can be used to obtain a debug string. +This will include information about allocation operations on the buffer with stack traces, such as when/where the buffer was allocated. + +.. code-block:: java + + import org.apache.arrow.memory.ArrowBuf; + import org.apache.arrow.memory.BufferAllocator; + import org.apache.arrow.memory.RootAllocator; + + try (final BufferAllocator allocator = new RootAllocator()) { + try (final ArrowBuf buf = allocator.buffer(1024)) { + final StringBuilder sb = new StringBuilder(); + buf.print(sb, /*indent*/ 0); + System.out.println(sb.toString()); + } + } + +.. code-block:: text + + ArrowBuf[2], address:140433199984656, length:1024 + event log for: ArrowBuf[2] + 675959093395667 create() + at org.apache.arrow.memory.util.HistoricalLog$Event.(HistoricalLog.java:175) + at org.apache.arrow.memory.util.HistoricalLog.recordEvent(HistoricalLog.java:83) + at org.apache.arrow.memory.ArrowBuf.(ArrowBuf.java:96) + at org.apache.arrow.memory.BufferLedger.newArrowBuf(BufferLedger.java:271) + at org.apache.arrow.memory.BaseAllocator.bufferWithoutReservation(BaseAllocator.java:300) + at org.apache.arrow.memory.BaseAllocator.buffer(BaseAllocator.java:276) + at org.apache.arrow.memory.RootAllocator.buffer(RootAllocator.java:29) + at org.apache.arrow.memory.BaseAllocator.buffer(BaseAllocator.java:240) + at org.apache.arrow.memory.RootAllocator.buffer(RootAllocator.java:29) + at REPL.$JShell$14.do_it$($JShell$14.java:10) + at jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(NativeMethodAccessorImpl.java:-2) + at jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) + at jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) + at java.lang.reflect.Method.invoke(Method.java:566) + at jdk.jshell.execution.DirectExecutionControl.invoke(DirectExecutionControl.java:209) + at jdk.jshell.execution.RemoteExecutionControl.invoke(RemoteExecutionControl.java:116) + at jdk.jshell.execution.DirectExecutionControl.invoke(DirectExecutionControl.java:119) + at jdk.jshell.execution.ExecutionControlForwarder.processCommand(ExecutionControlForwarder.java:144) + at jdk.jshell.execution.ExecutionControlForwarder.commandLoop(ExecutionControlForwarder.java:262) + at jdk.jshell.execution.Util.forwardExecutionControl(Util.java:76) + at jdk.jshell.execution.Util.forwardExecutionControlAndIO(Util.java:137) + at jdk.jshell.execution.RemoteExecutionControl.main(RemoteExecutionControl.java:70) + +The BufferAllocator also provides a ``BufferAllocator.toVerboseString()`` which can be used in +``DEBUG`` mode to get extensive stacktrace information and events associated with various Allocator behaviors. + +Finally, enabling the ``TRACE`` logging level will automatically provide this stack trace when the allocator is closed: + +.. code-block:: java + + // Assumes use of Logback; adjust for Log4j, etc. as appropriate + import ch.qos.logback.classic.Level; + import ch.qos.logback.classic.Logger; + import org.apache.arrow.memory.ArrowBuf; + import org.apache.arrow.memory.BufferAllocator; + import org.apache.arrow.memory.RootAllocator; + import org.slf4j.LoggerFactory; + + // Set log level to TRACE to get tracebacks + ((Logger) LoggerFactory.getLogger("org.apache.arrow")).setLevel(Level.TRACE); + try (final BufferAllocator allocator = new RootAllocator()) { + // Leak buffer + allocator.buffer(1024); + } + +.. code-block:: text + + | Exception java.lang.IllegalStateException: Allocator[ROOT] closed with outstanding buffers allocated (1). + Allocator(ROOT) 0/1024/1024/9223372036854775807 (res/actual/peak/limit) + child allocators: 0 + ledgers: 1 + ledger[1] allocator: ROOT), isOwning: , size: , references: 1, life: 712040870231544..0, allocatorManager: [, life: ] holds 1 buffers. + ArrowBuf[2], address:139926571810832, length:1024 + event log for: ArrowBuf[2] + 712040888650134 create() + at org.apache.arrow.memory.util.StackTrace.(StackTrace.java:34) + at org.apache.arrow.memory.util.HistoricalLog$Event.(HistoricalLog.java:175) + at org.apache.arrow.memory.util.HistoricalLog.recordEvent(HistoricalLog.java:83) + at org.apache.arrow.memory.ArrowBuf.(ArrowBuf.java:96) + at org.apache.arrow.memory.BufferLedger.newArrowBuf(BufferLedger.java:271) + at org.apache.arrow.memory.BaseAllocator.bufferWithoutReservation(BaseAllocator.java:300) + at org.apache.arrow.memory.BaseAllocator.buffer(BaseAllocator.java:276) + at org.apache.arrow.memory.RootAllocator.buffer(RootAllocator.java:29) + at org.apache.arrow.memory.BaseAllocator.buffer(BaseAllocator.java:240) + at org.apache.arrow.memory.RootAllocator.buffer(RootAllocator.java:29) + at REPL.$JShell$18.do_it$($JShell$18.java:13) + at jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(NativeMethodAccessorImpl.java:-2) + at jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) + at jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) + at java.lang.reflect.Method.invoke(Method.java:566) + at jdk.jshell.execution.DirectExecutionControl.invoke(DirectExecutionControl.java:209) + at jdk.jshell.execution.RemoteExecutionControl.invoke(RemoteExecutionControl.java:116) + at jdk.jshell.execution.DirectExecutionControl.invoke(DirectExecutionControl.java:119) + at jdk.jshell.execution.ExecutionControlForwarder.processCommand(ExecutionControlForwarder.java:144) + at jdk.jshell.execution.ExecutionControlForwarder.commandLoop(ExecutionControlForwarder.java:262) + at jdk.jshell.execution.Util.forwardExecutionControl(Util.java:76) + at jdk.jshell.execution.Util.forwardExecutionControlAndIO(Util.java:137) + + reservations: 0 + + | at BaseAllocator.close (BaseAllocator.java:405) + | at RootAllocator.close (RootAllocator.java:29) + | at (#8:1) + +Sometimes, explicitly passing allocators around is difficult. For example, it +can be hard to pass around extra state, like an allocator, through layers of +existing application or framework code. A global or singleton allocator instance +can be useful here, though it should not be your first choice. + +How this works: + +1. Set up a global allocator in a singleton class. +2. Provide methods to create child allocators from the global allocator. +3. Give child allocators proper names to make it easier to figure out where + allocations occurred in case of errors. +4. Ensure that resources are properly closed. +5. Check that the global allocator is empty at some suitable point, such as + right before program shutdown. +6. If it is not empty, review the above allocation bugs. + +.. code-block:: java + + //1 + private static final BufferAllocator allocator = new RootAllocator(); + private static final AtomicInteger childNumber = new AtomicInteger(0); + ... + //2 + public static BufferAllocator getChildAllocator() { + return allocator.newChildAllocator(nextChildName(), 0, Long.MAX_VALUE); + } + ... + //3 + private static String nextChildName() { + return "Allocator-Child-" + childNumber.incrementAndGet(); + } + ... + //4: Business code + try (BufferAllocator allocator = GlobalAllocator.getChildAllocator()) { + ... + } + ... + //5 + public static void checkGlobalCleanUpResources() { + ... + if (!allocator.getChildAllocators().isEmpty()) { + throw new IllegalStateException(...); + } else if (allocator.getAllocatedMemory() != 0) { + throw new IllegalStateException(...); + } + } + +.. _`ArrowBuf`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/memory/ArrowBuf.html +.. _`ArrowBuf.print()`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/memory/ArrowBuf.html#print-java.lang.StringBuilder-int-org.apache.arrow.memory.BaseAllocator.Verbosity- +.. _`BufferAllocator`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/memory/BufferAllocator.html +.. _`BufferLedger`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/memory/BufferLedger.html +.. _`RootAllocator`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/memory/RootAllocator.html +.. _`newChildAllocator`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/memory/RootAllocator.html#newChildAllocator-java.lang.String-org.apache.arrow.memory.AllocationListener-long-long- +.. _`Netty`: https://netty.io/wiki/ +.. _`sun.misc.unsafe`: https://web.archive.org/web/20210929024401/http://www.docjar.com/html/api/sun/misc/Unsafe.java.html +.. _`Direct Memory`: https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/nio/ByteBuffer.html +.. _`ReferenceManager`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/memory/ReferenceManager.html +.. _`ReferenceManager.release`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/memory/ReferenceManager.html#release-- +.. _`ReferenceManager.retain`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/memory/ReferenceManager.html#retain-- + +Arrow Memory In-Depth +===================== + +Design Principles +----------------- +Arrow’s memory model is based on the following basic concepts: + +- Memory can be allocated up to some limit. That limit could be a real + limit (OS/JVM) or a locally imposed limit. +- Allocation operates in two phases: accounting then actual allocation. + Allocation could fail at either point. +- Allocation failure should be recoverable. In all cases, the Allocator + infrastructure should expose memory allocation failures (OS or + internal limit-based) as ``OutOfMemoryException``\ s. +- Any allocator can reserve memory when created. This memory shall be + held such that this allocator will always be able to allocate that + amount of memory. +- A particular application component should work to use a local + allocator to understand local memory usage and better debug memory + leaks. +- The same physical memory can be shared by multiple allocators and the + allocator must provide an accounting paradigm for this purpose. + +Reserving Memory +---------------- + +Arrow provides two different ways to reserve memory: + +- BufferAllocator accounting reservations: When a new allocator (other + than the ``RootAllocator``) is initialized, it can set aside memory + that it will keep locally for its lifetime. This is memory that will + never be released back to its parent allocator until the allocator is + closed. +- ``AllocationReservation`` via BufferAllocator.newReservation(): + Allows a short-term preallocation strategy so that a particular + subsystem can ensure future memory is available to support a + particular request. + +Reference Counting Details +-------------------------- + +Typically, the ReferenceManager implementation used is an instance of `BufferLedger`_. +A BufferLedger is a ReferenceManager that also maintains the relationship between an ``AllocationManager``, +a ``BufferAllocator`` and one or more individual ``ArrowBuf``\ s + +All ArrowBufs (direct or sliced) related to a single BufferLedger/BufferAllocator combination +share the same reference count and either all will be valid or all will be invalid. +For simplicity of accounting, we treat that memory as being used by one +of the BufferAllocators associated with the memory. When that allocator +releases its claim on that memory, the memory ownership is then moved to +another BufferLedger belonging to the same AllocationManager. + +Allocation Details +------------------ + +There are several Allocator types in Arrow Java: + +- ``BufferAllocator`` - The public interface application users should be leveraging +- ``BaseAllocator`` - The base implementation of memory allocation, contains the meat of the Arrow allocator implementation +- ``RootAllocator`` - The root allocator. Typically only one created for a JVM. It serves as the parent/ancestor for child allocators +- ``ChildAllocator`` - A child allocator that derives from the root allocator + +Many BufferAllocators can reference the same piece of physical memory at the same +time. It is the AllocationManager’s responsibility to ensure that in this situation, +all memory is accurately accounted for from the Root’s perspective +and also to ensure that the memory is correctly released once all +BufferAllocators have stopped using that memory. + +For simplicity of accounting, we treat that memory as being used by one +of the BufferAllocators associated with the memory. When that allocator +releases its claim on that memory, the memory ownership is then moved to +another BufferLedger belonging to the same AllocationManager. Note that +because a ArrowBuf.release() is what actually causes memory ownership +transfer to occur, we always proceed with ownership transfer (even if +that violates an allocator limit). It is the responsibility of the +application owning a particular allocator to frequently confirm whether +the allocator is over its memory limit (BufferAllocator.isOverLimit()) +and if so, attempt to aggressively release memory to ameliorate the +situation. + + +Object Hierarchy +---------------- + +There are two main ways that someone can look at the object hierarchy +for Arrow’s memory management scheme. The first is a memory based +perspective as below: + +Memory Perspective +~~~~~~~~~~~~~~~~~~ + +.. code-block:: none + + + AllocationManager + | + |-- UnsignedDirectLittleEndian (One per AllocationManager) + | + |-+ BufferLedger 1 ==> Allocator A (owning) + | ` - ArrowBuf 1 + |-+ BufferLedger 2 ==> Allocator B (non-owning) + | ` - ArrowBuf 2 + |-+ BufferLedger 3 ==> Allocator C (non-owning) + | - ArrowBuf 3 + | - ArrowBuf 4 + ` - ArrowBuf 5 + +In this picture, a piece of memory is owned by an allocator manager. An +allocator manager is responsible for that piece of memory no matter +which allocator(s) it is working with. An allocator manager will have +relationships with a piece of raw memory (via its reference to +UnsignedDirectLittleEndian) as well as references to each +BufferAllocator it has a relationship to. + +Allocator Perspective +~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: none + + + RootAllocator + |-+ ChildAllocator 1 + | | - ChildAllocator 1.1 + | ` ... + | + |-+ ChildAllocator 2 + |-+ ChildAllocator 3 + | | + | |-+ BufferLedger 1 ==> AllocationManager 1 (owning) ==> UDLE + | | `- ArrowBuf 1 + | `-+ BufferLedger 2 ==> AllocationManager 2 (non-owning)==> UDLE + | `- ArrowBuf 2 + | + |-+ BufferLedger 3 ==> AllocationManager 1 (non-owning)==> UDLE + | ` - ArrowBuf 3 + |-+ BufferLedger 4 ==> AllocationManager 2 (owning) ==> UDLE + | - ArrowBuf 4 + | - ArrowBuf 5 + ` - ArrowBuf 6 + +In this picture, a RootAllocator owns three ChildAllocators. The first +ChildAllocator (ChildAllocator 1) owns a subsequent ChildAllocator. +ChildAllocator has two BufferLedgers/AllocationManager references. +Coincidentally, each of these AllocationManager’s is also associated +with the RootAllocator. In this case, one of the these +AllocationManagers is owned by ChildAllocator 3 (AllocationManager 1) +while the other AllocationManager (AllocationManager 2) is +owned/accounted for by the RootAllocator. Note that in this scenario, +ArrowBuf 1 is sharing the underlying memory as ArrowBuf 3. However the +subset of that memory (e.g. through slicing) might be different. Also +note that ArrowBuf 2 and ArrowBuf 4, 5 and 6 are also sharing the same +underlying memory. Also note that ArrowBuf 4, 5 and 6 all share the same +reference count and fate. diff --git a/docs/source/overview.rst b/docs/source/overview.rst new file mode 100644 index 00000000..be579c14 --- /dev/null +++ b/docs/source/overview.rst @@ -0,0 +1,90 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +=================== +High-Level Overview +=================== + +The Apache Arrow Java modules implement various specifications including the +columnar format and IPC. Most modules are native Java implementations, +but some modules are JNI bindings to the C++ library. + +.. list-table:: Arrow Java Modules + :widths: 25 50 25 + :header-rows: 1 + + * - Module + - Description + - Implementation + * - arrow-format + - Generated Java files from the IPC Flatbuffer definitions. + - Native + * - arrow-memory-core + - Core off-heap memory management libraries for Arrow ValueVectors. + - Native + * - arrow-memory-unsafe + - Memory management implementation based on sun.misc.Unsafe. + - Native + * - arrow-memory-netty + - Memory management implementation based on Netty. + - Native + * - arrow-vector + - An off-heap reference implementation for Arrow columnar data format. + - Native + * - arrow-tools + - Java applications for working with Arrow ValueVectors. + - Native + * - arrow-jdbc + - (Experimental) A library for converting JDBC data to Arrow data. + - Native + * - flight-core + - An RPC mechanism for transferring ValueVectors. + - Native + * - flight-sql + - Contains utility classes to expose Flight SQL semantics for clients and servers over Arrow Flight. + - Native + * - flight-integration-tests + - Integration tests for Flight RPC. + - Native + * - arrow-performance + - JMH benchmarks for the Arrow libraries. + - Native + * - arrow-algorithm + - (Experimental) A collection of algorithms for working with ValueVectors. + - Native + * - arrow-avro + - (Experimental) A library for converting Avro data to Arrow data. + - Native + * - arrow-compression + - (Experimental) A library for working with compression/decompression of Arrow data. + - Native + * - arrow-c-data + - Java implementation of `C Data Interface`_ + - JNI + * - arrow-orc + - (Experimental) A JNI wrapper for the C++ ORC reader implementation. + - JNI + * - arrow-gandiva + - Java wrappers around the native Gandiva SQL expression compiler. + - JNI + * - arrow-dataset + - Java bindings to the Arrow Datasets library. + - JNI + +Arrow Java modules support working with data (1) in-memory, (2) at rest, and (3) on-the-wire. + +.. _`C Data Interface`: https://arrow.apache.org/docs/format/CDataInterface.html diff --git a/docs/source/quickstartguide.rst b/docs/source/quickstartguide.rst new file mode 100644 index 00000000..adb07d70 --- /dev/null +++ b/docs/source/quickstartguide.rst @@ -0,0 +1,314 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +================= +Quick Start Guide +================= + +Arrow Java provides several building blocks. Data types describe the types of values; +ValueVectors are sequences of typed values; fields describe the types of columns in +tabular data; schemas describe a sequence of columns in tabular data, and +VectorSchemaRoot represents tabular data. Arrow also provides readers and +writers for loading data from and persisting data to storage. + +Create a ValueVector +******************** + +**ValueVectors** represent a sequence of values of the same type. +They are also known as "arrays" in the columnar format. + +Example: create a vector of 32-bit integers representing ``[1, null, 2]``: + +.. code-block:: Java + + import org.apache.arrow.memory.BufferAllocator; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.IntVector; + + try( + BufferAllocator allocator = new RootAllocator(); + IntVector intVector = new IntVector("fixed-size-primitive-layout", allocator); + ){ + intVector.allocateNew(3); + intVector.set(0,1); + intVector.setNull(1); + intVector.set(2,2); + intVector.setValueCount(3); + System.out.println("Vector created in memory: " + intVector); + } + +.. code-block:: shell + + Vector created in memory: [1, null, 2] + + +Example: create a vector of UTF-8 encoded strings representing ``["one", "two", "three"]``: + +.. code-block:: Java + + import org.apache.arrow.memory.BufferAllocator; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.VarCharVector; + + try( + BufferAllocator allocator = new RootAllocator(); + VarCharVector varCharVector = new VarCharVector("variable-size-primitive-layout", allocator); + ){ + varCharVector.allocateNew(3); + varCharVector.set(0, "one".getBytes()); + varCharVector.set(1, "two".getBytes()); + varCharVector.set(2, "three".getBytes()); + varCharVector.setValueCount(3); + System.out.println("Vector created in memory: " + varCharVector); + } + +.. code-block:: shell + + Vector created in memory: [one, two, three] + +Create a Field +************** + +**Fields** are used to denote the particular columns of tabular data. +They consist of a name, a data type, a flag indicating whether the column can have null values, +and optional key-value metadata. + +Example: create a field named "document" of string type: + +.. code-block:: Java + + import org.apache.arrow.vector.types.pojo.ArrowType; + import org.apache.arrow.vector.types.pojo.Field; + import org.apache.arrow.vector.types.pojo.FieldType; + import java.util.HashMap; + import java.util.Map; + + Map metadata = new HashMap<>(); + metadata.put("A", "Id card"); + metadata.put("B", "Passport"); + metadata.put("C", "Visa"); + Field document = new Field("document", + new FieldType(true, new ArrowType.Utf8(), /*dictionary*/ null, metadata), + /*children*/ null); + System.out.println("Field created: " + document + ", Metadata: " + document.getMetadata()); + +.. code-block:: shell + + Field created: document: Utf8, Metadata: {A=Id card, B=Passport, C=Visa} + +Create a Schema +*************** + +**Schemas** hold a sequence of fields together with some optional metadata. + +Example: Create a schema describing datasets with two columns: +an int32 column "A" and a UTF8-encoded string column "B" + +.. code-block:: Java + + import org.apache.arrow.vector.types.pojo.ArrowType; + import org.apache.arrow.vector.types.pojo.Field; + import org.apache.arrow.vector.types.pojo.FieldType; + import org.apache.arrow.vector.types.pojo.Schema; + import java.util.HashMap; + import java.util.Map; + import static java.util.Arrays.asList; + + Map metadata = new HashMap<>(); + metadata.put("K1", "V1"); + metadata.put("K2", "V2"); + Field a = new Field("A", FieldType.nullable(new ArrowType.Int(32, true)), /*children*/ null); + Field b = new Field("B", FieldType.nullable(new ArrowType.Utf8()), /*children*/ null); + Schema schema = new Schema(asList(a, b), metadata); + System.out.println("Schema created: " + schema); + +.. code-block:: shell + + Schema created: Schema(metadata: {K1=V1, K2=V2}) + +Create a VectorSchemaRoot +************************* + +A **VectorSchemaRoot** combines ValueVectors with a Schema to represent tabular data. + +Example: Create a dataset of names (strings) and ages (32-bit signed integers). + +.. code-block:: Java + + import org.apache.arrow.memory.BufferAllocator; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.IntVector; + import org.apache.arrow.vector.VarCharVector; + import org.apache.arrow.vector.VectorSchemaRoot; + import org.apache.arrow.vector.types.pojo.ArrowType; + import org.apache.arrow.vector.types.pojo.Field; + import org.apache.arrow.vector.types.pojo.FieldType; + import org.apache.arrow.vector.types.pojo.Schema; + import java.nio.charset.StandardCharsets; + import java.util.HashMap; + import java.util.Map; + import static java.util.Arrays.asList; + + Field age = new Field("age", + FieldType.nullable(new ArrowType.Int(32, true)), + /*children*/null + ); + Field name = new Field("name", + FieldType.nullable(new ArrowType.Utf8()), + /*children*/null + ); + Schema schema = new Schema(asList(age, name), /*metadata*/ null); + try( + BufferAllocator allocator = new RootAllocator(); + VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator); + IntVector ageVector = (IntVector) root.getVector("age"); + VarCharVector nameVector = (VarCharVector) root.getVector("name"); + ){ + ageVector.allocateNew(3); + ageVector.set(0, 10); + ageVector.set(1, 20); + ageVector.set(2, 30); + nameVector.allocateNew(3); + nameVector.set(0, "Dave".getBytes(StandardCharsets.UTF_8)); + nameVector.set(1, "Peter".getBytes(StandardCharsets.UTF_8)); + nameVector.set(2, "Mary".getBytes(StandardCharsets.UTF_8)); + root.setRowCount(3); + System.out.println("VectorSchemaRoot created: \n" + root.contentToTSVString()); + } + +.. code-block:: shell + + VectorSchemaRoot created: + age name + 10 Dave + 20 Peter + 30 Mary + + +Interprocess Communication (IPC) +******************************** + +Arrow data can be written to and read from disk, and both of these can be done in +a streaming and/or random-access fashion depending on application requirements. + +**Write data to an arrow file** + +Example: Write the dataset from the previous example to an Arrow IPC file (random-access). + +.. code-block:: Java + + import org.apache.arrow.memory.BufferAllocator; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.IntVector; + import org.apache.arrow.vector.VarCharVector; + import org.apache.arrow.vector.VectorSchemaRoot; + import org.apache.arrow.vector.ipc.ArrowFileWriter; + import org.apache.arrow.vector.types.pojo.ArrowType; + import org.apache.arrow.vector.types.pojo.Field; + import org.apache.arrow.vector.types.pojo.FieldType; + import org.apache.arrow.vector.types.pojo.Schema; + import java.io.File; + import java.io.FileOutputStream; + import java.io.IOException; + import java.nio.charset.StandardCharsets; + import java.util.HashMap; + import java.util.Map; + import static java.util.Arrays.asList; + + Field age = new Field("age", + FieldType.nullable(new ArrowType.Int(32, true)), + /*children*/ null); + Field name = new Field("name", + FieldType.nullable(new ArrowType.Utf8()), + /*children*/ null); + Schema schema = new Schema(asList(age, name)); + try( + BufferAllocator allocator = new RootAllocator(); + VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator); + IntVector ageVector = (IntVector) root.getVector("age"); + VarCharVector nameVector = (VarCharVector) root.getVector("name"); + ){ + ageVector.allocateNew(3); + ageVector.set(0, 10); + ageVector.set(1, 20); + ageVector.set(2, 30); + nameVector.allocateNew(3); + nameVector.set(0, "Dave".getBytes(StandardCharsets.UTF_8)); + nameVector.set(1, "Peter".getBytes(StandardCharsets.UTF_8)); + nameVector.set(2, "Mary".getBytes(StandardCharsets.UTF_8)); + root.setRowCount(3); + File file = new File("random_access_file.arrow"); + try ( + FileOutputStream fileOutputStream = new FileOutputStream(file); + ArrowFileWriter writer = new ArrowFileWriter(root, /*provider*/ null, fileOutputStream.getChannel()); + ) { + writer.start(); + writer.writeBatch(); + writer.end(); + System.out.println("Record batches written: " + writer.getRecordBlocks().size() + + ". Number of rows written: " + root.getRowCount()); + } catch (IOException e) { + e.printStackTrace(); + } + } + +.. code-block:: shell + + Record batches written: 1. Number of rows written: 3 + +**Read data from an arrow file** + +Example: Read the dataset from the previous example from an Arrow IPC file (random-access). + +.. code-block:: Java + + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.ipc.ArrowFileReader; + import org.apache.arrow.vector.ipc.message.ArrowBlock; + import org.apache.arrow.vector.VectorSchemaRoot; + import java.io.File; + import java.io.FileInputStream; + import java.io.FileOutputStream; + import java.io.IOException; + + try( + BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); + FileInputStream fileInputStream = new FileInputStream(new File("random_access_file.arrow")); + ArrowFileReader reader = new ArrowFileReader(fileInputStream.getChannel(), allocator); + ){ + System.out.println("Record batches in file: " + reader.getRecordBlocks().size()); + for (ArrowBlock arrowBlock : reader.getRecordBlocks()) { + reader.loadRecordBatch(arrowBlock); + VectorSchemaRoot root = reader.getVectorSchemaRoot(); + System.out.println("VectorSchemaRoot read: \n" + root.contentToTSVString()); + } + } catch (IOException e) { + e.printStackTrace(); + } + +.. code-block:: shell + + Record batches in file: 1 + VectorSchemaRoot read: + age name + 10 Dave + 20 Peter + 30 Mary + +More examples available at `Arrow Java Cookbook`_. + +.. _`Arrow Java Cookbook`: https://arrow.apache.org/cookbook/java diff --git a/docs/source/reference/index.rst b/docs/source/reference/index.rst new file mode 100644 index 00000000..523ac0c7 --- /dev/null +++ b/docs/source/reference/index.rst @@ -0,0 +1,21 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Java Reference (javadoc) +======================== + +Stub page for the Java reference docs; actual source is located in the java/ directory. diff --git a/docs/source/substrait.rst b/docs/source/substrait.rst new file mode 100644 index 00000000..b3678ac8 --- /dev/null +++ b/docs/source/substrait.rst @@ -0,0 +1,201 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +========= +Substrait +========= + +The ``arrow-dataset`` module can execute Substrait_ plans via the :external+arrow:doc:`Acero ` +query engine. + +Executing Queries Using Substrait Plans +======================================= + +Plans can reference data in files via URIs, or "named tables" that must be provided along with the plan. + +Here is an example of a Java program that queries a Parquet file using Java Substrait +(this example use `Substrait Java`_ project to compile a SQL query to a Substrait plan): + +.. code-block:: Java + + import com.google.common.collect.ImmutableList; + import io.substrait.isthmus.SqlToSubstrait; + import io.substrait.proto.Plan; + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.scanner.ScanOptions; + import org.apache.arrow.dataset.scanner.Scanner; + import org.apache.arrow.dataset.source.Dataset; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.dataset.substrait.AceroSubstraitConsumer; + import org.apache.arrow.memory.BufferAllocator; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.ipc.ArrowReader; + import org.apache.calcite.sql.parser.SqlParseException; + + import java.nio.ByteBuffer; + import java.util.HashMap; + import java.util.Map; + + public class ClientSubstrait { + public static void main(String[] args) { + String uri = "file:///data/tpch_parquet/nation.parquet"; + ScanOptions options = new ScanOptions(/*batchSize*/ 32768); + try ( + BufferAllocator allocator = new RootAllocator(); + DatasetFactory datasetFactory = new FileSystemDatasetFactory(allocator, NativeMemoryPool.getDefault(), + FileFormat.PARQUET, uri); + Dataset dataset = datasetFactory.finish(); + Scanner scanner = dataset.newScan(options); + ArrowReader reader = scanner.scanBatches() + ) { + // map table to reader + Map mapTableToArrowReader = new HashMap<>(); + mapTableToArrowReader.put("NATION", reader); + // get binary plan + Plan plan = getPlan(); + ByteBuffer substraitPlan = ByteBuffer.allocateDirect(plan.toByteArray().length); + substraitPlan.put(plan.toByteArray()); + // run query + try (ArrowReader arrowReader = new AceroSubstraitConsumer(allocator).runQuery( + substraitPlan, + mapTableToArrowReader + )) { + while (arrowReader.loadNextBatch()) { + System.out.println(arrowReader.getVectorSchemaRoot().contentToTSVString()); + } + } + } catch (Exception e) { + e.printStackTrace(); + } + } + + static Plan getPlan() throws SqlParseException { + String sql = "SELECT * from nation"; + String nation = "CREATE TABLE NATION (N_NATIONKEY BIGINT NOT NULL, N_NAME CHAR(25), " + + "N_REGIONKEY BIGINT NOT NULL, N_COMMENT VARCHAR(152))"; + SqlToSubstrait sqlToSubstrait = new SqlToSubstrait(); + Plan plan = sqlToSubstrait.execute(sql, ImmutableList.of(nation)); + return plan; + } + } + +.. code-block:: text + + // Results example: + FieldPath(0) FieldPath(1) FieldPath(2) FieldPath(3) + 0 ALGERIA 0 haggle. carefully final deposits detect slyly agai + 1 ARGENTINA 1 al foxes promise slyly according to the regular accounts. bold requests alon + +Executing Projections and Filters Using Extended Expressions +============================================================ + +Dataset also supports projections and filters with Substrait's `Extended Expression`_. +This requires the substrait-java library. + +This Java program: + +- Loads a Parquet file containing the "nation" table from the TPC-H benchmark. +- Applies a filter: + - ``N_NATIONKEY > 18`` +- Projects two new columns: + - ``N_REGIONKEY + 10`` + - ``N_NAME || ' - ' || N_COMMENT`` + + + +.. code-block:: Java + + import com.google.common.collect.ImmutableList; + import io.substrait.isthmus.SqlExpressionToSubstrait; + import io.substrait.proto.ExtendedExpression; + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.scanner.ScanOptions; + import org.apache.arrow.dataset.scanner.Scanner; + import org.apache.arrow.dataset.source.Dataset; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.BufferAllocator; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.ipc.ArrowReader; + import org.apache.calcite.sql.parser.SqlParseException; + + import java.nio.ByteBuffer; + import java.util.Base64; + import java.util.Optional; + + public class ClientSubstraitExtendedExpressionsCookbook { + + public static void main(String[] args) throws SqlParseException { + projectAndFilterDataset(); + } + + private static void projectAndFilterDataset() throws SqlParseException { + String uri = "file:///Users/data/tpch_parquet/nation.parquet"; + ScanOptions options = + new ScanOptions.Builder(/*batchSize*/ 32768) + .columns(Optional.empty()) + .substraitFilter(getByteBuffer(new String[]{"N_NATIONKEY > 18"})) + .substraitProjection(getByteBuffer(new String[]{"N_REGIONKEY + 10", + "N_NAME || CAST(' - ' as VARCHAR) || N_COMMENT"})) + .build(); + try (BufferAllocator allocator = new RootAllocator(); + DatasetFactory datasetFactory = + new FileSystemDatasetFactory( + allocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri); + Dataset dataset = datasetFactory.finish(); + Scanner scanner = dataset.newScan(options); + ArrowReader reader = scanner.scanBatches()) { + while (reader.loadNextBatch()) { + System.out.println(reader.getVectorSchemaRoot().contentToTSVString()); + } + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + private static ByteBuffer getByteBuffer(String[] sqlExpression) throws SqlParseException { + String schema = + "CREATE TABLE NATION (N_NATIONKEY INT NOT NULL, N_NAME VARCHAR, " + + "N_REGIONKEY INT NOT NULL, N_COMMENT VARCHAR)"; + SqlExpressionToSubstrait expressionToSubstrait = new SqlExpressionToSubstrait(); + ExtendedExpression expression = + expressionToSubstrait.convert(sqlExpression, ImmutableList.of(schema)); + byte[] expressionToByte = + Base64.getDecoder().decode(Base64.getEncoder().encodeToString(expression.toByteArray())); + ByteBuffer byteBuffer = ByteBuffer.allocateDirect(expressionToByte.length); + byteBuffer.put(expressionToByte); + return byteBuffer; + } + } + +.. code-block:: text + + column-1 column-2 + 13 ROMANIA - ular asymptotes are about the furious multipliers. express dependencies nag above the ironically ironic account + 14 SAUDI ARABIA - ts. silent requests haggle. closely express packages sleep across the blithely + 12 VIETNAM - hely enticingly express accounts. even, final + 13 RUSSIA - requests against the platelets use never according to the quickly regular pint + 13 UNITED KINGDOM - eans boost carefully special requests. accounts are. carefull + 11 UNITED STATES - y final packages. slow foxes cajole quickly. quickly silent platelets breach ironic accounts. unusual pinto be + +.. _`Substrait`: https://substrait.io/ +.. _`Substrait Java`: https://github.com/substrait-io/substrait-java +.. _`Acero`: https://arrow.apache.org/docs/cpp/streaming_execution.html +.. _`Extended Expression`: https://github.com/substrait-io/substrait/blob/main/site/docs/expressions/extended_expression.md diff --git a/docs/source/table.rst b/docs/source/table.rst new file mode 100644 index 00000000..5aa95e15 --- /dev/null +++ b/docs/source/table.rst @@ -0,0 +1,378 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +===== +Table +===== + +**NOTE**: The Table API is experimental and subject to change. See the list of limitations below. + +`Table`_ is an immutable tabular data structure based on `FieldVector`_. Like `VectorSchemaRoot`_, ``Table`` is a columnar data structure backed by Arrow arrays, or more specifically, by ``FieldVector`` objects. It differs from ``VectorSchemaRoot`` mainly in that it is fully immutable and lacks support for batch operations. Anyone processing batches of tabular data in a pipeline should continue to use ``VectorSchemaRoot``. Finally, the ``Table`` API is mainly row-oriented, so in some ways it's more like the JDBC API than the ``VectorSchemaRoot`` API, but you can still use ``FieldReaders`` to work with data in a columnar fashion. + +Mutation in Table and VectorSchemaRoot +====================================== + +``VectorSchemaRoot`` provides a thin wrapper on the vectors that hold its data. Individual vectors can be retrieved from a vector schema root. These vectors have *setters* for modifying their elements, making ``VectorSchemaRoot`` immutable only by convention. The protocol for mutating a vector is documented in the `ValueVector`_ interface: + +- values need to be written in order (e.g. index 0, 1, 2, 5) +- null vectors start with all values as null before writing anything +- for variable width types, the offset vector should be all zeros before writing +- you must call setValueCount before a vector can be read +- you should never write to a vector once it has been read. + +The rules aren't enforced by the API so the programmer is responsible for ensuring that they are followed. Failure to do so could lead to runtime exceptions. + +``Table``, on the other hand, is immutable. The underlying vectors are not exposed. When a table is created from existing vectors, their memory is transferred to new vectors, so subsequent changes to the original vectors can't impact the new table's values. + +Features and limitations +====================================== + +A basic set of table functionality is currently available: + +- Create a table from vectors or ``VectorSchemaRoot`` +- Iterate tables by row, or set the current row index directly +- Access vector values as primitives, objects, and/or nullable `ValueHolder`_ instances (depending on type) +- Get a ``FieldReader`` for any vector +- Add and remove vectors, creating new tables +- Encode and decode a table's vectors using dictionary encoding +- Export table data for use by native code +- Print representative data to TSV strings +- Get a table's schema +- Slice tables +- Convert table to ``VectorSchemaRoot`` + +Limitations in the 11.0.0 release: + +- No support ``ChunkedArray`` or any form of row-group. Support for chunked arrays or row groups will be considered for a future release. +- No support for the C-Stream API. Support for the streaming API is contingent on chunked array support +- No support for creating tables directly from Java POJOs. All data held by a table must be imported via a ``VectorSchemaRoot``, or from collections or arrays of vectors. + +The Table API +============= + +Like ``VectorSchemaRoot``, a table contains a `Schema`_ and an ordered collection of ``FieldVector`` objects, but it is designed to be accessed via a row-oriented interface. + +Creating a Table from a VectorSchemaRoot +**************************************** + +Tables are created from a ``VectorSchemaRoot`` as shown below. The memory buffers holding the data are transferred from the vector schema root to new vectors in the new table, clearing the source vectors in the process. This ensures that the data in your new table is never changed. Since the buffers are transferred rather than copied, this is a very low overhead operation. + +.. code-block:: Java + + Table t = new Table(someVectorSchemaRoot); + +If you now update the vectors held by the ``VectorSchemaRoot`` (using some version of ``ValueVector#setSafe()``), it would reflect those changes, but the values in table *t* are unchanged. + +Creating a Table from FieldVectors +********************************** + +Tables can be created from ``FieldVectors`` as shown below, using 'var-arg' array arguments: + +.. code-block:: Java + + IntVector myVector = createMyIntVector(); + VectorSchemaRoot vsr1 = new VectorSchemaRoot(myVector); + +or by passing a collection: + +.. code-block:: Java + + IntVector myVector = createMyIntVector(); + List fvList = List.of(myVector); + VectorSchemaRoot vsr1 = new VectorSchemaRoot(fvList); + +It is rarely a good idea to share vectors between multiple vector schema roots, and it would not be a good idea to share them between vector schema roots and tables. Creating a ``VectorSchemaRoot`` from a list of vectors does not cause the reference counts for the vectors to be incremented. Unless you manage the counts manually, the code below would lead to more references than reference counts, and that could lead to trouble. There is an implicit assumption that the vectors were created for use by *one* ``VectorSchemaRoot`` that this code violates. + +*Don't do this:* + +.. code-block:: Java + + IntVector myVector = createMyIntVector(); // Reference count for myVector = 1 + VectorSchemaRoot vsr1 = new VectorSchemaRoot(myVector); // Still one reference + VectorSchemaRoot vsr2 = new VectorSchemaRoot(myVector); + // Ref count is still one, but there are two VSRs with a reference to myVector + vsr2.clear(); // Reference count for myVector is 0. + +What is happening is that the reference counter works at a lower level than the ``VectorSchemaRoot`` interface. A reference counter counts references to `ArrowBuf`_ instances that control memory buffers. It doesn't count references to the vectors that hold those ArrowBufs. In the example above, each ``ArrowBuf`` is held by one vector, so there is only one reference. This distinction is blurred when you call the ``VectorSchemaRoot``'s clear() method, which frees the memory held by each of the vectors it references even though another instance references the same vectors. + +When you create tables from vectors, it's assumed that there are no external references to those vectors. To be certain, the buffers underlying these vectors are transferred to new vectors in the new table, and the original vectors are cleared. + +*Don't do this either, but note the difference from above:* + +.. code-block:: Java + + IntVector myVector = createMyIntVector(); // Reference count for myVector = 1 + Table t1 = new Table(myVector); + // myVector is cleared; Table t1 has a new hidden vector with the data from myVector + Table t2 = new Table(myVector); + // t2 has no rows because myVector was just cleared + // t1 continues to have the data from the original vector + t2.clear(); + // no change because t2 is already empty and t1 is independent + +With tables, memory is explicitly transferred on instantiation so the buffers held by a table are held by *only* that table. + +Creating Tables with dictionary-encoded vectors +*********************************************** + +Another point of difference is that ``VectorSchemaRoot`` is uninformed about any dictionary-encoding of its vectors, while tables hold an optional `DictionaryProvider`_ instance. If any vectors in the source data are encoded, a DictionaryProvider must be set to un-encode the values. + +.. code-block:: Java + + VectorSchemaRoot vsr = myVsr(); + DictionaryProvider provider = myProvider(); + Table t = new Table(vsr, provider); + +In ``Table``, dictionaries are used like they are with vectors. To decode a vector, the user provides the name of the vector to decode and the dictionary id: + +.. code-block:: Java + + Table t = new Table(vsr, provider); + ValueVector decodedName = t.decode("name", 1L); + +To encode a vector from a table, a similar approach is used: + +.. code-block:: Java + + Table t = new Table(vsr, provider); + ValueVector encodedName = t.encode("name", 1L); + +Freeing memory explicitly +************************* + +Tables use off-heap memory that must be freed when it is no longer needed. ``Table`` implements ``AutoCloseable`` so the best way to create one is in a try-with-resources block: + +.. code-block:: Java + + try (VectorSchemaRoot vsr = myMethodForGettingVsrs(); + Table t = new Table(vsr)) { + // do useful things. + } + +If you don't use a try-with-resources block, you must close the table manually: + +.. code-block:: Java + + try { + VectorSchemaRoot vsr = myMethodForGettingVsrs(); + Table t = new Table(vsr); + // do useful things. + } finally { + vsr.close(); + t.close(); + } + +Manual closing should be performed in a finally block. + +Getting the schema +****************** + +You get the table's schema just as you would with a vector schema root: + +.. code-block:: Java + + Schema s = table.getSchema(); + +Adding and removing vectors +*************************** + +``Table`` provides facilities for adding and removing vectors modeled on the same functionality in ``VectorSchemaRoot``. These operations return new instances rather than modifying the original instance in-place. + +.. code-block:: Java + + try (Table t = new Table(vectorList)) { + IntVector v3 = new IntVector("3", intFieldType, allocator); + Table t2 = t.addVector(2, v3); + Table t3 = t2.removeVector(1); + // don't forget to close t2 and t3 + } + +Slicing tables +************** + +``Table`` supports *slice()* operations, where a slice of a source table is a second Table that refers to a single, contiguous range of rows in the source. + +.. code-block:: Java + + try (Table t = new Table(vectorList)) { + Table t2 = t.slice(100, 200); // creates a slice referencing the values in range (100, 200] + ... + } + +This raises the question: If you create a slice with *all* the values in the source table (as shown below), how would that differ from a new Table constructed with the same vectors as the source? + +.. code-block:: Java + + try (Table t = new Table(vectorList)) { + Table t2 = t.slice(0, t.getRowCount()); // creates a slice referencing all the values in t + // ... + } + +The difference is that when you *construct* a new table, the buffers are transferred from the source vectors to new vectors in the destination. With a slice, both tables share the same underlying vectors. That's OK, though, since both tables are immutable. + +Using FieldReaders +****************** + +You can get a `FieldReader`_ for any vector in the Table passing either the `Field`_, vector index, or vector name as an argument. The signatures are the same as in ``VectorSchemaRoot``. + +.. code-block:: Java + + FieldReader nameReader = table.getReader("user_name"); + +Row operations +************** + +Row-based access is supported by the `Row`_ object. ``Row`` provides *get()* methods by both vector name and vector position, but no *set()* operations. + +It is important to recognize that rows are NOT reified as objects, but rather operate like a cursor where the data from numerous logical rows in the table can be viewed (one at a time) using the same ``Row`` instance. See "Moving from row-to-row" below for information about navigating through the table. + +Getting a row +************* + +Calling ``immutableRow()`` on any table instance returns a new ``Row`` instance. + +.. code-block:: Java + + Row r = table.immutableRow(); + +Moving from row-to-row +********************** + +Since rows are iterable, you can traverse a table using a standard while loop: + +.. code-block:: Java + + Row r = table.immutableRow(); + while (r.hasNext()) { + r.next(); + // do something useful here + } + +``Table`` implements ``Iterable`` so you can access rows directly from a table in an enhanced *for* loop: + +.. code-block:: Java + + for (Row row: table) { + int age = row.getInt("age"); + boolean nameIsNull = row.isNull("name"); + ... + } + +Finally, while rows are usually iterated in the order of the underlying data vectors, but they are also positionable using the ``Row#setPosition()`` method, so you can skip to a specific row. Row numbers are 0-based. + +.. code-block:: Java + + Row r = table.immutableRow(); + int age101 = r.setPosition(101); // change position directly to 101 + +Any changes to position are applied to all the columns in the table. + +Note that you must call ``next()``, or ``setPosition()`` before accessing values via a row. Failure to do so results in a runtime exception. + +Read operations using rows +************************** + +Methods are available for getting values by vector name and vector index, where index is the 0-based position of the vector in the table. For example, assuming 'age' is the 13th vector in 'table', the following two gets are equivalent: + +.. code-block:: Java + + Row r = table.immutableRow(); + r.next(); // position the row at the first value + int age1 = r.get("age"); // gets the value of vector named 'age' in the table at row 0 + int age2 = r.get(12); // gets the value of the 13th vector in the table at row 0 + +You can also get value using a nullable ``ValueHolder``. For example: + +.. code-block:: Java + + NullableIntHolder holder = new NullableIntHolder(); + int b = row.getInt("age", holder); + +This can be used to retrieve values without creating a new Object for each. + +In addition to getting values, you can check if a value is null using ``isNull()``. This is important if the vector contains any nulls, as asking for a value from a vector can cause NullPointerExceptions in some cases. + +.. code-block:: Java + + boolean name0isNull = row.isNull("name"); + +You can also get the current row number: + +.. code-block:: Java + + int row = row.getRowNumber(); + +Reading values as Objects +************************* + +For any given vector type, the basic *get()* method returns a primitive value wherever possible. For example, *getTimeStampMicro()* returns a long value that encodes the timestamp. To get the LocalDateTime object representing that timestamp in Java, another method with 'Obj' appended to the name is provided. For example: + +.. code-block:: Java + + long ts = row.getTimeStampMicro(); + LocalDateTime tsObject = row.getTimeStampMicroObj(); + +The exception to this naming scheme is for complex vector types (List, Map, Schema, Union, DenseUnion, and ExtensionType). These always return objects rather than primitives so no "Obj" extension is required. It is expected that some users may subclass ``Row`` to add getters that are more specific to their needs. + +Reading VarChars and LargeVarChars +********************************** + +Strings in arrow are represented as byte arrays encoded with the UTF-8 charset. You can get either a String result or the actual byte array. + +.. code-block:: Java + + byte[] b = row.getVarChar("first_name"); + String s = row.getVarCharObj("first_name"); // uses the default encoding (UTF-8) + +Converting a Table to a VectorSchemaRoot +**************************************** + +Tables can be converted to vector schema roots using the *toVectorSchemaRoot()* method. Buffers are transferred to the vector schema root and the source table is cleared. + +.. code-block:: Java + + VectorSchemaRoot root = myTable.toVectorSchemaRoot(); + +Working with the C-Data interface +********************************* + +The ability to work with native code is required for many Arrow features. This section describes how tables can be be exported for use with native code + +Exporting works by converting the data to a ``VectorSchemaRoot`` instance and using the existing facilities to transfer the data. You could do it yourself, but that isn't ideal because conversion to a vector schema root breaks the immutability guarantees. Using the ``exportTable()`` methods in the `Data`_ class avoids this concern. + +.. code-block:: Java + + Data.exportTable(bufferAllocator, table, dictionaryProvider, outArrowArray); + +If the table contains dictionary-encoded vectors and was constructed with a ``DictionaryProvider``, the provider argument to ``exportTable()`` can be omitted and the table's provider attribute will be used: + +.. code-block:: Java + + Data.exportTable(bufferAllocator, table, outArrowArray); + +.. _`ArrowBuf`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/memory/ArrowBuf.html +.. _`Data`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/c/Data.html +.. _`DictionaryProvider`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/dictionary/DictionaryProvider.html +.. _`Field`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/types/pojo/Field.html +.. _`FieldReader`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/complex/reader/FieldReader.html +.. _`FieldVector`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/FieldVector.html +.. _`Row`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/table/Row.html +.. _`Schema`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/types/pojo/Schema.html +.. _`Table`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/table/Table.html +.. _`ValueHolder`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/holders/ValueHolder.html +.. _`ValueVector`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/ValueVector.html +.. _`VectorSchemaRoot`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/VectorSchemaRoot.html diff --git a/docs/source/vector.rst b/docs/source/vector.rst new file mode 100644 index 00000000..19962774 --- /dev/null +++ b/docs/source/vector.rst @@ -0,0 +1,366 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +=========== +ValueVector +=========== + +:class:`ValueVector` interface (which called Array in C++ implementation and +the :external+arrow:doc:`the specification `) is an abstraction that is used to store a +sequence of values having the same type in an individual column. Internally, those values are +represented by one or several buffers, the number and meaning of which depend on the vector’s data type. + +There are concrete subclasses of :class:`ValueVector` for each primitive data type +and nested type described in the specification. There are a few differences in naming +with the type names described in the specification: +Table with non-intuitive names (BigInt = 64 bit integer, etc). + +It is important that vector is allocated before attempting to read or write, +:class:`ValueVector` "should" strive to guarantee this order of operation: +create > allocate > mutate > set value count > access > clear (or allocate to start the process over). +We will go through a concrete example to demonstrate each operation in the next section. + +Vector Life Cycle +================= + +As discussed above, each vector goes through several steps in its life cycle, +and each step is triggered by a vector operation. In particular, we have the following vector operations: + +1. **Vector creation**: we create a new vector object by, for example, the vector constructor. +The following code creates a new ``IntVector`` by the constructor: + +.. code-block:: Java + + RootAllocator allocator = new RootAllocator(Long.MAX_VALUE); + ... + IntVector vector = new IntVector("int vector", allocator); + +By now, a vector object is created. However, no underlying memory has been allocated, so we need the +following step. + +2. **Vector allocation**: in this step, we allocate memory for the vector. For most vectors, we +have two options: 1) if we know the maximum vector capacity, we can specify it by calling the +``allocateNew(int)`` method; 2) otherwise, we should call the ``allocateNew()`` method, and a default +capacity will be allocated for it. For our running example, we assume that the vector capacity never +exceeds 10: + +.. code-block:: Java + + vector.allocateNew(10); + +3. **Vector mutation**: now we can populate the vector with values we desire. For all vectors, we can populate +vector values through vector writers (An example will be given in the next section). For primitive types, +we can also mutate the vector by the set methods. There are two classes of set methods: 1) if we can +be sure the vector has enough capacity, we can call the ``set(index, value)`` method. 2) if we are not sure +about the vector capacity, we should call the ``setSafe(index, value)`` method, which will automatically +take care of vector reallocation, if the capacity is not sufficient. For our running example, we know the +vector has enough capacity, so we can call + +.. code-block:: Java + + vector.set(/*index*/5, /*value*/25); + +4. **Set value count**: for this step, we set the value count of the vector by calling the +``setValueCount(int)`` method: + +.. code-block:: Java + + vector.setValueCount(10); + +After this step, the vector enters an immutable state. In other words, we should no longer mutate it. +(Unless we reuse the vector by allocating it again. This will be discussed shortly.) + +5. **Vector access**: it is time to access vector values. Similarly, we have two options to access values: +1) get methods and 2) vector reader. Vector reader works for all types of vectors, while get methods are +only available for primitive vectors. A concrete example for vector reader will be given in the next section. +Below is an example of vector access by get method: + +.. code-block:: Java + + int value = vector.get(5); // value == 25 + +6. **Vector clear**: when we are done with the vector, we should clear it to release its memory. This is done by +calling the ``close()`` method: + +.. code-block:: Java + + vector.close(); + +Some points to note about the steps above: + +* The steps are not necessarily performed in a linear sequence. Instead, they can be in a loop. For example, + when a vector enters the access step, we can also go back to the vector mutation step, and then set value + count, access vector, and so on. + +* We should try to make sure the above steps are carried out in order. Otherwise, the vector + may be in an undefined state, and some unexpected behavior may occur. However, this restriction + is not strict. That means it is possible that we violates the order above, but still get + correct results. + +* When mutating vector values through set methods, we should prefer ``set(index, value)`` methods to + ``setSafe(index, value)`` methods whenever possible, to avoid unnecessary performance overhead of handling + vector capacity. + +* All vectors implement the ``AutoCloseable`` interface. So they must be closed explicitly when they are + no longer used, to avoid resource leak. To make sure of this, it is recommended to place vector related operations + into a try-with-resources block. + +* For fixed width vectors (e.g. IntVector), we can set values at different indices in arbitrary orders. + For variable width vectors (e.g. VarCharVector), however, we must set values in non-decreasing order of the + indices. Otherwise, the values after the set position will become invalid. For example, suppose we use the + following statements to populate a variable width vector: + +.. code-block:: Java + + VarCharVector vector = new VarCharVector("vector", allocator); + vector.allocateNew(); + vector.setSafe(0, "zero"); + vector.setSafe(1, "one"); + ... + vector.setSafe(9, "nine"); + +Then we set the value at position 5 again: + +.. code-block:: Java + + vector.setSafe(5, "5"); + +After that, the values at positions 6, 7, 8, and 9 of the vector will become invalid. + +Building ValueVector +==================== + +Note that the current implementation doesn't enforce the rule that Arrow objects are immutable. +:class:`ValueVector` instances could be created directly by using new keyword, there are +set/setSafe APIs and concrete subclasses of FieldWriter for populating values. + +For example, the code below shows how to build a :class:`BigIntVector`, in this case, we build a +vector of the range 0 to 7 where the element that should hold the fourth value is nulled + +.. code-block:: Java + + try (BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); + BigIntVector vector = new BigIntVector("vector", allocator)) { + vector.allocateNew(8); + vector.set(0, 1); + vector.set(1, 2); + vector.set(2, 3); + vector.setNull(3); + vector.set(4, 5); + vector.set(5, 6); + vector.set(6, 7); + vector.set(7, 8); + vector.setValueCount(8); // this will finalizes the vector by convention. + ... + } + +The :class:`BigIntVector` holds two ArrowBufs. The first buffer holds the null bitmap, which consists +here of a single byte with the bits 1|1|1|1|0|1|1|1 (the bit is 1 if the value is non-null). +The second buffer contains all the above values. As the fourth entry is null, the value at that position +in the buffer is undefined. Note compared with set API, setSafe API would check value capacity before setting +values and reallocate buffers if necessary. + +Here is how to build a vector using writer + +.. code-block:: Java + + try (BigIntVector vector = new BigIntVector("vector", allocator); + BigIntWriter writer = new BigIntWriterImpl(vector)) { + writer.setPosition(0); + writer.writeBigInt(1); + writer.setPosition(1); + writer.writeBigInt(2); + writer.setPosition(2); + writer.writeBigInt(3); + // writer.setPosition(3) is not called which means the fourth value is null. + writer.setPosition(4); + writer.writeBigInt(5); + writer.setPosition(5); + writer.writeBigInt(6); + writer.setPosition(6); + writer.writeBigInt(7); + writer.setPosition(7); + writer.writeBigInt(8); + } + +There are get API and concrete subclasses of :class:`FieldReader` for accessing vector values, what needs +to be declared is that writer/reader is not as efficient as direct access + +.. code-block:: Java + + // access via get API + for (int i = 0; i < vector.getValueCount(); i++) { + if (!vector.isNull(i)) { + System.out.println(vector.get(i)); + } + } + + // access via reader + BigIntReader reader = vector.getReader(); + for (int i = 0; i < vector.getValueCount(); i++) { + reader.setPosition(i); + if (reader.isSet()) { + System.out.println(reader.readLong()); + } + } + +Building ListVector +=================== + +A :class:`ListVector` is a vector that holds a list of values for each index. Working with one you need to handle the same steps as mentioned above (create > allocate > mutate > set value count > access > clear), but the details of how you accomplish this are slightly different since you need to both create the vector and set the list of values for each index. + +For example, the code below shows how to build a :class:`ListVector` of int's using the writer :class:`UnionListWriter`. We build a vector from 0 to 9 and each index contains a list with values [[0, 0, 0, 0, 0], [0, 1, 2, 3, 4], [0, 2, 4, 6, 8], …, [0, 9, 18, 27, 36]]. List values can be added in any order so writing a list such as [3, 1, 2] would be just as valid. + +.. code-block:: Java + + try (BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); + ListVector listVector = ListVector.empty("vector", allocator)) { + UnionListWriter writer = listVector.getWriter(); + for (int i = 0; i < 10; i++) { + writer.startList(); + writer.setPosition(i); + for (int j = 0; j < 5; j++) { + writer.writeInt(j * i); + } + writer.setValueCount(5); + writer.endList(); + } + listVector.setValueCount(10); + } + +:class:`ListVector` values can be accessed either through the get API or through the reader class :class:`UnionListReader`. To read all the values, first enumerate through the indexes, and then enumerate through the inner list values. + +.. code-block:: Java + + // access via get API + for (int i = 0; i < listVector.getValueCount(); i++) { + if (!listVector.isNull(i)) { + ArrayList elements = (ArrayList) listVector.getObject(i); + for (Integer element : elements) { + System.out.println(element); + } + } + } + + // access via reader + UnionListReader reader = listVector.getReader(); + for (int i = 0; i < listVector.getValueCount(); i++) { + reader.setPosition(i); + while (reader.next()) { + IntReader intReader = reader.reader(); + if (intReader.isSet()) { + System.out.println(intReader.readInteger()); + } + } + } + +Dictionary Encoding +=================== + +Dictionary encoding is a form of compression where values of one type are replaced by values of a smaller type: an array of ints replacing an array of strings is a common example. The mapping between the original values and the replacements is held in a 'dictionary'. Since the dictionary needs only one copy of each of the longer values, the combination of the dictionary and the array of smaller values may use less memory. The more repetitive the original data, the greater the savings. + +A ``FieldVector`` can be dictionary encoded for performance or improved memory efficiency. Nearly any type of vector might be encoded if there are many values, but few unique values. + +There are a few steps involved in the encoding process: + +1. Create a regular, un-encoded vector and populate it +2. Create a dictionary vector of the same type as the un-encoded vector. This vector must have the same values, but each unique value in the un-encoded vector need appear here only once. +3. Create a ``Dictionary``. It will contain the dictionary vector, plus a ``DictionaryEncoding`` object that holds the encoding's metadata and settings values. +4. Create a ``DictionaryEncoder``. +5. Call the encode() method on the ``DictionaryEncoder`` to produce an encoded version of the original vector. +6. (Optional) Call the decode() method on the encoded vector to re-create the original values. + +The encoded values will be integers. Depending on how many unique values you have, you can use ``TinyIntVector``, ``SmallIntVector``, ``IntVector``, or ``BigIntVector`` to hold them. You specify the type when you create your ``DictionaryEncoding`` instance. You might wonder where those integers come from: the dictionary vector is a regular vector, so the value's index position in that vector is used as its encoded value. + +Another critical attribute in ``DictionaryEncoding`` is the id. It's important to understand how the id is used, so we cover that later in this section. + +This result will be a new vector (for example, an ``IntVector``) that can act in place of the original vector (for example, a ``VarCharVector``). When you write the data in arrow format, it is both the new ``IntVector`` plus the dictionary that is written: you will need the dictionary later to retrieve the original values. + +.. code-block:: Java + + // 1. create a vector for the un-encoded data and populate it + VarCharVector unencoded = new VarCharVector("unencoded", allocator); + // now put some data in it before continuing + + // 2. create a vector to hold the dictionary and populate it + VarCharVector dictionaryVector = new VarCharVector("dictionary", allocator); + + // 3. create a dictionary object + Dictionary dictionary = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null)); + + // 4. create a dictionary encoder + DictionaryEncoder encoder = new DictionaryEncoder.encode(dictionary, allocator); + + // 5. encode the data + IntVector encoded = (IntVector) encoder.encode(unencoded); + + // 6. re-create an un-encoded version from the encoded vector + VarCharVector decoded = (VarCharVector) encoder.decode(encoded); + +One thing we haven't discussed is how to create the dictionary vector from the original un-encoded values. That is left to the library user since a custom method will likely be more efficient than a general utility. Since the dictionary vector is just a normal vector, you can populate its values with the standard APIs. + +Finally, you can package a number of dictionaries together, which is useful if you're working with a ``VectorSchemaRoot`` with several dictionary-encoded vectors. This is done using an object called a ``DictionaryProvider``. as shown in the example below. Note that we don't put the dictionary vectors in the same ``VectorSchemaRoot`` as the data vectors, as they will generally have fewer values. + + +.. code-block:: Java + + DictionaryProvider.MapDictionaryProvider provider = + new DictionaryProvider.MapDictionaryProvider(); + + provider.put(dictionary); + +The ``DictionaryProvider`` is simply a map of identifiers to ``Dictionary`` objects, where each identifier is a long value. In the above code you will see it as the first argument to the ``DictionaryEncoding`` constructor. + +This is where the ``DictionaryEncoding``'s 'id' attribute comes in. This value is used to connect dictionaries to instances of ``VectorSchemaRoot``, using a ``DictionaryProvider``. Here's how that works: + +* The ``VectorSchemaRoot`` has a ``Schema`` object containing a list of ``Field`` objects. +* The field has an attribute called 'dictionary', but it holds a ``DictionaryEncoding`` rather than a ``Dictionary`` +* As mentioned, the ``DictionaryProvider`` holds dictionaries indexed by a long value. This value is the id from your ``DictionaryEncoding``. +* To retrieve the dictionary for a vector in a ``VectorSchemaRoot``, you get the field associated with the vector, get its dictionary attribute, and use that object's id to look up the correct dictionary in the provider. + +.. code-block:: Java + + // create the encoded vector, the Dictionary and DictionaryProvider as discussed above + + // Create a VectorSchemaRoot with one encoded vector + VectorSchemaRoot vsr = new VectorSchemaRoot(List.of(encoded)); + + // now we want to decode our vector, so we retrieve its dictionary from the provider + Field f = vsr.getField(encoded.getName()); + DictionaryEncoding encoding = f.getDictionary(); + Dictionary dictionary = provider.lookup(encoding.getId()); + +As you can see, a ``DictionaryProvider`` is handy for managing the dictionaries associated with a ``VectorSchemaRoot``. More importantly, it helps package the dictionaries for a ``VectorSchemaRoot`` when it's written. The classes ``ArrowFileWriter`` and ``ArrowStreamWriter`` both accept an optional ``DictionaryProvider`` argument for that purpose. You can find example code for writing dictionaries in the documentation for (:doc:`ipc`). ``ArrowReader`` and its subclasses also implement the ``DictionaryProvider`` interface, so you can retrieve the actual dictionaries when reading a file. + +Slicing +======= + +Similar with C++ implementation, it is possible to make zero-copy slices of vectors to obtain a vector +referring to some logical sub-sequence of the data through :class:`TransferPair` + +.. code-block:: Java + + IntVector vector = new IntVector("intVector", allocator); + for (int i = 0; i < 10; i++) { + vector.setSafe(i, i); + } + vector.setValueCount(10); + + TransferPair tp = vector.getTransferPair(allocator); + tp.splitAndTransfer(0, 5); + IntVector sliced = (IntVector) tp.getTo(); + // In this case, the vector values are [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] and the sliceVector values are [0, 1, 2, 3, 4]. diff --git a/docs/source/vector_schema_root.rst b/docs/source/vector_schema_root.rst new file mode 100644 index 00000000..3119122d --- /dev/null +++ b/docs/source/vector_schema_root.rst @@ -0,0 +1,163 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +============ +Tabular Data +============ + +While arrays (aka: :doc:`ValueVector <./vector>`) represent a one-dimensional sequence of +homogeneous values, data often comes in the form of two-dimensional sets of +heterogeneous data (such as database tables, CSV files...). Arrow provides +several abstractions to handle such data conveniently and efficiently. + +Fields +====== + +Fields are used to denote the particular columns of tabular data. +A field, i.e. an instance of `Field`_, holds together a field name, a data +type, and some optional key-value metadata. + +.. code-block:: Java + + // Create a column "document" of string type with metadata + import org.apache.arrow.vector.types.pojo.ArrowType; + import org.apache.arrow.vector.types.pojo.Field; + import org.apache.arrow.vector.types.pojo.FieldType; + + Map metadata = new HashMap<>(); + metadata.put("A", "Id card"); + metadata.put("B", "Passport"); + metadata.put("C", "Visa"); + Field document = new Field("document", new FieldType(true, new ArrowType.Utf8(), /*dictionary*/ null, metadata), /*children*/ null); + +Schemas +======= + +A `Schema`_ describes the overall structure consisting of any number of columns. It holds a sequence of fields together +with some optional schema-wide metadata (in addition to per-field metadata). + +.. code-block:: Java + + // Create a schema describing datasets with two columns: + // a int32 column "A" and a utf8-encoded string column "B" + import org.apache.arrow.vector.types.pojo.ArrowType; + import org.apache.arrow.vector.types.pojo.Field; + import org.apache.arrow.vector.types.pojo.FieldType; + import org.apache.arrow.vector.types.pojo.Schema; + import static java.util.Arrays.asList; + + Map metadata = new HashMap<>(); + metadata.put("K1", "V1"); + metadata.put("K2", "V2"); + Field a = new Field("A", FieldType.nullable(new ArrowType.Int(32, true)), null); + Field b = new Field("B", FieldType.nullable(new ArrowType.Utf8()), null); + Schema schema = new Schema(asList(a, b), metadata); + +VectorSchemaRoot +================ + +A `VectorSchemaRoot`_ is a container for batches of data. Batches flow through +VectorSchemaRoot as part of a pipeline. + +.. note:: + + VectorSchemaRoot is somewhat analogous to tables or record batches in the + other Arrow implementations in that they all are 2D datasets, but their + usage is different. + +The recommended usage is to create a single VectorSchemaRoot based on a known +schema and populate data over and over into that root in a stream of batches, +rather than creating a new instance each time (see `Flight`_ or +``ArrowFileWriter`` as examples). Thus at any one point, a VectorSchemaRoot may +have data or may have no data (say it was transferred downstream or not yet +populated). + +Here is an example of creating a VectorSchemaRoot: + +.. code-block:: Java + + BitVector bitVector = new BitVector("boolean", allocator); + VarCharVector varCharVector = new VarCharVector("varchar", allocator); + bitVector.allocateNew(); + varCharVector.allocateNew(); + for (int i = 0; i < 10; i++) { + bitVector.setSafe(i, i % 2 == 0 ? 0 : 1); + varCharVector.setSafe(i, ("test" + i).getBytes(StandardCharsets.UTF_8)); + } + bitVector.setValueCount(10); + varCharVector.setValueCount(10); + + List fields = Arrays.asList(bitVector.getField(), varCharVector.getField()); + List vectors = Arrays.asList(bitVector, varCharVector); + VectorSchemaRoot vectorSchemaRoot = new VectorSchemaRoot(fields, vectors); + +Data can be loaded into/unloaded from a VectorSchemaRoot via `VectorLoader`_ +and `VectorUnloader`_. They handle converting between VectorSchemaRoot and +`ArrowRecordBatch`_ (a representation of a RecordBatch +:external+arrow:ref:`IPC ` message). For example: + +.. code-block:: Java + + // create a VectorSchemaRoot root1 and convert its data into recordBatch + VectorSchemaRoot root1 = new VectorSchemaRoot(fields, vectors); + VectorUnloader unloader = new VectorUnloader(root1); + ArrowRecordBatch recordBatch = unloader.getRecordBatch(); + + // create a VectorSchemaRoot root2 and load the recordBatch + VectorSchemaRoot root2 = VectorSchemaRoot.create(root1.getSchema(), allocator); + VectorLoader loader = new VectorLoader(root2); + loader.load(recordBatch); + +A new VectorSchemaRoot can be sliced from an existing root without copying +data: + +.. code-block:: Java + + // 0 indicates start index (inclusive) and 5 indicated length (exclusive). + VectorSchemaRoot newRoot = vectorSchemaRoot.slice(0, 5); + +Table +===== + +A `Table`_ is an immutable tabular data structure, very similar to VectorSchemaRoot, in that it is also built on ValueVectors and schemas. Unlike VectorSchemaRoot, Table is not designed for batch processing. Here is a version of the example above, showing how to create a Table, rather than a VectorSchemaRoot: + +.. code-block:: Java + + BitVector bitVector = new BitVector("boolean", allocator); + VarCharVector varCharVector = new VarCharVector("varchar", allocator); + bitVector.allocateNew(); + varCharVector.allocateNew(); + for (int i = 0; i < 10; i++) { + bitVector.setSafe(i, i % 2 == 0 ? 0 : 1); + varCharVector.setSafe(i, ("test" + i).getBytes(StandardCharsets.UTF_8)); + } + bitVector.setValueCount(10); + varCharVector.setValueCount(10); + + List vectors = Arrays.asList(bitVector, varCharVector); + Table table = new Table(vectors); + +See the :doc:`table` documentation for more information. + +.. _`ArrowRecordBatch`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/ipc/message/ArrowRecordBatch.html +.. _`Field`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/types/pojo/Field.html +.. _`Flight`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/flight/package-summary.html +.. _`Schema`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/types/pojo/Schema.html +.. _`Table`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/table/Table.html +.. _`VectorLoader`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/VectorLoader.html +.. _`VectorSchemaRoot`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/VectorSchemaRoot.html +.. _`VectorUnloader`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/VectorUnloader.html