Skip to content

Commit

Permalink
apacheGH-15060: [JS] Add LargeUtf8 type (apache#35780)
Browse files Browse the repository at this point in the history
This pull request adds support for the LargeUTF8 type in Arrow. Now we
can create, decode, and encode these vectors. However, while the offset
vectors support 64 bit integers, note that the value buffers are limited
to a length of 32 bits meaning that LargeUTF8 vectors cannot yet be
larger than UTF8 vectors. We will see how we can address this limitation
in a follow up pull request. The issue is that JS typed arrays can be at
most 2**31-1 elements long (implementation defined). This pull request
also fixes a bug in a rounding method which prevented us from supporting
large vectors so it's already a big step forward.

Fixes apache#15060.
* Closes: apache#15060

---------

Co-authored-by: Kyle Barron <[email protected]>
  • Loading branch information
domoritz and kylebarron authored Dec 16, 2023
1 parent 132b1f7 commit 49fde23
Show file tree
Hide file tree
Showing 36 changed files with 432 additions and 110 deletions.
2 changes: 1 addition & 1 deletion docs/source/status.rst
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ Data Types
+-------------------+-------+-------+-------+------------+-------+-------+-------+-------+
| Utf8 |||||||||
+-------------------+-------+-------+-------+------------+-------+-------+-------+-------+
| Large Utf8 |||| | ||| |
| Large Utf8 |||| | ||| |
+-------------------+-------+-------+-------+------------+-------+-------+-------+-------+
| Binary View || || | | | | |
+-------------------+-------+-------+-------+------------+-------+-------+-------+-------+
Expand Down
4 changes: 2 additions & 2 deletions js/src/Arrow.dom.ts
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ export {
Bool,
Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64,
Float, Float16, Float32, Float64,
Utf8,
Utf8, LargeUtf8,
Binary,
FixedSizeBinary,
Date_, DateDay, DateMillisecond,
Expand Down Expand Up @@ -96,5 +96,5 @@ export {
TimestampBuilder, TimestampSecondBuilder, TimestampMillisecondBuilder, TimestampMicrosecondBuilder, TimestampNanosecondBuilder,
TimeBuilder, TimeSecondBuilder, TimeMillisecondBuilder, TimeMicrosecondBuilder, TimeNanosecondBuilder,
UnionBuilder, DenseUnionBuilder, SparseUnionBuilder,
Utf8Builder,
Utf8Builder, LargeUtf8Builder
} from './Arrow.js';
3 changes: 2 additions & 1 deletion js/src/Arrow.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ export {
Bool,
Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64,
Float, Float16, Float32, Float64,
Utf8,
Utf8, LargeUtf8,
Binary,
FixedSizeBinary,
Date_, DateDay, DateMillisecond,
Expand Down Expand Up @@ -78,6 +78,7 @@ export { TimestampBuilder, TimestampSecondBuilder, TimestampMillisecondBuilder,
export { IntervalBuilder, IntervalDayTimeBuilder, IntervalYearMonthBuilder } from './builder/interval.js';
export { DurationBuilder, DurationSecondBuilder, DurationMillisecondBuilder, DurationMicrosecondBuilder, DurationNanosecondBuilder } from './builder/duration.js';
export { Utf8Builder } from './builder/utf8.js';
export { LargeUtf8Builder } from './builder/largeutf8.js';
export { BinaryBuilder } from './builder/binary.js';
export { ListBuilder } from './builder/list.js';
export { FixedSizeListBuilder } from './builder/fixedsizelist.js';
Expand Down
27 changes: 12 additions & 15 deletions js/src/builder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ import {
DataType, strideForType,
Float, Int, Decimal, FixedSizeBinary,
Date_, Time, Timestamp, Interval, Duration,
Utf8, Binary, List, Map_,
Utf8, LargeUtf8, Binary, List, Map_,
} from './type.js';
import { createIsValidFunction } from './builder/valid.js';
import { BufferBuilder, BitmapBufferBuilder, DataBufferBuilder, OffsetsBufferBuilder } from './builder/buffer.js';
Expand Down Expand Up @@ -198,10 +198,10 @@ export abstract class Builder<T extends DataType = any, TNull = any> {
return this.children.reduce((size, child) => size + child.reservedByteLength, size);
}

declare protected _offsets: DataBufferBuilder<Int32Array>;
declare protected _offsets: DataBufferBuilder<T['TOffsetArray']>;
public get valueOffsets() { return this._offsets ? this._offsets.buffer : null; }

declare protected _values: BufferBuilder<T['TArray'], any>;
declare protected _values: BufferBuilder<T['TArray']>;
public get values() { return this._values ? this._values.buffer : null; }

declare protected _nulls: BitmapBufferBuilder;
Expand Down Expand Up @@ -277,18 +277,15 @@ export abstract class Builder<T extends DataType = any, TNull = any> {
* @returns A `Data<T>` of the buffers and children representing the values written.
*/
public flush(): Data<T> {

let data;
let typeIds;
let nullBitmap;
let valueOffsets;
let data: BufferBuilder<T['TArray']> | undefined;
let typeIds: Int8Array;
let nullBitmap: Uint8Array | undefined;
let valueOffsets: T['TOffsetArray'];
const { type, length, nullCount, _typeIds, _offsets, _values, _nulls } = this;

if (typeIds = _typeIds?.flush(length)) { // Unions
// DenseUnions
if (typeIds = _typeIds?.flush(length)) { // Unions, DenseUnions
valueOffsets = _offsets?.flush(length);
} else if (valueOffsets = _offsets?.flush(length)) { // Variable-width primitives (Binary, Utf8), and Lists
// Binary, Utf8
} else if (valueOffsets = _offsets?.flush(length)) { // Variable-width primitives (Binary, Utf8, LargeUtf8), and Lists
data = _values?.flush(_offsets.last());
} else { // Fixed-width primitives (Int, Float, Decimal, Time, Timestamp, Duration and Interval)
data = _values?.flush(length);
Expand Down Expand Up @@ -355,13 +352,13 @@ export abstract class FixedWidthBuilder<T extends Int | Float | FixedSizeBinary
}

/** @ignore */
export abstract class VariableWidthBuilder<T extends Binary | Utf8 | List | Map_, TNull = any> extends Builder<T, TNull> {
export abstract class VariableWidthBuilder<T extends Binary | Utf8 | LargeUtf8 | List | Map_, TNull = any> extends Builder<T, TNull> {
protected _pendingLength = 0;
protected _offsets: OffsetsBufferBuilder;
protected _offsets: OffsetsBufferBuilder<T>;
protected _pending: Map<number, any> | undefined;
constructor(opts: BuilderOptions<T, TNull>) {
super(opts);
this._offsets = new OffsetsBufferBuilder();
this._offsets = new OffsetsBufferBuilder(opts.type);
}
public setValue(index: number, value: T['TValue']) {
const pending = this._pending || (this._pending = new Map());
Expand Down
52 changes: 21 additions & 31 deletions js/src/builder/buffer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,32 +16,21 @@
// under the License.

import { memcpy } from '../util/buffer.js';
import {
TypedArray, TypedArrayConstructor,
BigIntArray, BigIntArrayConstructor
} from '../interfaces.js';

/** @ignore */ type DataValue<T> = T extends TypedArray ? number : T extends BigIntArray ? WideValue<T> : T;
/** @ignore */ type WideValue<T extends BigIntArray> = T extends BigIntArray ? bigint | Int32Array | Uint32Array : never;
/** @ignore */ type ArrayCtor<T extends TypedArray | BigIntArray> =
T extends TypedArray ? TypedArrayConstructor<T> :
T extends BigIntArray ? BigIntArrayConstructor<T> :
any;
import { TypedArray, BigIntArray, ArrayCtor } from '../interfaces.js';
import { DataType } from '../type.js';

/** @ignore */
const roundLengthUpToNearest64Bytes = (len: number, BPE: number) => ((((Math.ceil(len) * BPE) + 63) & ~63) || 64) / BPE;
function roundLengthUpToNearest64Bytes(len: number, BPE: number) {
const bytesMinus1 = Math.ceil(len) * BPE - 1;
return ((bytesMinus1 - bytesMinus1 % 64 + 64) || 64) / BPE;
}
/** @ignore */
const sliceOrExtendArray = <T extends TypedArray | BigIntArray>(arr: T, len = 0) => (
arr.length >= len ? arr.subarray(0, len) : memcpy(new (arr.constructor as any)(len), arr, 0)
) as T;

/** @ignore */
export interface BufferBuilder<T extends TypedArray | BigIntArray = any, TValue = DataValue<T>> {
readonly offset: number;
}

/** @ignore */
export class BufferBuilder<T extends TypedArray | BigIntArray = any, TValue = DataValue<T>> {
export class BufferBuilder<T extends TypedArray | BigIntArray> {

constructor(buffer: T, stride = 1) {
this.buffer = buffer;
Expand All @@ -64,8 +53,8 @@ export class BufferBuilder<T extends TypedArray | BigIntArray = any, TValue = Da
public get reservedByteLength() { return this.buffer.byteLength; }

// @ts-ignore
public set(index: number, value: TValue) { return this; }
public append(value: TValue) { return this.set(this.length, value); }
public set(index: number, value: T[0]) { return this; }
public append(value: T[0]) { return this.set(this.length, value); }
public reserve(extra: number) {
if (extra > 0) {
this.length += extra;
Expand Down Expand Up @@ -97,13 +86,11 @@ export class BufferBuilder<T extends TypedArray | BigIntArray = any, TValue = Da
}
}

(BufferBuilder.prototype as any).offset = 0;

/** @ignore */
export class DataBufferBuilder<T extends TypedArray> extends BufferBuilder<T, number> {
export class DataBufferBuilder<T extends TypedArray | BigIntArray> extends BufferBuilder<T> {
public last() { return this.get(this.length - 1); }
public get(index: number) { return this.buffer[index]; }
public set(index: number, value: number) {
public get(index: number): T[0] { return this.buffer[index]; }
public set(index: number, value: T[0]) {
this.reserve(index - this.length + 1);
this.buffer[index * this.stride] = value;
return this;
Expand Down Expand Up @@ -134,23 +121,26 @@ export class BitmapBufferBuilder extends DataBufferBuilder<Uint8Array> {
}

/** @ignore */
export class OffsetsBufferBuilder extends DataBufferBuilder<Int32Array> {
constructor(data = new Int32Array(1)) { super(data, 1); }
public append(value: number) {
export class OffsetsBufferBuilder<T extends DataType> extends DataBufferBuilder<T['TOffsetArray']> {
constructor(type: T) {
super(new type.OffsetArrayType(1), 1);
}

public append(value: T['TOffsetArray'][0]) {
return this.set(this.length - 1, value);
}
public set(index: number, value: number) {
public set(index: number, value: T['TOffsetArray'][0]) {
const offset = this.length - 1;
const buffer = this.reserve(index - offset + 1).buffer;
if (offset < index++) {
if (offset < index++ && offset >= 0) {
buffer.fill(buffer[offset], offset, index);
}
buffer[index] = buffer[index - 1] + value;
return this;
}
public flush(length = this.length - 1) {
if (length > this.length) {
this.set(length - 1, 0);
this.set(length - 1, this.BYTES_PER_ELEMENT > 4 ? BigInt(0) : 0);
}
return super.flush(length + 1);
}
Expand Down
59 changes: 59 additions & 0 deletions js/src/builder/largeutf8.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

import { LargeUtf8 } from '../type.js';
import { encodeUtf8 } from '../util/utf8.js';
import { BufferBuilder } from './buffer.js';
import { VariableWidthBuilder, BuilderOptions } from '../builder.js';

/** @ignore */
export class LargeUtf8Builder<TNull = any> extends VariableWidthBuilder<LargeUtf8, TNull> {
constructor(opts: BuilderOptions<LargeUtf8, TNull>) {
super(opts);
this._values = new BufferBuilder(new Uint8Array(0));
}
public get byteLength(): number {
let size = this._pendingLength + (this.length * 4);
this._offsets && (size += this._offsets.byteLength);
this._values && (size += this._values.byteLength);
this._nulls && (size += this._nulls.byteLength);
return size;
}
public setValue(index: number, value: string) {
return super.setValue(index, encodeUtf8(value) as any);
}
// @ts-ignore
// TODO: move to largeBinaryBuilder when implemented
// protected _flushPending(pending: Map<number, Uint8Array | undefined>, pendingLength: number): void { }
protected _flushPending(pending: Map<number, Uint8Array | undefined>, pendingLength: number) {
const offsets = this._offsets;
const data = this._values.reserve(pendingLength).buffer;
let offset = 0;
for (const [index, value] of pending) {
if (value === undefined) {
offsets.set(index, BigInt(0));
} else {
const length = value.length;
data.set(value, offset);
offsets.set(index, BigInt(length));
offset += length;
}
}
}
}

// (LargeUtf8Builder.prototype as any)._flushPending = (LargeBinaryBuilder.prototype as any)._flushPending;
4 changes: 2 additions & 2 deletions js/src/builder/list.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@ import { Builder, BuilderOptions, VariableWidthBuilder } from '../builder.js';

/** @ignore */
export class ListBuilder<T extends DataType = any, TNull = any> extends VariableWidthBuilder<List<T>, TNull> {
protected _offsets: OffsetsBufferBuilder;
protected _offsets: OffsetsBufferBuilder<List<T>>;
constructor(opts: BuilderOptions<List<T>, TNull>) {
super(opts);
this._offsets = new OffsetsBufferBuilder();
this._offsets = new OffsetsBufferBuilder(opts.type);
}
public addChild(child: Builder<T>, name = '0') {
if (this.numChildren > 0) {
Expand Down
18 changes: 15 additions & 3 deletions js/src/data.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

import { Vector } from './vector.js';
import { BufferType, Type, UnionMode } from './enum.js';
import { DataType, strideForType } from './type.js';
import { DataType, LargeUtf8, strideForType } from './type.js';
import { popcnt_bit_range, truncateBitmap } from './util/bit.js';

// When slicing, we do not know the null count of the sliced range without
Expand All @@ -30,11 +30,12 @@ import { popcnt_bit_range, truncateBitmap } from './util/bit.js';
/** @ignore */ export type NullBuffer = Uint8Array | null | undefined;
/** @ignore */ export type TypeIdsBuffer = Int8Array | ArrayLike<number> | Iterable<number> | undefined;
/** @ignore */ export type ValueOffsetsBuffer = Int32Array | ArrayLike<number> | Iterable<number> | undefined;
/** @ignore */ export type LargeValueOffsetsBuffer = BigInt64Array | ArrayLike<bigint> | Iterable<bigint> | undefined;
/** @ignore */ export type DataBuffer<T extends DataType> = T['TArray'] | ArrayLike<number> | Iterable<number> | undefined;

/** @ignore */
export interface Buffers<T extends DataType> {
[BufferType.OFFSET]: Int32Array;
[BufferType.OFFSET]: T['TOffsetArray'];
[BufferType.DATA]: T['TArray'];
[BufferType.VALIDITY]: Uint8Array;
[BufferType.TYPE]: T['TArray'];
Expand Down Expand Up @@ -264,7 +265,7 @@ import {
} from './type.js';

import { Visitor } from './visitor.js';
import { toArrayBufferView, toInt32Array, toUint8Array } from './util/buffer.js';
import { toArrayBufferView, toBigInt64Array, toInt32Array, toUint8Array } from './util/buffer.js';

class MakeDataVisitor extends Visitor {
public visit<T extends DataType>(props: any): Data<T> {
Expand Down Expand Up @@ -307,6 +308,14 @@ class MakeDataVisitor extends Visitor {
const { ['length']: length = valueOffsets.length - 1, ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props;
return new Data(type, offset, length, nullCount, [valueOffsets, data, nullBitmap]);
}
public visitLargeUtf8<T extends LargeUtf8>(props: LargeUtf8DataProps<T>) {
const { ['type']: type, ['offset']: offset = 0 } = props;
const data = toUint8Array(props['data']);
const nullBitmap = toUint8Array(props['nullBitmap']);
const valueOffsets = toBigInt64Array(props['valueOffsets']);
const { ['length']: length = valueOffsets.length - 1, ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props;
return new Data(type, offset, length, nullCount, [valueOffsets, data, nullBitmap]);
}
public visitBinary<T extends Binary>(props: BinaryDataProps<T>) {
const { ['type']: type, ['offset']: offset = 0 } = props;
const data = toUint8Array(props['data']);
Expand Down Expand Up @@ -436,6 +445,7 @@ interface DurationDataProps<T extends Duration> extends DataProps_<T> { data?: D
interface FixedSizeBinaryDataProps<T extends FixedSizeBinary> extends DataProps_<T> { data?: DataBuffer<T> }
interface BinaryDataProps<T extends Binary> extends DataProps_<T> { valueOffsets: ValueOffsetsBuffer; data?: DataBuffer<T> }
interface Utf8DataProps<T extends Utf8> extends DataProps_<T> { valueOffsets: ValueOffsetsBuffer; data?: DataBuffer<T> }
interface LargeUtf8DataProps<T extends LargeUtf8> extends DataProps_<T> { valueOffsets: LargeValueOffsetsBuffer | ValueOffsetsBuffer; data?: DataBuffer<T> }
interface ListDataProps<T extends List> extends DataProps_<T> { valueOffsets: ValueOffsetsBuffer; child: Data<T['valueType']> }
interface FixedSizeListDataProps<T extends FixedSizeList> extends DataProps_<T> { child: Data<T['valueType']> }
interface StructDataProps<T extends Struct> extends DataProps_<T> { children: Data[] }
Expand All @@ -459,6 +469,7 @@ export type DataProps<T extends DataType> = (
T extends FixedSizeBinary /* */ ? FixedSizeBinaryDataProps<T> :
T extends Binary /* */ ? BinaryDataProps<T> :
T extends Utf8 /* */ ? Utf8DataProps<T> :
T extends LargeUtf8 /* */ ? LargeUtf8DataProps<T> :
T extends List /* */ ? ListDataProps<T> :
T extends FixedSizeList /* */ ? FixedSizeListDataProps<T> :
T extends Struct /* */ ? StructDataProps<T> :
Expand All @@ -485,6 +496,7 @@ export function makeData<T extends Duration>(props: DurationDataProps<T>): Data<
export function makeData<T extends FixedSizeBinary>(props: FixedSizeBinaryDataProps<T>): Data<T>;
export function makeData<T extends Binary>(props: BinaryDataProps<T>): Data<T>;
export function makeData<T extends Utf8>(props: Utf8DataProps<T>): Data<T>;
export function makeData<T extends LargeUtf8>(props: LargeUtf8DataProps<T>): Data<T>;
export function makeData<T extends List>(props: ListDataProps<T>): Data<T>;
export function makeData<T extends FixedSizeList>(props: FixedSizeListDataProps<T>): Data<T>;
export function makeData<T extends Struct>(props: StructDataProps<T>): Data<T>;
Expand Down
6 changes: 3 additions & 3 deletions js/src/enum.ts
Original file line number Diff line number Diff line change
Expand Up @@ -137,8 +137,7 @@ export enum MessageHeader {
* nested type consisting of other data types, or another data type (e.g. a
* timestamp encoded as an int64).
*
* **Note**: Only enum values 0-18 (NONE through Duration) are written to an Arrow
* IPC payload.
* **Note**: Only non-negative enum values are written to an Arrow IPC payload.
*
* The rest of the values are specified here so TypeScript can narrow the type
* signatures further beyond the base Arrow Types. The Arrow DataTypes include
Expand Down Expand Up @@ -175,6 +174,7 @@ export enum Type {
FixedSizeList = 16, /** Fixed-size list. Each value occupies the same number of bytes */
Map = 17, /** Map of named logical types */
Duration = 18, /** Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds. */
LargeUtf8 = 20, /** Large variable-length string as List<Char> */

Dictionary = -1, /** Dictionary aka Category type */
Int8 = -2,
Expand Down Expand Up @@ -205,7 +205,7 @@ export enum Type {
DurationSecond = -27,
DurationMillisecond = -28,
DurationMicrosecond = -29,
DurationNanosecond = -30
DurationNanosecond = -30,
}

export enum BufferType {
Expand Down
Loading

0 comments on commit 49fde23

Please sign in to comment.