Skip to content

Commit

Permalink
API: Move variant to API and add extract expression (#12304)
Browse files Browse the repository at this point in the history
  • Loading branch information
rdblue authored Feb 21, 2025
1 parent 30fd752 commit d4fe23a
Show file tree
Hide file tree
Showing 10 changed files with 522 additions and 3 deletions.
75 changes: 75 additions & 0 deletions api/src/main/java/org/apache/iceberg/expressions/BoundExtract.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg.expressions;

import org.apache.iceberg.StructLike;
import org.apache.iceberg.relocated.com.google.common.base.Joiner;
import org.apache.iceberg.types.Type;

public class BoundExtract<T> implements BoundTerm<T> {
private final BoundReference<?> ref;
private final String path;
private final String fullFieldName;
private final Type type;

BoundExtract(BoundReference<?> ref, String path, Type type) {
this.ref = ref;
this.path = path;
this.fullFieldName = Joiner.on(".").join(PathUtil.parse(path));
this.type = type;
}

@Override
public BoundReference<?> ref() {
return ref;
}

public String path() {
return path;
}

String fullFieldName() {
return fullFieldName;
}

@Override
public Type type() {
return type;
}

@Override
public boolean isEquivalentTo(BoundTerm<?> other) {
if (other instanceof BoundExtract) {
BoundExtract<?> that = (BoundExtract<?>) other;
return ref.isEquivalentTo(that.ref) && path.equals(that.path) && type.equals(that.type);
}

return false;
}

@Override
public T eval(StructLike struct) {
throw new UnsupportedOperationException("Cannot evaluate " + this);
}

@Override
public String toString() {
return "extract(" + ref + ", path=" + path + ", type=" + type + ")";
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,11 @@ public Type type() {
return field.type();
}

@Override
public boolean producesNull() {
return field.isOptional();
}

@Override
public String name() {
return name;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@ public interface BoundTerm<T> extends Bound<T>, Term {
/** Returns the type produced by this expression. */
Type type();

/** Returns whether values produced by this expression may be null. */
default boolean producesNull() {
return true;
}

/** Returns a {@link Comparator} for values produced by this term. */
default Comparator<T> comparator() {
return Comparators.forType(type().asPrimitiveType());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,13 @@ public Transform<S, T> transform() {
return transform;
}

@Override
public boolean producesNull() {
// transforms must produce null for null input values
// transforms may produce null for non-null inputs when not order-preserving
return ref.producesNull() || !transform.preservesOrder();
}

@Override
public Type type() {
return transform.getResultType(ref.type());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,10 @@ public static <T> UnboundTerm<T> truncate(String name, int width) {
return new UnboundTransform<>(ref(name), Transforms.truncate(width));
}

public static <T> UnboundTerm<T> extract(String name, String path, String type) {
return new UnboundExtract<>(ref(name), path, type);
}

public static <T> UnboundPredicate<T> isNull(String name) {
return new UnboundPredicate<>(Expression.Operation.IS_NULL, ref(name));
}
Expand Down
64 changes: 64 additions & 0 deletions api/src/main/java/org/apache/iceberg/expressions/PathUtil.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg.expressions;

import java.util.List;
import java.util.function.Predicate;
import java.util.regex.Pattern;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.base.Splitter;

class PathUtil {
private PathUtil() {}

private static final String RFC9535_NAME_FIRST =
"[A-Za-z_\\x{0080}-\\x{D7FF}\\x{E000}-\\x{10FFFF}]";
private static final String RFC9535_NAME_CHARS =
"[0-9A-Za-z_\\x{0080}-\\x{D7FF}\\x{E000}-\\x{10FFFF}]*";
private static final Predicate<String> RFC9535_MEMBER_NAME_SHORTHAND =
Pattern.compile(RFC9535_NAME_FIRST + RFC9535_NAME_CHARS).asMatchPredicate();

private static final Splitter DOT = Splitter.on(".");
private static final String ROOT = "$";

static List<String> parse(String path) {
Preconditions.checkArgument(path != null, "Invalid path: null");
Preconditions.checkArgument(
!path.contains("[") && !path.contains("]"), "Unsupported path, contains bracket: %s", path);
Preconditions.checkArgument(
!path.contains("*"), "Unsupported path, contains wildcard: %s", path);
Preconditions.checkArgument(
!path.contains(".."), "Unsupported path, contains recursive descent: %s", path);

List<String> parts = DOT.splitToList(path);
Preconditions.checkArgument(
ROOT.equals(parts.get(0)), "Invalid path, does not start with %s: %s", ROOT, path);

List<String> names = parts.subList(1, parts.size());
for (String name : names) {
Preconditions.checkArgument(
RFC9535_MEMBER_NAME_SHORTHAND.test(name),
"Invalid path: %s (%s has invalid characters)",
path,
name);
}

return names;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg.expressions;

import org.apache.iceberg.exceptions.ValidationException;
import org.apache.iceberg.types.Type;
import org.apache.iceberg.types.Types;

public class UnboundExtract<T> implements UnboundTerm<T> {
private final NamedReference<?> ref;
private final String path;
private final Type.PrimitiveType type;

public UnboundExtract(NamedReference<?> ref, String path, String type) {
this.ref = ref;
this.path = path;
this.type = Types.fromPrimitiveString(type);
// verify that the path is well-formed
PathUtil.parse(path);
}

@Override
public BoundTerm<T> bind(Types.StructType struct, boolean caseSensitive) {
BoundReference<?> boundRef = ref.bind(struct, caseSensitive);
ValidationException.check(
Types.VariantType.get().equals(boundRef.type()),
"Cannot bind extract, not a variant: %s",
boundRef.name());
ValidationException.check(
!type.equals(Types.UnknownType.get()), "Invalid type to extract: unknown");
return new BoundExtract<>(boundRef, path, type);
}

@Override
public NamedReference<?> ref() {
return ref;
}

public String path() {
return path;
}

public Type type() {
return type;
}

@Override
public String toString() {
return "extract(" + ref + ", path=" + path + ", type=" + type + ")";
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.relocated.com.google.common.collect.Sets;
import org.apache.iceberg.types.Type;
import org.apache.iceberg.types.Types;
import org.apache.iceberg.types.Types.StructType;
import org.apache.iceberg.util.CharSequenceSet;

Expand Down Expand Up @@ -124,13 +125,17 @@ public Expression bind(StructType struct, boolean caseSensitive) {
private Expression bindUnaryOperation(BoundTerm<T> boundTerm) {
switch (op()) {
case IS_NULL:
if (boundTerm.ref().field().isRequired()) {
if (!boundTerm.producesNull()) {
return Expressions.alwaysFalse();
} else if (boundTerm.type().equals(Types.UnknownType.get())) {
return Expressions.alwaysTrue();
}
return new BoundUnaryPredicate<>(Operation.IS_NULL, boundTerm);
case NOT_NULL:
if (boundTerm.ref().field().isRequired()) {
if (!boundTerm.producesNull()) {
return Expressions.alwaysTrue();
} else if (boundTerm.type().equals(Types.UnknownType.get())) {
return Expressions.alwaysFalse();
}
return new BoundUnaryPredicate<>(Operation.NOT_NULL, boundTerm);
case IS_NAN:
Expand All @@ -155,6 +160,14 @@ private boolean floatingType(Type.TypeID typeID) {
}

private Expression bindLiteralOperation(BoundTerm<T> boundTerm) {
if (op() == Operation.STARTS_WITH || op() == Operation.NOT_STARTS_WITH) {
ValidationException.check(
boundTerm.type().equals(Types.StringType.get()),
"Term for STARTS_WITH or NOT_STARTS_WITH must produce a string: %s: %s",
boundTerm,
boundTerm.type());
}

Literal<T> lit = literal().to(boundTerm.type());

if (lit == null) {
Expand Down
Loading

0 comments on commit d4fe23a

Please sign in to comment.