Skip to content

Commit

Permalink
fix binary field update
Browse files Browse the repository at this point in the history
  • Loading branch information
eddyxu committed Jul 9, 2024
1 parent 2502cdb commit 296cd8f
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 1 deletion.
20 changes: 20 additions & 0 deletions python/python/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1339,6 +1339,26 @@ def test_update_dataset_all_types(tmp_path: Path):
assert dataset.to_table() == expected


def test_update_with_binary_field(tmp_path: Path):
# Create a lance dataset with binary fields
table = pa.Table.from_pydict(
{
"a": [f"str-{i}" for i in range(100)],
"b": [b"bin-{i}" for i in range(100)],
"c": list(range(100)),
}
)
dataset = lance.write_dataset(table, tmp_path)

# Update binary field
dataset.update({"b": "X'616263'"}, where="c < 2")

ds = lance.dataset(tmp_path)
assert ds.scanner(filter="c < 2").to_table().column(
"b"
).combine_chunks() == pa.array([b"abc", b"abc"])


def test_create_update_empty_dataset(tmp_path: Path, provide_pandas: bool):
base_dir = tmp_path / "dataset"

Expand Down
40 changes: 39 additions & 1 deletion rust/lance/src/io/exec/planner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,9 @@ impl Planner {
Value::DollarQuotedString(_) => todo!(),
Value::EscapedStringLiteral(_) => todo!(),
Value::NationalStringLiteral(_) => todo!(),
Value::HexStringLiteral(_) => todo!(),
Value::HexStringLiteral(hsl) => {
Expr::Literal(ScalarValue::Binary(Self::try_decode_hex_literal(hsl)))
}
Value::DoubleQuotedString(s) => Expr::Literal(ScalarValue::Utf8(Some(s.clone()))),
Value::Boolean(v) => Expr::Literal(ScalarValue::Boolean(Some(*v))),
Value::Null => Expr::Literal(ScalarValue::Null),
Expand Down Expand Up @@ -673,6 +675,42 @@ impl Planner {
Ok(resolved)
}

/// Try to decode bytes from hex literal string.
///
/// Copied from datafusion because this is not public.
///
/// TODO: use SqlToRel from Datafusion directly?
fn try_decode_hex_literal(s: &str) -> Option<Vec<u8>> {
let hex_bytes = s.as_bytes();
let mut decoded_bytes = Vec::with_capacity((hex_bytes.len() + 1) / 2);

let start_idx = hex_bytes.len() % 2;
if start_idx > 0 {
// The first byte is formed of only one char.
decoded_bytes.push(Self::try_decode_hex_char(hex_bytes[0])?);
}

for i in (start_idx..hex_bytes.len()).step_by(2) {
let high = Self::try_decode_hex_char(hex_bytes[i])?;
let low = Self::try_decode_hex_char(hex_bytes[i + 1])?;
decoded_bytes.push(high << 4 | low);
}

Some(decoded_bytes)
}

/// Try to decode a byte from a hex char.
///
/// None will be returned if the input char is hex-invalid.
const fn try_decode_hex_char(c: u8) -> Option<u8> {
match c {
b'A'..=b'F' => Some(c - b'A' + 10),
b'a'..=b'f' => Some(c - b'a' + 10),
b'0'..=b'9' => Some(c - b'0'),
_ => None,
}
}

/// Optimize the filter expression and coerce data types.
pub fn optimize_expr(&self, expr: Expr) -> Result<Expr> {
let df_schema = Arc::new(DFSchema::try_from(self.schema.as_ref().clone())?);
Expand Down

0 comments on commit 296cd8f

Please sign in to comment.