Skip to content

Commit

Permalink
Merge pull request #2380 from dathere/2379-join_right_anti_and_semi_o…
Browse files Browse the repository at this point in the history
…ptions

feat: `join` add `--right-anti` and `--right-semi` options
  • Loading branch information
jqnatividad authored Dec 25, 2024
2 parents 6e3532d + beaab63 commit a9af236
Show file tree
Hide file tree
Showing 3 changed files with 131 additions and 11 deletions.
58 changes: 47 additions & 11 deletions src/cmd/join.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,13 @@ join options:
corresponding row in the first data set. When no
corresponding row exists, it is padded out with
empty fields. (This is the reverse of 'outer left'.)
--right-anti This returns only the rows in the second CSV data set
that do not have a corresponding row in the first
data set. The output schema is the same as the
second dataset.
--right-semi This returns only the rows in the second CSV data set
that have a corresponding row in the first data set.
The output schema is the same as the second data set.
--full Do a 'full outer' join. This returns all rows in
both data sets with matching records joined. If
there is no match, the missing side will be padded
Expand All @@ -71,7 +78,7 @@ Common options:
Must be a single character. (default: ,)
"#;

use std::{collections::hash_map::Entry, fmt, io, iter::repeat, str};
use std::{collections::hash_map::Entry, fmt, io, iter::repeat, mem::swap, str};

use ahash::AHashMap;
use byteorder::{BigEndian, WriteBytesExt};
Expand All @@ -96,6 +103,8 @@ struct Args {
flag_left_anti: bool,
flag_left_semi: bool,
flag_right: bool,
flag_right_anti: bool,
flag_right_semi: bool,
flag_full: bool,
flag_cross: bool,
flag_output: Option<String>,
Expand All @@ -113,37 +122,64 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
args.flag_left_anti,
args.flag_left_semi,
args.flag_right,
args.flag_right_anti,
args.flag_right_semi,
args.flag_full,
args.flag_cross,
) {
(true, false, false, false, false, false) => {
// default inner join
(false, false, false, false, false, false, false, false) => {
state.write_headers()?;
state.inner_join()
},
// left join
(true, false, false, false, false, false, false, false) => {
state.write_headers()?;
state.outer_join(false)
},
(false, true, false, false, false, false) => {
// left anti join
(false, true, false, false, false, false, false, false) => {
state.write_headers1()?;
state.left_join(true)
},
(false, false, true, false, false, false) => {
// left semi join
(false, false, true, false, false, false, false, false) => {
state.write_headers1()?;
state.left_join(false)
},
(false, false, false, true, false, false) => {
// right join
(false, false, false, true, false, false, false, false) => {
state.write_headers()?;
state.outer_join(true)
},
(false, false, false, false, true, false) => {
// right anti join
// swap left and right data sets and run left anti join
(false, false, false, false, true, false, false, false) => {
state.write_headers1()?;
let mut swapped_join = state;
swap(&mut swapped_join.rdr1, &mut swapped_join.rdr2);
swap(&mut swapped_join.sel1, &mut swapped_join.sel2);
swapped_join.left_join(true)
},
// right semi join
// swap left and right data sets and run left semi join
(false, false, false, false, false, true, false, false) => {
state.write_headers1()?;
let mut swapped_join = state;
swap(&mut swapped_join.rdr1, &mut swapped_join.rdr2);
swap(&mut swapped_join.sel1, &mut swapped_join.sel2);
swapped_join.left_join(false)
},
// full outer join
(false, false, false, false, false, false, true, false) => {
state.write_headers()?;
state.full_outer_join()
},
(false, false, false, false, false, true) => {
// cross join
(false, false, false, false, false, false, false, true) => {
state.write_headers()?;
state.cross_join()
},
(false, false, false, false, false, false) => {
state.write_headers()?;
state.inner_join()
},
_ => fail_incorrectusage_clierror!("Please pick exactly one join operation."),
}
}
Expand Down
17 changes: 17 additions & 0 deletions src/util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1341,6 +1341,23 @@ pub fn round_num(dec_f64: f64, places: u32) -> String {
}

#[inline]
/// Transforms a byte slice into a ByteString with optional case-insensitive conversion.
///
/// This function takes a byte slice and attempts to convert it to a UTF-8 string. If successful,
/// it trims whitespace and optionally converts to lowercase. If the input is not valid UTF-8,
/// it returns the original bytes unchanged.
///
/// It's fine-tuned for speed and memory usage, using simdutf8 for UTF-8 validation and
/// to_lowercase_into for non-allocating, in-place lowercase conversion.
///
/// # Arguments
///
/// * `bs` - The input byte slice to transform
/// * `casei` - If true, converts the string to lowercase. If false, leaves case unchanged.
///
/// # Returns
///
/// * A `ByteString` (Vec<u8>) containing the transformed bytes
pub fn transform(bs: &[u8], casei: bool) -> ByteString {
if let Ok(s) = simdutf8::basic::from_utf8(bs) {
if casei {
Expand Down
67 changes: 67 additions & 0 deletions tests/test_join.rs
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,9 @@ join_test!(
join_outer_full,
|wrk: Workdir, mut cmd: process::Command, headers: bool| {
cmd.arg("--full");

wrk.assert_success(&mut cmd);

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = make_rows(
headers,
Expand Down Expand Up @@ -301,3 +304,67 @@ fn join_cross_no_headers() {
];
assert_eq!(got, expected);
}

join_test!(
join_right_semi,
|wrk: Workdir, mut cmd: process::Command, headers: bool| {
cmd.arg("--right-semi");
let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = make_rows(
headers,
true,
vec![
svec!["Boston", "Logan Airport"],
svec!["Boston", "Boston Garden"],
svec!["Buffalo", "Ralph Wilson Stadium"],
],
);
assert_eq!(got, expected);
}
);

join_test!(
join_right_semi_casei,
|wrk: Workdir, mut cmd: process::Command, headers: bool| {
cmd.arg("--right-semi").arg("--ignore-case");
let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = make_rows(
headers,
true,
vec![
svec!["Boston", "Logan Airport"],
svec!["Boston", "Boston Garden"],
svec!["Buffalo", "Ralph Wilson Stadium"],
svec!["BOSTON", "BOSTON COMMON"],
],
);
assert_eq!(got, expected);
}
);

join_test!(
join_right_anti,
|wrk: Workdir, mut cmd: process::Command, headers: bool| {
cmd.arg("--right-anti");
let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = make_rows(
headers,
true,
vec![
svec!["Orlando", "Disney World"],
svec!["BOSTON", "BOSTON COMMON"],
],
);
assert_eq!(got, expected);
}
);

join_test!(
join_right_anti_casei,
|wrk: Workdir, mut cmd: process::Command, headers: bool| {
cmd.arg("--right-anti").arg("--ignore-case");
let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
let expected = make_rows(headers, true, vec![svec!["Orlando", "Disney World"]]);
assert_eq!(got, expected);
}
);

0 comments on commit a9af236

Please sign in to comment.