diff --git a/README.md b/README.md index 650d106..dddb169 100644 --- a/README.md +++ b/README.md @@ -15,8 +15,10 @@ Arguments: [INPUT] File path to use as input rather than stdin ("-") [default: -] Options: - -s, --sort Order [default: desc] [possible values: desc, asc, unsorted] - -c, --case Normalization [default: lower] [possible values: original, upper, lower] + -s, --sort Sort order [default: desc] [possible values: desc, asc, unsorted] + -c, --case Case normalization [default: lower] [possible values: original, upper, lower] + -m, --min-chars Exclude words that contain fewer than min chars [default: 1] + -M, --min-count Exclude words that appear fewer than min times [default: 1] -D, --delimiter Delimiter between keys and values [default: " "] -o, --output Write output to file rather than stdout -v, --verbose Print verbose details diff --git a/src/args.rs b/src/args.rs index 4537dfc..fae238b 100644 --- a/src/args.rs +++ b/src/args.rs @@ -7,19 +7,27 @@ use word_tally::{Case, Sort}; #[command(about, version)] pub struct Args { /// File path to use as input rather than stdin ("-"). - #[clap(default_value = "-")] + #[arg(default_value = "-")] pub input: FileOrStdin, - /// Order. + /// Sort order. #[arg(short, long, default_value_t, value_enum, value_name = "ORDER")] pub sort: Sort, - /// Normalization. + /// Case normalization. #[arg(short, long, default_value_t, value_enum, value_name = "FORMAT")] pub case: Case, + /// Exclude words that contain fewer than min chars. + #[arg(short, long, default_value_t = 1, value_name = "COUNT")] + pub min_chars: usize, + + /// Exclude words that appear fewer than min times. + #[arg(short = 'M', long, default_value_t = 1, value_name = "COUNT")] + pub min_count: u64, + /// Delimiter between keys and values. - #[clap(short = 'D', long, default_value = " ", value_name = "VALUE")] + #[arg(short = 'D', long, default_value = " ", value_name = "VALUE")] pub delimiter: String, /// Write output to file rather than stdout. diff --git a/src/lib.rs b/src/lib.rs index 0de1dff..4719dd4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -17,10 +17,10 @@ //! # Examples //! //! ``` -//! use word_tally::{Case, Sort, WordTally}; +//! use word_tally::{Case, Minimums, Sort, WordTally}; //! //! let input = "Cinquedea".as_bytes(); -//! let words = WordTally::new(input, Case::Lower, Sort::Desc); +//! let words = WordTally::new(input, Case::Lower, Sort::Desc, Minimums::default()); //! let expected_tally = vec![("cinquedea".to_string(), 1)]; //! //! assert_eq!(words.tally(), expected_tally); @@ -81,11 +81,23 @@ pub enum Sort { Unsorted, } +/// Minimum requirements for a word to be included in the tally. +#[derive(Clone, Copy, Debug, Default)] +pub struct Minimums { + /// Min number of chars for words to be included. + pub chars: usize, + /// Min count of a word for it to be included. + pub count: u64, +} + /// `WordTally` fields are eagerly populated upon construction and exposed by getter methods. impl WordTally { /// Constructs a new `WordTally` from a source that implements `Read` like file or stdin. - pub fn new(input: T, case: Case, order: Sort) -> Self { - let tally_map = Self::tally_map(input, case); + pub fn new(input: T, case: Case, order: Sort, min: Minimums) -> Self { + let mut tally_map = Self::tally_map(input, case, min.chars); + if min.count > 1 { + tally_map.retain(|_, &mut count| count >= min.count); + } let count = tally_map.values().sum(); let tally = Vec::from_iter(tally_map); let uniq_count = tally.len(); @@ -140,20 +152,22 @@ impl WordTally { } /// Creates a tally of optionally normalized words from input that implements `Read`. - fn tally_map(input: T, case: Case) -> HashMap { + fn tally_map(input: T, case: Case, min_chars: usize) -> HashMap { let mut tally = HashMap::new(); let lines = BufReader::new(input).lines(); for line in lines.map_while(Result::ok) { - line.unicode_words().for_each(|unicode_word| { - let word = match case { - Case::Lower => unicode_word.to_lowercase(), - Case::Upper => unicode_word.to_uppercase(), - Case::Original => unicode_word.to_owned(), - }; - - *tally.entry(word).or_insert(0) += 1; - }); + line.unicode_words() + .filter(|unicode_word| min_chars <= 1 || unicode_word.len() >= min_chars) + .for_each(|unicode_word| { + let word = match case { + Case::Lower => unicode_word.to_lowercase(), + Case::Upper => unicode_word.to_uppercase(), + Case::Original => unicode_word.to_owned(), + }; + + *tally.entry(word).or_insert(0) += 1; + }); } tally diff --git a/src/main.rs b/src/main.rs index 34f3b40..cedb0ac 100644 --- a/src/main.rs +++ b/src/main.rs @@ -9,7 +9,7 @@ use clap_stdin::Source; use std::fs::File; use std::io::{self, ErrorKind::BrokenPipe, LineWriter, StderrLock, Write}; use unescaper::unescape; -use word_tally::{Case, Sort, WordTally}; +use word_tally::{Case, Minimums, Sort, WordTally}; /// `Writer` is a boxed type for dynamic dispatch of the `Write` trait. type Writer = Box; @@ -20,7 +20,11 @@ fn main() -> Result<()> { .input .into_reader() .with_context(|| format!("Failed to read {:#?}.", args.input.source))?; - let word_tally = WordTally::new(reader, args.case, args.sort); + let minimums = Minimums { + chars: args.min_chars, + count: args.min_count, + }; + let word_tally = WordTally::new(reader, args.case, args.sort, minimums); let delimiter = unescape(&args.delimiter)?; if args.verbose || args.debug { @@ -53,14 +57,7 @@ fn log_details( } if args.debug { - log_debug( - &mut stderr_lock, - args.case, - args.sort, - args.verbose, - args.debug, - delimiter, - )?; + log_debug(&mut stderr_lock, args, delimiter)?; } if word_tally.count() > 0 { @@ -99,21 +96,14 @@ fn log_verbose( } /// Log debug details to stderr. -fn log_debug( - stderr: &mut StderrLock<'_>, - case: Case, - sort: Sort, - verbose: bool, - debug: bool, - delimiter: &str, -) -> Result<()> { - let case_name = match case { +fn log_debug(stderr: &mut StderrLock<'_>, args: &Args, delimiter: &str) -> Result<()> { + let case_name = match args.case { Case::Lower => "lower", Case::Upper => "upper", Case::Original => "original", }; - let sort_name = match sort { + let sort_name = match args.sort { Sort::Asc => "asc", Sort::Desc => "desc", Sort::Unsorted => "unsorted", @@ -123,8 +113,10 @@ fn log_debug( format!("delimiter{delimiter}{delimiter:#?}\n"), format!("case{delimiter}{case_name}\n"), format!("order{delimiter}{sort_name}\n"), - format!("verbose{delimiter}{verbose}\n"), - format!("debug{delimiter}{debug}\n"), + format!("min-chars{delimiter}{}\n", args.min_chars), + format!("min-count{delimiter}{}\n", args.min_count), + format!("verbose{delimiter}{}\n", args.verbose), + format!("debug{delimiter}{}\n", args.debug), ]; for detail in &details { @@ -162,7 +154,7 @@ fn write_tally( // used to kill the program if it tries to write to a closed pipe. fn piping(result: std::io::Result<()>) -> Result<()> { match result { - Ok(_) => Ok(()), + Ok(()) => Ok(()), Err(err) => match err.kind() { BrokenPipe => Ok(()), _ => Err(err.into()), diff --git a/tests/lib.rs b/tests/lib.rs index 6c422b3..4949cae 100644 --- a/tests/lib.rs +++ b/tests/lib.rs @@ -1,28 +1,28 @@ use std::fs::File; use std::hash::{DefaultHasher, Hash, Hasher}; -use word_tally::{Case, Sort, WordTally}; +use word_tally::{Case, Minimums, Sort, WordTally}; const TEST_WORDS_PATH: &str = "tests/files/words.txt"; struct ExpectedFields<'a> { count: u64, uniq_count: usize, - avg: f64, + avg: Option, tally: Vec<(&'a str, u64)>, } -fn word_tally(case: Case, sort: Sort) -> WordTally { +fn word_tally(case: Case, sort: Sort, minimums: Minimums) -> WordTally { let input = File::open(TEST_WORDS_PATH) .expect("Expected test words file (`files/words.txt`) to be readable."); - WordTally::new(input, case, sort) + WordTally::new(input, case, sort, minimums) } -fn word_tally_test(case: Case, sort: Sort, fields: &ExpectedFields<'_>) { - let word_tally = word_tally(case, sort); +fn word_tally_test(case: Case, sort: Sort, minimums: Minimums, fields: &ExpectedFields<'_>) { + let word_tally = word_tally(case, sort, minimums); assert_eq!(word_tally.count(), fields.count); assert_eq!(word_tally.uniq_count(), fields.uniq_count); - assert_eq!(word_tally.avg().unwrap(), fields.avg); + assert_eq!(word_tally.avg(), fields.avg); let expected_tally: Vec<(String, u64)> = fields .tally @@ -37,24 +37,89 @@ fn lower_case_desc_order() { word_tally_test( Case::Lower, Sort::Desc, + Minimums::default(), &ExpectedFields { count: 45, uniq_count: 5, - avg: 9.0, + avg: Some(9.0), tally: vec![("c", 15), ("d", 11), ("123", 9), ("b", 7), ("a", 3)], }, ); } +#[test] +fn min_char_count_at_max() { + word_tally_test( + Case::Lower, + Sort::Desc, + Minimums { chars: 3, count: 1 }, + &ExpectedFields { + count: 9, + uniq_count: 1, + avg: Some(9.0), + tally: vec![("123", 9)], + }, + ); +} + +#[test] +fn min_char_count_above_max() { + word_tally_test( + Case::Lower, + Sort::Desc, + Minimums { chars: 4, count: 1 }, + &ExpectedFields { + count: 0, + uniq_count: 0, + avg: None, + tally: vec![], + }, + ); +} + +#[test] +fn min_char_count_at_min() { + word_tally_test( + Case::Lower, + Sort::Desc, + Minimums::default(), + &ExpectedFields { + count: 45, + uniq_count: 5, + avg: Some(9.0), + tally: vec![("c", 15), ("d", 11), ("123", 9), ("b", 7), ("a", 3)], + }, + ); +} + +#[test] +fn min_word_count_at_max() { + word_tally_test( + Case::Lower, + Sort::Desc, + Minimums { + chars: 1, + count: 15, + }, + &ExpectedFields { + count: 15, + uniq_count: 1, + avg: Some(15.0), + tally: vec![("c", 15)], + }, + ); +} + #[test] fn upper_case_desc_order() { word_tally_test( Case::Upper, Sort::Desc, + Minimums::default(), &ExpectedFields { count: 45, uniq_count: 5, - avg: 9.0, + avg: Some(9.0), tally: vec![("C", 15), ("D", 11), ("123", 9), ("B", 7), ("A", 3)], }, ); @@ -65,10 +130,11 @@ fn lower_case_asc_order() { word_tally_test( Case::Lower, Sort::Asc, + Minimums::default(), &ExpectedFields { count: 45, uniq_count: 5, - avg: 9.0, + avg: Some(9.0), tally: vec![("a", 3), ("b", 7), ("123", 9), ("d", 11), ("c", 15)], }, ); @@ -79,10 +145,11 @@ fn upper_case_asc_order() { word_tally_test( Case::Upper, Sort::Asc, + Minimums::default(), &ExpectedFields { count: 45, uniq_count: 5, - avg: 9.0, + avg: Some(9.0), tally: vec![("A", 3), ("B", 7), ("123", 9), ("D", 11), ("C", 15)], }, ); @@ -93,10 +160,11 @@ fn original_case_desc_order() { word_tally_test( Case::Original, Sort::Desc, + Minimums::default(), &ExpectedFields { count: 45, uniq_count: 9, - avg: 5.0, + avg: Some(5.0), tally: vec![ ("123", 9), ("C", 8), @@ -117,10 +185,11 @@ fn original_case_asc_order() { word_tally_test( Case::Original, Sort::Asc, + Minimums::default(), &ExpectedFields { count: 45, uniq_count: 9, - avg: 5.0, + avg: Some(5.0), tally: vec![ ("a", 1), ("A", 2), @@ -163,7 +232,7 @@ fn equality_and_hashing() { let tallies: Vec = cases_and_sorts .iter() - .map(|&(case, sort)| word_tally(case, sort)) + .map(|&(case, sort)| word_tally(case, sort, Minimums::default())) .collect(); for tally in &tallies { @@ -183,7 +252,7 @@ fn equality_and_hashing() { #[test] fn vec_from() { - let tally = word_tally(Case::Lower, Sort::Desc); + let tally = word_tally(Case::Lower, Sort::Desc, Minimums::default()); assert_eq!( Vec::from(tally), diff --git a/tests/main.rs b/tests/main.rs index bd2c7da..c7d202b 100644 --- a/tests/main.rs +++ b/tests/main.rs @@ -23,7 +23,31 @@ fn debug_without_input() { let assert = word_tally().arg("-d").assert(); assert .success() - .stderr("delimiter \" \"\ncase lower\norder desc\nverbose false\ndebug true\n") + .stderr( + "delimiter \" \"\ncase lower\norder desc\nmin-chars 1\nmin-count 1\nverbose false\ndebug true\n", + ) + .stdout(""); +} + +#[test] +fn debug_with_min_chars() { + let assert = word_tally().arg("-d").arg("--min-chars=42").assert(); + assert + .success() + .stderr( + "delimiter \" \"\ncase lower\norder desc\nmin-chars 42\nmin-count 1\nverbose false\ndebug true\n", + ) + .stdout(""); +} + +#[test] +fn debug_with_min_count() { + let assert = word_tally().arg("-d").arg("--min-count=42").assert(); + assert + .success() + .stderr( + "delimiter \" \"\ncase lower\norder desc\nmin-chars 1\nmin-count 42\nverbose false\ndebug true\n", + ) .stdout(""); } @@ -32,7 +56,7 @@ fn debug_with_input() { let assert = word_tally().write_stdin("wombat").arg("-d").assert(); assert .success() - .stderr("delimiter \" \"\ncase lower\norder desc\nverbose false\ndebug true\n\n") + .stderr("delimiter \" \"\ncase lower\norder desc\nmin-chars 1\nmin-count 1\nverbose false\ndebug true\n\n") .stdout("wombat 1\n"); }