Skip to content

Commit

Permalink
Add --min-chars and --min-count options
Browse files Browse the repository at this point in the history
  • Loading branch information
havenwood committed May 24, 2024
1 parent ac15859 commit b35f0fb
Show file tree
Hide file tree
Showing 6 changed files with 169 additions and 60 deletions.
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@ Arguments:
[INPUT] File path to use as input rather than stdin ("-") [default: -]
Options:
-s, --sort <ORDER> Order [default: desc] [possible values: desc, asc, unsorted]
-c, --case <FORMAT> Normalization [default: lower] [possible values: original, upper, lower]
-s, --sort <ORDER> Sort order [default: desc] [possible values: desc, asc, unsorted]
-c, --case <FORMAT> Case normalization [default: lower] [possible values: original, upper, lower]
-m, --min-chars <COUNT> Exclude words that contain fewer than min chars [default: 1]
-M, --min-count <COUNT> Exclude words that appear fewer than min times [default: 1]
-D, --delimiter <VALUE> Delimiter between keys and values [default: " "]
-o, --output <PATH> Write output to file rather than stdout
-v, --verbose Print verbose details
Expand Down
16 changes: 12 additions & 4 deletions src/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,27 @@ use word_tally::{Case, Sort};
#[command(about, version)]
pub struct Args {
/// File path to use as input rather than stdin ("-").
#[clap(default_value = "-")]
#[arg(default_value = "-")]
pub input: FileOrStdin<PathBuf>,

/// Order.
/// Sort order.
#[arg(short, long, default_value_t, value_enum, value_name = "ORDER")]
pub sort: Sort,

/// Normalization.
/// Case normalization.
#[arg(short, long, default_value_t, value_enum, value_name = "FORMAT")]
pub case: Case,

/// Exclude words that contain fewer than min chars.
#[arg(short, long, default_value_t = 1, value_name = "COUNT")]
pub min_chars: usize,

/// Exclude words that appear fewer than min times.
#[arg(short = 'M', long, default_value_t = 1, value_name = "COUNT")]
pub min_count: u64,

/// Delimiter between keys and values.
#[clap(short = 'D', long, default_value = " ", value_name = "VALUE")]
#[arg(short = 'D', long, default_value = " ", value_name = "VALUE")]
pub delimiter: String,

/// Write output to file rather than stdout.
Expand Down
42 changes: 28 additions & 14 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@
//! # Examples
//!
//! ```
//! use word_tally::{Case, Sort, WordTally};
//! use word_tally::{Case, Minimums, Sort, WordTally};
//!
//! let input = "Cinquedea".as_bytes();
//! let words = WordTally::new(input, Case::Lower, Sort::Desc);
//! let words = WordTally::new(input, Case::Lower, Sort::Desc, Minimums::default());
//! let expected_tally = vec![("cinquedea".to_string(), 1)];
//!
//! assert_eq!(words.tally(), expected_tally);
Expand Down Expand Up @@ -81,11 +81,23 @@ pub enum Sort {
Unsorted,
}

/// Minimum requirements for a word to be included in the tally.
#[derive(Clone, Copy, Debug, Default)]
pub struct Minimums {
/// Min number of chars for words to be included.
pub chars: usize,
/// Min count of a word for it to be included.
pub count: u64,
}

/// `WordTally` fields are eagerly populated upon construction and exposed by getter methods.
impl WordTally {
/// Constructs a new `WordTally` from a source that implements `Read` like file or stdin.
pub fn new<T: Read>(input: T, case: Case, order: Sort) -> Self {
let tally_map = Self::tally_map(input, case);
pub fn new<T: Read>(input: T, case: Case, order: Sort, min: Minimums) -> Self {
let mut tally_map = Self::tally_map(input, case, min.chars);
if min.count > 1 {
tally_map.retain(|_, &mut count| count >= min.count);
}
let count = tally_map.values().sum();
let tally = Vec::from_iter(tally_map);
let uniq_count = tally.len();
Expand Down Expand Up @@ -140,20 +152,22 @@ impl WordTally {
}

/// Creates a tally of optionally normalized words from input that implements `Read`.
fn tally_map<T: Read>(input: T, case: Case) -> HashMap<String, u64> {
fn tally_map<T: Read>(input: T, case: Case, min_chars: usize) -> HashMap<String, u64> {
let mut tally = HashMap::new();
let lines = BufReader::new(input).lines();

for line in lines.map_while(Result::ok) {
line.unicode_words().for_each(|unicode_word| {
let word = match case {
Case::Lower => unicode_word.to_lowercase(),
Case::Upper => unicode_word.to_uppercase(),
Case::Original => unicode_word.to_owned(),
};

*tally.entry(word).or_insert(0) += 1;
});
line.unicode_words()
.filter(|unicode_word| min_chars <= 1 || unicode_word.len() >= min_chars)
.for_each(|unicode_word| {
let word = match case {
Case::Lower => unicode_word.to_lowercase(),
Case::Upper => unicode_word.to_uppercase(),
Case::Original => unicode_word.to_owned(),
};

*tally.entry(word).or_insert(0) += 1;
});
}

tally
Expand Down
38 changes: 15 additions & 23 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use clap_stdin::Source;
use std::fs::File;
use std::io::{self, ErrorKind::BrokenPipe, LineWriter, StderrLock, Write};
use unescaper::unescape;
use word_tally::{Case, Sort, WordTally};
use word_tally::{Case, Minimums, Sort, WordTally};

/// `Writer` is a boxed type for dynamic dispatch of the `Write` trait.
type Writer = Box<dyn Write>;
Expand All @@ -20,7 +20,11 @@ fn main() -> Result<()> {
.input
.into_reader()
.with_context(|| format!("Failed to read {:#?}.", args.input.source))?;
let word_tally = WordTally::new(reader, args.case, args.sort);
let minimums = Minimums {
chars: args.min_chars,
count: args.min_count,
};
let word_tally = WordTally::new(reader, args.case, args.sort, minimums);
let delimiter = unescape(&args.delimiter)?;

if args.verbose || args.debug {
Expand Down Expand Up @@ -53,14 +57,7 @@ fn log_details(
}

if args.debug {
log_debug(
&mut stderr_lock,
args.case,
args.sort,
args.verbose,
args.debug,
delimiter,
)?;
log_debug(&mut stderr_lock, args, delimiter)?;
}

if word_tally.count() > 0 {
Expand Down Expand Up @@ -99,21 +96,14 @@ fn log_verbose(
}

/// Log debug details to stderr.
fn log_debug(
stderr: &mut StderrLock<'_>,
case: Case,
sort: Sort,
verbose: bool,
debug: bool,
delimiter: &str,
) -> Result<()> {
let case_name = match case {
fn log_debug(stderr: &mut StderrLock<'_>, args: &Args, delimiter: &str) -> Result<()> {
let case_name = match args.case {
Case::Lower => "lower",
Case::Upper => "upper",
Case::Original => "original",
};

let sort_name = match sort {
let sort_name = match args.sort {
Sort::Asc => "asc",
Sort::Desc => "desc",
Sort::Unsorted => "unsorted",
Expand All @@ -123,8 +113,10 @@ fn log_debug(
format!("delimiter{delimiter}{delimiter:#?}\n"),
format!("case{delimiter}{case_name}\n"),
format!("order{delimiter}{sort_name}\n"),
format!("verbose{delimiter}{verbose}\n"),
format!("debug{delimiter}{debug}\n"),
format!("min-chars{delimiter}{}\n", args.min_chars),
format!("min-count{delimiter}{}\n", args.min_count),
format!("verbose{delimiter}{}\n", args.verbose),
format!("debug{delimiter}{}\n", args.debug),
];

for detail in &details {
Expand Down Expand Up @@ -162,7 +154,7 @@ fn write_tally(
// used to kill the program if it tries to write to a closed pipe.
fn piping(result: std::io::Result<()>) -> Result<()> {
match result {
Ok(_) => Ok(()),
Ok(()) => Ok(()),
Err(err) => match err.kind() {
BrokenPipe => Ok(()),
_ => Err(err.into()),
Expand Down
99 changes: 84 additions & 15 deletions tests/lib.rs
Original file line number Diff line number Diff line change
@@ -1,28 +1,28 @@
use std::fs::File;
use std::hash::{DefaultHasher, Hash, Hasher};
use word_tally::{Case, Sort, WordTally};
use word_tally::{Case, Minimums, Sort, WordTally};

const TEST_WORDS_PATH: &str = "tests/files/words.txt";

struct ExpectedFields<'a> {
count: u64,
uniq_count: usize,
avg: f64,
avg: Option<f64>,
tally: Vec<(&'a str, u64)>,
}

fn word_tally(case: Case, sort: Sort) -> WordTally {
fn word_tally(case: Case, sort: Sort, minimums: Minimums) -> WordTally {
let input = File::open(TEST_WORDS_PATH)
.expect("Expected test words file (`files/words.txt`) to be readable.");

WordTally::new(input, case, sort)
WordTally::new(input, case, sort, minimums)
}

fn word_tally_test(case: Case, sort: Sort, fields: &ExpectedFields<'_>) {
let word_tally = word_tally(case, sort);
fn word_tally_test(case: Case, sort: Sort, minimums: Minimums, fields: &ExpectedFields<'_>) {
let word_tally = word_tally(case, sort, minimums);
assert_eq!(word_tally.count(), fields.count);
assert_eq!(word_tally.uniq_count(), fields.uniq_count);
assert_eq!(word_tally.avg().unwrap(), fields.avg);
assert_eq!(word_tally.avg(), fields.avg);

let expected_tally: Vec<(String, u64)> = fields
.tally
Expand All @@ -37,24 +37,89 @@ fn lower_case_desc_order() {
word_tally_test(
Case::Lower,
Sort::Desc,
Minimums::default(),
&ExpectedFields {
count: 45,
uniq_count: 5,
avg: 9.0,
avg: Some(9.0),
tally: vec![("c", 15), ("d", 11), ("123", 9), ("b", 7), ("a", 3)],
},
);
}

#[test]
fn min_char_count_at_max() {
word_tally_test(
Case::Lower,
Sort::Desc,
Minimums { chars: 3, count: 1 },
&ExpectedFields {
count: 9,
uniq_count: 1,
avg: Some(9.0),
tally: vec![("123", 9)],
},
);
}

#[test]
fn min_char_count_above_max() {
word_tally_test(
Case::Lower,
Sort::Desc,
Minimums { chars: 4, count: 1 },
&ExpectedFields {
count: 0,
uniq_count: 0,
avg: None,
tally: vec![],
},
);
}

#[test]
fn min_char_count_at_min() {
word_tally_test(
Case::Lower,
Sort::Desc,
Minimums::default(),
&ExpectedFields {
count: 45,
uniq_count: 5,
avg: Some(9.0),
tally: vec![("c", 15), ("d", 11), ("123", 9), ("b", 7), ("a", 3)],
},
);
}

#[test]
fn min_word_count_at_max() {
word_tally_test(
Case::Lower,
Sort::Desc,
Minimums {
chars: 1,
count: 15,
},
&ExpectedFields {
count: 15,
uniq_count: 1,
avg: Some(15.0),
tally: vec![("c", 15)],
},
);
}

#[test]
fn upper_case_desc_order() {
word_tally_test(
Case::Upper,
Sort::Desc,
Minimums::default(),
&ExpectedFields {
count: 45,
uniq_count: 5,
avg: 9.0,
avg: Some(9.0),
tally: vec![("C", 15), ("D", 11), ("123", 9), ("B", 7), ("A", 3)],
},
);
Expand All @@ -65,10 +130,11 @@ fn lower_case_asc_order() {
word_tally_test(
Case::Lower,
Sort::Asc,
Minimums::default(),
&ExpectedFields {
count: 45,
uniq_count: 5,
avg: 9.0,
avg: Some(9.0),
tally: vec![("a", 3), ("b", 7), ("123", 9), ("d", 11), ("c", 15)],
},
);
Expand All @@ -79,10 +145,11 @@ fn upper_case_asc_order() {
word_tally_test(
Case::Upper,
Sort::Asc,
Minimums::default(),
&ExpectedFields {
count: 45,
uniq_count: 5,
avg: 9.0,
avg: Some(9.0),
tally: vec![("A", 3), ("B", 7), ("123", 9), ("D", 11), ("C", 15)],
},
);
Expand All @@ -93,10 +160,11 @@ fn original_case_desc_order() {
word_tally_test(
Case::Original,
Sort::Desc,
Minimums::default(),
&ExpectedFields {
count: 45,
uniq_count: 9,
avg: 5.0,
avg: Some(5.0),
tally: vec![
("123", 9),
("C", 8),
Expand All @@ -117,10 +185,11 @@ fn original_case_asc_order() {
word_tally_test(
Case::Original,
Sort::Asc,
Minimums::default(),
&ExpectedFields {
count: 45,
uniq_count: 9,
avg: 5.0,
avg: Some(5.0),
tally: vec![
("a", 1),
("A", 2),
Expand Down Expand Up @@ -163,7 +232,7 @@ fn equality_and_hashing() {

let tallies: Vec<WordTally> = cases_and_sorts
.iter()
.map(|&(case, sort)| word_tally(case, sort))
.map(|&(case, sort)| word_tally(case, sort, Minimums::default()))
.collect();

for tally in &tallies {
Expand All @@ -183,7 +252,7 @@ fn equality_and_hashing() {

#[test]
fn vec_from() {
let tally = word_tally(Case::Lower, Sort::Desc);
let tally = word_tally(Case::Lower, Sort::Desc, Minimums::default());

assert_eq!(
Vec::from(tally),
Expand Down
Loading

0 comments on commit b35f0fb

Please sign in to comment.