Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement Optional Footnote References Ignore #311

Merged
merged 1 commit into from
Nov 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions doc-chunks/src/chunk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@ use std::convert::TryFrom;
use std::fmt;
use std::path::Path;

use crate::PlainOverlay;
use crate::{
util::{sub_char_range, sub_chars},
Range, Span,
};
use crate::{Ignores, PlainOverlay};

/// Definition of the source of a checkable chunk
#[derive(Debug, Clone, Hash, Eq, PartialEq)]
Expand Down Expand Up @@ -364,8 +364,8 @@ impl CheckableChunk {

/// Obtain an accessor object containing mapping and string representation,
/// removing the markdown annotations.
pub fn erase_cmark(&self) -> PlainOverlay {
PlainOverlay::erase_cmark(self)
pub fn erase_cmark(&self, ignores: &Ignores) -> PlainOverlay {
PlainOverlay::erase_cmark(self, ignores)
}

/// Obtain the length in characters.
Expand Down
19 changes: 15 additions & 4 deletions doc-chunks/src/markdown.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,10 @@ impl<'a> PlainOverlay<'a> {
}

/// Ranges are mapped `cmark reduced/plain -> raw`.
pub fn extract_plain_with_mapping(cmark: &str) -> (String, IndexMap<Range, SourceRange>) {
pub fn extract_plain_with_mapping(
cmark: &str,
ignores: &Ignores,
) -> (String, IndexMap<Range, SourceRange>) {
let mut plain = String::with_capacity(cmark.len());
let mut mapping = indexmap::IndexMap::with_capacity(128);

Expand Down Expand Up @@ -322,7 +325,7 @@ impl<'a> PlainOverlay<'a> {
}
}
Event::FootnoteReference(s) => {
if !s.is_empty() {
if !ignores.footnote_references && !s.is_empty() {
let char_range = Range {
start: char_range.start + 2,
end: char_range.end - 1,
Expand Down Expand Up @@ -368,8 +371,8 @@ impl<'a> PlainOverlay<'a> {
/// reference.
// TODO consider returning a `Vec<PlainOverlay<'a>>` to account for list items
// or other non-linear information which might not pass a grammar check as a whole
pub fn erase_cmark(chunk: &'a CheckableChunk) -> Self {
let (plain, mapping) = Self::extract_plain_with_mapping(chunk.as_str());
pub fn erase_cmark(chunk: &'a CheckableChunk, ignores: &Ignores) -> Self {
let (plain, mapping) = Self::extract_plain_with_mapping(chunk.as_str(), ignores);
Self {
raw: chunk,
plain,
Expand Down Expand Up @@ -518,3 +521,11 @@ impl<'a> fmt::Debug for PlainOverlay<'a> {
Ok(())
}
}

/// Explicitly ignored markdown entities. The `Default` implementation means we
/// do not ignore anything, which is the backwards compatible configuration.
#[derive(Clone, Default)]
pub struct Ignores {
/// Ignore [footnote references](Event::FootnoteReference).
pub footnote_references: bool,
}
8 changes: 8 additions & 0 deletions docs/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,14 @@ allow_concatenation = true
# And the counterpart, which accepts words with dashes, when the suggestion has
# recommendations without the dashes. This is less common.
allow_dashed = false
# Check the expressions in the footnote references. By default this is turned on
# to remain backwards compatible but disabling it could be particularly useful
# when one uses abbreviations instead of numbers as footnote references. For
# instance by default the fragment `hello[^xyz]` would be spellchecked as
# `helloxyz` which is obviously a misspelled word, but by turning this check
# off, it will skip validating the reference altogether and will only check the
# word `hello`.
check_footnote_references = false

[NlpRules]
# Allows the user to override the default included
Expand Down
2 changes: 1 addition & 1 deletion src/checker/dummy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ impl Checker for DummyChecker {
let chunk = chunks
.first()
.expect("DummyChecker expects at least one chunk");
let plain = chunk.erase_cmark();
let plain = chunk.erase_cmark(&Default::default());
let txt = plain.as_str();
for (index, range) in apply_tokenizer(&tokenizer, txt).enumerate() {
log::trace!("****Token[{}]: >{}<", index, sub_chars(txt, range.clone()));
Expand Down
31 changes: 21 additions & 10 deletions src/checker/hunspell.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ use std::sync::Arc;

use hunspell_rs::{CheckResult, Hunspell};

use doc_chunks::Ignores;

use crate::errors::*;

use super::quirks::{
Expand Down Expand Up @@ -133,22 +135,28 @@ pub struct HunspellCheckerInner {
allow_concatenated: bool,
allow_dashed: bool,
allow_emojis: bool,
check_footnote_references: bool,
ignorelist: String,
}

impl HunspellCheckerInner {
fn new(config: &<HunspellChecker as Checker>::Config) -> Result<Self> {
// TODO allow override
let (transform_regex, allow_concatenated, allow_dashed, allow_emojis) = {
let (
transform_regex,
allow_concatenated,
allow_dashed,
allow_emojis,
check_footnote_references,
) = {
let quirks = &config.quirks;
{
(
quirks.transform_regex().to_vec(),
quirks.allow_concatenated(),
quirks.allow_dashed(),
quirks.allow_emojis(),
)
}
(
quirks.transform_regex().to_vec(),
quirks.allow_concatenated(),
quirks.allow_dashed(),
quirks.allow_emojis(),
quirks.check_footnote_references(),
)
};
// FIXME rename the config option
let ignorelist = config.tokenization_splitchars.clone();
Expand Down Expand Up @@ -257,6 +265,7 @@ impl HunspellCheckerInner {
allow_concatenated,
allow_dashed,
allow_emojis,
check_footnote_references,
ignorelist,
})
}
Expand Down Expand Up @@ -299,7 +308,9 @@ impl Checker for HunspellChecker {
let mut acc = Vec::with_capacity(chunks.len());

for chunk in chunks {
let plain = chunk.erase_cmark();
let plain = chunk.erase_cmark(&Ignores {
footnote_references: !self.0.check_footnote_references,
});
log::trace!("{plain:?}");
let txt = plain.as_str();
let hunspell = &*self.hunspell.0;
Expand Down
7 changes: 6 additions & 1 deletion src/checker/nlprules.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,12 @@ fn check_chunk<'a>(
tokenizer: &Tokenizer,
rules: &Rules,
) -> Vec<Suggestion<'a>> {
let plain = chunk.erase_cmark();
// TODO We should control which parts need to be ignored of the markdown
// entities, however the `NlpRulesConfig`, which is the only configuration
// we receive in the constructor does not contain the same quirks (or in
// fact any other similar settings) as the Hunspell one, so we cannot obtain
// this setting, therefore we fallback to default
let plain = chunk.erase_cmark(&Default::default());
log::trace!("{plain:?}");
let txt = plain.as_str();

Expand Down
16 changes: 15 additions & 1 deletion src/config/hunspell.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,25 @@ pub struct Quirks {
/// Treats sequences of emojis as OK.
#[serde(default = "yes")]
pub allow_emojis: bool,
/// Check the expressions in the footnote references. By default this is
/// turned on to remain backwards compatible but disabling it could be
/// particularly useful when one uses abbreviations instead of numbers as
/// footnote references. For instance by default the fragment `hello[^xyz]`
/// would be spellchecked as `helloxyz` which is obviously a misspelled
/// word, but by turning this check off, it will skip validating the
/// reference altogether and will only check the word `hello`.
#[serde(default = "yes")]
pub check_footnote_references: bool,
}

impl Default for Quirks {
fn default() -> Self {
Self {
transform_regex: vec![],
transform_regex: Vec::new(),
allow_concatenation: false,
allow_dashes: false,
allow_emojis: true,
check_footnote_references: true,
}
}
}
Expand All @@ -59,6 +69,10 @@ impl Quirks {
pub(crate) fn transform_regex(&self) -> &[WrappedRegex] {
&self.transform_regex
}

pub(crate) fn check_footnote_references(&self) -> bool {
self.check_footnote_references
}
}

fn default_tokenization_splitchars() -> String {
Expand Down
6 changes: 3 additions & 3 deletions src/reflow/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ macro_rules! reflow_content {
let chunks = docs.get(&$content_type).expect("Contains test data. qed");
assert_eq!(dbg!(chunks).len(), 1);
let chunk = &chunks[0];
let _plain = chunk.erase_cmark();
let _plain = chunk.erase_cmark(&Default::default());
let suggestions = reflow(&$content_type, chunk, &CFG).expect("Reflow is working. qed");

let patches = suggestions
Expand Down Expand Up @@ -134,7 +134,7 @@ macro_rules! reflow_content {
let chunks = docs.get(&$content_type).expect("Contains test data. qed");
assert_eq!(dbg!(chunks).len(), 1);
let chunk = &chunks[0];
let _plain = chunk.erase_cmark();
let _plain = chunk.erase_cmark(&Default::default());
let suggestions = reflow(&$content_type, chunk, &CFG).expect("Reflow is working. qed");

assert_eq!(
Expand All @@ -159,7 +159,7 @@ macro_rules! reflow_content {
let chunks = docs.get(&$content_type).expect("Contains test data. qed");
assert_eq!(dbg!(chunks).len(), 1);
let chunk = &chunks[0];
let _plain = chunk.erase_cmark();
let _plain = chunk.erase_cmark(&Default::default());
println!("reflow content:\n {:?}", $content);
let suggestions = reflow(&$content_type, chunk, &CFG).expect("Reflow is working. qed");
let patches = suggestions
Expand Down
45 changes: 36 additions & 9 deletions src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use super::*;
use crate::checker::Checker;
use crate::util::{load_span_from, sub_char_range, sub_chars};
use crate::{chyrp_up, fluff_up};
use doc_chunks::literalset::testhelper::gen_literal_set;
use doc_chunks::{literalset::testhelper::gen_literal_set, Ignores};

use crate::documentation::{testcase::annotated_literals, SourceRange};
use indexmap::IndexMap;
Expand Down Expand Up @@ -31,7 +31,7 @@ fn parse_and_construct() {
// TODO
let chunk = &chunks[0];
assert_eq!(chunk.as_str(), TEST_RAW.to_owned());
let plain = chunk.erase_cmark();
let plain = chunk.erase_cmark(&Default::default());
println!("{plain:?}");

assert_eq!(TEST_PLAIN, plain.as_str());
Expand Down Expand Up @@ -98,7 +98,7 @@ macro_rules! end2end {
let chunks = docs.get(&origin).expect("Must contain dummy path");
assert_eq!(dbg!(chunks).len(), 1);
let chunk = &chunks[0];
let _plain = chunk.erase_cmark();
let _plain = chunk.erase_cmark(&Default::default());
let cfg = $cfg;
dbg!(std::any::type_name::<$checker>());
let checker = <$checker>::new(&cfg).expect("Checker construction works");
Expand Down Expand Up @@ -259,7 +259,7 @@ struct CAPI;
#[test]
fn issue_281() {
let dict_path = temp_dir().join(uuid::Uuid::new_v4().to_string() + ".dic");
// Any of the two hypthens cause havoc
// Any of the two hyphens cause havoc
const EXTRA_DICT: &str = r###"2
Expand Down Expand Up @@ -326,7 +326,7 @@ struct CAPI;
assert_eq!(chunks.len(), 1);
assert_eq!(RAW, chunk.as_str());

let plain = dbg!(chunk.erase_cmark());
let plain = dbg!(chunk.erase_cmark(&Default::default()));
assert_eq!(dbg!($plain), plain.as_str());

let mut it = suggestions.into_iter();
Expand Down Expand Up @@ -600,6 +600,31 @@ Ref4"#;
}
}

#[test]
fn check_footnote_references() {
const SOURCE: &str = "Hello[^xyz].\n\n[^xyz]: World.";
let origin = ContentOrigin::TestEntityCommonMark;

let documentation = Documentation::load_from_str(origin.clone(), SOURCE, false, false);
assert_eq!(documentation.len(), 1);

let chunks = documentation.get(&origin).expect("Must contain dummy path");
assert_eq!(dbg!(chunks).len(), 1);

let chunk = &chunks[0];
assert_eq!(chunk.as_str(), SOURCE);

let plain = chunk.erase_cmark(&Ignores {
footnote_references: false,
});
assert_eq!(plain.as_str(), "Helloxyz.\n\nWorld.");

let plain = chunk.erase_cmark(&Ignores {
footnote_references: true,
});
assert_eq!(plain.as_str(), "Hello.\n\nWorld.");
}

#[test]
fn find_spans_emoji() {
const TEST: &str = r##"ab **🐡** xy"##;
Expand Down Expand Up @@ -1050,7 +1075,7 @@ fn drill_span() {
CommentVariant::CommonMark,
);

let plain = chunk.erase_cmark();
let plain = chunk.erase_cmark(&Default::default());
assert_eq!(plain.find_spans(0..2).len(), 1);
assert_eq!(plain.find_spans(3..4).len(), 1);
assert_eq!(plain.find_spans(5..7).len(), 1);
Expand Down Expand Up @@ -1100,7 +1125,8 @@ Extra pagaph paragraph.


And a line, or a rule."##;
let (reduced, mapping) = PlainOverlay::extract_plain_with_mapping(MARKDOWN);
let (reduced, mapping) =
PlainOverlay::extract_plain_with_mapping(MARKDOWN, &Default::default());

assert_eq!(dbg!(&reduced).as_str(), PLAIN);
assert_eq!(dbg!(&mapping).len(), 22);
Expand All @@ -1117,7 +1143,8 @@ fn reduction_leading_space() {
const MARKDOWN: &str = r#" Some __underlined__ **bold** text."#;
const PLAIN: &str = r#"Some underlined bold text."#;

let (reduced, mapping) = PlainOverlay::extract_plain_with_mapping(MARKDOWN);
let (reduced, mapping) =
PlainOverlay::extract_plain_with_mapping(MARKDOWN, &Default::default());

assert_eq!(dbg!(&reduced).as_str(), PLAIN);
assert_eq!(dbg!(&mapping).len(), 5);
Expand Down Expand Up @@ -1158,7 +1185,7 @@ fn range_test() {
}

fn cmark_reduction_test(input: &'static str, expected: &'static str, expected_mapping_len: usize) {
let (plain, mapping) = PlainOverlay::extract_plain_with_mapping(input);
let (plain, mapping) = PlainOverlay::extract_plain_with_mapping(input, &Default::default());
assert_eq!(dbg!(&plain).as_str(), expected);
assert_eq!(dbg!(&mapping).len(), expected_mapping_len);
for (reduced_range, markdown_range) in mapping.into_iter() {
Expand Down