Skip to content

Commit

Permalink
perf: Switch to byte processing (#219)
Browse files Browse the repository at this point in the history
The grammar is fairly neutral, so long as we can handle "non-ASCII".

This does introduce `unsafe` for cases where the grammar guarantees
we'll have valid UTF-8, to avoid validation.  There might be some more
cases where we can bypass the checks; I was being cautious and only
using `unsafe` for 7-bit ASCII.  If we ensure our input is only UTF-8
and we only split on 7-bit ASCII, we are probably safe to use bypass
validation everywhere.

So this saves us having to encode UTF0-8 as `char` at the cost of some
UTF-8 validation.  Looks like this gives us a 6-12% reduction in parsing
time.
  • Loading branch information
epage authored Sep 29, 2021
1 parent adad869 commit 13d027f
Show file tree
Hide file tree
Showing 17 changed files with 495 additions and 243 deletions.
48 changes: 33 additions & 15 deletions src/datetime.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,14 +130,20 @@ impl FromStr for Datetime {

/// Parses a value from a &str
fn from_str(s: &str) -> Result<Self, Self::Err> {
use combine::stream::position::{IndexPositioner, Positioner};
use combine::EasyParser;
let result = parser::datetime::date_time().easy_parse(Stream::new(s));

let b = s.as_bytes();
let result = parser::datetime::date_time().easy_parse(Stream::new(b));
match result {
Ok((_, ref rest)) if !rest.input.is_empty() => {
Err(parser::TomlError::from_unparsed(rest.positioner, s))
}
Ok((_, ref rest)) if !rest.input.is_empty() => Err(parser::TomlError::from_unparsed(
(&rest.positioner
as &dyn Positioner<usize, Position = usize, Checkpoint = IndexPositioner>)
.position(),
b,
)),
Ok((dt, _)) => Ok(dt),
Err(e) => Err(parser::TomlError::new(e, s)),
Err(e) => Err(parser::TomlError::new(e, b)),
}
}
}
Expand Down Expand Up @@ -205,14 +211,20 @@ impl FromStr for Date {

/// Parses a value from a &str
fn from_str(s: &str) -> Result<Self, Self::Err> {
use combine::stream::position::{IndexPositioner, Positioner};
use combine::EasyParser;
let result = parser::datetime::full_date().easy_parse(Stream::new(s));

let b = s.as_bytes();
let result = parser::datetime::full_date().easy_parse(Stream::new(b));
match result {
Ok((_, ref rest)) if !rest.input.is_empty() => {
Err(parser::TomlError::from_unparsed(rest.positioner, s))
}
Ok((_, ref rest)) if !rest.input.is_empty() => Err(parser::TomlError::from_unparsed(
(&rest.positioner
as &dyn Positioner<usize, Position = usize, Checkpoint = IndexPositioner>)
.position(),
b,
)),
Ok((dt, _)) => Ok(dt),
Err(e) => Err(parser::TomlError::new(e, s)),
Err(e) => Err(parser::TomlError::new(e, b)),
}
}
}
Expand Down Expand Up @@ -276,14 +288,20 @@ impl FromStr for Time {

/// Parses a value from a &str
fn from_str(s: &str) -> Result<Self, Self::Err> {
use combine::stream::position::{IndexPositioner, Positioner};
use combine::EasyParser;
let result = parser::datetime::partial_time().easy_parse(Stream::new(s));

let b = s.as_bytes();
let result = parser::datetime::partial_time().easy_parse(Stream::new(b));
match result {
Ok((_, ref rest)) if !rest.input.is_empty() => {
Err(parser::TomlError::from_unparsed(rest.positioner, s))
}
Ok((_, ref rest)) if !rest.input.is_empty() => Err(parser::TomlError::from_unparsed(
(&rest.positioner
as &dyn Positioner<usize, Position = usize, Checkpoint = IndexPositioner>)
.position(),
b,
)),
Ok((dt, _)) => Ok(dt),
Err(e) => Err(parser::TomlError::new(e, s)),
Err(e) => Err(parser::TomlError::new(e, b)),
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/document.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,6 @@ impl FromStr for Document {

/// Parses a document from a &str
fn from_str(s: &str) -> Result<Self, Self::Err> {
parser::TomlParser::parse(s)
parser::TomlParser::parse(s.as_bytes())
}
}
18 changes: 12 additions & 6 deletions src/key.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,14 +92,20 @@ impl Key {
}

fn try_parse(s: &str) -> Result<Key, parser::TomlError> {
use combine::stream::position::{IndexPositioner, Positioner};
use combine::EasyParser;
let result = parser::key_parser().easy_parse(Stream::new(s));

let b = s.as_bytes();
let result = parser::key_parser().easy_parse(Stream::new(b));
match result {
Ok((_, ref rest)) if !rest.input.is_empty() => {
Err(parser::TomlError::from_unparsed(rest.positioner, s))
}
Ok((_, ref rest)) if !rest.input.is_empty() => Err(parser::TomlError::from_unparsed(
(&rest.positioner
as &dyn Positioner<usize, Position = usize, Checkpoint = IndexPositioner>)
.position(),
b,
)),
Ok(((raw, key), _)) => Ok(Key::new(key).with_repr_unchecked(Repr::new_unchecked(raw))),
Err(e) => Err(parser::TomlError::new(e, s)),
Err(e) => Err(parser::TomlError::new(e, b)),
}
}
}
Expand All @@ -126,7 +132,7 @@ impl FromStr for Key {
}

fn to_key_repr(key: &str) -> Repr {
if key.chars().all(is_unquoted_char) && !key.is_empty() {
if key.as_bytes().iter().copied().all(is_unquoted_char) && !key.is_empty() {
Repr::new_unchecked(key)
} else {
to_string_repr(key, Some(StringStyle::OnelineSingle), Some(false))
Expand Down
28 changes: 17 additions & 11 deletions src/parser/array.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use crate::parser::trivia::ws_comment_newline;
use crate::parser::value::value;
use crate::{Array, Value};
use combine::parser::char::char;
use combine::parser::byte::byte;
use combine::parser::range::recognize_with_value;
use combine::stream::RangeStream;
use combine::*;
Expand All @@ -10,18 +10,18 @@ use combine::*;

// array = array-open array-values array-close
parse!(array() -> Array, {
between(char(ARRAY_OPEN), char(ARRAY_CLOSE),
between(byte(ARRAY_OPEN), byte(ARRAY_CLOSE),
array_values())
});

// note: we're omitting ws and newlines here, because
// they should be part of the formatted values
// array-open = %x5B ws-newline ; [
const ARRAY_OPEN: char = '[';
const ARRAY_OPEN: u8 = b'[';
// array-close = ws-newline %x5D ; ]
const ARRAY_CLOSE: char = ']';
const ARRAY_CLOSE: u8 = b']';
// array-sep = ws %x2C ws ; , Comma
const ARRAY_SEP: char = ',';
const ARRAY_SEP: u8 = b',';

// note: this rule is modified
// array-values = [ ( array-value array-sep array-values ) /
Expand All @@ -30,15 +30,15 @@ parse!(array_values() -> Array, {
(
optional(
recognize_with_value(
sep_end_by1(array_value(), char(ARRAY_SEP))
).map(|(r, v): (&'a str, Array)| (v, r.ends_with(',')))
sep_end_by1(array_value(), byte(ARRAY_SEP))
).map(|(r, v): (&'a [u8], Array)| (v, r[r.len() - 1] == b','))
),
ws_comment_newline(),
).map(|(array, trailing)| {
).and_then::<_, _, std::str::Utf8Error>(|(array, trailing)| {
let (mut array, comma) = array.unwrap_or_default();
array.set_trailing_comma(comma);
array.set_trailing(trailing);
array
array.set_trailing(std::str::from_utf8(trailing)?);
Ok(array)
})
});

Expand All @@ -47,5 +47,11 @@ parse!(array_value() -> Value, {
ws_comment_newline(),
value(),
ws_comment_newline(),
)).map(|(ws1, v, ws2)| v.decorated(ws1, ws2))
)).and_then::<_, _, std::str::Utf8Error>(|(ws1, v, ws2)| {
let v = v.decorated(
std::str::from_utf8(ws1)?,
std::str::from_utf8(ws2)?,
);
Ok(v)
})
});
47 changes: 31 additions & 16 deletions src/parser/datetime.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
use crate::datetime::*;
use crate::parser::errors::CustomError;
use combine::parser::char::char;
use combine::parser::range::{take, take_while1};
use crate::parser::trivia::from_utf8_unchecked;
use combine::parser::byte::byte;
use combine::parser::range::{recognize, take_while1};
use combine::stream::RangeStream;
use combine::*;

Expand Down Expand Up @@ -48,9 +49,9 @@ parse!(date_time() -> Datetime, {
// full-date = date-fullyear "-" date-month "-" date-mday
parse!(full_date() -> Date, {
(
attempt((date_fullyear(), char('-'))),
attempt((date_fullyear(), byte(b'-'))),
date_month(),
char('-'),
byte(b'-'),
date_mday(),
).map(|((year, _), month, _, day)| {
Date { year, month, day }
Expand All @@ -62,10 +63,10 @@ parse!(partial_time() -> Time, {
(
attempt((
time_hour(),
char(':'),
byte(b':'),
)),
time_minute(),
char(':'),
byte(b':'),
time_second(),
optional(attempt(time_secfrac())),
).map(|((hour, _), minute, _, second, nanosecond)| {
Expand All @@ -76,18 +77,18 @@ parse!(partial_time() -> Time, {
// time-offset = "Z" / time-numoffset
// time-numoffset = ( "+" / "-" ) time-hour ":" time-minute
parse!(time_offset() -> Offset, {
attempt(satisfy(|c| c == 'Z' || c == 'z')).map(|_| Offset::Z)
attempt(satisfy(|c| c == b'Z' || c == b'z')).map(|_| Offset::Z)
.or(
(
attempt(choice([char('+'), char('-')])),
attempt(choice([byte(b'+'), byte(b'-')])),
time_hour(),
char(':'),
byte(b':'),
time_minute(),
).map(|(sign, hours, _, minutes)| {
let hours = hours as i8;
let hours = match sign {
'+' => hours,
'-' => -hours,
b'+' => hours,
b'-' => -hours,
_ => unreachable!("Parser prevents this"),
};
Offset::Custom { hours, minutes }
Expand Down Expand Up @@ -123,8 +124,8 @@ parse!(date_mday() -> u8, {
});

// time-delim = "T" / %x20 ; T, t, or space
fn is_time_delim(c: char) -> bool {
matches!(c, 'T' | 't' | ' ')
fn is_time_delim(c: u8) -> bool {
matches!(c, b'T' | b't' | b' ')
}

// time-hour = 2DIGIT ; 00-23
Expand Down Expand Up @@ -162,7 +163,9 @@ parse!(time_second() -> u8, {

// time-secfrac = "." 1*DIGIT
parse!(time_secfrac() -> u32, {
char('.').and(take_while1(|c: char| c.is_digit(10))).and_then::<_, _, CustomError>(|(_, repr): (char, &str)| {
byte(b'.').and(take_while1(|c: u8| c.is_ascii_digit())).and_then::<_, _, CustomError>(|(_, repr): (u8, &[u8])| {
let repr = unsafe { from_utf8_unchecked(repr, "`is_ascii_digit` filters out on-ASCII") };

let v = repr.parse::<u32>().map_err(|_| CustomError::OutOfRange)?;
let consumed = repr.len();

Expand All @@ -176,9 +179,21 @@ parse!(time_secfrac() -> u32, {
});

parse!(signed_digits(count: usize) -> i32, {
take(*count).and_then(|s: &str| s.parse::<i32>())
recognize(skip_count_min_max(
*count, *count,
satisfy(|c: u8| c.is_ascii_digit()),
)).and_then(|b: &[u8]| {
let s = unsafe { from_utf8_unchecked(b, "`is_ascii_digit` filters out on-ASCII") };
s.parse::<i32>()
})
});

parse!(unsigned_digits(count: usize) -> u32, {
take(*count).and_then(|s: &str| s.parse::<u32>())
recognize(skip_count_min_max(
*count, *count,
satisfy(|c: u8| c.is_ascii_digit()),
)).and_then(|b: &[u8]| {
let s = unsafe { from_utf8_unchecked(b, "`is_ascii_digit` filters out on-ASCII") };
s.parse::<u32>()
})
});
42 changes: 25 additions & 17 deletions src/parser/document.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,8 @@ use crate::parser::value::value;
use crate::parser::{TomlError, TomlParser};
use crate::table::TableKeyValue;
use crate::{InternalString, Item};
use combine::parser::char::char;
use combine::parser::range::recognize;
use combine::stream::position::Stream;
use combine::parser::byte::byte;
use combine::stream::position::{IndexPositioner, Positioner, Stream};
use combine::stream::RangeStream;
use combine::Parser;
use combine::*;
Expand All @@ -20,7 +19,11 @@ use std::mem;
use std::ops::DerefMut;

toml_parser!(parse_comment, parser, {
(comment(), line_ending()).map(|(c, e)| parser.borrow_mut().deref_mut().on_comment(c, e))
(comment(), line_ending()).and_then::<_, _, std::str::Utf8Error>(|(c, e)| {
let c = std::str::from_utf8(c)?;
parser.borrow_mut().deref_mut().on_comment(c, e);
Ok(())
})
});

toml_parser!(
Expand All @@ -30,7 +33,7 @@ toml_parser!(
);

toml_parser!(parse_newline, parser, {
recognize(newline()).map(|w| parser.borrow_mut().deref_mut().on_ws(w))
newline().map(|_| parser.borrow_mut().deref_mut().on_ws("\n"))
});

toml_parser!(keyval, parser, {
Expand All @@ -42,31 +45,33 @@ parser! {
fn parse_keyval['a, I]()(I) -> (Vec<Key>, TableKeyValue)
where
[I: RangeStream<
Range = &'a str,
Token = char>,
I::Error: ParseError<char, &'a str, <I as StreamOnce>::Position>,
<I::Error as ParseError<char, &'a str, <I as StreamOnce>::Position>>::StreamError:
Range = &'a [u8],
Token = u8>,
I::Error: ParseError<u8, &'a [u8], <I as StreamOnce>::Position>,
<I::Error as ParseError<u8, &'a [u8], <I as StreamOnce>::Position>>::StreamError:
From<std::num::ParseIntError> +
From<std::num::ParseFloatError> +
From<std::str::Utf8Error> +
From<crate::parser::errors::CustomError>
] {
(
key(),
char(KEYVAL_SEP),
byte(KEYVAL_SEP),
(ws(), value(), line_trailing())
).map(|(key, _, v)| {
).and_then::<_, _, std::str::Utf8Error>(|(key, _, v)| {
let mut path = key;
let key = path.pop().expect("grammar ensures at least 1");

let (pre, v, suf) = v;
let suf = std::str::from_utf8(suf)?;
let v = v.decorated(pre, suf);
(
Ok((
path,
TableKeyValue {
key,
value: Item::Value(v),
}
)
))
})
}
}
Expand All @@ -80,7 +85,7 @@ impl TomlParser {
// ( ws keyval ws [ comment ] ) /
// ( ws table ws [ comment ] ) /
// ws )
pub(crate) fn parse(s: &str) -> Result<Document, TomlError> {
pub(crate) fn parse(s: &[u8]) -> Result<Document, TomlError> {
let mut parser = RefCell::new(Self::default());
let input = Stream::new(s);

Expand All @@ -99,9 +104,12 @@ impl TomlParser {
)))
.easy_parse(input);
match parsed {
Ok((_, ref rest)) if !rest.input.is_empty() => {
Err(TomlError::from_unparsed(rest.positioner, s))
}
Ok((_, ref rest)) if !rest.input.is_empty() => Err(TomlError::from_unparsed(
(&rest.positioner
as &dyn Positioner<usize, Position = usize, Checkpoint = IndexPositioner>)
.position(),
s,
)),
Ok(..) => {
parser
.get_mut()
Expand Down
Loading

0 comments on commit 13d027f

Please sign in to comment.