perf: Switch to byte processing (#219)

The grammar is fairly neutral, so long as we can handle "non-ASCII". This does introduce `unsafe` for cases where the grammar guarantees we'll have valid UTF-8, to avoid validation. There might be some more cases where we can bypass the checks; I was being cautious and only using `unsafe` for 7-bit ASCII. If we ensure our input is only UTF-8 and we only split on 7-bit ASCII, we are probably safe to use bypass validation everywhere. So this saves us having to encode UTF0-8 as `char` at the cost of some UTF-8 validation. Looks like this gives us a 6-12% reduction in parsing time.
toml-rs · Sep 29, 2021 · 13d027f · 13d027f
1 parent adad869
commit 13d027f
Show file tree

Hide file tree

Showing 17 changed files with 495 additions and 243 deletions.
diff --git a/src/datetime.rs b/src/datetime.rs
@@ -130,14 +130,20 @@ impl FromStr for Datetime {
 
     /// Parses a value from a &str
     fn from_str(s: &str) -> Result<Self, Self::Err> {
+        use combine::stream::position::{IndexPositioner, Positioner};
         use combine::EasyParser;
-        let result = parser::datetime::date_time().easy_parse(Stream::new(s));
+
+        let b = s.as_bytes();
+        let result = parser::datetime::date_time().easy_parse(Stream::new(b));
         match result {
-            Ok((_, ref rest)) if !rest.input.is_empty() => {
-                Err(parser::TomlError::from_unparsed(rest.positioner, s))
-            }
+            Ok((_, ref rest)) if !rest.input.is_empty() => Err(parser::TomlError::from_unparsed(
+                (&rest.positioner
+                    as &dyn Positioner<usize, Position = usize, Checkpoint = IndexPositioner>)
+                    .position(),
+                b,
+            )),
             Ok((dt, _)) => Ok(dt),
-            Err(e) => Err(parser::TomlError::new(e, s)),
+            Err(e) => Err(parser::TomlError::new(e, b)),
         }
     }
 }
@@ -205,14 +211,20 @@ impl FromStr for Date {
 
     /// Parses a value from a &str
     fn from_str(s: &str) -> Result<Self, Self::Err> {
+        use combine::stream::position::{IndexPositioner, Positioner};
         use combine::EasyParser;
-        let result = parser::datetime::full_date().easy_parse(Stream::new(s));
+
+        let b = s.as_bytes();
+        let result = parser::datetime::full_date().easy_parse(Stream::new(b));
         match result {
-            Ok((_, ref rest)) if !rest.input.is_empty() => {
-                Err(parser::TomlError::from_unparsed(rest.positioner, s))
-            }
+            Ok((_, ref rest)) if !rest.input.is_empty() => Err(parser::TomlError::from_unparsed(
+                (&rest.positioner
+                    as &dyn Positioner<usize, Position = usize, Checkpoint = IndexPositioner>)
+                    .position(),
+                b,
+            )),
             Ok((dt, _)) => Ok(dt),
-            Err(e) => Err(parser::TomlError::new(e, s)),
+            Err(e) => Err(parser::TomlError::new(e, b)),
         }
     }
 }
@@ -276,14 +288,20 @@ impl FromStr for Time {
 
     /// Parses a value from a &str
     fn from_str(s: &str) -> Result<Self, Self::Err> {
+        use combine::stream::position::{IndexPositioner, Positioner};
         use combine::EasyParser;
-        let result = parser::datetime::partial_time().easy_parse(Stream::new(s));
+
+        let b = s.as_bytes();
+        let result = parser::datetime::partial_time().easy_parse(Stream::new(b));
         match result {
-            Ok((_, ref rest)) if !rest.input.is_empty() => {
-                Err(parser::TomlError::from_unparsed(rest.positioner, s))
-            }
+            Ok((_, ref rest)) if !rest.input.is_empty() => Err(parser::TomlError::from_unparsed(
+                (&rest.positioner
+                    as &dyn Positioner<usize, Position = usize, Checkpoint = IndexPositioner>)
+                    .position(),
+                b,
+            )),
             Ok((dt, _)) => Ok(dt),
-            Err(e) => Err(parser::TomlError::new(e, s)),
+            Err(e) => Err(parser::TomlError::new(e, b)),
         }
     }
 }

diff --git a/src/document.rs b/src/document.rs
@@ -64,6 +64,6 @@ impl FromStr for Document {
 
     /// Parses a document from a &str
     fn from_str(s: &str) -> Result<Self, Self::Err> {
-        parser::TomlParser::parse(s)
+        parser::TomlParser::parse(s.as_bytes())
     }
 }
diff --git a/src/key.rs b/src/key.rs
@@ -92,14 +92,20 @@ impl Key {
     }
 
     fn try_parse(s: &str) -> Result<Key, parser::TomlError> {
+        use combine::stream::position::{IndexPositioner, Positioner};
         use combine::EasyParser;
-        let result = parser::key_parser().easy_parse(Stream::new(s));
+
+        let b = s.as_bytes();
+        let result = parser::key_parser().easy_parse(Stream::new(b));
         match result {
-            Ok((_, ref rest)) if !rest.input.is_empty() => {
-                Err(parser::TomlError::from_unparsed(rest.positioner, s))
-            }
+            Ok((_, ref rest)) if !rest.input.is_empty() => Err(parser::TomlError::from_unparsed(
+                (&rest.positioner
+                    as &dyn Positioner<usize, Position = usize, Checkpoint = IndexPositioner>)
+                    .position(),
+                b,
+            )),
             Ok(((raw, key), _)) => Ok(Key::new(key).with_repr_unchecked(Repr::new_unchecked(raw))),
-            Err(e) => Err(parser::TomlError::new(e, s)),
+            Err(e) => Err(parser::TomlError::new(e, b)),
         }
     }
 }
@@ -126,7 +132,7 @@ impl FromStr for Key {
 }
 
 fn to_key_repr(key: &str) -> Repr {
-    if key.chars().all(is_unquoted_char) && !key.is_empty() {
+    if key.as_bytes().iter().copied().all(is_unquoted_char) && !key.is_empty() {
         Repr::new_unchecked(key)
     } else {
         to_string_repr(key, Some(StringStyle::OnelineSingle), Some(false))

diff --git a/src/parser/array.rs b/src/parser/array.rs
@@ -1,7 +1,7 @@
 use crate::parser::trivia::ws_comment_newline;
 use crate::parser::value::value;
 use crate::{Array, Value};
-use combine::parser::char::char;
+use combine::parser::byte::byte;
 use combine::parser::range::recognize_with_value;
 use combine::stream::RangeStream;
 use combine::*;
@@ -10,18 +10,18 @@ use combine::*;
 
 // array = array-open array-values array-close
 parse!(array() -> Array, {
-    between(char(ARRAY_OPEN), char(ARRAY_CLOSE),
+    between(byte(ARRAY_OPEN), byte(ARRAY_CLOSE),
             array_values())
 });
 
 // note: we're omitting ws and newlines here, because
 // they should be part of the formatted values
 // array-open  = %x5B ws-newline  ; [
-const ARRAY_OPEN: char = '[';
+const ARRAY_OPEN: u8 = b'[';
 // array-close = ws-newline %x5D  ; ]
-const ARRAY_CLOSE: char = ']';
+const ARRAY_CLOSE: u8 = b']';
 // array-sep = ws %x2C ws  ; , Comma
-const ARRAY_SEP: char = ',';
+const ARRAY_SEP: u8 = b',';
 
 // note: this rule is modified
 // array-values = [ ( array-value array-sep array-values ) /
@@ -30,15 +30,15 @@ parse!(array_values() -> Array, {
     (
         optional(
             recognize_with_value(
-                sep_end_by1(array_value(), char(ARRAY_SEP))
-            ).map(|(r, v): (&'a str, Array)| (v, r.ends_with(',')))
+                sep_end_by1(array_value(), byte(ARRAY_SEP))
+            ).map(|(r, v): (&'a [u8], Array)| (v, r[r.len() - 1] == b','))
         ),
         ws_comment_newline(),
-    ).map(|(array, trailing)| {
+    ).and_then::<_, _, std::str::Utf8Error>(|(array, trailing)| {
         let (mut array, comma) = array.unwrap_or_default();
         array.set_trailing_comma(comma);
-        array.set_trailing(trailing);
-        array
+        array.set_trailing(std::str::from_utf8(trailing)?);
+        Ok(array)
     })
 });
 
@@ -47,5 +47,11 @@ parse!(array_value() -> Value, {
         ws_comment_newline(),
         value(),
         ws_comment_newline(),
-    )).map(|(ws1, v, ws2)| v.decorated(ws1, ws2))
+    )).and_then::<_, _, std::str::Utf8Error>(|(ws1, v, ws2)| {
+        let v = v.decorated(
+            std::str::from_utf8(ws1)?,
+            std::str::from_utf8(ws2)?,
+        );
+        Ok(v)
+    })
 });
diff --git a/src/parser/datetime.rs b/src/parser/datetime.rs
@@ -1,7 +1,8 @@
 use crate::datetime::*;
 use crate::parser::errors::CustomError;
-use combine::parser::char::char;
-use combine::parser::range::{take, take_while1};
+use crate::parser::trivia::from_utf8_unchecked;
+use combine::parser::byte::byte;
+use combine::parser::range::{recognize, take_while1};
 use combine::stream::RangeStream;
 use combine::*;
 
@@ -48,9 +49,9 @@ parse!(date_time() -> Datetime, {
 // full-date      = date-fullyear "-" date-month "-" date-mday
 parse!(full_date() -> Date, {
     (
-        attempt((date_fullyear(), char('-'))),
+        attempt((date_fullyear(), byte(b'-'))),
         date_month(),
-        char('-'),
+        byte(b'-'),
         date_mday(),
     ).map(|((year, _), month, _, day)| {
         Date { year, month, day }
@@ -62,10 +63,10 @@ parse!(partial_time() -> Time, {
     (
         attempt((
             time_hour(),
-            char(':'),
+            byte(b':'),
         )),
         time_minute(),
-        char(':'),
+        byte(b':'),
         time_second(),
         optional(attempt(time_secfrac())),
     ).map(|((hour, _), minute, _, second, nanosecond)| {
@@ -76,18 +77,18 @@ parse!(partial_time() -> Time, {
 // time-offset    = "Z" / time-numoffset
 // time-numoffset = ( "+" / "-" ) time-hour ":" time-minute
 parse!(time_offset() -> Offset, {
-    attempt(satisfy(|c| c == 'Z' || c == 'z')).map(|_| Offset::Z)
+    attempt(satisfy(|c| c == b'Z' || c == b'z')).map(|_| Offset::Z)
         .or(
             (
-                attempt(choice([char('+'), char('-')])),
+                attempt(choice([byte(b'+'), byte(b'-')])),
                 time_hour(),
-                char(':'),
+                byte(b':'),
                 time_minute(),
             ).map(|(sign, hours, _, minutes)| {
                 let hours = hours as i8;
                 let hours = match sign {
-                    '+' => hours,
-                    '-' => -hours,
+                    b'+' => hours,
+                    b'-' => -hours,
                     _ => unreachable!("Parser prevents this"),
                 };
                 Offset::Custom { hours, minutes }
@@ -123,8 +124,8 @@ parse!(date_mday() -> u8, {
 });
 
 // time-delim     = "T" / %x20 ; T, t, or space
-fn is_time_delim(c: char) -> bool {
-    matches!(c, 'T' | 't' | ' ')
+fn is_time_delim(c: u8) -> bool {
+    matches!(c, b'T' | b't' | b' ')
 }
 
 // time-hour      = 2DIGIT  ; 00-23
@@ -162,7 +163,9 @@ parse!(time_second() -> u8, {
 
 // time-secfrac   = "." 1*DIGIT
 parse!(time_secfrac() -> u32, {
-    char('.').and(take_while1(|c: char| c.is_digit(10))).and_then::<_, _, CustomError>(|(_, repr): (char, &str)| {
+    byte(b'.').and(take_while1(|c: u8| c.is_ascii_digit())).and_then::<_, _, CustomError>(|(_, repr): (u8, &[u8])| {
+        let repr = unsafe { from_utf8_unchecked(repr, "`is_ascii_digit` filters out on-ASCII") };
+
         let v = repr.parse::<u32>().map_err(|_| CustomError::OutOfRange)?;
         let consumed = repr.len();
 
@@ -176,9 +179,21 @@ parse!(time_secfrac() -> u32, {
 });
 
 parse!(signed_digits(count: usize) -> i32, {
-    take(*count).and_then(|s: &str| s.parse::<i32>())
+    recognize(skip_count_min_max(
+        *count, *count,
+        satisfy(|c: u8| c.is_ascii_digit()),
+    )).and_then(|b: &[u8]| {
+        let s = unsafe { from_utf8_unchecked(b, "`is_ascii_digit` filters out on-ASCII") };
+        s.parse::<i32>()
+    })
 });
 
 parse!(unsigned_digits(count: usize) -> u32, {
-    take(*count).and_then(|s: &str| s.parse::<u32>())
+    recognize(skip_count_min_max(
+        *count, *count,
+        satisfy(|c: u8| c.is_ascii_digit()),
+    )).and_then(|b: &[u8]| {
+        let s = unsafe { from_utf8_unchecked(b, "`is_ascii_digit` filters out on-ASCII") };
+        s.parse::<u32>()
+    })
 });
diff --git a/src/parser/document.rs b/src/parser/document.rs
@@ -9,9 +9,8 @@ use crate::parser::value::value;
 use crate::parser::{TomlError, TomlParser};
 use crate::table::TableKeyValue;
 use crate::{InternalString, Item};
-use combine::parser::char::char;
-use combine::parser::range::recognize;
-use combine::stream::position::Stream;
+use combine::parser::byte::byte;
+use combine::stream::position::{IndexPositioner, Positioner, Stream};
 use combine::stream::RangeStream;
 use combine::Parser;
 use combine::*;
@@ -20,7 +19,11 @@ use std::mem;
 use std::ops::DerefMut;
 
 toml_parser!(parse_comment, parser, {
-    (comment(), line_ending()).map(|(c, e)| parser.borrow_mut().deref_mut().on_comment(c, e))
+    (comment(), line_ending()).and_then::<_, _, std::str::Utf8Error>(|(c, e)| {
+        let c = std::str::from_utf8(c)?;
+        parser.borrow_mut().deref_mut().on_comment(c, e);
+        Ok(())
+    })
 });
 
 toml_parser!(
@@ -30,7 +33,7 @@ toml_parser!(
 );
 
 toml_parser!(parse_newline, parser, {
-    recognize(newline()).map(|w| parser.borrow_mut().deref_mut().on_ws(w))
+    newline().map(|_| parser.borrow_mut().deref_mut().on_ws("\n"))
 });
 
 toml_parser!(keyval, parser, {
@@ -42,31 +45,33 @@ parser! {
     fn parse_keyval['a, I]()(I) -> (Vec<Key>, TableKeyValue)
     where
         [I: RangeStream<
-         Range = &'a str,
-         Token = char>,
-         I::Error: ParseError<char, &'a str, <I as StreamOnce>::Position>,
-         <I::Error as ParseError<char, &'a str, <I as StreamOnce>::Position>>::StreamError:
+         Range = &'a [u8],
+         Token = u8>,
+         I::Error: ParseError<u8, &'a [u8], <I as StreamOnce>::Position>,
+         <I::Error as ParseError<u8, &'a [u8], <I as StreamOnce>::Position>>::StreamError:
          From<std::num::ParseIntError> +
          From<std::num::ParseFloatError> +
+         From<std::str::Utf8Error> +
          From<crate::parser::errors::CustomError>
     ] {
         (
             key(),
-            char(KEYVAL_SEP),
+            byte(KEYVAL_SEP),
             (ws(), value(), line_trailing())
-        ).map(|(key, _, v)| {
+        ).and_then::<_, _, std::str::Utf8Error>(|(key, _, v)| {
             let mut path = key;
             let key = path.pop().expect("grammar ensures at least 1");
 
             let (pre, v, suf) = v;
+            let suf = std::str::from_utf8(suf)?;
             let v = v.decorated(pre, suf);
-            (
+            Ok((
                 path,
                 TableKeyValue {
                     key,
                     value: Item::Value(v),
                 }
-            )
+            ))
         })
     }
 }
@@ -80,7 +85,7 @@ impl TomlParser {
     //                ( ws keyval ws [ comment ] ) /
     //                ( ws table ws [ comment ] ) /
     //                  ws )
-    pub(crate) fn parse(s: &str) -> Result<Document, TomlError> {
+    pub(crate) fn parse(s: &[u8]) -> Result<Document, TomlError> {
         let mut parser = RefCell::new(Self::default());
         let input = Stream::new(s);
 
@@ -99,9 +104,12 @@ impl TomlParser {
             )))
             .easy_parse(input);
         match parsed {
-            Ok((_, ref rest)) if !rest.input.is_empty() => {
-                Err(TomlError::from_unparsed(rest.positioner, s))
-            }
+            Ok((_, ref rest)) if !rest.input.is_empty() => Err(TomlError::from_unparsed(
+                (&rest.positioner
+                    as &dyn Positioner<usize, Position = usize, Checkpoint = IndexPositioner>)
+                    .position(),
+                s,
+            )),
             Ok(..) => {
                 parser
                     .get_mut()