diff --git a/Cargo.lock b/Cargo.lock index f6a63914..01758206 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -59,6 +59,21 @@ version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" +[[package]] +name = "android-tzdata" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + [[package]] name = "anstream" version = "0.6.18" @@ -450,6 +465,20 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "chrono" +version = "0.4.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e36cc9d416881d2e24f9a963be5fb1cd90966419ac844274161d10488b3e825" +dependencies = [ + "android-tzdata", + "iana-time-zone", + "js-sys", + "num-traits", + "wasm-bindgen", + "windows-targets", +] + [[package]] name = "clang-sys" version = "1.8.1" @@ -1344,6 +1373,29 @@ dependencies = [ "tracing", ] +[[package]] +name = "iana-time-zone" +version = "0.1.61" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "235e081f3925a06703c2d0117ea8b91f042756fd6e7a6e5d901e8ca1a996b220" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "wasm-bindgen", + "windows-core 0.52.0", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + [[package]] name = "icu_collections" version = "1.5.0" @@ -1833,6 +1885,15 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + [[package]] name = "object" version = "0.36.7" @@ -2249,6 +2310,7 @@ dependencies = [ "bitflags", "brotli", "bytes", + "chrono", "const_format", "csv", "flate2", @@ -3514,6 +3576,7 @@ checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" dependencies = [ "cfg-if", "once_cell", + "rustversion", "wasm-bindgen-macro", ] @@ -3621,7 +3684,16 @@ version = "0.58.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dd04d41d93c4992d421894c18c8b43496aa748dd4c081bac0dc93eb0489272b6" dependencies = [ - "windows-core", + "windows-core 0.58.0", + "windows-targets", +] + +[[package]] +name = "windows-core" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" +dependencies = [ "windows-targets", ] diff --git a/Cargo.toml b/Cargo.toml index c6ba006c..7fc47265 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -42,6 +42,7 @@ bitflags = "2.4" md5 = "0.7.0" brotli = "7" bytes = "1" +chrono = "0.4.39" clap = { version = "4.5.15", features = ["derive"] } crossterm = "0.27" csv = "1.3.1" diff --git a/rama-http-types/src/lib.rs b/rama-http-types/src/lib.rs index cf60b2e7..7b779b1e 100644 --- a/rama-http-types/src/lib.rs +++ b/rama-http-types/src/lib.rs @@ -131,6 +131,12 @@ pub mod header { "x-real-ip", ]; + // non-std web-crawler info headers + // + // More information at + // . + static_header!["x-robots-tag"]; + /// Static Header Value that is can be used as `User-Agent` or `Server` header. pub static RAMA_ID_HEADER_VALUE: HeaderValue = HeaderValue::from_static( const_format::formatcp!("{}/{}", rama_utils::info::NAME, rama_utils::info::VERSION), diff --git a/rama-http/Cargo.toml b/rama-http/Cargo.toml index ac596aa2..d34fec63 100644 --- a/rama-http/Cargo.toml +++ b/rama-http/Cargo.toml @@ -30,6 +30,7 @@ async-compression = { workspace = true, features = [ base64 = { workspace = true } bitflags = { workspace = true } bytes = { workspace = true } +chrono = { workspace = true } const_format = { workspace = true } csv = { workspace = true } futures-lite = { workspace = true } diff --git a/rama-http/src/headers/mod.rs b/rama-http/src/headers/mod.rs index 06dc4a44..3a82b06d 100644 --- a/rama-http/src/headers/mod.rs +++ b/rama-http/src/headers/mod.rs @@ -102,4 +102,10 @@ pub mod authorization { pub use ::rama_http_types::headers::HeaderExt; pub(crate) mod util; + +pub mod x_robots_tag_components; + +mod x_robots_tag; +pub use x_robots_tag::XRobotsTag; + pub use util::quality_value::{Quality, QualityValue}; diff --git a/rama-http/src/headers/util/csv.rs b/rama-http/src/headers/util/csv.rs index b91f3567..9523510d 100644 --- a/rama-http/src/headers/util/csv.rs +++ b/rama-http/src/headers/util/csv.rs @@ -11,24 +11,28 @@ use crate::HeaderValue; pub(crate) fn from_comma_delimited<'i, I, T, E>(values: &mut I) -> Result where I: Iterator, - T: ::std::str::FromStr, - E: ::std::iter::FromIterator, + T: std::str::FromStr, + E: FromIterator, { values .flat_map(|value| { - value.to_str().into_iter().flat_map(|string| { - string - .split(',') - .filter_map(|x| match x.trim() { - "" => None, - y => Some(y), - }) - .map(|x| x.parse().map_err(|_| Error::invalid())) - }) + value + .to_str() + .into_iter() + .flat_map(|string| split_csv_str(string)) }) .collect() } +pub(crate) fn split_csv_str( + string: &str, +) -> impl Iterator> + use<'_, T> { + string.split(',').filter_map(|x| match x.trim() { + "" => None, + y => Some(y.parse().map_err(|_| Error::invalid())), + }) +} + /// Format an array into a comma-delimited string. pub(crate) fn fmt_comma_delimited( f: &mut fmt::Formatter, diff --git a/rama-http/src/headers/util/mod.rs b/rama-http/src/headers/util/mod.rs index af107ff1..277c6dc8 100644 --- a/rama-http/src/headers/util/mod.rs +++ b/rama-http/src/headers/util/mod.rs @@ -1,3 +1,5 @@ pub(crate) mod csv; /// Internal utility functions for headers. pub(crate) mod quality_value; + +pub(crate) mod value_string; diff --git a/rama-http/src/headers/util/value_string.rs b/rama-http/src/headers/util/value_string.rs new file mode 100644 index 00000000..c572decf --- /dev/null +++ b/rama-http/src/headers/util/value_string.rs @@ -0,0 +1,63 @@ +use http::header::HeaderValue; +use std::fmt::{Display, Formatter}; +use std::{ + fmt, + str::{self, FromStr}, +}; + +/// A value that is both a valid `HeaderValue` and `String`. +#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct HeaderValueString { + /// Care must be taken to only set this value when it is also + /// a valid `String`, since `as_str` will convert to a `&str` + /// in an unchecked manner. + value: HeaderValue, +} + +impl HeaderValueString { + pub(crate) fn as_str(&self) -> &str { + // HeaderValueString is only created from HeaderValues + // that have validated they are also UTF-8 strings. + unsafe { str::from_utf8_unchecked(self.value.as_bytes()) } + } +} + +impl fmt::Debug for HeaderValueString { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + fmt::Debug::fmt(self.as_str(), f) + } +} + +impl Display for HeaderValueString { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + fmt::Display::fmt(self.as_str(), f) + } +} + +impl<'a> From<&'a HeaderValueString> for HeaderValue { + fn from(src: &'a HeaderValueString) -> HeaderValue { + src.value.clone() + } +} + +#[derive(Debug)] +pub struct FromStrError(&'static str); + +impl FromStr for HeaderValueString { + type Err = FromStrError; + + fn from_str(src: &str) -> Result { + // A valid `str` (the argument)... + src.parse() + .map(|value| HeaderValueString { value }) + .map_err(|_| FromStrError("failed to parse header value from string")) + } +} + +impl Display for FromStrError { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + writeln!(f, "{}", self.0) + } +} + +impl std::error::Error for FromStrError {} diff --git a/rama-http/src/headers/x_robots_tag.rs b/rama-http/src/headers/x_robots_tag.rs new file mode 100644 index 00000000..e9acf839 --- /dev/null +++ b/rama-http/src/headers/x_robots_tag.rs @@ -0,0 +1,57 @@ +use crate::headers::x_robots_tag_components::robots_tag_components::Parser; +use crate::headers::x_robots_tag_components::RobotsTag; +use crate::headers::Error; +use headers::Header; +use http::{HeaderName, HeaderValue}; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct XRobotsTag(Vec); + +impl Header for XRobotsTag { + fn name() -> &'static HeaderName { + &crate::header::X_ROBOTS_TAG + } + + fn decode<'i, I>(values: &mut I) -> Result + where + Self: Sized, + I: Iterator, + { + let elements = values.try_fold(Vec::new(), |mut acc, value| { + acc.extend(Parser::parse_value(value).map_err(|err| { + tracing::debug!(?err, "x-robots-tag header element decoding failure"); + Error::invalid() + })?); + + Ok(acc) + })?; + + Ok(XRobotsTag(elements)) + } + + fn encode>(&self, values: &mut E) { + use std::fmt; + struct Format(F); + impl fmt::Display for Format + where + F: Fn(&mut fmt::Formatter<'_>) -> fmt::Result, + { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + self.0(f) + } + } + let s = format!( + "{}", + Format(|f: &mut fmt::Formatter<'_>| { + crate::headers::util::csv::fmt_comma_delimited(&mut *f, self.0.iter()) + }) + ); + values.extend(Some(HeaderValue::from_str(&s).unwrap())) + } +} + +impl FromIterator for XRobotsTag { + fn from_iter>(iter: T) -> Self { + Self(iter.into_iter().collect()) + } +} diff --git a/rama-http/src/headers/x_robots_tag_components/custom_rule.rs b/rama-http/src/headers/x_robots_tag_components/custom_rule.rs new file mode 100644 index 00000000..0e67ec84 --- /dev/null +++ b/rama-http/src/headers/x_robots_tag_components/custom_rule.rs @@ -0,0 +1,28 @@ +use crate::headers::util::value_string::HeaderValueString; + +#[derive(Clone, Debug, Eq, PartialEq)] +pub(super) struct CustomRule { + key: HeaderValueString, + value: Option, +} + +impl CustomRule { + pub(super) fn as_tuple(&self) -> (&HeaderValueString, &Option) { + (&self.key, &self.value) + } +} + +impl From for CustomRule { + fn from(key: HeaderValueString) -> Self { + Self { key, value: None } + } +} + +impl From<(HeaderValueString, HeaderValueString)> for CustomRule { + fn from(key_value: (HeaderValueString, HeaderValueString)) -> Self { + Self { + key: key_value.0, + value: Some(key_value.1), + } + } +} diff --git a/rama-http/src/headers/x_robots_tag_components/max_image_preview_setting.rs b/rama-http/src/headers/x_robots_tag_components/max_image_preview_setting.rs new file mode 100644 index 00000000..ed601289 --- /dev/null +++ b/rama-http/src/headers/x_robots_tag_components/max_image_preview_setting.rs @@ -0,0 +1,60 @@ +use rama_core::error::OpaqueError; +use std::fmt::Formatter; +use std::str::FromStr; +use MaxImagePreviewSetting::*; + +/// The maximum size of an image preview for this page in a search results. +/// If omitted, search engines may show an image preview of the default size. +/// If you don't want search engines to use larger thumbnail images, +/// specify a max-image-preview value of standard or none. [^source] +/// +/// # Values +/// +/// - `none` +/// - No image preview is to be shown. +/// - `standard` +/// - A default image preview may be shown. +/// - `large` +/// - A larger image preview, up to the width of the viewport, may be shown. +/// +/// [^source]: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Robots-Tag#max-image-preview_setting +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum MaxImagePreviewSetting { + None, + Standard, + Large, +} + +impl MaxImagePreviewSetting { + fn as_str(&self) -> &'static str { + match self { + None => "none", + Standard => "standard", + Large => "large", + } + } +} + +impl std::fmt::Display for MaxImagePreviewSetting { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.as_str()) + } +} + +impl FromStr for MaxImagePreviewSetting { + type Err = OpaqueError; + + fn from_str(s: &str) -> Result { + if s.eq_ignore_ascii_case(None.as_str()) { + Ok(None) + } else if s.eq_ignore_ascii_case(Standard.as_str()) { + Ok(Standard) + } else if s.eq_ignore_ascii_case(Large.as_str()) { + Ok(Large) + } else { + Err(OpaqueError::from_display( + "failed to parse MaxImagePreviewSetting", + )) + } + } +} diff --git a/rama-http/src/headers/x_robots_tag_components/mod.rs b/rama-http/src/headers/x_robots_tag_components/mod.rs new file mode 100644 index 00000000..df64d8ec --- /dev/null +++ b/rama-http/src/headers/x_robots_tag_components/mod.rs @@ -0,0 +1,13 @@ +mod robots_tag; +pub use robots_tag::RobotsTag; + +mod max_image_preview_setting; +pub use max_image_preview_setting::MaxImagePreviewSetting; + +mod custom_rule; +use custom_rule::CustomRule; + +mod valid_date; +use valid_date::ValidDate; + +pub mod robots_tag_components; diff --git a/rama-http/src/headers/x_robots_tag_components/robots_tag.rs b/rama-http/src/headers/x_robots_tag_components/robots_tag.rs new file mode 100644 index 00000000..0ab180c7 --- /dev/null +++ b/rama-http/src/headers/x_robots_tag_components/robots_tag.rs @@ -0,0 +1,171 @@ +use crate::headers::util::value_string::HeaderValueString; +use crate::headers::x_robots_tag_components::robots_tag_components::{Builder, NoTag}; +use crate::headers::x_robots_tag_components::{CustomRule, MaxImagePreviewSetting, ValidDate}; +use chrono::{DateTime, Utc}; +use std::fmt::{Display, Formatter}; + +macro_rules! getter { + ($field:ident, $type:ty) => { + paste::paste! { + pub fn [<$field>](&self) -> $type { + self.[<$field>] + } + } + }; + + ($field:ident, $type:ty, optional) => { + paste::paste! { + pub fn [<$field>](&self) -> Option<&$type> { + self.[<$field>].as_ref() + } + } + }; +} + +/// A single element of [`XRobotsTag`] corresponding to the valid values for one `bot_name` +/// +/// [List of directives](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Robots-Tag#directives) +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct RobotsTag { + pub(super) bot_name: Option, + pub(super) all: bool, + pub(super) no_index: bool, + pub(super) no_follow: bool, + pub(super) none: bool, + pub(super) no_snippet: bool, + pub(super) index_if_embedded: bool, + pub(super) max_snippet: u32, + pub(super) max_image_preview: Option, + pub(super) max_video_preview: Option, + pub(super) no_translate: bool, + pub(super) no_image_index: bool, + pub(super) unavailable_after: Option, + pub(super) no_ai: bool, + pub(super) no_image_ai: bool, + pub(super) spc: bool, + pub(super) custom_rules: Vec, +} + +impl RobotsTag { + pub(super) fn new_with_bot_name(bot_name: Option) -> Self { + Self { + bot_name, + all: false, + no_index: false, + no_follow: false, + none: false, + no_snippet: false, + index_if_embedded: false, + max_snippet: 0, + max_image_preview: None, + max_video_preview: None, + no_translate: false, + no_image_index: false, + unavailable_after: None, + no_ai: false, + no_image_ai: false, + spc: false, + custom_rules: vec![], + } + } + + pub fn builder() -> Builder { + Builder::new() + } + + pub fn custom_rules( + &self, + ) -> impl Iterator)> { + self.custom_rules.iter().map(|x| x.as_tuple()) + } + + getter!(bot_name, HeaderValueString, optional); + getter!(all, bool); + getter!(no_index, bool); + getter!(no_follow, bool); + getter!(none, bool); + getter!(no_snippet, bool); + getter!(index_if_embedded, bool); + getter!(max_snippet, u32); + getter!(max_image_preview, MaxImagePreviewSetting, optional); + getter!(max_video_preview, u32, optional); + getter!(no_translate, bool); + getter!(no_image_index, bool); + getter!(no_ai, bool); + getter!(no_image_ai, bool); + getter!(spc, bool); + + pub fn unavailable_after(&self) -> Option<&DateTime> { + self.unavailable_after.as_deref() + } + + pub(super) fn is_valid_field_name(field_name: &str) -> bool { + field_name.trim().eq_ignore_ascii_case("all") + || field_name.eq_ignore_ascii_case("noindex") + || field_name.eq_ignore_ascii_case("nofollow") + || field_name.eq_ignore_ascii_case("none") + || field_name.eq_ignore_ascii_case("nosnippet") + || field_name.eq_ignore_ascii_case("indexifembedded") + || field_name.eq_ignore_ascii_case("max-snippet") + || field_name.eq_ignore_ascii_case("max-image-preview") + || field_name.eq_ignore_ascii_case("max-video-preview") + || field_name.eq_ignore_ascii_case("notranslate") + || field_name.eq_ignore_ascii_case("noimageindex") + || field_name.eq_ignore_ascii_case("unavailable_after") + || field_name.eq_ignore_ascii_case("noai") + || field_name.eq_ignore_ascii_case("noimageai") + || field_name.eq_ignore_ascii_case("spc") + } +} + +impl Display for RobotsTag { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + if let Some(bot_name) = self.bot_name() { + write!(f, "{bot_name}: ")?; + } + + let mut _first = true; + + macro_rules! write_field { + ($cond:expr, $fmt:expr) => { + if $cond { + if !_first { + write!(f, ", ")?; + } + write!(f, "{}", $fmt)?; + _first = false; + } + }; + ($cond:expr, $fmt:expr, optional) => { + if let Some(value) = $cond { + if !_first { + write!(f, ", ")?; + } + write!(f, "{}: {}", $fmt, value)?; + _first = false; + } + }; + } + + write_field!(self.all(), "all"); + write_field!(self.no_index(), "noindex"); + write_field!(self.no_follow(), "nofollow"); + write_field!(self.none(), "none"); + write_field!(self.no_snippet(), "nosnippet"); + write_field!(self.index_if_embedded(), "indexifembedded"); + write_field!( + self.max_snippet() != 0, + format!("max-snippet: {}", self.max_snippet()) + ); + write_field!(self.max_image_preview(), "max-image-preview", optional); + write_field!(self.max_video_preview(), "max-video-preview", optional); + write_field!(self.no_translate(), "notranslate"); + write_field!(self.no_image_index(), "noimageindex"); + write_field!(self.unavailable_after(), "unavailable_after", optional); + write_field!(self.no_ai(), "noai"); + write_field!(self.no_image_ai(), "noimageai"); + write_field!(self.spc(), "spc"); + + Ok(()) + } +} diff --git a/rama-http/src/headers/x_robots_tag_components/robots_tag_components/builder.rs b/rama-http/src/headers/x_robots_tag_components/robots_tag_components/builder.rs new file mode 100644 index 00000000..8f6d5297 --- /dev/null +++ b/rama-http/src/headers/x_robots_tag_components/robots_tag_components/builder.rs @@ -0,0 +1,260 @@ +use crate::headers::util::value_string::HeaderValueString; +use crate::headers::x_robots_tag_components::{MaxImagePreviewSetting, RobotsTag, ValidDate}; +use chrono::{DateTime, Utc}; +use headers::Error; +use rama_core::error::OpaqueError; + +macro_rules! robots_tag_builder_field { + ($field:ident, bool) => { + paste::paste! { + pub fn [<$field>](mut self) -> Self { + self.0.[<$field>] = true; + self + } + + pub fn [](&mut self) -> &mut Self { + self.0.[<$field>] = true; + self + } + } + }; + + ($field:ident, $type:ty) => { + paste::paste! { + pub fn [<$field>](mut self, [<$field>]: $type) -> Self { + self.0.[<$field>] = [<$field>]; + self + } + + pub fn [](&mut self, [<$field>]: $type) -> &mut Self { + self.0.[<$field>] = [<$field>]; + self + } + } + }; + + ($field:ident, $type:ty, optional) => { + paste::paste! { + pub fn [<$field>](mut self, [<$field>]: $type) -> Self { + self.0.[<$field>] = Some([<$field>]); + self + } + + pub fn [](&mut self, [<$field>]: $type) -> &mut Self { + self.0.[<$field>] = Some([<$field>]); + self + } + } + }; +} + +macro_rules! no_tag_builder_field { + ($field:ident, bool) => { + paste::paste! { + pub fn [<$field>](self) -> Builder { + Builder(RobotsTag::new_with_bot_name(self.0.bot_name)).[<$field>]() + } + } + }; + + ($field:ident, $type:ty) => { + paste::paste! { + pub fn [<$field>](self, [<$field>]: $type) -> Builder { + Builder(RobotsTag::new_with_bot_name(self.0.bot_name)).[<$field>]([<$field>]) + } + } + }; +} + +/// Generic structure used for building a [`RobotsTag`] with compile-time validation +/// +/// # States +/// +/// - `Builder<()>` +/// - a new builder without any values +/// - can transform to `Builder` using the [`Builder::bot_name()`] function +/// - `Builder` +/// - holds a `bot_name` field, but still isn't a valid [`RobotsTag`] +/// - can transform to `Builder` by specifying a valid [`RobotsTag`] field +/// - `Builder` +/// - holds a valid [`RobotsTag`] struct, which can be further modified +/// - can be built into a [`RobotsTag`] using the [`Builder::::build()`] function +/// +/// # Examples +/// +/// ``` +/// # use rama_http::headers::x_robots_tag_components::RobotsTag; +/// let robots_tag = RobotsTag::builder() +/// .no_follow() +/// .build(); +/// assert_eq!(robots_tag.no_follow(), true); +/// ``` +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Builder(T); + +impl Builder<()> {} + +pub struct NoTag { + bot_name: Option, +} + +impl Builder { + pub fn new() -> Self { + Self(NoTag { bot_name: None }) + } + + pub fn bot_name(mut self, bot_name: HeaderValueString) -> Self { + self.0.bot_name = Some(bot_name); + self + } + + pub fn set_bot_name(&mut self, bot_name: HeaderValueString) -> &mut Self { + self.0.bot_name = Some(bot_name); + self + } + + no_tag_builder_field!(all, bool); + no_tag_builder_field!(no_index, bool); + no_tag_builder_field!(no_follow, bool); + no_tag_builder_field!(none, bool); + no_tag_builder_field!(no_snippet, bool); + no_tag_builder_field!(index_if_embedded, bool); + no_tag_builder_field!(max_snippet, u32); + no_tag_builder_field!(max_image_preview, MaxImagePreviewSetting); + no_tag_builder_field!(max_video_preview, u32); + no_tag_builder_field!(no_translate, bool); + no_tag_builder_field!(no_image_index, bool); + no_tag_builder_field!(unavailable_after, DateTime); + no_tag_builder_field!(no_ai, bool); + no_tag_builder_field!(no_image_ai, bool); + no_tag_builder_field!(spc, bool); + + pub fn add_field(self, s: &str) -> Result, OpaqueError> { + let mut builder = Builder(RobotsTag::new_with_bot_name(self.0.bot_name)); + builder.add_field(s)?; + Ok(builder) + } +} + +impl Builder { + pub fn build(self) -> RobotsTag { + self.0 + } + + pub fn add_custom_rule_simple(&mut self, key: HeaderValueString) -> &mut Self { + self.0.custom_rules.push(key.into()); + self + } + + pub fn add_custom_rule_composite( + &mut self, + key: HeaderValueString, + value: HeaderValueString, + ) -> &mut Self { + self.0.custom_rules.push((key, value).into()); + self + } + + pub fn set_unavailable_after(&mut self, unavailable_after: DateTime) -> &mut Self { + self.0.unavailable_after = Some(unavailable_after.into()); + self + } + + pub fn unavailable_after(mut self, unavailable_after: DateTime) -> Self { + self.0.unavailable_after = Some(unavailable_after.into()); + self + } + + robots_tag_builder_field!(bot_name, HeaderValueString, optional); + robots_tag_builder_field!(all, bool); + robots_tag_builder_field!(no_index, bool); + robots_tag_builder_field!(no_follow, bool); + robots_tag_builder_field!(none, bool); + robots_tag_builder_field!(no_snippet, bool); + robots_tag_builder_field!(index_if_embedded, bool); + robots_tag_builder_field!(max_snippet, u32); + robots_tag_builder_field!(max_image_preview, MaxImagePreviewSetting, optional); + robots_tag_builder_field!(max_video_preview, u32, optional); + robots_tag_builder_field!(no_translate, bool); + robots_tag_builder_field!(no_image_index, bool); + robots_tag_builder_field!(no_ai, bool); + robots_tag_builder_field!(no_image_ai, bool); + robots_tag_builder_field!(spc, bool); + + /// Adds a field based on its `&str` representation + /// + /// # Returns and Errors + /// + /// - `Result<&mut Self, OpaqueError>` + /// - `Ok(&mut Self)` + /// - when the field was valid and successfully added + /// - returns `&mut Self` wrapped inside for easier chaining of functions + /// - `Err(OpaqueError)` + /// - is of type [`headers::Error`] when the field name is not valid + /// - for composite rules (key + value), wraps the conversion error for the value + /// + /// # Examples + /// + /// ``` + /// # use std::num::ParseIntError; + /// # use rama_http::headers::x_robots_tag_components::RobotsTag; + /// let mut builder = RobotsTag::builder().no_follow(); + /// + /// assert!(builder.add_field("nosnippet").is_ok()); + /// assert!(builder.add_field("max-snippet: 8").is_ok()); + /// assert!(builder.add_field("nonexistent").is_err_and(|e| e.is::())); + /// assert!(builder.add_field("max-video-preview: not_a_number").is_err_and(|e| e.is::())); + /// + /// let robots_tag = builder.build(); + /// + /// assert_eq!(robots_tag.no_snippet(), true); + /// assert_eq!(robots_tag.max_snippet(), 8); + /// ``` + pub fn add_field(&mut self, s: &str) -> Result<&mut Self, OpaqueError> { + if let Some((key, value)) = s.split_once(':') { + let key = key.trim(); + let value = value.trim(); + Ok(if key.eq_ignore_ascii_case("max-snippet") { + self.set_max_snippet(value.parse().map_err(OpaqueError::from_std)?) + } else if key.eq_ignore_ascii_case("max-image-preview") { + self.set_max_image_preview(value.parse()?) + } else if key.eq_ignore_ascii_case("max-video-preview") { + self.set_max_video_preview(value.parse().map_err(OpaqueError::from_std)?) + } else if key.eq_ignore_ascii_case("unavailable_after: ") { + self.set_unavailable_after(value.parse::()?.into()) + } else { + return Err(OpaqueError::from_std(Error::invalid())); + }) + } else { + self.add_simple_field(s) + } + } + + fn add_simple_field(&mut self, s: &str) -> Result<&mut Self, OpaqueError> { + Ok(if s.eq_ignore_ascii_case("all") { + self.set_all() + } else if s.eq_ignore_ascii_case("noindex") { + self.set_no_index() + } else if s.eq_ignore_ascii_case("nofollow") { + self.set_no_follow() + } else if s.eq_ignore_ascii_case("none") { + self.set_none() + } else if s.eq_ignore_ascii_case("nosnippet") { + self.set_no_snippet() + } else if s.eq_ignore_ascii_case("indexifembedded") { + self.set_index_if_embedded() + } else if s.eq_ignore_ascii_case("notranslate") { + self.set_no_translate() + } else if s.eq_ignore_ascii_case("noimageindex") { + self.set_no_image_index() + } else if s.eq_ignore_ascii_case("noai") { + self.set_no_ai() + } else if s.eq_ignore_ascii_case("noimageai") { + self.set_no_image_ai() + } else if s.eq_ignore_ascii_case("spc") { + self.set_spc() + } else { + return Err(OpaqueError::from_std(Error::invalid())); + }) + } +} diff --git a/rama-http/src/headers/x_robots_tag_components/robots_tag_components/mod.rs b/rama-http/src/headers/x_robots_tag_components/robots_tag_components/mod.rs new file mode 100644 index 00000000..edf87ff1 --- /dev/null +++ b/rama-http/src/headers/x_robots_tag_components/robots_tag_components/mod.rs @@ -0,0 +1,6 @@ +mod builder; +pub use builder::Builder; +pub(super) use builder::NoTag; + +mod parser; +pub(crate) use parser::Parser; diff --git a/rama-http/src/headers/x_robots_tag_components/robots_tag_components/parser.rs b/rama-http/src/headers/x_robots_tag_components/robots_tag_components/parser.rs new file mode 100644 index 00000000..1308c182 --- /dev/null +++ b/rama-http/src/headers/x_robots_tag_components/robots_tag_components/parser.rs @@ -0,0 +1,90 @@ +use crate::headers::util::value_string::HeaderValueString; +use crate::headers::x_robots_tag_components::RobotsTag; +use http::HeaderValue; +use rama_core::error::OpaqueError; +use std::str::FromStr; + +pub(crate) struct Parser<'a> { + remaining: Option<&'a str>, +} + +impl<'a> Parser<'a> { + pub(crate) fn new(remaining: &'a str) -> Self { + let remaining = match remaining.trim() { + "" => None, + text => Some(text), + }; + + Self { remaining } + } +} + +impl<'a> Iterator for Parser<'_> { + type Item = Result; + + fn next(&mut self) -> Option { + let mut remaining = self.remaining?.trim(); + + let bot_name = match Self::parse_bot_name(&mut remaining) { + Ok(bot_name) => bot_name, + Err(e) => return Some(Err(e)), + }; + + let mut builder = RobotsTag::builder(); + + let mut builder = if let Some((field, rest)) = remaining.split_once(',') { + if let Some(bot_name) = bot_name { + builder.set_bot_name(bot_name); + } + match builder.add_field(field) { + Ok(builder) => { + remaining = rest.trim(); + builder + } + Err(_) => return None, + } + } else { + return None; + }; + + while let Some((field, rest)) = remaining.split_once(',') { + let field = field.trim(); + if field.is_empty() { + continue; + } + + match builder.add_field(field) { + Ok(_) => { + remaining = rest.trim(); + } + Err(e) if e.is::() => { + self.remaining = Some(remaining.trim()); + return Some(Ok(builder.build())); + } + Err(e) => return Some(Err(e)), + } + } + + Some(Ok(builder.build())) + } +} + +impl Parser<'_> { + fn parse_bot_name(remaining: &mut &str) -> Result, OpaqueError> { + if let Some((bot_name_candidate, rest)) = remaining.split_once(':') { + if !RobotsTag::is_valid_field_name(bot_name_candidate) { + *remaining = rest.trim(); + return match HeaderValueString::from_str(bot_name_candidate) { + Ok(bot) => Ok(Some(bot)), + Err(e) => Err(OpaqueError::from_std(e)), + }; + } + } + + Ok(None) + } + + pub(crate) fn parse_value(value: &HeaderValue) -> Result, OpaqueError> { + Parser::new(value.to_str().map_err(OpaqueError::from_std)?).collect::, _>>() + } +} diff --git a/rama-http/src/headers/x_robots_tag_components/valid_date.rs b/rama-http/src/headers/x_robots_tag_components/valid_date.rs new file mode 100644 index 00000000..ce57ab26 --- /dev/null +++ b/rama-http/src/headers/x_robots_tag_components/valid_date.rs @@ -0,0 +1,373 @@ +use chrono::{DateTime, FixedOffset, NaiveDateTime, Utc}; +use rama_core::error::{ErrorContext, OpaqueError}; +use std::collections::HashMap; +use std::fmt::{Display, Formatter}; +use std::ops::Deref; +use std::str::FromStr; +use std::sync::OnceLock; + +#[derive(Clone, Debug, Eq, PartialEq)] +pub(super) struct ValidDate(DateTime); + +impl Deref for ValidDate { + type Target = DateTime; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl From for DateTime { + fn from(value: ValidDate) -> Self { + value.0 + } +} + +impl From> for ValidDate { + fn from(value: DateTime) -> Self { + Self(value) + } +} + +impl AsRef> for ValidDate { + fn as_ref(&self) -> &DateTime { + &self.0 + } +} + +impl AsMut> for ValidDate { + fn as_mut(&mut self) -> &mut DateTime { + &mut self.0 + } +} + +impl FromStr for ValidDate { + type Err = OpaqueError; + + fn from_str(s: &str) -> Result { + Ok(ValidDate( + DateTime::parse_from_rfc3339(s) // check ISO 8601 + .or_else(|_| { + DateTime::parse_from_rfc2822(s) // check RFC 822 + .or_else(|_| datetime_from_rfc_850(s)) + // check RFC 850 + }) + .with_context(|| "Failed to parse date")? + .with_timezone(&Utc), + )) + } +} + +impl Display for ValidDate { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + writeln!(f, "{}", self.0) + } +} + +fn datetime_from_rfc_850(s: &str) -> Result, OpaqueError> { + let (naive_date_time, remainder) = NaiveDateTime::parse_and_remainder(s, "%A, %d-%b-%y %T") + .with_context(|| "failed to parse naive datetime")?; + + let fixed_offset = offset_from_abbreviation(remainder)?; + + Ok(DateTime::from_naive_utc_and_offset( + naive_date_time, + fixed_offset, + )) +} + +fn offset_from_abbreviation(remainder: &str) -> Result { + get_timezone_map() + .get(remainder.trim()) + .ok_or_else(|| OpaqueError::from_display(format!("invalid abbreviation: {}", remainder)))? + .parse() + .with_context(|| "failed to parse timezone abbreviation") +} + +static TIMEZONE_MAP: OnceLock> = OnceLock::new(); + +fn get_timezone_map() -> &'static HashMap<&'static str, &'static str> { + TIMEZONE_MAP.get_or_init(|| { + let mut map = HashMap::new(); + map.insert("ACDT", "+1030"); + map.insert("ACST", "+0930"); + map.insert("ACT", "−0500"); + map.insert("ACWST", "+0845"); + map.insert("ADT", "−0300"); + map.insert("AEDT", "+1100"); + map.insert("AEST", "+1000"); + map.insert("AFT", "+0430"); + map.insert("AKDT", "−0800"); + map.insert("AKST", "−0900"); + map.insert("ALMT", "+0600"); + map.insert("AMST", "−0300"); + map.insert("AMT", "+0400"); + map.insert("ANAT", "+1200"); + map.insert("AQTT", "+0500"); + map.insert("ART", "−0300"); + map.insert("AST", "−0400"); + map.insert("AWST", "+0800"); + map.insert("AZOST", "+0000"); + map.insert("AZOT", "−0100"); + map.insert("AZT", "+0400"); + map.insert("BIOT", "+0600"); + map.insert("BIT", "−1200"); + map.insert("BNT", "+0800"); + map.insert("BOT", "−0400"); + map.insert("BRST", "−0200"); + map.insert("BRT", "−0300"); + map.insert("BST", "+0600"); + map.insert("BTT", "+0600"); + map.insert("CAT", "+0200"); + map.insert("CCT", "+0630"); + map.insert("CDT", "−0500"); + map.insert("CEST", "+0200"); + map.insert("CET", "+0100"); + map.insert("CHADT", "+1345"); + map.insert("CHAST", "+1245"); + map.insert("CHOST", "+0900"); + map.insert("CHOT", "+0800"); + map.insert("CHST", "+1000"); + map.insert("CHUT", "+1000"); + map.insert("CIST", "−0800"); + map.insert("CKT", "−1000"); + map.insert("CLST", "−0300"); + map.insert("CLT", "−0400"); + map.insert("COST", "−0400"); + map.insert("COT", "−0500"); + map.insert("CST", "−0600"); + map.insert("CVT", "−0100"); + map.insert("CWST", "+0845"); + map.insert("CXT", "+0700"); + map.insert("DAVT", "+0700"); + map.insert("DDUT", "+1000"); + map.insert("DFT", "+0100"); + map.insert("EASST", "−0500"); + map.insert("EAST", "−0600"); + map.insert("EAT", "+0300"); + map.insert("ECT", "−0500"); + map.insert("EDT", "−0400"); + map.insert("EEST", "+0300"); + map.insert("EET", "+0200"); + map.insert("EGST", "+0000"); + map.insert("EGT", "−0100"); + map.insert("EST", "−0500"); + map.insert("FET", "+0300"); + map.insert("FJT", "+1200"); + map.insert("FKST", "−0300"); + map.insert("FKT", "−0400"); + map.insert("FNT", "−0200"); + map.insert("GALT", "−0600"); + map.insert("GAMT", "−0900"); + map.insert("GET", "+0400"); + map.insert("GFT", "−0300"); + map.insert("GILT", "+1200"); + map.insert("GIT", "−0900"); + map.insert("GMT", "+0000"); + map.insert("GST", "+0400"); + map.insert("GYT", "−0400"); + map.insert("HAEC", "+0200"); + map.insert("HDT", "−0900"); + map.insert("HKT", "+0800"); + map.insert("HMT", "+0500"); + map.insert("HOVST", "+0800"); + map.insert("HOVT", "+0700"); + map.insert("HST", "−1000"); + map.insert("ICT", "+0700"); + map.insert("IDLW", "−1200"); + map.insert("IDT", "+0300"); + map.insert("IOT", "+0600"); + map.insert("IRDT", "+0430"); + map.insert("IRKT", "+0800"); + map.insert("IRST", "+0330"); + map.insert("IST", "+0530"); + map.insert("JST", "+0900"); + map.insert("KALT", "+0200"); + map.insert("KGT", "+0600"); + map.insert("KOST", "+1100"); + map.insert("KRAT", "+0700"); + map.insert("KST", "+0900"); + map.insert("LHST", "+1030"); + map.insert("LINT", "+1400"); + map.insert("MAGT", "+1200"); + map.insert("MART", "−0930"); + map.insert("MAWT", "+0500"); + map.insert("MDT", "−0600"); + map.insert("MEST", "+0200"); + map.insert("MET", "+0100"); + map.insert("MHT", "+1200"); + map.insert("MIST", "+1100"); + map.insert("MIT", "−0930"); + map.insert("MMT", "+0630"); + map.insert("MSK", "+0300"); + map.insert("MST", "+0800"); + map.insert("MUT", "+0400"); + map.insert("MVT", "+0500"); + map.insert("MYT", "+0800"); + map.insert("NCT", "+1100"); + map.insert("NDT", "−0230"); + map.insert("NFT", "+1100"); + map.insert("NOVT", "+0700"); + map.insert("NPT", "+0545"); + map.insert("NST", "−0330"); + map.insert("NT", "−0330"); + map.insert("NUT", "−1100"); + map.insert("NZDST", "+1300"); + map.insert("NZDT", "+1300"); + map.insert("NZST", "+1200"); + map.insert("OMST", "+0600"); + map.insert("ORAT", "+0500"); + map.insert("PDT", "−0700"); + map.insert("PET", "−0500"); + map.insert("PETT", "+1200"); + map.insert("PGT", "+1000"); + map.insert("PHOT", "+1300"); + map.insert("PHST", "+0800"); + map.insert("PHT", "+0800"); + map.insert("PKT", "+0500"); + map.insert("PMDT", "−0200"); + map.insert("PMST", "−0300"); + map.insert("PONT", "+1100"); + map.insert("PST", "−0800"); + map.insert("PWT", "+0900"); + map.insert("PYST", "−0300"); + map.insert("PYT", "−0400"); + map.insert("RET", "+0400"); + map.insert("ROTT", "−0300"); + map.insert("SAKT", "+1100"); + map.insert("SAMT", "+0400"); + map.insert("SAST", "+0200"); + map.insert("SBT", "+1100"); + map.insert("SCT", "+0400"); + map.insert("SDT", "−1000"); + map.insert("SGT", "+0800"); + map.insert("SLST", "+0530"); + map.insert("SRET", "+1100"); + map.insert("SRT", "−0300"); + map.insert("SST", "−1100"); + map.insert("SYOT", "+0300"); + map.insert("TAHT", "−1000"); + map.insert("TFT", "+0500"); + map.insert("THA", "+0700"); + map.insert("TJT", "+0500"); + map.insert("TKT", "+1300"); + map.insert("TLT", "+0900"); + map.insert("TMT", "+0500"); + map.insert("TOT", "+1300"); + map.insert("TRT", "+0300"); + map.insert("TST", "+0800"); + map.insert("TVT", "+1200"); + map.insert("ULAST", "+0900"); + map.insert("ULAT", "+0800"); + map.insert("UTC", "+0000"); + map.insert("UYST", "−0200"); + map.insert("UYT", "−0300"); + map.insert("UZT", "+0500"); + map.insert("VET", "−0400"); + map.insert("VLAT", "+1000"); + map.insert("VOLT", "+0300"); + map.insert("VOST", "+0600"); + map.insert("VUT", "+1100"); + map.insert("WAKT", "+1200"); + map.insert("WAST", "+0200"); + map.insert("WAT", "+0100"); + map.insert("WEST", "+0100"); + map.insert("WET", "+0000"); + map.insert("WGST", "−0200"); + map.insert("WGT", "−0300"); + map.insert("WIB", "+0700"); + map.insert("WIT", "+0900"); + map.insert("WITA", "+0800"); + map.insert("WST", "+0800"); + map.insert("YAKT", "+0900"); + map.insert("YEKT", "+0500"); + map + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + macro_rules! test_valid_date_strings { + ($($str:literal),+) => { + $(assert!(ValidDate::from_str($str).is_ok(), + "'{}': {:?}", + $str, ValidDate::from_str($str).err());)+ + }; + } + + macro_rules! test_invalid_date_strings { + ($($str:literal),+) => { + $(assert!(ValidDate::from_str($str).is_err());)+ + }; + } + + #[test] + fn test_valid_rfc_822() { + test_valid_date_strings!( + "Wed, 02 Oct 2002 08:00:00 EST", + "Wed, 02 Oct 2002 13:00:00 GMT", + "Wed, 02 Oct 2002 15:00:00 +0200", + "Mon, 11 Mar 2019 01:57:00 EST", + "11 Mar 2019 01:57:23 EDT", + "Mon, 11 Mar 2019 01:57:00 -0500", + "Mon, 11 Mar 2019 01:57 A", + "11 Mar 2019 01:00 N", + "11 Mar 2019 01:59 A", + "Mon, 11 Mar 2019 02:00 Z", + "Mon, 11 Mar 2019 02:00:34 Z", + "11 Mar 2019 02:00 PST" + ); + } + + #[test] + fn test_valid_rfc_850() { + test_valid_date_strings!( + "Sunday, 04-Feb-24 23:59:59 GMT", + "Monday, 29-Feb-88 12:34:56 UTC", + "Tuesday, 01-Jan-80 00:00:00 EST", + "Friday, 31-Dec-99 23:59:59 CST", + "Thursday, 24-Feb-00 23:59:59 MST", + "Friday, 01-Mar-19 00:00:01 PST", + "Saturday, 31-Oct-20 13:45:30 EDT", + "Wednesday, 27-Jun-12 23:59:60 CDT", + "Monday, 03-Sep-01 01:02:03 CET", + "Tuesday, 15-Aug-95 18:00:00 PDT" + ); + } + + #[test] + fn test_valid_iso_8601() { + test_valid_date_strings!( + "2025-02-02T14:30:00+00:00", + "2023-06-15T23:59:59-05:00", + "2019-12-31T12:00:00+08:45", + "2020-02-29T00:00:00Z", + "2024-10-10T10:10:10+02:00", + "2022-07-01T16:45:30-07:00", + "2018-01-01T09:00:00+09:30", + "2030-05-20T05:05:05+05:30", + "1999-12-31T23:59:59-03:00", + "2045-11-11T11:11:11+14:00" + ); + } + + #[test] + fn test_invalid_date_times() { + test_invalid_date_strings!( + "2025-02-30T14:30:00+00:00", + "2023-06-15T25:00:00-05:00", + "2019-12-31T12:60:00+08:45", + "2020-02-29T00:00:00", + "Thu, 32 Dec 2023 10:00:00 +0200", + "Mon, 15 Jan 2023 23:59:60 -0500", + "2024-10-10T10:10:10", + "2022-07-01T16:45:30 UTC", + "2018-01-01T09:00:00+09:75", + "2030-05-20T05:05:05+24:00", + "1999-12-31 23:59:59 -03:00", + "2045-11-11T11:11:11 EST" + ); + } +}