From 517fbfeff15e7471151bcf4519cfc857a4adb310 Mon Sep 17 00:00:00 2001 From: Delan Azabani Date: Sun, 6 Oct 2024 16:27:03 +0800 Subject: [PATCH] remove reliance on in post content (#17) --- src/lib.rs | 50 ++++++++++++++-- src/path.rs | 127 +++++++++++++++++++++++++++++++++++++++++ templates/threads.html | 1 - 3 files changed, 172 insertions(+), 6 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 1f2621e..cad5c54 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,15 +1,22 @@ -use std::{cmp::Ordering, collections::BTreeSet, fs::File, io::Read, sync::LazyLock}; +use std::{ + cmp::Ordering, + collections::{BTreeMap, BTreeSet}, + fs::File, + io::Read, + sync::LazyLock, +}; use askama::Template; use jane_eyre::eyre::{self, Context, OptionExt}; -use markup5ever_rcdom::RcDom; +use markup5ever_rcdom::{NodeData, RcDom}; use serde::Deserialize; use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt, EnvFilter}; +use xml5ever::QualName; use crate::{ - dom::serialize, + dom::{serialize, QualNameExt, TendrilExt, Transform}, meta::extract_metadata, - path::{PostsPath, SitePath}, + path::{parse_path_relative_scheme_less_url_string, PostsPath, SitePath}, settings::Settings, }; @@ -298,9 +305,42 @@ impl TemplatedPost { pub fn filter(unsafe_html: &str, path: Option) -> eyre::Result { // reader step: extract metadata. let post = extract_metadata(unsafe_html)?; - let extracted_html = serialize(post.dom)?; + + // reader step: fix relative urls. + let affected_attrs = BTreeMap::from([ + ( + QualName::html("a"), + BTreeSet::from([QualName::attribute("href")]), + ), + ( + QualName::html("img"), + BTreeSet::from([QualName::attribute("src")]), + ), + ]); + let mut transform = Transform::new(post.dom.document.clone()); + while transform.next(|kids, new_kids| { + for kid in kids { + if let NodeData::Element { name, attrs, .. } = &kid.data { + if let Some(attr_names) = affected_attrs.get(name) { + for attr in attrs.borrow_mut().iter_mut() { + if attr_names.contains(&attr.name) { + if let Some(url) = + parse_path_relative_scheme_less_url_string(attr.value.to_str()) + { + // TODO: clean this up and move logic into path module + attr.value = format!("{}{}", SETTINGS.base_url, url).into(); + } + } + } + } + } + new_kids.push(kid.clone()); + } + Ok(()) + })? {} // reader step: filter html. + let extracted_html = serialize(post.dom)?; let safe_html = ammonia::Builder::default() .add_generic_attributes(["style", "id"]) .add_generic_attributes(["data-cohost-href", "data-cohost-src"]) // cohost2autost diff --git a/src/path.rs b/src/path.rs index bb5cb53..31e5688 100644 --- a/src/path.rs +++ b/src/path.rs @@ -6,6 +6,7 @@ use std::{ }; use jane_eyre::eyre::{self, bail, Context}; +use url::Url; use crate::SETTINGS; @@ -346,3 +347,129 @@ pub fn hard_link_if_not_exists( Ok(()) } + +/// if the given string is a “path-relative-scheme-less-URL string”, returns that string after +/// the initial C0/space/tab/newline stripping, otherwise returns None. +/// +/// - `foo/bar` → true +/// - `/foo/bar` → false +/// - `foo:/bar` → false +/// +/// +pub fn parse_path_relative_scheme_less_url_string(url: &str) -> Option { + // is it a “relative-URL string”? (case “Otherwise”) + // + if Url::parse(url) == Err(url::ParseError::RelativeUrlWithoutBase) { + // if so, it may be a “scheme-relative-URL string” or “path-absolute-URL string”, but we can + // only check for that by running the first few steps of the “basic URL parser” on `url` + // with an imaginary non-null “*base*”, but no “*encoding*”, “*url*”, or “*state override*”. + // + // the imaginary “*base*” in our case has the http or https scheme, so “*base*” does not + // “have an [opaque path]”, and the scheme is not a “special scheme”. + // + // + // + // + + // “Remove any leading and trailing [C0 control or space] from *input*.” + let url = url.strip_prefix(|c| c <= '\x20').unwrap_or(url); + let url = url.strip_suffix(|c| c <= '\x20').unwrap_or(url); + + // “Remove all [ASCII tab or newline] from *input*.” + let url = url.replace(|c| c == '\x09' || c == '\x0A' || c == '\x0D', ""); + + // “Let *state* be *state override* if given, or [scheme start state] otherwise.” + #[derive(Debug)] + enum State { + SchemeStartState, + Scheme, + NoScheme, + Relative, + RelativeSlash, + } + let mut state = State::SchemeStartState; + + // “Let *pointer* be a [pointer] for *input*.” + let mut pointer = &url[..]; + + // “Keep running the following state machine by switching on state. If after a run + // pointer points to the EOF code point, go to the next step.” + while !pointer.is_empty() { + // “When a pointer is used, c references the code point the pointer points to as long + // as it does not point nowhere. When the pointer points to nowhere c cannot be used.” + let c = pointer.chars().next().expect("guaranteed by while"); + dbg!((&state, c)); + + match state { + State::SchemeStartState => { + if c.is_ascii_alphabetic() { + state = State::Scheme; + } else { + state = State::NoScheme; + continue; // skip pointer increase + } + } + State::Scheme => { + if c.is_ascii_alphabetic() || c == '+' || c == '-' || c == '.' { + // do nothing + } else if c == ':' { + // “Set url’s scheme to buffer.” + // we have an “absolute-URL string”. + return None; + } else { + // “Otherwise, if state override is not given, set buffer to the empty + // string, state to no scheme state, and start over (from the first code + // point in input).” + state = State::NoScheme; + pointer = &url[..]; + } + } + State::NoScheme => { + // “Otherwise, if base’s scheme is not "file", set state to relative state + // and decrease pointer by 1.” + state = State::Relative; + continue; // skip pointer increase + } + State::Relative => { + if c == '/' { + state = State::RelativeSlash; + } else if c == '\\' { + state = State::RelativeSlash; + } else { + // “Set [...], url’s path to a clone of base’s path, [...].” + // we have a “path-relative-scheme-less-URL string”. + return Some(url); + } + } + State::RelativeSlash => { + // we have a “scheme-relative-URL string” or “path-absolute-URL string”. + return None; + } + } + // “Otherwise, increase pointer by 1 and continue with the state machine.” + pointer = &pointer[c.len_utf8()..]; + } + } + + None +} + +#[test] +fn test_is_path_relative_scheme_less_url_string() { + assert_eq!( + parse_path_relative_scheme_less_url_string(" http://host/absolute?query#fragment"), + None + ); + assert_eq!( + parse_path_relative_scheme_less_url_string(" //host/absolute?query#fragment"), + None + ); + assert_eq!( + parse_path_relative_scheme_less_url_string(" /absolute?query#fragment"), + None + ); + assert_eq!( + parse_path_relative_scheme_less_url_string(" relative?query#fragment").as_deref(), + Some("relative?query#fragment") + ); +} diff --git a/templates/threads.html b/templates/threads.html index f22bbb9..49a91f5 100644 --- a/templates/threads.html +++ b/templates/threads.html @@ -1,5 +1,4 @@ -{# you can change your base url without rerunning cohost2autost #} {%~ if let Some(feed_href) = feed_href ~%}{%~ endif ~%}