Skip to content

Commit

Permalink
remove reliance on <base href> in post content (#17)
Browse files Browse the repository at this point in the history
  • Loading branch information
delan committed Oct 6, 2024
1 parent aea0158 commit 517fbfe
Show file tree
Hide file tree
Showing 3 changed files with 172 additions and 6 deletions.
50 changes: 45 additions & 5 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,22 @@
use std::{cmp::Ordering, collections::BTreeSet, fs::File, io::Read, sync::LazyLock};
use std::{
cmp::Ordering,
collections::{BTreeMap, BTreeSet},
fs::File,
io::Read,
sync::LazyLock,
};

use askama::Template;
use jane_eyre::eyre::{self, Context, OptionExt};
use markup5ever_rcdom::RcDom;
use markup5ever_rcdom::{NodeData, RcDom};
use serde::Deserialize;
use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt, EnvFilter};
use xml5ever::QualName;

use crate::{
dom::serialize,
dom::{serialize, QualNameExt, TendrilExt, Transform},
meta::extract_metadata,
path::{PostsPath, SitePath},
path::{parse_path_relative_scheme_less_url_string, PostsPath, SitePath},
settings::Settings,
};

Expand Down Expand Up @@ -298,9 +305,42 @@ impl TemplatedPost {
pub fn filter(unsafe_html: &str, path: Option<PostsPath>) -> eyre::Result<Self> {
// reader step: extract metadata.
let post = extract_metadata(unsafe_html)?;
let extracted_html = serialize(post.dom)?;

// reader step: fix relative urls.
let affected_attrs = BTreeMap::from([
(
QualName::html("a"),
BTreeSet::from([QualName::attribute("href")]),
),
(
QualName::html("img"),
BTreeSet::from([QualName::attribute("src")]),
),
]);
let mut transform = Transform::new(post.dom.document.clone());
while transform.next(|kids, new_kids| {
for kid in kids {
if let NodeData::Element { name, attrs, .. } = &kid.data {
if let Some(attr_names) = affected_attrs.get(name) {
for attr in attrs.borrow_mut().iter_mut() {
if attr_names.contains(&attr.name) {
if let Some(url) =
parse_path_relative_scheme_less_url_string(attr.value.to_str())
{
// TODO: clean this up and move logic into path module
attr.value = format!("{}{}", SETTINGS.base_url, url).into();
}
}
}
}
}
new_kids.push(kid.clone());
}
Ok(())
})? {}

// reader step: filter html.
let extracted_html = serialize(post.dom)?;
let safe_html = ammonia::Builder::default()
.add_generic_attributes(["style", "id"])
.add_generic_attributes(["data-cohost-href", "data-cohost-src"]) // cohost2autost
Expand Down
127 changes: 127 additions & 0 deletions src/path.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use std::{
};

use jane_eyre::eyre::{self, bail, Context};
use url::Url;

use crate::SETTINGS;

Expand Down Expand Up @@ -346,3 +347,129 @@ pub fn hard_link_if_not_exists(

Ok(())
}

/// if the given string is a “path-relative-scheme-less-URL string”, returns that string after
/// the initial C0/space/tab/newline stripping, otherwise returns None.
///
/// - `foo/bar` → true
/// - `/foo/bar` → false
/// - `foo:/bar` → false
///
/// <https://url.spec.whatwg.org/#path-relative-scheme-less-url-string>
pub fn parse_path_relative_scheme_less_url_string(url: &str) -> Option<String> {
// is it a “relative-URL string”? (case “Otherwise”)
// <https://url.spec.whatwg.org/#relative-url-string>
if Url::parse(url) == Err(url::ParseError::RelativeUrlWithoutBase) {
// if so, it may be a “scheme-relative-URL string” or “path-absolute-URL string”, but we can
// only check for that by running the first few steps of the “basic URL parser” on `url`
// with an imaginary non-null “*base*”, but no “*encoding*”, “*url*”, or “*state override*”.
//
// the imaginary “*base*” in our case has the http or https scheme, so “*base*” does not
// “have an [opaque path]”, and the scheme is not a “special scheme”.
//
// <https://url.spec.whatwg.org/#scheme-relative-url-string>
// <https://url.spec.whatwg.org/#path-absolute-url-string>
// <https://url.spec.whatwg.org/#concept-basic-url-parser>

// “Remove any leading and trailing [C0 control or space] from *input*.”
let url = url.strip_prefix(|c| c <= '\x20').unwrap_or(url);
let url = url.strip_suffix(|c| c <= '\x20').unwrap_or(url);

// “Remove all [ASCII tab or newline] from *input*.”
let url = url.replace(|c| c == '\x09' || c == '\x0A' || c == '\x0D', "");

// “Let *state* be *state override* if given, or [scheme start state] otherwise.”
#[derive(Debug)]
enum State {
SchemeStartState,
Scheme,
NoScheme,
Relative,
RelativeSlash,
}
let mut state = State::SchemeStartState;

// “Let *pointer* be a [pointer] for *input*.”
let mut pointer = &url[..];

// “Keep running the following state machine by switching on state. If after a run
// pointer points to the EOF code point, go to the next step.”
while !pointer.is_empty() {
// “When a pointer is used, c references the code point the pointer points to as long
// as it does not point nowhere. When the pointer points to nowhere c cannot be used.”
let c = pointer.chars().next().expect("guaranteed by while");
dbg!((&state, c));

match state {
State::SchemeStartState => {
if c.is_ascii_alphabetic() {
state = State::Scheme;
} else {
state = State::NoScheme;
continue; // skip pointer increase
}
}
State::Scheme => {
if c.is_ascii_alphabetic() || c == '+' || c == '-' || c == '.' {
// do nothing
} else if c == ':' {
// “Set url’s scheme to buffer.”
// we have an “absolute-URL string”.
return None;
} else {
// “Otherwise, if state override is not given, set buffer to the empty
// string, state to no scheme state, and start over (from the first code
// point in input).”
state = State::NoScheme;
pointer = &url[..];
}
}
State::NoScheme => {
// “Otherwise, if base’s scheme is not "file", set state to relative state
// and decrease pointer by 1.”
state = State::Relative;
continue; // skip pointer increase
}
State::Relative => {
if c == '/' {
state = State::RelativeSlash;
} else if c == '\\' {
state = State::RelativeSlash;
} else {
// “Set [...], url’s path to a clone of base’s path, [...].”
// we have a “path-relative-scheme-less-URL string”.
return Some(url);
}
}
State::RelativeSlash => {
// we have a “scheme-relative-URL string” or “path-absolute-URL string”.
return None;
}
}
// “Otherwise, increase pointer by 1 and continue with the state machine.”
pointer = &pointer[c.len_utf8()..];
}
}

None
}

#[test]
fn test_is_path_relative_scheme_less_url_string() {
assert_eq!(
parse_path_relative_scheme_less_url_string(" http://host/absolute?query#fragment"),
None
);
assert_eq!(
parse_path_relative_scheme_less_url_string(" //host/absolute?query#fragment"),
None
);
assert_eq!(
parse_path_relative_scheme_less_url_string(" /absolute?query#fragment"),
None
);
assert_eq!(
parse_path_relative_scheme_less_url_string(" relative?query#fragment").as_deref(),
Some("relative?query#fragment")
);
}
1 change: 0 additions & 1 deletion templates/threads.html
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
<!doctype html><meta charset="utf-8">
<base href="{{ SETTINGS.base_url }}">{# you can change your base url without rerunning cohost2autost #}
{%~ if let Some(feed_href) = feed_href ~%}<link rel="alternate" type="application/atom+xml" href="{{ feed_href.internal_url() }}">{%~ endif ~%}
<meta name="viewport" content="width=device-width">
<link rel="stylesheet" href="{{ SETTINGS.base_url }}style.css">
Expand Down

0 comments on commit 517fbfe

Please sign in to comment.