Skip to content

Commit

Permalink
feat(page): add optional page base passing
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Jan 11, 2025
1 parent 09688a1 commit f2c4e9b
Show file tree
Hide file tree
Showing 10 changed files with 96 additions and 46 deletions.
12 changes: 6 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.24.15"
version = "2.25.1"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
95 changes: 68 additions & 27 deletions spider/src/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ pub struct Page {
}

/// Validate link and push into the map
pub fn push_link<A: PartialEq + Eq + std::hash::Hash + From<String>>(
pub(crate) fn push_link<A: PartialEq + Eq + std::hash::Hash + From<String>>(
base: &Option<&Url>,
href: &str,
map: &mut HashSet<A>,
Expand Down Expand Up @@ -307,7 +307,7 @@ pub fn push_link<A: PartialEq + Eq + std::hash::Hash + From<String>>(
}

/// get the clean domain name
pub fn domain_name(domain: &Url) -> &str {
pub(crate) fn domain_name(domain: &Url) -> &str {
domain.host_str().unwrap_or_default()
}

Expand Down Expand Up @@ -335,7 +335,7 @@ fn is_subdomain(subdomain: &str, domain: &str) -> bool {
}

/// validation to match a domain to parent host and the top level redirect for the crawl 'parent_host' and 'base_host' being the input start domain.
pub fn parent_host_match(
pub(crate) fn parent_host_match(
host_name: Option<&str>,
base_domain: &str, // the base domain input
parent_host: &CompactString, // the main parent host
Expand All @@ -357,7 +357,11 @@ pub fn parent_host_match(
}

/// html selector for valid web pages for domain.
pub fn get_page_selectors_base(u: &Url, subdomains: bool, tld: bool) -> Option<RelativeSelectors> {
pub(crate) fn get_page_selectors_base(
u: &Url,
subdomains: bool,
tld: bool,
) -> Option<RelativeSelectors> {
let u = convert_abs_url_base(u);

let b = match u.host_str() {
Expand Down Expand Up @@ -1334,6 +1338,7 @@ impl Page {
selectors: &RelativeSelectors,
xml: &str,
map: &mut HashSet<A>,
base: &Option<Box<Url>>,
) {
use quick_xml::events::Event;
use quick_xml::reader::NsReader;
Expand All @@ -1356,8 +1361,13 @@ impl Page {
None
};

self.set_url_parsed_direct_empty();
let base = self.get_url_parsed_ref().as_ref();
let base = if base.is_some() {
base.as_deref()
} else {
self.set_url_parsed_direct_empty();
let base = self.get_url_parsed_ref().as_ref();
base
};

loop {
match reader.read_event_into_async(&mut buf).await {
Expand Down Expand Up @@ -1423,6 +1433,7 @@ impl Page {
&mut self,
selectors: &RelativeSelectors,
html: &str,
base: &Option<Box<Url>>,
) -> HashSet<A> {
let mut map: HashSet<A> = HashSet::new();
let mut links_pages = if self.page_links.is_some() {
Expand All @@ -1433,15 +1444,19 @@ impl Page {

if !html.is_empty() {
if html.starts_with("<?xml") {
self.links_stream_xml_links_stream_base(selectors, html, &mut map)
self.links_stream_xml_links_stream_base(selectors, html, &mut map, base)
.await;
} else {
let parent_host = &selectors.1[0];
// the host schemes
let parent_host_scheme = &selectors.1[1];
let base_input_domain = &selectors.2; // the domain after redirects
let sub_matcher = &selectors.0;
let base = self.get_url_parsed_ref().as_ref();
let base = if base.is_some() {
base.as_deref()
} else {
self.get_url_parsed_ref().as_ref()
};

let rewriter_settings = lol_html::Settings {
element_content_handlers: vec![lol_html::element!("a[href]", |el| {
Expand Down Expand Up @@ -1513,6 +1528,7 @@ impl Page {
selectors: &RelativeSelectors,
html: &str,
client: &Client,
base: &Option<Box<Url>>,
) -> HashSet<A> {
use auto_encoder::auto_encode_bytes;

Expand All @@ -1526,7 +1542,7 @@ impl Page {

if !html.is_empty() {
if html.starts_with("<?xml") {
self.links_stream_xml_links_stream_base(selectors, html, &mut map)
self.links_stream_xml_links_stream_base(selectors, html, &mut map, base)
.await;
} else {
let cell = tokio::sync::OnceCell::new();
Expand All @@ -1538,8 +1554,11 @@ impl Page {
let base_input_domain = &selectors.2; // the domain after redirects
let sub_matcher = &selectors.0;

self.set_url_parsed_direct_empty();
let base = self.get_url_parsed_ref().as_ref();
let base = if base.is_some() {
base.as_deref()
} else {
self.get_url_parsed_ref().as_ref()
};

let rewriter_settings = lol_html::Settings {
element_content_handlers: vec![
Expand Down Expand Up @@ -1655,11 +1674,12 @@ impl Page {
&mut self,
selectors: &RelativeSelectors,
client: &Client,
prior_domain: &Option<Box<Url>>,
) -> HashSet<A> {
if auto_encoder::is_binary_file(self.get_html_bytes_u8()) {
Default::default()
} else {
self.links_stream_base_ssg(selectors, &Box::new(self.get_html()), client)
self.links_stream_base_ssg(selectors, &Box::new(self.get_html()), client, prior_domain)
.await
}
}
Expand All @@ -1671,11 +1691,12 @@ impl Page {
&mut self,
selectors: &RelativeSelectors,
client: &Client,
prior_domain: &Option<Box<Url>>,
) -> HashSet<CaseInsensitiveString> {
match self.html.is_some() {
false => Default::default(),
true => {
self.links_stream_ssg::<CaseInsensitiveString>(selectors, client)
self.links_stream_ssg::<CaseInsensitiveString>(selectors, client, prior_domain)
.await
}
}
Expand All @@ -1689,11 +1710,12 @@ impl Page {
>(
&mut self,
selectors: &RelativeSelectors,
base: &Option<Box<Url>>,
) -> HashSet<A> {
if auto_encoder::is_binary_file(self.get_html_bytes_u8()) {
Default::default()
} else {
self.links_stream_base(selectors, &Box::new(self.get_html()))
self.links_stream_base(selectors, &Box::new(self.get_html()), base)
.await
}
}
Expand All @@ -1714,6 +1736,7 @@ impl Page {
browser: &std::sync::Arc<chromiumoxide::Browser>,
configuration: &crate::configuration::Configuration,
context_id: &Option<chromiumoxide::cdp::browser_protocol::browser::BrowserContextId>,
base: &Option<Box<Url>>,
) -> HashSet<A> {
use crate::utils::spawn_task;
use auto_encoder::auto_encode_bytes;
Expand All @@ -1732,7 +1755,7 @@ impl Page {
let html_resource = Box::new(self.get_html());

if html_resource.starts_with("<?xml") {
self.links_stream_xml_links_stream_base(selectors, &html_resource, &mut map)
self.links_stream_xml_links_stream_base(selectors, &html_resource, &mut map, &base)
.await;
} else {
let (tx, rx) = tokio::sync::oneshot::channel();
Expand All @@ -1745,8 +1768,6 @@ impl Page {

let external_domains_caseless = self.external_domains_caseless.clone();

self.set_url_parsed_direct_empty();
let base = self.get_url_parsed_ref();
let base1 = base.clone();

let rerender = AtomicBool::new(false);
Expand Down Expand Up @@ -1788,7 +1809,7 @@ impl Page {
element!("a[href]", |el| {
if let Some(href) = el.get_attribute("href") {
push_link(
&base.as_ref(),
&base.as_deref(),
&href,
&mut inner_map,
&selectors.0,
Expand Down Expand Up @@ -1930,6 +1951,7 @@ impl Page {
Some(h) => auto_encode_bytes(&h),
_ => Default::default(),
},
&base,
)
.await;
map.extend(extended_map)
Expand Down Expand Up @@ -1968,6 +1990,7 @@ impl Page {
>(
&mut self,
selectors: &RelativeSelectors,
base: &Option<Box<Url>>,
) -> HashSet<A> {
let mut map = HashSet::new();
let mut links_pages = if self.page_links.is_some() {
Expand All @@ -1980,7 +2003,7 @@ impl Page {
let html = Box::new(self.get_html());

if html.starts_with("<?xml") {
self.links_stream_xml_links_stream_base(selectors, &html, &mut map)
self.links_stream_xml_links_stream_base(selectors, &html, &mut map, base)
.await;
} else {
// let base_domain = &selectors.0;
Expand All @@ -1990,8 +2013,14 @@ impl Page {
let base_input_domain = &selectors.2; // the domain after redirects
let sub_matcher = &selectors.0;

self.set_url_parsed_direct_empty();
let base = self.get_url_parsed_ref();
let base = if base.is_some() {
base.as_deref()
} else {
self.set_url_parsed_direct_empty();
let base = self.get_url_parsed_ref().as_ref();
base
};

let external_domains_caseless = self.external_domains_caseless.clone();

let base_links_settings =
Expand All @@ -2003,7 +2032,7 @@ impl Page {
};
if let Some(href) = el.get_attribute(attribute) {
push_link(
&base.as_ref(),
&base,
&href,
&mut map,
&selectors.0,
Expand Down Expand Up @@ -2060,11 +2089,12 @@ impl Page {
>(
&mut self,
selectors: &RelativeSelectors,
base: &Option<Box<Url>>,
) -> HashSet<A> {
if auto_encoder::is_binary_file(self.get_html_bytes_u8()) {
Default::default()
} else {
self.links_stream_full_resource(selectors).await
self.links_stream_full_resource(selectors, base).await
}
}

Expand All @@ -2083,10 +2113,17 @@ impl Page {
/// Find all href links and return them using CSS selectors.
#[cfg(not(feature = "decentralized"))]
#[inline(always)]
pub async fn links(&mut self, selectors: &RelativeSelectors) -> HashSet<CaseInsensitiveString> {
pub async fn links(
&mut self,
selectors: &RelativeSelectors,
base: &Option<Box<Url>>,
) -> HashSet<CaseInsensitiveString> {
match self.html.is_some() {
false => Default::default(),
true => self.links_stream::<CaseInsensitiveString>(selectors).await,
true => {
self.links_stream::<CaseInsensitiveString>(selectors, base)
.await
}
}
}

Expand All @@ -2096,14 +2133,15 @@ impl Page {
pub async fn links_full(
&mut self,
selectors: &RelativeSelectors,
base: &Option<Box<Url>>,
) -> HashSet<CaseInsensitiveString> {
match self.html.is_some() {
false => Default::default(),
true => {
if auto_encoder::is_binary_file(self.get_html_bytes_u8()) {
return Default::default();
}
self.links_stream_full_resource::<CaseInsensitiveString>(selectors)
self.links_stream_full_resource::<CaseInsensitiveString>(selectors, base)
.await
}
}
Expand All @@ -2118,6 +2156,7 @@ impl Page {
page: &std::sync::Arc<chromiumoxide::Browser>,
configuration: &crate::configuration::Configuration,
context_id: &Option<chromiumoxide::cdp::browser_protocol::browser::BrowserContextId>,
base: &Option<Box<Url>>,
) -> HashSet<CaseInsensitiveString> {
match self.html.is_some() {
false => Default::default(),
Expand All @@ -2130,6 +2169,7 @@ impl Page {
page,
configuration,
context_id,
base,
)
.await
}
Expand Down Expand Up @@ -2177,6 +2217,7 @@ pub fn get_html_encoded(html: &Option<Bytes>, _label: &str) -> String {
}

#[cfg(test)]
/// The test user agent.
pub const TEST_AGENT_NAME: &str = concat!(env!("CARGO_PKG_NAME"), "/", env!("CARGO_PKG_VERSION"));

#[cfg(all(
Expand Down Expand Up @@ -2229,7 +2270,7 @@ async fn parse_links() {
let link_result = "https://choosealicense.com/";
let mut page = Page::new(link_result, &client).await;
let selector = get_page_selectors(link_result, false, false);
let links = page.links(&selector.unwrap()).await;
let links = page.links(&selector.unwrap(), &None).await;

assert!(
links.contains::<CaseInsensitiveString>(&"https://choosealicense.com/about/".into()),
Expand Down
Loading

0 comments on commit f2c4e9b

Please sign in to comment.