Skip to content

Commit

Permalink
chore(page): fix crawling initial redirect
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Jan 5, 2025
1 parent 5abc185 commit 2734b93
Show file tree
Hide file tree
Showing 8 changed files with 19 additions and 18 deletions.
12 changes: 6 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.24.4"
version = "2.24.5"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
13 changes: 7 additions & 6 deletions spider/src/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -171,13 +171,13 @@ pub struct Page {
pub extra_ai_data: Option<Vec<AIResults>>,
/// The links found on the page. This includes all links that have an href url.
pub page_links: Option<Box<HashSet<CaseInsensitiveString>>>,
/// The request should retry
/// The request should retry.
pub should_retry: bool,
/// A WAF was found on the page.
pub waf_check: bool,
/// The total byte transferred for the page. Mainly used for chrome events. Inspect the content for bytes when using http instead.
pub bytes_transferred: Option<f64>,
/// The page was blocked from crawling usual from using website::on_should_crawl_callback
/// The page was blocked from crawling usual from using website::on_should_crawl_callback.
pub blocked_crawl: bool,
}

Expand Down Expand Up @@ -214,11 +214,11 @@ pub struct Page {
pub extra_ai_data: Option<Vec<AIResults>>,
/// The links found on the page. Unused until we can structure the buffers to match.
pub page_links: Option<Box<HashSet<CaseInsensitiveString>>>,
/// The request should retry
/// The request should retry.
pub should_retry: bool,
/// A WAF was found on the page.
pub waf_check: bool,
/// The page was blocked from crawling usual from using website::on_should_crawl_callback
/// The page was blocked from crawling usual from using website::on_should_crawl_callback.
pub blocked_crawl: bool,
}

Expand Down Expand Up @@ -657,9 +657,10 @@ impl Page {

let target_url = res.url().as_str();

// handle redirects
if url != target_url && !exact_url_match(&url, &target_url) {
// handle initial redirects
if ssg_map.is_some() && url != target_url && !exact_url_match(&url, &target_url) {
let mut url = Box::new(CaseInsensitiveString::new(&url));

modify_selectors(
prior_domain,
target_url,
Expand Down
2 changes: 1 addition & 1 deletion spider_chrome/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_chrome"
version = "2.24.4"
version = "2.24.5"
rust-version = "1.70"
authors = [
"j-mendez <[email protected]>"
Expand Down
2 changes: 1 addition & 1 deletion spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "2.24.4"
version = "2.24.5"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_transformations/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_transformations"
version = "2.24.4"
version = "2.24.5"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_utils/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_utils"
version = "2.24.4"
version = "2.24.5"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_worker/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_worker"
version = "2.24.4"
version = "2.24.5"
authors = [
"j-mendez <[email protected]>"
]
Expand Down

0 comments on commit 2734b93

Please sign in to comment.