From 9678ffa85e92d2a88f331e1abff1ed20ef4b3d38 Mon Sep 17 00:00:00 2001 From: Sergey Brovko Date: Wed, 23 Sep 2020 16:29:43 +0300 Subject: [PATCH 1/4] UserAgent in parameters --- README.md | 11 +++++++++++ bin/wayback_machine_downloader | 4 ++++ lib/wayback_machine_downloader.rb | 5 +++-- lib/wayback_machine_downloader/archive_api.rb | 2 +- 4 files changed, 19 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 2bb9ea6..a6553a3 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,7 @@ It will download the last version of every file present on Wayback Machine to `. -p, --maximum-snapshot NUMBER Maximum snapshot pages to consider (Default is 100) Count an average of 150,000 snapshots per page -l, --list Only list file urls in a JSON format with the archived timestamps, won't download anything + -u, --user-agent STRING UserAgent for connection (Default is WayBack Machine Downloader) ## Specify directory to save files to @@ -175,6 +176,16 @@ Example: wayback_machine_downloader http://example.com --concurrency 20 +## Specify UserAgent for connection + + -u, --user-agent STRING + +UserAgent for connection (Default is WayBack Machine Downloader) + +Example: + + wayback_machine_downloader http://example.com --user-agent "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:77.0) Gecko/20190101 Firefox/77.0" + ## Using the Docker image As an alternative installation way, we have a Docker image! Retrieve the wayback-machine-downloader Docker image this way: diff --git a/bin/wayback_machine_downloader b/bin/wayback_machine_downloader index 8b9f2fd..05b9c89 100755 --- a/bin/wayback_machine_downloader +++ b/bin/wayback_machine_downloader @@ -58,6 +58,10 @@ option_parser = OptionParser.new do |opts| options[:list] = true end + opts.on("-u", "--user-agent STRING", String, "UserAgent for connection (Default is WayBack Machine Downloader)") do |t| + options[:user_agent] = t + end + opts.on("-v", "--version", "Display version") do |t| options[:version] = t end diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index 04005c8..26bdff6 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -18,7 +18,7 @@ class WaybackMachineDownloader attr_accessor :base_url, :exact_url, :directory, :all_timestamps, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, - :all, :maximum_pages, :threads_count + :all, :maximum_pages, :threads_count, :user_agent def initialize params @base_url = params[:base_url] @@ -32,6 +32,7 @@ def initialize params @all = params[:all] @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100 @threads_count = params[:threads_count].to_i + @user_agent = params[:user_agent] ? params[:user_agent] : "WayBack Machine Downloader" end def backup_name @@ -268,7 +269,7 @@ def download_file file_remote_info structure_dir_path dir_path open(file_path, "wb") do |file| begin - URI.open("https://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Accept-Encoding" => "plain") do |uri| + open("http://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Accept-Encoding" => "plain", "User-Agent" => @user_agent) do |uri| file.write(uri.read) end rescue OpenURI::HTTPError => e diff --git a/lib/wayback_machine_downloader/archive_api.rb b/lib/wayback_machine_downloader/archive_api.rb index 903f42b..e641d44 100644 --- a/lib/wayback_machine_downloader/archive_api.rb +++ b/lib/wayback_machine_downloader/archive_api.rb @@ -5,7 +5,7 @@ def get_raw_list_from_api url, page_index request_url += url request_url += parameters_for_api page_index - URI.open(request_url).read + open(request_url, "User-Agent" => @user_agent).read end def parameters_for_api page_index From 7f0b707c2bf36ba38455f711d3a9a597871178aa Mon Sep 17 00:00:00 2001 From: Sergey Brovko Date: Wed, 23 Sep 2020 16:35:28 +0300 Subject: [PATCH 2/4] https link (to avoid 403 error on http scheme) --- lib/wayback_machine_downloader.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index 26bdff6..613ce96 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -269,7 +269,7 @@ def download_file file_remote_info structure_dir_path dir_path open(file_path, "wb") do |file| begin - open("http://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Accept-Encoding" => "plain", "User-Agent" => @user_agent) do |uri| + open("https://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Accept-Encoding" => "plain", "User-Agent" => @user_agent) do |uri| file.write(uri.read) end rescue OpenURI::HTTPError => e From e8c78ccb1f8b1b3dfb082716745ed92a9cf2a9d9 Mon Sep 17 00:00:00 2001 From: Sergey Brovko Date: Thu, 5 Nov 2020 17:58:29 +0300 Subject: [PATCH 3/4] change default useragent to Firefox 80 / Windows 10 --- lib/wayback_machine_downloader.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/wayback_machine_downloader.rb b/lib/wayback_machine_downloader.rb index 613ce96..226a90f 100644 --- a/lib/wayback_machine_downloader.rb +++ b/lib/wayback_machine_downloader.rb @@ -32,7 +32,7 @@ def initialize params @all = params[:all] @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100 @threads_count = params[:threads_count].to_i - @user_agent = params[:user_agent] ? params[:user_agent] : "WayBack Machine Downloader" + @user_agent = params[:user_agent] ? params[:user_agent] : "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0" end def backup_name From 8ed636feb5d3412f3e6e7e3fb17fb1a91c6aa4eb Mon Sep 17 00:00:00 2001 From: Sergey Brovko Date: Fri, 6 Nov 2020 16:58:37 +0300 Subject: [PATCH 4/4] chrome UA in example usage / fix UA in readme and bin --- README.md | 6 +++--- bin/wayback_machine_downloader | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index a6553a3..302677b 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ It will download the last version of every file present on Wayback Machine to `. -p, --maximum-snapshot NUMBER Maximum snapshot pages to consider (Default is 100) Count an average of 150,000 snapshots per page -l, --list Only list file urls in a JSON format with the archived timestamps, won't download anything - -u, --user-agent STRING UserAgent for connection (Default is WayBack Machine Downloader) + -u, --user-agent STRING UserAgent for connection (Default is Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0) ## Specify directory to save files to @@ -180,11 +180,11 @@ Example: -u, --user-agent STRING -UserAgent for connection (Default is WayBack Machine Downloader) +UserAgent for connection (Default is Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0) Example: - wayback_machine_downloader http://example.com --user-agent "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:77.0) Gecko/20190101 Firefox/77.0" + wayback_machine_downloader http://example.com --user-agent "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36" ## Using the Docker image diff --git a/bin/wayback_machine_downloader b/bin/wayback_machine_downloader index 05b9c89..e1e53a4 100755 --- a/bin/wayback_machine_downloader +++ b/bin/wayback_machine_downloader @@ -58,7 +58,7 @@ option_parser = OptionParser.new do |opts| options[:list] = true end - opts.on("-u", "--user-agent STRING", String, "UserAgent for connection (Default is WayBack Machine Downloader)") do |t| + opts.on("-u", "--user-agent STRING", String, "UserAgent for connection (Default is Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0)") do |t| options[:user_agent] = t end