-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwget-mirror-website.sh
executable file
·122 lines (104 loc) · 3.08 KB
/
wget-mirror-website.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/bin/bash -eu
# A script to mirror a website using wget
usage() {
cat << EOF
Usage: $(basename "$0") [-w website] [-u user_agent] [-b]
Where:
-w The website to mirror. The website will be like "https://example.com/"
-u The User agent to use. If no User agent is specified,
a Firefox default one is used.
-b Backup the website. If it's not added, it will not back up.
-h This help
EOF
}
# Print the error, the usage and exit
#
# $1: The error to print
print_error_exit() {
error_print="${1}"
echo "${error_print}"
usage
exit 1
}
# Backup a directory
#
# The format of the backup file will be
# "Directory name to back up"_Backup-"The size of the directory"-"Current date in format %Y%m%d-%H%M%S".tar.xz
# For example, if the directory is "example.com" the backup file will look like the following
# example.com_Backup-12K-20210116-173838.tar.xz
#
# $1: The directory to backup
backup_dir() {
directory_to_backup="${1}"
# The time is in UTC/GMT
current_date=$(date -u +%Y%m%d-%H%M%S)
# The size of the directory before compressed
dir_size=$(du -sh "${directory_to_backup}" | cut -f 1)
filename_of_backup_dir="${directory_to_backup}_Backup-${dir_size}-${current_date}.tar"
# Create back up tar file of the dir
tar cvf "${filename_of_backup_dir}" "${directory_to_backup}"
# Compress tar file
xz "${filename_of_backup_dir}"
}
# Check if wget is present in the system
IS_WGET_INSTALLED="$(command -v wget)"
if [ -z "${IS_WGET_INSTALLED}" ] ; then
print_error_exit "wget command is missing!!!"
fi
while getopts "w:u:bh" opt; do
case "${opt}" in
w)
website=${OPTARG}
;;
u)
user_agent=${OPTARG}
;;
b)
backup_dir=1
;;
h)
usage
exit 0
;;
*)
usage
exit 1
;;
esac
done
# Check if any options are used
if [ ${OPTIND} = 1 ] ; then
print_error_exit "No options specified!"
fi
# The website is a mandatory argument
if [ -z "${website}" ] ; then
print_error_exit "The [-w website] is required."
fi
# The user_agent is a *NOT* mandatory argument
# if [ -z "${user_agent}" ] ; then
# user_agent='Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'
# fi
website_host=$(echo "$website" | cut -d / -f 3)
# The main wget command
wget \
--debug \
--mirror \
--timestamping \
--convert-links \
--backup-converted \
--adjust-extension \
--page-requisites \
--wait 2 \
--random-wait \
--continue \
--limit-rate=2k \
--no-if-modified-since \
--append-output="${website_host}.log" \
--rejected-log="${website_host}-rejected.log" \
--user-agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36" \
--directory-prefix="${website_host}/" \
"${website}"
# Backup the directory of mirrored website if backup is enabled
if [ -n "${backup_dir}" ] ; then
backup_dir "${website_host}"
fi