-
-
Notifications
You must be signed in to change notification settings - Fork 117
/
Copy pathxurls.go
205 lines (186 loc) · 7.83 KB
/
xurls.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
// Copyright (c) 2015, Daniel Martí <[email protected]>
// See LICENSE for licensing information
// Package xurls extracts urls from plain text using regular expressions.
package xurls
import (
"regexp"
"strings"
"sync"
"unicode/utf8"
)
//go:generate go run ./generate/tldsgen
//go:generate go run ./generate/schemesgen
//go:generate go run ./generate/unicodegen
const (
// pathCont is based on https://www.rfc-editor.org/rfc/rfc3987#section-2.2
// but does not match separators anywhere or most puncutation in final position,
// to avoid creating asymmetries like
// `Did you know that **<a href="...">https://example.com/**</a> is reserved for documentation?`
// from `Did you know that **https://example.com/** is reserved for documentation?`.
unreservedChar = `a-zA-Z0-9\-._~`
endUnreservedChar = `a-zA-Z0-9\-_~`
midSubDelimChar = `!$&'*+,;=`
endSubDelimChar = `$&+=`
midIPathSegmentChar = unreservedChar + `%` + midSubDelimChar + `:@` + allowedUcsChar
endIPathSegmentChar = endUnreservedChar + `%` + endSubDelimChar + allowedUcsCharMinusPunc
iPrivateChar = `\x{E000}-\x{F8FF}\x{F0000}-\x{FFFFD}\x{100000}-\x{10FFFD}`
midIChar = `/?#\\` + midIPathSegmentChar + iPrivateChar
endIChar = `/#` + endIPathSegmentChar + iPrivateChar
wellParen = `\((?:[` + midIChar + `]|\([` + midIChar + `]*\))*\)`
wellBrack = `\[(?:[` + midIChar + `]|\[[` + midIChar + `]*\])*\]`
wellBrace = `\{(?:[` + midIChar + `]|\{[` + midIChar + `]*\})*\}`
wellAll = wellParen + `|` + wellBrack + `|` + wellBrace
pathCont = `(?:[` + midIChar + `]*(?:` + wellAll + `|[` + endIChar + `]))+`
letter = `\p{L}`
mark = `\p{M}`
number = `\p{N}`
iriChar = letter + mark + number
iri = `[` + iriChar + `](?:[` + iriChar + `\-]*[` + iriChar + `])?`
subdomain = `(?:` + iri + `\.)+`
octet = `(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])`
ipv4Addr = octet + `\.` + octet + `\.` + octet + `\.` + octet
// ipv6Addr is based on https://datatracker.ietf.org/doc/html/rfc4291#section-2.2
// with a specific alternative for each valid count of leading 16-bit hexadecimal "chomps"
// that have not been replaced with a `::` elision.
h4 = `[0-9a-fA-F]{1,4}`
ipv6AddrMinusEmpty = `(?:` +
// 7 colon-terminated chomps, followed by a final chomp or the rest of an elision.
`(?:` + h4 + `:){7}(?:` + h4 + `|:)|` +
// 6 chomps, followed by an IPv4 address or elision with final chomp or final elision.
`(?:` + h4 + `:){6}(?:` + ipv4Addr + `|:` + h4 + `|:)|` +
// 5 chomps, followed by an elision with optional IPv4 or up to 2 final chomps.
`(?:` + h4 + `:){5}(?::` + ipv4Addr + `|(?::` + h4 + `){1,2}|:)|` +
// 4 chomps, followed by an elision with optional IPv4 (optionally preceded by a chomp) or
// up to 3 final chomps.
`(?:` + h4 + `:){4}(?:(?::` + h4 + `){0,1}:` + ipv4Addr + `|(?::` + h4 + `){1,3}|:)|` +
// 3 chomps, followed by an elision with optional IPv4 (preceded by up to 2 chomps) or
// up to 4 final chomps.
`(?:` + h4 + `:){3}(?:(?::` + h4 + `){0,2}:` + ipv4Addr + `|(?::` + h4 + `){1,4}|:)|` +
// 2 chomps, followed by an elision with optional IPv4 (preceded by up to 3 chomps) or
// up to 5 final chomps.
`(?:` + h4 + `:){2}(?:(?::` + h4 + `){0,3}:` + ipv4Addr + `|(?::` + h4 + `){1,5}|:)|` +
// 1 chomp, followed by an elision with optional IPv4 (preceded by up to 4 chomps) or
// up to 6 final chomps.
`(?:` + h4 + `:){1}(?:(?::` + h4 + `){0,4}:` + ipv4Addr + `|(?::` + h4 + `){1,6}|:)|` +
// elision, followed by optional IPv4 (preceded by up to 5 chomps) or
// up to 7 final chomps.
// `:` is an intentionally omitted alternative, to avoid matching `::`.
`:(?:(?::` + h4 + `){0,5}:` + ipv4Addr + `|(?::` + h4 + `){1,7})` +
`)`
ipv6Addr = `(?:` + ipv6AddrMinusEmpty + `|::)`
ipAddrMinusEmpty = `(?:` + ipv6AddrMinusEmpty + `|\b` + ipv4Addr + `\b)`
port = `(?::[0-9]+)?`
)
// AnyScheme can be passed to StrictMatchingScheme to match any possibly valid
// scheme, and not just the known ones.
var AnyScheme = `(?:[a-zA-Z][a-zA-Z.\-+]*://|` + anyOf(SchemesNoAuthority...) + `:)`
// SchemesNoAuthority is a sorted list of some well-known url schemes that are
// followed by ":" instead of "://". The list includes both officially
// registered and unofficial schemes.
var SchemesNoAuthority = []string{
`bitcoin`, // Bitcoin
`cid`, // Content-ID
`file`, // Files
`geo`, // Geographic location
`magnet`, // Torrent magnets
`mailto`, // Mail
`matrix`, // Matrix
`mid`, // Message-ID
`sms`, // SMS
`tel`, // Telephone
`xmpp`, // XMPP
}
// SchemesUnofficial is a sorted list of some well-known url schemes which
// aren't officially registered just yet. They tend to correspond to software.
//
// Mostly collected from https://en.wikipedia.org/wiki/List_of_URI_schemes#Unofficial_but_common_URI_schemes.
var SchemesUnofficial = []string{
`gemini`, // gemini
`jdbc`, // Java database Connectivity
`moz-extension`, // Firefox extension
`postgres`, // PostgreSQL (short form)
`postgresql`, // PostgreSQL
`slack`, // Slack
`zoommtg`, // Zoom (desktop)
`zoomus`, // Zoom (mobile)
}
// The regular expressions are compiled when the API is first called.
// Any subsequent calls will use the same regular expression pointers.
//
// We do not need to make a copy of them for each API call,
// as Copy is now only useful if one copy calls Longest but not another,
// and we always call Longest after compiling the regular expression.
var (
strictRe *regexp.Regexp
strictInit sync.Once
relaxedRe *regexp.Regexp
relaxedInit sync.Once
)
func anyOf(strs ...string) string {
var b strings.Builder
b.WriteString("(?:")
for i, s := range strs {
if i != 0 {
b.WriteByte('|')
}
b.WriteString(regexp.QuoteMeta(s))
}
b.WriteByte(')')
return b.String()
}
func strictExp() string {
schemes := `(?:(?i)(?:` + anyOf(Schemes...) + `|` + anyOf(SchemesUnofficial...) + `)://|` + anyOf(SchemesNoAuthority...) + `:)`
return schemes + pathCont
}
func relaxedExp() string {
var asciiTLDs, unicodeTLDs []string
for i, tld := range TLDs {
if tld[0] >= utf8.RuneSelf {
asciiTLDs = TLDs[:i:i]
unicodeTLDs = TLDs[i:]
break
}
}
punycode := `xn--[a-z0-9-]+`
// Use \b to make sure ASCII TLDs are immediately followed by a word break.
// We can't do that with unicode TLDs, as they don't see following
// whitespace as a word break.
tlds := `(?:(?i)` + punycode + `|` + anyOf(append(asciiTLDs, PseudoTLDs...)...) + `\b|` + anyOf(unicodeTLDs...) + `)`
domain := subdomain + tlds
hostName := `(?:` + domain + `|\[` + ipv6Addr + `\]|\b` + ipv4Addr + `\b)`
webURL := hostName + port + `(?:/` + pathCont + `|/)?`
email := `(?P<relaxedEmail>[a-zA-Z0-9._%\-+]+@` + domain + `)`
return strictExp() + `|` + webURL + `|` + email + `|` + ipv6AddrMinusEmpty
}
// Strict produces a regexp that matches any URL with a scheme in either the
// Schemes or SchemesNoAuthority lists.
func Strict() *regexp.Regexp {
strictInit.Do(func() {
strictRe = regexp.MustCompile(strictExp())
strictRe.Longest()
})
return strictRe
}
// Relaxed produces a regexp that matches any URL matched by Strict, plus any
// URL or email address with no scheme.
//
// Email addresses without a scheme match the `relaxedEmail` subexpression,
// which can be used to filter them as needed.
func Relaxed() *regexp.Regexp {
relaxedInit.Do(func() {
relaxedRe = regexp.MustCompile(relaxedExp())
relaxedRe.Longest()
})
return relaxedRe
}
// StrictMatchingScheme produces a regexp similar to Strict, but requiring that
// the scheme match the given regular expression. See AnyScheme too.
func StrictMatchingScheme(exp string) (*regexp.Regexp, error) {
strictMatching := `(?i)(?:` + exp + `)(?-i)` + pathCont
re, err := regexp.Compile(strictMatching)
if err != nil {
return nil, err
}
re.Longest()
return re, nil
}