xurls.go (7810B)
1 // Copyright (c) 2015, Daniel Martà <mvdan@mvdan.cc> 2 // See LICENSE for licensing information 3 4 // Package xurls extracts urls from plain text using regular expressions. 5 package xurls 6 7 import ( 8 "regexp" 9 "strings" 10 "sync" 11 "unicode/utf8" 12 ) 13 14 //go:generate go run ./generate/tldsgen 15 //go:generate go run ./generate/schemesgen 16 //go:generate go run ./generate/unicodegen 17 18 const ( 19 // pathCont is based on https://www.rfc-editor.org/rfc/rfc3987#section-2.2 20 // but does not match separators anywhere or most puncutation in final position, 21 // to avoid creating asymmetries like 22 // `Did you know that **<a href="...">https://example.com/**</a> is reserved for documentation?` 23 // from `Did you know that **https://example.com/** is reserved for documentation?`. 24 unreservedChar = `a-zA-Z0-9\-._~` 25 endUnreservedChar = `a-zA-Z0-9\-_~` 26 midSubDelimChar = `!$&'*+,;=` 27 endSubDelimChar = `$&+=` 28 midIPathSegmentChar = unreservedChar + `%` + midSubDelimChar + `:@` + allowedUcsChar 29 endIPathSegmentChar = endUnreservedChar + `%` + endSubDelimChar + allowedUcsCharMinusPunc 30 iPrivateChar = `\x{E000}-\x{F8FF}\x{F0000}-\x{FFFFD}\x{100000}-\x{10FFFD}` 31 midIChar = `/?#\\` + midIPathSegmentChar + iPrivateChar 32 endIChar = `/#` + endIPathSegmentChar + iPrivateChar 33 wellParen = `\((?:[` + midIChar + `]|\([` + midIChar + `]*\))*\)` 34 wellBrack = `\[(?:[` + midIChar + `]|\[[` + midIChar + `]*\])*\]` 35 wellBrace = `\{(?:[` + midIChar + `]|\{[` + midIChar + `]*\})*\}` 36 wellAll = wellParen + `|` + wellBrack + `|` + wellBrace 37 pathCont = `(?:[` + midIChar + `]*(?:` + wellAll + `|[` + endIChar + `]))+` 38 39 letter = `\p{L}` 40 mark = `\p{M}` 41 number = `\p{N}` 42 iriChar = letter + mark + number 43 iri = `[` + iriChar + `](?:[` + iriChar + `\-]*[` + iriChar + `])?` 44 subdomain = `(?:` + iri + `\.)+` 45 octet = `(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])` 46 ipv4Addr = octet + `\.` + octet + `\.` + octet + `\.` + octet 47 48 // ipv6Addr is based on https://datatracker.ietf.org/doc/html/rfc4291#section-2.2 49 // with a specific alternative for each valid count of leading 16-bit hexadecimal "chomps" 50 // that have not been replaced with a `::` elision. 51 h4 = `[0-9a-fA-F]{1,4}` 52 ipv6AddrMinusEmpty = `(?:` + 53 // 7 colon-terminated chomps, followed by a final chomp or the rest of an elision. 54 `(?:` + h4 + `:){7}(?:` + h4 + `|:)|` + 55 // 6 chomps, followed by an IPv4 address or elision with final chomp or final elision. 56 `(?:` + h4 + `:){6}(?:` + ipv4Addr + `|:` + h4 + `|:)|` + 57 // 5 chomps, followed by an elision with optional IPv4 or up to 2 final chomps. 58 `(?:` + h4 + `:){5}(?::` + ipv4Addr + `|(?::` + h4 + `){1,2}|:)|` + 59 // 4 chomps, followed by an elision with optional IPv4 (optionally preceded by a chomp) or 60 // up to 3 final chomps. 61 `(?:` + h4 + `:){4}(?:(?::` + h4 + `){0,1}:` + ipv4Addr + `|(?::` + h4 + `){1,3}|:)|` + 62 // 3 chomps, followed by an elision with optional IPv4 (preceded by up to 2 chomps) or 63 // up to 4 final chomps. 64 `(?:` + h4 + `:){3}(?:(?::` + h4 + `){0,2}:` + ipv4Addr + `|(?::` + h4 + `){1,4}|:)|` + 65 // 2 chomps, followed by an elision with optional IPv4 (preceded by up to 3 chomps) or 66 // up to 5 final chomps. 67 `(?:` + h4 + `:){2}(?:(?::` + h4 + `){0,3}:` + ipv4Addr + `|(?::` + h4 + `){1,5}|:)|` + 68 // 1 chomp, followed by an elision with optional IPv4 (preceded by up to 4 chomps) or 69 // up to 6 final chomps. 70 `(?:` + h4 + `:){1}(?:(?::` + h4 + `){0,4}:` + ipv4Addr + `|(?::` + h4 + `){1,6}|:)|` + 71 // elision, followed by optional IPv4 (preceded by up to 5 chomps) or 72 // up to 7 final chomps. 73 // `:` is an intentionally omitted alternative, to avoid matching `::`. 74 `:(?:(?::` + h4 + `){0,5}:` + ipv4Addr + `|(?::` + h4 + `){1,7})` + 75 `)` 76 ipv6Addr = `(?:` + ipv6AddrMinusEmpty + `|::)` 77 ipAddrMinusEmpty = `(?:` + ipv6AddrMinusEmpty + `|\b` + ipv4Addr + `\b)` 78 port = `(?::[0-9]*)?` 79 ) 80 81 // AnyScheme can be passed to StrictMatchingScheme to match any possibly valid 82 // scheme, and not just the known ones. 83 var AnyScheme = `(?:[a-zA-Z][a-zA-Z.\-+]*://|` + anyOf(SchemesNoAuthority...) + `:)` 84 85 // SchemesNoAuthority is a sorted list of some well-known url schemes that are 86 // followed by ":" instead of "://". The list includes both officially 87 // registered and unofficial schemes. 88 var SchemesNoAuthority = []string{ 89 `bitcoin`, // Bitcoin 90 `cid`, // Content-ID 91 `file`, // Files 92 `magnet`, // Torrent magnets 93 `mailto`, // Mail 94 `mid`, // Message-ID 95 `sms`, // SMS 96 `tel`, // Telephone 97 `xmpp`, // XMPP 98 } 99 100 // SchemesUnofficial is a sorted list of some well-known url schemes which 101 // aren't officially registered just yet. They tend to correspond to software. 102 // 103 // Mostly collected from https://en.wikipedia.org/wiki/List_of_URI_schemes#Unofficial_but_common_URI_schemes. 104 var SchemesUnofficial = []string{ 105 `gemini`, // gemini 106 `jdbc`, // Java database Connectivity 107 `moz-extension`, // Firefox extension 108 `postgres`, // PostgreSQL (short form) 109 `postgresql`, // PostgreSQL 110 `slack`, // Slack 111 `zoommtg`, // Zoom (desktop) 112 `zoomus`, // Zoom (mobile) 113 } 114 115 // The regular expressions are compiled when the API is first called. 116 // Any subsequent calls will use the same regular expression pointers. 117 // 118 // We do not need to make a copy of them for each API call, 119 // as Copy is now only useful if one copy calls Longest but not another, 120 // and we always call Longest after compiling the regular expression. 121 var ( 122 strictRe *regexp.Regexp 123 strictInit sync.Once 124 125 relaxedRe *regexp.Regexp 126 relaxedInit sync.Once 127 ) 128 129 func anyOf(strs ...string) string { 130 var b strings.Builder 131 b.WriteString("(?:") 132 for i, s := range strs { 133 if i != 0 { 134 b.WriteByte('|') 135 } 136 b.WriteString(regexp.QuoteMeta(s)) 137 } 138 b.WriteByte(')') 139 return b.String() 140 } 141 142 func strictExp() string { 143 schemes := `(?:(?i)(?:` + anyOf(Schemes...) + `|` + anyOf(SchemesUnofficial...) + `)://|` + anyOf(SchemesNoAuthority...) + `:)` 144 return schemes + pathCont 145 } 146 147 func relaxedExp() string { 148 var asciiTLDs, unicodeTLDs []string 149 for i, tld := range TLDs { 150 if tld[0] >= utf8.RuneSelf { 151 asciiTLDs = TLDs[:i:i] 152 unicodeTLDs = TLDs[i:] 153 break 154 } 155 } 156 punycode := `xn--[a-z0-9-]+` 157 158 // Use \b to make sure ASCII TLDs are immediately followed by a word break. 159 // We can't do that with unicode TLDs, as they don't see following 160 // whitespace as a word break. 161 tlds := `(?:(?i)` + punycode + `|` + anyOf(append(asciiTLDs, PseudoTLDs...)...) + `\b|` + anyOf(unicodeTLDs...) + `)` 162 domain := subdomain + tlds 163 164 hostName := `(?:` + domain + `|\[` + ipv6Addr + `\]|\b` + ipv4Addr + `\b)` 165 webURL := hostName + port + `(?:/` + pathCont + `|/)?` 166 email := `[a-zA-Z0-9._%\-+]+@` + domain 167 return strictExp() + `|` + webURL + `|` + email + `|` + ipv6AddrMinusEmpty 168 } 169 170 // Strict produces a regexp that matches any URL with a scheme in either the 171 // Schemes or SchemesNoAuthority lists. 172 func Strict() *regexp.Regexp { 173 strictInit.Do(func() { 174 strictRe = regexp.MustCompile(strictExp()) 175 strictRe.Longest() 176 }) 177 return strictRe 178 } 179 180 // Relaxed produces a regexp that matches any URL matched by Strict, plus any 181 // URL with no scheme or email address. 182 func Relaxed() *regexp.Regexp { 183 relaxedInit.Do(func() { 184 relaxedRe = regexp.MustCompile(relaxedExp()) 185 relaxedRe.Longest() 186 }) 187 return relaxedRe 188 } 189 190 // StrictMatchingScheme produces a regexp similar to Strict, but requiring that 191 // the scheme match the given regular expression. See AnyScheme too. 192 func StrictMatchingScheme(exp string) (*regexp.Regexp, error) { 193 strictMatching := `(?i)(?:` + exp + `)(?-i)` + pathCont 194 re, err := regexp.Compile(strictMatching) 195 if err != nil { 196 return nil, err 197 } 198 re.Longest() 199 return re, nil 200 }