gtsocial-umbx

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

xurls.go (7810B)


      1 // Copyright (c) 2015, Daniel Martí <mvdan@mvdan.cc>
      2 // See LICENSE for licensing information
      3 
      4 // Package xurls extracts urls from plain text using regular expressions.
      5 package xurls
      6 
      7 import (
      8 	"regexp"
      9 	"strings"
     10 	"sync"
     11 	"unicode/utf8"
     12 )
     13 
     14 //go:generate go run ./generate/tldsgen
     15 //go:generate go run ./generate/schemesgen
     16 //go:generate go run ./generate/unicodegen
     17 
     18 const (
     19 	// pathCont is based on https://www.rfc-editor.org/rfc/rfc3987#section-2.2
     20 	// but does not match separators anywhere or most puncutation in final position,
     21 	// to avoid creating asymmetries like
     22 	// `Did you know that **<a href="...">https://example.com/**</a> is reserved for documentation?`
     23 	// from `Did you know that **https://example.com/** is reserved for documentation?`.
     24 	unreservedChar      = `a-zA-Z0-9\-._~`
     25 	endUnreservedChar   = `a-zA-Z0-9\-_~`
     26 	midSubDelimChar     = `!$&'*+,;=`
     27 	endSubDelimChar     = `$&+=`
     28 	midIPathSegmentChar = unreservedChar + `%` + midSubDelimChar + `:@` + allowedUcsChar
     29 	endIPathSegmentChar = endUnreservedChar + `%` + endSubDelimChar + allowedUcsCharMinusPunc
     30 	iPrivateChar        = `\x{E000}-\x{F8FF}\x{F0000}-\x{FFFFD}\x{100000}-\x{10FFFD}`
     31 	midIChar            = `/?#\\` + midIPathSegmentChar + iPrivateChar
     32 	endIChar            = `/#` + endIPathSegmentChar + iPrivateChar
     33 	wellParen           = `\((?:[` + midIChar + `]|\([` + midIChar + `]*\))*\)`
     34 	wellBrack           = `\[(?:[` + midIChar + `]|\[[` + midIChar + `]*\])*\]`
     35 	wellBrace           = `\{(?:[` + midIChar + `]|\{[` + midIChar + `]*\})*\}`
     36 	wellAll             = wellParen + `|` + wellBrack + `|` + wellBrace
     37 	pathCont            = `(?:[` + midIChar + `]*(?:` + wellAll + `|[` + endIChar + `]))+`
     38 
     39 	letter    = `\p{L}`
     40 	mark      = `\p{M}`
     41 	number    = `\p{N}`
     42 	iriChar   = letter + mark + number
     43 	iri       = `[` + iriChar + `](?:[` + iriChar + `\-]*[` + iriChar + `])?`
     44 	subdomain = `(?:` + iri + `\.)+`
     45 	octet     = `(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])`
     46 	ipv4Addr  = octet + `\.` + octet + `\.` + octet + `\.` + octet
     47 
     48 	// ipv6Addr is based on https://datatracker.ietf.org/doc/html/rfc4291#section-2.2
     49 	// with a specific alternative for each valid count of leading 16-bit hexadecimal "chomps"
     50 	// that have not been replaced with a `::` elision.
     51 	h4                 = `[0-9a-fA-F]{1,4}`
     52 	ipv6AddrMinusEmpty = `(?:` +
     53 		// 7 colon-terminated chomps, followed by a final chomp or the rest of an elision.
     54 		`(?:` + h4 + `:){7}(?:` + h4 + `|:)|` +
     55 		// 6 chomps, followed by an IPv4 address or elision with final chomp or final elision.
     56 		`(?:` + h4 + `:){6}(?:` + ipv4Addr + `|:` + h4 + `|:)|` +
     57 		// 5 chomps, followed by an elision with optional IPv4 or up to 2 final chomps.
     58 		`(?:` + h4 + `:){5}(?::` + ipv4Addr + `|(?::` + h4 + `){1,2}|:)|` +
     59 		// 4 chomps, followed by an elision with optional IPv4 (optionally preceded by a chomp) or
     60 		// up to 3 final chomps.
     61 		`(?:` + h4 + `:){4}(?:(?::` + h4 + `){0,1}:` + ipv4Addr + `|(?::` + h4 + `){1,3}|:)|` +
     62 		// 3 chomps, followed by an elision with optional IPv4 (preceded by up to 2 chomps) or
     63 		// up to 4 final chomps.
     64 		`(?:` + h4 + `:){3}(?:(?::` + h4 + `){0,2}:` + ipv4Addr + `|(?::` + h4 + `){1,4}|:)|` +
     65 		// 2 chomps, followed by an elision with optional IPv4 (preceded by up to 3 chomps) or
     66 		// up to 5 final chomps.
     67 		`(?:` + h4 + `:){2}(?:(?::` + h4 + `){0,3}:` + ipv4Addr + `|(?::` + h4 + `){1,5}|:)|` +
     68 		// 1 chomp, followed by an elision with optional IPv4 (preceded by up to 4 chomps) or
     69 		// up to 6 final chomps.
     70 		`(?:` + h4 + `:){1}(?:(?::` + h4 + `){0,4}:` + ipv4Addr + `|(?::` + h4 + `){1,6}|:)|` +
     71 		// elision, followed by optional IPv4 (preceded by up to 5 chomps) or
     72 		// up to 7 final chomps.
     73 		// `:` is an intentionally omitted alternative, to avoid matching `::`.
     74 		`:(?:(?::` + h4 + `){0,5}:` + ipv4Addr + `|(?::` + h4 + `){1,7})` +
     75 		`)`
     76 	ipv6Addr         = `(?:` + ipv6AddrMinusEmpty + `|::)`
     77 	ipAddrMinusEmpty = `(?:` + ipv6AddrMinusEmpty + `|\b` + ipv4Addr + `\b)`
     78 	port             = `(?::[0-9]*)?`
     79 )
     80 
     81 // AnyScheme can be passed to StrictMatchingScheme to match any possibly valid
     82 // scheme, and not just the known ones.
     83 var AnyScheme = `(?:[a-zA-Z][a-zA-Z.\-+]*://|` + anyOf(SchemesNoAuthority...) + `:)`
     84 
     85 // SchemesNoAuthority is a sorted list of some well-known url schemes that are
     86 // followed by ":" instead of "://". The list includes both officially
     87 // registered and unofficial schemes.
     88 var SchemesNoAuthority = []string{
     89 	`bitcoin`, // Bitcoin
     90 	`cid`,     // Content-ID
     91 	`file`,    // Files
     92 	`magnet`,  // Torrent magnets
     93 	`mailto`,  // Mail
     94 	`mid`,     // Message-ID
     95 	`sms`,     // SMS
     96 	`tel`,     // Telephone
     97 	`xmpp`,    // XMPP
     98 }
     99 
    100 // SchemesUnofficial is a sorted list of some well-known url schemes which
    101 // aren't officially registered just yet. They tend to correspond to software.
    102 //
    103 // Mostly collected from https://en.wikipedia.org/wiki/List_of_URI_schemes#Unofficial_but_common_URI_schemes.
    104 var SchemesUnofficial = []string{
    105 	`gemini`,        // gemini
    106 	`jdbc`,          // Java database Connectivity
    107 	`moz-extension`, // Firefox extension
    108 	`postgres`,      // PostgreSQL (short form)
    109 	`postgresql`,    // PostgreSQL
    110 	`slack`,         // Slack
    111 	`zoommtg`,       // Zoom (desktop)
    112 	`zoomus`,        // Zoom (mobile)
    113 }
    114 
    115 // The regular expressions are compiled when the API is first called.
    116 // Any subsequent calls will use the same regular expression pointers.
    117 //
    118 // We do not need to make a copy of them for each API call,
    119 // as Copy is now only useful if one copy calls Longest but not another,
    120 // and we always call Longest after compiling the regular expression.
    121 var (
    122 	strictRe   *regexp.Regexp
    123 	strictInit sync.Once
    124 
    125 	relaxedRe   *regexp.Regexp
    126 	relaxedInit sync.Once
    127 )
    128 
    129 func anyOf(strs ...string) string {
    130 	var b strings.Builder
    131 	b.WriteString("(?:")
    132 	for i, s := range strs {
    133 		if i != 0 {
    134 			b.WriteByte('|')
    135 		}
    136 		b.WriteString(regexp.QuoteMeta(s))
    137 	}
    138 	b.WriteByte(')')
    139 	return b.String()
    140 }
    141 
    142 func strictExp() string {
    143 	schemes := `(?:(?i)(?:` + anyOf(Schemes...) + `|` + anyOf(SchemesUnofficial...) + `)://|` + anyOf(SchemesNoAuthority...) + `:)`
    144 	return schemes + pathCont
    145 }
    146 
    147 func relaxedExp() string {
    148 	var asciiTLDs, unicodeTLDs []string
    149 	for i, tld := range TLDs {
    150 		if tld[0] >= utf8.RuneSelf {
    151 			asciiTLDs = TLDs[:i:i]
    152 			unicodeTLDs = TLDs[i:]
    153 			break
    154 		}
    155 	}
    156 	punycode := `xn--[a-z0-9-]+`
    157 
    158 	// Use \b to make sure ASCII TLDs are immediately followed by a word break.
    159 	// We can't do that with unicode TLDs, as they don't see following
    160 	// whitespace as a word break.
    161 	tlds := `(?:(?i)` + punycode + `|` + anyOf(append(asciiTLDs, PseudoTLDs...)...) + `\b|` + anyOf(unicodeTLDs...) + `)`
    162 	domain := subdomain + tlds
    163 
    164 	hostName := `(?:` + domain + `|\[` + ipv6Addr + `\]|\b` + ipv4Addr + `\b)`
    165 	webURL := hostName + port + `(?:/` + pathCont + `|/)?`
    166 	email := `[a-zA-Z0-9._%\-+]+@` + domain
    167 	return strictExp() + `|` + webURL + `|` + email + `|` + ipv6AddrMinusEmpty
    168 }
    169 
    170 // Strict produces a regexp that matches any URL with a scheme in either the
    171 // Schemes or SchemesNoAuthority lists.
    172 func Strict() *regexp.Regexp {
    173 	strictInit.Do(func() {
    174 		strictRe = regexp.MustCompile(strictExp())
    175 		strictRe.Longest()
    176 	})
    177 	return strictRe
    178 }
    179 
    180 // Relaxed produces a regexp that matches any URL matched by Strict, plus any
    181 // URL with no scheme or email address.
    182 func Relaxed() *regexp.Regexp {
    183 	relaxedInit.Do(func() {
    184 		relaxedRe = regexp.MustCompile(relaxedExp())
    185 		relaxedRe.Longest()
    186 	})
    187 	return relaxedRe
    188 }
    189 
    190 // StrictMatchingScheme produces a regexp similar to Strict, but requiring that
    191 // the scheme match the given regular expression. See AnyScheme too.
    192 func StrictMatchingScheme(exp string) (*regexp.Regexp, error) {
    193 	strictMatching := `(?i)(?:` + exp + `)(?-i)` + pathCont
    194 	re, err := regexp.Compile(strictMatching)
    195 	if err != nil {
    196 		return nil, err
    197 	}
    198 	re.Longest()
    199 	return re, nil
    200 }