gtsocial-umbx

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

util.go (15156B)


      1 package parse
      2 
      3 import (
      4 	"bytes"
      5 	"fmt"
      6 	"strconv"
      7 	"unicode"
      8 )
      9 
     10 // Copy returns a copy of the given byte slice.
     11 func Copy(src []byte) (dst []byte) {
     12 	dst = make([]byte, len(src))
     13 	copy(dst, src)
     14 	return
     15 }
     16 
     17 // ToLower converts all characters in the byte slice from A-Z to a-z.
     18 func ToLower(src []byte) []byte {
     19 	for i, c := range src {
     20 		if c >= 'A' && c <= 'Z' {
     21 			src[i] = c + ('a' - 'A')
     22 		}
     23 	}
     24 	return src
     25 }
     26 
     27 // EqualFold returns true when s matches case-insensitively the targetLower (which must be lowercase).
     28 func EqualFold(s, targetLower []byte) bool {
     29 	if len(s) != len(targetLower) {
     30 		return false
     31 	}
     32 	for i, c := range targetLower {
     33 		d := s[i]
     34 		if d != c && (d < 'A' || d > 'Z' || d+('a'-'A') != c) {
     35 			return false
     36 		}
     37 	}
     38 	return true
     39 }
     40 
     41 // Printable returns a printable string for given rune
     42 func Printable(r rune) string {
     43 	if unicode.IsGraphic(r) {
     44 		return fmt.Sprintf("%c", r)
     45 	} else if r < 128 {
     46 		return fmt.Sprintf("0x%02X", r)
     47 	}
     48 	return fmt.Sprintf("%U", r)
     49 }
     50 
     51 var whitespaceTable = [256]bool{
     52 	// ASCII
     53 	false, false, false, false, false, false, false, false,
     54 	false, true, true, false, true, true, false, false, // tab, new line, form feed, carriage return
     55 	false, false, false, false, false, false, false, false,
     56 	false, false, false, false, false, false, false, false,
     57 
     58 	true, false, false, false, false, false, false, false, // space
     59 	false, false, false, false, false, false, false, false,
     60 	false, false, false, false, false, false, false, false,
     61 	false, false, false, false, false, false, false, false,
     62 
     63 	false, false, false, false, false, false, false, false,
     64 	false, false, false, false, false, false, false, false,
     65 	false, false, false, false, false, false, false, false,
     66 	false, false, false, false, false, false, false, false,
     67 
     68 	false, false, false, false, false, false, false, false,
     69 	false, false, false, false, false, false, false, false,
     70 	false, false, false, false, false, false, false, false,
     71 	false, false, false, false, false, false, false, false,
     72 
     73 	// non-ASCII
     74 	false, false, false, false, false, false, false, false,
     75 	false, false, false, false, false, false, false, false,
     76 	false, false, false, false, false, false, false, false,
     77 	false, false, false, false, false, false, false, false,
     78 
     79 	false, false, false, false, false, false, false, false,
     80 	false, false, false, false, false, false, false, false,
     81 	false, false, false, false, false, false, false, false,
     82 	false, false, false, false, false, false, false, false,
     83 
     84 	false, false, false, false, false, false, false, false,
     85 	false, false, false, false, false, false, false, false,
     86 	false, false, false, false, false, false, false, false,
     87 	false, false, false, false, false, false, false, false,
     88 
     89 	false, false, false, false, false, false, false, false,
     90 	false, false, false, false, false, false, false, false,
     91 	false, false, false, false, false, false, false, false,
     92 	false, false, false, false, false, false, false, false,
     93 }
     94 
     95 // IsWhitespace returns true for space, \n, \r, \t, \f.
     96 func IsWhitespace(c byte) bool {
     97 	return whitespaceTable[c]
     98 }
     99 
    100 var newlineTable = [256]bool{
    101 	// ASCII
    102 	false, false, false, false, false, false, false, false,
    103 	false, false, true, false, false, true, false, false, // new line, carriage return
    104 	false, false, false, false, false, false, false, false,
    105 	false, false, false, false, false, false, false, false,
    106 
    107 	false, false, false, false, false, false, false, false,
    108 	false, false, false, false, false, false, false, false,
    109 	false, false, false, false, false, false, false, false,
    110 	false, false, false, false, false, false, false, false,
    111 
    112 	false, false, false, false, false, false, false, false,
    113 	false, false, false, false, false, false, false, false,
    114 	false, false, false, false, false, false, false, false,
    115 	false, false, false, false, false, false, false, false,
    116 
    117 	false, false, false, false, false, false, false, false,
    118 	false, false, false, false, false, false, false, false,
    119 	false, false, false, false, false, false, false, false,
    120 	false, false, false, false, false, false, false, false,
    121 
    122 	// non-ASCII
    123 	false, false, false, false, false, false, false, false,
    124 	false, false, false, false, false, false, false, false,
    125 	false, false, false, false, false, false, false, false,
    126 	false, false, false, false, false, false, false, false,
    127 
    128 	false, false, false, false, false, false, false, false,
    129 	false, false, false, false, false, false, false, false,
    130 	false, false, false, false, false, false, false, false,
    131 	false, false, false, false, false, false, false, false,
    132 
    133 	false, false, false, false, false, false, false, false,
    134 	false, false, false, false, false, false, false, false,
    135 	false, false, false, false, false, false, false, false,
    136 	false, false, false, false, false, false, false, false,
    137 
    138 	false, false, false, false, false, false, false, false,
    139 	false, false, false, false, false, false, false, false,
    140 	false, false, false, false, false, false, false, false,
    141 	false, false, false, false, false, false, false, false,
    142 }
    143 
    144 // IsNewline returns true for \n, \r.
    145 func IsNewline(c byte) bool {
    146 	return newlineTable[c]
    147 }
    148 
    149 // IsAllWhitespace returns true when the entire byte slice consists of space, \n, \r, \t, \f.
    150 func IsAllWhitespace(b []byte) bool {
    151 	for _, c := range b {
    152 		if !IsWhitespace(c) {
    153 			return false
    154 		}
    155 	}
    156 	return true
    157 }
    158 
    159 // TrimWhitespace removes any leading and trailing whitespace characters.
    160 func TrimWhitespace(b []byte) []byte {
    161 	n := len(b)
    162 	start := n
    163 	for i := 0; i < n; i++ {
    164 		if !IsWhitespace(b[i]) {
    165 			start = i
    166 			break
    167 		}
    168 	}
    169 	end := n
    170 	for i := n - 1; i >= start; i-- {
    171 		if !IsWhitespace(b[i]) {
    172 			end = i + 1
    173 			break
    174 		}
    175 	}
    176 	return b[start:end]
    177 }
    178 
    179 // ReplaceMultipleWhitespace replaces character series of space, \n, \t, \f, \r into a single space or newline (when the serie contained a \n or \r).
    180 func ReplaceMultipleWhitespace(b []byte) []byte {
    181 	j, k := 0, 0 // j is write position, k is start of next text section
    182 	for i := 0; i < len(b); i++ {
    183 		if IsWhitespace(b[i]) {
    184 			start := i
    185 			newline := IsNewline(b[i])
    186 			i++
    187 			for ; i < len(b) && IsWhitespace(b[i]); i++ {
    188 				if IsNewline(b[i]) {
    189 					newline = true
    190 				}
    191 			}
    192 			if newline {
    193 				b[start] = '\n'
    194 			} else {
    195 				b[start] = ' '
    196 			}
    197 			if 1 < i-start { // more than one whitespace
    198 				if j == 0 {
    199 					j = start + 1
    200 				} else {
    201 					j += copy(b[j:], b[k:start+1])
    202 				}
    203 				k = i
    204 			}
    205 		}
    206 	}
    207 	if j == 0 {
    208 		return b
    209 	} else if j == 1 { // only if starts with whitespace
    210 		b[k-1] = b[0]
    211 		return b[k-1:]
    212 	} else if k < len(b) {
    213 		j += copy(b[j:], b[k:])
    214 	}
    215 	return b[:j]
    216 }
    217 
    218 // replaceEntities will replace in b at index i, assuming that b[i] == '&' and that i+3<len(b). The returned int will be the last character of the entity, so that the next iteration can safely do i++ to continue and not miss any entitites.
    219 func replaceEntities(b []byte, i int, entitiesMap map[string][]byte, revEntitiesMap map[byte][]byte) ([]byte, int) {
    220 	const MaxEntityLength = 31 // longest HTML entity: CounterClockwiseContourIntegral
    221 	var r []byte
    222 	j := i + 1
    223 	if b[j] == '#' {
    224 		j++
    225 		if b[j] == 'x' {
    226 			j++
    227 			c := 0
    228 			for ; j < len(b) && (b[j] >= '0' && b[j] <= '9' || b[j] >= 'a' && b[j] <= 'f' || b[j] >= 'A' && b[j] <= 'F'); j++ {
    229 				if b[j] <= '9' {
    230 					c = c<<4 + int(b[j]-'0')
    231 				} else if b[j] <= 'F' {
    232 					c = c<<4 + int(b[j]-'A') + 10
    233 				} else if b[j] <= 'f' {
    234 					c = c<<4 + int(b[j]-'a') + 10
    235 				}
    236 			}
    237 			if j <= i+3 || 10000 <= c {
    238 				return b, j - 1
    239 			}
    240 			if c < 128 {
    241 				r = []byte{byte(c)}
    242 			} else {
    243 				r = append(r, '&', '#')
    244 				r = strconv.AppendInt(r, int64(c), 10)
    245 				r = append(r, ';')
    246 			}
    247 		} else {
    248 			c := 0
    249 			for ; j < len(b) && c < 128 && b[j] >= '0' && b[j] <= '9'; j++ {
    250 				c = c*10 + int(b[j]-'0')
    251 			}
    252 			if j <= i+2 || 128 <= c {
    253 				return b, j - 1
    254 			}
    255 			r = []byte{byte(c)}
    256 		}
    257 	} else {
    258 		for ; j < len(b) && j-i-1 <= MaxEntityLength && b[j] != ';'; j++ {
    259 		}
    260 		if j <= i+1 || len(b) <= j {
    261 			return b, j - 1
    262 		}
    263 
    264 		var ok bool
    265 		r, ok = entitiesMap[string(b[i+1:j])]
    266 		if !ok {
    267 			return b, j
    268 		}
    269 	}
    270 
    271 	// j is at semicolon
    272 	n := j + 1 - i
    273 	if j < len(b) && b[j] == ';' && 2 < n {
    274 		if len(r) == 1 {
    275 			if q, ok := revEntitiesMap[r[0]]; ok {
    276 				if len(q) == len(b[i:j+1]) && bytes.Equal(q, b[i:j+1]) {
    277 					return b, j
    278 				}
    279 				r = q
    280 			} else if r[0] == '&' {
    281 				// check if for example &amp; is followed by something that could potentially be an entity
    282 				k := j + 1
    283 				if k < len(b) && (b[k] >= '0' && b[k] <= '9' || b[k] >= 'a' && b[k] <= 'z' || b[k] >= 'A' && b[k] <= 'Z' || b[k] == '#') {
    284 					return b, k
    285 				}
    286 			}
    287 		}
    288 
    289 		copy(b[i:], r)
    290 		copy(b[i+len(r):], b[j+1:])
    291 		b = b[:len(b)-n+len(r)]
    292 		return b, i + len(r) - 1
    293 	}
    294 	return b, i
    295 }
    296 
    297 // ReplaceEntities replaces all occurrences of entites (such as &quot;) to their respective unencoded bytes.
    298 func ReplaceEntities(b []byte, entitiesMap map[string][]byte, revEntitiesMap map[byte][]byte) []byte {
    299 	for i := 0; i < len(b); i++ {
    300 		if b[i] == '&' && i+3 < len(b) {
    301 			b, i = replaceEntities(b, i, entitiesMap, revEntitiesMap)
    302 		}
    303 	}
    304 	return b
    305 }
    306 
    307 // ReplaceMultipleWhitespaceAndEntities is a combination of ReplaceMultipleWhitespace and ReplaceEntities. It is faster than executing both sequentially.
    308 func ReplaceMultipleWhitespaceAndEntities(b []byte, entitiesMap map[string][]byte, revEntitiesMap map[byte][]byte) []byte {
    309 	j, k := 0, 0 // j is write position, k is start of next text section
    310 	for i := 0; i < len(b); i++ {
    311 		if IsWhitespace(b[i]) {
    312 			start := i
    313 			newline := IsNewline(b[i])
    314 			i++
    315 			for ; i < len(b) && IsWhitespace(b[i]); i++ {
    316 				if IsNewline(b[i]) {
    317 					newline = true
    318 				}
    319 			}
    320 			if newline {
    321 				b[start] = '\n'
    322 			} else {
    323 				b[start] = ' '
    324 			}
    325 			if 1 < i-start { // more than one whitespace
    326 				if j == 0 {
    327 					j = start + 1
    328 				} else {
    329 					j += copy(b[j:], b[k:start+1])
    330 				}
    331 				k = i
    332 			}
    333 		}
    334 		if i+3 < len(b) && b[i] == '&' {
    335 			b, i = replaceEntities(b, i, entitiesMap, revEntitiesMap)
    336 		}
    337 	}
    338 	if j == 0 {
    339 		return b
    340 	} else if j == 1 { // only if starts with whitespace
    341 		b[k-1] = b[0]
    342 		return b[k-1:]
    343 	} else if k < len(b) {
    344 		j += copy(b[j:], b[k:])
    345 	}
    346 	return b[:j]
    347 }
    348 
    349 // URLEncodingTable is a charmap for which characters need escaping in the URL encoding scheme
    350 var URLEncodingTable = [256]bool{
    351 	// ASCII
    352 	true, true, true, true, true, true, true, true,
    353 	true, true, true, true, true, true, true, true,
    354 	true, true, true, true, true, true, true, true,
    355 	true, true, true, true, true, true, true, true,
    356 
    357 	true, false, true, true, true, true, true, false, // space, ", #, $, %, &
    358 	false, false, false, true, true, false, false, true, // +, comma, /
    359 	false, false, false, false, false, false, false, false,
    360 	false, false, true, true, true, true, true, true, // :, ;, <, =, >, ?
    361 
    362 	true, false, false, false, false, false, false, false, // @
    363 	false, false, false, false, false, false, false, false,
    364 	false, false, false, false, false, false, false, false,
    365 	false, false, false, true, true, true, true, false, // [, \, ], ^
    366 
    367 	true, false, false, false, false, false, false, false, // `
    368 	false, false, false, false, false, false, false, false,
    369 	false, false, false, false, false, false, false, false,
    370 	false, false, false, true, true, true, false, true, // {, |, }, DEL
    371 
    372 	// non-ASCII
    373 	true, true, true, true, true, true, true, true,
    374 	true, true, true, true, true, true, true, true,
    375 	true, true, true, true, true, true, true, true,
    376 	true, true, true, true, true, true, true, true,
    377 
    378 	true, true, true, true, true, true, true, true,
    379 	true, true, true, true, true, true, true, true,
    380 	true, true, true, true, true, true, true, true,
    381 	true, true, true, true, true, true, true, true,
    382 
    383 	true, true, true, true, true, true, true, true,
    384 	true, true, true, true, true, true, true, true,
    385 	true, true, true, true, true, true, true, true,
    386 	true, true, true, true, true, true, true, true,
    387 
    388 	true, true, true, true, true, true, true, true,
    389 	true, true, true, true, true, true, true, true,
    390 	true, true, true, true, true, true, true, true,
    391 	true, true, true, true, true, true, true, true,
    392 }
    393 
    394 // DataURIEncodingTable is a charmap for which characters need escaping in the Data URI encoding scheme
    395 // Escape only non-printable characters, unicode and %, #, &.
    396 // IE11 additionally requires encoding of \, [, ], ", <, >, `, {, }, |, ^ which is not required by Chrome, Firefox, Opera, Edge, Safari, Yandex
    397 // To pass the HTML validator, restricted URL characters must be escaped: non-printable characters, space, <, >, #, %, "
    398 var DataURIEncodingTable = [256]bool{
    399 	// ASCII
    400 	true, true, true, true, true, true, true, true,
    401 	true, true, true, true, true, true, true, true,
    402 	true, true, true, true, true, true, true, true,
    403 	true, true, true, true, true, true, true, true,
    404 
    405 	true, false, true, true, false, true, true, false, // space, ", #, %, &
    406 	false, false, false, false, false, false, false, false,
    407 	false, false, false, false, false, false, false, false,
    408 	false, false, false, false, true, false, true, false, // <, >
    409 
    410 	false, false, false, false, false, false, false, false,
    411 	false, false, false, false, false, false, false, false,
    412 	false, false, false, false, false, false, false, false,
    413 	false, false, false, true, true, true, true, false, // [, \, ], ^
    414 
    415 	true, false, false, false, false, false, false, false, // `
    416 	false, false, false, false, false, false, false, false,
    417 	false, false, false, false, false, false, false, false,
    418 	false, false, false, true, true, true, false, true, // {, |, }, DEL
    419 
    420 	// non-ASCII
    421 	true, true, true, true, true, true, true, true,
    422 	true, true, true, true, true, true, true, true,
    423 	true, true, true, true, true, true, true, true,
    424 	true, true, true, true, true, true, true, true,
    425 
    426 	true, true, true, true, true, true, true, true,
    427 	true, true, true, true, true, true, true, true,
    428 	true, true, true, true, true, true, true, true,
    429 	true, true, true, true, true, true, true, true,
    430 
    431 	true, true, true, true, true, true, true, true,
    432 	true, true, true, true, true, true, true, true,
    433 	true, true, true, true, true, true, true, true,
    434 	true, true, true, true, true, true, true, true,
    435 
    436 	true, true, true, true, true, true, true, true,
    437 	true, true, true, true, true, true, true, true,
    438 	true, true, true, true, true, true, true, true,
    439 	true, true, true, true, true, true, true, true,
    440 }
    441 
    442 // EncodeURL encodes bytes using the URL encoding scheme
    443 func EncodeURL(b []byte, table [256]bool) []byte {
    444 	for i := 0; i < len(b); i++ {
    445 		c := b[i]
    446 		if table[c] {
    447 			b = append(b, 0, 0)
    448 			copy(b[i+3:], b[i+1:])
    449 			b[i+0] = '%'
    450 			b[i+1] = "0123456789ABCDEF"[c>>4]
    451 			b[i+2] = "0123456789ABCDEF"[c&15]
    452 		}
    453 	}
    454 	return b
    455 }
    456 
    457 // DecodeURL decodes an URL encoded using the URL encoding scheme
    458 func DecodeURL(b []byte) []byte {
    459 	for i := 0; i < len(b); i++ {
    460 		if b[i] == '%' && i+2 < len(b) {
    461 			j := i + 1
    462 			c := 0
    463 			for ; j < i+3 && (b[j] >= '0' && b[j] <= '9' || b[j] >= 'a' && b[j] <= 'f' || b[j] >= 'A' && b[j] <= 'F'); j++ {
    464 				if b[j] <= '9' {
    465 					c = c<<4 + int(b[j]-'0')
    466 				} else if b[j] <= 'F' {
    467 					c = c<<4 + int(b[j]-'A') + 10
    468 				} else if b[j] <= 'f' {
    469 					c = c<<4 + int(b[j]-'a') + 10
    470 				}
    471 			}
    472 			if j == i+3 && c < 128 {
    473 				b[i] = byte(c)
    474 				b = append(b[:i+1], b[i+3:]...)
    475 			}
    476 		} else if b[i] == '+' {
    477 			b[i] = ' '
    478 		}
    479 	}
    480 	return b
    481 }