gtsocial-umbx

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

string.go (10601B)


      1 package encoder
      2 
      3 import (
      4 	"math/bits"
      5 	"reflect"
      6 	"unsafe"
      7 )
      8 
      9 const (
     10 	lsb = 0x0101010101010101
     11 	msb = 0x8080808080808080
     12 )
     13 
     14 var hex = "0123456789abcdef"
     15 
     16 //nolint:govet
     17 func stringToUint64Slice(s string) []uint64 {
     18 	return *(*[]uint64)(unsafe.Pointer(&reflect.SliceHeader{
     19 		Data: ((*reflect.StringHeader)(unsafe.Pointer(&s))).Data,
     20 		Len:  len(s) / 8,
     21 		Cap:  len(s) / 8,
     22 	}))
     23 }
     24 
     25 func AppendString(ctx *RuntimeContext, buf []byte, s string) []byte {
     26 	if ctx.Option.Flag&HTMLEscapeOption != 0 {
     27 		if ctx.Option.Flag&NormalizeUTF8Option != 0 {
     28 			return appendNormalizedHTMLString(buf, s)
     29 		}
     30 		return appendHTMLString(buf, s)
     31 	}
     32 	if ctx.Option.Flag&NormalizeUTF8Option != 0 {
     33 		return appendNormalizedString(buf, s)
     34 	}
     35 	return appendString(buf, s)
     36 }
     37 
     38 func appendNormalizedHTMLString(buf []byte, s string) []byte {
     39 	valLen := len(s)
     40 	if valLen == 0 {
     41 		return append(buf, `""`...)
     42 	}
     43 	buf = append(buf, '"')
     44 	var (
     45 		i, j int
     46 	)
     47 	if valLen >= 8 {
     48 		chunks := stringToUint64Slice(s)
     49 		for _, n := range chunks {
     50 			// combine masks before checking for the MSB of each byte. We include
     51 			// `n` in the mask to check whether any of the *input* byte MSBs were
     52 			// set (i.e. the byte was outside the ASCII range).
     53 			mask := n | (n - (lsb * 0x20)) |
     54 				((n ^ (lsb * '"')) - lsb) |
     55 				((n ^ (lsb * '\\')) - lsb) |
     56 				((n ^ (lsb * '<')) - lsb) |
     57 				((n ^ (lsb * '>')) - lsb) |
     58 				((n ^ (lsb * '&')) - lsb)
     59 			if (mask & msb) != 0 {
     60 				j = bits.TrailingZeros64(mask&msb) / 8
     61 				goto ESCAPE_END
     62 			}
     63 		}
     64 		for i := len(chunks) * 8; i < valLen; i++ {
     65 			if needEscapeHTMLNormalizeUTF8[s[i]] {
     66 				j = i
     67 				goto ESCAPE_END
     68 			}
     69 		}
     70 		// no found any escape characters.
     71 		return append(append(buf, s...), '"')
     72 	}
     73 ESCAPE_END:
     74 	for j < valLen {
     75 		c := s[j]
     76 
     77 		if !needEscapeHTMLNormalizeUTF8[c] {
     78 			// fast path: most of the time, printable ascii characters are used
     79 			j++
     80 			continue
     81 		}
     82 
     83 		switch c {
     84 		case '\\', '"':
     85 			buf = append(buf, s[i:j]...)
     86 			buf = append(buf, '\\', c)
     87 			i = j + 1
     88 			j = j + 1
     89 			continue
     90 
     91 		case '\n':
     92 			buf = append(buf, s[i:j]...)
     93 			buf = append(buf, '\\', 'n')
     94 			i = j + 1
     95 			j = j + 1
     96 			continue
     97 
     98 		case '\r':
     99 			buf = append(buf, s[i:j]...)
    100 			buf = append(buf, '\\', 'r')
    101 			i = j + 1
    102 			j = j + 1
    103 			continue
    104 
    105 		case '\t':
    106 			buf = append(buf, s[i:j]...)
    107 			buf = append(buf, '\\', 't')
    108 			i = j + 1
    109 			j = j + 1
    110 			continue
    111 
    112 		case '<', '>', '&':
    113 			buf = append(buf, s[i:j]...)
    114 			buf = append(buf, `\u00`...)
    115 			buf = append(buf, hex[c>>4], hex[c&0xF])
    116 			i = j + 1
    117 			j = j + 1
    118 			continue
    119 
    120 		case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F
    121 			0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F
    122 			buf = append(buf, s[i:j]...)
    123 			buf = append(buf, `\u00`...)
    124 			buf = append(buf, hex[c>>4], hex[c&0xF])
    125 			i = j + 1
    126 			j = j + 1
    127 			continue
    128 		}
    129 		state, size := decodeRuneInString(s[j:])
    130 		switch state {
    131 		case runeErrorState:
    132 			buf = append(buf, s[i:j]...)
    133 			buf = append(buf, `\ufffd`...)
    134 			i = j + 1
    135 			j = j + 1
    136 			continue
    137 			// U+2028 is LINE SEPARATOR.
    138 			// U+2029 is PARAGRAPH SEPARATOR.
    139 			// They are both technically valid characters in JSON strings,
    140 			// but don't work in JSONP, which has to be evaluated as JavaScript,
    141 			// and can lead to security holes there. It is valid JSON to
    142 			// escape them, so we do so unconditionally.
    143 			// See http://timelessrepo.com/json-isnt-a-javascript-subset for discussion.
    144 		case lineSepState:
    145 			buf = append(buf, s[i:j]...)
    146 			buf = append(buf, `\u2028`...)
    147 			i = j + 3
    148 			j = j + 3
    149 			continue
    150 		case paragraphSepState:
    151 			buf = append(buf, s[i:j]...)
    152 			buf = append(buf, `\u2029`...)
    153 			i = j + 3
    154 			j = j + 3
    155 			continue
    156 		}
    157 		j += size
    158 	}
    159 
    160 	return append(append(buf, s[i:]...), '"')
    161 }
    162 
    163 func appendHTMLString(buf []byte, s string) []byte {
    164 	valLen := len(s)
    165 	if valLen == 0 {
    166 		return append(buf, `""`...)
    167 	}
    168 	buf = append(buf, '"')
    169 	var (
    170 		i, j int
    171 	)
    172 	if valLen >= 8 {
    173 		chunks := stringToUint64Slice(s)
    174 		for _, n := range chunks {
    175 			// combine masks before checking for the MSB of each byte. We include
    176 			// `n` in the mask to check whether any of the *input* byte MSBs were
    177 			// set (i.e. the byte was outside the ASCII range).
    178 			mask := n | (n - (lsb * 0x20)) |
    179 				((n ^ (lsb * '"')) - lsb) |
    180 				((n ^ (lsb * '\\')) - lsb) |
    181 				((n ^ (lsb * '<')) - lsb) |
    182 				((n ^ (lsb * '>')) - lsb) |
    183 				((n ^ (lsb * '&')) - lsb)
    184 			if (mask & msb) != 0 {
    185 				j = bits.TrailingZeros64(mask&msb) / 8
    186 				goto ESCAPE_END
    187 			}
    188 		}
    189 		for i := len(chunks) * 8; i < valLen; i++ {
    190 			if needEscapeHTML[s[i]] {
    191 				j = i
    192 				goto ESCAPE_END
    193 			}
    194 		}
    195 		// no found any escape characters.
    196 		return append(append(buf, s...), '"')
    197 	}
    198 ESCAPE_END:
    199 	for j < valLen {
    200 		c := s[j]
    201 
    202 		if !needEscapeHTML[c] {
    203 			// fast path: most of the time, printable ascii characters are used
    204 			j++
    205 			continue
    206 		}
    207 
    208 		switch c {
    209 		case '\\', '"':
    210 			buf = append(buf, s[i:j]...)
    211 			buf = append(buf, '\\', c)
    212 			i = j + 1
    213 			j = j + 1
    214 			continue
    215 
    216 		case '\n':
    217 			buf = append(buf, s[i:j]...)
    218 			buf = append(buf, '\\', 'n')
    219 			i = j + 1
    220 			j = j + 1
    221 			continue
    222 
    223 		case '\r':
    224 			buf = append(buf, s[i:j]...)
    225 			buf = append(buf, '\\', 'r')
    226 			i = j + 1
    227 			j = j + 1
    228 			continue
    229 
    230 		case '\t':
    231 			buf = append(buf, s[i:j]...)
    232 			buf = append(buf, '\\', 't')
    233 			i = j + 1
    234 			j = j + 1
    235 			continue
    236 
    237 		case '<', '>', '&':
    238 			buf = append(buf, s[i:j]...)
    239 			buf = append(buf, `\u00`...)
    240 			buf = append(buf, hex[c>>4], hex[c&0xF])
    241 			i = j + 1
    242 			j = j + 1
    243 			continue
    244 
    245 		case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F
    246 			0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F
    247 			buf = append(buf, s[i:j]...)
    248 			buf = append(buf, `\u00`...)
    249 			buf = append(buf, hex[c>>4], hex[c&0xF])
    250 			i = j + 1
    251 			j = j + 1
    252 			continue
    253 		}
    254 		j++
    255 	}
    256 
    257 	return append(append(buf, s[i:]...), '"')
    258 }
    259 
    260 func appendNormalizedString(buf []byte, s string) []byte {
    261 	valLen := len(s)
    262 	if valLen == 0 {
    263 		return append(buf, `""`...)
    264 	}
    265 	buf = append(buf, '"')
    266 	var (
    267 		i, j int
    268 	)
    269 	if valLen >= 8 {
    270 		chunks := stringToUint64Slice(s)
    271 		for _, n := range chunks {
    272 			// combine masks before checking for the MSB of each byte. We include
    273 			// `n` in the mask to check whether any of the *input* byte MSBs were
    274 			// set (i.e. the byte was outside the ASCII range).
    275 			mask := n | (n - (lsb * 0x20)) |
    276 				((n ^ (lsb * '"')) - lsb) |
    277 				((n ^ (lsb * '\\')) - lsb)
    278 			if (mask & msb) != 0 {
    279 				j = bits.TrailingZeros64(mask&msb) / 8
    280 				goto ESCAPE_END
    281 			}
    282 		}
    283 		valLen := len(s)
    284 		for i := len(chunks) * 8; i < valLen; i++ {
    285 			if needEscapeNormalizeUTF8[s[i]] {
    286 				j = i
    287 				goto ESCAPE_END
    288 			}
    289 		}
    290 		return append(append(buf, s...), '"')
    291 	}
    292 ESCAPE_END:
    293 	for j < valLen {
    294 		c := s[j]
    295 
    296 		if !needEscapeNormalizeUTF8[c] {
    297 			// fast path: most of the time, printable ascii characters are used
    298 			j++
    299 			continue
    300 		}
    301 
    302 		switch c {
    303 		case '\\', '"':
    304 			buf = append(buf, s[i:j]...)
    305 			buf = append(buf, '\\', c)
    306 			i = j + 1
    307 			j = j + 1
    308 			continue
    309 
    310 		case '\n':
    311 			buf = append(buf, s[i:j]...)
    312 			buf = append(buf, '\\', 'n')
    313 			i = j + 1
    314 			j = j + 1
    315 			continue
    316 
    317 		case '\r':
    318 			buf = append(buf, s[i:j]...)
    319 			buf = append(buf, '\\', 'r')
    320 			i = j + 1
    321 			j = j + 1
    322 			continue
    323 
    324 		case '\t':
    325 			buf = append(buf, s[i:j]...)
    326 			buf = append(buf, '\\', 't')
    327 			i = j + 1
    328 			j = j + 1
    329 			continue
    330 
    331 		case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F
    332 			0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F
    333 			buf = append(buf, s[i:j]...)
    334 			buf = append(buf, `\u00`...)
    335 			buf = append(buf, hex[c>>4], hex[c&0xF])
    336 			i = j + 1
    337 			j = j + 1
    338 			continue
    339 		}
    340 
    341 		state, size := decodeRuneInString(s[j:])
    342 		switch state {
    343 		case runeErrorState:
    344 			buf = append(buf, s[i:j]...)
    345 			buf = append(buf, `\ufffd`...)
    346 			i = j + 1
    347 			j = j + 1
    348 			continue
    349 			// U+2028 is LINE SEPARATOR.
    350 			// U+2029 is PARAGRAPH SEPARATOR.
    351 			// They are both technically valid characters in JSON strings,
    352 			// but don't work in JSONP, which has to be evaluated as JavaScript,
    353 			// and can lead to security holes there. It is valid JSON to
    354 			// escape them, so we do so unconditionally.
    355 			// See http://timelessrepo.com/json-isnt-a-javascript-subset for discussion.
    356 		case lineSepState:
    357 			buf = append(buf, s[i:j]...)
    358 			buf = append(buf, `\u2028`...)
    359 			i = j + 3
    360 			j = j + 3
    361 			continue
    362 		case paragraphSepState:
    363 			buf = append(buf, s[i:j]...)
    364 			buf = append(buf, `\u2029`...)
    365 			i = j + 3
    366 			j = j + 3
    367 			continue
    368 		}
    369 		j += size
    370 	}
    371 
    372 	return append(append(buf, s[i:]...), '"')
    373 }
    374 
    375 func appendString(buf []byte, s string) []byte {
    376 	valLen := len(s)
    377 	if valLen == 0 {
    378 		return append(buf, `""`...)
    379 	}
    380 	buf = append(buf, '"')
    381 	var (
    382 		i, j int
    383 	)
    384 	if valLen >= 8 {
    385 		chunks := stringToUint64Slice(s)
    386 		for _, n := range chunks {
    387 			// combine masks before checking for the MSB of each byte. We include
    388 			// `n` in the mask to check whether any of the *input* byte MSBs were
    389 			// set (i.e. the byte was outside the ASCII range).
    390 			mask := n | (n - (lsb * 0x20)) |
    391 				((n ^ (lsb * '"')) - lsb) |
    392 				((n ^ (lsb * '\\')) - lsb)
    393 			if (mask & msb) != 0 {
    394 				j = bits.TrailingZeros64(mask&msb) / 8
    395 				goto ESCAPE_END
    396 			}
    397 		}
    398 		valLen := len(s)
    399 		for i := len(chunks) * 8; i < valLen; i++ {
    400 			if needEscape[s[i]] {
    401 				j = i
    402 				goto ESCAPE_END
    403 			}
    404 		}
    405 		return append(append(buf, s...), '"')
    406 	}
    407 ESCAPE_END:
    408 	for j < valLen {
    409 		c := s[j]
    410 
    411 		if !needEscape[c] {
    412 			// fast path: most of the time, printable ascii characters are used
    413 			j++
    414 			continue
    415 		}
    416 
    417 		switch c {
    418 		case '\\', '"':
    419 			buf = append(buf, s[i:j]...)
    420 			buf = append(buf, '\\', c)
    421 			i = j + 1
    422 			j = j + 1
    423 			continue
    424 
    425 		case '\n':
    426 			buf = append(buf, s[i:j]...)
    427 			buf = append(buf, '\\', 'n')
    428 			i = j + 1
    429 			j = j + 1
    430 			continue
    431 
    432 		case '\r':
    433 			buf = append(buf, s[i:j]...)
    434 			buf = append(buf, '\\', 'r')
    435 			i = j + 1
    436 			j = j + 1
    437 			continue
    438 
    439 		case '\t':
    440 			buf = append(buf, s[i:j]...)
    441 			buf = append(buf, '\\', 't')
    442 			i = j + 1
    443 			j = j + 1
    444 			continue
    445 
    446 		case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F
    447 			0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F
    448 			buf = append(buf, s[i:j]...)
    449 			buf = append(buf, `\u00`...)
    450 			buf = append(buf, hex[c>>4], hex[c&0xF])
    451 			i = j + 1
    452 			j = j + 1
    453 			continue
    454 		}
    455 		j++
    456 	}
    457 
    458 	return append(append(buf, s[i:]...), '"')
    459 }