string.go (10601B)
1 package encoder 2 3 import ( 4 "math/bits" 5 "reflect" 6 "unsafe" 7 ) 8 9 const ( 10 lsb = 0x0101010101010101 11 msb = 0x8080808080808080 12 ) 13 14 var hex = "0123456789abcdef" 15 16 //nolint:govet 17 func stringToUint64Slice(s string) []uint64 { 18 return *(*[]uint64)(unsafe.Pointer(&reflect.SliceHeader{ 19 Data: ((*reflect.StringHeader)(unsafe.Pointer(&s))).Data, 20 Len: len(s) / 8, 21 Cap: len(s) / 8, 22 })) 23 } 24 25 func AppendString(ctx *RuntimeContext, buf []byte, s string) []byte { 26 if ctx.Option.Flag&HTMLEscapeOption != 0 { 27 if ctx.Option.Flag&NormalizeUTF8Option != 0 { 28 return appendNormalizedHTMLString(buf, s) 29 } 30 return appendHTMLString(buf, s) 31 } 32 if ctx.Option.Flag&NormalizeUTF8Option != 0 { 33 return appendNormalizedString(buf, s) 34 } 35 return appendString(buf, s) 36 } 37 38 func appendNormalizedHTMLString(buf []byte, s string) []byte { 39 valLen := len(s) 40 if valLen == 0 { 41 return append(buf, `""`...) 42 } 43 buf = append(buf, '"') 44 var ( 45 i, j int 46 ) 47 if valLen >= 8 { 48 chunks := stringToUint64Slice(s) 49 for _, n := range chunks { 50 // combine masks before checking for the MSB of each byte. We include 51 // `n` in the mask to check whether any of the *input* byte MSBs were 52 // set (i.e. the byte was outside the ASCII range). 53 mask := n | (n - (lsb * 0x20)) | 54 ((n ^ (lsb * '"')) - lsb) | 55 ((n ^ (lsb * '\\')) - lsb) | 56 ((n ^ (lsb * '<')) - lsb) | 57 ((n ^ (lsb * '>')) - lsb) | 58 ((n ^ (lsb * '&')) - lsb) 59 if (mask & msb) != 0 { 60 j = bits.TrailingZeros64(mask&msb) / 8 61 goto ESCAPE_END 62 } 63 } 64 for i := len(chunks) * 8; i < valLen; i++ { 65 if needEscapeHTMLNormalizeUTF8[s[i]] { 66 j = i 67 goto ESCAPE_END 68 } 69 } 70 // no found any escape characters. 71 return append(append(buf, s...), '"') 72 } 73 ESCAPE_END: 74 for j < valLen { 75 c := s[j] 76 77 if !needEscapeHTMLNormalizeUTF8[c] { 78 // fast path: most of the time, printable ascii characters are used 79 j++ 80 continue 81 } 82 83 switch c { 84 case '\\', '"': 85 buf = append(buf, s[i:j]...) 86 buf = append(buf, '\\', c) 87 i = j + 1 88 j = j + 1 89 continue 90 91 case '\n': 92 buf = append(buf, s[i:j]...) 93 buf = append(buf, '\\', 'n') 94 i = j + 1 95 j = j + 1 96 continue 97 98 case '\r': 99 buf = append(buf, s[i:j]...) 100 buf = append(buf, '\\', 'r') 101 i = j + 1 102 j = j + 1 103 continue 104 105 case '\t': 106 buf = append(buf, s[i:j]...) 107 buf = append(buf, '\\', 't') 108 i = j + 1 109 j = j + 1 110 continue 111 112 case '<', '>', '&': 113 buf = append(buf, s[i:j]...) 114 buf = append(buf, `\u00`...) 115 buf = append(buf, hex[c>>4], hex[c&0xF]) 116 i = j + 1 117 j = j + 1 118 continue 119 120 case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F 121 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F 122 buf = append(buf, s[i:j]...) 123 buf = append(buf, `\u00`...) 124 buf = append(buf, hex[c>>4], hex[c&0xF]) 125 i = j + 1 126 j = j + 1 127 continue 128 } 129 state, size := decodeRuneInString(s[j:]) 130 switch state { 131 case runeErrorState: 132 buf = append(buf, s[i:j]...) 133 buf = append(buf, `\ufffd`...) 134 i = j + 1 135 j = j + 1 136 continue 137 // U+2028 is LINE SEPARATOR. 138 // U+2029 is PARAGRAPH SEPARATOR. 139 // They are both technically valid characters in JSON strings, 140 // but don't work in JSONP, which has to be evaluated as JavaScript, 141 // and can lead to security holes there. It is valid JSON to 142 // escape them, so we do so unconditionally. 143 // See http://timelessrepo.com/json-isnt-a-javascript-subset for discussion. 144 case lineSepState: 145 buf = append(buf, s[i:j]...) 146 buf = append(buf, `\u2028`...) 147 i = j + 3 148 j = j + 3 149 continue 150 case paragraphSepState: 151 buf = append(buf, s[i:j]...) 152 buf = append(buf, `\u2029`...) 153 i = j + 3 154 j = j + 3 155 continue 156 } 157 j += size 158 } 159 160 return append(append(buf, s[i:]...), '"') 161 } 162 163 func appendHTMLString(buf []byte, s string) []byte { 164 valLen := len(s) 165 if valLen == 0 { 166 return append(buf, `""`...) 167 } 168 buf = append(buf, '"') 169 var ( 170 i, j int 171 ) 172 if valLen >= 8 { 173 chunks := stringToUint64Slice(s) 174 for _, n := range chunks { 175 // combine masks before checking for the MSB of each byte. We include 176 // `n` in the mask to check whether any of the *input* byte MSBs were 177 // set (i.e. the byte was outside the ASCII range). 178 mask := n | (n - (lsb * 0x20)) | 179 ((n ^ (lsb * '"')) - lsb) | 180 ((n ^ (lsb * '\\')) - lsb) | 181 ((n ^ (lsb * '<')) - lsb) | 182 ((n ^ (lsb * '>')) - lsb) | 183 ((n ^ (lsb * '&')) - lsb) 184 if (mask & msb) != 0 { 185 j = bits.TrailingZeros64(mask&msb) / 8 186 goto ESCAPE_END 187 } 188 } 189 for i := len(chunks) * 8; i < valLen; i++ { 190 if needEscapeHTML[s[i]] { 191 j = i 192 goto ESCAPE_END 193 } 194 } 195 // no found any escape characters. 196 return append(append(buf, s...), '"') 197 } 198 ESCAPE_END: 199 for j < valLen { 200 c := s[j] 201 202 if !needEscapeHTML[c] { 203 // fast path: most of the time, printable ascii characters are used 204 j++ 205 continue 206 } 207 208 switch c { 209 case '\\', '"': 210 buf = append(buf, s[i:j]...) 211 buf = append(buf, '\\', c) 212 i = j + 1 213 j = j + 1 214 continue 215 216 case '\n': 217 buf = append(buf, s[i:j]...) 218 buf = append(buf, '\\', 'n') 219 i = j + 1 220 j = j + 1 221 continue 222 223 case '\r': 224 buf = append(buf, s[i:j]...) 225 buf = append(buf, '\\', 'r') 226 i = j + 1 227 j = j + 1 228 continue 229 230 case '\t': 231 buf = append(buf, s[i:j]...) 232 buf = append(buf, '\\', 't') 233 i = j + 1 234 j = j + 1 235 continue 236 237 case '<', '>', '&': 238 buf = append(buf, s[i:j]...) 239 buf = append(buf, `\u00`...) 240 buf = append(buf, hex[c>>4], hex[c&0xF]) 241 i = j + 1 242 j = j + 1 243 continue 244 245 case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F 246 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F 247 buf = append(buf, s[i:j]...) 248 buf = append(buf, `\u00`...) 249 buf = append(buf, hex[c>>4], hex[c&0xF]) 250 i = j + 1 251 j = j + 1 252 continue 253 } 254 j++ 255 } 256 257 return append(append(buf, s[i:]...), '"') 258 } 259 260 func appendNormalizedString(buf []byte, s string) []byte { 261 valLen := len(s) 262 if valLen == 0 { 263 return append(buf, `""`...) 264 } 265 buf = append(buf, '"') 266 var ( 267 i, j int 268 ) 269 if valLen >= 8 { 270 chunks := stringToUint64Slice(s) 271 for _, n := range chunks { 272 // combine masks before checking for the MSB of each byte. We include 273 // `n` in the mask to check whether any of the *input* byte MSBs were 274 // set (i.e. the byte was outside the ASCII range). 275 mask := n | (n - (lsb * 0x20)) | 276 ((n ^ (lsb * '"')) - lsb) | 277 ((n ^ (lsb * '\\')) - lsb) 278 if (mask & msb) != 0 { 279 j = bits.TrailingZeros64(mask&msb) / 8 280 goto ESCAPE_END 281 } 282 } 283 valLen := len(s) 284 for i := len(chunks) * 8; i < valLen; i++ { 285 if needEscapeNormalizeUTF8[s[i]] { 286 j = i 287 goto ESCAPE_END 288 } 289 } 290 return append(append(buf, s...), '"') 291 } 292 ESCAPE_END: 293 for j < valLen { 294 c := s[j] 295 296 if !needEscapeNormalizeUTF8[c] { 297 // fast path: most of the time, printable ascii characters are used 298 j++ 299 continue 300 } 301 302 switch c { 303 case '\\', '"': 304 buf = append(buf, s[i:j]...) 305 buf = append(buf, '\\', c) 306 i = j + 1 307 j = j + 1 308 continue 309 310 case '\n': 311 buf = append(buf, s[i:j]...) 312 buf = append(buf, '\\', 'n') 313 i = j + 1 314 j = j + 1 315 continue 316 317 case '\r': 318 buf = append(buf, s[i:j]...) 319 buf = append(buf, '\\', 'r') 320 i = j + 1 321 j = j + 1 322 continue 323 324 case '\t': 325 buf = append(buf, s[i:j]...) 326 buf = append(buf, '\\', 't') 327 i = j + 1 328 j = j + 1 329 continue 330 331 case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F 332 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F 333 buf = append(buf, s[i:j]...) 334 buf = append(buf, `\u00`...) 335 buf = append(buf, hex[c>>4], hex[c&0xF]) 336 i = j + 1 337 j = j + 1 338 continue 339 } 340 341 state, size := decodeRuneInString(s[j:]) 342 switch state { 343 case runeErrorState: 344 buf = append(buf, s[i:j]...) 345 buf = append(buf, `\ufffd`...) 346 i = j + 1 347 j = j + 1 348 continue 349 // U+2028 is LINE SEPARATOR. 350 // U+2029 is PARAGRAPH SEPARATOR. 351 // They are both technically valid characters in JSON strings, 352 // but don't work in JSONP, which has to be evaluated as JavaScript, 353 // and can lead to security holes there. It is valid JSON to 354 // escape them, so we do so unconditionally. 355 // See http://timelessrepo.com/json-isnt-a-javascript-subset for discussion. 356 case lineSepState: 357 buf = append(buf, s[i:j]...) 358 buf = append(buf, `\u2028`...) 359 i = j + 3 360 j = j + 3 361 continue 362 case paragraphSepState: 363 buf = append(buf, s[i:j]...) 364 buf = append(buf, `\u2029`...) 365 i = j + 3 366 j = j + 3 367 continue 368 } 369 j += size 370 } 371 372 return append(append(buf, s[i:]...), '"') 373 } 374 375 func appendString(buf []byte, s string) []byte { 376 valLen := len(s) 377 if valLen == 0 { 378 return append(buf, `""`...) 379 } 380 buf = append(buf, '"') 381 var ( 382 i, j int 383 ) 384 if valLen >= 8 { 385 chunks := stringToUint64Slice(s) 386 for _, n := range chunks { 387 // combine masks before checking for the MSB of each byte. We include 388 // `n` in the mask to check whether any of the *input* byte MSBs were 389 // set (i.e. the byte was outside the ASCII range). 390 mask := n | (n - (lsb * 0x20)) | 391 ((n ^ (lsb * '"')) - lsb) | 392 ((n ^ (lsb * '\\')) - lsb) 393 if (mask & msb) != 0 { 394 j = bits.TrailingZeros64(mask&msb) / 8 395 goto ESCAPE_END 396 } 397 } 398 valLen := len(s) 399 for i := len(chunks) * 8; i < valLen; i++ { 400 if needEscape[s[i]] { 401 j = i 402 goto ESCAPE_END 403 } 404 } 405 return append(append(buf, s...), '"') 406 } 407 ESCAPE_END: 408 for j < valLen { 409 c := s[j] 410 411 if !needEscape[c] { 412 // fast path: most of the time, printable ascii characters are used 413 j++ 414 continue 415 } 416 417 switch c { 418 case '\\', '"': 419 buf = append(buf, s[i:j]...) 420 buf = append(buf, '\\', c) 421 i = j + 1 422 j = j + 1 423 continue 424 425 case '\n': 426 buf = append(buf, s[i:j]...) 427 buf = append(buf, '\\', 'n') 428 i = j + 1 429 j = j + 1 430 continue 431 432 case '\r': 433 buf = append(buf, s[i:j]...) 434 buf = append(buf, '\\', 'r') 435 i = j + 1 436 j = j + 1 437 continue 438 439 case '\t': 440 buf = append(buf, s[i:j]...) 441 buf = append(buf, '\\', 't') 442 i = j + 1 443 j = j + 1 444 continue 445 446 case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F 447 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F 448 buf = append(buf, s[i:j]...) 449 buf = append(buf, `\u00`...) 450 buf = append(buf, hex[c>>4], hex[c&0xF]) 451 i = j + 1 452 j = j + 1 453 continue 454 } 455 j++ 456 } 457 458 return append(append(buf, s[i:]...), '"') 459 }