util.go (15156B)
1 package parse 2 3 import ( 4 "bytes" 5 "fmt" 6 "strconv" 7 "unicode" 8 ) 9 10 // Copy returns a copy of the given byte slice. 11 func Copy(src []byte) (dst []byte) { 12 dst = make([]byte, len(src)) 13 copy(dst, src) 14 return 15 } 16 17 // ToLower converts all characters in the byte slice from A-Z to a-z. 18 func ToLower(src []byte) []byte { 19 for i, c := range src { 20 if c >= 'A' && c <= 'Z' { 21 src[i] = c + ('a' - 'A') 22 } 23 } 24 return src 25 } 26 27 // EqualFold returns true when s matches case-insensitively the targetLower (which must be lowercase). 28 func EqualFold(s, targetLower []byte) bool { 29 if len(s) != len(targetLower) { 30 return false 31 } 32 for i, c := range targetLower { 33 d := s[i] 34 if d != c && (d < 'A' || d > 'Z' || d+('a'-'A') != c) { 35 return false 36 } 37 } 38 return true 39 } 40 41 // Printable returns a printable string for given rune 42 func Printable(r rune) string { 43 if unicode.IsGraphic(r) { 44 return fmt.Sprintf("%c", r) 45 } else if r < 128 { 46 return fmt.Sprintf("0x%02X", r) 47 } 48 return fmt.Sprintf("%U", r) 49 } 50 51 var whitespaceTable = [256]bool{ 52 // ASCII 53 false, false, false, false, false, false, false, false, 54 false, true, true, false, true, true, false, false, // tab, new line, form feed, carriage return 55 false, false, false, false, false, false, false, false, 56 false, false, false, false, false, false, false, false, 57 58 true, false, false, false, false, false, false, false, // space 59 false, false, false, false, false, false, false, false, 60 false, false, false, false, false, false, false, false, 61 false, false, false, false, false, false, false, false, 62 63 false, false, false, false, false, false, false, false, 64 false, false, false, false, false, false, false, false, 65 false, false, false, false, false, false, false, false, 66 false, false, false, false, false, false, false, false, 67 68 false, false, false, false, false, false, false, false, 69 false, false, false, false, false, false, false, false, 70 false, false, false, false, false, false, false, false, 71 false, false, false, false, false, false, false, false, 72 73 // non-ASCII 74 false, false, false, false, false, false, false, false, 75 false, false, false, false, false, false, false, false, 76 false, false, false, false, false, false, false, false, 77 false, false, false, false, false, false, false, false, 78 79 false, false, false, false, false, false, false, false, 80 false, false, false, false, false, false, false, false, 81 false, false, false, false, false, false, false, false, 82 false, false, false, false, false, false, false, false, 83 84 false, false, false, false, false, false, false, false, 85 false, false, false, false, false, false, false, false, 86 false, false, false, false, false, false, false, false, 87 false, false, false, false, false, false, false, false, 88 89 false, false, false, false, false, false, false, false, 90 false, false, false, false, false, false, false, false, 91 false, false, false, false, false, false, false, false, 92 false, false, false, false, false, false, false, false, 93 } 94 95 // IsWhitespace returns true for space, \n, \r, \t, \f. 96 func IsWhitespace(c byte) bool { 97 return whitespaceTable[c] 98 } 99 100 var newlineTable = [256]bool{ 101 // ASCII 102 false, false, false, false, false, false, false, false, 103 false, false, true, false, false, true, false, false, // new line, carriage return 104 false, false, false, false, false, false, false, false, 105 false, false, false, false, false, false, false, false, 106 107 false, false, false, false, false, false, false, false, 108 false, false, false, false, false, false, false, false, 109 false, false, false, false, false, false, false, false, 110 false, false, false, false, false, false, false, false, 111 112 false, false, false, false, false, false, false, false, 113 false, false, false, false, false, false, false, false, 114 false, false, false, false, false, false, false, false, 115 false, false, false, false, false, false, false, false, 116 117 false, false, false, false, false, false, false, false, 118 false, false, false, false, false, false, false, false, 119 false, false, false, false, false, false, false, false, 120 false, false, false, false, false, false, false, false, 121 122 // non-ASCII 123 false, false, false, false, false, false, false, false, 124 false, false, false, false, false, false, false, false, 125 false, false, false, false, false, false, false, false, 126 false, false, false, false, false, false, false, false, 127 128 false, false, false, false, false, false, false, false, 129 false, false, false, false, false, false, false, false, 130 false, false, false, false, false, false, false, false, 131 false, false, false, false, false, false, false, false, 132 133 false, false, false, false, false, false, false, false, 134 false, false, false, false, false, false, false, false, 135 false, false, false, false, false, false, false, false, 136 false, false, false, false, false, false, false, false, 137 138 false, false, false, false, false, false, false, false, 139 false, false, false, false, false, false, false, false, 140 false, false, false, false, false, false, false, false, 141 false, false, false, false, false, false, false, false, 142 } 143 144 // IsNewline returns true for \n, \r. 145 func IsNewline(c byte) bool { 146 return newlineTable[c] 147 } 148 149 // IsAllWhitespace returns true when the entire byte slice consists of space, \n, \r, \t, \f. 150 func IsAllWhitespace(b []byte) bool { 151 for _, c := range b { 152 if !IsWhitespace(c) { 153 return false 154 } 155 } 156 return true 157 } 158 159 // TrimWhitespace removes any leading and trailing whitespace characters. 160 func TrimWhitespace(b []byte) []byte { 161 n := len(b) 162 start := n 163 for i := 0; i < n; i++ { 164 if !IsWhitespace(b[i]) { 165 start = i 166 break 167 } 168 } 169 end := n 170 for i := n - 1; i >= start; i-- { 171 if !IsWhitespace(b[i]) { 172 end = i + 1 173 break 174 } 175 } 176 return b[start:end] 177 } 178 179 // ReplaceMultipleWhitespace replaces character series of space, \n, \t, \f, \r into a single space or newline (when the serie contained a \n or \r). 180 func ReplaceMultipleWhitespace(b []byte) []byte { 181 j, k := 0, 0 // j is write position, k is start of next text section 182 for i := 0; i < len(b); i++ { 183 if IsWhitespace(b[i]) { 184 start := i 185 newline := IsNewline(b[i]) 186 i++ 187 for ; i < len(b) && IsWhitespace(b[i]); i++ { 188 if IsNewline(b[i]) { 189 newline = true 190 } 191 } 192 if newline { 193 b[start] = '\n' 194 } else { 195 b[start] = ' ' 196 } 197 if 1 < i-start { // more than one whitespace 198 if j == 0 { 199 j = start + 1 200 } else { 201 j += copy(b[j:], b[k:start+1]) 202 } 203 k = i 204 } 205 } 206 } 207 if j == 0 { 208 return b 209 } else if j == 1 { // only if starts with whitespace 210 b[k-1] = b[0] 211 return b[k-1:] 212 } else if k < len(b) { 213 j += copy(b[j:], b[k:]) 214 } 215 return b[:j] 216 } 217 218 // replaceEntities will replace in b at index i, assuming that b[i] == '&' and that i+3<len(b). The returned int will be the last character of the entity, so that the next iteration can safely do i++ to continue and not miss any entitites. 219 func replaceEntities(b []byte, i int, entitiesMap map[string][]byte, revEntitiesMap map[byte][]byte) ([]byte, int) { 220 const MaxEntityLength = 31 // longest HTML entity: CounterClockwiseContourIntegral 221 var r []byte 222 j := i + 1 223 if b[j] == '#' { 224 j++ 225 if b[j] == 'x' { 226 j++ 227 c := 0 228 for ; j < len(b) && (b[j] >= '0' && b[j] <= '9' || b[j] >= 'a' && b[j] <= 'f' || b[j] >= 'A' && b[j] <= 'F'); j++ { 229 if b[j] <= '9' { 230 c = c<<4 + int(b[j]-'0') 231 } else if b[j] <= 'F' { 232 c = c<<4 + int(b[j]-'A') + 10 233 } else if b[j] <= 'f' { 234 c = c<<4 + int(b[j]-'a') + 10 235 } 236 } 237 if j <= i+3 || 10000 <= c { 238 return b, j - 1 239 } 240 if c < 128 { 241 r = []byte{byte(c)} 242 } else { 243 r = append(r, '&', '#') 244 r = strconv.AppendInt(r, int64(c), 10) 245 r = append(r, ';') 246 } 247 } else { 248 c := 0 249 for ; j < len(b) && c < 128 && b[j] >= '0' && b[j] <= '9'; j++ { 250 c = c*10 + int(b[j]-'0') 251 } 252 if j <= i+2 || 128 <= c { 253 return b, j - 1 254 } 255 r = []byte{byte(c)} 256 } 257 } else { 258 for ; j < len(b) && j-i-1 <= MaxEntityLength && b[j] != ';'; j++ { 259 } 260 if j <= i+1 || len(b) <= j { 261 return b, j - 1 262 } 263 264 var ok bool 265 r, ok = entitiesMap[string(b[i+1:j])] 266 if !ok { 267 return b, j 268 } 269 } 270 271 // j is at semicolon 272 n := j + 1 - i 273 if j < len(b) && b[j] == ';' && 2 < n { 274 if len(r) == 1 { 275 if q, ok := revEntitiesMap[r[0]]; ok { 276 if len(q) == len(b[i:j+1]) && bytes.Equal(q, b[i:j+1]) { 277 return b, j 278 } 279 r = q 280 } else if r[0] == '&' { 281 // check if for example & is followed by something that could potentially be an entity 282 k := j + 1 283 if k < len(b) && (b[k] >= '0' && b[k] <= '9' || b[k] >= 'a' && b[k] <= 'z' || b[k] >= 'A' && b[k] <= 'Z' || b[k] == '#') { 284 return b, k 285 } 286 } 287 } 288 289 copy(b[i:], r) 290 copy(b[i+len(r):], b[j+1:]) 291 b = b[:len(b)-n+len(r)] 292 return b, i + len(r) - 1 293 } 294 return b, i 295 } 296 297 // ReplaceEntities replaces all occurrences of entites (such as ") to their respective unencoded bytes. 298 func ReplaceEntities(b []byte, entitiesMap map[string][]byte, revEntitiesMap map[byte][]byte) []byte { 299 for i := 0; i < len(b); i++ { 300 if b[i] == '&' && i+3 < len(b) { 301 b, i = replaceEntities(b, i, entitiesMap, revEntitiesMap) 302 } 303 } 304 return b 305 } 306 307 // ReplaceMultipleWhitespaceAndEntities is a combination of ReplaceMultipleWhitespace and ReplaceEntities. It is faster than executing both sequentially. 308 func ReplaceMultipleWhitespaceAndEntities(b []byte, entitiesMap map[string][]byte, revEntitiesMap map[byte][]byte) []byte { 309 j, k := 0, 0 // j is write position, k is start of next text section 310 for i := 0; i < len(b); i++ { 311 if IsWhitespace(b[i]) { 312 start := i 313 newline := IsNewline(b[i]) 314 i++ 315 for ; i < len(b) && IsWhitespace(b[i]); i++ { 316 if IsNewline(b[i]) { 317 newline = true 318 } 319 } 320 if newline { 321 b[start] = '\n' 322 } else { 323 b[start] = ' ' 324 } 325 if 1 < i-start { // more than one whitespace 326 if j == 0 { 327 j = start + 1 328 } else { 329 j += copy(b[j:], b[k:start+1]) 330 } 331 k = i 332 } 333 } 334 if i+3 < len(b) && b[i] == '&' { 335 b, i = replaceEntities(b, i, entitiesMap, revEntitiesMap) 336 } 337 } 338 if j == 0 { 339 return b 340 } else if j == 1 { // only if starts with whitespace 341 b[k-1] = b[0] 342 return b[k-1:] 343 } else if k < len(b) { 344 j += copy(b[j:], b[k:]) 345 } 346 return b[:j] 347 } 348 349 // URLEncodingTable is a charmap for which characters need escaping in the URL encoding scheme 350 var URLEncodingTable = [256]bool{ 351 // ASCII 352 true, true, true, true, true, true, true, true, 353 true, true, true, true, true, true, true, true, 354 true, true, true, true, true, true, true, true, 355 true, true, true, true, true, true, true, true, 356 357 true, false, true, true, true, true, true, false, // space, ", #, $, %, & 358 false, false, false, true, true, false, false, true, // +, comma, / 359 false, false, false, false, false, false, false, false, 360 false, false, true, true, true, true, true, true, // :, ;, <, =, >, ? 361 362 true, false, false, false, false, false, false, false, // @ 363 false, false, false, false, false, false, false, false, 364 false, false, false, false, false, false, false, false, 365 false, false, false, true, true, true, true, false, // [, \, ], ^ 366 367 true, false, false, false, false, false, false, false, // ` 368 false, false, false, false, false, false, false, false, 369 false, false, false, false, false, false, false, false, 370 false, false, false, true, true, true, false, true, // {, |, }, DEL 371 372 // non-ASCII 373 true, true, true, true, true, true, true, true, 374 true, true, true, true, true, true, true, true, 375 true, true, true, true, true, true, true, true, 376 true, true, true, true, true, true, true, true, 377 378 true, true, true, true, true, true, true, true, 379 true, true, true, true, true, true, true, true, 380 true, true, true, true, true, true, true, true, 381 true, true, true, true, true, true, true, true, 382 383 true, true, true, true, true, true, true, true, 384 true, true, true, true, true, true, true, true, 385 true, true, true, true, true, true, true, true, 386 true, true, true, true, true, true, true, true, 387 388 true, true, true, true, true, true, true, true, 389 true, true, true, true, true, true, true, true, 390 true, true, true, true, true, true, true, true, 391 true, true, true, true, true, true, true, true, 392 } 393 394 // DataURIEncodingTable is a charmap for which characters need escaping in the Data URI encoding scheme 395 // Escape only non-printable characters, unicode and %, #, &. 396 // IE11 additionally requires encoding of \, [, ], ", <, >, `, {, }, |, ^ which is not required by Chrome, Firefox, Opera, Edge, Safari, Yandex 397 // To pass the HTML validator, restricted URL characters must be escaped: non-printable characters, space, <, >, #, %, " 398 var DataURIEncodingTable = [256]bool{ 399 // ASCII 400 true, true, true, true, true, true, true, true, 401 true, true, true, true, true, true, true, true, 402 true, true, true, true, true, true, true, true, 403 true, true, true, true, true, true, true, true, 404 405 true, false, true, true, false, true, true, false, // space, ", #, %, & 406 false, false, false, false, false, false, false, false, 407 false, false, false, false, false, false, false, false, 408 false, false, false, false, true, false, true, false, // <, > 409 410 false, false, false, false, false, false, false, false, 411 false, false, false, false, false, false, false, false, 412 false, false, false, false, false, false, false, false, 413 false, false, false, true, true, true, true, false, // [, \, ], ^ 414 415 true, false, false, false, false, false, false, false, // ` 416 false, false, false, false, false, false, false, false, 417 false, false, false, false, false, false, false, false, 418 false, false, false, true, true, true, false, true, // {, |, }, DEL 419 420 // non-ASCII 421 true, true, true, true, true, true, true, true, 422 true, true, true, true, true, true, true, true, 423 true, true, true, true, true, true, true, true, 424 true, true, true, true, true, true, true, true, 425 426 true, true, true, true, true, true, true, true, 427 true, true, true, true, true, true, true, true, 428 true, true, true, true, true, true, true, true, 429 true, true, true, true, true, true, true, true, 430 431 true, true, true, true, true, true, true, true, 432 true, true, true, true, true, true, true, true, 433 true, true, true, true, true, true, true, true, 434 true, true, true, true, true, true, true, true, 435 436 true, true, true, true, true, true, true, true, 437 true, true, true, true, true, true, true, true, 438 true, true, true, true, true, true, true, true, 439 true, true, true, true, true, true, true, true, 440 } 441 442 // EncodeURL encodes bytes using the URL encoding scheme 443 func EncodeURL(b []byte, table [256]bool) []byte { 444 for i := 0; i < len(b); i++ { 445 c := b[i] 446 if table[c] { 447 b = append(b, 0, 0) 448 copy(b[i+3:], b[i+1:]) 449 b[i+0] = '%' 450 b[i+1] = "0123456789ABCDEF"[c>>4] 451 b[i+2] = "0123456789ABCDEF"[c&15] 452 } 453 } 454 return b 455 } 456 457 // DecodeURL decodes an URL encoded using the URL encoding scheme 458 func DecodeURL(b []byte) []byte { 459 for i := 0; i < len(b); i++ { 460 if b[i] == '%' && i+2 < len(b) { 461 j := i + 1 462 c := 0 463 for ; j < i+3 && (b[j] >= '0' && b[j] <= '9' || b[j] >= 'a' && b[j] <= 'f' || b[j] >= 'A' && b[j] <= 'F'); j++ { 464 if b[j] <= '9' { 465 c = c<<4 + int(b[j]-'0') 466 } else if b[j] <= 'F' { 467 c = c<<4 + int(b[j]-'A') + 10 468 } else if b[j] <= 'f' { 469 c = c<<4 + int(b[j]-'a') + 10 470 } 471 } 472 if j == i+3 && c < 128 { 473 b[i] = byte(c) 474 b = append(b[:i+1], b[i+3:]...) 475 } 476 } else if b[i] == '+' { 477 b[i] = ' ' 478 } 479 } 480 return b 481 }