util.go (28789B)
1 // Package util provides utility functions for the goldmark. 2 package util 3 4 import ( 5 "bytes" 6 "io" 7 "net/url" 8 "regexp" 9 "sort" 10 "strconv" 11 "unicode" 12 "unicode/utf8" 13 ) 14 15 // A CopyOnWriteBuffer is a byte buffer that copies buffer when 16 // it need to be changed. 17 type CopyOnWriteBuffer struct { 18 buffer []byte 19 copied bool 20 } 21 22 // NewCopyOnWriteBuffer returns a new CopyOnWriteBuffer. 23 func NewCopyOnWriteBuffer(buffer []byte) CopyOnWriteBuffer { 24 return CopyOnWriteBuffer{ 25 buffer: buffer, 26 copied: false, 27 } 28 } 29 30 // Write writes given bytes to the buffer. 31 // Write allocate new buffer and clears it at the first time. 32 func (b *CopyOnWriteBuffer) Write(value []byte) { 33 if !b.copied { 34 b.buffer = make([]byte, 0, len(b.buffer)+20) 35 b.copied = true 36 } 37 b.buffer = append(b.buffer, value...) 38 } 39 40 // WriteString writes given string to the buffer. 41 // WriteString allocate new buffer and clears it at the first time. 42 func (b *CopyOnWriteBuffer) WriteString(value string) { 43 b.Write(StringToReadOnlyBytes(value)) 44 } 45 46 // Append appends given bytes to the buffer. 47 // Append copy buffer at the first time. 48 func (b *CopyOnWriteBuffer) Append(value []byte) { 49 if !b.copied { 50 tmp := make([]byte, len(b.buffer), len(b.buffer)+20) 51 copy(tmp, b.buffer) 52 b.buffer = tmp 53 b.copied = true 54 } 55 b.buffer = append(b.buffer, value...) 56 } 57 58 // AppendString appends given string to the buffer. 59 // AppendString copy buffer at the first time. 60 func (b *CopyOnWriteBuffer) AppendString(value string) { 61 b.Append(StringToReadOnlyBytes(value)) 62 } 63 64 // WriteByte writes the given byte to the buffer. 65 // WriteByte allocate new buffer and clears it at the first time. 66 func (b *CopyOnWriteBuffer) WriteByte(c byte) { 67 if !b.copied { 68 b.buffer = make([]byte, 0, len(b.buffer)+20) 69 b.copied = true 70 } 71 b.buffer = append(b.buffer, c) 72 } 73 74 // AppendByte appends given bytes to the buffer. 75 // AppendByte copy buffer at the first time. 76 func (b *CopyOnWriteBuffer) AppendByte(c byte) { 77 if !b.copied { 78 tmp := make([]byte, len(b.buffer), len(b.buffer)+20) 79 copy(tmp, b.buffer) 80 b.buffer = tmp 81 b.copied = true 82 } 83 b.buffer = append(b.buffer, c) 84 } 85 86 // Bytes returns bytes of this buffer. 87 func (b *CopyOnWriteBuffer) Bytes() []byte { 88 return b.buffer 89 } 90 91 // IsCopied returns true if buffer has been copied, otherwise false. 92 func (b *CopyOnWriteBuffer) IsCopied() bool { 93 return b.copied 94 } 95 96 // IsEscapedPunctuation returns true if character at a given index i 97 // is an escaped punctuation, otherwise false. 98 func IsEscapedPunctuation(source []byte, i int) bool { 99 return source[i] == '\\' && i < len(source)-1 && IsPunct(source[i+1]) 100 } 101 102 // ReadWhile read the given source while pred is true. 103 func ReadWhile(source []byte, index [2]int, pred func(byte) bool) (int, bool) { 104 j := index[0] 105 ok := false 106 for ; j < index[1]; j++ { 107 c1 := source[j] 108 if pred(c1) { 109 ok = true 110 continue 111 } 112 break 113 } 114 return j, ok 115 } 116 117 // IsBlank returns true if the given string is all space characters. 118 func IsBlank(bs []byte) bool { 119 for _, b := range bs { 120 if !IsSpace(b) { 121 return false 122 } 123 } 124 return true 125 } 126 127 // VisualizeSpaces visualize invisible space characters. 128 func VisualizeSpaces(bs []byte) []byte { 129 bs = bytes.Replace(bs, []byte(" "), []byte("[SPACE]"), -1) 130 bs = bytes.Replace(bs, []byte("\t"), []byte("[TAB]"), -1) 131 bs = bytes.Replace(bs, []byte("\n"), []byte("[NEWLINE]\n"), -1) 132 bs = bytes.Replace(bs, []byte("\r"), []byte("[CR]"), -1) 133 bs = bytes.Replace(bs, []byte("\v"), []byte("[VTAB]"), -1) 134 bs = bytes.Replace(bs, []byte("\x00"), []byte("[NUL]"), -1) 135 bs = bytes.Replace(bs, []byte("\ufffd"), []byte("[U+FFFD]"), -1) 136 return bs 137 } 138 139 // TabWidth calculates actual width of a tab at the given position. 140 func TabWidth(currentPos int) int { 141 return 4 - currentPos%4 142 } 143 144 // IndentPosition searches an indent position with the given width for the given line. 145 // If the line contains tab characters, paddings may be not zero. 146 // currentPos==0 and width==2: 147 // 148 // position: 0 1 149 // [TAB]aaaa 150 // width: 1234 5678 151 // 152 // width=2 is in the tab character. In this case, IndentPosition returns 153 // (pos=1, padding=2) 154 func IndentPosition(bs []byte, currentPos, width int) (pos, padding int) { 155 return IndentPositionPadding(bs, currentPos, 0, width) 156 } 157 158 // IndentPositionPadding searches an indent position with the given width for the given line. 159 // This function is mostly same as IndentPosition except this function 160 // takes account into additional paddings. 161 func IndentPositionPadding(bs []byte, currentPos, paddingv, width int) (pos, padding int) { 162 if width == 0 { 163 return 0, paddingv 164 } 165 w := 0 166 i := 0 167 l := len(bs) 168 for ; i < l; i++ { 169 if bs[i] == '\t' && w < width { 170 w += TabWidth(currentPos + w) 171 } else if bs[i] == ' ' && w < width { 172 w++ 173 } else { 174 break 175 } 176 } 177 if w >= width { 178 return i - paddingv, w - width 179 } 180 return -1, -1 181 } 182 183 // DedentPosition dedents lines by the given width. 184 // 185 // Deprecated: This function has bugs. Use util.IndentPositionPadding and util.FirstNonSpacePosition. 186 func DedentPosition(bs []byte, currentPos, width int) (pos, padding int) { 187 if width == 0 { 188 return 0, 0 189 } 190 w := 0 191 l := len(bs) 192 i := 0 193 for ; i < l; i++ { 194 if bs[i] == '\t' { 195 w += TabWidth(currentPos + w) 196 } else if bs[i] == ' ' { 197 w++ 198 } else { 199 break 200 } 201 } 202 if w >= width { 203 return i, w - width 204 } 205 return i, 0 206 } 207 208 // DedentPositionPadding dedents lines by the given width. 209 // This function is mostly same as DedentPosition except this function 210 // takes account into additional paddings. 211 // 212 // Deprecated: This function has bugs. Use util.IndentPositionPadding and util.FirstNonSpacePosition. 213 func DedentPositionPadding(bs []byte, currentPos, paddingv, width int) (pos, padding int) { 214 if width == 0 { 215 return 0, paddingv 216 } 217 218 w := 0 219 i := 0 220 l := len(bs) 221 for ; i < l; i++ { 222 if bs[i] == '\t' { 223 w += TabWidth(currentPos + w) 224 } else if bs[i] == ' ' { 225 w++ 226 } else { 227 break 228 } 229 } 230 if w >= width { 231 return i - paddingv, w - width 232 } 233 return i - paddingv, 0 234 } 235 236 // IndentWidth calculate an indent width for the given line. 237 func IndentWidth(bs []byte, currentPos int) (width, pos int) { 238 l := len(bs) 239 for i := 0; i < l; i++ { 240 b := bs[i] 241 if b == ' ' { 242 width++ 243 pos++ 244 } else if b == '\t' { 245 width += TabWidth(currentPos + width) 246 pos++ 247 } else { 248 break 249 } 250 } 251 return 252 } 253 254 // FirstNonSpacePosition returns a position line that is a first nonspace 255 // character. 256 func FirstNonSpacePosition(bs []byte) int { 257 i := 0 258 for ; i < len(bs); i++ { 259 c := bs[i] 260 if c == ' ' || c == '\t' { 261 continue 262 } 263 if c == '\n' { 264 return -1 265 } 266 return i 267 } 268 return -1 269 } 270 271 // FindClosure returns a position that closes the given opener. 272 // If codeSpan is set true, it ignores characters in code spans. 273 // If allowNesting is set true, closures correspond to nested opener will be 274 // ignored. 275 // 276 // Deprecated: This function can not handle newlines. Many elements 277 // can be existed over multiple lines(e.g. link labels). 278 // Use text.Reader.FindClosure. 279 func FindClosure(bs []byte, opener, closure byte, codeSpan, allowNesting bool) int { 280 i := 0 281 opened := 1 282 codeSpanOpener := 0 283 for i < len(bs) { 284 c := bs[i] 285 if codeSpan && codeSpanOpener != 0 && c == '`' { 286 codeSpanCloser := 0 287 for ; i < len(bs); i++ { 288 if bs[i] == '`' { 289 codeSpanCloser++ 290 } else { 291 i-- 292 break 293 } 294 } 295 if codeSpanCloser == codeSpanOpener { 296 codeSpanOpener = 0 297 } 298 } else if codeSpanOpener == 0 && c == '\\' && i < len(bs)-1 && IsPunct(bs[i+1]) { 299 i += 2 300 continue 301 } else if codeSpan && codeSpanOpener == 0 && c == '`' { 302 for ; i < len(bs); i++ { 303 if bs[i] == '`' { 304 codeSpanOpener++ 305 } else { 306 i-- 307 break 308 } 309 } 310 } else if (codeSpan && codeSpanOpener == 0) || !codeSpan { 311 if c == closure { 312 opened-- 313 if opened == 0 { 314 return i 315 } 316 } else if c == opener { 317 if !allowNesting { 318 return -1 319 } 320 opened++ 321 } 322 } 323 i++ 324 } 325 return -1 326 } 327 328 // TrimLeft trims characters in the given s from head of the source. 329 // bytes.TrimLeft offers same functionalities, but bytes.TrimLeft 330 // allocates new buffer for the result. 331 func TrimLeft(source, b []byte) []byte { 332 i := 0 333 for ; i < len(source); i++ { 334 c := source[i] 335 found := false 336 for j := 0; j < len(b); j++ { 337 if c == b[j] { 338 found = true 339 break 340 } 341 } 342 if !found { 343 break 344 } 345 } 346 return source[i:] 347 } 348 349 // TrimRight trims characters in the given s from tail of the source. 350 func TrimRight(source, b []byte) []byte { 351 i := len(source) - 1 352 for ; i >= 0; i-- { 353 c := source[i] 354 found := false 355 for j := 0; j < len(b); j++ { 356 if c == b[j] { 357 found = true 358 break 359 } 360 } 361 if !found { 362 break 363 } 364 } 365 return source[:i+1] 366 } 367 368 // TrimLeftLength returns a length of leading specified characters. 369 func TrimLeftLength(source, s []byte) int { 370 return len(source) - len(TrimLeft(source, s)) 371 } 372 373 // TrimRightLength returns a length of trailing specified characters. 374 func TrimRightLength(source, s []byte) int { 375 return len(source) - len(TrimRight(source, s)) 376 } 377 378 // TrimLeftSpaceLength returns a length of leading space characters. 379 func TrimLeftSpaceLength(source []byte) int { 380 i := 0 381 for ; i < len(source); i++ { 382 if !IsSpace(source[i]) { 383 break 384 } 385 } 386 return i 387 } 388 389 // TrimRightSpaceLength returns a length of trailing space characters. 390 func TrimRightSpaceLength(source []byte) int { 391 l := len(source) 392 i := l - 1 393 for ; i >= 0; i-- { 394 if !IsSpace(source[i]) { 395 break 396 } 397 } 398 if i < 0 { 399 return l 400 } 401 return l - 1 - i 402 } 403 404 // TrimLeftSpace returns a subslice of the given string by slicing off all leading 405 // space characters. 406 func TrimLeftSpace(source []byte) []byte { 407 return TrimLeft(source, spaces) 408 } 409 410 // TrimRightSpace returns a subslice of the given string by slicing off all trailing 411 // space characters. 412 func TrimRightSpace(source []byte) []byte { 413 return TrimRight(source, spaces) 414 } 415 416 // DoFullUnicodeCaseFolding performs full unicode case folding to given bytes. 417 func DoFullUnicodeCaseFolding(v []byte) []byte { 418 var rbuf []byte 419 cob := NewCopyOnWriteBuffer(v) 420 n := 0 421 for i := 0; i < len(v); i++ { 422 c := v[i] 423 if c < 0xb5 { 424 if c >= 0x41 && c <= 0x5a { 425 // A-Z to a-z 426 cob.Write(v[n:i]) 427 cob.WriteByte(c + 32) 428 n = i + 1 429 } 430 continue 431 } 432 433 if !utf8.RuneStart(c) { 434 continue 435 } 436 r, length := utf8.DecodeRune(v[i:]) 437 if r == utf8.RuneError { 438 continue 439 } 440 folded, ok := unicodeCaseFoldings[r] 441 if !ok { 442 continue 443 } 444 445 cob.Write(v[n:i]) 446 if rbuf == nil { 447 rbuf = make([]byte, 4) 448 } 449 for _, f := range folded { 450 l := utf8.EncodeRune(rbuf, f) 451 cob.Write(rbuf[:l]) 452 } 453 i += length - 1 454 n = i + 1 455 } 456 if cob.IsCopied() { 457 cob.Write(v[n:]) 458 } 459 return cob.Bytes() 460 } 461 462 // ReplaceSpaces replaces sequence of spaces with the given repl. 463 func ReplaceSpaces(source []byte, repl byte) []byte { 464 var ret []byte 465 start := -1 466 for i, c := range source { 467 iss := IsSpace(c) 468 if start < 0 && iss { 469 start = i 470 continue 471 } else if start >= 0 && iss { 472 continue 473 } else if start >= 0 { 474 if ret == nil { 475 ret = make([]byte, 0, len(source)) 476 ret = append(ret, source[:start]...) 477 } 478 ret = append(ret, repl) 479 start = -1 480 } 481 if ret != nil { 482 ret = append(ret, c) 483 } 484 } 485 if start >= 0 && ret != nil { 486 ret = append(ret, repl) 487 } 488 if ret == nil { 489 return source 490 } 491 return ret 492 } 493 494 // ToRune decode given bytes start at pos and returns a rune. 495 func ToRune(source []byte, pos int) rune { 496 i := pos 497 for ; i >= 0; i-- { 498 if utf8.RuneStart(source[i]) { 499 break 500 } 501 } 502 r, _ := utf8.DecodeRune(source[i:]) 503 return r 504 } 505 506 // ToValidRune returns 0xFFFD if the given rune is invalid, otherwise v. 507 func ToValidRune(v rune) rune { 508 if v == 0 || !utf8.ValidRune(v) { 509 return rune(0xFFFD) 510 } 511 return v 512 } 513 514 // ToLinkReference converts given bytes into a valid link reference string. 515 // ToLinkReference performs unicode case folding, trims leading and trailing spaces, converts into lower 516 // case and replace spaces with a single space character. 517 func ToLinkReference(v []byte) string { 518 v = TrimLeftSpace(v) 519 v = TrimRightSpace(v) 520 v = DoFullUnicodeCaseFolding(v) 521 return string(ReplaceSpaces(v, ' ')) 522 } 523 524 var htmlEscapeTable = [256][]byte{nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, []byte("""), nil, nil, nil, []byte("&"), nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, []byte("<"), nil, []byte(">"), nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil} 525 526 // EscapeHTMLByte returns HTML escaped bytes if the given byte should be escaped, 527 // otherwise nil. 528 func EscapeHTMLByte(b byte) []byte { 529 return htmlEscapeTable[b] 530 } 531 532 // EscapeHTML escapes characters that should be escaped in HTML text. 533 func EscapeHTML(v []byte) []byte { 534 cob := NewCopyOnWriteBuffer(v) 535 n := 0 536 for i := 0; i < len(v); i++ { 537 c := v[i] 538 escaped := htmlEscapeTable[c] 539 if escaped != nil { 540 cob.Write(v[n:i]) 541 cob.Write(escaped) 542 n = i + 1 543 } 544 } 545 if cob.IsCopied() { 546 cob.Write(v[n:]) 547 } 548 return cob.Bytes() 549 } 550 551 // UnescapePunctuations unescapes blackslash escaped punctuations. 552 func UnescapePunctuations(source []byte) []byte { 553 cob := NewCopyOnWriteBuffer(source) 554 limit := len(source) 555 n := 0 556 for i := 0; i < limit; { 557 c := source[i] 558 if i < limit-1 && c == '\\' && IsPunct(source[i+1]) { 559 cob.Write(source[n:i]) 560 cob.WriteByte(source[i+1]) 561 i += 2 562 n = i 563 continue 564 } 565 i++ 566 } 567 if cob.IsCopied() { 568 cob.Write(source[n:]) 569 } 570 return cob.Bytes() 571 } 572 573 // ResolveNumericReferences resolve numeric references like 'Ӓ" . 574 func ResolveNumericReferences(source []byte) []byte { 575 cob := NewCopyOnWriteBuffer(source) 576 buf := make([]byte, 6, 6) 577 limit := len(source) 578 ok := false 579 n := 0 580 for i := 0; i < limit; i++ { 581 if source[i] == '&' { 582 pos := i 583 next := i + 1 584 if next < limit && source[next] == '#' { 585 nnext := next + 1 586 if nnext < limit { 587 nc := source[nnext] 588 // code point like #x22; 589 if nnext < limit && nc == 'x' || nc == 'X' { 590 start := nnext + 1 591 i, ok = ReadWhile(source, [2]int{start, limit}, IsHexDecimal) 592 if ok && i < limit && source[i] == ';' { 593 v, _ := strconv.ParseUint(BytesToReadOnlyString(source[start:i]), 16, 32) 594 cob.Write(source[n:pos]) 595 n = i + 1 596 runeSize := utf8.EncodeRune(buf, ToValidRune(rune(v))) 597 cob.Write(buf[:runeSize]) 598 continue 599 } 600 // code point like #1234; 601 } else if nc >= '0' && nc <= '9' { 602 start := nnext 603 i, ok = ReadWhile(source, [2]int{start, limit}, IsNumeric) 604 if ok && i < limit && i-start < 8 && source[i] == ';' { 605 v, _ := strconv.ParseUint(BytesToReadOnlyString(source[start:i]), 0, 32) 606 cob.Write(source[n:pos]) 607 n = i + 1 608 runeSize := utf8.EncodeRune(buf, ToValidRune(rune(v))) 609 cob.Write(buf[:runeSize]) 610 continue 611 } 612 } 613 } 614 } 615 i = next - 1 616 } 617 } 618 if cob.IsCopied() { 619 cob.Write(source[n:]) 620 } 621 return cob.Bytes() 622 } 623 624 // ResolveEntityNames resolve entity references like 'ö" . 625 func ResolveEntityNames(source []byte) []byte { 626 cob := NewCopyOnWriteBuffer(source) 627 limit := len(source) 628 ok := false 629 n := 0 630 for i := 0; i < limit; i++ { 631 if source[i] == '&' { 632 pos := i 633 next := i + 1 634 if !(next < limit && source[next] == '#') { 635 start := next 636 i, ok = ReadWhile(source, [2]int{start, limit}, IsAlphaNumeric) 637 if ok && i < limit && source[i] == ';' { 638 name := BytesToReadOnlyString(source[start:i]) 639 entity, ok := LookUpHTML5EntityByName(name) 640 if ok { 641 cob.Write(source[n:pos]) 642 n = i + 1 643 cob.Write(entity.Characters) 644 continue 645 } 646 } 647 } 648 i = next - 1 649 } 650 } 651 if cob.IsCopied() { 652 cob.Write(source[n:]) 653 } 654 return cob.Bytes() 655 } 656 657 var htmlSpace = []byte("%20") 658 659 // URLEscape escape the given URL. 660 // If resolveReference is set true: 661 // 1. unescape punctuations 662 // 2. resolve numeric references 663 // 3. resolve entity references 664 // 665 // URL encoded values (%xx) are kept as is. 666 func URLEscape(v []byte, resolveReference bool) []byte { 667 if resolveReference { 668 v = UnescapePunctuations(v) 669 v = ResolveNumericReferences(v) 670 v = ResolveEntityNames(v) 671 } 672 cob := NewCopyOnWriteBuffer(v) 673 limit := len(v) 674 n := 0 675 676 for i := 0; i < limit; { 677 c := v[i] 678 if urlEscapeTable[c] == 1 { 679 i++ 680 continue 681 } 682 if c == '%' && i+2 < limit && IsHexDecimal(v[i+1]) && IsHexDecimal(v[i+1]) { 683 i += 3 684 continue 685 } 686 u8len := utf8lenTable[c] 687 if u8len == 99 { // invalid utf8 leading byte, skip it 688 i++ 689 continue 690 } 691 if c == ' ' { 692 cob.Write(v[n:i]) 693 cob.Write(htmlSpace) 694 i++ 695 n = i 696 continue 697 } 698 if int(u8len) > len(v) { 699 u8len = int8(len(v) - 1) 700 } 701 if u8len == 0 { 702 i++ 703 n = i 704 continue 705 } 706 cob.Write(v[n:i]) 707 stop := i + int(u8len) 708 if stop > len(v) { 709 i++ 710 n = i 711 continue 712 } 713 cob.Write(StringToReadOnlyBytes(url.QueryEscape(string(v[i:stop])))) 714 i += int(u8len) 715 n = i 716 } 717 if cob.IsCopied() && n < limit { 718 cob.Write(v[n:]) 719 } 720 return cob.Bytes() 721 } 722 723 // FindURLIndex returns a stop index value if the given bytes seem an URL. 724 // This function is equivalent to [A-Za-z][A-Za-z0-9.+-]{1,31}:[^<>\x00-\x20]* . 725 func FindURLIndex(b []byte) int { 726 i := 0 727 if !(len(b) > 0 && urlTable[b[i]]&7 == 7) { 728 return -1 729 } 730 i++ 731 for ; i < len(b); i++ { 732 c := b[i] 733 if urlTable[c]&4 != 4 { 734 break 735 } 736 } 737 if i == 1 || i > 33 || i >= len(b) { 738 return -1 739 } 740 if b[i] != ':' { 741 return -1 742 } 743 i++ 744 for ; i < len(b); i++ { 745 c := b[i] 746 if urlTable[c]&1 != 1 { 747 break 748 } 749 } 750 return i 751 } 752 753 var emailDomainRegexp = regexp.MustCompile(`^[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*`) 754 755 // FindEmailIndex returns a stop index value if the given bytes seem an email address. 756 func FindEmailIndex(b []byte) int { 757 // TODO: eliminate regexps 758 i := 0 759 for ; i < len(b); i++ { 760 c := b[i] 761 if emailTable[c]&1 != 1 { 762 break 763 } 764 } 765 if i == 0 { 766 return -1 767 } 768 if i >= len(b) || b[i] != '@' { 769 return -1 770 } 771 i++ 772 if i >= len(b) { 773 return -1 774 } 775 match := emailDomainRegexp.FindSubmatchIndex(b[i:]) 776 if match == nil { 777 return -1 778 } 779 return i + match[1] 780 } 781 782 var spaces = []byte(" \t\n\x0b\x0c\x0d") 783 784 var spaceTable = [256]int8{0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} 785 786 var punctTable = [256]int8{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} 787 788 // a-zA-Z0-9, ;/?:@&=+$,-_.!~*'()# 789 var urlEscapeTable = [256]int8{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} 790 791 var utf8lenTable = [256]int8{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 99, 99, 99, 99, 99, 99, 99, 99} 792 793 var urlTable = [256]uint8{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 1, 0, 1, 1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 1, 1, 1, 1, 1, 1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1} 794 795 var emailTable = [256]uint8{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} 796 797 // UTF8Len returns a byte length of the utf-8 character. 798 func UTF8Len(b byte) int8 { 799 return utf8lenTable[b] 800 } 801 802 // IsPunct returns true if the given character is a punctuation, otherwise false. 803 func IsPunct(c byte) bool { 804 return punctTable[c] == 1 805 } 806 807 // IsPunctRune returns true if the given rune is a punctuation, otherwise false. 808 func IsPunctRune(r rune) bool { 809 return int32(r) <= 256 && IsPunct(byte(r)) || unicode.IsPunct(r) 810 } 811 812 // IsSpace returns true if the given character is a space, otherwise false. 813 func IsSpace(c byte) bool { 814 return spaceTable[c] == 1 815 } 816 817 // IsSpaceRune returns true if the given rune is a space, otherwise false. 818 func IsSpaceRune(r rune) bool { 819 return int32(r) <= 256 && IsSpace(byte(r)) || unicode.IsSpace(r) 820 } 821 822 // IsNumeric returns true if the given character is a numeric, otherwise false. 823 func IsNumeric(c byte) bool { 824 return c >= '0' && c <= '9' 825 } 826 827 // IsHexDecimal returns true if the given character is a hexdecimal, otherwise false. 828 func IsHexDecimal(c byte) bool { 829 return c >= '0' && c <= '9' || c >= 'a' && c <= 'f' || c >= 'A' && c <= 'F' 830 } 831 832 // IsAlphaNumeric returns true if the given character is a alphabet or a numeric, otherwise false. 833 func IsAlphaNumeric(c byte) bool { 834 return c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c >= '0' && c <= '9' 835 } 836 837 // IsEastAsianWideRune returns trhe if the given rune is an east asian wide character, otherwise false. 838 func IsEastAsianWideRune(r rune) bool { 839 return unicode.Is(unicode.Hiragana, r) || 840 unicode.Is(unicode.Katakana, r) || 841 unicode.Is(unicode.Han, r) || 842 unicode.Is(unicode.Lm, r) || 843 unicode.Is(unicode.Hangul, r) 844 } 845 846 // A BufWriter is a subset of the bufio.Writer . 847 type BufWriter interface { 848 io.Writer 849 Available() int 850 Buffered() int 851 Flush() error 852 WriteByte(c byte) error 853 WriteRune(r rune) (size int, err error) 854 WriteString(s string) (int, error) 855 } 856 857 // A PrioritizedValue struct holds pair of an arbitrary value and a priority. 858 type PrioritizedValue struct { 859 // Value is an arbitrary value that you want to prioritize. 860 Value interface{} 861 // Priority is a priority of the value. 862 Priority int 863 } 864 865 // PrioritizedSlice is a slice of the PrioritizedValues 866 type PrioritizedSlice []PrioritizedValue 867 868 // Sort sorts the PrioritizedSlice in ascending order. 869 func (s PrioritizedSlice) Sort() { 870 sort.Slice(s, func(i, j int) bool { 871 return s[i].Priority < s[j].Priority 872 }) 873 } 874 875 // Remove removes the given value from this slice. 876 func (s PrioritizedSlice) Remove(v interface{}) PrioritizedSlice { 877 i := 0 878 found := false 879 for ; i < len(s); i++ { 880 if s[i].Value == v { 881 found = true 882 break 883 } 884 } 885 if !found { 886 return s 887 } 888 return append(s[:i], s[i+1:]...) 889 } 890 891 // Prioritized returns a new PrioritizedValue. 892 func Prioritized(v interface{}, priority int) PrioritizedValue { 893 return PrioritizedValue{v, priority} 894 } 895 896 func bytesHash(b []byte) uint64 { 897 var hash uint64 = 5381 898 for _, c := range b { 899 hash = ((hash << 5) + hash) + uint64(c) 900 } 901 return hash 902 } 903 904 // BytesFilter is a efficient data structure for checking whether bytes exist or not. 905 // BytesFilter is thread-safe. 906 type BytesFilter interface { 907 // Add adds given bytes to this set. 908 Add([]byte) 909 910 // Contains return true if this set contains given bytes, otherwise false. 911 Contains([]byte) bool 912 913 // Extend copies this filter and adds given bytes to new filter. 914 Extend(...[]byte) BytesFilter 915 } 916 917 type bytesFilter struct { 918 chars [256]uint8 919 threshold int 920 slots [][][]byte 921 } 922 923 // NewBytesFilter returns a new BytesFilter. 924 func NewBytesFilter(elements ...[]byte) BytesFilter { 925 s := &bytesFilter{ 926 threshold: 3, 927 slots: make([][][]byte, 64), 928 } 929 for _, element := range elements { 930 s.Add(element) 931 } 932 return s 933 } 934 935 func (s *bytesFilter) Add(b []byte) { 936 l := len(b) 937 m := s.threshold 938 if l < s.threshold { 939 m = l 940 } 941 for i := 0; i < m; i++ { 942 s.chars[b[i]] |= 1 << uint8(i) 943 } 944 h := bytesHash(b) % uint64(len(s.slots)) 945 slot := s.slots[h] 946 if slot == nil { 947 slot = [][]byte{} 948 } 949 s.slots[h] = append(slot, b) 950 } 951 952 func (s *bytesFilter) Extend(bs ...[]byte) BytesFilter { 953 newFilter := NewBytesFilter().(*bytesFilter) 954 newFilter.chars = s.chars 955 newFilter.threshold = s.threshold 956 for k, v := range s.slots { 957 newSlot := make([][]byte, len(v)) 958 copy(newSlot, v) 959 newFilter.slots[k] = v 960 } 961 for _, b := range bs { 962 newFilter.Add(b) 963 } 964 return newFilter 965 } 966 967 func (s *bytesFilter) Contains(b []byte) bool { 968 l := len(b) 969 m := s.threshold 970 if l < s.threshold { 971 m = l 972 } 973 for i := 0; i < m; i++ { 974 if (s.chars[b[i]] & (1 << uint8(i))) == 0 { 975 return false 976 } 977 } 978 h := bytesHash(b) % uint64(len(s.slots)) 979 slot := s.slots[h] 980 if slot == nil || len(slot) == 0 { 981 return false 982 } 983 for _, element := range slot { 984 if bytes.Equal(element, b) { 985 return true 986 } 987 } 988 return false 989 }