sanitize.go (29145B)
1 // Copyright (c) 2014, David Kitchen <david@buro9.com> 2 // 3 // All rights reserved. 4 // 5 // Redistribution and use in source and binary forms, with or without 6 // modification, are permitted provided that the following conditions are met: 7 // 8 // * Redistributions of source code must retain the above copyright notice, this 9 // list of conditions and the following disclaimer. 10 // 11 // * Redistributions in binary form must reproduce the above copyright notice, 12 // this list of conditions and the following disclaimer in the documentation 13 // and/or other materials provided with the distribution. 14 // 15 // * Neither the name of the organisation (Microcosm) nor the names of its 16 // contributors may be used to endorse or promote products derived from 17 // this software without specific prior written permission. 18 // 19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 30 package bluemonday 31 32 import ( 33 "bytes" 34 "fmt" 35 "io" 36 "net/url" 37 "regexp" 38 "strconv" 39 "strings" 40 41 "golang.org/x/net/html" 42 43 "github.com/aymerick/douceur/parser" 44 ) 45 46 var ( 47 dataAttribute = regexp.MustCompile("^data-.+") 48 dataAttributeXMLPrefix = regexp.MustCompile("^xml.+") 49 dataAttributeInvalidChars = regexp.MustCompile("[A-Z;]+") 50 cssUnicodeChar = regexp.MustCompile(`\\[0-9a-f]{1,6} ?`) 51 dataURIbase64Prefix = regexp.MustCompile(`^data:[^,]*;base64,`) 52 ) 53 54 // Sanitize takes a string that contains a HTML fragment or document and applies 55 // the given policy allowlist. 56 // 57 // It returns a HTML string that has been sanitized by the policy or an empty 58 // string if an error has occurred (most likely as a consequence of extremely 59 // malformed input) 60 func (p *Policy) Sanitize(s string) string { 61 if strings.TrimSpace(s) == "" { 62 return s 63 } 64 65 return p.sanitizeWithBuff(strings.NewReader(s)).String() 66 } 67 68 // SanitizeBytes takes a []byte that contains a HTML fragment or document and applies 69 // the given policy allowlist. 70 // 71 // It returns a []byte containing the HTML that has been sanitized by the policy 72 // or an empty []byte if an error has occurred (most likely as a consequence of 73 // extremely malformed input) 74 func (p *Policy) SanitizeBytes(b []byte) []byte { 75 if len(bytes.TrimSpace(b)) == 0 { 76 return b 77 } 78 79 return p.sanitizeWithBuff(bytes.NewReader(b)).Bytes() 80 } 81 82 // SanitizeReader takes an io.Reader that contains a HTML fragment or document 83 // and applies the given policy allowlist. 84 // 85 // It returns a bytes.Buffer containing the HTML that has been sanitized by the 86 // policy. Errors during sanitization will merely return an empty result. 87 func (p *Policy) SanitizeReader(r io.Reader) *bytes.Buffer { 88 return p.sanitizeWithBuff(r) 89 } 90 91 // SanitizeReaderToWriter takes an io.Reader that contains a HTML fragment or document 92 // and applies the given policy allowlist and writes to the provided writer returning 93 // an error if there is one. 94 func (p *Policy) SanitizeReaderToWriter(r io.Reader, w io.Writer) error { 95 return p.sanitize(r, w) 96 } 97 98 const escapedURLChars = "'<>\"\r" 99 100 func escapeUrlComponent(w stringWriterWriter, val string) error { 101 i := strings.IndexAny(val, escapedURLChars) 102 for i != -1 { 103 if _, err := w.WriteString(val[:i]); err != nil { 104 return err 105 } 106 var esc string 107 switch val[i] { 108 case '\'': 109 // "'" is shorter than "'" and apos was not in HTML until HTML5. 110 esc = "'" 111 case '<': 112 esc = "<" 113 case '>': 114 esc = ">" 115 case '"': 116 // """ is shorter than """. 117 esc = """ 118 case '\r': 119 esc = " " 120 default: 121 panic("unrecognized escape character") 122 } 123 val = val[i+1:] 124 if _, err := w.WriteString(esc); err != nil { 125 return err 126 } 127 i = strings.IndexAny(val, escapedURLChars) 128 } 129 _, err := w.WriteString(val) 130 return err 131 } 132 133 // Query represents a single part of the query string, a query param 134 type Query struct { 135 Key string 136 Value string 137 HasValue bool 138 } 139 140 func parseQuery(query string) (values []Query, err error) { 141 // This is essentially a copy of parseQuery from 142 // https://golang.org/src/net/url/url.go but adjusted to build our values 143 // based on our type, which we need to preserve the ordering of the query 144 // string 145 for query != "" { 146 key := query 147 if i := strings.IndexAny(key, "&;"); i >= 0 { 148 key, query = key[:i], key[i+1:] 149 } else { 150 query = "" 151 } 152 if key == "" { 153 continue 154 } 155 value := "" 156 hasValue := false 157 if i := strings.Index(key, "="); i >= 0 { 158 key, value = key[:i], key[i+1:] 159 hasValue = true 160 } 161 key, err1 := url.QueryUnescape(key) 162 if err1 != nil { 163 if err == nil { 164 err = err1 165 } 166 continue 167 } 168 value, err1 = url.QueryUnescape(value) 169 if err1 != nil { 170 if err == nil { 171 err = err1 172 } 173 continue 174 } 175 values = append(values, Query{ 176 Key: key, 177 Value: value, 178 HasValue: hasValue, 179 }) 180 } 181 return values, err 182 } 183 184 func encodeQueries(queries []Query) string { 185 var buff bytes.Buffer 186 for i, query := range queries { 187 buff.WriteString(url.QueryEscape(query.Key)) 188 if query.HasValue { 189 buff.WriteString("=") 190 buff.WriteString(url.QueryEscape(query.Value)) 191 } 192 if i < len(queries)-1 { 193 buff.WriteString("&") 194 } 195 } 196 return buff.String() 197 } 198 199 func sanitizedURL(val string) (string, error) { 200 u, err := url.Parse(val) 201 if err != nil { 202 return "", err 203 } 204 205 // we use parseQuery but not u.Query to keep the order not change because 206 // url.Values is a map which has a random order. 207 queryValues, err := parseQuery(u.RawQuery) 208 if err != nil { 209 return "", err 210 } 211 // sanitize the url query params 212 for i, query := range queryValues { 213 queryValues[i].Key = html.EscapeString(query.Key) 214 } 215 u.RawQuery = encodeQueries(queryValues) 216 // u.String() will also sanitize host/scheme/user/pass 217 return u.String(), nil 218 } 219 220 // Performs the actual sanitization process. 221 func (p *Policy) sanitizeWithBuff(r io.Reader) *bytes.Buffer { 222 var buff bytes.Buffer 223 if err := p.sanitize(r, &buff); err != nil { 224 return &bytes.Buffer{} 225 } 226 return &buff 227 } 228 229 type asStringWriter struct { 230 io.Writer 231 } 232 233 func (a *asStringWriter) WriteString(s string) (int, error) { 234 return a.Write([]byte(s)) 235 } 236 237 func (p *Policy) sanitize(r io.Reader, w io.Writer) error { 238 // It is possible that the developer has created the policy via: 239 // p := bluemonday.Policy{} 240 // rather than: 241 // p := bluemonday.NewPolicy() 242 // If this is the case, and if they haven't yet triggered an action that 243 // would initialize the maps, then we need to do that. 244 p.init() 245 246 buff, ok := w.(stringWriterWriter) 247 if !ok { 248 buff = &asStringWriter{w} 249 } 250 251 var ( 252 skipElementContent bool 253 skippingElementsCount int64 254 skipClosingTag bool 255 closingTagToSkipStack []string 256 mostRecentlyStartedToken string 257 ) 258 259 tokenizer := html.NewTokenizer(r) 260 for { 261 if tokenizer.Next() == html.ErrorToken { 262 err := tokenizer.Err() 263 if err == io.EOF { 264 // End of input means end of processing 265 return nil 266 } 267 268 // Raw tokenizer error 269 return err 270 } 271 272 token := tokenizer.Token() 273 switch token.Type { 274 case html.DoctypeToken: 275 276 // DocType is not handled as there is no safe parsing mechanism 277 // provided by golang.org/x/net/html for the content, and this can 278 // be misused to insert HTML tags that are not then sanitized 279 // 280 // One might wish to recursively sanitize here using the same policy 281 // but I will need to do some further testing before considering 282 // this. 283 284 case html.CommentToken: 285 286 // Comments are ignored by default 287 if p.allowComments { 288 // But if allowed then write the comment out as-is 289 buff.WriteString(token.String()) 290 } 291 292 case html.StartTagToken: 293 294 mostRecentlyStartedToken = normaliseElementName(token.Data) 295 296 switch normaliseElementName(token.Data) { 297 case `script`: 298 if !p.allowUnsafe { 299 continue 300 } 301 case `style`: 302 if !p.allowUnsafe { 303 continue 304 } 305 } 306 307 aps, ok := p.elsAndAttrs[token.Data] 308 if !ok { 309 aa, matched := p.matchRegex(token.Data) 310 if !matched { 311 if _, ok := p.setOfElementsToSkipContent[token.Data]; ok { 312 skipElementContent = true 313 skippingElementsCount++ 314 } 315 if p.addSpaces { 316 if _, err := buff.WriteString(" "); err != nil { 317 return err 318 } 319 } 320 break 321 } 322 aps = aa 323 } 324 if len(token.Attr) != 0 { 325 token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps) 326 } 327 328 if len(token.Attr) == 0 { 329 if !p.allowNoAttrs(token.Data) { 330 skipClosingTag = true 331 closingTagToSkipStack = append(closingTagToSkipStack, token.Data) 332 if p.addSpaces { 333 if _, err := buff.WriteString(" "); err != nil { 334 return err 335 } 336 } 337 break 338 } 339 } 340 341 if !skipElementContent { 342 if _, err := buff.WriteString(token.String()); err != nil { 343 return err 344 } 345 } 346 347 case html.EndTagToken: 348 349 if mostRecentlyStartedToken == normaliseElementName(token.Data) { 350 mostRecentlyStartedToken = "" 351 } 352 353 switch normaliseElementName(token.Data) { 354 case `script`: 355 if !p.allowUnsafe { 356 continue 357 } 358 case `style`: 359 if !p.allowUnsafe { 360 continue 361 } 362 } 363 364 if skipClosingTag && closingTagToSkipStack[len(closingTagToSkipStack)-1] == token.Data { 365 closingTagToSkipStack = closingTagToSkipStack[:len(closingTagToSkipStack)-1] 366 if len(closingTagToSkipStack) == 0 { 367 skipClosingTag = false 368 } 369 if p.addSpaces { 370 if _, err := buff.WriteString(" "); err != nil { 371 return err 372 } 373 } 374 break 375 } 376 if _, ok := p.elsAndAttrs[token.Data]; !ok { 377 match := false 378 for regex := range p.elsMatchingAndAttrs { 379 if regex.MatchString(token.Data) { 380 skipElementContent = false 381 match = true 382 break 383 } 384 } 385 if _, ok := p.setOfElementsToSkipContent[token.Data]; ok && !match { 386 skippingElementsCount-- 387 if skippingElementsCount == 0 { 388 skipElementContent = false 389 } 390 } 391 if !match { 392 if p.addSpaces { 393 if _, err := buff.WriteString(" "); err != nil { 394 return err 395 } 396 } 397 break 398 } 399 } 400 401 if !skipElementContent { 402 if _, err := buff.WriteString(token.String()); err != nil { 403 return err 404 } 405 } 406 407 case html.SelfClosingTagToken: 408 409 switch normaliseElementName(token.Data) { 410 case `script`: 411 if !p.allowUnsafe { 412 continue 413 } 414 case `style`: 415 if !p.allowUnsafe { 416 continue 417 } 418 } 419 420 aps, ok := p.elsAndAttrs[token.Data] 421 if !ok { 422 aa, matched := p.matchRegex(token.Data) 423 if !matched { 424 if p.addSpaces && !matched { 425 if _, err := buff.WriteString(" "); err != nil { 426 return err 427 } 428 } 429 break 430 } 431 aps = aa 432 } 433 434 if len(token.Attr) != 0 { 435 token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps) 436 } 437 438 if len(token.Attr) == 0 && !p.allowNoAttrs(token.Data) { 439 if p.addSpaces { 440 if _, err := buff.WriteString(" "); err != nil { 441 return err 442 } 443 } 444 break 445 } 446 if !skipElementContent { 447 if _, err := buff.WriteString(token.String()); err != nil { 448 return err 449 } 450 } 451 452 case html.TextToken: 453 454 if !skipElementContent { 455 switch mostRecentlyStartedToken { 456 case `script`: 457 // not encouraged, but if a policy allows JavaScript we 458 // should not HTML escape it as that would break the output 459 // 460 // requires p.AllowUnsafe() 461 if p.allowUnsafe { 462 if _, err := buff.WriteString(token.Data); err != nil { 463 return err 464 } 465 } 466 case "style": 467 // not encouraged, but if a policy allows CSS styles we 468 // should not HTML escape it as that would break the output 469 // 470 // requires p.AllowUnsafe() 471 if p.allowUnsafe { 472 if _, err := buff.WriteString(token.Data); err != nil { 473 return err 474 } 475 } 476 default: 477 // HTML escape the text 478 if _, err := buff.WriteString(token.String()); err != nil { 479 return err 480 } 481 } 482 } 483 484 default: 485 // A token that didn't exist in the html package when we wrote this 486 return fmt.Errorf("unknown token: %v", token) 487 } 488 } 489 } 490 491 // sanitizeAttrs takes a set of element attribute policies and the global 492 // attribute policies and applies them to the []html.Attribute returning a set 493 // of html.Attributes that match the policies 494 func (p *Policy) sanitizeAttrs( 495 elementName string, 496 attrs []html.Attribute, 497 aps map[string][]attrPolicy, 498 ) []html.Attribute { 499 500 if len(attrs) == 0 { 501 return attrs 502 } 503 504 hasStylePolicies := false 505 sps, elementHasStylePolicies := p.elsAndStyles[elementName] 506 if len(p.globalStyles) > 0 || (elementHasStylePolicies && len(sps) > 0) { 507 hasStylePolicies = true 508 } 509 // no specific element policy found, look for a pattern match 510 if !hasStylePolicies { 511 for k, v := range p.elsMatchingAndStyles { 512 if k.MatchString(elementName) { 513 if len(v) > 0 { 514 hasStylePolicies = true 515 break 516 } 517 } 518 } 519 } 520 521 // Builds a new attribute slice based on the whether the attribute has been 522 // allowed explicitly or globally. 523 cleanAttrs := []html.Attribute{} 524 attrsLoop: 525 for _, htmlAttr := range attrs { 526 if p.allowDataAttributes { 527 // If we see a data attribute, let it through. 528 if isDataAttribute(htmlAttr.Key) { 529 cleanAttrs = append(cleanAttrs, htmlAttr) 530 continue 531 } 532 } 533 // Is this a "style" attribute, and if so, do we need to sanitize it? 534 if htmlAttr.Key == "style" && hasStylePolicies { 535 htmlAttr = p.sanitizeStyles(htmlAttr, elementName) 536 if htmlAttr.Val == "" { 537 // We've sanitized away any and all styles; don't bother to 538 // output the style attribute (even if it's allowed) 539 continue 540 } else { 541 cleanAttrs = append(cleanAttrs, htmlAttr) 542 continue 543 } 544 } 545 546 // Is there an element specific attribute policy that applies? 547 if apl, ok := aps[htmlAttr.Key]; ok { 548 for _, ap := range apl { 549 if ap.regexp != nil { 550 if ap.regexp.MatchString(htmlAttr.Val) { 551 cleanAttrs = append(cleanAttrs, htmlAttr) 552 continue attrsLoop 553 } 554 } else { 555 cleanAttrs = append(cleanAttrs, htmlAttr) 556 continue attrsLoop 557 } 558 } 559 } 560 561 // Is there a global attribute policy that applies? 562 if apl, ok := p.globalAttrs[htmlAttr.Key]; ok { 563 for _, ap := range apl { 564 if ap.regexp != nil { 565 if ap.regexp.MatchString(htmlAttr.Val) { 566 cleanAttrs = append(cleanAttrs, htmlAttr) 567 } 568 } else { 569 cleanAttrs = append(cleanAttrs, htmlAttr) 570 } 571 } 572 } 573 } 574 575 if len(cleanAttrs) == 0 { 576 // If nothing was allowed, let's get out of here 577 return cleanAttrs 578 } 579 // cleanAttrs now contains the attributes that are permitted 580 581 if linkable(elementName) { 582 if p.requireParseableURLs { 583 // Ensure URLs are parseable: 584 // - a.href 585 // - area.href 586 // - link.href 587 // - blockquote.cite 588 // - q.cite 589 // - img.src 590 // - script.src 591 tmpAttrs := []html.Attribute{} 592 for _, htmlAttr := range cleanAttrs { 593 switch elementName { 594 case "a", "area", "base", "link": 595 if htmlAttr.Key == "href" { 596 if u, ok := p.validURL(htmlAttr.Val); ok { 597 htmlAttr.Val = u 598 tmpAttrs = append(tmpAttrs, htmlAttr) 599 } 600 break 601 } 602 tmpAttrs = append(tmpAttrs, htmlAttr) 603 case "blockquote", "del", "ins", "q": 604 if htmlAttr.Key == "cite" { 605 if u, ok := p.validURL(htmlAttr.Val); ok { 606 htmlAttr.Val = u 607 tmpAttrs = append(tmpAttrs, htmlAttr) 608 } 609 break 610 } 611 tmpAttrs = append(tmpAttrs, htmlAttr) 612 case "audio", "embed", "iframe", "img", "script", "source", "track", "video": 613 if htmlAttr.Key == "src" { 614 if u, ok := p.validURL(htmlAttr.Val); ok { 615 htmlAttr.Val = u 616 tmpAttrs = append(tmpAttrs, htmlAttr) 617 } 618 break 619 } 620 tmpAttrs = append(tmpAttrs, htmlAttr) 621 default: 622 tmpAttrs = append(tmpAttrs, htmlAttr) 623 } 624 } 625 cleanAttrs = tmpAttrs 626 } 627 628 if (p.requireNoFollow || 629 p.requireNoFollowFullyQualifiedLinks || 630 p.requireNoReferrer || 631 p.requireNoReferrerFullyQualifiedLinks || 632 p.addTargetBlankToFullyQualifiedLinks) && 633 len(cleanAttrs) > 0 { 634 635 // Add rel="nofollow" if a "href" exists 636 switch elementName { 637 case "a", "area", "base", "link": 638 var hrefFound bool 639 var externalLink bool 640 for _, htmlAttr := range cleanAttrs { 641 if htmlAttr.Key == "href" { 642 hrefFound = true 643 644 u, err := url.Parse(htmlAttr.Val) 645 if err != nil { 646 continue 647 } 648 if u.Host != "" { 649 externalLink = true 650 } 651 652 continue 653 } 654 } 655 656 if hrefFound { 657 var ( 658 noFollowFound bool 659 noReferrerFound bool 660 targetBlankFound bool 661 ) 662 663 addNoFollow := (p.requireNoFollow || 664 externalLink && p.requireNoFollowFullyQualifiedLinks) 665 666 addNoReferrer := (p.requireNoReferrer || 667 externalLink && p.requireNoReferrerFullyQualifiedLinks) 668 669 addTargetBlank := (externalLink && 670 p.addTargetBlankToFullyQualifiedLinks) 671 672 tmpAttrs := []html.Attribute{} 673 for _, htmlAttr := range cleanAttrs { 674 675 var appended bool 676 if htmlAttr.Key == "rel" && (addNoFollow || addNoReferrer) { 677 678 if addNoFollow && !strings.Contains(htmlAttr.Val, "nofollow") { 679 htmlAttr.Val += " nofollow" 680 } 681 if addNoReferrer && !strings.Contains(htmlAttr.Val, "noreferrer") { 682 htmlAttr.Val += " noreferrer" 683 } 684 noFollowFound = addNoFollow 685 noReferrerFound = addNoReferrer 686 tmpAttrs = append(tmpAttrs, htmlAttr) 687 appended = true 688 } 689 690 if elementName == "a" && htmlAttr.Key == "target" { 691 if htmlAttr.Val == "_blank" { 692 targetBlankFound = true 693 } 694 if addTargetBlank && !targetBlankFound { 695 htmlAttr.Val = "_blank" 696 targetBlankFound = true 697 tmpAttrs = append(tmpAttrs, htmlAttr) 698 appended = true 699 } 700 } 701 702 if !appended { 703 tmpAttrs = append(tmpAttrs, htmlAttr) 704 } 705 } 706 if noFollowFound || noReferrerFound || targetBlankFound { 707 cleanAttrs = tmpAttrs 708 } 709 710 if (addNoFollow && !noFollowFound) || (addNoReferrer && !noReferrerFound) { 711 rel := html.Attribute{} 712 rel.Key = "rel" 713 if addNoFollow { 714 rel.Val = "nofollow" 715 } 716 if addNoReferrer { 717 if rel.Val != "" { 718 rel.Val += " " 719 } 720 rel.Val += "noreferrer" 721 } 722 cleanAttrs = append(cleanAttrs, rel) 723 } 724 725 if elementName == "a" && addTargetBlank && !targetBlankFound { 726 rel := html.Attribute{} 727 rel.Key = "target" 728 rel.Val = "_blank" 729 targetBlankFound = true 730 cleanAttrs = append(cleanAttrs, rel) 731 } 732 733 if targetBlankFound { 734 // target="_blank" has a security risk that allows the 735 // opened window/tab to issue JavaScript calls against 736 // window.opener, which in effect allow the destination 737 // of the link to control the source: 738 // https://dev.to/ben/the-targetblank-vulnerability-by-example 739 // 740 // To mitigate this risk, we need to add a specific rel 741 // attribute if it is not already present. 742 // rel="noopener" 743 // 744 // Unfortunately this is processing the rel twice (we 745 // already looked at it earlier ^^) as we cannot be sure 746 // of the ordering of the href and rel, and whether we 747 // have fully satisfied that we need to do this. This 748 // double processing only happens *if* target="_blank" 749 // is true. 750 var noOpenerAdded bool 751 tmpAttrs := []html.Attribute{} 752 for _, htmlAttr := range cleanAttrs { 753 var appended bool 754 if htmlAttr.Key == "rel" { 755 if strings.Contains(htmlAttr.Val, "noopener") { 756 noOpenerAdded = true 757 tmpAttrs = append(tmpAttrs, htmlAttr) 758 } else { 759 htmlAttr.Val += " noopener" 760 noOpenerAdded = true 761 tmpAttrs = append(tmpAttrs, htmlAttr) 762 } 763 764 appended = true 765 } 766 if !appended { 767 tmpAttrs = append(tmpAttrs, htmlAttr) 768 } 769 } 770 if noOpenerAdded { 771 cleanAttrs = tmpAttrs 772 } else { 773 // rel attr was not found, or else noopener would 774 // have been added already 775 rel := html.Attribute{} 776 rel.Key = "rel" 777 rel.Val = "noopener" 778 cleanAttrs = append(cleanAttrs, rel) 779 } 780 781 } 782 } 783 default: 784 } 785 } 786 } 787 788 if p.requireCrossOriginAnonymous && len(cleanAttrs) > 0 { 789 switch elementName { 790 case "audio", "img", "link", "script", "video": 791 var crossOriginFound bool 792 for _, htmlAttr := range cleanAttrs { 793 if htmlAttr.Key == "crossorigin" { 794 crossOriginFound = true 795 htmlAttr.Val = "anonymous" 796 } 797 } 798 799 if !crossOriginFound { 800 crossOrigin := html.Attribute{} 801 crossOrigin.Key = "crossorigin" 802 crossOrigin.Val = "anonymous" 803 cleanAttrs = append(cleanAttrs, crossOrigin) 804 } 805 } 806 } 807 808 if p.requireSandboxOnIFrame != nil && elementName == "iframe" { 809 var sandboxFound bool 810 for i, htmlAttr := range cleanAttrs { 811 if htmlAttr.Key == "sandbox" { 812 sandboxFound = true 813 var cleanVals []string 814 cleanValsSet := make(map[string]bool) 815 for _, val := range strings.Fields(htmlAttr.Val) { 816 if p.requireSandboxOnIFrame[val] { 817 if !cleanValsSet[val] { 818 cleanVals = append(cleanVals, val) 819 cleanValsSet[val] = true 820 } 821 } 822 } 823 cleanAttrs[i].Val = strings.Join(cleanVals, " ") 824 } 825 } 826 827 if !sandboxFound { 828 sandbox := html.Attribute{} 829 sandbox.Key = "sandbox" 830 sandbox.Val = "" 831 cleanAttrs = append(cleanAttrs, sandbox) 832 } 833 } 834 835 return cleanAttrs 836 } 837 838 func (p *Policy) sanitizeStyles(attr html.Attribute, elementName string) html.Attribute { 839 sps := p.elsAndStyles[elementName] 840 if len(sps) == 0 { 841 sps = map[string][]stylePolicy{} 842 // check for any matching elements, if we don't already have a policy found 843 // if multiple matches are found they will be overwritten, it's best 844 // to not have overlapping matchers 845 for regex, policies := range p.elsMatchingAndStyles { 846 if regex.MatchString(elementName) { 847 for k, v := range policies { 848 sps[k] = append(sps[k], v...) 849 } 850 } 851 } 852 } 853 854 //Add semi-colon to end to fix parsing issue 855 attr.Val = strings.TrimRight(attr.Val, " ") 856 if len(attr.Val) > 0 && attr.Val[len(attr.Val)-1] != ';' { 857 attr.Val = attr.Val + ";" 858 } 859 decs, err := parser.ParseDeclarations(attr.Val) 860 if err != nil { 861 attr.Val = "" 862 return attr 863 } 864 clean := []string{} 865 prefixes := []string{"-webkit-", "-moz-", "-ms-", "-o-", "mso-", "-xv-", "-atsc-", "-wap-", "-khtml-", "prince-", "-ah-", "-hp-", "-ro-", "-rim-", "-tc-"} 866 867 decLoop: 868 for _, dec := range decs { 869 tempProperty := strings.ToLower(dec.Property) 870 tempValue := removeUnicode(strings.ToLower(dec.Value)) 871 for _, i := range prefixes { 872 tempProperty = strings.TrimPrefix(tempProperty, i) 873 } 874 if spl, ok := sps[tempProperty]; ok { 875 for _, sp := range spl { 876 if sp.handler != nil { 877 if sp.handler(tempValue) { 878 clean = append(clean, dec.Property+": "+dec.Value) 879 continue decLoop 880 } 881 } else if len(sp.enum) > 0 { 882 if stringInSlice(tempValue, sp.enum) { 883 clean = append(clean, dec.Property+": "+dec.Value) 884 continue decLoop 885 } 886 } else if sp.regexp != nil { 887 if sp.regexp.MatchString(tempValue) { 888 clean = append(clean, dec.Property+": "+dec.Value) 889 continue decLoop 890 } 891 } 892 } 893 } 894 if spl, ok := p.globalStyles[tempProperty]; ok { 895 for _, sp := range spl { 896 if sp.handler != nil { 897 if sp.handler(tempValue) { 898 clean = append(clean, dec.Property+": "+dec.Value) 899 continue decLoop 900 } 901 } else if len(sp.enum) > 0 { 902 if stringInSlice(tempValue, sp.enum) { 903 clean = append(clean, dec.Property+": "+dec.Value) 904 continue decLoop 905 } 906 } else if sp.regexp != nil { 907 if sp.regexp.MatchString(tempValue) { 908 clean = append(clean, dec.Property+": "+dec.Value) 909 continue decLoop 910 } 911 } 912 } 913 } 914 } 915 if len(clean) > 0 { 916 attr.Val = strings.Join(clean, "; ") 917 } else { 918 attr.Val = "" 919 } 920 return attr 921 } 922 923 func (p *Policy) allowNoAttrs(elementName string) bool { 924 _, ok := p.setOfElementsAllowedWithoutAttrs[elementName] 925 if !ok { 926 for _, r := range p.setOfElementsMatchingAllowedWithoutAttrs { 927 if r.MatchString(elementName) { 928 ok = true 929 break 930 } 931 } 932 } 933 return ok 934 } 935 936 func (p *Policy) validURL(rawurl string) (string, bool) { 937 if p.requireParseableURLs { 938 // URLs are valid if when space is trimmed the URL is valid 939 rawurl = strings.TrimSpace(rawurl) 940 941 // URLs cannot contain whitespace, unless it is a data-uri 942 if strings.Contains(rawurl, " ") || 943 strings.Contains(rawurl, "\t") || 944 strings.Contains(rawurl, "\n") { 945 if !strings.HasPrefix(rawurl, `data:`) { 946 return "", false 947 } 948 949 // Remove \r and \n from base64 encoded data to pass url.Parse. 950 matched := dataURIbase64Prefix.FindString(rawurl) 951 if matched != "" { 952 rawurl = matched + strings.Replace( 953 strings.Replace( 954 rawurl[len(matched):], 955 "\r", 956 "", 957 -1, 958 ), 959 "\n", 960 "", 961 -1, 962 ) 963 } 964 } 965 966 // URLs are valid if they parse 967 u, err := url.Parse(rawurl) 968 if err != nil { 969 return "", false 970 } 971 972 if u.Scheme != "" { 973 for _, r := range p.allowURLSchemeRegexps { 974 if r.MatchString(u.Scheme) { 975 return u.String(), true 976 } 977 } 978 979 urlPolicies, ok := p.allowURLSchemes[u.Scheme] 980 if !ok { 981 return "", false 982 } 983 984 if len(urlPolicies) == 0 { 985 return u.String(), true 986 } 987 988 for _, urlPolicy := range urlPolicies { 989 if urlPolicy(u) == true { 990 return u.String(), true 991 } 992 } 993 994 return "", false 995 } 996 997 if p.allowRelativeURLs { 998 if u.String() != "" { 999 return u.String(), true 1000 } 1001 } 1002 1003 return "", false 1004 } 1005 1006 return rawurl, true 1007 } 1008 1009 func linkable(elementName string) bool { 1010 switch elementName { 1011 case "a", "area", "base", "link": 1012 // elements that allow .href 1013 return true 1014 case "blockquote", "del", "ins", "q": 1015 // elements that allow .cite 1016 return true 1017 case "audio", "embed", "iframe", "img", "input", "script", "track", "video": 1018 // elements that allow .src 1019 return true 1020 default: 1021 return false 1022 } 1023 } 1024 1025 // stringInSlice returns true if needle exists in haystack 1026 func stringInSlice(needle string, haystack []string) bool { 1027 for _, straw := range haystack { 1028 if strings.ToLower(straw) == strings.ToLower(needle) { 1029 return true 1030 } 1031 } 1032 return false 1033 } 1034 1035 func isDataAttribute(val string) bool { 1036 if !dataAttribute.MatchString(val) { 1037 return false 1038 } 1039 rest := strings.Split(val, "data-") 1040 if len(rest) == 1 { 1041 return false 1042 } 1043 // data-xml* is invalid. 1044 if dataAttributeXMLPrefix.MatchString(rest[1]) { 1045 return false 1046 } 1047 // no uppercase or semi-colons allowed. 1048 if dataAttributeInvalidChars.MatchString(rest[1]) { 1049 return false 1050 } 1051 return true 1052 } 1053 1054 func removeUnicode(value string) string { 1055 substitutedValue := value 1056 currentLoc := cssUnicodeChar.FindStringIndex(substitutedValue) 1057 for currentLoc != nil { 1058 1059 character := substitutedValue[currentLoc[0]+1 : currentLoc[1]] 1060 character = strings.TrimSpace(character) 1061 if len(character) < 4 { 1062 character = strings.Repeat("0", 4-len(character)) + character 1063 } else { 1064 for len(character) > 4 { 1065 if character[0] != '0' { 1066 character = "" 1067 break 1068 } else { 1069 character = character[1:] 1070 } 1071 } 1072 } 1073 character = "\\u" + character 1074 translatedChar, err := strconv.Unquote(`"` + character + `"`) 1075 translatedChar = strings.TrimSpace(translatedChar) 1076 if err != nil { 1077 return "" 1078 } 1079 substitutedValue = substitutedValue[0:currentLoc[0]] + translatedChar + substitutedValue[currentLoc[1]:] 1080 currentLoc = cssUnicodeChar.FindStringIndex(substitutedValue) 1081 } 1082 return substitutedValue 1083 } 1084 1085 func (p *Policy) matchRegex(elementName string) (map[string][]attrPolicy, bool) { 1086 aps := make(map[string][]attrPolicy, 0) 1087 matched := false 1088 for regex, attrs := range p.elsMatchingAndAttrs { 1089 if regex.MatchString(elementName) { 1090 matched = true 1091 for k, v := range attrs { 1092 aps[k] = append(aps[k], v...) 1093 } 1094 } 1095 } 1096 return aps, matched 1097 } 1098 1099 // normaliseElementName takes a HTML element like <script> which is user input 1100 // and returns a lower case version of it that is immune to UTF-8 to ASCII 1101 // conversion tricks (like the use of upper case cyrillic i scrÄ°pt which a 1102 // strings.ToLower would convert to script). Instead this func will preserve 1103 // all non-ASCII as their escaped equivalent, i.e. \u0130 which reveals the 1104 // characters when lower cased 1105 func normaliseElementName(str string) string { 1106 // that useful QuoteToASCII put quote marks at the start and end 1107 // so those are trimmed off 1108 return strings.TrimSuffix( 1109 strings.TrimPrefix( 1110 strings.ToLower( 1111 strconv.QuoteToASCII(str), 1112 ), 1113 `"`), 1114 `"`, 1115 ) 1116 }