gtsocial-umbx

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

sanitize.go (29145B)


      1 // Copyright (c) 2014, David Kitchen <david@buro9.com>
      2 //
      3 // All rights reserved.
      4 //
      5 // Redistribution and use in source and binary forms, with or without
      6 // modification, are permitted provided that the following conditions are met:
      7 //
      8 // * Redistributions of source code must retain the above copyright notice, this
      9 //   list of conditions and the following disclaimer.
     10 //
     11 // * Redistributions in binary form must reproduce the above copyright notice,
     12 //   this list of conditions and the following disclaimer in the documentation
     13 //   and/or other materials provided with the distribution.
     14 //
     15 // * Neither the name of the organisation (Microcosm) nor the names of its
     16 //   contributors may be used to endorse or promote products derived from
     17 //   this software without specific prior written permission.
     18 //
     19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
     20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     22 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
     23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
     25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
     26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
     27 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 
     30 package bluemonday
     31 
     32 import (
     33 	"bytes"
     34 	"fmt"
     35 	"io"
     36 	"net/url"
     37 	"regexp"
     38 	"strconv"
     39 	"strings"
     40 
     41 	"golang.org/x/net/html"
     42 
     43 	"github.com/aymerick/douceur/parser"
     44 )
     45 
     46 var (
     47 	dataAttribute             = regexp.MustCompile("^data-.+")
     48 	dataAttributeXMLPrefix    = regexp.MustCompile("^xml.+")
     49 	dataAttributeInvalidChars = regexp.MustCompile("[A-Z;]+")
     50 	cssUnicodeChar            = regexp.MustCompile(`\\[0-9a-f]{1,6} ?`)
     51 	dataURIbase64Prefix       = regexp.MustCompile(`^data:[^,]*;base64,`)
     52 )
     53 
     54 // Sanitize takes a string that contains a HTML fragment or document and applies
     55 // the given policy allowlist.
     56 //
     57 // It returns a HTML string that has been sanitized by the policy or an empty
     58 // string if an error has occurred (most likely as a consequence of extremely
     59 // malformed input)
     60 func (p *Policy) Sanitize(s string) string {
     61 	if strings.TrimSpace(s) == "" {
     62 		return s
     63 	}
     64 
     65 	return p.sanitizeWithBuff(strings.NewReader(s)).String()
     66 }
     67 
     68 // SanitizeBytes takes a []byte that contains a HTML fragment or document and applies
     69 // the given policy allowlist.
     70 //
     71 // It returns a []byte containing the HTML that has been sanitized by the policy
     72 // or an empty []byte if an error has occurred (most likely as a consequence of
     73 // extremely malformed input)
     74 func (p *Policy) SanitizeBytes(b []byte) []byte {
     75 	if len(bytes.TrimSpace(b)) == 0 {
     76 		return b
     77 	}
     78 
     79 	return p.sanitizeWithBuff(bytes.NewReader(b)).Bytes()
     80 }
     81 
     82 // SanitizeReader takes an io.Reader that contains a HTML fragment or document
     83 // and applies the given policy allowlist.
     84 //
     85 // It returns a bytes.Buffer containing the HTML that has been sanitized by the
     86 // policy. Errors during sanitization will merely return an empty result.
     87 func (p *Policy) SanitizeReader(r io.Reader) *bytes.Buffer {
     88 	return p.sanitizeWithBuff(r)
     89 }
     90 
     91 // SanitizeReaderToWriter takes an io.Reader that contains a HTML fragment or document
     92 // and applies the given policy allowlist and writes to the provided writer returning
     93 // an error if there is one.
     94 func (p *Policy) SanitizeReaderToWriter(r io.Reader, w io.Writer) error {
     95 	return p.sanitize(r, w)
     96 }
     97 
     98 const escapedURLChars = "'<>\"\r"
     99 
    100 func escapeUrlComponent(w stringWriterWriter, val string) error {
    101 	i := strings.IndexAny(val, escapedURLChars)
    102 	for i != -1 {
    103 		if _, err := w.WriteString(val[:i]); err != nil {
    104 			return err
    105 		}
    106 		var esc string
    107 		switch val[i] {
    108 		case '\'':
    109 			// "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
    110 			esc = "&#39;"
    111 		case '<':
    112 			esc = "&lt;"
    113 		case '>':
    114 			esc = "&gt;"
    115 		case '"':
    116 			// "&#34;" is shorter than "&quot;".
    117 			esc = "&#34;"
    118 		case '\r':
    119 			esc = "&#13;"
    120 		default:
    121 			panic("unrecognized escape character")
    122 		}
    123 		val = val[i+1:]
    124 		if _, err := w.WriteString(esc); err != nil {
    125 			return err
    126 		}
    127 		i = strings.IndexAny(val, escapedURLChars)
    128 	}
    129 	_, err := w.WriteString(val)
    130 	return err
    131 }
    132 
    133 // Query represents a single part of the query string, a query param
    134 type Query struct {
    135 	Key      string
    136 	Value    string
    137 	HasValue bool
    138 }
    139 
    140 func parseQuery(query string) (values []Query, err error) {
    141 	// This is essentially a copy of parseQuery from
    142 	// https://golang.org/src/net/url/url.go but adjusted to build our values
    143 	// based on our type, which we need to preserve the ordering of the query
    144 	// string
    145 	for query != "" {
    146 		key := query
    147 		if i := strings.IndexAny(key, "&;"); i >= 0 {
    148 			key, query = key[:i], key[i+1:]
    149 		} else {
    150 			query = ""
    151 		}
    152 		if key == "" {
    153 			continue
    154 		}
    155 		value := ""
    156 		hasValue := false
    157 		if i := strings.Index(key, "="); i >= 0 {
    158 			key, value = key[:i], key[i+1:]
    159 			hasValue = true
    160 		}
    161 		key, err1 := url.QueryUnescape(key)
    162 		if err1 != nil {
    163 			if err == nil {
    164 				err = err1
    165 			}
    166 			continue
    167 		}
    168 		value, err1 = url.QueryUnescape(value)
    169 		if err1 != nil {
    170 			if err == nil {
    171 				err = err1
    172 			}
    173 			continue
    174 		}
    175 		values = append(values, Query{
    176 			Key:      key,
    177 			Value:    value,
    178 			HasValue: hasValue,
    179 		})
    180 	}
    181 	return values, err
    182 }
    183 
    184 func encodeQueries(queries []Query) string {
    185 	var buff bytes.Buffer
    186 	for i, query := range queries {
    187 		buff.WriteString(url.QueryEscape(query.Key))
    188 		if query.HasValue {
    189 			buff.WriteString("=")
    190 			buff.WriteString(url.QueryEscape(query.Value))
    191 		}
    192 		if i < len(queries)-1 {
    193 			buff.WriteString("&")
    194 		}
    195 	}
    196 	return buff.String()
    197 }
    198 
    199 func sanitizedURL(val string) (string, error) {
    200 	u, err := url.Parse(val)
    201 	if err != nil {
    202 		return "", err
    203 	}
    204 
    205 	// we use parseQuery but not u.Query to keep the order not change because
    206 	// url.Values is a map which has a random order.
    207 	queryValues, err := parseQuery(u.RawQuery)
    208 	if err != nil {
    209 		return "", err
    210 	}
    211 	// sanitize the url query params
    212 	for i, query := range queryValues {
    213 		queryValues[i].Key = html.EscapeString(query.Key)
    214 	}
    215 	u.RawQuery = encodeQueries(queryValues)
    216 	// u.String() will also sanitize host/scheme/user/pass
    217 	return u.String(), nil
    218 }
    219 
    220 // Performs the actual sanitization process.
    221 func (p *Policy) sanitizeWithBuff(r io.Reader) *bytes.Buffer {
    222 	var buff bytes.Buffer
    223 	if err := p.sanitize(r, &buff); err != nil {
    224 		return &bytes.Buffer{}
    225 	}
    226 	return &buff
    227 }
    228 
    229 type asStringWriter struct {
    230 	io.Writer
    231 }
    232 
    233 func (a *asStringWriter) WriteString(s string) (int, error) {
    234 	return a.Write([]byte(s))
    235 }
    236 
    237 func (p *Policy) sanitize(r io.Reader, w io.Writer) error {
    238 	// It is possible that the developer has created the policy via:
    239 	//   p := bluemonday.Policy{}
    240 	// rather than:
    241 	//   p := bluemonday.NewPolicy()
    242 	// If this is the case, and if they haven't yet triggered an action that
    243 	// would initialize the maps, then we need to do that.
    244 	p.init()
    245 
    246 	buff, ok := w.(stringWriterWriter)
    247 	if !ok {
    248 		buff = &asStringWriter{w}
    249 	}
    250 
    251 	var (
    252 		skipElementContent       bool
    253 		skippingElementsCount    int64
    254 		skipClosingTag           bool
    255 		closingTagToSkipStack    []string
    256 		mostRecentlyStartedToken string
    257 	)
    258 
    259 	tokenizer := html.NewTokenizer(r)
    260 	for {
    261 		if tokenizer.Next() == html.ErrorToken {
    262 			err := tokenizer.Err()
    263 			if err == io.EOF {
    264 				// End of input means end of processing
    265 				return nil
    266 			}
    267 
    268 			// Raw tokenizer error
    269 			return err
    270 		}
    271 
    272 		token := tokenizer.Token()
    273 		switch token.Type {
    274 		case html.DoctypeToken:
    275 
    276 			// DocType is not handled as there is no safe parsing mechanism
    277 			// provided by golang.org/x/net/html for the content, and this can
    278 			// be misused to insert HTML tags that are not then sanitized
    279 			//
    280 			// One might wish to recursively sanitize here using the same policy
    281 			// but I will need to do some further testing before considering
    282 			// this.
    283 
    284 		case html.CommentToken:
    285 
    286 			// Comments are ignored by default
    287 			if p.allowComments {
    288 				// But if allowed then write the comment out as-is
    289 				buff.WriteString(token.String())
    290 			}
    291 
    292 		case html.StartTagToken:
    293 
    294 			mostRecentlyStartedToken = normaliseElementName(token.Data)
    295 
    296 			switch normaliseElementName(token.Data) {
    297 			case `script`:
    298 				if !p.allowUnsafe {
    299 					continue
    300 				}
    301 			case `style`:
    302 				if !p.allowUnsafe {
    303 					continue
    304 				}
    305 			}
    306 
    307 			aps, ok := p.elsAndAttrs[token.Data]
    308 			if !ok {
    309 				aa, matched := p.matchRegex(token.Data)
    310 				if !matched {
    311 					if _, ok := p.setOfElementsToSkipContent[token.Data]; ok {
    312 						skipElementContent = true
    313 						skippingElementsCount++
    314 					}
    315 					if p.addSpaces {
    316 						if _, err := buff.WriteString(" "); err != nil {
    317 							return err
    318 						}
    319 					}
    320 					break
    321 				}
    322 				aps = aa
    323 			}
    324 			if len(token.Attr) != 0 {
    325 				token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps)
    326 			}
    327 
    328 			if len(token.Attr) == 0 {
    329 				if !p.allowNoAttrs(token.Data) {
    330 					skipClosingTag = true
    331 					closingTagToSkipStack = append(closingTagToSkipStack, token.Data)
    332 					if p.addSpaces {
    333 						if _, err := buff.WriteString(" "); err != nil {
    334 							return err
    335 						}
    336 					}
    337 					break
    338 				}
    339 			}
    340 
    341 			if !skipElementContent {
    342 				if _, err := buff.WriteString(token.String()); err != nil {
    343 					return err
    344 				}
    345 			}
    346 
    347 		case html.EndTagToken:
    348 
    349 			if mostRecentlyStartedToken == normaliseElementName(token.Data) {
    350 				mostRecentlyStartedToken = ""
    351 			}
    352 
    353 			switch normaliseElementName(token.Data) {
    354 			case `script`:
    355 				if !p.allowUnsafe {
    356 					continue
    357 				}
    358 			case `style`:
    359 				if !p.allowUnsafe {
    360 					continue
    361 				}
    362 			}
    363 
    364 			if skipClosingTag && closingTagToSkipStack[len(closingTagToSkipStack)-1] == token.Data {
    365 				closingTagToSkipStack = closingTagToSkipStack[:len(closingTagToSkipStack)-1]
    366 				if len(closingTagToSkipStack) == 0 {
    367 					skipClosingTag = false
    368 				}
    369 				if p.addSpaces {
    370 					if _, err := buff.WriteString(" "); err != nil {
    371 						return err
    372 					}
    373 				}
    374 				break
    375 			}
    376 			if _, ok := p.elsAndAttrs[token.Data]; !ok {
    377 				match := false
    378 				for regex := range p.elsMatchingAndAttrs {
    379 					if regex.MatchString(token.Data) {
    380 						skipElementContent = false
    381 						match = true
    382 						break
    383 					}
    384 				}
    385 				if _, ok := p.setOfElementsToSkipContent[token.Data]; ok && !match {
    386 					skippingElementsCount--
    387 					if skippingElementsCount == 0 {
    388 						skipElementContent = false
    389 					}
    390 				}
    391 				if !match {
    392 					if p.addSpaces {
    393 						if _, err := buff.WriteString(" "); err != nil {
    394 							return err
    395 						}
    396 					}
    397 					break
    398 				}
    399 			}
    400 
    401 			if !skipElementContent {
    402 				if _, err := buff.WriteString(token.String()); err != nil {
    403 					return err
    404 				}
    405 			}
    406 
    407 		case html.SelfClosingTagToken:
    408 
    409 			switch normaliseElementName(token.Data) {
    410 			case `script`:
    411 				if !p.allowUnsafe {
    412 					continue
    413 				}
    414 			case `style`:
    415 				if !p.allowUnsafe {
    416 					continue
    417 				}
    418 			}
    419 
    420 			aps, ok := p.elsAndAttrs[token.Data]
    421 			if !ok {
    422 				aa, matched := p.matchRegex(token.Data)
    423 				if !matched {
    424 					if p.addSpaces && !matched {
    425 						if _, err := buff.WriteString(" "); err != nil {
    426 							return err
    427 						}
    428 					}
    429 					break
    430 				}
    431 				aps = aa
    432 			}
    433 
    434 			if len(token.Attr) != 0 {
    435 				token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps)
    436 			}
    437 
    438 			if len(token.Attr) == 0 && !p.allowNoAttrs(token.Data) {
    439 				if p.addSpaces {
    440 					if _, err := buff.WriteString(" "); err != nil {
    441 						return err
    442 					}
    443 				}
    444 				break
    445 			}
    446 			if !skipElementContent {
    447 				if _, err := buff.WriteString(token.String()); err != nil {
    448 					return err
    449 				}
    450 			}
    451 
    452 		case html.TextToken:
    453 
    454 			if !skipElementContent {
    455 				switch mostRecentlyStartedToken {
    456 				case `script`:
    457 					// not encouraged, but if a policy allows JavaScript we
    458 					// should not HTML escape it as that would break the output
    459 					//
    460 					// requires p.AllowUnsafe()
    461 					if p.allowUnsafe {
    462 						if _, err := buff.WriteString(token.Data); err != nil {
    463 							return err
    464 						}
    465 					}
    466 				case "style":
    467 					// not encouraged, but if a policy allows CSS styles we
    468 					// should not HTML escape it as that would break the output
    469 					//
    470 					// requires p.AllowUnsafe()
    471 					if p.allowUnsafe {
    472 						if _, err := buff.WriteString(token.Data); err != nil {
    473 							return err
    474 						}
    475 					}
    476 				default:
    477 					// HTML escape the text
    478 					if _, err := buff.WriteString(token.String()); err != nil {
    479 						return err
    480 					}
    481 				}
    482 			}
    483 
    484 		default:
    485 			// A token that didn't exist in the html package when we wrote this
    486 			return fmt.Errorf("unknown token: %v", token)
    487 		}
    488 	}
    489 }
    490 
    491 // sanitizeAttrs takes a set of element attribute policies and the global
    492 // attribute policies and applies them to the []html.Attribute returning a set
    493 // of html.Attributes that match the policies
    494 func (p *Policy) sanitizeAttrs(
    495 	elementName string,
    496 	attrs []html.Attribute,
    497 	aps map[string][]attrPolicy,
    498 ) []html.Attribute {
    499 
    500 	if len(attrs) == 0 {
    501 		return attrs
    502 	}
    503 
    504 	hasStylePolicies := false
    505 	sps, elementHasStylePolicies := p.elsAndStyles[elementName]
    506 	if len(p.globalStyles) > 0 || (elementHasStylePolicies && len(sps) > 0) {
    507 		hasStylePolicies = true
    508 	}
    509 	// no specific element policy found, look for a pattern match
    510 	if !hasStylePolicies {
    511 		for k, v := range p.elsMatchingAndStyles {
    512 			if k.MatchString(elementName) {
    513 				if len(v) > 0 {
    514 					hasStylePolicies = true
    515 					break
    516 				}
    517 			}
    518 		}
    519 	}
    520 
    521 	// Builds a new attribute slice based on the whether the attribute has been
    522 	// allowed explicitly or globally.
    523 	cleanAttrs := []html.Attribute{}
    524 attrsLoop:
    525 	for _, htmlAttr := range attrs {
    526 		if p.allowDataAttributes {
    527 			// If we see a data attribute, let it through.
    528 			if isDataAttribute(htmlAttr.Key) {
    529 				cleanAttrs = append(cleanAttrs, htmlAttr)
    530 				continue
    531 			}
    532 		}
    533 		// Is this a "style" attribute, and if so, do we need to sanitize it?
    534 		if htmlAttr.Key == "style" && hasStylePolicies {
    535 			htmlAttr = p.sanitizeStyles(htmlAttr, elementName)
    536 			if htmlAttr.Val == "" {
    537 				// We've sanitized away any and all styles; don't bother to
    538 				// output the style attribute (even if it's allowed)
    539 				continue
    540 			} else {
    541 				cleanAttrs = append(cleanAttrs, htmlAttr)
    542 				continue
    543 			}
    544 		}
    545 
    546 		// Is there an element specific attribute policy that applies?
    547 		if apl, ok := aps[htmlAttr.Key]; ok {
    548 			for _, ap := range apl {
    549 				if ap.regexp != nil {
    550 					if ap.regexp.MatchString(htmlAttr.Val) {
    551 						cleanAttrs = append(cleanAttrs, htmlAttr)
    552 						continue attrsLoop
    553 					}
    554 				} else {
    555 					cleanAttrs = append(cleanAttrs, htmlAttr)
    556 					continue attrsLoop
    557 				}
    558 			}
    559 		}
    560 
    561 		// Is there a global attribute policy that applies?
    562 		if apl, ok := p.globalAttrs[htmlAttr.Key]; ok {
    563 			for _, ap := range apl {
    564 				if ap.regexp != nil {
    565 					if ap.regexp.MatchString(htmlAttr.Val) {
    566 						cleanAttrs = append(cleanAttrs, htmlAttr)
    567 					}
    568 				} else {
    569 					cleanAttrs = append(cleanAttrs, htmlAttr)
    570 				}
    571 			}
    572 		}
    573 	}
    574 
    575 	if len(cleanAttrs) == 0 {
    576 		// If nothing was allowed, let's get out of here
    577 		return cleanAttrs
    578 	}
    579 	// cleanAttrs now contains the attributes that are permitted
    580 
    581 	if linkable(elementName) {
    582 		if p.requireParseableURLs {
    583 			// Ensure URLs are parseable:
    584 			// - a.href
    585 			// - area.href
    586 			// - link.href
    587 			// - blockquote.cite
    588 			// - q.cite
    589 			// - img.src
    590 			// - script.src
    591 			tmpAttrs := []html.Attribute{}
    592 			for _, htmlAttr := range cleanAttrs {
    593 				switch elementName {
    594 				case "a", "area", "base", "link":
    595 					if htmlAttr.Key == "href" {
    596 						if u, ok := p.validURL(htmlAttr.Val); ok {
    597 							htmlAttr.Val = u
    598 							tmpAttrs = append(tmpAttrs, htmlAttr)
    599 						}
    600 						break
    601 					}
    602 					tmpAttrs = append(tmpAttrs, htmlAttr)
    603 				case "blockquote", "del", "ins", "q":
    604 					if htmlAttr.Key == "cite" {
    605 						if u, ok := p.validURL(htmlAttr.Val); ok {
    606 							htmlAttr.Val = u
    607 							tmpAttrs = append(tmpAttrs, htmlAttr)
    608 						}
    609 						break
    610 					}
    611 					tmpAttrs = append(tmpAttrs, htmlAttr)
    612 				case "audio", "embed", "iframe", "img", "script", "source", "track", "video":
    613 					if htmlAttr.Key == "src" {
    614 						if u, ok := p.validURL(htmlAttr.Val); ok {
    615 							htmlAttr.Val = u
    616 							tmpAttrs = append(tmpAttrs, htmlAttr)
    617 						}
    618 						break
    619 					}
    620 					tmpAttrs = append(tmpAttrs, htmlAttr)
    621 				default:
    622 					tmpAttrs = append(tmpAttrs, htmlAttr)
    623 				}
    624 			}
    625 			cleanAttrs = tmpAttrs
    626 		}
    627 
    628 		if (p.requireNoFollow ||
    629 			p.requireNoFollowFullyQualifiedLinks ||
    630 			p.requireNoReferrer ||
    631 			p.requireNoReferrerFullyQualifiedLinks ||
    632 			p.addTargetBlankToFullyQualifiedLinks) &&
    633 			len(cleanAttrs) > 0 {
    634 
    635 			// Add rel="nofollow" if a "href" exists
    636 			switch elementName {
    637 			case "a", "area", "base", "link":
    638 				var hrefFound bool
    639 				var externalLink bool
    640 				for _, htmlAttr := range cleanAttrs {
    641 					if htmlAttr.Key == "href" {
    642 						hrefFound = true
    643 
    644 						u, err := url.Parse(htmlAttr.Val)
    645 						if err != nil {
    646 							continue
    647 						}
    648 						if u.Host != "" {
    649 							externalLink = true
    650 						}
    651 
    652 						continue
    653 					}
    654 				}
    655 
    656 				if hrefFound {
    657 					var (
    658 						noFollowFound    bool
    659 						noReferrerFound  bool
    660 						targetBlankFound bool
    661 					)
    662 
    663 					addNoFollow := (p.requireNoFollow ||
    664 						externalLink && p.requireNoFollowFullyQualifiedLinks)
    665 
    666 					addNoReferrer := (p.requireNoReferrer ||
    667 						externalLink && p.requireNoReferrerFullyQualifiedLinks)
    668 
    669 					addTargetBlank := (externalLink &&
    670 						p.addTargetBlankToFullyQualifiedLinks)
    671 
    672 					tmpAttrs := []html.Attribute{}
    673 					for _, htmlAttr := range cleanAttrs {
    674 
    675 						var appended bool
    676 						if htmlAttr.Key == "rel" && (addNoFollow || addNoReferrer) {
    677 
    678 							if addNoFollow && !strings.Contains(htmlAttr.Val, "nofollow") {
    679 								htmlAttr.Val += " nofollow"
    680 							}
    681 							if addNoReferrer && !strings.Contains(htmlAttr.Val, "noreferrer") {
    682 								htmlAttr.Val += " noreferrer"
    683 							}
    684 							noFollowFound = addNoFollow
    685 							noReferrerFound = addNoReferrer
    686 							tmpAttrs = append(tmpAttrs, htmlAttr)
    687 							appended = true
    688 						}
    689 
    690 						if elementName == "a" && htmlAttr.Key == "target" {
    691 							if htmlAttr.Val == "_blank" {
    692 								targetBlankFound = true
    693 							}
    694 							if addTargetBlank && !targetBlankFound {
    695 								htmlAttr.Val = "_blank"
    696 								targetBlankFound = true
    697 								tmpAttrs = append(tmpAttrs, htmlAttr)
    698 								appended = true
    699 							}
    700 						}
    701 
    702 						if !appended {
    703 							tmpAttrs = append(tmpAttrs, htmlAttr)
    704 						}
    705 					}
    706 					if noFollowFound || noReferrerFound || targetBlankFound {
    707 						cleanAttrs = tmpAttrs
    708 					}
    709 
    710 					if (addNoFollow && !noFollowFound) || (addNoReferrer && !noReferrerFound) {
    711 						rel := html.Attribute{}
    712 						rel.Key = "rel"
    713 						if addNoFollow {
    714 							rel.Val = "nofollow"
    715 						}
    716 						if addNoReferrer {
    717 							if rel.Val != "" {
    718 								rel.Val += " "
    719 							}
    720 							rel.Val += "noreferrer"
    721 						}
    722 						cleanAttrs = append(cleanAttrs, rel)
    723 					}
    724 
    725 					if elementName == "a" && addTargetBlank && !targetBlankFound {
    726 						rel := html.Attribute{}
    727 						rel.Key = "target"
    728 						rel.Val = "_blank"
    729 						targetBlankFound = true
    730 						cleanAttrs = append(cleanAttrs, rel)
    731 					}
    732 
    733 					if targetBlankFound {
    734 						// target="_blank" has a security risk that allows the
    735 						// opened window/tab to issue JavaScript calls against
    736 						// window.opener, which in effect allow the destination
    737 						// of the link to control the source:
    738 						// https://dev.to/ben/the-targetblank-vulnerability-by-example
    739 						//
    740 						// To mitigate this risk, we need to add a specific rel
    741 						// attribute if it is not already present.
    742 						// rel="noopener"
    743 						//
    744 						// Unfortunately this is processing the rel twice (we
    745 						// already looked at it earlier ^^) as we cannot be sure
    746 						// of the ordering of the href and rel, and whether we
    747 						// have fully satisfied that we need to do this. This
    748 						// double processing only happens *if* target="_blank"
    749 						// is true.
    750 						var noOpenerAdded bool
    751 						tmpAttrs := []html.Attribute{}
    752 						for _, htmlAttr := range cleanAttrs {
    753 							var appended bool
    754 							if htmlAttr.Key == "rel" {
    755 								if strings.Contains(htmlAttr.Val, "noopener") {
    756 									noOpenerAdded = true
    757 									tmpAttrs = append(tmpAttrs, htmlAttr)
    758 								} else {
    759 									htmlAttr.Val += " noopener"
    760 									noOpenerAdded = true
    761 									tmpAttrs = append(tmpAttrs, htmlAttr)
    762 								}
    763 
    764 								appended = true
    765 							}
    766 							if !appended {
    767 								tmpAttrs = append(tmpAttrs, htmlAttr)
    768 							}
    769 						}
    770 						if noOpenerAdded {
    771 							cleanAttrs = tmpAttrs
    772 						} else {
    773 							// rel attr was not found, or else noopener would
    774 							// have been added already
    775 							rel := html.Attribute{}
    776 							rel.Key = "rel"
    777 							rel.Val = "noopener"
    778 							cleanAttrs = append(cleanAttrs, rel)
    779 						}
    780 
    781 					}
    782 				}
    783 			default:
    784 			}
    785 		}
    786 	}
    787 
    788 	if p.requireCrossOriginAnonymous && len(cleanAttrs) > 0 {
    789 		switch elementName {
    790 		case "audio", "img", "link", "script", "video":
    791 			var crossOriginFound bool
    792 			for _, htmlAttr := range cleanAttrs {
    793 				if htmlAttr.Key == "crossorigin" {
    794 					crossOriginFound = true
    795 					htmlAttr.Val = "anonymous"
    796 				}
    797 			}
    798 
    799 			if !crossOriginFound {
    800 				crossOrigin := html.Attribute{}
    801 				crossOrigin.Key = "crossorigin"
    802 				crossOrigin.Val = "anonymous"
    803 				cleanAttrs = append(cleanAttrs, crossOrigin)
    804 			}
    805 		}
    806 	}
    807 
    808 	if p.requireSandboxOnIFrame != nil && elementName == "iframe" {
    809 		var sandboxFound bool
    810 		for i, htmlAttr := range cleanAttrs {
    811 			if htmlAttr.Key == "sandbox" {
    812 				sandboxFound = true
    813 				var cleanVals []string
    814 				cleanValsSet := make(map[string]bool)
    815 				for _, val := range strings.Fields(htmlAttr.Val) {
    816 					if p.requireSandboxOnIFrame[val] {
    817 						if !cleanValsSet[val] {
    818 							cleanVals = append(cleanVals, val)
    819 							cleanValsSet[val] = true
    820 						}
    821 					}
    822 				}
    823 				cleanAttrs[i].Val = strings.Join(cleanVals, " ")
    824 			}
    825 		}
    826 
    827 		if !sandboxFound {
    828 			sandbox := html.Attribute{}
    829 			sandbox.Key = "sandbox"
    830 			sandbox.Val = ""
    831 			cleanAttrs = append(cleanAttrs, sandbox)
    832 		}
    833 	}
    834 
    835 	return cleanAttrs
    836 }
    837 
    838 func (p *Policy) sanitizeStyles(attr html.Attribute, elementName string) html.Attribute {
    839 	sps := p.elsAndStyles[elementName]
    840 	if len(sps) == 0 {
    841 		sps = map[string][]stylePolicy{}
    842 		// check for any matching elements, if we don't already have a policy found
    843 		// if multiple matches are found they will be overwritten, it's best
    844 		// to not have overlapping matchers
    845 		for regex, policies := range p.elsMatchingAndStyles {
    846 			if regex.MatchString(elementName) {
    847 				for k, v := range policies {
    848 					sps[k] = append(sps[k], v...)
    849 				}
    850 			}
    851 		}
    852 	}
    853 
    854 	//Add semi-colon to end to fix parsing issue
    855 	attr.Val = strings.TrimRight(attr.Val, " ")
    856 	if len(attr.Val) > 0 && attr.Val[len(attr.Val)-1] != ';' {
    857 		attr.Val = attr.Val + ";"
    858 	}
    859 	decs, err := parser.ParseDeclarations(attr.Val)
    860 	if err != nil {
    861 		attr.Val = ""
    862 		return attr
    863 	}
    864 	clean := []string{}
    865 	prefixes := []string{"-webkit-", "-moz-", "-ms-", "-o-", "mso-", "-xv-", "-atsc-", "-wap-", "-khtml-", "prince-", "-ah-", "-hp-", "-ro-", "-rim-", "-tc-"}
    866 
    867 decLoop:
    868 	for _, dec := range decs {
    869 		tempProperty := strings.ToLower(dec.Property)
    870 		tempValue := removeUnicode(strings.ToLower(dec.Value))
    871 		for _, i := range prefixes {
    872 			tempProperty = strings.TrimPrefix(tempProperty, i)
    873 		}
    874 		if spl, ok := sps[tempProperty]; ok {
    875 			for _, sp := range spl {
    876 				if sp.handler != nil {
    877 					if sp.handler(tempValue) {
    878 						clean = append(clean, dec.Property+": "+dec.Value)
    879 						continue decLoop
    880 					}
    881 				} else if len(sp.enum) > 0 {
    882 					if stringInSlice(tempValue, sp.enum) {
    883 						clean = append(clean, dec.Property+": "+dec.Value)
    884 						continue decLoop
    885 					}
    886 				} else if sp.regexp != nil {
    887 					if sp.regexp.MatchString(tempValue) {
    888 						clean = append(clean, dec.Property+": "+dec.Value)
    889 						continue decLoop
    890 					}
    891 				}
    892 			}
    893 		}
    894 		if spl, ok := p.globalStyles[tempProperty]; ok {
    895 			for _, sp := range spl {
    896 				if sp.handler != nil {
    897 					if sp.handler(tempValue) {
    898 						clean = append(clean, dec.Property+": "+dec.Value)
    899 						continue decLoop
    900 					}
    901 				} else if len(sp.enum) > 0 {
    902 					if stringInSlice(tempValue, sp.enum) {
    903 						clean = append(clean, dec.Property+": "+dec.Value)
    904 						continue decLoop
    905 					}
    906 				} else if sp.regexp != nil {
    907 					if sp.regexp.MatchString(tempValue) {
    908 						clean = append(clean, dec.Property+": "+dec.Value)
    909 						continue decLoop
    910 					}
    911 				}
    912 			}
    913 		}
    914 	}
    915 	if len(clean) > 0 {
    916 		attr.Val = strings.Join(clean, "; ")
    917 	} else {
    918 		attr.Val = ""
    919 	}
    920 	return attr
    921 }
    922 
    923 func (p *Policy) allowNoAttrs(elementName string) bool {
    924 	_, ok := p.setOfElementsAllowedWithoutAttrs[elementName]
    925 	if !ok {
    926 		for _, r := range p.setOfElementsMatchingAllowedWithoutAttrs {
    927 			if r.MatchString(elementName) {
    928 				ok = true
    929 				break
    930 			}
    931 		}
    932 	}
    933 	return ok
    934 }
    935 
    936 func (p *Policy) validURL(rawurl string) (string, bool) {
    937 	if p.requireParseableURLs {
    938 		// URLs are valid if when space is trimmed the URL is valid
    939 		rawurl = strings.TrimSpace(rawurl)
    940 
    941 		// URLs cannot contain whitespace, unless it is a data-uri
    942 		if strings.Contains(rawurl, " ") ||
    943 			strings.Contains(rawurl, "\t") ||
    944 			strings.Contains(rawurl, "\n") {
    945 			if !strings.HasPrefix(rawurl, `data:`) {
    946 				return "", false
    947 			}
    948 
    949 			// Remove \r and \n from base64 encoded data to pass url.Parse.
    950 			matched := dataURIbase64Prefix.FindString(rawurl)
    951 			if matched != "" {
    952 				rawurl = matched + strings.Replace(
    953 					strings.Replace(
    954 						rawurl[len(matched):],
    955 						"\r",
    956 						"",
    957 						-1,
    958 					),
    959 					"\n",
    960 					"",
    961 					-1,
    962 				)
    963 			}
    964 		}
    965 
    966 		// URLs are valid if they parse
    967 		u, err := url.Parse(rawurl)
    968 		if err != nil {
    969 			return "", false
    970 		}
    971 
    972 		if u.Scheme != "" {
    973 			for _, r := range p.allowURLSchemeRegexps {
    974 				if r.MatchString(u.Scheme) {
    975 					return u.String(), true
    976 				}
    977 			}
    978 
    979 			urlPolicies, ok := p.allowURLSchemes[u.Scheme]
    980 			if !ok {
    981 				return "", false
    982 			}
    983 
    984 			if len(urlPolicies) == 0 {
    985 				return u.String(), true
    986 			}
    987 
    988 			for _, urlPolicy := range urlPolicies {
    989 				if urlPolicy(u) == true {
    990 					return u.String(), true
    991 				}
    992 			}
    993 
    994 			return "", false
    995 		}
    996 
    997 		if p.allowRelativeURLs {
    998 			if u.String() != "" {
    999 				return u.String(), true
   1000 			}
   1001 		}
   1002 
   1003 		return "", false
   1004 	}
   1005 
   1006 	return rawurl, true
   1007 }
   1008 
   1009 func linkable(elementName string) bool {
   1010 	switch elementName {
   1011 	case "a", "area", "base", "link":
   1012 		// elements that allow .href
   1013 		return true
   1014 	case "blockquote", "del", "ins", "q":
   1015 		// elements that allow .cite
   1016 		return true
   1017 	case "audio", "embed", "iframe", "img", "input", "script", "track", "video":
   1018 		// elements that allow .src
   1019 		return true
   1020 	default:
   1021 		return false
   1022 	}
   1023 }
   1024 
   1025 // stringInSlice returns true if needle exists in haystack
   1026 func stringInSlice(needle string, haystack []string) bool {
   1027 	for _, straw := range haystack {
   1028 		if strings.ToLower(straw) == strings.ToLower(needle) {
   1029 			return true
   1030 		}
   1031 	}
   1032 	return false
   1033 }
   1034 
   1035 func isDataAttribute(val string) bool {
   1036 	if !dataAttribute.MatchString(val) {
   1037 		return false
   1038 	}
   1039 	rest := strings.Split(val, "data-")
   1040 	if len(rest) == 1 {
   1041 		return false
   1042 	}
   1043 	// data-xml* is invalid.
   1044 	if dataAttributeXMLPrefix.MatchString(rest[1]) {
   1045 		return false
   1046 	}
   1047 	// no uppercase or semi-colons allowed.
   1048 	if dataAttributeInvalidChars.MatchString(rest[1]) {
   1049 		return false
   1050 	}
   1051 	return true
   1052 }
   1053 
   1054 func removeUnicode(value string) string {
   1055 	substitutedValue := value
   1056 	currentLoc := cssUnicodeChar.FindStringIndex(substitutedValue)
   1057 	for currentLoc != nil {
   1058 
   1059 		character := substitutedValue[currentLoc[0]+1 : currentLoc[1]]
   1060 		character = strings.TrimSpace(character)
   1061 		if len(character) < 4 {
   1062 			character = strings.Repeat("0", 4-len(character)) + character
   1063 		} else {
   1064 			for len(character) > 4 {
   1065 				if character[0] != '0' {
   1066 					character = ""
   1067 					break
   1068 				} else {
   1069 					character = character[1:]
   1070 				}
   1071 			}
   1072 		}
   1073 		character = "\\u" + character
   1074 		translatedChar, err := strconv.Unquote(`"` + character + `"`)
   1075 		translatedChar = strings.TrimSpace(translatedChar)
   1076 		if err != nil {
   1077 			return ""
   1078 		}
   1079 		substitutedValue = substitutedValue[0:currentLoc[0]] + translatedChar + substitutedValue[currentLoc[1]:]
   1080 		currentLoc = cssUnicodeChar.FindStringIndex(substitutedValue)
   1081 	}
   1082 	return substitutedValue
   1083 }
   1084 
   1085 func (p *Policy) matchRegex(elementName string) (map[string][]attrPolicy, bool) {
   1086 	aps := make(map[string][]attrPolicy, 0)
   1087 	matched := false
   1088 	for regex, attrs := range p.elsMatchingAndAttrs {
   1089 		if regex.MatchString(elementName) {
   1090 			matched = true
   1091 			for k, v := range attrs {
   1092 				aps[k] = append(aps[k], v...)
   1093 			}
   1094 		}
   1095 	}
   1096 	return aps, matched
   1097 }
   1098 
   1099 // normaliseElementName takes a HTML element like <script> which is user input
   1100 // and returns a lower case version of it that is immune to UTF-8 to ASCII
   1101 // conversion tricks (like the use of upper case cyrillic i scrÄ°pt which a
   1102 // strings.ToLower would convert to script). Instead this func will preserve
   1103 // all non-ASCII as their escaped equivalent, i.e. \u0130 which reveals the
   1104 // characters when lower cased
   1105 func normaliseElementName(str string) string {
   1106 	// that useful QuoteToASCII put quote marks at the start and end
   1107 	// so those are trimmed off
   1108 	return strings.TrimSuffix(
   1109 		strings.TrimPrefix(
   1110 			strings.ToLower(
   1111 				strconv.QuoteToASCII(str),
   1112 			),
   1113 			`"`),
   1114 		`"`,
   1115 	)
   1116 }