escape.go - gtsocial-umbx - Unnamed repository; edit this file 'description' to name the repository.

escape.go (8411B)
      1 // Copyright 2010 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package html
      6 
      7 import (
      8 	"bytes"
      9 	"strings"
     10 	"unicode/utf8"
     11 )
     12 
     13 // These replacements permit compatibility with old numeric entities that
     14 // assumed Windows-1252 encoding.
     15 // https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference
     16 var replacementTable = [...]rune{
     17 	'\u20AC', // First entry is what 0x80 should be replaced with.
     18 	'\u0081',
     19 	'\u201A',
     20 	'\u0192',
     21 	'\u201E',
     22 	'\u2026',
     23 	'\u2020',
     24 	'\u2021',
     25 	'\u02C6',
     26 	'\u2030',
     27 	'\u0160',
     28 	'\u2039',
     29 	'\u0152',
     30 	'\u008D',
     31 	'\u017D',
     32 	'\u008F',
     33 	'\u0090',
     34 	'\u2018',
     35 	'\u2019',
     36 	'\u201C',
     37 	'\u201D',
     38 	'\u2022',
     39 	'\u2013',
     40 	'\u2014',
     41 	'\u02DC',
     42 	'\u2122',
     43 	'\u0161',
     44 	'\u203A',
     45 	'\u0153',
     46 	'\u009D',
     47 	'\u017E',
     48 	'\u0178', // Last entry is 0x9F.
     49 	// 0x00->'\uFFFD' is handled programmatically.
     50 	// 0x0D->'\u000D' is a no-op.
     51 }
     52 
     53 // unescapeEntity reads an entity like "&lt;" from b[src:] and writes the
     54 // corresponding "<" to b[dst:], returning the incremented dst and src cursors.
     55 // Precondition: b[src] == '&' && dst <= src.
     56 // attribute should be true if parsing an attribute value.
     57 func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
     58 	// https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference
     59 
     60 	// i starts at 1 because we already know that s[0] == '&'.
     61 	i, s := 1, b[src:]
     62 
     63 	if len(s) <= 1 {
     64 		b[dst] = b[src]
     65 		return dst + 1, src + 1
     66 	}
     67 
     68 	if s[i] == '#' {
     69 		if len(s) <= 3 { // We need to have at least "&#.".
     70 			b[dst] = b[src]
     71 			return dst + 1, src + 1
     72 		}
     73 		i++
     74 		c := s[i]
     75 		hex := false
     76 		if c == 'x' || c == 'X' {
     77 			hex = true
     78 			i++
     79 		}
     80 
     81 		x := '\x00'
     82 		for i < len(s) {
     83 			c = s[i]
     84 			i++
     85 			if hex {
     86 				if '0' <= c && c <= '9' {
     87 					x = 16*x + rune(c) - '0'
     88 					continue
     89 				} else if 'a' <= c && c <= 'f' {
     90 					x = 16*x + rune(c) - 'a' + 10
     91 					continue
     92 				} else if 'A' <= c && c <= 'F' {
     93 					x = 16*x + rune(c) - 'A' + 10
     94 					continue
     95 				}
     96 			} else if '0' <= c && c <= '9' {
     97 				x = 10*x + rune(c) - '0'
     98 				continue
     99 			}
    100 			if c != ';' {
    101 				i--
    102 			}
    103 			break
    104 		}
    105 
    106 		if i <= 3 { // No characters matched.
    107 			b[dst] = b[src]
    108 			return dst + 1, src + 1
    109 		}
    110 
    111 		if 0x80 <= x && x <= 0x9F {
    112 			// Replace characters from Windows-1252 with UTF-8 equivalents.
    113 			x = replacementTable[x-0x80]
    114 		} else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
    115 			// Replace invalid characters with the replacement character.
    116 			x = '\uFFFD'
    117 		}
    118 
    119 		return dst + utf8.EncodeRune(b[dst:], x), src + i
    120 	}
    121 
    122 	// Consume the maximum number of characters possible, with the
    123 	// consumed characters matching one of the named references.
    124 
    125 	for i < len(s) {
    126 		c := s[i]
    127 		i++
    128 		// Lower-cased characters are more common in entities, so we check for them first.
    129 		if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
    130 			continue
    131 		}
    132 		if c != ';' {
    133 			i--
    134 		}
    135 		break
    136 	}
    137 
    138 	entityName := string(s[1:i])
    139 	if entityName == "" {
    140 		// No-op.
    141 	} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
    142 		// No-op.
    143 	} else if x := entity[entityName]; x != 0 {
    144 		return dst + utf8.EncodeRune(b[dst:], x), src + i
    145 	} else if x := entity2[entityName]; x[0] != 0 {
    146 		dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
    147 		return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
    148 	} else if !attribute {
    149 		maxLen := len(entityName) - 1
    150 		if maxLen > longestEntityWithoutSemicolon {
    151 			maxLen = longestEntityWithoutSemicolon
    152 		}
    153 		for j := maxLen; j > 1; j-- {
    154 			if x := entity[entityName[:j]]; x != 0 {
    155 				return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
    156 			}
    157 		}
    158 	}
    159 
    160 	dst1, src1 = dst+i, src+i
    161 	copy(b[dst:dst1], b[src:src1])
    162 	return dst1, src1
    163 }
    164 
    165 // unescape unescapes b's entities in-place, so that "a&lt;b" becomes "a<b".
    166 // attribute should be true if parsing an attribute value.
    167 func unescape(b []byte, attribute bool) []byte {
    168 	for i, c := range b {
    169 		if c == '&' {
    170 			dst, src := unescapeEntity(b, i, i, attribute)
    171 			for src < len(b) {
    172 				c := b[src]
    173 				if c == '&' {
    174 					dst, src = unescapeEntity(b, dst, src, attribute)
    175 				} else {
    176 					b[dst] = c
    177 					dst, src = dst+1, src+1
    178 				}
    179 			}
    180 			return b[0:dst]
    181 		}
    182 	}
    183 	return b
    184 }
    185 
    186 // lower lower-cases the A-Z bytes in b in-place, so that "aBc" becomes "abc".
    187 func lower(b []byte) []byte {
    188 	for i, c := range b {
    189 		if 'A' <= c && c <= 'Z' {
    190 			b[i] = c + 'a' - 'A'
    191 		}
    192 	}
    193 	return b
    194 }
    195 
    196 // escapeComment is like func escape but escapes its input bytes less often.
    197 // Per https://github.com/golang/go/issues/58246 some HTML comments are (1)
    198 // meaningful and (2) contain angle brackets that we'd like to avoid escaping
    199 // unless we have to.
    200 //
    201 // "We have to" includes the '&' byte, since that introduces other escapes.
    202 //
    203 // It also includes those bytes (not including EOF) that would otherwise end
    204 // the comment. Per the summary table at the bottom of comment_test.go, this is
    205 // the '>' byte that, per above, we'd like to avoid escaping unless we have to.
    206 //
    207 // Studying the summary table (and T actions in its '>' column) closely, we
    208 // only need to escape in states 43, 44, 49, 51 and 52. State 43 is at the
    209 // start of the comment data. State 52 is after a '!'. The other three states
    210 // are after a '-'.
    211 //
    212 // Our algorithm is thus to escape every '&' and to escape '>' if and only if:
    213 //   - The '>' is after a '!' or '-' (in the unescaped data) or
    214 //   - The '>' is at the start of the comment data (after the opening "<!--").
    215 func escapeComment(w writer, s string) error {
    216 	// When modifying this function, consider manually increasing the
    217 	// maxSuffixLen constant in func TestComments, from 6 to e.g. 9 or more.
    218 	// That increase should only be temporary, not committed, as it
    219 	// exponentially affects the test running time.
    220 
    221 	if len(s) == 0 {
    222 		return nil
    223 	}
    224 
    225 	// Loop:
    226 	//   - Grow j such that s[i:j] does not need escaping.
    227 	//   - If s[j] does need escaping, output s[i:j] and an escaped s[j],
    228 	//     resetting i and j to point past that s[j] byte.
    229 	i := 0
    230 	for j := 0; j < len(s); j++ {
    231 		escaped := ""
    232 		switch s[j] {
    233 		case '&':
    234 			escaped = "&amp;"
    235 
    236 		case '>':
    237 			if j > 0 {
    238 				if prev := s[j-1]; (prev != '!') && (prev != '-') {
    239 					continue
    240 				}
    241 			}
    242 			escaped = "&gt;"
    243 
    244 		default:
    245 			continue
    246 		}
    247 
    248 		if i < j {
    249 			if _, err := w.WriteString(s[i:j]); err != nil {
    250 				return err
    251 			}
    252 		}
    253 		if _, err := w.WriteString(escaped); err != nil {
    254 			return err
    255 		}
    256 		i = j + 1
    257 	}
    258 
    259 	if i < len(s) {
    260 		if _, err := w.WriteString(s[i:]); err != nil {
    261 			return err
    262 		}
    263 	}
    264 	return nil
    265 }
    266 
    267 // escapeCommentString is to EscapeString as escapeComment is to escape.
    268 func escapeCommentString(s string) string {
    269 	if strings.IndexAny(s, "&>") == -1 {
    270 		return s
    271 	}
    272 	var buf bytes.Buffer
    273 	escapeComment(&buf, s)
    274 	return buf.String()
    275 }
    276 
    277 const escapedChars = "&'<>\"\r"
    278 
    279 func escape(w writer, s string) error {
    280 	i := strings.IndexAny(s, escapedChars)
    281 	for i != -1 {
    282 		if _, err := w.WriteString(s[:i]); err != nil {
    283 			return err
    284 		}
    285 		var esc string
    286 		switch s[i] {
    287 		case '&':
    288 			esc = "&amp;"
    289 		case '\'':
    290 			// "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
    291 			esc = "&#39;"
    292 		case '<':
    293 			esc = "&lt;"
    294 		case '>':
    295 			esc = "&gt;"
    296 		case '"':
    297 			// "&#34;" is shorter than "&quot;".
    298 			esc = "&#34;"
    299 		case '\r':
    300 			esc = "&#13;"
    301 		default:
    302 			panic("unrecognized escape character")
    303 		}
    304 		s = s[i+1:]
    305 		if _, err := w.WriteString(esc); err != nil {
    306 			return err
    307 		}
    308 		i = strings.IndexAny(s, escapedChars)
    309 	}
    310 	_, err := w.WriteString(s)
    311 	return err
    312 }
    313 
    314 // EscapeString escapes special characters like "<" to become "&lt;". It
    315 // escapes only five such characters: <, >, &, ' and ".
    316 // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
    317 // always true.
    318 func EscapeString(s string) string {
    319 	if strings.IndexAny(s, escapedChars) == -1 {
    320 		return s
    321 	}
    322 	var buf bytes.Buffer
    323 	escape(&buf, s)
    324 	return buf.String()
    325 }
    326 
    327 // UnescapeString unescapes entities like "&lt;" to become "<". It unescapes a
    328 // larger range of entities than EscapeString escapes. For example, "&aacute;"
    329 // unescapes to "á", as does "&#225;" and "&xE1;".
    330 // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
    331 // always true.
    332 func UnescapeString(s string) string {
    333 	for _, c := range s {
    334 		if c == '&' {
    335 			return string(unescape([]byte(s), false))
    336 		}
    337 	}
    338 	return s
    339 }
	gtsocial-umbx Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs \| README \| LICENSE