gtsocial-umbx

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

html.go (17552B)


      1 // Package html minifies HTML5 following the specifications at http://www.w3.org/TR/html5/syntax.html.
      2 package html
      3 
      4 import (
      5 	"bytes"
      6 	"io"
      7 
      8 	"github.com/tdewolff/minify/v2"
      9 	"github.com/tdewolff/parse/v2"
     10 	"github.com/tdewolff/parse/v2/buffer"
     11 	"github.com/tdewolff/parse/v2/html"
     12 )
     13 
     14 var (
     15 	gtBytes         = []byte(">")
     16 	isBytes         = []byte("=")
     17 	spaceBytes      = []byte(" ")
     18 	doctypeBytes    = []byte("<!doctype html>")
     19 	jsMimeBytes     = []byte("application/javascript")
     20 	cssMimeBytes    = []byte("text/css")
     21 	htmlMimeBytes   = []byte("text/html")
     22 	svgMimeBytes    = []byte("image/svg+xml")
     23 	formMimeBytes   = []byte("application/x-www-form-urlencoded")
     24 	mathMimeBytes   = []byte("application/mathml+xml")
     25 	dataSchemeBytes = []byte("data:")
     26 	jsSchemeBytes   = []byte("javascript:")
     27 	httpBytes       = []byte("http")
     28 	radioBytes      = []byte("radio")
     29 	onBytes         = []byte("on")
     30 	textBytes       = []byte("text")
     31 	noneBytes       = []byte("none")
     32 	submitBytes     = []byte("submit")
     33 	allBytes        = []byte("all")
     34 	rectBytes       = []byte("rect")
     35 	dataBytes       = []byte("data")
     36 	getBytes        = []byte("get")
     37 	autoBytes       = []byte("auto")
     38 	oneBytes        = []byte("one")
     39 	inlineParams    = map[string]string{"inline": "1"}
     40 )
     41 
     42 ////////////////////////////////////////////////////////////////
     43 
     44 // Minifier is an HTML minifier.
     45 type Minifier struct {
     46 	KeepComments            bool
     47 	KeepConditionalComments bool
     48 	KeepDefaultAttrVals     bool
     49 	KeepDocumentTags        bool
     50 	KeepEndTags             bool
     51 	KeepQuotes              bool
     52 	KeepWhitespace          bool
     53 }
     54 
     55 // Minify minifies HTML data, it reads from r and writes to w.
     56 func Minify(m *minify.M, w io.Writer, r io.Reader, params map[string]string) error {
     57 	return (&Minifier{}).Minify(m, w, r, params)
     58 }
     59 
     60 // Minify minifies HTML data, it reads from r and writes to w.
     61 func (o *Minifier) Minify(m *minify.M, w io.Writer, r io.Reader, _ map[string]string) error {
     62 	var rawTagHash Hash
     63 	var rawTagMediatype []byte
     64 
     65 	omitSpace := true // if true the next leading space is omitted
     66 	inPre := false
     67 
     68 	attrMinifyBuffer := buffer.NewWriter(make([]byte, 0, 64))
     69 	attrByteBuffer := make([]byte, 0, 64)
     70 
     71 	z := parse.NewInput(r)
     72 	defer z.Restore()
     73 
     74 	l := html.NewLexer(z)
     75 	tb := NewTokenBuffer(z, l)
     76 	for {
     77 		t := *tb.Shift()
     78 		switch t.TokenType {
     79 		case html.ErrorToken:
     80 			if _, err := w.Write(nil); err != nil {
     81 				return err
     82 			}
     83 			if l.Err() == io.EOF {
     84 				return nil
     85 			}
     86 			return l.Err()
     87 		case html.DoctypeToken:
     88 			w.Write(doctypeBytes)
     89 		case html.CommentToken:
     90 			if o.KeepComments {
     91 				w.Write(t.Data)
     92 			} else if o.KeepConditionalComments && 6 < len(t.Text) && (bytes.HasPrefix(t.Text, []byte("[if ")) || bytes.HasSuffix(t.Text, []byte("[endif]")) || bytes.HasSuffix(t.Text, []byte("[endif]--"))) {
     93 				// [if ...] is always 7 or more characters, [endif] is only encountered for downlevel-revealed
     94 				// see https://msdn.microsoft.com/en-us/library/ms537512(v=vs.85).aspx#syntax
     95 				if bytes.HasPrefix(t.Data, []byte("<!--[if ")) && bytes.HasSuffix(t.Data, []byte("<![endif]-->")) { // downlevel-hidden
     96 					begin := bytes.IndexByte(t.Data, '>') + 1
     97 					end := len(t.Data) - len("<![endif]-->")
     98 					w.Write(t.Data[:begin])
     99 					if err := o.Minify(m, w, buffer.NewReader(t.Data[begin:end]), nil); err != nil {
    100 						return minify.UpdateErrorPosition(err, z, t.Offset)
    101 					}
    102 					w.Write(t.Data[end:])
    103 				} else {
    104 					w.Write(t.Data) // downlevel-revealed or short downlevel-hidden
    105 				}
    106 			} else if 1 < len(t.Text) && t.Text[0] == '#' {
    107 				// SSI tags
    108 				w.Write(t.Data)
    109 			}
    110 		case html.SvgToken:
    111 			if err := m.MinifyMimetype(svgMimeBytes, w, buffer.NewReader(t.Data), nil); err != nil {
    112 				if err != minify.ErrNotExist {
    113 					return minify.UpdateErrorPosition(err, z, t.Offset)
    114 				}
    115 				w.Write(t.Data)
    116 			}
    117 		case html.MathToken:
    118 			if err := m.MinifyMimetype(mathMimeBytes, w, buffer.NewReader(t.Data), nil); err != nil {
    119 				if err != minify.ErrNotExist {
    120 					return minify.UpdateErrorPosition(err, z, t.Offset)
    121 				}
    122 				w.Write(t.Data)
    123 			}
    124 		case html.TextToken:
    125 			// CSS and JS minifiers for inline code
    126 			if rawTagHash != 0 {
    127 				if rawTagHash == Style || rawTagHash == Script || rawTagHash == Iframe {
    128 					var mimetype []byte
    129 					var params map[string]string
    130 					if rawTagHash == Iframe {
    131 						mimetype = htmlMimeBytes
    132 					} else if len(rawTagMediatype) > 0 {
    133 						mimetype, params = parse.Mediatype(rawTagMediatype)
    134 					} else if rawTagHash == Script {
    135 						mimetype = jsMimeBytes
    136 					} else if rawTagHash == Style {
    137 						mimetype = cssMimeBytes
    138 					}
    139 					if err := m.MinifyMimetype(mimetype, w, buffer.NewReader(t.Data), params); err != nil {
    140 						if err != minify.ErrNotExist {
    141 							return minify.UpdateErrorPosition(err, z, t.Offset)
    142 						}
    143 						w.Write(t.Data)
    144 					}
    145 				} else {
    146 					w.Write(t.Data)
    147 				}
    148 			} else if inPre {
    149 				w.Write(t.Data)
    150 			} else {
    151 				t.Data = parse.ReplaceMultipleWhitespaceAndEntities(t.Data, EntitiesMap, TextRevEntitiesMap)
    152 
    153 				// whitespace removal; trim left
    154 				if omitSpace && parse.IsWhitespace(t.Data[0]) {
    155 					t.Data = t.Data[1:]
    156 				}
    157 
    158 				// whitespace removal; trim right
    159 				omitSpace = false
    160 				if len(t.Data) == 0 {
    161 					omitSpace = true
    162 				} else if parse.IsWhitespace(t.Data[len(t.Data)-1]) {
    163 					omitSpace = true
    164 					i := 0
    165 					for {
    166 						next := tb.Peek(i)
    167 						// trim if EOF, text token with leading whitespace or block token
    168 						if next.TokenType == html.ErrorToken {
    169 							t.Data = t.Data[:len(t.Data)-1]
    170 							omitSpace = false
    171 							break
    172 						} else if next.TokenType == html.TextToken {
    173 							// this only happens when a comment, doctype or phrasing end tag (only for !o.KeepWhitespace) was in between
    174 							// remove if the text token starts with a whitespace
    175 							if len(next.Data) > 0 && parse.IsWhitespace(next.Data[0]) {
    176 								t.Data = t.Data[:len(t.Data)-1]
    177 								omitSpace = false
    178 							}
    179 							break
    180 						} else if next.TokenType == html.StartTagToken || next.TokenType == html.EndTagToken {
    181 							if o.KeepWhitespace {
    182 								break
    183 							}
    184 							// remove when followed up by a block tag
    185 							if next.Traits&nonPhrasingTag != 0 {
    186 								t.Data = t.Data[:len(t.Data)-1]
    187 								omitSpace = false
    188 								break
    189 							} else if next.TokenType == html.StartTagToken {
    190 								break
    191 							}
    192 						}
    193 						i++
    194 					}
    195 				}
    196 
    197 				w.Write(t.Data)
    198 			}
    199 		case html.StartTagToken, html.EndTagToken:
    200 			rawTagHash = 0
    201 			hasAttributes := false
    202 			if t.TokenType == html.StartTagToken {
    203 				if next := tb.Peek(0); next.TokenType == html.AttributeToken {
    204 					hasAttributes = true
    205 				}
    206 				if t.Traits&rawTag != 0 {
    207 					// ignore empty script and style tags
    208 					if !hasAttributes && (t.Hash == Script || t.Hash == Style) {
    209 						if next := tb.Peek(1); next.TokenType == html.EndTagToken {
    210 							tb.Shift()
    211 							tb.Shift()
    212 							break
    213 						}
    214 					}
    215 					rawTagHash = t.Hash
    216 					rawTagMediatype = nil
    217 
    218 					// do not minify content of <style amp-boilerplate>
    219 					if hasAttributes && t.Hash == Style {
    220 						if attrs := tb.Attributes(Amp_Boilerplate); attrs[0] != nil {
    221 							rawTagHash = 0
    222 						}
    223 					}
    224 				}
    225 			} else if t.Hash == Template {
    226 				omitSpace = true // EndTagToken
    227 			}
    228 
    229 			if t.Hash == Pre {
    230 				inPre = t.TokenType == html.StartTagToken
    231 			}
    232 
    233 			// remove superfluous tags, except for html, head and body tags when KeepDocumentTags is set
    234 			if !hasAttributes && (!o.KeepDocumentTags && (t.Hash == Html || t.Hash == Head || t.Hash == Body) || t.Hash == Colgroup) {
    235 				break
    236 			} else if t.TokenType == html.EndTagToken {
    237 				omitEndTag := false
    238 				if !o.KeepEndTags {
    239 					if t.Hash == Thead || t.Hash == Tbody || t.Hash == Tfoot || t.Hash == Tr || t.Hash == Th ||
    240 						t.Hash == Td || t.Hash == Option || t.Hash == Dd || t.Hash == Dt || t.Hash == Li ||
    241 						t.Hash == Rb || t.Hash == Rt || t.Hash == Rtc || t.Hash == Rp {
    242 						omitEndTag = true // omit end tags
    243 					} else if t.Hash == P {
    244 						i := 0
    245 						for {
    246 							next := tb.Peek(i)
    247 							i++
    248 							// continue if text token is empty or whitespace
    249 							if next.TokenType == html.TextToken && parse.IsAllWhitespace(next.Data) {
    250 								continue
    251 							}
    252 							if next.TokenType == html.ErrorToken || next.TokenType == html.EndTagToken && next.Traits&keepPTag == 0 || next.TokenType == html.StartTagToken && next.Traits&omitPTag != 0 {
    253 								omitEndTag = true // omit p end tag
    254 							}
    255 							break
    256 						}
    257 					} else if t.Hash == Optgroup {
    258 						i := 0
    259 						for {
    260 							next := tb.Peek(i)
    261 							i++
    262 							// continue if text token
    263 							if next.TokenType == html.TextToken {
    264 								continue
    265 							}
    266 							if next.TokenType == html.ErrorToken || next.Hash != Option {
    267 								omitEndTag = true // omit optgroup end tag
    268 							}
    269 							break
    270 						}
    271 					}
    272 				}
    273 
    274 				if t.Traits&nonPhrasingTag != 0 {
    275 					omitSpace = true // omit spaces after block elements
    276 				} else if o.KeepWhitespace || t.Traits&objectTag != 0 {
    277 					omitSpace = false
    278 				}
    279 
    280 				if !omitEndTag {
    281 					if len(t.Data) > 3+len(t.Text) {
    282 						t.Data[2+len(t.Text)] = '>'
    283 						t.Data = t.Data[:3+len(t.Text)]
    284 					}
    285 					w.Write(t.Data)
    286 				}
    287 
    288 				// skip text in select and optgroup tags
    289 				if t.Hash == Option || t.Hash == Optgroup {
    290 					if next := tb.Peek(0); next.TokenType == html.TextToken {
    291 						tb.Shift()
    292 					}
    293 				}
    294 				break
    295 			}
    296 
    297 			if o.KeepWhitespace || t.Traits&objectTag != 0 {
    298 				omitSpace = false
    299 			} else if t.Traits&nonPhrasingTag != 0 {
    300 				omitSpace = true // omit spaces after block elements
    301 			}
    302 
    303 			w.Write(t.Data)
    304 
    305 			if hasAttributes {
    306 				if t.Hash == Meta {
    307 					attrs := tb.Attributes(Content, Http_Equiv, Charset, Name)
    308 					if content := attrs[0]; content != nil {
    309 						if httpEquiv := attrs[1]; httpEquiv != nil {
    310 							httpEquiv.AttrVal = parse.TrimWhitespace(httpEquiv.AttrVal)
    311 							if charset := attrs[2]; charset == nil && parse.EqualFold(httpEquiv.AttrVal, []byte("content-type")) {
    312 								content.AttrVal = minify.Mediatype(content.AttrVal)
    313 								if bytes.Equal(content.AttrVal, []byte("text/html;charset=utf-8")) {
    314 									httpEquiv.Text = nil
    315 									content.Text = []byte("charset")
    316 									content.Hash = Charset
    317 									content.AttrVal = []byte("utf-8")
    318 								}
    319 							}
    320 						}
    321 						if name := attrs[3]; name != nil {
    322 							name.AttrVal = parse.TrimWhitespace(name.AttrVal)
    323 							if parse.EqualFold(name.AttrVal, []byte("keywords")) {
    324 								content.AttrVal = bytes.ReplaceAll(content.AttrVal, []byte(", "), []byte(","))
    325 							} else if parse.EqualFold(name.AttrVal, []byte("viewport")) {
    326 								content.AttrVal = bytes.ReplaceAll(content.AttrVal, []byte(" "), []byte(""))
    327 								for i := 0; i < len(content.AttrVal); i++ {
    328 									if content.AttrVal[i] == '=' && i+2 < len(content.AttrVal) {
    329 										i++
    330 										if n := parse.Number(content.AttrVal[i:]); n > 0 {
    331 											minNum := minify.Number(content.AttrVal[i:i+n], -1)
    332 											if len(minNum) < n {
    333 												copy(content.AttrVal[i:i+len(minNum)], minNum)
    334 												copy(content.AttrVal[i+len(minNum):], content.AttrVal[i+n:])
    335 												content.AttrVal = content.AttrVal[:len(content.AttrVal)+len(minNum)-n]
    336 											}
    337 											i += len(minNum)
    338 										}
    339 										i-- // mitigate for-loop increase
    340 									}
    341 								}
    342 							}
    343 						}
    344 					}
    345 				} else if t.Hash == Script {
    346 					attrs := tb.Attributes(Src, Charset)
    347 					if attrs[0] != nil && attrs[1] != nil {
    348 						attrs[1].Text = nil
    349 					}
    350 				} else if t.Hash == Input {
    351 					attrs := tb.Attributes(Type, Value)
    352 					if t, value := attrs[0], attrs[1]; t != nil && value != nil {
    353 						isRadio := parse.EqualFold(t.AttrVal, radioBytes)
    354 						if !isRadio && len(value.AttrVal) == 0 {
    355 							value.Text = nil
    356 						} else if isRadio && parse.EqualFold(value.AttrVal, onBytes) {
    357 							value.Text = nil
    358 						}
    359 					}
    360 				} else if t.Hash == A {
    361 					attrs := tb.Attributes(Id, Name)
    362 					if id, name := attrs[0], attrs[1]; id != nil && name != nil {
    363 						if bytes.Equal(id.AttrVal, name.AttrVal) {
    364 							name.Text = nil
    365 						}
    366 					}
    367 				}
    368 
    369 				// write attributes
    370 				for {
    371 					attr := *tb.Shift()
    372 					if attr.TokenType != html.AttributeToken {
    373 						break
    374 					} else if attr.Text == nil {
    375 						continue // removed attribute
    376 					}
    377 
    378 					val := attr.AttrVal
    379 					if attr.Traits&trimAttr != 0 {
    380 						val = parse.ReplaceMultipleWhitespaceAndEntities(val, EntitiesMap, nil)
    381 						val = parse.TrimWhitespace(val)
    382 					} else {
    383 						val = parse.ReplaceEntities(val, EntitiesMap, nil)
    384 					}
    385 					if t.Traits != 0 {
    386 						if len(val) == 0 && (attr.Hash == Class ||
    387 							attr.Hash == Dir ||
    388 							attr.Hash == Id ||
    389 							attr.Hash == Name ||
    390 							attr.Hash == Action && t.Hash == Form) {
    391 							continue // omit empty attribute values
    392 						}
    393 						if attr.Traits&caselessAttr != 0 {
    394 							val = parse.ToLower(val)
    395 						}
    396 						if rawTagHash != 0 && attr.Hash == Type {
    397 							rawTagMediatype = parse.Copy(val)
    398 						}
    399 
    400 						if attr.Hash == Enctype || attr.Hash == Codetype || attr.Hash == Accept || attr.Hash == Type && (t.Hash == A || t.Hash == Link || t.Hash == Embed || t.Hash == Object || t.Hash == Source || t.Hash == Script || t.Hash == Style) {
    401 							val = minify.Mediatype(val)
    402 						}
    403 
    404 						// default attribute values can be omitted
    405 						if !o.KeepDefaultAttrVals && (attr.Hash == Type && (t.Hash == Script && jsMimetypes[string(val)] ||
    406 							t.Hash == Style && bytes.Equal(val, cssMimeBytes) ||
    407 							t.Hash == Link && bytes.Equal(val, cssMimeBytes) ||
    408 							t.Hash == Input && bytes.Equal(val, textBytes) ||
    409 							t.Hash == Button && bytes.Equal(val, submitBytes)) ||
    410 							attr.Hash == Language && t.Hash == Script ||
    411 							attr.Hash == Method && bytes.Equal(val, getBytes) ||
    412 							attr.Hash == Enctype && bytes.Equal(val, formMimeBytes) ||
    413 							attr.Hash == Colspan && bytes.Equal(val, oneBytes) ||
    414 							attr.Hash == Rowspan && bytes.Equal(val, oneBytes) ||
    415 							attr.Hash == Shape && bytes.Equal(val, rectBytes) ||
    416 							attr.Hash == Span && bytes.Equal(val, oneBytes) ||
    417 							attr.Hash == Clear && bytes.Equal(val, noneBytes) ||
    418 							attr.Hash == Frameborder && bytes.Equal(val, oneBytes) ||
    419 							attr.Hash == Scrolling && bytes.Equal(val, autoBytes) ||
    420 							attr.Hash == Valuetype && bytes.Equal(val, dataBytes) ||
    421 							attr.Hash == Media && t.Hash == Style && bytes.Equal(val, allBytes)) {
    422 							continue
    423 						}
    424 
    425 						if attr.Hash == Style {
    426 							// CSS minifier for attribute inline code
    427 							val = parse.TrimWhitespace(val)
    428 							attrMinifyBuffer.Reset()
    429 							if err := m.MinifyMimetype(cssMimeBytes, attrMinifyBuffer, buffer.NewReader(val), inlineParams); err == nil {
    430 								val = attrMinifyBuffer.Bytes()
    431 							} else if err != minify.ErrNotExist {
    432 								return minify.UpdateErrorPosition(err, z, attr.Offset)
    433 							}
    434 							if len(val) == 0 {
    435 								continue
    436 							}
    437 						} else if len(attr.Text) > 2 && attr.Text[0] == 'o' && attr.Text[1] == 'n' {
    438 							// JS minifier for attribute inline code
    439 							val = parse.TrimWhitespace(val)
    440 							if len(val) >= 11 && parse.EqualFold(val[:11], jsSchemeBytes) {
    441 								val = val[11:]
    442 							}
    443 							attrMinifyBuffer.Reset()
    444 							if err := m.MinifyMimetype(jsMimeBytes, attrMinifyBuffer, buffer.NewReader(val), nil); err == nil {
    445 								val = attrMinifyBuffer.Bytes()
    446 							} else if err != minify.ErrNotExist {
    447 								return minify.UpdateErrorPosition(err, z, attr.Offset)
    448 							}
    449 							if len(val) == 0 {
    450 								continue
    451 							}
    452 						} else if attr.Traits&urlAttr != 0 { // anchors are already handled
    453 							val = parse.TrimWhitespace(val)
    454 							if 5 < len(val) {
    455 								if parse.EqualFold(val[:4], httpBytes) {
    456 									if val[4] == ':' {
    457 										if m.URL != nil && m.URL.Scheme == "http" {
    458 											val = val[5:]
    459 										} else {
    460 											parse.ToLower(val[:4])
    461 										}
    462 									} else if (val[4] == 's' || val[4] == 'S') && val[5] == ':' {
    463 										if m.URL != nil && m.URL.Scheme == "https" {
    464 											val = val[6:]
    465 										} else {
    466 											parse.ToLower(val[:5])
    467 										}
    468 									}
    469 								} else if parse.EqualFold(val[:5], dataSchemeBytes) {
    470 									val = minify.DataURI(m, val)
    471 								}
    472 							}
    473 						}
    474 					}
    475 
    476 					w.Write(spaceBytes)
    477 					w.Write(attr.Text)
    478 					if len(val) > 0 && attr.Traits&booleanAttr == 0 {
    479 						w.Write(isBytes)
    480 
    481 						// use double quotes for RDFa attributes
    482 						isXML := attr.Hash == Vocab || attr.Hash == Typeof || attr.Hash == Property || attr.Hash == Resource || attr.Hash == Prefix || attr.Hash == Content || attr.Hash == About || attr.Hash == Rev || attr.Hash == Datatype || attr.Hash == Inlist
    483 
    484 						// no quotes if possible, else prefer single or double depending on which occurs more often in value
    485 						var quote byte
    486 
    487 						if 0 < len(attr.Data) && (attr.Data[len(attr.Data)-1] == '\'' || attr.Data[len(attr.Data)-1] == '"') {
    488 							quote = attr.Data[len(attr.Data)-1]
    489 						}
    490 						val = html.EscapeAttrVal(&attrByteBuffer, val, quote, o.KeepQuotes, isXML)
    491 						w.Write(val)
    492 					}
    493 				}
    494 			} else {
    495 				_ = tb.Shift() // StartTagClose
    496 			}
    497 			w.Write(gtBytes)
    498 
    499 			// skip text in select and optgroup tags
    500 			if t.Hash == Select || t.Hash == Optgroup {
    501 				if next := tb.Peek(0); next.TokenType == html.TextToken {
    502 					tb.Shift()
    503 				}
    504 			}
    505 
    506 			// keep space after phrasing tags (<i>, <span>, ...) FontAwesome etc.
    507 			if t.TokenType == html.StartTagToken && t.Traits&nonPhrasingTag == 0 {
    508 				if next := tb.Peek(0); next.Hash == t.Hash && next.TokenType == html.EndTagToken {
    509 					omitSpace = false
    510 				}
    511 			}
    512 		}
    513 	}
    514 }