html.go (17552B)
1 // Package html minifies HTML5 following the specifications at http://www.w3.org/TR/html5/syntax.html. 2 package html 3 4 import ( 5 "bytes" 6 "io" 7 8 "github.com/tdewolff/minify/v2" 9 "github.com/tdewolff/parse/v2" 10 "github.com/tdewolff/parse/v2/buffer" 11 "github.com/tdewolff/parse/v2/html" 12 ) 13 14 var ( 15 gtBytes = []byte(">") 16 isBytes = []byte("=") 17 spaceBytes = []byte(" ") 18 doctypeBytes = []byte("<!doctype html>") 19 jsMimeBytes = []byte("application/javascript") 20 cssMimeBytes = []byte("text/css") 21 htmlMimeBytes = []byte("text/html") 22 svgMimeBytes = []byte("image/svg+xml") 23 formMimeBytes = []byte("application/x-www-form-urlencoded") 24 mathMimeBytes = []byte("application/mathml+xml") 25 dataSchemeBytes = []byte("data:") 26 jsSchemeBytes = []byte("javascript:") 27 httpBytes = []byte("http") 28 radioBytes = []byte("radio") 29 onBytes = []byte("on") 30 textBytes = []byte("text") 31 noneBytes = []byte("none") 32 submitBytes = []byte("submit") 33 allBytes = []byte("all") 34 rectBytes = []byte("rect") 35 dataBytes = []byte("data") 36 getBytes = []byte("get") 37 autoBytes = []byte("auto") 38 oneBytes = []byte("one") 39 inlineParams = map[string]string{"inline": "1"} 40 ) 41 42 //////////////////////////////////////////////////////////////// 43 44 // Minifier is an HTML minifier. 45 type Minifier struct { 46 KeepComments bool 47 KeepConditionalComments bool 48 KeepDefaultAttrVals bool 49 KeepDocumentTags bool 50 KeepEndTags bool 51 KeepQuotes bool 52 KeepWhitespace bool 53 } 54 55 // Minify minifies HTML data, it reads from r and writes to w. 56 func Minify(m *minify.M, w io.Writer, r io.Reader, params map[string]string) error { 57 return (&Minifier{}).Minify(m, w, r, params) 58 } 59 60 // Minify minifies HTML data, it reads from r and writes to w. 61 func (o *Minifier) Minify(m *minify.M, w io.Writer, r io.Reader, _ map[string]string) error { 62 var rawTagHash Hash 63 var rawTagMediatype []byte 64 65 omitSpace := true // if true the next leading space is omitted 66 inPre := false 67 68 attrMinifyBuffer := buffer.NewWriter(make([]byte, 0, 64)) 69 attrByteBuffer := make([]byte, 0, 64) 70 71 z := parse.NewInput(r) 72 defer z.Restore() 73 74 l := html.NewLexer(z) 75 tb := NewTokenBuffer(z, l) 76 for { 77 t := *tb.Shift() 78 switch t.TokenType { 79 case html.ErrorToken: 80 if _, err := w.Write(nil); err != nil { 81 return err 82 } 83 if l.Err() == io.EOF { 84 return nil 85 } 86 return l.Err() 87 case html.DoctypeToken: 88 w.Write(doctypeBytes) 89 case html.CommentToken: 90 if o.KeepComments { 91 w.Write(t.Data) 92 } else if o.KeepConditionalComments && 6 < len(t.Text) && (bytes.HasPrefix(t.Text, []byte("[if ")) || bytes.HasSuffix(t.Text, []byte("[endif]")) || bytes.HasSuffix(t.Text, []byte("[endif]--"))) { 93 // [if ...] is always 7 or more characters, [endif] is only encountered for downlevel-revealed 94 // see https://msdn.microsoft.com/en-us/library/ms537512(v=vs.85).aspx#syntax 95 if bytes.HasPrefix(t.Data, []byte("<!--[if ")) && bytes.HasSuffix(t.Data, []byte("<![endif]-->")) { // downlevel-hidden 96 begin := bytes.IndexByte(t.Data, '>') + 1 97 end := len(t.Data) - len("<![endif]-->") 98 w.Write(t.Data[:begin]) 99 if err := o.Minify(m, w, buffer.NewReader(t.Data[begin:end]), nil); err != nil { 100 return minify.UpdateErrorPosition(err, z, t.Offset) 101 } 102 w.Write(t.Data[end:]) 103 } else { 104 w.Write(t.Data) // downlevel-revealed or short downlevel-hidden 105 } 106 } else if 1 < len(t.Text) && t.Text[0] == '#' { 107 // SSI tags 108 w.Write(t.Data) 109 } 110 case html.SvgToken: 111 if err := m.MinifyMimetype(svgMimeBytes, w, buffer.NewReader(t.Data), nil); err != nil { 112 if err != minify.ErrNotExist { 113 return minify.UpdateErrorPosition(err, z, t.Offset) 114 } 115 w.Write(t.Data) 116 } 117 case html.MathToken: 118 if err := m.MinifyMimetype(mathMimeBytes, w, buffer.NewReader(t.Data), nil); err != nil { 119 if err != minify.ErrNotExist { 120 return minify.UpdateErrorPosition(err, z, t.Offset) 121 } 122 w.Write(t.Data) 123 } 124 case html.TextToken: 125 // CSS and JS minifiers for inline code 126 if rawTagHash != 0 { 127 if rawTagHash == Style || rawTagHash == Script || rawTagHash == Iframe { 128 var mimetype []byte 129 var params map[string]string 130 if rawTagHash == Iframe { 131 mimetype = htmlMimeBytes 132 } else if len(rawTagMediatype) > 0 { 133 mimetype, params = parse.Mediatype(rawTagMediatype) 134 } else if rawTagHash == Script { 135 mimetype = jsMimeBytes 136 } else if rawTagHash == Style { 137 mimetype = cssMimeBytes 138 } 139 if err := m.MinifyMimetype(mimetype, w, buffer.NewReader(t.Data), params); err != nil { 140 if err != minify.ErrNotExist { 141 return minify.UpdateErrorPosition(err, z, t.Offset) 142 } 143 w.Write(t.Data) 144 } 145 } else { 146 w.Write(t.Data) 147 } 148 } else if inPre { 149 w.Write(t.Data) 150 } else { 151 t.Data = parse.ReplaceMultipleWhitespaceAndEntities(t.Data, EntitiesMap, TextRevEntitiesMap) 152 153 // whitespace removal; trim left 154 if omitSpace && parse.IsWhitespace(t.Data[0]) { 155 t.Data = t.Data[1:] 156 } 157 158 // whitespace removal; trim right 159 omitSpace = false 160 if len(t.Data) == 0 { 161 omitSpace = true 162 } else if parse.IsWhitespace(t.Data[len(t.Data)-1]) { 163 omitSpace = true 164 i := 0 165 for { 166 next := tb.Peek(i) 167 // trim if EOF, text token with leading whitespace or block token 168 if next.TokenType == html.ErrorToken { 169 t.Data = t.Data[:len(t.Data)-1] 170 omitSpace = false 171 break 172 } else if next.TokenType == html.TextToken { 173 // this only happens when a comment, doctype or phrasing end tag (only for !o.KeepWhitespace) was in between 174 // remove if the text token starts with a whitespace 175 if len(next.Data) > 0 && parse.IsWhitespace(next.Data[0]) { 176 t.Data = t.Data[:len(t.Data)-1] 177 omitSpace = false 178 } 179 break 180 } else if next.TokenType == html.StartTagToken || next.TokenType == html.EndTagToken { 181 if o.KeepWhitespace { 182 break 183 } 184 // remove when followed up by a block tag 185 if next.Traits&nonPhrasingTag != 0 { 186 t.Data = t.Data[:len(t.Data)-1] 187 omitSpace = false 188 break 189 } else if next.TokenType == html.StartTagToken { 190 break 191 } 192 } 193 i++ 194 } 195 } 196 197 w.Write(t.Data) 198 } 199 case html.StartTagToken, html.EndTagToken: 200 rawTagHash = 0 201 hasAttributes := false 202 if t.TokenType == html.StartTagToken { 203 if next := tb.Peek(0); next.TokenType == html.AttributeToken { 204 hasAttributes = true 205 } 206 if t.Traits&rawTag != 0 { 207 // ignore empty script and style tags 208 if !hasAttributes && (t.Hash == Script || t.Hash == Style) { 209 if next := tb.Peek(1); next.TokenType == html.EndTagToken { 210 tb.Shift() 211 tb.Shift() 212 break 213 } 214 } 215 rawTagHash = t.Hash 216 rawTagMediatype = nil 217 218 // do not minify content of <style amp-boilerplate> 219 if hasAttributes && t.Hash == Style { 220 if attrs := tb.Attributes(Amp_Boilerplate); attrs[0] != nil { 221 rawTagHash = 0 222 } 223 } 224 } 225 } else if t.Hash == Template { 226 omitSpace = true // EndTagToken 227 } 228 229 if t.Hash == Pre { 230 inPre = t.TokenType == html.StartTagToken 231 } 232 233 // remove superfluous tags, except for html, head and body tags when KeepDocumentTags is set 234 if !hasAttributes && (!o.KeepDocumentTags && (t.Hash == Html || t.Hash == Head || t.Hash == Body) || t.Hash == Colgroup) { 235 break 236 } else if t.TokenType == html.EndTagToken { 237 omitEndTag := false 238 if !o.KeepEndTags { 239 if t.Hash == Thead || t.Hash == Tbody || t.Hash == Tfoot || t.Hash == Tr || t.Hash == Th || 240 t.Hash == Td || t.Hash == Option || t.Hash == Dd || t.Hash == Dt || t.Hash == Li || 241 t.Hash == Rb || t.Hash == Rt || t.Hash == Rtc || t.Hash == Rp { 242 omitEndTag = true // omit end tags 243 } else if t.Hash == P { 244 i := 0 245 for { 246 next := tb.Peek(i) 247 i++ 248 // continue if text token is empty or whitespace 249 if next.TokenType == html.TextToken && parse.IsAllWhitespace(next.Data) { 250 continue 251 } 252 if next.TokenType == html.ErrorToken || next.TokenType == html.EndTagToken && next.Traits&keepPTag == 0 || next.TokenType == html.StartTagToken && next.Traits&omitPTag != 0 { 253 omitEndTag = true // omit p end tag 254 } 255 break 256 } 257 } else if t.Hash == Optgroup { 258 i := 0 259 for { 260 next := tb.Peek(i) 261 i++ 262 // continue if text token 263 if next.TokenType == html.TextToken { 264 continue 265 } 266 if next.TokenType == html.ErrorToken || next.Hash != Option { 267 omitEndTag = true // omit optgroup end tag 268 } 269 break 270 } 271 } 272 } 273 274 if t.Traits&nonPhrasingTag != 0 { 275 omitSpace = true // omit spaces after block elements 276 } else if o.KeepWhitespace || t.Traits&objectTag != 0 { 277 omitSpace = false 278 } 279 280 if !omitEndTag { 281 if len(t.Data) > 3+len(t.Text) { 282 t.Data[2+len(t.Text)] = '>' 283 t.Data = t.Data[:3+len(t.Text)] 284 } 285 w.Write(t.Data) 286 } 287 288 // skip text in select and optgroup tags 289 if t.Hash == Option || t.Hash == Optgroup { 290 if next := tb.Peek(0); next.TokenType == html.TextToken { 291 tb.Shift() 292 } 293 } 294 break 295 } 296 297 if o.KeepWhitespace || t.Traits&objectTag != 0 { 298 omitSpace = false 299 } else if t.Traits&nonPhrasingTag != 0 { 300 omitSpace = true // omit spaces after block elements 301 } 302 303 w.Write(t.Data) 304 305 if hasAttributes { 306 if t.Hash == Meta { 307 attrs := tb.Attributes(Content, Http_Equiv, Charset, Name) 308 if content := attrs[0]; content != nil { 309 if httpEquiv := attrs[1]; httpEquiv != nil { 310 httpEquiv.AttrVal = parse.TrimWhitespace(httpEquiv.AttrVal) 311 if charset := attrs[2]; charset == nil && parse.EqualFold(httpEquiv.AttrVal, []byte("content-type")) { 312 content.AttrVal = minify.Mediatype(content.AttrVal) 313 if bytes.Equal(content.AttrVal, []byte("text/html;charset=utf-8")) { 314 httpEquiv.Text = nil 315 content.Text = []byte("charset") 316 content.Hash = Charset 317 content.AttrVal = []byte("utf-8") 318 } 319 } 320 } 321 if name := attrs[3]; name != nil { 322 name.AttrVal = parse.TrimWhitespace(name.AttrVal) 323 if parse.EqualFold(name.AttrVal, []byte("keywords")) { 324 content.AttrVal = bytes.ReplaceAll(content.AttrVal, []byte(", "), []byte(",")) 325 } else if parse.EqualFold(name.AttrVal, []byte("viewport")) { 326 content.AttrVal = bytes.ReplaceAll(content.AttrVal, []byte(" "), []byte("")) 327 for i := 0; i < len(content.AttrVal); i++ { 328 if content.AttrVal[i] == '=' && i+2 < len(content.AttrVal) { 329 i++ 330 if n := parse.Number(content.AttrVal[i:]); n > 0 { 331 minNum := minify.Number(content.AttrVal[i:i+n], -1) 332 if len(minNum) < n { 333 copy(content.AttrVal[i:i+len(minNum)], minNum) 334 copy(content.AttrVal[i+len(minNum):], content.AttrVal[i+n:]) 335 content.AttrVal = content.AttrVal[:len(content.AttrVal)+len(minNum)-n] 336 } 337 i += len(minNum) 338 } 339 i-- // mitigate for-loop increase 340 } 341 } 342 } 343 } 344 } 345 } else if t.Hash == Script { 346 attrs := tb.Attributes(Src, Charset) 347 if attrs[0] != nil && attrs[1] != nil { 348 attrs[1].Text = nil 349 } 350 } else if t.Hash == Input { 351 attrs := tb.Attributes(Type, Value) 352 if t, value := attrs[0], attrs[1]; t != nil && value != nil { 353 isRadio := parse.EqualFold(t.AttrVal, radioBytes) 354 if !isRadio && len(value.AttrVal) == 0 { 355 value.Text = nil 356 } else if isRadio && parse.EqualFold(value.AttrVal, onBytes) { 357 value.Text = nil 358 } 359 } 360 } else if t.Hash == A { 361 attrs := tb.Attributes(Id, Name) 362 if id, name := attrs[0], attrs[1]; id != nil && name != nil { 363 if bytes.Equal(id.AttrVal, name.AttrVal) { 364 name.Text = nil 365 } 366 } 367 } 368 369 // write attributes 370 for { 371 attr := *tb.Shift() 372 if attr.TokenType != html.AttributeToken { 373 break 374 } else if attr.Text == nil { 375 continue // removed attribute 376 } 377 378 val := attr.AttrVal 379 if attr.Traits&trimAttr != 0 { 380 val = parse.ReplaceMultipleWhitespaceAndEntities(val, EntitiesMap, nil) 381 val = parse.TrimWhitespace(val) 382 } else { 383 val = parse.ReplaceEntities(val, EntitiesMap, nil) 384 } 385 if t.Traits != 0 { 386 if len(val) == 0 && (attr.Hash == Class || 387 attr.Hash == Dir || 388 attr.Hash == Id || 389 attr.Hash == Name || 390 attr.Hash == Action && t.Hash == Form) { 391 continue // omit empty attribute values 392 } 393 if attr.Traits&caselessAttr != 0 { 394 val = parse.ToLower(val) 395 } 396 if rawTagHash != 0 && attr.Hash == Type { 397 rawTagMediatype = parse.Copy(val) 398 } 399 400 if attr.Hash == Enctype || attr.Hash == Codetype || attr.Hash == Accept || attr.Hash == Type && (t.Hash == A || t.Hash == Link || t.Hash == Embed || t.Hash == Object || t.Hash == Source || t.Hash == Script || t.Hash == Style) { 401 val = minify.Mediatype(val) 402 } 403 404 // default attribute values can be omitted 405 if !o.KeepDefaultAttrVals && (attr.Hash == Type && (t.Hash == Script && jsMimetypes[string(val)] || 406 t.Hash == Style && bytes.Equal(val, cssMimeBytes) || 407 t.Hash == Link && bytes.Equal(val, cssMimeBytes) || 408 t.Hash == Input && bytes.Equal(val, textBytes) || 409 t.Hash == Button && bytes.Equal(val, submitBytes)) || 410 attr.Hash == Language && t.Hash == Script || 411 attr.Hash == Method && bytes.Equal(val, getBytes) || 412 attr.Hash == Enctype && bytes.Equal(val, formMimeBytes) || 413 attr.Hash == Colspan && bytes.Equal(val, oneBytes) || 414 attr.Hash == Rowspan && bytes.Equal(val, oneBytes) || 415 attr.Hash == Shape && bytes.Equal(val, rectBytes) || 416 attr.Hash == Span && bytes.Equal(val, oneBytes) || 417 attr.Hash == Clear && bytes.Equal(val, noneBytes) || 418 attr.Hash == Frameborder && bytes.Equal(val, oneBytes) || 419 attr.Hash == Scrolling && bytes.Equal(val, autoBytes) || 420 attr.Hash == Valuetype && bytes.Equal(val, dataBytes) || 421 attr.Hash == Media && t.Hash == Style && bytes.Equal(val, allBytes)) { 422 continue 423 } 424 425 if attr.Hash == Style { 426 // CSS minifier for attribute inline code 427 val = parse.TrimWhitespace(val) 428 attrMinifyBuffer.Reset() 429 if err := m.MinifyMimetype(cssMimeBytes, attrMinifyBuffer, buffer.NewReader(val), inlineParams); err == nil { 430 val = attrMinifyBuffer.Bytes() 431 } else if err != minify.ErrNotExist { 432 return minify.UpdateErrorPosition(err, z, attr.Offset) 433 } 434 if len(val) == 0 { 435 continue 436 } 437 } else if len(attr.Text) > 2 && attr.Text[0] == 'o' && attr.Text[1] == 'n' { 438 // JS minifier for attribute inline code 439 val = parse.TrimWhitespace(val) 440 if len(val) >= 11 && parse.EqualFold(val[:11], jsSchemeBytes) { 441 val = val[11:] 442 } 443 attrMinifyBuffer.Reset() 444 if err := m.MinifyMimetype(jsMimeBytes, attrMinifyBuffer, buffer.NewReader(val), nil); err == nil { 445 val = attrMinifyBuffer.Bytes() 446 } else if err != minify.ErrNotExist { 447 return minify.UpdateErrorPosition(err, z, attr.Offset) 448 } 449 if len(val) == 0 { 450 continue 451 } 452 } else if attr.Traits&urlAttr != 0 { // anchors are already handled 453 val = parse.TrimWhitespace(val) 454 if 5 < len(val) { 455 if parse.EqualFold(val[:4], httpBytes) { 456 if val[4] == ':' { 457 if m.URL != nil && m.URL.Scheme == "http" { 458 val = val[5:] 459 } else { 460 parse.ToLower(val[:4]) 461 } 462 } else if (val[4] == 's' || val[4] == 'S') && val[5] == ':' { 463 if m.URL != nil && m.URL.Scheme == "https" { 464 val = val[6:] 465 } else { 466 parse.ToLower(val[:5]) 467 } 468 } 469 } else if parse.EqualFold(val[:5], dataSchemeBytes) { 470 val = minify.DataURI(m, val) 471 } 472 } 473 } 474 } 475 476 w.Write(spaceBytes) 477 w.Write(attr.Text) 478 if len(val) > 0 && attr.Traits&booleanAttr == 0 { 479 w.Write(isBytes) 480 481 // use double quotes for RDFa attributes 482 isXML := attr.Hash == Vocab || attr.Hash == Typeof || attr.Hash == Property || attr.Hash == Resource || attr.Hash == Prefix || attr.Hash == Content || attr.Hash == About || attr.Hash == Rev || attr.Hash == Datatype || attr.Hash == Inlist 483 484 // no quotes if possible, else prefer single or double depending on which occurs more often in value 485 var quote byte 486 487 if 0 < len(attr.Data) && (attr.Data[len(attr.Data)-1] == '\'' || attr.Data[len(attr.Data)-1] == '"') { 488 quote = attr.Data[len(attr.Data)-1] 489 } 490 val = html.EscapeAttrVal(&attrByteBuffer, val, quote, o.KeepQuotes, isXML) 491 w.Write(val) 492 } 493 } 494 } else { 495 _ = tb.Shift() // StartTagClose 496 } 497 w.Write(gtBytes) 498 499 // skip text in select and optgroup tags 500 if t.Hash == Select || t.Hash == Optgroup { 501 if next := tb.Peek(0); next.TokenType == html.TextToken { 502 tb.Shift() 503 } 504 } 505 506 // keep space after phrasing tags (<i>, <span>, ...) FontAwesome etc. 507 if t.TokenType == html.StartTagToken && t.Traits&nonPhrasingTag == 0 { 508 if next := tb.Peek(0); next.Hash == t.Hash && next.TokenType == html.EndTagToken { 509 omitSpace = false 510 } 511 } 512 } 513 } 514 }