doc.go - gtsocial-umbx - Unnamed repository; edit this file 'description' to name the repository.

doc.go (4520B)
      1 // Copyright 2010 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 /*
      6 Package html implements an HTML5-compliant tokenizer and parser.
      7 
      8 Tokenization is done by creating a Tokenizer for an io.Reader r. It is the
      9 caller's responsibility to ensure that r provides UTF-8 encoded HTML.
     10 
     11 	z := html.NewTokenizer(r)
     12 
     13 Given a Tokenizer z, the HTML is tokenized by repeatedly calling z.Next(),
     14 which parses the next token and returns its type, or an error:
     15 
     16 	for {
     17 		tt := z.Next()
     18 		if tt == html.ErrorToken {
     19 			// ...
     20 			return ...
     21 		}
     22 		// Process the current token.
     23 	}
     24 
     25 There are two APIs for retrieving the current token. The high-level API is to
     26 call Token; the low-level API is to call Text or TagName / TagAttr. Both APIs
     27 allow optionally calling Raw after Next but before Token, Text, TagName, or
     28 TagAttr. In EBNF notation, the valid call sequence per token is:
     29 
     30 	Next {Raw} [ Token | Text | TagName {TagAttr} ]
     31 
     32 Token returns an independent data structure that completely describes a token.
     33 Entities (such as "&lt;") are unescaped, tag names and attribute keys are
     34 lower-cased, and attributes are collected into a []Attribute. For example:
     35 
     36 	for {
     37 		if z.Next() == html.ErrorToken {
     38 			// Returning io.EOF indicates success.
     39 			return z.Err()
     40 		}
     41 		emitToken(z.Token())
     42 	}
     43 
     44 The low-level API performs fewer allocations and copies, but the contents of
     45 the []byte values returned by Text, TagName and TagAttr may change on the next
     46 call to Next. For example, to extract an HTML page's anchor text:
     47 
     48 	depth := 0
     49 	for {
     50 		tt := z.Next()
     51 		switch tt {
     52 		case html.ErrorToken:
     53 			return z.Err()
     54 		case html.TextToken:
     55 			if depth > 0 {
     56 				// emitBytes should copy the []byte it receives,
     57 				// if it doesn't process it immediately.
     58 				emitBytes(z.Text())
     59 			}
     60 		case html.StartTagToken, html.EndTagToken:
     61 			tn, _ := z.TagName()
     62 			if len(tn) == 1 && tn[0] == 'a' {
     63 				if tt == html.StartTagToken {
     64 					depth++
     65 				} else {
     66 					depth--
     67 				}
     68 			}
     69 		}
     70 	}
     71 
     72 Parsing is done by calling Parse with an io.Reader, which returns the root of
     73 the parse tree (the document element) as a *Node. It is the caller's
     74 responsibility to ensure that the Reader provides UTF-8 encoded HTML. For
     75 example, to process each anchor node in depth-first order:
     76 
     77 	doc, err := html.Parse(r)
     78 	if err != nil {
     79 		// ...
     80 	}
     81 	var f func(*html.Node)
     82 	f = func(n *html.Node) {
     83 		if n.Type == html.ElementNode && n.Data == "a" {
     84 			// Do something with n...
     85 		}
     86 		for c := n.FirstChild; c != nil; c = c.NextSibling {
     87 			f(c)
     88 		}
     89 	}
     90 	f(doc)
     91 
     92 The relevant specifications include:
     93 https://html.spec.whatwg.org/multipage/syntax.html and
     94 https://html.spec.whatwg.org/multipage/syntax.html#tokenization
     95 
     96 # Security Considerations
     97 
     98 Care should be taken when parsing and interpreting HTML, whether full documents
     99 or fragments, within the framework of the HTML specification, especially with
    100 regard to untrusted inputs.
    101 
    102 This package provides both a tokenizer and a parser, which implement the
    103 tokenization, and tokenization and tree construction stages of the WHATWG HTML
    104 parsing specification respectively. While the tokenizer parses and normalizes
    105 individual HTML tokens, only the parser constructs the DOM tree from the
    106 tokenized HTML, as described in the tree construction stage of the
    107 specification, dynamically modifying or extending the docuemnt's DOM tree.
    108 
    109 If your use case requires semantically well-formed HTML documents, as defined by
    110 the WHATWG specification, the parser should be used rather than the tokenizer.
    111 
    112 In security contexts, if trust decisions are being made using the tokenized or
    113 parsed content, the input must be re-serialized (for instance by using Render or
    114 Token.String) in order for those trust decisions to hold, as the process of
    115 tokenization or parsing may alter the content.
    116 */
    117 package html // import "golang.org/x/net/html"
    118 
    119 // The tokenization algorithm implemented by this package is not a line-by-line
    120 // transliteration of the relatively verbose state-machine in the WHATWG
    121 // specification. A more direct approach is used instead, where the program
    122 // counter implies the state, such as whether it is tokenizing a tag or a text
    123 // node. Specification compliance is verified by checking expected and actual
    124 // outputs over a test suite rather than aiming for algorithmic fidelity.
    125 
    126 // TODO(nigeltao): Does a DOM API belong in this package or a separate one?
    127 // TODO(nigeltao): How does parsing interact with a JavaScript engine?
	gtsocial-umbx Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs \| README \| LICENSE