gtsocial-umbx

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

html_block.go (6356B)


      1 package parser
      2 
      3 import (
      4 	"bytes"
      5 	"regexp"
      6 	"strings"
      7 
      8 	"github.com/yuin/goldmark/ast"
      9 	"github.com/yuin/goldmark/text"
     10 	"github.com/yuin/goldmark/util"
     11 )
     12 
     13 var allowedBlockTags = map[string]bool{
     14 	"address":    true,
     15 	"article":    true,
     16 	"aside":      true,
     17 	"base":       true,
     18 	"basefont":   true,
     19 	"blockquote": true,
     20 	"body":       true,
     21 	"caption":    true,
     22 	"center":     true,
     23 	"col":        true,
     24 	"colgroup":   true,
     25 	"dd":         true,
     26 	"details":    true,
     27 	"dialog":     true,
     28 	"dir":        true,
     29 	"div":        true,
     30 	"dl":         true,
     31 	"dt":         true,
     32 	"fieldset":   true,
     33 	"figcaption": true,
     34 	"figure":     true,
     35 	"footer":     true,
     36 	"form":       true,
     37 	"frame":      true,
     38 	"frameset":   true,
     39 	"h1":         true,
     40 	"h2":         true,
     41 	"h3":         true,
     42 	"h4":         true,
     43 	"h5":         true,
     44 	"h6":         true,
     45 	"head":       true,
     46 	"header":     true,
     47 	"hr":         true,
     48 	"html":       true,
     49 	"iframe":     true,
     50 	"legend":     true,
     51 	"li":         true,
     52 	"link":       true,
     53 	"main":       true,
     54 	"menu":       true,
     55 	"menuitem":   true,
     56 	"meta":       true,
     57 	"nav":        true,
     58 	"noframes":   true,
     59 	"ol":         true,
     60 	"optgroup":   true,
     61 	"option":     true,
     62 	"p":          true,
     63 	"param":      true,
     64 	"section":    true,
     65 	"source":     true,
     66 	"summary":    true,
     67 	"table":      true,
     68 	"tbody":      true,
     69 	"td":         true,
     70 	"tfoot":      true,
     71 	"th":         true,
     72 	"thead":      true,
     73 	"title":      true,
     74 	"tr":         true,
     75 	"track":      true,
     76 	"ul":         true,
     77 }
     78 
     79 var htmlBlockType1OpenRegexp = regexp.MustCompile(`(?i)^[ ]{0,3}<(script|pre|style|textarea)(?:\s.*|>.*|/>.*|)(?:\r\n|\n)?$`)
     80 var htmlBlockType1CloseRegexp = regexp.MustCompile(`(?i)^.*</(?:script|pre|style|textarea)>.*`)
     81 
     82 var htmlBlockType2OpenRegexp = regexp.MustCompile(`^[ ]{0,3}<!\-\-`)
     83 var htmlBlockType2Close = []byte{'-', '-', '>'}
     84 
     85 var htmlBlockType3OpenRegexp = regexp.MustCompile(`^[ ]{0,3}<\?`)
     86 var htmlBlockType3Close = []byte{'?', '>'}
     87 
     88 var htmlBlockType4OpenRegexp = regexp.MustCompile(`^[ ]{0,3}<![A-Z]+.*(?:\r\n|\n)?$`)
     89 var htmlBlockType4Close = []byte{'>'}
     90 
     91 var htmlBlockType5OpenRegexp = regexp.MustCompile(`^[ ]{0,3}<\!\[CDATA\[`)
     92 var htmlBlockType5Close = []byte{']', ']', '>'}
     93 
     94 var htmlBlockType6Regexp = regexp.MustCompile(`^[ ]{0,3}<(?:/[ ]*)?([a-zA-Z]+[a-zA-Z0-9\-]*)(?:[ ].*|>.*|/>.*|)(?:\r\n|\n)?$`)
     95 
     96 var htmlBlockType7Regexp = regexp.MustCompile(`^[ ]{0,3}<(/[ ]*)?([a-zA-Z]+[a-zA-Z0-9\-]*)(` + attributePattern + `*)[ ]*(?:>|/>)[ ]*(?:\r\n|\n)?$`)
     97 
     98 type htmlBlockParser struct {
     99 }
    100 
    101 var defaultHTMLBlockParser = &htmlBlockParser{}
    102 
    103 // NewHTMLBlockParser return a new BlockParser that can parse html
    104 // blocks.
    105 func NewHTMLBlockParser() BlockParser {
    106 	return defaultHTMLBlockParser
    107 }
    108 
    109 func (b *htmlBlockParser) Trigger() []byte {
    110 	return []byte{'<'}
    111 }
    112 
    113 func (b *htmlBlockParser) Open(parent ast.Node, reader text.Reader, pc Context) (ast.Node, State) {
    114 	var node *ast.HTMLBlock
    115 	line, segment := reader.PeekLine()
    116 	last := pc.LastOpenedBlock().Node
    117 	if pos := pc.BlockOffset(); pos < 0 || line[pos] != '<' {
    118 		return nil, NoChildren
    119 	}
    120 
    121 	if m := htmlBlockType1OpenRegexp.FindSubmatchIndex(line); m != nil {
    122 		node = ast.NewHTMLBlock(ast.HTMLBlockType1)
    123 	} else if htmlBlockType2OpenRegexp.Match(line) {
    124 		node = ast.NewHTMLBlock(ast.HTMLBlockType2)
    125 	} else if htmlBlockType3OpenRegexp.Match(line) {
    126 		node = ast.NewHTMLBlock(ast.HTMLBlockType3)
    127 	} else if htmlBlockType4OpenRegexp.Match(line) {
    128 		node = ast.NewHTMLBlock(ast.HTMLBlockType4)
    129 	} else if htmlBlockType5OpenRegexp.Match(line) {
    130 		node = ast.NewHTMLBlock(ast.HTMLBlockType5)
    131 	} else if match := htmlBlockType7Regexp.FindSubmatchIndex(line); match != nil {
    132 		isCloseTag := match[2] > -1 && bytes.Equal(line[match[2]:match[3]], []byte("/"))
    133 		hasAttr := match[6] != match[7]
    134 		tagName := strings.ToLower(string(line[match[4]:match[5]]))
    135 		_, ok := allowedBlockTags[tagName]
    136 		if ok {
    137 			node = ast.NewHTMLBlock(ast.HTMLBlockType6)
    138 		} else if tagName != "script" && tagName != "style" && tagName != "pre" && !ast.IsParagraph(last) && !(isCloseTag && hasAttr) { // type 7 can not interrupt paragraph
    139 			node = ast.NewHTMLBlock(ast.HTMLBlockType7)
    140 		}
    141 	}
    142 	if node == nil {
    143 		if match := htmlBlockType6Regexp.FindSubmatchIndex(line); match != nil {
    144 			tagName := string(line[match[2]:match[3]])
    145 			_, ok := allowedBlockTags[strings.ToLower(tagName)]
    146 			if ok {
    147 				node = ast.NewHTMLBlock(ast.HTMLBlockType6)
    148 			}
    149 		}
    150 	}
    151 	if node != nil {
    152 		reader.Advance(segment.Len() - util.TrimRightSpaceLength(line))
    153 		node.Lines().Append(segment)
    154 		return node, NoChildren
    155 	}
    156 	return nil, NoChildren
    157 }
    158 
    159 func (b *htmlBlockParser) Continue(node ast.Node, reader text.Reader, pc Context) State {
    160 	htmlBlock := node.(*ast.HTMLBlock)
    161 	lines := htmlBlock.Lines()
    162 	line, segment := reader.PeekLine()
    163 	var closurePattern []byte
    164 
    165 	switch htmlBlock.HTMLBlockType {
    166 	case ast.HTMLBlockType1:
    167 		if lines.Len() == 1 {
    168 			firstLine := lines.At(0)
    169 			if htmlBlockType1CloseRegexp.Match(firstLine.Value(reader.Source())) {
    170 				return Close
    171 			}
    172 		}
    173 		if htmlBlockType1CloseRegexp.Match(line) {
    174 			htmlBlock.ClosureLine = segment
    175 			reader.Advance(segment.Len() - util.TrimRightSpaceLength(line))
    176 			return Close
    177 		}
    178 	case ast.HTMLBlockType2:
    179 		closurePattern = htmlBlockType2Close
    180 		fallthrough
    181 	case ast.HTMLBlockType3:
    182 		if closurePattern == nil {
    183 			closurePattern = htmlBlockType3Close
    184 		}
    185 		fallthrough
    186 	case ast.HTMLBlockType4:
    187 		if closurePattern == nil {
    188 			closurePattern = htmlBlockType4Close
    189 		}
    190 		fallthrough
    191 	case ast.HTMLBlockType5:
    192 		if closurePattern == nil {
    193 			closurePattern = htmlBlockType5Close
    194 		}
    195 
    196 		if lines.Len() == 1 {
    197 			firstLine := lines.At(0)
    198 			if bytes.Contains(firstLine.Value(reader.Source()), closurePattern) {
    199 				return Close
    200 			}
    201 		}
    202 		if bytes.Contains(line, closurePattern) {
    203 			htmlBlock.ClosureLine = segment
    204 			reader.Advance(segment.Len())
    205 			return Close
    206 		}
    207 
    208 	case ast.HTMLBlockType6, ast.HTMLBlockType7:
    209 		if util.IsBlank(line) {
    210 			return Close
    211 		}
    212 	}
    213 	node.Lines().Append(segment)
    214 	reader.Advance(segment.Len() - util.TrimRightSpaceLength(line))
    215 	return Continue | NoChildren
    216 }
    217 
    218 func (b *htmlBlockParser) Close(node ast.Node, reader text.Reader, pc Context) {
    219 	// nothing to do
    220 }
    221 
    222 func (b *htmlBlockParser) CanInterruptParagraph() bool {
    223 	return true
    224 }
    225 
    226 func (b *htmlBlockParser) CanAcceptIndentedLine() bool {
    227 	return false
    228 }