gtsocial-umbx

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

bidirule.go (9558B)


      1 // Copyright 2016 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Package bidirule implements the Bidi Rule defined by RFC 5893.
      6 //
      7 // This package is under development. The API may change without notice and
      8 // without preserving backward compatibility.
      9 package bidirule
     10 
     11 import (
     12 	"errors"
     13 	"unicode/utf8"
     14 
     15 	"golang.org/x/text/transform"
     16 	"golang.org/x/text/unicode/bidi"
     17 )
     18 
     19 // This file contains an implementation of RFC 5893: Right-to-Left Scripts for
     20 // Internationalized Domain Names for Applications (IDNA)
     21 //
     22 // A label is an individual component of a domain name.  Labels are usually
     23 // shown separated by dots; for example, the domain name "www.example.com" is
     24 // composed of three labels: "www", "example", and "com".
     25 //
     26 // An RTL label is a label that contains at least one character of class R, AL,
     27 // or AN. An LTR label is any label that is not an RTL label.
     28 //
     29 // A "Bidi domain name" is a domain name that contains at least one RTL label.
     30 //
     31 //  The following guarantees can be made based on the above:
     32 //
     33 //  o  In a domain name consisting of only labels that satisfy the rule,
     34 //     the requirements of Section 3 are satisfied.  Note that even LTR
     35 //     labels and pure ASCII labels have to be tested.
     36 //
     37 //  o  In a domain name consisting of only LDH labels (as defined in the
     38 //     Definitions document [RFC5890]) and labels that satisfy the rule,
     39 //     the requirements of Section 3 are satisfied as long as a label
     40 //     that starts with an ASCII digit does not come after a
     41 //     right-to-left label.
     42 //
     43 //  No guarantee is given for other combinations.
     44 
     45 // ErrInvalid indicates a label is invalid according to the Bidi Rule.
     46 var ErrInvalid = errors.New("bidirule: failed Bidi Rule")
     47 
     48 type ruleState uint8
     49 
     50 const (
     51 	ruleInitial ruleState = iota
     52 	ruleLTR
     53 	ruleLTRFinal
     54 	ruleRTL
     55 	ruleRTLFinal
     56 	ruleInvalid
     57 )
     58 
     59 type ruleTransition struct {
     60 	next ruleState
     61 	mask uint16
     62 }
     63 
     64 var transitions = [...][2]ruleTransition{
     65 	// [2.1] The first character must be a character with Bidi property L, R, or
     66 	// AL. If it has the R or AL property, it is an RTL label; if it has the L
     67 	// property, it is an LTR label.
     68 	ruleInitial: {
     69 		{ruleLTRFinal, 1 << bidi.L},
     70 		{ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL},
     71 	},
     72 	ruleRTL: {
     73 		// [2.3] In an RTL label, the end of the label must be a character with
     74 		// Bidi property R, AL, EN, or AN, followed by zero or more characters
     75 		// with Bidi property NSM.
     76 		{ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL | 1<<bidi.EN | 1<<bidi.AN},
     77 
     78 		// [2.2] In an RTL label, only characters with the Bidi properties R,
     79 		// AL, AN, EN, ES, CS, ET, ON, BN, or NSM are allowed.
     80 		// We exclude the entries from [2.3]
     81 		{ruleRTL, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN | 1<<bidi.NSM},
     82 	},
     83 	ruleRTLFinal: {
     84 		// [2.3] In an RTL label, the end of the label must be a character with
     85 		// Bidi property R, AL, EN, or AN, followed by zero or more characters
     86 		// with Bidi property NSM.
     87 		{ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL | 1<<bidi.EN | 1<<bidi.AN | 1<<bidi.NSM},
     88 
     89 		// [2.2] In an RTL label, only characters with the Bidi properties R,
     90 		// AL, AN, EN, ES, CS, ET, ON, BN, or NSM are allowed.
     91 		// We exclude the entries from [2.3] and NSM.
     92 		{ruleRTL, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN},
     93 	},
     94 	ruleLTR: {
     95 		// [2.6] In an LTR label, the end of the label must be a character with
     96 		// Bidi property L or EN, followed by zero or more characters with Bidi
     97 		// property NSM.
     98 		{ruleLTRFinal, 1<<bidi.L | 1<<bidi.EN},
     99 
    100 		// [2.5] In an LTR label, only characters with the Bidi properties L,
    101 		// EN, ES, CS, ET, ON, BN, or NSM are allowed.
    102 		// We exclude the entries from [2.6].
    103 		{ruleLTR, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN | 1<<bidi.NSM},
    104 	},
    105 	ruleLTRFinal: {
    106 		// [2.6] In an LTR label, the end of the label must be a character with
    107 		// Bidi property L or EN, followed by zero or more characters with Bidi
    108 		// property NSM.
    109 		{ruleLTRFinal, 1<<bidi.L | 1<<bidi.EN | 1<<bidi.NSM},
    110 
    111 		// [2.5] In an LTR label, only characters with the Bidi properties L,
    112 		// EN, ES, CS, ET, ON, BN, or NSM are allowed.
    113 		// We exclude the entries from [2.6].
    114 		{ruleLTR, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN},
    115 	},
    116 	ruleInvalid: {
    117 		{ruleInvalid, 0},
    118 		{ruleInvalid, 0},
    119 	},
    120 }
    121 
    122 // [2.4] In an RTL label, if an EN is present, no AN may be present, and
    123 // vice versa.
    124 const exclusiveRTL = uint16(1<<bidi.EN | 1<<bidi.AN)
    125 
    126 // From RFC 5893
    127 // An RTL label is a label that contains at least one character of type
    128 // R, AL, or AN.
    129 //
    130 // An LTR label is any label that is not an RTL label.
    131 
    132 // Direction reports the direction of the given label as defined by RFC 5893.
    133 // The Bidi Rule does not have to be applied to labels of the category
    134 // LeftToRight.
    135 func Direction(b []byte) bidi.Direction {
    136 	for i := 0; i < len(b); {
    137 		e, sz := bidi.Lookup(b[i:])
    138 		if sz == 0 {
    139 			i++
    140 		}
    141 		c := e.Class()
    142 		if c == bidi.R || c == bidi.AL || c == bidi.AN {
    143 			return bidi.RightToLeft
    144 		}
    145 		i += sz
    146 	}
    147 	return bidi.LeftToRight
    148 }
    149 
    150 // DirectionString reports the direction of the given label as defined by RFC
    151 // 5893. The Bidi Rule does not have to be applied to labels of the category
    152 // LeftToRight.
    153 func DirectionString(s string) bidi.Direction {
    154 	for i := 0; i < len(s); {
    155 		e, sz := bidi.LookupString(s[i:])
    156 		if sz == 0 {
    157 			i++
    158 			continue
    159 		}
    160 		c := e.Class()
    161 		if c == bidi.R || c == bidi.AL || c == bidi.AN {
    162 			return bidi.RightToLeft
    163 		}
    164 		i += sz
    165 	}
    166 	return bidi.LeftToRight
    167 }
    168 
    169 // Valid reports whether b conforms to the BiDi rule.
    170 func Valid(b []byte) bool {
    171 	var t Transformer
    172 	if n, ok := t.advance(b); !ok || n < len(b) {
    173 		return false
    174 	}
    175 	return t.isFinal()
    176 }
    177 
    178 // ValidString reports whether s conforms to the BiDi rule.
    179 func ValidString(s string) bool {
    180 	var t Transformer
    181 	if n, ok := t.advanceString(s); !ok || n < len(s) {
    182 		return false
    183 	}
    184 	return t.isFinal()
    185 }
    186 
    187 // New returns a Transformer that verifies that input adheres to the Bidi Rule.
    188 func New() *Transformer {
    189 	return &Transformer{}
    190 }
    191 
    192 // Transformer implements transform.Transform.
    193 type Transformer struct {
    194 	state  ruleState
    195 	hasRTL bool
    196 	seen   uint16
    197 }
    198 
    199 // A rule can only be violated for "Bidi Domain names", meaning if one of the
    200 // following categories has been observed.
    201 func (t *Transformer) isRTL() bool {
    202 	const isRTL = 1<<bidi.R | 1<<bidi.AL | 1<<bidi.AN
    203 	return t.seen&isRTL != 0
    204 }
    205 
    206 // Reset implements transform.Transformer.
    207 func (t *Transformer) Reset() { *t = Transformer{} }
    208 
    209 // Transform implements transform.Transformer. This Transformer has state and
    210 // needs to be reset between uses.
    211 func (t *Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
    212 	if len(dst) < len(src) {
    213 		src = src[:len(dst)]
    214 		atEOF = false
    215 		err = transform.ErrShortDst
    216 	}
    217 	n, err1 := t.Span(src, atEOF)
    218 	copy(dst, src[:n])
    219 	if err == nil || err1 != nil && err1 != transform.ErrShortSrc {
    220 		err = err1
    221 	}
    222 	return n, n, err
    223 }
    224 
    225 // Span returns the first n bytes of src that conform to the Bidi rule.
    226 func (t *Transformer) Span(src []byte, atEOF bool) (n int, err error) {
    227 	if t.state == ruleInvalid && t.isRTL() {
    228 		return 0, ErrInvalid
    229 	}
    230 	n, ok := t.advance(src)
    231 	switch {
    232 	case !ok:
    233 		err = ErrInvalid
    234 	case n < len(src):
    235 		if !atEOF {
    236 			err = transform.ErrShortSrc
    237 			break
    238 		}
    239 		err = ErrInvalid
    240 	case !t.isFinal():
    241 		err = ErrInvalid
    242 	}
    243 	return n, err
    244 }
    245 
    246 // Precomputing the ASCII values decreases running time for the ASCII fast path
    247 // by about 30%.
    248 var asciiTable [128]bidi.Properties
    249 
    250 func init() {
    251 	for i := range asciiTable {
    252 		p, _ := bidi.LookupRune(rune(i))
    253 		asciiTable[i] = p
    254 	}
    255 }
    256 
    257 func (t *Transformer) advance(s []byte) (n int, ok bool) {
    258 	var e bidi.Properties
    259 	var sz int
    260 	for n < len(s) {
    261 		if s[n] < utf8.RuneSelf {
    262 			e, sz = asciiTable[s[n]], 1
    263 		} else {
    264 			e, sz = bidi.Lookup(s[n:])
    265 			if sz <= 1 {
    266 				if sz == 1 {
    267 					// We always consider invalid UTF-8 to be invalid, even if
    268 					// the string has not yet been determined to be RTL.
    269 					// TODO: is this correct?
    270 					return n, false
    271 				}
    272 				return n, true // incomplete UTF-8 encoding
    273 			}
    274 		}
    275 		// TODO: using CompactClass would result in noticeable speedup.
    276 		// See unicode/bidi/prop.go:Properties.CompactClass.
    277 		c := uint16(1 << e.Class())
    278 		t.seen |= c
    279 		if t.seen&exclusiveRTL == exclusiveRTL {
    280 			t.state = ruleInvalid
    281 			return n, false
    282 		}
    283 		switch tr := transitions[t.state]; {
    284 		case tr[0].mask&c != 0:
    285 			t.state = tr[0].next
    286 		case tr[1].mask&c != 0:
    287 			t.state = tr[1].next
    288 		default:
    289 			t.state = ruleInvalid
    290 			if t.isRTL() {
    291 				return n, false
    292 			}
    293 		}
    294 		n += sz
    295 	}
    296 	return n, true
    297 }
    298 
    299 func (t *Transformer) advanceString(s string) (n int, ok bool) {
    300 	var e bidi.Properties
    301 	var sz int
    302 	for n < len(s) {
    303 		if s[n] < utf8.RuneSelf {
    304 			e, sz = asciiTable[s[n]], 1
    305 		} else {
    306 			e, sz = bidi.LookupString(s[n:])
    307 			if sz <= 1 {
    308 				if sz == 1 {
    309 					return n, false // invalid UTF-8
    310 				}
    311 				return n, true // incomplete UTF-8 encoding
    312 			}
    313 		}
    314 		// TODO: using CompactClass results in noticeable speedup.
    315 		// See unicode/bidi/prop.go:Properties.CompactClass.
    316 		c := uint16(1 << e.Class())
    317 		t.seen |= c
    318 		if t.seen&exclusiveRTL == exclusiveRTL {
    319 			t.state = ruleInvalid
    320 			return n, false
    321 		}
    322 		switch tr := transitions[t.state]; {
    323 		case tr[0].mask&c != 0:
    324 			t.state = tr[0].next
    325 		case tr[1].mask&c != 0:
    326 			t.state = tr[1].next
    327 		default:
    328 			t.state = ruleInvalid
    329 			if t.isRTL() {
    330 				return n, false
    331 			}
    332 		}
    333 		n += sz
    334 	}
    335 	return n, true
    336 }