gtsocial-umbx

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

readerc.go (14089B)


      1 // 
      2 // Copyright (c) 2011-2019 Canonical Ltd
      3 // Copyright (c) 2006-2010 Kirill Simonov
      4 // 
      5 // Permission is hereby granted, free of charge, to any person obtaining a copy of
      6 // this software and associated documentation files (the "Software"), to deal in
      7 // the Software without restriction, including without limitation the rights to
      8 // use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
      9 // of the Software, and to permit persons to whom the Software is furnished to do
     10 // so, subject to the following conditions:
     11 // 
     12 // The above copyright notice and this permission notice shall be included in all
     13 // copies or substantial portions of the Software.
     14 // 
     15 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     18 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     20 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
     21 // SOFTWARE.
     22 
     23 package yaml
     24 
     25 import (
     26 	"io"
     27 )
     28 
     29 // Set the reader error and return 0.
     30 func yaml_parser_set_reader_error(parser *yaml_parser_t, problem string, offset int, value int) bool {
     31 	parser.error = yaml_READER_ERROR
     32 	parser.problem = problem
     33 	parser.problem_offset = offset
     34 	parser.problem_value = value
     35 	return false
     36 }
     37 
     38 // Byte order marks.
     39 const (
     40 	bom_UTF8    = "\xef\xbb\xbf"
     41 	bom_UTF16LE = "\xff\xfe"
     42 	bom_UTF16BE = "\xfe\xff"
     43 )
     44 
     45 // Determine the input stream encoding by checking the BOM symbol. If no BOM is
     46 // found, the UTF-8 encoding is assumed. Return 1 on success, 0 on failure.
     47 func yaml_parser_determine_encoding(parser *yaml_parser_t) bool {
     48 	// Ensure that we had enough bytes in the raw buffer.
     49 	for !parser.eof && len(parser.raw_buffer)-parser.raw_buffer_pos < 3 {
     50 		if !yaml_parser_update_raw_buffer(parser) {
     51 			return false
     52 		}
     53 	}
     54 
     55 	// Determine the encoding.
     56 	buf := parser.raw_buffer
     57 	pos := parser.raw_buffer_pos
     58 	avail := len(buf) - pos
     59 	if avail >= 2 && buf[pos] == bom_UTF16LE[0] && buf[pos+1] == bom_UTF16LE[1] {
     60 		parser.encoding = yaml_UTF16LE_ENCODING
     61 		parser.raw_buffer_pos += 2
     62 		parser.offset += 2
     63 	} else if avail >= 2 && buf[pos] == bom_UTF16BE[0] && buf[pos+1] == bom_UTF16BE[1] {
     64 		parser.encoding = yaml_UTF16BE_ENCODING
     65 		parser.raw_buffer_pos += 2
     66 		parser.offset += 2
     67 	} else if avail >= 3 && buf[pos] == bom_UTF8[0] && buf[pos+1] == bom_UTF8[1] && buf[pos+2] == bom_UTF8[2] {
     68 		parser.encoding = yaml_UTF8_ENCODING
     69 		parser.raw_buffer_pos += 3
     70 		parser.offset += 3
     71 	} else {
     72 		parser.encoding = yaml_UTF8_ENCODING
     73 	}
     74 	return true
     75 }
     76 
     77 // Update the raw buffer.
     78 func yaml_parser_update_raw_buffer(parser *yaml_parser_t) bool {
     79 	size_read := 0
     80 
     81 	// Return if the raw buffer is full.
     82 	if parser.raw_buffer_pos == 0 && len(parser.raw_buffer) == cap(parser.raw_buffer) {
     83 		return true
     84 	}
     85 
     86 	// Return on EOF.
     87 	if parser.eof {
     88 		return true
     89 	}
     90 
     91 	// Move the remaining bytes in the raw buffer to the beginning.
     92 	if parser.raw_buffer_pos > 0 && parser.raw_buffer_pos < len(parser.raw_buffer) {
     93 		copy(parser.raw_buffer, parser.raw_buffer[parser.raw_buffer_pos:])
     94 	}
     95 	parser.raw_buffer = parser.raw_buffer[:len(parser.raw_buffer)-parser.raw_buffer_pos]
     96 	parser.raw_buffer_pos = 0
     97 
     98 	// Call the read handler to fill the buffer.
     99 	size_read, err := parser.read_handler(parser, parser.raw_buffer[len(parser.raw_buffer):cap(parser.raw_buffer)])
    100 	parser.raw_buffer = parser.raw_buffer[:len(parser.raw_buffer)+size_read]
    101 	if err == io.EOF {
    102 		parser.eof = true
    103 	} else if err != nil {
    104 		return yaml_parser_set_reader_error(parser, "input error: "+err.Error(), parser.offset, -1)
    105 	}
    106 	return true
    107 }
    108 
    109 // Ensure that the buffer contains at least `length` characters.
    110 // Return true on success, false on failure.
    111 //
    112 // The length is supposed to be significantly less that the buffer size.
    113 func yaml_parser_update_buffer(parser *yaml_parser_t, length int) bool {
    114 	if parser.read_handler == nil {
    115 		panic("read handler must be set")
    116 	}
    117 
    118 	// [Go] This function was changed to guarantee the requested length size at EOF.
    119 	// The fact we need to do this is pretty awful, but the description above implies
    120 	// for that to be the case, and there are tests
    121 
    122 	// If the EOF flag is set and the raw buffer is empty, do nothing.
    123 	if parser.eof && parser.raw_buffer_pos == len(parser.raw_buffer) {
    124 		// [Go] ACTUALLY! Read the documentation of this function above.
    125 		// This is just broken. To return true, we need to have the
    126 		// given length in the buffer. Not doing that means every single
    127 		// check that calls this function to make sure the buffer has a
    128 		// given length is Go) panicking; or C) accessing invalid memory.
    129 		//return true
    130 	}
    131 
    132 	// Return if the buffer contains enough characters.
    133 	if parser.unread >= length {
    134 		return true
    135 	}
    136 
    137 	// Determine the input encoding if it is not known yet.
    138 	if parser.encoding == yaml_ANY_ENCODING {
    139 		if !yaml_parser_determine_encoding(parser) {
    140 			return false
    141 		}
    142 	}
    143 
    144 	// Move the unread characters to the beginning of the buffer.
    145 	buffer_len := len(parser.buffer)
    146 	if parser.buffer_pos > 0 && parser.buffer_pos < buffer_len {
    147 		copy(parser.buffer, parser.buffer[parser.buffer_pos:])
    148 		buffer_len -= parser.buffer_pos
    149 		parser.buffer_pos = 0
    150 	} else if parser.buffer_pos == buffer_len {
    151 		buffer_len = 0
    152 		parser.buffer_pos = 0
    153 	}
    154 
    155 	// Open the whole buffer for writing, and cut it before returning.
    156 	parser.buffer = parser.buffer[:cap(parser.buffer)]
    157 
    158 	// Fill the buffer until it has enough characters.
    159 	first := true
    160 	for parser.unread < length {
    161 
    162 		// Fill the raw buffer if necessary.
    163 		if !first || parser.raw_buffer_pos == len(parser.raw_buffer) {
    164 			if !yaml_parser_update_raw_buffer(parser) {
    165 				parser.buffer = parser.buffer[:buffer_len]
    166 				return false
    167 			}
    168 		}
    169 		first = false
    170 
    171 		// Decode the raw buffer.
    172 	inner:
    173 		for parser.raw_buffer_pos != len(parser.raw_buffer) {
    174 			var value rune
    175 			var width int
    176 
    177 			raw_unread := len(parser.raw_buffer) - parser.raw_buffer_pos
    178 
    179 			// Decode the next character.
    180 			switch parser.encoding {
    181 			case yaml_UTF8_ENCODING:
    182 				// Decode a UTF-8 character.  Check RFC 3629
    183 				// (http://www.ietf.org/rfc/rfc3629.txt) for more details.
    184 				//
    185 				// The following table (taken from the RFC) is used for
    186 				// decoding.
    187 				//
    188 				//    Char. number range |        UTF-8 octet sequence
    189 				//      (hexadecimal)    |              (binary)
    190 				//   --------------------+------------------------------------
    191 				//   0000 0000-0000 007F | 0xxxxxxx
    192 				//   0000 0080-0000 07FF | 110xxxxx 10xxxxxx
    193 				//   0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
    194 				//   0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
    195 				//
    196 				// Additionally, the characters in the range 0xD800-0xDFFF
    197 				// are prohibited as they are reserved for use with UTF-16
    198 				// surrogate pairs.
    199 
    200 				// Determine the length of the UTF-8 sequence.
    201 				octet := parser.raw_buffer[parser.raw_buffer_pos]
    202 				switch {
    203 				case octet&0x80 == 0x00:
    204 					width = 1
    205 				case octet&0xE0 == 0xC0:
    206 					width = 2
    207 				case octet&0xF0 == 0xE0:
    208 					width = 3
    209 				case octet&0xF8 == 0xF0:
    210 					width = 4
    211 				default:
    212 					// The leading octet is invalid.
    213 					return yaml_parser_set_reader_error(parser,
    214 						"invalid leading UTF-8 octet",
    215 						parser.offset, int(octet))
    216 				}
    217 
    218 				// Check if the raw buffer contains an incomplete character.
    219 				if width > raw_unread {
    220 					if parser.eof {
    221 						return yaml_parser_set_reader_error(parser,
    222 							"incomplete UTF-8 octet sequence",
    223 							parser.offset, -1)
    224 					}
    225 					break inner
    226 				}
    227 
    228 				// Decode the leading octet.
    229 				switch {
    230 				case octet&0x80 == 0x00:
    231 					value = rune(octet & 0x7F)
    232 				case octet&0xE0 == 0xC0:
    233 					value = rune(octet & 0x1F)
    234 				case octet&0xF0 == 0xE0:
    235 					value = rune(octet & 0x0F)
    236 				case octet&0xF8 == 0xF0:
    237 					value = rune(octet & 0x07)
    238 				default:
    239 					value = 0
    240 				}
    241 
    242 				// Check and decode the trailing octets.
    243 				for k := 1; k < width; k++ {
    244 					octet = parser.raw_buffer[parser.raw_buffer_pos+k]
    245 
    246 					// Check if the octet is valid.
    247 					if (octet & 0xC0) != 0x80 {
    248 						return yaml_parser_set_reader_error(parser,
    249 							"invalid trailing UTF-8 octet",
    250 							parser.offset+k, int(octet))
    251 					}
    252 
    253 					// Decode the octet.
    254 					value = (value << 6) + rune(octet&0x3F)
    255 				}
    256 
    257 				// Check the length of the sequence against the value.
    258 				switch {
    259 				case width == 1:
    260 				case width == 2 && value >= 0x80:
    261 				case width == 3 && value >= 0x800:
    262 				case width == 4 && value >= 0x10000:
    263 				default:
    264 					return yaml_parser_set_reader_error(parser,
    265 						"invalid length of a UTF-8 sequence",
    266 						parser.offset, -1)
    267 				}
    268 
    269 				// Check the range of the value.
    270 				if value >= 0xD800 && value <= 0xDFFF || value > 0x10FFFF {
    271 					return yaml_parser_set_reader_error(parser,
    272 						"invalid Unicode character",
    273 						parser.offset, int(value))
    274 				}
    275 
    276 			case yaml_UTF16LE_ENCODING, yaml_UTF16BE_ENCODING:
    277 				var low, high int
    278 				if parser.encoding == yaml_UTF16LE_ENCODING {
    279 					low, high = 0, 1
    280 				} else {
    281 					low, high = 1, 0
    282 				}
    283 
    284 				// The UTF-16 encoding is not as simple as one might
    285 				// naively think.  Check RFC 2781
    286 				// (http://www.ietf.org/rfc/rfc2781.txt).
    287 				//
    288 				// Normally, two subsequent bytes describe a Unicode
    289 				// character.  However a special technique (called a
    290 				// surrogate pair) is used for specifying character
    291 				// values larger than 0xFFFF.
    292 				//
    293 				// A surrogate pair consists of two pseudo-characters:
    294 				//      high surrogate area (0xD800-0xDBFF)
    295 				//      low surrogate area (0xDC00-0xDFFF)
    296 				//
    297 				// The following formulas are used for decoding
    298 				// and encoding characters using surrogate pairs:
    299 				//
    300 				//  U  = U' + 0x10000   (0x01 00 00 <= U <= 0x10 FF FF)
    301 				//  U' = yyyyyyyyyyxxxxxxxxxx   (0 <= U' <= 0x0F FF FF)
    302 				//  W1 = 110110yyyyyyyyyy
    303 				//  W2 = 110111xxxxxxxxxx
    304 				//
    305 				// where U is the character value, W1 is the high surrogate
    306 				// area, W2 is the low surrogate area.
    307 
    308 				// Check for incomplete UTF-16 character.
    309 				if raw_unread < 2 {
    310 					if parser.eof {
    311 						return yaml_parser_set_reader_error(parser,
    312 							"incomplete UTF-16 character",
    313 							parser.offset, -1)
    314 					}
    315 					break inner
    316 				}
    317 
    318 				// Get the character.
    319 				value = rune(parser.raw_buffer[parser.raw_buffer_pos+low]) +
    320 					(rune(parser.raw_buffer[parser.raw_buffer_pos+high]) << 8)
    321 
    322 				// Check for unexpected low surrogate area.
    323 				if value&0xFC00 == 0xDC00 {
    324 					return yaml_parser_set_reader_error(parser,
    325 						"unexpected low surrogate area",
    326 						parser.offset, int(value))
    327 				}
    328 
    329 				// Check for a high surrogate area.
    330 				if value&0xFC00 == 0xD800 {
    331 					width = 4
    332 
    333 					// Check for incomplete surrogate pair.
    334 					if raw_unread < 4 {
    335 						if parser.eof {
    336 							return yaml_parser_set_reader_error(parser,
    337 								"incomplete UTF-16 surrogate pair",
    338 								parser.offset, -1)
    339 						}
    340 						break inner
    341 					}
    342 
    343 					// Get the next character.
    344 					value2 := rune(parser.raw_buffer[parser.raw_buffer_pos+low+2]) +
    345 						(rune(parser.raw_buffer[parser.raw_buffer_pos+high+2]) << 8)
    346 
    347 					// Check for a low surrogate area.
    348 					if value2&0xFC00 != 0xDC00 {
    349 						return yaml_parser_set_reader_error(parser,
    350 							"expected low surrogate area",
    351 							parser.offset+2, int(value2))
    352 					}
    353 
    354 					// Generate the value of the surrogate pair.
    355 					value = 0x10000 + ((value & 0x3FF) << 10) + (value2 & 0x3FF)
    356 				} else {
    357 					width = 2
    358 				}
    359 
    360 			default:
    361 				panic("impossible")
    362 			}
    363 
    364 			// Check if the character is in the allowed range:
    365 			//      #x9 | #xA | #xD | [#x20-#x7E]               (8 bit)
    366 			//      | #x85 | [#xA0-#xD7FF] | [#xE000-#xFFFD]    (16 bit)
    367 			//      | [#x10000-#x10FFFF]                        (32 bit)
    368 			switch {
    369 			case value == 0x09:
    370 			case value == 0x0A:
    371 			case value == 0x0D:
    372 			case value >= 0x20 && value <= 0x7E:
    373 			case value == 0x85:
    374 			case value >= 0xA0 && value <= 0xD7FF:
    375 			case value >= 0xE000 && value <= 0xFFFD:
    376 			case value >= 0x10000 && value <= 0x10FFFF:
    377 			default:
    378 				return yaml_parser_set_reader_error(parser,
    379 					"control characters are not allowed",
    380 					parser.offset, int(value))
    381 			}
    382 
    383 			// Move the raw pointers.
    384 			parser.raw_buffer_pos += width
    385 			parser.offset += width
    386 
    387 			// Finally put the character into the buffer.
    388 			if value <= 0x7F {
    389 				// 0000 0000-0000 007F . 0xxxxxxx
    390 				parser.buffer[buffer_len+0] = byte(value)
    391 				buffer_len += 1
    392 			} else if value <= 0x7FF {
    393 				// 0000 0080-0000 07FF . 110xxxxx 10xxxxxx
    394 				parser.buffer[buffer_len+0] = byte(0xC0 + (value >> 6))
    395 				parser.buffer[buffer_len+1] = byte(0x80 + (value & 0x3F))
    396 				buffer_len += 2
    397 			} else if value <= 0xFFFF {
    398 				// 0000 0800-0000 FFFF . 1110xxxx 10xxxxxx 10xxxxxx
    399 				parser.buffer[buffer_len+0] = byte(0xE0 + (value >> 12))
    400 				parser.buffer[buffer_len+1] = byte(0x80 + ((value >> 6) & 0x3F))
    401 				parser.buffer[buffer_len+2] = byte(0x80 + (value & 0x3F))
    402 				buffer_len += 3
    403 			} else {
    404 				// 0001 0000-0010 FFFF . 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
    405 				parser.buffer[buffer_len+0] = byte(0xF0 + (value >> 18))
    406 				parser.buffer[buffer_len+1] = byte(0x80 + ((value >> 12) & 0x3F))
    407 				parser.buffer[buffer_len+2] = byte(0x80 + ((value >> 6) & 0x3F))
    408 				parser.buffer[buffer_len+3] = byte(0x80 + (value & 0x3F))
    409 				buffer_len += 4
    410 			}
    411 
    412 			parser.unread++
    413 		}
    414 
    415 		// On EOF, put NUL into the buffer and return.
    416 		if parser.eof {
    417 			parser.buffer[buffer_len] = 0
    418 			buffer_len++
    419 			parser.unread++
    420 			break
    421 		}
    422 	}
    423 	// [Go] Read the documentation of this function above. To return true,
    424 	// we need to have the given length in the buffer. Not doing that means
    425 	// every single check that calls this function to make sure the buffer
    426 	// has a given length is Go) panicking; or C) accessing invalid memory.
    427 	// This happens here due to the EOF above breaking early.
    428 	for buffer_len < length {
    429 		parser.buffer[buffer_len] = 0
    430 		buffer_len++
    431 	}
    432 	parser.buffer = parser.buffer[:buffer_len]
    433 	return true
    434 }