decode_arm64.s - gtsocial-umbx - Unnamed repository; edit this file 'description' to name the repository.

decode_arm64.s (14954B)
      1 // Copyright 2020 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // +build !appengine
      6 // +build gc
      7 // +build !noasm
      8 
      9 #include "textflag.h"
     10 
     11 #define R_TMP0 R2
     12 #define R_TMP1 R3
     13 #define R_LEN R4
     14 #define R_OFF R5
     15 #define R_SRC R6
     16 #define R_DST R7
     17 #define R_DBASE R8
     18 #define R_DLEN R9
     19 #define R_DEND R10
     20 #define R_SBASE R11
     21 #define R_SLEN R12
     22 #define R_SEND R13
     23 #define R_TMP2 R14
     24 #define R_TMP3 R15
     25 
     26 // TEST_SRC will check if R_SRC is <= SRC_END
     27 #define TEST_SRC() \
     28 	CMP R_SEND, R_SRC \
     29 	BGT errCorrupt
     30 
     31 // MOVD R_SRC, R_TMP1
     32 // SUB  R_SBASE, R_TMP1, R_TMP1
     33 // CMP  R_SLEN, R_TMP1
     34 // BGT  errCorrupt
     35 
     36 // The asm code generally follows the pure Go code in decode_other.go, except
     37 // where marked with a "!!!".
     38 
     39 // func decode(dst, src []byte) int
     40 //
     41 // All local variables fit into registers. The non-zero stack size is only to
     42 // spill registers and push args when issuing a CALL. The register allocation:
     43 //	- R_TMP0	scratch
     44 //	- R_TMP1	scratch
     45 //	- R_LEN	length or x
     46 //	- R_OFF	offset
     47 //	- R_SRC	&src[s]
     48 //	- R_DST	&dst[d]
     49 //	+ R_DBASE	dst_base
     50 //	+ R_DLEN	dst_len
     51 //	+ R_DEND	dst_base + dst_len
     52 //	+ R_SBASE	src_base
     53 //	+ R_SLEN	src_len
     54 //	+ R_SEND	src_base + src_len
     55 //	- R_TMP2	used by doCopy
     56 //	- R_TMP3	used by doCopy
     57 //
     58 // The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the
     59 // function, and after a CALL returns, and are not otherwise modified.
     60 //
     61 // The d variable is implicitly R_DST - R_DBASE,  and len(dst)-d is R_DEND - R_DST.
     62 // The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC.
     63 TEXT ·s2Decode(SB), NOSPLIT, $56-64
     64 	// Initialize R_SRC, R_DST and R_DBASE-R_SEND.
     65 	MOVD dst_base+0(FP), R_DBASE
     66 	MOVD dst_len+8(FP), R_DLEN
     67 	MOVD R_DBASE, R_DST
     68 	MOVD R_DBASE, R_DEND
     69 	ADD  R_DLEN, R_DEND, R_DEND
     70 	MOVD src_base+24(FP), R_SBASE
     71 	MOVD src_len+32(FP), R_SLEN
     72 	MOVD R_SBASE, R_SRC
     73 	MOVD R_SBASE, R_SEND
     74 	ADD  R_SLEN, R_SEND, R_SEND
     75 	MOVD $0, R_OFF
     76 
     77 loop:
     78 	// for s < len(src)
     79 	CMP R_SEND, R_SRC
     80 	BEQ end
     81 
     82 	// R_LEN = uint32(src[s])
     83 	//
     84 	// switch src[s] & 0x03
     85 	MOVBU (R_SRC), R_LEN
     86 	MOVW  R_LEN, R_TMP1
     87 	ANDW  $3, R_TMP1
     88 	MOVW  $1, R1
     89 	CMPW  R1, R_TMP1
     90 	BGE   tagCopy
     91 
     92 	// ----------------------------------------
     93 	// The code below handles literal tags.
     94 
     95 	// case tagLiteral:
     96 	// x := uint32(src[s] >> 2)
     97 	// switch
     98 	MOVW $60, R1
     99 	LSRW $2, R_LEN, R_LEN
    100 	CMPW R_LEN, R1
    101 	BLS  tagLit60Plus
    102 
    103 	// case x < 60:
    104 	// s++
    105 	ADD $1, R_SRC, R_SRC
    106 
    107 doLit:
    108 	// This is the end of the inner "switch", when we have a literal tag.
    109 	//
    110 	// We assume that R_LEN == x and x fits in a uint32, where x is the variable
    111 	// used in the pure Go decode_other.go code.
    112 
    113 	// length = int(x) + 1
    114 	//
    115 	// Unlike the pure Go code, we don't need to check if length <= 0 because
    116 	// R_LEN can hold 64 bits, so the increment cannot overflow.
    117 	ADD $1, R_LEN, R_LEN
    118 
    119 	// Prepare to check if copying length bytes will run past the end of dst or
    120 	// src.
    121 	//
    122 	// R_TMP0 = len(dst) - d
    123 	// R_TMP1 = len(src) - s
    124 	MOVD R_DEND, R_TMP0
    125 	SUB  R_DST, R_TMP0, R_TMP0
    126 	MOVD R_SEND, R_TMP1
    127 	SUB  R_SRC, R_TMP1, R_TMP1
    128 
    129 	// !!! Try a faster technique for short (16 or fewer bytes) copies.
    130 	//
    131 	// if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
    132 	//   goto callMemmove // Fall back on calling runtime·memmove.
    133 	// }
    134 	//
    135 	// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
    136 	// against 21 instead of 16, because it cannot assume that all of its input
    137 	// is contiguous in memory and so it needs to leave enough source bytes to
    138 	// read the next tag without refilling buffers, but Go's Decode assumes
    139 	// contiguousness (the src argument is a []byte).
    140 	CMP $16, R_LEN
    141 	BGT callMemmove
    142 	CMP $16, R_TMP0
    143 	BLT callMemmove
    144 	CMP $16, R_TMP1
    145 	BLT callMemmove
    146 
    147 	// !!! Implement the copy from src to dst as a 16-byte load and store.
    148 	// (Decode's documentation says that dst and src must not overlap.)
    149 	//
    150 	// This always copies 16 bytes, instead of only length bytes, but that's
    151 	// OK. If the input is a valid Snappy encoding then subsequent iterations
    152 	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
    153 	// non-nil error), so the overrun will be ignored.
    154 	//
    155 	// Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
    156 	// 16-byte loads and stores. This technique probably wouldn't be as
    157 	// effective on architectures that are fussier about alignment.
    158 	LDP 0(R_SRC), (R_TMP2, R_TMP3)
    159 	STP (R_TMP2, R_TMP3), 0(R_DST)
    160 
    161 	// d += length
    162 	// s += length
    163 	ADD R_LEN, R_DST, R_DST
    164 	ADD R_LEN, R_SRC, R_SRC
    165 	B   loop
    166 
    167 callMemmove:
    168 	// if length > len(dst)-d || length > len(src)-s { etc }
    169 	CMP R_TMP0, R_LEN
    170 	BGT errCorrupt
    171 	CMP R_TMP1, R_LEN
    172 	BGT errCorrupt
    173 
    174 	// copy(dst[d:], src[s:s+length])
    175 	//
    176 	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
    177 	// R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those
    178 	// three registers to the stack, to save local variables across the CALL.
    179 	MOVD R_DST, 8(RSP)
    180 	MOVD R_SRC, 16(RSP)
    181 	MOVD R_LEN, 24(RSP)
    182 	MOVD R_DST, 32(RSP)
    183 	MOVD R_SRC, 40(RSP)
    184 	MOVD R_LEN, 48(RSP)
    185 	MOVD R_OFF, 56(RSP)
    186 	CALL runtime·memmove(SB)
    187 
    188 	// Restore local variables: unspill registers from the stack and
    189 	// re-calculate R_DBASE-R_SEND.
    190 	MOVD 32(RSP), R_DST
    191 	MOVD 40(RSP), R_SRC
    192 	MOVD 48(RSP), R_LEN
    193 	MOVD 56(RSP), R_OFF
    194 	MOVD dst_base+0(FP), R_DBASE
    195 	MOVD dst_len+8(FP), R_DLEN
    196 	MOVD R_DBASE, R_DEND
    197 	ADD  R_DLEN, R_DEND, R_DEND
    198 	MOVD src_base+24(FP), R_SBASE
    199 	MOVD src_len+32(FP), R_SLEN
    200 	MOVD R_SBASE, R_SEND
    201 	ADD  R_SLEN, R_SEND, R_SEND
    202 
    203 	// d += length
    204 	// s += length
    205 	ADD R_LEN, R_DST, R_DST
    206 	ADD R_LEN, R_SRC, R_SRC
    207 	B   loop
    208 
    209 tagLit60Plus:
    210 	// !!! This fragment does the
    211 	//
    212 	// s += x - 58; if uint(s) > uint(len(src)) { etc }
    213 	//
    214 	// checks. In the asm version, we code it once instead of once per switch case.
    215 	ADD R_LEN, R_SRC, R_SRC
    216 	SUB $58, R_SRC, R_SRC
    217 	TEST_SRC()
    218 
    219 	// case x == 60:
    220 	MOVW $61, R1
    221 	CMPW R1, R_LEN
    222 	BEQ  tagLit61
    223 	BGT  tagLit62Plus
    224 
    225 	// x = uint32(src[s-1])
    226 	MOVBU -1(R_SRC), R_LEN
    227 	B     doLit
    228 
    229 tagLit61:
    230 	// case x == 61:
    231 	// x = uint32(src[s-2]) | uint32(src[s-1])<<8
    232 	MOVHU -2(R_SRC), R_LEN
    233 	B     doLit
    234 
    235 tagLit62Plus:
    236 	CMPW $62, R_LEN
    237 	BHI  tagLit63
    238 
    239 	// case x == 62:
    240 	// x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
    241 	MOVHU -3(R_SRC), R_LEN
    242 	MOVBU -1(R_SRC), R_TMP1
    243 	ORR   R_TMP1<<16, R_LEN
    244 	B     doLit
    245 
    246 tagLit63:
    247 	// case x == 63:
    248 	// x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
    249 	MOVWU -4(R_SRC), R_LEN
    250 	B     doLit
    251 
    252 	// The code above handles literal tags.
    253 	// ----------------------------------------
    254 	// The code below handles copy tags.
    255 
    256 tagCopy4:
    257 	// case tagCopy4:
    258 	// s += 5
    259 	ADD $5, R_SRC, R_SRC
    260 
    261 	// if uint(s) > uint(len(src)) { etc }
    262 	MOVD R_SRC, R_TMP1
    263 	SUB  R_SBASE, R_TMP1, R_TMP1
    264 	CMP  R_SLEN, R_TMP1
    265 	BGT  errCorrupt
    266 
    267 	// length = 1 + int(src[s-5])>>2
    268 	MOVD $1, R1
    269 	ADD  R_LEN>>2, R1, R_LEN
    270 
    271 	// offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
    272 	MOVWU -4(R_SRC), R_OFF
    273 	B     doCopy
    274 
    275 tagCopy2:
    276 	// case tagCopy2:
    277 	// s += 3
    278 	ADD $3, R_SRC, R_SRC
    279 
    280 	// if uint(s) > uint(len(src)) { etc }
    281 	TEST_SRC()
    282 
    283 	// length = 1 + int(src[s-3])>>2
    284 	MOVD $1, R1
    285 	ADD  R_LEN>>2, R1, R_LEN
    286 
    287 	// offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
    288 	MOVHU -2(R_SRC), R_OFF
    289 	B     doCopy
    290 
    291 tagCopy:
    292 	// We have a copy tag. We assume that:
    293 	//	- R_TMP1 == src[s] & 0x03
    294 	//	- R_LEN == src[s]
    295 	CMP $2, R_TMP1
    296 	BEQ tagCopy2
    297 	BGT tagCopy4
    298 
    299 	// case tagCopy1:
    300 	// s += 2
    301 	ADD $2, R_SRC, R_SRC
    302 
    303 	// if uint(s) > uint(len(src)) { etc }
    304 	TEST_SRC()
    305 
    306 	// offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
    307 	// Calculate offset in R_TMP0 in case it is a repeat.
    308 	MOVD  R_LEN, R_TMP0
    309 	AND   $0xe0, R_TMP0
    310 	MOVBU -1(R_SRC), R_TMP1
    311 	ORR   R_TMP0<<3, R_TMP1, R_TMP0
    312 
    313 	// length = 4 + int(src[s-2])>>2&0x7
    314 	MOVD $7, R1
    315 	AND  R_LEN>>2, R1, R_LEN
    316 	ADD  $4, R_LEN, R_LEN
    317 
    318 	// check if repeat code with offset 0.
    319 	CMP $0, R_TMP0
    320 	BEQ repeatCode
    321 
    322 	// This is a regular copy, transfer our temporary value to R_OFF (offset)
    323 	MOVD R_TMP0, R_OFF
    324 	B    doCopy
    325 
    326 	// This is a repeat code.
    327 repeatCode:
    328 	// If length < 9, reuse last offset, with the length already calculated.
    329 	CMP $9, R_LEN
    330 	BLT doCopyRepeat
    331 	BEQ repeatLen1
    332 	CMP $10, R_LEN
    333 	BEQ repeatLen2
    334 
    335 repeatLen3:
    336 	// s +=3
    337 	ADD $3, R_SRC, R_SRC
    338 
    339 	// if uint(s) > uint(len(src)) { etc }
    340 	TEST_SRC()
    341 
    342 	// length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + 65540
    343 	MOVBU -1(R_SRC), R_TMP0
    344 	MOVHU -3(R_SRC), R_LEN
    345 	ORR   R_TMP0<<16, R_LEN, R_LEN
    346 	ADD   $65540, R_LEN, R_LEN
    347 	B     doCopyRepeat
    348 
    349 repeatLen2:
    350 	// s +=2
    351 	ADD $2, R_SRC, R_SRC
    352 
    353 	// if uint(s) > uint(len(src)) { etc }
    354 	TEST_SRC()
    355 
    356 	// length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + 260
    357 	MOVHU -2(R_SRC), R_LEN
    358 	ADD   $260, R_LEN, R_LEN
    359 	B     doCopyRepeat
    360 
    361 repeatLen1:
    362 	// s +=1
    363 	ADD $1, R_SRC, R_SRC
    364 
    365 	// if uint(s) > uint(len(src)) { etc }
    366 	TEST_SRC()
    367 
    368 	// length = src[s-1] + 8
    369 	MOVBU -1(R_SRC), R_LEN
    370 	ADD   $8, R_LEN, R_LEN
    371 	B     doCopyRepeat
    372 
    373 doCopy:
    374 	// This is the end of the outer "switch", when we have a copy tag.
    375 	//
    376 	// We assume that:
    377 	//	- R_LEN == length && R_LEN > 0
    378 	//	- R_OFF == offset
    379 
    380 	// if d < offset { etc }
    381 	MOVD R_DST, R_TMP1
    382 	SUB  R_DBASE, R_TMP1, R_TMP1
    383 	CMP  R_OFF, R_TMP1
    384 	BLT  errCorrupt
    385 
    386 	// Repeat values can skip the test above, since any offset > 0 will be in dst.
    387 doCopyRepeat:
    388 
    389 	// if offset <= 0 { etc }
    390 	CMP $0, R_OFF
    391 	BLE errCorrupt
    392 
    393 	// if length > len(dst)-d { etc }
    394 	MOVD R_DEND, R_TMP1
    395 	SUB  R_DST, R_TMP1, R_TMP1
    396 	CMP  R_TMP1, R_LEN
    397 	BGT  errCorrupt
    398 
    399 	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
    400 	//
    401 	// Set:
    402 	//	- R_TMP2 = len(dst)-d
    403 	//	- R_TMP3 = &dst[d-offset]
    404 	MOVD R_DEND, R_TMP2
    405 	SUB  R_DST, R_TMP2, R_TMP2
    406 	MOVD R_DST, R_TMP3
    407 	SUB  R_OFF, R_TMP3, R_TMP3
    408 
    409 	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
    410 	//
    411 	// First, try using two 8-byte load/stores, similar to the doLit technique
    412 	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
    413 	// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
    414 	// and not one 16-byte load/store, and the first store has to be before the
    415 	// second load, due to the overlap if offset is in the range [8, 16).
    416 	//
    417 	// if length > 16 || offset < 8 || len(dst)-d < 16 {
    418 	//   goto slowForwardCopy
    419 	// }
    420 	// copy 16 bytes
    421 	// d += length
    422 	CMP  $16, R_LEN
    423 	BGT  slowForwardCopy
    424 	CMP  $8, R_OFF
    425 	BLT  slowForwardCopy
    426 	CMP  $16, R_TMP2
    427 	BLT  slowForwardCopy
    428 	MOVD 0(R_TMP3), R_TMP0
    429 	MOVD R_TMP0, 0(R_DST)
    430 	MOVD 8(R_TMP3), R_TMP1
    431 	MOVD R_TMP1, 8(R_DST)
    432 	ADD  R_LEN, R_DST, R_DST
    433 	B    loop
    434 
    435 slowForwardCopy:
    436 	// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
    437 	// can still try 8-byte load stores, provided we can overrun up to 10 extra
    438 	// bytes. As above, the overrun will be fixed up by subsequent iterations
    439 	// of the outermost loop.
    440 	//
    441 	// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
    442 	// commentary says:
    443 	//
    444 	// ----
    445 	//
    446 	// The main part of this loop is a simple copy of eight bytes at a time
    447 	// until we've copied (at least) the requested amount of bytes.  However,
    448 	// if d and d-offset are less than eight bytes apart (indicating a
    449 	// repeating pattern of length < 8), we first need to expand the pattern in
    450 	// order to get the correct results. For instance, if the buffer looks like
    451 	// this, with the eight-byte <d-offset> and <d> patterns marked as
    452 	// intervals:
    453 	//
    454 	//    abxxxxxxxxxxxx
    455 	//    [------]           d-offset
    456 	//      [------]         d
    457 	//
    458 	// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
    459 	// once, after which we can move <d> two bytes without moving <d-offset>:
    460 	//
    461 	//    ababxxxxxxxxxx
    462 	//    [------]           d-offset
    463 	//        [------]       d
    464 	//
    465 	// and repeat the exercise until the two no longer overlap.
    466 	//
    467 	// This allows us to do very well in the special case of one single byte
    468 	// repeated many times, without taking a big hit for more general cases.
    469 	//
    470 	// The worst case of extra writing past the end of the match occurs when
    471 	// offset == 1 and length == 1; the last copy will read from byte positions
    472 	// [0..7] and write to [4..11], whereas it was only supposed to write to
    473 	// position 1. Thus, ten excess bytes.
    474 	//
    475 	// ----
    476 	//
    477 	// That "10 byte overrun" worst case is confirmed by Go's
    478 	// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
    479 	// and finishSlowForwardCopy algorithm.
    480 	//
    481 	// if length > len(dst)-d-10 {
    482 	//   goto verySlowForwardCopy
    483 	// }
    484 	SUB $10, R_TMP2, R_TMP2
    485 	CMP R_TMP2, R_LEN
    486 	BGT verySlowForwardCopy
    487 
    488 	// We want to keep the offset, so we use R_TMP2 from here.
    489 	MOVD R_OFF, R_TMP2
    490 
    491 makeOffsetAtLeast8:
    492 	// !!! As above, expand the pattern so that offset >= 8 and we can use
    493 	// 8-byte load/stores.
    494 	//
    495 	// for offset < 8 {
    496 	//   copy 8 bytes from dst[d-offset:] to dst[d:]
    497 	//   length -= offset
    498 	//   d      += offset
    499 	//   offset += offset
    500 	//   // The two previous lines together means that d-offset, and therefore
    501 	//   // R_TMP3, is unchanged.
    502 	// }
    503 	CMP  $8, R_TMP2
    504 	BGE  fixUpSlowForwardCopy
    505 	MOVD (R_TMP3), R_TMP1
    506 	MOVD R_TMP1, (R_DST)
    507 	SUB  R_TMP2, R_LEN, R_LEN
    508 	ADD  R_TMP2, R_DST, R_DST
    509 	ADD  R_TMP2, R_TMP2, R_TMP2
    510 	B    makeOffsetAtLeast8
    511 
    512 fixUpSlowForwardCopy:
    513 	// !!! Add length (which might be negative now) to d (implied by R_DST being
    514 	// &dst[d]) so that d ends up at the right place when we jump back to the
    515 	// top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if
    516 	// length is positive, copying the remaining length bytes will write to the
    517 	// right place.
    518 	MOVD R_DST, R_TMP0
    519 	ADD  R_LEN, R_DST, R_DST
    520 
    521 finishSlowForwardCopy:
    522 	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
    523 	// length means that we overrun, but as above, that will be fixed up by
    524 	// subsequent iterations of the outermost loop.
    525 	MOVD $0, R1
    526 	CMP  R1, R_LEN
    527 	BLE  loop
    528 	MOVD (R_TMP3), R_TMP1
    529 	MOVD R_TMP1, (R_TMP0)
    530 	ADD  $8, R_TMP3, R_TMP3
    531 	ADD  $8, R_TMP0, R_TMP0
    532 	SUB  $8, R_LEN, R_LEN
    533 	B    finishSlowForwardCopy
    534 
    535 verySlowForwardCopy:
    536 	// verySlowForwardCopy is a simple implementation of forward copy. In C
    537 	// parlance, this is a do/while loop instead of a while loop, since we know
    538 	// that length > 0. In Go syntax:
    539 	//
    540 	// for {
    541 	//   dst[d] = dst[d - offset]
    542 	//   d++
    543 	//   length--
    544 	//   if length == 0 {
    545 	//     break
    546 	//   }
    547 	// }
    548 	MOVB (R_TMP3), R_TMP1
    549 	MOVB R_TMP1, (R_DST)
    550 	ADD  $1, R_TMP3, R_TMP3
    551 	ADD  $1, R_DST, R_DST
    552 	SUB  $1, R_LEN, R_LEN
    553 	CBNZ R_LEN, verySlowForwardCopy
    554 	B    loop
    555 
    556 	// The code above handles copy tags.
    557 	// ----------------------------------------
    558 
    559 end:
    560 	// This is the end of the "for s < len(src)".
    561 	//
    562 	// if d != len(dst) { etc }
    563 	CMP R_DEND, R_DST
    564 	BNE errCorrupt
    565 
    566 	// return 0
    567 	MOVD $0, ret+48(FP)
    568 	RET
    569 
    570 errCorrupt:
    571 	// return decodeErrCodeCorrupt
    572 	MOVD $1, R_TMP0
    573 	MOVD R_TMP0, ret+48(FP)
    574 	RET
	gtsocial-umbx Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs \| README \| LICENSE