decode_amd64.s - gtsocial-umbx - Unnamed repository; edit this file 'description' to name the repository.

decode_amd64.s (14748B)
      1 // Copyright 2016 The Go Authors. All rights reserved.
      2 // Copyright (c) 2019 Klaus Post. All rights reserved.
      3 // Use of this source code is governed by a BSD-style
      4 // license that can be found in the LICENSE file.
      5 
      6 // +build !appengine
      7 // +build gc
      8 // +build !noasm
      9 
     10 #include "textflag.h"
     11 
     12 #define R_TMP0 AX
     13 #define R_TMP1 BX
     14 #define R_LEN CX
     15 #define R_OFF DX
     16 #define R_SRC SI
     17 #define R_DST DI
     18 #define R_DBASE R8
     19 #define R_DLEN R9
     20 #define R_DEND R10
     21 #define R_SBASE R11
     22 #define R_SLEN R12
     23 #define R_SEND R13
     24 #define R_TMP2 R14
     25 #define R_TMP3 R15
     26 
     27 // The asm code generally follows the pure Go code in decode_other.go, except
     28 // where marked with a "!!!".
     29 
     30 // func decode(dst, src []byte) int
     31 //
     32 // All local variables fit into registers. The non-zero stack size is only to
     33 // spill registers and push args when issuing a CALL. The register allocation:
     34 //	- R_TMP0	scratch
     35 //	- R_TMP1	scratch
     36 //	- R_LEN	    length or x (shared)
     37 //	- R_OFF	    offset
     38 //	- R_SRC	    &src[s]
     39 //	- R_DST	    &dst[d]
     40 //	+ R_DBASE	dst_base
     41 //	+ R_DLEN	dst_len
     42 //	+ R_DEND	dst_base + dst_len
     43 //	+ R_SBASE	src_base
     44 //	+ R_SLEN	src_len
     45 //	+ R_SEND	src_base + src_len
     46 //	- R_TMP2	used by doCopy
     47 //	- R_TMP3	used by doCopy
     48 //
     49 // The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the
     50 // function, and after a CALL returns, and are not otherwise modified.
     51 //
     52 // The d variable is implicitly R_DST - R_DBASE,  and len(dst)-d is R_DEND - R_DST.
     53 // The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC.
     54 TEXT ·s2Decode(SB), NOSPLIT, $48-56
     55 	// Initialize R_SRC, R_DST and R_DBASE-R_SEND.
     56 	MOVQ dst_base+0(FP), R_DBASE
     57 	MOVQ dst_len+8(FP), R_DLEN
     58 	MOVQ R_DBASE, R_DST
     59 	MOVQ R_DBASE, R_DEND
     60 	ADDQ R_DLEN, R_DEND
     61 	MOVQ src_base+24(FP), R_SBASE
     62 	MOVQ src_len+32(FP), R_SLEN
     63 	MOVQ R_SBASE, R_SRC
     64 	MOVQ R_SBASE, R_SEND
     65 	ADDQ R_SLEN, R_SEND
     66 	XORQ R_OFF, R_OFF
     67 
     68 loop:
     69 	// for s < len(src)
     70 	CMPQ R_SRC, R_SEND
     71 	JEQ  end
     72 
     73 	// R_LEN = uint32(src[s])
     74 	//
     75 	// switch src[s] & 0x03
     76 	MOVBLZX (R_SRC), R_LEN
     77 	MOVL    R_LEN, R_TMP1
     78 	ANDL    $3, R_TMP1
     79 	CMPL    R_TMP1, $1
     80 	JAE     tagCopy
     81 
     82 	// ----------------------------------------
     83 	// The code below handles literal tags.
     84 
     85 	// case tagLiteral:
     86 	// x := uint32(src[s] >> 2)
     87 	// switch
     88 	SHRL $2, R_LEN
     89 	CMPL R_LEN, $60
     90 	JAE  tagLit60Plus
     91 
     92 	// case x < 60:
     93 	// s++
     94 	INCQ R_SRC
     95 
     96 doLit:
     97 	// This is the end of the inner "switch", when we have a literal tag.
     98 	//
     99 	// We assume that R_LEN == x and x fits in a uint32, where x is the variable
    100 	// used in the pure Go decode_other.go code.
    101 
    102 	// length = int(x) + 1
    103 	//
    104 	// Unlike the pure Go code, we don't need to check if length <= 0 because
    105 	// R_LEN can hold 64 bits, so the increment cannot overflow.
    106 	INCQ R_LEN
    107 
    108 	// Prepare to check if copying length bytes will run past the end of dst or
    109 	// src.
    110 	//
    111 	// R_TMP0 = len(dst) - d
    112 	// R_TMP1 = len(src) - s
    113 	MOVQ R_DEND, R_TMP0
    114 	SUBQ R_DST, R_TMP0
    115 	MOVQ R_SEND, R_TMP1
    116 	SUBQ R_SRC, R_TMP1
    117 
    118 	// !!! Try a faster technique for short (16 or fewer bytes) copies.
    119 	//
    120 	// if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
    121 	//   goto callMemmove // Fall back on calling runtime·memmove.
    122 	// }
    123 	//
    124 	// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
    125 	// against 21 instead of 16, because it cannot assume that all of its input
    126 	// is contiguous in memory and so it needs to leave enough source bytes to
    127 	// read the next tag without refilling buffers, but Go's Decode assumes
    128 	// contiguousness (the src argument is a []byte).
    129 	CMPQ R_LEN, $16
    130 	JGT  callMemmove
    131 	CMPQ R_TMP0, $16
    132 	JLT  callMemmove
    133 	CMPQ R_TMP1, $16
    134 	JLT  callMemmove
    135 
    136 	// !!! Implement the copy from src to dst as a 16-byte load and store.
    137 	// (Decode's documentation says that dst and src must not overlap.)
    138 	//
    139 	// This always copies 16 bytes, instead of only length bytes, but that's
    140 	// OK. If the input is a valid Snappy encoding then subsequent iterations
    141 	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
    142 	// non-nil error), so the overrun will be ignored.
    143 	//
    144 	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
    145 	// 16-byte loads and stores. This technique probably wouldn't be as
    146 	// effective on architectures that are fussier about alignment.
    147 	MOVOU 0(R_SRC), X0
    148 	MOVOU X0, 0(R_DST)
    149 
    150 	// d += length
    151 	// s += length
    152 	ADDQ R_LEN, R_DST
    153 	ADDQ R_LEN, R_SRC
    154 	JMP  loop
    155 
    156 callMemmove:
    157 	// if length > len(dst)-d || length > len(src)-s { etc }
    158 	CMPQ R_LEN, R_TMP0
    159 	JGT  errCorrupt
    160 	CMPQ R_LEN, R_TMP1
    161 	JGT  errCorrupt
    162 
    163 	// copy(dst[d:], src[s:s+length])
    164 	//
    165 	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
    166 	// R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those
    167 	// three registers to the stack, to save local variables across the CALL.
    168 	MOVQ R_DST, 0(SP)
    169 	MOVQ R_SRC, 8(SP)
    170 	MOVQ R_LEN, 16(SP)
    171 	MOVQ R_DST, 24(SP)
    172 	MOVQ R_SRC, 32(SP)
    173 	MOVQ R_LEN, 40(SP)
    174 	MOVQ R_OFF, 48(SP)
    175 	CALL runtime·memmove(SB)
    176 
    177 	// Restore local variables: unspill registers from the stack and
    178 	// re-calculate R_DBASE-R_SEND.
    179 	MOVQ 24(SP), R_DST
    180 	MOVQ 32(SP), R_SRC
    181 	MOVQ 40(SP), R_LEN
    182 	MOVQ 48(SP), R_OFF
    183 	MOVQ dst_base+0(FP), R_DBASE
    184 	MOVQ dst_len+8(FP), R_DLEN
    185 	MOVQ R_DBASE, R_DEND
    186 	ADDQ R_DLEN, R_DEND
    187 	MOVQ src_base+24(FP), R_SBASE
    188 	MOVQ src_len+32(FP), R_SLEN
    189 	MOVQ R_SBASE, R_SEND
    190 	ADDQ R_SLEN, R_SEND
    191 
    192 	// d += length
    193 	// s += length
    194 	ADDQ R_LEN, R_DST
    195 	ADDQ R_LEN, R_SRC
    196 	JMP  loop
    197 
    198 tagLit60Plus:
    199 	// !!! This fragment does the
    200 	//
    201 	// s += x - 58; if uint(s) > uint(len(src)) { etc }
    202 	//
    203 	// checks. In the asm version, we code it once instead of once per switch case.
    204 	ADDQ R_LEN, R_SRC
    205 	SUBQ $58, R_SRC
    206 	CMPQ R_SRC, R_SEND
    207 	JA   errCorrupt
    208 
    209 	// case x == 60:
    210 	CMPL R_LEN, $61
    211 	JEQ  tagLit61
    212 	JA   tagLit62Plus
    213 
    214 	// x = uint32(src[s-1])
    215 	MOVBLZX -1(R_SRC), R_LEN
    216 	JMP     doLit
    217 
    218 tagLit61:
    219 	// case x == 61:
    220 	// x = uint32(src[s-2]) | uint32(src[s-1])<<8
    221 	MOVWLZX -2(R_SRC), R_LEN
    222 	JMP     doLit
    223 
    224 tagLit62Plus:
    225 	CMPL R_LEN, $62
    226 	JA   tagLit63
    227 
    228 	// case x == 62:
    229 	// x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
    230 	// We read one byte, safe to read one back, since we are just reading tag.
    231 	// x = binary.LittleEndian.Uint32(src[s-1:]) >> 8
    232 	MOVL -4(R_SRC), R_LEN
    233 	SHRL $8, R_LEN
    234 	JMP  doLit
    235 
    236 tagLit63:
    237 	// case x == 63:
    238 	// x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
    239 	MOVL -4(R_SRC), R_LEN
    240 	JMP  doLit
    241 
    242 // The code above handles literal tags.
    243 // ----------------------------------------
    244 // The code below handles copy tags.
    245 
    246 tagCopy4:
    247 	// case tagCopy4:
    248 	// s += 5
    249 	ADDQ $5, R_SRC
    250 
    251 	// if uint(s) > uint(len(src)) { etc }
    252 	CMPQ R_SRC, R_SEND
    253 	JA   errCorrupt
    254 
    255 	// length = 1 + int(src[s-5])>>2
    256 	SHRQ $2, R_LEN
    257 	INCQ R_LEN
    258 
    259 	// offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
    260 	MOVLQZX -4(R_SRC), R_OFF
    261 	JMP     doCopy
    262 
    263 tagCopy2:
    264 	// case tagCopy2:
    265 	// s += 3
    266 	ADDQ $3, R_SRC
    267 
    268 	// if uint(s) > uint(len(src)) { etc }
    269 	CMPQ R_SRC, R_SEND
    270 	JA   errCorrupt
    271 
    272 	// length = 1 + int(src[s-3])>>2
    273 	SHRQ $2, R_LEN
    274 	INCQ R_LEN
    275 
    276 	// offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
    277 	MOVWQZX -2(R_SRC), R_OFF
    278 	JMP     doCopy
    279 
    280 tagCopy:
    281 	// We have a copy tag. We assume that:
    282 	//	- R_TMP1 == src[s] & 0x03
    283 	//	- R_LEN == src[s]
    284 	CMPQ R_TMP1, $2
    285 	JEQ  tagCopy2
    286 	JA   tagCopy4
    287 
    288 	// case tagCopy1:
    289 	// s += 2
    290 	ADDQ $2, R_SRC
    291 
    292 	// if uint(s) > uint(len(src)) { etc }
    293 	CMPQ R_SRC, R_SEND
    294 	JA   errCorrupt
    295 
    296 	// offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
    297 	// length = 4 + int(src[s-2])>>2&0x7
    298 	MOVBQZX -1(R_SRC), R_TMP1
    299 	MOVQ    R_LEN, R_TMP0
    300 	SHRQ    $2, R_LEN
    301 	ANDQ    $0xe0, R_TMP0
    302 	ANDQ    $7, R_LEN
    303 	SHLQ    $3, R_TMP0
    304 	ADDQ    $4, R_LEN
    305 	ORQ     R_TMP1, R_TMP0
    306 
    307 	// check if repeat code, ZF set by ORQ.
    308 	JZ repeatCode
    309 
    310 	// This is a regular copy, transfer our temporary value to R_OFF (length)
    311 	MOVQ R_TMP0, R_OFF
    312 	JMP  doCopy
    313 
    314 // This is a repeat code.
    315 repeatCode:
    316 	// If length < 9, reuse last offset, with the length already calculated.
    317 	CMPQ R_LEN, $9
    318 	JL   doCopyRepeat
    319 
    320 	// Read additional bytes for length.
    321 	JE repeatLen1
    322 
    323 	// Rare, so the extra branch shouldn't hurt too much.
    324 	CMPQ R_LEN, $10
    325 	JE   repeatLen2
    326 	JMP  repeatLen3
    327 
    328 // Read repeat lengths.
    329 repeatLen1:
    330 	// s ++
    331 	ADDQ $1, R_SRC
    332 
    333 	// if uint(s) > uint(len(src)) { etc }
    334 	CMPQ R_SRC, R_SEND
    335 	JA   errCorrupt
    336 
    337 	// length = src[s-1] + 8
    338 	MOVBQZX -1(R_SRC), R_LEN
    339 	ADDL    $8, R_LEN
    340 	JMP     doCopyRepeat
    341 
    342 repeatLen2:
    343 	// s +=2
    344 	ADDQ $2, R_SRC
    345 
    346 	// if uint(s) > uint(len(src)) { etc }
    347 	CMPQ R_SRC, R_SEND
    348 	JA   errCorrupt
    349 
    350 	// length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + (1 << 8)
    351 	MOVWQZX -2(R_SRC), R_LEN
    352 	ADDL    $260, R_LEN
    353 	JMP     doCopyRepeat
    354 
    355 repeatLen3:
    356 	// s +=3
    357 	ADDQ $3, R_SRC
    358 
    359 	// if uint(s) > uint(len(src)) { etc }
    360 	CMPQ R_SRC, R_SEND
    361 	JA   errCorrupt
    362 
    363 	// length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + (1 << 16)
    364 	// Read one byte further back (just part of the tag, shifted out)
    365 	MOVL -4(R_SRC), R_LEN
    366 	SHRL $8, R_LEN
    367 	ADDL $65540, R_LEN
    368 	JMP  doCopyRepeat
    369 
    370 doCopy:
    371 	// This is the end of the outer "switch", when we have a copy tag.
    372 	//
    373 	// We assume that:
    374 	//	- R_LEN == length && R_LEN > 0
    375 	//	- R_OFF == offset
    376 
    377 	// if d < offset { etc }
    378 	MOVQ R_DST, R_TMP1
    379 	SUBQ R_DBASE, R_TMP1
    380 	CMPQ R_TMP1, R_OFF
    381 	JLT  errCorrupt
    382 
    383 	// Repeat values can skip the test above, since any offset > 0 will be in dst.
    384 doCopyRepeat:
    385 	// if offset <= 0 { etc }
    386 	CMPQ R_OFF, $0
    387 	JLE  errCorrupt
    388 
    389 	// if length > len(dst)-d { etc }
    390 	MOVQ R_DEND, R_TMP1
    391 	SUBQ R_DST, R_TMP1
    392 	CMPQ R_LEN, R_TMP1
    393 	JGT  errCorrupt
    394 
    395 	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
    396 	//
    397 	// Set:
    398 	//	- R_TMP2 = len(dst)-d
    399 	//	- R_TMP3 = &dst[d-offset]
    400 	MOVQ R_DEND, R_TMP2
    401 	SUBQ R_DST, R_TMP2
    402 	MOVQ R_DST, R_TMP3
    403 	SUBQ R_OFF, R_TMP3
    404 
    405 	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
    406 	//
    407 	// First, try using two 8-byte load/stores, similar to the doLit technique
    408 	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
    409 	// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
    410 	// and not one 16-byte load/store, and the first store has to be before the
    411 	// second load, due to the overlap if offset is in the range [8, 16).
    412 	//
    413 	// if length > 16 || offset < 8 || len(dst)-d < 16 {
    414 	//   goto slowForwardCopy
    415 	// }
    416 	// copy 16 bytes
    417 	// d += length
    418 	CMPQ R_LEN, $16
    419 	JGT  slowForwardCopy
    420 	CMPQ R_OFF, $8
    421 	JLT  slowForwardCopy
    422 	CMPQ R_TMP2, $16
    423 	JLT  slowForwardCopy
    424 	MOVQ 0(R_TMP3), R_TMP0
    425 	MOVQ R_TMP0, 0(R_DST)
    426 	MOVQ 8(R_TMP3), R_TMP1
    427 	MOVQ R_TMP1, 8(R_DST)
    428 	ADDQ R_LEN, R_DST
    429 	JMP  loop
    430 
    431 slowForwardCopy:
    432 	// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
    433 	// can still try 8-byte load stores, provided we can overrun up to 10 extra
    434 	// bytes. As above, the overrun will be fixed up by subsequent iterations
    435 	// of the outermost loop.
    436 	//
    437 	// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
    438 	// commentary says:
    439 	//
    440 	// ----
    441 	//
    442 	// The main part of this loop is a simple copy of eight bytes at a time
    443 	// until we've copied (at least) the requested amount of bytes.  However,
    444 	// if d and d-offset are less than eight bytes apart (indicating a
    445 	// repeating pattern of length < 8), we first need to expand the pattern in
    446 	// order to get the correct results. For instance, if the buffer looks like
    447 	// this, with the eight-byte <d-offset> and <d> patterns marked as
    448 	// intervals:
    449 	//
    450 	//    abxxxxxxxxxxxx
    451 	//    [------]           d-offset
    452 	//      [------]         d
    453 	//
    454 	// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
    455 	// once, after which we can move <d> two bytes without moving <d-offset>:
    456 	//
    457 	//    ababxxxxxxxxxx
    458 	//    [------]           d-offset
    459 	//        [------]       d
    460 	//
    461 	// and repeat the exercise until the two no longer overlap.
    462 	//
    463 	// This allows us to do very well in the special case of one single byte
    464 	// repeated many times, without taking a big hit for more general cases.
    465 	//
    466 	// The worst case of extra writing past the end of the match occurs when
    467 	// offset == 1 and length == 1; the last copy will read from byte positions
    468 	// [0..7] and write to [4..11], whereas it was only supposed to write to
    469 	// position 1. Thus, ten excess bytes.
    470 	//
    471 	// ----
    472 	//
    473 	// That "10 byte overrun" worst case is confirmed by Go's
    474 	// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
    475 	// and finishSlowForwardCopy algorithm.
    476 	//
    477 	// if length > len(dst)-d-10 {
    478 	//   goto verySlowForwardCopy
    479 	// }
    480 	SUBQ $10, R_TMP2
    481 	CMPQ R_LEN, R_TMP2
    482 	JGT  verySlowForwardCopy
    483 
    484 	// We want to keep the offset, so we use R_TMP2 from here.
    485 	MOVQ R_OFF, R_TMP2
    486 
    487 makeOffsetAtLeast8:
    488 	// !!! As above, expand the pattern so that offset >= 8 and we can use
    489 	// 8-byte load/stores.
    490 	//
    491 	// for offset < 8 {
    492 	//   copy 8 bytes from dst[d-offset:] to dst[d:]
    493 	//   length -= offset
    494 	//   d      += offset
    495 	//   offset += offset
    496 	//   // The two previous lines together means that d-offset, and therefore
    497 	//   // R_TMP3, is unchanged.
    498 	// }
    499 	CMPQ R_TMP2, $8
    500 	JGE  fixUpSlowForwardCopy
    501 	MOVQ (R_TMP3), R_TMP1
    502 	MOVQ R_TMP1, (R_DST)
    503 	SUBQ R_TMP2, R_LEN
    504 	ADDQ R_TMP2, R_DST
    505 	ADDQ R_TMP2, R_TMP2
    506 	JMP  makeOffsetAtLeast8
    507 
    508 fixUpSlowForwardCopy:
    509 	// !!! Add length (which might be negative now) to d (implied by R_DST being
    510 	// &dst[d]) so that d ends up at the right place when we jump back to the
    511 	// top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if
    512 	// length is positive, copying the remaining length bytes will write to the
    513 	// right place.
    514 	MOVQ R_DST, R_TMP0
    515 	ADDQ R_LEN, R_DST
    516 
    517 finishSlowForwardCopy:
    518 	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
    519 	// length means that we overrun, but as above, that will be fixed up by
    520 	// subsequent iterations of the outermost loop.
    521 	CMPQ R_LEN, $0
    522 	JLE  loop
    523 	MOVQ (R_TMP3), R_TMP1
    524 	MOVQ R_TMP1, (R_TMP0)
    525 	ADDQ $8, R_TMP3
    526 	ADDQ $8, R_TMP0
    527 	SUBQ $8, R_LEN
    528 	JMP  finishSlowForwardCopy
    529 
    530 verySlowForwardCopy:
    531 	// verySlowForwardCopy is a simple implementation of forward copy. In C
    532 	// parlance, this is a do/while loop instead of a while loop, since we know
    533 	// that length > 0. In Go syntax:
    534 	//
    535 	// for {
    536 	//   dst[d] = dst[d - offset]
    537 	//   d++
    538 	//   length--
    539 	//   if length == 0 {
    540 	//     break
    541 	//   }
    542 	// }
    543 	MOVB (R_TMP3), R_TMP1
    544 	MOVB R_TMP1, (R_DST)
    545 	INCQ R_TMP3
    546 	INCQ R_DST
    547 	DECQ R_LEN
    548 	JNZ  verySlowForwardCopy
    549 	JMP  loop
    550 
    551 // The code above handles copy tags.
    552 // ----------------------------------------
    553 
    554 end:
    555 	// This is the end of the "for s < len(src)".
    556 	//
    557 	// if d != len(dst) { etc }
    558 	CMPQ R_DST, R_DEND
    559 	JNE  errCorrupt
    560 
    561 	// return 0
    562 	MOVQ $0, ret+48(FP)
    563 	RET
    564 
    565 errCorrupt:
    566 	// return decodeErrCodeCorrupt
    567 	MOVQ $1, ret+48(FP)
    568 	RET
	gtsocial-umbx Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs \| README \| LICENSE