decode_amd64.s (14748B)
1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Copyright (c) 2019 Klaus Post. All rights reserved. 3 // Use of this source code is governed by a BSD-style 4 // license that can be found in the LICENSE file. 5 6 // +build !appengine 7 // +build gc 8 // +build !noasm 9 10 #include "textflag.h" 11 12 #define R_TMP0 AX 13 #define R_TMP1 BX 14 #define R_LEN CX 15 #define R_OFF DX 16 #define R_SRC SI 17 #define R_DST DI 18 #define R_DBASE R8 19 #define R_DLEN R9 20 #define R_DEND R10 21 #define R_SBASE R11 22 #define R_SLEN R12 23 #define R_SEND R13 24 #define R_TMP2 R14 25 #define R_TMP3 R15 26 27 // The asm code generally follows the pure Go code in decode_other.go, except 28 // where marked with a "!!!". 29 30 // func decode(dst, src []byte) int 31 // 32 // All local variables fit into registers. The non-zero stack size is only to 33 // spill registers and push args when issuing a CALL. The register allocation: 34 // - R_TMP0 scratch 35 // - R_TMP1 scratch 36 // - R_LEN length or x (shared) 37 // - R_OFF offset 38 // - R_SRC &src[s] 39 // - R_DST &dst[d] 40 // + R_DBASE dst_base 41 // + R_DLEN dst_len 42 // + R_DEND dst_base + dst_len 43 // + R_SBASE src_base 44 // + R_SLEN src_len 45 // + R_SEND src_base + src_len 46 // - R_TMP2 used by doCopy 47 // - R_TMP3 used by doCopy 48 // 49 // The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the 50 // function, and after a CALL returns, and are not otherwise modified. 51 // 52 // The d variable is implicitly R_DST - R_DBASE, and len(dst)-d is R_DEND - R_DST. 53 // The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC. 54 TEXT ·s2Decode(SB), NOSPLIT, $48-56 55 // Initialize R_SRC, R_DST and R_DBASE-R_SEND. 56 MOVQ dst_base+0(FP), R_DBASE 57 MOVQ dst_len+8(FP), R_DLEN 58 MOVQ R_DBASE, R_DST 59 MOVQ R_DBASE, R_DEND 60 ADDQ R_DLEN, R_DEND 61 MOVQ src_base+24(FP), R_SBASE 62 MOVQ src_len+32(FP), R_SLEN 63 MOVQ R_SBASE, R_SRC 64 MOVQ R_SBASE, R_SEND 65 ADDQ R_SLEN, R_SEND 66 XORQ R_OFF, R_OFF 67 68 loop: 69 // for s < len(src) 70 CMPQ R_SRC, R_SEND 71 JEQ end 72 73 // R_LEN = uint32(src[s]) 74 // 75 // switch src[s] & 0x03 76 MOVBLZX (R_SRC), R_LEN 77 MOVL R_LEN, R_TMP1 78 ANDL $3, R_TMP1 79 CMPL R_TMP1, $1 80 JAE tagCopy 81 82 // ---------------------------------------- 83 // The code below handles literal tags. 84 85 // case tagLiteral: 86 // x := uint32(src[s] >> 2) 87 // switch 88 SHRL $2, R_LEN 89 CMPL R_LEN, $60 90 JAE tagLit60Plus 91 92 // case x < 60: 93 // s++ 94 INCQ R_SRC 95 96 doLit: 97 // This is the end of the inner "switch", when we have a literal tag. 98 // 99 // We assume that R_LEN == x and x fits in a uint32, where x is the variable 100 // used in the pure Go decode_other.go code. 101 102 // length = int(x) + 1 103 // 104 // Unlike the pure Go code, we don't need to check if length <= 0 because 105 // R_LEN can hold 64 bits, so the increment cannot overflow. 106 INCQ R_LEN 107 108 // Prepare to check if copying length bytes will run past the end of dst or 109 // src. 110 // 111 // R_TMP0 = len(dst) - d 112 // R_TMP1 = len(src) - s 113 MOVQ R_DEND, R_TMP0 114 SUBQ R_DST, R_TMP0 115 MOVQ R_SEND, R_TMP1 116 SUBQ R_SRC, R_TMP1 117 118 // !!! Try a faster technique for short (16 or fewer bytes) copies. 119 // 120 // if length > 16 || len(dst)-d < 16 || len(src)-s < 16 { 121 // goto callMemmove // Fall back on calling runtime·memmove. 122 // } 123 // 124 // The C++ snappy code calls this TryFastAppend. It also checks len(src)-s 125 // against 21 instead of 16, because it cannot assume that all of its input 126 // is contiguous in memory and so it needs to leave enough source bytes to 127 // read the next tag without refilling buffers, but Go's Decode assumes 128 // contiguousness (the src argument is a []byte). 129 CMPQ R_LEN, $16 130 JGT callMemmove 131 CMPQ R_TMP0, $16 132 JLT callMemmove 133 CMPQ R_TMP1, $16 134 JLT callMemmove 135 136 // !!! Implement the copy from src to dst as a 16-byte load and store. 137 // (Decode's documentation says that dst and src must not overlap.) 138 // 139 // This always copies 16 bytes, instead of only length bytes, but that's 140 // OK. If the input is a valid Snappy encoding then subsequent iterations 141 // will fix up the overrun. Otherwise, Decode returns a nil []byte (and a 142 // non-nil error), so the overrun will be ignored. 143 // 144 // Note that on amd64, it is legal and cheap to issue unaligned 8-byte or 145 // 16-byte loads and stores. This technique probably wouldn't be as 146 // effective on architectures that are fussier about alignment. 147 MOVOU 0(R_SRC), X0 148 MOVOU X0, 0(R_DST) 149 150 // d += length 151 // s += length 152 ADDQ R_LEN, R_DST 153 ADDQ R_LEN, R_SRC 154 JMP loop 155 156 callMemmove: 157 // if length > len(dst)-d || length > len(src)-s { etc } 158 CMPQ R_LEN, R_TMP0 159 JGT errCorrupt 160 CMPQ R_LEN, R_TMP1 161 JGT errCorrupt 162 163 // copy(dst[d:], src[s:s+length]) 164 // 165 // This means calling runtime·memmove(&dst[d], &src[s], length), so we push 166 // R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those 167 // three registers to the stack, to save local variables across the CALL. 168 MOVQ R_DST, 0(SP) 169 MOVQ R_SRC, 8(SP) 170 MOVQ R_LEN, 16(SP) 171 MOVQ R_DST, 24(SP) 172 MOVQ R_SRC, 32(SP) 173 MOVQ R_LEN, 40(SP) 174 MOVQ R_OFF, 48(SP) 175 CALL runtime·memmove(SB) 176 177 // Restore local variables: unspill registers from the stack and 178 // re-calculate R_DBASE-R_SEND. 179 MOVQ 24(SP), R_DST 180 MOVQ 32(SP), R_SRC 181 MOVQ 40(SP), R_LEN 182 MOVQ 48(SP), R_OFF 183 MOVQ dst_base+0(FP), R_DBASE 184 MOVQ dst_len+8(FP), R_DLEN 185 MOVQ R_DBASE, R_DEND 186 ADDQ R_DLEN, R_DEND 187 MOVQ src_base+24(FP), R_SBASE 188 MOVQ src_len+32(FP), R_SLEN 189 MOVQ R_SBASE, R_SEND 190 ADDQ R_SLEN, R_SEND 191 192 // d += length 193 // s += length 194 ADDQ R_LEN, R_DST 195 ADDQ R_LEN, R_SRC 196 JMP loop 197 198 tagLit60Plus: 199 // !!! This fragment does the 200 // 201 // s += x - 58; if uint(s) > uint(len(src)) { etc } 202 // 203 // checks. In the asm version, we code it once instead of once per switch case. 204 ADDQ R_LEN, R_SRC 205 SUBQ $58, R_SRC 206 CMPQ R_SRC, R_SEND 207 JA errCorrupt 208 209 // case x == 60: 210 CMPL R_LEN, $61 211 JEQ tagLit61 212 JA tagLit62Plus 213 214 // x = uint32(src[s-1]) 215 MOVBLZX -1(R_SRC), R_LEN 216 JMP doLit 217 218 tagLit61: 219 // case x == 61: 220 // x = uint32(src[s-2]) | uint32(src[s-1])<<8 221 MOVWLZX -2(R_SRC), R_LEN 222 JMP doLit 223 224 tagLit62Plus: 225 CMPL R_LEN, $62 226 JA tagLit63 227 228 // case x == 62: 229 // x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16 230 // We read one byte, safe to read one back, since we are just reading tag. 231 // x = binary.LittleEndian.Uint32(src[s-1:]) >> 8 232 MOVL -4(R_SRC), R_LEN 233 SHRL $8, R_LEN 234 JMP doLit 235 236 tagLit63: 237 // case x == 63: 238 // x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24 239 MOVL -4(R_SRC), R_LEN 240 JMP doLit 241 242 // The code above handles literal tags. 243 // ---------------------------------------- 244 // The code below handles copy tags. 245 246 tagCopy4: 247 // case tagCopy4: 248 // s += 5 249 ADDQ $5, R_SRC 250 251 // if uint(s) > uint(len(src)) { etc } 252 CMPQ R_SRC, R_SEND 253 JA errCorrupt 254 255 // length = 1 + int(src[s-5])>>2 256 SHRQ $2, R_LEN 257 INCQ R_LEN 258 259 // offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24) 260 MOVLQZX -4(R_SRC), R_OFF 261 JMP doCopy 262 263 tagCopy2: 264 // case tagCopy2: 265 // s += 3 266 ADDQ $3, R_SRC 267 268 // if uint(s) > uint(len(src)) { etc } 269 CMPQ R_SRC, R_SEND 270 JA errCorrupt 271 272 // length = 1 + int(src[s-3])>>2 273 SHRQ $2, R_LEN 274 INCQ R_LEN 275 276 // offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8) 277 MOVWQZX -2(R_SRC), R_OFF 278 JMP doCopy 279 280 tagCopy: 281 // We have a copy tag. We assume that: 282 // - R_TMP1 == src[s] & 0x03 283 // - R_LEN == src[s] 284 CMPQ R_TMP1, $2 285 JEQ tagCopy2 286 JA tagCopy4 287 288 // case tagCopy1: 289 // s += 2 290 ADDQ $2, R_SRC 291 292 // if uint(s) > uint(len(src)) { etc } 293 CMPQ R_SRC, R_SEND 294 JA errCorrupt 295 296 // offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1])) 297 // length = 4 + int(src[s-2])>>2&0x7 298 MOVBQZX -1(R_SRC), R_TMP1 299 MOVQ R_LEN, R_TMP0 300 SHRQ $2, R_LEN 301 ANDQ $0xe0, R_TMP0 302 ANDQ $7, R_LEN 303 SHLQ $3, R_TMP0 304 ADDQ $4, R_LEN 305 ORQ R_TMP1, R_TMP0 306 307 // check if repeat code, ZF set by ORQ. 308 JZ repeatCode 309 310 // This is a regular copy, transfer our temporary value to R_OFF (length) 311 MOVQ R_TMP0, R_OFF 312 JMP doCopy 313 314 // This is a repeat code. 315 repeatCode: 316 // If length < 9, reuse last offset, with the length already calculated. 317 CMPQ R_LEN, $9 318 JL doCopyRepeat 319 320 // Read additional bytes for length. 321 JE repeatLen1 322 323 // Rare, so the extra branch shouldn't hurt too much. 324 CMPQ R_LEN, $10 325 JE repeatLen2 326 JMP repeatLen3 327 328 // Read repeat lengths. 329 repeatLen1: 330 // s ++ 331 ADDQ $1, R_SRC 332 333 // if uint(s) > uint(len(src)) { etc } 334 CMPQ R_SRC, R_SEND 335 JA errCorrupt 336 337 // length = src[s-1] + 8 338 MOVBQZX -1(R_SRC), R_LEN 339 ADDL $8, R_LEN 340 JMP doCopyRepeat 341 342 repeatLen2: 343 // s +=2 344 ADDQ $2, R_SRC 345 346 // if uint(s) > uint(len(src)) { etc } 347 CMPQ R_SRC, R_SEND 348 JA errCorrupt 349 350 // length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + (1 << 8) 351 MOVWQZX -2(R_SRC), R_LEN 352 ADDL $260, R_LEN 353 JMP doCopyRepeat 354 355 repeatLen3: 356 // s +=3 357 ADDQ $3, R_SRC 358 359 // if uint(s) > uint(len(src)) { etc } 360 CMPQ R_SRC, R_SEND 361 JA errCorrupt 362 363 // length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + (1 << 16) 364 // Read one byte further back (just part of the tag, shifted out) 365 MOVL -4(R_SRC), R_LEN 366 SHRL $8, R_LEN 367 ADDL $65540, R_LEN 368 JMP doCopyRepeat 369 370 doCopy: 371 // This is the end of the outer "switch", when we have a copy tag. 372 // 373 // We assume that: 374 // - R_LEN == length && R_LEN > 0 375 // - R_OFF == offset 376 377 // if d < offset { etc } 378 MOVQ R_DST, R_TMP1 379 SUBQ R_DBASE, R_TMP1 380 CMPQ R_TMP1, R_OFF 381 JLT errCorrupt 382 383 // Repeat values can skip the test above, since any offset > 0 will be in dst. 384 doCopyRepeat: 385 // if offset <= 0 { etc } 386 CMPQ R_OFF, $0 387 JLE errCorrupt 388 389 // if length > len(dst)-d { etc } 390 MOVQ R_DEND, R_TMP1 391 SUBQ R_DST, R_TMP1 392 CMPQ R_LEN, R_TMP1 393 JGT errCorrupt 394 395 // forwardCopy(dst[d:d+length], dst[d-offset:]); d += length 396 // 397 // Set: 398 // - R_TMP2 = len(dst)-d 399 // - R_TMP3 = &dst[d-offset] 400 MOVQ R_DEND, R_TMP2 401 SUBQ R_DST, R_TMP2 402 MOVQ R_DST, R_TMP3 403 SUBQ R_OFF, R_TMP3 404 405 // !!! Try a faster technique for short (16 or fewer bytes) forward copies. 406 // 407 // First, try using two 8-byte load/stores, similar to the doLit technique 408 // above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is 409 // still OK if offset >= 8. Note that this has to be two 8-byte load/stores 410 // and not one 16-byte load/store, and the first store has to be before the 411 // second load, due to the overlap if offset is in the range [8, 16). 412 // 413 // if length > 16 || offset < 8 || len(dst)-d < 16 { 414 // goto slowForwardCopy 415 // } 416 // copy 16 bytes 417 // d += length 418 CMPQ R_LEN, $16 419 JGT slowForwardCopy 420 CMPQ R_OFF, $8 421 JLT slowForwardCopy 422 CMPQ R_TMP2, $16 423 JLT slowForwardCopy 424 MOVQ 0(R_TMP3), R_TMP0 425 MOVQ R_TMP0, 0(R_DST) 426 MOVQ 8(R_TMP3), R_TMP1 427 MOVQ R_TMP1, 8(R_DST) 428 ADDQ R_LEN, R_DST 429 JMP loop 430 431 slowForwardCopy: 432 // !!! If the forward copy is longer than 16 bytes, or if offset < 8, we 433 // can still try 8-byte load stores, provided we can overrun up to 10 extra 434 // bytes. As above, the overrun will be fixed up by subsequent iterations 435 // of the outermost loop. 436 // 437 // The C++ snappy code calls this technique IncrementalCopyFastPath. Its 438 // commentary says: 439 // 440 // ---- 441 // 442 // The main part of this loop is a simple copy of eight bytes at a time 443 // until we've copied (at least) the requested amount of bytes. However, 444 // if d and d-offset are less than eight bytes apart (indicating a 445 // repeating pattern of length < 8), we first need to expand the pattern in 446 // order to get the correct results. For instance, if the buffer looks like 447 // this, with the eight-byte <d-offset> and <d> patterns marked as 448 // intervals: 449 // 450 // abxxxxxxxxxxxx 451 // [------] d-offset 452 // [------] d 453 // 454 // a single eight-byte copy from <d-offset> to <d> will repeat the pattern 455 // once, after which we can move <d> two bytes without moving <d-offset>: 456 // 457 // ababxxxxxxxxxx 458 // [------] d-offset 459 // [------] d 460 // 461 // and repeat the exercise until the two no longer overlap. 462 // 463 // This allows us to do very well in the special case of one single byte 464 // repeated many times, without taking a big hit for more general cases. 465 // 466 // The worst case of extra writing past the end of the match occurs when 467 // offset == 1 and length == 1; the last copy will read from byte positions 468 // [0..7] and write to [4..11], whereas it was only supposed to write to 469 // position 1. Thus, ten excess bytes. 470 // 471 // ---- 472 // 473 // That "10 byte overrun" worst case is confirmed by Go's 474 // TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy 475 // and finishSlowForwardCopy algorithm. 476 // 477 // if length > len(dst)-d-10 { 478 // goto verySlowForwardCopy 479 // } 480 SUBQ $10, R_TMP2 481 CMPQ R_LEN, R_TMP2 482 JGT verySlowForwardCopy 483 484 // We want to keep the offset, so we use R_TMP2 from here. 485 MOVQ R_OFF, R_TMP2 486 487 makeOffsetAtLeast8: 488 // !!! As above, expand the pattern so that offset >= 8 and we can use 489 // 8-byte load/stores. 490 // 491 // for offset < 8 { 492 // copy 8 bytes from dst[d-offset:] to dst[d:] 493 // length -= offset 494 // d += offset 495 // offset += offset 496 // // The two previous lines together means that d-offset, and therefore 497 // // R_TMP3, is unchanged. 498 // } 499 CMPQ R_TMP2, $8 500 JGE fixUpSlowForwardCopy 501 MOVQ (R_TMP3), R_TMP1 502 MOVQ R_TMP1, (R_DST) 503 SUBQ R_TMP2, R_LEN 504 ADDQ R_TMP2, R_DST 505 ADDQ R_TMP2, R_TMP2 506 JMP makeOffsetAtLeast8 507 508 fixUpSlowForwardCopy: 509 // !!! Add length (which might be negative now) to d (implied by R_DST being 510 // &dst[d]) so that d ends up at the right place when we jump back to the 511 // top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if 512 // length is positive, copying the remaining length bytes will write to the 513 // right place. 514 MOVQ R_DST, R_TMP0 515 ADDQ R_LEN, R_DST 516 517 finishSlowForwardCopy: 518 // !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative 519 // length means that we overrun, but as above, that will be fixed up by 520 // subsequent iterations of the outermost loop. 521 CMPQ R_LEN, $0 522 JLE loop 523 MOVQ (R_TMP3), R_TMP1 524 MOVQ R_TMP1, (R_TMP0) 525 ADDQ $8, R_TMP3 526 ADDQ $8, R_TMP0 527 SUBQ $8, R_LEN 528 JMP finishSlowForwardCopy 529 530 verySlowForwardCopy: 531 // verySlowForwardCopy is a simple implementation of forward copy. In C 532 // parlance, this is a do/while loop instead of a while loop, since we know 533 // that length > 0. In Go syntax: 534 // 535 // for { 536 // dst[d] = dst[d - offset] 537 // d++ 538 // length-- 539 // if length == 0 { 540 // break 541 // } 542 // } 543 MOVB (R_TMP3), R_TMP1 544 MOVB R_TMP1, (R_DST) 545 INCQ R_TMP3 546 INCQ R_DST 547 DECQ R_LEN 548 JNZ verySlowForwardCopy 549 JMP loop 550 551 // The code above handles copy tags. 552 // ---------------------------------------- 553 554 end: 555 // This is the end of the "for s < len(src)". 556 // 557 // if d != len(dst) { etc } 558 CMPQ R_DST, R_DEND 559 JNE errCorrupt 560 561 // return 0 562 MOVQ $0, ret+48(FP) 563 RET 564 565 errCorrupt: 566 // return decodeErrCodeCorrupt 567 MOVQ $1, ret+48(FP) 568 RET