decode_arm64.s (14954B)
1 // Copyright 2020 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !appengine 6 // +build gc 7 // +build !noasm 8 9 #include "textflag.h" 10 11 #define R_TMP0 R2 12 #define R_TMP1 R3 13 #define R_LEN R4 14 #define R_OFF R5 15 #define R_SRC R6 16 #define R_DST R7 17 #define R_DBASE R8 18 #define R_DLEN R9 19 #define R_DEND R10 20 #define R_SBASE R11 21 #define R_SLEN R12 22 #define R_SEND R13 23 #define R_TMP2 R14 24 #define R_TMP3 R15 25 26 // TEST_SRC will check if R_SRC is <= SRC_END 27 #define TEST_SRC() \ 28 CMP R_SEND, R_SRC \ 29 BGT errCorrupt 30 31 // MOVD R_SRC, R_TMP1 32 // SUB R_SBASE, R_TMP1, R_TMP1 33 // CMP R_SLEN, R_TMP1 34 // BGT errCorrupt 35 36 // The asm code generally follows the pure Go code in decode_other.go, except 37 // where marked with a "!!!". 38 39 // func decode(dst, src []byte) int 40 // 41 // All local variables fit into registers. The non-zero stack size is only to 42 // spill registers and push args when issuing a CALL. The register allocation: 43 // - R_TMP0 scratch 44 // - R_TMP1 scratch 45 // - R_LEN length or x 46 // - R_OFF offset 47 // - R_SRC &src[s] 48 // - R_DST &dst[d] 49 // + R_DBASE dst_base 50 // + R_DLEN dst_len 51 // + R_DEND dst_base + dst_len 52 // + R_SBASE src_base 53 // + R_SLEN src_len 54 // + R_SEND src_base + src_len 55 // - R_TMP2 used by doCopy 56 // - R_TMP3 used by doCopy 57 // 58 // The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the 59 // function, and after a CALL returns, and are not otherwise modified. 60 // 61 // The d variable is implicitly R_DST - R_DBASE, and len(dst)-d is R_DEND - R_DST. 62 // The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC. 63 TEXT ·s2Decode(SB), NOSPLIT, $56-64 64 // Initialize R_SRC, R_DST and R_DBASE-R_SEND. 65 MOVD dst_base+0(FP), R_DBASE 66 MOVD dst_len+8(FP), R_DLEN 67 MOVD R_DBASE, R_DST 68 MOVD R_DBASE, R_DEND 69 ADD R_DLEN, R_DEND, R_DEND 70 MOVD src_base+24(FP), R_SBASE 71 MOVD src_len+32(FP), R_SLEN 72 MOVD R_SBASE, R_SRC 73 MOVD R_SBASE, R_SEND 74 ADD R_SLEN, R_SEND, R_SEND 75 MOVD $0, R_OFF 76 77 loop: 78 // for s < len(src) 79 CMP R_SEND, R_SRC 80 BEQ end 81 82 // R_LEN = uint32(src[s]) 83 // 84 // switch src[s] & 0x03 85 MOVBU (R_SRC), R_LEN 86 MOVW R_LEN, R_TMP1 87 ANDW $3, R_TMP1 88 MOVW $1, R1 89 CMPW R1, R_TMP1 90 BGE tagCopy 91 92 // ---------------------------------------- 93 // The code below handles literal tags. 94 95 // case tagLiteral: 96 // x := uint32(src[s] >> 2) 97 // switch 98 MOVW $60, R1 99 LSRW $2, R_LEN, R_LEN 100 CMPW R_LEN, R1 101 BLS tagLit60Plus 102 103 // case x < 60: 104 // s++ 105 ADD $1, R_SRC, R_SRC 106 107 doLit: 108 // This is the end of the inner "switch", when we have a literal tag. 109 // 110 // We assume that R_LEN == x and x fits in a uint32, where x is the variable 111 // used in the pure Go decode_other.go code. 112 113 // length = int(x) + 1 114 // 115 // Unlike the pure Go code, we don't need to check if length <= 0 because 116 // R_LEN can hold 64 bits, so the increment cannot overflow. 117 ADD $1, R_LEN, R_LEN 118 119 // Prepare to check if copying length bytes will run past the end of dst or 120 // src. 121 // 122 // R_TMP0 = len(dst) - d 123 // R_TMP1 = len(src) - s 124 MOVD R_DEND, R_TMP0 125 SUB R_DST, R_TMP0, R_TMP0 126 MOVD R_SEND, R_TMP1 127 SUB R_SRC, R_TMP1, R_TMP1 128 129 // !!! Try a faster technique for short (16 or fewer bytes) copies. 130 // 131 // if length > 16 || len(dst)-d < 16 || len(src)-s < 16 { 132 // goto callMemmove // Fall back on calling runtime·memmove. 133 // } 134 // 135 // The C++ snappy code calls this TryFastAppend. It also checks len(src)-s 136 // against 21 instead of 16, because it cannot assume that all of its input 137 // is contiguous in memory and so it needs to leave enough source bytes to 138 // read the next tag without refilling buffers, but Go's Decode assumes 139 // contiguousness (the src argument is a []byte). 140 CMP $16, R_LEN 141 BGT callMemmove 142 CMP $16, R_TMP0 143 BLT callMemmove 144 CMP $16, R_TMP1 145 BLT callMemmove 146 147 // !!! Implement the copy from src to dst as a 16-byte load and store. 148 // (Decode's documentation says that dst and src must not overlap.) 149 // 150 // This always copies 16 bytes, instead of only length bytes, but that's 151 // OK. If the input is a valid Snappy encoding then subsequent iterations 152 // will fix up the overrun. Otherwise, Decode returns a nil []byte (and a 153 // non-nil error), so the overrun will be ignored. 154 // 155 // Note that on arm64, it is legal and cheap to issue unaligned 8-byte or 156 // 16-byte loads and stores. This technique probably wouldn't be as 157 // effective on architectures that are fussier about alignment. 158 LDP 0(R_SRC), (R_TMP2, R_TMP3) 159 STP (R_TMP2, R_TMP3), 0(R_DST) 160 161 // d += length 162 // s += length 163 ADD R_LEN, R_DST, R_DST 164 ADD R_LEN, R_SRC, R_SRC 165 B loop 166 167 callMemmove: 168 // if length > len(dst)-d || length > len(src)-s { etc } 169 CMP R_TMP0, R_LEN 170 BGT errCorrupt 171 CMP R_TMP1, R_LEN 172 BGT errCorrupt 173 174 // copy(dst[d:], src[s:s+length]) 175 // 176 // This means calling runtime·memmove(&dst[d], &src[s], length), so we push 177 // R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those 178 // three registers to the stack, to save local variables across the CALL. 179 MOVD R_DST, 8(RSP) 180 MOVD R_SRC, 16(RSP) 181 MOVD R_LEN, 24(RSP) 182 MOVD R_DST, 32(RSP) 183 MOVD R_SRC, 40(RSP) 184 MOVD R_LEN, 48(RSP) 185 MOVD R_OFF, 56(RSP) 186 CALL runtime·memmove(SB) 187 188 // Restore local variables: unspill registers from the stack and 189 // re-calculate R_DBASE-R_SEND. 190 MOVD 32(RSP), R_DST 191 MOVD 40(RSP), R_SRC 192 MOVD 48(RSP), R_LEN 193 MOVD 56(RSP), R_OFF 194 MOVD dst_base+0(FP), R_DBASE 195 MOVD dst_len+8(FP), R_DLEN 196 MOVD R_DBASE, R_DEND 197 ADD R_DLEN, R_DEND, R_DEND 198 MOVD src_base+24(FP), R_SBASE 199 MOVD src_len+32(FP), R_SLEN 200 MOVD R_SBASE, R_SEND 201 ADD R_SLEN, R_SEND, R_SEND 202 203 // d += length 204 // s += length 205 ADD R_LEN, R_DST, R_DST 206 ADD R_LEN, R_SRC, R_SRC 207 B loop 208 209 tagLit60Plus: 210 // !!! This fragment does the 211 // 212 // s += x - 58; if uint(s) > uint(len(src)) { etc } 213 // 214 // checks. In the asm version, we code it once instead of once per switch case. 215 ADD R_LEN, R_SRC, R_SRC 216 SUB $58, R_SRC, R_SRC 217 TEST_SRC() 218 219 // case x == 60: 220 MOVW $61, R1 221 CMPW R1, R_LEN 222 BEQ tagLit61 223 BGT tagLit62Plus 224 225 // x = uint32(src[s-1]) 226 MOVBU -1(R_SRC), R_LEN 227 B doLit 228 229 tagLit61: 230 // case x == 61: 231 // x = uint32(src[s-2]) | uint32(src[s-1])<<8 232 MOVHU -2(R_SRC), R_LEN 233 B doLit 234 235 tagLit62Plus: 236 CMPW $62, R_LEN 237 BHI tagLit63 238 239 // case x == 62: 240 // x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16 241 MOVHU -3(R_SRC), R_LEN 242 MOVBU -1(R_SRC), R_TMP1 243 ORR R_TMP1<<16, R_LEN 244 B doLit 245 246 tagLit63: 247 // case x == 63: 248 // x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24 249 MOVWU -4(R_SRC), R_LEN 250 B doLit 251 252 // The code above handles literal tags. 253 // ---------------------------------------- 254 // The code below handles copy tags. 255 256 tagCopy4: 257 // case tagCopy4: 258 // s += 5 259 ADD $5, R_SRC, R_SRC 260 261 // if uint(s) > uint(len(src)) { etc } 262 MOVD R_SRC, R_TMP1 263 SUB R_SBASE, R_TMP1, R_TMP1 264 CMP R_SLEN, R_TMP1 265 BGT errCorrupt 266 267 // length = 1 + int(src[s-5])>>2 268 MOVD $1, R1 269 ADD R_LEN>>2, R1, R_LEN 270 271 // offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24) 272 MOVWU -4(R_SRC), R_OFF 273 B doCopy 274 275 tagCopy2: 276 // case tagCopy2: 277 // s += 3 278 ADD $3, R_SRC, R_SRC 279 280 // if uint(s) > uint(len(src)) { etc } 281 TEST_SRC() 282 283 // length = 1 + int(src[s-3])>>2 284 MOVD $1, R1 285 ADD R_LEN>>2, R1, R_LEN 286 287 // offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8) 288 MOVHU -2(R_SRC), R_OFF 289 B doCopy 290 291 tagCopy: 292 // We have a copy tag. We assume that: 293 // - R_TMP1 == src[s] & 0x03 294 // - R_LEN == src[s] 295 CMP $2, R_TMP1 296 BEQ tagCopy2 297 BGT tagCopy4 298 299 // case tagCopy1: 300 // s += 2 301 ADD $2, R_SRC, R_SRC 302 303 // if uint(s) > uint(len(src)) { etc } 304 TEST_SRC() 305 306 // offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1])) 307 // Calculate offset in R_TMP0 in case it is a repeat. 308 MOVD R_LEN, R_TMP0 309 AND $0xe0, R_TMP0 310 MOVBU -1(R_SRC), R_TMP1 311 ORR R_TMP0<<3, R_TMP1, R_TMP0 312 313 // length = 4 + int(src[s-2])>>2&0x7 314 MOVD $7, R1 315 AND R_LEN>>2, R1, R_LEN 316 ADD $4, R_LEN, R_LEN 317 318 // check if repeat code with offset 0. 319 CMP $0, R_TMP0 320 BEQ repeatCode 321 322 // This is a regular copy, transfer our temporary value to R_OFF (offset) 323 MOVD R_TMP0, R_OFF 324 B doCopy 325 326 // This is a repeat code. 327 repeatCode: 328 // If length < 9, reuse last offset, with the length already calculated. 329 CMP $9, R_LEN 330 BLT doCopyRepeat 331 BEQ repeatLen1 332 CMP $10, R_LEN 333 BEQ repeatLen2 334 335 repeatLen3: 336 // s +=3 337 ADD $3, R_SRC, R_SRC 338 339 // if uint(s) > uint(len(src)) { etc } 340 TEST_SRC() 341 342 // length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + 65540 343 MOVBU -1(R_SRC), R_TMP0 344 MOVHU -3(R_SRC), R_LEN 345 ORR R_TMP0<<16, R_LEN, R_LEN 346 ADD $65540, R_LEN, R_LEN 347 B doCopyRepeat 348 349 repeatLen2: 350 // s +=2 351 ADD $2, R_SRC, R_SRC 352 353 // if uint(s) > uint(len(src)) { etc } 354 TEST_SRC() 355 356 // length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + 260 357 MOVHU -2(R_SRC), R_LEN 358 ADD $260, R_LEN, R_LEN 359 B doCopyRepeat 360 361 repeatLen1: 362 // s +=1 363 ADD $1, R_SRC, R_SRC 364 365 // if uint(s) > uint(len(src)) { etc } 366 TEST_SRC() 367 368 // length = src[s-1] + 8 369 MOVBU -1(R_SRC), R_LEN 370 ADD $8, R_LEN, R_LEN 371 B doCopyRepeat 372 373 doCopy: 374 // This is the end of the outer "switch", when we have a copy tag. 375 // 376 // We assume that: 377 // - R_LEN == length && R_LEN > 0 378 // - R_OFF == offset 379 380 // if d < offset { etc } 381 MOVD R_DST, R_TMP1 382 SUB R_DBASE, R_TMP1, R_TMP1 383 CMP R_OFF, R_TMP1 384 BLT errCorrupt 385 386 // Repeat values can skip the test above, since any offset > 0 will be in dst. 387 doCopyRepeat: 388 389 // if offset <= 0 { etc } 390 CMP $0, R_OFF 391 BLE errCorrupt 392 393 // if length > len(dst)-d { etc } 394 MOVD R_DEND, R_TMP1 395 SUB R_DST, R_TMP1, R_TMP1 396 CMP R_TMP1, R_LEN 397 BGT errCorrupt 398 399 // forwardCopy(dst[d:d+length], dst[d-offset:]); d += length 400 // 401 // Set: 402 // - R_TMP2 = len(dst)-d 403 // - R_TMP3 = &dst[d-offset] 404 MOVD R_DEND, R_TMP2 405 SUB R_DST, R_TMP2, R_TMP2 406 MOVD R_DST, R_TMP3 407 SUB R_OFF, R_TMP3, R_TMP3 408 409 // !!! Try a faster technique for short (16 or fewer bytes) forward copies. 410 // 411 // First, try using two 8-byte load/stores, similar to the doLit technique 412 // above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is 413 // still OK if offset >= 8. Note that this has to be two 8-byte load/stores 414 // and not one 16-byte load/store, and the first store has to be before the 415 // second load, due to the overlap if offset is in the range [8, 16). 416 // 417 // if length > 16 || offset < 8 || len(dst)-d < 16 { 418 // goto slowForwardCopy 419 // } 420 // copy 16 bytes 421 // d += length 422 CMP $16, R_LEN 423 BGT slowForwardCopy 424 CMP $8, R_OFF 425 BLT slowForwardCopy 426 CMP $16, R_TMP2 427 BLT slowForwardCopy 428 MOVD 0(R_TMP3), R_TMP0 429 MOVD R_TMP0, 0(R_DST) 430 MOVD 8(R_TMP3), R_TMP1 431 MOVD R_TMP1, 8(R_DST) 432 ADD R_LEN, R_DST, R_DST 433 B loop 434 435 slowForwardCopy: 436 // !!! If the forward copy is longer than 16 bytes, or if offset < 8, we 437 // can still try 8-byte load stores, provided we can overrun up to 10 extra 438 // bytes. As above, the overrun will be fixed up by subsequent iterations 439 // of the outermost loop. 440 // 441 // The C++ snappy code calls this technique IncrementalCopyFastPath. Its 442 // commentary says: 443 // 444 // ---- 445 // 446 // The main part of this loop is a simple copy of eight bytes at a time 447 // until we've copied (at least) the requested amount of bytes. However, 448 // if d and d-offset are less than eight bytes apart (indicating a 449 // repeating pattern of length < 8), we first need to expand the pattern in 450 // order to get the correct results. For instance, if the buffer looks like 451 // this, with the eight-byte <d-offset> and <d> patterns marked as 452 // intervals: 453 // 454 // abxxxxxxxxxxxx 455 // [------] d-offset 456 // [------] d 457 // 458 // a single eight-byte copy from <d-offset> to <d> will repeat the pattern 459 // once, after which we can move <d> two bytes without moving <d-offset>: 460 // 461 // ababxxxxxxxxxx 462 // [------] d-offset 463 // [------] d 464 // 465 // and repeat the exercise until the two no longer overlap. 466 // 467 // This allows us to do very well in the special case of one single byte 468 // repeated many times, without taking a big hit for more general cases. 469 // 470 // The worst case of extra writing past the end of the match occurs when 471 // offset == 1 and length == 1; the last copy will read from byte positions 472 // [0..7] and write to [4..11], whereas it was only supposed to write to 473 // position 1. Thus, ten excess bytes. 474 // 475 // ---- 476 // 477 // That "10 byte overrun" worst case is confirmed by Go's 478 // TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy 479 // and finishSlowForwardCopy algorithm. 480 // 481 // if length > len(dst)-d-10 { 482 // goto verySlowForwardCopy 483 // } 484 SUB $10, R_TMP2, R_TMP2 485 CMP R_TMP2, R_LEN 486 BGT verySlowForwardCopy 487 488 // We want to keep the offset, so we use R_TMP2 from here. 489 MOVD R_OFF, R_TMP2 490 491 makeOffsetAtLeast8: 492 // !!! As above, expand the pattern so that offset >= 8 and we can use 493 // 8-byte load/stores. 494 // 495 // for offset < 8 { 496 // copy 8 bytes from dst[d-offset:] to dst[d:] 497 // length -= offset 498 // d += offset 499 // offset += offset 500 // // The two previous lines together means that d-offset, and therefore 501 // // R_TMP3, is unchanged. 502 // } 503 CMP $8, R_TMP2 504 BGE fixUpSlowForwardCopy 505 MOVD (R_TMP3), R_TMP1 506 MOVD R_TMP1, (R_DST) 507 SUB R_TMP2, R_LEN, R_LEN 508 ADD R_TMP2, R_DST, R_DST 509 ADD R_TMP2, R_TMP2, R_TMP2 510 B makeOffsetAtLeast8 511 512 fixUpSlowForwardCopy: 513 // !!! Add length (which might be negative now) to d (implied by R_DST being 514 // &dst[d]) so that d ends up at the right place when we jump back to the 515 // top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if 516 // length is positive, copying the remaining length bytes will write to the 517 // right place. 518 MOVD R_DST, R_TMP0 519 ADD R_LEN, R_DST, R_DST 520 521 finishSlowForwardCopy: 522 // !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative 523 // length means that we overrun, but as above, that will be fixed up by 524 // subsequent iterations of the outermost loop. 525 MOVD $0, R1 526 CMP R1, R_LEN 527 BLE loop 528 MOVD (R_TMP3), R_TMP1 529 MOVD R_TMP1, (R_TMP0) 530 ADD $8, R_TMP3, R_TMP3 531 ADD $8, R_TMP0, R_TMP0 532 SUB $8, R_LEN, R_LEN 533 B finishSlowForwardCopy 534 535 verySlowForwardCopy: 536 // verySlowForwardCopy is a simple implementation of forward copy. In C 537 // parlance, this is a do/while loop instead of a while loop, since we know 538 // that length > 0. In Go syntax: 539 // 540 // for { 541 // dst[d] = dst[d - offset] 542 // d++ 543 // length-- 544 // if length == 0 { 545 // break 546 // } 547 // } 548 MOVB (R_TMP3), R_TMP1 549 MOVB R_TMP1, (R_DST) 550 ADD $1, R_TMP3, R_TMP3 551 ADD $1, R_DST, R_DST 552 SUB $1, R_LEN, R_LEN 553 CBNZ R_LEN, verySlowForwardCopy 554 B loop 555 556 // The code above handles copy tags. 557 // ---------------------------------------- 558 559 end: 560 // This is the end of the "for s < len(src)". 561 // 562 // if d != len(dst) { etc } 563 CMP R_DEND, R_DST 564 BNE errCorrupt 565 566 // return 0 567 MOVD $0, ret+48(FP) 568 RET 569 570 errCorrupt: 571 // return decodeErrCodeCorrupt 572 MOVD $1, R_TMP0 573 MOVD R_TMP0, ret+48(FP) 574 RET