encodeblock_amd64.s (512643B)
1 // Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT. 2 3 //go:build !appengine && !noasm && gc && !noasm 4 5 #include "textflag.h" 6 7 // func _dummy_() 8 TEXT ·_dummy_(SB), $0 9 #ifdef GOAMD64_v4 10 #ifndef GOAMD64_v3 11 #define GOAMD64_v3 12 #endif 13 #endif 14 RET 15 16 // func encodeBlockAsm(dst []byte, src []byte) int 17 // Requires: BMI, SSE2 18 TEXT ·encodeBlockAsm(SB), $65560-56 19 MOVQ dst_base+0(FP), AX 20 MOVQ $0x00000200, CX 21 LEAQ 24(SP), DX 22 PXOR X0, X0 23 24 zero_loop_encodeBlockAsm: 25 MOVOU X0, (DX) 26 MOVOU X0, 16(DX) 27 MOVOU X0, 32(DX) 28 MOVOU X0, 48(DX) 29 MOVOU X0, 64(DX) 30 MOVOU X0, 80(DX) 31 MOVOU X0, 96(DX) 32 MOVOU X0, 112(DX) 33 ADDQ $0x80, DX 34 DECQ CX 35 JNZ zero_loop_encodeBlockAsm 36 MOVL $0x00000000, 12(SP) 37 MOVQ src_len+32(FP), CX 38 LEAQ -9(CX), DX 39 LEAQ -8(CX), BX 40 MOVL BX, 8(SP) 41 SHRQ $0x05, CX 42 SUBL CX, DX 43 LEAQ (AX)(DX*1), DX 44 MOVQ DX, (SP) 45 MOVL $0x00000001, CX 46 MOVL CX, 16(SP) 47 MOVQ src_base+24(FP), DX 48 49 search_loop_encodeBlockAsm: 50 MOVL CX, BX 51 SUBL 12(SP), BX 52 SHRL $0x06, BX 53 LEAL 4(CX)(BX*1), BX 54 CMPL BX, 8(SP) 55 JAE emit_remainder_encodeBlockAsm 56 MOVQ (DX)(CX*1), SI 57 MOVL BX, 20(SP) 58 MOVQ $0x0000cf1bbcdcbf9b, R8 59 MOVQ SI, R9 60 MOVQ SI, R10 61 SHRQ $0x08, R10 62 SHLQ $0x10, R9 63 IMULQ R8, R9 64 SHRQ $0x32, R9 65 SHLQ $0x10, R10 66 IMULQ R8, R10 67 SHRQ $0x32, R10 68 MOVL 24(SP)(R9*4), BX 69 MOVL 24(SP)(R10*4), DI 70 MOVL CX, 24(SP)(R9*4) 71 LEAL 1(CX), R9 72 MOVL R9, 24(SP)(R10*4) 73 MOVQ SI, R9 74 SHRQ $0x10, R9 75 SHLQ $0x10, R9 76 IMULQ R8, R9 77 SHRQ $0x32, R9 78 MOVL CX, R8 79 SUBL 16(SP), R8 80 MOVL 1(DX)(R8*1), R10 81 MOVQ SI, R8 82 SHRQ $0x08, R8 83 CMPL R8, R10 84 JNE no_repeat_found_encodeBlockAsm 85 LEAL 1(CX), SI 86 MOVL 12(SP), DI 87 MOVL SI, BX 88 SUBL 16(SP), BX 89 JZ repeat_extend_back_end_encodeBlockAsm 90 91 repeat_extend_back_loop_encodeBlockAsm: 92 CMPL SI, DI 93 JBE repeat_extend_back_end_encodeBlockAsm 94 MOVB -1(DX)(BX*1), R8 95 MOVB -1(DX)(SI*1), R9 96 CMPB R8, R9 97 JNE repeat_extend_back_end_encodeBlockAsm 98 LEAL -1(SI), SI 99 DECL BX 100 JNZ repeat_extend_back_loop_encodeBlockAsm 101 102 repeat_extend_back_end_encodeBlockAsm: 103 MOVL 12(SP), BX 104 CMPL BX, SI 105 JEQ emit_literal_done_repeat_emit_encodeBlockAsm 106 MOVL SI, R8 107 MOVL SI, 12(SP) 108 LEAQ (DX)(BX*1), R9 109 SUBL BX, R8 110 LEAL -1(R8), BX 111 CMPL BX, $0x3c 112 JB one_byte_repeat_emit_encodeBlockAsm 113 CMPL BX, $0x00000100 114 JB two_bytes_repeat_emit_encodeBlockAsm 115 CMPL BX, $0x00010000 116 JB three_bytes_repeat_emit_encodeBlockAsm 117 CMPL BX, $0x01000000 118 JB four_bytes_repeat_emit_encodeBlockAsm 119 MOVB $0xfc, (AX) 120 MOVL BX, 1(AX) 121 ADDQ $0x05, AX 122 JMP memmove_long_repeat_emit_encodeBlockAsm 123 124 four_bytes_repeat_emit_encodeBlockAsm: 125 MOVL BX, R10 126 SHRL $0x10, R10 127 MOVB $0xf8, (AX) 128 MOVW BX, 1(AX) 129 MOVB R10, 3(AX) 130 ADDQ $0x04, AX 131 JMP memmove_long_repeat_emit_encodeBlockAsm 132 133 three_bytes_repeat_emit_encodeBlockAsm: 134 MOVB $0xf4, (AX) 135 MOVW BX, 1(AX) 136 ADDQ $0x03, AX 137 JMP memmove_long_repeat_emit_encodeBlockAsm 138 139 two_bytes_repeat_emit_encodeBlockAsm: 140 MOVB $0xf0, (AX) 141 MOVB BL, 1(AX) 142 ADDQ $0x02, AX 143 CMPL BX, $0x40 144 JB memmove_repeat_emit_encodeBlockAsm 145 JMP memmove_long_repeat_emit_encodeBlockAsm 146 147 one_byte_repeat_emit_encodeBlockAsm: 148 SHLB $0x02, BL 149 MOVB BL, (AX) 150 ADDQ $0x01, AX 151 152 memmove_repeat_emit_encodeBlockAsm: 153 LEAQ (AX)(R8*1), BX 154 155 // genMemMoveShort 156 CMPQ R8, $0x08 157 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8 158 CMPQ R8, $0x10 159 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16 160 CMPQ R8, $0x20 161 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32 162 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64 163 164 emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8: 165 MOVQ (R9), R10 166 MOVQ R10, (AX) 167 JMP memmove_end_copy_repeat_emit_encodeBlockAsm 168 169 emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16: 170 MOVQ (R9), R10 171 MOVQ -8(R9)(R8*1), R9 172 MOVQ R10, (AX) 173 MOVQ R9, -8(AX)(R8*1) 174 JMP memmove_end_copy_repeat_emit_encodeBlockAsm 175 176 emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32: 177 MOVOU (R9), X0 178 MOVOU -16(R9)(R8*1), X1 179 MOVOU X0, (AX) 180 MOVOU X1, -16(AX)(R8*1) 181 JMP memmove_end_copy_repeat_emit_encodeBlockAsm 182 183 emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64: 184 MOVOU (R9), X0 185 MOVOU 16(R9), X1 186 MOVOU -32(R9)(R8*1), X2 187 MOVOU -16(R9)(R8*1), X3 188 MOVOU X0, (AX) 189 MOVOU X1, 16(AX) 190 MOVOU X2, -32(AX)(R8*1) 191 MOVOU X3, -16(AX)(R8*1) 192 193 memmove_end_copy_repeat_emit_encodeBlockAsm: 194 MOVQ BX, AX 195 JMP emit_literal_done_repeat_emit_encodeBlockAsm 196 197 memmove_long_repeat_emit_encodeBlockAsm: 198 LEAQ (AX)(R8*1), BX 199 200 // genMemMoveLong 201 MOVOU (R9), X0 202 MOVOU 16(R9), X1 203 MOVOU -32(R9)(R8*1), X2 204 MOVOU -16(R9)(R8*1), X3 205 MOVQ R8, R11 206 SHRQ $0x05, R11 207 MOVQ AX, R10 208 ANDL $0x0000001f, R10 209 MOVQ $0x00000040, R12 210 SUBQ R10, R12 211 DECQ R11 212 JA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32 213 LEAQ -32(R9)(R12*1), R10 214 LEAQ -32(AX)(R12*1), R13 215 216 emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back: 217 MOVOU (R10), X4 218 MOVOU 16(R10), X5 219 MOVOA X4, (R13) 220 MOVOA X5, 16(R13) 221 ADDQ $0x20, R13 222 ADDQ $0x20, R10 223 ADDQ $0x20, R12 224 DECQ R11 225 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back 226 227 emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32: 228 MOVOU -32(R9)(R12*1), X4 229 MOVOU -16(R9)(R12*1), X5 230 MOVOA X4, -32(AX)(R12*1) 231 MOVOA X5, -16(AX)(R12*1) 232 ADDQ $0x20, R12 233 CMPQ R8, R12 234 JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32 235 MOVOU X0, (AX) 236 MOVOU X1, 16(AX) 237 MOVOU X2, -32(AX)(R8*1) 238 MOVOU X3, -16(AX)(R8*1) 239 MOVQ BX, AX 240 241 emit_literal_done_repeat_emit_encodeBlockAsm: 242 ADDL $0x05, CX 243 MOVL CX, BX 244 SUBL 16(SP), BX 245 MOVQ src_len+32(FP), R8 246 SUBL CX, R8 247 LEAQ (DX)(CX*1), R9 248 LEAQ (DX)(BX*1), BX 249 250 // matchLen 251 XORL R11, R11 252 CMPL R8, $0x08 253 JB matchlen_match4_repeat_extend_encodeBlockAsm 254 255 matchlen_loopback_repeat_extend_encodeBlockAsm: 256 MOVQ (R9)(R11*1), R10 257 XORQ (BX)(R11*1), R10 258 TESTQ R10, R10 259 JZ matchlen_loop_repeat_extend_encodeBlockAsm 260 261 #ifdef GOAMD64_v3 262 TZCNTQ R10, R10 263 264 #else 265 BSFQ R10, R10 266 267 #endif 268 SARQ $0x03, R10 269 LEAL (R11)(R10*1), R11 270 JMP repeat_extend_forward_end_encodeBlockAsm 271 272 matchlen_loop_repeat_extend_encodeBlockAsm: 273 LEAL -8(R8), R8 274 LEAL 8(R11), R11 275 CMPL R8, $0x08 276 JAE matchlen_loopback_repeat_extend_encodeBlockAsm 277 JZ repeat_extend_forward_end_encodeBlockAsm 278 279 matchlen_match4_repeat_extend_encodeBlockAsm: 280 CMPL R8, $0x04 281 JB matchlen_match2_repeat_extend_encodeBlockAsm 282 MOVL (R9)(R11*1), R10 283 CMPL (BX)(R11*1), R10 284 JNE matchlen_match2_repeat_extend_encodeBlockAsm 285 SUBL $0x04, R8 286 LEAL 4(R11), R11 287 288 matchlen_match2_repeat_extend_encodeBlockAsm: 289 CMPL R8, $0x02 290 JB matchlen_match1_repeat_extend_encodeBlockAsm 291 MOVW (R9)(R11*1), R10 292 CMPW (BX)(R11*1), R10 293 JNE matchlen_match1_repeat_extend_encodeBlockAsm 294 SUBL $0x02, R8 295 LEAL 2(R11), R11 296 297 matchlen_match1_repeat_extend_encodeBlockAsm: 298 CMPL R8, $0x01 299 JB repeat_extend_forward_end_encodeBlockAsm 300 MOVB (R9)(R11*1), R10 301 CMPB (BX)(R11*1), R10 302 JNE repeat_extend_forward_end_encodeBlockAsm 303 LEAL 1(R11), R11 304 305 repeat_extend_forward_end_encodeBlockAsm: 306 ADDL R11, CX 307 MOVL CX, BX 308 SUBL SI, BX 309 MOVL 16(SP), SI 310 TESTL DI, DI 311 JZ repeat_as_copy_encodeBlockAsm 312 313 // emitRepeat 314 emit_repeat_again_match_repeat_encodeBlockAsm: 315 MOVL BX, DI 316 LEAL -4(BX), BX 317 CMPL DI, $0x08 318 JBE repeat_two_match_repeat_encodeBlockAsm 319 CMPL DI, $0x0c 320 JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm 321 CMPL SI, $0x00000800 322 JB repeat_two_offset_match_repeat_encodeBlockAsm 323 324 cant_repeat_two_offset_match_repeat_encodeBlockAsm: 325 CMPL BX, $0x00000104 326 JB repeat_three_match_repeat_encodeBlockAsm 327 CMPL BX, $0x00010100 328 JB repeat_four_match_repeat_encodeBlockAsm 329 CMPL BX, $0x0100ffff 330 JB repeat_five_match_repeat_encodeBlockAsm 331 LEAL -16842747(BX), BX 332 MOVL $0xfffb001d, (AX) 333 MOVB $0xff, 4(AX) 334 ADDQ $0x05, AX 335 JMP emit_repeat_again_match_repeat_encodeBlockAsm 336 337 repeat_five_match_repeat_encodeBlockAsm: 338 LEAL -65536(BX), BX 339 MOVL BX, SI 340 MOVW $0x001d, (AX) 341 MOVW BX, 2(AX) 342 SARL $0x10, SI 343 MOVB SI, 4(AX) 344 ADDQ $0x05, AX 345 JMP repeat_end_emit_encodeBlockAsm 346 347 repeat_four_match_repeat_encodeBlockAsm: 348 LEAL -256(BX), BX 349 MOVW $0x0019, (AX) 350 MOVW BX, 2(AX) 351 ADDQ $0x04, AX 352 JMP repeat_end_emit_encodeBlockAsm 353 354 repeat_three_match_repeat_encodeBlockAsm: 355 LEAL -4(BX), BX 356 MOVW $0x0015, (AX) 357 MOVB BL, 2(AX) 358 ADDQ $0x03, AX 359 JMP repeat_end_emit_encodeBlockAsm 360 361 repeat_two_match_repeat_encodeBlockAsm: 362 SHLL $0x02, BX 363 ORL $0x01, BX 364 MOVW BX, (AX) 365 ADDQ $0x02, AX 366 JMP repeat_end_emit_encodeBlockAsm 367 368 repeat_two_offset_match_repeat_encodeBlockAsm: 369 XORQ DI, DI 370 LEAL 1(DI)(BX*4), BX 371 MOVB SI, 1(AX) 372 SARL $0x08, SI 373 SHLL $0x05, SI 374 ORL SI, BX 375 MOVB BL, (AX) 376 ADDQ $0x02, AX 377 JMP repeat_end_emit_encodeBlockAsm 378 379 repeat_as_copy_encodeBlockAsm: 380 // emitCopy 381 CMPL SI, $0x00010000 382 JB two_byte_offset_repeat_as_copy_encodeBlockAsm 383 CMPL BX, $0x40 384 JBE four_bytes_remain_repeat_as_copy_encodeBlockAsm 385 MOVB $0xff, (AX) 386 MOVL SI, 1(AX) 387 LEAL -64(BX), BX 388 ADDQ $0x05, AX 389 CMPL BX, $0x04 390 JB four_bytes_remain_repeat_as_copy_encodeBlockAsm 391 392 // emitRepeat 393 emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy: 394 MOVL BX, DI 395 LEAL -4(BX), BX 396 CMPL DI, $0x08 397 JBE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy 398 CMPL DI, $0x0c 399 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy 400 CMPL SI, $0x00000800 401 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy 402 403 cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy: 404 CMPL BX, $0x00000104 405 JB repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy 406 CMPL BX, $0x00010100 407 JB repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy 408 CMPL BX, $0x0100ffff 409 JB repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy 410 LEAL -16842747(BX), BX 411 MOVL $0xfffb001d, (AX) 412 MOVB $0xff, 4(AX) 413 ADDQ $0x05, AX 414 JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy 415 416 repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy: 417 LEAL -65536(BX), BX 418 MOVL BX, SI 419 MOVW $0x001d, (AX) 420 MOVW BX, 2(AX) 421 SARL $0x10, SI 422 MOVB SI, 4(AX) 423 ADDQ $0x05, AX 424 JMP repeat_end_emit_encodeBlockAsm 425 426 repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy: 427 LEAL -256(BX), BX 428 MOVW $0x0019, (AX) 429 MOVW BX, 2(AX) 430 ADDQ $0x04, AX 431 JMP repeat_end_emit_encodeBlockAsm 432 433 repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy: 434 LEAL -4(BX), BX 435 MOVW $0x0015, (AX) 436 MOVB BL, 2(AX) 437 ADDQ $0x03, AX 438 JMP repeat_end_emit_encodeBlockAsm 439 440 repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy: 441 SHLL $0x02, BX 442 ORL $0x01, BX 443 MOVW BX, (AX) 444 ADDQ $0x02, AX 445 JMP repeat_end_emit_encodeBlockAsm 446 447 repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy: 448 XORQ DI, DI 449 LEAL 1(DI)(BX*4), BX 450 MOVB SI, 1(AX) 451 SARL $0x08, SI 452 SHLL $0x05, SI 453 ORL SI, BX 454 MOVB BL, (AX) 455 ADDQ $0x02, AX 456 JMP repeat_end_emit_encodeBlockAsm 457 458 four_bytes_remain_repeat_as_copy_encodeBlockAsm: 459 TESTL BX, BX 460 JZ repeat_end_emit_encodeBlockAsm 461 XORL DI, DI 462 LEAL -1(DI)(BX*4), BX 463 MOVB BL, (AX) 464 MOVL SI, 1(AX) 465 ADDQ $0x05, AX 466 JMP repeat_end_emit_encodeBlockAsm 467 468 two_byte_offset_repeat_as_copy_encodeBlockAsm: 469 CMPL BX, $0x40 470 JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm 471 CMPL SI, $0x00000800 472 JAE long_offset_short_repeat_as_copy_encodeBlockAsm 473 MOVL $0x00000001, DI 474 LEAL 16(DI), DI 475 MOVB SI, 1(AX) 476 MOVL SI, R8 477 SHRL $0x08, R8 478 SHLL $0x05, R8 479 ORL R8, DI 480 MOVB DI, (AX) 481 ADDQ $0x02, AX 482 SUBL $0x08, BX 483 484 // emitRepeat 485 LEAL -4(BX), BX 486 JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b 487 488 emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: 489 MOVL BX, DI 490 LEAL -4(BX), BX 491 CMPL DI, $0x08 492 JBE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b 493 CMPL DI, $0x0c 494 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b 495 CMPL SI, $0x00000800 496 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b 497 498 cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: 499 CMPL BX, $0x00000104 500 JB repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b 501 CMPL BX, $0x00010100 502 JB repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b 503 CMPL BX, $0x0100ffff 504 JB repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b 505 LEAL -16842747(BX), BX 506 MOVL $0xfffb001d, (AX) 507 MOVB $0xff, 4(AX) 508 ADDQ $0x05, AX 509 JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b 510 511 repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: 512 LEAL -65536(BX), BX 513 MOVL BX, SI 514 MOVW $0x001d, (AX) 515 MOVW BX, 2(AX) 516 SARL $0x10, SI 517 MOVB SI, 4(AX) 518 ADDQ $0x05, AX 519 JMP repeat_end_emit_encodeBlockAsm 520 521 repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: 522 LEAL -256(BX), BX 523 MOVW $0x0019, (AX) 524 MOVW BX, 2(AX) 525 ADDQ $0x04, AX 526 JMP repeat_end_emit_encodeBlockAsm 527 528 repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: 529 LEAL -4(BX), BX 530 MOVW $0x0015, (AX) 531 MOVB BL, 2(AX) 532 ADDQ $0x03, AX 533 JMP repeat_end_emit_encodeBlockAsm 534 535 repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: 536 SHLL $0x02, BX 537 ORL $0x01, BX 538 MOVW BX, (AX) 539 ADDQ $0x02, AX 540 JMP repeat_end_emit_encodeBlockAsm 541 542 repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: 543 XORQ DI, DI 544 LEAL 1(DI)(BX*4), BX 545 MOVB SI, 1(AX) 546 SARL $0x08, SI 547 SHLL $0x05, SI 548 ORL SI, BX 549 MOVB BL, (AX) 550 ADDQ $0x02, AX 551 JMP repeat_end_emit_encodeBlockAsm 552 553 long_offset_short_repeat_as_copy_encodeBlockAsm: 554 MOVB $0xee, (AX) 555 MOVW SI, 1(AX) 556 LEAL -60(BX), BX 557 ADDQ $0x03, AX 558 559 // emitRepeat 560 emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short: 561 MOVL BX, DI 562 LEAL -4(BX), BX 563 CMPL DI, $0x08 564 JBE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short 565 CMPL DI, $0x0c 566 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short 567 CMPL SI, $0x00000800 568 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short 569 570 cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short: 571 CMPL BX, $0x00000104 572 JB repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short 573 CMPL BX, $0x00010100 574 JB repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short 575 CMPL BX, $0x0100ffff 576 JB repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short 577 LEAL -16842747(BX), BX 578 MOVL $0xfffb001d, (AX) 579 MOVB $0xff, 4(AX) 580 ADDQ $0x05, AX 581 JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short 582 583 repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short: 584 LEAL -65536(BX), BX 585 MOVL BX, SI 586 MOVW $0x001d, (AX) 587 MOVW BX, 2(AX) 588 SARL $0x10, SI 589 MOVB SI, 4(AX) 590 ADDQ $0x05, AX 591 JMP repeat_end_emit_encodeBlockAsm 592 593 repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short: 594 LEAL -256(BX), BX 595 MOVW $0x0019, (AX) 596 MOVW BX, 2(AX) 597 ADDQ $0x04, AX 598 JMP repeat_end_emit_encodeBlockAsm 599 600 repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short: 601 LEAL -4(BX), BX 602 MOVW $0x0015, (AX) 603 MOVB BL, 2(AX) 604 ADDQ $0x03, AX 605 JMP repeat_end_emit_encodeBlockAsm 606 607 repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short: 608 SHLL $0x02, BX 609 ORL $0x01, BX 610 MOVW BX, (AX) 611 ADDQ $0x02, AX 612 JMP repeat_end_emit_encodeBlockAsm 613 614 repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short: 615 XORQ DI, DI 616 LEAL 1(DI)(BX*4), BX 617 MOVB SI, 1(AX) 618 SARL $0x08, SI 619 SHLL $0x05, SI 620 ORL SI, BX 621 MOVB BL, (AX) 622 ADDQ $0x02, AX 623 JMP repeat_end_emit_encodeBlockAsm 624 625 two_byte_offset_short_repeat_as_copy_encodeBlockAsm: 626 MOVL BX, DI 627 SHLL $0x02, DI 628 CMPL BX, $0x0c 629 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm 630 CMPL SI, $0x00000800 631 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm 632 LEAL -15(DI), DI 633 MOVB SI, 1(AX) 634 SHRL $0x08, SI 635 SHLL $0x05, SI 636 ORL SI, DI 637 MOVB DI, (AX) 638 ADDQ $0x02, AX 639 JMP repeat_end_emit_encodeBlockAsm 640 641 emit_copy_three_repeat_as_copy_encodeBlockAsm: 642 LEAL -2(DI), DI 643 MOVB DI, (AX) 644 MOVW SI, 1(AX) 645 ADDQ $0x03, AX 646 647 repeat_end_emit_encodeBlockAsm: 648 MOVL CX, 12(SP) 649 JMP search_loop_encodeBlockAsm 650 651 no_repeat_found_encodeBlockAsm: 652 CMPL (DX)(BX*1), SI 653 JEQ candidate_match_encodeBlockAsm 654 SHRQ $0x08, SI 655 MOVL 24(SP)(R9*4), BX 656 LEAL 2(CX), R8 657 CMPL (DX)(DI*1), SI 658 JEQ candidate2_match_encodeBlockAsm 659 MOVL R8, 24(SP)(R9*4) 660 SHRQ $0x08, SI 661 CMPL (DX)(BX*1), SI 662 JEQ candidate3_match_encodeBlockAsm 663 MOVL 20(SP), CX 664 JMP search_loop_encodeBlockAsm 665 666 candidate3_match_encodeBlockAsm: 667 ADDL $0x02, CX 668 JMP candidate_match_encodeBlockAsm 669 670 candidate2_match_encodeBlockAsm: 671 MOVL R8, 24(SP)(R9*4) 672 INCL CX 673 MOVL DI, BX 674 675 candidate_match_encodeBlockAsm: 676 MOVL 12(SP), SI 677 TESTL BX, BX 678 JZ match_extend_back_end_encodeBlockAsm 679 680 match_extend_back_loop_encodeBlockAsm: 681 CMPL CX, SI 682 JBE match_extend_back_end_encodeBlockAsm 683 MOVB -1(DX)(BX*1), DI 684 MOVB -1(DX)(CX*1), R8 685 CMPB DI, R8 686 JNE match_extend_back_end_encodeBlockAsm 687 LEAL -1(CX), CX 688 DECL BX 689 JZ match_extend_back_end_encodeBlockAsm 690 JMP match_extend_back_loop_encodeBlockAsm 691 692 match_extend_back_end_encodeBlockAsm: 693 MOVL CX, SI 694 SUBL 12(SP), SI 695 LEAQ 5(AX)(SI*1), SI 696 CMPQ SI, (SP) 697 JB match_dst_size_check_encodeBlockAsm 698 MOVQ $0x00000000, ret+48(FP) 699 RET 700 701 match_dst_size_check_encodeBlockAsm: 702 MOVL CX, SI 703 MOVL 12(SP), DI 704 CMPL DI, SI 705 JEQ emit_literal_done_match_emit_encodeBlockAsm 706 MOVL SI, R8 707 MOVL SI, 12(SP) 708 LEAQ (DX)(DI*1), SI 709 SUBL DI, R8 710 LEAL -1(R8), DI 711 CMPL DI, $0x3c 712 JB one_byte_match_emit_encodeBlockAsm 713 CMPL DI, $0x00000100 714 JB two_bytes_match_emit_encodeBlockAsm 715 CMPL DI, $0x00010000 716 JB three_bytes_match_emit_encodeBlockAsm 717 CMPL DI, $0x01000000 718 JB four_bytes_match_emit_encodeBlockAsm 719 MOVB $0xfc, (AX) 720 MOVL DI, 1(AX) 721 ADDQ $0x05, AX 722 JMP memmove_long_match_emit_encodeBlockAsm 723 724 four_bytes_match_emit_encodeBlockAsm: 725 MOVL DI, R9 726 SHRL $0x10, R9 727 MOVB $0xf8, (AX) 728 MOVW DI, 1(AX) 729 MOVB R9, 3(AX) 730 ADDQ $0x04, AX 731 JMP memmove_long_match_emit_encodeBlockAsm 732 733 three_bytes_match_emit_encodeBlockAsm: 734 MOVB $0xf4, (AX) 735 MOVW DI, 1(AX) 736 ADDQ $0x03, AX 737 JMP memmove_long_match_emit_encodeBlockAsm 738 739 two_bytes_match_emit_encodeBlockAsm: 740 MOVB $0xf0, (AX) 741 MOVB DI, 1(AX) 742 ADDQ $0x02, AX 743 CMPL DI, $0x40 744 JB memmove_match_emit_encodeBlockAsm 745 JMP memmove_long_match_emit_encodeBlockAsm 746 747 one_byte_match_emit_encodeBlockAsm: 748 SHLB $0x02, DI 749 MOVB DI, (AX) 750 ADDQ $0x01, AX 751 752 memmove_match_emit_encodeBlockAsm: 753 LEAQ (AX)(R8*1), DI 754 755 // genMemMoveShort 756 CMPQ R8, $0x08 757 JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8 758 CMPQ R8, $0x10 759 JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16 760 CMPQ R8, $0x20 761 JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32 762 JMP emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64 763 764 emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8: 765 MOVQ (SI), R9 766 MOVQ R9, (AX) 767 JMP memmove_end_copy_match_emit_encodeBlockAsm 768 769 emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16: 770 MOVQ (SI), R9 771 MOVQ -8(SI)(R8*1), SI 772 MOVQ R9, (AX) 773 MOVQ SI, -8(AX)(R8*1) 774 JMP memmove_end_copy_match_emit_encodeBlockAsm 775 776 emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32: 777 MOVOU (SI), X0 778 MOVOU -16(SI)(R8*1), X1 779 MOVOU X0, (AX) 780 MOVOU X1, -16(AX)(R8*1) 781 JMP memmove_end_copy_match_emit_encodeBlockAsm 782 783 emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64: 784 MOVOU (SI), X0 785 MOVOU 16(SI), X1 786 MOVOU -32(SI)(R8*1), X2 787 MOVOU -16(SI)(R8*1), X3 788 MOVOU X0, (AX) 789 MOVOU X1, 16(AX) 790 MOVOU X2, -32(AX)(R8*1) 791 MOVOU X3, -16(AX)(R8*1) 792 793 memmove_end_copy_match_emit_encodeBlockAsm: 794 MOVQ DI, AX 795 JMP emit_literal_done_match_emit_encodeBlockAsm 796 797 memmove_long_match_emit_encodeBlockAsm: 798 LEAQ (AX)(R8*1), DI 799 800 // genMemMoveLong 801 MOVOU (SI), X0 802 MOVOU 16(SI), X1 803 MOVOU -32(SI)(R8*1), X2 804 MOVOU -16(SI)(R8*1), X3 805 MOVQ R8, R10 806 SHRQ $0x05, R10 807 MOVQ AX, R9 808 ANDL $0x0000001f, R9 809 MOVQ $0x00000040, R11 810 SUBQ R9, R11 811 DECQ R10 812 JA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32 813 LEAQ -32(SI)(R11*1), R9 814 LEAQ -32(AX)(R11*1), R12 815 816 emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back: 817 MOVOU (R9), X4 818 MOVOU 16(R9), X5 819 MOVOA X4, (R12) 820 MOVOA X5, 16(R12) 821 ADDQ $0x20, R12 822 ADDQ $0x20, R9 823 ADDQ $0x20, R11 824 DECQ R10 825 JNA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back 826 827 emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32: 828 MOVOU -32(SI)(R11*1), X4 829 MOVOU -16(SI)(R11*1), X5 830 MOVOA X4, -32(AX)(R11*1) 831 MOVOA X5, -16(AX)(R11*1) 832 ADDQ $0x20, R11 833 CMPQ R8, R11 834 JAE emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32 835 MOVOU X0, (AX) 836 MOVOU X1, 16(AX) 837 MOVOU X2, -32(AX)(R8*1) 838 MOVOU X3, -16(AX)(R8*1) 839 MOVQ DI, AX 840 841 emit_literal_done_match_emit_encodeBlockAsm: 842 match_nolit_loop_encodeBlockAsm: 843 MOVL CX, SI 844 SUBL BX, SI 845 MOVL SI, 16(SP) 846 ADDL $0x04, CX 847 ADDL $0x04, BX 848 MOVQ src_len+32(FP), SI 849 SUBL CX, SI 850 LEAQ (DX)(CX*1), DI 851 LEAQ (DX)(BX*1), BX 852 853 // matchLen 854 XORL R9, R9 855 CMPL SI, $0x08 856 JB matchlen_match4_match_nolit_encodeBlockAsm 857 858 matchlen_loopback_match_nolit_encodeBlockAsm: 859 MOVQ (DI)(R9*1), R8 860 XORQ (BX)(R9*1), R8 861 TESTQ R8, R8 862 JZ matchlen_loop_match_nolit_encodeBlockAsm 863 864 #ifdef GOAMD64_v3 865 TZCNTQ R8, R8 866 867 #else 868 BSFQ R8, R8 869 870 #endif 871 SARQ $0x03, R8 872 LEAL (R9)(R8*1), R9 873 JMP match_nolit_end_encodeBlockAsm 874 875 matchlen_loop_match_nolit_encodeBlockAsm: 876 LEAL -8(SI), SI 877 LEAL 8(R9), R9 878 CMPL SI, $0x08 879 JAE matchlen_loopback_match_nolit_encodeBlockAsm 880 JZ match_nolit_end_encodeBlockAsm 881 882 matchlen_match4_match_nolit_encodeBlockAsm: 883 CMPL SI, $0x04 884 JB matchlen_match2_match_nolit_encodeBlockAsm 885 MOVL (DI)(R9*1), R8 886 CMPL (BX)(R9*1), R8 887 JNE matchlen_match2_match_nolit_encodeBlockAsm 888 SUBL $0x04, SI 889 LEAL 4(R9), R9 890 891 matchlen_match2_match_nolit_encodeBlockAsm: 892 CMPL SI, $0x02 893 JB matchlen_match1_match_nolit_encodeBlockAsm 894 MOVW (DI)(R9*1), R8 895 CMPW (BX)(R9*1), R8 896 JNE matchlen_match1_match_nolit_encodeBlockAsm 897 SUBL $0x02, SI 898 LEAL 2(R9), R9 899 900 matchlen_match1_match_nolit_encodeBlockAsm: 901 CMPL SI, $0x01 902 JB match_nolit_end_encodeBlockAsm 903 MOVB (DI)(R9*1), R8 904 CMPB (BX)(R9*1), R8 905 JNE match_nolit_end_encodeBlockAsm 906 LEAL 1(R9), R9 907 908 match_nolit_end_encodeBlockAsm: 909 ADDL R9, CX 910 MOVL 16(SP), BX 911 ADDL $0x04, R9 912 MOVL CX, 12(SP) 913 914 // emitCopy 915 CMPL BX, $0x00010000 916 JB two_byte_offset_match_nolit_encodeBlockAsm 917 CMPL R9, $0x40 918 JBE four_bytes_remain_match_nolit_encodeBlockAsm 919 MOVB $0xff, (AX) 920 MOVL BX, 1(AX) 921 LEAL -64(R9), R9 922 ADDQ $0x05, AX 923 CMPL R9, $0x04 924 JB four_bytes_remain_match_nolit_encodeBlockAsm 925 926 // emitRepeat 927 emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy: 928 MOVL R9, SI 929 LEAL -4(R9), R9 930 CMPL SI, $0x08 931 JBE repeat_two_match_nolit_encodeBlockAsm_emit_copy 932 CMPL SI, $0x0c 933 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy 934 CMPL BX, $0x00000800 935 JB repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy 936 937 cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy: 938 CMPL R9, $0x00000104 939 JB repeat_three_match_nolit_encodeBlockAsm_emit_copy 940 CMPL R9, $0x00010100 941 JB repeat_four_match_nolit_encodeBlockAsm_emit_copy 942 CMPL R9, $0x0100ffff 943 JB repeat_five_match_nolit_encodeBlockAsm_emit_copy 944 LEAL -16842747(R9), R9 945 MOVL $0xfffb001d, (AX) 946 MOVB $0xff, 4(AX) 947 ADDQ $0x05, AX 948 JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy 949 950 repeat_five_match_nolit_encodeBlockAsm_emit_copy: 951 LEAL -65536(R9), R9 952 MOVL R9, BX 953 MOVW $0x001d, (AX) 954 MOVW R9, 2(AX) 955 SARL $0x10, BX 956 MOVB BL, 4(AX) 957 ADDQ $0x05, AX 958 JMP match_nolit_emitcopy_end_encodeBlockAsm 959 960 repeat_four_match_nolit_encodeBlockAsm_emit_copy: 961 LEAL -256(R9), R9 962 MOVW $0x0019, (AX) 963 MOVW R9, 2(AX) 964 ADDQ $0x04, AX 965 JMP match_nolit_emitcopy_end_encodeBlockAsm 966 967 repeat_three_match_nolit_encodeBlockAsm_emit_copy: 968 LEAL -4(R9), R9 969 MOVW $0x0015, (AX) 970 MOVB R9, 2(AX) 971 ADDQ $0x03, AX 972 JMP match_nolit_emitcopy_end_encodeBlockAsm 973 974 repeat_two_match_nolit_encodeBlockAsm_emit_copy: 975 SHLL $0x02, R9 976 ORL $0x01, R9 977 MOVW R9, (AX) 978 ADDQ $0x02, AX 979 JMP match_nolit_emitcopy_end_encodeBlockAsm 980 981 repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy: 982 XORQ SI, SI 983 LEAL 1(SI)(R9*4), R9 984 MOVB BL, 1(AX) 985 SARL $0x08, BX 986 SHLL $0x05, BX 987 ORL BX, R9 988 MOVB R9, (AX) 989 ADDQ $0x02, AX 990 JMP match_nolit_emitcopy_end_encodeBlockAsm 991 992 four_bytes_remain_match_nolit_encodeBlockAsm: 993 TESTL R9, R9 994 JZ match_nolit_emitcopy_end_encodeBlockAsm 995 XORL SI, SI 996 LEAL -1(SI)(R9*4), R9 997 MOVB R9, (AX) 998 MOVL BX, 1(AX) 999 ADDQ $0x05, AX 1000 JMP match_nolit_emitcopy_end_encodeBlockAsm 1001 1002 two_byte_offset_match_nolit_encodeBlockAsm: 1003 CMPL R9, $0x40 1004 JBE two_byte_offset_short_match_nolit_encodeBlockAsm 1005 CMPL BX, $0x00000800 1006 JAE long_offset_short_match_nolit_encodeBlockAsm 1007 MOVL $0x00000001, SI 1008 LEAL 16(SI), SI 1009 MOVB BL, 1(AX) 1010 MOVL BX, DI 1011 SHRL $0x08, DI 1012 SHLL $0x05, DI 1013 ORL DI, SI 1014 MOVB SI, (AX) 1015 ADDQ $0x02, AX 1016 SUBL $0x08, R9 1017 1018 // emitRepeat 1019 LEAL -4(R9), R9 1020 JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b 1021 1022 emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b: 1023 MOVL R9, SI 1024 LEAL -4(R9), R9 1025 CMPL SI, $0x08 1026 JBE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b 1027 CMPL SI, $0x0c 1028 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b 1029 CMPL BX, $0x00000800 1030 JB repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b 1031 1032 cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b: 1033 CMPL R9, $0x00000104 1034 JB repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b 1035 CMPL R9, $0x00010100 1036 JB repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b 1037 CMPL R9, $0x0100ffff 1038 JB repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b 1039 LEAL -16842747(R9), R9 1040 MOVL $0xfffb001d, (AX) 1041 MOVB $0xff, 4(AX) 1042 ADDQ $0x05, AX 1043 JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b 1044 1045 repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b: 1046 LEAL -65536(R9), R9 1047 MOVL R9, BX 1048 MOVW $0x001d, (AX) 1049 MOVW R9, 2(AX) 1050 SARL $0x10, BX 1051 MOVB BL, 4(AX) 1052 ADDQ $0x05, AX 1053 JMP match_nolit_emitcopy_end_encodeBlockAsm 1054 1055 repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b: 1056 LEAL -256(R9), R9 1057 MOVW $0x0019, (AX) 1058 MOVW R9, 2(AX) 1059 ADDQ $0x04, AX 1060 JMP match_nolit_emitcopy_end_encodeBlockAsm 1061 1062 repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b: 1063 LEAL -4(R9), R9 1064 MOVW $0x0015, (AX) 1065 MOVB R9, 2(AX) 1066 ADDQ $0x03, AX 1067 JMP match_nolit_emitcopy_end_encodeBlockAsm 1068 1069 repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b: 1070 SHLL $0x02, R9 1071 ORL $0x01, R9 1072 MOVW R9, (AX) 1073 ADDQ $0x02, AX 1074 JMP match_nolit_emitcopy_end_encodeBlockAsm 1075 1076 repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b: 1077 XORQ SI, SI 1078 LEAL 1(SI)(R9*4), R9 1079 MOVB BL, 1(AX) 1080 SARL $0x08, BX 1081 SHLL $0x05, BX 1082 ORL BX, R9 1083 MOVB R9, (AX) 1084 ADDQ $0x02, AX 1085 JMP match_nolit_emitcopy_end_encodeBlockAsm 1086 1087 long_offset_short_match_nolit_encodeBlockAsm: 1088 MOVB $0xee, (AX) 1089 MOVW BX, 1(AX) 1090 LEAL -60(R9), R9 1091 ADDQ $0x03, AX 1092 1093 // emitRepeat 1094 emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short: 1095 MOVL R9, SI 1096 LEAL -4(R9), R9 1097 CMPL SI, $0x08 1098 JBE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short 1099 CMPL SI, $0x0c 1100 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short 1101 CMPL BX, $0x00000800 1102 JB repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short 1103 1104 cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short: 1105 CMPL R9, $0x00000104 1106 JB repeat_three_match_nolit_encodeBlockAsm_emit_copy_short 1107 CMPL R9, $0x00010100 1108 JB repeat_four_match_nolit_encodeBlockAsm_emit_copy_short 1109 CMPL R9, $0x0100ffff 1110 JB repeat_five_match_nolit_encodeBlockAsm_emit_copy_short 1111 LEAL -16842747(R9), R9 1112 MOVL $0xfffb001d, (AX) 1113 MOVB $0xff, 4(AX) 1114 ADDQ $0x05, AX 1115 JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short 1116 1117 repeat_five_match_nolit_encodeBlockAsm_emit_copy_short: 1118 LEAL -65536(R9), R9 1119 MOVL R9, BX 1120 MOVW $0x001d, (AX) 1121 MOVW R9, 2(AX) 1122 SARL $0x10, BX 1123 MOVB BL, 4(AX) 1124 ADDQ $0x05, AX 1125 JMP match_nolit_emitcopy_end_encodeBlockAsm 1126 1127 repeat_four_match_nolit_encodeBlockAsm_emit_copy_short: 1128 LEAL -256(R9), R9 1129 MOVW $0x0019, (AX) 1130 MOVW R9, 2(AX) 1131 ADDQ $0x04, AX 1132 JMP match_nolit_emitcopy_end_encodeBlockAsm 1133 1134 repeat_three_match_nolit_encodeBlockAsm_emit_copy_short: 1135 LEAL -4(R9), R9 1136 MOVW $0x0015, (AX) 1137 MOVB R9, 2(AX) 1138 ADDQ $0x03, AX 1139 JMP match_nolit_emitcopy_end_encodeBlockAsm 1140 1141 repeat_two_match_nolit_encodeBlockAsm_emit_copy_short: 1142 SHLL $0x02, R9 1143 ORL $0x01, R9 1144 MOVW R9, (AX) 1145 ADDQ $0x02, AX 1146 JMP match_nolit_emitcopy_end_encodeBlockAsm 1147 1148 repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short: 1149 XORQ SI, SI 1150 LEAL 1(SI)(R9*4), R9 1151 MOVB BL, 1(AX) 1152 SARL $0x08, BX 1153 SHLL $0x05, BX 1154 ORL BX, R9 1155 MOVB R9, (AX) 1156 ADDQ $0x02, AX 1157 JMP match_nolit_emitcopy_end_encodeBlockAsm 1158 1159 two_byte_offset_short_match_nolit_encodeBlockAsm: 1160 MOVL R9, SI 1161 SHLL $0x02, SI 1162 CMPL R9, $0x0c 1163 JAE emit_copy_three_match_nolit_encodeBlockAsm 1164 CMPL BX, $0x00000800 1165 JAE emit_copy_three_match_nolit_encodeBlockAsm 1166 LEAL -15(SI), SI 1167 MOVB BL, 1(AX) 1168 SHRL $0x08, BX 1169 SHLL $0x05, BX 1170 ORL BX, SI 1171 MOVB SI, (AX) 1172 ADDQ $0x02, AX 1173 JMP match_nolit_emitcopy_end_encodeBlockAsm 1174 1175 emit_copy_three_match_nolit_encodeBlockAsm: 1176 LEAL -2(SI), SI 1177 MOVB SI, (AX) 1178 MOVW BX, 1(AX) 1179 ADDQ $0x03, AX 1180 1181 match_nolit_emitcopy_end_encodeBlockAsm: 1182 CMPL CX, 8(SP) 1183 JAE emit_remainder_encodeBlockAsm 1184 MOVQ -2(DX)(CX*1), SI 1185 CMPQ AX, (SP) 1186 JB match_nolit_dst_ok_encodeBlockAsm 1187 MOVQ $0x00000000, ret+48(FP) 1188 RET 1189 1190 match_nolit_dst_ok_encodeBlockAsm: 1191 MOVQ $0x0000cf1bbcdcbf9b, R8 1192 MOVQ SI, DI 1193 SHRQ $0x10, SI 1194 MOVQ SI, BX 1195 SHLQ $0x10, DI 1196 IMULQ R8, DI 1197 SHRQ $0x32, DI 1198 SHLQ $0x10, BX 1199 IMULQ R8, BX 1200 SHRQ $0x32, BX 1201 LEAL -2(CX), R8 1202 LEAQ 24(SP)(BX*4), R9 1203 MOVL (R9), BX 1204 MOVL R8, 24(SP)(DI*4) 1205 MOVL CX, (R9) 1206 CMPL (DX)(BX*1), SI 1207 JEQ match_nolit_loop_encodeBlockAsm 1208 INCL CX 1209 JMP search_loop_encodeBlockAsm 1210 1211 emit_remainder_encodeBlockAsm: 1212 MOVQ src_len+32(FP), CX 1213 SUBL 12(SP), CX 1214 LEAQ 5(AX)(CX*1), CX 1215 CMPQ CX, (SP) 1216 JB emit_remainder_ok_encodeBlockAsm 1217 MOVQ $0x00000000, ret+48(FP) 1218 RET 1219 1220 emit_remainder_ok_encodeBlockAsm: 1221 MOVQ src_len+32(FP), CX 1222 MOVL 12(SP), BX 1223 CMPL BX, CX 1224 JEQ emit_literal_done_emit_remainder_encodeBlockAsm 1225 MOVL CX, SI 1226 MOVL CX, 12(SP) 1227 LEAQ (DX)(BX*1), CX 1228 SUBL BX, SI 1229 LEAL -1(SI), DX 1230 CMPL DX, $0x3c 1231 JB one_byte_emit_remainder_encodeBlockAsm 1232 CMPL DX, $0x00000100 1233 JB two_bytes_emit_remainder_encodeBlockAsm 1234 CMPL DX, $0x00010000 1235 JB three_bytes_emit_remainder_encodeBlockAsm 1236 CMPL DX, $0x01000000 1237 JB four_bytes_emit_remainder_encodeBlockAsm 1238 MOVB $0xfc, (AX) 1239 MOVL DX, 1(AX) 1240 ADDQ $0x05, AX 1241 JMP memmove_long_emit_remainder_encodeBlockAsm 1242 1243 four_bytes_emit_remainder_encodeBlockAsm: 1244 MOVL DX, BX 1245 SHRL $0x10, BX 1246 MOVB $0xf8, (AX) 1247 MOVW DX, 1(AX) 1248 MOVB BL, 3(AX) 1249 ADDQ $0x04, AX 1250 JMP memmove_long_emit_remainder_encodeBlockAsm 1251 1252 three_bytes_emit_remainder_encodeBlockAsm: 1253 MOVB $0xf4, (AX) 1254 MOVW DX, 1(AX) 1255 ADDQ $0x03, AX 1256 JMP memmove_long_emit_remainder_encodeBlockAsm 1257 1258 two_bytes_emit_remainder_encodeBlockAsm: 1259 MOVB $0xf0, (AX) 1260 MOVB DL, 1(AX) 1261 ADDQ $0x02, AX 1262 CMPL DX, $0x40 1263 JB memmove_emit_remainder_encodeBlockAsm 1264 JMP memmove_long_emit_remainder_encodeBlockAsm 1265 1266 one_byte_emit_remainder_encodeBlockAsm: 1267 SHLB $0x02, DL 1268 MOVB DL, (AX) 1269 ADDQ $0x01, AX 1270 1271 memmove_emit_remainder_encodeBlockAsm: 1272 LEAQ (AX)(SI*1), DX 1273 MOVL SI, BX 1274 1275 // genMemMoveShort 1276 CMPQ BX, $0x03 1277 JB emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2 1278 JE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3 1279 CMPQ BX, $0x08 1280 JB emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7 1281 CMPQ BX, $0x10 1282 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16 1283 CMPQ BX, $0x20 1284 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32 1285 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64 1286 1287 emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2: 1288 MOVB (CX), SI 1289 MOVB -1(CX)(BX*1), CL 1290 MOVB SI, (AX) 1291 MOVB CL, -1(AX)(BX*1) 1292 JMP memmove_end_copy_emit_remainder_encodeBlockAsm 1293 1294 emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3: 1295 MOVW (CX), SI 1296 MOVB 2(CX), CL 1297 MOVW SI, (AX) 1298 MOVB CL, 2(AX) 1299 JMP memmove_end_copy_emit_remainder_encodeBlockAsm 1300 1301 emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7: 1302 MOVL (CX), SI 1303 MOVL -4(CX)(BX*1), CX 1304 MOVL SI, (AX) 1305 MOVL CX, -4(AX)(BX*1) 1306 JMP memmove_end_copy_emit_remainder_encodeBlockAsm 1307 1308 emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16: 1309 MOVQ (CX), SI 1310 MOVQ -8(CX)(BX*1), CX 1311 MOVQ SI, (AX) 1312 MOVQ CX, -8(AX)(BX*1) 1313 JMP memmove_end_copy_emit_remainder_encodeBlockAsm 1314 1315 emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32: 1316 MOVOU (CX), X0 1317 MOVOU -16(CX)(BX*1), X1 1318 MOVOU X0, (AX) 1319 MOVOU X1, -16(AX)(BX*1) 1320 JMP memmove_end_copy_emit_remainder_encodeBlockAsm 1321 1322 emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64: 1323 MOVOU (CX), X0 1324 MOVOU 16(CX), X1 1325 MOVOU -32(CX)(BX*1), X2 1326 MOVOU -16(CX)(BX*1), X3 1327 MOVOU X0, (AX) 1328 MOVOU X1, 16(AX) 1329 MOVOU X2, -32(AX)(BX*1) 1330 MOVOU X3, -16(AX)(BX*1) 1331 1332 memmove_end_copy_emit_remainder_encodeBlockAsm: 1333 MOVQ DX, AX 1334 JMP emit_literal_done_emit_remainder_encodeBlockAsm 1335 1336 memmove_long_emit_remainder_encodeBlockAsm: 1337 LEAQ (AX)(SI*1), DX 1338 MOVL SI, BX 1339 1340 // genMemMoveLong 1341 MOVOU (CX), X0 1342 MOVOU 16(CX), X1 1343 MOVOU -32(CX)(BX*1), X2 1344 MOVOU -16(CX)(BX*1), X3 1345 MOVQ BX, DI 1346 SHRQ $0x05, DI 1347 MOVQ AX, SI 1348 ANDL $0x0000001f, SI 1349 MOVQ $0x00000040, R8 1350 SUBQ SI, R8 1351 DECQ DI 1352 JA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32 1353 LEAQ -32(CX)(R8*1), SI 1354 LEAQ -32(AX)(R8*1), R9 1355 1356 emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back: 1357 MOVOU (SI), X4 1358 MOVOU 16(SI), X5 1359 MOVOA X4, (R9) 1360 MOVOA X5, 16(R9) 1361 ADDQ $0x20, R9 1362 ADDQ $0x20, SI 1363 ADDQ $0x20, R8 1364 DECQ DI 1365 JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back 1366 1367 emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32: 1368 MOVOU -32(CX)(R8*1), X4 1369 MOVOU -16(CX)(R8*1), X5 1370 MOVOA X4, -32(AX)(R8*1) 1371 MOVOA X5, -16(AX)(R8*1) 1372 ADDQ $0x20, R8 1373 CMPQ BX, R8 1374 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32 1375 MOVOU X0, (AX) 1376 MOVOU X1, 16(AX) 1377 MOVOU X2, -32(AX)(BX*1) 1378 MOVOU X3, -16(AX)(BX*1) 1379 MOVQ DX, AX 1380 1381 emit_literal_done_emit_remainder_encodeBlockAsm: 1382 MOVQ dst_base+0(FP), CX 1383 SUBQ CX, AX 1384 MOVQ AX, ret+48(FP) 1385 RET 1386 1387 // func encodeBlockAsm4MB(dst []byte, src []byte) int 1388 // Requires: BMI, SSE2 1389 TEXT ·encodeBlockAsm4MB(SB), $65560-56 1390 MOVQ dst_base+0(FP), AX 1391 MOVQ $0x00000200, CX 1392 LEAQ 24(SP), DX 1393 PXOR X0, X0 1394 1395 zero_loop_encodeBlockAsm4MB: 1396 MOVOU X0, (DX) 1397 MOVOU X0, 16(DX) 1398 MOVOU X0, 32(DX) 1399 MOVOU X0, 48(DX) 1400 MOVOU X0, 64(DX) 1401 MOVOU X0, 80(DX) 1402 MOVOU X0, 96(DX) 1403 MOVOU X0, 112(DX) 1404 ADDQ $0x80, DX 1405 DECQ CX 1406 JNZ zero_loop_encodeBlockAsm4MB 1407 MOVL $0x00000000, 12(SP) 1408 MOVQ src_len+32(FP), CX 1409 LEAQ -9(CX), DX 1410 LEAQ -8(CX), BX 1411 MOVL BX, 8(SP) 1412 SHRQ $0x05, CX 1413 SUBL CX, DX 1414 LEAQ (AX)(DX*1), DX 1415 MOVQ DX, (SP) 1416 MOVL $0x00000001, CX 1417 MOVL CX, 16(SP) 1418 MOVQ src_base+24(FP), DX 1419 1420 search_loop_encodeBlockAsm4MB: 1421 MOVL CX, BX 1422 SUBL 12(SP), BX 1423 SHRL $0x06, BX 1424 LEAL 4(CX)(BX*1), BX 1425 CMPL BX, 8(SP) 1426 JAE emit_remainder_encodeBlockAsm4MB 1427 MOVQ (DX)(CX*1), SI 1428 MOVL BX, 20(SP) 1429 MOVQ $0x0000cf1bbcdcbf9b, R8 1430 MOVQ SI, R9 1431 MOVQ SI, R10 1432 SHRQ $0x08, R10 1433 SHLQ $0x10, R9 1434 IMULQ R8, R9 1435 SHRQ $0x32, R9 1436 SHLQ $0x10, R10 1437 IMULQ R8, R10 1438 SHRQ $0x32, R10 1439 MOVL 24(SP)(R9*4), BX 1440 MOVL 24(SP)(R10*4), DI 1441 MOVL CX, 24(SP)(R9*4) 1442 LEAL 1(CX), R9 1443 MOVL R9, 24(SP)(R10*4) 1444 MOVQ SI, R9 1445 SHRQ $0x10, R9 1446 SHLQ $0x10, R9 1447 IMULQ R8, R9 1448 SHRQ $0x32, R9 1449 MOVL CX, R8 1450 SUBL 16(SP), R8 1451 MOVL 1(DX)(R8*1), R10 1452 MOVQ SI, R8 1453 SHRQ $0x08, R8 1454 CMPL R8, R10 1455 JNE no_repeat_found_encodeBlockAsm4MB 1456 LEAL 1(CX), SI 1457 MOVL 12(SP), DI 1458 MOVL SI, BX 1459 SUBL 16(SP), BX 1460 JZ repeat_extend_back_end_encodeBlockAsm4MB 1461 1462 repeat_extend_back_loop_encodeBlockAsm4MB: 1463 CMPL SI, DI 1464 JBE repeat_extend_back_end_encodeBlockAsm4MB 1465 MOVB -1(DX)(BX*1), R8 1466 MOVB -1(DX)(SI*1), R9 1467 CMPB R8, R9 1468 JNE repeat_extend_back_end_encodeBlockAsm4MB 1469 LEAL -1(SI), SI 1470 DECL BX 1471 JNZ repeat_extend_back_loop_encodeBlockAsm4MB 1472 1473 repeat_extend_back_end_encodeBlockAsm4MB: 1474 MOVL 12(SP), BX 1475 CMPL BX, SI 1476 JEQ emit_literal_done_repeat_emit_encodeBlockAsm4MB 1477 MOVL SI, R8 1478 MOVL SI, 12(SP) 1479 LEAQ (DX)(BX*1), R9 1480 SUBL BX, R8 1481 LEAL -1(R8), BX 1482 CMPL BX, $0x3c 1483 JB one_byte_repeat_emit_encodeBlockAsm4MB 1484 CMPL BX, $0x00000100 1485 JB two_bytes_repeat_emit_encodeBlockAsm4MB 1486 CMPL BX, $0x00010000 1487 JB three_bytes_repeat_emit_encodeBlockAsm4MB 1488 MOVL BX, R10 1489 SHRL $0x10, R10 1490 MOVB $0xf8, (AX) 1491 MOVW BX, 1(AX) 1492 MOVB R10, 3(AX) 1493 ADDQ $0x04, AX 1494 JMP memmove_long_repeat_emit_encodeBlockAsm4MB 1495 1496 three_bytes_repeat_emit_encodeBlockAsm4MB: 1497 MOVB $0xf4, (AX) 1498 MOVW BX, 1(AX) 1499 ADDQ $0x03, AX 1500 JMP memmove_long_repeat_emit_encodeBlockAsm4MB 1501 1502 two_bytes_repeat_emit_encodeBlockAsm4MB: 1503 MOVB $0xf0, (AX) 1504 MOVB BL, 1(AX) 1505 ADDQ $0x02, AX 1506 CMPL BX, $0x40 1507 JB memmove_repeat_emit_encodeBlockAsm4MB 1508 JMP memmove_long_repeat_emit_encodeBlockAsm4MB 1509 1510 one_byte_repeat_emit_encodeBlockAsm4MB: 1511 SHLB $0x02, BL 1512 MOVB BL, (AX) 1513 ADDQ $0x01, AX 1514 1515 memmove_repeat_emit_encodeBlockAsm4MB: 1516 LEAQ (AX)(R8*1), BX 1517 1518 // genMemMoveShort 1519 CMPQ R8, $0x08 1520 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8 1521 CMPQ R8, $0x10 1522 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16 1523 CMPQ R8, $0x20 1524 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32 1525 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64 1526 1527 emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8: 1528 MOVQ (R9), R10 1529 MOVQ R10, (AX) 1530 JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB 1531 1532 emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16: 1533 MOVQ (R9), R10 1534 MOVQ -8(R9)(R8*1), R9 1535 MOVQ R10, (AX) 1536 MOVQ R9, -8(AX)(R8*1) 1537 JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB 1538 1539 emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32: 1540 MOVOU (R9), X0 1541 MOVOU -16(R9)(R8*1), X1 1542 MOVOU X0, (AX) 1543 MOVOU X1, -16(AX)(R8*1) 1544 JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB 1545 1546 emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64: 1547 MOVOU (R9), X0 1548 MOVOU 16(R9), X1 1549 MOVOU -32(R9)(R8*1), X2 1550 MOVOU -16(R9)(R8*1), X3 1551 MOVOU X0, (AX) 1552 MOVOU X1, 16(AX) 1553 MOVOU X2, -32(AX)(R8*1) 1554 MOVOU X3, -16(AX)(R8*1) 1555 1556 memmove_end_copy_repeat_emit_encodeBlockAsm4MB: 1557 MOVQ BX, AX 1558 JMP emit_literal_done_repeat_emit_encodeBlockAsm4MB 1559 1560 memmove_long_repeat_emit_encodeBlockAsm4MB: 1561 LEAQ (AX)(R8*1), BX 1562 1563 // genMemMoveLong 1564 MOVOU (R9), X0 1565 MOVOU 16(R9), X1 1566 MOVOU -32(R9)(R8*1), X2 1567 MOVOU -16(R9)(R8*1), X3 1568 MOVQ R8, R11 1569 SHRQ $0x05, R11 1570 MOVQ AX, R10 1571 ANDL $0x0000001f, R10 1572 MOVQ $0x00000040, R12 1573 SUBQ R10, R12 1574 DECQ R11 1575 JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 1576 LEAQ -32(R9)(R12*1), R10 1577 LEAQ -32(AX)(R12*1), R13 1578 1579 emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back: 1580 MOVOU (R10), X4 1581 MOVOU 16(R10), X5 1582 MOVOA X4, (R13) 1583 MOVOA X5, 16(R13) 1584 ADDQ $0x20, R13 1585 ADDQ $0x20, R10 1586 ADDQ $0x20, R12 1587 DECQ R11 1588 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back 1589 1590 emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32: 1591 MOVOU -32(R9)(R12*1), X4 1592 MOVOU -16(R9)(R12*1), X5 1593 MOVOA X4, -32(AX)(R12*1) 1594 MOVOA X5, -16(AX)(R12*1) 1595 ADDQ $0x20, R12 1596 CMPQ R8, R12 1597 JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 1598 MOVOU X0, (AX) 1599 MOVOU X1, 16(AX) 1600 MOVOU X2, -32(AX)(R8*1) 1601 MOVOU X3, -16(AX)(R8*1) 1602 MOVQ BX, AX 1603 1604 emit_literal_done_repeat_emit_encodeBlockAsm4MB: 1605 ADDL $0x05, CX 1606 MOVL CX, BX 1607 SUBL 16(SP), BX 1608 MOVQ src_len+32(FP), R8 1609 SUBL CX, R8 1610 LEAQ (DX)(CX*1), R9 1611 LEAQ (DX)(BX*1), BX 1612 1613 // matchLen 1614 XORL R11, R11 1615 CMPL R8, $0x08 1616 JB matchlen_match4_repeat_extend_encodeBlockAsm4MB 1617 1618 matchlen_loopback_repeat_extend_encodeBlockAsm4MB: 1619 MOVQ (R9)(R11*1), R10 1620 XORQ (BX)(R11*1), R10 1621 TESTQ R10, R10 1622 JZ matchlen_loop_repeat_extend_encodeBlockAsm4MB 1623 1624 #ifdef GOAMD64_v3 1625 TZCNTQ R10, R10 1626 1627 #else 1628 BSFQ R10, R10 1629 1630 #endif 1631 SARQ $0x03, R10 1632 LEAL (R11)(R10*1), R11 1633 JMP repeat_extend_forward_end_encodeBlockAsm4MB 1634 1635 matchlen_loop_repeat_extend_encodeBlockAsm4MB: 1636 LEAL -8(R8), R8 1637 LEAL 8(R11), R11 1638 CMPL R8, $0x08 1639 JAE matchlen_loopback_repeat_extend_encodeBlockAsm4MB 1640 JZ repeat_extend_forward_end_encodeBlockAsm4MB 1641 1642 matchlen_match4_repeat_extend_encodeBlockAsm4MB: 1643 CMPL R8, $0x04 1644 JB matchlen_match2_repeat_extend_encodeBlockAsm4MB 1645 MOVL (R9)(R11*1), R10 1646 CMPL (BX)(R11*1), R10 1647 JNE matchlen_match2_repeat_extend_encodeBlockAsm4MB 1648 SUBL $0x04, R8 1649 LEAL 4(R11), R11 1650 1651 matchlen_match2_repeat_extend_encodeBlockAsm4MB: 1652 CMPL R8, $0x02 1653 JB matchlen_match1_repeat_extend_encodeBlockAsm4MB 1654 MOVW (R9)(R11*1), R10 1655 CMPW (BX)(R11*1), R10 1656 JNE matchlen_match1_repeat_extend_encodeBlockAsm4MB 1657 SUBL $0x02, R8 1658 LEAL 2(R11), R11 1659 1660 matchlen_match1_repeat_extend_encodeBlockAsm4MB: 1661 CMPL R8, $0x01 1662 JB repeat_extend_forward_end_encodeBlockAsm4MB 1663 MOVB (R9)(R11*1), R10 1664 CMPB (BX)(R11*1), R10 1665 JNE repeat_extend_forward_end_encodeBlockAsm4MB 1666 LEAL 1(R11), R11 1667 1668 repeat_extend_forward_end_encodeBlockAsm4MB: 1669 ADDL R11, CX 1670 MOVL CX, BX 1671 SUBL SI, BX 1672 MOVL 16(SP), SI 1673 TESTL DI, DI 1674 JZ repeat_as_copy_encodeBlockAsm4MB 1675 1676 // emitRepeat 1677 MOVL BX, DI 1678 LEAL -4(BX), BX 1679 CMPL DI, $0x08 1680 JBE repeat_two_match_repeat_encodeBlockAsm4MB 1681 CMPL DI, $0x0c 1682 JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB 1683 CMPL SI, $0x00000800 1684 JB repeat_two_offset_match_repeat_encodeBlockAsm4MB 1685 1686 cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB: 1687 CMPL BX, $0x00000104 1688 JB repeat_three_match_repeat_encodeBlockAsm4MB 1689 CMPL BX, $0x00010100 1690 JB repeat_four_match_repeat_encodeBlockAsm4MB 1691 LEAL -65536(BX), BX 1692 MOVL BX, SI 1693 MOVW $0x001d, (AX) 1694 MOVW BX, 2(AX) 1695 SARL $0x10, SI 1696 MOVB SI, 4(AX) 1697 ADDQ $0x05, AX 1698 JMP repeat_end_emit_encodeBlockAsm4MB 1699 1700 repeat_four_match_repeat_encodeBlockAsm4MB: 1701 LEAL -256(BX), BX 1702 MOVW $0x0019, (AX) 1703 MOVW BX, 2(AX) 1704 ADDQ $0x04, AX 1705 JMP repeat_end_emit_encodeBlockAsm4MB 1706 1707 repeat_three_match_repeat_encodeBlockAsm4MB: 1708 LEAL -4(BX), BX 1709 MOVW $0x0015, (AX) 1710 MOVB BL, 2(AX) 1711 ADDQ $0x03, AX 1712 JMP repeat_end_emit_encodeBlockAsm4MB 1713 1714 repeat_two_match_repeat_encodeBlockAsm4MB: 1715 SHLL $0x02, BX 1716 ORL $0x01, BX 1717 MOVW BX, (AX) 1718 ADDQ $0x02, AX 1719 JMP repeat_end_emit_encodeBlockAsm4MB 1720 1721 repeat_two_offset_match_repeat_encodeBlockAsm4MB: 1722 XORQ DI, DI 1723 LEAL 1(DI)(BX*4), BX 1724 MOVB SI, 1(AX) 1725 SARL $0x08, SI 1726 SHLL $0x05, SI 1727 ORL SI, BX 1728 MOVB BL, (AX) 1729 ADDQ $0x02, AX 1730 JMP repeat_end_emit_encodeBlockAsm4MB 1731 1732 repeat_as_copy_encodeBlockAsm4MB: 1733 // emitCopy 1734 CMPL SI, $0x00010000 1735 JB two_byte_offset_repeat_as_copy_encodeBlockAsm4MB 1736 CMPL BX, $0x40 1737 JBE four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB 1738 MOVB $0xff, (AX) 1739 MOVL SI, 1(AX) 1740 LEAL -64(BX), BX 1741 ADDQ $0x05, AX 1742 CMPL BX, $0x04 1743 JB four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB 1744 1745 // emitRepeat 1746 MOVL BX, DI 1747 LEAL -4(BX), BX 1748 CMPL DI, $0x08 1749 JBE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy 1750 CMPL DI, $0x0c 1751 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy 1752 CMPL SI, $0x00000800 1753 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy 1754 1755 cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy: 1756 CMPL BX, $0x00000104 1757 JB repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy 1758 CMPL BX, $0x00010100 1759 JB repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy 1760 LEAL -65536(BX), BX 1761 MOVL BX, SI 1762 MOVW $0x001d, (AX) 1763 MOVW BX, 2(AX) 1764 SARL $0x10, SI 1765 MOVB SI, 4(AX) 1766 ADDQ $0x05, AX 1767 JMP repeat_end_emit_encodeBlockAsm4MB 1768 1769 repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy: 1770 LEAL -256(BX), BX 1771 MOVW $0x0019, (AX) 1772 MOVW BX, 2(AX) 1773 ADDQ $0x04, AX 1774 JMP repeat_end_emit_encodeBlockAsm4MB 1775 1776 repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy: 1777 LEAL -4(BX), BX 1778 MOVW $0x0015, (AX) 1779 MOVB BL, 2(AX) 1780 ADDQ $0x03, AX 1781 JMP repeat_end_emit_encodeBlockAsm4MB 1782 1783 repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy: 1784 SHLL $0x02, BX 1785 ORL $0x01, BX 1786 MOVW BX, (AX) 1787 ADDQ $0x02, AX 1788 JMP repeat_end_emit_encodeBlockAsm4MB 1789 1790 repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy: 1791 XORQ DI, DI 1792 LEAL 1(DI)(BX*4), BX 1793 MOVB SI, 1(AX) 1794 SARL $0x08, SI 1795 SHLL $0x05, SI 1796 ORL SI, BX 1797 MOVB BL, (AX) 1798 ADDQ $0x02, AX 1799 JMP repeat_end_emit_encodeBlockAsm4MB 1800 1801 four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB: 1802 TESTL BX, BX 1803 JZ repeat_end_emit_encodeBlockAsm4MB 1804 XORL DI, DI 1805 LEAL -1(DI)(BX*4), BX 1806 MOVB BL, (AX) 1807 MOVL SI, 1(AX) 1808 ADDQ $0x05, AX 1809 JMP repeat_end_emit_encodeBlockAsm4MB 1810 1811 two_byte_offset_repeat_as_copy_encodeBlockAsm4MB: 1812 CMPL BX, $0x40 1813 JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB 1814 CMPL SI, $0x00000800 1815 JAE long_offset_short_repeat_as_copy_encodeBlockAsm4MB 1816 MOVL $0x00000001, DI 1817 LEAL 16(DI), DI 1818 MOVB SI, 1(AX) 1819 SHRL $0x08, SI 1820 SHLL $0x05, SI 1821 ORL SI, DI 1822 MOVB DI, (AX) 1823 ADDQ $0x02, AX 1824 SUBL $0x08, BX 1825 1826 // emitRepeat 1827 LEAL -4(BX), BX 1828 JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b 1829 MOVL BX, DI 1830 LEAL -4(BX), BX 1831 CMPL DI, $0x08 1832 JBE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b 1833 CMPL DI, $0x0c 1834 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b 1835 CMPL SI, $0x00000800 1836 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b 1837 1838 cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: 1839 CMPL BX, $0x00000104 1840 JB repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b 1841 CMPL BX, $0x00010100 1842 JB repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b 1843 LEAL -65536(BX), BX 1844 MOVL BX, SI 1845 MOVW $0x001d, (AX) 1846 MOVW BX, 2(AX) 1847 SARL $0x10, SI 1848 MOVB SI, 4(AX) 1849 ADDQ $0x05, AX 1850 JMP repeat_end_emit_encodeBlockAsm4MB 1851 1852 repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: 1853 LEAL -256(BX), BX 1854 MOVW $0x0019, (AX) 1855 MOVW BX, 2(AX) 1856 ADDQ $0x04, AX 1857 JMP repeat_end_emit_encodeBlockAsm4MB 1858 1859 repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: 1860 LEAL -4(BX), BX 1861 MOVW $0x0015, (AX) 1862 MOVB BL, 2(AX) 1863 ADDQ $0x03, AX 1864 JMP repeat_end_emit_encodeBlockAsm4MB 1865 1866 repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: 1867 SHLL $0x02, BX 1868 ORL $0x01, BX 1869 MOVW BX, (AX) 1870 ADDQ $0x02, AX 1871 JMP repeat_end_emit_encodeBlockAsm4MB 1872 1873 repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: 1874 XORQ DI, DI 1875 LEAL 1(DI)(BX*4), BX 1876 MOVB SI, 1(AX) 1877 SARL $0x08, SI 1878 SHLL $0x05, SI 1879 ORL SI, BX 1880 MOVB BL, (AX) 1881 ADDQ $0x02, AX 1882 JMP repeat_end_emit_encodeBlockAsm4MB 1883 1884 long_offset_short_repeat_as_copy_encodeBlockAsm4MB: 1885 MOVB $0xee, (AX) 1886 MOVW SI, 1(AX) 1887 LEAL -60(BX), BX 1888 ADDQ $0x03, AX 1889 1890 // emitRepeat 1891 MOVL BX, DI 1892 LEAL -4(BX), BX 1893 CMPL DI, $0x08 1894 JBE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short 1895 CMPL DI, $0x0c 1896 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short 1897 CMPL SI, $0x00000800 1898 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short 1899 1900 cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: 1901 CMPL BX, $0x00000104 1902 JB repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short 1903 CMPL BX, $0x00010100 1904 JB repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short 1905 LEAL -65536(BX), BX 1906 MOVL BX, SI 1907 MOVW $0x001d, (AX) 1908 MOVW BX, 2(AX) 1909 SARL $0x10, SI 1910 MOVB SI, 4(AX) 1911 ADDQ $0x05, AX 1912 JMP repeat_end_emit_encodeBlockAsm4MB 1913 1914 repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: 1915 LEAL -256(BX), BX 1916 MOVW $0x0019, (AX) 1917 MOVW BX, 2(AX) 1918 ADDQ $0x04, AX 1919 JMP repeat_end_emit_encodeBlockAsm4MB 1920 1921 repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: 1922 LEAL -4(BX), BX 1923 MOVW $0x0015, (AX) 1924 MOVB BL, 2(AX) 1925 ADDQ $0x03, AX 1926 JMP repeat_end_emit_encodeBlockAsm4MB 1927 1928 repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: 1929 SHLL $0x02, BX 1930 ORL $0x01, BX 1931 MOVW BX, (AX) 1932 ADDQ $0x02, AX 1933 JMP repeat_end_emit_encodeBlockAsm4MB 1934 1935 repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: 1936 XORQ DI, DI 1937 LEAL 1(DI)(BX*4), BX 1938 MOVB SI, 1(AX) 1939 SARL $0x08, SI 1940 SHLL $0x05, SI 1941 ORL SI, BX 1942 MOVB BL, (AX) 1943 ADDQ $0x02, AX 1944 JMP repeat_end_emit_encodeBlockAsm4MB 1945 1946 two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB: 1947 MOVL BX, DI 1948 SHLL $0x02, DI 1949 CMPL BX, $0x0c 1950 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB 1951 CMPL SI, $0x00000800 1952 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB 1953 LEAL -15(DI), DI 1954 MOVB SI, 1(AX) 1955 SHRL $0x08, SI 1956 SHLL $0x05, SI 1957 ORL SI, DI 1958 MOVB DI, (AX) 1959 ADDQ $0x02, AX 1960 JMP repeat_end_emit_encodeBlockAsm4MB 1961 1962 emit_copy_three_repeat_as_copy_encodeBlockAsm4MB: 1963 LEAL -2(DI), DI 1964 MOVB DI, (AX) 1965 MOVW SI, 1(AX) 1966 ADDQ $0x03, AX 1967 1968 repeat_end_emit_encodeBlockAsm4MB: 1969 MOVL CX, 12(SP) 1970 JMP search_loop_encodeBlockAsm4MB 1971 1972 no_repeat_found_encodeBlockAsm4MB: 1973 CMPL (DX)(BX*1), SI 1974 JEQ candidate_match_encodeBlockAsm4MB 1975 SHRQ $0x08, SI 1976 MOVL 24(SP)(R9*4), BX 1977 LEAL 2(CX), R8 1978 CMPL (DX)(DI*1), SI 1979 JEQ candidate2_match_encodeBlockAsm4MB 1980 MOVL R8, 24(SP)(R9*4) 1981 SHRQ $0x08, SI 1982 CMPL (DX)(BX*1), SI 1983 JEQ candidate3_match_encodeBlockAsm4MB 1984 MOVL 20(SP), CX 1985 JMP search_loop_encodeBlockAsm4MB 1986 1987 candidate3_match_encodeBlockAsm4MB: 1988 ADDL $0x02, CX 1989 JMP candidate_match_encodeBlockAsm4MB 1990 1991 candidate2_match_encodeBlockAsm4MB: 1992 MOVL R8, 24(SP)(R9*4) 1993 INCL CX 1994 MOVL DI, BX 1995 1996 candidate_match_encodeBlockAsm4MB: 1997 MOVL 12(SP), SI 1998 TESTL BX, BX 1999 JZ match_extend_back_end_encodeBlockAsm4MB 2000 2001 match_extend_back_loop_encodeBlockAsm4MB: 2002 CMPL CX, SI 2003 JBE match_extend_back_end_encodeBlockAsm4MB 2004 MOVB -1(DX)(BX*1), DI 2005 MOVB -1(DX)(CX*1), R8 2006 CMPB DI, R8 2007 JNE match_extend_back_end_encodeBlockAsm4MB 2008 LEAL -1(CX), CX 2009 DECL BX 2010 JZ match_extend_back_end_encodeBlockAsm4MB 2011 JMP match_extend_back_loop_encodeBlockAsm4MB 2012 2013 match_extend_back_end_encodeBlockAsm4MB: 2014 MOVL CX, SI 2015 SUBL 12(SP), SI 2016 LEAQ 4(AX)(SI*1), SI 2017 CMPQ SI, (SP) 2018 JB match_dst_size_check_encodeBlockAsm4MB 2019 MOVQ $0x00000000, ret+48(FP) 2020 RET 2021 2022 match_dst_size_check_encodeBlockAsm4MB: 2023 MOVL CX, SI 2024 MOVL 12(SP), DI 2025 CMPL DI, SI 2026 JEQ emit_literal_done_match_emit_encodeBlockAsm4MB 2027 MOVL SI, R8 2028 MOVL SI, 12(SP) 2029 LEAQ (DX)(DI*1), SI 2030 SUBL DI, R8 2031 LEAL -1(R8), DI 2032 CMPL DI, $0x3c 2033 JB one_byte_match_emit_encodeBlockAsm4MB 2034 CMPL DI, $0x00000100 2035 JB two_bytes_match_emit_encodeBlockAsm4MB 2036 CMPL DI, $0x00010000 2037 JB three_bytes_match_emit_encodeBlockAsm4MB 2038 MOVL DI, R9 2039 SHRL $0x10, R9 2040 MOVB $0xf8, (AX) 2041 MOVW DI, 1(AX) 2042 MOVB R9, 3(AX) 2043 ADDQ $0x04, AX 2044 JMP memmove_long_match_emit_encodeBlockAsm4MB 2045 2046 three_bytes_match_emit_encodeBlockAsm4MB: 2047 MOVB $0xf4, (AX) 2048 MOVW DI, 1(AX) 2049 ADDQ $0x03, AX 2050 JMP memmove_long_match_emit_encodeBlockAsm4MB 2051 2052 two_bytes_match_emit_encodeBlockAsm4MB: 2053 MOVB $0xf0, (AX) 2054 MOVB DI, 1(AX) 2055 ADDQ $0x02, AX 2056 CMPL DI, $0x40 2057 JB memmove_match_emit_encodeBlockAsm4MB 2058 JMP memmove_long_match_emit_encodeBlockAsm4MB 2059 2060 one_byte_match_emit_encodeBlockAsm4MB: 2061 SHLB $0x02, DI 2062 MOVB DI, (AX) 2063 ADDQ $0x01, AX 2064 2065 memmove_match_emit_encodeBlockAsm4MB: 2066 LEAQ (AX)(R8*1), DI 2067 2068 // genMemMoveShort 2069 CMPQ R8, $0x08 2070 JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8 2071 CMPQ R8, $0x10 2072 JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16 2073 CMPQ R8, $0x20 2074 JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32 2075 JMP emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64 2076 2077 emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8: 2078 MOVQ (SI), R9 2079 MOVQ R9, (AX) 2080 JMP memmove_end_copy_match_emit_encodeBlockAsm4MB 2081 2082 emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16: 2083 MOVQ (SI), R9 2084 MOVQ -8(SI)(R8*1), SI 2085 MOVQ R9, (AX) 2086 MOVQ SI, -8(AX)(R8*1) 2087 JMP memmove_end_copy_match_emit_encodeBlockAsm4MB 2088 2089 emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32: 2090 MOVOU (SI), X0 2091 MOVOU -16(SI)(R8*1), X1 2092 MOVOU X0, (AX) 2093 MOVOU X1, -16(AX)(R8*1) 2094 JMP memmove_end_copy_match_emit_encodeBlockAsm4MB 2095 2096 emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64: 2097 MOVOU (SI), X0 2098 MOVOU 16(SI), X1 2099 MOVOU -32(SI)(R8*1), X2 2100 MOVOU -16(SI)(R8*1), X3 2101 MOVOU X0, (AX) 2102 MOVOU X1, 16(AX) 2103 MOVOU X2, -32(AX)(R8*1) 2104 MOVOU X3, -16(AX)(R8*1) 2105 2106 memmove_end_copy_match_emit_encodeBlockAsm4MB: 2107 MOVQ DI, AX 2108 JMP emit_literal_done_match_emit_encodeBlockAsm4MB 2109 2110 memmove_long_match_emit_encodeBlockAsm4MB: 2111 LEAQ (AX)(R8*1), DI 2112 2113 // genMemMoveLong 2114 MOVOU (SI), X0 2115 MOVOU 16(SI), X1 2116 MOVOU -32(SI)(R8*1), X2 2117 MOVOU -16(SI)(R8*1), X3 2118 MOVQ R8, R10 2119 SHRQ $0x05, R10 2120 MOVQ AX, R9 2121 ANDL $0x0000001f, R9 2122 MOVQ $0x00000040, R11 2123 SUBQ R9, R11 2124 DECQ R10 2125 JA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 2126 LEAQ -32(SI)(R11*1), R9 2127 LEAQ -32(AX)(R11*1), R12 2128 2129 emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back: 2130 MOVOU (R9), X4 2131 MOVOU 16(R9), X5 2132 MOVOA X4, (R12) 2133 MOVOA X5, 16(R12) 2134 ADDQ $0x20, R12 2135 ADDQ $0x20, R9 2136 ADDQ $0x20, R11 2137 DECQ R10 2138 JNA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back 2139 2140 emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32: 2141 MOVOU -32(SI)(R11*1), X4 2142 MOVOU -16(SI)(R11*1), X5 2143 MOVOA X4, -32(AX)(R11*1) 2144 MOVOA X5, -16(AX)(R11*1) 2145 ADDQ $0x20, R11 2146 CMPQ R8, R11 2147 JAE emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 2148 MOVOU X0, (AX) 2149 MOVOU X1, 16(AX) 2150 MOVOU X2, -32(AX)(R8*1) 2151 MOVOU X3, -16(AX)(R8*1) 2152 MOVQ DI, AX 2153 2154 emit_literal_done_match_emit_encodeBlockAsm4MB: 2155 match_nolit_loop_encodeBlockAsm4MB: 2156 MOVL CX, SI 2157 SUBL BX, SI 2158 MOVL SI, 16(SP) 2159 ADDL $0x04, CX 2160 ADDL $0x04, BX 2161 MOVQ src_len+32(FP), SI 2162 SUBL CX, SI 2163 LEAQ (DX)(CX*1), DI 2164 LEAQ (DX)(BX*1), BX 2165 2166 // matchLen 2167 XORL R9, R9 2168 CMPL SI, $0x08 2169 JB matchlen_match4_match_nolit_encodeBlockAsm4MB 2170 2171 matchlen_loopback_match_nolit_encodeBlockAsm4MB: 2172 MOVQ (DI)(R9*1), R8 2173 XORQ (BX)(R9*1), R8 2174 TESTQ R8, R8 2175 JZ matchlen_loop_match_nolit_encodeBlockAsm4MB 2176 2177 #ifdef GOAMD64_v3 2178 TZCNTQ R8, R8 2179 2180 #else 2181 BSFQ R8, R8 2182 2183 #endif 2184 SARQ $0x03, R8 2185 LEAL (R9)(R8*1), R9 2186 JMP match_nolit_end_encodeBlockAsm4MB 2187 2188 matchlen_loop_match_nolit_encodeBlockAsm4MB: 2189 LEAL -8(SI), SI 2190 LEAL 8(R9), R9 2191 CMPL SI, $0x08 2192 JAE matchlen_loopback_match_nolit_encodeBlockAsm4MB 2193 JZ match_nolit_end_encodeBlockAsm4MB 2194 2195 matchlen_match4_match_nolit_encodeBlockAsm4MB: 2196 CMPL SI, $0x04 2197 JB matchlen_match2_match_nolit_encodeBlockAsm4MB 2198 MOVL (DI)(R9*1), R8 2199 CMPL (BX)(R9*1), R8 2200 JNE matchlen_match2_match_nolit_encodeBlockAsm4MB 2201 SUBL $0x04, SI 2202 LEAL 4(R9), R9 2203 2204 matchlen_match2_match_nolit_encodeBlockAsm4MB: 2205 CMPL SI, $0x02 2206 JB matchlen_match1_match_nolit_encodeBlockAsm4MB 2207 MOVW (DI)(R9*1), R8 2208 CMPW (BX)(R9*1), R8 2209 JNE matchlen_match1_match_nolit_encodeBlockAsm4MB 2210 SUBL $0x02, SI 2211 LEAL 2(R9), R9 2212 2213 matchlen_match1_match_nolit_encodeBlockAsm4MB: 2214 CMPL SI, $0x01 2215 JB match_nolit_end_encodeBlockAsm4MB 2216 MOVB (DI)(R9*1), R8 2217 CMPB (BX)(R9*1), R8 2218 JNE match_nolit_end_encodeBlockAsm4MB 2219 LEAL 1(R9), R9 2220 2221 match_nolit_end_encodeBlockAsm4MB: 2222 ADDL R9, CX 2223 MOVL 16(SP), BX 2224 ADDL $0x04, R9 2225 MOVL CX, 12(SP) 2226 2227 // emitCopy 2228 CMPL BX, $0x00010000 2229 JB two_byte_offset_match_nolit_encodeBlockAsm4MB 2230 CMPL R9, $0x40 2231 JBE four_bytes_remain_match_nolit_encodeBlockAsm4MB 2232 MOVB $0xff, (AX) 2233 MOVL BX, 1(AX) 2234 LEAL -64(R9), R9 2235 ADDQ $0x05, AX 2236 CMPL R9, $0x04 2237 JB four_bytes_remain_match_nolit_encodeBlockAsm4MB 2238 2239 // emitRepeat 2240 MOVL R9, SI 2241 LEAL -4(R9), R9 2242 CMPL SI, $0x08 2243 JBE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy 2244 CMPL SI, $0x0c 2245 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy 2246 CMPL BX, $0x00000800 2247 JB repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy 2248 2249 cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy: 2250 CMPL R9, $0x00000104 2251 JB repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy 2252 CMPL R9, $0x00010100 2253 JB repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy 2254 LEAL -65536(R9), R9 2255 MOVL R9, BX 2256 MOVW $0x001d, (AX) 2257 MOVW R9, 2(AX) 2258 SARL $0x10, BX 2259 MOVB BL, 4(AX) 2260 ADDQ $0x05, AX 2261 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB 2262 2263 repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy: 2264 LEAL -256(R9), R9 2265 MOVW $0x0019, (AX) 2266 MOVW R9, 2(AX) 2267 ADDQ $0x04, AX 2268 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB 2269 2270 repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy: 2271 LEAL -4(R9), R9 2272 MOVW $0x0015, (AX) 2273 MOVB R9, 2(AX) 2274 ADDQ $0x03, AX 2275 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB 2276 2277 repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy: 2278 SHLL $0x02, R9 2279 ORL $0x01, R9 2280 MOVW R9, (AX) 2281 ADDQ $0x02, AX 2282 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB 2283 2284 repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy: 2285 XORQ SI, SI 2286 LEAL 1(SI)(R9*4), R9 2287 MOVB BL, 1(AX) 2288 SARL $0x08, BX 2289 SHLL $0x05, BX 2290 ORL BX, R9 2291 MOVB R9, (AX) 2292 ADDQ $0x02, AX 2293 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB 2294 2295 four_bytes_remain_match_nolit_encodeBlockAsm4MB: 2296 TESTL R9, R9 2297 JZ match_nolit_emitcopy_end_encodeBlockAsm4MB 2298 XORL SI, SI 2299 LEAL -1(SI)(R9*4), R9 2300 MOVB R9, (AX) 2301 MOVL BX, 1(AX) 2302 ADDQ $0x05, AX 2303 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB 2304 2305 two_byte_offset_match_nolit_encodeBlockAsm4MB: 2306 CMPL R9, $0x40 2307 JBE two_byte_offset_short_match_nolit_encodeBlockAsm4MB 2308 CMPL BX, $0x00000800 2309 JAE long_offset_short_match_nolit_encodeBlockAsm4MB 2310 MOVL $0x00000001, SI 2311 LEAL 16(SI), SI 2312 MOVB BL, 1(AX) 2313 SHRL $0x08, BX 2314 SHLL $0x05, BX 2315 ORL BX, SI 2316 MOVB SI, (AX) 2317 ADDQ $0x02, AX 2318 SUBL $0x08, R9 2319 2320 // emitRepeat 2321 LEAL -4(R9), R9 2322 JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b 2323 MOVL R9, SI 2324 LEAL -4(R9), R9 2325 CMPL SI, $0x08 2326 JBE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b 2327 CMPL SI, $0x0c 2328 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b 2329 CMPL BX, $0x00000800 2330 JB repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b 2331 2332 cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: 2333 CMPL R9, $0x00000104 2334 JB repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b 2335 CMPL R9, $0x00010100 2336 JB repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b 2337 LEAL -65536(R9), R9 2338 MOVL R9, BX 2339 MOVW $0x001d, (AX) 2340 MOVW R9, 2(AX) 2341 SARL $0x10, BX 2342 MOVB BL, 4(AX) 2343 ADDQ $0x05, AX 2344 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB 2345 2346 repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: 2347 LEAL -256(R9), R9 2348 MOVW $0x0019, (AX) 2349 MOVW R9, 2(AX) 2350 ADDQ $0x04, AX 2351 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB 2352 2353 repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: 2354 LEAL -4(R9), R9 2355 MOVW $0x0015, (AX) 2356 MOVB R9, 2(AX) 2357 ADDQ $0x03, AX 2358 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB 2359 2360 repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: 2361 SHLL $0x02, R9 2362 ORL $0x01, R9 2363 MOVW R9, (AX) 2364 ADDQ $0x02, AX 2365 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB 2366 2367 repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: 2368 XORQ SI, SI 2369 LEAL 1(SI)(R9*4), R9 2370 MOVB BL, 1(AX) 2371 SARL $0x08, BX 2372 SHLL $0x05, BX 2373 ORL BX, R9 2374 MOVB R9, (AX) 2375 ADDQ $0x02, AX 2376 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB 2377 2378 long_offset_short_match_nolit_encodeBlockAsm4MB: 2379 MOVB $0xee, (AX) 2380 MOVW BX, 1(AX) 2381 LEAL -60(R9), R9 2382 ADDQ $0x03, AX 2383 2384 // emitRepeat 2385 MOVL R9, SI 2386 LEAL -4(R9), R9 2387 CMPL SI, $0x08 2388 JBE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short 2389 CMPL SI, $0x0c 2390 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short 2391 CMPL BX, $0x00000800 2392 JB repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short 2393 2394 cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short: 2395 CMPL R9, $0x00000104 2396 JB repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short 2397 CMPL R9, $0x00010100 2398 JB repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short 2399 LEAL -65536(R9), R9 2400 MOVL R9, BX 2401 MOVW $0x001d, (AX) 2402 MOVW R9, 2(AX) 2403 SARL $0x10, BX 2404 MOVB BL, 4(AX) 2405 ADDQ $0x05, AX 2406 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB 2407 2408 repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short: 2409 LEAL -256(R9), R9 2410 MOVW $0x0019, (AX) 2411 MOVW R9, 2(AX) 2412 ADDQ $0x04, AX 2413 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB 2414 2415 repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short: 2416 LEAL -4(R9), R9 2417 MOVW $0x0015, (AX) 2418 MOVB R9, 2(AX) 2419 ADDQ $0x03, AX 2420 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB 2421 2422 repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short: 2423 SHLL $0x02, R9 2424 ORL $0x01, R9 2425 MOVW R9, (AX) 2426 ADDQ $0x02, AX 2427 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB 2428 2429 repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short: 2430 XORQ SI, SI 2431 LEAL 1(SI)(R9*4), R9 2432 MOVB BL, 1(AX) 2433 SARL $0x08, BX 2434 SHLL $0x05, BX 2435 ORL BX, R9 2436 MOVB R9, (AX) 2437 ADDQ $0x02, AX 2438 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB 2439 2440 two_byte_offset_short_match_nolit_encodeBlockAsm4MB: 2441 MOVL R9, SI 2442 SHLL $0x02, SI 2443 CMPL R9, $0x0c 2444 JAE emit_copy_three_match_nolit_encodeBlockAsm4MB 2445 CMPL BX, $0x00000800 2446 JAE emit_copy_three_match_nolit_encodeBlockAsm4MB 2447 LEAL -15(SI), SI 2448 MOVB BL, 1(AX) 2449 SHRL $0x08, BX 2450 SHLL $0x05, BX 2451 ORL BX, SI 2452 MOVB SI, (AX) 2453 ADDQ $0x02, AX 2454 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB 2455 2456 emit_copy_three_match_nolit_encodeBlockAsm4MB: 2457 LEAL -2(SI), SI 2458 MOVB SI, (AX) 2459 MOVW BX, 1(AX) 2460 ADDQ $0x03, AX 2461 2462 match_nolit_emitcopy_end_encodeBlockAsm4MB: 2463 CMPL CX, 8(SP) 2464 JAE emit_remainder_encodeBlockAsm4MB 2465 MOVQ -2(DX)(CX*1), SI 2466 CMPQ AX, (SP) 2467 JB match_nolit_dst_ok_encodeBlockAsm4MB 2468 MOVQ $0x00000000, ret+48(FP) 2469 RET 2470 2471 match_nolit_dst_ok_encodeBlockAsm4MB: 2472 MOVQ $0x0000cf1bbcdcbf9b, R8 2473 MOVQ SI, DI 2474 SHRQ $0x10, SI 2475 MOVQ SI, BX 2476 SHLQ $0x10, DI 2477 IMULQ R8, DI 2478 SHRQ $0x32, DI 2479 SHLQ $0x10, BX 2480 IMULQ R8, BX 2481 SHRQ $0x32, BX 2482 LEAL -2(CX), R8 2483 LEAQ 24(SP)(BX*4), R9 2484 MOVL (R9), BX 2485 MOVL R8, 24(SP)(DI*4) 2486 MOVL CX, (R9) 2487 CMPL (DX)(BX*1), SI 2488 JEQ match_nolit_loop_encodeBlockAsm4MB 2489 INCL CX 2490 JMP search_loop_encodeBlockAsm4MB 2491 2492 emit_remainder_encodeBlockAsm4MB: 2493 MOVQ src_len+32(FP), CX 2494 SUBL 12(SP), CX 2495 LEAQ 4(AX)(CX*1), CX 2496 CMPQ CX, (SP) 2497 JB emit_remainder_ok_encodeBlockAsm4MB 2498 MOVQ $0x00000000, ret+48(FP) 2499 RET 2500 2501 emit_remainder_ok_encodeBlockAsm4MB: 2502 MOVQ src_len+32(FP), CX 2503 MOVL 12(SP), BX 2504 CMPL BX, CX 2505 JEQ emit_literal_done_emit_remainder_encodeBlockAsm4MB 2506 MOVL CX, SI 2507 MOVL CX, 12(SP) 2508 LEAQ (DX)(BX*1), CX 2509 SUBL BX, SI 2510 LEAL -1(SI), DX 2511 CMPL DX, $0x3c 2512 JB one_byte_emit_remainder_encodeBlockAsm4MB 2513 CMPL DX, $0x00000100 2514 JB two_bytes_emit_remainder_encodeBlockAsm4MB 2515 CMPL DX, $0x00010000 2516 JB three_bytes_emit_remainder_encodeBlockAsm4MB 2517 MOVL DX, BX 2518 SHRL $0x10, BX 2519 MOVB $0xf8, (AX) 2520 MOVW DX, 1(AX) 2521 MOVB BL, 3(AX) 2522 ADDQ $0x04, AX 2523 JMP memmove_long_emit_remainder_encodeBlockAsm4MB 2524 2525 three_bytes_emit_remainder_encodeBlockAsm4MB: 2526 MOVB $0xf4, (AX) 2527 MOVW DX, 1(AX) 2528 ADDQ $0x03, AX 2529 JMP memmove_long_emit_remainder_encodeBlockAsm4MB 2530 2531 two_bytes_emit_remainder_encodeBlockAsm4MB: 2532 MOVB $0xf0, (AX) 2533 MOVB DL, 1(AX) 2534 ADDQ $0x02, AX 2535 CMPL DX, $0x40 2536 JB memmove_emit_remainder_encodeBlockAsm4MB 2537 JMP memmove_long_emit_remainder_encodeBlockAsm4MB 2538 2539 one_byte_emit_remainder_encodeBlockAsm4MB: 2540 SHLB $0x02, DL 2541 MOVB DL, (AX) 2542 ADDQ $0x01, AX 2543 2544 memmove_emit_remainder_encodeBlockAsm4MB: 2545 LEAQ (AX)(SI*1), DX 2546 MOVL SI, BX 2547 2548 // genMemMoveShort 2549 CMPQ BX, $0x03 2550 JB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2 2551 JE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3 2552 CMPQ BX, $0x08 2553 JB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7 2554 CMPQ BX, $0x10 2555 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16 2556 CMPQ BX, $0x20 2557 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32 2558 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64 2559 2560 emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2: 2561 MOVB (CX), SI 2562 MOVB -1(CX)(BX*1), CL 2563 MOVB SI, (AX) 2564 MOVB CL, -1(AX)(BX*1) 2565 JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB 2566 2567 emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3: 2568 MOVW (CX), SI 2569 MOVB 2(CX), CL 2570 MOVW SI, (AX) 2571 MOVB CL, 2(AX) 2572 JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB 2573 2574 emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7: 2575 MOVL (CX), SI 2576 MOVL -4(CX)(BX*1), CX 2577 MOVL SI, (AX) 2578 MOVL CX, -4(AX)(BX*1) 2579 JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB 2580 2581 emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16: 2582 MOVQ (CX), SI 2583 MOVQ -8(CX)(BX*1), CX 2584 MOVQ SI, (AX) 2585 MOVQ CX, -8(AX)(BX*1) 2586 JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB 2587 2588 emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32: 2589 MOVOU (CX), X0 2590 MOVOU -16(CX)(BX*1), X1 2591 MOVOU X0, (AX) 2592 MOVOU X1, -16(AX)(BX*1) 2593 JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB 2594 2595 emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64: 2596 MOVOU (CX), X0 2597 MOVOU 16(CX), X1 2598 MOVOU -32(CX)(BX*1), X2 2599 MOVOU -16(CX)(BX*1), X3 2600 MOVOU X0, (AX) 2601 MOVOU X1, 16(AX) 2602 MOVOU X2, -32(AX)(BX*1) 2603 MOVOU X3, -16(AX)(BX*1) 2604 2605 memmove_end_copy_emit_remainder_encodeBlockAsm4MB: 2606 MOVQ DX, AX 2607 JMP emit_literal_done_emit_remainder_encodeBlockAsm4MB 2608 2609 memmove_long_emit_remainder_encodeBlockAsm4MB: 2610 LEAQ (AX)(SI*1), DX 2611 MOVL SI, BX 2612 2613 // genMemMoveLong 2614 MOVOU (CX), X0 2615 MOVOU 16(CX), X1 2616 MOVOU -32(CX)(BX*1), X2 2617 MOVOU -16(CX)(BX*1), X3 2618 MOVQ BX, DI 2619 SHRQ $0x05, DI 2620 MOVQ AX, SI 2621 ANDL $0x0000001f, SI 2622 MOVQ $0x00000040, R8 2623 SUBQ SI, R8 2624 DECQ DI 2625 JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32 2626 LEAQ -32(CX)(R8*1), SI 2627 LEAQ -32(AX)(R8*1), R9 2628 2629 emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back: 2630 MOVOU (SI), X4 2631 MOVOU 16(SI), X5 2632 MOVOA X4, (R9) 2633 MOVOA X5, 16(R9) 2634 ADDQ $0x20, R9 2635 ADDQ $0x20, SI 2636 ADDQ $0x20, R8 2637 DECQ DI 2638 JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back 2639 2640 emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32: 2641 MOVOU -32(CX)(R8*1), X4 2642 MOVOU -16(CX)(R8*1), X5 2643 MOVOA X4, -32(AX)(R8*1) 2644 MOVOA X5, -16(AX)(R8*1) 2645 ADDQ $0x20, R8 2646 CMPQ BX, R8 2647 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32 2648 MOVOU X0, (AX) 2649 MOVOU X1, 16(AX) 2650 MOVOU X2, -32(AX)(BX*1) 2651 MOVOU X3, -16(AX)(BX*1) 2652 MOVQ DX, AX 2653 2654 emit_literal_done_emit_remainder_encodeBlockAsm4MB: 2655 MOVQ dst_base+0(FP), CX 2656 SUBQ CX, AX 2657 MOVQ AX, ret+48(FP) 2658 RET 2659 2660 // func encodeBlockAsm12B(dst []byte, src []byte) int 2661 // Requires: BMI, SSE2 2662 TEXT ·encodeBlockAsm12B(SB), $16408-56 2663 MOVQ dst_base+0(FP), AX 2664 MOVQ $0x00000080, CX 2665 LEAQ 24(SP), DX 2666 PXOR X0, X0 2667 2668 zero_loop_encodeBlockAsm12B: 2669 MOVOU X0, (DX) 2670 MOVOU X0, 16(DX) 2671 MOVOU X0, 32(DX) 2672 MOVOU X0, 48(DX) 2673 MOVOU X0, 64(DX) 2674 MOVOU X0, 80(DX) 2675 MOVOU X0, 96(DX) 2676 MOVOU X0, 112(DX) 2677 ADDQ $0x80, DX 2678 DECQ CX 2679 JNZ zero_loop_encodeBlockAsm12B 2680 MOVL $0x00000000, 12(SP) 2681 MOVQ src_len+32(FP), CX 2682 LEAQ -9(CX), DX 2683 LEAQ -8(CX), BX 2684 MOVL BX, 8(SP) 2685 SHRQ $0x05, CX 2686 SUBL CX, DX 2687 LEAQ (AX)(DX*1), DX 2688 MOVQ DX, (SP) 2689 MOVL $0x00000001, CX 2690 MOVL CX, 16(SP) 2691 MOVQ src_base+24(FP), DX 2692 2693 search_loop_encodeBlockAsm12B: 2694 MOVL CX, BX 2695 SUBL 12(SP), BX 2696 SHRL $0x05, BX 2697 LEAL 4(CX)(BX*1), BX 2698 CMPL BX, 8(SP) 2699 JAE emit_remainder_encodeBlockAsm12B 2700 MOVQ (DX)(CX*1), SI 2701 MOVL BX, 20(SP) 2702 MOVQ $0x000000cf1bbcdcbb, R8 2703 MOVQ SI, R9 2704 MOVQ SI, R10 2705 SHRQ $0x08, R10 2706 SHLQ $0x18, R9 2707 IMULQ R8, R9 2708 SHRQ $0x34, R9 2709 SHLQ $0x18, R10 2710 IMULQ R8, R10 2711 SHRQ $0x34, R10 2712 MOVL 24(SP)(R9*4), BX 2713 MOVL 24(SP)(R10*4), DI 2714 MOVL CX, 24(SP)(R9*4) 2715 LEAL 1(CX), R9 2716 MOVL R9, 24(SP)(R10*4) 2717 MOVQ SI, R9 2718 SHRQ $0x10, R9 2719 SHLQ $0x18, R9 2720 IMULQ R8, R9 2721 SHRQ $0x34, R9 2722 MOVL CX, R8 2723 SUBL 16(SP), R8 2724 MOVL 1(DX)(R8*1), R10 2725 MOVQ SI, R8 2726 SHRQ $0x08, R8 2727 CMPL R8, R10 2728 JNE no_repeat_found_encodeBlockAsm12B 2729 LEAL 1(CX), SI 2730 MOVL 12(SP), DI 2731 MOVL SI, BX 2732 SUBL 16(SP), BX 2733 JZ repeat_extend_back_end_encodeBlockAsm12B 2734 2735 repeat_extend_back_loop_encodeBlockAsm12B: 2736 CMPL SI, DI 2737 JBE repeat_extend_back_end_encodeBlockAsm12B 2738 MOVB -1(DX)(BX*1), R8 2739 MOVB -1(DX)(SI*1), R9 2740 CMPB R8, R9 2741 JNE repeat_extend_back_end_encodeBlockAsm12B 2742 LEAL -1(SI), SI 2743 DECL BX 2744 JNZ repeat_extend_back_loop_encodeBlockAsm12B 2745 2746 repeat_extend_back_end_encodeBlockAsm12B: 2747 MOVL 12(SP), BX 2748 CMPL BX, SI 2749 JEQ emit_literal_done_repeat_emit_encodeBlockAsm12B 2750 MOVL SI, R8 2751 MOVL SI, 12(SP) 2752 LEAQ (DX)(BX*1), R9 2753 SUBL BX, R8 2754 LEAL -1(R8), BX 2755 CMPL BX, $0x3c 2756 JB one_byte_repeat_emit_encodeBlockAsm12B 2757 CMPL BX, $0x00000100 2758 JB two_bytes_repeat_emit_encodeBlockAsm12B 2759 JB three_bytes_repeat_emit_encodeBlockAsm12B 2760 2761 three_bytes_repeat_emit_encodeBlockAsm12B: 2762 MOVB $0xf4, (AX) 2763 MOVW BX, 1(AX) 2764 ADDQ $0x03, AX 2765 JMP memmove_long_repeat_emit_encodeBlockAsm12B 2766 2767 two_bytes_repeat_emit_encodeBlockAsm12B: 2768 MOVB $0xf0, (AX) 2769 MOVB BL, 1(AX) 2770 ADDQ $0x02, AX 2771 CMPL BX, $0x40 2772 JB memmove_repeat_emit_encodeBlockAsm12B 2773 JMP memmove_long_repeat_emit_encodeBlockAsm12B 2774 2775 one_byte_repeat_emit_encodeBlockAsm12B: 2776 SHLB $0x02, BL 2777 MOVB BL, (AX) 2778 ADDQ $0x01, AX 2779 2780 memmove_repeat_emit_encodeBlockAsm12B: 2781 LEAQ (AX)(R8*1), BX 2782 2783 // genMemMoveShort 2784 CMPQ R8, $0x08 2785 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8 2786 CMPQ R8, $0x10 2787 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16 2788 CMPQ R8, $0x20 2789 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32 2790 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64 2791 2792 emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8: 2793 MOVQ (R9), R10 2794 MOVQ R10, (AX) 2795 JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B 2796 2797 emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16: 2798 MOVQ (R9), R10 2799 MOVQ -8(R9)(R8*1), R9 2800 MOVQ R10, (AX) 2801 MOVQ R9, -8(AX)(R8*1) 2802 JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B 2803 2804 emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32: 2805 MOVOU (R9), X0 2806 MOVOU -16(R9)(R8*1), X1 2807 MOVOU X0, (AX) 2808 MOVOU X1, -16(AX)(R8*1) 2809 JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B 2810 2811 emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64: 2812 MOVOU (R9), X0 2813 MOVOU 16(R9), X1 2814 MOVOU -32(R9)(R8*1), X2 2815 MOVOU -16(R9)(R8*1), X3 2816 MOVOU X0, (AX) 2817 MOVOU X1, 16(AX) 2818 MOVOU X2, -32(AX)(R8*1) 2819 MOVOU X3, -16(AX)(R8*1) 2820 2821 memmove_end_copy_repeat_emit_encodeBlockAsm12B: 2822 MOVQ BX, AX 2823 JMP emit_literal_done_repeat_emit_encodeBlockAsm12B 2824 2825 memmove_long_repeat_emit_encodeBlockAsm12B: 2826 LEAQ (AX)(R8*1), BX 2827 2828 // genMemMoveLong 2829 MOVOU (R9), X0 2830 MOVOU 16(R9), X1 2831 MOVOU -32(R9)(R8*1), X2 2832 MOVOU -16(R9)(R8*1), X3 2833 MOVQ R8, R11 2834 SHRQ $0x05, R11 2835 MOVQ AX, R10 2836 ANDL $0x0000001f, R10 2837 MOVQ $0x00000040, R12 2838 SUBQ R10, R12 2839 DECQ R11 2840 JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 2841 LEAQ -32(R9)(R12*1), R10 2842 LEAQ -32(AX)(R12*1), R13 2843 2844 emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back: 2845 MOVOU (R10), X4 2846 MOVOU 16(R10), X5 2847 MOVOA X4, (R13) 2848 MOVOA X5, 16(R13) 2849 ADDQ $0x20, R13 2850 ADDQ $0x20, R10 2851 ADDQ $0x20, R12 2852 DECQ R11 2853 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back 2854 2855 emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32: 2856 MOVOU -32(R9)(R12*1), X4 2857 MOVOU -16(R9)(R12*1), X5 2858 MOVOA X4, -32(AX)(R12*1) 2859 MOVOA X5, -16(AX)(R12*1) 2860 ADDQ $0x20, R12 2861 CMPQ R8, R12 2862 JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 2863 MOVOU X0, (AX) 2864 MOVOU X1, 16(AX) 2865 MOVOU X2, -32(AX)(R8*1) 2866 MOVOU X3, -16(AX)(R8*1) 2867 MOVQ BX, AX 2868 2869 emit_literal_done_repeat_emit_encodeBlockAsm12B: 2870 ADDL $0x05, CX 2871 MOVL CX, BX 2872 SUBL 16(SP), BX 2873 MOVQ src_len+32(FP), R8 2874 SUBL CX, R8 2875 LEAQ (DX)(CX*1), R9 2876 LEAQ (DX)(BX*1), BX 2877 2878 // matchLen 2879 XORL R11, R11 2880 CMPL R8, $0x08 2881 JB matchlen_match4_repeat_extend_encodeBlockAsm12B 2882 2883 matchlen_loopback_repeat_extend_encodeBlockAsm12B: 2884 MOVQ (R9)(R11*1), R10 2885 XORQ (BX)(R11*1), R10 2886 TESTQ R10, R10 2887 JZ matchlen_loop_repeat_extend_encodeBlockAsm12B 2888 2889 #ifdef GOAMD64_v3 2890 TZCNTQ R10, R10 2891 2892 #else 2893 BSFQ R10, R10 2894 2895 #endif 2896 SARQ $0x03, R10 2897 LEAL (R11)(R10*1), R11 2898 JMP repeat_extend_forward_end_encodeBlockAsm12B 2899 2900 matchlen_loop_repeat_extend_encodeBlockAsm12B: 2901 LEAL -8(R8), R8 2902 LEAL 8(R11), R11 2903 CMPL R8, $0x08 2904 JAE matchlen_loopback_repeat_extend_encodeBlockAsm12B 2905 JZ repeat_extend_forward_end_encodeBlockAsm12B 2906 2907 matchlen_match4_repeat_extend_encodeBlockAsm12B: 2908 CMPL R8, $0x04 2909 JB matchlen_match2_repeat_extend_encodeBlockAsm12B 2910 MOVL (R9)(R11*1), R10 2911 CMPL (BX)(R11*1), R10 2912 JNE matchlen_match2_repeat_extend_encodeBlockAsm12B 2913 SUBL $0x04, R8 2914 LEAL 4(R11), R11 2915 2916 matchlen_match2_repeat_extend_encodeBlockAsm12B: 2917 CMPL R8, $0x02 2918 JB matchlen_match1_repeat_extend_encodeBlockAsm12B 2919 MOVW (R9)(R11*1), R10 2920 CMPW (BX)(R11*1), R10 2921 JNE matchlen_match1_repeat_extend_encodeBlockAsm12B 2922 SUBL $0x02, R8 2923 LEAL 2(R11), R11 2924 2925 matchlen_match1_repeat_extend_encodeBlockAsm12B: 2926 CMPL R8, $0x01 2927 JB repeat_extend_forward_end_encodeBlockAsm12B 2928 MOVB (R9)(R11*1), R10 2929 CMPB (BX)(R11*1), R10 2930 JNE repeat_extend_forward_end_encodeBlockAsm12B 2931 LEAL 1(R11), R11 2932 2933 repeat_extend_forward_end_encodeBlockAsm12B: 2934 ADDL R11, CX 2935 MOVL CX, BX 2936 SUBL SI, BX 2937 MOVL 16(SP), SI 2938 TESTL DI, DI 2939 JZ repeat_as_copy_encodeBlockAsm12B 2940 2941 // emitRepeat 2942 MOVL BX, DI 2943 LEAL -4(BX), BX 2944 CMPL DI, $0x08 2945 JBE repeat_two_match_repeat_encodeBlockAsm12B 2946 CMPL DI, $0x0c 2947 JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm12B 2948 CMPL SI, $0x00000800 2949 JB repeat_two_offset_match_repeat_encodeBlockAsm12B 2950 2951 cant_repeat_two_offset_match_repeat_encodeBlockAsm12B: 2952 CMPL BX, $0x00000104 2953 JB repeat_three_match_repeat_encodeBlockAsm12B 2954 LEAL -256(BX), BX 2955 MOVW $0x0019, (AX) 2956 MOVW BX, 2(AX) 2957 ADDQ $0x04, AX 2958 JMP repeat_end_emit_encodeBlockAsm12B 2959 2960 repeat_three_match_repeat_encodeBlockAsm12B: 2961 LEAL -4(BX), BX 2962 MOVW $0x0015, (AX) 2963 MOVB BL, 2(AX) 2964 ADDQ $0x03, AX 2965 JMP repeat_end_emit_encodeBlockAsm12B 2966 2967 repeat_two_match_repeat_encodeBlockAsm12B: 2968 SHLL $0x02, BX 2969 ORL $0x01, BX 2970 MOVW BX, (AX) 2971 ADDQ $0x02, AX 2972 JMP repeat_end_emit_encodeBlockAsm12B 2973 2974 repeat_two_offset_match_repeat_encodeBlockAsm12B: 2975 XORQ DI, DI 2976 LEAL 1(DI)(BX*4), BX 2977 MOVB SI, 1(AX) 2978 SARL $0x08, SI 2979 SHLL $0x05, SI 2980 ORL SI, BX 2981 MOVB BL, (AX) 2982 ADDQ $0x02, AX 2983 JMP repeat_end_emit_encodeBlockAsm12B 2984 2985 repeat_as_copy_encodeBlockAsm12B: 2986 // emitCopy 2987 CMPL BX, $0x40 2988 JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B 2989 CMPL SI, $0x00000800 2990 JAE long_offset_short_repeat_as_copy_encodeBlockAsm12B 2991 MOVL $0x00000001, DI 2992 LEAL 16(DI), DI 2993 MOVB SI, 1(AX) 2994 SHRL $0x08, SI 2995 SHLL $0x05, SI 2996 ORL SI, DI 2997 MOVB DI, (AX) 2998 ADDQ $0x02, AX 2999 SUBL $0x08, BX 3000 3001 // emitRepeat 3002 LEAL -4(BX), BX 3003 JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b 3004 MOVL BX, DI 3005 LEAL -4(BX), BX 3006 CMPL DI, $0x08 3007 JBE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b 3008 CMPL DI, $0x0c 3009 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b 3010 CMPL SI, $0x00000800 3011 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b 3012 3013 cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b: 3014 CMPL BX, $0x00000104 3015 JB repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b 3016 LEAL -256(BX), BX 3017 MOVW $0x0019, (AX) 3018 MOVW BX, 2(AX) 3019 ADDQ $0x04, AX 3020 JMP repeat_end_emit_encodeBlockAsm12B 3021 3022 repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b: 3023 LEAL -4(BX), BX 3024 MOVW $0x0015, (AX) 3025 MOVB BL, 2(AX) 3026 ADDQ $0x03, AX 3027 JMP repeat_end_emit_encodeBlockAsm12B 3028 3029 repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b: 3030 SHLL $0x02, BX 3031 ORL $0x01, BX 3032 MOVW BX, (AX) 3033 ADDQ $0x02, AX 3034 JMP repeat_end_emit_encodeBlockAsm12B 3035 3036 repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b: 3037 XORQ DI, DI 3038 LEAL 1(DI)(BX*4), BX 3039 MOVB SI, 1(AX) 3040 SARL $0x08, SI 3041 SHLL $0x05, SI 3042 ORL SI, BX 3043 MOVB BL, (AX) 3044 ADDQ $0x02, AX 3045 JMP repeat_end_emit_encodeBlockAsm12B 3046 3047 long_offset_short_repeat_as_copy_encodeBlockAsm12B: 3048 MOVB $0xee, (AX) 3049 MOVW SI, 1(AX) 3050 LEAL -60(BX), BX 3051 ADDQ $0x03, AX 3052 3053 // emitRepeat 3054 MOVL BX, DI 3055 LEAL -4(BX), BX 3056 CMPL DI, $0x08 3057 JBE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short 3058 CMPL DI, $0x0c 3059 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short 3060 CMPL SI, $0x00000800 3061 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short 3062 3063 cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: 3064 CMPL BX, $0x00000104 3065 JB repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short 3066 LEAL -256(BX), BX 3067 MOVW $0x0019, (AX) 3068 MOVW BX, 2(AX) 3069 ADDQ $0x04, AX 3070 JMP repeat_end_emit_encodeBlockAsm12B 3071 3072 repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: 3073 LEAL -4(BX), BX 3074 MOVW $0x0015, (AX) 3075 MOVB BL, 2(AX) 3076 ADDQ $0x03, AX 3077 JMP repeat_end_emit_encodeBlockAsm12B 3078 3079 repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: 3080 SHLL $0x02, BX 3081 ORL $0x01, BX 3082 MOVW BX, (AX) 3083 ADDQ $0x02, AX 3084 JMP repeat_end_emit_encodeBlockAsm12B 3085 3086 repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: 3087 XORQ DI, DI 3088 LEAL 1(DI)(BX*4), BX 3089 MOVB SI, 1(AX) 3090 SARL $0x08, SI 3091 SHLL $0x05, SI 3092 ORL SI, BX 3093 MOVB BL, (AX) 3094 ADDQ $0x02, AX 3095 JMP repeat_end_emit_encodeBlockAsm12B 3096 3097 two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B: 3098 MOVL BX, DI 3099 SHLL $0x02, DI 3100 CMPL BX, $0x0c 3101 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm12B 3102 CMPL SI, $0x00000800 3103 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm12B 3104 LEAL -15(DI), DI 3105 MOVB SI, 1(AX) 3106 SHRL $0x08, SI 3107 SHLL $0x05, SI 3108 ORL SI, DI 3109 MOVB DI, (AX) 3110 ADDQ $0x02, AX 3111 JMP repeat_end_emit_encodeBlockAsm12B 3112 3113 emit_copy_three_repeat_as_copy_encodeBlockAsm12B: 3114 LEAL -2(DI), DI 3115 MOVB DI, (AX) 3116 MOVW SI, 1(AX) 3117 ADDQ $0x03, AX 3118 3119 repeat_end_emit_encodeBlockAsm12B: 3120 MOVL CX, 12(SP) 3121 JMP search_loop_encodeBlockAsm12B 3122 3123 no_repeat_found_encodeBlockAsm12B: 3124 CMPL (DX)(BX*1), SI 3125 JEQ candidate_match_encodeBlockAsm12B 3126 SHRQ $0x08, SI 3127 MOVL 24(SP)(R9*4), BX 3128 LEAL 2(CX), R8 3129 CMPL (DX)(DI*1), SI 3130 JEQ candidate2_match_encodeBlockAsm12B 3131 MOVL R8, 24(SP)(R9*4) 3132 SHRQ $0x08, SI 3133 CMPL (DX)(BX*1), SI 3134 JEQ candidate3_match_encodeBlockAsm12B 3135 MOVL 20(SP), CX 3136 JMP search_loop_encodeBlockAsm12B 3137 3138 candidate3_match_encodeBlockAsm12B: 3139 ADDL $0x02, CX 3140 JMP candidate_match_encodeBlockAsm12B 3141 3142 candidate2_match_encodeBlockAsm12B: 3143 MOVL R8, 24(SP)(R9*4) 3144 INCL CX 3145 MOVL DI, BX 3146 3147 candidate_match_encodeBlockAsm12B: 3148 MOVL 12(SP), SI 3149 TESTL BX, BX 3150 JZ match_extend_back_end_encodeBlockAsm12B 3151 3152 match_extend_back_loop_encodeBlockAsm12B: 3153 CMPL CX, SI 3154 JBE match_extend_back_end_encodeBlockAsm12B 3155 MOVB -1(DX)(BX*1), DI 3156 MOVB -1(DX)(CX*1), R8 3157 CMPB DI, R8 3158 JNE match_extend_back_end_encodeBlockAsm12B 3159 LEAL -1(CX), CX 3160 DECL BX 3161 JZ match_extend_back_end_encodeBlockAsm12B 3162 JMP match_extend_back_loop_encodeBlockAsm12B 3163 3164 match_extend_back_end_encodeBlockAsm12B: 3165 MOVL CX, SI 3166 SUBL 12(SP), SI 3167 LEAQ 3(AX)(SI*1), SI 3168 CMPQ SI, (SP) 3169 JB match_dst_size_check_encodeBlockAsm12B 3170 MOVQ $0x00000000, ret+48(FP) 3171 RET 3172 3173 match_dst_size_check_encodeBlockAsm12B: 3174 MOVL CX, SI 3175 MOVL 12(SP), DI 3176 CMPL DI, SI 3177 JEQ emit_literal_done_match_emit_encodeBlockAsm12B 3178 MOVL SI, R8 3179 MOVL SI, 12(SP) 3180 LEAQ (DX)(DI*1), SI 3181 SUBL DI, R8 3182 LEAL -1(R8), DI 3183 CMPL DI, $0x3c 3184 JB one_byte_match_emit_encodeBlockAsm12B 3185 CMPL DI, $0x00000100 3186 JB two_bytes_match_emit_encodeBlockAsm12B 3187 JB three_bytes_match_emit_encodeBlockAsm12B 3188 3189 three_bytes_match_emit_encodeBlockAsm12B: 3190 MOVB $0xf4, (AX) 3191 MOVW DI, 1(AX) 3192 ADDQ $0x03, AX 3193 JMP memmove_long_match_emit_encodeBlockAsm12B 3194 3195 two_bytes_match_emit_encodeBlockAsm12B: 3196 MOVB $0xf0, (AX) 3197 MOVB DI, 1(AX) 3198 ADDQ $0x02, AX 3199 CMPL DI, $0x40 3200 JB memmove_match_emit_encodeBlockAsm12B 3201 JMP memmove_long_match_emit_encodeBlockAsm12B 3202 3203 one_byte_match_emit_encodeBlockAsm12B: 3204 SHLB $0x02, DI 3205 MOVB DI, (AX) 3206 ADDQ $0x01, AX 3207 3208 memmove_match_emit_encodeBlockAsm12B: 3209 LEAQ (AX)(R8*1), DI 3210 3211 // genMemMoveShort 3212 CMPQ R8, $0x08 3213 JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8 3214 CMPQ R8, $0x10 3215 JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16 3216 CMPQ R8, $0x20 3217 JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32 3218 JMP emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64 3219 3220 emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8: 3221 MOVQ (SI), R9 3222 MOVQ R9, (AX) 3223 JMP memmove_end_copy_match_emit_encodeBlockAsm12B 3224 3225 emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16: 3226 MOVQ (SI), R9 3227 MOVQ -8(SI)(R8*1), SI 3228 MOVQ R9, (AX) 3229 MOVQ SI, -8(AX)(R8*1) 3230 JMP memmove_end_copy_match_emit_encodeBlockAsm12B 3231 3232 emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32: 3233 MOVOU (SI), X0 3234 MOVOU -16(SI)(R8*1), X1 3235 MOVOU X0, (AX) 3236 MOVOU X1, -16(AX)(R8*1) 3237 JMP memmove_end_copy_match_emit_encodeBlockAsm12B 3238 3239 emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64: 3240 MOVOU (SI), X0 3241 MOVOU 16(SI), X1 3242 MOVOU -32(SI)(R8*1), X2 3243 MOVOU -16(SI)(R8*1), X3 3244 MOVOU X0, (AX) 3245 MOVOU X1, 16(AX) 3246 MOVOU X2, -32(AX)(R8*1) 3247 MOVOU X3, -16(AX)(R8*1) 3248 3249 memmove_end_copy_match_emit_encodeBlockAsm12B: 3250 MOVQ DI, AX 3251 JMP emit_literal_done_match_emit_encodeBlockAsm12B 3252 3253 memmove_long_match_emit_encodeBlockAsm12B: 3254 LEAQ (AX)(R8*1), DI 3255 3256 // genMemMoveLong 3257 MOVOU (SI), X0 3258 MOVOU 16(SI), X1 3259 MOVOU -32(SI)(R8*1), X2 3260 MOVOU -16(SI)(R8*1), X3 3261 MOVQ R8, R10 3262 SHRQ $0x05, R10 3263 MOVQ AX, R9 3264 ANDL $0x0000001f, R9 3265 MOVQ $0x00000040, R11 3266 SUBQ R9, R11 3267 DECQ R10 3268 JA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 3269 LEAQ -32(SI)(R11*1), R9 3270 LEAQ -32(AX)(R11*1), R12 3271 3272 emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back: 3273 MOVOU (R9), X4 3274 MOVOU 16(R9), X5 3275 MOVOA X4, (R12) 3276 MOVOA X5, 16(R12) 3277 ADDQ $0x20, R12 3278 ADDQ $0x20, R9 3279 ADDQ $0x20, R11 3280 DECQ R10 3281 JNA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back 3282 3283 emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32: 3284 MOVOU -32(SI)(R11*1), X4 3285 MOVOU -16(SI)(R11*1), X5 3286 MOVOA X4, -32(AX)(R11*1) 3287 MOVOA X5, -16(AX)(R11*1) 3288 ADDQ $0x20, R11 3289 CMPQ R8, R11 3290 JAE emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 3291 MOVOU X0, (AX) 3292 MOVOU X1, 16(AX) 3293 MOVOU X2, -32(AX)(R8*1) 3294 MOVOU X3, -16(AX)(R8*1) 3295 MOVQ DI, AX 3296 3297 emit_literal_done_match_emit_encodeBlockAsm12B: 3298 match_nolit_loop_encodeBlockAsm12B: 3299 MOVL CX, SI 3300 SUBL BX, SI 3301 MOVL SI, 16(SP) 3302 ADDL $0x04, CX 3303 ADDL $0x04, BX 3304 MOVQ src_len+32(FP), SI 3305 SUBL CX, SI 3306 LEAQ (DX)(CX*1), DI 3307 LEAQ (DX)(BX*1), BX 3308 3309 // matchLen 3310 XORL R9, R9 3311 CMPL SI, $0x08 3312 JB matchlen_match4_match_nolit_encodeBlockAsm12B 3313 3314 matchlen_loopback_match_nolit_encodeBlockAsm12B: 3315 MOVQ (DI)(R9*1), R8 3316 XORQ (BX)(R9*1), R8 3317 TESTQ R8, R8 3318 JZ matchlen_loop_match_nolit_encodeBlockAsm12B 3319 3320 #ifdef GOAMD64_v3 3321 TZCNTQ R8, R8 3322 3323 #else 3324 BSFQ R8, R8 3325 3326 #endif 3327 SARQ $0x03, R8 3328 LEAL (R9)(R8*1), R9 3329 JMP match_nolit_end_encodeBlockAsm12B 3330 3331 matchlen_loop_match_nolit_encodeBlockAsm12B: 3332 LEAL -8(SI), SI 3333 LEAL 8(R9), R9 3334 CMPL SI, $0x08 3335 JAE matchlen_loopback_match_nolit_encodeBlockAsm12B 3336 JZ match_nolit_end_encodeBlockAsm12B 3337 3338 matchlen_match4_match_nolit_encodeBlockAsm12B: 3339 CMPL SI, $0x04 3340 JB matchlen_match2_match_nolit_encodeBlockAsm12B 3341 MOVL (DI)(R9*1), R8 3342 CMPL (BX)(R9*1), R8 3343 JNE matchlen_match2_match_nolit_encodeBlockAsm12B 3344 SUBL $0x04, SI 3345 LEAL 4(R9), R9 3346 3347 matchlen_match2_match_nolit_encodeBlockAsm12B: 3348 CMPL SI, $0x02 3349 JB matchlen_match1_match_nolit_encodeBlockAsm12B 3350 MOVW (DI)(R9*1), R8 3351 CMPW (BX)(R9*1), R8 3352 JNE matchlen_match1_match_nolit_encodeBlockAsm12B 3353 SUBL $0x02, SI 3354 LEAL 2(R9), R9 3355 3356 matchlen_match1_match_nolit_encodeBlockAsm12B: 3357 CMPL SI, $0x01 3358 JB match_nolit_end_encodeBlockAsm12B 3359 MOVB (DI)(R9*1), R8 3360 CMPB (BX)(R9*1), R8 3361 JNE match_nolit_end_encodeBlockAsm12B 3362 LEAL 1(R9), R9 3363 3364 match_nolit_end_encodeBlockAsm12B: 3365 ADDL R9, CX 3366 MOVL 16(SP), BX 3367 ADDL $0x04, R9 3368 MOVL CX, 12(SP) 3369 3370 // emitCopy 3371 CMPL R9, $0x40 3372 JBE two_byte_offset_short_match_nolit_encodeBlockAsm12B 3373 CMPL BX, $0x00000800 3374 JAE long_offset_short_match_nolit_encodeBlockAsm12B 3375 MOVL $0x00000001, SI 3376 LEAL 16(SI), SI 3377 MOVB BL, 1(AX) 3378 SHRL $0x08, BX 3379 SHLL $0x05, BX 3380 ORL BX, SI 3381 MOVB SI, (AX) 3382 ADDQ $0x02, AX 3383 SUBL $0x08, R9 3384 3385 // emitRepeat 3386 LEAL -4(R9), R9 3387 JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b 3388 MOVL R9, SI 3389 LEAL -4(R9), R9 3390 CMPL SI, $0x08 3391 JBE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b 3392 CMPL SI, $0x0c 3393 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b 3394 CMPL BX, $0x00000800 3395 JB repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b 3396 3397 cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b: 3398 CMPL R9, $0x00000104 3399 JB repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b 3400 LEAL -256(R9), R9 3401 MOVW $0x0019, (AX) 3402 MOVW R9, 2(AX) 3403 ADDQ $0x04, AX 3404 JMP match_nolit_emitcopy_end_encodeBlockAsm12B 3405 3406 repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b: 3407 LEAL -4(R9), R9 3408 MOVW $0x0015, (AX) 3409 MOVB R9, 2(AX) 3410 ADDQ $0x03, AX 3411 JMP match_nolit_emitcopy_end_encodeBlockAsm12B 3412 3413 repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b: 3414 SHLL $0x02, R9 3415 ORL $0x01, R9 3416 MOVW R9, (AX) 3417 ADDQ $0x02, AX 3418 JMP match_nolit_emitcopy_end_encodeBlockAsm12B 3419 3420 repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b: 3421 XORQ SI, SI 3422 LEAL 1(SI)(R9*4), R9 3423 MOVB BL, 1(AX) 3424 SARL $0x08, BX 3425 SHLL $0x05, BX 3426 ORL BX, R9 3427 MOVB R9, (AX) 3428 ADDQ $0x02, AX 3429 JMP match_nolit_emitcopy_end_encodeBlockAsm12B 3430 3431 long_offset_short_match_nolit_encodeBlockAsm12B: 3432 MOVB $0xee, (AX) 3433 MOVW BX, 1(AX) 3434 LEAL -60(R9), R9 3435 ADDQ $0x03, AX 3436 3437 // emitRepeat 3438 MOVL R9, SI 3439 LEAL -4(R9), R9 3440 CMPL SI, $0x08 3441 JBE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short 3442 CMPL SI, $0x0c 3443 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short 3444 CMPL BX, $0x00000800 3445 JB repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short 3446 3447 cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: 3448 CMPL R9, $0x00000104 3449 JB repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short 3450 LEAL -256(R9), R9 3451 MOVW $0x0019, (AX) 3452 MOVW R9, 2(AX) 3453 ADDQ $0x04, AX 3454 JMP match_nolit_emitcopy_end_encodeBlockAsm12B 3455 3456 repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short: 3457 LEAL -4(R9), R9 3458 MOVW $0x0015, (AX) 3459 MOVB R9, 2(AX) 3460 ADDQ $0x03, AX 3461 JMP match_nolit_emitcopy_end_encodeBlockAsm12B 3462 3463 repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short: 3464 SHLL $0x02, R9 3465 ORL $0x01, R9 3466 MOVW R9, (AX) 3467 ADDQ $0x02, AX 3468 JMP match_nolit_emitcopy_end_encodeBlockAsm12B 3469 3470 repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: 3471 XORQ SI, SI 3472 LEAL 1(SI)(R9*4), R9 3473 MOVB BL, 1(AX) 3474 SARL $0x08, BX 3475 SHLL $0x05, BX 3476 ORL BX, R9 3477 MOVB R9, (AX) 3478 ADDQ $0x02, AX 3479 JMP match_nolit_emitcopy_end_encodeBlockAsm12B 3480 3481 two_byte_offset_short_match_nolit_encodeBlockAsm12B: 3482 MOVL R9, SI 3483 SHLL $0x02, SI 3484 CMPL R9, $0x0c 3485 JAE emit_copy_three_match_nolit_encodeBlockAsm12B 3486 CMPL BX, $0x00000800 3487 JAE emit_copy_three_match_nolit_encodeBlockAsm12B 3488 LEAL -15(SI), SI 3489 MOVB BL, 1(AX) 3490 SHRL $0x08, BX 3491 SHLL $0x05, BX 3492 ORL BX, SI 3493 MOVB SI, (AX) 3494 ADDQ $0x02, AX 3495 JMP match_nolit_emitcopy_end_encodeBlockAsm12B 3496 3497 emit_copy_three_match_nolit_encodeBlockAsm12B: 3498 LEAL -2(SI), SI 3499 MOVB SI, (AX) 3500 MOVW BX, 1(AX) 3501 ADDQ $0x03, AX 3502 3503 match_nolit_emitcopy_end_encodeBlockAsm12B: 3504 CMPL CX, 8(SP) 3505 JAE emit_remainder_encodeBlockAsm12B 3506 MOVQ -2(DX)(CX*1), SI 3507 CMPQ AX, (SP) 3508 JB match_nolit_dst_ok_encodeBlockAsm12B 3509 MOVQ $0x00000000, ret+48(FP) 3510 RET 3511 3512 match_nolit_dst_ok_encodeBlockAsm12B: 3513 MOVQ $0x000000cf1bbcdcbb, R8 3514 MOVQ SI, DI 3515 SHRQ $0x10, SI 3516 MOVQ SI, BX 3517 SHLQ $0x18, DI 3518 IMULQ R8, DI 3519 SHRQ $0x34, DI 3520 SHLQ $0x18, BX 3521 IMULQ R8, BX 3522 SHRQ $0x34, BX 3523 LEAL -2(CX), R8 3524 LEAQ 24(SP)(BX*4), R9 3525 MOVL (R9), BX 3526 MOVL R8, 24(SP)(DI*4) 3527 MOVL CX, (R9) 3528 CMPL (DX)(BX*1), SI 3529 JEQ match_nolit_loop_encodeBlockAsm12B 3530 INCL CX 3531 JMP search_loop_encodeBlockAsm12B 3532 3533 emit_remainder_encodeBlockAsm12B: 3534 MOVQ src_len+32(FP), CX 3535 SUBL 12(SP), CX 3536 LEAQ 3(AX)(CX*1), CX 3537 CMPQ CX, (SP) 3538 JB emit_remainder_ok_encodeBlockAsm12B 3539 MOVQ $0x00000000, ret+48(FP) 3540 RET 3541 3542 emit_remainder_ok_encodeBlockAsm12B: 3543 MOVQ src_len+32(FP), CX 3544 MOVL 12(SP), BX 3545 CMPL BX, CX 3546 JEQ emit_literal_done_emit_remainder_encodeBlockAsm12B 3547 MOVL CX, SI 3548 MOVL CX, 12(SP) 3549 LEAQ (DX)(BX*1), CX 3550 SUBL BX, SI 3551 LEAL -1(SI), DX 3552 CMPL DX, $0x3c 3553 JB one_byte_emit_remainder_encodeBlockAsm12B 3554 CMPL DX, $0x00000100 3555 JB two_bytes_emit_remainder_encodeBlockAsm12B 3556 JB three_bytes_emit_remainder_encodeBlockAsm12B 3557 3558 three_bytes_emit_remainder_encodeBlockAsm12B: 3559 MOVB $0xf4, (AX) 3560 MOVW DX, 1(AX) 3561 ADDQ $0x03, AX 3562 JMP memmove_long_emit_remainder_encodeBlockAsm12B 3563 3564 two_bytes_emit_remainder_encodeBlockAsm12B: 3565 MOVB $0xf0, (AX) 3566 MOVB DL, 1(AX) 3567 ADDQ $0x02, AX 3568 CMPL DX, $0x40 3569 JB memmove_emit_remainder_encodeBlockAsm12B 3570 JMP memmove_long_emit_remainder_encodeBlockAsm12B 3571 3572 one_byte_emit_remainder_encodeBlockAsm12B: 3573 SHLB $0x02, DL 3574 MOVB DL, (AX) 3575 ADDQ $0x01, AX 3576 3577 memmove_emit_remainder_encodeBlockAsm12B: 3578 LEAQ (AX)(SI*1), DX 3579 MOVL SI, BX 3580 3581 // genMemMoveShort 3582 CMPQ BX, $0x03 3583 JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2 3584 JE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3 3585 CMPQ BX, $0x08 3586 JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7 3587 CMPQ BX, $0x10 3588 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16 3589 CMPQ BX, $0x20 3590 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32 3591 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64 3592 3593 emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2: 3594 MOVB (CX), SI 3595 MOVB -1(CX)(BX*1), CL 3596 MOVB SI, (AX) 3597 MOVB CL, -1(AX)(BX*1) 3598 JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B 3599 3600 emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3: 3601 MOVW (CX), SI 3602 MOVB 2(CX), CL 3603 MOVW SI, (AX) 3604 MOVB CL, 2(AX) 3605 JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B 3606 3607 emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7: 3608 MOVL (CX), SI 3609 MOVL -4(CX)(BX*1), CX 3610 MOVL SI, (AX) 3611 MOVL CX, -4(AX)(BX*1) 3612 JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B 3613 3614 emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16: 3615 MOVQ (CX), SI 3616 MOVQ -8(CX)(BX*1), CX 3617 MOVQ SI, (AX) 3618 MOVQ CX, -8(AX)(BX*1) 3619 JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B 3620 3621 emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32: 3622 MOVOU (CX), X0 3623 MOVOU -16(CX)(BX*1), X1 3624 MOVOU X0, (AX) 3625 MOVOU X1, -16(AX)(BX*1) 3626 JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B 3627 3628 emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64: 3629 MOVOU (CX), X0 3630 MOVOU 16(CX), X1 3631 MOVOU -32(CX)(BX*1), X2 3632 MOVOU -16(CX)(BX*1), X3 3633 MOVOU X0, (AX) 3634 MOVOU X1, 16(AX) 3635 MOVOU X2, -32(AX)(BX*1) 3636 MOVOU X3, -16(AX)(BX*1) 3637 3638 memmove_end_copy_emit_remainder_encodeBlockAsm12B: 3639 MOVQ DX, AX 3640 JMP emit_literal_done_emit_remainder_encodeBlockAsm12B 3641 3642 memmove_long_emit_remainder_encodeBlockAsm12B: 3643 LEAQ (AX)(SI*1), DX 3644 MOVL SI, BX 3645 3646 // genMemMoveLong 3647 MOVOU (CX), X0 3648 MOVOU 16(CX), X1 3649 MOVOU -32(CX)(BX*1), X2 3650 MOVOU -16(CX)(BX*1), X3 3651 MOVQ BX, DI 3652 SHRQ $0x05, DI 3653 MOVQ AX, SI 3654 ANDL $0x0000001f, SI 3655 MOVQ $0x00000040, R8 3656 SUBQ SI, R8 3657 DECQ DI 3658 JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32 3659 LEAQ -32(CX)(R8*1), SI 3660 LEAQ -32(AX)(R8*1), R9 3661 3662 emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back: 3663 MOVOU (SI), X4 3664 MOVOU 16(SI), X5 3665 MOVOA X4, (R9) 3666 MOVOA X5, 16(R9) 3667 ADDQ $0x20, R9 3668 ADDQ $0x20, SI 3669 ADDQ $0x20, R8 3670 DECQ DI 3671 JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back 3672 3673 emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32: 3674 MOVOU -32(CX)(R8*1), X4 3675 MOVOU -16(CX)(R8*1), X5 3676 MOVOA X4, -32(AX)(R8*1) 3677 MOVOA X5, -16(AX)(R8*1) 3678 ADDQ $0x20, R8 3679 CMPQ BX, R8 3680 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32 3681 MOVOU X0, (AX) 3682 MOVOU X1, 16(AX) 3683 MOVOU X2, -32(AX)(BX*1) 3684 MOVOU X3, -16(AX)(BX*1) 3685 MOVQ DX, AX 3686 3687 emit_literal_done_emit_remainder_encodeBlockAsm12B: 3688 MOVQ dst_base+0(FP), CX 3689 SUBQ CX, AX 3690 MOVQ AX, ret+48(FP) 3691 RET 3692 3693 // func encodeBlockAsm10B(dst []byte, src []byte) int 3694 // Requires: BMI, SSE2 3695 TEXT ·encodeBlockAsm10B(SB), $4120-56 3696 MOVQ dst_base+0(FP), AX 3697 MOVQ $0x00000020, CX 3698 LEAQ 24(SP), DX 3699 PXOR X0, X0 3700 3701 zero_loop_encodeBlockAsm10B: 3702 MOVOU X0, (DX) 3703 MOVOU X0, 16(DX) 3704 MOVOU X0, 32(DX) 3705 MOVOU X0, 48(DX) 3706 MOVOU X0, 64(DX) 3707 MOVOU X0, 80(DX) 3708 MOVOU X0, 96(DX) 3709 MOVOU X0, 112(DX) 3710 ADDQ $0x80, DX 3711 DECQ CX 3712 JNZ zero_loop_encodeBlockAsm10B 3713 MOVL $0x00000000, 12(SP) 3714 MOVQ src_len+32(FP), CX 3715 LEAQ -9(CX), DX 3716 LEAQ -8(CX), BX 3717 MOVL BX, 8(SP) 3718 SHRQ $0x05, CX 3719 SUBL CX, DX 3720 LEAQ (AX)(DX*1), DX 3721 MOVQ DX, (SP) 3722 MOVL $0x00000001, CX 3723 MOVL CX, 16(SP) 3724 MOVQ src_base+24(FP), DX 3725 3726 search_loop_encodeBlockAsm10B: 3727 MOVL CX, BX 3728 SUBL 12(SP), BX 3729 SHRL $0x05, BX 3730 LEAL 4(CX)(BX*1), BX 3731 CMPL BX, 8(SP) 3732 JAE emit_remainder_encodeBlockAsm10B 3733 MOVQ (DX)(CX*1), SI 3734 MOVL BX, 20(SP) 3735 MOVQ $0x9e3779b1, R8 3736 MOVQ SI, R9 3737 MOVQ SI, R10 3738 SHRQ $0x08, R10 3739 SHLQ $0x20, R9 3740 IMULQ R8, R9 3741 SHRQ $0x36, R9 3742 SHLQ $0x20, R10 3743 IMULQ R8, R10 3744 SHRQ $0x36, R10 3745 MOVL 24(SP)(R9*4), BX 3746 MOVL 24(SP)(R10*4), DI 3747 MOVL CX, 24(SP)(R9*4) 3748 LEAL 1(CX), R9 3749 MOVL R9, 24(SP)(R10*4) 3750 MOVQ SI, R9 3751 SHRQ $0x10, R9 3752 SHLQ $0x20, R9 3753 IMULQ R8, R9 3754 SHRQ $0x36, R9 3755 MOVL CX, R8 3756 SUBL 16(SP), R8 3757 MOVL 1(DX)(R8*1), R10 3758 MOVQ SI, R8 3759 SHRQ $0x08, R8 3760 CMPL R8, R10 3761 JNE no_repeat_found_encodeBlockAsm10B 3762 LEAL 1(CX), SI 3763 MOVL 12(SP), DI 3764 MOVL SI, BX 3765 SUBL 16(SP), BX 3766 JZ repeat_extend_back_end_encodeBlockAsm10B 3767 3768 repeat_extend_back_loop_encodeBlockAsm10B: 3769 CMPL SI, DI 3770 JBE repeat_extend_back_end_encodeBlockAsm10B 3771 MOVB -1(DX)(BX*1), R8 3772 MOVB -1(DX)(SI*1), R9 3773 CMPB R8, R9 3774 JNE repeat_extend_back_end_encodeBlockAsm10B 3775 LEAL -1(SI), SI 3776 DECL BX 3777 JNZ repeat_extend_back_loop_encodeBlockAsm10B 3778 3779 repeat_extend_back_end_encodeBlockAsm10B: 3780 MOVL 12(SP), BX 3781 CMPL BX, SI 3782 JEQ emit_literal_done_repeat_emit_encodeBlockAsm10B 3783 MOVL SI, R8 3784 MOVL SI, 12(SP) 3785 LEAQ (DX)(BX*1), R9 3786 SUBL BX, R8 3787 LEAL -1(R8), BX 3788 CMPL BX, $0x3c 3789 JB one_byte_repeat_emit_encodeBlockAsm10B 3790 CMPL BX, $0x00000100 3791 JB two_bytes_repeat_emit_encodeBlockAsm10B 3792 JB three_bytes_repeat_emit_encodeBlockAsm10B 3793 3794 three_bytes_repeat_emit_encodeBlockAsm10B: 3795 MOVB $0xf4, (AX) 3796 MOVW BX, 1(AX) 3797 ADDQ $0x03, AX 3798 JMP memmove_long_repeat_emit_encodeBlockAsm10B 3799 3800 two_bytes_repeat_emit_encodeBlockAsm10B: 3801 MOVB $0xf0, (AX) 3802 MOVB BL, 1(AX) 3803 ADDQ $0x02, AX 3804 CMPL BX, $0x40 3805 JB memmove_repeat_emit_encodeBlockAsm10B 3806 JMP memmove_long_repeat_emit_encodeBlockAsm10B 3807 3808 one_byte_repeat_emit_encodeBlockAsm10B: 3809 SHLB $0x02, BL 3810 MOVB BL, (AX) 3811 ADDQ $0x01, AX 3812 3813 memmove_repeat_emit_encodeBlockAsm10B: 3814 LEAQ (AX)(R8*1), BX 3815 3816 // genMemMoveShort 3817 CMPQ R8, $0x08 3818 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8 3819 CMPQ R8, $0x10 3820 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16 3821 CMPQ R8, $0x20 3822 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32 3823 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64 3824 3825 emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8: 3826 MOVQ (R9), R10 3827 MOVQ R10, (AX) 3828 JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B 3829 3830 emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16: 3831 MOVQ (R9), R10 3832 MOVQ -8(R9)(R8*1), R9 3833 MOVQ R10, (AX) 3834 MOVQ R9, -8(AX)(R8*1) 3835 JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B 3836 3837 emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32: 3838 MOVOU (R9), X0 3839 MOVOU -16(R9)(R8*1), X1 3840 MOVOU X0, (AX) 3841 MOVOU X1, -16(AX)(R8*1) 3842 JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B 3843 3844 emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64: 3845 MOVOU (R9), X0 3846 MOVOU 16(R9), X1 3847 MOVOU -32(R9)(R8*1), X2 3848 MOVOU -16(R9)(R8*1), X3 3849 MOVOU X0, (AX) 3850 MOVOU X1, 16(AX) 3851 MOVOU X2, -32(AX)(R8*1) 3852 MOVOU X3, -16(AX)(R8*1) 3853 3854 memmove_end_copy_repeat_emit_encodeBlockAsm10B: 3855 MOVQ BX, AX 3856 JMP emit_literal_done_repeat_emit_encodeBlockAsm10B 3857 3858 memmove_long_repeat_emit_encodeBlockAsm10B: 3859 LEAQ (AX)(R8*1), BX 3860 3861 // genMemMoveLong 3862 MOVOU (R9), X0 3863 MOVOU 16(R9), X1 3864 MOVOU -32(R9)(R8*1), X2 3865 MOVOU -16(R9)(R8*1), X3 3866 MOVQ R8, R11 3867 SHRQ $0x05, R11 3868 MOVQ AX, R10 3869 ANDL $0x0000001f, R10 3870 MOVQ $0x00000040, R12 3871 SUBQ R10, R12 3872 DECQ R11 3873 JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 3874 LEAQ -32(R9)(R12*1), R10 3875 LEAQ -32(AX)(R12*1), R13 3876 3877 emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back: 3878 MOVOU (R10), X4 3879 MOVOU 16(R10), X5 3880 MOVOA X4, (R13) 3881 MOVOA X5, 16(R13) 3882 ADDQ $0x20, R13 3883 ADDQ $0x20, R10 3884 ADDQ $0x20, R12 3885 DECQ R11 3886 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back 3887 3888 emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32: 3889 MOVOU -32(R9)(R12*1), X4 3890 MOVOU -16(R9)(R12*1), X5 3891 MOVOA X4, -32(AX)(R12*1) 3892 MOVOA X5, -16(AX)(R12*1) 3893 ADDQ $0x20, R12 3894 CMPQ R8, R12 3895 JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 3896 MOVOU X0, (AX) 3897 MOVOU X1, 16(AX) 3898 MOVOU X2, -32(AX)(R8*1) 3899 MOVOU X3, -16(AX)(R8*1) 3900 MOVQ BX, AX 3901 3902 emit_literal_done_repeat_emit_encodeBlockAsm10B: 3903 ADDL $0x05, CX 3904 MOVL CX, BX 3905 SUBL 16(SP), BX 3906 MOVQ src_len+32(FP), R8 3907 SUBL CX, R8 3908 LEAQ (DX)(CX*1), R9 3909 LEAQ (DX)(BX*1), BX 3910 3911 // matchLen 3912 XORL R11, R11 3913 CMPL R8, $0x08 3914 JB matchlen_match4_repeat_extend_encodeBlockAsm10B 3915 3916 matchlen_loopback_repeat_extend_encodeBlockAsm10B: 3917 MOVQ (R9)(R11*1), R10 3918 XORQ (BX)(R11*1), R10 3919 TESTQ R10, R10 3920 JZ matchlen_loop_repeat_extend_encodeBlockAsm10B 3921 3922 #ifdef GOAMD64_v3 3923 TZCNTQ R10, R10 3924 3925 #else 3926 BSFQ R10, R10 3927 3928 #endif 3929 SARQ $0x03, R10 3930 LEAL (R11)(R10*1), R11 3931 JMP repeat_extend_forward_end_encodeBlockAsm10B 3932 3933 matchlen_loop_repeat_extend_encodeBlockAsm10B: 3934 LEAL -8(R8), R8 3935 LEAL 8(R11), R11 3936 CMPL R8, $0x08 3937 JAE matchlen_loopback_repeat_extend_encodeBlockAsm10B 3938 JZ repeat_extend_forward_end_encodeBlockAsm10B 3939 3940 matchlen_match4_repeat_extend_encodeBlockAsm10B: 3941 CMPL R8, $0x04 3942 JB matchlen_match2_repeat_extend_encodeBlockAsm10B 3943 MOVL (R9)(R11*1), R10 3944 CMPL (BX)(R11*1), R10 3945 JNE matchlen_match2_repeat_extend_encodeBlockAsm10B 3946 SUBL $0x04, R8 3947 LEAL 4(R11), R11 3948 3949 matchlen_match2_repeat_extend_encodeBlockAsm10B: 3950 CMPL R8, $0x02 3951 JB matchlen_match1_repeat_extend_encodeBlockAsm10B 3952 MOVW (R9)(R11*1), R10 3953 CMPW (BX)(R11*1), R10 3954 JNE matchlen_match1_repeat_extend_encodeBlockAsm10B 3955 SUBL $0x02, R8 3956 LEAL 2(R11), R11 3957 3958 matchlen_match1_repeat_extend_encodeBlockAsm10B: 3959 CMPL R8, $0x01 3960 JB repeat_extend_forward_end_encodeBlockAsm10B 3961 MOVB (R9)(R11*1), R10 3962 CMPB (BX)(R11*1), R10 3963 JNE repeat_extend_forward_end_encodeBlockAsm10B 3964 LEAL 1(R11), R11 3965 3966 repeat_extend_forward_end_encodeBlockAsm10B: 3967 ADDL R11, CX 3968 MOVL CX, BX 3969 SUBL SI, BX 3970 MOVL 16(SP), SI 3971 TESTL DI, DI 3972 JZ repeat_as_copy_encodeBlockAsm10B 3973 3974 // emitRepeat 3975 MOVL BX, DI 3976 LEAL -4(BX), BX 3977 CMPL DI, $0x08 3978 JBE repeat_two_match_repeat_encodeBlockAsm10B 3979 CMPL DI, $0x0c 3980 JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm10B 3981 CMPL SI, $0x00000800 3982 JB repeat_two_offset_match_repeat_encodeBlockAsm10B 3983 3984 cant_repeat_two_offset_match_repeat_encodeBlockAsm10B: 3985 CMPL BX, $0x00000104 3986 JB repeat_three_match_repeat_encodeBlockAsm10B 3987 LEAL -256(BX), BX 3988 MOVW $0x0019, (AX) 3989 MOVW BX, 2(AX) 3990 ADDQ $0x04, AX 3991 JMP repeat_end_emit_encodeBlockAsm10B 3992 3993 repeat_three_match_repeat_encodeBlockAsm10B: 3994 LEAL -4(BX), BX 3995 MOVW $0x0015, (AX) 3996 MOVB BL, 2(AX) 3997 ADDQ $0x03, AX 3998 JMP repeat_end_emit_encodeBlockAsm10B 3999 4000 repeat_two_match_repeat_encodeBlockAsm10B: 4001 SHLL $0x02, BX 4002 ORL $0x01, BX 4003 MOVW BX, (AX) 4004 ADDQ $0x02, AX 4005 JMP repeat_end_emit_encodeBlockAsm10B 4006 4007 repeat_two_offset_match_repeat_encodeBlockAsm10B: 4008 XORQ DI, DI 4009 LEAL 1(DI)(BX*4), BX 4010 MOVB SI, 1(AX) 4011 SARL $0x08, SI 4012 SHLL $0x05, SI 4013 ORL SI, BX 4014 MOVB BL, (AX) 4015 ADDQ $0x02, AX 4016 JMP repeat_end_emit_encodeBlockAsm10B 4017 4018 repeat_as_copy_encodeBlockAsm10B: 4019 // emitCopy 4020 CMPL BX, $0x40 4021 JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B 4022 CMPL SI, $0x00000800 4023 JAE long_offset_short_repeat_as_copy_encodeBlockAsm10B 4024 MOVL $0x00000001, DI 4025 LEAL 16(DI), DI 4026 MOVB SI, 1(AX) 4027 SHRL $0x08, SI 4028 SHLL $0x05, SI 4029 ORL SI, DI 4030 MOVB DI, (AX) 4031 ADDQ $0x02, AX 4032 SUBL $0x08, BX 4033 4034 // emitRepeat 4035 LEAL -4(BX), BX 4036 JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b 4037 MOVL BX, DI 4038 LEAL -4(BX), BX 4039 CMPL DI, $0x08 4040 JBE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b 4041 CMPL DI, $0x0c 4042 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b 4043 CMPL SI, $0x00000800 4044 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b 4045 4046 cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b: 4047 CMPL BX, $0x00000104 4048 JB repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b 4049 LEAL -256(BX), BX 4050 MOVW $0x0019, (AX) 4051 MOVW BX, 2(AX) 4052 ADDQ $0x04, AX 4053 JMP repeat_end_emit_encodeBlockAsm10B 4054 4055 repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b: 4056 LEAL -4(BX), BX 4057 MOVW $0x0015, (AX) 4058 MOVB BL, 2(AX) 4059 ADDQ $0x03, AX 4060 JMP repeat_end_emit_encodeBlockAsm10B 4061 4062 repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b: 4063 SHLL $0x02, BX 4064 ORL $0x01, BX 4065 MOVW BX, (AX) 4066 ADDQ $0x02, AX 4067 JMP repeat_end_emit_encodeBlockAsm10B 4068 4069 repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b: 4070 XORQ DI, DI 4071 LEAL 1(DI)(BX*4), BX 4072 MOVB SI, 1(AX) 4073 SARL $0x08, SI 4074 SHLL $0x05, SI 4075 ORL SI, BX 4076 MOVB BL, (AX) 4077 ADDQ $0x02, AX 4078 JMP repeat_end_emit_encodeBlockAsm10B 4079 4080 long_offset_short_repeat_as_copy_encodeBlockAsm10B: 4081 MOVB $0xee, (AX) 4082 MOVW SI, 1(AX) 4083 LEAL -60(BX), BX 4084 ADDQ $0x03, AX 4085 4086 // emitRepeat 4087 MOVL BX, DI 4088 LEAL -4(BX), BX 4089 CMPL DI, $0x08 4090 JBE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short 4091 CMPL DI, $0x0c 4092 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short 4093 CMPL SI, $0x00000800 4094 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short 4095 4096 cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: 4097 CMPL BX, $0x00000104 4098 JB repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short 4099 LEAL -256(BX), BX 4100 MOVW $0x0019, (AX) 4101 MOVW BX, 2(AX) 4102 ADDQ $0x04, AX 4103 JMP repeat_end_emit_encodeBlockAsm10B 4104 4105 repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: 4106 LEAL -4(BX), BX 4107 MOVW $0x0015, (AX) 4108 MOVB BL, 2(AX) 4109 ADDQ $0x03, AX 4110 JMP repeat_end_emit_encodeBlockAsm10B 4111 4112 repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: 4113 SHLL $0x02, BX 4114 ORL $0x01, BX 4115 MOVW BX, (AX) 4116 ADDQ $0x02, AX 4117 JMP repeat_end_emit_encodeBlockAsm10B 4118 4119 repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: 4120 XORQ DI, DI 4121 LEAL 1(DI)(BX*4), BX 4122 MOVB SI, 1(AX) 4123 SARL $0x08, SI 4124 SHLL $0x05, SI 4125 ORL SI, BX 4126 MOVB BL, (AX) 4127 ADDQ $0x02, AX 4128 JMP repeat_end_emit_encodeBlockAsm10B 4129 4130 two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B: 4131 MOVL BX, DI 4132 SHLL $0x02, DI 4133 CMPL BX, $0x0c 4134 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm10B 4135 CMPL SI, $0x00000800 4136 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm10B 4137 LEAL -15(DI), DI 4138 MOVB SI, 1(AX) 4139 SHRL $0x08, SI 4140 SHLL $0x05, SI 4141 ORL SI, DI 4142 MOVB DI, (AX) 4143 ADDQ $0x02, AX 4144 JMP repeat_end_emit_encodeBlockAsm10B 4145 4146 emit_copy_three_repeat_as_copy_encodeBlockAsm10B: 4147 LEAL -2(DI), DI 4148 MOVB DI, (AX) 4149 MOVW SI, 1(AX) 4150 ADDQ $0x03, AX 4151 4152 repeat_end_emit_encodeBlockAsm10B: 4153 MOVL CX, 12(SP) 4154 JMP search_loop_encodeBlockAsm10B 4155 4156 no_repeat_found_encodeBlockAsm10B: 4157 CMPL (DX)(BX*1), SI 4158 JEQ candidate_match_encodeBlockAsm10B 4159 SHRQ $0x08, SI 4160 MOVL 24(SP)(R9*4), BX 4161 LEAL 2(CX), R8 4162 CMPL (DX)(DI*1), SI 4163 JEQ candidate2_match_encodeBlockAsm10B 4164 MOVL R8, 24(SP)(R9*4) 4165 SHRQ $0x08, SI 4166 CMPL (DX)(BX*1), SI 4167 JEQ candidate3_match_encodeBlockAsm10B 4168 MOVL 20(SP), CX 4169 JMP search_loop_encodeBlockAsm10B 4170 4171 candidate3_match_encodeBlockAsm10B: 4172 ADDL $0x02, CX 4173 JMP candidate_match_encodeBlockAsm10B 4174 4175 candidate2_match_encodeBlockAsm10B: 4176 MOVL R8, 24(SP)(R9*4) 4177 INCL CX 4178 MOVL DI, BX 4179 4180 candidate_match_encodeBlockAsm10B: 4181 MOVL 12(SP), SI 4182 TESTL BX, BX 4183 JZ match_extend_back_end_encodeBlockAsm10B 4184 4185 match_extend_back_loop_encodeBlockAsm10B: 4186 CMPL CX, SI 4187 JBE match_extend_back_end_encodeBlockAsm10B 4188 MOVB -1(DX)(BX*1), DI 4189 MOVB -1(DX)(CX*1), R8 4190 CMPB DI, R8 4191 JNE match_extend_back_end_encodeBlockAsm10B 4192 LEAL -1(CX), CX 4193 DECL BX 4194 JZ match_extend_back_end_encodeBlockAsm10B 4195 JMP match_extend_back_loop_encodeBlockAsm10B 4196 4197 match_extend_back_end_encodeBlockAsm10B: 4198 MOVL CX, SI 4199 SUBL 12(SP), SI 4200 LEAQ 3(AX)(SI*1), SI 4201 CMPQ SI, (SP) 4202 JB match_dst_size_check_encodeBlockAsm10B 4203 MOVQ $0x00000000, ret+48(FP) 4204 RET 4205 4206 match_dst_size_check_encodeBlockAsm10B: 4207 MOVL CX, SI 4208 MOVL 12(SP), DI 4209 CMPL DI, SI 4210 JEQ emit_literal_done_match_emit_encodeBlockAsm10B 4211 MOVL SI, R8 4212 MOVL SI, 12(SP) 4213 LEAQ (DX)(DI*1), SI 4214 SUBL DI, R8 4215 LEAL -1(R8), DI 4216 CMPL DI, $0x3c 4217 JB one_byte_match_emit_encodeBlockAsm10B 4218 CMPL DI, $0x00000100 4219 JB two_bytes_match_emit_encodeBlockAsm10B 4220 JB three_bytes_match_emit_encodeBlockAsm10B 4221 4222 three_bytes_match_emit_encodeBlockAsm10B: 4223 MOVB $0xf4, (AX) 4224 MOVW DI, 1(AX) 4225 ADDQ $0x03, AX 4226 JMP memmove_long_match_emit_encodeBlockAsm10B 4227 4228 two_bytes_match_emit_encodeBlockAsm10B: 4229 MOVB $0xf0, (AX) 4230 MOVB DI, 1(AX) 4231 ADDQ $0x02, AX 4232 CMPL DI, $0x40 4233 JB memmove_match_emit_encodeBlockAsm10B 4234 JMP memmove_long_match_emit_encodeBlockAsm10B 4235 4236 one_byte_match_emit_encodeBlockAsm10B: 4237 SHLB $0x02, DI 4238 MOVB DI, (AX) 4239 ADDQ $0x01, AX 4240 4241 memmove_match_emit_encodeBlockAsm10B: 4242 LEAQ (AX)(R8*1), DI 4243 4244 // genMemMoveShort 4245 CMPQ R8, $0x08 4246 JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8 4247 CMPQ R8, $0x10 4248 JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16 4249 CMPQ R8, $0x20 4250 JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32 4251 JMP emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64 4252 4253 emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8: 4254 MOVQ (SI), R9 4255 MOVQ R9, (AX) 4256 JMP memmove_end_copy_match_emit_encodeBlockAsm10B 4257 4258 emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16: 4259 MOVQ (SI), R9 4260 MOVQ -8(SI)(R8*1), SI 4261 MOVQ R9, (AX) 4262 MOVQ SI, -8(AX)(R8*1) 4263 JMP memmove_end_copy_match_emit_encodeBlockAsm10B 4264 4265 emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32: 4266 MOVOU (SI), X0 4267 MOVOU -16(SI)(R8*1), X1 4268 MOVOU X0, (AX) 4269 MOVOU X1, -16(AX)(R8*1) 4270 JMP memmove_end_copy_match_emit_encodeBlockAsm10B 4271 4272 emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64: 4273 MOVOU (SI), X0 4274 MOVOU 16(SI), X1 4275 MOVOU -32(SI)(R8*1), X2 4276 MOVOU -16(SI)(R8*1), X3 4277 MOVOU X0, (AX) 4278 MOVOU X1, 16(AX) 4279 MOVOU X2, -32(AX)(R8*1) 4280 MOVOU X3, -16(AX)(R8*1) 4281 4282 memmove_end_copy_match_emit_encodeBlockAsm10B: 4283 MOVQ DI, AX 4284 JMP emit_literal_done_match_emit_encodeBlockAsm10B 4285 4286 memmove_long_match_emit_encodeBlockAsm10B: 4287 LEAQ (AX)(R8*1), DI 4288 4289 // genMemMoveLong 4290 MOVOU (SI), X0 4291 MOVOU 16(SI), X1 4292 MOVOU -32(SI)(R8*1), X2 4293 MOVOU -16(SI)(R8*1), X3 4294 MOVQ R8, R10 4295 SHRQ $0x05, R10 4296 MOVQ AX, R9 4297 ANDL $0x0000001f, R9 4298 MOVQ $0x00000040, R11 4299 SUBQ R9, R11 4300 DECQ R10 4301 JA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 4302 LEAQ -32(SI)(R11*1), R9 4303 LEAQ -32(AX)(R11*1), R12 4304 4305 emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back: 4306 MOVOU (R9), X4 4307 MOVOU 16(R9), X5 4308 MOVOA X4, (R12) 4309 MOVOA X5, 16(R12) 4310 ADDQ $0x20, R12 4311 ADDQ $0x20, R9 4312 ADDQ $0x20, R11 4313 DECQ R10 4314 JNA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back 4315 4316 emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32: 4317 MOVOU -32(SI)(R11*1), X4 4318 MOVOU -16(SI)(R11*1), X5 4319 MOVOA X4, -32(AX)(R11*1) 4320 MOVOA X5, -16(AX)(R11*1) 4321 ADDQ $0x20, R11 4322 CMPQ R8, R11 4323 JAE emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 4324 MOVOU X0, (AX) 4325 MOVOU X1, 16(AX) 4326 MOVOU X2, -32(AX)(R8*1) 4327 MOVOU X3, -16(AX)(R8*1) 4328 MOVQ DI, AX 4329 4330 emit_literal_done_match_emit_encodeBlockAsm10B: 4331 match_nolit_loop_encodeBlockAsm10B: 4332 MOVL CX, SI 4333 SUBL BX, SI 4334 MOVL SI, 16(SP) 4335 ADDL $0x04, CX 4336 ADDL $0x04, BX 4337 MOVQ src_len+32(FP), SI 4338 SUBL CX, SI 4339 LEAQ (DX)(CX*1), DI 4340 LEAQ (DX)(BX*1), BX 4341 4342 // matchLen 4343 XORL R9, R9 4344 CMPL SI, $0x08 4345 JB matchlen_match4_match_nolit_encodeBlockAsm10B 4346 4347 matchlen_loopback_match_nolit_encodeBlockAsm10B: 4348 MOVQ (DI)(R9*1), R8 4349 XORQ (BX)(R9*1), R8 4350 TESTQ R8, R8 4351 JZ matchlen_loop_match_nolit_encodeBlockAsm10B 4352 4353 #ifdef GOAMD64_v3 4354 TZCNTQ R8, R8 4355 4356 #else 4357 BSFQ R8, R8 4358 4359 #endif 4360 SARQ $0x03, R8 4361 LEAL (R9)(R8*1), R9 4362 JMP match_nolit_end_encodeBlockAsm10B 4363 4364 matchlen_loop_match_nolit_encodeBlockAsm10B: 4365 LEAL -8(SI), SI 4366 LEAL 8(R9), R9 4367 CMPL SI, $0x08 4368 JAE matchlen_loopback_match_nolit_encodeBlockAsm10B 4369 JZ match_nolit_end_encodeBlockAsm10B 4370 4371 matchlen_match4_match_nolit_encodeBlockAsm10B: 4372 CMPL SI, $0x04 4373 JB matchlen_match2_match_nolit_encodeBlockAsm10B 4374 MOVL (DI)(R9*1), R8 4375 CMPL (BX)(R9*1), R8 4376 JNE matchlen_match2_match_nolit_encodeBlockAsm10B 4377 SUBL $0x04, SI 4378 LEAL 4(R9), R9 4379 4380 matchlen_match2_match_nolit_encodeBlockAsm10B: 4381 CMPL SI, $0x02 4382 JB matchlen_match1_match_nolit_encodeBlockAsm10B 4383 MOVW (DI)(R9*1), R8 4384 CMPW (BX)(R9*1), R8 4385 JNE matchlen_match1_match_nolit_encodeBlockAsm10B 4386 SUBL $0x02, SI 4387 LEAL 2(R9), R9 4388 4389 matchlen_match1_match_nolit_encodeBlockAsm10B: 4390 CMPL SI, $0x01 4391 JB match_nolit_end_encodeBlockAsm10B 4392 MOVB (DI)(R9*1), R8 4393 CMPB (BX)(R9*1), R8 4394 JNE match_nolit_end_encodeBlockAsm10B 4395 LEAL 1(R9), R9 4396 4397 match_nolit_end_encodeBlockAsm10B: 4398 ADDL R9, CX 4399 MOVL 16(SP), BX 4400 ADDL $0x04, R9 4401 MOVL CX, 12(SP) 4402 4403 // emitCopy 4404 CMPL R9, $0x40 4405 JBE two_byte_offset_short_match_nolit_encodeBlockAsm10B 4406 CMPL BX, $0x00000800 4407 JAE long_offset_short_match_nolit_encodeBlockAsm10B 4408 MOVL $0x00000001, SI 4409 LEAL 16(SI), SI 4410 MOVB BL, 1(AX) 4411 SHRL $0x08, BX 4412 SHLL $0x05, BX 4413 ORL BX, SI 4414 MOVB SI, (AX) 4415 ADDQ $0x02, AX 4416 SUBL $0x08, R9 4417 4418 // emitRepeat 4419 LEAL -4(R9), R9 4420 JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b 4421 MOVL R9, SI 4422 LEAL -4(R9), R9 4423 CMPL SI, $0x08 4424 JBE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b 4425 CMPL SI, $0x0c 4426 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b 4427 CMPL BX, $0x00000800 4428 JB repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b 4429 4430 cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b: 4431 CMPL R9, $0x00000104 4432 JB repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b 4433 LEAL -256(R9), R9 4434 MOVW $0x0019, (AX) 4435 MOVW R9, 2(AX) 4436 ADDQ $0x04, AX 4437 JMP match_nolit_emitcopy_end_encodeBlockAsm10B 4438 4439 repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b: 4440 LEAL -4(R9), R9 4441 MOVW $0x0015, (AX) 4442 MOVB R9, 2(AX) 4443 ADDQ $0x03, AX 4444 JMP match_nolit_emitcopy_end_encodeBlockAsm10B 4445 4446 repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b: 4447 SHLL $0x02, R9 4448 ORL $0x01, R9 4449 MOVW R9, (AX) 4450 ADDQ $0x02, AX 4451 JMP match_nolit_emitcopy_end_encodeBlockAsm10B 4452 4453 repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b: 4454 XORQ SI, SI 4455 LEAL 1(SI)(R9*4), R9 4456 MOVB BL, 1(AX) 4457 SARL $0x08, BX 4458 SHLL $0x05, BX 4459 ORL BX, R9 4460 MOVB R9, (AX) 4461 ADDQ $0x02, AX 4462 JMP match_nolit_emitcopy_end_encodeBlockAsm10B 4463 4464 long_offset_short_match_nolit_encodeBlockAsm10B: 4465 MOVB $0xee, (AX) 4466 MOVW BX, 1(AX) 4467 LEAL -60(R9), R9 4468 ADDQ $0x03, AX 4469 4470 // emitRepeat 4471 MOVL R9, SI 4472 LEAL -4(R9), R9 4473 CMPL SI, $0x08 4474 JBE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short 4475 CMPL SI, $0x0c 4476 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short 4477 CMPL BX, $0x00000800 4478 JB repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short 4479 4480 cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short: 4481 CMPL R9, $0x00000104 4482 JB repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short 4483 LEAL -256(R9), R9 4484 MOVW $0x0019, (AX) 4485 MOVW R9, 2(AX) 4486 ADDQ $0x04, AX 4487 JMP match_nolit_emitcopy_end_encodeBlockAsm10B 4488 4489 repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short: 4490 LEAL -4(R9), R9 4491 MOVW $0x0015, (AX) 4492 MOVB R9, 2(AX) 4493 ADDQ $0x03, AX 4494 JMP match_nolit_emitcopy_end_encodeBlockAsm10B 4495 4496 repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short: 4497 SHLL $0x02, R9 4498 ORL $0x01, R9 4499 MOVW R9, (AX) 4500 ADDQ $0x02, AX 4501 JMP match_nolit_emitcopy_end_encodeBlockAsm10B 4502 4503 repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short: 4504 XORQ SI, SI 4505 LEAL 1(SI)(R9*4), R9 4506 MOVB BL, 1(AX) 4507 SARL $0x08, BX 4508 SHLL $0x05, BX 4509 ORL BX, R9 4510 MOVB R9, (AX) 4511 ADDQ $0x02, AX 4512 JMP match_nolit_emitcopy_end_encodeBlockAsm10B 4513 4514 two_byte_offset_short_match_nolit_encodeBlockAsm10B: 4515 MOVL R9, SI 4516 SHLL $0x02, SI 4517 CMPL R9, $0x0c 4518 JAE emit_copy_three_match_nolit_encodeBlockAsm10B 4519 CMPL BX, $0x00000800 4520 JAE emit_copy_three_match_nolit_encodeBlockAsm10B 4521 LEAL -15(SI), SI 4522 MOVB BL, 1(AX) 4523 SHRL $0x08, BX 4524 SHLL $0x05, BX 4525 ORL BX, SI 4526 MOVB SI, (AX) 4527 ADDQ $0x02, AX 4528 JMP match_nolit_emitcopy_end_encodeBlockAsm10B 4529 4530 emit_copy_three_match_nolit_encodeBlockAsm10B: 4531 LEAL -2(SI), SI 4532 MOVB SI, (AX) 4533 MOVW BX, 1(AX) 4534 ADDQ $0x03, AX 4535 4536 match_nolit_emitcopy_end_encodeBlockAsm10B: 4537 CMPL CX, 8(SP) 4538 JAE emit_remainder_encodeBlockAsm10B 4539 MOVQ -2(DX)(CX*1), SI 4540 CMPQ AX, (SP) 4541 JB match_nolit_dst_ok_encodeBlockAsm10B 4542 MOVQ $0x00000000, ret+48(FP) 4543 RET 4544 4545 match_nolit_dst_ok_encodeBlockAsm10B: 4546 MOVQ $0x9e3779b1, R8 4547 MOVQ SI, DI 4548 SHRQ $0x10, SI 4549 MOVQ SI, BX 4550 SHLQ $0x20, DI 4551 IMULQ R8, DI 4552 SHRQ $0x36, DI 4553 SHLQ $0x20, BX 4554 IMULQ R8, BX 4555 SHRQ $0x36, BX 4556 LEAL -2(CX), R8 4557 LEAQ 24(SP)(BX*4), R9 4558 MOVL (R9), BX 4559 MOVL R8, 24(SP)(DI*4) 4560 MOVL CX, (R9) 4561 CMPL (DX)(BX*1), SI 4562 JEQ match_nolit_loop_encodeBlockAsm10B 4563 INCL CX 4564 JMP search_loop_encodeBlockAsm10B 4565 4566 emit_remainder_encodeBlockAsm10B: 4567 MOVQ src_len+32(FP), CX 4568 SUBL 12(SP), CX 4569 LEAQ 3(AX)(CX*1), CX 4570 CMPQ CX, (SP) 4571 JB emit_remainder_ok_encodeBlockAsm10B 4572 MOVQ $0x00000000, ret+48(FP) 4573 RET 4574 4575 emit_remainder_ok_encodeBlockAsm10B: 4576 MOVQ src_len+32(FP), CX 4577 MOVL 12(SP), BX 4578 CMPL BX, CX 4579 JEQ emit_literal_done_emit_remainder_encodeBlockAsm10B 4580 MOVL CX, SI 4581 MOVL CX, 12(SP) 4582 LEAQ (DX)(BX*1), CX 4583 SUBL BX, SI 4584 LEAL -1(SI), DX 4585 CMPL DX, $0x3c 4586 JB one_byte_emit_remainder_encodeBlockAsm10B 4587 CMPL DX, $0x00000100 4588 JB two_bytes_emit_remainder_encodeBlockAsm10B 4589 JB three_bytes_emit_remainder_encodeBlockAsm10B 4590 4591 three_bytes_emit_remainder_encodeBlockAsm10B: 4592 MOVB $0xf4, (AX) 4593 MOVW DX, 1(AX) 4594 ADDQ $0x03, AX 4595 JMP memmove_long_emit_remainder_encodeBlockAsm10B 4596 4597 two_bytes_emit_remainder_encodeBlockAsm10B: 4598 MOVB $0xf0, (AX) 4599 MOVB DL, 1(AX) 4600 ADDQ $0x02, AX 4601 CMPL DX, $0x40 4602 JB memmove_emit_remainder_encodeBlockAsm10B 4603 JMP memmove_long_emit_remainder_encodeBlockAsm10B 4604 4605 one_byte_emit_remainder_encodeBlockAsm10B: 4606 SHLB $0x02, DL 4607 MOVB DL, (AX) 4608 ADDQ $0x01, AX 4609 4610 memmove_emit_remainder_encodeBlockAsm10B: 4611 LEAQ (AX)(SI*1), DX 4612 MOVL SI, BX 4613 4614 // genMemMoveShort 4615 CMPQ BX, $0x03 4616 JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2 4617 JE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3 4618 CMPQ BX, $0x08 4619 JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7 4620 CMPQ BX, $0x10 4621 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16 4622 CMPQ BX, $0x20 4623 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32 4624 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64 4625 4626 emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2: 4627 MOVB (CX), SI 4628 MOVB -1(CX)(BX*1), CL 4629 MOVB SI, (AX) 4630 MOVB CL, -1(AX)(BX*1) 4631 JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B 4632 4633 emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3: 4634 MOVW (CX), SI 4635 MOVB 2(CX), CL 4636 MOVW SI, (AX) 4637 MOVB CL, 2(AX) 4638 JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B 4639 4640 emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7: 4641 MOVL (CX), SI 4642 MOVL -4(CX)(BX*1), CX 4643 MOVL SI, (AX) 4644 MOVL CX, -4(AX)(BX*1) 4645 JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B 4646 4647 emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16: 4648 MOVQ (CX), SI 4649 MOVQ -8(CX)(BX*1), CX 4650 MOVQ SI, (AX) 4651 MOVQ CX, -8(AX)(BX*1) 4652 JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B 4653 4654 emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32: 4655 MOVOU (CX), X0 4656 MOVOU -16(CX)(BX*1), X1 4657 MOVOU X0, (AX) 4658 MOVOU X1, -16(AX)(BX*1) 4659 JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B 4660 4661 emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64: 4662 MOVOU (CX), X0 4663 MOVOU 16(CX), X1 4664 MOVOU -32(CX)(BX*1), X2 4665 MOVOU -16(CX)(BX*1), X3 4666 MOVOU X0, (AX) 4667 MOVOU X1, 16(AX) 4668 MOVOU X2, -32(AX)(BX*1) 4669 MOVOU X3, -16(AX)(BX*1) 4670 4671 memmove_end_copy_emit_remainder_encodeBlockAsm10B: 4672 MOVQ DX, AX 4673 JMP emit_literal_done_emit_remainder_encodeBlockAsm10B 4674 4675 memmove_long_emit_remainder_encodeBlockAsm10B: 4676 LEAQ (AX)(SI*1), DX 4677 MOVL SI, BX 4678 4679 // genMemMoveLong 4680 MOVOU (CX), X0 4681 MOVOU 16(CX), X1 4682 MOVOU -32(CX)(BX*1), X2 4683 MOVOU -16(CX)(BX*1), X3 4684 MOVQ BX, DI 4685 SHRQ $0x05, DI 4686 MOVQ AX, SI 4687 ANDL $0x0000001f, SI 4688 MOVQ $0x00000040, R8 4689 SUBQ SI, R8 4690 DECQ DI 4691 JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32 4692 LEAQ -32(CX)(R8*1), SI 4693 LEAQ -32(AX)(R8*1), R9 4694 4695 emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back: 4696 MOVOU (SI), X4 4697 MOVOU 16(SI), X5 4698 MOVOA X4, (R9) 4699 MOVOA X5, 16(R9) 4700 ADDQ $0x20, R9 4701 ADDQ $0x20, SI 4702 ADDQ $0x20, R8 4703 DECQ DI 4704 JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back 4705 4706 emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32: 4707 MOVOU -32(CX)(R8*1), X4 4708 MOVOU -16(CX)(R8*1), X5 4709 MOVOA X4, -32(AX)(R8*1) 4710 MOVOA X5, -16(AX)(R8*1) 4711 ADDQ $0x20, R8 4712 CMPQ BX, R8 4713 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32 4714 MOVOU X0, (AX) 4715 MOVOU X1, 16(AX) 4716 MOVOU X2, -32(AX)(BX*1) 4717 MOVOU X3, -16(AX)(BX*1) 4718 MOVQ DX, AX 4719 4720 emit_literal_done_emit_remainder_encodeBlockAsm10B: 4721 MOVQ dst_base+0(FP), CX 4722 SUBQ CX, AX 4723 MOVQ AX, ret+48(FP) 4724 RET 4725 4726 // func encodeBlockAsm8B(dst []byte, src []byte) int 4727 // Requires: BMI, SSE2 4728 TEXT ·encodeBlockAsm8B(SB), $1048-56 4729 MOVQ dst_base+0(FP), AX 4730 MOVQ $0x00000008, CX 4731 LEAQ 24(SP), DX 4732 PXOR X0, X0 4733 4734 zero_loop_encodeBlockAsm8B: 4735 MOVOU X0, (DX) 4736 MOVOU X0, 16(DX) 4737 MOVOU X0, 32(DX) 4738 MOVOU X0, 48(DX) 4739 MOVOU X0, 64(DX) 4740 MOVOU X0, 80(DX) 4741 MOVOU X0, 96(DX) 4742 MOVOU X0, 112(DX) 4743 ADDQ $0x80, DX 4744 DECQ CX 4745 JNZ zero_loop_encodeBlockAsm8B 4746 MOVL $0x00000000, 12(SP) 4747 MOVQ src_len+32(FP), CX 4748 LEAQ -9(CX), DX 4749 LEAQ -8(CX), BX 4750 MOVL BX, 8(SP) 4751 SHRQ $0x05, CX 4752 SUBL CX, DX 4753 LEAQ (AX)(DX*1), DX 4754 MOVQ DX, (SP) 4755 MOVL $0x00000001, CX 4756 MOVL CX, 16(SP) 4757 MOVQ src_base+24(FP), DX 4758 4759 search_loop_encodeBlockAsm8B: 4760 MOVL CX, BX 4761 SUBL 12(SP), BX 4762 SHRL $0x04, BX 4763 LEAL 4(CX)(BX*1), BX 4764 CMPL BX, 8(SP) 4765 JAE emit_remainder_encodeBlockAsm8B 4766 MOVQ (DX)(CX*1), SI 4767 MOVL BX, 20(SP) 4768 MOVQ $0x9e3779b1, R8 4769 MOVQ SI, R9 4770 MOVQ SI, R10 4771 SHRQ $0x08, R10 4772 SHLQ $0x20, R9 4773 IMULQ R8, R9 4774 SHRQ $0x38, R9 4775 SHLQ $0x20, R10 4776 IMULQ R8, R10 4777 SHRQ $0x38, R10 4778 MOVL 24(SP)(R9*4), BX 4779 MOVL 24(SP)(R10*4), DI 4780 MOVL CX, 24(SP)(R9*4) 4781 LEAL 1(CX), R9 4782 MOVL R9, 24(SP)(R10*4) 4783 MOVQ SI, R9 4784 SHRQ $0x10, R9 4785 SHLQ $0x20, R9 4786 IMULQ R8, R9 4787 SHRQ $0x38, R9 4788 MOVL CX, R8 4789 SUBL 16(SP), R8 4790 MOVL 1(DX)(R8*1), R10 4791 MOVQ SI, R8 4792 SHRQ $0x08, R8 4793 CMPL R8, R10 4794 JNE no_repeat_found_encodeBlockAsm8B 4795 LEAL 1(CX), SI 4796 MOVL 12(SP), DI 4797 MOVL SI, BX 4798 SUBL 16(SP), BX 4799 JZ repeat_extend_back_end_encodeBlockAsm8B 4800 4801 repeat_extend_back_loop_encodeBlockAsm8B: 4802 CMPL SI, DI 4803 JBE repeat_extend_back_end_encodeBlockAsm8B 4804 MOVB -1(DX)(BX*1), R8 4805 MOVB -1(DX)(SI*1), R9 4806 CMPB R8, R9 4807 JNE repeat_extend_back_end_encodeBlockAsm8B 4808 LEAL -1(SI), SI 4809 DECL BX 4810 JNZ repeat_extend_back_loop_encodeBlockAsm8B 4811 4812 repeat_extend_back_end_encodeBlockAsm8B: 4813 MOVL 12(SP), BX 4814 CMPL BX, SI 4815 JEQ emit_literal_done_repeat_emit_encodeBlockAsm8B 4816 MOVL SI, R8 4817 MOVL SI, 12(SP) 4818 LEAQ (DX)(BX*1), R9 4819 SUBL BX, R8 4820 LEAL -1(R8), BX 4821 CMPL BX, $0x3c 4822 JB one_byte_repeat_emit_encodeBlockAsm8B 4823 CMPL BX, $0x00000100 4824 JB two_bytes_repeat_emit_encodeBlockAsm8B 4825 JB three_bytes_repeat_emit_encodeBlockAsm8B 4826 4827 three_bytes_repeat_emit_encodeBlockAsm8B: 4828 MOVB $0xf4, (AX) 4829 MOVW BX, 1(AX) 4830 ADDQ $0x03, AX 4831 JMP memmove_long_repeat_emit_encodeBlockAsm8B 4832 4833 two_bytes_repeat_emit_encodeBlockAsm8B: 4834 MOVB $0xf0, (AX) 4835 MOVB BL, 1(AX) 4836 ADDQ $0x02, AX 4837 CMPL BX, $0x40 4838 JB memmove_repeat_emit_encodeBlockAsm8B 4839 JMP memmove_long_repeat_emit_encodeBlockAsm8B 4840 4841 one_byte_repeat_emit_encodeBlockAsm8B: 4842 SHLB $0x02, BL 4843 MOVB BL, (AX) 4844 ADDQ $0x01, AX 4845 4846 memmove_repeat_emit_encodeBlockAsm8B: 4847 LEAQ (AX)(R8*1), BX 4848 4849 // genMemMoveShort 4850 CMPQ R8, $0x08 4851 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8 4852 CMPQ R8, $0x10 4853 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16 4854 CMPQ R8, $0x20 4855 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32 4856 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64 4857 4858 emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8: 4859 MOVQ (R9), R10 4860 MOVQ R10, (AX) 4861 JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B 4862 4863 emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16: 4864 MOVQ (R9), R10 4865 MOVQ -8(R9)(R8*1), R9 4866 MOVQ R10, (AX) 4867 MOVQ R9, -8(AX)(R8*1) 4868 JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B 4869 4870 emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32: 4871 MOVOU (R9), X0 4872 MOVOU -16(R9)(R8*1), X1 4873 MOVOU X0, (AX) 4874 MOVOU X1, -16(AX)(R8*1) 4875 JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B 4876 4877 emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64: 4878 MOVOU (R9), X0 4879 MOVOU 16(R9), X1 4880 MOVOU -32(R9)(R8*1), X2 4881 MOVOU -16(R9)(R8*1), X3 4882 MOVOU X0, (AX) 4883 MOVOU X1, 16(AX) 4884 MOVOU X2, -32(AX)(R8*1) 4885 MOVOU X3, -16(AX)(R8*1) 4886 4887 memmove_end_copy_repeat_emit_encodeBlockAsm8B: 4888 MOVQ BX, AX 4889 JMP emit_literal_done_repeat_emit_encodeBlockAsm8B 4890 4891 memmove_long_repeat_emit_encodeBlockAsm8B: 4892 LEAQ (AX)(R8*1), BX 4893 4894 // genMemMoveLong 4895 MOVOU (R9), X0 4896 MOVOU 16(R9), X1 4897 MOVOU -32(R9)(R8*1), X2 4898 MOVOU -16(R9)(R8*1), X3 4899 MOVQ R8, R11 4900 SHRQ $0x05, R11 4901 MOVQ AX, R10 4902 ANDL $0x0000001f, R10 4903 MOVQ $0x00000040, R12 4904 SUBQ R10, R12 4905 DECQ R11 4906 JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 4907 LEAQ -32(R9)(R12*1), R10 4908 LEAQ -32(AX)(R12*1), R13 4909 4910 emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back: 4911 MOVOU (R10), X4 4912 MOVOU 16(R10), X5 4913 MOVOA X4, (R13) 4914 MOVOA X5, 16(R13) 4915 ADDQ $0x20, R13 4916 ADDQ $0x20, R10 4917 ADDQ $0x20, R12 4918 DECQ R11 4919 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back 4920 4921 emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32: 4922 MOVOU -32(R9)(R12*1), X4 4923 MOVOU -16(R9)(R12*1), X5 4924 MOVOA X4, -32(AX)(R12*1) 4925 MOVOA X5, -16(AX)(R12*1) 4926 ADDQ $0x20, R12 4927 CMPQ R8, R12 4928 JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 4929 MOVOU X0, (AX) 4930 MOVOU X1, 16(AX) 4931 MOVOU X2, -32(AX)(R8*1) 4932 MOVOU X3, -16(AX)(R8*1) 4933 MOVQ BX, AX 4934 4935 emit_literal_done_repeat_emit_encodeBlockAsm8B: 4936 ADDL $0x05, CX 4937 MOVL CX, BX 4938 SUBL 16(SP), BX 4939 MOVQ src_len+32(FP), R8 4940 SUBL CX, R8 4941 LEAQ (DX)(CX*1), R9 4942 LEAQ (DX)(BX*1), BX 4943 4944 // matchLen 4945 XORL R11, R11 4946 CMPL R8, $0x08 4947 JB matchlen_match4_repeat_extend_encodeBlockAsm8B 4948 4949 matchlen_loopback_repeat_extend_encodeBlockAsm8B: 4950 MOVQ (R9)(R11*1), R10 4951 XORQ (BX)(R11*1), R10 4952 TESTQ R10, R10 4953 JZ matchlen_loop_repeat_extend_encodeBlockAsm8B 4954 4955 #ifdef GOAMD64_v3 4956 TZCNTQ R10, R10 4957 4958 #else 4959 BSFQ R10, R10 4960 4961 #endif 4962 SARQ $0x03, R10 4963 LEAL (R11)(R10*1), R11 4964 JMP repeat_extend_forward_end_encodeBlockAsm8B 4965 4966 matchlen_loop_repeat_extend_encodeBlockAsm8B: 4967 LEAL -8(R8), R8 4968 LEAL 8(R11), R11 4969 CMPL R8, $0x08 4970 JAE matchlen_loopback_repeat_extend_encodeBlockAsm8B 4971 JZ repeat_extend_forward_end_encodeBlockAsm8B 4972 4973 matchlen_match4_repeat_extend_encodeBlockAsm8B: 4974 CMPL R8, $0x04 4975 JB matchlen_match2_repeat_extend_encodeBlockAsm8B 4976 MOVL (R9)(R11*1), R10 4977 CMPL (BX)(R11*1), R10 4978 JNE matchlen_match2_repeat_extend_encodeBlockAsm8B 4979 SUBL $0x04, R8 4980 LEAL 4(R11), R11 4981 4982 matchlen_match2_repeat_extend_encodeBlockAsm8B: 4983 CMPL R8, $0x02 4984 JB matchlen_match1_repeat_extend_encodeBlockAsm8B 4985 MOVW (R9)(R11*1), R10 4986 CMPW (BX)(R11*1), R10 4987 JNE matchlen_match1_repeat_extend_encodeBlockAsm8B 4988 SUBL $0x02, R8 4989 LEAL 2(R11), R11 4990 4991 matchlen_match1_repeat_extend_encodeBlockAsm8B: 4992 CMPL R8, $0x01 4993 JB repeat_extend_forward_end_encodeBlockAsm8B 4994 MOVB (R9)(R11*1), R10 4995 CMPB (BX)(R11*1), R10 4996 JNE repeat_extend_forward_end_encodeBlockAsm8B 4997 LEAL 1(R11), R11 4998 4999 repeat_extend_forward_end_encodeBlockAsm8B: 5000 ADDL R11, CX 5001 MOVL CX, BX 5002 SUBL SI, BX 5003 MOVL 16(SP), SI 5004 TESTL DI, DI 5005 JZ repeat_as_copy_encodeBlockAsm8B 5006 5007 // emitRepeat 5008 MOVL BX, SI 5009 LEAL -4(BX), BX 5010 CMPL SI, $0x08 5011 JBE repeat_two_match_repeat_encodeBlockAsm8B 5012 CMPL SI, $0x0c 5013 JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm8B 5014 5015 cant_repeat_two_offset_match_repeat_encodeBlockAsm8B: 5016 CMPL BX, $0x00000104 5017 JB repeat_three_match_repeat_encodeBlockAsm8B 5018 LEAL -256(BX), BX 5019 MOVW $0x0019, (AX) 5020 MOVW BX, 2(AX) 5021 ADDQ $0x04, AX 5022 JMP repeat_end_emit_encodeBlockAsm8B 5023 5024 repeat_three_match_repeat_encodeBlockAsm8B: 5025 LEAL -4(BX), BX 5026 MOVW $0x0015, (AX) 5027 MOVB BL, 2(AX) 5028 ADDQ $0x03, AX 5029 JMP repeat_end_emit_encodeBlockAsm8B 5030 5031 repeat_two_match_repeat_encodeBlockAsm8B: 5032 SHLL $0x02, BX 5033 ORL $0x01, BX 5034 MOVW BX, (AX) 5035 ADDQ $0x02, AX 5036 JMP repeat_end_emit_encodeBlockAsm8B 5037 XORQ DI, DI 5038 LEAL 1(DI)(BX*4), BX 5039 MOVB SI, 1(AX) 5040 SARL $0x08, SI 5041 SHLL $0x05, SI 5042 ORL SI, BX 5043 MOVB BL, (AX) 5044 ADDQ $0x02, AX 5045 JMP repeat_end_emit_encodeBlockAsm8B 5046 5047 repeat_as_copy_encodeBlockAsm8B: 5048 // emitCopy 5049 CMPL BX, $0x40 5050 JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B 5051 CMPL SI, $0x00000800 5052 JAE long_offset_short_repeat_as_copy_encodeBlockAsm8B 5053 MOVL $0x00000001, DI 5054 LEAL 16(DI), DI 5055 MOVB SI, 1(AX) 5056 SHRL $0x08, SI 5057 SHLL $0x05, SI 5058 ORL SI, DI 5059 MOVB DI, (AX) 5060 ADDQ $0x02, AX 5061 SUBL $0x08, BX 5062 5063 // emitRepeat 5064 LEAL -4(BX), BX 5065 JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b 5066 MOVL BX, SI 5067 LEAL -4(BX), BX 5068 CMPL SI, $0x08 5069 JBE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b 5070 CMPL SI, $0x0c 5071 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b 5072 5073 cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b: 5074 CMPL BX, $0x00000104 5075 JB repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b 5076 LEAL -256(BX), BX 5077 MOVW $0x0019, (AX) 5078 MOVW BX, 2(AX) 5079 ADDQ $0x04, AX 5080 JMP repeat_end_emit_encodeBlockAsm8B 5081 5082 repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b: 5083 LEAL -4(BX), BX 5084 MOVW $0x0015, (AX) 5085 MOVB BL, 2(AX) 5086 ADDQ $0x03, AX 5087 JMP repeat_end_emit_encodeBlockAsm8B 5088 5089 repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b: 5090 SHLL $0x02, BX 5091 ORL $0x01, BX 5092 MOVW BX, (AX) 5093 ADDQ $0x02, AX 5094 JMP repeat_end_emit_encodeBlockAsm8B 5095 XORQ DI, DI 5096 LEAL 1(DI)(BX*4), BX 5097 MOVB SI, 1(AX) 5098 SARL $0x08, SI 5099 SHLL $0x05, SI 5100 ORL SI, BX 5101 MOVB BL, (AX) 5102 ADDQ $0x02, AX 5103 JMP repeat_end_emit_encodeBlockAsm8B 5104 5105 long_offset_short_repeat_as_copy_encodeBlockAsm8B: 5106 MOVB $0xee, (AX) 5107 MOVW SI, 1(AX) 5108 LEAL -60(BX), BX 5109 ADDQ $0x03, AX 5110 5111 // emitRepeat 5112 MOVL BX, SI 5113 LEAL -4(BX), BX 5114 CMPL SI, $0x08 5115 JBE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short 5116 CMPL SI, $0x0c 5117 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short 5118 5119 cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: 5120 CMPL BX, $0x00000104 5121 JB repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short 5122 LEAL -256(BX), BX 5123 MOVW $0x0019, (AX) 5124 MOVW BX, 2(AX) 5125 ADDQ $0x04, AX 5126 JMP repeat_end_emit_encodeBlockAsm8B 5127 5128 repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: 5129 LEAL -4(BX), BX 5130 MOVW $0x0015, (AX) 5131 MOVB BL, 2(AX) 5132 ADDQ $0x03, AX 5133 JMP repeat_end_emit_encodeBlockAsm8B 5134 5135 repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: 5136 SHLL $0x02, BX 5137 ORL $0x01, BX 5138 MOVW BX, (AX) 5139 ADDQ $0x02, AX 5140 JMP repeat_end_emit_encodeBlockAsm8B 5141 XORQ DI, DI 5142 LEAL 1(DI)(BX*4), BX 5143 MOVB SI, 1(AX) 5144 SARL $0x08, SI 5145 SHLL $0x05, SI 5146 ORL SI, BX 5147 MOVB BL, (AX) 5148 ADDQ $0x02, AX 5149 JMP repeat_end_emit_encodeBlockAsm8B 5150 5151 two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B: 5152 MOVL BX, DI 5153 SHLL $0x02, DI 5154 CMPL BX, $0x0c 5155 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm8B 5156 LEAL -15(DI), DI 5157 MOVB SI, 1(AX) 5158 SHRL $0x08, SI 5159 SHLL $0x05, SI 5160 ORL SI, DI 5161 MOVB DI, (AX) 5162 ADDQ $0x02, AX 5163 JMP repeat_end_emit_encodeBlockAsm8B 5164 5165 emit_copy_three_repeat_as_copy_encodeBlockAsm8B: 5166 LEAL -2(DI), DI 5167 MOVB DI, (AX) 5168 MOVW SI, 1(AX) 5169 ADDQ $0x03, AX 5170 5171 repeat_end_emit_encodeBlockAsm8B: 5172 MOVL CX, 12(SP) 5173 JMP search_loop_encodeBlockAsm8B 5174 5175 no_repeat_found_encodeBlockAsm8B: 5176 CMPL (DX)(BX*1), SI 5177 JEQ candidate_match_encodeBlockAsm8B 5178 SHRQ $0x08, SI 5179 MOVL 24(SP)(R9*4), BX 5180 LEAL 2(CX), R8 5181 CMPL (DX)(DI*1), SI 5182 JEQ candidate2_match_encodeBlockAsm8B 5183 MOVL R8, 24(SP)(R9*4) 5184 SHRQ $0x08, SI 5185 CMPL (DX)(BX*1), SI 5186 JEQ candidate3_match_encodeBlockAsm8B 5187 MOVL 20(SP), CX 5188 JMP search_loop_encodeBlockAsm8B 5189 5190 candidate3_match_encodeBlockAsm8B: 5191 ADDL $0x02, CX 5192 JMP candidate_match_encodeBlockAsm8B 5193 5194 candidate2_match_encodeBlockAsm8B: 5195 MOVL R8, 24(SP)(R9*4) 5196 INCL CX 5197 MOVL DI, BX 5198 5199 candidate_match_encodeBlockAsm8B: 5200 MOVL 12(SP), SI 5201 TESTL BX, BX 5202 JZ match_extend_back_end_encodeBlockAsm8B 5203 5204 match_extend_back_loop_encodeBlockAsm8B: 5205 CMPL CX, SI 5206 JBE match_extend_back_end_encodeBlockAsm8B 5207 MOVB -1(DX)(BX*1), DI 5208 MOVB -1(DX)(CX*1), R8 5209 CMPB DI, R8 5210 JNE match_extend_back_end_encodeBlockAsm8B 5211 LEAL -1(CX), CX 5212 DECL BX 5213 JZ match_extend_back_end_encodeBlockAsm8B 5214 JMP match_extend_back_loop_encodeBlockAsm8B 5215 5216 match_extend_back_end_encodeBlockAsm8B: 5217 MOVL CX, SI 5218 SUBL 12(SP), SI 5219 LEAQ 3(AX)(SI*1), SI 5220 CMPQ SI, (SP) 5221 JB match_dst_size_check_encodeBlockAsm8B 5222 MOVQ $0x00000000, ret+48(FP) 5223 RET 5224 5225 match_dst_size_check_encodeBlockAsm8B: 5226 MOVL CX, SI 5227 MOVL 12(SP), DI 5228 CMPL DI, SI 5229 JEQ emit_literal_done_match_emit_encodeBlockAsm8B 5230 MOVL SI, R8 5231 MOVL SI, 12(SP) 5232 LEAQ (DX)(DI*1), SI 5233 SUBL DI, R8 5234 LEAL -1(R8), DI 5235 CMPL DI, $0x3c 5236 JB one_byte_match_emit_encodeBlockAsm8B 5237 CMPL DI, $0x00000100 5238 JB two_bytes_match_emit_encodeBlockAsm8B 5239 JB three_bytes_match_emit_encodeBlockAsm8B 5240 5241 three_bytes_match_emit_encodeBlockAsm8B: 5242 MOVB $0xf4, (AX) 5243 MOVW DI, 1(AX) 5244 ADDQ $0x03, AX 5245 JMP memmove_long_match_emit_encodeBlockAsm8B 5246 5247 two_bytes_match_emit_encodeBlockAsm8B: 5248 MOVB $0xf0, (AX) 5249 MOVB DI, 1(AX) 5250 ADDQ $0x02, AX 5251 CMPL DI, $0x40 5252 JB memmove_match_emit_encodeBlockAsm8B 5253 JMP memmove_long_match_emit_encodeBlockAsm8B 5254 5255 one_byte_match_emit_encodeBlockAsm8B: 5256 SHLB $0x02, DI 5257 MOVB DI, (AX) 5258 ADDQ $0x01, AX 5259 5260 memmove_match_emit_encodeBlockAsm8B: 5261 LEAQ (AX)(R8*1), DI 5262 5263 // genMemMoveShort 5264 CMPQ R8, $0x08 5265 JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8 5266 CMPQ R8, $0x10 5267 JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16 5268 CMPQ R8, $0x20 5269 JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32 5270 JMP emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64 5271 5272 emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8: 5273 MOVQ (SI), R9 5274 MOVQ R9, (AX) 5275 JMP memmove_end_copy_match_emit_encodeBlockAsm8B 5276 5277 emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16: 5278 MOVQ (SI), R9 5279 MOVQ -8(SI)(R8*1), SI 5280 MOVQ R9, (AX) 5281 MOVQ SI, -8(AX)(R8*1) 5282 JMP memmove_end_copy_match_emit_encodeBlockAsm8B 5283 5284 emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32: 5285 MOVOU (SI), X0 5286 MOVOU -16(SI)(R8*1), X1 5287 MOVOU X0, (AX) 5288 MOVOU X1, -16(AX)(R8*1) 5289 JMP memmove_end_copy_match_emit_encodeBlockAsm8B 5290 5291 emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64: 5292 MOVOU (SI), X0 5293 MOVOU 16(SI), X1 5294 MOVOU -32(SI)(R8*1), X2 5295 MOVOU -16(SI)(R8*1), X3 5296 MOVOU X0, (AX) 5297 MOVOU X1, 16(AX) 5298 MOVOU X2, -32(AX)(R8*1) 5299 MOVOU X3, -16(AX)(R8*1) 5300 5301 memmove_end_copy_match_emit_encodeBlockAsm8B: 5302 MOVQ DI, AX 5303 JMP emit_literal_done_match_emit_encodeBlockAsm8B 5304 5305 memmove_long_match_emit_encodeBlockAsm8B: 5306 LEAQ (AX)(R8*1), DI 5307 5308 // genMemMoveLong 5309 MOVOU (SI), X0 5310 MOVOU 16(SI), X1 5311 MOVOU -32(SI)(R8*1), X2 5312 MOVOU -16(SI)(R8*1), X3 5313 MOVQ R8, R10 5314 SHRQ $0x05, R10 5315 MOVQ AX, R9 5316 ANDL $0x0000001f, R9 5317 MOVQ $0x00000040, R11 5318 SUBQ R9, R11 5319 DECQ R10 5320 JA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 5321 LEAQ -32(SI)(R11*1), R9 5322 LEAQ -32(AX)(R11*1), R12 5323 5324 emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back: 5325 MOVOU (R9), X4 5326 MOVOU 16(R9), X5 5327 MOVOA X4, (R12) 5328 MOVOA X5, 16(R12) 5329 ADDQ $0x20, R12 5330 ADDQ $0x20, R9 5331 ADDQ $0x20, R11 5332 DECQ R10 5333 JNA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back 5334 5335 emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32: 5336 MOVOU -32(SI)(R11*1), X4 5337 MOVOU -16(SI)(R11*1), X5 5338 MOVOA X4, -32(AX)(R11*1) 5339 MOVOA X5, -16(AX)(R11*1) 5340 ADDQ $0x20, R11 5341 CMPQ R8, R11 5342 JAE emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 5343 MOVOU X0, (AX) 5344 MOVOU X1, 16(AX) 5345 MOVOU X2, -32(AX)(R8*1) 5346 MOVOU X3, -16(AX)(R8*1) 5347 MOVQ DI, AX 5348 5349 emit_literal_done_match_emit_encodeBlockAsm8B: 5350 match_nolit_loop_encodeBlockAsm8B: 5351 MOVL CX, SI 5352 SUBL BX, SI 5353 MOVL SI, 16(SP) 5354 ADDL $0x04, CX 5355 ADDL $0x04, BX 5356 MOVQ src_len+32(FP), SI 5357 SUBL CX, SI 5358 LEAQ (DX)(CX*1), DI 5359 LEAQ (DX)(BX*1), BX 5360 5361 // matchLen 5362 XORL R9, R9 5363 CMPL SI, $0x08 5364 JB matchlen_match4_match_nolit_encodeBlockAsm8B 5365 5366 matchlen_loopback_match_nolit_encodeBlockAsm8B: 5367 MOVQ (DI)(R9*1), R8 5368 XORQ (BX)(R9*1), R8 5369 TESTQ R8, R8 5370 JZ matchlen_loop_match_nolit_encodeBlockAsm8B 5371 5372 #ifdef GOAMD64_v3 5373 TZCNTQ R8, R8 5374 5375 #else 5376 BSFQ R8, R8 5377 5378 #endif 5379 SARQ $0x03, R8 5380 LEAL (R9)(R8*1), R9 5381 JMP match_nolit_end_encodeBlockAsm8B 5382 5383 matchlen_loop_match_nolit_encodeBlockAsm8B: 5384 LEAL -8(SI), SI 5385 LEAL 8(R9), R9 5386 CMPL SI, $0x08 5387 JAE matchlen_loopback_match_nolit_encodeBlockAsm8B 5388 JZ match_nolit_end_encodeBlockAsm8B 5389 5390 matchlen_match4_match_nolit_encodeBlockAsm8B: 5391 CMPL SI, $0x04 5392 JB matchlen_match2_match_nolit_encodeBlockAsm8B 5393 MOVL (DI)(R9*1), R8 5394 CMPL (BX)(R9*1), R8 5395 JNE matchlen_match2_match_nolit_encodeBlockAsm8B 5396 SUBL $0x04, SI 5397 LEAL 4(R9), R9 5398 5399 matchlen_match2_match_nolit_encodeBlockAsm8B: 5400 CMPL SI, $0x02 5401 JB matchlen_match1_match_nolit_encodeBlockAsm8B 5402 MOVW (DI)(R9*1), R8 5403 CMPW (BX)(R9*1), R8 5404 JNE matchlen_match1_match_nolit_encodeBlockAsm8B 5405 SUBL $0x02, SI 5406 LEAL 2(R9), R9 5407 5408 matchlen_match1_match_nolit_encodeBlockAsm8B: 5409 CMPL SI, $0x01 5410 JB match_nolit_end_encodeBlockAsm8B 5411 MOVB (DI)(R9*1), R8 5412 CMPB (BX)(R9*1), R8 5413 JNE match_nolit_end_encodeBlockAsm8B 5414 LEAL 1(R9), R9 5415 5416 match_nolit_end_encodeBlockAsm8B: 5417 ADDL R9, CX 5418 MOVL 16(SP), BX 5419 ADDL $0x04, R9 5420 MOVL CX, 12(SP) 5421 5422 // emitCopy 5423 CMPL R9, $0x40 5424 JBE two_byte_offset_short_match_nolit_encodeBlockAsm8B 5425 CMPL BX, $0x00000800 5426 JAE long_offset_short_match_nolit_encodeBlockAsm8B 5427 MOVL $0x00000001, SI 5428 LEAL 16(SI), SI 5429 MOVB BL, 1(AX) 5430 SHRL $0x08, BX 5431 SHLL $0x05, BX 5432 ORL BX, SI 5433 MOVB SI, (AX) 5434 ADDQ $0x02, AX 5435 SUBL $0x08, R9 5436 5437 // emitRepeat 5438 LEAL -4(R9), R9 5439 JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b 5440 MOVL R9, BX 5441 LEAL -4(R9), R9 5442 CMPL BX, $0x08 5443 JBE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b 5444 CMPL BX, $0x0c 5445 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b 5446 5447 cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b: 5448 CMPL R9, $0x00000104 5449 JB repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b 5450 LEAL -256(R9), R9 5451 MOVW $0x0019, (AX) 5452 MOVW R9, 2(AX) 5453 ADDQ $0x04, AX 5454 JMP match_nolit_emitcopy_end_encodeBlockAsm8B 5455 5456 repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b: 5457 LEAL -4(R9), R9 5458 MOVW $0x0015, (AX) 5459 MOVB R9, 2(AX) 5460 ADDQ $0x03, AX 5461 JMP match_nolit_emitcopy_end_encodeBlockAsm8B 5462 5463 repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b: 5464 SHLL $0x02, R9 5465 ORL $0x01, R9 5466 MOVW R9, (AX) 5467 ADDQ $0x02, AX 5468 JMP match_nolit_emitcopy_end_encodeBlockAsm8B 5469 XORQ SI, SI 5470 LEAL 1(SI)(R9*4), R9 5471 MOVB BL, 1(AX) 5472 SARL $0x08, BX 5473 SHLL $0x05, BX 5474 ORL BX, R9 5475 MOVB R9, (AX) 5476 ADDQ $0x02, AX 5477 JMP match_nolit_emitcopy_end_encodeBlockAsm8B 5478 5479 long_offset_short_match_nolit_encodeBlockAsm8B: 5480 MOVB $0xee, (AX) 5481 MOVW BX, 1(AX) 5482 LEAL -60(R9), R9 5483 ADDQ $0x03, AX 5484 5485 // emitRepeat 5486 MOVL R9, BX 5487 LEAL -4(R9), R9 5488 CMPL BX, $0x08 5489 JBE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short 5490 CMPL BX, $0x0c 5491 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short 5492 5493 cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short: 5494 CMPL R9, $0x00000104 5495 JB repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short 5496 LEAL -256(R9), R9 5497 MOVW $0x0019, (AX) 5498 MOVW R9, 2(AX) 5499 ADDQ $0x04, AX 5500 JMP match_nolit_emitcopy_end_encodeBlockAsm8B 5501 5502 repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short: 5503 LEAL -4(R9), R9 5504 MOVW $0x0015, (AX) 5505 MOVB R9, 2(AX) 5506 ADDQ $0x03, AX 5507 JMP match_nolit_emitcopy_end_encodeBlockAsm8B 5508 5509 repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short: 5510 SHLL $0x02, R9 5511 ORL $0x01, R9 5512 MOVW R9, (AX) 5513 ADDQ $0x02, AX 5514 JMP match_nolit_emitcopy_end_encodeBlockAsm8B 5515 XORQ SI, SI 5516 LEAL 1(SI)(R9*4), R9 5517 MOVB BL, 1(AX) 5518 SARL $0x08, BX 5519 SHLL $0x05, BX 5520 ORL BX, R9 5521 MOVB R9, (AX) 5522 ADDQ $0x02, AX 5523 JMP match_nolit_emitcopy_end_encodeBlockAsm8B 5524 5525 two_byte_offset_short_match_nolit_encodeBlockAsm8B: 5526 MOVL R9, SI 5527 SHLL $0x02, SI 5528 CMPL R9, $0x0c 5529 JAE emit_copy_three_match_nolit_encodeBlockAsm8B 5530 LEAL -15(SI), SI 5531 MOVB BL, 1(AX) 5532 SHRL $0x08, BX 5533 SHLL $0x05, BX 5534 ORL BX, SI 5535 MOVB SI, (AX) 5536 ADDQ $0x02, AX 5537 JMP match_nolit_emitcopy_end_encodeBlockAsm8B 5538 5539 emit_copy_three_match_nolit_encodeBlockAsm8B: 5540 LEAL -2(SI), SI 5541 MOVB SI, (AX) 5542 MOVW BX, 1(AX) 5543 ADDQ $0x03, AX 5544 5545 match_nolit_emitcopy_end_encodeBlockAsm8B: 5546 CMPL CX, 8(SP) 5547 JAE emit_remainder_encodeBlockAsm8B 5548 MOVQ -2(DX)(CX*1), SI 5549 CMPQ AX, (SP) 5550 JB match_nolit_dst_ok_encodeBlockAsm8B 5551 MOVQ $0x00000000, ret+48(FP) 5552 RET 5553 5554 match_nolit_dst_ok_encodeBlockAsm8B: 5555 MOVQ $0x9e3779b1, R8 5556 MOVQ SI, DI 5557 SHRQ $0x10, SI 5558 MOVQ SI, BX 5559 SHLQ $0x20, DI 5560 IMULQ R8, DI 5561 SHRQ $0x38, DI 5562 SHLQ $0x20, BX 5563 IMULQ R8, BX 5564 SHRQ $0x38, BX 5565 LEAL -2(CX), R8 5566 LEAQ 24(SP)(BX*4), R9 5567 MOVL (R9), BX 5568 MOVL R8, 24(SP)(DI*4) 5569 MOVL CX, (R9) 5570 CMPL (DX)(BX*1), SI 5571 JEQ match_nolit_loop_encodeBlockAsm8B 5572 INCL CX 5573 JMP search_loop_encodeBlockAsm8B 5574 5575 emit_remainder_encodeBlockAsm8B: 5576 MOVQ src_len+32(FP), CX 5577 SUBL 12(SP), CX 5578 LEAQ 3(AX)(CX*1), CX 5579 CMPQ CX, (SP) 5580 JB emit_remainder_ok_encodeBlockAsm8B 5581 MOVQ $0x00000000, ret+48(FP) 5582 RET 5583 5584 emit_remainder_ok_encodeBlockAsm8B: 5585 MOVQ src_len+32(FP), CX 5586 MOVL 12(SP), BX 5587 CMPL BX, CX 5588 JEQ emit_literal_done_emit_remainder_encodeBlockAsm8B 5589 MOVL CX, SI 5590 MOVL CX, 12(SP) 5591 LEAQ (DX)(BX*1), CX 5592 SUBL BX, SI 5593 LEAL -1(SI), DX 5594 CMPL DX, $0x3c 5595 JB one_byte_emit_remainder_encodeBlockAsm8B 5596 CMPL DX, $0x00000100 5597 JB two_bytes_emit_remainder_encodeBlockAsm8B 5598 JB three_bytes_emit_remainder_encodeBlockAsm8B 5599 5600 three_bytes_emit_remainder_encodeBlockAsm8B: 5601 MOVB $0xf4, (AX) 5602 MOVW DX, 1(AX) 5603 ADDQ $0x03, AX 5604 JMP memmove_long_emit_remainder_encodeBlockAsm8B 5605 5606 two_bytes_emit_remainder_encodeBlockAsm8B: 5607 MOVB $0xf0, (AX) 5608 MOVB DL, 1(AX) 5609 ADDQ $0x02, AX 5610 CMPL DX, $0x40 5611 JB memmove_emit_remainder_encodeBlockAsm8B 5612 JMP memmove_long_emit_remainder_encodeBlockAsm8B 5613 5614 one_byte_emit_remainder_encodeBlockAsm8B: 5615 SHLB $0x02, DL 5616 MOVB DL, (AX) 5617 ADDQ $0x01, AX 5618 5619 memmove_emit_remainder_encodeBlockAsm8B: 5620 LEAQ (AX)(SI*1), DX 5621 MOVL SI, BX 5622 5623 // genMemMoveShort 5624 CMPQ BX, $0x03 5625 JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2 5626 JE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3 5627 CMPQ BX, $0x08 5628 JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7 5629 CMPQ BX, $0x10 5630 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16 5631 CMPQ BX, $0x20 5632 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32 5633 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64 5634 5635 emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2: 5636 MOVB (CX), SI 5637 MOVB -1(CX)(BX*1), CL 5638 MOVB SI, (AX) 5639 MOVB CL, -1(AX)(BX*1) 5640 JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B 5641 5642 emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3: 5643 MOVW (CX), SI 5644 MOVB 2(CX), CL 5645 MOVW SI, (AX) 5646 MOVB CL, 2(AX) 5647 JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B 5648 5649 emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7: 5650 MOVL (CX), SI 5651 MOVL -4(CX)(BX*1), CX 5652 MOVL SI, (AX) 5653 MOVL CX, -4(AX)(BX*1) 5654 JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B 5655 5656 emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16: 5657 MOVQ (CX), SI 5658 MOVQ -8(CX)(BX*1), CX 5659 MOVQ SI, (AX) 5660 MOVQ CX, -8(AX)(BX*1) 5661 JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B 5662 5663 emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32: 5664 MOVOU (CX), X0 5665 MOVOU -16(CX)(BX*1), X1 5666 MOVOU X0, (AX) 5667 MOVOU X1, -16(AX)(BX*1) 5668 JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B 5669 5670 emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64: 5671 MOVOU (CX), X0 5672 MOVOU 16(CX), X1 5673 MOVOU -32(CX)(BX*1), X2 5674 MOVOU -16(CX)(BX*1), X3 5675 MOVOU X0, (AX) 5676 MOVOU X1, 16(AX) 5677 MOVOU X2, -32(AX)(BX*1) 5678 MOVOU X3, -16(AX)(BX*1) 5679 5680 memmove_end_copy_emit_remainder_encodeBlockAsm8B: 5681 MOVQ DX, AX 5682 JMP emit_literal_done_emit_remainder_encodeBlockAsm8B 5683 5684 memmove_long_emit_remainder_encodeBlockAsm8B: 5685 LEAQ (AX)(SI*1), DX 5686 MOVL SI, BX 5687 5688 // genMemMoveLong 5689 MOVOU (CX), X0 5690 MOVOU 16(CX), X1 5691 MOVOU -32(CX)(BX*1), X2 5692 MOVOU -16(CX)(BX*1), X3 5693 MOVQ BX, DI 5694 SHRQ $0x05, DI 5695 MOVQ AX, SI 5696 ANDL $0x0000001f, SI 5697 MOVQ $0x00000040, R8 5698 SUBQ SI, R8 5699 DECQ DI 5700 JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32 5701 LEAQ -32(CX)(R8*1), SI 5702 LEAQ -32(AX)(R8*1), R9 5703 5704 emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back: 5705 MOVOU (SI), X4 5706 MOVOU 16(SI), X5 5707 MOVOA X4, (R9) 5708 MOVOA X5, 16(R9) 5709 ADDQ $0x20, R9 5710 ADDQ $0x20, SI 5711 ADDQ $0x20, R8 5712 DECQ DI 5713 JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back 5714 5715 emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32: 5716 MOVOU -32(CX)(R8*1), X4 5717 MOVOU -16(CX)(R8*1), X5 5718 MOVOA X4, -32(AX)(R8*1) 5719 MOVOA X5, -16(AX)(R8*1) 5720 ADDQ $0x20, R8 5721 CMPQ BX, R8 5722 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32 5723 MOVOU X0, (AX) 5724 MOVOU X1, 16(AX) 5725 MOVOU X2, -32(AX)(BX*1) 5726 MOVOU X3, -16(AX)(BX*1) 5727 MOVQ DX, AX 5728 5729 emit_literal_done_emit_remainder_encodeBlockAsm8B: 5730 MOVQ dst_base+0(FP), CX 5731 SUBQ CX, AX 5732 MOVQ AX, ret+48(FP) 5733 RET 5734 5735 // func encodeBetterBlockAsm(dst []byte, src []byte) int 5736 // Requires: BMI, SSE2 5737 TEXT ·encodeBetterBlockAsm(SB), $589848-56 5738 MOVQ dst_base+0(FP), AX 5739 MOVQ $0x00001200, CX 5740 LEAQ 24(SP), DX 5741 PXOR X0, X0 5742 5743 zero_loop_encodeBetterBlockAsm: 5744 MOVOU X0, (DX) 5745 MOVOU X0, 16(DX) 5746 MOVOU X0, 32(DX) 5747 MOVOU X0, 48(DX) 5748 MOVOU X0, 64(DX) 5749 MOVOU X0, 80(DX) 5750 MOVOU X0, 96(DX) 5751 MOVOU X0, 112(DX) 5752 ADDQ $0x80, DX 5753 DECQ CX 5754 JNZ zero_loop_encodeBetterBlockAsm 5755 MOVL $0x00000000, 12(SP) 5756 MOVQ src_len+32(FP), CX 5757 LEAQ -6(CX), DX 5758 LEAQ -8(CX), BX 5759 MOVL BX, 8(SP) 5760 SHRQ $0x05, CX 5761 SUBL CX, DX 5762 LEAQ (AX)(DX*1), DX 5763 MOVQ DX, (SP) 5764 MOVL $0x00000001, CX 5765 MOVL $0x00000000, 16(SP) 5766 MOVQ src_base+24(FP), DX 5767 5768 search_loop_encodeBetterBlockAsm: 5769 MOVL CX, BX 5770 SUBL 12(SP), BX 5771 SHRL $0x07, BX 5772 CMPL BX, $0x63 5773 JBE check_maxskip_ok_encodeBetterBlockAsm 5774 LEAL 100(CX), BX 5775 JMP check_maxskip_cont_encodeBetterBlockAsm 5776 5777 check_maxskip_ok_encodeBetterBlockAsm: 5778 LEAL 1(CX)(BX*1), BX 5779 5780 check_maxskip_cont_encodeBetterBlockAsm: 5781 CMPL BX, 8(SP) 5782 JAE emit_remainder_encodeBetterBlockAsm 5783 MOVQ (DX)(CX*1), SI 5784 MOVL BX, 20(SP) 5785 MOVQ $0x00cf1bbcdcbfa563, R8 5786 MOVQ $0x9e3779b1, BX 5787 MOVQ SI, R9 5788 MOVQ SI, R10 5789 SHLQ $0x08, R9 5790 IMULQ R8, R9 5791 SHRQ $0x2f, R9 5792 SHLQ $0x20, R10 5793 IMULQ BX, R10 5794 SHRQ $0x32, R10 5795 MOVL 24(SP)(R9*4), BX 5796 MOVL 524312(SP)(R10*4), DI 5797 MOVL CX, 24(SP)(R9*4) 5798 MOVL CX, 524312(SP)(R10*4) 5799 MOVQ (DX)(BX*1), R9 5800 MOVQ (DX)(DI*1), R10 5801 CMPQ R9, SI 5802 JEQ candidate_match_encodeBetterBlockAsm 5803 CMPQ R10, SI 5804 JNE no_short_found_encodeBetterBlockAsm 5805 MOVL DI, BX 5806 JMP candidate_match_encodeBetterBlockAsm 5807 5808 no_short_found_encodeBetterBlockAsm: 5809 CMPL R9, SI 5810 JEQ candidate_match_encodeBetterBlockAsm 5811 CMPL R10, SI 5812 JEQ candidateS_match_encodeBetterBlockAsm 5813 MOVL 20(SP), CX 5814 JMP search_loop_encodeBetterBlockAsm 5815 5816 candidateS_match_encodeBetterBlockAsm: 5817 SHRQ $0x08, SI 5818 MOVQ SI, R9 5819 SHLQ $0x08, R9 5820 IMULQ R8, R9 5821 SHRQ $0x2f, R9 5822 MOVL 24(SP)(R9*4), BX 5823 INCL CX 5824 MOVL CX, 24(SP)(R9*4) 5825 CMPL (DX)(BX*1), SI 5826 JEQ candidate_match_encodeBetterBlockAsm 5827 DECL CX 5828 MOVL DI, BX 5829 5830 candidate_match_encodeBetterBlockAsm: 5831 MOVL 12(SP), SI 5832 TESTL BX, BX 5833 JZ match_extend_back_end_encodeBetterBlockAsm 5834 5835 match_extend_back_loop_encodeBetterBlockAsm: 5836 CMPL CX, SI 5837 JBE match_extend_back_end_encodeBetterBlockAsm 5838 MOVB -1(DX)(BX*1), DI 5839 MOVB -1(DX)(CX*1), R8 5840 CMPB DI, R8 5841 JNE match_extend_back_end_encodeBetterBlockAsm 5842 LEAL -1(CX), CX 5843 DECL BX 5844 JZ match_extend_back_end_encodeBetterBlockAsm 5845 JMP match_extend_back_loop_encodeBetterBlockAsm 5846 5847 match_extend_back_end_encodeBetterBlockAsm: 5848 MOVL CX, SI 5849 SUBL 12(SP), SI 5850 LEAQ 5(AX)(SI*1), SI 5851 CMPQ SI, (SP) 5852 JB match_dst_size_check_encodeBetterBlockAsm 5853 MOVQ $0x00000000, ret+48(FP) 5854 RET 5855 5856 match_dst_size_check_encodeBetterBlockAsm: 5857 MOVL CX, SI 5858 ADDL $0x04, CX 5859 ADDL $0x04, BX 5860 MOVQ src_len+32(FP), DI 5861 SUBL CX, DI 5862 LEAQ (DX)(CX*1), R8 5863 LEAQ (DX)(BX*1), R9 5864 5865 // matchLen 5866 XORL R11, R11 5867 CMPL DI, $0x08 5868 JB matchlen_match4_match_nolit_encodeBetterBlockAsm 5869 5870 matchlen_loopback_match_nolit_encodeBetterBlockAsm: 5871 MOVQ (R8)(R11*1), R10 5872 XORQ (R9)(R11*1), R10 5873 TESTQ R10, R10 5874 JZ matchlen_loop_match_nolit_encodeBetterBlockAsm 5875 5876 #ifdef GOAMD64_v3 5877 TZCNTQ R10, R10 5878 5879 #else 5880 BSFQ R10, R10 5881 5882 #endif 5883 SARQ $0x03, R10 5884 LEAL (R11)(R10*1), R11 5885 JMP match_nolit_end_encodeBetterBlockAsm 5886 5887 matchlen_loop_match_nolit_encodeBetterBlockAsm: 5888 LEAL -8(DI), DI 5889 LEAL 8(R11), R11 5890 CMPL DI, $0x08 5891 JAE matchlen_loopback_match_nolit_encodeBetterBlockAsm 5892 JZ match_nolit_end_encodeBetterBlockAsm 5893 5894 matchlen_match4_match_nolit_encodeBetterBlockAsm: 5895 CMPL DI, $0x04 5896 JB matchlen_match2_match_nolit_encodeBetterBlockAsm 5897 MOVL (R8)(R11*1), R10 5898 CMPL (R9)(R11*1), R10 5899 JNE matchlen_match2_match_nolit_encodeBetterBlockAsm 5900 SUBL $0x04, DI 5901 LEAL 4(R11), R11 5902 5903 matchlen_match2_match_nolit_encodeBetterBlockAsm: 5904 CMPL DI, $0x02 5905 JB matchlen_match1_match_nolit_encodeBetterBlockAsm 5906 MOVW (R8)(R11*1), R10 5907 CMPW (R9)(R11*1), R10 5908 JNE matchlen_match1_match_nolit_encodeBetterBlockAsm 5909 SUBL $0x02, DI 5910 LEAL 2(R11), R11 5911 5912 matchlen_match1_match_nolit_encodeBetterBlockAsm: 5913 CMPL DI, $0x01 5914 JB match_nolit_end_encodeBetterBlockAsm 5915 MOVB (R8)(R11*1), R10 5916 CMPB (R9)(R11*1), R10 5917 JNE match_nolit_end_encodeBetterBlockAsm 5918 LEAL 1(R11), R11 5919 5920 match_nolit_end_encodeBetterBlockAsm: 5921 MOVL CX, DI 5922 SUBL BX, DI 5923 5924 // Check if repeat 5925 CMPL 16(SP), DI 5926 JEQ match_is_repeat_encodeBetterBlockAsm 5927 CMPL R11, $0x01 5928 JA match_length_ok_encodeBetterBlockAsm 5929 CMPL DI, $0x0000ffff 5930 JBE match_length_ok_encodeBetterBlockAsm 5931 MOVL 20(SP), CX 5932 INCL CX 5933 JMP search_loop_encodeBetterBlockAsm 5934 5935 match_length_ok_encodeBetterBlockAsm: 5936 MOVL DI, 16(SP) 5937 MOVL 12(SP), BX 5938 CMPL BX, SI 5939 JEQ emit_literal_done_match_emit_encodeBetterBlockAsm 5940 MOVL SI, R8 5941 MOVL SI, 12(SP) 5942 LEAQ (DX)(BX*1), R9 5943 SUBL BX, R8 5944 LEAL -1(R8), BX 5945 CMPL BX, $0x3c 5946 JB one_byte_match_emit_encodeBetterBlockAsm 5947 CMPL BX, $0x00000100 5948 JB two_bytes_match_emit_encodeBetterBlockAsm 5949 CMPL BX, $0x00010000 5950 JB three_bytes_match_emit_encodeBetterBlockAsm 5951 CMPL BX, $0x01000000 5952 JB four_bytes_match_emit_encodeBetterBlockAsm 5953 MOVB $0xfc, (AX) 5954 MOVL BX, 1(AX) 5955 ADDQ $0x05, AX 5956 JMP memmove_long_match_emit_encodeBetterBlockAsm 5957 5958 four_bytes_match_emit_encodeBetterBlockAsm: 5959 MOVL BX, R10 5960 SHRL $0x10, R10 5961 MOVB $0xf8, (AX) 5962 MOVW BX, 1(AX) 5963 MOVB R10, 3(AX) 5964 ADDQ $0x04, AX 5965 JMP memmove_long_match_emit_encodeBetterBlockAsm 5966 5967 three_bytes_match_emit_encodeBetterBlockAsm: 5968 MOVB $0xf4, (AX) 5969 MOVW BX, 1(AX) 5970 ADDQ $0x03, AX 5971 JMP memmove_long_match_emit_encodeBetterBlockAsm 5972 5973 two_bytes_match_emit_encodeBetterBlockAsm: 5974 MOVB $0xf0, (AX) 5975 MOVB BL, 1(AX) 5976 ADDQ $0x02, AX 5977 CMPL BX, $0x40 5978 JB memmove_match_emit_encodeBetterBlockAsm 5979 JMP memmove_long_match_emit_encodeBetterBlockAsm 5980 5981 one_byte_match_emit_encodeBetterBlockAsm: 5982 SHLB $0x02, BL 5983 MOVB BL, (AX) 5984 ADDQ $0x01, AX 5985 5986 memmove_match_emit_encodeBetterBlockAsm: 5987 LEAQ (AX)(R8*1), BX 5988 5989 // genMemMoveShort 5990 CMPQ R8, $0x04 5991 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4 5992 CMPQ R8, $0x08 5993 JB emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7 5994 CMPQ R8, $0x10 5995 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16 5996 CMPQ R8, $0x20 5997 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32 5998 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64 5999 6000 emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4: 6001 MOVL (R9), R10 6002 MOVL R10, (AX) 6003 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm 6004 6005 emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7: 6006 MOVL (R9), R10 6007 MOVL -4(R9)(R8*1), R9 6008 MOVL R10, (AX) 6009 MOVL R9, -4(AX)(R8*1) 6010 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm 6011 6012 emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16: 6013 MOVQ (R9), R10 6014 MOVQ -8(R9)(R8*1), R9 6015 MOVQ R10, (AX) 6016 MOVQ R9, -8(AX)(R8*1) 6017 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm 6018 6019 emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32: 6020 MOVOU (R9), X0 6021 MOVOU -16(R9)(R8*1), X1 6022 MOVOU X0, (AX) 6023 MOVOU X1, -16(AX)(R8*1) 6024 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm 6025 6026 emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64: 6027 MOVOU (R9), X0 6028 MOVOU 16(R9), X1 6029 MOVOU -32(R9)(R8*1), X2 6030 MOVOU -16(R9)(R8*1), X3 6031 MOVOU X0, (AX) 6032 MOVOU X1, 16(AX) 6033 MOVOU X2, -32(AX)(R8*1) 6034 MOVOU X3, -16(AX)(R8*1) 6035 6036 memmove_end_copy_match_emit_encodeBetterBlockAsm: 6037 MOVQ BX, AX 6038 JMP emit_literal_done_match_emit_encodeBetterBlockAsm 6039 6040 memmove_long_match_emit_encodeBetterBlockAsm: 6041 LEAQ (AX)(R8*1), BX 6042 6043 // genMemMoveLong 6044 MOVOU (R9), X0 6045 MOVOU 16(R9), X1 6046 MOVOU -32(R9)(R8*1), X2 6047 MOVOU -16(R9)(R8*1), X3 6048 MOVQ R8, R12 6049 SHRQ $0x05, R12 6050 MOVQ AX, R10 6051 ANDL $0x0000001f, R10 6052 MOVQ $0x00000040, R13 6053 SUBQ R10, R13 6054 DECQ R12 6055 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 6056 LEAQ -32(R9)(R13*1), R10 6057 LEAQ -32(AX)(R13*1), R14 6058 6059 emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back: 6060 MOVOU (R10), X4 6061 MOVOU 16(R10), X5 6062 MOVOA X4, (R14) 6063 MOVOA X5, 16(R14) 6064 ADDQ $0x20, R14 6065 ADDQ $0x20, R10 6066 ADDQ $0x20, R13 6067 DECQ R12 6068 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back 6069 6070 emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32: 6071 MOVOU -32(R9)(R13*1), X4 6072 MOVOU -16(R9)(R13*1), X5 6073 MOVOA X4, -32(AX)(R13*1) 6074 MOVOA X5, -16(AX)(R13*1) 6075 ADDQ $0x20, R13 6076 CMPQ R8, R13 6077 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 6078 MOVOU X0, (AX) 6079 MOVOU X1, 16(AX) 6080 MOVOU X2, -32(AX)(R8*1) 6081 MOVOU X3, -16(AX)(R8*1) 6082 MOVQ BX, AX 6083 6084 emit_literal_done_match_emit_encodeBetterBlockAsm: 6085 ADDL R11, CX 6086 ADDL $0x04, R11 6087 MOVL CX, 12(SP) 6088 6089 // emitCopy 6090 CMPL DI, $0x00010000 6091 JB two_byte_offset_match_nolit_encodeBetterBlockAsm 6092 CMPL R11, $0x40 6093 JBE four_bytes_remain_match_nolit_encodeBetterBlockAsm 6094 MOVB $0xff, (AX) 6095 MOVL DI, 1(AX) 6096 LEAL -64(R11), R11 6097 ADDQ $0x05, AX 6098 CMPL R11, $0x04 6099 JB four_bytes_remain_match_nolit_encodeBetterBlockAsm 6100 6101 // emitRepeat 6102 emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy: 6103 MOVL R11, BX 6104 LEAL -4(R11), R11 6105 CMPL BX, $0x08 6106 JBE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy 6107 CMPL BX, $0x0c 6108 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy 6109 CMPL DI, $0x00000800 6110 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy 6111 6112 cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy: 6113 CMPL R11, $0x00000104 6114 JB repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy 6115 CMPL R11, $0x00010100 6116 JB repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy 6117 CMPL R11, $0x0100ffff 6118 JB repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy 6119 LEAL -16842747(R11), R11 6120 MOVL $0xfffb001d, (AX) 6121 MOVB $0xff, 4(AX) 6122 ADDQ $0x05, AX 6123 JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy 6124 6125 repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy: 6126 LEAL -65536(R11), R11 6127 MOVL R11, DI 6128 MOVW $0x001d, (AX) 6129 MOVW R11, 2(AX) 6130 SARL $0x10, DI 6131 MOVB DI, 4(AX) 6132 ADDQ $0x05, AX 6133 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm 6134 6135 repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy: 6136 LEAL -256(R11), R11 6137 MOVW $0x0019, (AX) 6138 MOVW R11, 2(AX) 6139 ADDQ $0x04, AX 6140 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm 6141 6142 repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy: 6143 LEAL -4(R11), R11 6144 MOVW $0x0015, (AX) 6145 MOVB R11, 2(AX) 6146 ADDQ $0x03, AX 6147 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm 6148 6149 repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy: 6150 SHLL $0x02, R11 6151 ORL $0x01, R11 6152 MOVW R11, (AX) 6153 ADDQ $0x02, AX 6154 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm 6155 6156 repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy: 6157 XORQ BX, BX 6158 LEAL 1(BX)(R11*4), R11 6159 MOVB DI, 1(AX) 6160 SARL $0x08, DI 6161 SHLL $0x05, DI 6162 ORL DI, R11 6163 MOVB R11, (AX) 6164 ADDQ $0x02, AX 6165 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm 6166 6167 four_bytes_remain_match_nolit_encodeBetterBlockAsm: 6168 TESTL R11, R11 6169 JZ match_nolit_emitcopy_end_encodeBetterBlockAsm 6170 XORL BX, BX 6171 LEAL -1(BX)(R11*4), R11 6172 MOVB R11, (AX) 6173 MOVL DI, 1(AX) 6174 ADDQ $0x05, AX 6175 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm 6176 6177 two_byte_offset_match_nolit_encodeBetterBlockAsm: 6178 CMPL R11, $0x40 6179 JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm 6180 CMPL DI, $0x00000800 6181 JAE long_offset_short_match_nolit_encodeBetterBlockAsm 6182 MOVL $0x00000001, BX 6183 LEAL 16(BX), BX 6184 MOVB DI, 1(AX) 6185 MOVL DI, R8 6186 SHRL $0x08, R8 6187 SHLL $0x05, R8 6188 ORL R8, BX 6189 MOVB BL, (AX) 6190 ADDQ $0x02, AX 6191 SUBL $0x08, R11 6192 6193 // emitRepeat 6194 LEAL -4(R11), R11 6195 JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b 6196 6197 emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: 6198 MOVL R11, BX 6199 LEAL -4(R11), R11 6200 CMPL BX, $0x08 6201 JBE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b 6202 CMPL BX, $0x0c 6203 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b 6204 CMPL DI, $0x00000800 6205 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b 6206 6207 cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: 6208 CMPL R11, $0x00000104 6209 JB repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b 6210 CMPL R11, $0x00010100 6211 JB repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b 6212 CMPL R11, $0x0100ffff 6213 JB repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b 6214 LEAL -16842747(R11), R11 6215 MOVL $0xfffb001d, (AX) 6216 MOVB $0xff, 4(AX) 6217 ADDQ $0x05, AX 6218 JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b 6219 6220 repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: 6221 LEAL -65536(R11), R11 6222 MOVL R11, DI 6223 MOVW $0x001d, (AX) 6224 MOVW R11, 2(AX) 6225 SARL $0x10, DI 6226 MOVB DI, 4(AX) 6227 ADDQ $0x05, AX 6228 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm 6229 6230 repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: 6231 LEAL -256(R11), R11 6232 MOVW $0x0019, (AX) 6233 MOVW R11, 2(AX) 6234 ADDQ $0x04, AX 6235 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm 6236 6237 repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: 6238 LEAL -4(R11), R11 6239 MOVW $0x0015, (AX) 6240 MOVB R11, 2(AX) 6241 ADDQ $0x03, AX 6242 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm 6243 6244 repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: 6245 SHLL $0x02, R11 6246 ORL $0x01, R11 6247 MOVW R11, (AX) 6248 ADDQ $0x02, AX 6249 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm 6250 6251 repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: 6252 XORQ BX, BX 6253 LEAL 1(BX)(R11*4), R11 6254 MOVB DI, 1(AX) 6255 SARL $0x08, DI 6256 SHLL $0x05, DI 6257 ORL DI, R11 6258 MOVB R11, (AX) 6259 ADDQ $0x02, AX 6260 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm 6261 6262 long_offset_short_match_nolit_encodeBetterBlockAsm: 6263 MOVB $0xee, (AX) 6264 MOVW DI, 1(AX) 6265 LEAL -60(R11), R11 6266 ADDQ $0x03, AX 6267 6268 // emitRepeat 6269 emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short: 6270 MOVL R11, BX 6271 LEAL -4(R11), R11 6272 CMPL BX, $0x08 6273 JBE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short 6274 CMPL BX, $0x0c 6275 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short 6276 CMPL DI, $0x00000800 6277 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short 6278 6279 cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short: 6280 CMPL R11, $0x00000104 6281 JB repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short 6282 CMPL R11, $0x00010100 6283 JB repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short 6284 CMPL R11, $0x0100ffff 6285 JB repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short 6286 LEAL -16842747(R11), R11 6287 MOVL $0xfffb001d, (AX) 6288 MOVB $0xff, 4(AX) 6289 ADDQ $0x05, AX 6290 JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short 6291 6292 repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short: 6293 LEAL -65536(R11), R11 6294 MOVL R11, DI 6295 MOVW $0x001d, (AX) 6296 MOVW R11, 2(AX) 6297 SARL $0x10, DI 6298 MOVB DI, 4(AX) 6299 ADDQ $0x05, AX 6300 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm 6301 6302 repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short: 6303 LEAL -256(R11), R11 6304 MOVW $0x0019, (AX) 6305 MOVW R11, 2(AX) 6306 ADDQ $0x04, AX 6307 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm 6308 6309 repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short: 6310 LEAL -4(R11), R11 6311 MOVW $0x0015, (AX) 6312 MOVB R11, 2(AX) 6313 ADDQ $0x03, AX 6314 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm 6315 6316 repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short: 6317 SHLL $0x02, R11 6318 ORL $0x01, R11 6319 MOVW R11, (AX) 6320 ADDQ $0x02, AX 6321 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm 6322 6323 repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short: 6324 XORQ BX, BX 6325 LEAL 1(BX)(R11*4), R11 6326 MOVB DI, 1(AX) 6327 SARL $0x08, DI 6328 SHLL $0x05, DI 6329 ORL DI, R11 6330 MOVB R11, (AX) 6331 ADDQ $0x02, AX 6332 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm 6333 6334 two_byte_offset_short_match_nolit_encodeBetterBlockAsm: 6335 MOVL R11, BX 6336 SHLL $0x02, BX 6337 CMPL R11, $0x0c 6338 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm 6339 CMPL DI, $0x00000800 6340 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm 6341 LEAL -15(BX), BX 6342 MOVB DI, 1(AX) 6343 SHRL $0x08, DI 6344 SHLL $0x05, DI 6345 ORL DI, BX 6346 MOVB BL, (AX) 6347 ADDQ $0x02, AX 6348 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm 6349 6350 emit_copy_three_match_nolit_encodeBetterBlockAsm: 6351 LEAL -2(BX), BX 6352 MOVB BL, (AX) 6353 MOVW DI, 1(AX) 6354 ADDQ $0x03, AX 6355 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm 6356 6357 match_is_repeat_encodeBetterBlockAsm: 6358 MOVL 12(SP), BX 6359 CMPL BX, SI 6360 JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm 6361 MOVL SI, R8 6362 MOVL SI, 12(SP) 6363 LEAQ (DX)(BX*1), R9 6364 SUBL BX, R8 6365 LEAL -1(R8), BX 6366 CMPL BX, $0x3c 6367 JB one_byte_match_emit_repeat_encodeBetterBlockAsm 6368 CMPL BX, $0x00000100 6369 JB two_bytes_match_emit_repeat_encodeBetterBlockAsm 6370 CMPL BX, $0x00010000 6371 JB three_bytes_match_emit_repeat_encodeBetterBlockAsm 6372 CMPL BX, $0x01000000 6373 JB four_bytes_match_emit_repeat_encodeBetterBlockAsm 6374 MOVB $0xfc, (AX) 6375 MOVL BX, 1(AX) 6376 ADDQ $0x05, AX 6377 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm 6378 6379 four_bytes_match_emit_repeat_encodeBetterBlockAsm: 6380 MOVL BX, R10 6381 SHRL $0x10, R10 6382 MOVB $0xf8, (AX) 6383 MOVW BX, 1(AX) 6384 MOVB R10, 3(AX) 6385 ADDQ $0x04, AX 6386 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm 6387 6388 three_bytes_match_emit_repeat_encodeBetterBlockAsm: 6389 MOVB $0xf4, (AX) 6390 MOVW BX, 1(AX) 6391 ADDQ $0x03, AX 6392 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm 6393 6394 two_bytes_match_emit_repeat_encodeBetterBlockAsm: 6395 MOVB $0xf0, (AX) 6396 MOVB BL, 1(AX) 6397 ADDQ $0x02, AX 6398 CMPL BX, $0x40 6399 JB memmove_match_emit_repeat_encodeBetterBlockAsm 6400 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm 6401 6402 one_byte_match_emit_repeat_encodeBetterBlockAsm: 6403 SHLB $0x02, BL 6404 MOVB BL, (AX) 6405 ADDQ $0x01, AX 6406 6407 memmove_match_emit_repeat_encodeBetterBlockAsm: 6408 LEAQ (AX)(R8*1), BX 6409 6410 // genMemMoveShort 6411 CMPQ R8, $0x04 6412 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4 6413 CMPQ R8, $0x08 6414 JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7 6415 CMPQ R8, $0x10 6416 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16 6417 CMPQ R8, $0x20 6418 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32 6419 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64 6420 6421 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4: 6422 MOVL (R9), R10 6423 MOVL R10, (AX) 6424 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm 6425 6426 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7: 6427 MOVL (R9), R10 6428 MOVL -4(R9)(R8*1), R9 6429 MOVL R10, (AX) 6430 MOVL R9, -4(AX)(R8*1) 6431 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm 6432 6433 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16: 6434 MOVQ (R9), R10 6435 MOVQ -8(R9)(R8*1), R9 6436 MOVQ R10, (AX) 6437 MOVQ R9, -8(AX)(R8*1) 6438 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm 6439 6440 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32: 6441 MOVOU (R9), X0 6442 MOVOU -16(R9)(R8*1), X1 6443 MOVOU X0, (AX) 6444 MOVOU X1, -16(AX)(R8*1) 6445 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm 6446 6447 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64: 6448 MOVOU (R9), X0 6449 MOVOU 16(R9), X1 6450 MOVOU -32(R9)(R8*1), X2 6451 MOVOU -16(R9)(R8*1), X3 6452 MOVOU X0, (AX) 6453 MOVOU X1, 16(AX) 6454 MOVOU X2, -32(AX)(R8*1) 6455 MOVOU X3, -16(AX)(R8*1) 6456 6457 memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm: 6458 MOVQ BX, AX 6459 JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm 6460 6461 memmove_long_match_emit_repeat_encodeBetterBlockAsm: 6462 LEAQ (AX)(R8*1), BX 6463 6464 // genMemMoveLong 6465 MOVOU (R9), X0 6466 MOVOU 16(R9), X1 6467 MOVOU -32(R9)(R8*1), X2 6468 MOVOU -16(R9)(R8*1), X3 6469 MOVQ R8, R12 6470 SHRQ $0x05, R12 6471 MOVQ AX, R10 6472 ANDL $0x0000001f, R10 6473 MOVQ $0x00000040, R13 6474 SUBQ R10, R13 6475 DECQ R12 6476 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32 6477 LEAQ -32(R9)(R13*1), R10 6478 LEAQ -32(AX)(R13*1), R14 6479 6480 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back: 6481 MOVOU (R10), X4 6482 MOVOU 16(R10), X5 6483 MOVOA X4, (R14) 6484 MOVOA X5, 16(R14) 6485 ADDQ $0x20, R14 6486 ADDQ $0x20, R10 6487 ADDQ $0x20, R13 6488 DECQ R12 6489 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back 6490 6491 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32: 6492 MOVOU -32(R9)(R13*1), X4 6493 MOVOU -16(R9)(R13*1), X5 6494 MOVOA X4, -32(AX)(R13*1) 6495 MOVOA X5, -16(AX)(R13*1) 6496 ADDQ $0x20, R13 6497 CMPQ R8, R13 6498 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32 6499 MOVOU X0, (AX) 6500 MOVOU X1, 16(AX) 6501 MOVOU X2, -32(AX)(R8*1) 6502 MOVOU X3, -16(AX)(R8*1) 6503 MOVQ BX, AX 6504 6505 emit_literal_done_match_emit_repeat_encodeBetterBlockAsm: 6506 ADDL R11, CX 6507 ADDL $0x04, R11 6508 MOVL CX, 12(SP) 6509 6510 // emitRepeat 6511 emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm: 6512 MOVL R11, BX 6513 LEAL -4(R11), R11 6514 CMPL BX, $0x08 6515 JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm 6516 CMPL BX, $0x0c 6517 JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm 6518 CMPL DI, $0x00000800 6519 JB repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm 6520 6521 cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm: 6522 CMPL R11, $0x00000104 6523 JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm 6524 CMPL R11, $0x00010100 6525 JB repeat_four_match_nolit_repeat_encodeBetterBlockAsm 6526 CMPL R11, $0x0100ffff 6527 JB repeat_five_match_nolit_repeat_encodeBetterBlockAsm 6528 LEAL -16842747(R11), R11 6529 MOVL $0xfffb001d, (AX) 6530 MOVB $0xff, 4(AX) 6531 ADDQ $0x05, AX 6532 JMP emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm 6533 6534 repeat_five_match_nolit_repeat_encodeBetterBlockAsm: 6535 LEAL -65536(R11), R11 6536 MOVL R11, DI 6537 MOVW $0x001d, (AX) 6538 MOVW R11, 2(AX) 6539 SARL $0x10, DI 6540 MOVB DI, 4(AX) 6541 ADDQ $0x05, AX 6542 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm 6543 6544 repeat_four_match_nolit_repeat_encodeBetterBlockAsm: 6545 LEAL -256(R11), R11 6546 MOVW $0x0019, (AX) 6547 MOVW R11, 2(AX) 6548 ADDQ $0x04, AX 6549 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm 6550 6551 repeat_three_match_nolit_repeat_encodeBetterBlockAsm: 6552 LEAL -4(R11), R11 6553 MOVW $0x0015, (AX) 6554 MOVB R11, 2(AX) 6555 ADDQ $0x03, AX 6556 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm 6557 6558 repeat_two_match_nolit_repeat_encodeBetterBlockAsm: 6559 SHLL $0x02, R11 6560 ORL $0x01, R11 6561 MOVW R11, (AX) 6562 ADDQ $0x02, AX 6563 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm 6564 6565 repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm: 6566 XORQ BX, BX 6567 LEAL 1(BX)(R11*4), R11 6568 MOVB DI, 1(AX) 6569 SARL $0x08, DI 6570 SHLL $0x05, DI 6571 ORL DI, R11 6572 MOVB R11, (AX) 6573 ADDQ $0x02, AX 6574 6575 match_nolit_emitcopy_end_encodeBetterBlockAsm: 6576 CMPL CX, 8(SP) 6577 JAE emit_remainder_encodeBetterBlockAsm 6578 CMPQ AX, (SP) 6579 JB match_nolit_dst_ok_encodeBetterBlockAsm 6580 MOVQ $0x00000000, ret+48(FP) 6581 RET 6582 6583 match_nolit_dst_ok_encodeBetterBlockAsm: 6584 MOVQ $0x00cf1bbcdcbfa563, BX 6585 MOVQ $0x9e3779b1, DI 6586 LEAQ 1(SI), SI 6587 LEAQ -2(CX), R8 6588 MOVQ (DX)(SI*1), R9 6589 MOVQ 1(DX)(SI*1), R10 6590 MOVQ (DX)(R8*1), R11 6591 MOVQ 1(DX)(R8*1), R12 6592 SHLQ $0x08, R9 6593 IMULQ BX, R9 6594 SHRQ $0x2f, R9 6595 SHLQ $0x20, R10 6596 IMULQ DI, R10 6597 SHRQ $0x32, R10 6598 SHLQ $0x08, R11 6599 IMULQ BX, R11 6600 SHRQ $0x2f, R11 6601 SHLQ $0x20, R12 6602 IMULQ DI, R12 6603 SHRQ $0x32, R12 6604 LEAQ 1(SI), DI 6605 LEAQ 1(R8), R13 6606 MOVL SI, 24(SP)(R9*4) 6607 MOVL R8, 24(SP)(R11*4) 6608 MOVL DI, 524312(SP)(R10*4) 6609 MOVL R13, 524312(SP)(R12*4) 6610 ADDQ $0x01, SI 6611 SUBQ $0x01, R8 6612 6613 index_loop_encodeBetterBlockAsm: 6614 CMPQ SI, R8 6615 JAE search_loop_encodeBetterBlockAsm 6616 MOVQ (DX)(SI*1), DI 6617 MOVQ (DX)(R8*1), R9 6618 SHLQ $0x08, DI 6619 IMULQ BX, DI 6620 SHRQ $0x2f, DI 6621 SHLQ $0x08, R9 6622 IMULQ BX, R9 6623 SHRQ $0x2f, R9 6624 MOVL SI, 24(SP)(DI*4) 6625 MOVL R8, 24(SP)(R9*4) 6626 ADDQ $0x02, SI 6627 SUBQ $0x02, R8 6628 JMP index_loop_encodeBetterBlockAsm 6629 6630 emit_remainder_encodeBetterBlockAsm: 6631 MOVQ src_len+32(FP), CX 6632 SUBL 12(SP), CX 6633 LEAQ 5(AX)(CX*1), CX 6634 CMPQ CX, (SP) 6635 JB emit_remainder_ok_encodeBetterBlockAsm 6636 MOVQ $0x00000000, ret+48(FP) 6637 RET 6638 6639 emit_remainder_ok_encodeBetterBlockAsm: 6640 MOVQ src_len+32(FP), CX 6641 MOVL 12(SP), BX 6642 CMPL BX, CX 6643 JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm 6644 MOVL CX, SI 6645 MOVL CX, 12(SP) 6646 LEAQ (DX)(BX*1), CX 6647 SUBL BX, SI 6648 LEAL -1(SI), DX 6649 CMPL DX, $0x3c 6650 JB one_byte_emit_remainder_encodeBetterBlockAsm 6651 CMPL DX, $0x00000100 6652 JB two_bytes_emit_remainder_encodeBetterBlockAsm 6653 CMPL DX, $0x00010000 6654 JB three_bytes_emit_remainder_encodeBetterBlockAsm 6655 CMPL DX, $0x01000000 6656 JB four_bytes_emit_remainder_encodeBetterBlockAsm 6657 MOVB $0xfc, (AX) 6658 MOVL DX, 1(AX) 6659 ADDQ $0x05, AX 6660 JMP memmove_long_emit_remainder_encodeBetterBlockAsm 6661 6662 four_bytes_emit_remainder_encodeBetterBlockAsm: 6663 MOVL DX, BX 6664 SHRL $0x10, BX 6665 MOVB $0xf8, (AX) 6666 MOVW DX, 1(AX) 6667 MOVB BL, 3(AX) 6668 ADDQ $0x04, AX 6669 JMP memmove_long_emit_remainder_encodeBetterBlockAsm 6670 6671 three_bytes_emit_remainder_encodeBetterBlockAsm: 6672 MOVB $0xf4, (AX) 6673 MOVW DX, 1(AX) 6674 ADDQ $0x03, AX 6675 JMP memmove_long_emit_remainder_encodeBetterBlockAsm 6676 6677 two_bytes_emit_remainder_encodeBetterBlockAsm: 6678 MOVB $0xf0, (AX) 6679 MOVB DL, 1(AX) 6680 ADDQ $0x02, AX 6681 CMPL DX, $0x40 6682 JB memmove_emit_remainder_encodeBetterBlockAsm 6683 JMP memmove_long_emit_remainder_encodeBetterBlockAsm 6684 6685 one_byte_emit_remainder_encodeBetterBlockAsm: 6686 SHLB $0x02, DL 6687 MOVB DL, (AX) 6688 ADDQ $0x01, AX 6689 6690 memmove_emit_remainder_encodeBetterBlockAsm: 6691 LEAQ (AX)(SI*1), DX 6692 MOVL SI, BX 6693 6694 // genMemMoveShort 6695 CMPQ BX, $0x03 6696 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2 6697 JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3 6698 CMPQ BX, $0x08 6699 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7 6700 CMPQ BX, $0x10 6701 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16 6702 CMPQ BX, $0x20 6703 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32 6704 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64 6705 6706 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2: 6707 MOVB (CX), SI 6708 MOVB -1(CX)(BX*1), CL 6709 MOVB SI, (AX) 6710 MOVB CL, -1(AX)(BX*1) 6711 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm 6712 6713 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3: 6714 MOVW (CX), SI 6715 MOVB 2(CX), CL 6716 MOVW SI, (AX) 6717 MOVB CL, 2(AX) 6718 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm 6719 6720 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7: 6721 MOVL (CX), SI 6722 MOVL -4(CX)(BX*1), CX 6723 MOVL SI, (AX) 6724 MOVL CX, -4(AX)(BX*1) 6725 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm 6726 6727 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16: 6728 MOVQ (CX), SI 6729 MOVQ -8(CX)(BX*1), CX 6730 MOVQ SI, (AX) 6731 MOVQ CX, -8(AX)(BX*1) 6732 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm 6733 6734 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32: 6735 MOVOU (CX), X0 6736 MOVOU -16(CX)(BX*1), X1 6737 MOVOU X0, (AX) 6738 MOVOU X1, -16(AX)(BX*1) 6739 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm 6740 6741 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64: 6742 MOVOU (CX), X0 6743 MOVOU 16(CX), X1 6744 MOVOU -32(CX)(BX*1), X2 6745 MOVOU -16(CX)(BX*1), X3 6746 MOVOU X0, (AX) 6747 MOVOU X1, 16(AX) 6748 MOVOU X2, -32(AX)(BX*1) 6749 MOVOU X3, -16(AX)(BX*1) 6750 6751 memmove_end_copy_emit_remainder_encodeBetterBlockAsm: 6752 MOVQ DX, AX 6753 JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm 6754 6755 memmove_long_emit_remainder_encodeBetterBlockAsm: 6756 LEAQ (AX)(SI*1), DX 6757 MOVL SI, BX 6758 6759 // genMemMoveLong 6760 MOVOU (CX), X0 6761 MOVOU 16(CX), X1 6762 MOVOU -32(CX)(BX*1), X2 6763 MOVOU -16(CX)(BX*1), X3 6764 MOVQ BX, DI 6765 SHRQ $0x05, DI 6766 MOVQ AX, SI 6767 ANDL $0x0000001f, SI 6768 MOVQ $0x00000040, R8 6769 SUBQ SI, R8 6770 DECQ DI 6771 JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32 6772 LEAQ -32(CX)(R8*1), SI 6773 LEAQ -32(AX)(R8*1), R9 6774 6775 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back: 6776 MOVOU (SI), X4 6777 MOVOU 16(SI), X5 6778 MOVOA X4, (R9) 6779 MOVOA X5, 16(R9) 6780 ADDQ $0x20, R9 6781 ADDQ $0x20, SI 6782 ADDQ $0x20, R8 6783 DECQ DI 6784 JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back 6785 6786 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32: 6787 MOVOU -32(CX)(R8*1), X4 6788 MOVOU -16(CX)(R8*1), X5 6789 MOVOA X4, -32(AX)(R8*1) 6790 MOVOA X5, -16(AX)(R8*1) 6791 ADDQ $0x20, R8 6792 CMPQ BX, R8 6793 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32 6794 MOVOU X0, (AX) 6795 MOVOU X1, 16(AX) 6796 MOVOU X2, -32(AX)(BX*1) 6797 MOVOU X3, -16(AX)(BX*1) 6798 MOVQ DX, AX 6799 6800 emit_literal_done_emit_remainder_encodeBetterBlockAsm: 6801 MOVQ dst_base+0(FP), CX 6802 SUBQ CX, AX 6803 MOVQ AX, ret+48(FP) 6804 RET 6805 6806 // func encodeBetterBlockAsm4MB(dst []byte, src []byte) int 6807 // Requires: BMI, SSE2 6808 TEXT ·encodeBetterBlockAsm4MB(SB), $589848-56 6809 MOVQ dst_base+0(FP), AX 6810 MOVQ $0x00001200, CX 6811 LEAQ 24(SP), DX 6812 PXOR X0, X0 6813 6814 zero_loop_encodeBetterBlockAsm4MB: 6815 MOVOU X0, (DX) 6816 MOVOU X0, 16(DX) 6817 MOVOU X0, 32(DX) 6818 MOVOU X0, 48(DX) 6819 MOVOU X0, 64(DX) 6820 MOVOU X0, 80(DX) 6821 MOVOU X0, 96(DX) 6822 MOVOU X0, 112(DX) 6823 ADDQ $0x80, DX 6824 DECQ CX 6825 JNZ zero_loop_encodeBetterBlockAsm4MB 6826 MOVL $0x00000000, 12(SP) 6827 MOVQ src_len+32(FP), CX 6828 LEAQ -6(CX), DX 6829 LEAQ -8(CX), BX 6830 MOVL BX, 8(SP) 6831 SHRQ $0x05, CX 6832 SUBL CX, DX 6833 LEAQ (AX)(DX*1), DX 6834 MOVQ DX, (SP) 6835 MOVL $0x00000001, CX 6836 MOVL $0x00000000, 16(SP) 6837 MOVQ src_base+24(FP), DX 6838 6839 search_loop_encodeBetterBlockAsm4MB: 6840 MOVL CX, BX 6841 SUBL 12(SP), BX 6842 SHRL $0x07, BX 6843 CMPL BX, $0x63 6844 JBE check_maxskip_ok_encodeBetterBlockAsm4MB 6845 LEAL 100(CX), BX 6846 JMP check_maxskip_cont_encodeBetterBlockAsm4MB 6847 6848 check_maxskip_ok_encodeBetterBlockAsm4MB: 6849 LEAL 1(CX)(BX*1), BX 6850 6851 check_maxskip_cont_encodeBetterBlockAsm4MB: 6852 CMPL BX, 8(SP) 6853 JAE emit_remainder_encodeBetterBlockAsm4MB 6854 MOVQ (DX)(CX*1), SI 6855 MOVL BX, 20(SP) 6856 MOVQ $0x00cf1bbcdcbfa563, R8 6857 MOVQ $0x9e3779b1, BX 6858 MOVQ SI, R9 6859 MOVQ SI, R10 6860 SHLQ $0x08, R9 6861 IMULQ R8, R9 6862 SHRQ $0x2f, R9 6863 SHLQ $0x20, R10 6864 IMULQ BX, R10 6865 SHRQ $0x32, R10 6866 MOVL 24(SP)(R9*4), BX 6867 MOVL 524312(SP)(R10*4), DI 6868 MOVL CX, 24(SP)(R9*4) 6869 MOVL CX, 524312(SP)(R10*4) 6870 MOVQ (DX)(BX*1), R9 6871 MOVQ (DX)(DI*1), R10 6872 CMPQ R9, SI 6873 JEQ candidate_match_encodeBetterBlockAsm4MB 6874 CMPQ R10, SI 6875 JNE no_short_found_encodeBetterBlockAsm4MB 6876 MOVL DI, BX 6877 JMP candidate_match_encodeBetterBlockAsm4MB 6878 6879 no_short_found_encodeBetterBlockAsm4MB: 6880 CMPL R9, SI 6881 JEQ candidate_match_encodeBetterBlockAsm4MB 6882 CMPL R10, SI 6883 JEQ candidateS_match_encodeBetterBlockAsm4MB 6884 MOVL 20(SP), CX 6885 JMP search_loop_encodeBetterBlockAsm4MB 6886 6887 candidateS_match_encodeBetterBlockAsm4MB: 6888 SHRQ $0x08, SI 6889 MOVQ SI, R9 6890 SHLQ $0x08, R9 6891 IMULQ R8, R9 6892 SHRQ $0x2f, R9 6893 MOVL 24(SP)(R9*4), BX 6894 INCL CX 6895 MOVL CX, 24(SP)(R9*4) 6896 CMPL (DX)(BX*1), SI 6897 JEQ candidate_match_encodeBetterBlockAsm4MB 6898 DECL CX 6899 MOVL DI, BX 6900 6901 candidate_match_encodeBetterBlockAsm4MB: 6902 MOVL 12(SP), SI 6903 TESTL BX, BX 6904 JZ match_extend_back_end_encodeBetterBlockAsm4MB 6905 6906 match_extend_back_loop_encodeBetterBlockAsm4MB: 6907 CMPL CX, SI 6908 JBE match_extend_back_end_encodeBetterBlockAsm4MB 6909 MOVB -1(DX)(BX*1), DI 6910 MOVB -1(DX)(CX*1), R8 6911 CMPB DI, R8 6912 JNE match_extend_back_end_encodeBetterBlockAsm4MB 6913 LEAL -1(CX), CX 6914 DECL BX 6915 JZ match_extend_back_end_encodeBetterBlockAsm4MB 6916 JMP match_extend_back_loop_encodeBetterBlockAsm4MB 6917 6918 match_extend_back_end_encodeBetterBlockAsm4MB: 6919 MOVL CX, SI 6920 SUBL 12(SP), SI 6921 LEAQ 4(AX)(SI*1), SI 6922 CMPQ SI, (SP) 6923 JB match_dst_size_check_encodeBetterBlockAsm4MB 6924 MOVQ $0x00000000, ret+48(FP) 6925 RET 6926 6927 match_dst_size_check_encodeBetterBlockAsm4MB: 6928 MOVL CX, SI 6929 ADDL $0x04, CX 6930 ADDL $0x04, BX 6931 MOVQ src_len+32(FP), DI 6932 SUBL CX, DI 6933 LEAQ (DX)(CX*1), R8 6934 LEAQ (DX)(BX*1), R9 6935 6936 // matchLen 6937 XORL R11, R11 6938 CMPL DI, $0x08 6939 JB matchlen_match4_match_nolit_encodeBetterBlockAsm4MB 6940 6941 matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB: 6942 MOVQ (R8)(R11*1), R10 6943 XORQ (R9)(R11*1), R10 6944 TESTQ R10, R10 6945 JZ matchlen_loop_match_nolit_encodeBetterBlockAsm4MB 6946 6947 #ifdef GOAMD64_v3 6948 TZCNTQ R10, R10 6949 6950 #else 6951 BSFQ R10, R10 6952 6953 #endif 6954 SARQ $0x03, R10 6955 LEAL (R11)(R10*1), R11 6956 JMP match_nolit_end_encodeBetterBlockAsm4MB 6957 6958 matchlen_loop_match_nolit_encodeBetterBlockAsm4MB: 6959 LEAL -8(DI), DI 6960 LEAL 8(R11), R11 6961 CMPL DI, $0x08 6962 JAE matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB 6963 JZ match_nolit_end_encodeBetterBlockAsm4MB 6964 6965 matchlen_match4_match_nolit_encodeBetterBlockAsm4MB: 6966 CMPL DI, $0x04 6967 JB matchlen_match2_match_nolit_encodeBetterBlockAsm4MB 6968 MOVL (R8)(R11*1), R10 6969 CMPL (R9)(R11*1), R10 6970 JNE matchlen_match2_match_nolit_encodeBetterBlockAsm4MB 6971 SUBL $0x04, DI 6972 LEAL 4(R11), R11 6973 6974 matchlen_match2_match_nolit_encodeBetterBlockAsm4MB: 6975 CMPL DI, $0x02 6976 JB matchlen_match1_match_nolit_encodeBetterBlockAsm4MB 6977 MOVW (R8)(R11*1), R10 6978 CMPW (R9)(R11*1), R10 6979 JNE matchlen_match1_match_nolit_encodeBetterBlockAsm4MB 6980 SUBL $0x02, DI 6981 LEAL 2(R11), R11 6982 6983 matchlen_match1_match_nolit_encodeBetterBlockAsm4MB: 6984 CMPL DI, $0x01 6985 JB match_nolit_end_encodeBetterBlockAsm4MB 6986 MOVB (R8)(R11*1), R10 6987 CMPB (R9)(R11*1), R10 6988 JNE match_nolit_end_encodeBetterBlockAsm4MB 6989 LEAL 1(R11), R11 6990 6991 match_nolit_end_encodeBetterBlockAsm4MB: 6992 MOVL CX, DI 6993 SUBL BX, DI 6994 6995 // Check if repeat 6996 CMPL 16(SP), DI 6997 JEQ match_is_repeat_encodeBetterBlockAsm4MB 6998 CMPL R11, $0x01 6999 JA match_length_ok_encodeBetterBlockAsm4MB 7000 CMPL DI, $0x0000ffff 7001 JBE match_length_ok_encodeBetterBlockAsm4MB 7002 MOVL 20(SP), CX 7003 INCL CX 7004 JMP search_loop_encodeBetterBlockAsm4MB 7005 7006 match_length_ok_encodeBetterBlockAsm4MB: 7007 MOVL DI, 16(SP) 7008 MOVL 12(SP), BX 7009 CMPL BX, SI 7010 JEQ emit_literal_done_match_emit_encodeBetterBlockAsm4MB 7011 MOVL SI, R8 7012 MOVL SI, 12(SP) 7013 LEAQ (DX)(BX*1), R9 7014 SUBL BX, R8 7015 LEAL -1(R8), BX 7016 CMPL BX, $0x3c 7017 JB one_byte_match_emit_encodeBetterBlockAsm4MB 7018 CMPL BX, $0x00000100 7019 JB two_bytes_match_emit_encodeBetterBlockAsm4MB 7020 CMPL BX, $0x00010000 7021 JB three_bytes_match_emit_encodeBetterBlockAsm4MB 7022 MOVL BX, R10 7023 SHRL $0x10, R10 7024 MOVB $0xf8, (AX) 7025 MOVW BX, 1(AX) 7026 MOVB R10, 3(AX) 7027 ADDQ $0x04, AX 7028 JMP memmove_long_match_emit_encodeBetterBlockAsm4MB 7029 7030 three_bytes_match_emit_encodeBetterBlockAsm4MB: 7031 MOVB $0xf4, (AX) 7032 MOVW BX, 1(AX) 7033 ADDQ $0x03, AX 7034 JMP memmove_long_match_emit_encodeBetterBlockAsm4MB 7035 7036 two_bytes_match_emit_encodeBetterBlockAsm4MB: 7037 MOVB $0xf0, (AX) 7038 MOVB BL, 1(AX) 7039 ADDQ $0x02, AX 7040 CMPL BX, $0x40 7041 JB memmove_match_emit_encodeBetterBlockAsm4MB 7042 JMP memmove_long_match_emit_encodeBetterBlockAsm4MB 7043 7044 one_byte_match_emit_encodeBetterBlockAsm4MB: 7045 SHLB $0x02, BL 7046 MOVB BL, (AX) 7047 ADDQ $0x01, AX 7048 7049 memmove_match_emit_encodeBetterBlockAsm4MB: 7050 LEAQ (AX)(R8*1), BX 7051 7052 // genMemMoveShort 7053 CMPQ R8, $0x04 7054 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4 7055 CMPQ R8, $0x08 7056 JB emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7 7057 CMPQ R8, $0x10 7058 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16 7059 CMPQ R8, $0x20 7060 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32 7061 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64 7062 7063 emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4: 7064 MOVL (R9), R10 7065 MOVL R10, (AX) 7066 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB 7067 7068 emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7: 7069 MOVL (R9), R10 7070 MOVL -4(R9)(R8*1), R9 7071 MOVL R10, (AX) 7072 MOVL R9, -4(AX)(R8*1) 7073 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB 7074 7075 emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16: 7076 MOVQ (R9), R10 7077 MOVQ -8(R9)(R8*1), R9 7078 MOVQ R10, (AX) 7079 MOVQ R9, -8(AX)(R8*1) 7080 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB 7081 7082 emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32: 7083 MOVOU (R9), X0 7084 MOVOU -16(R9)(R8*1), X1 7085 MOVOU X0, (AX) 7086 MOVOU X1, -16(AX)(R8*1) 7087 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB 7088 7089 emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64: 7090 MOVOU (R9), X0 7091 MOVOU 16(R9), X1 7092 MOVOU -32(R9)(R8*1), X2 7093 MOVOU -16(R9)(R8*1), X3 7094 MOVOU X0, (AX) 7095 MOVOU X1, 16(AX) 7096 MOVOU X2, -32(AX)(R8*1) 7097 MOVOU X3, -16(AX)(R8*1) 7098 7099 memmove_end_copy_match_emit_encodeBetterBlockAsm4MB: 7100 MOVQ BX, AX 7101 JMP emit_literal_done_match_emit_encodeBetterBlockAsm4MB 7102 7103 memmove_long_match_emit_encodeBetterBlockAsm4MB: 7104 LEAQ (AX)(R8*1), BX 7105 7106 // genMemMoveLong 7107 MOVOU (R9), X0 7108 MOVOU 16(R9), X1 7109 MOVOU -32(R9)(R8*1), X2 7110 MOVOU -16(R9)(R8*1), X3 7111 MOVQ R8, R12 7112 SHRQ $0x05, R12 7113 MOVQ AX, R10 7114 ANDL $0x0000001f, R10 7115 MOVQ $0x00000040, R13 7116 SUBQ R10, R13 7117 DECQ R12 7118 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 7119 LEAQ -32(R9)(R13*1), R10 7120 LEAQ -32(AX)(R13*1), R14 7121 7122 emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back: 7123 MOVOU (R10), X4 7124 MOVOU 16(R10), X5 7125 MOVOA X4, (R14) 7126 MOVOA X5, 16(R14) 7127 ADDQ $0x20, R14 7128 ADDQ $0x20, R10 7129 ADDQ $0x20, R13 7130 DECQ R12 7131 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back 7132 7133 emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: 7134 MOVOU -32(R9)(R13*1), X4 7135 MOVOU -16(R9)(R13*1), X5 7136 MOVOA X4, -32(AX)(R13*1) 7137 MOVOA X5, -16(AX)(R13*1) 7138 ADDQ $0x20, R13 7139 CMPQ R8, R13 7140 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 7141 MOVOU X0, (AX) 7142 MOVOU X1, 16(AX) 7143 MOVOU X2, -32(AX)(R8*1) 7144 MOVOU X3, -16(AX)(R8*1) 7145 MOVQ BX, AX 7146 7147 emit_literal_done_match_emit_encodeBetterBlockAsm4MB: 7148 ADDL R11, CX 7149 ADDL $0x04, R11 7150 MOVL CX, 12(SP) 7151 7152 // emitCopy 7153 CMPL DI, $0x00010000 7154 JB two_byte_offset_match_nolit_encodeBetterBlockAsm4MB 7155 CMPL R11, $0x40 7156 JBE four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB 7157 MOVB $0xff, (AX) 7158 MOVL DI, 1(AX) 7159 LEAL -64(R11), R11 7160 ADDQ $0x05, AX 7161 CMPL R11, $0x04 7162 JB four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB 7163 7164 // emitRepeat 7165 MOVL R11, BX 7166 LEAL -4(R11), R11 7167 CMPL BX, $0x08 7168 JBE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy 7169 CMPL BX, $0x0c 7170 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy 7171 CMPL DI, $0x00000800 7172 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy 7173 7174 cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy: 7175 CMPL R11, $0x00000104 7176 JB repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy 7177 CMPL R11, $0x00010100 7178 JB repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy 7179 LEAL -65536(R11), R11 7180 MOVL R11, DI 7181 MOVW $0x001d, (AX) 7182 MOVW R11, 2(AX) 7183 SARL $0x10, DI 7184 MOVB DI, 4(AX) 7185 ADDQ $0x05, AX 7186 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB 7187 7188 repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy: 7189 LEAL -256(R11), R11 7190 MOVW $0x0019, (AX) 7191 MOVW R11, 2(AX) 7192 ADDQ $0x04, AX 7193 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB 7194 7195 repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy: 7196 LEAL -4(R11), R11 7197 MOVW $0x0015, (AX) 7198 MOVB R11, 2(AX) 7199 ADDQ $0x03, AX 7200 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB 7201 7202 repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy: 7203 SHLL $0x02, R11 7204 ORL $0x01, R11 7205 MOVW R11, (AX) 7206 ADDQ $0x02, AX 7207 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB 7208 7209 repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy: 7210 XORQ BX, BX 7211 LEAL 1(BX)(R11*4), R11 7212 MOVB DI, 1(AX) 7213 SARL $0x08, DI 7214 SHLL $0x05, DI 7215 ORL DI, R11 7216 MOVB R11, (AX) 7217 ADDQ $0x02, AX 7218 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB 7219 7220 four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB: 7221 TESTL R11, R11 7222 JZ match_nolit_emitcopy_end_encodeBetterBlockAsm4MB 7223 XORL BX, BX 7224 LEAL -1(BX)(R11*4), R11 7225 MOVB R11, (AX) 7226 MOVL DI, 1(AX) 7227 ADDQ $0x05, AX 7228 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB 7229 7230 two_byte_offset_match_nolit_encodeBetterBlockAsm4MB: 7231 CMPL R11, $0x40 7232 JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB 7233 CMPL DI, $0x00000800 7234 JAE long_offset_short_match_nolit_encodeBetterBlockAsm4MB 7235 MOVL $0x00000001, BX 7236 LEAL 16(BX), BX 7237 MOVB DI, 1(AX) 7238 SHRL $0x08, DI 7239 SHLL $0x05, DI 7240 ORL DI, BX 7241 MOVB BL, (AX) 7242 ADDQ $0x02, AX 7243 SUBL $0x08, R11 7244 7245 // emitRepeat 7246 LEAL -4(R11), R11 7247 JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b 7248 MOVL R11, BX 7249 LEAL -4(R11), R11 7250 CMPL BX, $0x08 7251 JBE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b 7252 CMPL BX, $0x0c 7253 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b 7254 CMPL DI, $0x00000800 7255 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b 7256 7257 cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: 7258 CMPL R11, $0x00000104 7259 JB repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b 7260 CMPL R11, $0x00010100 7261 JB repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b 7262 LEAL -65536(R11), R11 7263 MOVL R11, DI 7264 MOVW $0x001d, (AX) 7265 MOVW R11, 2(AX) 7266 SARL $0x10, DI 7267 MOVB DI, 4(AX) 7268 ADDQ $0x05, AX 7269 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB 7270 7271 repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: 7272 LEAL -256(R11), R11 7273 MOVW $0x0019, (AX) 7274 MOVW R11, 2(AX) 7275 ADDQ $0x04, AX 7276 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB 7277 7278 repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: 7279 LEAL -4(R11), R11 7280 MOVW $0x0015, (AX) 7281 MOVB R11, 2(AX) 7282 ADDQ $0x03, AX 7283 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB 7284 7285 repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: 7286 SHLL $0x02, R11 7287 ORL $0x01, R11 7288 MOVW R11, (AX) 7289 ADDQ $0x02, AX 7290 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB 7291 7292 repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: 7293 XORQ BX, BX 7294 LEAL 1(BX)(R11*4), R11 7295 MOVB DI, 1(AX) 7296 SARL $0x08, DI 7297 SHLL $0x05, DI 7298 ORL DI, R11 7299 MOVB R11, (AX) 7300 ADDQ $0x02, AX 7301 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB 7302 7303 long_offset_short_match_nolit_encodeBetterBlockAsm4MB: 7304 MOVB $0xee, (AX) 7305 MOVW DI, 1(AX) 7306 LEAL -60(R11), R11 7307 ADDQ $0x03, AX 7308 7309 // emitRepeat 7310 MOVL R11, BX 7311 LEAL -4(R11), R11 7312 CMPL BX, $0x08 7313 JBE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short 7314 CMPL BX, $0x0c 7315 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short 7316 CMPL DI, $0x00000800 7317 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short 7318 7319 cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: 7320 CMPL R11, $0x00000104 7321 JB repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short 7322 CMPL R11, $0x00010100 7323 JB repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short 7324 LEAL -65536(R11), R11 7325 MOVL R11, DI 7326 MOVW $0x001d, (AX) 7327 MOVW R11, 2(AX) 7328 SARL $0x10, DI 7329 MOVB DI, 4(AX) 7330 ADDQ $0x05, AX 7331 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB 7332 7333 repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: 7334 LEAL -256(R11), R11 7335 MOVW $0x0019, (AX) 7336 MOVW R11, 2(AX) 7337 ADDQ $0x04, AX 7338 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB 7339 7340 repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: 7341 LEAL -4(R11), R11 7342 MOVW $0x0015, (AX) 7343 MOVB R11, 2(AX) 7344 ADDQ $0x03, AX 7345 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB 7346 7347 repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: 7348 SHLL $0x02, R11 7349 ORL $0x01, R11 7350 MOVW R11, (AX) 7351 ADDQ $0x02, AX 7352 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB 7353 7354 repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: 7355 XORQ BX, BX 7356 LEAL 1(BX)(R11*4), R11 7357 MOVB DI, 1(AX) 7358 SARL $0x08, DI 7359 SHLL $0x05, DI 7360 ORL DI, R11 7361 MOVB R11, (AX) 7362 ADDQ $0x02, AX 7363 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB 7364 7365 two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB: 7366 MOVL R11, BX 7367 SHLL $0x02, BX 7368 CMPL R11, $0x0c 7369 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB 7370 CMPL DI, $0x00000800 7371 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB 7372 LEAL -15(BX), BX 7373 MOVB DI, 1(AX) 7374 SHRL $0x08, DI 7375 SHLL $0x05, DI 7376 ORL DI, BX 7377 MOVB BL, (AX) 7378 ADDQ $0x02, AX 7379 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB 7380 7381 emit_copy_three_match_nolit_encodeBetterBlockAsm4MB: 7382 LEAL -2(BX), BX 7383 MOVB BL, (AX) 7384 MOVW DI, 1(AX) 7385 ADDQ $0x03, AX 7386 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB 7387 7388 match_is_repeat_encodeBetterBlockAsm4MB: 7389 MOVL 12(SP), BX 7390 CMPL BX, SI 7391 JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB 7392 MOVL SI, R8 7393 MOVL SI, 12(SP) 7394 LEAQ (DX)(BX*1), R9 7395 SUBL BX, R8 7396 LEAL -1(R8), BX 7397 CMPL BX, $0x3c 7398 JB one_byte_match_emit_repeat_encodeBetterBlockAsm4MB 7399 CMPL BX, $0x00000100 7400 JB two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB 7401 CMPL BX, $0x00010000 7402 JB three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB 7403 MOVL BX, R10 7404 SHRL $0x10, R10 7405 MOVB $0xf8, (AX) 7406 MOVW BX, 1(AX) 7407 MOVB R10, 3(AX) 7408 ADDQ $0x04, AX 7409 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB 7410 7411 three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB: 7412 MOVB $0xf4, (AX) 7413 MOVW BX, 1(AX) 7414 ADDQ $0x03, AX 7415 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB 7416 7417 two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB: 7418 MOVB $0xf0, (AX) 7419 MOVB BL, 1(AX) 7420 ADDQ $0x02, AX 7421 CMPL BX, $0x40 7422 JB memmove_match_emit_repeat_encodeBetterBlockAsm4MB 7423 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB 7424 7425 one_byte_match_emit_repeat_encodeBetterBlockAsm4MB: 7426 SHLB $0x02, BL 7427 MOVB BL, (AX) 7428 ADDQ $0x01, AX 7429 7430 memmove_match_emit_repeat_encodeBetterBlockAsm4MB: 7431 LEAQ (AX)(R8*1), BX 7432 7433 // genMemMoveShort 7434 CMPQ R8, $0x04 7435 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4 7436 CMPQ R8, $0x08 7437 JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7 7438 CMPQ R8, $0x10 7439 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16 7440 CMPQ R8, $0x20 7441 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32 7442 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64 7443 7444 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4: 7445 MOVL (R9), R10 7446 MOVL R10, (AX) 7447 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB 7448 7449 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7: 7450 MOVL (R9), R10 7451 MOVL -4(R9)(R8*1), R9 7452 MOVL R10, (AX) 7453 MOVL R9, -4(AX)(R8*1) 7454 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB 7455 7456 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16: 7457 MOVQ (R9), R10 7458 MOVQ -8(R9)(R8*1), R9 7459 MOVQ R10, (AX) 7460 MOVQ R9, -8(AX)(R8*1) 7461 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB 7462 7463 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32: 7464 MOVOU (R9), X0 7465 MOVOU -16(R9)(R8*1), X1 7466 MOVOU X0, (AX) 7467 MOVOU X1, -16(AX)(R8*1) 7468 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB 7469 7470 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64: 7471 MOVOU (R9), X0 7472 MOVOU 16(R9), X1 7473 MOVOU -32(R9)(R8*1), X2 7474 MOVOU -16(R9)(R8*1), X3 7475 MOVOU X0, (AX) 7476 MOVOU X1, 16(AX) 7477 MOVOU X2, -32(AX)(R8*1) 7478 MOVOU X3, -16(AX)(R8*1) 7479 7480 memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB: 7481 MOVQ BX, AX 7482 JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB 7483 7484 memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB: 7485 LEAQ (AX)(R8*1), BX 7486 7487 // genMemMoveLong 7488 MOVOU (R9), X0 7489 MOVOU 16(R9), X1 7490 MOVOU -32(R9)(R8*1), X2 7491 MOVOU -16(R9)(R8*1), X3 7492 MOVQ R8, R12 7493 SHRQ $0x05, R12 7494 MOVQ AX, R10 7495 ANDL $0x0000001f, R10 7496 MOVQ $0x00000040, R13 7497 SUBQ R10, R13 7498 DECQ R12 7499 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 7500 LEAQ -32(R9)(R13*1), R10 7501 LEAQ -32(AX)(R13*1), R14 7502 7503 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back: 7504 MOVOU (R10), X4 7505 MOVOU 16(R10), X5 7506 MOVOA X4, (R14) 7507 MOVOA X5, 16(R14) 7508 ADDQ $0x20, R14 7509 ADDQ $0x20, R10 7510 ADDQ $0x20, R13 7511 DECQ R12 7512 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back 7513 7514 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: 7515 MOVOU -32(R9)(R13*1), X4 7516 MOVOU -16(R9)(R13*1), X5 7517 MOVOA X4, -32(AX)(R13*1) 7518 MOVOA X5, -16(AX)(R13*1) 7519 ADDQ $0x20, R13 7520 CMPQ R8, R13 7521 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 7522 MOVOU X0, (AX) 7523 MOVOU X1, 16(AX) 7524 MOVOU X2, -32(AX)(R8*1) 7525 MOVOU X3, -16(AX)(R8*1) 7526 MOVQ BX, AX 7527 7528 emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB: 7529 ADDL R11, CX 7530 ADDL $0x04, R11 7531 MOVL CX, 12(SP) 7532 7533 // emitRepeat 7534 MOVL R11, BX 7535 LEAL -4(R11), R11 7536 CMPL BX, $0x08 7537 JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB 7538 CMPL BX, $0x0c 7539 JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB 7540 CMPL DI, $0x00000800 7541 JB repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB 7542 7543 cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB: 7544 CMPL R11, $0x00000104 7545 JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB 7546 CMPL R11, $0x00010100 7547 JB repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB 7548 LEAL -65536(R11), R11 7549 MOVL R11, DI 7550 MOVW $0x001d, (AX) 7551 MOVW R11, 2(AX) 7552 SARL $0x10, DI 7553 MOVB DI, 4(AX) 7554 ADDQ $0x05, AX 7555 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB 7556 7557 repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB: 7558 LEAL -256(R11), R11 7559 MOVW $0x0019, (AX) 7560 MOVW R11, 2(AX) 7561 ADDQ $0x04, AX 7562 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB 7563 7564 repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB: 7565 LEAL -4(R11), R11 7566 MOVW $0x0015, (AX) 7567 MOVB R11, 2(AX) 7568 ADDQ $0x03, AX 7569 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB 7570 7571 repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB: 7572 SHLL $0x02, R11 7573 ORL $0x01, R11 7574 MOVW R11, (AX) 7575 ADDQ $0x02, AX 7576 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB 7577 7578 repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB: 7579 XORQ BX, BX 7580 LEAL 1(BX)(R11*4), R11 7581 MOVB DI, 1(AX) 7582 SARL $0x08, DI 7583 SHLL $0x05, DI 7584 ORL DI, R11 7585 MOVB R11, (AX) 7586 ADDQ $0x02, AX 7587 7588 match_nolit_emitcopy_end_encodeBetterBlockAsm4MB: 7589 CMPL CX, 8(SP) 7590 JAE emit_remainder_encodeBetterBlockAsm4MB 7591 CMPQ AX, (SP) 7592 JB match_nolit_dst_ok_encodeBetterBlockAsm4MB 7593 MOVQ $0x00000000, ret+48(FP) 7594 RET 7595 7596 match_nolit_dst_ok_encodeBetterBlockAsm4MB: 7597 MOVQ $0x00cf1bbcdcbfa563, BX 7598 MOVQ $0x9e3779b1, DI 7599 LEAQ 1(SI), SI 7600 LEAQ -2(CX), R8 7601 MOVQ (DX)(SI*1), R9 7602 MOVQ 1(DX)(SI*1), R10 7603 MOVQ (DX)(R8*1), R11 7604 MOVQ 1(DX)(R8*1), R12 7605 SHLQ $0x08, R9 7606 IMULQ BX, R9 7607 SHRQ $0x2f, R9 7608 SHLQ $0x20, R10 7609 IMULQ DI, R10 7610 SHRQ $0x32, R10 7611 SHLQ $0x08, R11 7612 IMULQ BX, R11 7613 SHRQ $0x2f, R11 7614 SHLQ $0x20, R12 7615 IMULQ DI, R12 7616 SHRQ $0x32, R12 7617 LEAQ 1(SI), DI 7618 LEAQ 1(R8), R13 7619 MOVL SI, 24(SP)(R9*4) 7620 MOVL R8, 24(SP)(R11*4) 7621 MOVL DI, 524312(SP)(R10*4) 7622 MOVL R13, 524312(SP)(R12*4) 7623 ADDQ $0x01, SI 7624 SUBQ $0x01, R8 7625 7626 index_loop_encodeBetterBlockAsm4MB: 7627 CMPQ SI, R8 7628 JAE search_loop_encodeBetterBlockAsm4MB 7629 MOVQ (DX)(SI*1), DI 7630 MOVQ (DX)(R8*1), R9 7631 SHLQ $0x08, DI 7632 IMULQ BX, DI 7633 SHRQ $0x2f, DI 7634 SHLQ $0x08, R9 7635 IMULQ BX, R9 7636 SHRQ $0x2f, R9 7637 MOVL SI, 24(SP)(DI*4) 7638 MOVL R8, 24(SP)(R9*4) 7639 ADDQ $0x02, SI 7640 SUBQ $0x02, R8 7641 JMP index_loop_encodeBetterBlockAsm4MB 7642 7643 emit_remainder_encodeBetterBlockAsm4MB: 7644 MOVQ src_len+32(FP), CX 7645 SUBL 12(SP), CX 7646 LEAQ 4(AX)(CX*1), CX 7647 CMPQ CX, (SP) 7648 JB emit_remainder_ok_encodeBetterBlockAsm4MB 7649 MOVQ $0x00000000, ret+48(FP) 7650 RET 7651 7652 emit_remainder_ok_encodeBetterBlockAsm4MB: 7653 MOVQ src_len+32(FP), CX 7654 MOVL 12(SP), BX 7655 CMPL BX, CX 7656 JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB 7657 MOVL CX, SI 7658 MOVL CX, 12(SP) 7659 LEAQ (DX)(BX*1), CX 7660 SUBL BX, SI 7661 LEAL -1(SI), DX 7662 CMPL DX, $0x3c 7663 JB one_byte_emit_remainder_encodeBetterBlockAsm4MB 7664 CMPL DX, $0x00000100 7665 JB two_bytes_emit_remainder_encodeBetterBlockAsm4MB 7666 CMPL DX, $0x00010000 7667 JB three_bytes_emit_remainder_encodeBetterBlockAsm4MB 7668 MOVL DX, BX 7669 SHRL $0x10, BX 7670 MOVB $0xf8, (AX) 7671 MOVW DX, 1(AX) 7672 MOVB BL, 3(AX) 7673 ADDQ $0x04, AX 7674 JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB 7675 7676 three_bytes_emit_remainder_encodeBetterBlockAsm4MB: 7677 MOVB $0xf4, (AX) 7678 MOVW DX, 1(AX) 7679 ADDQ $0x03, AX 7680 JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB 7681 7682 two_bytes_emit_remainder_encodeBetterBlockAsm4MB: 7683 MOVB $0xf0, (AX) 7684 MOVB DL, 1(AX) 7685 ADDQ $0x02, AX 7686 CMPL DX, $0x40 7687 JB memmove_emit_remainder_encodeBetterBlockAsm4MB 7688 JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB 7689 7690 one_byte_emit_remainder_encodeBetterBlockAsm4MB: 7691 SHLB $0x02, DL 7692 MOVB DL, (AX) 7693 ADDQ $0x01, AX 7694 7695 memmove_emit_remainder_encodeBetterBlockAsm4MB: 7696 LEAQ (AX)(SI*1), DX 7697 MOVL SI, BX 7698 7699 // genMemMoveShort 7700 CMPQ BX, $0x03 7701 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2 7702 JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3 7703 CMPQ BX, $0x08 7704 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7 7705 CMPQ BX, $0x10 7706 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16 7707 CMPQ BX, $0x20 7708 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32 7709 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64 7710 7711 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2: 7712 MOVB (CX), SI 7713 MOVB -1(CX)(BX*1), CL 7714 MOVB SI, (AX) 7715 MOVB CL, -1(AX)(BX*1) 7716 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB 7717 7718 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3: 7719 MOVW (CX), SI 7720 MOVB 2(CX), CL 7721 MOVW SI, (AX) 7722 MOVB CL, 2(AX) 7723 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB 7724 7725 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7: 7726 MOVL (CX), SI 7727 MOVL -4(CX)(BX*1), CX 7728 MOVL SI, (AX) 7729 MOVL CX, -4(AX)(BX*1) 7730 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB 7731 7732 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16: 7733 MOVQ (CX), SI 7734 MOVQ -8(CX)(BX*1), CX 7735 MOVQ SI, (AX) 7736 MOVQ CX, -8(AX)(BX*1) 7737 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB 7738 7739 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32: 7740 MOVOU (CX), X0 7741 MOVOU -16(CX)(BX*1), X1 7742 MOVOU X0, (AX) 7743 MOVOU X1, -16(AX)(BX*1) 7744 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB 7745 7746 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64: 7747 MOVOU (CX), X0 7748 MOVOU 16(CX), X1 7749 MOVOU -32(CX)(BX*1), X2 7750 MOVOU -16(CX)(BX*1), X3 7751 MOVOU X0, (AX) 7752 MOVOU X1, 16(AX) 7753 MOVOU X2, -32(AX)(BX*1) 7754 MOVOU X3, -16(AX)(BX*1) 7755 7756 memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB: 7757 MOVQ DX, AX 7758 JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB 7759 7760 memmove_long_emit_remainder_encodeBetterBlockAsm4MB: 7761 LEAQ (AX)(SI*1), DX 7762 MOVL SI, BX 7763 7764 // genMemMoveLong 7765 MOVOU (CX), X0 7766 MOVOU 16(CX), X1 7767 MOVOU -32(CX)(BX*1), X2 7768 MOVOU -16(CX)(BX*1), X3 7769 MOVQ BX, DI 7770 SHRQ $0x05, DI 7771 MOVQ AX, SI 7772 ANDL $0x0000001f, SI 7773 MOVQ $0x00000040, R8 7774 SUBQ SI, R8 7775 DECQ DI 7776 JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 7777 LEAQ -32(CX)(R8*1), SI 7778 LEAQ -32(AX)(R8*1), R9 7779 7780 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back: 7781 MOVOU (SI), X4 7782 MOVOU 16(SI), X5 7783 MOVOA X4, (R9) 7784 MOVOA X5, 16(R9) 7785 ADDQ $0x20, R9 7786 ADDQ $0x20, SI 7787 ADDQ $0x20, R8 7788 DECQ DI 7789 JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back 7790 7791 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: 7792 MOVOU -32(CX)(R8*1), X4 7793 MOVOU -16(CX)(R8*1), X5 7794 MOVOA X4, -32(AX)(R8*1) 7795 MOVOA X5, -16(AX)(R8*1) 7796 ADDQ $0x20, R8 7797 CMPQ BX, R8 7798 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 7799 MOVOU X0, (AX) 7800 MOVOU X1, 16(AX) 7801 MOVOU X2, -32(AX)(BX*1) 7802 MOVOU X3, -16(AX)(BX*1) 7803 MOVQ DX, AX 7804 7805 emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB: 7806 MOVQ dst_base+0(FP), CX 7807 SUBQ CX, AX 7808 MOVQ AX, ret+48(FP) 7809 RET 7810 7811 // func encodeBetterBlockAsm12B(dst []byte, src []byte) int 7812 // Requires: BMI, SSE2 7813 TEXT ·encodeBetterBlockAsm12B(SB), $81944-56 7814 MOVQ dst_base+0(FP), AX 7815 MOVQ $0x00000280, CX 7816 LEAQ 24(SP), DX 7817 PXOR X0, X0 7818 7819 zero_loop_encodeBetterBlockAsm12B: 7820 MOVOU X0, (DX) 7821 MOVOU X0, 16(DX) 7822 MOVOU X0, 32(DX) 7823 MOVOU X0, 48(DX) 7824 MOVOU X0, 64(DX) 7825 MOVOU X0, 80(DX) 7826 MOVOU X0, 96(DX) 7827 MOVOU X0, 112(DX) 7828 ADDQ $0x80, DX 7829 DECQ CX 7830 JNZ zero_loop_encodeBetterBlockAsm12B 7831 MOVL $0x00000000, 12(SP) 7832 MOVQ src_len+32(FP), CX 7833 LEAQ -6(CX), DX 7834 LEAQ -8(CX), BX 7835 MOVL BX, 8(SP) 7836 SHRQ $0x05, CX 7837 SUBL CX, DX 7838 LEAQ (AX)(DX*1), DX 7839 MOVQ DX, (SP) 7840 MOVL $0x00000001, CX 7841 MOVL $0x00000000, 16(SP) 7842 MOVQ src_base+24(FP), DX 7843 7844 search_loop_encodeBetterBlockAsm12B: 7845 MOVL CX, BX 7846 SUBL 12(SP), BX 7847 SHRL $0x06, BX 7848 LEAL 1(CX)(BX*1), BX 7849 CMPL BX, 8(SP) 7850 JAE emit_remainder_encodeBetterBlockAsm12B 7851 MOVQ (DX)(CX*1), SI 7852 MOVL BX, 20(SP) 7853 MOVQ $0x0000cf1bbcdcbf9b, R8 7854 MOVQ $0x9e3779b1, BX 7855 MOVQ SI, R9 7856 MOVQ SI, R10 7857 SHLQ $0x10, R9 7858 IMULQ R8, R9 7859 SHRQ $0x32, R9 7860 SHLQ $0x20, R10 7861 IMULQ BX, R10 7862 SHRQ $0x34, R10 7863 MOVL 24(SP)(R9*4), BX 7864 MOVL 65560(SP)(R10*4), DI 7865 MOVL CX, 24(SP)(R9*4) 7866 MOVL CX, 65560(SP)(R10*4) 7867 MOVQ (DX)(BX*1), R9 7868 MOVQ (DX)(DI*1), R10 7869 CMPQ R9, SI 7870 JEQ candidate_match_encodeBetterBlockAsm12B 7871 CMPQ R10, SI 7872 JNE no_short_found_encodeBetterBlockAsm12B 7873 MOVL DI, BX 7874 JMP candidate_match_encodeBetterBlockAsm12B 7875 7876 no_short_found_encodeBetterBlockAsm12B: 7877 CMPL R9, SI 7878 JEQ candidate_match_encodeBetterBlockAsm12B 7879 CMPL R10, SI 7880 JEQ candidateS_match_encodeBetterBlockAsm12B 7881 MOVL 20(SP), CX 7882 JMP search_loop_encodeBetterBlockAsm12B 7883 7884 candidateS_match_encodeBetterBlockAsm12B: 7885 SHRQ $0x08, SI 7886 MOVQ SI, R9 7887 SHLQ $0x10, R9 7888 IMULQ R8, R9 7889 SHRQ $0x32, R9 7890 MOVL 24(SP)(R9*4), BX 7891 INCL CX 7892 MOVL CX, 24(SP)(R9*4) 7893 CMPL (DX)(BX*1), SI 7894 JEQ candidate_match_encodeBetterBlockAsm12B 7895 DECL CX 7896 MOVL DI, BX 7897 7898 candidate_match_encodeBetterBlockAsm12B: 7899 MOVL 12(SP), SI 7900 TESTL BX, BX 7901 JZ match_extend_back_end_encodeBetterBlockAsm12B 7902 7903 match_extend_back_loop_encodeBetterBlockAsm12B: 7904 CMPL CX, SI 7905 JBE match_extend_back_end_encodeBetterBlockAsm12B 7906 MOVB -1(DX)(BX*1), DI 7907 MOVB -1(DX)(CX*1), R8 7908 CMPB DI, R8 7909 JNE match_extend_back_end_encodeBetterBlockAsm12B 7910 LEAL -1(CX), CX 7911 DECL BX 7912 JZ match_extend_back_end_encodeBetterBlockAsm12B 7913 JMP match_extend_back_loop_encodeBetterBlockAsm12B 7914 7915 match_extend_back_end_encodeBetterBlockAsm12B: 7916 MOVL CX, SI 7917 SUBL 12(SP), SI 7918 LEAQ 3(AX)(SI*1), SI 7919 CMPQ SI, (SP) 7920 JB match_dst_size_check_encodeBetterBlockAsm12B 7921 MOVQ $0x00000000, ret+48(FP) 7922 RET 7923 7924 match_dst_size_check_encodeBetterBlockAsm12B: 7925 MOVL CX, SI 7926 ADDL $0x04, CX 7927 ADDL $0x04, BX 7928 MOVQ src_len+32(FP), DI 7929 SUBL CX, DI 7930 LEAQ (DX)(CX*1), R8 7931 LEAQ (DX)(BX*1), R9 7932 7933 // matchLen 7934 XORL R11, R11 7935 CMPL DI, $0x08 7936 JB matchlen_match4_match_nolit_encodeBetterBlockAsm12B 7937 7938 matchlen_loopback_match_nolit_encodeBetterBlockAsm12B: 7939 MOVQ (R8)(R11*1), R10 7940 XORQ (R9)(R11*1), R10 7941 TESTQ R10, R10 7942 JZ matchlen_loop_match_nolit_encodeBetterBlockAsm12B 7943 7944 #ifdef GOAMD64_v3 7945 TZCNTQ R10, R10 7946 7947 #else 7948 BSFQ R10, R10 7949 7950 #endif 7951 SARQ $0x03, R10 7952 LEAL (R11)(R10*1), R11 7953 JMP match_nolit_end_encodeBetterBlockAsm12B 7954 7955 matchlen_loop_match_nolit_encodeBetterBlockAsm12B: 7956 LEAL -8(DI), DI 7957 LEAL 8(R11), R11 7958 CMPL DI, $0x08 7959 JAE matchlen_loopback_match_nolit_encodeBetterBlockAsm12B 7960 JZ match_nolit_end_encodeBetterBlockAsm12B 7961 7962 matchlen_match4_match_nolit_encodeBetterBlockAsm12B: 7963 CMPL DI, $0x04 7964 JB matchlen_match2_match_nolit_encodeBetterBlockAsm12B 7965 MOVL (R8)(R11*1), R10 7966 CMPL (R9)(R11*1), R10 7967 JNE matchlen_match2_match_nolit_encodeBetterBlockAsm12B 7968 SUBL $0x04, DI 7969 LEAL 4(R11), R11 7970 7971 matchlen_match2_match_nolit_encodeBetterBlockAsm12B: 7972 CMPL DI, $0x02 7973 JB matchlen_match1_match_nolit_encodeBetterBlockAsm12B 7974 MOVW (R8)(R11*1), R10 7975 CMPW (R9)(R11*1), R10 7976 JNE matchlen_match1_match_nolit_encodeBetterBlockAsm12B 7977 SUBL $0x02, DI 7978 LEAL 2(R11), R11 7979 7980 matchlen_match1_match_nolit_encodeBetterBlockAsm12B: 7981 CMPL DI, $0x01 7982 JB match_nolit_end_encodeBetterBlockAsm12B 7983 MOVB (R8)(R11*1), R10 7984 CMPB (R9)(R11*1), R10 7985 JNE match_nolit_end_encodeBetterBlockAsm12B 7986 LEAL 1(R11), R11 7987 7988 match_nolit_end_encodeBetterBlockAsm12B: 7989 MOVL CX, DI 7990 SUBL BX, DI 7991 7992 // Check if repeat 7993 CMPL 16(SP), DI 7994 JEQ match_is_repeat_encodeBetterBlockAsm12B 7995 MOVL DI, 16(SP) 7996 MOVL 12(SP), BX 7997 CMPL BX, SI 7998 JEQ emit_literal_done_match_emit_encodeBetterBlockAsm12B 7999 MOVL SI, R8 8000 MOVL SI, 12(SP) 8001 LEAQ (DX)(BX*1), R9 8002 SUBL BX, R8 8003 LEAL -1(R8), BX 8004 CMPL BX, $0x3c 8005 JB one_byte_match_emit_encodeBetterBlockAsm12B 8006 CMPL BX, $0x00000100 8007 JB two_bytes_match_emit_encodeBetterBlockAsm12B 8008 JB three_bytes_match_emit_encodeBetterBlockAsm12B 8009 8010 three_bytes_match_emit_encodeBetterBlockAsm12B: 8011 MOVB $0xf4, (AX) 8012 MOVW BX, 1(AX) 8013 ADDQ $0x03, AX 8014 JMP memmove_long_match_emit_encodeBetterBlockAsm12B 8015 8016 two_bytes_match_emit_encodeBetterBlockAsm12B: 8017 MOVB $0xf0, (AX) 8018 MOVB BL, 1(AX) 8019 ADDQ $0x02, AX 8020 CMPL BX, $0x40 8021 JB memmove_match_emit_encodeBetterBlockAsm12B 8022 JMP memmove_long_match_emit_encodeBetterBlockAsm12B 8023 8024 one_byte_match_emit_encodeBetterBlockAsm12B: 8025 SHLB $0x02, BL 8026 MOVB BL, (AX) 8027 ADDQ $0x01, AX 8028 8029 memmove_match_emit_encodeBetterBlockAsm12B: 8030 LEAQ (AX)(R8*1), BX 8031 8032 // genMemMoveShort 8033 CMPQ R8, $0x04 8034 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4 8035 CMPQ R8, $0x08 8036 JB emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7 8037 CMPQ R8, $0x10 8038 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16 8039 CMPQ R8, $0x20 8040 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32 8041 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64 8042 8043 emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4: 8044 MOVL (R9), R10 8045 MOVL R10, (AX) 8046 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B 8047 8048 emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7: 8049 MOVL (R9), R10 8050 MOVL -4(R9)(R8*1), R9 8051 MOVL R10, (AX) 8052 MOVL R9, -4(AX)(R8*1) 8053 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B 8054 8055 emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16: 8056 MOVQ (R9), R10 8057 MOVQ -8(R9)(R8*1), R9 8058 MOVQ R10, (AX) 8059 MOVQ R9, -8(AX)(R8*1) 8060 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B 8061 8062 emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32: 8063 MOVOU (R9), X0 8064 MOVOU -16(R9)(R8*1), X1 8065 MOVOU X0, (AX) 8066 MOVOU X1, -16(AX)(R8*1) 8067 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B 8068 8069 emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64: 8070 MOVOU (R9), X0 8071 MOVOU 16(R9), X1 8072 MOVOU -32(R9)(R8*1), X2 8073 MOVOU -16(R9)(R8*1), X3 8074 MOVOU X0, (AX) 8075 MOVOU X1, 16(AX) 8076 MOVOU X2, -32(AX)(R8*1) 8077 MOVOU X3, -16(AX)(R8*1) 8078 8079 memmove_end_copy_match_emit_encodeBetterBlockAsm12B: 8080 MOVQ BX, AX 8081 JMP emit_literal_done_match_emit_encodeBetterBlockAsm12B 8082 8083 memmove_long_match_emit_encodeBetterBlockAsm12B: 8084 LEAQ (AX)(R8*1), BX 8085 8086 // genMemMoveLong 8087 MOVOU (R9), X0 8088 MOVOU 16(R9), X1 8089 MOVOU -32(R9)(R8*1), X2 8090 MOVOU -16(R9)(R8*1), X3 8091 MOVQ R8, R12 8092 SHRQ $0x05, R12 8093 MOVQ AX, R10 8094 ANDL $0x0000001f, R10 8095 MOVQ $0x00000040, R13 8096 SUBQ R10, R13 8097 DECQ R12 8098 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 8099 LEAQ -32(R9)(R13*1), R10 8100 LEAQ -32(AX)(R13*1), R14 8101 8102 emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back: 8103 MOVOU (R10), X4 8104 MOVOU 16(R10), X5 8105 MOVOA X4, (R14) 8106 MOVOA X5, 16(R14) 8107 ADDQ $0x20, R14 8108 ADDQ $0x20, R10 8109 ADDQ $0x20, R13 8110 DECQ R12 8111 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back 8112 8113 emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: 8114 MOVOU -32(R9)(R13*1), X4 8115 MOVOU -16(R9)(R13*1), X5 8116 MOVOA X4, -32(AX)(R13*1) 8117 MOVOA X5, -16(AX)(R13*1) 8118 ADDQ $0x20, R13 8119 CMPQ R8, R13 8120 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 8121 MOVOU X0, (AX) 8122 MOVOU X1, 16(AX) 8123 MOVOU X2, -32(AX)(R8*1) 8124 MOVOU X3, -16(AX)(R8*1) 8125 MOVQ BX, AX 8126 8127 emit_literal_done_match_emit_encodeBetterBlockAsm12B: 8128 ADDL R11, CX 8129 ADDL $0x04, R11 8130 MOVL CX, 12(SP) 8131 8132 // emitCopy 8133 CMPL R11, $0x40 8134 JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B 8135 CMPL DI, $0x00000800 8136 JAE long_offset_short_match_nolit_encodeBetterBlockAsm12B 8137 MOVL $0x00000001, BX 8138 LEAL 16(BX), BX 8139 MOVB DI, 1(AX) 8140 SHRL $0x08, DI 8141 SHLL $0x05, DI 8142 ORL DI, BX 8143 MOVB BL, (AX) 8144 ADDQ $0x02, AX 8145 SUBL $0x08, R11 8146 8147 // emitRepeat 8148 LEAL -4(R11), R11 8149 JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b 8150 MOVL R11, BX 8151 LEAL -4(R11), R11 8152 CMPL BX, $0x08 8153 JBE repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b 8154 CMPL BX, $0x0c 8155 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b 8156 CMPL DI, $0x00000800 8157 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b 8158 8159 cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b: 8160 CMPL R11, $0x00000104 8161 JB repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b 8162 LEAL -256(R11), R11 8163 MOVW $0x0019, (AX) 8164 MOVW R11, 2(AX) 8165 ADDQ $0x04, AX 8166 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B 8167 8168 repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b: 8169 LEAL -4(R11), R11 8170 MOVW $0x0015, (AX) 8171 MOVB R11, 2(AX) 8172 ADDQ $0x03, AX 8173 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B 8174 8175 repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b: 8176 SHLL $0x02, R11 8177 ORL $0x01, R11 8178 MOVW R11, (AX) 8179 ADDQ $0x02, AX 8180 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B 8181 8182 repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b: 8183 XORQ BX, BX 8184 LEAL 1(BX)(R11*4), R11 8185 MOVB DI, 1(AX) 8186 SARL $0x08, DI 8187 SHLL $0x05, DI 8188 ORL DI, R11 8189 MOVB R11, (AX) 8190 ADDQ $0x02, AX 8191 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B 8192 8193 long_offset_short_match_nolit_encodeBetterBlockAsm12B: 8194 MOVB $0xee, (AX) 8195 MOVW DI, 1(AX) 8196 LEAL -60(R11), R11 8197 ADDQ $0x03, AX 8198 8199 // emitRepeat 8200 MOVL R11, BX 8201 LEAL -4(R11), R11 8202 CMPL BX, $0x08 8203 JBE repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short 8204 CMPL BX, $0x0c 8205 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short 8206 CMPL DI, $0x00000800 8207 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short 8208 8209 cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: 8210 CMPL R11, $0x00000104 8211 JB repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short 8212 LEAL -256(R11), R11 8213 MOVW $0x0019, (AX) 8214 MOVW R11, 2(AX) 8215 ADDQ $0x04, AX 8216 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B 8217 8218 repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: 8219 LEAL -4(R11), R11 8220 MOVW $0x0015, (AX) 8221 MOVB R11, 2(AX) 8222 ADDQ $0x03, AX 8223 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B 8224 8225 repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: 8226 SHLL $0x02, R11 8227 ORL $0x01, R11 8228 MOVW R11, (AX) 8229 ADDQ $0x02, AX 8230 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B 8231 8232 repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: 8233 XORQ BX, BX 8234 LEAL 1(BX)(R11*4), R11 8235 MOVB DI, 1(AX) 8236 SARL $0x08, DI 8237 SHLL $0x05, DI 8238 ORL DI, R11 8239 MOVB R11, (AX) 8240 ADDQ $0x02, AX 8241 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B 8242 8243 two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B: 8244 MOVL R11, BX 8245 SHLL $0x02, BX 8246 CMPL R11, $0x0c 8247 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm12B 8248 CMPL DI, $0x00000800 8249 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm12B 8250 LEAL -15(BX), BX 8251 MOVB DI, 1(AX) 8252 SHRL $0x08, DI 8253 SHLL $0x05, DI 8254 ORL DI, BX 8255 MOVB BL, (AX) 8256 ADDQ $0x02, AX 8257 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B 8258 8259 emit_copy_three_match_nolit_encodeBetterBlockAsm12B: 8260 LEAL -2(BX), BX 8261 MOVB BL, (AX) 8262 MOVW DI, 1(AX) 8263 ADDQ $0x03, AX 8264 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B 8265 8266 match_is_repeat_encodeBetterBlockAsm12B: 8267 MOVL 12(SP), BX 8268 CMPL BX, SI 8269 JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B 8270 MOVL SI, R8 8271 MOVL SI, 12(SP) 8272 LEAQ (DX)(BX*1), R9 8273 SUBL BX, R8 8274 LEAL -1(R8), BX 8275 CMPL BX, $0x3c 8276 JB one_byte_match_emit_repeat_encodeBetterBlockAsm12B 8277 CMPL BX, $0x00000100 8278 JB two_bytes_match_emit_repeat_encodeBetterBlockAsm12B 8279 JB three_bytes_match_emit_repeat_encodeBetterBlockAsm12B 8280 8281 three_bytes_match_emit_repeat_encodeBetterBlockAsm12B: 8282 MOVB $0xf4, (AX) 8283 MOVW BX, 1(AX) 8284 ADDQ $0x03, AX 8285 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B 8286 8287 two_bytes_match_emit_repeat_encodeBetterBlockAsm12B: 8288 MOVB $0xf0, (AX) 8289 MOVB BL, 1(AX) 8290 ADDQ $0x02, AX 8291 CMPL BX, $0x40 8292 JB memmove_match_emit_repeat_encodeBetterBlockAsm12B 8293 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B 8294 8295 one_byte_match_emit_repeat_encodeBetterBlockAsm12B: 8296 SHLB $0x02, BL 8297 MOVB BL, (AX) 8298 ADDQ $0x01, AX 8299 8300 memmove_match_emit_repeat_encodeBetterBlockAsm12B: 8301 LEAQ (AX)(R8*1), BX 8302 8303 // genMemMoveShort 8304 CMPQ R8, $0x04 8305 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4 8306 CMPQ R8, $0x08 8307 JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7 8308 CMPQ R8, $0x10 8309 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16 8310 CMPQ R8, $0x20 8311 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32 8312 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64 8313 8314 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4: 8315 MOVL (R9), R10 8316 MOVL R10, (AX) 8317 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B 8318 8319 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7: 8320 MOVL (R9), R10 8321 MOVL -4(R9)(R8*1), R9 8322 MOVL R10, (AX) 8323 MOVL R9, -4(AX)(R8*1) 8324 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B 8325 8326 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16: 8327 MOVQ (R9), R10 8328 MOVQ -8(R9)(R8*1), R9 8329 MOVQ R10, (AX) 8330 MOVQ R9, -8(AX)(R8*1) 8331 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B 8332 8333 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32: 8334 MOVOU (R9), X0 8335 MOVOU -16(R9)(R8*1), X1 8336 MOVOU X0, (AX) 8337 MOVOU X1, -16(AX)(R8*1) 8338 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B 8339 8340 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64: 8341 MOVOU (R9), X0 8342 MOVOU 16(R9), X1 8343 MOVOU -32(R9)(R8*1), X2 8344 MOVOU -16(R9)(R8*1), X3 8345 MOVOU X0, (AX) 8346 MOVOU X1, 16(AX) 8347 MOVOU X2, -32(AX)(R8*1) 8348 MOVOU X3, -16(AX)(R8*1) 8349 8350 memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B: 8351 MOVQ BX, AX 8352 JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B 8353 8354 memmove_long_match_emit_repeat_encodeBetterBlockAsm12B: 8355 LEAQ (AX)(R8*1), BX 8356 8357 // genMemMoveLong 8358 MOVOU (R9), X0 8359 MOVOU 16(R9), X1 8360 MOVOU -32(R9)(R8*1), X2 8361 MOVOU -16(R9)(R8*1), X3 8362 MOVQ R8, R12 8363 SHRQ $0x05, R12 8364 MOVQ AX, R10 8365 ANDL $0x0000001f, R10 8366 MOVQ $0x00000040, R13 8367 SUBQ R10, R13 8368 DECQ R12 8369 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 8370 LEAQ -32(R9)(R13*1), R10 8371 LEAQ -32(AX)(R13*1), R14 8372 8373 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back: 8374 MOVOU (R10), X4 8375 MOVOU 16(R10), X5 8376 MOVOA X4, (R14) 8377 MOVOA X5, 16(R14) 8378 ADDQ $0x20, R14 8379 ADDQ $0x20, R10 8380 ADDQ $0x20, R13 8381 DECQ R12 8382 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back 8383 8384 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: 8385 MOVOU -32(R9)(R13*1), X4 8386 MOVOU -16(R9)(R13*1), X5 8387 MOVOA X4, -32(AX)(R13*1) 8388 MOVOA X5, -16(AX)(R13*1) 8389 ADDQ $0x20, R13 8390 CMPQ R8, R13 8391 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 8392 MOVOU X0, (AX) 8393 MOVOU X1, 16(AX) 8394 MOVOU X2, -32(AX)(R8*1) 8395 MOVOU X3, -16(AX)(R8*1) 8396 MOVQ BX, AX 8397 8398 emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B: 8399 ADDL R11, CX 8400 ADDL $0x04, R11 8401 MOVL CX, 12(SP) 8402 8403 // emitRepeat 8404 MOVL R11, BX 8405 LEAL -4(R11), R11 8406 CMPL BX, $0x08 8407 JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B 8408 CMPL BX, $0x0c 8409 JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B 8410 CMPL DI, $0x00000800 8411 JB repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B 8412 8413 cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B: 8414 CMPL R11, $0x00000104 8415 JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B 8416 LEAL -256(R11), R11 8417 MOVW $0x0019, (AX) 8418 MOVW R11, 2(AX) 8419 ADDQ $0x04, AX 8420 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B 8421 8422 repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B: 8423 LEAL -4(R11), R11 8424 MOVW $0x0015, (AX) 8425 MOVB R11, 2(AX) 8426 ADDQ $0x03, AX 8427 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B 8428 8429 repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B: 8430 SHLL $0x02, R11 8431 ORL $0x01, R11 8432 MOVW R11, (AX) 8433 ADDQ $0x02, AX 8434 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B 8435 8436 repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B: 8437 XORQ BX, BX 8438 LEAL 1(BX)(R11*4), R11 8439 MOVB DI, 1(AX) 8440 SARL $0x08, DI 8441 SHLL $0x05, DI 8442 ORL DI, R11 8443 MOVB R11, (AX) 8444 ADDQ $0x02, AX 8445 8446 match_nolit_emitcopy_end_encodeBetterBlockAsm12B: 8447 CMPL CX, 8(SP) 8448 JAE emit_remainder_encodeBetterBlockAsm12B 8449 CMPQ AX, (SP) 8450 JB match_nolit_dst_ok_encodeBetterBlockAsm12B 8451 MOVQ $0x00000000, ret+48(FP) 8452 RET 8453 8454 match_nolit_dst_ok_encodeBetterBlockAsm12B: 8455 MOVQ $0x0000cf1bbcdcbf9b, BX 8456 MOVQ $0x9e3779b1, DI 8457 LEAQ 1(SI), SI 8458 LEAQ -2(CX), R8 8459 MOVQ (DX)(SI*1), R9 8460 MOVQ 1(DX)(SI*1), R10 8461 MOVQ (DX)(R8*1), R11 8462 MOVQ 1(DX)(R8*1), R12 8463 SHLQ $0x10, R9 8464 IMULQ BX, R9 8465 SHRQ $0x32, R9 8466 SHLQ $0x20, R10 8467 IMULQ DI, R10 8468 SHRQ $0x34, R10 8469 SHLQ $0x10, R11 8470 IMULQ BX, R11 8471 SHRQ $0x32, R11 8472 SHLQ $0x20, R12 8473 IMULQ DI, R12 8474 SHRQ $0x34, R12 8475 LEAQ 1(SI), DI 8476 LEAQ 1(R8), R13 8477 MOVL SI, 24(SP)(R9*4) 8478 MOVL R8, 24(SP)(R11*4) 8479 MOVL DI, 65560(SP)(R10*4) 8480 MOVL R13, 65560(SP)(R12*4) 8481 ADDQ $0x01, SI 8482 SUBQ $0x01, R8 8483 8484 index_loop_encodeBetterBlockAsm12B: 8485 CMPQ SI, R8 8486 JAE search_loop_encodeBetterBlockAsm12B 8487 MOVQ (DX)(SI*1), DI 8488 MOVQ (DX)(R8*1), R9 8489 SHLQ $0x10, DI 8490 IMULQ BX, DI 8491 SHRQ $0x32, DI 8492 SHLQ $0x10, R9 8493 IMULQ BX, R9 8494 SHRQ $0x32, R9 8495 MOVL SI, 24(SP)(DI*4) 8496 MOVL R8, 24(SP)(R9*4) 8497 ADDQ $0x02, SI 8498 SUBQ $0x02, R8 8499 JMP index_loop_encodeBetterBlockAsm12B 8500 8501 emit_remainder_encodeBetterBlockAsm12B: 8502 MOVQ src_len+32(FP), CX 8503 SUBL 12(SP), CX 8504 LEAQ 3(AX)(CX*1), CX 8505 CMPQ CX, (SP) 8506 JB emit_remainder_ok_encodeBetterBlockAsm12B 8507 MOVQ $0x00000000, ret+48(FP) 8508 RET 8509 8510 emit_remainder_ok_encodeBetterBlockAsm12B: 8511 MOVQ src_len+32(FP), CX 8512 MOVL 12(SP), BX 8513 CMPL BX, CX 8514 JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm12B 8515 MOVL CX, SI 8516 MOVL CX, 12(SP) 8517 LEAQ (DX)(BX*1), CX 8518 SUBL BX, SI 8519 LEAL -1(SI), DX 8520 CMPL DX, $0x3c 8521 JB one_byte_emit_remainder_encodeBetterBlockAsm12B 8522 CMPL DX, $0x00000100 8523 JB two_bytes_emit_remainder_encodeBetterBlockAsm12B 8524 JB three_bytes_emit_remainder_encodeBetterBlockAsm12B 8525 8526 three_bytes_emit_remainder_encodeBetterBlockAsm12B: 8527 MOVB $0xf4, (AX) 8528 MOVW DX, 1(AX) 8529 ADDQ $0x03, AX 8530 JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B 8531 8532 two_bytes_emit_remainder_encodeBetterBlockAsm12B: 8533 MOVB $0xf0, (AX) 8534 MOVB DL, 1(AX) 8535 ADDQ $0x02, AX 8536 CMPL DX, $0x40 8537 JB memmove_emit_remainder_encodeBetterBlockAsm12B 8538 JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B 8539 8540 one_byte_emit_remainder_encodeBetterBlockAsm12B: 8541 SHLB $0x02, DL 8542 MOVB DL, (AX) 8543 ADDQ $0x01, AX 8544 8545 memmove_emit_remainder_encodeBetterBlockAsm12B: 8546 LEAQ (AX)(SI*1), DX 8547 MOVL SI, BX 8548 8549 // genMemMoveShort 8550 CMPQ BX, $0x03 8551 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2 8552 JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3 8553 CMPQ BX, $0x08 8554 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7 8555 CMPQ BX, $0x10 8556 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16 8557 CMPQ BX, $0x20 8558 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32 8559 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64 8560 8561 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2: 8562 MOVB (CX), SI 8563 MOVB -1(CX)(BX*1), CL 8564 MOVB SI, (AX) 8565 MOVB CL, -1(AX)(BX*1) 8566 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B 8567 8568 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3: 8569 MOVW (CX), SI 8570 MOVB 2(CX), CL 8571 MOVW SI, (AX) 8572 MOVB CL, 2(AX) 8573 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B 8574 8575 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7: 8576 MOVL (CX), SI 8577 MOVL -4(CX)(BX*1), CX 8578 MOVL SI, (AX) 8579 MOVL CX, -4(AX)(BX*1) 8580 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B 8581 8582 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16: 8583 MOVQ (CX), SI 8584 MOVQ -8(CX)(BX*1), CX 8585 MOVQ SI, (AX) 8586 MOVQ CX, -8(AX)(BX*1) 8587 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B 8588 8589 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32: 8590 MOVOU (CX), X0 8591 MOVOU -16(CX)(BX*1), X1 8592 MOVOU X0, (AX) 8593 MOVOU X1, -16(AX)(BX*1) 8594 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B 8595 8596 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64: 8597 MOVOU (CX), X0 8598 MOVOU 16(CX), X1 8599 MOVOU -32(CX)(BX*1), X2 8600 MOVOU -16(CX)(BX*1), X3 8601 MOVOU X0, (AX) 8602 MOVOU X1, 16(AX) 8603 MOVOU X2, -32(AX)(BX*1) 8604 MOVOU X3, -16(AX)(BX*1) 8605 8606 memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B: 8607 MOVQ DX, AX 8608 JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm12B 8609 8610 memmove_long_emit_remainder_encodeBetterBlockAsm12B: 8611 LEAQ (AX)(SI*1), DX 8612 MOVL SI, BX 8613 8614 // genMemMoveLong 8615 MOVOU (CX), X0 8616 MOVOU 16(CX), X1 8617 MOVOU -32(CX)(BX*1), X2 8618 MOVOU -16(CX)(BX*1), X3 8619 MOVQ BX, DI 8620 SHRQ $0x05, DI 8621 MOVQ AX, SI 8622 ANDL $0x0000001f, SI 8623 MOVQ $0x00000040, R8 8624 SUBQ SI, R8 8625 DECQ DI 8626 JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 8627 LEAQ -32(CX)(R8*1), SI 8628 LEAQ -32(AX)(R8*1), R9 8629 8630 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back: 8631 MOVOU (SI), X4 8632 MOVOU 16(SI), X5 8633 MOVOA X4, (R9) 8634 MOVOA X5, 16(R9) 8635 ADDQ $0x20, R9 8636 ADDQ $0x20, SI 8637 ADDQ $0x20, R8 8638 DECQ DI 8639 JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back 8640 8641 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: 8642 MOVOU -32(CX)(R8*1), X4 8643 MOVOU -16(CX)(R8*1), X5 8644 MOVOA X4, -32(AX)(R8*1) 8645 MOVOA X5, -16(AX)(R8*1) 8646 ADDQ $0x20, R8 8647 CMPQ BX, R8 8648 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 8649 MOVOU X0, (AX) 8650 MOVOU X1, 16(AX) 8651 MOVOU X2, -32(AX)(BX*1) 8652 MOVOU X3, -16(AX)(BX*1) 8653 MOVQ DX, AX 8654 8655 emit_literal_done_emit_remainder_encodeBetterBlockAsm12B: 8656 MOVQ dst_base+0(FP), CX 8657 SUBQ CX, AX 8658 MOVQ AX, ret+48(FP) 8659 RET 8660 8661 // func encodeBetterBlockAsm10B(dst []byte, src []byte) int 8662 // Requires: BMI, SSE2 8663 TEXT ·encodeBetterBlockAsm10B(SB), $20504-56 8664 MOVQ dst_base+0(FP), AX 8665 MOVQ $0x000000a0, CX 8666 LEAQ 24(SP), DX 8667 PXOR X0, X0 8668 8669 zero_loop_encodeBetterBlockAsm10B: 8670 MOVOU X0, (DX) 8671 MOVOU X0, 16(DX) 8672 MOVOU X0, 32(DX) 8673 MOVOU X0, 48(DX) 8674 MOVOU X0, 64(DX) 8675 MOVOU X0, 80(DX) 8676 MOVOU X0, 96(DX) 8677 MOVOU X0, 112(DX) 8678 ADDQ $0x80, DX 8679 DECQ CX 8680 JNZ zero_loop_encodeBetterBlockAsm10B 8681 MOVL $0x00000000, 12(SP) 8682 MOVQ src_len+32(FP), CX 8683 LEAQ -6(CX), DX 8684 LEAQ -8(CX), BX 8685 MOVL BX, 8(SP) 8686 SHRQ $0x05, CX 8687 SUBL CX, DX 8688 LEAQ (AX)(DX*1), DX 8689 MOVQ DX, (SP) 8690 MOVL $0x00000001, CX 8691 MOVL $0x00000000, 16(SP) 8692 MOVQ src_base+24(FP), DX 8693 8694 search_loop_encodeBetterBlockAsm10B: 8695 MOVL CX, BX 8696 SUBL 12(SP), BX 8697 SHRL $0x05, BX 8698 LEAL 1(CX)(BX*1), BX 8699 CMPL BX, 8(SP) 8700 JAE emit_remainder_encodeBetterBlockAsm10B 8701 MOVQ (DX)(CX*1), SI 8702 MOVL BX, 20(SP) 8703 MOVQ $0x0000cf1bbcdcbf9b, R8 8704 MOVQ $0x9e3779b1, BX 8705 MOVQ SI, R9 8706 MOVQ SI, R10 8707 SHLQ $0x10, R9 8708 IMULQ R8, R9 8709 SHRQ $0x34, R9 8710 SHLQ $0x20, R10 8711 IMULQ BX, R10 8712 SHRQ $0x36, R10 8713 MOVL 24(SP)(R9*4), BX 8714 MOVL 16408(SP)(R10*4), DI 8715 MOVL CX, 24(SP)(R9*4) 8716 MOVL CX, 16408(SP)(R10*4) 8717 MOVQ (DX)(BX*1), R9 8718 MOVQ (DX)(DI*1), R10 8719 CMPQ R9, SI 8720 JEQ candidate_match_encodeBetterBlockAsm10B 8721 CMPQ R10, SI 8722 JNE no_short_found_encodeBetterBlockAsm10B 8723 MOVL DI, BX 8724 JMP candidate_match_encodeBetterBlockAsm10B 8725 8726 no_short_found_encodeBetterBlockAsm10B: 8727 CMPL R9, SI 8728 JEQ candidate_match_encodeBetterBlockAsm10B 8729 CMPL R10, SI 8730 JEQ candidateS_match_encodeBetterBlockAsm10B 8731 MOVL 20(SP), CX 8732 JMP search_loop_encodeBetterBlockAsm10B 8733 8734 candidateS_match_encodeBetterBlockAsm10B: 8735 SHRQ $0x08, SI 8736 MOVQ SI, R9 8737 SHLQ $0x10, R9 8738 IMULQ R8, R9 8739 SHRQ $0x34, R9 8740 MOVL 24(SP)(R9*4), BX 8741 INCL CX 8742 MOVL CX, 24(SP)(R9*4) 8743 CMPL (DX)(BX*1), SI 8744 JEQ candidate_match_encodeBetterBlockAsm10B 8745 DECL CX 8746 MOVL DI, BX 8747 8748 candidate_match_encodeBetterBlockAsm10B: 8749 MOVL 12(SP), SI 8750 TESTL BX, BX 8751 JZ match_extend_back_end_encodeBetterBlockAsm10B 8752 8753 match_extend_back_loop_encodeBetterBlockAsm10B: 8754 CMPL CX, SI 8755 JBE match_extend_back_end_encodeBetterBlockAsm10B 8756 MOVB -1(DX)(BX*1), DI 8757 MOVB -1(DX)(CX*1), R8 8758 CMPB DI, R8 8759 JNE match_extend_back_end_encodeBetterBlockAsm10B 8760 LEAL -1(CX), CX 8761 DECL BX 8762 JZ match_extend_back_end_encodeBetterBlockAsm10B 8763 JMP match_extend_back_loop_encodeBetterBlockAsm10B 8764 8765 match_extend_back_end_encodeBetterBlockAsm10B: 8766 MOVL CX, SI 8767 SUBL 12(SP), SI 8768 LEAQ 3(AX)(SI*1), SI 8769 CMPQ SI, (SP) 8770 JB match_dst_size_check_encodeBetterBlockAsm10B 8771 MOVQ $0x00000000, ret+48(FP) 8772 RET 8773 8774 match_dst_size_check_encodeBetterBlockAsm10B: 8775 MOVL CX, SI 8776 ADDL $0x04, CX 8777 ADDL $0x04, BX 8778 MOVQ src_len+32(FP), DI 8779 SUBL CX, DI 8780 LEAQ (DX)(CX*1), R8 8781 LEAQ (DX)(BX*1), R9 8782 8783 // matchLen 8784 XORL R11, R11 8785 CMPL DI, $0x08 8786 JB matchlen_match4_match_nolit_encodeBetterBlockAsm10B 8787 8788 matchlen_loopback_match_nolit_encodeBetterBlockAsm10B: 8789 MOVQ (R8)(R11*1), R10 8790 XORQ (R9)(R11*1), R10 8791 TESTQ R10, R10 8792 JZ matchlen_loop_match_nolit_encodeBetterBlockAsm10B 8793 8794 #ifdef GOAMD64_v3 8795 TZCNTQ R10, R10 8796 8797 #else 8798 BSFQ R10, R10 8799 8800 #endif 8801 SARQ $0x03, R10 8802 LEAL (R11)(R10*1), R11 8803 JMP match_nolit_end_encodeBetterBlockAsm10B 8804 8805 matchlen_loop_match_nolit_encodeBetterBlockAsm10B: 8806 LEAL -8(DI), DI 8807 LEAL 8(R11), R11 8808 CMPL DI, $0x08 8809 JAE matchlen_loopback_match_nolit_encodeBetterBlockAsm10B 8810 JZ match_nolit_end_encodeBetterBlockAsm10B 8811 8812 matchlen_match4_match_nolit_encodeBetterBlockAsm10B: 8813 CMPL DI, $0x04 8814 JB matchlen_match2_match_nolit_encodeBetterBlockAsm10B 8815 MOVL (R8)(R11*1), R10 8816 CMPL (R9)(R11*1), R10 8817 JNE matchlen_match2_match_nolit_encodeBetterBlockAsm10B 8818 SUBL $0x04, DI 8819 LEAL 4(R11), R11 8820 8821 matchlen_match2_match_nolit_encodeBetterBlockAsm10B: 8822 CMPL DI, $0x02 8823 JB matchlen_match1_match_nolit_encodeBetterBlockAsm10B 8824 MOVW (R8)(R11*1), R10 8825 CMPW (R9)(R11*1), R10 8826 JNE matchlen_match1_match_nolit_encodeBetterBlockAsm10B 8827 SUBL $0x02, DI 8828 LEAL 2(R11), R11 8829 8830 matchlen_match1_match_nolit_encodeBetterBlockAsm10B: 8831 CMPL DI, $0x01 8832 JB match_nolit_end_encodeBetterBlockAsm10B 8833 MOVB (R8)(R11*1), R10 8834 CMPB (R9)(R11*1), R10 8835 JNE match_nolit_end_encodeBetterBlockAsm10B 8836 LEAL 1(R11), R11 8837 8838 match_nolit_end_encodeBetterBlockAsm10B: 8839 MOVL CX, DI 8840 SUBL BX, DI 8841 8842 // Check if repeat 8843 CMPL 16(SP), DI 8844 JEQ match_is_repeat_encodeBetterBlockAsm10B 8845 MOVL DI, 16(SP) 8846 MOVL 12(SP), BX 8847 CMPL BX, SI 8848 JEQ emit_literal_done_match_emit_encodeBetterBlockAsm10B 8849 MOVL SI, R8 8850 MOVL SI, 12(SP) 8851 LEAQ (DX)(BX*1), R9 8852 SUBL BX, R8 8853 LEAL -1(R8), BX 8854 CMPL BX, $0x3c 8855 JB one_byte_match_emit_encodeBetterBlockAsm10B 8856 CMPL BX, $0x00000100 8857 JB two_bytes_match_emit_encodeBetterBlockAsm10B 8858 JB three_bytes_match_emit_encodeBetterBlockAsm10B 8859 8860 three_bytes_match_emit_encodeBetterBlockAsm10B: 8861 MOVB $0xf4, (AX) 8862 MOVW BX, 1(AX) 8863 ADDQ $0x03, AX 8864 JMP memmove_long_match_emit_encodeBetterBlockAsm10B 8865 8866 two_bytes_match_emit_encodeBetterBlockAsm10B: 8867 MOVB $0xf0, (AX) 8868 MOVB BL, 1(AX) 8869 ADDQ $0x02, AX 8870 CMPL BX, $0x40 8871 JB memmove_match_emit_encodeBetterBlockAsm10B 8872 JMP memmove_long_match_emit_encodeBetterBlockAsm10B 8873 8874 one_byte_match_emit_encodeBetterBlockAsm10B: 8875 SHLB $0x02, BL 8876 MOVB BL, (AX) 8877 ADDQ $0x01, AX 8878 8879 memmove_match_emit_encodeBetterBlockAsm10B: 8880 LEAQ (AX)(R8*1), BX 8881 8882 // genMemMoveShort 8883 CMPQ R8, $0x04 8884 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4 8885 CMPQ R8, $0x08 8886 JB emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7 8887 CMPQ R8, $0x10 8888 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16 8889 CMPQ R8, $0x20 8890 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32 8891 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64 8892 8893 emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4: 8894 MOVL (R9), R10 8895 MOVL R10, (AX) 8896 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B 8897 8898 emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7: 8899 MOVL (R9), R10 8900 MOVL -4(R9)(R8*1), R9 8901 MOVL R10, (AX) 8902 MOVL R9, -4(AX)(R8*1) 8903 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B 8904 8905 emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16: 8906 MOVQ (R9), R10 8907 MOVQ -8(R9)(R8*1), R9 8908 MOVQ R10, (AX) 8909 MOVQ R9, -8(AX)(R8*1) 8910 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B 8911 8912 emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32: 8913 MOVOU (R9), X0 8914 MOVOU -16(R9)(R8*1), X1 8915 MOVOU X0, (AX) 8916 MOVOU X1, -16(AX)(R8*1) 8917 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B 8918 8919 emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64: 8920 MOVOU (R9), X0 8921 MOVOU 16(R9), X1 8922 MOVOU -32(R9)(R8*1), X2 8923 MOVOU -16(R9)(R8*1), X3 8924 MOVOU X0, (AX) 8925 MOVOU X1, 16(AX) 8926 MOVOU X2, -32(AX)(R8*1) 8927 MOVOU X3, -16(AX)(R8*1) 8928 8929 memmove_end_copy_match_emit_encodeBetterBlockAsm10B: 8930 MOVQ BX, AX 8931 JMP emit_literal_done_match_emit_encodeBetterBlockAsm10B 8932 8933 memmove_long_match_emit_encodeBetterBlockAsm10B: 8934 LEAQ (AX)(R8*1), BX 8935 8936 // genMemMoveLong 8937 MOVOU (R9), X0 8938 MOVOU 16(R9), X1 8939 MOVOU -32(R9)(R8*1), X2 8940 MOVOU -16(R9)(R8*1), X3 8941 MOVQ R8, R12 8942 SHRQ $0x05, R12 8943 MOVQ AX, R10 8944 ANDL $0x0000001f, R10 8945 MOVQ $0x00000040, R13 8946 SUBQ R10, R13 8947 DECQ R12 8948 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 8949 LEAQ -32(R9)(R13*1), R10 8950 LEAQ -32(AX)(R13*1), R14 8951 8952 emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back: 8953 MOVOU (R10), X4 8954 MOVOU 16(R10), X5 8955 MOVOA X4, (R14) 8956 MOVOA X5, 16(R14) 8957 ADDQ $0x20, R14 8958 ADDQ $0x20, R10 8959 ADDQ $0x20, R13 8960 DECQ R12 8961 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back 8962 8963 emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: 8964 MOVOU -32(R9)(R13*1), X4 8965 MOVOU -16(R9)(R13*1), X5 8966 MOVOA X4, -32(AX)(R13*1) 8967 MOVOA X5, -16(AX)(R13*1) 8968 ADDQ $0x20, R13 8969 CMPQ R8, R13 8970 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 8971 MOVOU X0, (AX) 8972 MOVOU X1, 16(AX) 8973 MOVOU X2, -32(AX)(R8*1) 8974 MOVOU X3, -16(AX)(R8*1) 8975 MOVQ BX, AX 8976 8977 emit_literal_done_match_emit_encodeBetterBlockAsm10B: 8978 ADDL R11, CX 8979 ADDL $0x04, R11 8980 MOVL CX, 12(SP) 8981 8982 // emitCopy 8983 CMPL R11, $0x40 8984 JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B 8985 CMPL DI, $0x00000800 8986 JAE long_offset_short_match_nolit_encodeBetterBlockAsm10B 8987 MOVL $0x00000001, BX 8988 LEAL 16(BX), BX 8989 MOVB DI, 1(AX) 8990 SHRL $0x08, DI 8991 SHLL $0x05, DI 8992 ORL DI, BX 8993 MOVB BL, (AX) 8994 ADDQ $0x02, AX 8995 SUBL $0x08, R11 8996 8997 // emitRepeat 8998 LEAL -4(R11), R11 8999 JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b 9000 MOVL R11, BX 9001 LEAL -4(R11), R11 9002 CMPL BX, $0x08 9003 JBE repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b 9004 CMPL BX, $0x0c 9005 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b 9006 CMPL DI, $0x00000800 9007 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b 9008 9009 cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b: 9010 CMPL R11, $0x00000104 9011 JB repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b 9012 LEAL -256(R11), R11 9013 MOVW $0x0019, (AX) 9014 MOVW R11, 2(AX) 9015 ADDQ $0x04, AX 9016 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B 9017 9018 repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b: 9019 LEAL -4(R11), R11 9020 MOVW $0x0015, (AX) 9021 MOVB R11, 2(AX) 9022 ADDQ $0x03, AX 9023 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B 9024 9025 repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b: 9026 SHLL $0x02, R11 9027 ORL $0x01, R11 9028 MOVW R11, (AX) 9029 ADDQ $0x02, AX 9030 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B 9031 9032 repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b: 9033 XORQ BX, BX 9034 LEAL 1(BX)(R11*4), R11 9035 MOVB DI, 1(AX) 9036 SARL $0x08, DI 9037 SHLL $0x05, DI 9038 ORL DI, R11 9039 MOVB R11, (AX) 9040 ADDQ $0x02, AX 9041 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B 9042 9043 long_offset_short_match_nolit_encodeBetterBlockAsm10B: 9044 MOVB $0xee, (AX) 9045 MOVW DI, 1(AX) 9046 LEAL -60(R11), R11 9047 ADDQ $0x03, AX 9048 9049 // emitRepeat 9050 MOVL R11, BX 9051 LEAL -4(R11), R11 9052 CMPL BX, $0x08 9053 JBE repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short 9054 CMPL BX, $0x0c 9055 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short 9056 CMPL DI, $0x00000800 9057 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short 9058 9059 cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: 9060 CMPL R11, $0x00000104 9061 JB repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short 9062 LEAL -256(R11), R11 9063 MOVW $0x0019, (AX) 9064 MOVW R11, 2(AX) 9065 ADDQ $0x04, AX 9066 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B 9067 9068 repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: 9069 LEAL -4(R11), R11 9070 MOVW $0x0015, (AX) 9071 MOVB R11, 2(AX) 9072 ADDQ $0x03, AX 9073 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B 9074 9075 repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: 9076 SHLL $0x02, R11 9077 ORL $0x01, R11 9078 MOVW R11, (AX) 9079 ADDQ $0x02, AX 9080 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B 9081 9082 repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: 9083 XORQ BX, BX 9084 LEAL 1(BX)(R11*4), R11 9085 MOVB DI, 1(AX) 9086 SARL $0x08, DI 9087 SHLL $0x05, DI 9088 ORL DI, R11 9089 MOVB R11, (AX) 9090 ADDQ $0x02, AX 9091 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B 9092 9093 two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B: 9094 MOVL R11, BX 9095 SHLL $0x02, BX 9096 CMPL R11, $0x0c 9097 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm10B 9098 CMPL DI, $0x00000800 9099 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm10B 9100 LEAL -15(BX), BX 9101 MOVB DI, 1(AX) 9102 SHRL $0x08, DI 9103 SHLL $0x05, DI 9104 ORL DI, BX 9105 MOVB BL, (AX) 9106 ADDQ $0x02, AX 9107 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B 9108 9109 emit_copy_three_match_nolit_encodeBetterBlockAsm10B: 9110 LEAL -2(BX), BX 9111 MOVB BL, (AX) 9112 MOVW DI, 1(AX) 9113 ADDQ $0x03, AX 9114 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B 9115 9116 match_is_repeat_encodeBetterBlockAsm10B: 9117 MOVL 12(SP), BX 9118 CMPL BX, SI 9119 JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B 9120 MOVL SI, R8 9121 MOVL SI, 12(SP) 9122 LEAQ (DX)(BX*1), R9 9123 SUBL BX, R8 9124 LEAL -1(R8), BX 9125 CMPL BX, $0x3c 9126 JB one_byte_match_emit_repeat_encodeBetterBlockAsm10B 9127 CMPL BX, $0x00000100 9128 JB two_bytes_match_emit_repeat_encodeBetterBlockAsm10B 9129 JB three_bytes_match_emit_repeat_encodeBetterBlockAsm10B 9130 9131 three_bytes_match_emit_repeat_encodeBetterBlockAsm10B: 9132 MOVB $0xf4, (AX) 9133 MOVW BX, 1(AX) 9134 ADDQ $0x03, AX 9135 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B 9136 9137 two_bytes_match_emit_repeat_encodeBetterBlockAsm10B: 9138 MOVB $0xf0, (AX) 9139 MOVB BL, 1(AX) 9140 ADDQ $0x02, AX 9141 CMPL BX, $0x40 9142 JB memmove_match_emit_repeat_encodeBetterBlockAsm10B 9143 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B 9144 9145 one_byte_match_emit_repeat_encodeBetterBlockAsm10B: 9146 SHLB $0x02, BL 9147 MOVB BL, (AX) 9148 ADDQ $0x01, AX 9149 9150 memmove_match_emit_repeat_encodeBetterBlockAsm10B: 9151 LEAQ (AX)(R8*1), BX 9152 9153 // genMemMoveShort 9154 CMPQ R8, $0x04 9155 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4 9156 CMPQ R8, $0x08 9157 JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7 9158 CMPQ R8, $0x10 9159 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16 9160 CMPQ R8, $0x20 9161 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32 9162 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64 9163 9164 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4: 9165 MOVL (R9), R10 9166 MOVL R10, (AX) 9167 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B 9168 9169 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7: 9170 MOVL (R9), R10 9171 MOVL -4(R9)(R8*1), R9 9172 MOVL R10, (AX) 9173 MOVL R9, -4(AX)(R8*1) 9174 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B 9175 9176 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16: 9177 MOVQ (R9), R10 9178 MOVQ -8(R9)(R8*1), R9 9179 MOVQ R10, (AX) 9180 MOVQ R9, -8(AX)(R8*1) 9181 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B 9182 9183 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32: 9184 MOVOU (R9), X0 9185 MOVOU -16(R9)(R8*1), X1 9186 MOVOU X0, (AX) 9187 MOVOU X1, -16(AX)(R8*1) 9188 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B 9189 9190 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64: 9191 MOVOU (R9), X0 9192 MOVOU 16(R9), X1 9193 MOVOU -32(R9)(R8*1), X2 9194 MOVOU -16(R9)(R8*1), X3 9195 MOVOU X0, (AX) 9196 MOVOU X1, 16(AX) 9197 MOVOU X2, -32(AX)(R8*1) 9198 MOVOU X3, -16(AX)(R8*1) 9199 9200 memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B: 9201 MOVQ BX, AX 9202 JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B 9203 9204 memmove_long_match_emit_repeat_encodeBetterBlockAsm10B: 9205 LEAQ (AX)(R8*1), BX 9206 9207 // genMemMoveLong 9208 MOVOU (R9), X0 9209 MOVOU 16(R9), X1 9210 MOVOU -32(R9)(R8*1), X2 9211 MOVOU -16(R9)(R8*1), X3 9212 MOVQ R8, R12 9213 SHRQ $0x05, R12 9214 MOVQ AX, R10 9215 ANDL $0x0000001f, R10 9216 MOVQ $0x00000040, R13 9217 SUBQ R10, R13 9218 DECQ R12 9219 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 9220 LEAQ -32(R9)(R13*1), R10 9221 LEAQ -32(AX)(R13*1), R14 9222 9223 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back: 9224 MOVOU (R10), X4 9225 MOVOU 16(R10), X5 9226 MOVOA X4, (R14) 9227 MOVOA X5, 16(R14) 9228 ADDQ $0x20, R14 9229 ADDQ $0x20, R10 9230 ADDQ $0x20, R13 9231 DECQ R12 9232 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back 9233 9234 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: 9235 MOVOU -32(R9)(R13*1), X4 9236 MOVOU -16(R9)(R13*1), X5 9237 MOVOA X4, -32(AX)(R13*1) 9238 MOVOA X5, -16(AX)(R13*1) 9239 ADDQ $0x20, R13 9240 CMPQ R8, R13 9241 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 9242 MOVOU X0, (AX) 9243 MOVOU X1, 16(AX) 9244 MOVOU X2, -32(AX)(R8*1) 9245 MOVOU X3, -16(AX)(R8*1) 9246 MOVQ BX, AX 9247 9248 emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B: 9249 ADDL R11, CX 9250 ADDL $0x04, R11 9251 MOVL CX, 12(SP) 9252 9253 // emitRepeat 9254 MOVL R11, BX 9255 LEAL -4(R11), R11 9256 CMPL BX, $0x08 9257 JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B 9258 CMPL BX, $0x0c 9259 JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B 9260 CMPL DI, $0x00000800 9261 JB repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B 9262 9263 cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B: 9264 CMPL R11, $0x00000104 9265 JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B 9266 LEAL -256(R11), R11 9267 MOVW $0x0019, (AX) 9268 MOVW R11, 2(AX) 9269 ADDQ $0x04, AX 9270 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B 9271 9272 repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B: 9273 LEAL -4(R11), R11 9274 MOVW $0x0015, (AX) 9275 MOVB R11, 2(AX) 9276 ADDQ $0x03, AX 9277 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B 9278 9279 repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B: 9280 SHLL $0x02, R11 9281 ORL $0x01, R11 9282 MOVW R11, (AX) 9283 ADDQ $0x02, AX 9284 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B 9285 9286 repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B: 9287 XORQ BX, BX 9288 LEAL 1(BX)(R11*4), R11 9289 MOVB DI, 1(AX) 9290 SARL $0x08, DI 9291 SHLL $0x05, DI 9292 ORL DI, R11 9293 MOVB R11, (AX) 9294 ADDQ $0x02, AX 9295 9296 match_nolit_emitcopy_end_encodeBetterBlockAsm10B: 9297 CMPL CX, 8(SP) 9298 JAE emit_remainder_encodeBetterBlockAsm10B 9299 CMPQ AX, (SP) 9300 JB match_nolit_dst_ok_encodeBetterBlockAsm10B 9301 MOVQ $0x00000000, ret+48(FP) 9302 RET 9303 9304 match_nolit_dst_ok_encodeBetterBlockAsm10B: 9305 MOVQ $0x0000cf1bbcdcbf9b, BX 9306 MOVQ $0x9e3779b1, DI 9307 LEAQ 1(SI), SI 9308 LEAQ -2(CX), R8 9309 MOVQ (DX)(SI*1), R9 9310 MOVQ 1(DX)(SI*1), R10 9311 MOVQ (DX)(R8*1), R11 9312 MOVQ 1(DX)(R8*1), R12 9313 SHLQ $0x10, R9 9314 IMULQ BX, R9 9315 SHRQ $0x34, R9 9316 SHLQ $0x20, R10 9317 IMULQ DI, R10 9318 SHRQ $0x36, R10 9319 SHLQ $0x10, R11 9320 IMULQ BX, R11 9321 SHRQ $0x34, R11 9322 SHLQ $0x20, R12 9323 IMULQ DI, R12 9324 SHRQ $0x36, R12 9325 LEAQ 1(SI), DI 9326 LEAQ 1(R8), R13 9327 MOVL SI, 24(SP)(R9*4) 9328 MOVL R8, 24(SP)(R11*4) 9329 MOVL DI, 16408(SP)(R10*4) 9330 MOVL R13, 16408(SP)(R12*4) 9331 ADDQ $0x01, SI 9332 SUBQ $0x01, R8 9333 9334 index_loop_encodeBetterBlockAsm10B: 9335 CMPQ SI, R8 9336 JAE search_loop_encodeBetterBlockAsm10B 9337 MOVQ (DX)(SI*1), DI 9338 MOVQ (DX)(R8*1), R9 9339 SHLQ $0x10, DI 9340 IMULQ BX, DI 9341 SHRQ $0x34, DI 9342 SHLQ $0x10, R9 9343 IMULQ BX, R9 9344 SHRQ $0x34, R9 9345 MOVL SI, 24(SP)(DI*4) 9346 MOVL R8, 24(SP)(R9*4) 9347 ADDQ $0x02, SI 9348 SUBQ $0x02, R8 9349 JMP index_loop_encodeBetterBlockAsm10B 9350 9351 emit_remainder_encodeBetterBlockAsm10B: 9352 MOVQ src_len+32(FP), CX 9353 SUBL 12(SP), CX 9354 LEAQ 3(AX)(CX*1), CX 9355 CMPQ CX, (SP) 9356 JB emit_remainder_ok_encodeBetterBlockAsm10B 9357 MOVQ $0x00000000, ret+48(FP) 9358 RET 9359 9360 emit_remainder_ok_encodeBetterBlockAsm10B: 9361 MOVQ src_len+32(FP), CX 9362 MOVL 12(SP), BX 9363 CMPL BX, CX 9364 JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm10B 9365 MOVL CX, SI 9366 MOVL CX, 12(SP) 9367 LEAQ (DX)(BX*1), CX 9368 SUBL BX, SI 9369 LEAL -1(SI), DX 9370 CMPL DX, $0x3c 9371 JB one_byte_emit_remainder_encodeBetterBlockAsm10B 9372 CMPL DX, $0x00000100 9373 JB two_bytes_emit_remainder_encodeBetterBlockAsm10B 9374 JB three_bytes_emit_remainder_encodeBetterBlockAsm10B 9375 9376 three_bytes_emit_remainder_encodeBetterBlockAsm10B: 9377 MOVB $0xf4, (AX) 9378 MOVW DX, 1(AX) 9379 ADDQ $0x03, AX 9380 JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B 9381 9382 two_bytes_emit_remainder_encodeBetterBlockAsm10B: 9383 MOVB $0xf0, (AX) 9384 MOVB DL, 1(AX) 9385 ADDQ $0x02, AX 9386 CMPL DX, $0x40 9387 JB memmove_emit_remainder_encodeBetterBlockAsm10B 9388 JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B 9389 9390 one_byte_emit_remainder_encodeBetterBlockAsm10B: 9391 SHLB $0x02, DL 9392 MOVB DL, (AX) 9393 ADDQ $0x01, AX 9394 9395 memmove_emit_remainder_encodeBetterBlockAsm10B: 9396 LEAQ (AX)(SI*1), DX 9397 MOVL SI, BX 9398 9399 // genMemMoveShort 9400 CMPQ BX, $0x03 9401 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2 9402 JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3 9403 CMPQ BX, $0x08 9404 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7 9405 CMPQ BX, $0x10 9406 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16 9407 CMPQ BX, $0x20 9408 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32 9409 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64 9410 9411 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2: 9412 MOVB (CX), SI 9413 MOVB -1(CX)(BX*1), CL 9414 MOVB SI, (AX) 9415 MOVB CL, -1(AX)(BX*1) 9416 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B 9417 9418 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3: 9419 MOVW (CX), SI 9420 MOVB 2(CX), CL 9421 MOVW SI, (AX) 9422 MOVB CL, 2(AX) 9423 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B 9424 9425 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7: 9426 MOVL (CX), SI 9427 MOVL -4(CX)(BX*1), CX 9428 MOVL SI, (AX) 9429 MOVL CX, -4(AX)(BX*1) 9430 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B 9431 9432 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16: 9433 MOVQ (CX), SI 9434 MOVQ -8(CX)(BX*1), CX 9435 MOVQ SI, (AX) 9436 MOVQ CX, -8(AX)(BX*1) 9437 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B 9438 9439 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32: 9440 MOVOU (CX), X0 9441 MOVOU -16(CX)(BX*1), X1 9442 MOVOU X0, (AX) 9443 MOVOU X1, -16(AX)(BX*1) 9444 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B 9445 9446 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64: 9447 MOVOU (CX), X0 9448 MOVOU 16(CX), X1 9449 MOVOU -32(CX)(BX*1), X2 9450 MOVOU -16(CX)(BX*1), X3 9451 MOVOU X0, (AX) 9452 MOVOU X1, 16(AX) 9453 MOVOU X2, -32(AX)(BX*1) 9454 MOVOU X3, -16(AX)(BX*1) 9455 9456 memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B: 9457 MOVQ DX, AX 9458 JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm10B 9459 9460 memmove_long_emit_remainder_encodeBetterBlockAsm10B: 9461 LEAQ (AX)(SI*1), DX 9462 MOVL SI, BX 9463 9464 // genMemMoveLong 9465 MOVOU (CX), X0 9466 MOVOU 16(CX), X1 9467 MOVOU -32(CX)(BX*1), X2 9468 MOVOU -16(CX)(BX*1), X3 9469 MOVQ BX, DI 9470 SHRQ $0x05, DI 9471 MOVQ AX, SI 9472 ANDL $0x0000001f, SI 9473 MOVQ $0x00000040, R8 9474 SUBQ SI, R8 9475 DECQ DI 9476 JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 9477 LEAQ -32(CX)(R8*1), SI 9478 LEAQ -32(AX)(R8*1), R9 9479 9480 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back: 9481 MOVOU (SI), X4 9482 MOVOU 16(SI), X5 9483 MOVOA X4, (R9) 9484 MOVOA X5, 16(R9) 9485 ADDQ $0x20, R9 9486 ADDQ $0x20, SI 9487 ADDQ $0x20, R8 9488 DECQ DI 9489 JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back 9490 9491 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: 9492 MOVOU -32(CX)(R8*1), X4 9493 MOVOU -16(CX)(R8*1), X5 9494 MOVOA X4, -32(AX)(R8*1) 9495 MOVOA X5, -16(AX)(R8*1) 9496 ADDQ $0x20, R8 9497 CMPQ BX, R8 9498 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 9499 MOVOU X0, (AX) 9500 MOVOU X1, 16(AX) 9501 MOVOU X2, -32(AX)(BX*1) 9502 MOVOU X3, -16(AX)(BX*1) 9503 MOVQ DX, AX 9504 9505 emit_literal_done_emit_remainder_encodeBetterBlockAsm10B: 9506 MOVQ dst_base+0(FP), CX 9507 SUBQ CX, AX 9508 MOVQ AX, ret+48(FP) 9509 RET 9510 9511 // func encodeBetterBlockAsm8B(dst []byte, src []byte) int 9512 // Requires: BMI, SSE2 9513 TEXT ·encodeBetterBlockAsm8B(SB), $5144-56 9514 MOVQ dst_base+0(FP), AX 9515 MOVQ $0x00000028, CX 9516 LEAQ 24(SP), DX 9517 PXOR X0, X0 9518 9519 zero_loop_encodeBetterBlockAsm8B: 9520 MOVOU X0, (DX) 9521 MOVOU X0, 16(DX) 9522 MOVOU X0, 32(DX) 9523 MOVOU X0, 48(DX) 9524 MOVOU X0, 64(DX) 9525 MOVOU X0, 80(DX) 9526 MOVOU X0, 96(DX) 9527 MOVOU X0, 112(DX) 9528 ADDQ $0x80, DX 9529 DECQ CX 9530 JNZ zero_loop_encodeBetterBlockAsm8B 9531 MOVL $0x00000000, 12(SP) 9532 MOVQ src_len+32(FP), CX 9533 LEAQ -6(CX), DX 9534 LEAQ -8(CX), BX 9535 MOVL BX, 8(SP) 9536 SHRQ $0x05, CX 9537 SUBL CX, DX 9538 LEAQ (AX)(DX*1), DX 9539 MOVQ DX, (SP) 9540 MOVL $0x00000001, CX 9541 MOVL $0x00000000, 16(SP) 9542 MOVQ src_base+24(FP), DX 9543 9544 search_loop_encodeBetterBlockAsm8B: 9545 MOVL CX, BX 9546 SUBL 12(SP), BX 9547 SHRL $0x04, BX 9548 LEAL 1(CX)(BX*1), BX 9549 CMPL BX, 8(SP) 9550 JAE emit_remainder_encodeBetterBlockAsm8B 9551 MOVQ (DX)(CX*1), SI 9552 MOVL BX, 20(SP) 9553 MOVQ $0x0000cf1bbcdcbf9b, R8 9554 MOVQ $0x9e3779b1, BX 9555 MOVQ SI, R9 9556 MOVQ SI, R10 9557 SHLQ $0x10, R9 9558 IMULQ R8, R9 9559 SHRQ $0x36, R9 9560 SHLQ $0x20, R10 9561 IMULQ BX, R10 9562 SHRQ $0x38, R10 9563 MOVL 24(SP)(R9*4), BX 9564 MOVL 4120(SP)(R10*4), DI 9565 MOVL CX, 24(SP)(R9*4) 9566 MOVL CX, 4120(SP)(R10*4) 9567 MOVQ (DX)(BX*1), R9 9568 MOVQ (DX)(DI*1), R10 9569 CMPQ R9, SI 9570 JEQ candidate_match_encodeBetterBlockAsm8B 9571 CMPQ R10, SI 9572 JNE no_short_found_encodeBetterBlockAsm8B 9573 MOVL DI, BX 9574 JMP candidate_match_encodeBetterBlockAsm8B 9575 9576 no_short_found_encodeBetterBlockAsm8B: 9577 CMPL R9, SI 9578 JEQ candidate_match_encodeBetterBlockAsm8B 9579 CMPL R10, SI 9580 JEQ candidateS_match_encodeBetterBlockAsm8B 9581 MOVL 20(SP), CX 9582 JMP search_loop_encodeBetterBlockAsm8B 9583 9584 candidateS_match_encodeBetterBlockAsm8B: 9585 SHRQ $0x08, SI 9586 MOVQ SI, R9 9587 SHLQ $0x10, R9 9588 IMULQ R8, R9 9589 SHRQ $0x36, R9 9590 MOVL 24(SP)(R9*4), BX 9591 INCL CX 9592 MOVL CX, 24(SP)(R9*4) 9593 CMPL (DX)(BX*1), SI 9594 JEQ candidate_match_encodeBetterBlockAsm8B 9595 DECL CX 9596 MOVL DI, BX 9597 9598 candidate_match_encodeBetterBlockAsm8B: 9599 MOVL 12(SP), SI 9600 TESTL BX, BX 9601 JZ match_extend_back_end_encodeBetterBlockAsm8B 9602 9603 match_extend_back_loop_encodeBetterBlockAsm8B: 9604 CMPL CX, SI 9605 JBE match_extend_back_end_encodeBetterBlockAsm8B 9606 MOVB -1(DX)(BX*1), DI 9607 MOVB -1(DX)(CX*1), R8 9608 CMPB DI, R8 9609 JNE match_extend_back_end_encodeBetterBlockAsm8B 9610 LEAL -1(CX), CX 9611 DECL BX 9612 JZ match_extend_back_end_encodeBetterBlockAsm8B 9613 JMP match_extend_back_loop_encodeBetterBlockAsm8B 9614 9615 match_extend_back_end_encodeBetterBlockAsm8B: 9616 MOVL CX, SI 9617 SUBL 12(SP), SI 9618 LEAQ 3(AX)(SI*1), SI 9619 CMPQ SI, (SP) 9620 JB match_dst_size_check_encodeBetterBlockAsm8B 9621 MOVQ $0x00000000, ret+48(FP) 9622 RET 9623 9624 match_dst_size_check_encodeBetterBlockAsm8B: 9625 MOVL CX, SI 9626 ADDL $0x04, CX 9627 ADDL $0x04, BX 9628 MOVQ src_len+32(FP), DI 9629 SUBL CX, DI 9630 LEAQ (DX)(CX*1), R8 9631 LEAQ (DX)(BX*1), R9 9632 9633 // matchLen 9634 XORL R11, R11 9635 CMPL DI, $0x08 9636 JB matchlen_match4_match_nolit_encodeBetterBlockAsm8B 9637 9638 matchlen_loopback_match_nolit_encodeBetterBlockAsm8B: 9639 MOVQ (R8)(R11*1), R10 9640 XORQ (R9)(R11*1), R10 9641 TESTQ R10, R10 9642 JZ matchlen_loop_match_nolit_encodeBetterBlockAsm8B 9643 9644 #ifdef GOAMD64_v3 9645 TZCNTQ R10, R10 9646 9647 #else 9648 BSFQ R10, R10 9649 9650 #endif 9651 SARQ $0x03, R10 9652 LEAL (R11)(R10*1), R11 9653 JMP match_nolit_end_encodeBetterBlockAsm8B 9654 9655 matchlen_loop_match_nolit_encodeBetterBlockAsm8B: 9656 LEAL -8(DI), DI 9657 LEAL 8(R11), R11 9658 CMPL DI, $0x08 9659 JAE matchlen_loopback_match_nolit_encodeBetterBlockAsm8B 9660 JZ match_nolit_end_encodeBetterBlockAsm8B 9661 9662 matchlen_match4_match_nolit_encodeBetterBlockAsm8B: 9663 CMPL DI, $0x04 9664 JB matchlen_match2_match_nolit_encodeBetterBlockAsm8B 9665 MOVL (R8)(R11*1), R10 9666 CMPL (R9)(R11*1), R10 9667 JNE matchlen_match2_match_nolit_encodeBetterBlockAsm8B 9668 SUBL $0x04, DI 9669 LEAL 4(R11), R11 9670 9671 matchlen_match2_match_nolit_encodeBetterBlockAsm8B: 9672 CMPL DI, $0x02 9673 JB matchlen_match1_match_nolit_encodeBetterBlockAsm8B 9674 MOVW (R8)(R11*1), R10 9675 CMPW (R9)(R11*1), R10 9676 JNE matchlen_match1_match_nolit_encodeBetterBlockAsm8B 9677 SUBL $0x02, DI 9678 LEAL 2(R11), R11 9679 9680 matchlen_match1_match_nolit_encodeBetterBlockAsm8B: 9681 CMPL DI, $0x01 9682 JB match_nolit_end_encodeBetterBlockAsm8B 9683 MOVB (R8)(R11*1), R10 9684 CMPB (R9)(R11*1), R10 9685 JNE match_nolit_end_encodeBetterBlockAsm8B 9686 LEAL 1(R11), R11 9687 9688 match_nolit_end_encodeBetterBlockAsm8B: 9689 MOVL CX, DI 9690 SUBL BX, DI 9691 9692 // Check if repeat 9693 CMPL 16(SP), DI 9694 JEQ match_is_repeat_encodeBetterBlockAsm8B 9695 MOVL DI, 16(SP) 9696 MOVL 12(SP), BX 9697 CMPL BX, SI 9698 JEQ emit_literal_done_match_emit_encodeBetterBlockAsm8B 9699 MOVL SI, R8 9700 MOVL SI, 12(SP) 9701 LEAQ (DX)(BX*1), R9 9702 SUBL BX, R8 9703 LEAL -1(R8), BX 9704 CMPL BX, $0x3c 9705 JB one_byte_match_emit_encodeBetterBlockAsm8B 9706 CMPL BX, $0x00000100 9707 JB two_bytes_match_emit_encodeBetterBlockAsm8B 9708 JB three_bytes_match_emit_encodeBetterBlockAsm8B 9709 9710 three_bytes_match_emit_encodeBetterBlockAsm8B: 9711 MOVB $0xf4, (AX) 9712 MOVW BX, 1(AX) 9713 ADDQ $0x03, AX 9714 JMP memmove_long_match_emit_encodeBetterBlockAsm8B 9715 9716 two_bytes_match_emit_encodeBetterBlockAsm8B: 9717 MOVB $0xf0, (AX) 9718 MOVB BL, 1(AX) 9719 ADDQ $0x02, AX 9720 CMPL BX, $0x40 9721 JB memmove_match_emit_encodeBetterBlockAsm8B 9722 JMP memmove_long_match_emit_encodeBetterBlockAsm8B 9723 9724 one_byte_match_emit_encodeBetterBlockAsm8B: 9725 SHLB $0x02, BL 9726 MOVB BL, (AX) 9727 ADDQ $0x01, AX 9728 9729 memmove_match_emit_encodeBetterBlockAsm8B: 9730 LEAQ (AX)(R8*1), BX 9731 9732 // genMemMoveShort 9733 CMPQ R8, $0x04 9734 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4 9735 CMPQ R8, $0x08 9736 JB emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7 9737 CMPQ R8, $0x10 9738 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16 9739 CMPQ R8, $0x20 9740 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32 9741 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64 9742 9743 emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4: 9744 MOVL (R9), R10 9745 MOVL R10, (AX) 9746 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B 9747 9748 emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7: 9749 MOVL (R9), R10 9750 MOVL -4(R9)(R8*1), R9 9751 MOVL R10, (AX) 9752 MOVL R9, -4(AX)(R8*1) 9753 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B 9754 9755 emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16: 9756 MOVQ (R9), R10 9757 MOVQ -8(R9)(R8*1), R9 9758 MOVQ R10, (AX) 9759 MOVQ R9, -8(AX)(R8*1) 9760 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B 9761 9762 emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32: 9763 MOVOU (R9), X0 9764 MOVOU -16(R9)(R8*1), X1 9765 MOVOU X0, (AX) 9766 MOVOU X1, -16(AX)(R8*1) 9767 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B 9768 9769 emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64: 9770 MOVOU (R9), X0 9771 MOVOU 16(R9), X1 9772 MOVOU -32(R9)(R8*1), X2 9773 MOVOU -16(R9)(R8*1), X3 9774 MOVOU X0, (AX) 9775 MOVOU X1, 16(AX) 9776 MOVOU X2, -32(AX)(R8*1) 9777 MOVOU X3, -16(AX)(R8*1) 9778 9779 memmove_end_copy_match_emit_encodeBetterBlockAsm8B: 9780 MOVQ BX, AX 9781 JMP emit_literal_done_match_emit_encodeBetterBlockAsm8B 9782 9783 memmove_long_match_emit_encodeBetterBlockAsm8B: 9784 LEAQ (AX)(R8*1), BX 9785 9786 // genMemMoveLong 9787 MOVOU (R9), X0 9788 MOVOU 16(R9), X1 9789 MOVOU -32(R9)(R8*1), X2 9790 MOVOU -16(R9)(R8*1), X3 9791 MOVQ R8, R12 9792 SHRQ $0x05, R12 9793 MOVQ AX, R10 9794 ANDL $0x0000001f, R10 9795 MOVQ $0x00000040, R13 9796 SUBQ R10, R13 9797 DECQ R12 9798 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 9799 LEAQ -32(R9)(R13*1), R10 9800 LEAQ -32(AX)(R13*1), R14 9801 9802 emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back: 9803 MOVOU (R10), X4 9804 MOVOU 16(R10), X5 9805 MOVOA X4, (R14) 9806 MOVOA X5, 16(R14) 9807 ADDQ $0x20, R14 9808 ADDQ $0x20, R10 9809 ADDQ $0x20, R13 9810 DECQ R12 9811 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back 9812 9813 emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: 9814 MOVOU -32(R9)(R13*1), X4 9815 MOVOU -16(R9)(R13*1), X5 9816 MOVOA X4, -32(AX)(R13*1) 9817 MOVOA X5, -16(AX)(R13*1) 9818 ADDQ $0x20, R13 9819 CMPQ R8, R13 9820 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 9821 MOVOU X0, (AX) 9822 MOVOU X1, 16(AX) 9823 MOVOU X2, -32(AX)(R8*1) 9824 MOVOU X3, -16(AX)(R8*1) 9825 MOVQ BX, AX 9826 9827 emit_literal_done_match_emit_encodeBetterBlockAsm8B: 9828 ADDL R11, CX 9829 ADDL $0x04, R11 9830 MOVL CX, 12(SP) 9831 9832 // emitCopy 9833 CMPL R11, $0x40 9834 JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B 9835 CMPL DI, $0x00000800 9836 JAE long_offset_short_match_nolit_encodeBetterBlockAsm8B 9837 MOVL $0x00000001, BX 9838 LEAL 16(BX), BX 9839 MOVB DI, 1(AX) 9840 SHRL $0x08, DI 9841 SHLL $0x05, DI 9842 ORL DI, BX 9843 MOVB BL, (AX) 9844 ADDQ $0x02, AX 9845 SUBL $0x08, R11 9846 9847 // emitRepeat 9848 LEAL -4(R11), R11 9849 JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b 9850 MOVL R11, BX 9851 LEAL -4(R11), R11 9852 CMPL BX, $0x08 9853 JBE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b 9854 CMPL BX, $0x0c 9855 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b 9856 9857 cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b: 9858 CMPL R11, $0x00000104 9859 JB repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b 9860 LEAL -256(R11), R11 9861 MOVW $0x0019, (AX) 9862 MOVW R11, 2(AX) 9863 ADDQ $0x04, AX 9864 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B 9865 9866 repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b: 9867 LEAL -4(R11), R11 9868 MOVW $0x0015, (AX) 9869 MOVB R11, 2(AX) 9870 ADDQ $0x03, AX 9871 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B 9872 9873 repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b: 9874 SHLL $0x02, R11 9875 ORL $0x01, R11 9876 MOVW R11, (AX) 9877 ADDQ $0x02, AX 9878 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B 9879 XORQ BX, BX 9880 LEAL 1(BX)(R11*4), R11 9881 MOVB DI, 1(AX) 9882 SARL $0x08, DI 9883 SHLL $0x05, DI 9884 ORL DI, R11 9885 MOVB R11, (AX) 9886 ADDQ $0x02, AX 9887 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B 9888 9889 long_offset_short_match_nolit_encodeBetterBlockAsm8B: 9890 MOVB $0xee, (AX) 9891 MOVW DI, 1(AX) 9892 LEAL -60(R11), R11 9893 ADDQ $0x03, AX 9894 9895 // emitRepeat 9896 MOVL R11, BX 9897 LEAL -4(R11), R11 9898 CMPL BX, $0x08 9899 JBE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short 9900 CMPL BX, $0x0c 9901 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short 9902 9903 cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: 9904 CMPL R11, $0x00000104 9905 JB repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short 9906 LEAL -256(R11), R11 9907 MOVW $0x0019, (AX) 9908 MOVW R11, 2(AX) 9909 ADDQ $0x04, AX 9910 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B 9911 9912 repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: 9913 LEAL -4(R11), R11 9914 MOVW $0x0015, (AX) 9915 MOVB R11, 2(AX) 9916 ADDQ $0x03, AX 9917 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B 9918 9919 repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: 9920 SHLL $0x02, R11 9921 ORL $0x01, R11 9922 MOVW R11, (AX) 9923 ADDQ $0x02, AX 9924 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B 9925 XORQ BX, BX 9926 LEAL 1(BX)(R11*4), R11 9927 MOVB DI, 1(AX) 9928 SARL $0x08, DI 9929 SHLL $0x05, DI 9930 ORL DI, R11 9931 MOVB R11, (AX) 9932 ADDQ $0x02, AX 9933 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B 9934 9935 two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B: 9936 MOVL R11, BX 9937 SHLL $0x02, BX 9938 CMPL R11, $0x0c 9939 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm8B 9940 LEAL -15(BX), BX 9941 MOVB DI, 1(AX) 9942 SHRL $0x08, DI 9943 SHLL $0x05, DI 9944 ORL DI, BX 9945 MOVB BL, (AX) 9946 ADDQ $0x02, AX 9947 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B 9948 9949 emit_copy_three_match_nolit_encodeBetterBlockAsm8B: 9950 LEAL -2(BX), BX 9951 MOVB BL, (AX) 9952 MOVW DI, 1(AX) 9953 ADDQ $0x03, AX 9954 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B 9955 9956 match_is_repeat_encodeBetterBlockAsm8B: 9957 MOVL 12(SP), BX 9958 CMPL BX, SI 9959 JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B 9960 MOVL SI, DI 9961 MOVL SI, 12(SP) 9962 LEAQ (DX)(BX*1), R8 9963 SUBL BX, DI 9964 LEAL -1(DI), BX 9965 CMPL BX, $0x3c 9966 JB one_byte_match_emit_repeat_encodeBetterBlockAsm8B 9967 CMPL BX, $0x00000100 9968 JB two_bytes_match_emit_repeat_encodeBetterBlockAsm8B 9969 JB three_bytes_match_emit_repeat_encodeBetterBlockAsm8B 9970 9971 three_bytes_match_emit_repeat_encodeBetterBlockAsm8B: 9972 MOVB $0xf4, (AX) 9973 MOVW BX, 1(AX) 9974 ADDQ $0x03, AX 9975 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B 9976 9977 two_bytes_match_emit_repeat_encodeBetterBlockAsm8B: 9978 MOVB $0xf0, (AX) 9979 MOVB BL, 1(AX) 9980 ADDQ $0x02, AX 9981 CMPL BX, $0x40 9982 JB memmove_match_emit_repeat_encodeBetterBlockAsm8B 9983 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B 9984 9985 one_byte_match_emit_repeat_encodeBetterBlockAsm8B: 9986 SHLB $0x02, BL 9987 MOVB BL, (AX) 9988 ADDQ $0x01, AX 9989 9990 memmove_match_emit_repeat_encodeBetterBlockAsm8B: 9991 LEAQ (AX)(DI*1), BX 9992 9993 // genMemMoveShort 9994 CMPQ DI, $0x04 9995 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4 9996 CMPQ DI, $0x08 9997 JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7 9998 CMPQ DI, $0x10 9999 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16 10000 CMPQ DI, $0x20 10001 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32 10002 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64 10003 10004 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4: 10005 MOVL (R8), R9 10006 MOVL R9, (AX) 10007 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B 10008 10009 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7: 10010 MOVL (R8), R9 10011 MOVL -4(R8)(DI*1), R8 10012 MOVL R9, (AX) 10013 MOVL R8, -4(AX)(DI*1) 10014 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B 10015 10016 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16: 10017 MOVQ (R8), R9 10018 MOVQ -8(R8)(DI*1), R8 10019 MOVQ R9, (AX) 10020 MOVQ R8, -8(AX)(DI*1) 10021 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B 10022 10023 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32: 10024 MOVOU (R8), X0 10025 MOVOU -16(R8)(DI*1), X1 10026 MOVOU X0, (AX) 10027 MOVOU X1, -16(AX)(DI*1) 10028 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B 10029 10030 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64: 10031 MOVOU (R8), X0 10032 MOVOU 16(R8), X1 10033 MOVOU -32(R8)(DI*1), X2 10034 MOVOU -16(R8)(DI*1), X3 10035 MOVOU X0, (AX) 10036 MOVOU X1, 16(AX) 10037 MOVOU X2, -32(AX)(DI*1) 10038 MOVOU X3, -16(AX)(DI*1) 10039 10040 memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B: 10041 MOVQ BX, AX 10042 JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B 10043 10044 memmove_long_match_emit_repeat_encodeBetterBlockAsm8B: 10045 LEAQ (AX)(DI*1), BX 10046 10047 // genMemMoveLong 10048 MOVOU (R8), X0 10049 MOVOU 16(R8), X1 10050 MOVOU -32(R8)(DI*1), X2 10051 MOVOU -16(R8)(DI*1), X3 10052 MOVQ DI, R10 10053 SHRQ $0x05, R10 10054 MOVQ AX, R9 10055 ANDL $0x0000001f, R9 10056 MOVQ $0x00000040, R12 10057 SUBQ R9, R12 10058 DECQ R10 10059 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 10060 LEAQ -32(R8)(R12*1), R9 10061 LEAQ -32(AX)(R12*1), R13 10062 10063 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back: 10064 MOVOU (R9), X4 10065 MOVOU 16(R9), X5 10066 MOVOA X4, (R13) 10067 MOVOA X5, 16(R13) 10068 ADDQ $0x20, R13 10069 ADDQ $0x20, R9 10070 ADDQ $0x20, R12 10071 DECQ R10 10072 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back 10073 10074 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: 10075 MOVOU -32(R8)(R12*1), X4 10076 MOVOU -16(R8)(R12*1), X5 10077 MOVOA X4, -32(AX)(R12*1) 10078 MOVOA X5, -16(AX)(R12*1) 10079 ADDQ $0x20, R12 10080 CMPQ DI, R12 10081 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 10082 MOVOU X0, (AX) 10083 MOVOU X1, 16(AX) 10084 MOVOU X2, -32(AX)(DI*1) 10085 MOVOU X3, -16(AX)(DI*1) 10086 MOVQ BX, AX 10087 10088 emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B: 10089 ADDL R11, CX 10090 ADDL $0x04, R11 10091 MOVL CX, 12(SP) 10092 10093 // emitRepeat 10094 MOVL R11, BX 10095 LEAL -4(R11), R11 10096 CMPL BX, $0x08 10097 JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B 10098 CMPL BX, $0x0c 10099 JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B 10100 10101 cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B: 10102 CMPL R11, $0x00000104 10103 JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B 10104 LEAL -256(R11), R11 10105 MOVW $0x0019, (AX) 10106 MOVW R11, 2(AX) 10107 ADDQ $0x04, AX 10108 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B 10109 10110 repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B: 10111 LEAL -4(R11), R11 10112 MOVW $0x0015, (AX) 10113 MOVB R11, 2(AX) 10114 ADDQ $0x03, AX 10115 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B 10116 10117 repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B: 10118 SHLL $0x02, R11 10119 ORL $0x01, R11 10120 MOVW R11, (AX) 10121 ADDQ $0x02, AX 10122 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B 10123 XORQ BX, BX 10124 LEAL 1(BX)(R11*4), R11 10125 MOVB DI, 1(AX) 10126 SARL $0x08, DI 10127 SHLL $0x05, DI 10128 ORL DI, R11 10129 MOVB R11, (AX) 10130 ADDQ $0x02, AX 10131 10132 match_nolit_emitcopy_end_encodeBetterBlockAsm8B: 10133 CMPL CX, 8(SP) 10134 JAE emit_remainder_encodeBetterBlockAsm8B 10135 CMPQ AX, (SP) 10136 JB match_nolit_dst_ok_encodeBetterBlockAsm8B 10137 MOVQ $0x00000000, ret+48(FP) 10138 RET 10139 10140 match_nolit_dst_ok_encodeBetterBlockAsm8B: 10141 MOVQ $0x0000cf1bbcdcbf9b, BX 10142 MOVQ $0x9e3779b1, DI 10143 LEAQ 1(SI), SI 10144 LEAQ -2(CX), R8 10145 MOVQ (DX)(SI*1), R9 10146 MOVQ 1(DX)(SI*1), R10 10147 MOVQ (DX)(R8*1), R11 10148 MOVQ 1(DX)(R8*1), R12 10149 SHLQ $0x10, R9 10150 IMULQ BX, R9 10151 SHRQ $0x36, R9 10152 SHLQ $0x20, R10 10153 IMULQ DI, R10 10154 SHRQ $0x38, R10 10155 SHLQ $0x10, R11 10156 IMULQ BX, R11 10157 SHRQ $0x36, R11 10158 SHLQ $0x20, R12 10159 IMULQ DI, R12 10160 SHRQ $0x38, R12 10161 LEAQ 1(SI), DI 10162 LEAQ 1(R8), R13 10163 MOVL SI, 24(SP)(R9*4) 10164 MOVL R8, 24(SP)(R11*4) 10165 MOVL DI, 4120(SP)(R10*4) 10166 MOVL R13, 4120(SP)(R12*4) 10167 ADDQ $0x01, SI 10168 SUBQ $0x01, R8 10169 10170 index_loop_encodeBetterBlockAsm8B: 10171 CMPQ SI, R8 10172 JAE search_loop_encodeBetterBlockAsm8B 10173 MOVQ (DX)(SI*1), DI 10174 MOVQ (DX)(R8*1), R9 10175 SHLQ $0x10, DI 10176 IMULQ BX, DI 10177 SHRQ $0x36, DI 10178 SHLQ $0x10, R9 10179 IMULQ BX, R9 10180 SHRQ $0x36, R9 10181 MOVL SI, 24(SP)(DI*4) 10182 MOVL R8, 24(SP)(R9*4) 10183 ADDQ $0x02, SI 10184 SUBQ $0x02, R8 10185 JMP index_loop_encodeBetterBlockAsm8B 10186 10187 emit_remainder_encodeBetterBlockAsm8B: 10188 MOVQ src_len+32(FP), CX 10189 SUBL 12(SP), CX 10190 LEAQ 3(AX)(CX*1), CX 10191 CMPQ CX, (SP) 10192 JB emit_remainder_ok_encodeBetterBlockAsm8B 10193 MOVQ $0x00000000, ret+48(FP) 10194 RET 10195 10196 emit_remainder_ok_encodeBetterBlockAsm8B: 10197 MOVQ src_len+32(FP), CX 10198 MOVL 12(SP), BX 10199 CMPL BX, CX 10200 JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm8B 10201 MOVL CX, SI 10202 MOVL CX, 12(SP) 10203 LEAQ (DX)(BX*1), CX 10204 SUBL BX, SI 10205 LEAL -1(SI), DX 10206 CMPL DX, $0x3c 10207 JB one_byte_emit_remainder_encodeBetterBlockAsm8B 10208 CMPL DX, $0x00000100 10209 JB two_bytes_emit_remainder_encodeBetterBlockAsm8B 10210 JB three_bytes_emit_remainder_encodeBetterBlockAsm8B 10211 10212 three_bytes_emit_remainder_encodeBetterBlockAsm8B: 10213 MOVB $0xf4, (AX) 10214 MOVW DX, 1(AX) 10215 ADDQ $0x03, AX 10216 JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B 10217 10218 two_bytes_emit_remainder_encodeBetterBlockAsm8B: 10219 MOVB $0xf0, (AX) 10220 MOVB DL, 1(AX) 10221 ADDQ $0x02, AX 10222 CMPL DX, $0x40 10223 JB memmove_emit_remainder_encodeBetterBlockAsm8B 10224 JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B 10225 10226 one_byte_emit_remainder_encodeBetterBlockAsm8B: 10227 SHLB $0x02, DL 10228 MOVB DL, (AX) 10229 ADDQ $0x01, AX 10230 10231 memmove_emit_remainder_encodeBetterBlockAsm8B: 10232 LEAQ (AX)(SI*1), DX 10233 MOVL SI, BX 10234 10235 // genMemMoveShort 10236 CMPQ BX, $0x03 10237 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2 10238 JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3 10239 CMPQ BX, $0x08 10240 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7 10241 CMPQ BX, $0x10 10242 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16 10243 CMPQ BX, $0x20 10244 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32 10245 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64 10246 10247 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2: 10248 MOVB (CX), SI 10249 MOVB -1(CX)(BX*1), CL 10250 MOVB SI, (AX) 10251 MOVB CL, -1(AX)(BX*1) 10252 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B 10253 10254 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3: 10255 MOVW (CX), SI 10256 MOVB 2(CX), CL 10257 MOVW SI, (AX) 10258 MOVB CL, 2(AX) 10259 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B 10260 10261 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7: 10262 MOVL (CX), SI 10263 MOVL -4(CX)(BX*1), CX 10264 MOVL SI, (AX) 10265 MOVL CX, -4(AX)(BX*1) 10266 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B 10267 10268 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16: 10269 MOVQ (CX), SI 10270 MOVQ -8(CX)(BX*1), CX 10271 MOVQ SI, (AX) 10272 MOVQ CX, -8(AX)(BX*1) 10273 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B 10274 10275 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32: 10276 MOVOU (CX), X0 10277 MOVOU -16(CX)(BX*1), X1 10278 MOVOU X0, (AX) 10279 MOVOU X1, -16(AX)(BX*1) 10280 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B 10281 10282 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64: 10283 MOVOU (CX), X0 10284 MOVOU 16(CX), X1 10285 MOVOU -32(CX)(BX*1), X2 10286 MOVOU -16(CX)(BX*1), X3 10287 MOVOU X0, (AX) 10288 MOVOU X1, 16(AX) 10289 MOVOU X2, -32(AX)(BX*1) 10290 MOVOU X3, -16(AX)(BX*1) 10291 10292 memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B: 10293 MOVQ DX, AX 10294 JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm8B 10295 10296 memmove_long_emit_remainder_encodeBetterBlockAsm8B: 10297 LEAQ (AX)(SI*1), DX 10298 MOVL SI, BX 10299 10300 // genMemMoveLong 10301 MOVOU (CX), X0 10302 MOVOU 16(CX), X1 10303 MOVOU -32(CX)(BX*1), X2 10304 MOVOU -16(CX)(BX*1), X3 10305 MOVQ BX, DI 10306 SHRQ $0x05, DI 10307 MOVQ AX, SI 10308 ANDL $0x0000001f, SI 10309 MOVQ $0x00000040, R8 10310 SUBQ SI, R8 10311 DECQ DI 10312 JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 10313 LEAQ -32(CX)(R8*1), SI 10314 LEAQ -32(AX)(R8*1), R9 10315 10316 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back: 10317 MOVOU (SI), X4 10318 MOVOU 16(SI), X5 10319 MOVOA X4, (R9) 10320 MOVOA X5, 16(R9) 10321 ADDQ $0x20, R9 10322 ADDQ $0x20, SI 10323 ADDQ $0x20, R8 10324 DECQ DI 10325 JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back 10326 10327 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: 10328 MOVOU -32(CX)(R8*1), X4 10329 MOVOU -16(CX)(R8*1), X5 10330 MOVOA X4, -32(AX)(R8*1) 10331 MOVOA X5, -16(AX)(R8*1) 10332 ADDQ $0x20, R8 10333 CMPQ BX, R8 10334 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 10335 MOVOU X0, (AX) 10336 MOVOU X1, 16(AX) 10337 MOVOU X2, -32(AX)(BX*1) 10338 MOVOU X3, -16(AX)(BX*1) 10339 MOVQ DX, AX 10340 10341 emit_literal_done_emit_remainder_encodeBetterBlockAsm8B: 10342 MOVQ dst_base+0(FP), CX 10343 SUBQ CX, AX 10344 MOVQ AX, ret+48(FP) 10345 RET 10346 10347 // func encodeSnappyBlockAsm(dst []byte, src []byte) int 10348 // Requires: BMI, SSE2 10349 TEXT ·encodeSnappyBlockAsm(SB), $65560-56 10350 MOVQ dst_base+0(FP), AX 10351 MOVQ $0x00000200, CX 10352 LEAQ 24(SP), DX 10353 PXOR X0, X0 10354 10355 zero_loop_encodeSnappyBlockAsm: 10356 MOVOU X0, (DX) 10357 MOVOU X0, 16(DX) 10358 MOVOU X0, 32(DX) 10359 MOVOU X0, 48(DX) 10360 MOVOU X0, 64(DX) 10361 MOVOU X0, 80(DX) 10362 MOVOU X0, 96(DX) 10363 MOVOU X0, 112(DX) 10364 ADDQ $0x80, DX 10365 DECQ CX 10366 JNZ zero_loop_encodeSnappyBlockAsm 10367 MOVL $0x00000000, 12(SP) 10368 MOVQ src_len+32(FP), CX 10369 LEAQ -9(CX), DX 10370 LEAQ -8(CX), BX 10371 MOVL BX, 8(SP) 10372 SHRQ $0x05, CX 10373 SUBL CX, DX 10374 LEAQ (AX)(DX*1), DX 10375 MOVQ DX, (SP) 10376 MOVL $0x00000001, CX 10377 MOVL CX, 16(SP) 10378 MOVQ src_base+24(FP), DX 10379 10380 search_loop_encodeSnappyBlockAsm: 10381 MOVL CX, BX 10382 SUBL 12(SP), BX 10383 SHRL $0x06, BX 10384 LEAL 4(CX)(BX*1), BX 10385 CMPL BX, 8(SP) 10386 JAE emit_remainder_encodeSnappyBlockAsm 10387 MOVQ (DX)(CX*1), SI 10388 MOVL BX, 20(SP) 10389 MOVQ $0x0000cf1bbcdcbf9b, R8 10390 MOVQ SI, R9 10391 MOVQ SI, R10 10392 SHRQ $0x08, R10 10393 SHLQ $0x10, R9 10394 IMULQ R8, R9 10395 SHRQ $0x32, R9 10396 SHLQ $0x10, R10 10397 IMULQ R8, R10 10398 SHRQ $0x32, R10 10399 MOVL 24(SP)(R9*4), BX 10400 MOVL 24(SP)(R10*4), DI 10401 MOVL CX, 24(SP)(R9*4) 10402 LEAL 1(CX), R9 10403 MOVL R9, 24(SP)(R10*4) 10404 MOVQ SI, R9 10405 SHRQ $0x10, R9 10406 SHLQ $0x10, R9 10407 IMULQ R8, R9 10408 SHRQ $0x32, R9 10409 MOVL CX, R8 10410 SUBL 16(SP), R8 10411 MOVL 1(DX)(R8*1), R10 10412 MOVQ SI, R8 10413 SHRQ $0x08, R8 10414 CMPL R8, R10 10415 JNE no_repeat_found_encodeSnappyBlockAsm 10416 LEAL 1(CX), SI 10417 MOVL 12(SP), BX 10418 MOVL SI, DI 10419 SUBL 16(SP), DI 10420 JZ repeat_extend_back_end_encodeSnappyBlockAsm 10421 10422 repeat_extend_back_loop_encodeSnappyBlockAsm: 10423 CMPL SI, BX 10424 JBE repeat_extend_back_end_encodeSnappyBlockAsm 10425 MOVB -1(DX)(DI*1), R8 10426 MOVB -1(DX)(SI*1), R9 10427 CMPB R8, R9 10428 JNE repeat_extend_back_end_encodeSnappyBlockAsm 10429 LEAL -1(SI), SI 10430 DECL DI 10431 JNZ repeat_extend_back_loop_encodeSnappyBlockAsm 10432 10433 repeat_extend_back_end_encodeSnappyBlockAsm: 10434 MOVL 12(SP), BX 10435 CMPL BX, SI 10436 JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm 10437 MOVL SI, DI 10438 MOVL SI, 12(SP) 10439 LEAQ (DX)(BX*1), R8 10440 SUBL BX, DI 10441 LEAL -1(DI), BX 10442 CMPL BX, $0x3c 10443 JB one_byte_repeat_emit_encodeSnappyBlockAsm 10444 CMPL BX, $0x00000100 10445 JB two_bytes_repeat_emit_encodeSnappyBlockAsm 10446 CMPL BX, $0x00010000 10447 JB three_bytes_repeat_emit_encodeSnappyBlockAsm 10448 CMPL BX, $0x01000000 10449 JB four_bytes_repeat_emit_encodeSnappyBlockAsm 10450 MOVB $0xfc, (AX) 10451 MOVL BX, 1(AX) 10452 ADDQ $0x05, AX 10453 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm 10454 10455 four_bytes_repeat_emit_encodeSnappyBlockAsm: 10456 MOVL BX, R9 10457 SHRL $0x10, R9 10458 MOVB $0xf8, (AX) 10459 MOVW BX, 1(AX) 10460 MOVB R9, 3(AX) 10461 ADDQ $0x04, AX 10462 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm 10463 10464 three_bytes_repeat_emit_encodeSnappyBlockAsm: 10465 MOVB $0xf4, (AX) 10466 MOVW BX, 1(AX) 10467 ADDQ $0x03, AX 10468 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm 10469 10470 two_bytes_repeat_emit_encodeSnappyBlockAsm: 10471 MOVB $0xf0, (AX) 10472 MOVB BL, 1(AX) 10473 ADDQ $0x02, AX 10474 CMPL BX, $0x40 10475 JB memmove_repeat_emit_encodeSnappyBlockAsm 10476 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm 10477 10478 one_byte_repeat_emit_encodeSnappyBlockAsm: 10479 SHLB $0x02, BL 10480 MOVB BL, (AX) 10481 ADDQ $0x01, AX 10482 10483 memmove_repeat_emit_encodeSnappyBlockAsm: 10484 LEAQ (AX)(DI*1), BX 10485 10486 // genMemMoveShort 10487 CMPQ DI, $0x08 10488 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8 10489 CMPQ DI, $0x10 10490 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16 10491 CMPQ DI, $0x20 10492 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32 10493 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64 10494 10495 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8: 10496 MOVQ (R8), R9 10497 MOVQ R9, (AX) 10498 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm 10499 10500 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16: 10501 MOVQ (R8), R9 10502 MOVQ -8(R8)(DI*1), R8 10503 MOVQ R9, (AX) 10504 MOVQ R8, -8(AX)(DI*1) 10505 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm 10506 10507 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32: 10508 MOVOU (R8), X0 10509 MOVOU -16(R8)(DI*1), X1 10510 MOVOU X0, (AX) 10511 MOVOU X1, -16(AX)(DI*1) 10512 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm 10513 10514 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64: 10515 MOVOU (R8), X0 10516 MOVOU 16(R8), X1 10517 MOVOU -32(R8)(DI*1), X2 10518 MOVOU -16(R8)(DI*1), X3 10519 MOVOU X0, (AX) 10520 MOVOU X1, 16(AX) 10521 MOVOU X2, -32(AX)(DI*1) 10522 MOVOU X3, -16(AX)(DI*1) 10523 10524 memmove_end_copy_repeat_emit_encodeSnappyBlockAsm: 10525 MOVQ BX, AX 10526 JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm 10527 10528 memmove_long_repeat_emit_encodeSnappyBlockAsm: 10529 LEAQ (AX)(DI*1), BX 10530 10531 // genMemMoveLong 10532 MOVOU (R8), X0 10533 MOVOU 16(R8), X1 10534 MOVOU -32(R8)(DI*1), X2 10535 MOVOU -16(R8)(DI*1), X3 10536 MOVQ DI, R10 10537 SHRQ $0x05, R10 10538 MOVQ AX, R9 10539 ANDL $0x0000001f, R9 10540 MOVQ $0x00000040, R11 10541 SUBQ R9, R11 10542 DECQ R10 10543 JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 10544 LEAQ -32(R8)(R11*1), R9 10545 LEAQ -32(AX)(R11*1), R12 10546 10547 emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back: 10548 MOVOU (R9), X4 10549 MOVOU 16(R9), X5 10550 MOVOA X4, (R12) 10551 MOVOA X5, 16(R12) 10552 ADDQ $0x20, R12 10553 ADDQ $0x20, R9 10554 ADDQ $0x20, R11 10555 DECQ R10 10556 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back 10557 10558 emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32: 10559 MOVOU -32(R8)(R11*1), X4 10560 MOVOU -16(R8)(R11*1), X5 10561 MOVOA X4, -32(AX)(R11*1) 10562 MOVOA X5, -16(AX)(R11*1) 10563 ADDQ $0x20, R11 10564 CMPQ DI, R11 10565 JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 10566 MOVOU X0, (AX) 10567 MOVOU X1, 16(AX) 10568 MOVOU X2, -32(AX)(DI*1) 10569 MOVOU X3, -16(AX)(DI*1) 10570 MOVQ BX, AX 10571 10572 emit_literal_done_repeat_emit_encodeSnappyBlockAsm: 10573 ADDL $0x05, CX 10574 MOVL CX, BX 10575 SUBL 16(SP), BX 10576 MOVQ src_len+32(FP), DI 10577 SUBL CX, DI 10578 LEAQ (DX)(CX*1), R8 10579 LEAQ (DX)(BX*1), BX 10580 10581 // matchLen 10582 XORL R10, R10 10583 CMPL DI, $0x08 10584 JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm 10585 10586 matchlen_loopback_repeat_extend_encodeSnappyBlockAsm: 10587 MOVQ (R8)(R10*1), R9 10588 XORQ (BX)(R10*1), R9 10589 TESTQ R9, R9 10590 JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm 10591 10592 #ifdef GOAMD64_v3 10593 TZCNTQ R9, R9 10594 10595 #else 10596 BSFQ R9, R9 10597 10598 #endif 10599 SARQ $0x03, R9 10600 LEAL (R10)(R9*1), R10 10601 JMP repeat_extend_forward_end_encodeSnappyBlockAsm 10602 10603 matchlen_loop_repeat_extend_encodeSnappyBlockAsm: 10604 LEAL -8(DI), DI 10605 LEAL 8(R10), R10 10606 CMPL DI, $0x08 10607 JAE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm 10608 JZ repeat_extend_forward_end_encodeSnappyBlockAsm 10609 10610 matchlen_match4_repeat_extend_encodeSnappyBlockAsm: 10611 CMPL DI, $0x04 10612 JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm 10613 MOVL (R8)(R10*1), R9 10614 CMPL (BX)(R10*1), R9 10615 JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm 10616 SUBL $0x04, DI 10617 LEAL 4(R10), R10 10618 10619 matchlen_match2_repeat_extend_encodeSnappyBlockAsm: 10620 CMPL DI, $0x02 10621 JB matchlen_match1_repeat_extend_encodeSnappyBlockAsm 10622 MOVW (R8)(R10*1), R9 10623 CMPW (BX)(R10*1), R9 10624 JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm 10625 SUBL $0x02, DI 10626 LEAL 2(R10), R10 10627 10628 matchlen_match1_repeat_extend_encodeSnappyBlockAsm: 10629 CMPL DI, $0x01 10630 JB repeat_extend_forward_end_encodeSnappyBlockAsm 10631 MOVB (R8)(R10*1), R9 10632 CMPB (BX)(R10*1), R9 10633 JNE repeat_extend_forward_end_encodeSnappyBlockAsm 10634 LEAL 1(R10), R10 10635 10636 repeat_extend_forward_end_encodeSnappyBlockAsm: 10637 ADDL R10, CX 10638 MOVL CX, BX 10639 SUBL SI, BX 10640 MOVL 16(SP), SI 10641 10642 // emitCopy 10643 CMPL SI, $0x00010000 10644 JB two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm 10645 10646 four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm: 10647 CMPL BX, $0x40 10648 JBE four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm 10649 MOVB $0xff, (AX) 10650 MOVL SI, 1(AX) 10651 LEAL -64(BX), BX 10652 ADDQ $0x05, AX 10653 CMPL BX, $0x04 10654 JB four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm 10655 JMP four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm 10656 10657 four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm: 10658 TESTL BX, BX 10659 JZ repeat_end_emit_encodeSnappyBlockAsm 10660 XORL DI, DI 10661 LEAL -1(DI)(BX*4), BX 10662 MOVB BL, (AX) 10663 MOVL SI, 1(AX) 10664 ADDQ $0x05, AX 10665 JMP repeat_end_emit_encodeSnappyBlockAsm 10666 10667 two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm: 10668 CMPL BX, $0x40 10669 JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm 10670 MOVB $0xee, (AX) 10671 MOVW SI, 1(AX) 10672 LEAL -60(BX), BX 10673 ADDQ $0x03, AX 10674 JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm 10675 10676 two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm: 10677 MOVL BX, DI 10678 SHLL $0x02, DI 10679 CMPL BX, $0x0c 10680 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm 10681 CMPL SI, $0x00000800 10682 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm 10683 LEAL -15(DI), DI 10684 MOVB SI, 1(AX) 10685 SHRL $0x08, SI 10686 SHLL $0x05, SI 10687 ORL SI, DI 10688 MOVB DI, (AX) 10689 ADDQ $0x02, AX 10690 JMP repeat_end_emit_encodeSnappyBlockAsm 10691 10692 emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm: 10693 LEAL -2(DI), DI 10694 MOVB DI, (AX) 10695 MOVW SI, 1(AX) 10696 ADDQ $0x03, AX 10697 10698 repeat_end_emit_encodeSnappyBlockAsm: 10699 MOVL CX, 12(SP) 10700 JMP search_loop_encodeSnappyBlockAsm 10701 10702 no_repeat_found_encodeSnappyBlockAsm: 10703 CMPL (DX)(BX*1), SI 10704 JEQ candidate_match_encodeSnappyBlockAsm 10705 SHRQ $0x08, SI 10706 MOVL 24(SP)(R9*4), BX 10707 LEAL 2(CX), R8 10708 CMPL (DX)(DI*1), SI 10709 JEQ candidate2_match_encodeSnappyBlockAsm 10710 MOVL R8, 24(SP)(R9*4) 10711 SHRQ $0x08, SI 10712 CMPL (DX)(BX*1), SI 10713 JEQ candidate3_match_encodeSnappyBlockAsm 10714 MOVL 20(SP), CX 10715 JMP search_loop_encodeSnappyBlockAsm 10716 10717 candidate3_match_encodeSnappyBlockAsm: 10718 ADDL $0x02, CX 10719 JMP candidate_match_encodeSnappyBlockAsm 10720 10721 candidate2_match_encodeSnappyBlockAsm: 10722 MOVL R8, 24(SP)(R9*4) 10723 INCL CX 10724 MOVL DI, BX 10725 10726 candidate_match_encodeSnappyBlockAsm: 10727 MOVL 12(SP), SI 10728 TESTL BX, BX 10729 JZ match_extend_back_end_encodeSnappyBlockAsm 10730 10731 match_extend_back_loop_encodeSnappyBlockAsm: 10732 CMPL CX, SI 10733 JBE match_extend_back_end_encodeSnappyBlockAsm 10734 MOVB -1(DX)(BX*1), DI 10735 MOVB -1(DX)(CX*1), R8 10736 CMPB DI, R8 10737 JNE match_extend_back_end_encodeSnappyBlockAsm 10738 LEAL -1(CX), CX 10739 DECL BX 10740 JZ match_extend_back_end_encodeSnappyBlockAsm 10741 JMP match_extend_back_loop_encodeSnappyBlockAsm 10742 10743 match_extend_back_end_encodeSnappyBlockAsm: 10744 MOVL CX, SI 10745 SUBL 12(SP), SI 10746 LEAQ 5(AX)(SI*1), SI 10747 CMPQ SI, (SP) 10748 JB match_dst_size_check_encodeSnappyBlockAsm 10749 MOVQ $0x00000000, ret+48(FP) 10750 RET 10751 10752 match_dst_size_check_encodeSnappyBlockAsm: 10753 MOVL CX, SI 10754 MOVL 12(SP), DI 10755 CMPL DI, SI 10756 JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm 10757 MOVL SI, R8 10758 MOVL SI, 12(SP) 10759 LEAQ (DX)(DI*1), SI 10760 SUBL DI, R8 10761 LEAL -1(R8), DI 10762 CMPL DI, $0x3c 10763 JB one_byte_match_emit_encodeSnappyBlockAsm 10764 CMPL DI, $0x00000100 10765 JB two_bytes_match_emit_encodeSnappyBlockAsm 10766 CMPL DI, $0x00010000 10767 JB three_bytes_match_emit_encodeSnappyBlockAsm 10768 CMPL DI, $0x01000000 10769 JB four_bytes_match_emit_encodeSnappyBlockAsm 10770 MOVB $0xfc, (AX) 10771 MOVL DI, 1(AX) 10772 ADDQ $0x05, AX 10773 JMP memmove_long_match_emit_encodeSnappyBlockAsm 10774 10775 four_bytes_match_emit_encodeSnappyBlockAsm: 10776 MOVL DI, R9 10777 SHRL $0x10, R9 10778 MOVB $0xf8, (AX) 10779 MOVW DI, 1(AX) 10780 MOVB R9, 3(AX) 10781 ADDQ $0x04, AX 10782 JMP memmove_long_match_emit_encodeSnappyBlockAsm 10783 10784 three_bytes_match_emit_encodeSnappyBlockAsm: 10785 MOVB $0xf4, (AX) 10786 MOVW DI, 1(AX) 10787 ADDQ $0x03, AX 10788 JMP memmove_long_match_emit_encodeSnappyBlockAsm 10789 10790 two_bytes_match_emit_encodeSnappyBlockAsm: 10791 MOVB $0xf0, (AX) 10792 MOVB DI, 1(AX) 10793 ADDQ $0x02, AX 10794 CMPL DI, $0x40 10795 JB memmove_match_emit_encodeSnappyBlockAsm 10796 JMP memmove_long_match_emit_encodeSnappyBlockAsm 10797 10798 one_byte_match_emit_encodeSnappyBlockAsm: 10799 SHLB $0x02, DI 10800 MOVB DI, (AX) 10801 ADDQ $0x01, AX 10802 10803 memmove_match_emit_encodeSnappyBlockAsm: 10804 LEAQ (AX)(R8*1), DI 10805 10806 // genMemMoveShort 10807 CMPQ R8, $0x08 10808 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8 10809 CMPQ R8, $0x10 10810 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16 10811 CMPQ R8, $0x20 10812 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32 10813 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64 10814 10815 emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8: 10816 MOVQ (SI), R9 10817 MOVQ R9, (AX) 10818 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm 10819 10820 emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16: 10821 MOVQ (SI), R9 10822 MOVQ -8(SI)(R8*1), SI 10823 MOVQ R9, (AX) 10824 MOVQ SI, -8(AX)(R8*1) 10825 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm 10826 10827 emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32: 10828 MOVOU (SI), X0 10829 MOVOU -16(SI)(R8*1), X1 10830 MOVOU X0, (AX) 10831 MOVOU X1, -16(AX)(R8*1) 10832 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm 10833 10834 emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64: 10835 MOVOU (SI), X0 10836 MOVOU 16(SI), X1 10837 MOVOU -32(SI)(R8*1), X2 10838 MOVOU -16(SI)(R8*1), X3 10839 MOVOU X0, (AX) 10840 MOVOU X1, 16(AX) 10841 MOVOU X2, -32(AX)(R8*1) 10842 MOVOU X3, -16(AX)(R8*1) 10843 10844 memmove_end_copy_match_emit_encodeSnappyBlockAsm: 10845 MOVQ DI, AX 10846 JMP emit_literal_done_match_emit_encodeSnappyBlockAsm 10847 10848 memmove_long_match_emit_encodeSnappyBlockAsm: 10849 LEAQ (AX)(R8*1), DI 10850 10851 // genMemMoveLong 10852 MOVOU (SI), X0 10853 MOVOU 16(SI), X1 10854 MOVOU -32(SI)(R8*1), X2 10855 MOVOU -16(SI)(R8*1), X3 10856 MOVQ R8, R10 10857 SHRQ $0x05, R10 10858 MOVQ AX, R9 10859 ANDL $0x0000001f, R9 10860 MOVQ $0x00000040, R11 10861 SUBQ R9, R11 10862 DECQ R10 10863 JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 10864 LEAQ -32(SI)(R11*1), R9 10865 LEAQ -32(AX)(R11*1), R12 10866 10867 emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back: 10868 MOVOU (R9), X4 10869 MOVOU 16(R9), X5 10870 MOVOA X4, (R12) 10871 MOVOA X5, 16(R12) 10872 ADDQ $0x20, R12 10873 ADDQ $0x20, R9 10874 ADDQ $0x20, R11 10875 DECQ R10 10876 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back 10877 10878 emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32: 10879 MOVOU -32(SI)(R11*1), X4 10880 MOVOU -16(SI)(R11*1), X5 10881 MOVOA X4, -32(AX)(R11*1) 10882 MOVOA X5, -16(AX)(R11*1) 10883 ADDQ $0x20, R11 10884 CMPQ R8, R11 10885 JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 10886 MOVOU X0, (AX) 10887 MOVOU X1, 16(AX) 10888 MOVOU X2, -32(AX)(R8*1) 10889 MOVOU X3, -16(AX)(R8*1) 10890 MOVQ DI, AX 10891 10892 emit_literal_done_match_emit_encodeSnappyBlockAsm: 10893 match_nolit_loop_encodeSnappyBlockAsm: 10894 MOVL CX, SI 10895 SUBL BX, SI 10896 MOVL SI, 16(SP) 10897 ADDL $0x04, CX 10898 ADDL $0x04, BX 10899 MOVQ src_len+32(FP), SI 10900 SUBL CX, SI 10901 LEAQ (DX)(CX*1), DI 10902 LEAQ (DX)(BX*1), BX 10903 10904 // matchLen 10905 XORL R9, R9 10906 CMPL SI, $0x08 10907 JB matchlen_match4_match_nolit_encodeSnappyBlockAsm 10908 10909 matchlen_loopback_match_nolit_encodeSnappyBlockAsm: 10910 MOVQ (DI)(R9*1), R8 10911 XORQ (BX)(R9*1), R8 10912 TESTQ R8, R8 10913 JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm 10914 10915 #ifdef GOAMD64_v3 10916 TZCNTQ R8, R8 10917 10918 #else 10919 BSFQ R8, R8 10920 10921 #endif 10922 SARQ $0x03, R8 10923 LEAL (R9)(R8*1), R9 10924 JMP match_nolit_end_encodeSnappyBlockAsm 10925 10926 matchlen_loop_match_nolit_encodeSnappyBlockAsm: 10927 LEAL -8(SI), SI 10928 LEAL 8(R9), R9 10929 CMPL SI, $0x08 10930 JAE matchlen_loopback_match_nolit_encodeSnappyBlockAsm 10931 JZ match_nolit_end_encodeSnappyBlockAsm 10932 10933 matchlen_match4_match_nolit_encodeSnappyBlockAsm: 10934 CMPL SI, $0x04 10935 JB matchlen_match2_match_nolit_encodeSnappyBlockAsm 10936 MOVL (DI)(R9*1), R8 10937 CMPL (BX)(R9*1), R8 10938 JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm 10939 SUBL $0x04, SI 10940 LEAL 4(R9), R9 10941 10942 matchlen_match2_match_nolit_encodeSnappyBlockAsm: 10943 CMPL SI, $0x02 10944 JB matchlen_match1_match_nolit_encodeSnappyBlockAsm 10945 MOVW (DI)(R9*1), R8 10946 CMPW (BX)(R9*1), R8 10947 JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm 10948 SUBL $0x02, SI 10949 LEAL 2(R9), R9 10950 10951 matchlen_match1_match_nolit_encodeSnappyBlockAsm: 10952 CMPL SI, $0x01 10953 JB match_nolit_end_encodeSnappyBlockAsm 10954 MOVB (DI)(R9*1), R8 10955 CMPB (BX)(R9*1), R8 10956 JNE match_nolit_end_encodeSnappyBlockAsm 10957 LEAL 1(R9), R9 10958 10959 match_nolit_end_encodeSnappyBlockAsm: 10960 ADDL R9, CX 10961 MOVL 16(SP), BX 10962 ADDL $0x04, R9 10963 MOVL CX, 12(SP) 10964 10965 // emitCopy 10966 CMPL BX, $0x00010000 10967 JB two_byte_offset_match_nolit_encodeSnappyBlockAsm 10968 10969 four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm: 10970 CMPL R9, $0x40 10971 JBE four_bytes_remain_match_nolit_encodeSnappyBlockAsm 10972 MOVB $0xff, (AX) 10973 MOVL BX, 1(AX) 10974 LEAL -64(R9), R9 10975 ADDQ $0x05, AX 10976 CMPL R9, $0x04 10977 JB four_bytes_remain_match_nolit_encodeSnappyBlockAsm 10978 JMP four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm 10979 10980 four_bytes_remain_match_nolit_encodeSnappyBlockAsm: 10981 TESTL R9, R9 10982 JZ match_nolit_emitcopy_end_encodeSnappyBlockAsm 10983 XORL SI, SI 10984 LEAL -1(SI)(R9*4), R9 10985 MOVB R9, (AX) 10986 MOVL BX, 1(AX) 10987 ADDQ $0x05, AX 10988 JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm 10989 10990 two_byte_offset_match_nolit_encodeSnappyBlockAsm: 10991 CMPL R9, $0x40 10992 JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm 10993 MOVB $0xee, (AX) 10994 MOVW BX, 1(AX) 10995 LEAL -60(R9), R9 10996 ADDQ $0x03, AX 10997 JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm 10998 10999 two_byte_offset_short_match_nolit_encodeSnappyBlockAsm: 11000 MOVL R9, SI 11001 SHLL $0x02, SI 11002 CMPL R9, $0x0c 11003 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm 11004 CMPL BX, $0x00000800 11005 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm 11006 LEAL -15(SI), SI 11007 MOVB BL, 1(AX) 11008 SHRL $0x08, BX 11009 SHLL $0x05, BX 11010 ORL BX, SI 11011 MOVB SI, (AX) 11012 ADDQ $0x02, AX 11013 JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm 11014 11015 emit_copy_three_match_nolit_encodeSnappyBlockAsm: 11016 LEAL -2(SI), SI 11017 MOVB SI, (AX) 11018 MOVW BX, 1(AX) 11019 ADDQ $0x03, AX 11020 11021 match_nolit_emitcopy_end_encodeSnappyBlockAsm: 11022 CMPL CX, 8(SP) 11023 JAE emit_remainder_encodeSnappyBlockAsm 11024 MOVQ -2(DX)(CX*1), SI 11025 CMPQ AX, (SP) 11026 JB match_nolit_dst_ok_encodeSnappyBlockAsm 11027 MOVQ $0x00000000, ret+48(FP) 11028 RET 11029 11030 match_nolit_dst_ok_encodeSnappyBlockAsm: 11031 MOVQ $0x0000cf1bbcdcbf9b, R8 11032 MOVQ SI, DI 11033 SHRQ $0x10, SI 11034 MOVQ SI, BX 11035 SHLQ $0x10, DI 11036 IMULQ R8, DI 11037 SHRQ $0x32, DI 11038 SHLQ $0x10, BX 11039 IMULQ R8, BX 11040 SHRQ $0x32, BX 11041 LEAL -2(CX), R8 11042 LEAQ 24(SP)(BX*4), R9 11043 MOVL (R9), BX 11044 MOVL R8, 24(SP)(DI*4) 11045 MOVL CX, (R9) 11046 CMPL (DX)(BX*1), SI 11047 JEQ match_nolit_loop_encodeSnappyBlockAsm 11048 INCL CX 11049 JMP search_loop_encodeSnappyBlockAsm 11050 11051 emit_remainder_encodeSnappyBlockAsm: 11052 MOVQ src_len+32(FP), CX 11053 SUBL 12(SP), CX 11054 LEAQ 5(AX)(CX*1), CX 11055 CMPQ CX, (SP) 11056 JB emit_remainder_ok_encodeSnappyBlockAsm 11057 MOVQ $0x00000000, ret+48(FP) 11058 RET 11059 11060 emit_remainder_ok_encodeSnappyBlockAsm: 11061 MOVQ src_len+32(FP), CX 11062 MOVL 12(SP), BX 11063 CMPL BX, CX 11064 JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm 11065 MOVL CX, SI 11066 MOVL CX, 12(SP) 11067 LEAQ (DX)(BX*1), CX 11068 SUBL BX, SI 11069 LEAL -1(SI), DX 11070 CMPL DX, $0x3c 11071 JB one_byte_emit_remainder_encodeSnappyBlockAsm 11072 CMPL DX, $0x00000100 11073 JB two_bytes_emit_remainder_encodeSnappyBlockAsm 11074 CMPL DX, $0x00010000 11075 JB three_bytes_emit_remainder_encodeSnappyBlockAsm 11076 CMPL DX, $0x01000000 11077 JB four_bytes_emit_remainder_encodeSnappyBlockAsm 11078 MOVB $0xfc, (AX) 11079 MOVL DX, 1(AX) 11080 ADDQ $0x05, AX 11081 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm 11082 11083 four_bytes_emit_remainder_encodeSnappyBlockAsm: 11084 MOVL DX, BX 11085 SHRL $0x10, BX 11086 MOVB $0xf8, (AX) 11087 MOVW DX, 1(AX) 11088 MOVB BL, 3(AX) 11089 ADDQ $0x04, AX 11090 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm 11091 11092 three_bytes_emit_remainder_encodeSnappyBlockAsm: 11093 MOVB $0xf4, (AX) 11094 MOVW DX, 1(AX) 11095 ADDQ $0x03, AX 11096 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm 11097 11098 two_bytes_emit_remainder_encodeSnappyBlockAsm: 11099 MOVB $0xf0, (AX) 11100 MOVB DL, 1(AX) 11101 ADDQ $0x02, AX 11102 CMPL DX, $0x40 11103 JB memmove_emit_remainder_encodeSnappyBlockAsm 11104 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm 11105 11106 one_byte_emit_remainder_encodeSnappyBlockAsm: 11107 SHLB $0x02, DL 11108 MOVB DL, (AX) 11109 ADDQ $0x01, AX 11110 11111 memmove_emit_remainder_encodeSnappyBlockAsm: 11112 LEAQ (AX)(SI*1), DX 11113 MOVL SI, BX 11114 11115 // genMemMoveShort 11116 CMPQ BX, $0x03 11117 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2 11118 JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3 11119 CMPQ BX, $0x08 11120 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7 11121 CMPQ BX, $0x10 11122 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16 11123 CMPQ BX, $0x20 11124 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32 11125 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64 11126 11127 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2: 11128 MOVB (CX), SI 11129 MOVB -1(CX)(BX*1), CL 11130 MOVB SI, (AX) 11131 MOVB CL, -1(AX)(BX*1) 11132 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm 11133 11134 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3: 11135 MOVW (CX), SI 11136 MOVB 2(CX), CL 11137 MOVW SI, (AX) 11138 MOVB CL, 2(AX) 11139 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm 11140 11141 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7: 11142 MOVL (CX), SI 11143 MOVL -4(CX)(BX*1), CX 11144 MOVL SI, (AX) 11145 MOVL CX, -4(AX)(BX*1) 11146 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm 11147 11148 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16: 11149 MOVQ (CX), SI 11150 MOVQ -8(CX)(BX*1), CX 11151 MOVQ SI, (AX) 11152 MOVQ CX, -8(AX)(BX*1) 11153 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm 11154 11155 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32: 11156 MOVOU (CX), X0 11157 MOVOU -16(CX)(BX*1), X1 11158 MOVOU X0, (AX) 11159 MOVOU X1, -16(AX)(BX*1) 11160 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm 11161 11162 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64: 11163 MOVOU (CX), X0 11164 MOVOU 16(CX), X1 11165 MOVOU -32(CX)(BX*1), X2 11166 MOVOU -16(CX)(BX*1), X3 11167 MOVOU X0, (AX) 11168 MOVOU X1, 16(AX) 11169 MOVOU X2, -32(AX)(BX*1) 11170 MOVOU X3, -16(AX)(BX*1) 11171 11172 memmove_end_copy_emit_remainder_encodeSnappyBlockAsm: 11173 MOVQ DX, AX 11174 JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm 11175 11176 memmove_long_emit_remainder_encodeSnappyBlockAsm: 11177 LEAQ (AX)(SI*1), DX 11178 MOVL SI, BX 11179 11180 // genMemMoveLong 11181 MOVOU (CX), X0 11182 MOVOU 16(CX), X1 11183 MOVOU -32(CX)(BX*1), X2 11184 MOVOU -16(CX)(BX*1), X3 11185 MOVQ BX, DI 11186 SHRQ $0x05, DI 11187 MOVQ AX, SI 11188 ANDL $0x0000001f, SI 11189 MOVQ $0x00000040, R8 11190 SUBQ SI, R8 11191 DECQ DI 11192 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32 11193 LEAQ -32(CX)(R8*1), SI 11194 LEAQ -32(AX)(R8*1), R9 11195 11196 emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back: 11197 MOVOU (SI), X4 11198 MOVOU 16(SI), X5 11199 MOVOA X4, (R9) 11200 MOVOA X5, 16(R9) 11201 ADDQ $0x20, R9 11202 ADDQ $0x20, SI 11203 ADDQ $0x20, R8 11204 DECQ DI 11205 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back 11206 11207 emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32: 11208 MOVOU -32(CX)(R8*1), X4 11209 MOVOU -16(CX)(R8*1), X5 11210 MOVOA X4, -32(AX)(R8*1) 11211 MOVOA X5, -16(AX)(R8*1) 11212 ADDQ $0x20, R8 11213 CMPQ BX, R8 11214 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32 11215 MOVOU X0, (AX) 11216 MOVOU X1, 16(AX) 11217 MOVOU X2, -32(AX)(BX*1) 11218 MOVOU X3, -16(AX)(BX*1) 11219 MOVQ DX, AX 11220 11221 emit_literal_done_emit_remainder_encodeSnappyBlockAsm: 11222 MOVQ dst_base+0(FP), CX 11223 SUBQ CX, AX 11224 MOVQ AX, ret+48(FP) 11225 RET 11226 11227 // func encodeSnappyBlockAsm64K(dst []byte, src []byte) int 11228 // Requires: BMI, SSE2 11229 TEXT ·encodeSnappyBlockAsm64K(SB), $65560-56 11230 MOVQ dst_base+0(FP), AX 11231 MOVQ $0x00000200, CX 11232 LEAQ 24(SP), DX 11233 PXOR X0, X0 11234 11235 zero_loop_encodeSnappyBlockAsm64K: 11236 MOVOU X0, (DX) 11237 MOVOU X0, 16(DX) 11238 MOVOU X0, 32(DX) 11239 MOVOU X0, 48(DX) 11240 MOVOU X0, 64(DX) 11241 MOVOU X0, 80(DX) 11242 MOVOU X0, 96(DX) 11243 MOVOU X0, 112(DX) 11244 ADDQ $0x80, DX 11245 DECQ CX 11246 JNZ zero_loop_encodeSnappyBlockAsm64K 11247 MOVL $0x00000000, 12(SP) 11248 MOVQ src_len+32(FP), CX 11249 LEAQ -9(CX), DX 11250 LEAQ -8(CX), BX 11251 MOVL BX, 8(SP) 11252 SHRQ $0x05, CX 11253 SUBL CX, DX 11254 LEAQ (AX)(DX*1), DX 11255 MOVQ DX, (SP) 11256 MOVL $0x00000001, CX 11257 MOVL CX, 16(SP) 11258 MOVQ src_base+24(FP), DX 11259 11260 search_loop_encodeSnappyBlockAsm64K: 11261 MOVL CX, BX 11262 SUBL 12(SP), BX 11263 SHRL $0x06, BX 11264 LEAL 4(CX)(BX*1), BX 11265 CMPL BX, 8(SP) 11266 JAE emit_remainder_encodeSnappyBlockAsm64K 11267 MOVQ (DX)(CX*1), SI 11268 MOVL BX, 20(SP) 11269 MOVQ $0x0000cf1bbcdcbf9b, R8 11270 MOVQ SI, R9 11271 MOVQ SI, R10 11272 SHRQ $0x08, R10 11273 SHLQ $0x10, R9 11274 IMULQ R8, R9 11275 SHRQ $0x32, R9 11276 SHLQ $0x10, R10 11277 IMULQ R8, R10 11278 SHRQ $0x32, R10 11279 MOVL 24(SP)(R9*4), BX 11280 MOVL 24(SP)(R10*4), DI 11281 MOVL CX, 24(SP)(R9*4) 11282 LEAL 1(CX), R9 11283 MOVL R9, 24(SP)(R10*4) 11284 MOVQ SI, R9 11285 SHRQ $0x10, R9 11286 SHLQ $0x10, R9 11287 IMULQ R8, R9 11288 SHRQ $0x32, R9 11289 MOVL CX, R8 11290 SUBL 16(SP), R8 11291 MOVL 1(DX)(R8*1), R10 11292 MOVQ SI, R8 11293 SHRQ $0x08, R8 11294 CMPL R8, R10 11295 JNE no_repeat_found_encodeSnappyBlockAsm64K 11296 LEAL 1(CX), SI 11297 MOVL 12(SP), BX 11298 MOVL SI, DI 11299 SUBL 16(SP), DI 11300 JZ repeat_extend_back_end_encodeSnappyBlockAsm64K 11301 11302 repeat_extend_back_loop_encodeSnappyBlockAsm64K: 11303 CMPL SI, BX 11304 JBE repeat_extend_back_end_encodeSnappyBlockAsm64K 11305 MOVB -1(DX)(DI*1), R8 11306 MOVB -1(DX)(SI*1), R9 11307 CMPB R8, R9 11308 JNE repeat_extend_back_end_encodeSnappyBlockAsm64K 11309 LEAL -1(SI), SI 11310 DECL DI 11311 JNZ repeat_extend_back_loop_encodeSnappyBlockAsm64K 11312 11313 repeat_extend_back_end_encodeSnappyBlockAsm64K: 11314 MOVL 12(SP), BX 11315 CMPL BX, SI 11316 JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K 11317 MOVL SI, DI 11318 MOVL SI, 12(SP) 11319 LEAQ (DX)(BX*1), R8 11320 SUBL BX, DI 11321 LEAL -1(DI), BX 11322 CMPL BX, $0x3c 11323 JB one_byte_repeat_emit_encodeSnappyBlockAsm64K 11324 CMPL BX, $0x00000100 11325 JB two_bytes_repeat_emit_encodeSnappyBlockAsm64K 11326 JB three_bytes_repeat_emit_encodeSnappyBlockAsm64K 11327 11328 three_bytes_repeat_emit_encodeSnappyBlockAsm64K: 11329 MOVB $0xf4, (AX) 11330 MOVW BX, 1(AX) 11331 ADDQ $0x03, AX 11332 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K 11333 11334 two_bytes_repeat_emit_encodeSnappyBlockAsm64K: 11335 MOVB $0xf0, (AX) 11336 MOVB BL, 1(AX) 11337 ADDQ $0x02, AX 11338 CMPL BX, $0x40 11339 JB memmove_repeat_emit_encodeSnappyBlockAsm64K 11340 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K 11341 11342 one_byte_repeat_emit_encodeSnappyBlockAsm64K: 11343 SHLB $0x02, BL 11344 MOVB BL, (AX) 11345 ADDQ $0x01, AX 11346 11347 memmove_repeat_emit_encodeSnappyBlockAsm64K: 11348 LEAQ (AX)(DI*1), BX 11349 11350 // genMemMoveShort 11351 CMPQ DI, $0x08 11352 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8 11353 CMPQ DI, $0x10 11354 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16 11355 CMPQ DI, $0x20 11356 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32 11357 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64 11358 11359 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8: 11360 MOVQ (R8), R9 11361 MOVQ R9, (AX) 11362 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K 11363 11364 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16: 11365 MOVQ (R8), R9 11366 MOVQ -8(R8)(DI*1), R8 11367 MOVQ R9, (AX) 11368 MOVQ R8, -8(AX)(DI*1) 11369 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K 11370 11371 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32: 11372 MOVOU (R8), X0 11373 MOVOU -16(R8)(DI*1), X1 11374 MOVOU X0, (AX) 11375 MOVOU X1, -16(AX)(DI*1) 11376 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K 11377 11378 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64: 11379 MOVOU (R8), X0 11380 MOVOU 16(R8), X1 11381 MOVOU -32(R8)(DI*1), X2 11382 MOVOU -16(R8)(DI*1), X3 11383 MOVOU X0, (AX) 11384 MOVOU X1, 16(AX) 11385 MOVOU X2, -32(AX)(DI*1) 11386 MOVOU X3, -16(AX)(DI*1) 11387 11388 memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K: 11389 MOVQ BX, AX 11390 JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K 11391 11392 memmove_long_repeat_emit_encodeSnappyBlockAsm64K: 11393 LEAQ (AX)(DI*1), BX 11394 11395 // genMemMoveLong 11396 MOVOU (R8), X0 11397 MOVOU 16(R8), X1 11398 MOVOU -32(R8)(DI*1), X2 11399 MOVOU -16(R8)(DI*1), X3 11400 MOVQ DI, R10 11401 SHRQ $0x05, R10 11402 MOVQ AX, R9 11403 ANDL $0x0000001f, R9 11404 MOVQ $0x00000040, R11 11405 SUBQ R9, R11 11406 DECQ R10 11407 JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 11408 LEAQ -32(R8)(R11*1), R9 11409 LEAQ -32(AX)(R11*1), R12 11410 11411 emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back: 11412 MOVOU (R9), X4 11413 MOVOU 16(R9), X5 11414 MOVOA X4, (R12) 11415 MOVOA X5, 16(R12) 11416 ADDQ $0x20, R12 11417 ADDQ $0x20, R9 11418 ADDQ $0x20, R11 11419 DECQ R10 11420 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back 11421 11422 emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32: 11423 MOVOU -32(R8)(R11*1), X4 11424 MOVOU -16(R8)(R11*1), X5 11425 MOVOA X4, -32(AX)(R11*1) 11426 MOVOA X5, -16(AX)(R11*1) 11427 ADDQ $0x20, R11 11428 CMPQ DI, R11 11429 JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 11430 MOVOU X0, (AX) 11431 MOVOU X1, 16(AX) 11432 MOVOU X2, -32(AX)(DI*1) 11433 MOVOU X3, -16(AX)(DI*1) 11434 MOVQ BX, AX 11435 11436 emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K: 11437 ADDL $0x05, CX 11438 MOVL CX, BX 11439 SUBL 16(SP), BX 11440 MOVQ src_len+32(FP), DI 11441 SUBL CX, DI 11442 LEAQ (DX)(CX*1), R8 11443 LEAQ (DX)(BX*1), BX 11444 11445 // matchLen 11446 XORL R10, R10 11447 CMPL DI, $0x08 11448 JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K 11449 11450 matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K: 11451 MOVQ (R8)(R10*1), R9 11452 XORQ (BX)(R10*1), R9 11453 TESTQ R9, R9 11454 JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K 11455 11456 #ifdef GOAMD64_v3 11457 TZCNTQ R9, R9 11458 11459 #else 11460 BSFQ R9, R9 11461 11462 #endif 11463 SARQ $0x03, R9 11464 LEAL (R10)(R9*1), R10 11465 JMP repeat_extend_forward_end_encodeSnappyBlockAsm64K 11466 11467 matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K: 11468 LEAL -8(DI), DI 11469 LEAL 8(R10), R10 11470 CMPL DI, $0x08 11471 JAE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K 11472 JZ repeat_extend_forward_end_encodeSnappyBlockAsm64K 11473 11474 matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K: 11475 CMPL DI, $0x04 11476 JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K 11477 MOVL (R8)(R10*1), R9 11478 CMPL (BX)(R10*1), R9 11479 JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K 11480 SUBL $0x04, DI 11481 LEAL 4(R10), R10 11482 11483 matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K: 11484 CMPL DI, $0x02 11485 JB matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K 11486 MOVW (R8)(R10*1), R9 11487 CMPW (BX)(R10*1), R9 11488 JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K 11489 SUBL $0x02, DI 11490 LEAL 2(R10), R10 11491 11492 matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K: 11493 CMPL DI, $0x01 11494 JB repeat_extend_forward_end_encodeSnappyBlockAsm64K 11495 MOVB (R8)(R10*1), R9 11496 CMPB (BX)(R10*1), R9 11497 JNE repeat_extend_forward_end_encodeSnappyBlockAsm64K 11498 LEAL 1(R10), R10 11499 11500 repeat_extend_forward_end_encodeSnappyBlockAsm64K: 11501 ADDL R10, CX 11502 MOVL CX, BX 11503 SUBL SI, BX 11504 MOVL 16(SP), SI 11505 11506 // emitCopy 11507 two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K: 11508 CMPL BX, $0x40 11509 JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K 11510 MOVB $0xee, (AX) 11511 MOVW SI, 1(AX) 11512 LEAL -60(BX), BX 11513 ADDQ $0x03, AX 11514 JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K 11515 11516 two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K: 11517 MOVL BX, DI 11518 SHLL $0x02, DI 11519 CMPL BX, $0x0c 11520 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K 11521 CMPL SI, $0x00000800 11522 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K 11523 LEAL -15(DI), DI 11524 MOVB SI, 1(AX) 11525 SHRL $0x08, SI 11526 SHLL $0x05, SI 11527 ORL SI, DI 11528 MOVB DI, (AX) 11529 ADDQ $0x02, AX 11530 JMP repeat_end_emit_encodeSnappyBlockAsm64K 11531 11532 emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K: 11533 LEAL -2(DI), DI 11534 MOVB DI, (AX) 11535 MOVW SI, 1(AX) 11536 ADDQ $0x03, AX 11537 11538 repeat_end_emit_encodeSnappyBlockAsm64K: 11539 MOVL CX, 12(SP) 11540 JMP search_loop_encodeSnappyBlockAsm64K 11541 11542 no_repeat_found_encodeSnappyBlockAsm64K: 11543 CMPL (DX)(BX*1), SI 11544 JEQ candidate_match_encodeSnappyBlockAsm64K 11545 SHRQ $0x08, SI 11546 MOVL 24(SP)(R9*4), BX 11547 LEAL 2(CX), R8 11548 CMPL (DX)(DI*1), SI 11549 JEQ candidate2_match_encodeSnappyBlockAsm64K 11550 MOVL R8, 24(SP)(R9*4) 11551 SHRQ $0x08, SI 11552 CMPL (DX)(BX*1), SI 11553 JEQ candidate3_match_encodeSnappyBlockAsm64K 11554 MOVL 20(SP), CX 11555 JMP search_loop_encodeSnappyBlockAsm64K 11556 11557 candidate3_match_encodeSnappyBlockAsm64K: 11558 ADDL $0x02, CX 11559 JMP candidate_match_encodeSnappyBlockAsm64K 11560 11561 candidate2_match_encodeSnappyBlockAsm64K: 11562 MOVL R8, 24(SP)(R9*4) 11563 INCL CX 11564 MOVL DI, BX 11565 11566 candidate_match_encodeSnappyBlockAsm64K: 11567 MOVL 12(SP), SI 11568 TESTL BX, BX 11569 JZ match_extend_back_end_encodeSnappyBlockAsm64K 11570 11571 match_extend_back_loop_encodeSnappyBlockAsm64K: 11572 CMPL CX, SI 11573 JBE match_extend_back_end_encodeSnappyBlockAsm64K 11574 MOVB -1(DX)(BX*1), DI 11575 MOVB -1(DX)(CX*1), R8 11576 CMPB DI, R8 11577 JNE match_extend_back_end_encodeSnappyBlockAsm64K 11578 LEAL -1(CX), CX 11579 DECL BX 11580 JZ match_extend_back_end_encodeSnappyBlockAsm64K 11581 JMP match_extend_back_loop_encodeSnappyBlockAsm64K 11582 11583 match_extend_back_end_encodeSnappyBlockAsm64K: 11584 MOVL CX, SI 11585 SUBL 12(SP), SI 11586 LEAQ 3(AX)(SI*1), SI 11587 CMPQ SI, (SP) 11588 JB match_dst_size_check_encodeSnappyBlockAsm64K 11589 MOVQ $0x00000000, ret+48(FP) 11590 RET 11591 11592 match_dst_size_check_encodeSnappyBlockAsm64K: 11593 MOVL CX, SI 11594 MOVL 12(SP), DI 11595 CMPL DI, SI 11596 JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm64K 11597 MOVL SI, R8 11598 MOVL SI, 12(SP) 11599 LEAQ (DX)(DI*1), SI 11600 SUBL DI, R8 11601 LEAL -1(R8), DI 11602 CMPL DI, $0x3c 11603 JB one_byte_match_emit_encodeSnappyBlockAsm64K 11604 CMPL DI, $0x00000100 11605 JB two_bytes_match_emit_encodeSnappyBlockAsm64K 11606 JB three_bytes_match_emit_encodeSnappyBlockAsm64K 11607 11608 three_bytes_match_emit_encodeSnappyBlockAsm64K: 11609 MOVB $0xf4, (AX) 11610 MOVW DI, 1(AX) 11611 ADDQ $0x03, AX 11612 JMP memmove_long_match_emit_encodeSnappyBlockAsm64K 11613 11614 two_bytes_match_emit_encodeSnappyBlockAsm64K: 11615 MOVB $0xf0, (AX) 11616 MOVB DI, 1(AX) 11617 ADDQ $0x02, AX 11618 CMPL DI, $0x40 11619 JB memmove_match_emit_encodeSnappyBlockAsm64K 11620 JMP memmove_long_match_emit_encodeSnappyBlockAsm64K 11621 11622 one_byte_match_emit_encodeSnappyBlockAsm64K: 11623 SHLB $0x02, DI 11624 MOVB DI, (AX) 11625 ADDQ $0x01, AX 11626 11627 memmove_match_emit_encodeSnappyBlockAsm64K: 11628 LEAQ (AX)(R8*1), DI 11629 11630 // genMemMoveShort 11631 CMPQ R8, $0x08 11632 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8 11633 CMPQ R8, $0x10 11634 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16 11635 CMPQ R8, $0x20 11636 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32 11637 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64 11638 11639 emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8: 11640 MOVQ (SI), R9 11641 MOVQ R9, (AX) 11642 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K 11643 11644 emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16: 11645 MOVQ (SI), R9 11646 MOVQ -8(SI)(R8*1), SI 11647 MOVQ R9, (AX) 11648 MOVQ SI, -8(AX)(R8*1) 11649 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K 11650 11651 emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32: 11652 MOVOU (SI), X0 11653 MOVOU -16(SI)(R8*1), X1 11654 MOVOU X0, (AX) 11655 MOVOU X1, -16(AX)(R8*1) 11656 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K 11657 11658 emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64: 11659 MOVOU (SI), X0 11660 MOVOU 16(SI), X1 11661 MOVOU -32(SI)(R8*1), X2 11662 MOVOU -16(SI)(R8*1), X3 11663 MOVOU X0, (AX) 11664 MOVOU X1, 16(AX) 11665 MOVOU X2, -32(AX)(R8*1) 11666 MOVOU X3, -16(AX)(R8*1) 11667 11668 memmove_end_copy_match_emit_encodeSnappyBlockAsm64K: 11669 MOVQ DI, AX 11670 JMP emit_literal_done_match_emit_encodeSnappyBlockAsm64K 11671 11672 memmove_long_match_emit_encodeSnappyBlockAsm64K: 11673 LEAQ (AX)(R8*1), DI 11674 11675 // genMemMoveLong 11676 MOVOU (SI), X0 11677 MOVOU 16(SI), X1 11678 MOVOU -32(SI)(R8*1), X2 11679 MOVOU -16(SI)(R8*1), X3 11680 MOVQ R8, R10 11681 SHRQ $0x05, R10 11682 MOVQ AX, R9 11683 ANDL $0x0000001f, R9 11684 MOVQ $0x00000040, R11 11685 SUBQ R9, R11 11686 DECQ R10 11687 JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 11688 LEAQ -32(SI)(R11*1), R9 11689 LEAQ -32(AX)(R11*1), R12 11690 11691 emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back: 11692 MOVOU (R9), X4 11693 MOVOU 16(R9), X5 11694 MOVOA X4, (R12) 11695 MOVOA X5, 16(R12) 11696 ADDQ $0x20, R12 11697 ADDQ $0x20, R9 11698 ADDQ $0x20, R11 11699 DECQ R10 11700 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back 11701 11702 emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32: 11703 MOVOU -32(SI)(R11*1), X4 11704 MOVOU -16(SI)(R11*1), X5 11705 MOVOA X4, -32(AX)(R11*1) 11706 MOVOA X5, -16(AX)(R11*1) 11707 ADDQ $0x20, R11 11708 CMPQ R8, R11 11709 JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 11710 MOVOU X0, (AX) 11711 MOVOU X1, 16(AX) 11712 MOVOU X2, -32(AX)(R8*1) 11713 MOVOU X3, -16(AX)(R8*1) 11714 MOVQ DI, AX 11715 11716 emit_literal_done_match_emit_encodeSnappyBlockAsm64K: 11717 match_nolit_loop_encodeSnappyBlockAsm64K: 11718 MOVL CX, SI 11719 SUBL BX, SI 11720 MOVL SI, 16(SP) 11721 ADDL $0x04, CX 11722 ADDL $0x04, BX 11723 MOVQ src_len+32(FP), SI 11724 SUBL CX, SI 11725 LEAQ (DX)(CX*1), DI 11726 LEAQ (DX)(BX*1), BX 11727 11728 // matchLen 11729 XORL R9, R9 11730 CMPL SI, $0x08 11731 JB matchlen_match4_match_nolit_encodeSnappyBlockAsm64K 11732 11733 matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K: 11734 MOVQ (DI)(R9*1), R8 11735 XORQ (BX)(R9*1), R8 11736 TESTQ R8, R8 11737 JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm64K 11738 11739 #ifdef GOAMD64_v3 11740 TZCNTQ R8, R8 11741 11742 #else 11743 BSFQ R8, R8 11744 11745 #endif 11746 SARQ $0x03, R8 11747 LEAL (R9)(R8*1), R9 11748 JMP match_nolit_end_encodeSnappyBlockAsm64K 11749 11750 matchlen_loop_match_nolit_encodeSnappyBlockAsm64K: 11751 LEAL -8(SI), SI 11752 LEAL 8(R9), R9 11753 CMPL SI, $0x08 11754 JAE matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K 11755 JZ match_nolit_end_encodeSnappyBlockAsm64K 11756 11757 matchlen_match4_match_nolit_encodeSnappyBlockAsm64K: 11758 CMPL SI, $0x04 11759 JB matchlen_match2_match_nolit_encodeSnappyBlockAsm64K 11760 MOVL (DI)(R9*1), R8 11761 CMPL (BX)(R9*1), R8 11762 JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm64K 11763 SUBL $0x04, SI 11764 LEAL 4(R9), R9 11765 11766 matchlen_match2_match_nolit_encodeSnappyBlockAsm64K: 11767 CMPL SI, $0x02 11768 JB matchlen_match1_match_nolit_encodeSnappyBlockAsm64K 11769 MOVW (DI)(R9*1), R8 11770 CMPW (BX)(R9*1), R8 11771 JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm64K 11772 SUBL $0x02, SI 11773 LEAL 2(R9), R9 11774 11775 matchlen_match1_match_nolit_encodeSnappyBlockAsm64K: 11776 CMPL SI, $0x01 11777 JB match_nolit_end_encodeSnappyBlockAsm64K 11778 MOVB (DI)(R9*1), R8 11779 CMPB (BX)(R9*1), R8 11780 JNE match_nolit_end_encodeSnappyBlockAsm64K 11781 LEAL 1(R9), R9 11782 11783 match_nolit_end_encodeSnappyBlockAsm64K: 11784 ADDL R9, CX 11785 MOVL 16(SP), BX 11786 ADDL $0x04, R9 11787 MOVL CX, 12(SP) 11788 11789 // emitCopy 11790 two_byte_offset_match_nolit_encodeSnappyBlockAsm64K: 11791 CMPL R9, $0x40 11792 JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K 11793 MOVB $0xee, (AX) 11794 MOVW BX, 1(AX) 11795 LEAL -60(R9), R9 11796 ADDQ $0x03, AX 11797 JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm64K 11798 11799 two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K: 11800 MOVL R9, SI 11801 SHLL $0x02, SI 11802 CMPL R9, $0x0c 11803 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K 11804 CMPL BX, $0x00000800 11805 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K 11806 LEAL -15(SI), SI 11807 MOVB BL, 1(AX) 11808 SHRL $0x08, BX 11809 SHLL $0x05, BX 11810 ORL BX, SI 11811 MOVB SI, (AX) 11812 ADDQ $0x02, AX 11813 JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm64K 11814 11815 emit_copy_three_match_nolit_encodeSnappyBlockAsm64K: 11816 LEAL -2(SI), SI 11817 MOVB SI, (AX) 11818 MOVW BX, 1(AX) 11819 ADDQ $0x03, AX 11820 11821 match_nolit_emitcopy_end_encodeSnappyBlockAsm64K: 11822 CMPL CX, 8(SP) 11823 JAE emit_remainder_encodeSnappyBlockAsm64K 11824 MOVQ -2(DX)(CX*1), SI 11825 CMPQ AX, (SP) 11826 JB match_nolit_dst_ok_encodeSnappyBlockAsm64K 11827 MOVQ $0x00000000, ret+48(FP) 11828 RET 11829 11830 match_nolit_dst_ok_encodeSnappyBlockAsm64K: 11831 MOVQ $0x0000cf1bbcdcbf9b, R8 11832 MOVQ SI, DI 11833 SHRQ $0x10, SI 11834 MOVQ SI, BX 11835 SHLQ $0x10, DI 11836 IMULQ R8, DI 11837 SHRQ $0x32, DI 11838 SHLQ $0x10, BX 11839 IMULQ R8, BX 11840 SHRQ $0x32, BX 11841 LEAL -2(CX), R8 11842 LEAQ 24(SP)(BX*4), R9 11843 MOVL (R9), BX 11844 MOVL R8, 24(SP)(DI*4) 11845 MOVL CX, (R9) 11846 CMPL (DX)(BX*1), SI 11847 JEQ match_nolit_loop_encodeSnappyBlockAsm64K 11848 INCL CX 11849 JMP search_loop_encodeSnappyBlockAsm64K 11850 11851 emit_remainder_encodeSnappyBlockAsm64K: 11852 MOVQ src_len+32(FP), CX 11853 SUBL 12(SP), CX 11854 LEAQ 3(AX)(CX*1), CX 11855 CMPQ CX, (SP) 11856 JB emit_remainder_ok_encodeSnappyBlockAsm64K 11857 MOVQ $0x00000000, ret+48(FP) 11858 RET 11859 11860 emit_remainder_ok_encodeSnappyBlockAsm64K: 11861 MOVQ src_len+32(FP), CX 11862 MOVL 12(SP), BX 11863 CMPL BX, CX 11864 JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K 11865 MOVL CX, SI 11866 MOVL CX, 12(SP) 11867 LEAQ (DX)(BX*1), CX 11868 SUBL BX, SI 11869 LEAL -1(SI), DX 11870 CMPL DX, $0x3c 11871 JB one_byte_emit_remainder_encodeSnappyBlockAsm64K 11872 CMPL DX, $0x00000100 11873 JB two_bytes_emit_remainder_encodeSnappyBlockAsm64K 11874 JB three_bytes_emit_remainder_encodeSnappyBlockAsm64K 11875 11876 three_bytes_emit_remainder_encodeSnappyBlockAsm64K: 11877 MOVB $0xf4, (AX) 11878 MOVW DX, 1(AX) 11879 ADDQ $0x03, AX 11880 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm64K 11881 11882 two_bytes_emit_remainder_encodeSnappyBlockAsm64K: 11883 MOVB $0xf0, (AX) 11884 MOVB DL, 1(AX) 11885 ADDQ $0x02, AX 11886 CMPL DX, $0x40 11887 JB memmove_emit_remainder_encodeSnappyBlockAsm64K 11888 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm64K 11889 11890 one_byte_emit_remainder_encodeSnappyBlockAsm64K: 11891 SHLB $0x02, DL 11892 MOVB DL, (AX) 11893 ADDQ $0x01, AX 11894 11895 memmove_emit_remainder_encodeSnappyBlockAsm64K: 11896 LEAQ (AX)(SI*1), DX 11897 MOVL SI, BX 11898 11899 // genMemMoveShort 11900 CMPQ BX, $0x03 11901 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2 11902 JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3 11903 CMPQ BX, $0x08 11904 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7 11905 CMPQ BX, $0x10 11906 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16 11907 CMPQ BX, $0x20 11908 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32 11909 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64 11910 11911 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2: 11912 MOVB (CX), SI 11913 MOVB -1(CX)(BX*1), CL 11914 MOVB SI, (AX) 11915 MOVB CL, -1(AX)(BX*1) 11916 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K 11917 11918 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3: 11919 MOVW (CX), SI 11920 MOVB 2(CX), CL 11921 MOVW SI, (AX) 11922 MOVB CL, 2(AX) 11923 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K 11924 11925 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7: 11926 MOVL (CX), SI 11927 MOVL -4(CX)(BX*1), CX 11928 MOVL SI, (AX) 11929 MOVL CX, -4(AX)(BX*1) 11930 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K 11931 11932 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16: 11933 MOVQ (CX), SI 11934 MOVQ -8(CX)(BX*1), CX 11935 MOVQ SI, (AX) 11936 MOVQ CX, -8(AX)(BX*1) 11937 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K 11938 11939 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32: 11940 MOVOU (CX), X0 11941 MOVOU -16(CX)(BX*1), X1 11942 MOVOU X0, (AX) 11943 MOVOU X1, -16(AX)(BX*1) 11944 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K 11945 11946 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64: 11947 MOVOU (CX), X0 11948 MOVOU 16(CX), X1 11949 MOVOU -32(CX)(BX*1), X2 11950 MOVOU -16(CX)(BX*1), X3 11951 MOVOU X0, (AX) 11952 MOVOU X1, 16(AX) 11953 MOVOU X2, -32(AX)(BX*1) 11954 MOVOU X3, -16(AX)(BX*1) 11955 11956 memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K: 11957 MOVQ DX, AX 11958 JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K 11959 11960 memmove_long_emit_remainder_encodeSnappyBlockAsm64K: 11961 LEAQ (AX)(SI*1), DX 11962 MOVL SI, BX 11963 11964 // genMemMoveLong 11965 MOVOU (CX), X0 11966 MOVOU 16(CX), X1 11967 MOVOU -32(CX)(BX*1), X2 11968 MOVOU -16(CX)(BX*1), X3 11969 MOVQ BX, DI 11970 SHRQ $0x05, DI 11971 MOVQ AX, SI 11972 ANDL $0x0000001f, SI 11973 MOVQ $0x00000040, R8 11974 SUBQ SI, R8 11975 DECQ DI 11976 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 11977 LEAQ -32(CX)(R8*1), SI 11978 LEAQ -32(AX)(R8*1), R9 11979 11980 emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back: 11981 MOVOU (SI), X4 11982 MOVOU 16(SI), X5 11983 MOVOA X4, (R9) 11984 MOVOA X5, 16(R9) 11985 ADDQ $0x20, R9 11986 ADDQ $0x20, SI 11987 ADDQ $0x20, R8 11988 DECQ DI 11989 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back 11990 11991 emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32: 11992 MOVOU -32(CX)(R8*1), X4 11993 MOVOU -16(CX)(R8*1), X5 11994 MOVOA X4, -32(AX)(R8*1) 11995 MOVOA X5, -16(AX)(R8*1) 11996 ADDQ $0x20, R8 11997 CMPQ BX, R8 11998 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 11999 MOVOU X0, (AX) 12000 MOVOU X1, 16(AX) 12001 MOVOU X2, -32(AX)(BX*1) 12002 MOVOU X3, -16(AX)(BX*1) 12003 MOVQ DX, AX 12004 12005 emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K: 12006 MOVQ dst_base+0(FP), CX 12007 SUBQ CX, AX 12008 MOVQ AX, ret+48(FP) 12009 RET 12010 12011 // func encodeSnappyBlockAsm12B(dst []byte, src []byte) int 12012 // Requires: BMI, SSE2 12013 TEXT ·encodeSnappyBlockAsm12B(SB), $16408-56 12014 MOVQ dst_base+0(FP), AX 12015 MOVQ $0x00000080, CX 12016 LEAQ 24(SP), DX 12017 PXOR X0, X0 12018 12019 zero_loop_encodeSnappyBlockAsm12B: 12020 MOVOU X0, (DX) 12021 MOVOU X0, 16(DX) 12022 MOVOU X0, 32(DX) 12023 MOVOU X0, 48(DX) 12024 MOVOU X0, 64(DX) 12025 MOVOU X0, 80(DX) 12026 MOVOU X0, 96(DX) 12027 MOVOU X0, 112(DX) 12028 ADDQ $0x80, DX 12029 DECQ CX 12030 JNZ zero_loop_encodeSnappyBlockAsm12B 12031 MOVL $0x00000000, 12(SP) 12032 MOVQ src_len+32(FP), CX 12033 LEAQ -9(CX), DX 12034 LEAQ -8(CX), BX 12035 MOVL BX, 8(SP) 12036 SHRQ $0x05, CX 12037 SUBL CX, DX 12038 LEAQ (AX)(DX*1), DX 12039 MOVQ DX, (SP) 12040 MOVL $0x00000001, CX 12041 MOVL CX, 16(SP) 12042 MOVQ src_base+24(FP), DX 12043 12044 search_loop_encodeSnappyBlockAsm12B: 12045 MOVL CX, BX 12046 SUBL 12(SP), BX 12047 SHRL $0x05, BX 12048 LEAL 4(CX)(BX*1), BX 12049 CMPL BX, 8(SP) 12050 JAE emit_remainder_encodeSnappyBlockAsm12B 12051 MOVQ (DX)(CX*1), SI 12052 MOVL BX, 20(SP) 12053 MOVQ $0x000000cf1bbcdcbb, R8 12054 MOVQ SI, R9 12055 MOVQ SI, R10 12056 SHRQ $0x08, R10 12057 SHLQ $0x18, R9 12058 IMULQ R8, R9 12059 SHRQ $0x34, R9 12060 SHLQ $0x18, R10 12061 IMULQ R8, R10 12062 SHRQ $0x34, R10 12063 MOVL 24(SP)(R9*4), BX 12064 MOVL 24(SP)(R10*4), DI 12065 MOVL CX, 24(SP)(R9*4) 12066 LEAL 1(CX), R9 12067 MOVL R9, 24(SP)(R10*4) 12068 MOVQ SI, R9 12069 SHRQ $0x10, R9 12070 SHLQ $0x18, R9 12071 IMULQ R8, R9 12072 SHRQ $0x34, R9 12073 MOVL CX, R8 12074 SUBL 16(SP), R8 12075 MOVL 1(DX)(R8*1), R10 12076 MOVQ SI, R8 12077 SHRQ $0x08, R8 12078 CMPL R8, R10 12079 JNE no_repeat_found_encodeSnappyBlockAsm12B 12080 LEAL 1(CX), SI 12081 MOVL 12(SP), BX 12082 MOVL SI, DI 12083 SUBL 16(SP), DI 12084 JZ repeat_extend_back_end_encodeSnappyBlockAsm12B 12085 12086 repeat_extend_back_loop_encodeSnappyBlockAsm12B: 12087 CMPL SI, BX 12088 JBE repeat_extend_back_end_encodeSnappyBlockAsm12B 12089 MOVB -1(DX)(DI*1), R8 12090 MOVB -1(DX)(SI*1), R9 12091 CMPB R8, R9 12092 JNE repeat_extend_back_end_encodeSnappyBlockAsm12B 12093 LEAL -1(SI), SI 12094 DECL DI 12095 JNZ repeat_extend_back_loop_encodeSnappyBlockAsm12B 12096 12097 repeat_extend_back_end_encodeSnappyBlockAsm12B: 12098 MOVL 12(SP), BX 12099 CMPL BX, SI 12100 JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B 12101 MOVL SI, DI 12102 MOVL SI, 12(SP) 12103 LEAQ (DX)(BX*1), R8 12104 SUBL BX, DI 12105 LEAL -1(DI), BX 12106 CMPL BX, $0x3c 12107 JB one_byte_repeat_emit_encodeSnappyBlockAsm12B 12108 CMPL BX, $0x00000100 12109 JB two_bytes_repeat_emit_encodeSnappyBlockAsm12B 12110 JB three_bytes_repeat_emit_encodeSnappyBlockAsm12B 12111 12112 three_bytes_repeat_emit_encodeSnappyBlockAsm12B: 12113 MOVB $0xf4, (AX) 12114 MOVW BX, 1(AX) 12115 ADDQ $0x03, AX 12116 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B 12117 12118 two_bytes_repeat_emit_encodeSnappyBlockAsm12B: 12119 MOVB $0xf0, (AX) 12120 MOVB BL, 1(AX) 12121 ADDQ $0x02, AX 12122 CMPL BX, $0x40 12123 JB memmove_repeat_emit_encodeSnappyBlockAsm12B 12124 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B 12125 12126 one_byte_repeat_emit_encodeSnappyBlockAsm12B: 12127 SHLB $0x02, BL 12128 MOVB BL, (AX) 12129 ADDQ $0x01, AX 12130 12131 memmove_repeat_emit_encodeSnappyBlockAsm12B: 12132 LEAQ (AX)(DI*1), BX 12133 12134 // genMemMoveShort 12135 CMPQ DI, $0x08 12136 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8 12137 CMPQ DI, $0x10 12138 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16 12139 CMPQ DI, $0x20 12140 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32 12141 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64 12142 12143 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8: 12144 MOVQ (R8), R9 12145 MOVQ R9, (AX) 12146 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B 12147 12148 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16: 12149 MOVQ (R8), R9 12150 MOVQ -8(R8)(DI*1), R8 12151 MOVQ R9, (AX) 12152 MOVQ R8, -8(AX)(DI*1) 12153 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B 12154 12155 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32: 12156 MOVOU (R8), X0 12157 MOVOU -16(R8)(DI*1), X1 12158 MOVOU X0, (AX) 12159 MOVOU X1, -16(AX)(DI*1) 12160 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B 12161 12162 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64: 12163 MOVOU (R8), X0 12164 MOVOU 16(R8), X1 12165 MOVOU -32(R8)(DI*1), X2 12166 MOVOU -16(R8)(DI*1), X3 12167 MOVOU X0, (AX) 12168 MOVOU X1, 16(AX) 12169 MOVOU X2, -32(AX)(DI*1) 12170 MOVOU X3, -16(AX)(DI*1) 12171 12172 memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B: 12173 MOVQ BX, AX 12174 JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B 12175 12176 memmove_long_repeat_emit_encodeSnappyBlockAsm12B: 12177 LEAQ (AX)(DI*1), BX 12178 12179 // genMemMoveLong 12180 MOVOU (R8), X0 12181 MOVOU 16(R8), X1 12182 MOVOU -32(R8)(DI*1), X2 12183 MOVOU -16(R8)(DI*1), X3 12184 MOVQ DI, R10 12185 SHRQ $0x05, R10 12186 MOVQ AX, R9 12187 ANDL $0x0000001f, R9 12188 MOVQ $0x00000040, R11 12189 SUBQ R9, R11 12190 DECQ R10 12191 JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 12192 LEAQ -32(R8)(R11*1), R9 12193 LEAQ -32(AX)(R11*1), R12 12194 12195 emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back: 12196 MOVOU (R9), X4 12197 MOVOU 16(R9), X5 12198 MOVOA X4, (R12) 12199 MOVOA X5, 16(R12) 12200 ADDQ $0x20, R12 12201 ADDQ $0x20, R9 12202 ADDQ $0x20, R11 12203 DECQ R10 12204 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back 12205 12206 emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32: 12207 MOVOU -32(R8)(R11*1), X4 12208 MOVOU -16(R8)(R11*1), X5 12209 MOVOA X4, -32(AX)(R11*1) 12210 MOVOA X5, -16(AX)(R11*1) 12211 ADDQ $0x20, R11 12212 CMPQ DI, R11 12213 JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 12214 MOVOU X0, (AX) 12215 MOVOU X1, 16(AX) 12216 MOVOU X2, -32(AX)(DI*1) 12217 MOVOU X3, -16(AX)(DI*1) 12218 MOVQ BX, AX 12219 12220 emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B: 12221 ADDL $0x05, CX 12222 MOVL CX, BX 12223 SUBL 16(SP), BX 12224 MOVQ src_len+32(FP), DI 12225 SUBL CX, DI 12226 LEAQ (DX)(CX*1), R8 12227 LEAQ (DX)(BX*1), BX 12228 12229 // matchLen 12230 XORL R10, R10 12231 CMPL DI, $0x08 12232 JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B 12233 12234 matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B: 12235 MOVQ (R8)(R10*1), R9 12236 XORQ (BX)(R10*1), R9 12237 TESTQ R9, R9 12238 JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B 12239 12240 #ifdef GOAMD64_v3 12241 TZCNTQ R9, R9 12242 12243 #else 12244 BSFQ R9, R9 12245 12246 #endif 12247 SARQ $0x03, R9 12248 LEAL (R10)(R9*1), R10 12249 JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B 12250 12251 matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B: 12252 LEAL -8(DI), DI 12253 LEAL 8(R10), R10 12254 CMPL DI, $0x08 12255 JAE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B 12256 JZ repeat_extend_forward_end_encodeSnappyBlockAsm12B 12257 12258 matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B: 12259 CMPL DI, $0x04 12260 JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B 12261 MOVL (R8)(R10*1), R9 12262 CMPL (BX)(R10*1), R9 12263 JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B 12264 SUBL $0x04, DI 12265 LEAL 4(R10), R10 12266 12267 matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B: 12268 CMPL DI, $0x02 12269 JB matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B 12270 MOVW (R8)(R10*1), R9 12271 CMPW (BX)(R10*1), R9 12272 JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B 12273 SUBL $0x02, DI 12274 LEAL 2(R10), R10 12275 12276 matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B: 12277 CMPL DI, $0x01 12278 JB repeat_extend_forward_end_encodeSnappyBlockAsm12B 12279 MOVB (R8)(R10*1), R9 12280 CMPB (BX)(R10*1), R9 12281 JNE repeat_extend_forward_end_encodeSnappyBlockAsm12B 12282 LEAL 1(R10), R10 12283 12284 repeat_extend_forward_end_encodeSnappyBlockAsm12B: 12285 ADDL R10, CX 12286 MOVL CX, BX 12287 SUBL SI, BX 12288 MOVL 16(SP), SI 12289 12290 // emitCopy 12291 two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B: 12292 CMPL BX, $0x40 12293 JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B 12294 MOVB $0xee, (AX) 12295 MOVW SI, 1(AX) 12296 LEAL -60(BX), BX 12297 ADDQ $0x03, AX 12298 JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B 12299 12300 two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B: 12301 MOVL BX, DI 12302 SHLL $0x02, DI 12303 CMPL BX, $0x0c 12304 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B 12305 CMPL SI, $0x00000800 12306 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B 12307 LEAL -15(DI), DI 12308 MOVB SI, 1(AX) 12309 SHRL $0x08, SI 12310 SHLL $0x05, SI 12311 ORL SI, DI 12312 MOVB DI, (AX) 12313 ADDQ $0x02, AX 12314 JMP repeat_end_emit_encodeSnappyBlockAsm12B 12315 12316 emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B: 12317 LEAL -2(DI), DI 12318 MOVB DI, (AX) 12319 MOVW SI, 1(AX) 12320 ADDQ $0x03, AX 12321 12322 repeat_end_emit_encodeSnappyBlockAsm12B: 12323 MOVL CX, 12(SP) 12324 JMP search_loop_encodeSnappyBlockAsm12B 12325 12326 no_repeat_found_encodeSnappyBlockAsm12B: 12327 CMPL (DX)(BX*1), SI 12328 JEQ candidate_match_encodeSnappyBlockAsm12B 12329 SHRQ $0x08, SI 12330 MOVL 24(SP)(R9*4), BX 12331 LEAL 2(CX), R8 12332 CMPL (DX)(DI*1), SI 12333 JEQ candidate2_match_encodeSnappyBlockAsm12B 12334 MOVL R8, 24(SP)(R9*4) 12335 SHRQ $0x08, SI 12336 CMPL (DX)(BX*1), SI 12337 JEQ candidate3_match_encodeSnappyBlockAsm12B 12338 MOVL 20(SP), CX 12339 JMP search_loop_encodeSnappyBlockAsm12B 12340 12341 candidate3_match_encodeSnappyBlockAsm12B: 12342 ADDL $0x02, CX 12343 JMP candidate_match_encodeSnappyBlockAsm12B 12344 12345 candidate2_match_encodeSnappyBlockAsm12B: 12346 MOVL R8, 24(SP)(R9*4) 12347 INCL CX 12348 MOVL DI, BX 12349 12350 candidate_match_encodeSnappyBlockAsm12B: 12351 MOVL 12(SP), SI 12352 TESTL BX, BX 12353 JZ match_extend_back_end_encodeSnappyBlockAsm12B 12354 12355 match_extend_back_loop_encodeSnappyBlockAsm12B: 12356 CMPL CX, SI 12357 JBE match_extend_back_end_encodeSnappyBlockAsm12B 12358 MOVB -1(DX)(BX*1), DI 12359 MOVB -1(DX)(CX*1), R8 12360 CMPB DI, R8 12361 JNE match_extend_back_end_encodeSnappyBlockAsm12B 12362 LEAL -1(CX), CX 12363 DECL BX 12364 JZ match_extend_back_end_encodeSnappyBlockAsm12B 12365 JMP match_extend_back_loop_encodeSnappyBlockAsm12B 12366 12367 match_extend_back_end_encodeSnappyBlockAsm12B: 12368 MOVL CX, SI 12369 SUBL 12(SP), SI 12370 LEAQ 3(AX)(SI*1), SI 12371 CMPQ SI, (SP) 12372 JB match_dst_size_check_encodeSnappyBlockAsm12B 12373 MOVQ $0x00000000, ret+48(FP) 12374 RET 12375 12376 match_dst_size_check_encodeSnappyBlockAsm12B: 12377 MOVL CX, SI 12378 MOVL 12(SP), DI 12379 CMPL DI, SI 12380 JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm12B 12381 MOVL SI, R8 12382 MOVL SI, 12(SP) 12383 LEAQ (DX)(DI*1), SI 12384 SUBL DI, R8 12385 LEAL -1(R8), DI 12386 CMPL DI, $0x3c 12387 JB one_byte_match_emit_encodeSnappyBlockAsm12B 12388 CMPL DI, $0x00000100 12389 JB two_bytes_match_emit_encodeSnappyBlockAsm12B 12390 JB three_bytes_match_emit_encodeSnappyBlockAsm12B 12391 12392 three_bytes_match_emit_encodeSnappyBlockAsm12B: 12393 MOVB $0xf4, (AX) 12394 MOVW DI, 1(AX) 12395 ADDQ $0x03, AX 12396 JMP memmove_long_match_emit_encodeSnappyBlockAsm12B 12397 12398 two_bytes_match_emit_encodeSnappyBlockAsm12B: 12399 MOVB $0xf0, (AX) 12400 MOVB DI, 1(AX) 12401 ADDQ $0x02, AX 12402 CMPL DI, $0x40 12403 JB memmove_match_emit_encodeSnappyBlockAsm12B 12404 JMP memmove_long_match_emit_encodeSnappyBlockAsm12B 12405 12406 one_byte_match_emit_encodeSnappyBlockAsm12B: 12407 SHLB $0x02, DI 12408 MOVB DI, (AX) 12409 ADDQ $0x01, AX 12410 12411 memmove_match_emit_encodeSnappyBlockAsm12B: 12412 LEAQ (AX)(R8*1), DI 12413 12414 // genMemMoveShort 12415 CMPQ R8, $0x08 12416 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8 12417 CMPQ R8, $0x10 12418 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16 12419 CMPQ R8, $0x20 12420 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32 12421 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64 12422 12423 emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8: 12424 MOVQ (SI), R9 12425 MOVQ R9, (AX) 12426 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B 12427 12428 emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16: 12429 MOVQ (SI), R9 12430 MOVQ -8(SI)(R8*1), SI 12431 MOVQ R9, (AX) 12432 MOVQ SI, -8(AX)(R8*1) 12433 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B 12434 12435 emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32: 12436 MOVOU (SI), X0 12437 MOVOU -16(SI)(R8*1), X1 12438 MOVOU X0, (AX) 12439 MOVOU X1, -16(AX)(R8*1) 12440 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B 12441 12442 emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64: 12443 MOVOU (SI), X0 12444 MOVOU 16(SI), X1 12445 MOVOU -32(SI)(R8*1), X2 12446 MOVOU -16(SI)(R8*1), X3 12447 MOVOU X0, (AX) 12448 MOVOU X1, 16(AX) 12449 MOVOU X2, -32(AX)(R8*1) 12450 MOVOU X3, -16(AX)(R8*1) 12451 12452 memmove_end_copy_match_emit_encodeSnappyBlockAsm12B: 12453 MOVQ DI, AX 12454 JMP emit_literal_done_match_emit_encodeSnappyBlockAsm12B 12455 12456 memmove_long_match_emit_encodeSnappyBlockAsm12B: 12457 LEAQ (AX)(R8*1), DI 12458 12459 // genMemMoveLong 12460 MOVOU (SI), X0 12461 MOVOU 16(SI), X1 12462 MOVOU -32(SI)(R8*1), X2 12463 MOVOU -16(SI)(R8*1), X3 12464 MOVQ R8, R10 12465 SHRQ $0x05, R10 12466 MOVQ AX, R9 12467 ANDL $0x0000001f, R9 12468 MOVQ $0x00000040, R11 12469 SUBQ R9, R11 12470 DECQ R10 12471 JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 12472 LEAQ -32(SI)(R11*1), R9 12473 LEAQ -32(AX)(R11*1), R12 12474 12475 emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back: 12476 MOVOU (R9), X4 12477 MOVOU 16(R9), X5 12478 MOVOA X4, (R12) 12479 MOVOA X5, 16(R12) 12480 ADDQ $0x20, R12 12481 ADDQ $0x20, R9 12482 ADDQ $0x20, R11 12483 DECQ R10 12484 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back 12485 12486 emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32: 12487 MOVOU -32(SI)(R11*1), X4 12488 MOVOU -16(SI)(R11*1), X5 12489 MOVOA X4, -32(AX)(R11*1) 12490 MOVOA X5, -16(AX)(R11*1) 12491 ADDQ $0x20, R11 12492 CMPQ R8, R11 12493 JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 12494 MOVOU X0, (AX) 12495 MOVOU X1, 16(AX) 12496 MOVOU X2, -32(AX)(R8*1) 12497 MOVOU X3, -16(AX)(R8*1) 12498 MOVQ DI, AX 12499 12500 emit_literal_done_match_emit_encodeSnappyBlockAsm12B: 12501 match_nolit_loop_encodeSnappyBlockAsm12B: 12502 MOVL CX, SI 12503 SUBL BX, SI 12504 MOVL SI, 16(SP) 12505 ADDL $0x04, CX 12506 ADDL $0x04, BX 12507 MOVQ src_len+32(FP), SI 12508 SUBL CX, SI 12509 LEAQ (DX)(CX*1), DI 12510 LEAQ (DX)(BX*1), BX 12511 12512 // matchLen 12513 XORL R9, R9 12514 CMPL SI, $0x08 12515 JB matchlen_match4_match_nolit_encodeSnappyBlockAsm12B 12516 12517 matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B: 12518 MOVQ (DI)(R9*1), R8 12519 XORQ (BX)(R9*1), R8 12520 TESTQ R8, R8 12521 JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm12B 12522 12523 #ifdef GOAMD64_v3 12524 TZCNTQ R8, R8 12525 12526 #else 12527 BSFQ R8, R8 12528 12529 #endif 12530 SARQ $0x03, R8 12531 LEAL (R9)(R8*1), R9 12532 JMP match_nolit_end_encodeSnappyBlockAsm12B 12533 12534 matchlen_loop_match_nolit_encodeSnappyBlockAsm12B: 12535 LEAL -8(SI), SI 12536 LEAL 8(R9), R9 12537 CMPL SI, $0x08 12538 JAE matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B 12539 JZ match_nolit_end_encodeSnappyBlockAsm12B 12540 12541 matchlen_match4_match_nolit_encodeSnappyBlockAsm12B: 12542 CMPL SI, $0x04 12543 JB matchlen_match2_match_nolit_encodeSnappyBlockAsm12B 12544 MOVL (DI)(R9*1), R8 12545 CMPL (BX)(R9*1), R8 12546 JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm12B 12547 SUBL $0x04, SI 12548 LEAL 4(R9), R9 12549 12550 matchlen_match2_match_nolit_encodeSnappyBlockAsm12B: 12551 CMPL SI, $0x02 12552 JB matchlen_match1_match_nolit_encodeSnappyBlockAsm12B 12553 MOVW (DI)(R9*1), R8 12554 CMPW (BX)(R9*1), R8 12555 JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm12B 12556 SUBL $0x02, SI 12557 LEAL 2(R9), R9 12558 12559 matchlen_match1_match_nolit_encodeSnappyBlockAsm12B: 12560 CMPL SI, $0x01 12561 JB match_nolit_end_encodeSnappyBlockAsm12B 12562 MOVB (DI)(R9*1), R8 12563 CMPB (BX)(R9*1), R8 12564 JNE match_nolit_end_encodeSnappyBlockAsm12B 12565 LEAL 1(R9), R9 12566 12567 match_nolit_end_encodeSnappyBlockAsm12B: 12568 ADDL R9, CX 12569 MOVL 16(SP), BX 12570 ADDL $0x04, R9 12571 MOVL CX, 12(SP) 12572 12573 // emitCopy 12574 two_byte_offset_match_nolit_encodeSnappyBlockAsm12B: 12575 CMPL R9, $0x40 12576 JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B 12577 MOVB $0xee, (AX) 12578 MOVW BX, 1(AX) 12579 LEAL -60(R9), R9 12580 ADDQ $0x03, AX 12581 JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm12B 12582 12583 two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B: 12584 MOVL R9, SI 12585 SHLL $0x02, SI 12586 CMPL R9, $0x0c 12587 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B 12588 CMPL BX, $0x00000800 12589 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B 12590 LEAL -15(SI), SI 12591 MOVB BL, 1(AX) 12592 SHRL $0x08, BX 12593 SHLL $0x05, BX 12594 ORL BX, SI 12595 MOVB SI, (AX) 12596 ADDQ $0x02, AX 12597 JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm12B 12598 12599 emit_copy_three_match_nolit_encodeSnappyBlockAsm12B: 12600 LEAL -2(SI), SI 12601 MOVB SI, (AX) 12602 MOVW BX, 1(AX) 12603 ADDQ $0x03, AX 12604 12605 match_nolit_emitcopy_end_encodeSnappyBlockAsm12B: 12606 CMPL CX, 8(SP) 12607 JAE emit_remainder_encodeSnappyBlockAsm12B 12608 MOVQ -2(DX)(CX*1), SI 12609 CMPQ AX, (SP) 12610 JB match_nolit_dst_ok_encodeSnappyBlockAsm12B 12611 MOVQ $0x00000000, ret+48(FP) 12612 RET 12613 12614 match_nolit_dst_ok_encodeSnappyBlockAsm12B: 12615 MOVQ $0x000000cf1bbcdcbb, R8 12616 MOVQ SI, DI 12617 SHRQ $0x10, SI 12618 MOVQ SI, BX 12619 SHLQ $0x18, DI 12620 IMULQ R8, DI 12621 SHRQ $0x34, DI 12622 SHLQ $0x18, BX 12623 IMULQ R8, BX 12624 SHRQ $0x34, BX 12625 LEAL -2(CX), R8 12626 LEAQ 24(SP)(BX*4), R9 12627 MOVL (R9), BX 12628 MOVL R8, 24(SP)(DI*4) 12629 MOVL CX, (R9) 12630 CMPL (DX)(BX*1), SI 12631 JEQ match_nolit_loop_encodeSnappyBlockAsm12B 12632 INCL CX 12633 JMP search_loop_encodeSnappyBlockAsm12B 12634 12635 emit_remainder_encodeSnappyBlockAsm12B: 12636 MOVQ src_len+32(FP), CX 12637 SUBL 12(SP), CX 12638 LEAQ 3(AX)(CX*1), CX 12639 CMPQ CX, (SP) 12640 JB emit_remainder_ok_encodeSnappyBlockAsm12B 12641 MOVQ $0x00000000, ret+48(FP) 12642 RET 12643 12644 emit_remainder_ok_encodeSnappyBlockAsm12B: 12645 MOVQ src_len+32(FP), CX 12646 MOVL 12(SP), BX 12647 CMPL BX, CX 12648 JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B 12649 MOVL CX, SI 12650 MOVL CX, 12(SP) 12651 LEAQ (DX)(BX*1), CX 12652 SUBL BX, SI 12653 LEAL -1(SI), DX 12654 CMPL DX, $0x3c 12655 JB one_byte_emit_remainder_encodeSnappyBlockAsm12B 12656 CMPL DX, $0x00000100 12657 JB two_bytes_emit_remainder_encodeSnappyBlockAsm12B 12658 JB three_bytes_emit_remainder_encodeSnappyBlockAsm12B 12659 12660 three_bytes_emit_remainder_encodeSnappyBlockAsm12B: 12661 MOVB $0xf4, (AX) 12662 MOVW DX, 1(AX) 12663 ADDQ $0x03, AX 12664 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B 12665 12666 two_bytes_emit_remainder_encodeSnappyBlockAsm12B: 12667 MOVB $0xf0, (AX) 12668 MOVB DL, 1(AX) 12669 ADDQ $0x02, AX 12670 CMPL DX, $0x40 12671 JB memmove_emit_remainder_encodeSnappyBlockAsm12B 12672 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B 12673 12674 one_byte_emit_remainder_encodeSnappyBlockAsm12B: 12675 SHLB $0x02, DL 12676 MOVB DL, (AX) 12677 ADDQ $0x01, AX 12678 12679 memmove_emit_remainder_encodeSnappyBlockAsm12B: 12680 LEAQ (AX)(SI*1), DX 12681 MOVL SI, BX 12682 12683 // genMemMoveShort 12684 CMPQ BX, $0x03 12685 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2 12686 JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3 12687 CMPQ BX, $0x08 12688 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7 12689 CMPQ BX, $0x10 12690 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16 12691 CMPQ BX, $0x20 12692 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32 12693 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64 12694 12695 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2: 12696 MOVB (CX), SI 12697 MOVB -1(CX)(BX*1), CL 12698 MOVB SI, (AX) 12699 MOVB CL, -1(AX)(BX*1) 12700 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B 12701 12702 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3: 12703 MOVW (CX), SI 12704 MOVB 2(CX), CL 12705 MOVW SI, (AX) 12706 MOVB CL, 2(AX) 12707 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B 12708 12709 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7: 12710 MOVL (CX), SI 12711 MOVL -4(CX)(BX*1), CX 12712 MOVL SI, (AX) 12713 MOVL CX, -4(AX)(BX*1) 12714 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B 12715 12716 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16: 12717 MOVQ (CX), SI 12718 MOVQ -8(CX)(BX*1), CX 12719 MOVQ SI, (AX) 12720 MOVQ CX, -8(AX)(BX*1) 12721 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B 12722 12723 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32: 12724 MOVOU (CX), X0 12725 MOVOU -16(CX)(BX*1), X1 12726 MOVOU X0, (AX) 12727 MOVOU X1, -16(AX)(BX*1) 12728 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B 12729 12730 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64: 12731 MOVOU (CX), X0 12732 MOVOU 16(CX), X1 12733 MOVOU -32(CX)(BX*1), X2 12734 MOVOU -16(CX)(BX*1), X3 12735 MOVOU X0, (AX) 12736 MOVOU X1, 16(AX) 12737 MOVOU X2, -32(AX)(BX*1) 12738 MOVOU X3, -16(AX)(BX*1) 12739 12740 memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B: 12741 MOVQ DX, AX 12742 JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B 12743 12744 memmove_long_emit_remainder_encodeSnappyBlockAsm12B: 12745 LEAQ (AX)(SI*1), DX 12746 MOVL SI, BX 12747 12748 // genMemMoveLong 12749 MOVOU (CX), X0 12750 MOVOU 16(CX), X1 12751 MOVOU -32(CX)(BX*1), X2 12752 MOVOU -16(CX)(BX*1), X3 12753 MOVQ BX, DI 12754 SHRQ $0x05, DI 12755 MOVQ AX, SI 12756 ANDL $0x0000001f, SI 12757 MOVQ $0x00000040, R8 12758 SUBQ SI, R8 12759 DECQ DI 12760 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 12761 LEAQ -32(CX)(R8*1), SI 12762 LEAQ -32(AX)(R8*1), R9 12763 12764 emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back: 12765 MOVOU (SI), X4 12766 MOVOU 16(SI), X5 12767 MOVOA X4, (R9) 12768 MOVOA X5, 16(R9) 12769 ADDQ $0x20, R9 12770 ADDQ $0x20, SI 12771 ADDQ $0x20, R8 12772 DECQ DI 12773 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back 12774 12775 emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32: 12776 MOVOU -32(CX)(R8*1), X4 12777 MOVOU -16(CX)(R8*1), X5 12778 MOVOA X4, -32(AX)(R8*1) 12779 MOVOA X5, -16(AX)(R8*1) 12780 ADDQ $0x20, R8 12781 CMPQ BX, R8 12782 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 12783 MOVOU X0, (AX) 12784 MOVOU X1, 16(AX) 12785 MOVOU X2, -32(AX)(BX*1) 12786 MOVOU X3, -16(AX)(BX*1) 12787 MOVQ DX, AX 12788 12789 emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B: 12790 MOVQ dst_base+0(FP), CX 12791 SUBQ CX, AX 12792 MOVQ AX, ret+48(FP) 12793 RET 12794 12795 // func encodeSnappyBlockAsm10B(dst []byte, src []byte) int 12796 // Requires: BMI, SSE2 12797 TEXT ·encodeSnappyBlockAsm10B(SB), $4120-56 12798 MOVQ dst_base+0(FP), AX 12799 MOVQ $0x00000020, CX 12800 LEAQ 24(SP), DX 12801 PXOR X0, X0 12802 12803 zero_loop_encodeSnappyBlockAsm10B: 12804 MOVOU X0, (DX) 12805 MOVOU X0, 16(DX) 12806 MOVOU X0, 32(DX) 12807 MOVOU X0, 48(DX) 12808 MOVOU X0, 64(DX) 12809 MOVOU X0, 80(DX) 12810 MOVOU X0, 96(DX) 12811 MOVOU X0, 112(DX) 12812 ADDQ $0x80, DX 12813 DECQ CX 12814 JNZ zero_loop_encodeSnappyBlockAsm10B 12815 MOVL $0x00000000, 12(SP) 12816 MOVQ src_len+32(FP), CX 12817 LEAQ -9(CX), DX 12818 LEAQ -8(CX), BX 12819 MOVL BX, 8(SP) 12820 SHRQ $0x05, CX 12821 SUBL CX, DX 12822 LEAQ (AX)(DX*1), DX 12823 MOVQ DX, (SP) 12824 MOVL $0x00000001, CX 12825 MOVL CX, 16(SP) 12826 MOVQ src_base+24(FP), DX 12827 12828 search_loop_encodeSnappyBlockAsm10B: 12829 MOVL CX, BX 12830 SUBL 12(SP), BX 12831 SHRL $0x05, BX 12832 LEAL 4(CX)(BX*1), BX 12833 CMPL BX, 8(SP) 12834 JAE emit_remainder_encodeSnappyBlockAsm10B 12835 MOVQ (DX)(CX*1), SI 12836 MOVL BX, 20(SP) 12837 MOVQ $0x9e3779b1, R8 12838 MOVQ SI, R9 12839 MOVQ SI, R10 12840 SHRQ $0x08, R10 12841 SHLQ $0x20, R9 12842 IMULQ R8, R9 12843 SHRQ $0x36, R9 12844 SHLQ $0x20, R10 12845 IMULQ R8, R10 12846 SHRQ $0x36, R10 12847 MOVL 24(SP)(R9*4), BX 12848 MOVL 24(SP)(R10*4), DI 12849 MOVL CX, 24(SP)(R9*4) 12850 LEAL 1(CX), R9 12851 MOVL R9, 24(SP)(R10*4) 12852 MOVQ SI, R9 12853 SHRQ $0x10, R9 12854 SHLQ $0x20, R9 12855 IMULQ R8, R9 12856 SHRQ $0x36, R9 12857 MOVL CX, R8 12858 SUBL 16(SP), R8 12859 MOVL 1(DX)(R8*1), R10 12860 MOVQ SI, R8 12861 SHRQ $0x08, R8 12862 CMPL R8, R10 12863 JNE no_repeat_found_encodeSnappyBlockAsm10B 12864 LEAL 1(CX), SI 12865 MOVL 12(SP), BX 12866 MOVL SI, DI 12867 SUBL 16(SP), DI 12868 JZ repeat_extend_back_end_encodeSnappyBlockAsm10B 12869 12870 repeat_extend_back_loop_encodeSnappyBlockAsm10B: 12871 CMPL SI, BX 12872 JBE repeat_extend_back_end_encodeSnappyBlockAsm10B 12873 MOVB -1(DX)(DI*1), R8 12874 MOVB -1(DX)(SI*1), R9 12875 CMPB R8, R9 12876 JNE repeat_extend_back_end_encodeSnappyBlockAsm10B 12877 LEAL -1(SI), SI 12878 DECL DI 12879 JNZ repeat_extend_back_loop_encodeSnappyBlockAsm10B 12880 12881 repeat_extend_back_end_encodeSnappyBlockAsm10B: 12882 MOVL 12(SP), BX 12883 CMPL BX, SI 12884 JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B 12885 MOVL SI, DI 12886 MOVL SI, 12(SP) 12887 LEAQ (DX)(BX*1), R8 12888 SUBL BX, DI 12889 LEAL -1(DI), BX 12890 CMPL BX, $0x3c 12891 JB one_byte_repeat_emit_encodeSnappyBlockAsm10B 12892 CMPL BX, $0x00000100 12893 JB two_bytes_repeat_emit_encodeSnappyBlockAsm10B 12894 JB three_bytes_repeat_emit_encodeSnappyBlockAsm10B 12895 12896 three_bytes_repeat_emit_encodeSnappyBlockAsm10B: 12897 MOVB $0xf4, (AX) 12898 MOVW BX, 1(AX) 12899 ADDQ $0x03, AX 12900 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B 12901 12902 two_bytes_repeat_emit_encodeSnappyBlockAsm10B: 12903 MOVB $0xf0, (AX) 12904 MOVB BL, 1(AX) 12905 ADDQ $0x02, AX 12906 CMPL BX, $0x40 12907 JB memmove_repeat_emit_encodeSnappyBlockAsm10B 12908 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B 12909 12910 one_byte_repeat_emit_encodeSnappyBlockAsm10B: 12911 SHLB $0x02, BL 12912 MOVB BL, (AX) 12913 ADDQ $0x01, AX 12914 12915 memmove_repeat_emit_encodeSnappyBlockAsm10B: 12916 LEAQ (AX)(DI*1), BX 12917 12918 // genMemMoveShort 12919 CMPQ DI, $0x08 12920 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8 12921 CMPQ DI, $0x10 12922 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16 12923 CMPQ DI, $0x20 12924 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32 12925 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64 12926 12927 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8: 12928 MOVQ (R8), R9 12929 MOVQ R9, (AX) 12930 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B 12931 12932 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16: 12933 MOVQ (R8), R9 12934 MOVQ -8(R8)(DI*1), R8 12935 MOVQ R9, (AX) 12936 MOVQ R8, -8(AX)(DI*1) 12937 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B 12938 12939 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32: 12940 MOVOU (R8), X0 12941 MOVOU -16(R8)(DI*1), X1 12942 MOVOU X0, (AX) 12943 MOVOU X1, -16(AX)(DI*1) 12944 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B 12945 12946 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64: 12947 MOVOU (R8), X0 12948 MOVOU 16(R8), X1 12949 MOVOU -32(R8)(DI*1), X2 12950 MOVOU -16(R8)(DI*1), X3 12951 MOVOU X0, (AX) 12952 MOVOU X1, 16(AX) 12953 MOVOU X2, -32(AX)(DI*1) 12954 MOVOU X3, -16(AX)(DI*1) 12955 12956 memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B: 12957 MOVQ BX, AX 12958 JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B 12959 12960 memmove_long_repeat_emit_encodeSnappyBlockAsm10B: 12961 LEAQ (AX)(DI*1), BX 12962 12963 // genMemMoveLong 12964 MOVOU (R8), X0 12965 MOVOU 16(R8), X1 12966 MOVOU -32(R8)(DI*1), X2 12967 MOVOU -16(R8)(DI*1), X3 12968 MOVQ DI, R10 12969 SHRQ $0x05, R10 12970 MOVQ AX, R9 12971 ANDL $0x0000001f, R9 12972 MOVQ $0x00000040, R11 12973 SUBQ R9, R11 12974 DECQ R10 12975 JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 12976 LEAQ -32(R8)(R11*1), R9 12977 LEAQ -32(AX)(R11*1), R12 12978 12979 emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back: 12980 MOVOU (R9), X4 12981 MOVOU 16(R9), X5 12982 MOVOA X4, (R12) 12983 MOVOA X5, 16(R12) 12984 ADDQ $0x20, R12 12985 ADDQ $0x20, R9 12986 ADDQ $0x20, R11 12987 DECQ R10 12988 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back 12989 12990 emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32: 12991 MOVOU -32(R8)(R11*1), X4 12992 MOVOU -16(R8)(R11*1), X5 12993 MOVOA X4, -32(AX)(R11*1) 12994 MOVOA X5, -16(AX)(R11*1) 12995 ADDQ $0x20, R11 12996 CMPQ DI, R11 12997 JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 12998 MOVOU X0, (AX) 12999 MOVOU X1, 16(AX) 13000 MOVOU X2, -32(AX)(DI*1) 13001 MOVOU X3, -16(AX)(DI*1) 13002 MOVQ BX, AX 13003 13004 emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B: 13005 ADDL $0x05, CX 13006 MOVL CX, BX 13007 SUBL 16(SP), BX 13008 MOVQ src_len+32(FP), DI 13009 SUBL CX, DI 13010 LEAQ (DX)(CX*1), R8 13011 LEAQ (DX)(BX*1), BX 13012 13013 // matchLen 13014 XORL R10, R10 13015 CMPL DI, $0x08 13016 JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B 13017 13018 matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B: 13019 MOVQ (R8)(R10*1), R9 13020 XORQ (BX)(R10*1), R9 13021 TESTQ R9, R9 13022 JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B 13023 13024 #ifdef GOAMD64_v3 13025 TZCNTQ R9, R9 13026 13027 #else 13028 BSFQ R9, R9 13029 13030 #endif 13031 SARQ $0x03, R9 13032 LEAL (R10)(R9*1), R10 13033 JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B 13034 13035 matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B: 13036 LEAL -8(DI), DI 13037 LEAL 8(R10), R10 13038 CMPL DI, $0x08 13039 JAE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B 13040 JZ repeat_extend_forward_end_encodeSnappyBlockAsm10B 13041 13042 matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B: 13043 CMPL DI, $0x04 13044 JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B 13045 MOVL (R8)(R10*1), R9 13046 CMPL (BX)(R10*1), R9 13047 JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B 13048 SUBL $0x04, DI 13049 LEAL 4(R10), R10 13050 13051 matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B: 13052 CMPL DI, $0x02 13053 JB matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B 13054 MOVW (R8)(R10*1), R9 13055 CMPW (BX)(R10*1), R9 13056 JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B 13057 SUBL $0x02, DI 13058 LEAL 2(R10), R10 13059 13060 matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B: 13061 CMPL DI, $0x01 13062 JB repeat_extend_forward_end_encodeSnappyBlockAsm10B 13063 MOVB (R8)(R10*1), R9 13064 CMPB (BX)(R10*1), R9 13065 JNE repeat_extend_forward_end_encodeSnappyBlockAsm10B 13066 LEAL 1(R10), R10 13067 13068 repeat_extend_forward_end_encodeSnappyBlockAsm10B: 13069 ADDL R10, CX 13070 MOVL CX, BX 13071 SUBL SI, BX 13072 MOVL 16(SP), SI 13073 13074 // emitCopy 13075 two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B: 13076 CMPL BX, $0x40 13077 JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B 13078 MOVB $0xee, (AX) 13079 MOVW SI, 1(AX) 13080 LEAL -60(BX), BX 13081 ADDQ $0x03, AX 13082 JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B 13083 13084 two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B: 13085 MOVL BX, DI 13086 SHLL $0x02, DI 13087 CMPL BX, $0x0c 13088 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B 13089 CMPL SI, $0x00000800 13090 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B 13091 LEAL -15(DI), DI 13092 MOVB SI, 1(AX) 13093 SHRL $0x08, SI 13094 SHLL $0x05, SI 13095 ORL SI, DI 13096 MOVB DI, (AX) 13097 ADDQ $0x02, AX 13098 JMP repeat_end_emit_encodeSnappyBlockAsm10B 13099 13100 emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B: 13101 LEAL -2(DI), DI 13102 MOVB DI, (AX) 13103 MOVW SI, 1(AX) 13104 ADDQ $0x03, AX 13105 13106 repeat_end_emit_encodeSnappyBlockAsm10B: 13107 MOVL CX, 12(SP) 13108 JMP search_loop_encodeSnappyBlockAsm10B 13109 13110 no_repeat_found_encodeSnappyBlockAsm10B: 13111 CMPL (DX)(BX*1), SI 13112 JEQ candidate_match_encodeSnappyBlockAsm10B 13113 SHRQ $0x08, SI 13114 MOVL 24(SP)(R9*4), BX 13115 LEAL 2(CX), R8 13116 CMPL (DX)(DI*1), SI 13117 JEQ candidate2_match_encodeSnappyBlockAsm10B 13118 MOVL R8, 24(SP)(R9*4) 13119 SHRQ $0x08, SI 13120 CMPL (DX)(BX*1), SI 13121 JEQ candidate3_match_encodeSnappyBlockAsm10B 13122 MOVL 20(SP), CX 13123 JMP search_loop_encodeSnappyBlockAsm10B 13124 13125 candidate3_match_encodeSnappyBlockAsm10B: 13126 ADDL $0x02, CX 13127 JMP candidate_match_encodeSnappyBlockAsm10B 13128 13129 candidate2_match_encodeSnappyBlockAsm10B: 13130 MOVL R8, 24(SP)(R9*4) 13131 INCL CX 13132 MOVL DI, BX 13133 13134 candidate_match_encodeSnappyBlockAsm10B: 13135 MOVL 12(SP), SI 13136 TESTL BX, BX 13137 JZ match_extend_back_end_encodeSnappyBlockAsm10B 13138 13139 match_extend_back_loop_encodeSnappyBlockAsm10B: 13140 CMPL CX, SI 13141 JBE match_extend_back_end_encodeSnappyBlockAsm10B 13142 MOVB -1(DX)(BX*1), DI 13143 MOVB -1(DX)(CX*1), R8 13144 CMPB DI, R8 13145 JNE match_extend_back_end_encodeSnappyBlockAsm10B 13146 LEAL -1(CX), CX 13147 DECL BX 13148 JZ match_extend_back_end_encodeSnappyBlockAsm10B 13149 JMP match_extend_back_loop_encodeSnappyBlockAsm10B 13150 13151 match_extend_back_end_encodeSnappyBlockAsm10B: 13152 MOVL CX, SI 13153 SUBL 12(SP), SI 13154 LEAQ 3(AX)(SI*1), SI 13155 CMPQ SI, (SP) 13156 JB match_dst_size_check_encodeSnappyBlockAsm10B 13157 MOVQ $0x00000000, ret+48(FP) 13158 RET 13159 13160 match_dst_size_check_encodeSnappyBlockAsm10B: 13161 MOVL CX, SI 13162 MOVL 12(SP), DI 13163 CMPL DI, SI 13164 JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm10B 13165 MOVL SI, R8 13166 MOVL SI, 12(SP) 13167 LEAQ (DX)(DI*1), SI 13168 SUBL DI, R8 13169 LEAL -1(R8), DI 13170 CMPL DI, $0x3c 13171 JB one_byte_match_emit_encodeSnappyBlockAsm10B 13172 CMPL DI, $0x00000100 13173 JB two_bytes_match_emit_encodeSnappyBlockAsm10B 13174 JB three_bytes_match_emit_encodeSnappyBlockAsm10B 13175 13176 three_bytes_match_emit_encodeSnappyBlockAsm10B: 13177 MOVB $0xf4, (AX) 13178 MOVW DI, 1(AX) 13179 ADDQ $0x03, AX 13180 JMP memmove_long_match_emit_encodeSnappyBlockAsm10B 13181 13182 two_bytes_match_emit_encodeSnappyBlockAsm10B: 13183 MOVB $0xf0, (AX) 13184 MOVB DI, 1(AX) 13185 ADDQ $0x02, AX 13186 CMPL DI, $0x40 13187 JB memmove_match_emit_encodeSnappyBlockAsm10B 13188 JMP memmove_long_match_emit_encodeSnappyBlockAsm10B 13189 13190 one_byte_match_emit_encodeSnappyBlockAsm10B: 13191 SHLB $0x02, DI 13192 MOVB DI, (AX) 13193 ADDQ $0x01, AX 13194 13195 memmove_match_emit_encodeSnappyBlockAsm10B: 13196 LEAQ (AX)(R8*1), DI 13197 13198 // genMemMoveShort 13199 CMPQ R8, $0x08 13200 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8 13201 CMPQ R8, $0x10 13202 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16 13203 CMPQ R8, $0x20 13204 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32 13205 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64 13206 13207 emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8: 13208 MOVQ (SI), R9 13209 MOVQ R9, (AX) 13210 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B 13211 13212 emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16: 13213 MOVQ (SI), R9 13214 MOVQ -8(SI)(R8*1), SI 13215 MOVQ R9, (AX) 13216 MOVQ SI, -8(AX)(R8*1) 13217 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B 13218 13219 emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32: 13220 MOVOU (SI), X0 13221 MOVOU -16(SI)(R8*1), X1 13222 MOVOU X0, (AX) 13223 MOVOU X1, -16(AX)(R8*1) 13224 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B 13225 13226 emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64: 13227 MOVOU (SI), X0 13228 MOVOU 16(SI), X1 13229 MOVOU -32(SI)(R8*1), X2 13230 MOVOU -16(SI)(R8*1), X3 13231 MOVOU X0, (AX) 13232 MOVOU X1, 16(AX) 13233 MOVOU X2, -32(AX)(R8*1) 13234 MOVOU X3, -16(AX)(R8*1) 13235 13236 memmove_end_copy_match_emit_encodeSnappyBlockAsm10B: 13237 MOVQ DI, AX 13238 JMP emit_literal_done_match_emit_encodeSnappyBlockAsm10B 13239 13240 memmove_long_match_emit_encodeSnappyBlockAsm10B: 13241 LEAQ (AX)(R8*1), DI 13242 13243 // genMemMoveLong 13244 MOVOU (SI), X0 13245 MOVOU 16(SI), X1 13246 MOVOU -32(SI)(R8*1), X2 13247 MOVOU -16(SI)(R8*1), X3 13248 MOVQ R8, R10 13249 SHRQ $0x05, R10 13250 MOVQ AX, R9 13251 ANDL $0x0000001f, R9 13252 MOVQ $0x00000040, R11 13253 SUBQ R9, R11 13254 DECQ R10 13255 JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 13256 LEAQ -32(SI)(R11*1), R9 13257 LEAQ -32(AX)(R11*1), R12 13258 13259 emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back: 13260 MOVOU (R9), X4 13261 MOVOU 16(R9), X5 13262 MOVOA X4, (R12) 13263 MOVOA X5, 16(R12) 13264 ADDQ $0x20, R12 13265 ADDQ $0x20, R9 13266 ADDQ $0x20, R11 13267 DECQ R10 13268 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back 13269 13270 emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32: 13271 MOVOU -32(SI)(R11*1), X4 13272 MOVOU -16(SI)(R11*1), X5 13273 MOVOA X4, -32(AX)(R11*1) 13274 MOVOA X5, -16(AX)(R11*1) 13275 ADDQ $0x20, R11 13276 CMPQ R8, R11 13277 JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 13278 MOVOU X0, (AX) 13279 MOVOU X1, 16(AX) 13280 MOVOU X2, -32(AX)(R8*1) 13281 MOVOU X3, -16(AX)(R8*1) 13282 MOVQ DI, AX 13283 13284 emit_literal_done_match_emit_encodeSnappyBlockAsm10B: 13285 match_nolit_loop_encodeSnappyBlockAsm10B: 13286 MOVL CX, SI 13287 SUBL BX, SI 13288 MOVL SI, 16(SP) 13289 ADDL $0x04, CX 13290 ADDL $0x04, BX 13291 MOVQ src_len+32(FP), SI 13292 SUBL CX, SI 13293 LEAQ (DX)(CX*1), DI 13294 LEAQ (DX)(BX*1), BX 13295 13296 // matchLen 13297 XORL R9, R9 13298 CMPL SI, $0x08 13299 JB matchlen_match4_match_nolit_encodeSnappyBlockAsm10B 13300 13301 matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B: 13302 MOVQ (DI)(R9*1), R8 13303 XORQ (BX)(R9*1), R8 13304 TESTQ R8, R8 13305 JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm10B 13306 13307 #ifdef GOAMD64_v3 13308 TZCNTQ R8, R8 13309 13310 #else 13311 BSFQ R8, R8 13312 13313 #endif 13314 SARQ $0x03, R8 13315 LEAL (R9)(R8*1), R9 13316 JMP match_nolit_end_encodeSnappyBlockAsm10B 13317 13318 matchlen_loop_match_nolit_encodeSnappyBlockAsm10B: 13319 LEAL -8(SI), SI 13320 LEAL 8(R9), R9 13321 CMPL SI, $0x08 13322 JAE matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B 13323 JZ match_nolit_end_encodeSnappyBlockAsm10B 13324 13325 matchlen_match4_match_nolit_encodeSnappyBlockAsm10B: 13326 CMPL SI, $0x04 13327 JB matchlen_match2_match_nolit_encodeSnappyBlockAsm10B 13328 MOVL (DI)(R9*1), R8 13329 CMPL (BX)(R9*1), R8 13330 JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm10B 13331 SUBL $0x04, SI 13332 LEAL 4(R9), R9 13333 13334 matchlen_match2_match_nolit_encodeSnappyBlockAsm10B: 13335 CMPL SI, $0x02 13336 JB matchlen_match1_match_nolit_encodeSnappyBlockAsm10B 13337 MOVW (DI)(R9*1), R8 13338 CMPW (BX)(R9*1), R8 13339 JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm10B 13340 SUBL $0x02, SI 13341 LEAL 2(R9), R9 13342 13343 matchlen_match1_match_nolit_encodeSnappyBlockAsm10B: 13344 CMPL SI, $0x01 13345 JB match_nolit_end_encodeSnappyBlockAsm10B 13346 MOVB (DI)(R9*1), R8 13347 CMPB (BX)(R9*1), R8 13348 JNE match_nolit_end_encodeSnappyBlockAsm10B 13349 LEAL 1(R9), R9 13350 13351 match_nolit_end_encodeSnappyBlockAsm10B: 13352 ADDL R9, CX 13353 MOVL 16(SP), BX 13354 ADDL $0x04, R9 13355 MOVL CX, 12(SP) 13356 13357 // emitCopy 13358 two_byte_offset_match_nolit_encodeSnappyBlockAsm10B: 13359 CMPL R9, $0x40 13360 JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B 13361 MOVB $0xee, (AX) 13362 MOVW BX, 1(AX) 13363 LEAL -60(R9), R9 13364 ADDQ $0x03, AX 13365 JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm10B 13366 13367 two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B: 13368 MOVL R9, SI 13369 SHLL $0x02, SI 13370 CMPL R9, $0x0c 13371 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B 13372 CMPL BX, $0x00000800 13373 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B 13374 LEAL -15(SI), SI 13375 MOVB BL, 1(AX) 13376 SHRL $0x08, BX 13377 SHLL $0x05, BX 13378 ORL BX, SI 13379 MOVB SI, (AX) 13380 ADDQ $0x02, AX 13381 JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm10B 13382 13383 emit_copy_three_match_nolit_encodeSnappyBlockAsm10B: 13384 LEAL -2(SI), SI 13385 MOVB SI, (AX) 13386 MOVW BX, 1(AX) 13387 ADDQ $0x03, AX 13388 13389 match_nolit_emitcopy_end_encodeSnappyBlockAsm10B: 13390 CMPL CX, 8(SP) 13391 JAE emit_remainder_encodeSnappyBlockAsm10B 13392 MOVQ -2(DX)(CX*1), SI 13393 CMPQ AX, (SP) 13394 JB match_nolit_dst_ok_encodeSnappyBlockAsm10B 13395 MOVQ $0x00000000, ret+48(FP) 13396 RET 13397 13398 match_nolit_dst_ok_encodeSnappyBlockAsm10B: 13399 MOVQ $0x9e3779b1, R8 13400 MOVQ SI, DI 13401 SHRQ $0x10, SI 13402 MOVQ SI, BX 13403 SHLQ $0x20, DI 13404 IMULQ R8, DI 13405 SHRQ $0x36, DI 13406 SHLQ $0x20, BX 13407 IMULQ R8, BX 13408 SHRQ $0x36, BX 13409 LEAL -2(CX), R8 13410 LEAQ 24(SP)(BX*4), R9 13411 MOVL (R9), BX 13412 MOVL R8, 24(SP)(DI*4) 13413 MOVL CX, (R9) 13414 CMPL (DX)(BX*1), SI 13415 JEQ match_nolit_loop_encodeSnappyBlockAsm10B 13416 INCL CX 13417 JMP search_loop_encodeSnappyBlockAsm10B 13418 13419 emit_remainder_encodeSnappyBlockAsm10B: 13420 MOVQ src_len+32(FP), CX 13421 SUBL 12(SP), CX 13422 LEAQ 3(AX)(CX*1), CX 13423 CMPQ CX, (SP) 13424 JB emit_remainder_ok_encodeSnappyBlockAsm10B 13425 MOVQ $0x00000000, ret+48(FP) 13426 RET 13427 13428 emit_remainder_ok_encodeSnappyBlockAsm10B: 13429 MOVQ src_len+32(FP), CX 13430 MOVL 12(SP), BX 13431 CMPL BX, CX 13432 JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B 13433 MOVL CX, SI 13434 MOVL CX, 12(SP) 13435 LEAQ (DX)(BX*1), CX 13436 SUBL BX, SI 13437 LEAL -1(SI), DX 13438 CMPL DX, $0x3c 13439 JB one_byte_emit_remainder_encodeSnappyBlockAsm10B 13440 CMPL DX, $0x00000100 13441 JB two_bytes_emit_remainder_encodeSnappyBlockAsm10B 13442 JB three_bytes_emit_remainder_encodeSnappyBlockAsm10B 13443 13444 three_bytes_emit_remainder_encodeSnappyBlockAsm10B: 13445 MOVB $0xf4, (AX) 13446 MOVW DX, 1(AX) 13447 ADDQ $0x03, AX 13448 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B 13449 13450 two_bytes_emit_remainder_encodeSnappyBlockAsm10B: 13451 MOVB $0xf0, (AX) 13452 MOVB DL, 1(AX) 13453 ADDQ $0x02, AX 13454 CMPL DX, $0x40 13455 JB memmove_emit_remainder_encodeSnappyBlockAsm10B 13456 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B 13457 13458 one_byte_emit_remainder_encodeSnappyBlockAsm10B: 13459 SHLB $0x02, DL 13460 MOVB DL, (AX) 13461 ADDQ $0x01, AX 13462 13463 memmove_emit_remainder_encodeSnappyBlockAsm10B: 13464 LEAQ (AX)(SI*1), DX 13465 MOVL SI, BX 13466 13467 // genMemMoveShort 13468 CMPQ BX, $0x03 13469 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2 13470 JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3 13471 CMPQ BX, $0x08 13472 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7 13473 CMPQ BX, $0x10 13474 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16 13475 CMPQ BX, $0x20 13476 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32 13477 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64 13478 13479 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2: 13480 MOVB (CX), SI 13481 MOVB -1(CX)(BX*1), CL 13482 MOVB SI, (AX) 13483 MOVB CL, -1(AX)(BX*1) 13484 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B 13485 13486 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3: 13487 MOVW (CX), SI 13488 MOVB 2(CX), CL 13489 MOVW SI, (AX) 13490 MOVB CL, 2(AX) 13491 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B 13492 13493 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7: 13494 MOVL (CX), SI 13495 MOVL -4(CX)(BX*1), CX 13496 MOVL SI, (AX) 13497 MOVL CX, -4(AX)(BX*1) 13498 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B 13499 13500 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16: 13501 MOVQ (CX), SI 13502 MOVQ -8(CX)(BX*1), CX 13503 MOVQ SI, (AX) 13504 MOVQ CX, -8(AX)(BX*1) 13505 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B 13506 13507 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32: 13508 MOVOU (CX), X0 13509 MOVOU -16(CX)(BX*1), X1 13510 MOVOU X0, (AX) 13511 MOVOU X1, -16(AX)(BX*1) 13512 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B 13513 13514 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64: 13515 MOVOU (CX), X0 13516 MOVOU 16(CX), X1 13517 MOVOU -32(CX)(BX*1), X2 13518 MOVOU -16(CX)(BX*1), X3 13519 MOVOU X0, (AX) 13520 MOVOU X1, 16(AX) 13521 MOVOU X2, -32(AX)(BX*1) 13522 MOVOU X3, -16(AX)(BX*1) 13523 13524 memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B: 13525 MOVQ DX, AX 13526 JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B 13527 13528 memmove_long_emit_remainder_encodeSnappyBlockAsm10B: 13529 LEAQ (AX)(SI*1), DX 13530 MOVL SI, BX 13531 13532 // genMemMoveLong 13533 MOVOU (CX), X0 13534 MOVOU 16(CX), X1 13535 MOVOU -32(CX)(BX*1), X2 13536 MOVOU -16(CX)(BX*1), X3 13537 MOVQ BX, DI 13538 SHRQ $0x05, DI 13539 MOVQ AX, SI 13540 ANDL $0x0000001f, SI 13541 MOVQ $0x00000040, R8 13542 SUBQ SI, R8 13543 DECQ DI 13544 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 13545 LEAQ -32(CX)(R8*1), SI 13546 LEAQ -32(AX)(R8*1), R9 13547 13548 emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back: 13549 MOVOU (SI), X4 13550 MOVOU 16(SI), X5 13551 MOVOA X4, (R9) 13552 MOVOA X5, 16(R9) 13553 ADDQ $0x20, R9 13554 ADDQ $0x20, SI 13555 ADDQ $0x20, R8 13556 DECQ DI 13557 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back 13558 13559 emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32: 13560 MOVOU -32(CX)(R8*1), X4 13561 MOVOU -16(CX)(R8*1), X5 13562 MOVOA X4, -32(AX)(R8*1) 13563 MOVOA X5, -16(AX)(R8*1) 13564 ADDQ $0x20, R8 13565 CMPQ BX, R8 13566 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 13567 MOVOU X0, (AX) 13568 MOVOU X1, 16(AX) 13569 MOVOU X2, -32(AX)(BX*1) 13570 MOVOU X3, -16(AX)(BX*1) 13571 MOVQ DX, AX 13572 13573 emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B: 13574 MOVQ dst_base+0(FP), CX 13575 SUBQ CX, AX 13576 MOVQ AX, ret+48(FP) 13577 RET 13578 13579 // func encodeSnappyBlockAsm8B(dst []byte, src []byte) int 13580 // Requires: BMI, SSE2 13581 TEXT ·encodeSnappyBlockAsm8B(SB), $1048-56 13582 MOVQ dst_base+0(FP), AX 13583 MOVQ $0x00000008, CX 13584 LEAQ 24(SP), DX 13585 PXOR X0, X0 13586 13587 zero_loop_encodeSnappyBlockAsm8B: 13588 MOVOU X0, (DX) 13589 MOVOU X0, 16(DX) 13590 MOVOU X0, 32(DX) 13591 MOVOU X0, 48(DX) 13592 MOVOU X0, 64(DX) 13593 MOVOU X0, 80(DX) 13594 MOVOU X0, 96(DX) 13595 MOVOU X0, 112(DX) 13596 ADDQ $0x80, DX 13597 DECQ CX 13598 JNZ zero_loop_encodeSnappyBlockAsm8B 13599 MOVL $0x00000000, 12(SP) 13600 MOVQ src_len+32(FP), CX 13601 LEAQ -9(CX), DX 13602 LEAQ -8(CX), BX 13603 MOVL BX, 8(SP) 13604 SHRQ $0x05, CX 13605 SUBL CX, DX 13606 LEAQ (AX)(DX*1), DX 13607 MOVQ DX, (SP) 13608 MOVL $0x00000001, CX 13609 MOVL CX, 16(SP) 13610 MOVQ src_base+24(FP), DX 13611 13612 search_loop_encodeSnappyBlockAsm8B: 13613 MOVL CX, BX 13614 SUBL 12(SP), BX 13615 SHRL $0x04, BX 13616 LEAL 4(CX)(BX*1), BX 13617 CMPL BX, 8(SP) 13618 JAE emit_remainder_encodeSnappyBlockAsm8B 13619 MOVQ (DX)(CX*1), SI 13620 MOVL BX, 20(SP) 13621 MOVQ $0x9e3779b1, R8 13622 MOVQ SI, R9 13623 MOVQ SI, R10 13624 SHRQ $0x08, R10 13625 SHLQ $0x20, R9 13626 IMULQ R8, R9 13627 SHRQ $0x38, R9 13628 SHLQ $0x20, R10 13629 IMULQ R8, R10 13630 SHRQ $0x38, R10 13631 MOVL 24(SP)(R9*4), BX 13632 MOVL 24(SP)(R10*4), DI 13633 MOVL CX, 24(SP)(R9*4) 13634 LEAL 1(CX), R9 13635 MOVL R9, 24(SP)(R10*4) 13636 MOVQ SI, R9 13637 SHRQ $0x10, R9 13638 SHLQ $0x20, R9 13639 IMULQ R8, R9 13640 SHRQ $0x38, R9 13641 MOVL CX, R8 13642 SUBL 16(SP), R8 13643 MOVL 1(DX)(R8*1), R10 13644 MOVQ SI, R8 13645 SHRQ $0x08, R8 13646 CMPL R8, R10 13647 JNE no_repeat_found_encodeSnappyBlockAsm8B 13648 LEAL 1(CX), SI 13649 MOVL 12(SP), BX 13650 MOVL SI, DI 13651 SUBL 16(SP), DI 13652 JZ repeat_extend_back_end_encodeSnappyBlockAsm8B 13653 13654 repeat_extend_back_loop_encodeSnappyBlockAsm8B: 13655 CMPL SI, BX 13656 JBE repeat_extend_back_end_encodeSnappyBlockAsm8B 13657 MOVB -1(DX)(DI*1), R8 13658 MOVB -1(DX)(SI*1), R9 13659 CMPB R8, R9 13660 JNE repeat_extend_back_end_encodeSnappyBlockAsm8B 13661 LEAL -1(SI), SI 13662 DECL DI 13663 JNZ repeat_extend_back_loop_encodeSnappyBlockAsm8B 13664 13665 repeat_extend_back_end_encodeSnappyBlockAsm8B: 13666 MOVL 12(SP), BX 13667 CMPL BX, SI 13668 JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B 13669 MOVL SI, DI 13670 MOVL SI, 12(SP) 13671 LEAQ (DX)(BX*1), R8 13672 SUBL BX, DI 13673 LEAL -1(DI), BX 13674 CMPL BX, $0x3c 13675 JB one_byte_repeat_emit_encodeSnappyBlockAsm8B 13676 CMPL BX, $0x00000100 13677 JB two_bytes_repeat_emit_encodeSnappyBlockAsm8B 13678 JB three_bytes_repeat_emit_encodeSnappyBlockAsm8B 13679 13680 three_bytes_repeat_emit_encodeSnappyBlockAsm8B: 13681 MOVB $0xf4, (AX) 13682 MOVW BX, 1(AX) 13683 ADDQ $0x03, AX 13684 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B 13685 13686 two_bytes_repeat_emit_encodeSnappyBlockAsm8B: 13687 MOVB $0xf0, (AX) 13688 MOVB BL, 1(AX) 13689 ADDQ $0x02, AX 13690 CMPL BX, $0x40 13691 JB memmove_repeat_emit_encodeSnappyBlockAsm8B 13692 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B 13693 13694 one_byte_repeat_emit_encodeSnappyBlockAsm8B: 13695 SHLB $0x02, BL 13696 MOVB BL, (AX) 13697 ADDQ $0x01, AX 13698 13699 memmove_repeat_emit_encodeSnappyBlockAsm8B: 13700 LEAQ (AX)(DI*1), BX 13701 13702 // genMemMoveShort 13703 CMPQ DI, $0x08 13704 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8 13705 CMPQ DI, $0x10 13706 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16 13707 CMPQ DI, $0x20 13708 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32 13709 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64 13710 13711 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8: 13712 MOVQ (R8), R9 13713 MOVQ R9, (AX) 13714 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B 13715 13716 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16: 13717 MOVQ (R8), R9 13718 MOVQ -8(R8)(DI*1), R8 13719 MOVQ R9, (AX) 13720 MOVQ R8, -8(AX)(DI*1) 13721 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B 13722 13723 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32: 13724 MOVOU (R8), X0 13725 MOVOU -16(R8)(DI*1), X1 13726 MOVOU X0, (AX) 13727 MOVOU X1, -16(AX)(DI*1) 13728 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B 13729 13730 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64: 13731 MOVOU (R8), X0 13732 MOVOU 16(R8), X1 13733 MOVOU -32(R8)(DI*1), X2 13734 MOVOU -16(R8)(DI*1), X3 13735 MOVOU X0, (AX) 13736 MOVOU X1, 16(AX) 13737 MOVOU X2, -32(AX)(DI*1) 13738 MOVOU X3, -16(AX)(DI*1) 13739 13740 memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B: 13741 MOVQ BX, AX 13742 JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B 13743 13744 memmove_long_repeat_emit_encodeSnappyBlockAsm8B: 13745 LEAQ (AX)(DI*1), BX 13746 13747 // genMemMoveLong 13748 MOVOU (R8), X0 13749 MOVOU 16(R8), X1 13750 MOVOU -32(R8)(DI*1), X2 13751 MOVOU -16(R8)(DI*1), X3 13752 MOVQ DI, R10 13753 SHRQ $0x05, R10 13754 MOVQ AX, R9 13755 ANDL $0x0000001f, R9 13756 MOVQ $0x00000040, R11 13757 SUBQ R9, R11 13758 DECQ R10 13759 JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 13760 LEAQ -32(R8)(R11*1), R9 13761 LEAQ -32(AX)(R11*1), R12 13762 13763 emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back: 13764 MOVOU (R9), X4 13765 MOVOU 16(R9), X5 13766 MOVOA X4, (R12) 13767 MOVOA X5, 16(R12) 13768 ADDQ $0x20, R12 13769 ADDQ $0x20, R9 13770 ADDQ $0x20, R11 13771 DECQ R10 13772 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back 13773 13774 emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32: 13775 MOVOU -32(R8)(R11*1), X4 13776 MOVOU -16(R8)(R11*1), X5 13777 MOVOA X4, -32(AX)(R11*1) 13778 MOVOA X5, -16(AX)(R11*1) 13779 ADDQ $0x20, R11 13780 CMPQ DI, R11 13781 JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 13782 MOVOU X0, (AX) 13783 MOVOU X1, 16(AX) 13784 MOVOU X2, -32(AX)(DI*1) 13785 MOVOU X3, -16(AX)(DI*1) 13786 MOVQ BX, AX 13787 13788 emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B: 13789 ADDL $0x05, CX 13790 MOVL CX, BX 13791 SUBL 16(SP), BX 13792 MOVQ src_len+32(FP), DI 13793 SUBL CX, DI 13794 LEAQ (DX)(CX*1), R8 13795 LEAQ (DX)(BX*1), BX 13796 13797 // matchLen 13798 XORL R10, R10 13799 CMPL DI, $0x08 13800 JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B 13801 13802 matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B: 13803 MOVQ (R8)(R10*1), R9 13804 XORQ (BX)(R10*1), R9 13805 TESTQ R9, R9 13806 JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B 13807 13808 #ifdef GOAMD64_v3 13809 TZCNTQ R9, R9 13810 13811 #else 13812 BSFQ R9, R9 13813 13814 #endif 13815 SARQ $0x03, R9 13816 LEAL (R10)(R9*1), R10 13817 JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B 13818 13819 matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B: 13820 LEAL -8(DI), DI 13821 LEAL 8(R10), R10 13822 CMPL DI, $0x08 13823 JAE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B 13824 JZ repeat_extend_forward_end_encodeSnappyBlockAsm8B 13825 13826 matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B: 13827 CMPL DI, $0x04 13828 JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B 13829 MOVL (R8)(R10*1), R9 13830 CMPL (BX)(R10*1), R9 13831 JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B 13832 SUBL $0x04, DI 13833 LEAL 4(R10), R10 13834 13835 matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B: 13836 CMPL DI, $0x02 13837 JB matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B 13838 MOVW (R8)(R10*1), R9 13839 CMPW (BX)(R10*1), R9 13840 JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B 13841 SUBL $0x02, DI 13842 LEAL 2(R10), R10 13843 13844 matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B: 13845 CMPL DI, $0x01 13846 JB repeat_extend_forward_end_encodeSnappyBlockAsm8B 13847 MOVB (R8)(R10*1), R9 13848 CMPB (BX)(R10*1), R9 13849 JNE repeat_extend_forward_end_encodeSnappyBlockAsm8B 13850 LEAL 1(R10), R10 13851 13852 repeat_extend_forward_end_encodeSnappyBlockAsm8B: 13853 ADDL R10, CX 13854 MOVL CX, BX 13855 SUBL SI, BX 13856 MOVL 16(SP), SI 13857 13858 // emitCopy 13859 two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B: 13860 CMPL BX, $0x40 13861 JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B 13862 MOVB $0xee, (AX) 13863 MOVW SI, 1(AX) 13864 LEAL -60(BX), BX 13865 ADDQ $0x03, AX 13866 JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B 13867 13868 two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B: 13869 MOVL BX, DI 13870 SHLL $0x02, DI 13871 CMPL BX, $0x0c 13872 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B 13873 LEAL -15(DI), DI 13874 MOVB SI, 1(AX) 13875 SHRL $0x08, SI 13876 SHLL $0x05, SI 13877 ORL SI, DI 13878 MOVB DI, (AX) 13879 ADDQ $0x02, AX 13880 JMP repeat_end_emit_encodeSnappyBlockAsm8B 13881 13882 emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B: 13883 LEAL -2(DI), DI 13884 MOVB DI, (AX) 13885 MOVW SI, 1(AX) 13886 ADDQ $0x03, AX 13887 13888 repeat_end_emit_encodeSnappyBlockAsm8B: 13889 MOVL CX, 12(SP) 13890 JMP search_loop_encodeSnappyBlockAsm8B 13891 13892 no_repeat_found_encodeSnappyBlockAsm8B: 13893 CMPL (DX)(BX*1), SI 13894 JEQ candidate_match_encodeSnappyBlockAsm8B 13895 SHRQ $0x08, SI 13896 MOVL 24(SP)(R9*4), BX 13897 LEAL 2(CX), R8 13898 CMPL (DX)(DI*1), SI 13899 JEQ candidate2_match_encodeSnappyBlockAsm8B 13900 MOVL R8, 24(SP)(R9*4) 13901 SHRQ $0x08, SI 13902 CMPL (DX)(BX*1), SI 13903 JEQ candidate3_match_encodeSnappyBlockAsm8B 13904 MOVL 20(SP), CX 13905 JMP search_loop_encodeSnappyBlockAsm8B 13906 13907 candidate3_match_encodeSnappyBlockAsm8B: 13908 ADDL $0x02, CX 13909 JMP candidate_match_encodeSnappyBlockAsm8B 13910 13911 candidate2_match_encodeSnappyBlockAsm8B: 13912 MOVL R8, 24(SP)(R9*4) 13913 INCL CX 13914 MOVL DI, BX 13915 13916 candidate_match_encodeSnappyBlockAsm8B: 13917 MOVL 12(SP), SI 13918 TESTL BX, BX 13919 JZ match_extend_back_end_encodeSnappyBlockAsm8B 13920 13921 match_extend_back_loop_encodeSnappyBlockAsm8B: 13922 CMPL CX, SI 13923 JBE match_extend_back_end_encodeSnappyBlockAsm8B 13924 MOVB -1(DX)(BX*1), DI 13925 MOVB -1(DX)(CX*1), R8 13926 CMPB DI, R8 13927 JNE match_extend_back_end_encodeSnappyBlockAsm8B 13928 LEAL -1(CX), CX 13929 DECL BX 13930 JZ match_extend_back_end_encodeSnappyBlockAsm8B 13931 JMP match_extend_back_loop_encodeSnappyBlockAsm8B 13932 13933 match_extend_back_end_encodeSnappyBlockAsm8B: 13934 MOVL CX, SI 13935 SUBL 12(SP), SI 13936 LEAQ 3(AX)(SI*1), SI 13937 CMPQ SI, (SP) 13938 JB match_dst_size_check_encodeSnappyBlockAsm8B 13939 MOVQ $0x00000000, ret+48(FP) 13940 RET 13941 13942 match_dst_size_check_encodeSnappyBlockAsm8B: 13943 MOVL CX, SI 13944 MOVL 12(SP), DI 13945 CMPL DI, SI 13946 JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm8B 13947 MOVL SI, R8 13948 MOVL SI, 12(SP) 13949 LEAQ (DX)(DI*1), SI 13950 SUBL DI, R8 13951 LEAL -1(R8), DI 13952 CMPL DI, $0x3c 13953 JB one_byte_match_emit_encodeSnappyBlockAsm8B 13954 CMPL DI, $0x00000100 13955 JB two_bytes_match_emit_encodeSnappyBlockAsm8B 13956 JB three_bytes_match_emit_encodeSnappyBlockAsm8B 13957 13958 three_bytes_match_emit_encodeSnappyBlockAsm8B: 13959 MOVB $0xf4, (AX) 13960 MOVW DI, 1(AX) 13961 ADDQ $0x03, AX 13962 JMP memmove_long_match_emit_encodeSnappyBlockAsm8B 13963 13964 two_bytes_match_emit_encodeSnappyBlockAsm8B: 13965 MOVB $0xf0, (AX) 13966 MOVB DI, 1(AX) 13967 ADDQ $0x02, AX 13968 CMPL DI, $0x40 13969 JB memmove_match_emit_encodeSnappyBlockAsm8B 13970 JMP memmove_long_match_emit_encodeSnappyBlockAsm8B 13971 13972 one_byte_match_emit_encodeSnappyBlockAsm8B: 13973 SHLB $0x02, DI 13974 MOVB DI, (AX) 13975 ADDQ $0x01, AX 13976 13977 memmove_match_emit_encodeSnappyBlockAsm8B: 13978 LEAQ (AX)(R8*1), DI 13979 13980 // genMemMoveShort 13981 CMPQ R8, $0x08 13982 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8 13983 CMPQ R8, $0x10 13984 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16 13985 CMPQ R8, $0x20 13986 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32 13987 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64 13988 13989 emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8: 13990 MOVQ (SI), R9 13991 MOVQ R9, (AX) 13992 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B 13993 13994 emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16: 13995 MOVQ (SI), R9 13996 MOVQ -8(SI)(R8*1), SI 13997 MOVQ R9, (AX) 13998 MOVQ SI, -8(AX)(R8*1) 13999 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B 14000 14001 emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32: 14002 MOVOU (SI), X0 14003 MOVOU -16(SI)(R8*1), X1 14004 MOVOU X0, (AX) 14005 MOVOU X1, -16(AX)(R8*1) 14006 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B 14007 14008 emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64: 14009 MOVOU (SI), X0 14010 MOVOU 16(SI), X1 14011 MOVOU -32(SI)(R8*1), X2 14012 MOVOU -16(SI)(R8*1), X3 14013 MOVOU X0, (AX) 14014 MOVOU X1, 16(AX) 14015 MOVOU X2, -32(AX)(R8*1) 14016 MOVOU X3, -16(AX)(R8*1) 14017 14018 memmove_end_copy_match_emit_encodeSnappyBlockAsm8B: 14019 MOVQ DI, AX 14020 JMP emit_literal_done_match_emit_encodeSnappyBlockAsm8B 14021 14022 memmove_long_match_emit_encodeSnappyBlockAsm8B: 14023 LEAQ (AX)(R8*1), DI 14024 14025 // genMemMoveLong 14026 MOVOU (SI), X0 14027 MOVOU 16(SI), X1 14028 MOVOU -32(SI)(R8*1), X2 14029 MOVOU -16(SI)(R8*1), X3 14030 MOVQ R8, R10 14031 SHRQ $0x05, R10 14032 MOVQ AX, R9 14033 ANDL $0x0000001f, R9 14034 MOVQ $0x00000040, R11 14035 SUBQ R9, R11 14036 DECQ R10 14037 JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 14038 LEAQ -32(SI)(R11*1), R9 14039 LEAQ -32(AX)(R11*1), R12 14040 14041 emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back: 14042 MOVOU (R9), X4 14043 MOVOU 16(R9), X5 14044 MOVOA X4, (R12) 14045 MOVOA X5, 16(R12) 14046 ADDQ $0x20, R12 14047 ADDQ $0x20, R9 14048 ADDQ $0x20, R11 14049 DECQ R10 14050 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back 14051 14052 emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32: 14053 MOVOU -32(SI)(R11*1), X4 14054 MOVOU -16(SI)(R11*1), X5 14055 MOVOA X4, -32(AX)(R11*1) 14056 MOVOA X5, -16(AX)(R11*1) 14057 ADDQ $0x20, R11 14058 CMPQ R8, R11 14059 JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 14060 MOVOU X0, (AX) 14061 MOVOU X1, 16(AX) 14062 MOVOU X2, -32(AX)(R8*1) 14063 MOVOU X3, -16(AX)(R8*1) 14064 MOVQ DI, AX 14065 14066 emit_literal_done_match_emit_encodeSnappyBlockAsm8B: 14067 match_nolit_loop_encodeSnappyBlockAsm8B: 14068 MOVL CX, SI 14069 SUBL BX, SI 14070 MOVL SI, 16(SP) 14071 ADDL $0x04, CX 14072 ADDL $0x04, BX 14073 MOVQ src_len+32(FP), SI 14074 SUBL CX, SI 14075 LEAQ (DX)(CX*1), DI 14076 LEAQ (DX)(BX*1), BX 14077 14078 // matchLen 14079 XORL R9, R9 14080 CMPL SI, $0x08 14081 JB matchlen_match4_match_nolit_encodeSnappyBlockAsm8B 14082 14083 matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B: 14084 MOVQ (DI)(R9*1), R8 14085 XORQ (BX)(R9*1), R8 14086 TESTQ R8, R8 14087 JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm8B 14088 14089 #ifdef GOAMD64_v3 14090 TZCNTQ R8, R8 14091 14092 #else 14093 BSFQ R8, R8 14094 14095 #endif 14096 SARQ $0x03, R8 14097 LEAL (R9)(R8*1), R9 14098 JMP match_nolit_end_encodeSnappyBlockAsm8B 14099 14100 matchlen_loop_match_nolit_encodeSnappyBlockAsm8B: 14101 LEAL -8(SI), SI 14102 LEAL 8(R9), R9 14103 CMPL SI, $0x08 14104 JAE matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B 14105 JZ match_nolit_end_encodeSnappyBlockAsm8B 14106 14107 matchlen_match4_match_nolit_encodeSnappyBlockAsm8B: 14108 CMPL SI, $0x04 14109 JB matchlen_match2_match_nolit_encodeSnappyBlockAsm8B 14110 MOVL (DI)(R9*1), R8 14111 CMPL (BX)(R9*1), R8 14112 JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm8B 14113 SUBL $0x04, SI 14114 LEAL 4(R9), R9 14115 14116 matchlen_match2_match_nolit_encodeSnappyBlockAsm8B: 14117 CMPL SI, $0x02 14118 JB matchlen_match1_match_nolit_encodeSnappyBlockAsm8B 14119 MOVW (DI)(R9*1), R8 14120 CMPW (BX)(R9*1), R8 14121 JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm8B 14122 SUBL $0x02, SI 14123 LEAL 2(R9), R9 14124 14125 matchlen_match1_match_nolit_encodeSnappyBlockAsm8B: 14126 CMPL SI, $0x01 14127 JB match_nolit_end_encodeSnappyBlockAsm8B 14128 MOVB (DI)(R9*1), R8 14129 CMPB (BX)(R9*1), R8 14130 JNE match_nolit_end_encodeSnappyBlockAsm8B 14131 LEAL 1(R9), R9 14132 14133 match_nolit_end_encodeSnappyBlockAsm8B: 14134 ADDL R9, CX 14135 MOVL 16(SP), BX 14136 ADDL $0x04, R9 14137 MOVL CX, 12(SP) 14138 14139 // emitCopy 14140 two_byte_offset_match_nolit_encodeSnappyBlockAsm8B: 14141 CMPL R9, $0x40 14142 JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B 14143 MOVB $0xee, (AX) 14144 MOVW BX, 1(AX) 14145 LEAL -60(R9), R9 14146 ADDQ $0x03, AX 14147 JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm8B 14148 14149 two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B: 14150 MOVL R9, SI 14151 SHLL $0x02, SI 14152 CMPL R9, $0x0c 14153 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm8B 14154 LEAL -15(SI), SI 14155 MOVB BL, 1(AX) 14156 SHRL $0x08, BX 14157 SHLL $0x05, BX 14158 ORL BX, SI 14159 MOVB SI, (AX) 14160 ADDQ $0x02, AX 14161 JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm8B 14162 14163 emit_copy_three_match_nolit_encodeSnappyBlockAsm8B: 14164 LEAL -2(SI), SI 14165 MOVB SI, (AX) 14166 MOVW BX, 1(AX) 14167 ADDQ $0x03, AX 14168 14169 match_nolit_emitcopy_end_encodeSnappyBlockAsm8B: 14170 CMPL CX, 8(SP) 14171 JAE emit_remainder_encodeSnappyBlockAsm8B 14172 MOVQ -2(DX)(CX*1), SI 14173 CMPQ AX, (SP) 14174 JB match_nolit_dst_ok_encodeSnappyBlockAsm8B 14175 MOVQ $0x00000000, ret+48(FP) 14176 RET 14177 14178 match_nolit_dst_ok_encodeSnappyBlockAsm8B: 14179 MOVQ $0x9e3779b1, R8 14180 MOVQ SI, DI 14181 SHRQ $0x10, SI 14182 MOVQ SI, BX 14183 SHLQ $0x20, DI 14184 IMULQ R8, DI 14185 SHRQ $0x38, DI 14186 SHLQ $0x20, BX 14187 IMULQ R8, BX 14188 SHRQ $0x38, BX 14189 LEAL -2(CX), R8 14190 LEAQ 24(SP)(BX*4), R9 14191 MOVL (R9), BX 14192 MOVL R8, 24(SP)(DI*4) 14193 MOVL CX, (R9) 14194 CMPL (DX)(BX*1), SI 14195 JEQ match_nolit_loop_encodeSnappyBlockAsm8B 14196 INCL CX 14197 JMP search_loop_encodeSnappyBlockAsm8B 14198 14199 emit_remainder_encodeSnappyBlockAsm8B: 14200 MOVQ src_len+32(FP), CX 14201 SUBL 12(SP), CX 14202 LEAQ 3(AX)(CX*1), CX 14203 CMPQ CX, (SP) 14204 JB emit_remainder_ok_encodeSnappyBlockAsm8B 14205 MOVQ $0x00000000, ret+48(FP) 14206 RET 14207 14208 emit_remainder_ok_encodeSnappyBlockAsm8B: 14209 MOVQ src_len+32(FP), CX 14210 MOVL 12(SP), BX 14211 CMPL BX, CX 14212 JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B 14213 MOVL CX, SI 14214 MOVL CX, 12(SP) 14215 LEAQ (DX)(BX*1), CX 14216 SUBL BX, SI 14217 LEAL -1(SI), DX 14218 CMPL DX, $0x3c 14219 JB one_byte_emit_remainder_encodeSnappyBlockAsm8B 14220 CMPL DX, $0x00000100 14221 JB two_bytes_emit_remainder_encodeSnappyBlockAsm8B 14222 JB three_bytes_emit_remainder_encodeSnappyBlockAsm8B 14223 14224 three_bytes_emit_remainder_encodeSnappyBlockAsm8B: 14225 MOVB $0xf4, (AX) 14226 MOVW DX, 1(AX) 14227 ADDQ $0x03, AX 14228 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B 14229 14230 two_bytes_emit_remainder_encodeSnappyBlockAsm8B: 14231 MOVB $0xf0, (AX) 14232 MOVB DL, 1(AX) 14233 ADDQ $0x02, AX 14234 CMPL DX, $0x40 14235 JB memmove_emit_remainder_encodeSnappyBlockAsm8B 14236 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B 14237 14238 one_byte_emit_remainder_encodeSnappyBlockAsm8B: 14239 SHLB $0x02, DL 14240 MOVB DL, (AX) 14241 ADDQ $0x01, AX 14242 14243 memmove_emit_remainder_encodeSnappyBlockAsm8B: 14244 LEAQ (AX)(SI*1), DX 14245 MOVL SI, BX 14246 14247 // genMemMoveShort 14248 CMPQ BX, $0x03 14249 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2 14250 JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3 14251 CMPQ BX, $0x08 14252 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7 14253 CMPQ BX, $0x10 14254 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16 14255 CMPQ BX, $0x20 14256 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32 14257 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64 14258 14259 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2: 14260 MOVB (CX), SI 14261 MOVB -1(CX)(BX*1), CL 14262 MOVB SI, (AX) 14263 MOVB CL, -1(AX)(BX*1) 14264 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B 14265 14266 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3: 14267 MOVW (CX), SI 14268 MOVB 2(CX), CL 14269 MOVW SI, (AX) 14270 MOVB CL, 2(AX) 14271 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B 14272 14273 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7: 14274 MOVL (CX), SI 14275 MOVL -4(CX)(BX*1), CX 14276 MOVL SI, (AX) 14277 MOVL CX, -4(AX)(BX*1) 14278 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B 14279 14280 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16: 14281 MOVQ (CX), SI 14282 MOVQ -8(CX)(BX*1), CX 14283 MOVQ SI, (AX) 14284 MOVQ CX, -8(AX)(BX*1) 14285 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B 14286 14287 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32: 14288 MOVOU (CX), X0 14289 MOVOU -16(CX)(BX*1), X1 14290 MOVOU X0, (AX) 14291 MOVOU X1, -16(AX)(BX*1) 14292 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B 14293 14294 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64: 14295 MOVOU (CX), X0 14296 MOVOU 16(CX), X1 14297 MOVOU -32(CX)(BX*1), X2 14298 MOVOU -16(CX)(BX*1), X3 14299 MOVOU X0, (AX) 14300 MOVOU X1, 16(AX) 14301 MOVOU X2, -32(AX)(BX*1) 14302 MOVOU X3, -16(AX)(BX*1) 14303 14304 memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B: 14305 MOVQ DX, AX 14306 JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B 14307 14308 memmove_long_emit_remainder_encodeSnappyBlockAsm8B: 14309 LEAQ (AX)(SI*1), DX 14310 MOVL SI, BX 14311 14312 // genMemMoveLong 14313 MOVOU (CX), X0 14314 MOVOU 16(CX), X1 14315 MOVOU -32(CX)(BX*1), X2 14316 MOVOU -16(CX)(BX*1), X3 14317 MOVQ BX, DI 14318 SHRQ $0x05, DI 14319 MOVQ AX, SI 14320 ANDL $0x0000001f, SI 14321 MOVQ $0x00000040, R8 14322 SUBQ SI, R8 14323 DECQ DI 14324 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 14325 LEAQ -32(CX)(R8*1), SI 14326 LEAQ -32(AX)(R8*1), R9 14327 14328 emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back: 14329 MOVOU (SI), X4 14330 MOVOU 16(SI), X5 14331 MOVOA X4, (R9) 14332 MOVOA X5, 16(R9) 14333 ADDQ $0x20, R9 14334 ADDQ $0x20, SI 14335 ADDQ $0x20, R8 14336 DECQ DI 14337 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back 14338 14339 emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32: 14340 MOVOU -32(CX)(R8*1), X4 14341 MOVOU -16(CX)(R8*1), X5 14342 MOVOA X4, -32(AX)(R8*1) 14343 MOVOA X5, -16(AX)(R8*1) 14344 ADDQ $0x20, R8 14345 CMPQ BX, R8 14346 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 14347 MOVOU X0, (AX) 14348 MOVOU X1, 16(AX) 14349 MOVOU X2, -32(AX)(BX*1) 14350 MOVOU X3, -16(AX)(BX*1) 14351 MOVQ DX, AX 14352 14353 emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B: 14354 MOVQ dst_base+0(FP), CX 14355 SUBQ CX, AX 14356 MOVQ AX, ret+48(FP) 14357 RET 14358 14359 // func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int 14360 // Requires: BMI, SSE2 14361 TEXT ·encodeSnappyBetterBlockAsm(SB), $589848-56 14362 MOVQ dst_base+0(FP), AX 14363 MOVQ $0x00001200, CX 14364 LEAQ 24(SP), DX 14365 PXOR X0, X0 14366 14367 zero_loop_encodeSnappyBetterBlockAsm: 14368 MOVOU X0, (DX) 14369 MOVOU X0, 16(DX) 14370 MOVOU X0, 32(DX) 14371 MOVOU X0, 48(DX) 14372 MOVOU X0, 64(DX) 14373 MOVOU X0, 80(DX) 14374 MOVOU X0, 96(DX) 14375 MOVOU X0, 112(DX) 14376 ADDQ $0x80, DX 14377 DECQ CX 14378 JNZ zero_loop_encodeSnappyBetterBlockAsm 14379 MOVL $0x00000000, 12(SP) 14380 MOVQ src_len+32(FP), CX 14381 LEAQ -9(CX), DX 14382 LEAQ -8(CX), BX 14383 MOVL BX, 8(SP) 14384 SHRQ $0x05, CX 14385 SUBL CX, DX 14386 LEAQ (AX)(DX*1), DX 14387 MOVQ DX, (SP) 14388 MOVL $0x00000001, CX 14389 MOVL $0x00000000, 16(SP) 14390 MOVQ src_base+24(FP), DX 14391 14392 search_loop_encodeSnappyBetterBlockAsm: 14393 MOVL CX, BX 14394 SUBL 12(SP), BX 14395 SHRL $0x07, BX 14396 CMPL BX, $0x63 14397 JBE check_maxskip_ok_encodeSnappyBetterBlockAsm 14398 LEAL 100(CX), BX 14399 JMP check_maxskip_cont_encodeSnappyBetterBlockAsm 14400 14401 check_maxskip_ok_encodeSnappyBetterBlockAsm: 14402 LEAL 1(CX)(BX*1), BX 14403 14404 check_maxskip_cont_encodeSnappyBetterBlockAsm: 14405 CMPL BX, 8(SP) 14406 JAE emit_remainder_encodeSnappyBetterBlockAsm 14407 MOVQ (DX)(CX*1), SI 14408 MOVL BX, 20(SP) 14409 MOVQ $0x00cf1bbcdcbfa563, R8 14410 MOVQ $0x9e3779b1, BX 14411 MOVQ SI, R9 14412 MOVQ SI, R10 14413 SHLQ $0x08, R9 14414 IMULQ R8, R9 14415 SHRQ $0x2f, R9 14416 SHLQ $0x20, R10 14417 IMULQ BX, R10 14418 SHRQ $0x32, R10 14419 MOVL 24(SP)(R9*4), BX 14420 MOVL 524312(SP)(R10*4), DI 14421 MOVL CX, 24(SP)(R9*4) 14422 MOVL CX, 524312(SP)(R10*4) 14423 MOVQ (DX)(BX*1), R9 14424 MOVQ (DX)(DI*1), R10 14425 CMPQ R9, SI 14426 JEQ candidate_match_encodeSnappyBetterBlockAsm 14427 CMPQ R10, SI 14428 JNE no_short_found_encodeSnappyBetterBlockAsm 14429 MOVL DI, BX 14430 JMP candidate_match_encodeSnappyBetterBlockAsm 14431 14432 no_short_found_encodeSnappyBetterBlockAsm: 14433 CMPL R9, SI 14434 JEQ candidate_match_encodeSnappyBetterBlockAsm 14435 CMPL R10, SI 14436 JEQ candidateS_match_encodeSnappyBetterBlockAsm 14437 MOVL 20(SP), CX 14438 JMP search_loop_encodeSnappyBetterBlockAsm 14439 14440 candidateS_match_encodeSnappyBetterBlockAsm: 14441 SHRQ $0x08, SI 14442 MOVQ SI, R9 14443 SHLQ $0x08, R9 14444 IMULQ R8, R9 14445 SHRQ $0x2f, R9 14446 MOVL 24(SP)(R9*4), BX 14447 INCL CX 14448 MOVL CX, 24(SP)(R9*4) 14449 CMPL (DX)(BX*1), SI 14450 JEQ candidate_match_encodeSnappyBetterBlockAsm 14451 DECL CX 14452 MOVL DI, BX 14453 14454 candidate_match_encodeSnappyBetterBlockAsm: 14455 MOVL 12(SP), SI 14456 TESTL BX, BX 14457 JZ match_extend_back_end_encodeSnappyBetterBlockAsm 14458 14459 match_extend_back_loop_encodeSnappyBetterBlockAsm: 14460 CMPL CX, SI 14461 JBE match_extend_back_end_encodeSnappyBetterBlockAsm 14462 MOVB -1(DX)(BX*1), DI 14463 MOVB -1(DX)(CX*1), R8 14464 CMPB DI, R8 14465 JNE match_extend_back_end_encodeSnappyBetterBlockAsm 14466 LEAL -1(CX), CX 14467 DECL BX 14468 JZ match_extend_back_end_encodeSnappyBetterBlockAsm 14469 JMP match_extend_back_loop_encodeSnappyBetterBlockAsm 14470 14471 match_extend_back_end_encodeSnappyBetterBlockAsm: 14472 MOVL CX, SI 14473 SUBL 12(SP), SI 14474 LEAQ 5(AX)(SI*1), SI 14475 CMPQ SI, (SP) 14476 JB match_dst_size_check_encodeSnappyBetterBlockAsm 14477 MOVQ $0x00000000, ret+48(FP) 14478 RET 14479 14480 match_dst_size_check_encodeSnappyBetterBlockAsm: 14481 MOVL CX, SI 14482 ADDL $0x04, CX 14483 ADDL $0x04, BX 14484 MOVQ src_len+32(FP), DI 14485 SUBL CX, DI 14486 LEAQ (DX)(CX*1), R8 14487 LEAQ (DX)(BX*1), R9 14488 14489 // matchLen 14490 XORL R11, R11 14491 CMPL DI, $0x08 14492 JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm 14493 14494 matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm: 14495 MOVQ (R8)(R11*1), R10 14496 XORQ (R9)(R11*1), R10 14497 TESTQ R10, R10 14498 JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm 14499 14500 #ifdef GOAMD64_v3 14501 TZCNTQ R10, R10 14502 14503 #else 14504 BSFQ R10, R10 14505 14506 #endif 14507 SARQ $0x03, R10 14508 LEAL (R11)(R10*1), R11 14509 JMP match_nolit_end_encodeSnappyBetterBlockAsm 14510 14511 matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm: 14512 LEAL -8(DI), DI 14513 LEAL 8(R11), R11 14514 CMPL DI, $0x08 14515 JAE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm 14516 JZ match_nolit_end_encodeSnappyBetterBlockAsm 14517 14518 matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm: 14519 CMPL DI, $0x04 14520 JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm 14521 MOVL (R8)(R11*1), R10 14522 CMPL (R9)(R11*1), R10 14523 JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm 14524 SUBL $0x04, DI 14525 LEAL 4(R11), R11 14526 14527 matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm: 14528 CMPL DI, $0x02 14529 JB matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm 14530 MOVW (R8)(R11*1), R10 14531 CMPW (R9)(R11*1), R10 14532 JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm 14533 SUBL $0x02, DI 14534 LEAL 2(R11), R11 14535 14536 matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm: 14537 CMPL DI, $0x01 14538 JB match_nolit_end_encodeSnappyBetterBlockAsm 14539 MOVB (R8)(R11*1), R10 14540 CMPB (R9)(R11*1), R10 14541 JNE match_nolit_end_encodeSnappyBetterBlockAsm 14542 LEAL 1(R11), R11 14543 14544 match_nolit_end_encodeSnappyBetterBlockAsm: 14545 MOVL CX, DI 14546 SUBL BX, DI 14547 14548 // Check if repeat 14549 CMPL R11, $0x01 14550 JA match_length_ok_encodeSnappyBetterBlockAsm 14551 CMPL DI, $0x0000ffff 14552 JBE match_length_ok_encodeSnappyBetterBlockAsm 14553 MOVL 20(SP), CX 14554 INCL CX 14555 JMP search_loop_encodeSnappyBetterBlockAsm 14556 14557 match_length_ok_encodeSnappyBetterBlockAsm: 14558 MOVL DI, 16(SP) 14559 MOVL 12(SP), BX 14560 CMPL BX, SI 14561 JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm 14562 MOVL SI, R8 14563 MOVL SI, 12(SP) 14564 LEAQ (DX)(BX*1), R9 14565 SUBL BX, R8 14566 LEAL -1(R8), BX 14567 CMPL BX, $0x3c 14568 JB one_byte_match_emit_encodeSnappyBetterBlockAsm 14569 CMPL BX, $0x00000100 14570 JB two_bytes_match_emit_encodeSnappyBetterBlockAsm 14571 CMPL BX, $0x00010000 14572 JB three_bytes_match_emit_encodeSnappyBetterBlockAsm 14573 CMPL BX, $0x01000000 14574 JB four_bytes_match_emit_encodeSnappyBetterBlockAsm 14575 MOVB $0xfc, (AX) 14576 MOVL BX, 1(AX) 14577 ADDQ $0x05, AX 14578 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm 14579 14580 four_bytes_match_emit_encodeSnappyBetterBlockAsm: 14581 MOVL BX, R10 14582 SHRL $0x10, R10 14583 MOVB $0xf8, (AX) 14584 MOVW BX, 1(AX) 14585 MOVB R10, 3(AX) 14586 ADDQ $0x04, AX 14587 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm 14588 14589 three_bytes_match_emit_encodeSnappyBetterBlockAsm: 14590 MOVB $0xf4, (AX) 14591 MOVW BX, 1(AX) 14592 ADDQ $0x03, AX 14593 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm 14594 14595 two_bytes_match_emit_encodeSnappyBetterBlockAsm: 14596 MOVB $0xf0, (AX) 14597 MOVB BL, 1(AX) 14598 ADDQ $0x02, AX 14599 CMPL BX, $0x40 14600 JB memmove_match_emit_encodeSnappyBetterBlockAsm 14601 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm 14602 14603 one_byte_match_emit_encodeSnappyBetterBlockAsm: 14604 SHLB $0x02, BL 14605 MOVB BL, (AX) 14606 ADDQ $0x01, AX 14607 14608 memmove_match_emit_encodeSnappyBetterBlockAsm: 14609 LEAQ (AX)(R8*1), BX 14610 14611 // genMemMoveShort 14612 CMPQ R8, $0x08 14613 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8 14614 CMPQ R8, $0x10 14615 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16 14616 CMPQ R8, $0x20 14617 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32 14618 JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64 14619 14620 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8: 14621 MOVQ (R9), R10 14622 MOVQ R10, (AX) 14623 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm 14624 14625 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16: 14626 MOVQ (R9), R10 14627 MOVQ -8(R9)(R8*1), R9 14628 MOVQ R10, (AX) 14629 MOVQ R9, -8(AX)(R8*1) 14630 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm 14631 14632 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32: 14633 MOVOU (R9), X0 14634 MOVOU -16(R9)(R8*1), X1 14635 MOVOU X0, (AX) 14636 MOVOU X1, -16(AX)(R8*1) 14637 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm 14638 14639 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64: 14640 MOVOU (R9), X0 14641 MOVOU 16(R9), X1 14642 MOVOU -32(R9)(R8*1), X2 14643 MOVOU -16(R9)(R8*1), X3 14644 MOVOU X0, (AX) 14645 MOVOU X1, 16(AX) 14646 MOVOU X2, -32(AX)(R8*1) 14647 MOVOU X3, -16(AX)(R8*1) 14648 14649 memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm: 14650 MOVQ BX, AX 14651 JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm 14652 14653 memmove_long_match_emit_encodeSnappyBetterBlockAsm: 14654 LEAQ (AX)(R8*1), BX 14655 14656 // genMemMoveLong 14657 MOVOU (R9), X0 14658 MOVOU 16(R9), X1 14659 MOVOU -32(R9)(R8*1), X2 14660 MOVOU -16(R9)(R8*1), X3 14661 MOVQ R8, R12 14662 SHRQ $0x05, R12 14663 MOVQ AX, R10 14664 ANDL $0x0000001f, R10 14665 MOVQ $0x00000040, R13 14666 SUBQ R10, R13 14667 DECQ R12 14668 JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 14669 LEAQ -32(R9)(R13*1), R10 14670 LEAQ -32(AX)(R13*1), R14 14671 14672 emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back: 14673 MOVOU (R10), X4 14674 MOVOU 16(R10), X5 14675 MOVOA X4, (R14) 14676 MOVOA X5, 16(R14) 14677 ADDQ $0x20, R14 14678 ADDQ $0x20, R10 14679 ADDQ $0x20, R13 14680 DECQ R12 14681 JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back 14682 14683 emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32: 14684 MOVOU -32(R9)(R13*1), X4 14685 MOVOU -16(R9)(R13*1), X5 14686 MOVOA X4, -32(AX)(R13*1) 14687 MOVOA X5, -16(AX)(R13*1) 14688 ADDQ $0x20, R13 14689 CMPQ R8, R13 14690 JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 14691 MOVOU X0, (AX) 14692 MOVOU X1, 16(AX) 14693 MOVOU X2, -32(AX)(R8*1) 14694 MOVOU X3, -16(AX)(R8*1) 14695 MOVQ BX, AX 14696 14697 emit_literal_done_match_emit_encodeSnappyBetterBlockAsm: 14698 ADDL R11, CX 14699 ADDL $0x04, R11 14700 MOVL CX, 12(SP) 14701 14702 // emitCopy 14703 CMPL DI, $0x00010000 14704 JB two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm 14705 14706 four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm: 14707 CMPL R11, $0x40 14708 JBE four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm 14709 MOVB $0xff, (AX) 14710 MOVL DI, 1(AX) 14711 LEAL -64(R11), R11 14712 ADDQ $0x05, AX 14713 CMPL R11, $0x04 14714 JB four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm 14715 JMP four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm 14716 14717 four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm: 14718 TESTL R11, R11 14719 JZ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm 14720 XORL BX, BX 14721 LEAL -1(BX)(R11*4), R11 14722 MOVB R11, (AX) 14723 MOVL DI, 1(AX) 14724 ADDQ $0x05, AX 14725 JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm 14726 14727 two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm: 14728 CMPL R11, $0x40 14729 JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm 14730 MOVB $0xee, (AX) 14731 MOVW DI, 1(AX) 14732 LEAL -60(R11), R11 14733 ADDQ $0x03, AX 14734 JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm 14735 14736 two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm: 14737 MOVL R11, BX 14738 SHLL $0x02, BX 14739 CMPL R11, $0x0c 14740 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm 14741 CMPL DI, $0x00000800 14742 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm 14743 LEAL -15(BX), BX 14744 MOVB DI, 1(AX) 14745 SHRL $0x08, DI 14746 SHLL $0x05, DI 14747 ORL DI, BX 14748 MOVB BL, (AX) 14749 ADDQ $0x02, AX 14750 JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm 14751 14752 emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm: 14753 LEAL -2(BX), BX 14754 MOVB BL, (AX) 14755 MOVW DI, 1(AX) 14756 ADDQ $0x03, AX 14757 14758 match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm: 14759 CMPL CX, 8(SP) 14760 JAE emit_remainder_encodeSnappyBetterBlockAsm 14761 CMPQ AX, (SP) 14762 JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm 14763 MOVQ $0x00000000, ret+48(FP) 14764 RET 14765 14766 match_nolit_dst_ok_encodeSnappyBetterBlockAsm: 14767 MOVQ $0x00cf1bbcdcbfa563, BX 14768 MOVQ $0x9e3779b1, DI 14769 LEAQ 1(SI), SI 14770 LEAQ -2(CX), R8 14771 MOVQ (DX)(SI*1), R9 14772 MOVQ 1(DX)(SI*1), R10 14773 MOVQ (DX)(R8*1), R11 14774 MOVQ 1(DX)(R8*1), R12 14775 SHLQ $0x08, R9 14776 IMULQ BX, R9 14777 SHRQ $0x2f, R9 14778 SHLQ $0x20, R10 14779 IMULQ DI, R10 14780 SHRQ $0x32, R10 14781 SHLQ $0x08, R11 14782 IMULQ BX, R11 14783 SHRQ $0x2f, R11 14784 SHLQ $0x20, R12 14785 IMULQ DI, R12 14786 SHRQ $0x32, R12 14787 LEAQ 1(SI), DI 14788 LEAQ 1(R8), R13 14789 MOVL SI, 24(SP)(R9*4) 14790 MOVL R8, 24(SP)(R11*4) 14791 MOVL DI, 524312(SP)(R10*4) 14792 MOVL R13, 524312(SP)(R12*4) 14793 ADDQ $0x01, SI 14794 SUBQ $0x01, R8 14795 14796 index_loop_encodeSnappyBetterBlockAsm: 14797 CMPQ SI, R8 14798 JAE search_loop_encodeSnappyBetterBlockAsm 14799 MOVQ (DX)(SI*1), DI 14800 MOVQ (DX)(R8*1), R9 14801 SHLQ $0x08, DI 14802 IMULQ BX, DI 14803 SHRQ $0x2f, DI 14804 SHLQ $0x08, R9 14805 IMULQ BX, R9 14806 SHRQ $0x2f, R9 14807 MOVL SI, 24(SP)(DI*4) 14808 MOVL R8, 24(SP)(R9*4) 14809 ADDQ $0x02, SI 14810 SUBQ $0x02, R8 14811 JMP index_loop_encodeSnappyBetterBlockAsm 14812 14813 emit_remainder_encodeSnappyBetterBlockAsm: 14814 MOVQ src_len+32(FP), CX 14815 SUBL 12(SP), CX 14816 LEAQ 5(AX)(CX*1), CX 14817 CMPQ CX, (SP) 14818 JB emit_remainder_ok_encodeSnappyBetterBlockAsm 14819 MOVQ $0x00000000, ret+48(FP) 14820 RET 14821 14822 emit_remainder_ok_encodeSnappyBetterBlockAsm: 14823 MOVQ src_len+32(FP), CX 14824 MOVL 12(SP), BX 14825 CMPL BX, CX 14826 JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm 14827 MOVL CX, SI 14828 MOVL CX, 12(SP) 14829 LEAQ (DX)(BX*1), CX 14830 SUBL BX, SI 14831 LEAL -1(SI), DX 14832 CMPL DX, $0x3c 14833 JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm 14834 CMPL DX, $0x00000100 14835 JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm 14836 CMPL DX, $0x00010000 14837 JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm 14838 CMPL DX, $0x01000000 14839 JB four_bytes_emit_remainder_encodeSnappyBetterBlockAsm 14840 MOVB $0xfc, (AX) 14841 MOVL DX, 1(AX) 14842 ADDQ $0x05, AX 14843 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm 14844 14845 four_bytes_emit_remainder_encodeSnappyBetterBlockAsm: 14846 MOVL DX, BX 14847 SHRL $0x10, BX 14848 MOVB $0xf8, (AX) 14849 MOVW DX, 1(AX) 14850 MOVB BL, 3(AX) 14851 ADDQ $0x04, AX 14852 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm 14853 14854 three_bytes_emit_remainder_encodeSnappyBetterBlockAsm: 14855 MOVB $0xf4, (AX) 14856 MOVW DX, 1(AX) 14857 ADDQ $0x03, AX 14858 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm 14859 14860 two_bytes_emit_remainder_encodeSnappyBetterBlockAsm: 14861 MOVB $0xf0, (AX) 14862 MOVB DL, 1(AX) 14863 ADDQ $0x02, AX 14864 CMPL DX, $0x40 14865 JB memmove_emit_remainder_encodeSnappyBetterBlockAsm 14866 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm 14867 14868 one_byte_emit_remainder_encodeSnappyBetterBlockAsm: 14869 SHLB $0x02, DL 14870 MOVB DL, (AX) 14871 ADDQ $0x01, AX 14872 14873 memmove_emit_remainder_encodeSnappyBetterBlockAsm: 14874 LEAQ (AX)(SI*1), DX 14875 MOVL SI, BX 14876 14877 // genMemMoveShort 14878 CMPQ BX, $0x03 14879 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2 14880 JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3 14881 CMPQ BX, $0x08 14882 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7 14883 CMPQ BX, $0x10 14884 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16 14885 CMPQ BX, $0x20 14886 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32 14887 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64 14888 14889 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2: 14890 MOVB (CX), SI 14891 MOVB -1(CX)(BX*1), CL 14892 MOVB SI, (AX) 14893 MOVB CL, -1(AX)(BX*1) 14894 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm 14895 14896 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3: 14897 MOVW (CX), SI 14898 MOVB 2(CX), CL 14899 MOVW SI, (AX) 14900 MOVB CL, 2(AX) 14901 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm 14902 14903 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7: 14904 MOVL (CX), SI 14905 MOVL -4(CX)(BX*1), CX 14906 MOVL SI, (AX) 14907 MOVL CX, -4(AX)(BX*1) 14908 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm 14909 14910 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16: 14911 MOVQ (CX), SI 14912 MOVQ -8(CX)(BX*1), CX 14913 MOVQ SI, (AX) 14914 MOVQ CX, -8(AX)(BX*1) 14915 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm 14916 14917 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32: 14918 MOVOU (CX), X0 14919 MOVOU -16(CX)(BX*1), X1 14920 MOVOU X0, (AX) 14921 MOVOU X1, -16(AX)(BX*1) 14922 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm 14923 14924 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64: 14925 MOVOU (CX), X0 14926 MOVOU 16(CX), X1 14927 MOVOU -32(CX)(BX*1), X2 14928 MOVOU -16(CX)(BX*1), X3 14929 MOVOU X0, (AX) 14930 MOVOU X1, 16(AX) 14931 MOVOU X2, -32(AX)(BX*1) 14932 MOVOU X3, -16(AX)(BX*1) 14933 14934 memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm: 14935 MOVQ DX, AX 14936 JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm 14937 14938 memmove_long_emit_remainder_encodeSnappyBetterBlockAsm: 14939 LEAQ (AX)(SI*1), DX 14940 MOVL SI, BX 14941 14942 // genMemMoveLong 14943 MOVOU (CX), X0 14944 MOVOU 16(CX), X1 14945 MOVOU -32(CX)(BX*1), X2 14946 MOVOU -16(CX)(BX*1), X3 14947 MOVQ BX, DI 14948 SHRQ $0x05, DI 14949 MOVQ AX, SI 14950 ANDL $0x0000001f, SI 14951 MOVQ $0x00000040, R8 14952 SUBQ SI, R8 14953 DECQ DI 14954 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 14955 LEAQ -32(CX)(R8*1), SI 14956 LEAQ -32(AX)(R8*1), R9 14957 14958 emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back: 14959 MOVOU (SI), X4 14960 MOVOU 16(SI), X5 14961 MOVOA X4, (R9) 14962 MOVOA X5, 16(R9) 14963 ADDQ $0x20, R9 14964 ADDQ $0x20, SI 14965 ADDQ $0x20, R8 14966 DECQ DI 14967 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back 14968 14969 emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32: 14970 MOVOU -32(CX)(R8*1), X4 14971 MOVOU -16(CX)(R8*1), X5 14972 MOVOA X4, -32(AX)(R8*1) 14973 MOVOA X5, -16(AX)(R8*1) 14974 ADDQ $0x20, R8 14975 CMPQ BX, R8 14976 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 14977 MOVOU X0, (AX) 14978 MOVOU X1, 16(AX) 14979 MOVOU X2, -32(AX)(BX*1) 14980 MOVOU X3, -16(AX)(BX*1) 14981 MOVQ DX, AX 14982 14983 emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm: 14984 MOVQ dst_base+0(FP), CX 14985 SUBQ CX, AX 14986 MOVQ AX, ret+48(FP) 14987 RET 14988 14989 // func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte) int 14990 // Requires: BMI, SSE2 14991 TEXT ·encodeSnappyBetterBlockAsm64K(SB), $327704-56 14992 MOVQ dst_base+0(FP), AX 14993 MOVQ $0x00000a00, CX 14994 LEAQ 24(SP), DX 14995 PXOR X0, X0 14996 14997 zero_loop_encodeSnappyBetterBlockAsm64K: 14998 MOVOU X0, (DX) 14999 MOVOU X0, 16(DX) 15000 MOVOU X0, 32(DX) 15001 MOVOU X0, 48(DX) 15002 MOVOU X0, 64(DX) 15003 MOVOU X0, 80(DX) 15004 MOVOU X0, 96(DX) 15005 MOVOU X0, 112(DX) 15006 ADDQ $0x80, DX 15007 DECQ CX 15008 JNZ zero_loop_encodeSnappyBetterBlockAsm64K 15009 MOVL $0x00000000, 12(SP) 15010 MOVQ src_len+32(FP), CX 15011 LEAQ -9(CX), DX 15012 LEAQ -8(CX), BX 15013 MOVL BX, 8(SP) 15014 SHRQ $0x05, CX 15015 SUBL CX, DX 15016 LEAQ (AX)(DX*1), DX 15017 MOVQ DX, (SP) 15018 MOVL $0x00000001, CX 15019 MOVL $0x00000000, 16(SP) 15020 MOVQ src_base+24(FP), DX 15021 15022 search_loop_encodeSnappyBetterBlockAsm64K: 15023 MOVL CX, BX 15024 SUBL 12(SP), BX 15025 SHRL $0x07, BX 15026 LEAL 1(CX)(BX*1), BX 15027 CMPL BX, 8(SP) 15028 JAE emit_remainder_encodeSnappyBetterBlockAsm64K 15029 MOVQ (DX)(CX*1), SI 15030 MOVL BX, 20(SP) 15031 MOVQ $0x00cf1bbcdcbfa563, R8 15032 MOVQ $0x9e3779b1, BX 15033 MOVQ SI, R9 15034 MOVQ SI, R10 15035 SHLQ $0x08, R9 15036 IMULQ R8, R9 15037 SHRQ $0x30, R9 15038 SHLQ $0x20, R10 15039 IMULQ BX, R10 15040 SHRQ $0x32, R10 15041 MOVL 24(SP)(R9*4), BX 15042 MOVL 262168(SP)(R10*4), DI 15043 MOVL CX, 24(SP)(R9*4) 15044 MOVL CX, 262168(SP)(R10*4) 15045 MOVQ (DX)(BX*1), R9 15046 MOVQ (DX)(DI*1), R10 15047 CMPQ R9, SI 15048 JEQ candidate_match_encodeSnappyBetterBlockAsm64K 15049 CMPQ R10, SI 15050 JNE no_short_found_encodeSnappyBetterBlockAsm64K 15051 MOVL DI, BX 15052 JMP candidate_match_encodeSnappyBetterBlockAsm64K 15053 15054 no_short_found_encodeSnappyBetterBlockAsm64K: 15055 CMPL R9, SI 15056 JEQ candidate_match_encodeSnappyBetterBlockAsm64K 15057 CMPL R10, SI 15058 JEQ candidateS_match_encodeSnappyBetterBlockAsm64K 15059 MOVL 20(SP), CX 15060 JMP search_loop_encodeSnappyBetterBlockAsm64K 15061 15062 candidateS_match_encodeSnappyBetterBlockAsm64K: 15063 SHRQ $0x08, SI 15064 MOVQ SI, R9 15065 SHLQ $0x08, R9 15066 IMULQ R8, R9 15067 SHRQ $0x30, R9 15068 MOVL 24(SP)(R9*4), BX 15069 INCL CX 15070 MOVL CX, 24(SP)(R9*4) 15071 CMPL (DX)(BX*1), SI 15072 JEQ candidate_match_encodeSnappyBetterBlockAsm64K 15073 DECL CX 15074 MOVL DI, BX 15075 15076 candidate_match_encodeSnappyBetterBlockAsm64K: 15077 MOVL 12(SP), SI 15078 TESTL BX, BX 15079 JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K 15080 15081 match_extend_back_loop_encodeSnappyBetterBlockAsm64K: 15082 CMPL CX, SI 15083 JBE match_extend_back_end_encodeSnappyBetterBlockAsm64K 15084 MOVB -1(DX)(BX*1), DI 15085 MOVB -1(DX)(CX*1), R8 15086 CMPB DI, R8 15087 JNE match_extend_back_end_encodeSnappyBetterBlockAsm64K 15088 LEAL -1(CX), CX 15089 DECL BX 15090 JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K 15091 JMP match_extend_back_loop_encodeSnappyBetterBlockAsm64K 15092 15093 match_extend_back_end_encodeSnappyBetterBlockAsm64K: 15094 MOVL CX, SI 15095 SUBL 12(SP), SI 15096 LEAQ 3(AX)(SI*1), SI 15097 CMPQ SI, (SP) 15098 JB match_dst_size_check_encodeSnappyBetterBlockAsm64K 15099 MOVQ $0x00000000, ret+48(FP) 15100 RET 15101 15102 match_dst_size_check_encodeSnappyBetterBlockAsm64K: 15103 MOVL CX, SI 15104 ADDL $0x04, CX 15105 ADDL $0x04, BX 15106 MOVQ src_len+32(FP), DI 15107 SUBL CX, DI 15108 LEAQ (DX)(CX*1), R8 15109 LEAQ (DX)(BX*1), R9 15110 15111 // matchLen 15112 XORL R11, R11 15113 CMPL DI, $0x08 15114 JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K 15115 15116 matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K: 15117 MOVQ (R8)(R11*1), R10 15118 XORQ (R9)(R11*1), R10 15119 TESTQ R10, R10 15120 JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K 15121 15122 #ifdef GOAMD64_v3 15123 TZCNTQ R10, R10 15124 15125 #else 15126 BSFQ R10, R10 15127 15128 #endif 15129 SARQ $0x03, R10 15130 LEAL (R11)(R10*1), R11 15131 JMP match_nolit_end_encodeSnappyBetterBlockAsm64K 15132 15133 matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K: 15134 LEAL -8(DI), DI 15135 LEAL 8(R11), R11 15136 CMPL DI, $0x08 15137 JAE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K 15138 JZ match_nolit_end_encodeSnappyBetterBlockAsm64K 15139 15140 matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K: 15141 CMPL DI, $0x04 15142 JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K 15143 MOVL (R8)(R11*1), R10 15144 CMPL (R9)(R11*1), R10 15145 JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K 15146 SUBL $0x04, DI 15147 LEAL 4(R11), R11 15148 15149 matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K: 15150 CMPL DI, $0x02 15151 JB matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K 15152 MOVW (R8)(R11*1), R10 15153 CMPW (R9)(R11*1), R10 15154 JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K 15155 SUBL $0x02, DI 15156 LEAL 2(R11), R11 15157 15158 matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K: 15159 CMPL DI, $0x01 15160 JB match_nolit_end_encodeSnappyBetterBlockAsm64K 15161 MOVB (R8)(R11*1), R10 15162 CMPB (R9)(R11*1), R10 15163 JNE match_nolit_end_encodeSnappyBetterBlockAsm64K 15164 LEAL 1(R11), R11 15165 15166 match_nolit_end_encodeSnappyBetterBlockAsm64K: 15167 MOVL CX, DI 15168 SUBL BX, DI 15169 15170 // Check if repeat 15171 MOVL DI, 16(SP) 15172 MOVL 12(SP), BX 15173 CMPL BX, SI 15174 JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K 15175 MOVL SI, R8 15176 MOVL SI, 12(SP) 15177 LEAQ (DX)(BX*1), R9 15178 SUBL BX, R8 15179 LEAL -1(R8), BX 15180 CMPL BX, $0x3c 15181 JB one_byte_match_emit_encodeSnappyBetterBlockAsm64K 15182 CMPL BX, $0x00000100 15183 JB two_bytes_match_emit_encodeSnappyBetterBlockAsm64K 15184 JB three_bytes_match_emit_encodeSnappyBetterBlockAsm64K 15185 15186 three_bytes_match_emit_encodeSnappyBetterBlockAsm64K: 15187 MOVB $0xf4, (AX) 15188 MOVW BX, 1(AX) 15189 ADDQ $0x03, AX 15190 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K 15191 15192 two_bytes_match_emit_encodeSnappyBetterBlockAsm64K: 15193 MOVB $0xf0, (AX) 15194 MOVB BL, 1(AX) 15195 ADDQ $0x02, AX 15196 CMPL BX, $0x40 15197 JB memmove_match_emit_encodeSnappyBetterBlockAsm64K 15198 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K 15199 15200 one_byte_match_emit_encodeSnappyBetterBlockAsm64K: 15201 SHLB $0x02, BL 15202 MOVB BL, (AX) 15203 ADDQ $0x01, AX 15204 15205 memmove_match_emit_encodeSnappyBetterBlockAsm64K: 15206 LEAQ (AX)(R8*1), BX 15207 15208 // genMemMoveShort 15209 CMPQ R8, $0x08 15210 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8 15211 CMPQ R8, $0x10 15212 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16 15213 CMPQ R8, $0x20 15214 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32 15215 JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64 15216 15217 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8: 15218 MOVQ (R9), R10 15219 MOVQ R10, (AX) 15220 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K 15221 15222 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16: 15223 MOVQ (R9), R10 15224 MOVQ -8(R9)(R8*1), R9 15225 MOVQ R10, (AX) 15226 MOVQ R9, -8(AX)(R8*1) 15227 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K 15228 15229 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32: 15230 MOVOU (R9), X0 15231 MOVOU -16(R9)(R8*1), X1 15232 MOVOU X0, (AX) 15233 MOVOU X1, -16(AX)(R8*1) 15234 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K 15235 15236 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64: 15237 MOVOU (R9), X0 15238 MOVOU 16(R9), X1 15239 MOVOU -32(R9)(R8*1), X2 15240 MOVOU -16(R9)(R8*1), X3 15241 MOVOU X0, (AX) 15242 MOVOU X1, 16(AX) 15243 MOVOU X2, -32(AX)(R8*1) 15244 MOVOU X3, -16(AX)(R8*1) 15245 15246 memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K: 15247 MOVQ BX, AX 15248 JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K 15249 15250 memmove_long_match_emit_encodeSnappyBetterBlockAsm64K: 15251 LEAQ (AX)(R8*1), BX 15252 15253 // genMemMoveLong 15254 MOVOU (R9), X0 15255 MOVOU 16(R9), X1 15256 MOVOU -32(R9)(R8*1), X2 15257 MOVOU -16(R9)(R8*1), X3 15258 MOVQ R8, R12 15259 SHRQ $0x05, R12 15260 MOVQ AX, R10 15261 ANDL $0x0000001f, R10 15262 MOVQ $0x00000040, R13 15263 SUBQ R10, R13 15264 DECQ R12 15265 JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 15266 LEAQ -32(R9)(R13*1), R10 15267 LEAQ -32(AX)(R13*1), R14 15268 15269 emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back: 15270 MOVOU (R10), X4 15271 MOVOU 16(R10), X5 15272 MOVOA X4, (R14) 15273 MOVOA X5, 16(R14) 15274 ADDQ $0x20, R14 15275 ADDQ $0x20, R10 15276 ADDQ $0x20, R13 15277 DECQ R12 15278 JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back 15279 15280 emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32: 15281 MOVOU -32(R9)(R13*1), X4 15282 MOVOU -16(R9)(R13*1), X5 15283 MOVOA X4, -32(AX)(R13*1) 15284 MOVOA X5, -16(AX)(R13*1) 15285 ADDQ $0x20, R13 15286 CMPQ R8, R13 15287 JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 15288 MOVOU X0, (AX) 15289 MOVOU X1, 16(AX) 15290 MOVOU X2, -32(AX)(R8*1) 15291 MOVOU X3, -16(AX)(R8*1) 15292 MOVQ BX, AX 15293 15294 emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K: 15295 ADDL R11, CX 15296 ADDL $0x04, R11 15297 MOVL CX, 12(SP) 15298 15299 // emitCopy 15300 two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K: 15301 CMPL R11, $0x40 15302 JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K 15303 MOVB $0xee, (AX) 15304 MOVW DI, 1(AX) 15305 LEAL -60(R11), R11 15306 ADDQ $0x03, AX 15307 JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K 15308 15309 two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K: 15310 MOVL R11, BX 15311 SHLL $0x02, BX 15312 CMPL R11, $0x0c 15313 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K 15314 CMPL DI, $0x00000800 15315 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K 15316 LEAL -15(BX), BX 15317 MOVB DI, 1(AX) 15318 SHRL $0x08, DI 15319 SHLL $0x05, DI 15320 ORL DI, BX 15321 MOVB BL, (AX) 15322 ADDQ $0x02, AX 15323 JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K 15324 15325 emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K: 15326 LEAL -2(BX), BX 15327 MOVB BL, (AX) 15328 MOVW DI, 1(AX) 15329 ADDQ $0x03, AX 15330 15331 match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K: 15332 CMPL CX, 8(SP) 15333 JAE emit_remainder_encodeSnappyBetterBlockAsm64K 15334 CMPQ AX, (SP) 15335 JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K 15336 MOVQ $0x00000000, ret+48(FP) 15337 RET 15338 15339 match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K: 15340 MOVQ $0x00cf1bbcdcbfa563, BX 15341 MOVQ $0x9e3779b1, DI 15342 LEAQ 1(SI), SI 15343 LEAQ -2(CX), R8 15344 MOVQ (DX)(SI*1), R9 15345 MOVQ 1(DX)(SI*1), R10 15346 MOVQ (DX)(R8*1), R11 15347 MOVQ 1(DX)(R8*1), R12 15348 SHLQ $0x08, R9 15349 IMULQ BX, R9 15350 SHRQ $0x30, R9 15351 SHLQ $0x20, R10 15352 IMULQ DI, R10 15353 SHRQ $0x32, R10 15354 SHLQ $0x08, R11 15355 IMULQ BX, R11 15356 SHRQ $0x30, R11 15357 SHLQ $0x20, R12 15358 IMULQ DI, R12 15359 SHRQ $0x32, R12 15360 LEAQ 1(SI), DI 15361 LEAQ 1(R8), R13 15362 MOVL SI, 24(SP)(R9*4) 15363 MOVL R8, 24(SP)(R11*4) 15364 MOVL DI, 262168(SP)(R10*4) 15365 MOVL R13, 262168(SP)(R12*4) 15366 ADDQ $0x01, SI 15367 SUBQ $0x01, R8 15368 15369 index_loop_encodeSnappyBetterBlockAsm64K: 15370 CMPQ SI, R8 15371 JAE search_loop_encodeSnappyBetterBlockAsm64K 15372 MOVQ (DX)(SI*1), DI 15373 MOVQ (DX)(R8*1), R9 15374 SHLQ $0x08, DI 15375 IMULQ BX, DI 15376 SHRQ $0x30, DI 15377 SHLQ $0x08, R9 15378 IMULQ BX, R9 15379 SHRQ $0x30, R9 15380 MOVL SI, 24(SP)(DI*4) 15381 MOVL R8, 24(SP)(R9*4) 15382 ADDQ $0x02, SI 15383 SUBQ $0x02, R8 15384 JMP index_loop_encodeSnappyBetterBlockAsm64K 15385 15386 emit_remainder_encodeSnappyBetterBlockAsm64K: 15387 MOVQ src_len+32(FP), CX 15388 SUBL 12(SP), CX 15389 LEAQ 3(AX)(CX*1), CX 15390 CMPQ CX, (SP) 15391 JB emit_remainder_ok_encodeSnappyBetterBlockAsm64K 15392 MOVQ $0x00000000, ret+48(FP) 15393 RET 15394 15395 emit_remainder_ok_encodeSnappyBetterBlockAsm64K: 15396 MOVQ src_len+32(FP), CX 15397 MOVL 12(SP), BX 15398 CMPL BX, CX 15399 JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K 15400 MOVL CX, SI 15401 MOVL CX, 12(SP) 15402 LEAQ (DX)(BX*1), CX 15403 SUBL BX, SI 15404 LEAL -1(SI), DX 15405 CMPL DX, $0x3c 15406 JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K 15407 CMPL DX, $0x00000100 15408 JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K 15409 JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K 15410 15411 three_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K: 15412 MOVB $0xf4, (AX) 15413 MOVW DX, 1(AX) 15414 ADDQ $0x03, AX 15415 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K 15416 15417 two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K: 15418 MOVB $0xf0, (AX) 15419 MOVB DL, 1(AX) 15420 ADDQ $0x02, AX 15421 CMPL DX, $0x40 15422 JB memmove_emit_remainder_encodeSnappyBetterBlockAsm64K 15423 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K 15424 15425 one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K: 15426 SHLB $0x02, DL 15427 MOVB DL, (AX) 15428 ADDQ $0x01, AX 15429 15430 memmove_emit_remainder_encodeSnappyBetterBlockAsm64K: 15431 LEAQ (AX)(SI*1), DX 15432 MOVL SI, BX 15433 15434 // genMemMoveShort 15435 CMPQ BX, $0x03 15436 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2 15437 JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3 15438 CMPQ BX, $0x08 15439 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7 15440 CMPQ BX, $0x10 15441 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16 15442 CMPQ BX, $0x20 15443 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32 15444 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64 15445 15446 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2: 15447 MOVB (CX), SI 15448 MOVB -1(CX)(BX*1), CL 15449 MOVB SI, (AX) 15450 MOVB CL, -1(AX)(BX*1) 15451 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K 15452 15453 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3: 15454 MOVW (CX), SI 15455 MOVB 2(CX), CL 15456 MOVW SI, (AX) 15457 MOVB CL, 2(AX) 15458 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K 15459 15460 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7: 15461 MOVL (CX), SI 15462 MOVL -4(CX)(BX*1), CX 15463 MOVL SI, (AX) 15464 MOVL CX, -4(AX)(BX*1) 15465 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K 15466 15467 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16: 15468 MOVQ (CX), SI 15469 MOVQ -8(CX)(BX*1), CX 15470 MOVQ SI, (AX) 15471 MOVQ CX, -8(AX)(BX*1) 15472 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K 15473 15474 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32: 15475 MOVOU (CX), X0 15476 MOVOU -16(CX)(BX*1), X1 15477 MOVOU X0, (AX) 15478 MOVOU X1, -16(AX)(BX*1) 15479 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K 15480 15481 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64: 15482 MOVOU (CX), X0 15483 MOVOU 16(CX), X1 15484 MOVOU -32(CX)(BX*1), X2 15485 MOVOU -16(CX)(BX*1), X3 15486 MOVOU X0, (AX) 15487 MOVOU X1, 16(AX) 15488 MOVOU X2, -32(AX)(BX*1) 15489 MOVOU X3, -16(AX)(BX*1) 15490 15491 memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K: 15492 MOVQ DX, AX 15493 JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K 15494 15495 memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K: 15496 LEAQ (AX)(SI*1), DX 15497 MOVL SI, BX 15498 15499 // genMemMoveLong 15500 MOVOU (CX), X0 15501 MOVOU 16(CX), X1 15502 MOVOU -32(CX)(BX*1), X2 15503 MOVOU -16(CX)(BX*1), X3 15504 MOVQ BX, DI 15505 SHRQ $0x05, DI 15506 MOVQ AX, SI 15507 ANDL $0x0000001f, SI 15508 MOVQ $0x00000040, R8 15509 SUBQ SI, R8 15510 DECQ DI 15511 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 15512 LEAQ -32(CX)(R8*1), SI 15513 LEAQ -32(AX)(R8*1), R9 15514 15515 emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back: 15516 MOVOU (SI), X4 15517 MOVOU 16(SI), X5 15518 MOVOA X4, (R9) 15519 MOVOA X5, 16(R9) 15520 ADDQ $0x20, R9 15521 ADDQ $0x20, SI 15522 ADDQ $0x20, R8 15523 DECQ DI 15524 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back 15525 15526 emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32: 15527 MOVOU -32(CX)(R8*1), X4 15528 MOVOU -16(CX)(R8*1), X5 15529 MOVOA X4, -32(AX)(R8*1) 15530 MOVOA X5, -16(AX)(R8*1) 15531 ADDQ $0x20, R8 15532 CMPQ BX, R8 15533 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 15534 MOVOU X0, (AX) 15535 MOVOU X1, 16(AX) 15536 MOVOU X2, -32(AX)(BX*1) 15537 MOVOU X3, -16(AX)(BX*1) 15538 MOVQ DX, AX 15539 15540 emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K: 15541 MOVQ dst_base+0(FP), CX 15542 SUBQ CX, AX 15543 MOVQ AX, ret+48(FP) 15544 RET 15545 15546 // func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte) int 15547 // Requires: BMI, SSE2 15548 TEXT ·encodeSnappyBetterBlockAsm12B(SB), $81944-56 15549 MOVQ dst_base+0(FP), AX 15550 MOVQ $0x00000280, CX 15551 LEAQ 24(SP), DX 15552 PXOR X0, X0 15553 15554 zero_loop_encodeSnappyBetterBlockAsm12B: 15555 MOVOU X0, (DX) 15556 MOVOU X0, 16(DX) 15557 MOVOU X0, 32(DX) 15558 MOVOU X0, 48(DX) 15559 MOVOU X0, 64(DX) 15560 MOVOU X0, 80(DX) 15561 MOVOU X0, 96(DX) 15562 MOVOU X0, 112(DX) 15563 ADDQ $0x80, DX 15564 DECQ CX 15565 JNZ zero_loop_encodeSnappyBetterBlockAsm12B 15566 MOVL $0x00000000, 12(SP) 15567 MOVQ src_len+32(FP), CX 15568 LEAQ -9(CX), DX 15569 LEAQ -8(CX), BX 15570 MOVL BX, 8(SP) 15571 SHRQ $0x05, CX 15572 SUBL CX, DX 15573 LEAQ (AX)(DX*1), DX 15574 MOVQ DX, (SP) 15575 MOVL $0x00000001, CX 15576 MOVL $0x00000000, 16(SP) 15577 MOVQ src_base+24(FP), DX 15578 15579 search_loop_encodeSnappyBetterBlockAsm12B: 15580 MOVL CX, BX 15581 SUBL 12(SP), BX 15582 SHRL $0x06, BX 15583 LEAL 1(CX)(BX*1), BX 15584 CMPL BX, 8(SP) 15585 JAE emit_remainder_encodeSnappyBetterBlockAsm12B 15586 MOVQ (DX)(CX*1), SI 15587 MOVL BX, 20(SP) 15588 MOVQ $0x0000cf1bbcdcbf9b, R8 15589 MOVQ $0x9e3779b1, BX 15590 MOVQ SI, R9 15591 MOVQ SI, R10 15592 SHLQ $0x10, R9 15593 IMULQ R8, R9 15594 SHRQ $0x32, R9 15595 SHLQ $0x20, R10 15596 IMULQ BX, R10 15597 SHRQ $0x34, R10 15598 MOVL 24(SP)(R9*4), BX 15599 MOVL 65560(SP)(R10*4), DI 15600 MOVL CX, 24(SP)(R9*4) 15601 MOVL CX, 65560(SP)(R10*4) 15602 MOVQ (DX)(BX*1), R9 15603 MOVQ (DX)(DI*1), R10 15604 CMPQ R9, SI 15605 JEQ candidate_match_encodeSnappyBetterBlockAsm12B 15606 CMPQ R10, SI 15607 JNE no_short_found_encodeSnappyBetterBlockAsm12B 15608 MOVL DI, BX 15609 JMP candidate_match_encodeSnappyBetterBlockAsm12B 15610 15611 no_short_found_encodeSnappyBetterBlockAsm12B: 15612 CMPL R9, SI 15613 JEQ candidate_match_encodeSnappyBetterBlockAsm12B 15614 CMPL R10, SI 15615 JEQ candidateS_match_encodeSnappyBetterBlockAsm12B 15616 MOVL 20(SP), CX 15617 JMP search_loop_encodeSnappyBetterBlockAsm12B 15618 15619 candidateS_match_encodeSnappyBetterBlockAsm12B: 15620 SHRQ $0x08, SI 15621 MOVQ SI, R9 15622 SHLQ $0x10, R9 15623 IMULQ R8, R9 15624 SHRQ $0x32, R9 15625 MOVL 24(SP)(R9*4), BX 15626 INCL CX 15627 MOVL CX, 24(SP)(R9*4) 15628 CMPL (DX)(BX*1), SI 15629 JEQ candidate_match_encodeSnappyBetterBlockAsm12B 15630 DECL CX 15631 MOVL DI, BX 15632 15633 candidate_match_encodeSnappyBetterBlockAsm12B: 15634 MOVL 12(SP), SI 15635 TESTL BX, BX 15636 JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B 15637 15638 match_extend_back_loop_encodeSnappyBetterBlockAsm12B: 15639 CMPL CX, SI 15640 JBE match_extend_back_end_encodeSnappyBetterBlockAsm12B 15641 MOVB -1(DX)(BX*1), DI 15642 MOVB -1(DX)(CX*1), R8 15643 CMPB DI, R8 15644 JNE match_extend_back_end_encodeSnappyBetterBlockAsm12B 15645 LEAL -1(CX), CX 15646 DECL BX 15647 JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B 15648 JMP match_extend_back_loop_encodeSnappyBetterBlockAsm12B 15649 15650 match_extend_back_end_encodeSnappyBetterBlockAsm12B: 15651 MOVL CX, SI 15652 SUBL 12(SP), SI 15653 LEAQ 3(AX)(SI*1), SI 15654 CMPQ SI, (SP) 15655 JB match_dst_size_check_encodeSnappyBetterBlockAsm12B 15656 MOVQ $0x00000000, ret+48(FP) 15657 RET 15658 15659 match_dst_size_check_encodeSnappyBetterBlockAsm12B: 15660 MOVL CX, SI 15661 ADDL $0x04, CX 15662 ADDL $0x04, BX 15663 MOVQ src_len+32(FP), DI 15664 SUBL CX, DI 15665 LEAQ (DX)(CX*1), R8 15666 LEAQ (DX)(BX*1), R9 15667 15668 // matchLen 15669 XORL R11, R11 15670 CMPL DI, $0x08 15671 JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B 15672 15673 matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B: 15674 MOVQ (R8)(R11*1), R10 15675 XORQ (R9)(R11*1), R10 15676 TESTQ R10, R10 15677 JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B 15678 15679 #ifdef GOAMD64_v3 15680 TZCNTQ R10, R10 15681 15682 #else 15683 BSFQ R10, R10 15684 15685 #endif 15686 SARQ $0x03, R10 15687 LEAL (R11)(R10*1), R11 15688 JMP match_nolit_end_encodeSnappyBetterBlockAsm12B 15689 15690 matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B: 15691 LEAL -8(DI), DI 15692 LEAL 8(R11), R11 15693 CMPL DI, $0x08 15694 JAE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B 15695 JZ match_nolit_end_encodeSnappyBetterBlockAsm12B 15696 15697 matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B: 15698 CMPL DI, $0x04 15699 JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B 15700 MOVL (R8)(R11*1), R10 15701 CMPL (R9)(R11*1), R10 15702 JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B 15703 SUBL $0x04, DI 15704 LEAL 4(R11), R11 15705 15706 matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B: 15707 CMPL DI, $0x02 15708 JB matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B 15709 MOVW (R8)(R11*1), R10 15710 CMPW (R9)(R11*1), R10 15711 JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B 15712 SUBL $0x02, DI 15713 LEAL 2(R11), R11 15714 15715 matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B: 15716 CMPL DI, $0x01 15717 JB match_nolit_end_encodeSnappyBetterBlockAsm12B 15718 MOVB (R8)(R11*1), R10 15719 CMPB (R9)(R11*1), R10 15720 JNE match_nolit_end_encodeSnappyBetterBlockAsm12B 15721 LEAL 1(R11), R11 15722 15723 match_nolit_end_encodeSnappyBetterBlockAsm12B: 15724 MOVL CX, DI 15725 SUBL BX, DI 15726 15727 // Check if repeat 15728 MOVL DI, 16(SP) 15729 MOVL 12(SP), BX 15730 CMPL BX, SI 15731 JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B 15732 MOVL SI, R8 15733 MOVL SI, 12(SP) 15734 LEAQ (DX)(BX*1), R9 15735 SUBL BX, R8 15736 LEAL -1(R8), BX 15737 CMPL BX, $0x3c 15738 JB one_byte_match_emit_encodeSnappyBetterBlockAsm12B 15739 CMPL BX, $0x00000100 15740 JB two_bytes_match_emit_encodeSnappyBetterBlockAsm12B 15741 JB three_bytes_match_emit_encodeSnappyBetterBlockAsm12B 15742 15743 three_bytes_match_emit_encodeSnappyBetterBlockAsm12B: 15744 MOVB $0xf4, (AX) 15745 MOVW BX, 1(AX) 15746 ADDQ $0x03, AX 15747 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B 15748 15749 two_bytes_match_emit_encodeSnappyBetterBlockAsm12B: 15750 MOVB $0xf0, (AX) 15751 MOVB BL, 1(AX) 15752 ADDQ $0x02, AX 15753 CMPL BX, $0x40 15754 JB memmove_match_emit_encodeSnappyBetterBlockAsm12B 15755 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B 15756 15757 one_byte_match_emit_encodeSnappyBetterBlockAsm12B: 15758 SHLB $0x02, BL 15759 MOVB BL, (AX) 15760 ADDQ $0x01, AX 15761 15762 memmove_match_emit_encodeSnappyBetterBlockAsm12B: 15763 LEAQ (AX)(R8*1), BX 15764 15765 // genMemMoveShort 15766 CMPQ R8, $0x08 15767 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8 15768 CMPQ R8, $0x10 15769 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16 15770 CMPQ R8, $0x20 15771 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32 15772 JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64 15773 15774 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8: 15775 MOVQ (R9), R10 15776 MOVQ R10, (AX) 15777 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B 15778 15779 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16: 15780 MOVQ (R9), R10 15781 MOVQ -8(R9)(R8*1), R9 15782 MOVQ R10, (AX) 15783 MOVQ R9, -8(AX)(R8*1) 15784 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B 15785 15786 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32: 15787 MOVOU (R9), X0 15788 MOVOU -16(R9)(R8*1), X1 15789 MOVOU X0, (AX) 15790 MOVOU X1, -16(AX)(R8*1) 15791 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B 15792 15793 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64: 15794 MOVOU (R9), X0 15795 MOVOU 16(R9), X1 15796 MOVOU -32(R9)(R8*1), X2 15797 MOVOU -16(R9)(R8*1), X3 15798 MOVOU X0, (AX) 15799 MOVOU X1, 16(AX) 15800 MOVOU X2, -32(AX)(R8*1) 15801 MOVOU X3, -16(AX)(R8*1) 15802 15803 memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B: 15804 MOVQ BX, AX 15805 JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B 15806 15807 memmove_long_match_emit_encodeSnappyBetterBlockAsm12B: 15808 LEAQ (AX)(R8*1), BX 15809 15810 // genMemMoveLong 15811 MOVOU (R9), X0 15812 MOVOU 16(R9), X1 15813 MOVOU -32(R9)(R8*1), X2 15814 MOVOU -16(R9)(R8*1), X3 15815 MOVQ R8, R12 15816 SHRQ $0x05, R12 15817 MOVQ AX, R10 15818 ANDL $0x0000001f, R10 15819 MOVQ $0x00000040, R13 15820 SUBQ R10, R13 15821 DECQ R12 15822 JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 15823 LEAQ -32(R9)(R13*1), R10 15824 LEAQ -32(AX)(R13*1), R14 15825 15826 emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back: 15827 MOVOU (R10), X4 15828 MOVOU 16(R10), X5 15829 MOVOA X4, (R14) 15830 MOVOA X5, 16(R14) 15831 ADDQ $0x20, R14 15832 ADDQ $0x20, R10 15833 ADDQ $0x20, R13 15834 DECQ R12 15835 JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back 15836 15837 emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32: 15838 MOVOU -32(R9)(R13*1), X4 15839 MOVOU -16(R9)(R13*1), X5 15840 MOVOA X4, -32(AX)(R13*1) 15841 MOVOA X5, -16(AX)(R13*1) 15842 ADDQ $0x20, R13 15843 CMPQ R8, R13 15844 JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 15845 MOVOU X0, (AX) 15846 MOVOU X1, 16(AX) 15847 MOVOU X2, -32(AX)(R8*1) 15848 MOVOU X3, -16(AX)(R8*1) 15849 MOVQ BX, AX 15850 15851 emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B: 15852 ADDL R11, CX 15853 ADDL $0x04, R11 15854 MOVL CX, 12(SP) 15855 15856 // emitCopy 15857 two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B: 15858 CMPL R11, $0x40 15859 JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B 15860 MOVB $0xee, (AX) 15861 MOVW DI, 1(AX) 15862 LEAL -60(R11), R11 15863 ADDQ $0x03, AX 15864 JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B 15865 15866 two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B: 15867 MOVL R11, BX 15868 SHLL $0x02, BX 15869 CMPL R11, $0x0c 15870 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B 15871 CMPL DI, $0x00000800 15872 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B 15873 LEAL -15(BX), BX 15874 MOVB DI, 1(AX) 15875 SHRL $0x08, DI 15876 SHLL $0x05, DI 15877 ORL DI, BX 15878 MOVB BL, (AX) 15879 ADDQ $0x02, AX 15880 JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B 15881 15882 emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B: 15883 LEAL -2(BX), BX 15884 MOVB BL, (AX) 15885 MOVW DI, 1(AX) 15886 ADDQ $0x03, AX 15887 15888 match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B: 15889 CMPL CX, 8(SP) 15890 JAE emit_remainder_encodeSnappyBetterBlockAsm12B 15891 CMPQ AX, (SP) 15892 JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B 15893 MOVQ $0x00000000, ret+48(FP) 15894 RET 15895 15896 match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B: 15897 MOVQ $0x0000cf1bbcdcbf9b, BX 15898 MOVQ $0x9e3779b1, DI 15899 LEAQ 1(SI), SI 15900 LEAQ -2(CX), R8 15901 MOVQ (DX)(SI*1), R9 15902 MOVQ 1(DX)(SI*1), R10 15903 MOVQ (DX)(R8*1), R11 15904 MOVQ 1(DX)(R8*1), R12 15905 SHLQ $0x10, R9 15906 IMULQ BX, R9 15907 SHRQ $0x32, R9 15908 SHLQ $0x20, R10 15909 IMULQ DI, R10 15910 SHRQ $0x34, R10 15911 SHLQ $0x10, R11 15912 IMULQ BX, R11 15913 SHRQ $0x32, R11 15914 SHLQ $0x20, R12 15915 IMULQ DI, R12 15916 SHRQ $0x34, R12 15917 LEAQ 1(SI), DI 15918 LEAQ 1(R8), R13 15919 MOVL SI, 24(SP)(R9*4) 15920 MOVL R8, 24(SP)(R11*4) 15921 MOVL DI, 65560(SP)(R10*4) 15922 MOVL R13, 65560(SP)(R12*4) 15923 ADDQ $0x01, SI 15924 SUBQ $0x01, R8 15925 15926 index_loop_encodeSnappyBetterBlockAsm12B: 15927 CMPQ SI, R8 15928 JAE search_loop_encodeSnappyBetterBlockAsm12B 15929 MOVQ (DX)(SI*1), DI 15930 MOVQ (DX)(R8*1), R9 15931 SHLQ $0x10, DI 15932 IMULQ BX, DI 15933 SHRQ $0x32, DI 15934 SHLQ $0x10, R9 15935 IMULQ BX, R9 15936 SHRQ $0x32, R9 15937 MOVL SI, 24(SP)(DI*4) 15938 MOVL R8, 24(SP)(R9*4) 15939 ADDQ $0x02, SI 15940 SUBQ $0x02, R8 15941 JMP index_loop_encodeSnappyBetterBlockAsm12B 15942 15943 emit_remainder_encodeSnappyBetterBlockAsm12B: 15944 MOVQ src_len+32(FP), CX 15945 SUBL 12(SP), CX 15946 LEAQ 3(AX)(CX*1), CX 15947 CMPQ CX, (SP) 15948 JB emit_remainder_ok_encodeSnappyBetterBlockAsm12B 15949 MOVQ $0x00000000, ret+48(FP) 15950 RET 15951 15952 emit_remainder_ok_encodeSnappyBetterBlockAsm12B: 15953 MOVQ src_len+32(FP), CX 15954 MOVL 12(SP), BX 15955 CMPL BX, CX 15956 JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B 15957 MOVL CX, SI 15958 MOVL CX, 12(SP) 15959 LEAQ (DX)(BX*1), CX 15960 SUBL BX, SI 15961 LEAL -1(SI), DX 15962 CMPL DX, $0x3c 15963 JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B 15964 CMPL DX, $0x00000100 15965 JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B 15966 JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B 15967 15968 three_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B: 15969 MOVB $0xf4, (AX) 15970 MOVW DX, 1(AX) 15971 ADDQ $0x03, AX 15972 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B 15973 15974 two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B: 15975 MOVB $0xf0, (AX) 15976 MOVB DL, 1(AX) 15977 ADDQ $0x02, AX 15978 CMPL DX, $0x40 15979 JB memmove_emit_remainder_encodeSnappyBetterBlockAsm12B 15980 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B 15981 15982 one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B: 15983 SHLB $0x02, DL 15984 MOVB DL, (AX) 15985 ADDQ $0x01, AX 15986 15987 memmove_emit_remainder_encodeSnappyBetterBlockAsm12B: 15988 LEAQ (AX)(SI*1), DX 15989 MOVL SI, BX 15990 15991 // genMemMoveShort 15992 CMPQ BX, $0x03 15993 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2 15994 JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3 15995 CMPQ BX, $0x08 15996 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7 15997 CMPQ BX, $0x10 15998 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16 15999 CMPQ BX, $0x20 16000 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32 16001 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64 16002 16003 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2: 16004 MOVB (CX), SI 16005 MOVB -1(CX)(BX*1), CL 16006 MOVB SI, (AX) 16007 MOVB CL, -1(AX)(BX*1) 16008 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B 16009 16010 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3: 16011 MOVW (CX), SI 16012 MOVB 2(CX), CL 16013 MOVW SI, (AX) 16014 MOVB CL, 2(AX) 16015 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B 16016 16017 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7: 16018 MOVL (CX), SI 16019 MOVL -4(CX)(BX*1), CX 16020 MOVL SI, (AX) 16021 MOVL CX, -4(AX)(BX*1) 16022 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B 16023 16024 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16: 16025 MOVQ (CX), SI 16026 MOVQ -8(CX)(BX*1), CX 16027 MOVQ SI, (AX) 16028 MOVQ CX, -8(AX)(BX*1) 16029 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B 16030 16031 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32: 16032 MOVOU (CX), X0 16033 MOVOU -16(CX)(BX*1), X1 16034 MOVOU X0, (AX) 16035 MOVOU X1, -16(AX)(BX*1) 16036 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B 16037 16038 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64: 16039 MOVOU (CX), X0 16040 MOVOU 16(CX), X1 16041 MOVOU -32(CX)(BX*1), X2 16042 MOVOU -16(CX)(BX*1), X3 16043 MOVOU X0, (AX) 16044 MOVOU X1, 16(AX) 16045 MOVOU X2, -32(AX)(BX*1) 16046 MOVOU X3, -16(AX)(BX*1) 16047 16048 memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B: 16049 MOVQ DX, AX 16050 JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B 16051 16052 memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B: 16053 LEAQ (AX)(SI*1), DX 16054 MOVL SI, BX 16055 16056 // genMemMoveLong 16057 MOVOU (CX), X0 16058 MOVOU 16(CX), X1 16059 MOVOU -32(CX)(BX*1), X2 16060 MOVOU -16(CX)(BX*1), X3 16061 MOVQ BX, DI 16062 SHRQ $0x05, DI 16063 MOVQ AX, SI 16064 ANDL $0x0000001f, SI 16065 MOVQ $0x00000040, R8 16066 SUBQ SI, R8 16067 DECQ DI 16068 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 16069 LEAQ -32(CX)(R8*1), SI 16070 LEAQ -32(AX)(R8*1), R9 16071 16072 emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back: 16073 MOVOU (SI), X4 16074 MOVOU 16(SI), X5 16075 MOVOA X4, (R9) 16076 MOVOA X5, 16(R9) 16077 ADDQ $0x20, R9 16078 ADDQ $0x20, SI 16079 ADDQ $0x20, R8 16080 DECQ DI 16081 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back 16082 16083 emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32: 16084 MOVOU -32(CX)(R8*1), X4 16085 MOVOU -16(CX)(R8*1), X5 16086 MOVOA X4, -32(AX)(R8*1) 16087 MOVOA X5, -16(AX)(R8*1) 16088 ADDQ $0x20, R8 16089 CMPQ BX, R8 16090 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 16091 MOVOU X0, (AX) 16092 MOVOU X1, 16(AX) 16093 MOVOU X2, -32(AX)(BX*1) 16094 MOVOU X3, -16(AX)(BX*1) 16095 MOVQ DX, AX 16096 16097 emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B: 16098 MOVQ dst_base+0(FP), CX 16099 SUBQ CX, AX 16100 MOVQ AX, ret+48(FP) 16101 RET 16102 16103 // func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte) int 16104 // Requires: BMI, SSE2 16105 TEXT ·encodeSnappyBetterBlockAsm10B(SB), $20504-56 16106 MOVQ dst_base+0(FP), AX 16107 MOVQ $0x000000a0, CX 16108 LEAQ 24(SP), DX 16109 PXOR X0, X0 16110 16111 zero_loop_encodeSnappyBetterBlockAsm10B: 16112 MOVOU X0, (DX) 16113 MOVOU X0, 16(DX) 16114 MOVOU X0, 32(DX) 16115 MOVOU X0, 48(DX) 16116 MOVOU X0, 64(DX) 16117 MOVOU X0, 80(DX) 16118 MOVOU X0, 96(DX) 16119 MOVOU X0, 112(DX) 16120 ADDQ $0x80, DX 16121 DECQ CX 16122 JNZ zero_loop_encodeSnappyBetterBlockAsm10B 16123 MOVL $0x00000000, 12(SP) 16124 MOVQ src_len+32(FP), CX 16125 LEAQ -9(CX), DX 16126 LEAQ -8(CX), BX 16127 MOVL BX, 8(SP) 16128 SHRQ $0x05, CX 16129 SUBL CX, DX 16130 LEAQ (AX)(DX*1), DX 16131 MOVQ DX, (SP) 16132 MOVL $0x00000001, CX 16133 MOVL $0x00000000, 16(SP) 16134 MOVQ src_base+24(FP), DX 16135 16136 search_loop_encodeSnappyBetterBlockAsm10B: 16137 MOVL CX, BX 16138 SUBL 12(SP), BX 16139 SHRL $0x05, BX 16140 LEAL 1(CX)(BX*1), BX 16141 CMPL BX, 8(SP) 16142 JAE emit_remainder_encodeSnappyBetterBlockAsm10B 16143 MOVQ (DX)(CX*1), SI 16144 MOVL BX, 20(SP) 16145 MOVQ $0x0000cf1bbcdcbf9b, R8 16146 MOVQ $0x9e3779b1, BX 16147 MOVQ SI, R9 16148 MOVQ SI, R10 16149 SHLQ $0x10, R9 16150 IMULQ R8, R9 16151 SHRQ $0x34, R9 16152 SHLQ $0x20, R10 16153 IMULQ BX, R10 16154 SHRQ $0x36, R10 16155 MOVL 24(SP)(R9*4), BX 16156 MOVL 16408(SP)(R10*4), DI 16157 MOVL CX, 24(SP)(R9*4) 16158 MOVL CX, 16408(SP)(R10*4) 16159 MOVQ (DX)(BX*1), R9 16160 MOVQ (DX)(DI*1), R10 16161 CMPQ R9, SI 16162 JEQ candidate_match_encodeSnappyBetterBlockAsm10B 16163 CMPQ R10, SI 16164 JNE no_short_found_encodeSnappyBetterBlockAsm10B 16165 MOVL DI, BX 16166 JMP candidate_match_encodeSnappyBetterBlockAsm10B 16167 16168 no_short_found_encodeSnappyBetterBlockAsm10B: 16169 CMPL R9, SI 16170 JEQ candidate_match_encodeSnappyBetterBlockAsm10B 16171 CMPL R10, SI 16172 JEQ candidateS_match_encodeSnappyBetterBlockAsm10B 16173 MOVL 20(SP), CX 16174 JMP search_loop_encodeSnappyBetterBlockAsm10B 16175 16176 candidateS_match_encodeSnappyBetterBlockAsm10B: 16177 SHRQ $0x08, SI 16178 MOVQ SI, R9 16179 SHLQ $0x10, R9 16180 IMULQ R8, R9 16181 SHRQ $0x34, R9 16182 MOVL 24(SP)(R9*4), BX 16183 INCL CX 16184 MOVL CX, 24(SP)(R9*4) 16185 CMPL (DX)(BX*1), SI 16186 JEQ candidate_match_encodeSnappyBetterBlockAsm10B 16187 DECL CX 16188 MOVL DI, BX 16189 16190 candidate_match_encodeSnappyBetterBlockAsm10B: 16191 MOVL 12(SP), SI 16192 TESTL BX, BX 16193 JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B 16194 16195 match_extend_back_loop_encodeSnappyBetterBlockAsm10B: 16196 CMPL CX, SI 16197 JBE match_extend_back_end_encodeSnappyBetterBlockAsm10B 16198 MOVB -1(DX)(BX*1), DI 16199 MOVB -1(DX)(CX*1), R8 16200 CMPB DI, R8 16201 JNE match_extend_back_end_encodeSnappyBetterBlockAsm10B 16202 LEAL -1(CX), CX 16203 DECL BX 16204 JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B 16205 JMP match_extend_back_loop_encodeSnappyBetterBlockAsm10B 16206 16207 match_extend_back_end_encodeSnappyBetterBlockAsm10B: 16208 MOVL CX, SI 16209 SUBL 12(SP), SI 16210 LEAQ 3(AX)(SI*1), SI 16211 CMPQ SI, (SP) 16212 JB match_dst_size_check_encodeSnappyBetterBlockAsm10B 16213 MOVQ $0x00000000, ret+48(FP) 16214 RET 16215 16216 match_dst_size_check_encodeSnappyBetterBlockAsm10B: 16217 MOVL CX, SI 16218 ADDL $0x04, CX 16219 ADDL $0x04, BX 16220 MOVQ src_len+32(FP), DI 16221 SUBL CX, DI 16222 LEAQ (DX)(CX*1), R8 16223 LEAQ (DX)(BX*1), R9 16224 16225 // matchLen 16226 XORL R11, R11 16227 CMPL DI, $0x08 16228 JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B 16229 16230 matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B: 16231 MOVQ (R8)(R11*1), R10 16232 XORQ (R9)(R11*1), R10 16233 TESTQ R10, R10 16234 JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B 16235 16236 #ifdef GOAMD64_v3 16237 TZCNTQ R10, R10 16238 16239 #else 16240 BSFQ R10, R10 16241 16242 #endif 16243 SARQ $0x03, R10 16244 LEAL (R11)(R10*1), R11 16245 JMP match_nolit_end_encodeSnappyBetterBlockAsm10B 16246 16247 matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B: 16248 LEAL -8(DI), DI 16249 LEAL 8(R11), R11 16250 CMPL DI, $0x08 16251 JAE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B 16252 JZ match_nolit_end_encodeSnappyBetterBlockAsm10B 16253 16254 matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B: 16255 CMPL DI, $0x04 16256 JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B 16257 MOVL (R8)(R11*1), R10 16258 CMPL (R9)(R11*1), R10 16259 JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B 16260 SUBL $0x04, DI 16261 LEAL 4(R11), R11 16262 16263 matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B: 16264 CMPL DI, $0x02 16265 JB matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B 16266 MOVW (R8)(R11*1), R10 16267 CMPW (R9)(R11*1), R10 16268 JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B 16269 SUBL $0x02, DI 16270 LEAL 2(R11), R11 16271 16272 matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B: 16273 CMPL DI, $0x01 16274 JB match_nolit_end_encodeSnappyBetterBlockAsm10B 16275 MOVB (R8)(R11*1), R10 16276 CMPB (R9)(R11*1), R10 16277 JNE match_nolit_end_encodeSnappyBetterBlockAsm10B 16278 LEAL 1(R11), R11 16279 16280 match_nolit_end_encodeSnappyBetterBlockAsm10B: 16281 MOVL CX, DI 16282 SUBL BX, DI 16283 16284 // Check if repeat 16285 MOVL DI, 16(SP) 16286 MOVL 12(SP), BX 16287 CMPL BX, SI 16288 JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B 16289 MOVL SI, R8 16290 MOVL SI, 12(SP) 16291 LEAQ (DX)(BX*1), R9 16292 SUBL BX, R8 16293 LEAL -1(R8), BX 16294 CMPL BX, $0x3c 16295 JB one_byte_match_emit_encodeSnappyBetterBlockAsm10B 16296 CMPL BX, $0x00000100 16297 JB two_bytes_match_emit_encodeSnappyBetterBlockAsm10B 16298 JB three_bytes_match_emit_encodeSnappyBetterBlockAsm10B 16299 16300 three_bytes_match_emit_encodeSnappyBetterBlockAsm10B: 16301 MOVB $0xf4, (AX) 16302 MOVW BX, 1(AX) 16303 ADDQ $0x03, AX 16304 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B 16305 16306 two_bytes_match_emit_encodeSnappyBetterBlockAsm10B: 16307 MOVB $0xf0, (AX) 16308 MOVB BL, 1(AX) 16309 ADDQ $0x02, AX 16310 CMPL BX, $0x40 16311 JB memmove_match_emit_encodeSnappyBetterBlockAsm10B 16312 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B 16313 16314 one_byte_match_emit_encodeSnappyBetterBlockAsm10B: 16315 SHLB $0x02, BL 16316 MOVB BL, (AX) 16317 ADDQ $0x01, AX 16318 16319 memmove_match_emit_encodeSnappyBetterBlockAsm10B: 16320 LEAQ (AX)(R8*1), BX 16321 16322 // genMemMoveShort 16323 CMPQ R8, $0x08 16324 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8 16325 CMPQ R8, $0x10 16326 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16 16327 CMPQ R8, $0x20 16328 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32 16329 JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64 16330 16331 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8: 16332 MOVQ (R9), R10 16333 MOVQ R10, (AX) 16334 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B 16335 16336 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16: 16337 MOVQ (R9), R10 16338 MOVQ -8(R9)(R8*1), R9 16339 MOVQ R10, (AX) 16340 MOVQ R9, -8(AX)(R8*1) 16341 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B 16342 16343 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32: 16344 MOVOU (R9), X0 16345 MOVOU -16(R9)(R8*1), X1 16346 MOVOU X0, (AX) 16347 MOVOU X1, -16(AX)(R8*1) 16348 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B 16349 16350 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64: 16351 MOVOU (R9), X0 16352 MOVOU 16(R9), X1 16353 MOVOU -32(R9)(R8*1), X2 16354 MOVOU -16(R9)(R8*1), X3 16355 MOVOU X0, (AX) 16356 MOVOU X1, 16(AX) 16357 MOVOU X2, -32(AX)(R8*1) 16358 MOVOU X3, -16(AX)(R8*1) 16359 16360 memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B: 16361 MOVQ BX, AX 16362 JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B 16363 16364 memmove_long_match_emit_encodeSnappyBetterBlockAsm10B: 16365 LEAQ (AX)(R8*1), BX 16366 16367 // genMemMoveLong 16368 MOVOU (R9), X0 16369 MOVOU 16(R9), X1 16370 MOVOU -32(R9)(R8*1), X2 16371 MOVOU -16(R9)(R8*1), X3 16372 MOVQ R8, R12 16373 SHRQ $0x05, R12 16374 MOVQ AX, R10 16375 ANDL $0x0000001f, R10 16376 MOVQ $0x00000040, R13 16377 SUBQ R10, R13 16378 DECQ R12 16379 JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 16380 LEAQ -32(R9)(R13*1), R10 16381 LEAQ -32(AX)(R13*1), R14 16382 16383 emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back: 16384 MOVOU (R10), X4 16385 MOVOU 16(R10), X5 16386 MOVOA X4, (R14) 16387 MOVOA X5, 16(R14) 16388 ADDQ $0x20, R14 16389 ADDQ $0x20, R10 16390 ADDQ $0x20, R13 16391 DECQ R12 16392 JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back 16393 16394 emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32: 16395 MOVOU -32(R9)(R13*1), X4 16396 MOVOU -16(R9)(R13*1), X5 16397 MOVOA X4, -32(AX)(R13*1) 16398 MOVOA X5, -16(AX)(R13*1) 16399 ADDQ $0x20, R13 16400 CMPQ R8, R13 16401 JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 16402 MOVOU X0, (AX) 16403 MOVOU X1, 16(AX) 16404 MOVOU X2, -32(AX)(R8*1) 16405 MOVOU X3, -16(AX)(R8*1) 16406 MOVQ BX, AX 16407 16408 emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B: 16409 ADDL R11, CX 16410 ADDL $0x04, R11 16411 MOVL CX, 12(SP) 16412 16413 // emitCopy 16414 two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B: 16415 CMPL R11, $0x40 16416 JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B 16417 MOVB $0xee, (AX) 16418 MOVW DI, 1(AX) 16419 LEAL -60(R11), R11 16420 ADDQ $0x03, AX 16421 JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B 16422 16423 two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B: 16424 MOVL R11, BX 16425 SHLL $0x02, BX 16426 CMPL R11, $0x0c 16427 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B 16428 CMPL DI, $0x00000800 16429 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B 16430 LEAL -15(BX), BX 16431 MOVB DI, 1(AX) 16432 SHRL $0x08, DI 16433 SHLL $0x05, DI 16434 ORL DI, BX 16435 MOVB BL, (AX) 16436 ADDQ $0x02, AX 16437 JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B 16438 16439 emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B: 16440 LEAL -2(BX), BX 16441 MOVB BL, (AX) 16442 MOVW DI, 1(AX) 16443 ADDQ $0x03, AX 16444 16445 match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B: 16446 CMPL CX, 8(SP) 16447 JAE emit_remainder_encodeSnappyBetterBlockAsm10B 16448 CMPQ AX, (SP) 16449 JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B 16450 MOVQ $0x00000000, ret+48(FP) 16451 RET 16452 16453 match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B: 16454 MOVQ $0x0000cf1bbcdcbf9b, BX 16455 MOVQ $0x9e3779b1, DI 16456 LEAQ 1(SI), SI 16457 LEAQ -2(CX), R8 16458 MOVQ (DX)(SI*1), R9 16459 MOVQ 1(DX)(SI*1), R10 16460 MOVQ (DX)(R8*1), R11 16461 MOVQ 1(DX)(R8*1), R12 16462 SHLQ $0x10, R9 16463 IMULQ BX, R9 16464 SHRQ $0x34, R9 16465 SHLQ $0x20, R10 16466 IMULQ DI, R10 16467 SHRQ $0x36, R10 16468 SHLQ $0x10, R11 16469 IMULQ BX, R11 16470 SHRQ $0x34, R11 16471 SHLQ $0x20, R12 16472 IMULQ DI, R12 16473 SHRQ $0x36, R12 16474 LEAQ 1(SI), DI 16475 LEAQ 1(R8), R13 16476 MOVL SI, 24(SP)(R9*4) 16477 MOVL R8, 24(SP)(R11*4) 16478 MOVL DI, 16408(SP)(R10*4) 16479 MOVL R13, 16408(SP)(R12*4) 16480 ADDQ $0x01, SI 16481 SUBQ $0x01, R8 16482 16483 index_loop_encodeSnappyBetterBlockAsm10B: 16484 CMPQ SI, R8 16485 JAE search_loop_encodeSnappyBetterBlockAsm10B 16486 MOVQ (DX)(SI*1), DI 16487 MOVQ (DX)(R8*1), R9 16488 SHLQ $0x10, DI 16489 IMULQ BX, DI 16490 SHRQ $0x34, DI 16491 SHLQ $0x10, R9 16492 IMULQ BX, R9 16493 SHRQ $0x34, R9 16494 MOVL SI, 24(SP)(DI*4) 16495 MOVL R8, 24(SP)(R9*4) 16496 ADDQ $0x02, SI 16497 SUBQ $0x02, R8 16498 JMP index_loop_encodeSnappyBetterBlockAsm10B 16499 16500 emit_remainder_encodeSnappyBetterBlockAsm10B: 16501 MOVQ src_len+32(FP), CX 16502 SUBL 12(SP), CX 16503 LEAQ 3(AX)(CX*1), CX 16504 CMPQ CX, (SP) 16505 JB emit_remainder_ok_encodeSnappyBetterBlockAsm10B 16506 MOVQ $0x00000000, ret+48(FP) 16507 RET 16508 16509 emit_remainder_ok_encodeSnappyBetterBlockAsm10B: 16510 MOVQ src_len+32(FP), CX 16511 MOVL 12(SP), BX 16512 CMPL BX, CX 16513 JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B 16514 MOVL CX, SI 16515 MOVL CX, 12(SP) 16516 LEAQ (DX)(BX*1), CX 16517 SUBL BX, SI 16518 LEAL -1(SI), DX 16519 CMPL DX, $0x3c 16520 JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B 16521 CMPL DX, $0x00000100 16522 JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B 16523 JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B 16524 16525 three_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B: 16526 MOVB $0xf4, (AX) 16527 MOVW DX, 1(AX) 16528 ADDQ $0x03, AX 16529 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B 16530 16531 two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B: 16532 MOVB $0xf0, (AX) 16533 MOVB DL, 1(AX) 16534 ADDQ $0x02, AX 16535 CMPL DX, $0x40 16536 JB memmove_emit_remainder_encodeSnappyBetterBlockAsm10B 16537 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B 16538 16539 one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B: 16540 SHLB $0x02, DL 16541 MOVB DL, (AX) 16542 ADDQ $0x01, AX 16543 16544 memmove_emit_remainder_encodeSnappyBetterBlockAsm10B: 16545 LEAQ (AX)(SI*1), DX 16546 MOVL SI, BX 16547 16548 // genMemMoveShort 16549 CMPQ BX, $0x03 16550 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2 16551 JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3 16552 CMPQ BX, $0x08 16553 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7 16554 CMPQ BX, $0x10 16555 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16 16556 CMPQ BX, $0x20 16557 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32 16558 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64 16559 16560 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2: 16561 MOVB (CX), SI 16562 MOVB -1(CX)(BX*1), CL 16563 MOVB SI, (AX) 16564 MOVB CL, -1(AX)(BX*1) 16565 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B 16566 16567 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3: 16568 MOVW (CX), SI 16569 MOVB 2(CX), CL 16570 MOVW SI, (AX) 16571 MOVB CL, 2(AX) 16572 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B 16573 16574 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7: 16575 MOVL (CX), SI 16576 MOVL -4(CX)(BX*1), CX 16577 MOVL SI, (AX) 16578 MOVL CX, -4(AX)(BX*1) 16579 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B 16580 16581 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16: 16582 MOVQ (CX), SI 16583 MOVQ -8(CX)(BX*1), CX 16584 MOVQ SI, (AX) 16585 MOVQ CX, -8(AX)(BX*1) 16586 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B 16587 16588 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32: 16589 MOVOU (CX), X0 16590 MOVOU -16(CX)(BX*1), X1 16591 MOVOU X0, (AX) 16592 MOVOU X1, -16(AX)(BX*1) 16593 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B 16594 16595 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64: 16596 MOVOU (CX), X0 16597 MOVOU 16(CX), X1 16598 MOVOU -32(CX)(BX*1), X2 16599 MOVOU -16(CX)(BX*1), X3 16600 MOVOU X0, (AX) 16601 MOVOU X1, 16(AX) 16602 MOVOU X2, -32(AX)(BX*1) 16603 MOVOU X3, -16(AX)(BX*1) 16604 16605 memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B: 16606 MOVQ DX, AX 16607 JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B 16608 16609 memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B: 16610 LEAQ (AX)(SI*1), DX 16611 MOVL SI, BX 16612 16613 // genMemMoveLong 16614 MOVOU (CX), X0 16615 MOVOU 16(CX), X1 16616 MOVOU -32(CX)(BX*1), X2 16617 MOVOU -16(CX)(BX*1), X3 16618 MOVQ BX, DI 16619 SHRQ $0x05, DI 16620 MOVQ AX, SI 16621 ANDL $0x0000001f, SI 16622 MOVQ $0x00000040, R8 16623 SUBQ SI, R8 16624 DECQ DI 16625 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 16626 LEAQ -32(CX)(R8*1), SI 16627 LEAQ -32(AX)(R8*1), R9 16628 16629 emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back: 16630 MOVOU (SI), X4 16631 MOVOU 16(SI), X5 16632 MOVOA X4, (R9) 16633 MOVOA X5, 16(R9) 16634 ADDQ $0x20, R9 16635 ADDQ $0x20, SI 16636 ADDQ $0x20, R8 16637 DECQ DI 16638 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back 16639 16640 emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32: 16641 MOVOU -32(CX)(R8*1), X4 16642 MOVOU -16(CX)(R8*1), X5 16643 MOVOA X4, -32(AX)(R8*1) 16644 MOVOA X5, -16(AX)(R8*1) 16645 ADDQ $0x20, R8 16646 CMPQ BX, R8 16647 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 16648 MOVOU X0, (AX) 16649 MOVOU X1, 16(AX) 16650 MOVOU X2, -32(AX)(BX*1) 16651 MOVOU X3, -16(AX)(BX*1) 16652 MOVQ DX, AX 16653 16654 emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B: 16655 MOVQ dst_base+0(FP), CX 16656 SUBQ CX, AX 16657 MOVQ AX, ret+48(FP) 16658 RET 16659 16660 // func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int 16661 // Requires: BMI, SSE2 16662 TEXT ·encodeSnappyBetterBlockAsm8B(SB), $5144-56 16663 MOVQ dst_base+0(FP), AX 16664 MOVQ $0x00000028, CX 16665 LEAQ 24(SP), DX 16666 PXOR X0, X0 16667 16668 zero_loop_encodeSnappyBetterBlockAsm8B: 16669 MOVOU X0, (DX) 16670 MOVOU X0, 16(DX) 16671 MOVOU X0, 32(DX) 16672 MOVOU X0, 48(DX) 16673 MOVOU X0, 64(DX) 16674 MOVOU X0, 80(DX) 16675 MOVOU X0, 96(DX) 16676 MOVOU X0, 112(DX) 16677 ADDQ $0x80, DX 16678 DECQ CX 16679 JNZ zero_loop_encodeSnappyBetterBlockAsm8B 16680 MOVL $0x00000000, 12(SP) 16681 MOVQ src_len+32(FP), CX 16682 LEAQ -9(CX), DX 16683 LEAQ -8(CX), BX 16684 MOVL BX, 8(SP) 16685 SHRQ $0x05, CX 16686 SUBL CX, DX 16687 LEAQ (AX)(DX*1), DX 16688 MOVQ DX, (SP) 16689 MOVL $0x00000001, CX 16690 MOVL $0x00000000, 16(SP) 16691 MOVQ src_base+24(FP), DX 16692 16693 search_loop_encodeSnappyBetterBlockAsm8B: 16694 MOVL CX, BX 16695 SUBL 12(SP), BX 16696 SHRL $0x04, BX 16697 LEAL 1(CX)(BX*1), BX 16698 CMPL BX, 8(SP) 16699 JAE emit_remainder_encodeSnappyBetterBlockAsm8B 16700 MOVQ (DX)(CX*1), SI 16701 MOVL BX, 20(SP) 16702 MOVQ $0x0000cf1bbcdcbf9b, R8 16703 MOVQ $0x9e3779b1, BX 16704 MOVQ SI, R9 16705 MOVQ SI, R10 16706 SHLQ $0x10, R9 16707 IMULQ R8, R9 16708 SHRQ $0x36, R9 16709 SHLQ $0x20, R10 16710 IMULQ BX, R10 16711 SHRQ $0x38, R10 16712 MOVL 24(SP)(R9*4), BX 16713 MOVL 4120(SP)(R10*4), DI 16714 MOVL CX, 24(SP)(R9*4) 16715 MOVL CX, 4120(SP)(R10*4) 16716 MOVQ (DX)(BX*1), R9 16717 MOVQ (DX)(DI*1), R10 16718 CMPQ R9, SI 16719 JEQ candidate_match_encodeSnappyBetterBlockAsm8B 16720 CMPQ R10, SI 16721 JNE no_short_found_encodeSnappyBetterBlockAsm8B 16722 MOVL DI, BX 16723 JMP candidate_match_encodeSnappyBetterBlockAsm8B 16724 16725 no_short_found_encodeSnappyBetterBlockAsm8B: 16726 CMPL R9, SI 16727 JEQ candidate_match_encodeSnappyBetterBlockAsm8B 16728 CMPL R10, SI 16729 JEQ candidateS_match_encodeSnappyBetterBlockAsm8B 16730 MOVL 20(SP), CX 16731 JMP search_loop_encodeSnappyBetterBlockAsm8B 16732 16733 candidateS_match_encodeSnappyBetterBlockAsm8B: 16734 SHRQ $0x08, SI 16735 MOVQ SI, R9 16736 SHLQ $0x10, R9 16737 IMULQ R8, R9 16738 SHRQ $0x36, R9 16739 MOVL 24(SP)(R9*4), BX 16740 INCL CX 16741 MOVL CX, 24(SP)(R9*4) 16742 CMPL (DX)(BX*1), SI 16743 JEQ candidate_match_encodeSnappyBetterBlockAsm8B 16744 DECL CX 16745 MOVL DI, BX 16746 16747 candidate_match_encodeSnappyBetterBlockAsm8B: 16748 MOVL 12(SP), SI 16749 TESTL BX, BX 16750 JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B 16751 16752 match_extend_back_loop_encodeSnappyBetterBlockAsm8B: 16753 CMPL CX, SI 16754 JBE match_extend_back_end_encodeSnappyBetterBlockAsm8B 16755 MOVB -1(DX)(BX*1), DI 16756 MOVB -1(DX)(CX*1), R8 16757 CMPB DI, R8 16758 JNE match_extend_back_end_encodeSnappyBetterBlockAsm8B 16759 LEAL -1(CX), CX 16760 DECL BX 16761 JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B 16762 JMP match_extend_back_loop_encodeSnappyBetterBlockAsm8B 16763 16764 match_extend_back_end_encodeSnappyBetterBlockAsm8B: 16765 MOVL CX, SI 16766 SUBL 12(SP), SI 16767 LEAQ 3(AX)(SI*1), SI 16768 CMPQ SI, (SP) 16769 JB match_dst_size_check_encodeSnappyBetterBlockAsm8B 16770 MOVQ $0x00000000, ret+48(FP) 16771 RET 16772 16773 match_dst_size_check_encodeSnappyBetterBlockAsm8B: 16774 MOVL CX, SI 16775 ADDL $0x04, CX 16776 ADDL $0x04, BX 16777 MOVQ src_len+32(FP), DI 16778 SUBL CX, DI 16779 LEAQ (DX)(CX*1), R8 16780 LEAQ (DX)(BX*1), R9 16781 16782 // matchLen 16783 XORL R11, R11 16784 CMPL DI, $0x08 16785 JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B 16786 16787 matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B: 16788 MOVQ (R8)(R11*1), R10 16789 XORQ (R9)(R11*1), R10 16790 TESTQ R10, R10 16791 JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B 16792 16793 #ifdef GOAMD64_v3 16794 TZCNTQ R10, R10 16795 16796 #else 16797 BSFQ R10, R10 16798 16799 #endif 16800 SARQ $0x03, R10 16801 LEAL (R11)(R10*1), R11 16802 JMP match_nolit_end_encodeSnappyBetterBlockAsm8B 16803 16804 matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B: 16805 LEAL -8(DI), DI 16806 LEAL 8(R11), R11 16807 CMPL DI, $0x08 16808 JAE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B 16809 JZ match_nolit_end_encodeSnappyBetterBlockAsm8B 16810 16811 matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B: 16812 CMPL DI, $0x04 16813 JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B 16814 MOVL (R8)(R11*1), R10 16815 CMPL (R9)(R11*1), R10 16816 JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B 16817 SUBL $0x04, DI 16818 LEAL 4(R11), R11 16819 16820 matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B: 16821 CMPL DI, $0x02 16822 JB matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B 16823 MOVW (R8)(R11*1), R10 16824 CMPW (R9)(R11*1), R10 16825 JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B 16826 SUBL $0x02, DI 16827 LEAL 2(R11), R11 16828 16829 matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B: 16830 CMPL DI, $0x01 16831 JB match_nolit_end_encodeSnappyBetterBlockAsm8B 16832 MOVB (R8)(R11*1), R10 16833 CMPB (R9)(R11*1), R10 16834 JNE match_nolit_end_encodeSnappyBetterBlockAsm8B 16835 LEAL 1(R11), R11 16836 16837 match_nolit_end_encodeSnappyBetterBlockAsm8B: 16838 MOVL CX, DI 16839 SUBL BX, DI 16840 16841 // Check if repeat 16842 MOVL DI, 16(SP) 16843 MOVL 12(SP), BX 16844 CMPL BX, SI 16845 JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B 16846 MOVL SI, R8 16847 MOVL SI, 12(SP) 16848 LEAQ (DX)(BX*1), R9 16849 SUBL BX, R8 16850 LEAL -1(R8), BX 16851 CMPL BX, $0x3c 16852 JB one_byte_match_emit_encodeSnappyBetterBlockAsm8B 16853 CMPL BX, $0x00000100 16854 JB two_bytes_match_emit_encodeSnappyBetterBlockAsm8B 16855 JB three_bytes_match_emit_encodeSnappyBetterBlockAsm8B 16856 16857 three_bytes_match_emit_encodeSnappyBetterBlockAsm8B: 16858 MOVB $0xf4, (AX) 16859 MOVW BX, 1(AX) 16860 ADDQ $0x03, AX 16861 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B 16862 16863 two_bytes_match_emit_encodeSnappyBetterBlockAsm8B: 16864 MOVB $0xf0, (AX) 16865 MOVB BL, 1(AX) 16866 ADDQ $0x02, AX 16867 CMPL BX, $0x40 16868 JB memmove_match_emit_encodeSnappyBetterBlockAsm8B 16869 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B 16870 16871 one_byte_match_emit_encodeSnappyBetterBlockAsm8B: 16872 SHLB $0x02, BL 16873 MOVB BL, (AX) 16874 ADDQ $0x01, AX 16875 16876 memmove_match_emit_encodeSnappyBetterBlockAsm8B: 16877 LEAQ (AX)(R8*1), BX 16878 16879 // genMemMoveShort 16880 CMPQ R8, $0x08 16881 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8 16882 CMPQ R8, $0x10 16883 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16 16884 CMPQ R8, $0x20 16885 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32 16886 JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64 16887 16888 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8: 16889 MOVQ (R9), R10 16890 MOVQ R10, (AX) 16891 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B 16892 16893 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16: 16894 MOVQ (R9), R10 16895 MOVQ -8(R9)(R8*1), R9 16896 MOVQ R10, (AX) 16897 MOVQ R9, -8(AX)(R8*1) 16898 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B 16899 16900 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32: 16901 MOVOU (R9), X0 16902 MOVOU -16(R9)(R8*1), X1 16903 MOVOU X0, (AX) 16904 MOVOU X1, -16(AX)(R8*1) 16905 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B 16906 16907 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64: 16908 MOVOU (R9), X0 16909 MOVOU 16(R9), X1 16910 MOVOU -32(R9)(R8*1), X2 16911 MOVOU -16(R9)(R8*1), X3 16912 MOVOU X0, (AX) 16913 MOVOU X1, 16(AX) 16914 MOVOU X2, -32(AX)(R8*1) 16915 MOVOU X3, -16(AX)(R8*1) 16916 16917 memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B: 16918 MOVQ BX, AX 16919 JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B 16920 16921 memmove_long_match_emit_encodeSnappyBetterBlockAsm8B: 16922 LEAQ (AX)(R8*1), BX 16923 16924 // genMemMoveLong 16925 MOVOU (R9), X0 16926 MOVOU 16(R9), X1 16927 MOVOU -32(R9)(R8*1), X2 16928 MOVOU -16(R9)(R8*1), X3 16929 MOVQ R8, R12 16930 SHRQ $0x05, R12 16931 MOVQ AX, R10 16932 ANDL $0x0000001f, R10 16933 MOVQ $0x00000040, R13 16934 SUBQ R10, R13 16935 DECQ R12 16936 JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 16937 LEAQ -32(R9)(R13*1), R10 16938 LEAQ -32(AX)(R13*1), R14 16939 16940 emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back: 16941 MOVOU (R10), X4 16942 MOVOU 16(R10), X5 16943 MOVOA X4, (R14) 16944 MOVOA X5, 16(R14) 16945 ADDQ $0x20, R14 16946 ADDQ $0x20, R10 16947 ADDQ $0x20, R13 16948 DECQ R12 16949 JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back 16950 16951 emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32: 16952 MOVOU -32(R9)(R13*1), X4 16953 MOVOU -16(R9)(R13*1), X5 16954 MOVOA X4, -32(AX)(R13*1) 16955 MOVOA X5, -16(AX)(R13*1) 16956 ADDQ $0x20, R13 16957 CMPQ R8, R13 16958 JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 16959 MOVOU X0, (AX) 16960 MOVOU X1, 16(AX) 16961 MOVOU X2, -32(AX)(R8*1) 16962 MOVOU X3, -16(AX)(R8*1) 16963 MOVQ BX, AX 16964 16965 emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B: 16966 ADDL R11, CX 16967 ADDL $0x04, R11 16968 MOVL CX, 12(SP) 16969 16970 // emitCopy 16971 two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B: 16972 CMPL R11, $0x40 16973 JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B 16974 MOVB $0xee, (AX) 16975 MOVW DI, 1(AX) 16976 LEAL -60(R11), R11 16977 ADDQ $0x03, AX 16978 JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B 16979 16980 two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B: 16981 MOVL R11, BX 16982 SHLL $0x02, BX 16983 CMPL R11, $0x0c 16984 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B 16985 LEAL -15(BX), BX 16986 MOVB DI, 1(AX) 16987 SHRL $0x08, DI 16988 SHLL $0x05, DI 16989 ORL DI, BX 16990 MOVB BL, (AX) 16991 ADDQ $0x02, AX 16992 JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B 16993 16994 emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B: 16995 LEAL -2(BX), BX 16996 MOVB BL, (AX) 16997 MOVW DI, 1(AX) 16998 ADDQ $0x03, AX 16999 17000 match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B: 17001 CMPL CX, 8(SP) 17002 JAE emit_remainder_encodeSnappyBetterBlockAsm8B 17003 CMPQ AX, (SP) 17004 JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B 17005 MOVQ $0x00000000, ret+48(FP) 17006 RET 17007 17008 match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B: 17009 MOVQ $0x0000cf1bbcdcbf9b, BX 17010 MOVQ $0x9e3779b1, DI 17011 LEAQ 1(SI), SI 17012 LEAQ -2(CX), R8 17013 MOVQ (DX)(SI*1), R9 17014 MOVQ 1(DX)(SI*1), R10 17015 MOVQ (DX)(R8*1), R11 17016 MOVQ 1(DX)(R8*1), R12 17017 SHLQ $0x10, R9 17018 IMULQ BX, R9 17019 SHRQ $0x36, R9 17020 SHLQ $0x20, R10 17021 IMULQ DI, R10 17022 SHRQ $0x38, R10 17023 SHLQ $0x10, R11 17024 IMULQ BX, R11 17025 SHRQ $0x36, R11 17026 SHLQ $0x20, R12 17027 IMULQ DI, R12 17028 SHRQ $0x38, R12 17029 LEAQ 1(SI), DI 17030 LEAQ 1(R8), R13 17031 MOVL SI, 24(SP)(R9*4) 17032 MOVL R8, 24(SP)(R11*4) 17033 MOVL DI, 4120(SP)(R10*4) 17034 MOVL R13, 4120(SP)(R12*4) 17035 ADDQ $0x01, SI 17036 SUBQ $0x01, R8 17037 17038 index_loop_encodeSnappyBetterBlockAsm8B: 17039 CMPQ SI, R8 17040 JAE search_loop_encodeSnappyBetterBlockAsm8B 17041 MOVQ (DX)(SI*1), DI 17042 MOVQ (DX)(R8*1), R9 17043 SHLQ $0x10, DI 17044 IMULQ BX, DI 17045 SHRQ $0x36, DI 17046 SHLQ $0x10, R9 17047 IMULQ BX, R9 17048 SHRQ $0x36, R9 17049 MOVL SI, 24(SP)(DI*4) 17050 MOVL R8, 24(SP)(R9*4) 17051 ADDQ $0x02, SI 17052 SUBQ $0x02, R8 17053 JMP index_loop_encodeSnappyBetterBlockAsm8B 17054 17055 emit_remainder_encodeSnappyBetterBlockAsm8B: 17056 MOVQ src_len+32(FP), CX 17057 SUBL 12(SP), CX 17058 LEAQ 3(AX)(CX*1), CX 17059 CMPQ CX, (SP) 17060 JB emit_remainder_ok_encodeSnappyBetterBlockAsm8B 17061 MOVQ $0x00000000, ret+48(FP) 17062 RET 17063 17064 emit_remainder_ok_encodeSnappyBetterBlockAsm8B: 17065 MOVQ src_len+32(FP), CX 17066 MOVL 12(SP), BX 17067 CMPL BX, CX 17068 JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B 17069 MOVL CX, SI 17070 MOVL CX, 12(SP) 17071 LEAQ (DX)(BX*1), CX 17072 SUBL BX, SI 17073 LEAL -1(SI), DX 17074 CMPL DX, $0x3c 17075 JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B 17076 CMPL DX, $0x00000100 17077 JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B 17078 JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B 17079 17080 three_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B: 17081 MOVB $0xf4, (AX) 17082 MOVW DX, 1(AX) 17083 ADDQ $0x03, AX 17084 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B 17085 17086 two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B: 17087 MOVB $0xf0, (AX) 17088 MOVB DL, 1(AX) 17089 ADDQ $0x02, AX 17090 CMPL DX, $0x40 17091 JB memmove_emit_remainder_encodeSnappyBetterBlockAsm8B 17092 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B 17093 17094 one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B: 17095 SHLB $0x02, DL 17096 MOVB DL, (AX) 17097 ADDQ $0x01, AX 17098 17099 memmove_emit_remainder_encodeSnappyBetterBlockAsm8B: 17100 LEAQ (AX)(SI*1), DX 17101 MOVL SI, BX 17102 17103 // genMemMoveShort 17104 CMPQ BX, $0x03 17105 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2 17106 JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3 17107 CMPQ BX, $0x08 17108 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7 17109 CMPQ BX, $0x10 17110 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16 17111 CMPQ BX, $0x20 17112 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32 17113 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64 17114 17115 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2: 17116 MOVB (CX), SI 17117 MOVB -1(CX)(BX*1), CL 17118 MOVB SI, (AX) 17119 MOVB CL, -1(AX)(BX*1) 17120 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B 17121 17122 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3: 17123 MOVW (CX), SI 17124 MOVB 2(CX), CL 17125 MOVW SI, (AX) 17126 MOVB CL, 2(AX) 17127 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B 17128 17129 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7: 17130 MOVL (CX), SI 17131 MOVL -4(CX)(BX*1), CX 17132 MOVL SI, (AX) 17133 MOVL CX, -4(AX)(BX*1) 17134 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B 17135 17136 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16: 17137 MOVQ (CX), SI 17138 MOVQ -8(CX)(BX*1), CX 17139 MOVQ SI, (AX) 17140 MOVQ CX, -8(AX)(BX*1) 17141 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B 17142 17143 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32: 17144 MOVOU (CX), X0 17145 MOVOU -16(CX)(BX*1), X1 17146 MOVOU X0, (AX) 17147 MOVOU X1, -16(AX)(BX*1) 17148 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B 17149 17150 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64: 17151 MOVOU (CX), X0 17152 MOVOU 16(CX), X1 17153 MOVOU -32(CX)(BX*1), X2 17154 MOVOU -16(CX)(BX*1), X3 17155 MOVOU X0, (AX) 17156 MOVOU X1, 16(AX) 17157 MOVOU X2, -32(AX)(BX*1) 17158 MOVOU X3, -16(AX)(BX*1) 17159 17160 memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B: 17161 MOVQ DX, AX 17162 JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B 17163 17164 memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B: 17165 LEAQ (AX)(SI*1), DX 17166 MOVL SI, BX 17167 17168 // genMemMoveLong 17169 MOVOU (CX), X0 17170 MOVOU 16(CX), X1 17171 MOVOU -32(CX)(BX*1), X2 17172 MOVOU -16(CX)(BX*1), X3 17173 MOVQ BX, DI 17174 SHRQ $0x05, DI 17175 MOVQ AX, SI 17176 ANDL $0x0000001f, SI 17177 MOVQ $0x00000040, R8 17178 SUBQ SI, R8 17179 DECQ DI 17180 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 17181 LEAQ -32(CX)(R8*1), SI 17182 LEAQ -32(AX)(R8*1), R9 17183 17184 emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back: 17185 MOVOU (SI), X4 17186 MOVOU 16(SI), X5 17187 MOVOA X4, (R9) 17188 MOVOA X5, 16(R9) 17189 ADDQ $0x20, R9 17190 ADDQ $0x20, SI 17191 ADDQ $0x20, R8 17192 DECQ DI 17193 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back 17194 17195 emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32: 17196 MOVOU -32(CX)(R8*1), X4 17197 MOVOU -16(CX)(R8*1), X5 17198 MOVOA X4, -32(AX)(R8*1) 17199 MOVOA X5, -16(AX)(R8*1) 17200 ADDQ $0x20, R8 17201 CMPQ BX, R8 17202 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 17203 MOVOU X0, (AX) 17204 MOVOU X1, 16(AX) 17205 MOVOU X2, -32(AX)(BX*1) 17206 MOVOU X3, -16(AX)(BX*1) 17207 MOVQ DX, AX 17208 17209 emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B: 17210 MOVQ dst_base+0(FP), CX 17211 SUBQ CX, AX 17212 MOVQ AX, ret+48(FP) 17213 RET 17214 17215 // func calcBlockSize(src []byte) int 17216 // Requires: BMI, SSE2 17217 TEXT ·calcBlockSize(SB), $32792-32 17218 XORQ AX, AX 17219 MOVQ $0x00000100, CX 17220 LEAQ 24(SP), DX 17221 PXOR X0, X0 17222 17223 zero_loop_calcBlockSize: 17224 MOVOU X0, (DX) 17225 MOVOU X0, 16(DX) 17226 MOVOU X0, 32(DX) 17227 MOVOU X0, 48(DX) 17228 MOVOU X0, 64(DX) 17229 MOVOU X0, 80(DX) 17230 MOVOU X0, 96(DX) 17231 MOVOU X0, 112(DX) 17232 ADDQ $0x80, DX 17233 DECQ CX 17234 JNZ zero_loop_calcBlockSize 17235 MOVL $0x00000000, 12(SP) 17236 MOVQ src_len+8(FP), CX 17237 LEAQ -9(CX), DX 17238 LEAQ -8(CX), BX 17239 MOVL BX, 8(SP) 17240 SHRQ $0x05, CX 17241 SUBL CX, DX 17242 LEAQ (AX)(DX*1), DX 17243 MOVQ DX, (SP) 17244 MOVL $0x00000001, CX 17245 MOVL CX, 16(SP) 17246 MOVQ src_base+0(FP), DX 17247 17248 search_loop_calcBlockSize: 17249 MOVL CX, BX 17250 SUBL 12(SP), BX 17251 SHRL $0x05, BX 17252 LEAL 4(CX)(BX*1), BX 17253 CMPL BX, 8(SP) 17254 JAE emit_remainder_calcBlockSize 17255 MOVQ (DX)(CX*1), SI 17256 MOVL BX, 20(SP) 17257 MOVQ $0x0000cf1bbcdcbf9b, R8 17258 MOVQ SI, R9 17259 MOVQ SI, R10 17260 SHRQ $0x08, R10 17261 SHLQ $0x10, R9 17262 IMULQ R8, R9 17263 SHRQ $0x33, R9 17264 SHLQ $0x10, R10 17265 IMULQ R8, R10 17266 SHRQ $0x33, R10 17267 MOVL 24(SP)(R9*4), BX 17268 MOVL 24(SP)(R10*4), DI 17269 MOVL CX, 24(SP)(R9*4) 17270 LEAL 1(CX), R9 17271 MOVL R9, 24(SP)(R10*4) 17272 MOVQ SI, R9 17273 SHRQ $0x10, R9 17274 SHLQ $0x10, R9 17275 IMULQ R8, R9 17276 SHRQ $0x33, R9 17277 MOVL CX, R8 17278 SUBL 16(SP), R8 17279 MOVL 1(DX)(R8*1), R10 17280 MOVQ SI, R8 17281 SHRQ $0x08, R8 17282 CMPL R8, R10 17283 JNE no_repeat_found_calcBlockSize 17284 LEAL 1(CX), SI 17285 MOVL 12(SP), BX 17286 MOVL SI, DI 17287 SUBL 16(SP), DI 17288 JZ repeat_extend_back_end_calcBlockSize 17289 17290 repeat_extend_back_loop_calcBlockSize: 17291 CMPL SI, BX 17292 JBE repeat_extend_back_end_calcBlockSize 17293 MOVB -1(DX)(DI*1), R8 17294 MOVB -1(DX)(SI*1), R9 17295 CMPB R8, R9 17296 JNE repeat_extend_back_end_calcBlockSize 17297 LEAL -1(SI), SI 17298 DECL DI 17299 JNZ repeat_extend_back_loop_calcBlockSize 17300 17301 repeat_extend_back_end_calcBlockSize: 17302 MOVL 12(SP), BX 17303 CMPL BX, SI 17304 JEQ emit_literal_done_repeat_emit_calcBlockSize 17305 MOVL SI, DI 17306 MOVL SI, 12(SP) 17307 LEAQ (DX)(BX*1), R8 17308 SUBL BX, DI 17309 LEAL -1(DI), BX 17310 CMPL BX, $0x3c 17311 JB one_byte_repeat_emit_calcBlockSize 17312 CMPL BX, $0x00000100 17313 JB two_bytes_repeat_emit_calcBlockSize 17314 CMPL BX, $0x00010000 17315 JB three_bytes_repeat_emit_calcBlockSize 17316 CMPL BX, $0x01000000 17317 JB four_bytes_repeat_emit_calcBlockSize 17318 ADDQ $0x05, AX 17319 JMP memmove_long_repeat_emit_calcBlockSize 17320 17321 four_bytes_repeat_emit_calcBlockSize: 17322 ADDQ $0x04, AX 17323 JMP memmove_long_repeat_emit_calcBlockSize 17324 17325 three_bytes_repeat_emit_calcBlockSize: 17326 ADDQ $0x03, AX 17327 JMP memmove_long_repeat_emit_calcBlockSize 17328 17329 two_bytes_repeat_emit_calcBlockSize: 17330 ADDQ $0x02, AX 17331 CMPL BX, $0x40 17332 JB memmove_repeat_emit_calcBlockSize 17333 JMP memmove_long_repeat_emit_calcBlockSize 17334 17335 one_byte_repeat_emit_calcBlockSize: 17336 ADDQ $0x01, AX 17337 17338 memmove_repeat_emit_calcBlockSize: 17339 LEAQ (AX)(DI*1), AX 17340 JMP emit_literal_done_repeat_emit_calcBlockSize 17341 17342 memmove_long_repeat_emit_calcBlockSize: 17343 LEAQ (AX)(DI*1), AX 17344 17345 emit_literal_done_repeat_emit_calcBlockSize: 17346 ADDL $0x05, CX 17347 MOVL CX, BX 17348 SUBL 16(SP), BX 17349 MOVQ src_len+8(FP), DI 17350 SUBL CX, DI 17351 LEAQ (DX)(CX*1), R8 17352 LEAQ (DX)(BX*1), BX 17353 17354 // matchLen 17355 XORL R10, R10 17356 CMPL DI, $0x08 17357 JB matchlen_match4_repeat_extend_calcBlockSize 17358 17359 matchlen_loopback_repeat_extend_calcBlockSize: 17360 MOVQ (R8)(R10*1), R9 17361 XORQ (BX)(R10*1), R9 17362 TESTQ R9, R9 17363 JZ matchlen_loop_repeat_extend_calcBlockSize 17364 17365 #ifdef GOAMD64_v3 17366 TZCNTQ R9, R9 17367 17368 #else 17369 BSFQ R9, R9 17370 17371 #endif 17372 SARQ $0x03, R9 17373 LEAL (R10)(R9*1), R10 17374 JMP repeat_extend_forward_end_calcBlockSize 17375 17376 matchlen_loop_repeat_extend_calcBlockSize: 17377 LEAL -8(DI), DI 17378 LEAL 8(R10), R10 17379 CMPL DI, $0x08 17380 JAE matchlen_loopback_repeat_extend_calcBlockSize 17381 JZ repeat_extend_forward_end_calcBlockSize 17382 17383 matchlen_match4_repeat_extend_calcBlockSize: 17384 CMPL DI, $0x04 17385 JB matchlen_match2_repeat_extend_calcBlockSize 17386 MOVL (R8)(R10*1), R9 17387 CMPL (BX)(R10*1), R9 17388 JNE matchlen_match2_repeat_extend_calcBlockSize 17389 SUBL $0x04, DI 17390 LEAL 4(R10), R10 17391 17392 matchlen_match2_repeat_extend_calcBlockSize: 17393 CMPL DI, $0x02 17394 JB matchlen_match1_repeat_extend_calcBlockSize 17395 MOVW (R8)(R10*1), R9 17396 CMPW (BX)(R10*1), R9 17397 JNE matchlen_match1_repeat_extend_calcBlockSize 17398 SUBL $0x02, DI 17399 LEAL 2(R10), R10 17400 17401 matchlen_match1_repeat_extend_calcBlockSize: 17402 CMPL DI, $0x01 17403 JB repeat_extend_forward_end_calcBlockSize 17404 MOVB (R8)(R10*1), R9 17405 CMPB (BX)(R10*1), R9 17406 JNE repeat_extend_forward_end_calcBlockSize 17407 LEAL 1(R10), R10 17408 17409 repeat_extend_forward_end_calcBlockSize: 17410 ADDL R10, CX 17411 MOVL CX, BX 17412 SUBL SI, BX 17413 MOVL 16(SP), SI 17414 17415 // emitCopy 17416 CMPL SI, $0x00010000 17417 JB two_byte_offset_repeat_as_copy_calcBlockSize 17418 17419 four_bytes_loop_back_repeat_as_copy_calcBlockSize: 17420 CMPL BX, $0x40 17421 JBE four_bytes_remain_repeat_as_copy_calcBlockSize 17422 LEAL -64(BX), BX 17423 ADDQ $0x05, AX 17424 CMPL BX, $0x04 17425 JB four_bytes_remain_repeat_as_copy_calcBlockSize 17426 JMP four_bytes_loop_back_repeat_as_copy_calcBlockSize 17427 17428 four_bytes_remain_repeat_as_copy_calcBlockSize: 17429 TESTL BX, BX 17430 JZ repeat_end_emit_calcBlockSize 17431 XORL BX, BX 17432 ADDQ $0x05, AX 17433 JMP repeat_end_emit_calcBlockSize 17434 17435 two_byte_offset_repeat_as_copy_calcBlockSize: 17436 CMPL BX, $0x40 17437 JBE two_byte_offset_short_repeat_as_copy_calcBlockSize 17438 LEAL -60(BX), BX 17439 ADDQ $0x03, AX 17440 JMP two_byte_offset_repeat_as_copy_calcBlockSize 17441 17442 two_byte_offset_short_repeat_as_copy_calcBlockSize: 17443 MOVL BX, DI 17444 SHLL $0x02, DI 17445 CMPL BX, $0x0c 17446 JAE emit_copy_three_repeat_as_copy_calcBlockSize 17447 CMPL SI, $0x00000800 17448 JAE emit_copy_three_repeat_as_copy_calcBlockSize 17449 ADDQ $0x02, AX 17450 JMP repeat_end_emit_calcBlockSize 17451 17452 emit_copy_three_repeat_as_copy_calcBlockSize: 17453 ADDQ $0x03, AX 17454 17455 repeat_end_emit_calcBlockSize: 17456 MOVL CX, 12(SP) 17457 JMP search_loop_calcBlockSize 17458 17459 no_repeat_found_calcBlockSize: 17460 CMPL (DX)(BX*1), SI 17461 JEQ candidate_match_calcBlockSize 17462 SHRQ $0x08, SI 17463 MOVL 24(SP)(R9*4), BX 17464 LEAL 2(CX), R8 17465 CMPL (DX)(DI*1), SI 17466 JEQ candidate2_match_calcBlockSize 17467 MOVL R8, 24(SP)(R9*4) 17468 SHRQ $0x08, SI 17469 CMPL (DX)(BX*1), SI 17470 JEQ candidate3_match_calcBlockSize 17471 MOVL 20(SP), CX 17472 JMP search_loop_calcBlockSize 17473 17474 candidate3_match_calcBlockSize: 17475 ADDL $0x02, CX 17476 JMP candidate_match_calcBlockSize 17477 17478 candidate2_match_calcBlockSize: 17479 MOVL R8, 24(SP)(R9*4) 17480 INCL CX 17481 MOVL DI, BX 17482 17483 candidate_match_calcBlockSize: 17484 MOVL 12(SP), SI 17485 TESTL BX, BX 17486 JZ match_extend_back_end_calcBlockSize 17487 17488 match_extend_back_loop_calcBlockSize: 17489 CMPL CX, SI 17490 JBE match_extend_back_end_calcBlockSize 17491 MOVB -1(DX)(BX*1), DI 17492 MOVB -1(DX)(CX*1), R8 17493 CMPB DI, R8 17494 JNE match_extend_back_end_calcBlockSize 17495 LEAL -1(CX), CX 17496 DECL BX 17497 JZ match_extend_back_end_calcBlockSize 17498 JMP match_extend_back_loop_calcBlockSize 17499 17500 match_extend_back_end_calcBlockSize: 17501 MOVL CX, SI 17502 SUBL 12(SP), SI 17503 LEAQ 5(AX)(SI*1), SI 17504 CMPQ SI, (SP) 17505 JB match_dst_size_check_calcBlockSize 17506 MOVQ $0x00000000, ret+24(FP) 17507 RET 17508 17509 match_dst_size_check_calcBlockSize: 17510 MOVL CX, SI 17511 MOVL 12(SP), DI 17512 CMPL DI, SI 17513 JEQ emit_literal_done_match_emit_calcBlockSize 17514 MOVL SI, R8 17515 MOVL SI, 12(SP) 17516 LEAQ (DX)(DI*1), SI 17517 SUBL DI, R8 17518 LEAL -1(R8), SI 17519 CMPL SI, $0x3c 17520 JB one_byte_match_emit_calcBlockSize 17521 CMPL SI, $0x00000100 17522 JB two_bytes_match_emit_calcBlockSize 17523 CMPL SI, $0x00010000 17524 JB three_bytes_match_emit_calcBlockSize 17525 CMPL SI, $0x01000000 17526 JB four_bytes_match_emit_calcBlockSize 17527 ADDQ $0x05, AX 17528 JMP memmove_long_match_emit_calcBlockSize 17529 17530 four_bytes_match_emit_calcBlockSize: 17531 ADDQ $0x04, AX 17532 JMP memmove_long_match_emit_calcBlockSize 17533 17534 three_bytes_match_emit_calcBlockSize: 17535 ADDQ $0x03, AX 17536 JMP memmove_long_match_emit_calcBlockSize 17537 17538 two_bytes_match_emit_calcBlockSize: 17539 ADDQ $0x02, AX 17540 CMPL SI, $0x40 17541 JB memmove_match_emit_calcBlockSize 17542 JMP memmove_long_match_emit_calcBlockSize 17543 17544 one_byte_match_emit_calcBlockSize: 17545 ADDQ $0x01, AX 17546 17547 memmove_match_emit_calcBlockSize: 17548 LEAQ (AX)(R8*1), AX 17549 JMP emit_literal_done_match_emit_calcBlockSize 17550 17551 memmove_long_match_emit_calcBlockSize: 17552 LEAQ (AX)(R8*1), AX 17553 17554 emit_literal_done_match_emit_calcBlockSize: 17555 match_nolit_loop_calcBlockSize: 17556 MOVL CX, SI 17557 SUBL BX, SI 17558 MOVL SI, 16(SP) 17559 ADDL $0x04, CX 17560 ADDL $0x04, BX 17561 MOVQ src_len+8(FP), SI 17562 SUBL CX, SI 17563 LEAQ (DX)(CX*1), DI 17564 LEAQ (DX)(BX*1), BX 17565 17566 // matchLen 17567 XORL R9, R9 17568 CMPL SI, $0x08 17569 JB matchlen_match4_match_nolit_calcBlockSize 17570 17571 matchlen_loopback_match_nolit_calcBlockSize: 17572 MOVQ (DI)(R9*1), R8 17573 XORQ (BX)(R9*1), R8 17574 TESTQ R8, R8 17575 JZ matchlen_loop_match_nolit_calcBlockSize 17576 17577 #ifdef GOAMD64_v3 17578 TZCNTQ R8, R8 17579 17580 #else 17581 BSFQ R8, R8 17582 17583 #endif 17584 SARQ $0x03, R8 17585 LEAL (R9)(R8*1), R9 17586 JMP match_nolit_end_calcBlockSize 17587 17588 matchlen_loop_match_nolit_calcBlockSize: 17589 LEAL -8(SI), SI 17590 LEAL 8(R9), R9 17591 CMPL SI, $0x08 17592 JAE matchlen_loopback_match_nolit_calcBlockSize 17593 JZ match_nolit_end_calcBlockSize 17594 17595 matchlen_match4_match_nolit_calcBlockSize: 17596 CMPL SI, $0x04 17597 JB matchlen_match2_match_nolit_calcBlockSize 17598 MOVL (DI)(R9*1), R8 17599 CMPL (BX)(R9*1), R8 17600 JNE matchlen_match2_match_nolit_calcBlockSize 17601 SUBL $0x04, SI 17602 LEAL 4(R9), R9 17603 17604 matchlen_match2_match_nolit_calcBlockSize: 17605 CMPL SI, $0x02 17606 JB matchlen_match1_match_nolit_calcBlockSize 17607 MOVW (DI)(R9*1), R8 17608 CMPW (BX)(R9*1), R8 17609 JNE matchlen_match1_match_nolit_calcBlockSize 17610 SUBL $0x02, SI 17611 LEAL 2(R9), R9 17612 17613 matchlen_match1_match_nolit_calcBlockSize: 17614 CMPL SI, $0x01 17615 JB match_nolit_end_calcBlockSize 17616 MOVB (DI)(R9*1), R8 17617 CMPB (BX)(R9*1), R8 17618 JNE match_nolit_end_calcBlockSize 17619 LEAL 1(R9), R9 17620 17621 match_nolit_end_calcBlockSize: 17622 ADDL R9, CX 17623 MOVL 16(SP), BX 17624 ADDL $0x04, R9 17625 MOVL CX, 12(SP) 17626 17627 // emitCopy 17628 CMPL BX, $0x00010000 17629 JB two_byte_offset_match_nolit_calcBlockSize 17630 17631 four_bytes_loop_back_match_nolit_calcBlockSize: 17632 CMPL R9, $0x40 17633 JBE four_bytes_remain_match_nolit_calcBlockSize 17634 LEAL -64(R9), R9 17635 ADDQ $0x05, AX 17636 CMPL R9, $0x04 17637 JB four_bytes_remain_match_nolit_calcBlockSize 17638 JMP four_bytes_loop_back_match_nolit_calcBlockSize 17639 17640 four_bytes_remain_match_nolit_calcBlockSize: 17641 TESTL R9, R9 17642 JZ match_nolit_emitcopy_end_calcBlockSize 17643 XORL BX, BX 17644 ADDQ $0x05, AX 17645 JMP match_nolit_emitcopy_end_calcBlockSize 17646 17647 two_byte_offset_match_nolit_calcBlockSize: 17648 CMPL R9, $0x40 17649 JBE two_byte_offset_short_match_nolit_calcBlockSize 17650 LEAL -60(R9), R9 17651 ADDQ $0x03, AX 17652 JMP two_byte_offset_match_nolit_calcBlockSize 17653 17654 two_byte_offset_short_match_nolit_calcBlockSize: 17655 MOVL R9, SI 17656 SHLL $0x02, SI 17657 CMPL R9, $0x0c 17658 JAE emit_copy_three_match_nolit_calcBlockSize 17659 CMPL BX, $0x00000800 17660 JAE emit_copy_three_match_nolit_calcBlockSize 17661 ADDQ $0x02, AX 17662 JMP match_nolit_emitcopy_end_calcBlockSize 17663 17664 emit_copy_three_match_nolit_calcBlockSize: 17665 ADDQ $0x03, AX 17666 17667 match_nolit_emitcopy_end_calcBlockSize: 17668 CMPL CX, 8(SP) 17669 JAE emit_remainder_calcBlockSize 17670 MOVQ -2(DX)(CX*1), SI 17671 CMPQ AX, (SP) 17672 JB match_nolit_dst_ok_calcBlockSize 17673 MOVQ $0x00000000, ret+24(FP) 17674 RET 17675 17676 match_nolit_dst_ok_calcBlockSize: 17677 MOVQ $0x0000cf1bbcdcbf9b, R8 17678 MOVQ SI, DI 17679 SHRQ $0x10, SI 17680 MOVQ SI, BX 17681 SHLQ $0x10, DI 17682 IMULQ R8, DI 17683 SHRQ $0x33, DI 17684 SHLQ $0x10, BX 17685 IMULQ R8, BX 17686 SHRQ $0x33, BX 17687 LEAL -2(CX), R8 17688 LEAQ 24(SP)(BX*4), R9 17689 MOVL (R9), BX 17690 MOVL R8, 24(SP)(DI*4) 17691 MOVL CX, (R9) 17692 CMPL (DX)(BX*1), SI 17693 JEQ match_nolit_loop_calcBlockSize 17694 INCL CX 17695 JMP search_loop_calcBlockSize 17696 17697 emit_remainder_calcBlockSize: 17698 MOVQ src_len+8(FP), CX 17699 SUBL 12(SP), CX 17700 LEAQ 5(AX)(CX*1), CX 17701 CMPQ CX, (SP) 17702 JB emit_remainder_ok_calcBlockSize 17703 MOVQ $0x00000000, ret+24(FP) 17704 RET 17705 17706 emit_remainder_ok_calcBlockSize: 17707 MOVQ src_len+8(FP), CX 17708 MOVL 12(SP), BX 17709 CMPL BX, CX 17710 JEQ emit_literal_done_emit_remainder_calcBlockSize 17711 MOVL CX, SI 17712 MOVL CX, 12(SP) 17713 LEAQ (DX)(BX*1), CX 17714 SUBL BX, SI 17715 LEAL -1(SI), CX 17716 CMPL CX, $0x3c 17717 JB one_byte_emit_remainder_calcBlockSize 17718 CMPL CX, $0x00000100 17719 JB two_bytes_emit_remainder_calcBlockSize 17720 CMPL CX, $0x00010000 17721 JB three_bytes_emit_remainder_calcBlockSize 17722 CMPL CX, $0x01000000 17723 JB four_bytes_emit_remainder_calcBlockSize 17724 ADDQ $0x05, AX 17725 JMP memmove_long_emit_remainder_calcBlockSize 17726 17727 four_bytes_emit_remainder_calcBlockSize: 17728 ADDQ $0x04, AX 17729 JMP memmove_long_emit_remainder_calcBlockSize 17730 17731 three_bytes_emit_remainder_calcBlockSize: 17732 ADDQ $0x03, AX 17733 JMP memmove_long_emit_remainder_calcBlockSize 17734 17735 two_bytes_emit_remainder_calcBlockSize: 17736 ADDQ $0x02, AX 17737 CMPL CX, $0x40 17738 JB memmove_emit_remainder_calcBlockSize 17739 JMP memmove_long_emit_remainder_calcBlockSize 17740 17741 one_byte_emit_remainder_calcBlockSize: 17742 ADDQ $0x01, AX 17743 17744 memmove_emit_remainder_calcBlockSize: 17745 LEAQ (AX)(SI*1), AX 17746 JMP emit_literal_done_emit_remainder_calcBlockSize 17747 17748 memmove_long_emit_remainder_calcBlockSize: 17749 LEAQ (AX)(SI*1), AX 17750 17751 emit_literal_done_emit_remainder_calcBlockSize: 17752 MOVQ AX, ret+24(FP) 17753 RET 17754 17755 // func calcBlockSizeSmall(src []byte) int 17756 // Requires: BMI, SSE2 17757 TEXT ·calcBlockSizeSmall(SB), $2072-32 17758 XORQ AX, AX 17759 MOVQ $0x00000010, CX 17760 LEAQ 24(SP), DX 17761 PXOR X0, X0 17762 17763 zero_loop_calcBlockSizeSmall: 17764 MOVOU X0, (DX) 17765 MOVOU X0, 16(DX) 17766 MOVOU X0, 32(DX) 17767 MOVOU X0, 48(DX) 17768 MOVOU X0, 64(DX) 17769 MOVOU X0, 80(DX) 17770 MOVOU X0, 96(DX) 17771 MOVOU X0, 112(DX) 17772 ADDQ $0x80, DX 17773 DECQ CX 17774 JNZ zero_loop_calcBlockSizeSmall 17775 MOVL $0x00000000, 12(SP) 17776 MOVQ src_len+8(FP), CX 17777 LEAQ -9(CX), DX 17778 LEAQ -8(CX), BX 17779 MOVL BX, 8(SP) 17780 SHRQ $0x05, CX 17781 SUBL CX, DX 17782 LEAQ (AX)(DX*1), DX 17783 MOVQ DX, (SP) 17784 MOVL $0x00000001, CX 17785 MOVL CX, 16(SP) 17786 MOVQ src_base+0(FP), DX 17787 17788 search_loop_calcBlockSizeSmall: 17789 MOVL CX, BX 17790 SUBL 12(SP), BX 17791 SHRL $0x04, BX 17792 LEAL 4(CX)(BX*1), BX 17793 CMPL BX, 8(SP) 17794 JAE emit_remainder_calcBlockSizeSmall 17795 MOVQ (DX)(CX*1), SI 17796 MOVL BX, 20(SP) 17797 MOVQ $0x9e3779b1, R8 17798 MOVQ SI, R9 17799 MOVQ SI, R10 17800 SHRQ $0x08, R10 17801 SHLQ $0x20, R9 17802 IMULQ R8, R9 17803 SHRQ $0x37, R9 17804 SHLQ $0x20, R10 17805 IMULQ R8, R10 17806 SHRQ $0x37, R10 17807 MOVL 24(SP)(R9*4), BX 17808 MOVL 24(SP)(R10*4), DI 17809 MOVL CX, 24(SP)(R9*4) 17810 LEAL 1(CX), R9 17811 MOVL R9, 24(SP)(R10*4) 17812 MOVQ SI, R9 17813 SHRQ $0x10, R9 17814 SHLQ $0x20, R9 17815 IMULQ R8, R9 17816 SHRQ $0x37, R9 17817 MOVL CX, R8 17818 SUBL 16(SP), R8 17819 MOVL 1(DX)(R8*1), R10 17820 MOVQ SI, R8 17821 SHRQ $0x08, R8 17822 CMPL R8, R10 17823 JNE no_repeat_found_calcBlockSizeSmall 17824 LEAL 1(CX), SI 17825 MOVL 12(SP), BX 17826 MOVL SI, DI 17827 SUBL 16(SP), DI 17828 JZ repeat_extend_back_end_calcBlockSizeSmall 17829 17830 repeat_extend_back_loop_calcBlockSizeSmall: 17831 CMPL SI, BX 17832 JBE repeat_extend_back_end_calcBlockSizeSmall 17833 MOVB -1(DX)(DI*1), R8 17834 MOVB -1(DX)(SI*1), R9 17835 CMPB R8, R9 17836 JNE repeat_extend_back_end_calcBlockSizeSmall 17837 LEAL -1(SI), SI 17838 DECL DI 17839 JNZ repeat_extend_back_loop_calcBlockSizeSmall 17840 17841 repeat_extend_back_end_calcBlockSizeSmall: 17842 MOVL 12(SP), BX 17843 CMPL BX, SI 17844 JEQ emit_literal_done_repeat_emit_calcBlockSizeSmall 17845 MOVL SI, DI 17846 MOVL SI, 12(SP) 17847 LEAQ (DX)(BX*1), R8 17848 SUBL BX, DI 17849 LEAL -1(DI), BX 17850 CMPL BX, $0x3c 17851 JB one_byte_repeat_emit_calcBlockSizeSmall 17852 CMPL BX, $0x00000100 17853 JB two_bytes_repeat_emit_calcBlockSizeSmall 17854 JB three_bytes_repeat_emit_calcBlockSizeSmall 17855 17856 three_bytes_repeat_emit_calcBlockSizeSmall: 17857 ADDQ $0x03, AX 17858 JMP memmove_long_repeat_emit_calcBlockSizeSmall 17859 17860 two_bytes_repeat_emit_calcBlockSizeSmall: 17861 ADDQ $0x02, AX 17862 CMPL BX, $0x40 17863 JB memmove_repeat_emit_calcBlockSizeSmall 17864 JMP memmove_long_repeat_emit_calcBlockSizeSmall 17865 17866 one_byte_repeat_emit_calcBlockSizeSmall: 17867 ADDQ $0x01, AX 17868 17869 memmove_repeat_emit_calcBlockSizeSmall: 17870 LEAQ (AX)(DI*1), AX 17871 JMP emit_literal_done_repeat_emit_calcBlockSizeSmall 17872 17873 memmove_long_repeat_emit_calcBlockSizeSmall: 17874 LEAQ (AX)(DI*1), AX 17875 17876 emit_literal_done_repeat_emit_calcBlockSizeSmall: 17877 ADDL $0x05, CX 17878 MOVL CX, BX 17879 SUBL 16(SP), BX 17880 MOVQ src_len+8(FP), DI 17881 SUBL CX, DI 17882 LEAQ (DX)(CX*1), R8 17883 LEAQ (DX)(BX*1), BX 17884 17885 // matchLen 17886 XORL R10, R10 17887 CMPL DI, $0x08 17888 JB matchlen_match4_repeat_extend_calcBlockSizeSmall 17889 17890 matchlen_loopback_repeat_extend_calcBlockSizeSmall: 17891 MOVQ (R8)(R10*1), R9 17892 XORQ (BX)(R10*1), R9 17893 TESTQ R9, R9 17894 JZ matchlen_loop_repeat_extend_calcBlockSizeSmall 17895 17896 #ifdef GOAMD64_v3 17897 TZCNTQ R9, R9 17898 17899 #else 17900 BSFQ R9, R9 17901 17902 #endif 17903 SARQ $0x03, R9 17904 LEAL (R10)(R9*1), R10 17905 JMP repeat_extend_forward_end_calcBlockSizeSmall 17906 17907 matchlen_loop_repeat_extend_calcBlockSizeSmall: 17908 LEAL -8(DI), DI 17909 LEAL 8(R10), R10 17910 CMPL DI, $0x08 17911 JAE matchlen_loopback_repeat_extend_calcBlockSizeSmall 17912 JZ repeat_extend_forward_end_calcBlockSizeSmall 17913 17914 matchlen_match4_repeat_extend_calcBlockSizeSmall: 17915 CMPL DI, $0x04 17916 JB matchlen_match2_repeat_extend_calcBlockSizeSmall 17917 MOVL (R8)(R10*1), R9 17918 CMPL (BX)(R10*1), R9 17919 JNE matchlen_match2_repeat_extend_calcBlockSizeSmall 17920 SUBL $0x04, DI 17921 LEAL 4(R10), R10 17922 17923 matchlen_match2_repeat_extend_calcBlockSizeSmall: 17924 CMPL DI, $0x02 17925 JB matchlen_match1_repeat_extend_calcBlockSizeSmall 17926 MOVW (R8)(R10*1), R9 17927 CMPW (BX)(R10*1), R9 17928 JNE matchlen_match1_repeat_extend_calcBlockSizeSmall 17929 SUBL $0x02, DI 17930 LEAL 2(R10), R10 17931 17932 matchlen_match1_repeat_extend_calcBlockSizeSmall: 17933 CMPL DI, $0x01 17934 JB repeat_extend_forward_end_calcBlockSizeSmall 17935 MOVB (R8)(R10*1), R9 17936 CMPB (BX)(R10*1), R9 17937 JNE repeat_extend_forward_end_calcBlockSizeSmall 17938 LEAL 1(R10), R10 17939 17940 repeat_extend_forward_end_calcBlockSizeSmall: 17941 ADDL R10, CX 17942 MOVL CX, BX 17943 SUBL SI, BX 17944 MOVL 16(SP), SI 17945 17946 // emitCopy 17947 two_byte_offset_repeat_as_copy_calcBlockSizeSmall: 17948 CMPL BX, $0x40 17949 JBE two_byte_offset_short_repeat_as_copy_calcBlockSizeSmall 17950 LEAL -60(BX), BX 17951 ADDQ $0x03, AX 17952 JMP two_byte_offset_repeat_as_copy_calcBlockSizeSmall 17953 17954 two_byte_offset_short_repeat_as_copy_calcBlockSizeSmall: 17955 MOVL BX, SI 17956 SHLL $0x02, SI 17957 CMPL BX, $0x0c 17958 JAE emit_copy_three_repeat_as_copy_calcBlockSizeSmall 17959 ADDQ $0x02, AX 17960 JMP repeat_end_emit_calcBlockSizeSmall 17961 17962 emit_copy_three_repeat_as_copy_calcBlockSizeSmall: 17963 ADDQ $0x03, AX 17964 17965 repeat_end_emit_calcBlockSizeSmall: 17966 MOVL CX, 12(SP) 17967 JMP search_loop_calcBlockSizeSmall 17968 17969 no_repeat_found_calcBlockSizeSmall: 17970 CMPL (DX)(BX*1), SI 17971 JEQ candidate_match_calcBlockSizeSmall 17972 SHRQ $0x08, SI 17973 MOVL 24(SP)(R9*4), BX 17974 LEAL 2(CX), R8 17975 CMPL (DX)(DI*1), SI 17976 JEQ candidate2_match_calcBlockSizeSmall 17977 MOVL R8, 24(SP)(R9*4) 17978 SHRQ $0x08, SI 17979 CMPL (DX)(BX*1), SI 17980 JEQ candidate3_match_calcBlockSizeSmall 17981 MOVL 20(SP), CX 17982 JMP search_loop_calcBlockSizeSmall 17983 17984 candidate3_match_calcBlockSizeSmall: 17985 ADDL $0x02, CX 17986 JMP candidate_match_calcBlockSizeSmall 17987 17988 candidate2_match_calcBlockSizeSmall: 17989 MOVL R8, 24(SP)(R9*4) 17990 INCL CX 17991 MOVL DI, BX 17992 17993 candidate_match_calcBlockSizeSmall: 17994 MOVL 12(SP), SI 17995 TESTL BX, BX 17996 JZ match_extend_back_end_calcBlockSizeSmall 17997 17998 match_extend_back_loop_calcBlockSizeSmall: 17999 CMPL CX, SI 18000 JBE match_extend_back_end_calcBlockSizeSmall 18001 MOVB -1(DX)(BX*1), DI 18002 MOVB -1(DX)(CX*1), R8 18003 CMPB DI, R8 18004 JNE match_extend_back_end_calcBlockSizeSmall 18005 LEAL -1(CX), CX 18006 DECL BX 18007 JZ match_extend_back_end_calcBlockSizeSmall 18008 JMP match_extend_back_loop_calcBlockSizeSmall 18009 18010 match_extend_back_end_calcBlockSizeSmall: 18011 MOVL CX, SI 18012 SUBL 12(SP), SI 18013 LEAQ 3(AX)(SI*1), SI 18014 CMPQ SI, (SP) 18015 JB match_dst_size_check_calcBlockSizeSmall 18016 MOVQ $0x00000000, ret+24(FP) 18017 RET 18018 18019 match_dst_size_check_calcBlockSizeSmall: 18020 MOVL CX, SI 18021 MOVL 12(SP), DI 18022 CMPL DI, SI 18023 JEQ emit_literal_done_match_emit_calcBlockSizeSmall 18024 MOVL SI, R8 18025 MOVL SI, 12(SP) 18026 LEAQ (DX)(DI*1), SI 18027 SUBL DI, R8 18028 LEAL -1(R8), SI 18029 CMPL SI, $0x3c 18030 JB one_byte_match_emit_calcBlockSizeSmall 18031 CMPL SI, $0x00000100 18032 JB two_bytes_match_emit_calcBlockSizeSmall 18033 JB three_bytes_match_emit_calcBlockSizeSmall 18034 18035 three_bytes_match_emit_calcBlockSizeSmall: 18036 ADDQ $0x03, AX 18037 JMP memmove_long_match_emit_calcBlockSizeSmall 18038 18039 two_bytes_match_emit_calcBlockSizeSmall: 18040 ADDQ $0x02, AX 18041 CMPL SI, $0x40 18042 JB memmove_match_emit_calcBlockSizeSmall 18043 JMP memmove_long_match_emit_calcBlockSizeSmall 18044 18045 one_byte_match_emit_calcBlockSizeSmall: 18046 ADDQ $0x01, AX 18047 18048 memmove_match_emit_calcBlockSizeSmall: 18049 LEAQ (AX)(R8*1), AX 18050 JMP emit_literal_done_match_emit_calcBlockSizeSmall 18051 18052 memmove_long_match_emit_calcBlockSizeSmall: 18053 LEAQ (AX)(R8*1), AX 18054 18055 emit_literal_done_match_emit_calcBlockSizeSmall: 18056 match_nolit_loop_calcBlockSizeSmall: 18057 MOVL CX, SI 18058 SUBL BX, SI 18059 MOVL SI, 16(SP) 18060 ADDL $0x04, CX 18061 ADDL $0x04, BX 18062 MOVQ src_len+8(FP), SI 18063 SUBL CX, SI 18064 LEAQ (DX)(CX*1), DI 18065 LEAQ (DX)(BX*1), BX 18066 18067 // matchLen 18068 XORL R9, R9 18069 CMPL SI, $0x08 18070 JB matchlen_match4_match_nolit_calcBlockSizeSmall 18071 18072 matchlen_loopback_match_nolit_calcBlockSizeSmall: 18073 MOVQ (DI)(R9*1), R8 18074 XORQ (BX)(R9*1), R8 18075 TESTQ R8, R8 18076 JZ matchlen_loop_match_nolit_calcBlockSizeSmall 18077 18078 #ifdef GOAMD64_v3 18079 TZCNTQ R8, R8 18080 18081 #else 18082 BSFQ R8, R8 18083 18084 #endif 18085 SARQ $0x03, R8 18086 LEAL (R9)(R8*1), R9 18087 JMP match_nolit_end_calcBlockSizeSmall 18088 18089 matchlen_loop_match_nolit_calcBlockSizeSmall: 18090 LEAL -8(SI), SI 18091 LEAL 8(R9), R9 18092 CMPL SI, $0x08 18093 JAE matchlen_loopback_match_nolit_calcBlockSizeSmall 18094 JZ match_nolit_end_calcBlockSizeSmall 18095 18096 matchlen_match4_match_nolit_calcBlockSizeSmall: 18097 CMPL SI, $0x04 18098 JB matchlen_match2_match_nolit_calcBlockSizeSmall 18099 MOVL (DI)(R9*1), R8 18100 CMPL (BX)(R9*1), R8 18101 JNE matchlen_match2_match_nolit_calcBlockSizeSmall 18102 SUBL $0x04, SI 18103 LEAL 4(R9), R9 18104 18105 matchlen_match2_match_nolit_calcBlockSizeSmall: 18106 CMPL SI, $0x02 18107 JB matchlen_match1_match_nolit_calcBlockSizeSmall 18108 MOVW (DI)(R9*1), R8 18109 CMPW (BX)(R9*1), R8 18110 JNE matchlen_match1_match_nolit_calcBlockSizeSmall 18111 SUBL $0x02, SI 18112 LEAL 2(R9), R9 18113 18114 matchlen_match1_match_nolit_calcBlockSizeSmall: 18115 CMPL SI, $0x01 18116 JB match_nolit_end_calcBlockSizeSmall 18117 MOVB (DI)(R9*1), R8 18118 CMPB (BX)(R9*1), R8 18119 JNE match_nolit_end_calcBlockSizeSmall 18120 LEAL 1(R9), R9 18121 18122 match_nolit_end_calcBlockSizeSmall: 18123 ADDL R9, CX 18124 MOVL 16(SP), BX 18125 ADDL $0x04, R9 18126 MOVL CX, 12(SP) 18127 18128 // emitCopy 18129 two_byte_offset_match_nolit_calcBlockSizeSmall: 18130 CMPL R9, $0x40 18131 JBE two_byte_offset_short_match_nolit_calcBlockSizeSmall 18132 LEAL -60(R9), R9 18133 ADDQ $0x03, AX 18134 JMP two_byte_offset_match_nolit_calcBlockSizeSmall 18135 18136 two_byte_offset_short_match_nolit_calcBlockSizeSmall: 18137 MOVL R9, BX 18138 SHLL $0x02, BX 18139 CMPL R9, $0x0c 18140 JAE emit_copy_three_match_nolit_calcBlockSizeSmall 18141 ADDQ $0x02, AX 18142 JMP match_nolit_emitcopy_end_calcBlockSizeSmall 18143 18144 emit_copy_three_match_nolit_calcBlockSizeSmall: 18145 ADDQ $0x03, AX 18146 18147 match_nolit_emitcopy_end_calcBlockSizeSmall: 18148 CMPL CX, 8(SP) 18149 JAE emit_remainder_calcBlockSizeSmall 18150 MOVQ -2(DX)(CX*1), SI 18151 CMPQ AX, (SP) 18152 JB match_nolit_dst_ok_calcBlockSizeSmall 18153 MOVQ $0x00000000, ret+24(FP) 18154 RET 18155 18156 match_nolit_dst_ok_calcBlockSizeSmall: 18157 MOVQ $0x9e3779b1, R8 18158 MOVQ SI, DI 18159 SHRQ $0x10, SI 18160 MOVQ SI, BX 18161 SHLQ $0x20, DI 18162 IMULQ R8, DI 18163 SHRQ $0x37, DI 18164 SHLQ $0x20, BX 18165 IMULQ R8, BX 18166 SHRQ $0x37, BX 18167 LEAL -2(CX), R8 18168 LEAQ 24(SP)(BX*4), R9 18169 MOVL (R9), BX 18170 MOVL R8, 24(SP)(DI*4) 18171 MOVL CX, (R9) 18172 CMPL (DX)(BX*1), SI 18173 JEQ match_nolit_loop_calcBlockSizeSmall 18174 INCL CX 18175 JMP search_loop_calcBlockSizeSmall 18176 18177 emit_remainder_calcBlockSizeSmall: 18178 MOVQ src_len+8(FP), CX 18179 SUBL 12(SP), CX 18180 LEAQ 3(AX)(CX*1), CX 18181 CMPQ CX, (SP) 18182 JB emit_remainder_ok_calcBlockSizeSmall 18183 MOVQ $0x00000000, ret+24(FP) 18184 RET 18185 18186 emit_remainder_ok_calcBlockSizeSmall: 18187 MOVQ src_len+8(FP), CX 18188 MOVL 12(SP), BX 18189 CMPL BX, CX 18190 JEQ emit_literal_done_emit_remainder_calcBlockSizeSmall 18191 MOVL CX, SI 18192 MOVL CX, 12(SP) 18193 LEAQ (DX)(BX*1), CX 18194 SUBL BX, SI 18195 LEAL -1(SI), CX 18196 CMPL CX, $0x3c 18197 JB one_byte_emit_remainder_calcBlockSizeSmall 18198 CMPL CX, $0x00000100 18199 JB two_bytes_emit_remainder_calcBlockSizeSmall 18200 JB three_bytes_emit_remainder_calcBlockSizeSmall 18201 18202 three_bytes_emit_remainder_calcBlockSizeSmall: 18203 ADDQ $0x03, AX 18204 JMP memmove_long_emit_remainder_calcBlockSizeSmall 18205 18206 two_bytes_emit_remainder_calcBlockSizeSmall: 18207 ADDQ $0x02, AX 18208 CMPL CX, $0x40 18209 JB memmove_emit_remainder_calcBlockSizeSmall 18210 JMP memmove_long_emit_remainder_calcBlockSizeSmall 18211 18212 one_byte_emit_remainder_calcBlockSizeSmall: 18213 ADDQ $0x01, AX 18214 18215 memmove_emit_remainder_calcBlockSizeSmall: 18216 LEAQ (AX)(SI*1), AX 18217 JMP emit_literal_done_emit_remainder_calcBlockSizeSmall 18218 18219 memmove_long_emit_remainder_calcBlockSizeSmall: 18220 LEAQ (AX)(SI*1), AX 18221 18222 emit_literal_done_emit_remainder_calcBlockSizeSmall: 18223 MOVQ AX, ret+24(FP) 18224 RET 18225 18226 // func emitLiteral(dst []byte, lit []byte) int 18227 // Requires: SSE2 18228 TEXT ·emitLiteral(SB), NOSPLIT, $0-56 18229 MOVQ lit_len+32(FP), DX 18230 MOVQ dst_base+0(FP), AX 18231 MOVQ lit_base+24(FP), CX 18232 TESTQ DX, DX 18233 JZ emit_literal_end_standalone_skip 18234 MOVL DX, BX 18235 LEAL -1(DX), SI 18236 CMPL SI, $0x3c 18237 JB one_byte_standalone 18238 CMPL SI, $0x00000100 18239 JB two_bytes_standalone 18240 CMPL SI, $0x00010000 18241 JB three_bytes_standalone 18242 CMPL SI, $0x01000000 18243 JB four_bytes_standalone 18244 MOVB $0xfc, (AX) 18245 MOVL SI, 1(AX) 18246 ADDQ $0x05, BX 18247 ADDQ $0x05, AX 18248 JMP memmove_long_standalone 18249 18250 four_bytes_standalone: 18251 MOVL SI, DI 18252 SHRL $0x10, DI 18253 MOVB $0xf8, (AX) 18254 MOVW SI, 1(AX) 18255 MOVB DI, 3(AX) 18256 ADDQ $0x04, BX 18257 ADDQ $0x04, AX 18258 JMP memmove_long_standalone 18259 18260 three_bytes_standalone: 18261 MOVB $0xf4, (AX) 18262 MOVW SI, 1(AX) 18263 ADDQ $0x03, BX 18264 ADDQ $0x03, AX 18265 JMP memmove_long_standalone 18266 18267 two_bytes_standalone: 18268 MOVB $0xf0, (AX) 18269 MOVB SI, 1(AX) 18270 ADDQ $0x02, BX 18271 ADDQ $0x02, AX 18272 CMPL SI, $0x40 18273 JB memmove_standalone 18274 JMP memmove_long_standalone 18275 18276 one_byte_standalone: 18277 SHLB $0x02, SI 18278 MOVB SI, (AX) 18279 ADDQ $0x01, BX 18280 ADDQ $0x01, AX 18281 18282 memmove_standalone: 18283 // genMemMoveShort 18284 CMPQ DX, $0x03 18285 JB emit_lit_memmove_standalone_memmove_move_1or2 18286 JE emit_lit_memmove_standalone_memmove_move_3 18287 CMPQ DX, $0x08 18288 JB emit_lit_memmove_standalone_memmove_move_4through7 18289 CMPQ DX, $0x10 18290 JBE emit_lit_memmove_standalone_memmove_move_8through16 18291 CMPQ DX, $0x20 18292 JBE emit_lit_memmove_standalone_memmove_move_17through32 18293 JMP emit_lit_memmove_standalone_memmove_move_33through64 18294 18295 emit_lit_memmove_standalone_memmove_move_1or2: 18296 MOVB (CX), SI 18297 MOVB -1(CX)(DX*1), CL 18298 MOVB SI, (AX) 18299 MOVB CL, -1(AX)(DX*1) 18300 JMP emit_literal_end_standalone 18301 18302 emit_lit_memmove_standalone_memmove_move_3: 18303 MOVW (CX), SI 18304 MOVB 2(CX), CL 18305 MOVW SI, (AX) 18306 MOVB CL, 2(AX) 18307 JMP emit_literal_end_standalone 18308 18309 emit_lit_memmove_standalone_memmove_move_4through7: 18310 MOVL (CX), SI 18311 MOVL -4(CX)(DX*1), CX 18312 MOVL SI, (AX) 18313 MOVL CX, -4(AX)(DX*1) 18314 JMP emit_literal_end_standalone 18315 18316 emit_lit_memmove_standalone_memmove_move_8through16: 18317 MOVQ (CX), SI 18318 MOVQ -8(CX)(DX*1), CX 18319 MOVQ SI, (AX) 18320 MOVQ CX, -8(AX)(DX*1) 18321 JMP emit_literal_end_standalone 18322 18323 emit_lit_memmove_standalone_memmove_move_17through32: 18324 MOVOU (CX), X0 18325 MOVOU -16(CX)(DX*1), X1 18326 MOVOU X0, (AX) 18327 MOVOU X1, -16(AX)(DX*1) 18328 JMP emit_literal_end_standalone 18329 18330 emit_lit_memmove_standalone_memmove_move_33through64: 18331 MOVOU (CX), X0 18332 MOVOU 16(CX), X1 18333 MOVOU -32(CX)(DX*1), X2 18334 MOVOU -16(CX)(DX*1), X3 18335 MOVOU X0, (AX) 18336 MOVOU X1, 16(AX) 18337 MOVOU X2, -32(AX)(DX*1) 18338 MOVOU X3, -16(AX)(DX*1) 18339 JMP emit_literal_end_standalone 18340 JMP emit_literal_end_standalone 18341 18342 memmove_long_standalone: 18343 // genMemMoveLong 18344 MOVOU (CX), X0 18345 MOVOU 16(CX), X1 18346 MOVOU -32(CX)(DX*1), X2 18347 MOVOU -16(CX)(DX*1), X3 18348 MOVQ DX, DI 18349 SHRQ $0x05, DI 18350 MOVQ AX, SI 18351 ANDL $0x0000001f, SI 18352 MOVQ $0x00000040, R8 18353 SUBQ SI, R8 18354 DECQ DI 18355 JA emit_lit_memmove_long_standalonelarge_forward_sse_loop_32 18356 LEAQ -32(CX)(R8*1), SI 18357 LEAQ -32(AX)(R8*1), R9 18358 18359 emit_lit_memmove_long_standalonelarge_big_loop_back: 18360 MOVOU (SI), X4 18361 MOVOU 16(SI), X5 18362 MOVOA X4, (R9) 18363 MOVOA X5, 16(R9) 18364 ADDQ $0x20, R9 18365 ADDQ $0x20, SI 18366 ADDQ $0x20, R8 18367 DECQ DI 18368 JNA emit_lit_memmove_long_standalonelarge_big_loop_back 18369 18370 emit_lit_memmove_long_standalonelarge_forward_sse_loop_32: 18371 MOVOU -32(CX)(R8*1), X4 18372 MOVOU -16(CX)(R8*1), X5 18373 MOVOA X4, -32(AX)(R8*1) 18374 MOVOA X5, -16(AX)(R8*1) 18375 ADDQ $0x20, R8 18376 CMPQ DX, R8 18377 JAE emit_lit_memmove_long_standalonelarge_forward_sse_loop_32 18378 MOVOU X0, (AX) 18379 MOVOU X1, 16(AX) 18380 MOVOU X2, -32(AX)(DX*1) 18381 MOVOU X3, -16(AX)(DX*1) 18382 JMP emit_literal_end_standalone 18383 JMP emit_literal_end_standalone 18384 18385 emit_literal_end_standalone_skip: 18386 XORQ BX, BX 18387 18388 emit_literal_end_standalone: 18389 MOVQ BX, ret+48(FP) 18390 RET 18391 18392 // func emitRepeat(dst []byte, offset int, length int) int 18393 TEXT ·emitRepeat(SB), NOSPLIT, $0-48 18394 XORQ BX, BX 18395 MOVQ dst_base+0(FP), AX 18396 MOVQ offset+24(FP), CX 18397 MOVQ length+32(FP), DX 18398 18399 // emitRepeat 18400 emit_repeat_again_standalone: 18401 MOVL DX, SI 18402 LEAL -4(DX), DX 18403 CMPL SI, $0x08 18404 JBE repeat_two_standalone 18405 CMPL SI, $0x0c 18406 JAE cant_repeat_two_offset_standalone 18407 CMPL CX, $0x00000800 18408 JB repeat_two_offset_standalone 18409 18410 cant_repeat_two_offset_standalone: 18411 CMPL DX, $0x00000104 18412 JB repeat_three_standalone 18413 CMPL DX, $0x00010100 18414 JB repeat_four_standalone 18415 CMPL DX, $0x0100ffff 18416 JB repeat_five_standalone 18417 LEAL -16842747(DX), DX 18418 MOVL $0xfffb001d, (AX) 18419 MOVB $0xff, 4(AX) 18420 ADDQ $0x05, AX 18421 ADDQ $0x05, BX 18422 JMP emit_repeat_again_standalone 18423 18424 repeat_five_standalone: 18425 LEAL -65536(DX), DX 18426 MOVL DX, CX 18427 MOVW $0x001d, (AX) 18428 MOVW DX, 2(AX) 18429 SARL $0x10, CX 18430 MOVB CL, 4(AX) 18431 ADDQ $0x05, BX 18432 ADDQ $0x05, AX 18433 JMP gen_emit_repeat_end 18434 18435 repeat_four_standalone: 18436 LEAL -256(DX), DX 18437 MOVW $0x0019, (AX) 18438 MOVW DX, 2(AX) 18439 ADDQ $0x04, BX 18440 ADDQ $0x04, AX 18441 JMP gen_emit_repeat_end 18442 18443 repeat_three_standalone: 18444 LEAL -4(DX), DX 18445 MOVW $0x0015, (AX) 18446 MOVB DL, 2(AX) 18447 ADDQ $0x03, BX 18448 ADDQ $0x03, AX 18449 JMP gen_emit_repeat_end 18450 18451 repeat_two_standalone: 18452 SHLL $0x02, DX 18453 ORL $0x01, DX 18454 MOVW DX, (AX) 18455 ADDQ $0x02, BX 18456 ADDQ $0x02, AX 18457 JMP gen_emit_repeat_end 18458 18459 repeat_two_offset_standalone: 18460 XORQ SI, SI 18461 LEAL 1(SI)(DX*4), DX 18462 MOVB CL, 1(AX) 18463 SARL $0x08, CX 18464 SHLL $0x05, CX 18465 ORL CX, DX 18466 MOVB DL, (AX) 18467 ADDQ $0x02, BX 18468 ADDQ $0x02, AX 18469 18470 gen_emit_repeat_end: 18471 MOVQ BX, ret+40(FP) 18472 RET 18473 18474 // func emitCopy(dst []byte, offset int, length int) int 18475 TEXT ·emitCopy(SB), NOSPLIT, $0-48 18476 XORQ BX, BX 18477 MOVQ dst_base+0(FP), AX 18478 MOVQ offset+24(FP), CX 18479 MOVQ length+32(FP), DX 18480 18481 // emitCopy 18482 CMPL CX, $0x00010000 18483 JB two_byte_offset_standalone 18484 CMPL DX, $0x40 18485 JBE four_bytes_remain_standalone 18486 MOVB $0xff, (AX) 18487 MOVL CX, 1(AX) 18488 LEAL -64(DX), DX 18489 ADDQ $0x05, BX 18490 ADDQ $0x05, AX 18491 CMPL DX, $0x04 18492 JB four_bytes_remain_standalone 18493 18494 // emitRepeat 18495 emit_repeat_again_standalone_emit_copy: 18496 MOVL DX, SI 18497 LEAL -4(DX), DX 18498 CMPL SI, $0x08 18499 JBE repeat_two_standalone_emit_copy 18500 CMPL SI, $0x0c 18501 JAE cant_repeat_two_offset_standalone_emit_copy 18502 CMPL CX, $0x00000800 18503 JB repeat_two_offset_standalone_emit_copy 18504 18505 cant_repeat_two_offset_standalone_emit_copy: 18506 CMPL DX, $0x00000104 18507 JB repeat_three_standalone_emit_copy 18508 CMPL DX, $0x00010100 18509 JB repeat_four_standalone_emit_copy 18510 CMPL DX, $0x0100ffff 18511 JB repeat_five_standalone_emit_copy 18512 LEAL -16842747(DX), DX 18513 MOVL $0xfffb001d, (AX) 18514 MOVB $0xff, 4(AX) 18515 ADDQ $0x05, AX 18516 ADDQ $0x05, BX 18517 JMP emit_repeat_again_standalone_emit_copy 18518 18519 repeat_five_standalone_emit_copy: 18520 LEAL -65536(DX), DX 18521 MOVL DX, CX 18522 MOVW $0x001d, (AX) 18523 MOVW DX, 2(AX) 18524 SARL $0x10, CX 18525 MOVB CL, 4(AX) 18526 ADDQ $0x05, BX 18527 ADDQ $0x05, AX 18528 JMP gen_emit_copy_end 18529 18530 repeat_four_standalone_emit_copy: 18531 LEAL -256(DX), DX 18532 MOVW $0x0019, (AX) 18533 MOVW DX, 2(AX) 18534 ADDQ $0x04, BX 18535 ADDQ $0x04, AX 18536 JMP gen_emit_copy_end 18537 18538 repeat_three_standalone_emit_copy: 18539 LEAL -4(DX), DX 18540 MOVW $0x0015, (AX) 18541 MOVB DL, 2(AX) 18542 ADDQ $0x03, BX 18543 ADDQ $0x03, AX 18544 JMP gen_emit_copy_end 18545 18546 repeat_two_standalone_emit_copy: 18547 SHLL $0x02, DX 18548 ORL $0x01, DX 18549 MOVW DX, (AX) 18550 ADDQ $0x02, BX 18551 ADDQ $0x02, AX 18552 JMP gen_emit_copy_end 18553 18554 repeat_two_offset_standalone_emit_copy: 18555 XORQ SI, SI 18556 LEAL 1(SI)(DX*4), DX 18557 MOVB CL, 1(AX) 18558 SARL $0x08, CX 18559 SHLL $0x05, CX 18560 ORL CX, DX 18561 MOVB DL, (AX) 18562 ADDQ $0x02, BX 18563 ADDQ $0x02, AX 18564 JMP gen_emit_copy_end 18565 18566 four_bytes_remain_standalone: 18567 TESTL DX, DX 18568 JZ gen_emit_copy_end 18569 XORL SI, SI 18570 LEAL -1(SI)(DX*4), DX 18571 MOVB DL, (AX) 18572 MOVL CX, 1(AX) 18573 ADDQ $0x05, BX 18574 ADDQ $0x05, AX 18575 JMP gen_emit_copy_end 18576 18577 two_byte_offset_standalone: 18578 CMPL DX, $0x40 18579 JBE two_byte_offset_short_standalone 18580 CMPL CX, $0x00000800 18581 JAE long_offset_short_standalone 18582 MOVL $0x00000001, SI 18583 LEAL 16(SI), SI 18584 MOVB CL, 1(AX) 18585 MOVL CX, DI 18586 SHRL $0x08, DI 18587 SHLL $0x05, DI 18588 ORL DI, SI 18589 MOVB SI, (AX) 18590 ADDQ $0x02, BX 18591 ADDQ $0x02, AX 18592 SUBL $0x08, DX 18593 18594 // emitRepeat 18595 LEAL -4(DX), DX 18596 JMP cant_repeat_two_offset_standalone_emit_copy_short_2b 18597 18598 emit_repeat_again_standalone_emit_copy_short_2b: 18599 MOVL DX, SI 18600 LEAL -4(DX), DX 18601 CMPL SI, $0x08 18602 JBE repeat_two_standalone_emit_copy_short_2b 18603 CMPL SI, $0x0c 18604 JAE cant_repeat_two_offset_standalone_emit_copy_short_2b 18605 CMPL CX, $0x00000800 18606 JB repeat_two_offset_standalone_emit_copy_short_2b 18607 18608 cant_repeat_two_offset_standalone_emit_copy_short_2b: 18609 CMPL DX, $0x00000104 18610 JB repeat_three_standalone_emit_copy_short_2b 18611 CMPL DX, $0x00010100 18612 JB repeat_four_standalone_emit_copy_short_2b 18613 CMPL DX, $0x0100ffff 18614 JB repeat_five_standalone_emit_copy_short_2b 18615 LEAL -16842747(DX), DX 18616 MOVL $0xfffb001d, (AX) 18617 MOVB $0xff, 4(AX) 18618 ADDQ $0x05, AX 18619 ADDQ $0x05, BX 18620 JMP emit_repeat_again_standalone_emit_copy_short_2b 18621 18622 repeat_five_standalone_emit_copy_short_2b: 18623 LEAL -65536(DX), DX 18624 MOVL DX, CX 18625 MOVW $0x001d, (AX) 18626 MOVW DX, 2(AX) 18627 SARL $0x10, CX 18628 MOVB CL, 4(AX) 18629 ADDQ $0x05, BX 18630 ADDQ $0x05, AX 18631 JMP gen_emit_copy_end 18632 18633 repeat_four_standalone_emit_copy_short_2b: 18634 LEAL -256(DX), DX 18635 MOVW $0x0019, (AX) 18636 MOVW DX, 2(AX) 18637 ADDQ $0x04, BX 18638 ADDQ $0x04, AX 18639 JMP gen_emit_copy_end 18640 18641 repeat_three_standalone_emit_copy_short_2b: 18642 LEAL -4(DX), DX 18643 MOVW $0x0015, (AX) 18644 MOVB DL, 2(AX) 18645 ADDQ $0x03, BX 18646 ADDQ $0x03, AX 18647 JMP gen_emit_copy_end 18648 18649 repeat_two_standalone_emit_copy_short_2b: 18650 SHLL $0x02, DX 18651 ORL $0x01, DX 18652 MOVW DX, (AX) 18653 ADDQ $0x02, BX 18654 ADDQ $0x02, AX 18655 JMP gen_emit_copy_end 18656 18657 repeat_two_offset_standalone_emit_copy_short_2b: 18658 XORQ SI, SI 18659 LEAL 1(SI)(DX*4), DX 18660 MOVB CL, 1(AX) 18661 SARL $0x08, CX 18662 SHLL $0x05, CX 18663 ORL CX, DX 18664 MOVB DL, (AX) 18665 ADDQ $0x02, BX 18666 ADDQ $0x02, AX 18667 JMP gen_emit_copy_end 18668 18669 long_offset_short_standalone: 18670 MOVB $0xee, (AX) 18671 MOVW CX, 1(AX) 18672 LEAL -60(DX), DX 18673 ADDQ $0x03, AX 18674 ADDQ $0x03, BX 18675 18676 // emitRepeat 18677 emit_repeat_again_standalone_emit_copy_short: 18678 MOVL DX, SI 18679 LEAL -4(DX), DX 18680 CMPL SI, $0x08 18681 JBE repeat_two_standalone_emit_copy_short 18682 CMPL SI, $0x0c 18683 JAE cant_repeat_two_offset_standalone_emit_copy_short 18684 CMPL CX, $0x00000800 18685 JB repeat_two_offset_standalone_emit_copy_short 18686 18687 cant_repeat_two_offset_standalone_emit_copy_short: 18688 CMPL DX, $0x00000104 18689 JB repeat_three_standalone_emit_copy_short 18690 CMPL DX, $0x00010100 18691 JB repeat_four_standalone_emit_copy_short 18692 CMPL DX, $0x0100ffff 18693 JB repeat_five_standalone_emit_copy_short 18694 LEAL -16842747(DX), DX 18695 MOVL $0xfffb001d, (AX) 18696 MOVB $0xff, 4(AX) 18697 ADDQ $0x05, AX 18698 ADDQ $0x05, BX 18699 JMP emit_repeat_again_standalone_emit_copy_short 18700 18701 repeat_five_standalone_emit_copy_short: 18702 LEAL -65536(DX), DX 18703 MOVL DX, CX 18704 MOVW $0x001d, (AX) 18705 MOVW DX, 2(AX) 18706 SARL $0x10, CX 18707 MOVB CL, 4(AX) 18708 ADDQ $0x05, BX 18709 ADDQ $0x05, AX 18710 JMP gen_emit_copy_end 18711 18712 repeat_four_standalone_emit_copy_short: 18713 LEAL -256(DX), DX 18714 MOVW $0x0019, (AX) 18715 MOVW DX, 2(AX) 18716 ADDQ $0x04, BX 18717 ADDQ $0x04, AX 18718 JMP gen_emit_copy_end 18719 18720 repeat_three_standalone_emit_copy_short: 18721 LEAL -4(DX), DX 18722 MOVW $0x0015, (AX) 18723 MOVB DL, 2(AX) 18724 ADDQ $0x03, BX 18725 ADDQ $0x03, AX 18726 JMP gen_emit_copy_end 18727 18728 repeat_two_standalone_emit_copy_short: 18729 SHLL $0x02, DX 18730 ORL $0x01, DX 18731 MOVW DX, (AX) 18732 ADDQ $0x02, BX 18733 ADDQ $0x02, AX 18734 JMP gen_emit_copy_end 18735 18736 repeat_two_offset_standalone_emit_copy_short: 18737 XORQ SI, SI 18738 LEAL 1(SI)(DX*4), DX 18739 MOVB CL, 1(AX) 18740 SARL $0x08, CX 18741 SHLL $0x05, CX 18742 ORL CX, DX 18743 MOVB DL, (AX) 18744 ADDQ $0x02, BX 18745 ADDQ $0x02, AX 18746 JMP gen_emit_copy_end 18747 18748 two_byte_offset_short_standalone: 18749 MOVL DX, SI 18750 SHLL $0x02, SI 18751 CMPL DX, $0x0c 18752 JAE emit_copy_three_standalone 18753 CMPL CX, $0x00000800 18754 JAE emit_copy_three_standalone 18755 LEAL -15(SI), SI 18756 MOVB CL, 1(AX) 18757 SHRL $0x08, CX 18758 SHLL $0x05, CX 18759 ORL CX, SI 18760 MOVB SI, (AX) 18761 ADDQ $0x02, BX 18762 ADDQ $0x02, AX 18763 JMP gen_emit_copy_end 18764 18765 emit_copy_three_standalone: 18766 LEAL -2(SI), SI 18767 MOVB SI, (AX) 18768 MOVW CX, 1(AX) 18769 ADDQ $0x03, BX 18770 ADDQ $0x03, AX 18771 18772 gen_emit_copy_end: 18773 MOVQ BX, ret+40(FP) 18774 RET 18775 18776 // func emitCopyNoRepeat(dst []byte, offset int, length int) int 18777 TEXT ·emitCopyNoRepeat(SB), NOSPLIT, $0-48 18778 XORQ BX, BX 18779 MOVQ dst_base+0(FP), AX 18780 MOVQ offset+24(FP), CX 18781 MOVQ length+32(FP), DX 18782 18783 // emitCopy 18784 CMPL CX, $0x00010000 18785 JB two_byte_offset_standalone_snappy 18786 18787 four_bytes_loop_back_standalone_snappy: 18788 CMPL DX, $0x40 18789 JBE four_bytes_remain_standalone_snappy 18790 MOVB $0xff, (AX) 18791 MOVL CX, 1(AX) 18792 LEAL -64(DX), DX 18793 ADDQ $0x05, BX 18794 ADDQ $0x05, AX 18795 CMPL DX, $0x04 18796 JB four_bytes_remain_standalone_snappy 18797 JMP four_bytes_loop_back_standalone_snappy 18798 18799 four_bytes_remain_standalone_snappy: 18800 TESTL DX, DX 18801 JZ gen_emit_copy_end_snappy 18802 XORL SI, SI 18803 LEAL -1(SI)(DX*4), DX 18804 MOVB DL, (AX) 18805 MOVL CX, 1(AX) 18806 ADDQ $0x05, BX 18807 ADDQ $0x05, AX 18808 JMP gen_emit_copy_end_snappy 18809 18810 two_byte_offset_standalone_snappy: 18811 CMPL DX, $0x40 18812 JBE two_byte_offset_short_standalone_snappy 18813 MOVB $0xee, (AX) 18814 MOVW CX, 1(AX) 18815 LEAL -60(DX), DX 18816 ADDQ $0x03, AX 18817 ADDQ $0x03, BX 18818 JMP two_byte_offset_standalone_snappy 18819 18820 two_byte_offset_short_standalone_snappy: 18821 MOVL DX, SI 18822 SHLL $0x02, SI 18823 CMPL DX, $0x0c 18824 JAE emit_copy_three_standalone_snappy 18825 CMPL CX, $0x00000800 18826 JAE emit_copy_three_standalone_snappy 18827 LEAL -15(SI), SI 18828 MOVB CL, 1(AX) 18829 SHRL $0x08, CX 18830 SHLL $0x05, CX 18831 ORL CX, SI 18832 MOVB SI, (AX) 18833 ADDQ $0x02, BX 18834 ADDQ $0x02, AX 18835 JMP gen_emit_copy_end_snappy 18836 18837 emit_copy_three_standalone_snappy: 18838 LEAL -2(SI), SI 18839 MOVB SI, (AX) 18840 MOVW CX, 1(AX) 18841 ADDQ $0x03, BX 18842 ADDQ $0x03, AX 18843 18844 gen_emit_copy_end_snappy: 18845 MOVQ BX, ret+40(FP) 18846 RET 18847 18848 // func matchLen(a []byte, b []byte) int 18849 // Requires: BMI 18850 TEXT ·matchLen(SB), NOSPLIT, $0-56 18851 MOVQ a_base+0(FP), AX 18852 MOVQ b_base+24(FP), CX 18853 MOVQ a_len+8(FP), DX 18854 18855 // matchLen 18856 XORL SI, SI 18857 CMPL DX, $0x08 18858 JB matchlen_match4_standalone 18859 18860 matchlen_loopback_standalone: 18861 MOVQ (AX)(SI*1), BX 18862 XORQ (CX)(SI*1), BX 18863 TESTQ BX, BX 18864 JZ matchlen_loop_standalone 18865 18866 #ifdef GOAMD64_v3 18867 TZCNTQ BX, BX 18868 18869 #else 18870 BSFQ BX, BX 18871 18872 #endif 18873 SARQ $0x03, BX 18874 LEAL (SI)(BX*1), SI 18875 JMP gen_match_len_end 18876 18877 matchlen_loop_standalone: 18878 LEAL -8(DX), DX 18879 LEAL 8(SI), SI 18880 CMPL DX, $0x08 18881 JAE matchlen_loopback_standalone 18882 JZ gen_match_len_end 18883 18884 matchlen_match4_standalone: 18885 CMPL DX, $0x04 18886 JB matchlen_match2_standalone 18887 MOVL (AX)(SI*1), BX 18888 CMPL (CX)(SI*1), BX 18889 JNE matchlen_match2_standalone 18890 SUBL $0x04, DX 18891 LEAL 4(SI), SI 18892 18893 matchlen_match2_standalone: 18894 CMPL DX, $0x02 18895 JB matchlen_match1_standalone 18896 MOVW (AX)(SI*1), BX 18897 CMPW (CX)(SI*1), BX 18898 JNE matchlen_match1_standalone 18899 SUBL $0x02, DX 18900 LEAL 2(SI), SI 18901 18902 matchlen_match1_standalone: 18903 CMPL DX, $0x01 18904 JB gen_match_len_end 18905 MOVB (AX)(SI*1), BL 18906 CMPB (CX)(SI*1), BL 18907 JNE gen_match_len_end 18908 LEAL 1(SI), SI 18909 18910 gen_match_len_end: 18911 MOVQ SI, ret+48(FP) 18912 RET 18913 18914 // func cvtLZ4BlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) 18915 // Requires: SSE2 18916 TEXT ·cvtLZ4BlockAsm(SB), NOSPLIT, $0-64 18917 XORQ SI, SI 18918 MOVQ dst_base+0(FP), AX 18919 MOVQ dst_len+8(FP), CX 18920 MOVQ src_base+24(FP), DX 18921 MOVQ src_len+32(FP), BX 18922 LEAQ (DX)(BX*1), BX 18923 LEAQ -10(AX)(CX*1), CX 18924 XORQ DI, DI 18925 18926 lz4_s2_loop: 18927 CMPQ DX, BX 18928 JAE lz4_s2_corrupt 18929 CMPQ AX, CX 18930 JAE lz4_s2_dstfull 18931 MOVBQZX (DX), R8 18932 MOVQ R8, R9 18933 MOVQ R8, R10 18934 SHRQ $0x04, R9 18935 ANDQ $0x0f, R10 18936 CMPQ R8, $0xf0 18937 JB lz4_s2_ll_end 18938 18939 lz4_s2_ll_loop: 18940 INCQ DX 18941 CMPQ DX, BX 18942 JAE lz4_s2_corrupt 18943 MOVBQZX (DX), R8 18944 ADDQ R8, R9 18945 CMPQ R8, $0xff 18946 JEQ lz4_s2_ll_loop 18947 18948 lz4_s2_ll_end: 18949 LEAQ (DX)(R9*1), R8 18950 ADDQ $0x04, R10 18951 CMPQ R8, BX 18952 JAE lz4_s2_corrupt 18953 INCQ DX 18954 INCQ R8 18955 TESTQ R9, R9 18956 JZ lz4_s2_lits_done 18957 LEAQ (AX)(R9*1), R11 18958 CMPQ R11, CX 18959 JAE lz4_s2_dstfull 18960 ADDQ R9, SI 18961 LEAL -1(R9), R11 18962 CMPL R11, $0x3c 18963 JB one_byte_lz4_s2 18964 CMPL R11, $0x00000100 18965 JB two_bytes_lz4_s2 18966 CMPL R11, $0x00010000 18967 JB three_bytes_lz4_s2 18968 CMPL R11, $0x01000000 18969 JB four_bytes_lz4_s2 18970 MOVB $0xfc, (AX) 18971 MOVL R11, 1(AX) 18972 ADDQ $0x05, AX 18973 JMP memmove_long_lz4_s2 18974 18975 four_bytes_lz4_s2: 18976 MOVL R11, R12 18977 SHRL $0x10, R12 18978 MOVB $0xf8, (AX) 18979 MOVW R11, 1(AX) 18980 MOVB R12, 3(AX) 18981 ADDQ $0x04, AX 18982 JMP memmove_long_lz4_s2 18983 18984 three_bytes_lz4_s2: 18985 MOVB $0xf4, (AX) 18986 MOVW R11, 1(AX) 18987 ADDQ $0x03, AX 18988 JMP memmove_long_lz4_s2 18989 18990 two_bytes_lz4_s2: 18991 MOVB $0xf0, (AX) 18992 MOVB R11, 1(AX) 18993 ADDQ $0x02, AX 18994 CMPL R11, $0x40 18995 JB memmove_lz4_s2 18996 JMP memmove_long_lz4_s2 18997 18998 one_byte_lz4_s2: 18999 SHLB $0x02, R11 19000 MOVB R11, (AX) 19001 ADDQ $0x01, AX 19002 19003 memmove_lz4_s2: 19004 LEAQ (AX)(R9*1), R11 19005 19006 // genMemMoveShort 19007 CMPQ R9, $0x08 19008 JBE emit_lit_memmove_lz4_s2_memmove_move_8 19009 CMPQ R9, $0x10 19010 JBE emit_lit_memmove_lz4_s2_memmove_move_8through16 19011 CMPQ R9, $0x20 19012 JBE emit_lit_memmove_lz4_s2_memmove_move_17through32 19013 JMP emit_lit_memmove_lz4_s2_memmove_move_33through64 19014 19015 emit_lit_memmove_lz4_s2_memmove_move_8: 19016 MOVQ (DX), R12 19017 MOVQ R12, (AX) 19018 JMP memmove_end_copy_lz4_s2 19019 19020 emit_lit_memmove_lz4_s2_memmove_move_8through16: 19021 MOVQ (DX), R12 19022 MOVQ -8(DX)(R9*1), DX 19023 MOVQ R12, (AX) 19024 MOVQ DX, -8(AX)(R9*1) 19025 JMP memmove_end_copy_lz4_s2 19026 19027 emit_lit_memmove_lz4_s2_memmove_move_17through32: 19028 MOVOU (DX), X0 19029 MOVOU -16(DX)(R9*1), X1 19030 MOVOU X0, (AX) 19031 MOVOU X1, -16(AX)(R9*1) 19032 JMP memmove_end_copy_lz4_s2 19033 19034 emit_lit_memmove_lz4_s2_memmove_move_33through64: 19035 MOVOU (DX), X0 19036 MOVOU 16(DX), X1 19037 MOVOU -32(DX)(R9*1), X2 19038 MOVOU -16(DX)(R9*1), X3 19039 MOVOU X0, (AX) 19040 MOVOU X1, 16(AX) 19041 MOVOU X2, -32(AX)(R9*1) 19042 MOVOU X3, -16(AX)(R9*1) 19043 19044 memmove_end_copy_lz4_s2: 19045 MOVQ R11, AX 19046 JMP lz4_s2_lits_emit_done 19047 19048 memmove_long_lz4_s2: 19049 LEAQ (AX)(R9*1), R11 19050 19051 // genMemMoveLong 19052 MOVOU (DX), X0 19053 MOVOU 16(DX), X1 19054 MOVOU -32(DX)(R9*1), X2 19055 MOVOU -16(DX)(R9*1), X3 19056 MOVQ R9, R13 19057 SHRQ $0x05, R13 19058 MOVQ AX, R12 19059 ANDL $0x0000001f, R12 19060 MOVQ $0x00000040, R14 19061 SUBQ R12, R14 19062 DECQ R13 19063 JA emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32 19064 LEAQ -32(DX)(R14*1), R12 19065 LEAQ -32(AX)(R14*1), R15 19066 19067 emit_lit_memmove_long_lz4_s2large_big_loop_back: 19068 MOVOU (R12), X4 19069 MOVOU 16(R12), X5 19070 MOVOA X4, (R15) 19071 MOVOA X5, 16(R15) 19072 ADDQ $0x20, R15 19073 ADDQ $0x20, R12 19074 ADDQ $0x20, R14 19075 DECQ R13 19076 JNA emit_lit_memmove_long_lz4_s2large_big_loop_back 19077 19078 emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32: 19079 MOVOU -32(DX)(R14*1), X4 19080 MOVOU -16(DX)(R14*1), X5 19081 MOVOA X4, -32(AX)(R14*1) 19082 MOVOA X5, -16(AX)(R14*1) 19083 ADDQ $0x20, R14 19084 CMPQ R9, R14 19085 JAE emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32 19086 MOVOU X0, (AX) 19087 MOVOU X1, 16(AX) 19088 MOVOU X2, -32(AX)(R9*1) 19089 MOVOU X3, -16(AX)(R9*1) 19090 MOVQ R11, AX 19091 19092 lz4_s2_lits_emit_done: 19093 MOVQ R8, DX 19094 19095 lz4_s2_lits_done: 19096 CMPQ DX, BX 19097 JNE lz4_s2_match 19098 CMPQ R10, $0x04 19099 JEQ lz4_s2_done 19100 JMP lz4_s2_corrupt 19101 19102 lz4_s2_match: 19103 LEAQ 2(DX), R8 19104 CMPQ R8, BX 19105 JAE lz4_s2_corrupt 19106 MOVWQZX (DX), R9 19107 MOVQ R8, DX 19108 TESTQ R9, R9 19109 JZ lz4_s2_corrupt 19110 CMPQ R9, SI 19111 JA lz4_s2_corrupt 19112 CMPQ R10, $0x13 19113 JNE lz4_s2_ml_done 19114 19115 lz4_s2_ml_loop: 19116 MOVBQZX (DX), R8 19117 INCQ DX 19118 ADDQ R8, R10 19119 CMPQ DX, BX 19120 JAE lz4_s2_corrupt 19121 CMPQ R8, $0xff 19122 JEQ lz4_s2_ml_loop 19123 19124 lz4_s2_ml_done: 19125 ADDQ R10, SI 19126 CMPQ R9, DI 19127 JNE lz4_s2_docopy 19128 19129 // emitRepeat 19130 emit_repeat_again_lz4_s2: 19131 MOVL R10, R8 19132 LEAL -4(R10), R10 19133 CMPL R8, $0x08 19134 JBE repeat_two_lz4_s2 19135 CMPL R8, $0x0c 19136 JAE cant_repeat_two_offset_lz4_s2 19137 CMPL R9, $0x00000800 19138 JB repeat_two_offset_lz4_s2 19139 19140 cant_repeat_two_offset_lz4_s2: 19141 CMPL R10, $0x00000104 19142 JB repeat_three_lz4_s2 19143 CMPL R10, $0x00010100 19144 JB repeat_four_lz4_s2 19145 CMPL R10, $0x0100ffff 19146 JB repeat_five_lz4_s2 19147 LEAL -16842747(R10), R10 19148 MOVL $0xfffb001d, (AX) 19149 MOVB $0xff, 4(AX) 19150 ADDQ $0x05, AX 19151 JMP emit_repeat_again_lz4_s2 19152 19153 repeat_five_lz4_s2: 19154 LEAL -65536(R10), R10 19155 MOVL R10, R9 19156 MOVW $0x001d, (AX) 19157 MOVW R10, 2(AX) 19158 SARL $0x10, R9 19159 MOVB R9, 4(AX) 19160 ADDQ $0x05, AX 19161 JMP lz4_s2_loop 19162 19163 repeat_four_lz4_s2: 19164 LEAL -256(R10), R10 19165 MOVW $0x0019, (AX) 19166 MOVW R10, 2(AX) 19167 ADDQ $0x04, AX 19168 JMP lz4_s2_loop 19169 19170 repeat_three_lz4_s2: 19171 LEAL -4(R10), R10 19172 MOVW $0x0015, (AX) 19173 MOVB R10, 2(AX) 19174 ADDQ $0x03, AX 19175 JMP lz4_s2_loop 19176 19177 repeat_two_lz4_s2: 19178 SHLL $0x02, R10 19179 ORL $0x01, R10 19180 MOVW R10, (AX) 19181 ADDQ $0x02, AX 19182 JMP lz4_s2_loop 19183 19184 repeat_two_offset_lz4_s2: 19185 XORQ R8, R8 19186 LEAL 1(R8)(R10*4), R10 19187 MOVB R9, 1(AX) 19188 SARL $0x08, R9 19189 SHLL $0x05, R9 19190 ORL R9, R10 19191 MOVB R10, (AX) 19192 ADDQ $0x02, AX 19193 JMP lz4_s2_loop 19194 19195 lz4_s2_docopy: 19196 MOVQ R9, DI 19197 19198 // emitCopy 19199 CMPL R10, $0x40 19200 JBE two_byte_offset_short_lz4_s2 19201 CMPL R9, $0x00000800 19202 JAE long_offset_short_lz4_s2 19203 MOVL $0x00000001, R8 19204 LEAL 16(R8), R8 19205 MOVB R9, 1(AX) 19206 MOVL R9, R11 19207 SHRL $0x08, R11 19208 SHLL $0x05, R11 19209 ORL R11, R8 19210 MOVB R8, (AX) 19211 ADDQ $0x02, AX 19212 SUBL $0x08, R10 19213 19214 // emitRepeat 19215 LEAL -4(R10), R10 19216 JMP cant_repeat_two_offset_lz4_s2_emit_copy_short_2b 19217 19218 emit_repeat_again_lz4_s2_emit_copy_short_2b: 19219 MOVL R10, R8 19220 LEAL -4(R10), R10 19221 CMPL R8, $0x08 19222 JBE repeat_two_lz4_s2_emit_copy_short_2b 19223 CMPL R8, $0x0c 19224 JAE cant_repeat_two_offset_lz4_s2_emit_copy_short_2b 19225 CMPL R9, $0x00000800 19226 JB repeat_two_offset_lz4_s2_emit_copy_short_2b 19227 19228 cant_repeat_two_offset_lz4_s2_emit_copy_short_2b: 19229 CMPL R10, $0x00000104 19230 JB repeat_three_lz4_s2_emit_copy_short_2b 19231 CMPL R10, $0x00010100 19232 JB repeat_four_lz4_s2_emit_copy_short_2b 19233 CMPL R10, $0x0100ffff 19234 JB repeat_five_lz4_s2_emit_copy_short_2b 19235 LEAL -16842747(R10), R10 19236 MOVL $0xfffb001d, (AX) 19237 MOVB $0xff, 4(AX) 19238 ADDQ $0x05, AX 19239 JMP emit_repeat_again_lz4_s2_emit_copy_short_2b 19240 19241 repeat_five_lz4_s2_emit_copy_short_2b: 19242 LEAL -65536(R10), R10 19243 MOVL R10, R9 19244 MOVW $0x001d, (AX) 19245 MOVW R10, 2(AX) 19246 SARL $0x10, R9 19247 MOVB R9, 4(AX) 19248 ADDQ $0x05, AX 19249 JMP lz4_s2_loop 19250 19251 repeat_four_lz4_s2_emit_copy_short_2b: 19252 LEAL -256(R10), R10 19253 MOVW $0x0019, (AX) 19254 MOVW R10, 2(AX) 19255 ADDQ $0x04, AX 19256 JMP lz4_s2_loop 19257 19258 repeat_three_lz4_s2_emit_copy_short_2b: 19259 LEAL -4(R10), R10 19260 MOVW $0x0015, (AX) 19261 MOVB R10, 2(AX) 19262 ADDQ $0x03, AX 19263 JMP lz4_s2_loop 19264 19265 repeat_two_lz4_s2_emit_copy_short_2b: 19266 SHLL $0x02, R10 19267 ORL $0x01, R10 19268 MOVW R10, (AX) 19269 ADDQ $0x02, AX 19270 JMP lz4_s2_loop 19271 19272 repeat_two_offset_lz4_s2_emit_copy_short_2b: 19273 XORQ R8, R8 19274 LEAL 1(R8)(R10*4), R10 19275 MOVB R9, 1(AX) 19276 SARL $0x08, R9 19277 SHLL $0x05, R9 19278 ORL R9, R10 19279 MOVB R10, (AX) 19280 ADDQ $0x02, AX 19281 JMP lz4_s2_loop 19282 19283 long_offset_short_lz4_s2: 19284 MOVB $0xee, (AX) 19285 MOVW R9, 1(AX) 19286 LEAL -60(R10), R10 19287 ADDQ $0x03, AX 19288 19289 // emitRepeat 19290 emit_repeat_again_lz4_s2_emit_copy_short: 19291 MOVL R10, R8 19292 LEAL -4(R10), R10 19293 CMPL R8, $0x08 19294 JBE repeat_two_lz4_s2_emit_copy_short 19295 CMPL R8, $0x0c 19296 JAE cant_repeat_two_offset_lz4_s2_emit_copy_short 19297 CMPL R9, $0x00000800 19298 JB repeat_two_offset_lz4_s2_emit_copy_short 19299 19300 cant_repeat_two_offset_lz4_s2_emit_copy_short: 19301 CMPL R10, $0x00000104 19302 JB repeat_three_lz4_s2_emit_copy_short 19303 CMPL R10, $0x00010100 19304 JB repeat_four_lz4_s2_emit_copy_short 19305 CMPL R10, $0x0100ffff 19306 JB repeat_five_lz4_s2_emit_copy_short 19307 LEAL -16842747(R10), R10 19308 MOVL $0xfffb001d, (AX) 19309 MOVB $0xff, 4(AX) 19310 ADDQ $0x05, AX 19311 JMP emit_repeat_again_lz4_s2_emit_copy_short 19312 19313 repeat_five_lz4_s2_emit_copy_short: 19314 LEAL -65536(R10), R10 19315 MOVL R10, R9 19316 MOVW $0x001d, (AX) 19317 MOVW R10, 2(AX) 19318 SARL $0x10, R9 19319 MOVB R9, 4(AX) 19320 ADDQ $0x05, AX 19321 JMP lz4_s2_loop 19322 19323 repeat_four_lz4_s2_emit_copy_short: 19324 LEAL -256(R10), R10 19325 MOVW $0x0019, (AX) 19326 MOVW R10, 2(AX) 19327 ADDQ $0x04, AX 19328 JMP lz4_s2_loop 19329 19330 repeat_three_lz4_s2_emit_copy_short: 19331 LEAL -4(R10), R10 19332 MOVW $0x0015, (AX) 19333 MOVB R10, 2(AX) 19334 ADDQ $0x03, AX 19335 JMP lz4_s2_loop 19336 19337 repeat_two_lz4_s2_emit_copy_short: 19338 SHLL $0x02, R10 19339 ORL $0x01, R10 19340 MOVW R10, (AX) 19341 ADDQ $0x02, AX 19342 JMP lz4_s2_loop 19343 19344 repeat_two_offset_lz4_s2_emit_copy_short: 19345 XORQ R8, R8 19346 LEAL 1(R8)(R10*4), R10 19347 MOVB R9, 1(AX) 19348 SARL $0x08, R9 19349 SHLL $0x05, R9 19350 ORL R9, R10 19351 MOVB R10, (AX) 19352 ADDQ $0x02, AX 19353 JMP lz4_s2_loop 19354 19355 two_byte_offset_short_lz4_s2: 19356 MOVL R10, R8 19357 SHLL $0x02, R8 19358 CMPL R10, $0x0c 19359 JAE emit_copy_three_lz4_s2 19360 CMPL R9, $0x00000800 19361 JAE emit_copy_three_lz4_s2 19362 LEAL -15(R8), R8 19363 MOVB R9, 1(AX) 19364 SHRL $0x08, R9 19365 SHLL $0x05, R9 19366 ORL R9, R8 19367 MOVB R8, (AX) 19368 ADDQ $0x02, AX 19369 JMP lz4_s2_loop 19370 19371 emit_copy_three_lz4_s2: 19372 LEAL -2(R8), R8 19373 MOVB R8, (AX) 19374 MOVW R9, 1(AX) 19375 ADDQ $0x03, AX 19376 JMP lz4_s2_loop 19377 19378 lz4_s2_done: 19379 MOVQ dst_base+0(FP), CX 19380 SUBQ CX, AX 19381 MOVQ SI, uncompressed+48(FP) 19382 MOVQ AX, dstUsed+56(FP) 19383 RET 19384 19385 lz4_s2_corrupt: 19386 XORQ AX, AX 19387 LEAQ -1(AX), SI 19388 MOVQ SI, uncompressed+48(FP) 19389 RET 19390 19391 lz4_s2_dstfull: 19392 XORQ AX, AX 19393 LEAQ -2(AX), SI 19394 MOVQ SI, uncompressed+48(FP) 19395 RET 19396 19397 // func cvtLZ4sBlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) 19398 // Requires: SSE2 19399 TEXT ·cvtLZ4sBlockAsm(SB), NOSPLIT, $0-64 19400 XORQ SI, SI 19401 MOVQ dst_base+0(FP), AX 19402 MOVQ dst_len+8(FP), CX 19403 MOVQ src_base+24(FP), DX 19404 MOVQ src_len+32(FP), BX 19405 LEAQ (DX)(BX*1), BX 19406 LEAQ -10(AX)(CX*1), CX 19407 XORQ DI, DI 19408 19409 lz4s_s2_loop: 19410 CMPQ DX, BX 19411 JAE lz4s_s2_corrupt 19412 CMPQ AX, CX 19413 JAE lz4s_s2_dstfull 19414 MOVBQZX (DX), R8 19415 MOVQ R8, R9 19416 MOVQ R8, R10 19417 SHRQ $0x04, R9 19418 ANDQ $0x0f, R10 19419 CMPQ R8, $0xf0 19420 JB lz4s_s2_ll_end 19421 19422 lz4s_s2_ll_loop: 19423 INCQ DX 19424 CMPQ DX, BX 19425 JAE lz4s_s2_corrupt 19426 MOVBQZX (DX), R8 19427 ADDQ R8, R9 19428 CMPQ R8, $0xff 19429 JEQ lz4s_s2_ll_loop 19430 19431 lz4s_s2_ll_end: 19432 LEAQ (DX)(R9*1), R8 19433 ADDQ $0x03, R10 19434 CMPQ R8, BX 19435 JAE lz4s_s2_corrupt 19436 INCQ DX 19437 INCQ R8 19438 TESTQ R9, R9 19439 JZ lz4s_s2_lits_done 19440 LEAQ (AX)(R9*1), R11 19441 CMPQ R11, CX 19442 JAE lz4s_s2_dstfull 19443 ADDQ R9, SI 19444 LEAL -1(R9), R11 19445 CMPL R11, $0x3c 19446 JB one_byte_lz4s_s2 19447 CMPL R11, $0x00000100 19448 JB two_bytes_lz4s_s2 19449 CMPL R11, $0x00010000 19450 JB three_bytes_lz4s_s2 19451 CMPL R11, $0x01000000 19452 JB four_bytes_lz4s_s2 19453 MOVB $0xfc, (AX) 19454 MOVL R11, 1(AX) 19455 ADDQ $0x05, AX 19456 JMP memmove_long_lz4s_s2 19457 19458 four_bytes_lz4s_s2: 19459 MOVL R11, R12 19460 SHRL $0x10, R12 19461 MOVB $0xf8, (AX) 19462 MOVW R11, 1(AX) 19463 MOVB R12, 3(AX) 19464 ADDQ $0x04, AX 19465 JMP memmove_long_lz4s_s2 19466 19467 three_bytes_lz4s_s2: 19468 MOVB $0xf4, (AX) 19469 MOVW R11, 1(AX) 19470 ADDQ $0x03, AX 19471 JMP memmove_long_lz4s_s2 19472 19473 two_bytes_lz4s_s2: 19474 MOVB $0xf0, (AX) 19475 MOVB R11, 1(AX) 19476 ADDQ $0x02, AX 19477 CMPL R11, $0x40 19478 JB memmove_lz4s_s2 19479 JMP memmove_long_lz4s_s2 19480 19481 one_byte_lz4s_s2: 19482 SHLB $0x02, R11 19483 MOVB R11, (AX) 19484 ADDQ $0x01, AX 19485 19486 memmove_lz4s_s2: 19487 LEAQ (AX)(R9*1), R11 19488 19489 // genMemMoveShort 19490 CMPQ R9, $0x08 19491 JBE emit_lit_memmove_lz4s_s2_memmove_move_8 19492 CMPQ R9, $0x10 19493 JBE emit_lit_memmove_lz4s_s2_memmove_move_8through16 19494 CMPQ R9, $0x20 19495 JBE emit_lit_memmove_lz4s_s2_memmove_move_17through32 19496 JMP emit_lit_memmove_lz4s_s2_memmove_move_33through64 19497 19498 emit_lit_memmove_lz4s_s2_memmove_move_8: 19499 MOVQ (DX), R12 19500 MOVQ R12, (AX) 19501 JMP memmove_end_copy_lz4s_s2 19502 19503 emit_lit_memmove_lz4s_s2_memmove_move_8through16: 19504 MOVQ (DX), R12 19505 MOVQ -8(DX)(R9*1), DX 19506 MOVQ R12, (AX) 19507 MOVQ DX, -8(AX)(R9*1) 19508 JMP memmove_end_copy_lz4s_s2 19509 19510 emit_lit_memmove_lz4s_s2_memmove_move_17through32: 19511 MOVOU (DX), X0 19512 MOVOU -16(DX)(R9*1), X1 19513 MOVOU X0, (AX) 19514 MOVOU X1, -16(AX)(R9*1) 19515 JMP memmove_end_copy_lz4s_s2 19516 19517 emit_lit_memmove_lz4s_s2_memmove_move_33through64: 19518 MOVOU (DX), X0 19519 MOVOU 16(DX), X1 19520 MOVOU -32(DX)(R9*1), X2 19521 MOVOU -16(DX)(R9*1), X3 19522 MOVOU X0, (AX) 19523 MOVOU X1, 16(AX) 19524 MOVOU X2, -32(AX)(R9*1) 19525 MOVOU X3, -16(AX)(R9*1) 19526 19527 memmove_end_copy_lz4s_s2: 19528 MOVQ R11, AX 19529 JMP lz4s_s2_lits_emit_done 19530 19531 memmove_long_lz4s_s2: 19532 LEAQ (AX)(R9*1), R11 19533 19534 // genMemMoveLong 19535 MOVOU (DX), X0 19536 MOVOU 16(DX), X1 19537 MOVOU -32(DX)(R9*1), X2 19538 MOVOU -16(DX)(R9*1), X3 19539 MOVQ R9, R13 19540 SHRQ $0x05, R13 19541 MOVQ AX, R12 19542 ANDL $0x0000001f, R12 19543 MOVQ $0x00000040, R14 19544 SUBQ R12, R14 19545 DECQ R13 19546 JA emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32 19547 LEAQ -32(DX)(R14*1), R12 19548 LEAQ -32(AX)(R14*1), R15 19549 19550 emit_lit_memmove_long_lz4s_s2large_big_loop_back: 19551 MOVOU (R12), X4 19552 MOVOU 16(R12), X5 19553 MOVOA X4, (R15) 19554 MOVOA X5, 16(R15) 19555 ADDQ $0x20, R15 19556 ADDQ $0x20, R12 19557 ADDQ $0x20, R14 19558 DECQ R13 19559 JNA emit_lit_memmove_long_lz4s_s2large_big_loop_back 19560 19561 emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32: 19562 MOVOU -32(DX)(R14*1), X4 19563 MOVOU -16(DX)(R14*1), X5 19564 MOVOA X4, -32(AX)(R14*1) 19565 MOVOA X5, -16(AX)(R14*1) 19566 ADDQ $0x20, R14 19567 CMPQ R9, R14 19568 JAE emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32 19569 MOVOU X0, (AX) 19570 MOVOU X1, 16(AX) 19571 MOVOU X2, -32(AX)(R9*1) 19572 MOVOU X3, -16(AX)(R9*1) 19573 MOVQ R11, AX 19574 19575 lz4s_s2_lits_emit_done: 19576 MOVQ R8, DX 19577 19578 lz4s_s2_lits_done: 19579 CMPQ DX, BX 19580 JNE lz4s_s2_match 19581 CMPQ R10, $0x03 19582 JEQ lz4s_s2_done 19583 JMP lz4s_s2_corrupt 19584 19585 lz4s_s2_match: 19586 CMPQ R10, $0x03 19587 JEQ lz4s_s2_loop 19588 LEAQ 2(DX), R8 19589 CMPQ R8, BX 19590 JAE lz4s_s2_corrupt 19591 MOVWQZX (DX), R9 19592 MOVQ R8, DX 19593 TESTQ R9, R9 19594 JZ lz4s_s2_corrupt 19595 CMPQ R9, SI 19596 JA lz4s_s2_corrupt 19597 CMPQ R10, $0x12 19598 JNE lz4s_s2_ml_done 19599 19600 lz4s_s2_ml_loop: 19601 MOVBQZX (DX), R8 19602 INCQ DX 19603 ADDQ R8, R10 19604 CMPQ DX, BX 19605 JAE lz4s_s2_corrupt 19606 CMPQ R8, $0xff 19607 JEQ lz4s_s2_ml_loop 19608 19609 lz4s_s2_ml_done: 19610 ADDQ R10, SI 19611 CMPQ R9, DI 19612 JNE lz4s_s2_docopy 19613 19614 // emitRepeat 19615 emit_repeat_again_lz4_s2: 19616 MOVL R10, R8 19617 LEAL -4(R10), R10 19618 CMPL R8, $0x08 19619 JBE repeat_two_lz4_s2 19620 CMPL R8, $0x0c 19621 JAE cant_repeat_two_offset_lz4_s2 19622 CMPL R9, $0x00000800 19623 JB repeat_two_offset_lz4_s2 19624 19625 cant_repeat_two_offset_lz4_s2: 19626 CMPL R10, $0x00000104 19627 JB repeat_three_lz4_s2 19628 CMPL R10, $0x00010100 19629 JB repeat_four_lz4_s2 19630 CMPL R10, $0x0100ffff 19631 JB repeat_five_lz4_s2 19632 LEAL -16842747(R10), R10 19633 MOVL $0xfffb001d, (AX) 19634 MOVB $0xff, 4(AX) 19635 ADDQ $0x05, AX 19636 JMP emit_repeat_again_lz4_s2 19637 19638 repeat_five_lz4_s2: 19639 LEAL -65536(R10), R10 19640 MOVL R10, R9 19641 MOVW $0x001d, (AX) 19642 MOVW R10, 2(AX) 19643 SARL $0x10, R9 19644 MOVB R9, 4(AX) 19645 ADDQ $0x05, AX 19646 JMP lz4s_s2_loop 19647 19648 repeat_four_lz4_s2: 19649 LEAL -256(R10), R10 19650 MOVW $0x0019, (AX) 19651 MOVW R10, 2(AX) 19652 ADDQ $0x04, AX 19653 JMP lz4s_s2_loop 19654 19655 repeat_three_lz4_s2: 19656 LEAL -4(R10), R10 19657 MOVW $0x0015, (AX) 19658 MOVB R10, 2(AX) 19659 ADDQ $0x03, AX 19660 JMP lz4s_s2_loop 19661 19662 repeat_two_lz4_s2: 19663 SHLL $0x02, R10 19664 ORL $0x01, R10 19665 MOVW R10, (AX) 19666 ADDQ $0x02, AX 19667 JMP lz4s_s2_loop 19668 19669 repeat_two_offset_lz4_s2: 19670 XORQ R8, R8 19671 LEAL 1(R8)(R10*4), R10 19672 MOVB R9, 1(AX) 19673 SARL $0x08, R9 19674 SHLL $0x05, R9 19675 ORL R9, R10 19676 MOVB R10, (AX) 19677 ADDQ $0x02, AX 19678 JMP lz4s_s2_loop 19679 19680 lz4s_s2_docopy: 19681 MOVQ R9, DI 19682 19683 // emitCopy 19684 CMPL R10, $0x40 19685 JBE two_byte_offset_short_lz4_s2 19686 CMPL R9, $0x00000800 19687 JAE long_offset_short_lz4_s2 19688 MOVL $0x00000001, R8 19689 LEAL 16(R8), R8 19690 MOVB R9, 1(AX) 19691 MOVL R9, R11 19692 SHRL $0x08, R11 19693 SHLL $0x05, R11 19694 ORL R11, R8 19695 MOVB R8, (AX) 19696 ADDQ $0x02, AX 19697 SUBL $0x08, R10 19698 19699 // emitRepeat 19700 LEAL -4(R10), R10 19701 JMP cant_repeat_two_offset_lz4_s2_emit_copy_short_2b 19702 19703 emit_repeat_again_lz4_s2_emit_copy_short_2b: 19704 MOVL R10, R8 19705 LEAL -4(R10), R10 19706 CMPL R8, $0x08 19707 JBE repeat_two_lz4_s2_emit_copy_short_2b 19708 CMPL R8, $0x0c 19709 JAE cant_repeat_two_offset_lz4_s2_emit_copy_short_2b 19710 CMPL R9, $0x00000800 19711 JB repeat_two_offset_lz4_s2_emit_copy_short_2b 19712 19713 cant_repeat_two_offset_lz4_s2_emit_copy_short_2b: 19714 CMPL R10, $0x00000104 19715 JB repeat_three_lz4_s2_emit_copy_short_2b 19716 CMPL R10, $0x00010100 19717 JB repeat_four_lz4_s2_emit_copy_short_2b 19718 CMPL R10, $0x0100ffff 19719 JB repeat_five_lz4_s2_emit_copy_short_2b 19720 LEAL -16842747(R10), R10 19721 MOVL $0xfffb001d, (AX) 19722 MOVB $0xff, 4(AX) 19723 ADDQ $0x05, AX 19724 JMP emit_repeat_again_lz4_s2_emit_copy_short_2b 19725 19726 repeat_five_lz4_s2_emit_copy_short_2b: 19727 LEAL -65536(R10), R10 19728 MOVL R10, R9 19729 MOVW $0x001d, (AX) 19730 MOVW R10, 2(AX) 19731 SARL $0x10, R9 19732 MOVB R9, 4(AX) 19733 ADDQ $0x05, AX 19734 JMP lz4s_s2_loop 19735 19736 repeat_four_lz4_s2_emit_copy_short_2b: 19737 LEAL -256(R10), R10 19738 MOVW $0x0019, (AX) 19739 MOVW R10, 2(AX) 19740 ADDQ $0x04, AX 19741 JMP lz4s_s2_loop 19742 19743 repeat_three_lz4_s2_emit_copy_short_2b: 19744 LEAL -4(R10), R10 19745 MOVW $0x0015, (AX) 19746 MOVB R10, 2(AX) 19747 ADDQ $0x03, AX 19748 JMP lz4s_s2_loop 19749 19750 repeat_two_lz4_s2_emit_copy_short_2b: 19751 SHLL $0x02, R10 19752 ORL $0x01, R10 19753 MOVW R10, (AX) 19754 ADDQ $0x02, AX 19755 JMP lz4s_s2_loop 19756 19757 repeat_two_offset_lz4_s2_emit_copy_short_2b: 19758 XORQ R8, R8 19759 LEAL 1(R8)(R10*4), R10 19760 MOVB R9, 1(AX) 19761 SARL $0x08, R9 19762 SHLL $0x05, R9 19763 ORL R9, R10 19764 MOVB R10, (AX) 19765 ADDQ $0x02, AX 19766 JMP lz4s_s2_loop 19767 19768 long_offset_short_lz4_s2: 19769 MOVB $0xee, (AX) 19770 MOVW R9, 1(AX) 19771 LEAL -60(R10), R10 19772 ADDQ $0x03, AX 19773 19774 // emitRepeat 19775 emit_repeat_again_lz4_s2_emit_copy_short: 19776 MOVL R10, R8 19777 LEAL -4(R10), R10 19778 CMPL R8, $0x08 19779 JBE repeat_two_lz4_s2_emit_copy_short 19780 CMPL R8, $0x0c 19781 JAE cant_repeat_two_offset_lz4_s2_emit_copy_short 19782 CMPL R9, $0x00000800 19783 JB repeat_two_offset_lz4_s2_emit_copy_short 19784 19785 cant_repeat_two_offset_lz4_s2_emit_copy_short: 19786 CMPL R10, $0x00000104 19787 JB repeat_three_lz4_s2_emit_copy_short 19788 CMPL R10, $0x00010100 19789 JB repeat_four_lz4_s2_emit_copy_short 19790 CMPL R10, $0x0100ffff 19791 JB repeat_five_lz4_s2_emit_copy_short 19792 LEAL -16842747(R10), R10 19793 MOVL $0xfffb001d, (AX) 19794 MOVB $0xff, 4(AX) 19795 ADDQ $0x05, AX 19796 JMP emit_repeat_again_lz4_s2_emit_copy_short 19797 19798 repeat_five_lz4_s2_emit_copy_short: 19799 LEAL -65536(R10), R10 19800 MOVL R10, R9 19801 MOVW $0x001d, (AX) 19802 MOVW R10, 2(AX) 19803 SARL $0x10, R9 19804 MOVB R9, 4(AX) 19805 ADDQ $0x05, AX 19806 JMP lz4s_s2_loop 19807 19808 repeat_four_lz4_s2_emit_copy_short: 19809 LEAL -256(R10), R10 19810 MOVW $0x0019, (AX) 19811 MOVW R10, 2(AX) 19812 ADDQ $0x04, AX 19813 JMP lz4s_s2_loop 19814 19815 repeat_three_lz4_s2_emit_copy_short: 19816 LEAL -4(R10), R10 19817 MOVW $0x0015, (AX) 19818 MOVB R10, 2(AX) 19819 ADDQ $0x03, AX 19820 JMP lz4s_s2_loop 19821 19822 repeat_two_lz4_s2_emit_copy_short: 19823 SHLL $0x02, R10 19824 ORL $0x01, R10 19825 MOVW R10, (AX) 19826 ADDQ $0x02, AX 19827 JMP lz4s_s2_loop 19828 19829 repeat_two_offset_lz4_s2_emit_copy_short: 19830 XORQ R8, R8 19831 LEAL 1(R8)(R10*4), R10 19832 MOVB R9, 1(AX) 19833 SARL $0x08, R9 19834 SHLL $0x05, R9 19835 ORL R9, R10 19836 MOVB R10, (AX) 19837 ADDQ $0x02, AX 19838 JMP lz4s_s2_loop 19839 19840 two_byte_offset_short_lz4_s2: 19841 MOVL R10, R8 19842 SHLL $0x02, R8 19843 CMPL R10, $0x0c 19844 JAE emit_copy_three_lz4_s2 19845 CMPL R9, $0x00000800 19846 JAE emit_copy_three_lz4_s2 19847 LEAL -15(R8), R8 19848 MOVB R9, 1(AX) 19849 SHRL $0x08, R9 19850 SHLL $0x05, R9 19851 ORL R9, R8 19852 MOVB R8, (AX) 19853 ADDQ $0x02, AX 19854 JMP lz4s_s2_loop 19855 19856 emit_copy_three_lz4_s2: 19857 LEAL -2(R8), R8 19858 MOVB R8, (AX) 19859 MOVW R9, 1(AX) 19860 ADDQ $0x03, AX 19861 JMP lz4s_s2_loop 19862 19863 lz4s_s2_done: 19864 MOVQ dst_base+0(FP), CX 19865 SUBQ CX, AX 19866 MOVQ SI, uncompressed+48(FP) 19867 MOVQ AX, dstUsed+56(FP) 19868 RET 19869 19870 lz4s_s2_corrupt: 19871 XORQ AX, AX 19872 LEAQ -1(AX), SI 19873 MOVQ SI, uncompressed+48(FP) 19874 RET 19875 19876 lz4s_s2_dstfull: 19877 XORQ AX, AX 19878 LEAQ -2(AX), SI 19879 MOVQ SI, uncompressed+48(FP) 19880 RET 19881 19882 // func cvtLZ4BlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) 19883 // Requires: SSE2 19884 TEXT ·cvtLZ4BlockSnappyAsm(SB), NOSPLIT, $0-64 19885 XORQ SI, SI 19886 MOVQ dst_base+0(FP), AX 19887 MOVQ dst_len+8(FP), CX 19888 MOVQ src_base+24(FP), DX 19889 MOVQ src_len+32(FP), BX 19890 LEAQ (DX)(BX*1), BX 19891 LEAQ -10(AX)(CX*1), CX 19892 19893 lz4_snappy_loop: 19894 CMPQ DX, BX 19895 JAE lz4_snappy_corrupt 19896 CMPQ AX, CX 19897 JAE lz4_snappy_dstfull 19898 MOVBQZX (DX), DI 19899 MOVQ DI, R8 19900 MOVQ DI, R9 19901 SHRQ $0x04, R8 19902 ANDQ $0x0f, R9 19903 CMPQ DI, $0xf0 19904 JB lz4_snappy_ll_end 19905 19906 lz4_snappy_ll_loop: 19907 INCQ DX 19908 CMPQ DX, BX 19909 JAE lz4_snappy_corrupt 19910 MOVBQZX (DX), DI 19911 ADDQ DI, R8 19912 CMPQ DI, $0xff 19913 JEQ lz4_snappy_ll_loop 19914 19915 lz4_snappy_ll_end: 19916 LEAQ (DX)(R8*1), DI 19917 ADDQ $0x04, R9 19918 CMPQ DI, BX 19919 JAE lz4_snappy_corrupt 19920 INCQ DX 19921 INCQ DI 19922 TESTQ R8, R8 19923 JZ lz4_snappy_lits_done 19924 LEAQ (AX)(R8*1), R10 19925 CMPQ R10, CX 19926 JAE lz4_snappy_dstfull 19927 ADDQ R8, SI 19928 LEAL -1(R8), R10 19929 CMPL R10, $0x3c 19930 JB one_byte_lz4_snappy 19931 CMPL R10, $0x00000100 19932 JB two_bytes_lz4_snappy 19933 CMPL R10, $0x00010000 19934 JB three_bytes_lz4_snappy 19935 CMPL R10, $0x01000000 19936 JB four_bytes_lz4_snappy 19937 MOVB $0xfc, (AX) 19938 MOVL R10, 1(AX) 19939 ADDQ $0x05, AX 19940 JMP memmove_long_lz4_snappy 19941 19942 four_bytes_lz4_snappy: 19943 MOVL R10, R11 19944 SHRL $0x10, R11 19945 MOVB $0xf8, (AX) 19946 MOVW R10, 1(AX) 19947 MOVB R11, 3(AX) 19948 ADDQ $0x04, AX 19949 JMP memmove_long_lz4_snappy 19950 19951 three_bytes_lz4_snappy: 19952 MOVB $0xf4, (AX) 19953 MOVW R10, 1(AX) 19954 ADDQ $0x03, AX 19955 JMP memmove_long_lz4_snappy 19956 19957 two_bytes_lz4_snappy: 19958 MOVB $0xf0, (AX) 19959 MOVB R10, 1(AX) 19960 ADDQ $0x02, AX 19961 CMPL R10, $0x40 19962 JB memmove_lz4_snappy 19963 JMP memmove_long_lz4_snappy 19964 19965 one_byte_lz4_snappy: 19966 SHLB $0x02, R10 19967 MOVB R10, (AX) 19968 ADDQ $0x01, AX 19969 19970 memmove_lz4_snappy: 19971 LEAQ (AX)(R8*1), R10 19972 19973 // genMemMoveShort 19974 CMPQ R8, $0x08 19975 JBE emit_lit_memmove_lz4_snappy_memmove_move_8 19976 CMPQ R8, $0x10 19977 JBE emit_lit_memmove_lz4_snappy_memmove_move_8through16 19978 CMPQ R8, $0x20 19979 JBE emit_lit_memmove_lz4_snappy_memmove_move_17through32 19980 JMP emit_lit_memmove_lz4_snappy_memmove_move_33through64 19981 19982 emit_lit_memmove_lz4_snappy_memmove_move_8: 19983 MOVQ (DX), R11 19984 MOVQ R11, (AX) 19985 JMP memmove_end_copy_lz4_snappy 19986 19987 emit_lit_memmove_lz4_snappy_memmove_move_8through16: 19988 MOVQ (DX), R11 19989 MOVQ -8(DX)(R8*1), DX 19990 MOVQ R11, (AX) 19991 MOVQ DX, -8(AX)(R8*1) 19992 JMP memmove_end_copy_lz4_snappy 19993 19994 emit_lit_memmove_lz4_snappy_memmove_move_17through32: 19995 MOVOU (DX), X0 19996 MOVOU -16(DX)(R8*1), X1 19997 MOVOU X0, (AX) 19998 MOVOU X1, -16(AX)(R8*1) 19999 JMP memmove_end_copy_lz4_snappy 20000 20001 emit_lit_memmove_lz4_snappy_memmove_move_33through64: 20002 MOVOU (DX), X0 20003 MOVOU 16(DX), X1 20004 MOVOU -32(DX)(R8*1), X2 20005 MOVOU -16(DX)(R8*1), X3 20006 MOVOU X0, (AX) 20007 MOVOU X1, 16(AX) 20008 MOVOU X2, -32(AX)(R8*1) 20009 MOVOU X3, -16(AX)(R8*1) 20010 20011 memmove_end_copy_lz4_snappy: 20012 MOVQ R10, AX 20013 JMP lz4_snappy_lits_emit_done 20014 20015 memmove_long_lz4_snappy: 20016 LEAQ (AX)(R8*1), R10 20017 20018 // genMemMoveLong 20019 MOVOU (DX), X0 20020 MOVOU 16(DX), X1 20021 MOVOU -32(DX)(R8*1), X2 20022 MOVOU -16(DX)(R8*1), X3 20023 MOVQ R8, R12 20024 SHRQ $0x05, R12 20025 MOVQ AX, R11 20026 ANDL $0x0000001f, R11 20027 MOVQ $0x00000040, R13 20028 SUBQ R11, R13 20029 DECQ R12 20030 JA emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32 20031 LEAQ -32(DX)(R13*1), R11 20032 LEAQ -32(AX)(R13*1), R14 20033 20034 emit_lit_memmove_long_lz4_snappylarge_big_loop_back: 20035 MOVOU (R11), X4 20036 MOVOU 16(R11), X5 20037 MOVOA X4, (R14) 20038 MOVOA X5, 16(R14) 20039 ADDQ $0x20, R14 20040 ADDQ $0x20, R11 20041 ADDQ $0x20, R13 20042 DECQ R12 20043 JNA emit_lit_memmove_long_lz4_snappylarge_big_loop_back 20044 20045 emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32: 20046 MOVOU -32(DX)(R13*1), X4 20047 MOVOU -16(DX)(R13*1), X5 20048 MOVOA X4, -32(AX)(R13*1) 20049 MOVOA X5, -16(AX)(R13*1) 20050 ADDQ $0x20, R13 20051 CMPQ R8, R13 20052 JAE emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32 20053 MOVOU X0, (AX) 20054 MOVOU X1, 16(AX) 20055 MOVOU X2, -32(AX)(R8*1) 20056 MOVOU X3, -16(AX)(R8*1) 20057 MOVQ R10, AX 20058 20059 lz4_snappy_lits_emit_done: 20060 MOVQ DI, DX 20061 20062 lz4_snappy_lits_done: 20063 CMPQ DX, BX 20064 JNE lz4_snappy_match 20065 CMPQ R9, $0x04 20066 JEQ lz4_snappy_done 20067 JMP lz4_snappy_corrupt 20068 20069 lz4_snappy_match: 20070 LEAQ 2(DX), DI 20071 CMPQ DI, BX 20072 JAE lz4_snappy_corrupt 20073 MOVWQZX (DX), R8 20074 MOVQ DI, DX 20075 TESTQ R8, R8 20076 JZ lz4_snappy_corrupt 20077 CMPQ R8, SI 20078 JA lz4_snappy_corrupt 20079 CMPQ R9, $0x13 20080 JNE lz4_snappy_ml_done 20081 20082 lz4_snappy_ml_loop: 20083 MOVBQZX (DX), DI 20084 INCQ DX 20085 ADDQ DI, R9 20086 CMPQ DX, BX 20087 JAE lz4_snappy_corrupt 20088 CMPQ DI, $0xff 20089 JEQ lz4_snappy_ml_loop 20090 20091 lz4_snappy_ml_done: 20092 ADDQ R9, SI 20093 20094 // emitCopy 20095 two_byte_offset_lz4_s2: 20096 CMPL R9, $0x40 20097 JBE two_byte_offset_short_lz4_s2 20098 MOVB $0xee, (AX) 20099 MOVW R8, 1(AX) 20100 LEAL -60(R9), R9 20101 ADDQ $0x03, AX 20102 CMPQ AX, CX 20103 JAE lz4_snappy_loop 20104 JMP two_byte_offset_lz4_s2 20105 20106 two_byte_offset_short_lz4_s2: 20107 MOVL R9, DI 20108 SHLL $0x02, DI 20109 CMPL R9, $0x0c 20110 JAE emit_copy_three_lz4_s2 20111 CMPL R8, $0x00000800 20112 JAE emit_copy_three_lz4_s2 20113 LEAL -15(DI), DI 20114 MOVB R8, 1(AX) 20115 SHRL $0x08, R8 20116 SHLL $0x05, R8 20117 ORL R8, DI 20118 MOVB DI, (AX) 20119 ADDQ $0x02, AX 20120 JMP lz4_snappy_loop 20121 20122 emit_copy_three_lz4_s2: 20123 LEAL -2(DI), DI 20124 MOVB DI, (AX) 20125 MOVW R8, 1(AX) 20126 ADDQ $0x03, AX 20127 JMP lz4_snappy_loop 20128 20129 lz4_snappy_done: 20130 MOVQ dst_base+0(FP), CX 20131 SUBQ CX, AX 20132 MOVQ SI, uncompressed+48(FP) 20133 MOVQ AX, dstUsed+56(FP) 20134 RET 20135 20136 lz4_snappy_corrupt: 20137 XORQ AX, AX 20138 LEAQ -1(AX), SI 20139 MOVQ SI, uncompressed+48(FP) 20140 RET 20141 20142 lz4_snappy_dstfull: 20143 XORQ AX, AX 20144 LEAQ -2(AX), SI 20145 MOVQ SI, uncompressed+48(FP) 20146 RET 20147 20148 // func cvtLZ4sBlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) 20149 // Requires: SSE2 20150 TEXT ·cvtLZ4sBlockSnappyAsm(SB), NOSPLIT, $0-64 20151 XORQ SI, SI 20152 MOVQ dst_base+0(FP), AX 20153 MOVQ dst_len+8(FP), CX 20154 MOVQ src_base+24(FP), DX 20155 MOVQ src_len+32(FP), BX 20156 LEAQ (DX)(BX*1), BX 20157 LEAQ -10(AX)(CX*1), CX 20158 20159 lz4s_snappy_loop: 20160 CMPQ DX, BX 20161 JAE lz4s_snappy_corrupt 20162 CMPQ AX, CX 20163 JAE lz4s_snappy_dstfull 20164 MOVBQZX (DX), DI 20165 MOVQ DI, R8 20166 MOVQ DI, R9 20167 SHRQ $0x04, R8 20168 ANDQ $0x0f, R9 20169 CMPQ DI, $0xf0 20170 JB lz4s_snappy_ll_end 20171 20172 lz4s_snappy_ll_loop: 20173 INCQ DX 20174 CMPQ DX, BX 20175 JAE lz4s_snappy_corrupt 20176 MOVBQZX (DX), DI 20177 ADDQ DI, R8 20178 CMPQ DI, $0xff 20179 JEQ lz4s_snappy_ll_loop 20180 20181 lz4s_snappy_ll_end: 20182 LEAQ (DX)(R8*1), DI 20183 ADDQ $0x03, R9 20184 CMPQ DI, BX 20185 JAE lz4s_snappy_corrupt 20186 INCQ DX 20187 INCQ DI 20188 TESTQ R8, R8 20189 JZ lz4s_snappy_lits_done 20190 LEAQ (AX)(R8*1), R10 20191 CMPQ R10, CX 20192 JAE lz4s_snappy_dstfull 20193 ADDQ R8, SI 20194 LEAL -1(R8), R10 20195 CMPL R10, $0x3c 20196 JB one_byte_lz4s_snappy 20197 CMPL R10, $0x00000100 20198 JB two_bytes_lz4s_snappy 20199 CMPL R10, $0x00010000 20200 JB three_bytes_lz4s_snappy 20201 CMPL R10, $0x01000000 20202 JB four_bytes_lz4s_snappy 20203 MOVB $0xfc, (AX) 20204 MOVL R10, 1(AX) 20205 ADDQ $0x05, AX 20206 JMP memmove_long_lz4s_snappy 20207 20208 four_bytes_lz4s_snappy: 20209 MOVL R10, R11 20210 SHRL $0x10, R11 20211 MOVB $0xf8, (AX) 20212 MOVW R10, 1(AX) 20213 MOVB R11, 3(AX) 20214 ADDQ $0x04, AX 20215 JMP memmove_long_lz4s_snappy 20216 20217 three_bytes_lz4s_snappy: 20218 MOVB $0xf4, (AX) 20219 MOVW R10, 1(AX) 20220 ADDQ $0x03, AX 20221 JMP memmove_long_lz4s_snappy 20222 20223 two_bytes_lz4s_snappy: 20224 MOVB $0xf0, (AX) 20225 MOVB R10, 1(AX) 20226 ADDQ $0x02, AX 20227 CMPL R10, $0x40 20228 JB memmove_lz4s_snappy 20229 JMP memmove_long_lz4s_snappy 20230 20231 one_byte_lz4s_snappy: 20232 SHLB $0x02, R10 20233 MOVB R10, (AX) 20234 ADDQ $0x01, AX 20235 20236 memmove_lz4s_snappy: 20237 LEAQ (AX)(R8*1), R10 20238 20239 // genMemMoveShort 20240 CMPQ R8, $0x08 20241 JBE emit_lit_memmove_lz4s_snappy_memmove_move_8 20242 CMPQ R8, $0x10 20243 JBE emit_lit_memmove_lz4s_snappy_memmove_move_8through16 20244 CMPQ R8, $0x20 20245 JBE emit_lit_memmove_lz4s_snappy_memmove_move_17through32 20246 JMP emit_lit_memmove_lz4s_snappy_memmove_move_33through64 20247 20248 emit_lit_memmove_lz4s_snappy_memmove_move_8: 20249 MOVQ (DX), R11 20250 MOVQ R11, (AX) 20251 JMP memmove_end_copy_lz4s_snappy 20252 20253 emit_lit_memmove_lz4s_snappy_memmove_move_8through16: 20254 MOVQ (DX), R11 20255 MOVQ -8(DX)(R8*1), DX 20256 MOVQ R11, (AX) 20257 MOVQ DX, -8(AX)(R8*1) 20258 JMP memmove_end_copy_lz4s_snappy 20259 20260 emit_lit_memmove_lz4s_snappy_memmove_move_17through32: 20261 MOVOU (DX), X0 20262 MOVOU -16(DX)(R8*1), X1 20263 MOVOU X0, (AX) 20264 MOVOU X1, -16(AX)(R8*1) 20265 JMP memmove_end_copy_lz4s_snappy 20266 20267 emit_lit_memmove_lz4s_snappy_memmove_move_33through64: 20268 MOVOU (DX), X0 20269 MOVOU 16(DX), X1 20270 MOVOU -32(DX)(R8*1), X2 20271 MOVOU -16(DX)(R8*1), X3 20272 MOVOU X0, (AX) 20273 MOVOU X1, 16(AX) 20274 MOVOU X2, -32(AX)(R8*1) 20275 MOVOU X3, -16(AX)(R8*1) 20276 20277 memmove_end_copy_lz4s_snappy: 20278 MOVQ R10, AX 20279 JMP lz4s_snappy_lits_emit_done 20280 20281 memmove_long_lz4s_snappy: 20282 LEAQ (AX)(R8*1), R10 20283 20284 // genMemMoveLong 20285 MOVOU (DX), X0 20286 MOVOU 16(DX), X1 20287 MOVOU -32(DX)(R8*1), X2 20288 MOVOU -16(DX)(R8*1), X3 20289 MOVQ R8, R12 20290 SHRQ $0x05, R12 20291 MOVQ AX, R11 20292 ANDL $0x0000001f, R11 20293 MOVQ $0x00000040, R13 20294 SUBQ R11, R13 20295 DECQ R12 20296 JA emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32 20297 LEAQ -32(DX)(R13*1), R11 20298 LEAQ -32(AX)(R13*1), R14 20299 20300 emit_lit_memmove_long_lz4s_snappylarge_big_loop_back: 20301 MOVOU (R11), X4 20302 MOVOU 16(R11), X5 20303 MOVOA X4, (R14) 20304 MOVOA X5, 16(R14) 20305 ADDQ $0x20, R14 20306 ADDQ $0x20, R11 20307 ADDQ $0x20, R13 20308 DECQ R12 20309 JNA emit_lit_memmove_long_lz4s_snappylarge_big_loop_back 20310 20311 emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32: 20312 MOVOU -32(DX)(R13*1), X4 20313 MOVOU -16(DX)(R13*1), X5 20314 MOVOA X4, -32(AX)(R13*1) 20315 MOVOA X5, -16(AX)(R13*1) 20316 ADDQ $0x20, R13 20317 CMPQ R8, R13 20318 JAE emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32 20319 MOVOU X0, (AX) 20320 MOVOU X1, 16(AX) 20321 MOVOU X2, -32(AX)(R8*1) 20322 MOVOU X3, -16(AX)(R8*1) 20323 MOVQ R10, AX 20324 20325 lz4s_snappy_lits_emit_done: 20326 MOVQ DI, DX 20327 20328 lz4s_snappy_lits_done: 20329 CMPQ DX, BX 20330 JNE lz4s_snappy_match 20331 CMPQ R9, $0x03 20332 JEQ lz4s_snappy_done 20333 JMP lz4s_snappy_corrupt 20334 20335 lz4s_snappy_match: 20336 CMPQ R9, $0x03 20337 JEQ lz4s_snappy_loop 20338 LEAQ 2(DX), DI 20339 CMPQ DI, BX 20340 JAE lz4s_snappy_corrupt 20341 MOVWQZX (DX), R8 20342 MOVQ DI, DX 20343 TESTQ R8, R8 20344 JZ lz4s_snappy_corrupt 20345 CMPQ R8, SI 20346 JA lz4s_snappy_corrupt 20347 CMPQ R9, $0x12 20348 JNE lz4s_snappy_ml_done 20349 20350 lz4s_snappy_ml_loop: 20351 MOVBQZX (DX), DI 20352 INCQ DX 20353 ADDQ DI, R9 20354 CMPQ DX, BX 20355 JAE lz4s_snappy_corrupt 20356 CMPQ DI, $0xff 20357 JEQ lz4s_snappy_ml_loop 20358 20359 lz4s_snappy_ml_done: 20360 ADDQ R9, SI 20361 20362 // emitCopy 20363 two_byte_offset_lz4_s2: 20364 CMPL R9, $0x40 20365 JBE two_byte_offset_short_lz4_s2 20366 MOVB $0xee, (AX) 20367 MOVW R8, 1(AX) 20368 LEAL -60(R9), R9 20369 ADDQ $0x03, AX 20370 CMPQ AX, CX 20371 JAE lz4s_snappy_loop 20372 JMP two_byte_offset_lz4_s2 20373 20374 two_byte_offset_short_lz4_s2: 20375 MOVL R9, DI 20376 SHLL $0x02, DI 20377 CMPL R9, $0x0c 20378 JAE emit_copy_three_lz4_s2 20379 CMPL R8, $0x00000800 20380 JAE emit_copy_three_lz4_s2 20381 LEAL -15(DI), DI 20382 MOVB R8, 1(AX) 20383 SHRL $0x08, R8 20384 SHLL $0x05, R8 20385 ORL R8, DI 20386 MOVB DI, (AX) 20387 ADDQ $0x02, AX 20388 JMP lz4s_snappy_loop 20389 20390 emit_copy_three_lz4_s2: 20391 LEAL -2(DI), DI 20392 MOVB DI, (AX) 20393 MOVW R8, 1(AX) 20394 ADDQ $0x03, AX 20395 JMP lz4s_snappy_loop 20396 20397 lz4s_snappy_done: 20398 MOVQ dst_base+0(FP), CX 20399 SUBQ CX, AX 20400 MOVQ SI, uncompressed+48(FP) 20401 MOVQ AX, dstUsed+56(FP) 20402 RET 20403 20404 lz4s_snappy_corrupt: 20405 XORQ AX, AX 20406 LEAQ -1(AX), SI 20407 MOVQ SI, uncompressed+48(FP) 20408 RET 20409 20410 lz4s_snappy_dstfull: 20411 XORQ AX, AX 20412 LEAQ -2(AX), SI 20413 MOVQ SI, uncompressed+48(FP) 20414 RET