# Calling convention: # # %rdi contains the first argument: HUF_DecompressAsmArgs*. # %rbp is'nt maintained (no frame pointer). # %rsp contains the stack pointer that grows down. # No red-zone is assumed, only addresses >= %rsp are used. # All register contents are preserved. # # TODO: Support Windows calling convention. #if !defined(HUF_DISABLE_ASM) && defined(__x86_64__) .global HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop .global HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop .global _HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop .global _HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop .text # Sets up register mappings for clarity. # op[], bits[], dtable & ip[0] each get their own register. # ip[1,2,3] & olimit alias var[]. # %rax is a scratch register. #define op0 rsi #define op1 rbx #define op2 rcx #define op3 rdi #define ip0 r8 #define ip1 r9 #define ip2 r10 #define ip3 r11 #define bits0 rbp #define bits1 rdx #define bits2 r12 #define bits3 r13 #define dtable r14 #define olimit r15 # var[] aliases ip[1,2,3] & olimit # ip[1,2,3] are saved every iteration. # olimit is only used in compute_olimit. #define var0 r15 #define var1 r9 #define var2 r10 #define var3 r11 # 32-bit var registers #define vard0 r15d #define vard1 r9d #define vard2 r10d #define vard3 r11d # Helper macro: args if idx != 4. #define IF_NOT_4_0(...) __VA_ARGS__ #define IF_NOT_4_1(...) __VA_ARGS__ #define IF_NOT_4_2(...) __VA_ARGS__ #define IF_NOT_4_3(...) __VA_ARGS__ #define IF_NOT_4_4(...) #define IF_NOT_4_(idx, ...) IF_NOT_4_##idx(__VA_ARGS__) #define IF_NOT_4(idx, ...) IF_NOT_4_(idx, __VA_ARGS__) # Calls X(N) for each stream 0, 1, 2, 3. #define FOR_EACH_STREAM(X) \ X(0); \ X(1); \ X(2); \ X(3) # Calls X(N, idx) for each stream 0, 1, 2, 3. #define FOR_EACH_STREAM_WITH_INDEX(X, idx) \ X(0, idx); \ X(1, idx); \ X(2, idx); \ X(3, idx) # Define both _HUF_* & HUF_* symbols because MacOS # C symbols are prefixed with '_' & Linux symbols aren't. _HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop: HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop: # Save all registers - even if they are callee saved for simplicity. push %rax push %rbx push %rcx push %rdx push %rbp push %rsi push %rdi push %r8 push %r9 push %r10 push %r11 push %r12 push %r13 push %r14 push %r15 # Read HUF_DecompressAsmArgs* args from %rax movq %rdi, %rax movq 0(%rax), %ip0 movq 8(%rax), %ip1 movq 16(%rax), %ip2 movq 24(%rax), %ip3 movq 32(%rax), %op0 movq 40(%rax), %op1 movq 48(%rax), %op2 movq 56(%rax), %op3 movq 64(%rax), %bits0 movq 72(%rax), %bits1 movq 80(%rax), %bits2 movq 88(%rax), %bits3 movq 96(%rax), %dtable push %rax # argument push 104(%rax) # ilimit push 112(%rax) # oend push %olimit # olimit space subq $24, %rsp .L_4X1_compute_olimit: # Computes how many iterations we can do savely # %r15, %rax may be clobbered # rbx, rdx must be saved # op3 & ip0 mustn't be clobbered movq %rbx, 0(%rsp) movq %rdx, 8(%rsp) movq 32(%rsp), %rax # rax = oend subq %op3, %rax # rax = oend - op3 # r15 = (oend - op3) / 5 movabsq $-3689348814741910323, %rdx mulq %rdx movq %rdx, %r15 shrq $2, %r15 movq %ip0, %rax # rax = ip0 movq 40(%rsp), %rdx # rdx = ilimit subq %rdx, %rax # rax = ip0 - ilimit movq %rax, %rbx # rbx = ip0 - ilimit # rdx = (ip0 - ilimit) / 7 movabsq $2635249153387078803, %rdx mulq %rdx subq %rdx, %rbx shrq %rbx addq %rbx, %rdx shrq $2, %rdx # r15 = min(%rdx, %r15) cmpq %rdx, %r15 cmova %rdx, %r15 # r15 = r15 * 5 leaq (%r15, %r15, 4), %r15 # olimit = op3 + r15 addq %op3, %olimit movq 8(%rsp), %rdx movq 0(%rsp), %rbx # If (op3 + 20 > olimit) movq %op3, %rax # rax = op3 addq $20, %rax # rax = op3 + 20 cmpq %rax, %olimit # op3 + 20 > olimit jb .L_4X1_exit # If (ip1 < ip0) go to exit cmpq %ip0, %ip1 jb .L_4X1_exit # If (ip2 < ip1) go to exit cmpq %ip1, %ip2 jb .L_4X1_exit # If (ip3 < ip2) go to exit cmpq %ip2, %ip3 jb .L_4X1_exit # Reads top 11 bits from bits[n] # Loads dt[bits[n]] into var[n] #define GET_NEXT_DELT(n) \ movq $53, %var##n; \ shrxq %var##n, %bits##n, %var##n; \ movzwl (%dtable,%var##n,2),%vard##n # var[n] must contain the DTable entry computed with GET_NEXT_DELT # Moves var[n] to %rax # bits[n] <<= var[n] & 63 # op[n][idx] = %rax >> 8 # %ah is a way to access bits [8, 16) of %rax #define DECODE_FROM_DELT(n, idx) \ movq %var##n, %rax; \ shlxq %var##n, %bits##n, %bits##n; \ movb %ah, idx(%op##n) # Assumes GET_NEXT_DELT has been called. # Calls DECODE_FROM_DELT then GET_NEXT_DELT if n < 4 #define DECODE(n, idx) \ DECODE_FROM_DELT(n, idx); \ IF_NOT_4(idx, GET_NEXT_DELT(n)) # // ctz & nbBytes is stored in bits[n] # // nbBits is stored in %rax # ctz = CTZ[bits[n]] # nbBits = ctz & 7 # nbBytes = ctz >> 3 # op[n] += 5 # ip[n] -= nbBytes # // Note: x86-64 is little-endian ==> no bswap # bits[n] = MEM_readST(ip[n]) | 1 # bits[n] <<= nbBits #define RELOAD_BITS(n) \ bsfq %bits##n, %bits##n; \ movq %bits##n, %rax; \ andq $7, %rax; \ shrq $3, %bits##n; \ leaq 5(%op##n), %op##n; \ subq %bits##n, %ip##n; \ movq (%ip##n), %bits##n; \ orq $1, %bits##n; \ shlx %rax, %bits##n, %bits##n; # Store clobbered variables on the stack movq %olimit, 24(%rsp) movq %ip1, 0(%rsp) movq %ip2, 8(%rsp) movq %ip3, 16(%rsp) # Call GET_NEXT_DELT for each stream FOR_EACH_STREAM(GET_NEXT_DELT) .p2align 6 .L_4X1_loop_body: # LLVM-MCA-BEGIN decode-4X1 # Decode 5 symbols in each of the 4 streams (20 total) # Must have called GET_NEXT_DELT for each stream FOR_EACH_STREAM_WITH_INDEX(DECODE, 0) FOR_EACH_STREAM_WITH_INDEX(DECODE, 1) FOR_EACH_STREAM_WITH_INDEX(DECODE, 2) FOR_EACH_STREAM_WITH_INDEX(DECODE, 3) FOR_EACH_STREAM_WITH_INDEX(DECODE, 4) # Load ip[1,2,3] from stack (var[] aliases them) # ip[] is needed for RELOAD_BITS # Each will be stored back to the stack after RELOAD movq 0(%rsp), %ip1 movq 8(%rsp), %ip2 movq 16(%rsp), %ip3 # Reload each stream & fetch the next table entry # to prepare for the next iteration RELOAD_BITS(0) GET_NEXT_DELT(0) RELOAD_BITS(1) movq %ip1, 0(%rsp) GET_NEXT_DELT(1) RELOAD_BITS(2) movq %ip2, 8(%rsp) GET_NEXT_DELT(2) RELOAD_BITS(3) movq %ip3, 16(%rsp) GET_NEXT_DELT(3) # If op3 < olimit: continue the loop cmp %op3, 24(%rsp) ja .L_4X1_loop_body # Reload ip[1,2,3] from stack movq 0(%rsp), %ip1 movq 8(%rsp), %ip2 movq 16(%rsp), %ip3 # Re-compute olimit jmp .L_4X1_compute_olimit #undef GET_NEXT_DELT #undef DECODE_FROM_DELT #undef DECODE #undef RELOAD_BITS # LLVM-MCA-END .L_4X1_exit: addq $24, %rsp # Restore stack (oend & olimit) pop %rax # olimit pop %rax # oend pop %rax # ilimit pop %rax # arg # Save ip / op / bits movq %ip0, 0(%rax) movq %ip1, 8(%rax) movq %ip2, 16(%rax) movq %ip3, 24(%rax) movq %op0, 32(%rax) movq %op1, 40(%rax) movq %op2, 48(%rax) movq %op3, 56(%rax) movq %bits0, 64(%rax) movq %bits1, 72(%rax) movq %bits2, 80(%rax) movq %bits3, 88(%rax) # Restore registers pop %r15 pop %r14 pop %r13 pop %r12 pop %r11 pop %r10 pop %r9 pop %r8 pop %rdi pop %rsi pop %rbp pop %rdx pop %rcx pop %rbx pop %rax ret _HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop: HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop: # Save all registers - even if they are callee saved for simplicity. push %rax push %rbx push %rcx push %rdx push %rbp push %rsi push %rdi push %r8 push %r9 push %r10 push %r11 push %r12 push %r13 push %r14 push %r15 movq %rdi, %rax movq 0(%rax), %ip0 movq 8(%rax), %ip1 movq 16(%rax), %ip2 movq 24(%rax), %ip3 movq 32(%rax), %op0 movq 40(%rax), %op1 movq 48(%rax), %op2 movq 56(%rax), %op3 movq 64(%rax), %bits0 movq 72(%rax), %bits1 movq 80(%rax), %bits2 movq 88(%rax), %bits3 movq 96(%rax), %dtable push %rax # argument push %rax # olimit push 104(%rax) # ilimit movq 112(%rax), %rax push %rax # oend3 movq %op3, %rax push %rax # oend2 movq %op2, %rax push %rax # oend1 movq %op1, %rax push %rax # oend0 # Scratch space subq $8, %rsp .L_4X2_compute_olimit: # Computes how many iterations we can do savely # %r15, %rax may be clobbered # rdx must be saved # op[1,2,3,4] & ip0 mustn't be clobbered movq %rdx, 0(%rsp) # We can consume up to 7 input bytes each iteration. movq %ip0, %rax # rax = ip0 movq 40(%rsp), %rdx # rdx = ilimit subq %rdx, %rax # rax = ip0 - ilimit movq %rax, %r15 # r15 = ip0 - ilimit # rdx = rax / 7 movabsq $2635249153387078803, %rdx mulq %rdx subq %rdx, %r15 shrq %r15 addq %r15, %rdx shrq $2, %rdx # r15 = (ip0 - ilimit) / 7 movq %rdx, %r15 movabsq $-3689348814741910323, %rdx movq 8(%rsp), %rax # rax = oend0 subq %op0, %rax # rax = oend0 - op0 mulq %rdx shrq $3, %rdx # rdx = rax / 10 # r15 = min(%rdx, %r15) cmpq %rdx, %r15 cmova %rdx, %r15 movabsq $-3689348814741910323, %rdx movq 16(%rsp), %rax # rax = oend1 subq %op1, %rax # rax = oend1 - op1 mulq %rdx shrq $3, %rdx # rdx = rax / 10 # r15 = min(%rdx, %r15) cmpq %rdx, %r15 cmova %rdx, %r15 movabsq $-3689348814741910323, %rdx movq 24(%rsp), %rax # rax = oend2 subq %op2, %rax # rax = oend2 - op2 mulq %rdx shrq $3, %rdx # rdx = rax / 10 # r15 = min(%rdx, %r15) cmpq %rdx, %r15 cmova %rdx, %r15 movabsq $-3689348814741910323, %rdx movq 32(%rsp), %rax # rax = oend3 subq %op3, %rax # rax = oend3 - op3 mulq %rdx shrq $3, %rdx # rdx = rax / 10 # r15 = min(%rdx, %r15) cmpq %rdx, %r15 cmova %rdx, %r15 # olimit = op3 + 5 * r15 movq %r15, %rax leaq (%op3, %rax, 4), %olimit addq %rax, %olimit movq 0(%rsp), %rdx # If (op3 + 10 > olimit) movq %op3, %rax # rax = op3 addq $10, %rax # rax = op3 + 10 cmpq %rax, %olimit # op3 + 10 > olimit jb .L_4X2_exit # If (ip1 < ip0) go to exit cmpq %ip0, %ip1 jb .L_4X2_exit # If (ip2 < ip1) go to exit cmpq %ip1, %ip2 jb .L_4X2_exit # If (ip3 < ip2) go to exit cmpq %ip2, %ip3 jb .L_4X2_exit #define DECODE(n, idx) \ movq %bits##n, %rax; \ shrq $53, %rax; \ movzwl 0(%dtable,%rax,4),%r8d; \ movzbl 2(%dtable,%rax,4),%r15d; \ movzbl 3(%dtable,%rax,4),%eax; \ movw %r8w, (%op##n); \ shlxq %r15, %bits##n, %bits##n; \ addq %rax, %op##n #define RELOAD_BITS(n) \ bsfq %bits##n, %bits##n; \ movq %bits##n, %rax; \ shrq $3, %bits##n; \ andq $7, %rax; \ subq %bits##n, %ip##n; \ movq (%ip##n), %bits##n; \ orq $1, %bits##n; \ shlxq %rax, %bits##n, %bits##n; movq %olimit, 48(%rsp) .p2align 6 .L_4X2_loop_body: # LLVM-MCA-BEGIN decode-4X2 # We clobber r8, so store it on the stack movq %r8, 0(%rsp) # Decode 5 symbols from each of the 4 streams (20 symbols total). FOR_EACH_STREAM_WITH_INDEX(DECODE, 0) FOR_EACH_STREAM_WITH_INDEX(DECODE, 1) FOR_EACH_STREAM_WITH_INDEX(DECODE, 2) FOR_EACH_STREAM_WITH_INDEX(DECODE, 3) FOR_EACH_STREAM_WITH_INDEX(DECODE, 4) # Reload r8 movq 0(%rsp), %r8 FOR_EACH_STREAM(RELOAD_BITS) cmp %op3, 48(%rsp) ja .L_4X2_loop_body jmp .L_4X2_compute_olimit #undef DECODE #undef RELOAD_BITS # LLVM-MCA-END .L_4X2_exit: addq $8, %rsp # Restore stack (oend & olimit) pop %rax # oend0 pop %rax # oend1 pop %rax # oend2 pop %rax # oend3 pop %rax # ilimit pop %rax # olimit pop %rax # arg # Save ip / op / bits movq %ip0, 0(%rax) movq %ip1, 8(%rax) movq %ip2, 16(%rax) movq %ip3, 24(%rax) movq %op0, 32(%rax) movq %op1, 40(%rax) movq %op2, 48(%rax) movq %op3, 56(%rax) movq %bits0, 64(%rax) movq %bits1, 72(%rax) movq %bits2, 80(%rax) movq %bits3, 88(%rax) # Restore registers pop %r15 pop %r14 pop %r13 pop %r12 pop %r11 pop %r10 pop %r9 pop %r8 pop %rdi pop %rsi pop %rbp pop %rdx pop %rcx pop %rbx pop %rax ret #endif