mirror of
https://github.com/facebook/zstd.git
synced 2025-11-27 00:05:09 -05:00
Putting stack marking into every assembly files is required to indicate that the stack does not need to be executable. Executable flag on stack conflicts with some security measures, Systemd MemoryDenyWriteExecute=yes for example.
579 lines
14 KiB
ArmAsm
579 lines
14 KiB
ArmAsm
#if !defined(HUF_DISABLE_ASM) && defined(__x86_64__)
|
|
|
|
/* Stack marking
|
|
* ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart
|
|
*/
|
|
#if defined(__linux__) && defined(__ELF__)
|
|
.section .note.GNU-stack,"",%progbits
|
|
#endif
|
|
|
|
/* Calling convention:
|
|
*
|
|
* %rdi contains the first argument: HUF_DecompressAsmArgs*.
|
|
* %rbp is'nt maintained (no frame pointer).
|
|
* %rsp contains the stack pointer that grows down.
|
|
* No red-zone is assumed, only addresses >= %rsp are used.
|
|
* All register contents are preserved.
|
|
*
|
|
* TODO: Support Windows calling convention.
|
|
*/
|
|
|
|
.global HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop
|
|
.global HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop
|
|
.global _HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop
|
|
.global _HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop
|
|
.text
|
|
|
|
/* Sets up register mappings for clarity.
|
|
* op[], bits[], dtable & ip[0] each get their own register.
|
|
* ip[1,2,3] & olimit alias var[].
|
|
* %rax is a scratch register.
|
|
*/
|
|
|
|
#define op0 rsi
|
|
#define op1 rbx
|
|
#define op2 rcx
|
|
#define op3 rdi
|
|
|
|
#define ip0 r8
|
|
#define ip1 r9
|
|
#define ip2 r10
|
|
#define ip3 r11
|
|
|
|
#define bits0 rbp
|
|
#define bits1 rdx
|
|
#define bits2 r12
|
|
#define bits3 r13
|
|
#define dtable r14
|
|
#define olimit r15
|
|
|
|
/* var[] aliases ip[1,2,3] & olimit
|
|
* ip[1,2,3] are saved every iteration.
|
|
* olimit is only used in compute_olimit.
|
|
*/
|
|
#define var0 r15
|
|
#define var1 r9
|
|
#define var2 r10
|
|
#define var3 r11
|
|
|
|
/* 32-bit var registers */
|
|
#define vard0 r15d
|
|
#define vard1 r9d
|
|
#define vard2 r10d
|
|
#define vard3 r11d
|
|
|
|
/* Helper macro: args if idx != 4. */
|
|
#define IF_NOT_4_0(...) __VA_ARGS__
|
|
#define IF_NOT_4_1(...) __VA_ARGS__
|
|
#define IF_NOT_4_2(...) __VA_ARGS__
|
|
#define IF_NOT_4_3(...) __VA_ARGS__
|
|
#define IF_NOT_4_4(...)
|
|
#define IF_NOT_4_(idx, ...) IF_NOT_4_##idx(__VA_ARGS__)
|
|
#define IF_NOT_4(idx, ...) IF_NOT_4_(idx, __VA_ARGS__)
|
|
|
|
/* Calls X(N) for each stream 0, 1, 2, 3. */
|
|
#define FOR_EACH_STREAM(X) \
|
|
X(0); \
|
|
X(1); \
|
|
X(2); \
|
|
X(3)
|
|
|
|
/* Calls X(N, idx) for each stream 0, 1, 2, 3. */
|
|
#define FOR_EACH_STREAM_WITH_INDEX(X, idx) \
|
|
X(0, idx); \
|
|
X(1, idx); \
|
|
X(2, idx); \
|
|
X(3, idx)
|
|
|
|
/* Define both _HUF_* & HUF_* symbols because MacOS
|
|
* C symbols are prefixed with '_' & Linux symbols aren't.
|
|
*/
|
|
_HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:
|
|
HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:
|
|
/* Save all registers - even if they are callee saved for simplicity. */
|
|
push %rax
|
|
push %rbx
|
|
push %rcx
|
|
push %rdx
|
|
push %rbp
|
|
push %rsi
|
|
push %rdi
|
|
push %r8
|
|
push %r9
|
|
push %r10
|
|
push %r11
|
|
push %r12
|
|
push %r13
|
|
push %r14
|
|
push %r15
|
|
|
|
/* Read HUF_DecompressAsmArgs* args from %rax */
|
|
movq %rdi, %rax
|
|
movq 0(%rax), %ip0
|
|
movq 8(%rax), %ip1
|
|
movq 16(%rax), %ip2
|
|
movq 24(%rax), %ip3
|
|
movq 32(%rax), %op0
|
|
movq 40(%rax), %op1
|
|
movq 48(%rax), %op2
|
|
movq 56(%rax), %op3
|
|
movq 64(%rax), %bits0
|
|
movq 72(%rax), %bits1
|
|
movq 80(%rax), %bits2
|
|
movq 88(%rax), %bits3
|
|
movq 96(%rax), %dtable
|
|
push %rax /* argument */
|
|
push 104(%rax) /* ilimit */
|
|
push 112(%rax) /* oend */
|
|
push %olimit /* olimit space */
|
|
|
|
subq $24, %rsp
|
|
|
|
.L_4X1_compute_olimit:
|
|
/* Computes how many iterations we can do savely
|
|
* %r15, %rax may be clobbered
|
|
* rbx, rdx must be saved
|
|
* op3 & ip0 mustn't be clobbered
|
|
*/
|
|
movq %rbx, 0(%rsp)
|
|
movq %rdx, 8(%rsp)
|
|
|
|
movq 32(%rsp), %rax /* rax = oend */
|
|
subq %op3, %rax /* rax = oend - op3 */
|
|
|
|
/* r15 = (oend - op3) / 5 */
|
|
movabsq $-3689348814741910323, %rdx
|
|
mulq %rdx
|
|
movq %rdx, %r15
|
|
shrq $2, %r15
|
|
|
|
movq %ip0, %rax /* rax = ip0 */
|
|
movq 40(%rsp), %rdx /* rdx = ilimit */
|
|
subq %rdx, %rax /* rax = ip0 - ilimit */
|
|
movq %rax, %rbx /* rbx = ip0 - ilimit */
|
|
|
|
/* rdx = (ip0 - ilimit) / 7 */
|
|
movabsq $2635249153387078803, %rdx
|
|
mulq %rdx
|
|
subq %rdx, %rbx
|
|
shrq %rbx
|
|
addq %rbx, %rdx
|
|
shrq $2, %rdx
|
|
|
|
/* r15 = min(%rdx, %r15) */
|
|
cmpq %rdx, %r15
|
|
cmova %rdx, %r15
|
|
|
|
/* r15 = r15 * 5 */
|
|
leaq (%r15, %r15, 4), %r15
|
|
|
|
/* olimit = op3 + r15 */
|
|
addq %op3, %olimit
|
|
|
|
movq 8(%rsp), %rdx
|
|
movq 0(%rsp), %rbx
|
|
|
|
/* If (op3 + 20 > olimit) */
|
|
movq %op3, %rax /* rax = op3 */
|
|
addq $20, %rax /* rax = op3 + 20 */
|
|
cmpq %rax, %olimit /* op3 + 20 > olimit */
|
|
jb .L_4X1_exit
|
|
|
|
/* If (ip1 < ip0) go to exit */
|
|
cmpq %ip0, %ip1
|
|
jb .L_4X1_exit
|
|
|
|
/* If (ip2 < ip1) go to exit */
|
|
cmpq %ip1, %ip2
|
|
jb .L_4X1_exit
|
|
|
|
/* If (ip3 < ip2) go to exit */
|
|
cmpq %ip2, %ip3
|
|
jb .L_4X1_exit
|
|
|
|
/* Reads top 11 bits from bits[n]
|
|
* Loads dt[bits[n]] into var[n]
|
|
*/
|
|
#define GET_NEXT_DELT(n) \
|
|
movq $53, %var##n; \
|
|
shrxq %var##n, %bits##n, %var##n; \
|
|
movzwl (%dtable,%var##n,2),%vard##n
|
|
|
|
/* var[n] must contain the DTable entry computed with GET_NEXT_DELT
|
|
* Moves var[n] to %rax
|
|
* bits[n] <<= var[n] & 63
|
|
* op[n][idx] = %rax >> 8
|
|
* %ah is a way to access bits [8, 16) of %rax
|
|
*/
|
|
#define DECODE_FROM_DELT(n, idx) \
|
|
movq %var##n, %rax; \
|
|
shlxq %var##n, %bits##n, %bits##n; \
|
|
movb %ah, idx(%op##n)
|
|
|
|
/* Assumes GET_NEXT_DELT has been called.
|
|
* Calls DECODE_FROM_DELT then GET_NEXT_DELT if n < 4
|
|
*/
|
|
#define DECODE(n, idx) \
|
|
DECODE_FROM_DELT(n, idx); \
|
|
IF_NOT_4(idx, GET_NEXT_DELT(n))
|
|
|
|
/* // ctz & nbBytes is stored in bits[n]
|
|
* // nbBits is stored in %rax
|
|
* ctz = CTZ[bits[n]]
|
|
* nbBits = ctz & 7
|
|
* nbBytes = ctz >> 3
|
|
* op[n] += 5
|
|
* ip[n] -= nbBytes
|
|
* // Note: x86-64 is little-endian ==> no bswap
|
|
* bits[n] = MEM_readST(ip[n]) | 1
|
|
* bits[n] <<= nbBits
|
|
*/
|
|
#define RELOAD_BITS(n) \
|
|
bsfq %bits##n, %bits##n; \
|
|
movq %bits##n, %rax; \
|
|
andq $7, %rax; \
|
|
shrq $3, %bits##n; \
|
|
leaq 5(%op##n), %op##n; \
|
|
subq %bits##n, %ip##n; \
|
|
movq (%ip##n), %bits##n; \
|
|
orq $1, %bits##n; \
|
|
shlx %rax, %bits##n, %bits##n
|
|
|
|
/* Store clobbered variables on the stack */
|
|
movq %olimit, 24(%rsp)
|
|
movq %ip1, 0(%rsp)
|
|
movq %ip2, 8(%rsp)
|
|
movq %ip3, 16(%rsp)
|
|
|
|
/* Call GET_NEXT_DELT for each stream */
|
|
FOR_EACH_STREAM(GET_NEXT_DELT)
|
|
|
|
.p2align 6
|
|
|
|
.L_4X1_loop_body:
|
|
/* Decode 5 symbols in each of the 4 streams (20 total)
|
|
* Must have called GET_NEXT_DELT for each stream
|
|
*/
|
|
FOR_EACH_STREAM_WITH_INDEX(DECODE, 0)
|
|
FOR_EACH_STREAM_WITH_INDEX(DECODE, 1)
|
|
FOR_EACH_STREAM_WITH_INDEX(DECODE, 2)
|
|
FOR_EACH_STREAM_WITH_INDEX(DECODE, 3)
|
|
FOR_EACH_STREAM_WITH_INDEX(DECODE, 4)
|
|
|
|
/* Load ip[1,2,3] from stack (var[] aliases them)
|
|
* ip[] is needed for RELOAD_BITS
|
|
* Each will be stored back to the stack after RELOAD
|
|
*/
|
|
movq 0(%rsp), %ip1
|
|
movq 8(%rsp), %ip2
|
|
movq 16(%rsp), %ip3
|
|
|
|
/* Reload each stream & fetch the next table entry
|
|
* to prepare for the next iteration
|
|
*/
|
|
RELOAD_BITS(0)
|
|
GET_NEXT_DELT(0)
|
|
|
|
RELOAD_BITS(1)
|
|
movq %ip1, 0(%rsp)
|
|
GET_NEXT_DELT(1)
|
|
|
|
RELOAD_BITS(2)
|
|
movq %ip2, 8(%rsp)
|
|
GET_NEXT_DELT(2)
|
|
|
|
RELOAD_BITS(3)
|
|
movq %ip3, 16(%rsp)
|
|
GET_NEXT_DELT(3)
|
|
|
|
/* If op3 < olimit: continue the loop */
|
|
cmp %op3, 24(%rsp)
|
|
ja .L_4X1_loop_body
|
|
|
|
/* Reload ip[1,2,3] from stack */
|
|
movq 0(%rsp), %ip1
|
|
movq 8(%rsp), %ip2
|
|
movq 16(%rsp), %ip3
|
|
|
|
/* Re-compute olimit */
|
|
jmp .L_4X1_compute_olimit
|
|
|
|
#undef GET_NEXT_DELT
|
|
#undef DECODE_FROM_DELT
|
|
#undef DECODE
|
|
#undef RELOAD_BITS
|
|
.L_4X1_exit:
|
|
addq $24, %rsp
|
|
|
|
/* Restore stack (oend & olimit) */
|
|
pop %rax /* olimit */
|
|
pop %rax /* oend */
|
|
pop %rax /* ilimit */
|
|
pop %rax /* arg */
|
|
|
|
/* Save ip / op / bits */
|
|
movq %ip0, 0(%rax)
|
|
movq %ip1, 8(%rax)
|
|
movq %ip2, 16(%rax)
|
|
movq %ip3, 24(%rax)
|
|
movq %op0, 32(%rax)
|
|
movq %op1, 40(%rax)
|
|
movq %op2, 48(%rax)
|
|
movq %op3, 56(%rax)
|
|
movq %bits0, 64(%rax)
|
|
movq %bits1, 72(%rax)
|
|
movq %bits2, 80(%rax)
|
|
movq %bits3, 88(%rax)
|
|
|
|
/* Restore registers */
|
|
pop %r15
|
|
pop %r14
|
|
pop %r13
|
|
pop %r12
|
|
pop %r11
|
|
pop %r10
|
|
pop %r9
|
|
pop %r8
|
|
pop %rdi
|
|
pop %rsi
|
|
pop %rbp
|
|
pop %rdx
|
|
pop %rcx
|
|
pop %rbx
|
|
pop %rax
|
|
ret
|
|
|
|
_HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:
|
|
HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:
|
|
/* Save all registers - even if they are callee saved for simplicity. */
|
|
push %rax
|
|
push %rbx
|
|
push %rcx
|
|
push %rdx
|
|
push %rbp
|
|
push %rsi
|
|
push %rdi
|
|
push %r8
|
|
push %r9
|
|
push %r10
|
|
push %r11
|
|
push %r12
|
|
push %r13
|
|
push %r14
|
|
push %r15
|
|
|
|
movq %rdi, %rax
|
|
movq 0(%rax), %ip0
|
|
movq 8(%rax), %ip1
|
|
movq 16(%rax), %ip2
|
|
movq 24(%rax), %ip3
|
|
movq 32(%rax), %op0
|
|
movq 40(%rax), %op1
|
|
movq 48(%rax), %op2
|
|
movq 56(%rax), %op3
|
|
movq 64(%rax), %bits0
|
|
movq 72(%rax), %bits1
|
|
movq 80(%rax), %bits2
|
|
movq 88(%rax), %bits3
|
|
movq 96(%rax), %dtable
|
|
push %rax /* argument */
|
|
push %rax /* olimit */
|
|
push 104(%rax) /* ilimit */
|
|
|
|
movq 112(%rax), %rax
|
|
push %rax /* oend3 */
|
|
|
|
movq %op3, %rax
|
|
push %rax /* oend2 */
|
|
|
|
movq %op2, %rax
|
|
push %rax /* oend1 */
|
|
|
|
movq %op1, %rax
|
|
push %rax /* oend0 */
|
|
|
|
/* Scratch space */
|
|
subq $8, %rsp
|
|
|
|
.L_4X2_compute_olimit:
|
|
/* Computes how many iterations we can do savely
|
|
* %r15, %rax may be clobbered
|
|
* rdx must be saved
|
|
* op[1,2,3,4] & ip0 mustn't be clobbered
|
|
*/
|
|
movq %rdx, 0(%rsp)
|
|
|
|
/* We can consume up to 7 input bytes each iteration. */
|
|
movq %ip0, %rax /* rax = ip0 */
|
|
movq 40(%rsp), %rdx /* rdx = ilimit */
|
|
subq %rdx, %rax /* rax = ip0 - ilimit */
|
|
movq %rax, %r15 /* r15 = ip0 - ilimit */
|
|
|
|
/* rdx = rax / 7 */
|
|
movabsq $2635249153387078803, %rdx
|
|
mulq %rdx
|
|
subq %rdx, %r15
|
|
shrq %r15
|
|
addq %r15, %rdx
|
|
shrq $2, %rdx
|
|
|
|
/* r15 = (ip0 - ilimit) / 7 */
|
|
movq %rdx, %r15
|
|
|
|
movabsq $-3689348814741910323, %rdx
|
|
movq 8(%rsp), %rax /* rax = oend0 */
|
|
subq %op0, %rax /* rax = oend0 - op0 */
|
|
mulq %rdx
|
|
shrq $3, %rdx /* rdx = rax / 10 */
|
|
|
|
/* r15 = min(%rdx, %r15) */
|
|
cmpq %rdx, %r15
|
|
cmova %rdx, %r15
|
|
|
|
movabsq $-3689348814741910323, %rdx
|
|
movq 16(%rsp), %rax /* rax = oend1 */
|
|
subq %op1, %rax /* rax = oend1 - op1 */
|
|
mulq %rdx
|
|
shrq $3, %rdx /* rdx = rax / 10 */
|
|
|
|
/* r15 = min(%rdx, %r15) */
|
|
cmpq %rdx, %r15
|
|
cmova %rdx, %r15
|
|
|
|
movabsq $-3689348814741910323, %rdx
|
|
movq 24(%rsp), %rax /* rax = oend2 */
|
|
subq %op2, %rax /* rax = oend2 - op2 */
|
|
mulq %rdx
|
|
shrq $3, %rdx /* rdx = rax / 10 */
|
|
|
|
/* r15 = min(%rdx, %r15) */
|
|
cmpq %rdx, %r15
|
|
cmova %rdx, %r15
|
|
|
|
movabsq $-3689348814741910323, %rdx
|
|
movq 32(%rsp), %rax /* rax = oend3 */
|
|
subq %op3, %rax /* rax = oend3 - op3 */
|
|
mulq %rdx
|
|
shrq $3, %rdx /* rdx = rax / 10 */
|
|
|
|
/* r15 = min(%rdx, %r15) */
|
|
cmpq %rdx, %r15
|
|
cmova %rdx, %r15
|
|
|
|
/* olimit = op3 + 5 * r15 */
|
|
movq %r15, %rax
|
|
leaq (%op3, %rax, 4), %olimit
|
|
addq %rax, %olimit
|
|
|
|
movq 0(%rsp), %rdx
|
|
|
|
/* If (op3 + 10 > olimit) */
|
|
movq %op3, %rax /* rax = op3 */
|
|
addq $10, %rax /* rax = op3 + 10 */
|
|
cmpq %rax, %olimit /* op3 + 10 > olimit */
|
|
jb .L_4X2_exit
|
|
|
|
/* If (ip1 < ip0) go to exit */
|
|
cmpq %ip0, %ip1
|
|
jb .L_4X2_exit
|
|
|
|
/* If (ip2 < ip1) go to exit */
|
|
cmpq %ip1, %ip2
|
|
jb .L_4X2_exit
|
|
|
|
/* If (ip3 < ip2) go to exit */
|
|
cmpq %ip2, %ip3
|
|
jb .L_4X2_exit
|
|
|
|
#define DECODE(n, idx) \
|
|
movq %bits##n, %rax; \
|
|
shrq $53, %rax; \
|
|
movzwl 0(%dtable,%rax,4),%r8d; \
|
|
movzbl 2(%dtable,%rax,4),%r15d; \
|
|
movzbl 3(%dtable,%rax,4),%eax; \
|
|
movw %r8w, (%op##n); \
|
|
shlxq %r15, %bits##n, %bits##n; \
|
|
addq %rax, %op##n
|
|
|
|
#define RELOAD_BITS(n) \
|
|
bsfq %bits##n, %bits##n; \
|
|
movq %bits##n, %rax; \
|
|
shrq $3, %bits##n; \
|
|
andq $7, %rax; \
|
|
subq %bits##n, %ip##n; \
|
|
movq (%ip##n), %bits##n; \
|
|
orq $1, %bits##n; \
|
|
shlxq %rax, %bits##n, %bits##n
|
|
|
|
|
|
movq %olimit, 48(%rsp)
|
|
|
|
.p2align 6
|
|
|
|
.L_4X2_loop_body:
|
|
/* We clobber r8, so store it on the stack */
|
|
movq %r8, 0(%rsp)
|
|
|
|
/* Decode 5 symbols from each of the 4 streams (20 symbols total). */
|
|
FOR_EACH_STREAM_WITH_INDEX(DECODE, 0)
|
|
FOR_EACH_STREAM_WITH_INDEX(DECODE, 1)
|
|
FOR_EACH_STREAM_WITH_INDEX(DECODE, 2)
|
|
FOR_EACH_STREAM_WITH_INDEX(DECODE, 3)
|
|
FOR_EACH_STREAM_WITH_INDEX(DECODE, 4)
|
|
|
|
/* Reload r8 */
|
|
movq 0(%rsp), %r8
|
|
|
|
FOR_EACH_STREAM(RELOAD_BITS)
|
|
|
|
cmp %op3, 48(%rsp)
|
|
ja .L_4X2_loop_body
|
|
jmp .L_4X2_compute_olimit
|
|
|
|
#undef DECODE
|
|
#undef RELOAD_BITS
|
|
.L_4X2_exit:
|
|
addq $8, %rsp
|
|
/* Restore stack (oend & olimit) */
|
|
pop %rax /* oend0 */
|
|
pop %rax /* oend1 */
|
|
pop %rax /* oend2 */
|
|
pop %rax /* oend3 */
|
|
pop %rax /* ilimit */
|
|
pop %rax /* olimit */
|
|
pop %rax /* arg */
|
|
|
|
/* Save ip / op / bits */
|
|
movq %ip0, 0(%rax)
|
|
movq %ip1, 8(%rax)
|
|
movq %ip2, 16(%rax)
|
|
movq %ip3, 24(%rax)
|
|
movq %op0, 32(%rax)
|
|
movq %op1, 40(%rax)
|
|
movq %op2, 48(%rax)
|
|
movq %op3, 56(%rax)
|
|
movq %bits0, 64(%rax)
|
|
movq %bits1, 72(%rax)
|
|
movq %bits2, 80(%rax)
|
|
movq %bits3, 88(%rax)
|
|
|
|
/* Restore registers */
|
|
pop %r15
|
|
pop %r14
|
|
pop %r13
|
|
pop %r12
|
|
pop %r11
|
|
pop %r10
|
|
pop %r9
|
|
pop %r8
|
|
pop %rdi
|
|
pop %rsi
|
|
pop %rbp
|
|
pop %rdx
|
|
pop %rcx
|
|
pop %rbx
|
|
pop %rax
|
|
ret
|
|
|
|
#endif
|