libm: Use funnel shifts to speed up u256 shifting#1120
libm: Use funnel shifts to speed up u256 shifting#1120tgross35 wants to merge 1 commit intorust-lang:mainfrom
Conversation
a35316d to
cf9723a
Compare
|
A regression shows up in the benchmark for the softfloat version, which doesn't set It should be possible to use this same algorithm for signed and unsigned shl. I can't think of an easy way to do this for i256 shr, but I'm not even sure we use that outside of filling in traits. @quaternic I'm sure you're taking a look at this too |
|
Yeah, the compiler output doesn't look ideal for the current impl. I think you could add the funnel shift as a method to one of the traits (or just as a local helper) to avoid needing recent nightly. Using something like |
7fc84f4 to
8a60dcd
Compare
|
Original asm before this PR: x86 i256 shlbigint_i256_shl:
.cfi_startproc
push rbx
.cfi_def_cfa_offset 16
.cfi_offset rbx, -16
mov rax, rcx
mov ecx, r9d
and ecx, 127
mov r10, r8
shld r10, rax, cl
mov r11, rax
shl r11, cl
mov ebx, ecx
xor ecx, ecx
test bl, 64
cmovne r10, r11
cmovne r11, rcx
test r9b, r9b
js .LBB287_1
mov ecx, ebx
xor cl, 127
mov r9d, ebx
shrd rax, r8, cl
shr r8, cl
xor ebx, ebx
test cl, 64
cmovne rax, r8
cmovne r8, rbx
shrd rax, r8, 1
mov ecx, r9d
shld rdx, rsi, cl
shl rsi, cl
shr r8
test r9b, 64
cmovne rdx, rsi
cmovne rsi, rbx
or rdx, r8
or rsi, rax
mov rcx, r11
mov rax, r10
mov r11, rsi
mov r10, rdx
jmp .LBB287_3
.LBB287_1:
xor eax, eax
.LBB287_3:
mov qword ptr [rdi], r11
mov qword ptr [rdi + 8], r10
mov qword ptr [rdi + 16], rcx
mov qword ptr [rdi + 24], rax
mov rax, rdi
pop rbx
.cfi_def_cfa_offset 8
retx86 i256 shrbigint_i256_shr:
.cfi_startproc
push rbx
.cfi_def_cfa_offset 16
.cfi_offset rbx, -16
mov rax, rcx
mov ecx, r9d
and ecx, 127
mov r10, rsi
shrd r10, rdx, cl
mov r11, rdx
sar r11, cl
mov ebx, ecx
mov rcx, rdx
sar rcx, 63
test bl, 64
cmovne r10, r11
cmovne r11, rcx
test r9b, r9b
js .LBB288_2
mov ecx, ebx
xor cl, 127
mov r9d, ebx
shld rdx, rsi, cl
shl rsi, cl
xor ebx, ebx
test cl, 64
cmovne rdx, rsi
cmovne rsi, rbx
shld rdx, rsi, 1
mov ecx, r9d
shrd rax, r8, cl
shr r8, cl
add rsi, rsi
test r9b, 64
cmovne rax, r8
cmovne r8, rbx
or r8, rdx
or rax, rsi
jmp .LBB288_3
.LBB288_2:
mov rax, r10
mov r8, r11
mov r10, rcx
mov r11, rcx
.LBB288_3:
mov qword ptr [rdi], r10
mov qword ptr [rdi + 8], r11
mov qword ptr [rdi + 16], rax
mov qword ptr [rdi + 24], r8
mov rax, rdi
pop rbx
.cfi_def_cfa_offset 8
retx86 u256 shrbigint_u256_shr:
.cfi_startproc
push rbx
.cfi_def_cfa_offset 16
.cfi_offset rbx, -16
mov rax, rcx
mov ecx, r9d
and ecx, 127
mov r10, rsi
shrd r10, rdx, cl
mov r11, rdx
shr r11, cl
mov ebx, ecx
xor ecx, ecx
test bl, 64
cmovne r10, r11
cmovne r11, rcx
test r9b, r9b
js .LBB289_1
mov ecx, ebx
xor cl, 127
mov r9d, ebx
shld rdx, rsi, cl
shl rsi, cl
xor ebx, ebx
test cl, 64
cmovne rdx, rsi
cmovne rsi, rbx
shld rdx, rsi, 1
mov ecx, r9d
shrd rax, r8, cl
shr r8, cl
add rsi, rsi
test r9b, 64
cmovne rax, r8
cmovne r8, rbx
or r8, rdx
or rax, rsi
jmp .LBB289_3
.LBB289_1:
mov rax, r10
mov r8, r11
xor r10d, r10d
xor r11d, r11d
.LBB289_3:
mov qword ptr [rdi], r10
mov qword ptr [rdi + 8], r11
mov qword ptr [rdi + 16], rax
mov qword ptr [rdi + 24], r8
mov rax, rdi
pop rbx
.cfi_def_cfa_offset 8
retaarch64 i256 shlbigint_i256_shl:
.cfi_startproc
lsr x9, x2, #1
and w11, w4, #0x7f
mvn w10, w4
lsl x12, x2, x11
tst x11, #0x40
lsr x9, x9, x10
lsl x10, x3, x11
orr x9, x10, x9
csel x10, xzr, x12, ne
csel x9, x12, x9, ne
tbnz w4, #7, .LBB275_2
lsl x12, x3, #1
eor w13, w11, #0x7f
lsr x14, x0, #1
lsr x15, x2, x13
lsr x16, x3, x13
tst x13, #0x40
lsl x12, x12, x11
mvn w13, w11
lsr x13, x14, x13
lsl x14, x1, x11
orr x12, x12, x15
lsl x15, x0, x11
csel x12, x16, x12, ne
csel x16, xzr, x16, ne
orr x13, x14, x13
tst x11, #0x40
extr x12, x16, x12, #1
csel x11, x15, x13, ne
csel x13, xzr, x15, ne
orr x14, x11, x16, lsr #1
mov x11, x10
orr x10, x12, x13
mov x12, x9
stp x10, x14, [x8]
stp x11, x12, [x8, #16]
ret
.LBB275_2:
stp x10, x9, [x8]
stp xzr, xzr, [x8, #16]
retaarch64 i256 shrbigint_i256_shr:
.cfi_startproc
lsl x9, x1, #1
and w11, w4, #0x7f
mvn w10, w4
asr x12, x1, x11
asr x13, x1, #63
tst x11, #0x40
lsl x9, x9, x10
lsr x10, x0, x11
orr x9, x9, x10
csel x10, x13, x12, ne
csel x9, x12, x9, ne
tbnz w4, #7, .LBB276_2
lsr x12, x0, #1
eor w13, w11, #0x7f
lsl x14, x3, #1
lsl x15, x1, x13
lsl x16, x0, x13
tst x13, #0x40
lsr x12, x12, x11
mvn w13, w11
lsl x13, x14, x13
lsr x14, x2, x11
orr x12, x15, x12
lsr x15, x3, x11
csel x12, x16, x12, ne
csel x16, xzr, x16, ne
orr x13, x13, x14
tst x11, #0x40
extr x12, x12, x16, #63
csel x11, x15, x13, ne
csel x13, xzr, x15, ne
orr x11, x11, x16, lsl #1
orr x12, x12, x13
stp x9, x10, [x8]
stp x11, x12, [x8, #16]
ret
.LBB276_2:
mov x11, x9
mov x12, x10
stp x13, x13, [x8]
stp x11, x12, [x8, #16]
retaarch64 u256 shrbigint_u256_shr:
.cfi_startproc
lsl x9, x1, #1
and w11, w4, #0x7f
mvn w10, w4
lsr x12, x1, x11
tst x11, #0x40
lsl x9, x9, x10
lsr x10, x0, x11
orr x9, x9, x10
csel x10, xzr, x12, ne
csel x9, x12, x9, ne
tbnz w4, #7, .LBB277_2
lsr x12, x0, #1
eor w13, w11, #0x7f
lsl x14, x3, #1
lsl x15, x1, x13
lsl x16, x0, x13
tst x13, #0x40
lsr x12, x12, x11
mvn w13, w11
lsl x13, x14, x13
lsr x14, x2, x11
orr x12, x15, x12
lsr x15, x3, x11
csel x12, x16, x12, ne
csel x16, xzr, x16, ne
orr x13, x13, x14
tst x11, #0x40
extr x12, x12, x16, #63
csel x11, x15, x13, ne
csel x13, xzr, x15, ne
orr x11, x11, x16, lsl #1
orr x12, x12, x13
stp x9, x10, [x8]
stp x11, x12, [x8, #16]
ret
.LBB277_2:
mov x11, x9
mov x12, x10
stp xzr, xzr, [x8]
stp x11, x12, [x8, #16]
ret |
|
With this change currently: x86 i256 shlbigint_i256_shl:
.cfi_startproc
xorps xmm0, xmm0
movaps xmmword ptr [rsp - 56], xmm0
movaps xmmword ptr [rsp - 72], xmm0
mov qword ptr [rsp - 40], rcx
mov qword ptr [rsp - 32], r8
mov qword ptr [rsp - 24], rsi
mov qword ptr [rsp - 16], rdx
mov eax, r9d
shr eax, 6
and eax, 3
mov edx, 4
sub edx, eax
mov rsi, qword ptr [rsp + 8*rdx - 56]
mov r8, qword ptr [rsp + 8*rdx - 48]
mov ecx, r9d
shld r8, rsi, cl
mov r10, qword ptr [rsp + 8*rdx - 64]
shld rsi, r10, cl
mov rdx, qword ptr [rsp + 8*rdx - 72]
shld r10, rdx, cl
mov rax, rdi
shl rdx, cl
mov qword ptr [rdi + 8], r8
mov qword ptr [rdi], rsi
mov qword ptr [rdi + 24], r10
mov qword ptr [rdi + 16], rdx
retx86 i256 shrbigint_i256_shr:
.cfi_startproc
mov qword ptr [rsp - 64], rcx
mov qword ptr [rsp - 56], r8
mov qword ptr [rsp - 48], rsi
mov qword ptr [rsp - 40], rdx
sar rdx, 63
mov qword ptr [rsp - 32], rdx
mov qword ptr [rsp - 24], rdx
mov qword ptr [rsp - 16], rdx
mov eax, r9d
shr eax, 3
and eax, 24
mov rdx, qword ptr [rsp + rax - 40]
mov rsi, rdx
mov ecx, r9d
sar rsi, cl
mov r8, qword ptr [rsp + rax - 48]
mov r10, r8
shrd r10, rdx, cl
mov rdx, qword ptr [rsp + rax - 56]
mov r11, rdx
shrd r11, r8, cl
mov r8, qword ptr [rsp + rax - 64]
shrd r8, rdx, cl
mov rax, rdi
mov qword ptr [rdi + 8], rsi
mov qword ptr [rdi], r10
mov qword ptr [rdi + 24], r11
mov qword ptr [rdi + 16], r8
retx86 u256 shrbigint_u256_shr:
.cfi_startproc
xorps xmm0, xmm0
movups xmmword ptr [rsp - 16], xmm0
movups xmmword ptr [rsp - 32], xmm0
mov qword ptr [rsp - 64], rcx
mov qword ptr [rsp - 56], r8
mov qword ptr [rsp - 48], rsi
mov qword ptr [rsp - 40], rdx
mov eax, r9d
shr eax, 3
and eax, 24
mov rdx, qword ptr [rsp + rax - 48]
mov rsi, qword ptr [rsp + rax - 40]
mov r8, rdx
mov ecx, r9d
shrd r8, rsi, cl
shr rsi, cl
mov r10, qword ptr [rsp + rax - 56]
mov r11, r10
shrd r11, rdx, cl
mov rdx, qword ptr [rsp + rax - 64]
shrd rdx, r10, cl
mov rax, rdi
mov qword ptr [rdi + 24], r11
mov qword ptr [rdi + 16], rdx
mov qword ptr [rdi + 8], rsi
mov qword ptr [rdi], r8
retaarch64 i256 shlbigint_i256_shl:
.cfi_startproc
sub sp, sp, #64
.cfi_def_cfa_offset 64
movi v0.2d, #0000000000000000
ubfx w9, w4, #6, #2
mov w10, #4
stp x2, x3, [sp, #32]
and w12, w4, #0x3f
sub w9, w10, w9
mov x10, sp
stp x0, x1, [sp, #48]
add x9, x10, w9, uxtw #3
eor x12, x12, #0x3f
stp q0, q0, [sp]
ldp x11, x10, [x9, #8]
ldr x13, [x9, #24]
ldr x9, [x9]
lsl x13, x13, x4
lsr x14, x10, #1
lsr x15, x11, #1
lsr x16, x9, #1
lsl x10, x10, x4
lsl x11, x11, x4
lsl x9, x9, x4
lsr x14, x14, x12
lsr x15, x15, x12
lsr x12, x16, x12
orr x13, x13, x14
orr x10, x10, x15
stp x10, x13, [x8]
orr x10, x11, x12
stp x9, x10, [x8, #16]
add sp, sp, #64
.cfi_def_cfa_offset 0
retaarch64 i256 shrbigint_i256_shr:
.cfi_startproc
sub sp, sp, #64
.cfi_def_cfa_offset 64
ubfx w10, w4, #6, #2
asr x9, x1, #63
mov x11, sp
stp x0, x1, [sp, #16]
and w12, w4, #0x3f
add x10, x11, w10, uxtw #3
stp x9, x9, [sp, #32]
eor x12, x12, #0x3f
str x9, [sp, #48]
stp x2, x3, [sp]
ldp x11, x9, [x10, #16]
ldp x10, x14, [x10]
lsl x13, x9, #1
lsl x15, x11, #1
lsr x11, x11, x4
lsl x16, x14, #1
asr x9, x9, x4
lsr x14, x14, x4
lsl x13, x13, x12
lsr x10, x10, x4
lsl x15, x15, x12
lsl x12, x16, x12
orr x11, x13, x11
stp x11, x9, [x8]
orr x9, x15, x14
orr x10, x12, x10
stp x10, x9, [x8, #16]
add sp, sp, #64
.cfi_def_cfa_offset 0
retaarch64 u256 shrbigint_u256_shr:
.cfi_startproc
sub sp, sp, #64
.cfi_def_cfa_offset 64
movi v0.2d, #0000000000000000
ubfx w9, w4, #6, #2
mov x10, sp
stp x0, x1, [sp, #16]
and w12, w4, #0x3f
add x9, x10, w9, uxtw #3
stp x2, x3, [sp]
eor x12, x12, #0x3f
stp q0, q0, [sp, #32]
ldp x11, x10, [x9, #8]
ldr x13, [x9, #24]
ldr x9, [x9]
lsl x16, x13, #1
lsr x13, x13, x4
lsl x14, x10, #1
lsl x15, x11, #1
lsr x11, x11, x4
lsr x9, x9, x4
lsr x10, x10, x4
lsl x14, x14, x12
lsl x15, x15, x12
lsl x12, x16, x12
orr x11, x14, x11
orr x9, x15, x9
stp x9, x11, [x8, #16]
orr x9, x12, x10
stp x9, x13, [x8]
add sp, sp, #64
.cfi_def_cfa_offset 0
ret |
|
Per the benchmarks x86 shows about a 7% improvement in shifts, 6% improvement in I'm leaning toward taking this still because of the improvement on other arches, especially 32-bit, and it doesn't seem worth keeping a separate implementation only for aarch64 (excluding something asm), and this seems to pretty closely match what LLVM generates for Any thoughts @quaternic? |
This comment has been minimized.
This comment has been minimized.
Not being familiar with ARM, this wasn't very clear, but it's actually a branch for shifts of at least 128 bits, which explains why it has a significant effect on the benchmarks: Half the time it avoids more than half the work, with some added overhead for the branching. For the uses in The use of funnel shifts is a clear improvement. I was still wondering if doing the coarse shifting via memory is ideal given the added complexity. However, it's probably unavoidable for good performance on 32-bit targets. |
This comment has been minimized.
This comment has been minimized.
Switch to an algorithm using word-sized operations on an array to do coarse shifts, then funnel shifts for the bit shifts. The result is quite close to what LLVM generates when using native `u256` types.
|
This PR was rebased onto a different main commit. Here's a range-diff highlighting what actually changed. Rebasing is a normal part of keeping PRs up to date, so no action is needed—this note is just to help reviewers. |
|
☔ The latest upstream changes (possibly #1193) made this pull request unmergeable. Please resolve the merge conflicts. |
Switch to an algorithm using word-sized operations on an array to do coarse shifts, then funnel shifts for the bit shifts. The result is quite close to what LLVM generates when using native
u256types.