libm: Use funnel shifts to speed up u256 shifting by tgross35 · Pull Request #1120 · rust-lang/compiler-builtins

tgross35 · 2026-03-20T22:04:35Z

Switch to an algorithm using word-sized operations on an array to do coarse shifts, then funnel shifts for the bit shifts. The result is quite close to what LLVM generates when using native u256 types.

tgross35 · 2026-03-20T22:05:37Z

https://rust.godbolt.org/z/h56bcWoz8

tgross35 · 2026-03-20T22:29:42Z

A regression shows up in the benchmark for the softfloat version, which doesn't set intrinsics_enabled. So I suppose it's probably better to keep the existing algorithm when we don't have funnel shifts.

It should be possible to use this same algorithm for signed and unsigned shl. I can't think of an easy way to do this for i256 shr, but I'm not even sure we use that outside of filling in traits.

@quaternic I'm sure you're taking a look at this too

quaternic · 2026-03-21T01:06:22Z

Yeah, the compiler output doesn't look ideal for the current impl. I think you could add the funnel shift as a method to one of the traits (or just as a local helper) to avoid needing recent nightly. Using something like (u128::from_lo_hi(lo, hi) >> (shift & 63)).lo() should generate what we want.

tgross35 · 2026-03-23T02:22:00Z

Original asm before this PR:

x86 i256 shl

bigint_i256_shl:
        .cfi_startproc
        push rbx
        .cfi_def_cfa_offset 16
        .cfi_offset rbx, -16
        mov rax, rcx
        mov ecx, r9d
        and ecx, 127
        mov r10, r8
        shld r10, rax, cl
        mov r11, rax
        shl r11, cl
        mov ebx, ecx
        xor ecx, ecx
        test bl, 64
        cmovne r10, r11
        cmovne r11, rcx
        test r9b, r9b
        js .LBB287_1
        mov ecx, ebx
        xor cl, 127
        mov r9d, ebx
        shrd rax, r8, cl
        shr r8, cl
        xor ebx, ebx
        test cl, 64
        cmovne rax, r8
        cmovne r8, rbx
        shrd rax, r8, 1
        mov ecx, r9d
        shld rdx, rsi, cl
        shl rsi, cl
        shr r8
        test r9b, 64
        cmovne rdx, rsi
        cmovne rsi, rbx
        or rdx, r8
        or rsi, rax
        mov rcx, r11
        mov rax, r10
        mov r11, rsi
        mov r10, rdx
        jmp .LBB287_3
.LBB287_1:
        xor eax, eax
.LBB287_3:
        mov qword ptr [rdi], r11
        mov qword ptr [rdi + 8], r10
        mov qword ptr [rdi + 16], rcx
        mov qword ptr [rdi + 24], rax
        mov rax, rdi
        pop rbx
        .cfi_def_cfa_offset 8
        ret

x86 i256 shr

bigint_i256_shr:
        .cfi_startproc
        push rbx
        .cfi_def_cfa_offset 16
        .cfi_offset rbx, -16
        mov rax, rcx
        mov ecx, r9d
        and ecx, 127
        mov r10, rsi
        shrd r10, rdx, cl
        mov r11, rdx
        sar r11, cl
        mov ebx, ecx
        mov rcx, rdx
        sar rcx, 63
        test bl, 64
        cmovne r10, r11
        cmovne r11, rcx
        test r9b, r9b
        js .LBB288_2
        mov ecx, ebx
        xor cl, 127
        mov r9d, ebx
        shld rdx, rsi, cl
        shl rsi, cl
        xor ebx, ebx
        test cl, 64
        cmovne rdx, rsi
        cmovne rsi, rbx
        shld rdx, rsi, 1
        mov ecx, r9d
        shrd rax, r8, cl
        shr r8, cl
        add rsi, rsi
        test r9b, 64
        cmovne rax, r8
        cmovne r8, rbx
        or r8, rdx
        or rax, rsi
        jmp .LBB288_3
.LBB288_2:
        mov rax, r10
        mov r8, r11
        mov r10, rcx
        mov r11, rcx
.LBB288_3:
        mov qword ptr [rdi], r10
        mov qword ptr [rdi + 8], r11
        mov qword ptr [rdi + 16], rax
        mov qword ptr [rdi + 24], r8
        mov rax, rdi
        pop rbx
        .cfi_def_cfa_offset 8
        ret

x86 u256 shr

bigint_u256_shr:
        .cfi_startproc
        push rbx
        .cfi_def_cfa_offset 16
        .cfi_offset rbx, -16
        mov rax, rcx
        mov ecx, r9d
        and ecx, 127
        mov r10, rsi
        shrd r10, rdx, cl
        mov r11, rdx
        shr r11, cl
        mov ebx, ecx
        xor ecx, ecx
        test bl, 64
        cmovne r10, r11
        cmovne r11, rcx
        test r9b, r9b
        js .LBB289_1
        mov ecx, ebx
        xor cl, 127
        mov r9d, ebx
        shld rdx, rsi, cl
        shl rsi, cl
        xor ebx, ebx
        test cl, 64
        cmovne rdx, rsi
        cmovne rsi, rbx
        shld rdx, rsi, 1
        mov ecx, r9d
        shrd rax, r8, cl
        shr r8, cl
        add rsi, rsi
        test r9b, 64
        cmovne rax, r8
        cmovne r8, rbx
        or r8, rdx
        or rax, rsi
        jmp .LBB289_3
.LBB289_1:
        mov rax, r10
        mov r8, r11
        xor r10d, r10d
        xor r11d, r11d
.LBB289_3:
        mov qword ptr [rdi], r10
        mov qword ptr [rdi + 8], r11
        mov qword ptr [rdi + 16], rax
        mov qword ptr [rdi + 24], r8
        mov rax, rdi
        pop rbx
        .cfi_def_cfa_offset 8
        ret

aarch64 i256 shl

bigint_i256_shl:
        .cfi_startproc
        lsr x9, x2, #1
        and w11, w4, #0x7f
        mvn w10, w4
        lsl x12, x2, x11
        tst x11, #0x40
        lsr x9, x9, x10
        lsl x10, x3, x11
        orr x9, x10, x9
        csel x10, xzr, x12, ne
        csel x9, x12, x9, ne
        tbnz w4, #7, .LBB275_2
        lsl x12, x3, #1
        eor w13, w11, #0x7f
        lsr x14, x0, #1
        lsr x15, x2, x13
        lsr x16, x3, x13
        tst x13, #0x40
        lsl x12, x12, x11
        mvn w13, w11
        lsr x13, x14, x13
        lsl x14, x1, x11
        orr x12, x12, x15
        lsl x15, x0, x11
        csel x12, x16, x12, ne
        csel x16, xzr, x16, ne
        orr x13, x14, x13
        tst x11, #0x40
        extr x12, x16, x12, #1
        csel x11, x15, x13, ne
        csel x13, xzr, x15, ne
        orr x14, x11, x16, lsr #1
        mov x11, x10
        orr x10, x12, x13
        mov x12, x9
        stp x10, x14, [x8]
        stp x11, x12, [x8, #16]
        ret
.LBB275_2:
        stp x10, x9, [x8]
        stp xzr, xzr, [x8, #16]
        ret

aarch64 i256 shr

bigint_i256_shr:
        .cfi_startproc
        lsl x9, x1, #1
        and w11, w4, #0x7f
        mvn w10, w4
        asr x12, x1, x11
        asr x13, x1, #63
        tst x11, #0x40
        lsl x9, x9, x10
        lsr x10, x0, x11
        orr x9, x9, x10
        csel x10, x13, x12, ne
        csel x9, x12, x9, ne
        tbnz w4, #7, .LBB276_2
        lsr x12, x0, #1
        eor w13, w11, #0x7f
        lsl x14, x3, #1
        lsl x15, x1, x13
        lsl x16, x0, x13
        tst x13, #0x40
        lsr x12, x12, x11
        mvn w13, w11
        lsl x13, x14, x13
        lsr x14, x2, x11
        orr x12, x15, x12
        lsr x15, x3, x11
        csel x12, x16, x12, ne
        csel x16, xzr, x16, ne
        orr x13, x13, x14
        tst x11, #0x40
        extr x12, x12, x16, #63
        csel x11, x15, x13, ne
        csel x13, xzr, x15, ne
        orr x11, x11, x16, lsl #1
        orr x12, x12, x13
        stp x9, x10, [x8]
        stp x11, x12, [x8, #16]
        ret
.LBB276_2:
        mov x11, x9
        mov x12, x10
        stp x13, x13, [x8]
        stp x11, x12, [x8, #16]
        ret

aarch64 u256 shr

bigint_u256_shr:
        .cfi_startproc
        lsl x9, x1, #1
        and w11, w4, #0x7f
        mvn w10, w4
        lsr x12, x1, x11
        tst x11, #0x40
        lsl x9, x9, x10
        lsr x10, x0, x11
        orr x9, x9, x10
        csel x10, xzr, x12, ne
        csel x9, x12, x9, ne
        tbnz w4, #7, .LBB277_2
        lsr x12, x0, #1
        eor w13, w11, #0x7f
        lsl x14, x3, #1
        lsl x15, x1, x13
        lsl x16, x0, x13
        tst x13, #0x40
        lsr x12, x12, x11
        mvn w13, w11
        lsl x13, x14, x13
        lsr x14, x2, x11
        orr x12, x15, x12
        lsr x15, x3, x11
        csel x12, x16, x12, ne
        csel x16, xzr, x16, ne
        orr x13, x13, x14
        tst x11, #0x40
        extr x12, x12, x16, #63
        csel x11, x15, x13, ne
        csel x13, xzr, x15, ne
        orr x11, x11, x16, lsl #1
        orr x12, x12, x13
        stp x9, x10, [x8]
        stp x11, x12, [x8, #16]
        ret
.LBB277_2:
        mov x11, x9
        mov x12, x10
        stp xzr, xzr, [x8]
        stp x11, x12, [x8, #16]
        ret

tgross35 · 2026-03-23T02:25:46Z

With this change currently:

x86 i256 shl

bigint_i256_shl:
        .cfi_startproc
        xorps xmm0, xmm0
        movaps xmmword ptr [rsp - 56], xmm0
        movaps xmmword ptr [rsp - 72], xmm0
        mov qword ptr [rsp - 40], rcx
        mov qword ptr [rsp - 32], r8
        mov qword ptr [rsp - 24], rsi
        mov qword ptr [rsp - 16], rdx
        mov eax, r9d
        shr eax, 6
        and eax, 3
        mov edx, 4
        sub edx, eax
        mov rsi, qword ptr [rsp + 8*rdx - 56]
        mov r8, qword ptr [rsp + 8*rdx - 48]
        mov ecx, r9d
        shld r8, rsi, cl
        mov r10, qword ptr [rsp + 8*rdx - 64]
        shld rsi, r10, cl
        mov rdx, qword ptr [rsp + 8*rdx - 72]
        shld r10, rdx, cl
        mov rax, rdi
        shl rdx, cl
        mov qword ptr [rdi + 8], r8
        mov qword ptr [rdi], rsi
        mov qword ptr [rdi + 24], r10
        mov qword ptr [rdi + 16], rdx
        ret

x86 i256 shr

bigint_i256_shr:
        .cfi_startproc
        mov qword ptr [rsp - 64], rcx
        mov qword ptr [rsp - 56], r8
        mov qword ptr [rsp - 48], rsi
        mov qword ptr [rsp - 40], rdx
        sar rdx, 63
        mov qword ptr [rsp - 32], rdx
        mov qword ptr [rsp - 24], rdx
        mov qword ptr [rsp - 16], rdx
        mov eax, r9d
        shr eax, 3
        and eax, 24
        mov rdx, qword ptr [rsp + rax - 40]
        mov rsi, rdx
        mov ecx, r9d
        sar rsi, cl
        mov r8, qword ptr [rsp + rax - 48]
        mov r10, r8
        shrd r10, rdx, cl
        mov rdx, qword ptr [rsp + rax - 56]
        mov r11, rdx
        shrd r11, r8, cl
        mov r8, qword ptr [rsp + rax - 64]
        shrd r8, rdx, cl
        mov rax, rdi
        mov qword ptr [rdi + 8], rsi
        mov qword ptr [rdi], r10
        mov qword ptr [rdi + 24], r11
        mov qword ptr [rdi + 16], r8
        ret

x86 u256 shr

bigint_u256_shr:
        .cfi_startproc
        xorps xmm0, xmm0
        movups xmmword ptr [rsp - 16], xmm0
        movups xmmword ptr [rsp - 32], xmm0
        mov qword ptr [rsp - 64], rcx
        mov qword ptr [rsp - 56], r8
        mov qword ptr [rsp - 48], rsi
        mov qword ptr [rsp - 40], rdx
        mov eax, r9d
        shr eax, 3
        and eax, 24
        mov rdx, qword ptr [rsp + rax - 48]
        mov rsi, qword ptr [rsp + rax - 40]
        mov r8, rdx
        mov ecx, r9d
        shrd r8, rsi, cl
        shr rsi, cl
        mov r10, qword ptr [rsp + rax - 56]
        mov r11, r10
        shrd r11, rdx, cl
        mov rdx, qword ptr [rsp + rax - 64]
        shrd rdx, r10, cl
        mov rax, rdi
        mov qword ptr [rdi + 24], r11
        mov qword ptr [rdi + 16], rdx
        mov qword ptr [rdi + 8], rsi
        mov qword ptr [rdi], r8
        ret

aarch64 i256 shl

bigint_i256_shl:
        .cfi_startproc
        sub sp, sp, #64
        .cfi_def_cfa_offset 64
        movi v0.2d, #0000000000000000
        ubfx w9, w4, #6, #2
        mov w10, #4
        stp x2, x3, [sp, #32]
        and w12, w4, #0x3f
        sub w9, w10, w9
        mov x10, sp
        stp x0, x1, [sp, #48]
        add x9, x10, w9, uxtw #3
        eor x12, x12, #0x3f
        stp q0, q0, [sp]
        ldp x11, x10, [x9, #8]
        ldr x13, [x9, #24]
        ldr x9, [x9]
        lsl x13, x13, x4
        lsr x14, x10, #1
        lsr x15, x11, #1
        lsr x16, x9, #1
        lsl x10, x10, x4
        lsl x11, x11, x4
        lsl x9, x9, x4
        lsr x14, x14, x12
        lsr x15, x15, x12
        lsr x12, x16, x12
        orr x13, x13, x14
        orr x10, x10, x15
        stp x10, x13, [x8]
        orr x10, x11, x12
        stp x9, x10, [x8, #16]
        add sp, sp, #64
        .cfi_def_cfa_offset 0
        ret

aarch64 i256 shr

bigint_i256_shr:
        .cfi_startproc
        sub sp, sp, #64
        .cfi_def_cfa_offset 64
        ubfx w10, w4, #6, #2
        asr x9, x1, #63
        mov x11, sp
        stp x0, x1, [sp, #16]
        and w12, w4, #0x3f
        add x10, x11, w10, uxtw #3
        stp x9, x9, [sp, #32]
        eor x12, x12, #0x3f
        str x9, [sp, #48]
        stp x2, x3, [sp]
        ldp x11, x9, [x10, #16]
        ldp x10, x14, [x10]
        lsl x13, x9, #1
        lsl x15, x11, #1
        lsr x11, x11, x4
        lsl x16, x14, #1
        asr x9, x9, x4
        lsr x14, x14, x4
        lsl x13, x13, x12
        lsr x10, x10, x4
        lsl x15, x15, x12
        lsl x12, x16, x12
        orr x11, x13, x11
        stp x11, x9, [x8]
        orr x9, x15, x14
        orr x10, x12, x10
        stp x10, x9, [x8, #16]
        add sp, sp, #64
        .cfi_def_cfa_offset 0
        ret

aarch64 u256 shr

bigint_u256_shr:
        .cfi_startproc
        sub sp, sp, #64
        .cfi_def_cfa_offset 64
        movi v0.2d, #0000000000000000
        ubfx w9, w4, #6, #2
        mov x10, sp
        stp x0, x1, [sp, #16]
        and w12, w4, #0x3f
        add x9, x10, w9, uxtw #3
        stp x2, x3, [sp]
        eor x12, x12, #0x3f
        stp q0, q0, [sp, #32]
        ldp x11, x10, [x9, #8]
        ldr x13, [x9, #24]
        ldr x9, [x9]
        lsl x16, x13, #1
        lsr x13, x13, x4
        lsl x14, x10, #1
        lsl x15, x11, #1
        lsr x11, x11, x4
        lsr x9, x9, x4
        lsr x10, x10, x4
        lsl x14, x14, x12
        lsl x15, x15, x12
        lsl x12, x16, x12
        orr x11, x14, x11
        orr x9, x15, x9
        stp x9, x11, [x8, #16]
        orr x9, x12, x10
        stp x9, x13, [x8]
        add sp, sp, #64
        .cfi_def_cfa_offset 0
        ret

tgross35 · 2026-03-23T03:12:45Z

Per the benchmarks x86 shows about a 7% improvement in shifts, 6% improvement in narrowing_div, and a 0.17% regression in fmodf128. aarch64 however shows a 16-20% regression in shifts, 2% in narrowing_div, and 0.4% in fmodf128. If I'm reading the asm right it seems like the original aarch64 implementation had an early exit when only the word-sized shifts are needed (?) which is no longer present.

I'm leaning toward taking this still because of the improvement on other arches, especially 32-bit, and it doesn't seem worth keeping a separate implementation only for aarch64 (excluding something asm), and this seems to pretty closely match what LLVM generates for i256.

Any thoughts @quaternic?

quaternic · 2026-03-30T21:25:03Z

If I'm reading the asm right it seems like the original aarch64 implementation had an early exit when only the word-sized shifts are needed (?) which is no longer present.

Not being familiar with ARM, this wasn't very clear, but it's actually a branch for shifts of at least 128 bits, which explains why it has a significant effect on the benchmarks: Half the time it avoids more than half the work, with some added overhead for the branching.

For the uses in u256::narrowing_div, the shift amount for shifts of u256 is always < 128 (and the compiler should see that), which explains why that regresses.

The use of funnel shifts is a clear improvement. I was still wondering if doing the coarse shifting via memory is ideal given the added complexity. However, it's probably unavoidable for good performance on 32-bit targets.

Switch to an algorithm using word-sized operations on an array to do coarse shifts, then funnel shifts for the bit shifts. The result is quite close to what LLVM generates when using native `u256` types.

rustbot · 2026-04-10T00:06:23Z

This PR was rebased onto a different main commit. Here's a range-diff highlighting what actually changed.

Rebasing is a normal part of keeping PRs up to date, so no action is needed—this note is just to help reviewers.

rustbot · 2026-04-10T01:03:43Z

☔ The latest upstream changes (possibly #1193) made this pull request unmergeable. Please resolve the merge conflicts.

tgross35 force-pushed the u256-shifts branch from 2cb3562 to 7b34885 Compare March 20, 2026 22:05

tgross35 force-pushed the u256-shifts branch 2 times, most recently from a35316d to cf9723a Compare March 20, 2026 22:26

tgross35 force-pushed the u256-shifts branch 3 times, most recently from 7fc84f4 to 8a60dcd Compare March 23, 2026 02:02

tgross35 force-pushed the u256-shifts branch from 8a60dcd to fb548b7 Compare March 23, 2026 02:41

tgross35 marked this pull request as ready for review March 23, 2026 02:42

tgross35 force-pushed the u256-shifts branch from fb548b7 to 10b1094 Compare March 30, 2026 18:31

This comment has been minimized.

Sign in to view

tgross35 force-pushed the u256-shifts branch from 10b1094 to d509bc1 Compare March 31, 2026 00:09

This comment has been minimized.

Sign in to view

libm: Use funnel shifts to speed up u256 shifting

a7720bd

Switch to an algorithm using word-sized operations on an array to do coarse shifts, then funnel shifts for the bit shifts. The result is quite close to what LLVM generates when using native `u256` types.

tgross35 force-pushed the u256-shifts branch from d509bc1 to a7720bd Compare April 10, 2026 00:06

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

libm: Use funnel shifts to speed up u256 shifting#1120

libm: Use funnel shifts to speed up u256 shifting#1120
tgross35 wants to merge 1 commit intorust-lang:mainfrom
tgross35:u256-shifts

tgross35 commented Mar 20, 2026 •

edited

Loading

Uh oh!

tgross35 commented Mar 20, 2026

Uh oh!

tgross35 commented Mar 20, 2026

Uh oh!

quaternic commented Mar 21, 2026

Uh oh!

tgross35 commented Mar 23, 2026

Uh oh!

tgross35 commented Mar 23, 2026 •

edited

Loading

Uh oh!

tgross35 commented Mar 23, 2026

Uh oh!

This comment has been minimized.

quaternic commented Mar 30, 2026

Uh oh!

This comment has been minimized.

rustbot commented Apr 10, 2026

Uh oh!

rustbot commented Apr 10, 2026

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

Conversation

tgross35 commented Mar 20, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

tgross35 commented Mar 20, 2026

Uh oh!

tgross35 commented Mar 20, 2026

Uh oh!

quaternic commented Mar 21, 2026

Uh oh!

tgross35 commented Mar 23, 2026

Uh oh!

tgross35 commented Mar 23, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

tgross35 commented Mar 23, 2026

Uh oh!

This comment has been minimized.

quaternic commented Mar 30, 2026

Uh oh!

This comment has been minimized.

rustbot commented Apr 10, 2026

Uh oh!

rustbot commented Apr 10, 2026

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

tgross35 commented Mar 20, 2026 •

edited

Loading

tgross35 commented Mar 23, 2026 •

edited

Loading