Skip to content

Commit 21ae715

Browse files
committed
perf: improve small size fft
1 parent 9c7daa1 commit 21ae715

File tree

3 files changed

+181
-2
lines changed

3 files changed

+181
-2
lines changed

field/babybear/fft/fft.go

Lines changed: 59 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

field/koalabear/fft/fft.go

Lines changed: 59 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

internal/generator/field/template/fft/fft.go.tmpl

Lines changed: 63 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,13 @@ func difFFT(a []{{ .FF }}.Element, w {{ .FF }}.Element, twiddles [][]{{ .FF }}.E
218218
kerDIFNP_{{$ksize}}(a, twiddles, stage-twiddlesStartStage)
219219
return
220220
}
221-
{{- end }}
221+
{{- end }}{{- if .HasASMKernel}} else if n == 512 {
222+
kerDIFNP_512(a, twiddles, stage-twiddlesStartStage)
223+
return
224+
} else if n == 1024 {
225+
kerDIFNP_1024(a, twiddles, stage-twiddlesStartStage)
226+
return
227+
}{{- end}}
222228
}
223229
m := n >> 1
224230

@@ -312,7 +318,13 @@ func ditFFT(a []{{ .FF }}.Element, w {{ .FF }}.Element, twiddles [][]{{ .FF }}.E
312318
kerDITNP_{{$ksize}}(a, twiddles, stage-twiddlesStartStage)
313319
return
314320
}
315-
{{- end }}
321+
{{- end }}{{- if .HasASMKernel}} else if n == 512 {
322+
kerDITNP_512(a, twiddles, stage-twiddlesStartStage)
323+
return
324+
} else if n == 1024 {
325+
kerDITNP_1024(a, twiddles, stage-twiddlesStartStage)
326+
return
327+
}{{- end}}
316328
}
317329

318330
m := n >> 1
@@ -397,6 +409,55 @@ func innerDITWithoutTwiddles(a []{{ .FF }}.Element, at, w {{ .FF }}.Element, sta
397409
{{genKernel $.FF $ksize $klog2}}
398410
{{end}}
399411

412+
{{- if .HasASMKernel}}
413+
// kerDIFNP_512 is an optimized 512-element DIF kernel that avoids recursion overhead
414+
// by directly processing the outer butterfly layer and then calling the 256-element kernel.
415+
func kerDIFNP_512(a []{{ .FF }}.Element, twiddles [][]{{ .FF }}.Element, stage int) {
416+
// Stage 0: butterfly with m=256
417+
innerDIFWithTwiddles(a, twiddles[stage], 0, 256, 256)
418+
// Process two halves with the 256-element kernel
419+
kerDIFNP_256(a[:256], twiddles, stage+1)
420+
kerDIFNP_256(a[256:], twiddles, stage+1)
421+
}
422+
423+
// kerDITNP_512 is an optimized 512-element DIT kernel that avoids recursion overhead.
424+
func kerDITNP_512(a []{{ .FF }}.Element, twiddles [][]{{ .FF }}.Element, stage int) {
425+
// Process two halves with the 256-element kernel first (DIT order)
426+
kerDITNP_256(a[:256], twiddles, stage+1)
427+
kerDITNP_256(a[256:], twiddles, stage+1)
428+
// Final stage: butterfly with m=256
429+
innerDITWithTwiddles(a, twiddles[stage], 0, 256, 256)
430+
}
431+
432+
// kerDIFNP_1024 is an optimized 1024-element DIF kernel that avoids recursion overhead.
433+
func kerDIFNP_1024(a []{{ .FF }}.Element, twiddles [][]{{ .FF }}.Element, stage int) {
434+
// Stage 0: butterfly with m=512
435+
innerDIFWithTwiddles(a, twiddles[stage], 0, 512, 512)
436+
// Stage 1: butterfly with m=256 on both halves
437+
innerDIFWithTwiddles(a[:512], twiddles[stage+1], 0, 256, 256)
438+
innerDIFWithTwiddles(a[512:], twiddles[stage+1], 0, 256, 256)
439+
// Process four quarters with the 256-element kernel
440+
kerDIFNP_256(a[:256], twiddles, stage+2)
441+
kerDIFNP_256(a[256:512], twiddles, stage+2)
442+
kerDIFNP_256(a[512:768], twiddles, stage+2)
443+
kerDIFNP_256(a[768:], twiddles, stage+2)
444+
}
445+
446+
// kerDITNP_1024 is an optimized 1024-element DIT kernel that avoids recursion overhead.
447+
func kerDITNP_1024(a []{{ .FF }}.Element, twiddles [][]{{ .FF }}.Element, stage int) {
448+
// Process four quarters with the 256-element kernel first (DIT order)
449+
kerDITNP_256(a[:256], twiddles, stage+2)
450+
kerDITNP_256(a[256:512], twiddles, stage+2)
451+
kerDITNP_256(a[512:768], twiddles, stage+2)
452+
kerDITNP_256(a[768:], twiddles, stage+2)
453+
// Stage 1: butterfly with m=256 on both halves
454+
innerDITWithTwiddles(a[:512], twiddles[stage+1], 0, 256, 256)
455+
innerDITWithTwiddles(a[512:], twiddles[stage+1], 0, 256, 256)
456+
// Final stage: butterfly with m=512
457+
innerDITWithTwiddles(a, twiddles[stage], 0, 512, 512)
458+
}
459+
{{- end}}
460+
400461
{{define "genKernel FF sizeKernel sizeKernelLog2"}}
401462

402463
func kerDIFNP_{{.sizeKernel}}generic(a []{{ .FF }}.Element, twiddles [][]{{ .FF }}.Element, stage int) {

0 commit comments

Comments
 (0)