@@ -218,7 +218,13 @@ func difFFT(a []{{ .FF }}.Element, w {{ .FF }}.Element, twiddles [][]{{ .FF }}.E
218218 kerDIFNP_{{$ksize }}(a, twiddles, stage-twiddlesStartStage)
219219 return
220220 }
221- {{- end }}
221+ {{- end }}{{- if .HasASMKernel }} else if n == 512 {
222+ kerDIFNP_512(a, twiddles, stage-twiddlesStartStage)
223+ return
224+ } else if n == 1024 {
225+ kerDIFNP_1024(a, twiddles, stage-twiddlesStartStage)
226+ return
227+ }{{- end }}
222228 }
223229 m := n >> 1
224230
@@ -312,7 +318,13 @@ func ditFFT(a []{{ .FF }}.Element, w {{ .FF }}.Element, twiddles [][]{{ .FF }}.E
312318 kerDITNP_{{$ksize }}(a, twiddles, stage-twiddlesStartStage)
313319 return
314320 }
315- {{- end }}
321+ {{- end }}{{- if .HasASMKernel }} else if n == 512 {
322+ kerDITNP_512(a, twiddles, stage-twiddlesStartStage)
323+ return
324+ } else if n == 1024 {
325+ kerDITNP_1024(a, twiddles, stage-twiddlesStartStage)
326+ return
327+ }{{- end }}
316328 }
317329
318330 m := n >> 1
@@ -397,6 +409,55 @@ func innerDITWithoutTwiddles(a []{{ .FF }}.Element, at, w {{ .FF }}.Element, sta
397409 {{genKernel $ .FF $ksize $klog2 }}
398410{{end }}
399411
412+ {{- if .HasASMKernel }}
413+ // kerDIFNP_512 is an optimized 512-element DIF kernel that avoids recursion overhead
414+ // by directly processing the outer butterfly layer and then calling the 256-element kernel.
415+ func kerDIFNP_512(a []{{ .FF }}.Element , twiddles [][]{{ .FF }}.Element , stage int) {
416+ // Stage 0: butterfly with m= 256
417+ innerDIFWithTwiddles(a, twiddles[stage], 0, 256, 256)
418+ // Process two halves with the 256-element kernel
419+ kerDIFNP_256(a[:256], twiddles, stage+1)
420+ kerDIFNP_256(a[256:], twiddles, stage+1)
421+ }
422+
423+ // kerDITNP_512 is an optimized 512-element DIT kernel that avoids recursion overhead.
424+ func kerDITNP_512(a []{{ .FF }}.Element , twiddles [][]{{ .FF }}.Element , stage int) {
425+ // Process two halves with the 256-element kernel first (DIT order)
426+ kerDITNP_256(a[:256], twiddles, stage+1)
427+ kerDITNP_256(a[256:], twiddles, stage+1)
428+ // Final stage: butterfly with m= 256
429+ innerDITWithTwiddles(a, twiddles[stage], 0, 256, 256)
430+ }
431+
432+ // kerDIFNP_1024 is an optimized 1024-element DIF kernel that avoids recursion overhead.
433+ func kerDIFNP_1024(a []{{ .FF }}.Element , twiddles [][]{{ .FF }}.Element , stage int) {
434+ // Stage 0: butterfly with m= 512
435+ innerDIFWithTwiddles(a, twiddles[stage], 0, 512, 512)
436+ // Stage 1: butterfly with m= 256 on both halves
437+ innerDIFWithTwiddles(a[:512], twiddles[stage+1], 0, 256, 256)
438+ innerDIFWithTwiddles(a[512:], twiddles[stage+1], 0, 256, 256)
439+ // Process four quarters with the 256-element kernel
440+ kerDIFNP_256(a[:256], twiddles, stage+2)
441+ kerDIFNP_256(a[256:512], twiddles, stage+2)
442+ kerDIFNP_256(a[512:768], twiddles, stage+2)
443+ kerDIFNP_256(a[768:], twiddles, stage+2)
444+ }
445+
446+ // kerDITNP_1024 is an optimized 1024-element DIT kernel that avoids recursion overhead.
447+ func kerDITNP_1024(a []{{ .FF }}.Element , twiddles [][]{{ .FF }}.Element , stage int) {
448+ // Process four quarters with the 256-element kernel first (DIT order)
449+ kerDITNP_256(a[:256], twiddles, stage+2)
450+ kerDITNP_256(a[256:512], twiddles, stage+2)
451+ kerDITNP_256(a[512:768], twiddles, stage+2)
452+ kerDITNP_256(a[768:], twiddles, stage+2)
453+ // Stage 1: butterfly with m= 256 on both halves
454+ innerDITWithTwiddles(a[:512], twiddles[stage+1], 0, 256, 256)
455+ innerDITWithTwiddles(a[512:], twiddles[stage+1], 0, 256, 256)
456+ // Final stage: butterfly with m= 512
457+ innerDITWithTwiddles(a, twiddles[stage], 0, 512, 512)
458+ }
459+ {{- end }}
460+
400461{{define " genKernel FF sizeKernel sizeKernelLog2" }}
401462
402463func kerDIFNP_{{.sizeKernel }}generic(a []{{ .FF }}.Element , twiddles [][]{{ .FF }}.Element , stage int) {
0 commit comments