summaryrefslogtreecommitdiffstats
path: root/libavcodec/aarch64/imdct15_neon.S
diff options
context:
space:
mode:
Diffstat (limited to 'libavcodec/aarch64/imdct15_neon.S')
-rw-r--r--libavcodec/aarch64/imdct15_neon.S647
1 files changed, 0 insertions, 647 deletions
diff --git a/libavcodec/aarch64/imdct15_neon.S b/libavcodec/aarch64/imdct15_neon.S
deleted file mode 100644
index d99edf4..0000000
--- a/libavcodec/aarch64/imdct15_neon.S
+++ /dev/null
@@ -1,647 +0,0 @@
-/*
- * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/aarch64/asm.S"
-
-#include "asm-offsets.h"
-
-.macro shuffle a, b, c, d
-const shuffle_\a\b\c\d, align=4
- .byte (\a * 4), (\a * 4 + 1), (\a * 4 + 2), (\a * 4 + 3)
- .byte (\b * 4), (\b * 4 + 1), (\b * 4 + 2), (\b * 4 + 3)
- .byte (\c * 4), (\c * 4 + 1), (\c * 4 + 2), (\c * 4 + 3)
- .byte (\d * 4), (\d * 4 + 1), (\d * 4 + 2), (\d * 4 + 3)
-endconst
-.endm
-
-shuffle 0, 2, 1, 3
-shuffle 1, 0, 3, 2
-shuffle 2, 3, 0, 1
-shuffle 3, 1, 2, 0
-
-
-function fft5_neon
- lsl x2, x2, #3
- ld1 {v24.2s}, [x1], x2
- ld2 {v25.s,v26.s}[0], [x1], x2
- ld2 {v25.s,v26.s}[1], [x1], x2
- ld2 {v25.s,v26.s}[2], [x1], x2
- ld2 {v25.s,v26.s}[3], [x1]
- dup v6.4s, v24.s[0]
- dup v7.4s, v24.s[1]
-
- faddp v0.4s, v25.4s, v26.4s
- // z[][0], z[][3]
- fmul v16.4s, v25.4s, v15.s[0] // rr
- fmul v17.4s, v25.4s, v15.s[1] // ri
- fmul v18.4s, v26.4s, v15.s[0] // ir
- fmul v19.4s, v26.4s, v15.s[1] // ii
- faddp v0.4s, v0.4s, v0.4s
- // z[][1], z[][2]
- fmul v20.4s, v25.4s, v15.s[2] // rr
- fmul v21.4s, v25.4s, v15.s[3] // ri
- fmul v22.4s, v26.4s, v15.s[2] // ir
- fmul v23.4s, v26.4s, v15.s[3] // ii
- fadd v0.2s, v24.2s, v0.2s // out[0]
-
- // z[0123][0], z[0123][3]
- fsub v24.4s, v16.4s, v19.4s // (c).re = rr - ii;
- fadd v27.4s, v16.4s, v19.4s // (d).re = rr + ii;
- ld1 {v16.16b}, [x11]
- ld1 {v19.16b}, [x14]
- fadd v28.4s, v17.4s, v18.4s // (c).im = ri + ir;
- fsub v31.4s, v18.4s, v17.4s // (d).im = -ri + ir;
- ld1 {v17.16b}, [x12]
- // z[0123][1], z[0123][2]
- fsub v25.4s, v20.4s, v23.4s // (c).re = rr - ii;
- fadd v26.4s, v20.4s, v23.4s // (d).re = rr + ii;
- ld1 {v18.16b}, [x13]
- fadd v29.4s, v21.4s, v22.4s // (c).im = ri + ir;
- fsub v30.4s, v22.4s, v21.4s // (d).im = -ri + ir;
-
- //real
- tbl v20.16b, {v24.16b}, v16.16b
- tbl v21.16b, {v25.16b}, v17.16b
- tbl v22.16b, {v26.16b}, v18.16b
- tbl v23.16b, {v27.16b}, v19.16b
- //imag
- tbl v16.16b, {v28.16b}, v16.16b
- tbl v17.16b, {v29.16b}, v17.16b
- tbl v18.16b, {v30.16b}, v18.16b
- tbl v19.16b, {v31.16b}, v19.16b
-
- fadd v6.4s, v6.4s, v20.4s
- fadd v22.4s, v22.4s, v23.4s
- fadd v7.4s, v7.4s, v16.4s
- fadd v18.4s, v18.4s, v19.4s
-
- fadd v21.4s, v21.4s, v22.4s
- fadd v17.4s, v17.4s, v18.4s
- fadd v6.4s, v6.4s, v21.4s
- fadd v7.4s, v7.4s, v17.4s
-
- ret
-endfunc
-
-function fft15_neon
- mov x8, x1
- mov x9, x30
- add x2, x3, x3, lsl #1 // 3 * stride
-
- add x1, x8, x3, lsl #3 // in + 1 * stride
- bl fft5_neon
- mov v1.8b, v0.8b
- mov v2.16b, v6.16b
- mov v3.16b, v7.16b
-
- add x1, x8, x3, lsl #4 // in + 2 * stride
- add x2, x3, x3, lsl #1 // 3 * stride
- bl fft5_neon
- zip1 v1.4s, v1.4s, v0.4s
- mov v4.16b, v6.16b
- mov v5.16b, v7.16b
-
- mov x1, x8 // in + 0 * stride
- add x2, x3, x3, lsl #1 // 3 * stride
- bl fft5_neon
-
- faddp v20.4s, v1.4s, v1.4s
-
- ext v18.16b, v8.16b, v8.16b, #4
- ext v19.16b, v9.16b, v9.16b, #4
- mov v16.16b, v6.16b
- mov v17.16b, v7.16b
- fadd v20.2s, v20.2s, v0.2s
-
- uzp1 v18.4s, v18.4s, v10.4s // exp[2,4,6,8].re
- uzp1 v19.4s, v19.4s, v11.4s // exp[2,4,6,8].im
-
- st1 {v20.2s}, [x0], #8 // out[0]
-
- fmla v16.4s, v2.4s, v8.4s
- fmls v16.4s, v3.4s, v9.4s
-
- fmla v17.4s, v2.4s, v9.4s
- fmla v17.4s, v3.4s, v8.4s
-
- fmla v16.4s, v4.4s, v18.4s
- fmls v16.4s, v5.4s, v19.4s
-
- fmla v17.4s, v4.4s, v19.4s
- fmla v17.4s, v5.4s, v18.4s
-
- zip1 v18.4s, v16.4s, v17.4s
- zip2 v19.4s, v16.4s, v17.4s
-
- rev64 v31.4s, v14.4s
- trn1 v28.2d, v1.2d, v1.2d
- trn2 v29.2d, v1.2d, v1.2d
- zip1 v30.2d, v14.2d, v31.2d
- zip2 v31.2d, v14.2d, v31.2d
-
- st1 {v18.4s,v19.4s}, [x0], #32 // out[1-4]
-
- fmul v16.4s, v28.4s, v30.4s
- fmul v17.4s, v29.4s, v30.4s
- fmls v16.4s, v29.4s, v31.4s
- fmla v17.4s, v28.4s, v31.4s
- faddp v16.4s, v16.4s, v16.4s
- faddp v17.4s, v17.4s, v17.4s
- zip1 v18.2s, v16.2s, v17.2s
- zip2 v19.2s, v16.2s, v17.2s
-
- fadd v18.2s, v18.2s, v0.2s
- fadd v0.2s, v19.2s, v0.2s
-
- ext v30.16b, v12.16b, v12.16b, #4
- ext v31.16b, v13.16b, v13.16b, #4
- mov v16.16b, v6.16b
- mov v17.16b, v7.16b
-
- uzp1 v30.4s, v30.4s, v8.4s
- uzp1 v31.4s, v31.4s, v9.4s
-
- st1 {v18.2s}, [x0], #8 // out[5]
-
- fmla v16.4s, v2.4s, v10.4s
- fmls v16.4s, v3.4s, v11.4s
-
- fmla v17.4s, v2.4s, v11.4s
- fmla v17.4s, v3.4s, v10.4s
-
- fmla v16.4s, v4.4s, v30.4s
- fmls v16.4s, v5.4s, v31.4s
-
- fmla v17.4s, v4.4s, v31.4s
- fmla v17.4s, v5.4s, v30.4s
-
- zip1 v18.4s, v16.4s, v17.4s
- zip2 v19.4s, v16.4s, v17.4s
-
- ext v30.16b, v10.16b, v10.16b, #4
- ext v31.16b, v11.16b, v11.16b, #4
-
- fmla v6.4s, v2.4s, v12.4s
- fmls v6.4s, v3.4s, v13.4s
-
- st1 {v18.4s,v19.4s}, [x0], #32 // out[6-9]
-
- uzp1 v30.4s, v30.4s, v12.4s
- uzp1 v31.4s, v31.4s, v13.4s
-
- fmla v7.4s, v2.4s, v13.4s
- fmla v7.4s, v3.4s, v12.4s
-
- st1 {v0.2s}, [x0], #8 // out[10]
-
- fmla v6.4s, v4.4s, v30.4s
- fmls v6.4s, v5.4s, v31.4s
-
- fmla v7.4s, v4.4s, v31.4s
- fmla v7.4s, v5.4s, v30.4s
-
- zip1 v18.4s, v6.4s, v7.4s
- zip2 v19.4s, v6.4s, v7.4s
-
- st1 {v18.4s,v19.4s}, [x0], #32 // out[11-14]
-
- ret x9
-endfunc
-
-// x0: out, x1: out+len2, x2: exptab, x3: len2
-function fft15_pass
- ands x6, x3, #3
- mov x4, x0
- mov x5, x1
- b.eq 9f
- ld1 {v0.2s}, [x0], #8
- ld1 {v1.2s}, [x1], #8
- sub x3, x3, x6
- subs x6, x6, #1
- fadd v2.2s, v0.2s, v1.2s
- fsub v3.2s, v0.2s, v1.2s
- add x2, x2, #8
- st1 {v2.2s}, [x4], #8
- st1 {v3.2s}, [x5], #8
- b.eq 9f
-1:
- subs x6, x6, #1
- ldp s4, s5, [x2], #8
- ldp s2, s3, [x1], #8
- ldp s0, s1, [x0], #8
-
- fmul s6, s2, s4
- fmul s7, s2, s5
- fmls s6, s3, v5.s[0]
- fmla s7, s3, v4.s[0]
-
- fsub s2, s0, s6
- fsub s3, s1, s7
- fadd s0, s0, s6
- fadd s1, s1, s7
-
- stp s2, s3, [x5], #8
- stp s0, s1, [x4], #8
- b.gt 1b
-9:
- ld1 {v4.4s,v5.4s}, [x2], #32
- ld2 {v2.4s,v3.4s}, [x1], #32
- uzp1 v6.4s, v4.4s, v5.4s
- uzp2 v7.4s, v4.4s, v5.4s
- ld2 {v0.4s,v1.4s}, [x0], #32
-8:
- subs x3, x3, #8
-
- fmul v4.4s, v2.4s, v6.4s
- fmul v5.4s, v2.4s, v7.4s
- b.lt 4f
-
- ld1 {v18.4s,v19.4s}, [x2], #32
-
- fmls v4.4s, v3.4s, v7.4s
- fmla v5.4s, v3.4s, v6.4s
-
- ld2 {v22.4s,v23.4s}, [x1], #32
-
- fsub v2.4s, v0.4s, v4.4s
- fadd v0.4s, v0.4s, v4.4s
- fsub v3.4s, v1.4s, v5.4s
- fadd v1.4s, v1.4s, v5.4s
-
- uzp1 v16.4s, v18.4s, v19.4s
- uzp2 v17.4s, v18.4s, v19.4s
-
- st2 {v2.4s,v3.4s}, [x5], #32
- st2 {v0.4s,v1.4s}, [x4], #32
- ld2 {v20.4s,v21.4s}, [x0], #32
-
- fmul v18.4s, v22.4s, v16.4s
- fmul v19.4s, v22.4s, v17.4s
- b.eq 0f
-
- ld1 {v4.4s,v5.4s}, [x2], #32
-
- fmls v18.4s, v23.4s, v17.4s
- fmla v19.4s, v23.4s, v16.4s
-
- ld2 {v2.4s,v3.4s}, [x1], #32
-
- fsub v22.4s, v20.4s, v18.4s
- fadd v20.4s, v20.4s, v18.4s
- fsub v23.4s, v21.4s, v19.4s
- fadd v21.4s, v21.4s, v19.4s
-
- uzp1 v6.4s, v4.4s, v5.4s
- uzp2 v7.4s, v4.4s, v5.4s
-
- st2 {v22.4s,v23.4s}, [x5], #32
- st2 {v20.4s,v21.4s}, [x4], #32
- ld2 {v0.4s,v1.4s}, [x0], #32
-
- b 8b
-4:
- fmls v4.4s, v3.4s, v7.4s
- fmla v5.4s, v3.4s, v6.4s
-
- fsub v2.4s, v0.4s, v4.4s
- fadd v0.4s, v0.4s, v4.4s
- fsub v3.4s, v1.4s, v5.4s
- fadd v1.4s, v1.4s, v5.4s
-
- st2 {v2.4s,v3.4s}, [x5], #32
- st2 {v0.4s,v1.4s}, [x4], #32
-
- ret
-0:
- fmls v18.4s, v23.4s, v17.4s
- fmla v19.4s, v23.4s, v16.4s
-
- fsub v22.4s, v20.4s, v18.4s
- fadd v20.4s, v20.4s, v18.4s
- fsub v23.4s, v21.4s, v19.4s
- fadd v21.4s, v21.4s, v19.4s
-
- st2 {v22.4s,v23.4s}, [x5], #32
- st2 {v20.4s,v21.4s}, [x4], #32
-
- ret
-endfunc
-
-function fft30_neon, align=6
- sub sp, sp, #0x20
- stp x20, x21, [sp]
- stp x22, x30, [sp, #0x10]
- mov x21, x1
- mov x22, x2
- mov x20, x4
- mov x0, x21
- mov x1, x22
- lsl x3, x20, #1
- bl fft15_neon
-
- add x0, x21, #15*8
- add x1, x22, x20, lsl #3
- lsl x3, x20, #1
- bl fft15_neon
-
- ldr x2, [x10, #(CELT_EXPTAB + 8)] // s->exptab[1]
- add x0, x21, #0
- add x1, x21, #15*8
- mov x3, #15
- ldp x20, x21, [sp]
- ldp x22, x30, [sp, #0x10]
- add sp, sp, #0x20
- b fft15_pass
-endfunc
-
-.macro def_fft n, n2
-function fft\n\()_neon, align=6
- sub sp, sp, #0x30
- stp x20, x21, [sp]
- stp x22, x30, [sp, #0x10]
- stp x23, x24, [sp, #0x20]
- mov x21, x1
- mov x22, x2
- mov x23, x3
- mov x20, x4
- sub x3, x3, #1
- lsl x4, x4, #1
- bl fft\n2\()_neon
-
- add x1, x21, #(\n2 * 8)
- add x2, x22, x20, lsl #3
- sub x3, x23, #1
- lsl x4, x20, #1
- bl fft\n2\()_neon
-
- add x5, x10, #CELT_EXPTAB
- mov x0, x21
- ldr x2, [x5, x23, lsl #3] // s->exptab[N]
- add x1, x21, #(\n2 * 8)
- mov x3, #\n2
- ldp x20, x21, [sp]
- ldp x22, x30, [sp, #0x10]
- ldp x23, x24, [sp, #0x20]
- add sp, sp, #0x30
- b fft15_pass
-endfunc
-.endm
-
- def_fft 60, 30
- def_fft 120, 60
- def_fft 240, 120
- def_fft 480, 240
- def_fft 960, 480
-
-function fft_b15_calc_neon
- sub sp, sp, #0x50
- ldr x8, [x0, #CELT_EXPTAB] // s->exptab[0]
- movrel x6, fact5
- movrel x11, shuffle_0213
- movrel x12, shuffle_1032
- movrel x13, shuffle_2301
- movrel x14, shuffle_3120
- add x8, x8, #8
- movrel x5, fft_tab_neon
- stp x20, x30, [sp]
- stp d8, d9, [sp, #0x10]
- stp d10, d11, [sp, #0x20]
- stp d12, d13, [sp, #0x30]
- stp d14, d15, [sp, #0x40]
- ld1 {v15.4s}, [x6]
- ld1 {v0.4s,v1.4s}, [x8], #32
- ld1 {v6.2s}, [x8], #8
- ld1 {v2.4s,v3.4s}, [x8], #32
- ld1 {v7.2s}, [x8], #8
- ld1 {v4.4s,v5.4s}, [x8], #32
- uzp1 v8.4s, v0.4s, v1.4s // exp[ 1 - 4].re
- uzp2 v9.4s, v0.4s, v1.4s // exp[ 1 - 4].im
- uzp1 v10.4s, v2.4s, v3.4s // exp[ 6 - 9].re
- uzp2 v11.4s, v2.4s, v3.4s // exp[ 6 - 9].im
- uzp1 v12.4s, v4.4s, v5.4s // exp[11 - 14].re
- uzp2 v13.4s, v4.4s, v5.4s // exp[11 - 14].im
- zip1 v14.4s, v6.4s, v7.4s // exp[5,10].re/exp[5,10].im
- add x5, x5, x3, lsl #3
- ldr x5, [x5]
- mov x10, x0
- blr x5
- ldp x20, x30, [sp]
- ldp d8, d9, [sp, #0x10]
- ldp d10, d11, [sp, #0x20]
- ldp d12, d13, [sp, #0x30]
- ldp d14, d15, [sp, #0x40]
- add sp, sp, #0x50
- ret
-endfunc
-
-const fft_tab_neon, relocate=1
- .quad fft15_neon
- .quad fft30_neon
- .quad fft60_neon
- .quad fft120_neon
- .quad fft240_neon
- .quad fft480_neon
- .quad fft960_neon
-endconst
-
-function ff_celt_imdct_half_neon, export=1
- sub sp, sp, #0x20
- stp x21, x30, [sp]
- str s0, [sp, #0x10]
-
- ldp w5, w6, [x0, #CELT_LEN2] // CELT_LEN4
- mov x10, x0
- mov x21, x1
- sub w5, w5, #1
- lsl x7, x3, #3 // 2 * stride * sizeof(float)
- sub x8, xzr, x3, lsl #3 // -2 * stride * sizeof(float)
- mul x5, x5, x3
- ldp x9, x10, [x0, #CELT_TMP] // CELT_TWIDDLE
- ldr w3, [x0, #CELT_FFT_N]
- add x5, x2, x5, lsl #2
- mov x11, x9
-
- sub w6, w6, #4
- ld1 {v0.s}[0], [x5], x8
- ld1 {v1.s}[0], [x2], x7
- ld1 {v4.4s,v5.4s}, [x10], #32
- ld1 {v0.s}[1], [x5], x8
- ld1 {v1.s}[1], [x2], x7
- uzp1 v2.4s, v4.4s, v5.4s
- ld1 {v0.s}[2], [x5], x8
- ld1 {v1.s}[2], [x2], x7
- uzp2 v3.4s, v4.4s, v5.4s
- ld1 {v0.s}[3], [x5], x8
- ld1 {v1.s}[3], [x2], x7
-1:
- subs w6, w6, #4
-
- ld1 {v20.s}[0], [x5], x8
- ld1 {v21.s}[0], [x2], x7
- ld1 {v4.4s,v5.4s}, [x10], #32
-
- fmul v6.4s, v0.4s, v2.4s
- fmul v7.4s, v0.4s, v3.4s
-
- ld1 {v20.s}[1], [x5], x8
- ld1 {v21.s}[1], [x2], x7
-
- fmls v6.4s, v1.4s, v3.4s
- fmla v7.4s, v1.4s, v2.4s
-
- ld1 {v20.s}[2], [x5], x8
- ld1 {v21.s}[2], [x2], x7
-
- uzp1 v2.4s, v4.4s, v5.4s
- uzp2 v3.4s, v4.4s, v5.4s
- ld1 {v20.s}[3], [x5], x8
- ld1 {v21.s}[3], [x2], x7
-
- zip1 v4.4s, v6.4s, v7.4s
- zip2 v5.4s, v6.4s, v7.4s
-
- fmul v6.4s, v20.4s, v2.4s
- fmul v7.4s, v20.4s, v3.4s
-
- st1 {v4.4s,v5.4s}, [x9], #32
-
- fmls v6.4s, v21.4s, v3.4s
- fmla v7.4s, v21.4s, v2.4s
-
- b.eq 3f
-
- subs w6, w6, #4
- ld1 {v4.4s,v5.4s}, [x10], #32
- ld1 {v0.s}[0], [x5], x8
- ld1 {v1.s}[0], [x2], x7
- uzp1 v2.4s, v4.4s, v5.4s
- ld1 {v0.s}[1], [x5], x8
- ld1 {v1.s}[1], [x2], x7
- uzp2 v3.4s, v4.4s, v5.4s
- ld1 {v0.s}[2], [x5], x8
- ld1 {v1.s}[2], [x2], x7
- zip1 v4.4s, v6.4s, v7.4s
- zip2 v5.4s, v6.4s, v7.4s
- ld1 {v0.s}[3], [x5], x8
- ld1 {v1.s}[3], [x2], x7
-
- st1 {v4.4s,v5.4s}, [x9], #32
-
- b.gt 1b
-
- fmul v6.4s, v0.4s, v2.4s
- fmul v7.4s, v0.4s, v3.4s
- fmls v6.4s, v1.4s, v3.4s
- fmla v7.4s, v1.4s, v2.4s
-3:
- zip1 v4.4s, v6.4s, v7.4s
- zip2 v5.4s, v6.4s, v7.4s
- st1 {v4.4s,v5.4s}, [x9], #32
-
- mov x2, x11
- mov x4, #1
-
- bl fft_b15_calc_neon
-
- ldr w5, [x10, #CELT_LEN4]
- ldr x6, [x10, #CELT_TWIDDLE]
- ldr s31, [sp, #0x10]
-
- add x1, x21, x5, lsl #2
- add x3, x6, x5, lsl #2
- sub x0, x1, #16
- sub x2, x3, #16
- mov x8, #-16
- mov x7, #16
- mov x10, x0
- mov x11, x1
-
- sub w5, w5, #4
-
- ld1 {v0.4s}, [x0], x8
- ld1 {v1.4s}, [x1], x7
- ld1 {v2.4s}, [x2], x8
- ld1 {v3.4s}, [x3], x7
-
- uzp1 v4.4s, v0.4s, v1.4s // z[-i-2, -i-1, +i, i+1].re
- uzp2 v6.4s, v0.4s, v1.4s // z[-i-2, -i-1, +i, i+1].im
-
- uzp1 v5.4s, v2.4s, v3.4s // twidlle_exptab[-i-2, -i-1, +i, i+1].re
- uzp2 v7.4s, v2.4s, v3.4s // twidlle_exptab[-i-2, -i-1, +i, i+1].im
-
- fmul v1.4s, v6.4s, v5.4s
- fmul v0.4s, v6.4s, v7.4s
-2:
- subs w5, w5, #4
-
- ld1 {v20.4s}, [x0], x8
-
- fmla v1.4s, v4.4s, v7.4s
- fmls v0.4s, v4.4s, v5.4s
-
- ld1 {v21.4s}, [x1], x7
-
- ext v1.16b, v1.16b, v1.16b, #8
- fmul v0.4s, v0.4s, v31.s[0]
-
- ld1 {v2.4s}, [x2], x8
-
- rev64 v1.4s, v1.4s
- fmul v1.4s, v1.4s, v31.s[0]
-
- ld1 {v3.4s}, [x3], x7
-
- zip1 v5.4s, v0.4s, v1.4s
- zip2 v7.4s, v0.4s, v1.4s
-
- uzp1 v4.4s, v20.4s, v21.4s // z[-i-2, -i-1, +i, i+1].re
- uzp2 v6.4s, v20.4s, v21.4s // z[-i-2, -i-1, +i, i+1].im
-
- st1 {v5.4s}, [x10], x8
- st1 {v7.4s}, [x11], x7
-
- uzp1 v5.4s, v2.4s, v3.4s // twidlle_exptab[-i-2, -i-1, +i, i+1].re
- uzp2 v7.4s, v2.4s, v3.4s // twidlle_exptab[-i-2, -i-1, +i, i+1].im
-
- fmul v1.4s, v6.4s, v5.4s
- fmul v0.4s, v6.4s, v7.4s
- b.gt 2b
-
- fmla v1.4s, v4.4s, v7.4s
- fmls v0.4s, v4.4s, v5.4s
- ext v1.16b, v1.16b, v1.16b, #8
- fmul v0.4s, v0.4s, v31.s[0]
- rev64 v1.4s, v1.4s
- fmul v1.4s, v1.4s, v31.s[0]
- zip1 v5.4s, v0.4s, v1.4s
- zip2 v7.4s, v0.4s, v1.4s
- st1 {v5.4s}, [x10], x8
- st1 {v7.4s}, [x11], x7
-
- ldp x21, x30, [sp]
- add sp, sp, #0x20
- ret
-endfunc
-
-// [0] = exp(2 * i * pi / 5), [1] = exp(2 * i * pi * 2 / 5)
-const fact5, align=4
- .float 0.30901699437494745, 0.95105651629515353
- .float -0.80901699437494734, 0.58778525229247325
-endconst
OpenPOWER on IntegriCloud