diff options
author | Haruki Hasegawa <h6a.h4i.0@gmail.com> | 2016-05-05 13:24:55 +0900 |
---|---|---|
committer | Haruki Hasegawa <h6a.h4i.0@gmail.com> | 2016-05-05 13:24:55 +0900 |
commit | da3213cf045e0c7c4971d8b44272d1d86d689ceb (patch) | |
tree | 33e17e0a166f03307ebf11e8ab2891ae1ab90f61 /src/macros-neon.h | |
parent | fa1780c68593762b1e4bdbc46d83912db3eba27a (diff) | |
parent | 944d14c9151f6b20145de0cdae38e366e73c9432 (diff) | |
download | ffts-da3213cf045e0c7c4971d8b44272d1d86d689ceb.zip ffts-da3213cf045e0c7c4971d8b44272d1d86d689ceb.tar.gz |
Merge remote-tracking branch 'linkotec/master'
Diffstat (limited to 'src/macros-neon.h')
-rw-r--r-- | src/macros-neon.h | 168 |
1 files changed, 95 insertions, 73 deletions
diff --git a/src/macros-neon.h b/src/macros-neon.h index c8b5720..29aa49f 100644 --- a/src/macros-neon.h +++ b/src/macros-neon.h @@ -1,97 +1,119 @@ /* - - This file is part of FFTS -- The Fastest Fourier Transform in the South - - Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com> - - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the organization nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY - DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +This file is part of FFTS -- The Fastest Fourier Transform in the South + +Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com> + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +* Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +* Neither the name of the organization nor the +names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef __MACROS_NEON_H__ -#define __MACROS_NEON_H__ -#include "neon.h" +#ifndef FFTS_MACROS_NEON_H +#define FFTS_MACROS_NEON_H + #include <arm_neon.h> -typedef float32x4_t V; +#ifdef HAVE_STDLIB_H +#include <stdlib.h> +#endif -typedef float32x4x2_t VS; +#define FFTS_MALLOC(d,a) (valloc(d)) +#define FFTS_FREE(d) (free(d)) -#define ADD vaddq_f32 -#define SUB vsubq_f32 -#define MUL vmulq_f32 -#define VADD vaddq_f32 -#define VSUB vsubq_f32 -#define VMUL vmulq_f32 -#define VXOR(x,y) (vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(x), vreinterpretq_u32_f32(y)))) -#define VST vst1q_f32 -#define VLD vld1q_f32 -#define VST2 vst2q_f32 -#define VLD2 vld2q_f32 +typedef float32x4_t V4SF; +typedef float32x4x2_t V4SF2; -#define VSWAPPAIRS(x) (vrev64q_f32(x)) +#define V4SF_ADD vaddq_f32 +#define V4SF_SUB vsubq_f32 +#define V4SF_MUL vmulq_f32 -#define VUNPACKHI(a,b) (vcombine_f32(vget_high_f32(a), vget_high_f32(b))) -#define VUNPACKLO(a,b) (vcombine_f32(vget_low_f32(a), vget_low_f32(b))) +#define V4SF_XOR(x,y) \ + (vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(x), vreinterpretq_u32_f32(y)))) -#define VBLEND(x,y) (vcombine_f32(vget_low_f32(x), vget_high_f32(y))) +#define V4SF_ST vst1q_f32 +#define V4SF_LD vld1q_f32 -__INLINE V VLIT4(data_t f3, data_t f2, data_t f1, data_t f0) { - data_t __attribute__ ((aligned(16))) d[4] = {f0, f1, f2, f3}; - return VLD(d); -} +#define V4SF_SWAP_PAIRS(x) \ + (vrev64q_f32(x)) -#define VDUPRE(r) vcombine_f32(vdup_lane_f32(vget_low_f32(r),0), vdup_lane_f32(vget_high_f32(r),0)) -#define VDUPIM(r) vcombine_f32(vdup_lane_f32(vget_low_f32(r),1), vdup_lane_f32(vget_high_f32(r),1)) +#define V4SF_UNPACK_HI(a,b) \ + (vcombine_f32(vget_high_f32(a), vget_high_f32(b))) -#define FFTS_MALLOC(d,a) (valloc(d)) -#define FFTS_FREE(d) (free(d)) +#define V4SF_UNPACK_LO(a,b) \ + (vcombine_f32(vget_low_f32(a), vget_low_f32(b))) -__INLINE void STORESPR(data_t * addr, VS p) { +#define V4SF_BLEND(x,y) \ + (vcombine_f32(vget_low_f32(x), vget_high_f32(y))) - vst1q_f32(addr, p.val[0]); - vst1q_f32(addr + 4, p.val[1]); - +static FFTS_ALWAYS_INLINE V4SF +V4SF_LIT4(float f3, float f2, float f1, float f0) +{ + float FFTS_ALIGN(16) d[4] = {f0, f1, f2, f3}; + return V4SF_LD(d); } -__INLINE V IMULI(int inv, V a) { - if(inv) return VSWAPPAIRS(VXOR(a, VLIT4(0.0f, -0.0f, 0.0f, -0.0f))); - else return VSWAPPAIRS(VXOR(a, VLIT4(-0.0f, 0.0f, -0.0f, 0.0f))); +#define V4SF_DUPLICATE_RE(r) \ + vcombine_f32(vdup_lane_f32(vget_low_f32(r),0), vdup_lane_f32(vget_high_f32(r),0)) + +#define V4SF_DUPLICATE_IM(r) \ + vcombine_f32(vdup_lane_f32(vget_low_f32(r),1), vdup_lane_f32(vget_high_f32(r),1)) + +static FFTS_ALWAYS_INLINE V4SF +V4SF_IMULI(int inv, V4SF a) +{ + if (inv) { + return V4SF_SWAP_PAIRS(V4SF_XOR(a, V4SF_LIT4(0.0f, -0.0f, 0.0f, -0.0f))); + } else { + return V4SF_SWAP_PAIRS(V4SF_XOR(a, V4SF_LIT4(-0.0f, 0.0f, -0.0f, 0.0f))); + } } -__INLINE V IMUL(V d, V re, V im) { - re = VMUL(re, d); - im = VMUL(im, VSWAPPAIRS(d)); - return VSUB(re, im); +static FFTS_ALWAYS_INLINE V4SF +V4SF_IMUL(V4SF d, V4SF re, V4SF im) +{ + re = V4SF_MUL(re, d); + im = V4SF_MUL(im, V4SF_SWAP_PAIRS(d)); + return V4SF_SUB(re, im); } -__INLINE V IMULJ(V d, V re, V im) { - re = VMUL(re, d); - im = VMUL(im, VSWAPPAIRS(d)); - return VADD(re, im); +static FFTS_ALWAYS_INLINE V4SF +V4SF_IMULJ(V4SF d, V4SF re, V4SF im) +{ + re = V4SF_MUL(re, d); + im = V4SF_MUL(im, V4SF_SWAP_PAIRS(d)); + return V4SF_ADD(re, im); } -#endif -// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3: +#define V4SF2_ST vst2q_f32 +#define V4SF2_LD vld2q_f32 + +static FFTS_ALWAYS_INLINE void +V4SF2_STORE_SPR(float *addr, V4SF2 p) +{ + vst1q_f32(addr, p.val[0]); + vst1q_f32(addr + 4, p.val[1]); +} + +#endif /* FFTS_MACROS_NEON_H */ |