summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorErik Schnetter <schnetter@gmail.com>2013-06-17 19:34:42 -0400
committerErik Schnetter <schnetter@gmail.com>2013-06-17 19:34:42 -0400
commita3559ff558a81a80fe86765966e4a7f2548d0ad3 (patch)
treeff68c2eec66a1bf89bf33c136cfcb1676756a280
parenta66c6ecfdd734ae511cbb2d76c282c4bf84f08a4 (diff)
downloadvecmathlib-a3559ff558a81a80fe86765966e4a7f2548d0ad3.zip
vecmathlib-a3559ff558a81a80fe86765966e4a7f2548d0ad3.tar.gz
Correct NEON implementation
-rw-r--r--vec_neon_float2.h6
-rw-r--r--vec_neon_float4.h32
2 files changed, 26 insertions, 12 deletions
diff --git a/vec_neon_float2.h b/vec_neon_float2.h
index 258a091..0030c88 100644
--- a/vec_neon_float2.h
+++ b/vec_neon_float2.h
@@ -96,12 +96,12 @@ namespace vecmathlib {
bool all() const
{
boolvec r = vpmin_u32(v, v);
- return to_bool(r[0]);
+ return r[0];
}
bool any() const
{
boolvec r = vpmax_u32(v, v);
- return to_bool(r[0]);
+ return r[0];
}
@@ -155,7 +155,7 @@ namespace vecmathlib {
}
static intvec iota()
{
- return vcreate_s32((uint64_t(0) << uint64_t(32)) | uint64_t(1));
+ return vcreate_s32((uint64_t(1) << uint64_t(32)) | uint64_t(0));
}
operator ivector_t() const { return v; }
diff --git a/vec_neon_float4.h b/vec_neon_float4.h
index 4ce0800..cf72a6d 100644
--- a/vec_neon_float4.h
+++ b/vec_neon_float4.h
@@ -95,13 +95,17 @@ namespace vecmathlib {
bool all() const
{
- boolvec r = vpminq_u32(v, v);
- return to_bool(r[0]);
+ uint32x2_t x = vpmin_u32(vget_low_u32(v), vget_high_u32(v));
+ uint32x2_t y = vpmin_u32(x, x);
+ uint32_t z = vget_lane_u32(y, 0);
+ return to_bool(z);
}
bool any() const
{
- boolvec r = vpmaxq_u32(v, v);
- return to_bool(r[0]);
+ uint32x2_t x = vpmax_u32(vget_low_u32(v), vget_high_u32(v));
+ uint32x2_t y = vpmax_u32(x, x);
+ uint32_t z = vget_lane_u32(y, 0);
+ return to_bool(z);
}
@@ -155,7 +159,9 @@ namespace vecmathlib {
}
static intvec iota()
{
- return vcreateq_s32((uint64_t(0) << uint64_t(32)) | uint64_t(1));
+ return
+ vcombine_s32(vcreate_s32((uint64_t(1) << uint64_t(32)) | uint64_t(0)),
+ vcreate_s32((uint64_t(3) << uint64_t(32)) | uint64_t(2)));
}
operator ivector_t() const { return v; }
@@ -223,7 +229,7 @@ namespace vecmathlib {
boolvec_t signbit() const
{
//return *this < IV(I(0));
- return intvec(vshr_n_s32(v, FP::bits-1)).as_bool();
+ return intvec(vshrq_n_s32(v, FP::bits-1)).as_bool();
}
boolvec_t operator==(intvec const& x) const { return vceqq_s32(v, x.v); }
@@ -342,6 +348,8 @@ namespace vecmathlib {
// be atomic
// p[0] = (*this)[0];
// p[1] = (*this)[1];
+ // p[2] = (*this)[2];
+ // p[3] = (*this)[3];
#if defined __ARM_FEATURE_UNALIGNED
vst1q_f32(p, v);
#else
@@ -362,6 +370,8 @@ namespace vecmathlib {
} else {
if (m.m[0]) p[0] = (*this)[0];
if (m.m[1]) p[1] = (*this)[1];
+ if (m.m[2]) p[2] = (*this)[2];
+ if (m.m[3]) p[3] = (*this)[3];
}
}
void storeu(real_t* p, mask_t const& m) const
@@ -371,6 +381,8 @@ namespace vecmathlib {
} else {
if (m.m[0]) p[0] = (*this)[0];
if (m.m[1]) p[1] = (*this)[1];
+ if (m.m[2]) p[2] = (*this)[2];
+ if (m.m[3]) p[3] = (*this)[3];
}
}
void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
@@ -402,12 +414,14 @@ namespace vecmathlib {
real_t prod() const
{
- return (*this)[0] * (*this)[1];
+ return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
}
real_t sum() const
{
- realvec r = vpaddq_f32(v, v);
- return r[0];
+ float32x2_t x = vpadd_f32(vget_low_f32(v), vget_high_f32(v));
+ float32x2_t y = vpadd_f32(x, x);
+ float32_t z = vget_lane_f32(y, 0);
+ return z;
}
OpenPOWER on IntegriCloud