4 files changed, 31 insertions, 8 deletions
diff --git a/vec_avx_double4.h b/vec_avx_double4.h
index 9d60b3d..f45855c 100644
--- a/vec_avx_double4.h
+++ b/vec_avx_double4.h
@@ -533,7 +533,11 @@ namespace vecmathlib {
     }
     real_t prod() const
     {
-      return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
+      // return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
+      realvec_t x0123 = *this;
+      realvec_t x1032 = _mm256_shuffle_pd(x0123, x0123, 0b0101);
+      realvec_t y0022 = x0123 * x1032;
+      return y0022[0] * y0022[2];
     }
     real_t sum() const
     {
diff --git a/vec_avx_float8.h b/vec_avx_float8.h
index 3bb13ec..02a1ec8 100644
--- a/vec_avx_float8.h
+++ b/vec_avx_float8.h
@@ -526,9 +526,15 @@ namespace vecmathlib {
     }
     real_t prod() const
     {
-      return
-        (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3] *
-        (*this)[4] * (*this)[5] * (*this)[6] * (*this)[7];
+      // return
+      //   (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3] *
+      //   (*this)[4] * (*this)[5] * (*this)[6] * (*this)[7];
+      realvec_t x01234567 = *this;
+      realvec_t x10325476 = _mm256_shuffle_ps(x01234567, x01234567, 0b10110001);
+      realvec_t y00224466 = x01234567 * x10325476;
+      realvec_t y22006644 = _mm256_shuffle_ps(y00224466, y00224466, 0b01001110);
+      realvec_t z00004444 = y00224466 * y22006644;
+      return z00004444[0] * z00004444[4];
     }
     real_t sum() const
     {
diff --git a/vec_qpx_double4.h b/vec_qpx_double4.h
index 5bf830f..684c161 100644
--- a/vec_qpx_double4.h
+++ b/vec_qpx_double4.h
@@ -531,11 +531,16 @@ namespace vecmathlib {
     }
     real_t prod() const
     {
-      return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
+      // return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
+      realvec_t x = vec_xmul(v, v);
+      return x[1] * x[3];
     }
     real_t sum() const
     {
-      return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
+      // return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
+      realvec_t c1 = vec_logical(v, v, 0b1111);
+      realvec_t x = vec_xxmadd(v, c1, v);
+      return x[0] + x[2];
     }
     
     
diff --git a/vec_sse_float4.h b/vec_sse_float4.h
index 1466013..dc3ecb6 100644
--- a/vec_sse_float4.h
+++ b/vec_sse_float4.h
@@ -499,7 +499,11 @@ namespace vecmathlib {
     }
     real_t prod() const
     {
-      return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
+      // return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
+      realvec_t x0123 = *this;
+      realvec_t x1032 = _mm_shuffle_ps(x0123, x0123, 0b10110001);
+      realvec_t y0022 = x0123 * x1032;
+      return y0022[0] * y0022[2];
     }
     real_t sum() const
     {
@@ -509,7 +513,11 @@ namespace vecmathlib {
       x = _mm_hadd_ps(x.v, x.v);
       return x[0];
 #else
-      return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
+      // return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
+      realvec_t x0123 = *this;
+      realvec_t x1032 = _mm_shuffle_ps(x0123, x0123, 0b10110001);
+      realvec_t y0022 = x0123 + x1032;
+      return y0022[0] + y0022[2];
 #endif
     }