summaryrefslogtreecommitdiffstats
path: root/floatprops.h
diff options
context:
space:
mode:
authorErik Schnetter <schnetter@gmail.com>2013-02-19 12:49:36 -0500
committerErik Schnetter <schnetter@gmail.com>2013-02-19 12:49:36 -0500
commit4e9653e4441ba216fdedb4b6105a12ccad797028 (patch)
tree569ae7b500ce689410f5e6bb7677017876a5a6d5 /floatprops.h
parent0208cd9fcea032092d892587c444f1a6b0891425 (diff)
downloadvecmathlib-4e9653e4441ba216fdedb4b6105a12ccad797028.zip
vecmathlib-4e9653e4441ba216fdedb4b6105a12ccad797028.tar.gz
Provide vector types with 8 and 16 bit precision
Use non-functional dummy float types fp8 and fp16.
Diffstat (limited to 'floatprops.h')
-rw-r--r--floatprops.h140
1 files changed, 120 insertions, 20 deletions
diff --git a/floatprops.h b/floatprops.h
index 8d4ddcf..c39c788 100644
--- a/floatprops.h
+++ b/floatprops.h
@@ -3,6 +3,8 @@
#ifndef FLOATPROPS_H
#define FLOATPROPS_H
+#include "floattypes.h"
+
#include <cmath>
#include <cstdint>
#include <cstring>
@@ -27,8 +29,116 @@ namespace vecmathlib {
// max_exponent
};
- template<typename int_t>
- struct intprops {
+
+
+ // Properties of fp8
+ template<>
+ struct floatprops<fp8> {
+ typedef fp8 real_t;
+ typedef int8_t int_t;
+ typedef uint8_t uint_t;
+
+ // Definitions that might come from numeric_limits<> instead:
+ static int const digits = 4;
+ static int epsilon() { __builtin_unreachable(); }
+ static int const min_exponent = -6;
+ static int const max_exponent = 7;
+
+ // Ensure the sizes match
+ static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size");
+ static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size");
+
+ // Number of bits in internal representation
+ static int const bits = 8 * sizeof(real_t);
+ static int const mantissa_bits = digits - 1;
+ static int const signbit_bits = 1;
+ static int const exponent_bits = bits - mantissa_bits - signbit_bits;
+ static int const exponent_offset = 2 - min_exponent;
+ static_assert(mantissa_bits + exponent_bits + signbit_bits == bits,
+ "error in bit counts");
+ static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1;
+ static uint_t const exponent_mask =
+ ((uint_t(1) << exponent_bits) - 1) << mantissa_bits;
+ static uint_t const signbit_mask = uint_t(1) << (bits-1);
+ static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0),
+ "error in masks");
+ static_assert((mantissa_mask | exponent_mask | signbit_mask) ==
+ uint_t(~uint_t(0)),
+ "error in masks");
+
+ // Re-interpret bit patterns
+ static real_t as_float(int_t x)
+ {
+ real_t res;
+ std::memcpy(&res, &x, sizeof res);
+ return res;
+ }
+ static int_t as_int(real_t x)
+ {
+ int_t res;
+ std::memcpy(&res, &x, sizeof res);
+ return res;
+ }
+
+ // Convert values
+ static real_t convert_float(int_t x) { __builtin_unreachable(); }
+ static int_t convert_int(real_t x) { __builtin_unreachable(); }
+ };
+
+
+
+ // Properties of fp16
+ template<>
+ struct floatprops<fp16> {
+ typedef fp16 real_t;
+ typedef int16_t int_t;
+ typedef uint16_t uint_t;
+
+ // Definitions that might come from numeric_limits<> instead:
+ static int const digits = 11;
+ static int epsilon() { __builtin_unreachable(); }
+ static int const min_exponent = -14;
+ static int const max_exponent = 15;
+
+ // Ensure the sizes match
+ static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size");
+ static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size");
+
+ // Number of bits in internal representation
+ static int const bits = 8 * sizeof(real_t);
+ static int const mantissa_bits = digits - 1;
+ static int const signbit_bits = 1;
+ static int const exponent_bits = bits - mantissa_bits - signbit_bits;
+ static int const exponent_offset = 2 - min_exponent;
+ static_assert(mantissa_bits + exponent_bits + signbit_bits == bits,
+ "error in bit counts");
+ static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1;
+ static uint_t const exponent_mask =
+ ((uint_t(1) << exponent_bits) - 1) << mantissa_bits;
+ static uint_t const signbit_mask = uint_t(1) << (bits-1);
+ static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0),
+ "error in masks");
+ static_assert((mantissa_mask | exponent_mask | signbit_mask) ==
+ uint_t(~uint_t(0)),
+ "error in masks");
+
+ // Re-interpret bit patterns
+ static real_t as_float(int_t x)
+ {
+ real_t res;
+ std::memcpy(&res, &x, sizeof res);
+ return res;
+ }
+ static int_t as_int(real_t x)
+ {
+ int_t res;
+ std::memcpy(&res, &x, sizeof res);
+ return res;
+ }
+
+ // Convert values
+ static real_t convert_float(int_t x) { __builtin_unreachable(); }
+ static int_t convert_int(real_t x) { __builtin_unreachable(); }
};
@@ -66,7 +176,7 @@ namespace vecmathlib {
"error in masks");
// Re-interpret bit patterns
- static inline real_t as_float(int_t x)
+ static real_t as_float(int_t x)
{
// return *(real_t*)&x;
// union { int_t i; real_t r; } ir;
@@ -75,7 +185,7 @@ namespace vecmathlib {
std::memcpy(&res, &x, sizeof res);
return res;
}
- static inline int_t as_int(real_t x)
+ static int_t as_int(real_t x)
{
// return *(int_t*)&x;
// union { real_t r; int_t i; } ri;
@@ -86,8 +196,8 @@ namespace vecmathlib {
}
// Convert values
- static inline real_t convert_float(int_t x) { return real_t(x); }
- static inline int_t convert_int(real_t x)
+ static real_t convert_float(int_t x) { return real_t(x); }
+ static int_t convert_int(real_t x)
{
static_assert(sizeof std::lrint(x) >= sizeof(int_t),
"lrint() has wrong return type");
@@ -103,11 +213,6 @@ namespace vecmathlib {
}
};
- template<>
- struct intprops<floatprops<float>::int_t> {
- typedef float real_t;
- };
-
// Properties of double
@@ -143,7 +248,7 @@ namespace vecmathlib {
"error in masks");
// Re-interpret bit patterns
- static inline real_t as_float(int_t x)
+ static real_t as_float(int_t x)
{
// return *(real_t*)&x;
// union { int_t i; real_t r; } ir;
@@ -152,7 +257,7 @@ namespace vecmathlib {
std::memcpy(&res, &x, sizeof res);
return res;
}
- static inline int_t as_int(real_t x)
+ static int_t as_int(real_t x)
{
// return *(int_t*)&x;
// union { real_t r; int_t i; } ri;
@@ -163,8 +268,8 @@ namespace vecmathlib {
}
// Convert values
- static inline real_t convert_float(int_t x) { return real_t(x); }
- static inline int_t convert_int(real_t x)
+ static real_t convert_float(int_t x) { return real_t(x); }
+ static int_t convert_int(real_t x)
{
static_assert(sizeof std::lrint(x) >= sizeof(int_t),
"lrint() has wrong return type");
@@ -180,11 +285,6 @@ namespace vecmathlib {
}
};
- template<>
- struct intprops<floatprops<double>::int_t> {
- typedef double real_t;
- };
-
} // namespace vecmathlib
OpenPOWER on IntegriCloud