Provide vector types with 8 and 16 bit precision

Use non-functional dummy float types fp8 and fp16.
author: Erik Schnetter <schnetter@gmail.com> 2013-02-19 12:49:36 -0500
committer: Erik Schnetter <schnetter@gmail.com> 2013-02-19 12:49:36 -0500
commit: 4e9653e4441ba216fdedb4b6105a12ccad797028 (patch)
tree: 569ae7b500ce689410f5e6bb7677017876a5a6d5 /floatprops.h
parent: 0208cd9fcea032092d892587c444f1a6b0891425 (diff)
download: vecmathlib-4e9653e4441ba216fdedb4b6105a12ccad797028.zip
vecmathlib-4e9653e4441ba216fdedb4b6105a12ccad797028.tar.gz
1 files changed, 120 insertions, 20 deletions
diff --git a/floatprops.h b/floatprops.h
index 8d4ddcf..c39c788 100644
--- a/floatprops.h
+++ b/floatprops.h
@@ -3,6 +3,8 @@
 #ifndef FLOATPROPS_H
 #define FLOATPROPS_H
 
+#include "floattypes.h"
+
 #include <cmath>
 #include <cstdint>
 #include <cstring>
@@ -27,8 +29,116 @@ namespace vecmathlib {
     //    max_exponent
   };
   
-  template<typename int_t>
-  struct intprops {
+  
+  
+  // Properties of fp8
+  template<>
+  struct floatprops<fp8> {
+    typedef fp8 real_t;
+    typedef int8_t int_t;
+    typedef uint8_t uint_t;
+    
+    // Definitions that might come from numeric_limits<> instead:
+    static int const digits = 4;
+    static int epsilon() { __builtin_unreachable(); }
+    static int const min_exponent = -6;
+    static int const max_exponent = 7;
+    
+    // Ensure the sizes match
+    static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size");
+    static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size");
+    
+    // Number of bits in internal representation
+    static int const bits = 8 * sizeof(real_t);
+    static int const mantissa_bits = digits - 1;
+    static int const signbit_bits = 1;
+    static int const exponent_bits = bits - mantissa_bits - signbit_bits;
+    static int const exponent_offset = 2 - min_exponent;
+    static_assert(mantissa_bits + exponent_bits + signbit_bits == bits,
+                  "error in bit counts");
+    static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1;
+    static uint_t const exponent_mask =
+      ((uint_t(1) << exponent_bits) - 1) << mantissa_bits;
+    static uint_t const signbit_mask = uint_t(1) << (bits-1);
+    static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0),
+                  "error in masks");
+    static_assert((mantissa_mask | exponent_mask | signbit_mask) ==
+                  uint_t(~uint_t(0)),
+                  "error in masks");
+    
+    // Re-interpret bit patterns
+    static real_t as_float(int_t x)
+    {
+      real_t res;
+      std::memcpy(&res, &x, sizeof res);
+      return res;
+    }
+    static int_t as_int(real_t x)
+    {
+      int_t res;
+      std::memcpy(&res, &x, sizeof res);
+      return res;
+    }
+    
+    // Convert values
+    static real_t convert_float(int_t x) { __builtin_unreachable(); }
+    static int_t convert_int(real_t x) { __builtin_unreachable(); }
+  };
+  
+  
+  
+  // Properties of fp16
+  template<>
+  struct floatprops<fp16> {
+    typedef fp16 real_t;
+    typedef int16_t int_t;
+    typedef uint16_t uint_t;
+    
+    // Definitions that might come from numeric_limits<> instead:
+    static int const digits = 11;
+    static int epsilon() { __builtin_unreachable(); }
+    static int const min_exponent = -14;
+    static int const max_exponent = 15;
+    
+    // Ensure the sizes match
+    static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size");
+    static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size");
+    
+    // Number of bits in internal representation
+    static int const bits = 8 * sizeof(real_t);
+    static int const mantissa_bits = digits - 1;
+    static int const signbit_bits = 1;
+    static int const exponent_bits = bits - mantissa_bits - signbit_bits;
+    static int const exponent_offset = 2 - min_exponent;
+    static_assert(mantissa_bits + exponent_bits + signbit_bits == bits,
+                  "error in bit counts");
+    static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1;
+    static uint_t const exponent_mask =
+      ((uint_t(1) << exponent_bits) - 1) << mantissa_bits;
+    static uint_t const signbit_mask = uint_t(1) << (bits-1);
+    static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0),
+                  "error in masks");
+    static_assert((mantissa_mask | exponent_mask | signbit_mask) ==
+                  uint_t(~uint_t(0)),
+                  "error in masks");
+    
+    // Re-interpret bit patterns
+    static real_t as_float(int_t x)
+    {
+      real_t res;
+      std::memcpy(&res, &x, sizeof res);
+      return res;
+    }
+    static int_t as_int(real_t x)
+    {
+      int_t res;
+      std::memcpy(&res, &x, sizeof res);
+      return res;
+    }
+    
+    // Convert values
+    static real_t convert_float(int_t x) { __builtin_unreachable(); }
+    static int_t convert_int(real_t x) { __builtin_unreachable(); }
   };
   
   
@@ -66,7 +176,7 @@ namespace vecmathlib {
                   "error in masks");
     
     // Re-interpret bit patterns
-    static inline real_t as_float(int_t x)
+    static real_t as_float(int_t x)
     {
       // return *(real_t*)&x;
       // union { int_t i; real_t r; } ir;
@@ -75,7 +185,7 @@ namespace vecmathlib {
       std::memcpy(&res, &x, sizeof res);
       return res;
     }
-    static inline int_t as_int(real_t x)
+    static int_t as_int(real_t x)
     {
       // return *(int_t*)&x;
       // union { real_t r; int_t i; } ri;
@@ -86,8 +196,8 @@ namespace vecmathlib {
     }
     
     // Convert values
-    static inline real_t convert_float(int_t x) { return real_t(x); }
-    static inline int_t convert_int(real_t x)
+    static real_t convert_float(int_t x) { return real_t(x); }
+    static int_t convert_int(real_t x)
     {
       static_assert(sizeof std::lrint(x) >= sizeof(int_t),
                     "lrint() has wrong return type");
@@ -103,11 +213,6 @@ namespace vecmathlib {
     }
   };
   
-  template<>
-  struct intprops<floatprops<float>::int_t> {
-    typedef float real_t;
-  };
-  
   
   
   // Properties of double
@@ -143,7 +248,7 @@ namespace vecmathlib {
                   "error in masks");
     
     // Re-interpret bit patterns
-    static inline real_t as_float(int_t x)
+    static real_t as_float(int_t x)
     {
       // return *(real_t*)&x;
       // union { int_t i; real_t r; } ir;
@@ -152,7 +257,7 @@ namespace vecmathlib {
       std::memcpy(&res, &x, sizeof res);
       return res;
     }
-    static inline int_t as_int(real_t x)
+    static int_t as_int(real_t x)
     {
       // return *(int_t*)&x;
       // union { real_t r; int_t i; } ri;
@@ -163,8 +268,8 @@ namespace vecmathlib {
     }
     
     // Convert values
-    static inline real_t convert_float(int_t x) { return real_t(x); }
-    static inline int_t convert_int(real_t x)
+    static real_t convert_float(int_t x) { return real_t(x); }
+    static int_t convert_int(real_t x)
     {
       static_assert(sizeof std::lrint(x) >= sizeof(int_t),
                     "lrint() has wrong return type");
@@ -180,11 +285,6 @@ namespace vecmathlib {
     }
   };
   
-  template<>
-  struct intprops<floatprops<double>::int_t> {
-    typedef double real_t;
-  };
-  
   
   
 } // namespace vecmathlib
author	Erik Schnetter <schnetter@gmail.com>	2013-02-19 12:49:36 -0500
committer	Erik Schnetter <schnetter@gmail.com>	2013-02-19 12:49:36 -0500
commit	4e9653e4441ba216fdedb4b6105a12ccad797028 (patch)
tree	569ae7b500ce689410f5e6bb7677017876a5a6d5 /floatprops.h
parent	0208cd9fcea032092d892587c444f1a6b0891425 (diff)
download	vecmathlib-4e9653e4441ba216fdedb4b6105a12ccad797028.zip vecmathlib-4e9653e4441ba216fdedb4b6105a12ccad797028.tar.gz