Generate pocl-compatible OpenCL math function bindings

author: Erik Schnetter <schnetter@gmail.com> 2013-02-17 20:46:39 -0500
committer: Erik Schnetter <schnetter@gmail.com> 2013-02-17 20:46:39 -0500
commit: 1fccde626317f3c6aad9186200d169baa1673a0f (patch)
tree: 36c0c7a22e35b0737dbf63212e1629a6caa09079 /pocl
parent: 16627815106407a34b85ca691bd98ee18c92abe3 (diff)
download: vecmathlib-1fccde626317f3c6aad9186200d169baa1673a0f.zip
vecmathlib-1fccde626317f3c6aad9186200d169baa1673a0f.tar.gz
11 files changed, 921 insertions, 0 deletions
diff --git a/pocl/CMakeLists.txt b/pocl/CMakeLists.txt
new file mode 100644
index 0000000..2586bac
--- /dev/null
+++ b/pocl/CMakeLists.txt
@@ -0,0 +1,33 @@
+# See file "BUILD" for instructions
+
+set (GEN_SRCS
+acos.cc acosh.cc asin.cc asinh.cc atan.cc atanh.cc cbrt.cc ceil.cc copysign.cc cos.cc cosh.cc exp.cc exp2.cc exp10.cc expm1.cc fabs.cc fdim.cc floor.cc fma.cc fmax.cc fmin.cc fmod.cc hypot.cc ilogb.cc ldexp.cc log.cc log2.cc log10.cc log1p.cc pow.cc remainder.cc round.cc rsqrt.cc sin.cc sinh.cc sqrt.cc tan.cc tanh.cc trunc.cc isfinite.cc isinf.cc isnan.cc isnormal.cc signbit.cc acospi.cl asinpi.cl atanpi.cl atan2pi.cl cospi.cl fmax.cl fmin.cl mad.cl maxmag.cl minmag.cl nan.cl pown.cl powr.cl rint.cl rootn.cl sinpi.cl tanpi.cl half_cos.cl half_divide.cl half_exp.cl half_exp2.cl half_exp10.cl half_log.cl half_log2.cl half_log10.cl half_powr.cl half_recip.cl half_rsqrt.cl half_sin.cl half_sqrt.cl half_tan.cl native_cos.cl native_divide.cl native_exp.cl native_exp2.cl native_exp10.cl native_log.cl native_log2.cl native_log10.cl native_powr.cl native_recip.cl native_rsqrt.cl native_sin.cl native_sqrt.cl native_tan.cl clamp.cl degrees.cl max.cl min.cl mix.cl radians.cl step.cl smoothstep.cl sign.cl isequal.cl isnotequal.cl isgreater.cl isgreaterequal.cl isless.cl islessequal.cl islessgreater.cl isordered.cl isunordered.cl
+  )
+
+set (SRCS
+  cross.cl dot.cl distance.cl length.cl normalize.cl fast_distance.cl
+  fast_length.cl fast_normalize.cl
+  ${GEN_SRCS}
+)
+
+add_custom_command(
+  OUTPUT ${GEN_SRCS} kernel-vecmathlib.h
+  COMMAND ./generate-files.py
+  DEPENDS generate-files.py)
+
+add_library (kernel-vecmathlib ${SRCS})
+
+
+
+# GCC:
+# set (CMAKE_CXX_COMPILER "g++")
+# set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -g -std=gnu++11 -march=native -Ofast")
+
+# Clang:
+# Note: This fails to link with -O4
+set (CMAKE_CXX_COMPILER "clang++-mp-3.3")
+set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -g -Dcl_khr_fp64 -Dcles_khr_int64 -std=gnu++11 -stdlib=libc++ -march=native -O3")
+
+# Intel:
+# set (CMAKE_CXX_COMPILER "icpc")
+# set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -g -std=c++11 -fast")
diff --git a/pocl/cross.cl b/pocl/cross.cl
new file mode 100644
index 0000000..7b8e861
--- /dev/null
+++ b/pocl/cross.cl
@@ -0,0 +1,43 @@
+__attribute__((__overloadable__))
+float4 cross(float4 p0, float4 p1)
+{
+  float4 r;
+  r.x = p0.y*p1.z - p0.z*p1.y;
+  r.y = p0.z*p1.x - p0.x*p1.z;
+  r.z = p0.x*p1.y - p0.y*p1.x;
+  r.w = 0.0f;
+  return r;
+}
+
+__attribute__((__overloadable__))
+float3 cross(float3 p0, float3 p1)
+{
+  float3 r;
+  r.x = p0.y*p1.z - p0.z*p1.y;
+  r.y = p0.z*p1.x - p0.x*p1.z;
+  r.z = p0.x*p1.y - p0.y*p1.x;
+  return r;
+}
+
+#ifdef cl_khr_fp64
+__attribute__((__overloadable__))
+double4 cross(double4 p0, double4 p1)
+{
+  double4 r;
+  r.x = p0.y*p1.z - p0.z*p1.y;
+  r.y = p0.z*p1.x - p0.x*p1.z;
+  r.z = p0.x*p1.y - p0.y*p1.x;
+  r.w = 0.0f;
+  return r;
+}
+
+__attribute__((__overloadable__))
+double3 cross(double3 p0, double3 p1)
+{
+  double3 r;
+  r.x = p0.y*p1.z - p0.z*p1.y;
+  r.y = p0.z*p1.x - p0.x*p1.z;
+  r.z = p0.x*p1.y - p0.y*p1.x;
+  return r;
+}
+#endif
diff --git a/pocl/distance.cl b/pocl/distance.cl
new file mode 100644
index 0000000..5df8637
--- /dev/null
+++ b/pocl/distance.cl
@@ -0,0 +1,49 @@
+__attribute__((__overloadable__))
+float distance(float p0, float p1)
+{
+  return length(p0-p1);
+}
+
+__attribute__((__overloadable__))
+float distance(float2 p0, float2 p1)
+{
+  return length(p0-p1);
+}
+
+__attribute__((__overloadable__))
+float distance(float3 p0, float3 p1)
+{
+  return length(p0-p1);
+}
+
+__attribute__((__overloadable__))
+float distance(float4 p0, float4 p1)
+{
+  return length(p0-p1);
+}
+
+#ifdef cl_khr_fp64
+__attribute__((__overloadable__))
+double distance(double p0, double p1)
+{
+  return length(p0-p1);
+}
+
+__attribute__((__overloadable__))
+double distance(double2 p0, double2 p1)
+{
+  return length(p0-p1);
+}
+
+__attribute__((__overloadable__))
+double distance(double3 p0, double3 p1)
+{
+  return length(p0-p1);
+}
+
+__attribute__((__overloadable__))
+double distance(double4 p0, double4 p1)
+{
+  return length(p0-p1);
+}
+#endif
diff --git a/pocl/dot.cl b/pocl/dot.cl
new file mode 100644
index 0000000..91bb400
--- /dev/null
+++ b/pocl/dot.cl
@@ -0,0 +1,49 @@
+__attribute__((__overloadable__))
+float dot(float p0, float p1)
+{
+  return p0*p1;
+}
+
+__attribute__((__overloadable__))
+float dot(float2 p0, float2 p1)
+{
+  return p0.x*p1.x + p0.y*p1.y;
+}
+
+__attribute__((__overloadable__))
+float dot(float3 p0, float3 p1)
+{
+  return p0.x*p1.x + p0.y*p1.y + p0.z*p1.z;
+}
+
+__attribute__((__overloadable__))
+float dot(float4 p0, float4 p1)
+{
+  return p0.x*p1.x + p0.y*p1.y + p0.z*p1.z + p0.w*p1.w;
+}
+
+#ifdef cl_khr_fp64
+__attribute__((__overloadable__))
+double dot(double p0, double p1)
+{
+  return p0*p1;
+}
+
+__attribute__((__overloadable__))
+double dot(double2 p0, double2 p1)
+{
+  return p0.x*p1.x + p0.y*p1.y;
+}
+
+__attribute__((__overloadable__))
+double dot(double3 p0, double3 p1)
+{
+  return p0.x*p1.x + p0.y*p1.y + p0.z*p1.z;
+}
+
+__attribute__((__overloadable__))
+double dot(double4 p0, double4 p1)
+{
+  return p0.x*p1.x + p0.y*p1.y + p0.z*p1.z + p0.w*p1.w;
+}
+#endif
diff --git a/pocl/fast_distance.cl b/pocl/fast_distance.cl
new file mode 100644
index 0000000..c2a7e9e
--- /dev/null
+++ b/pocl/fast_distance.cl
@@ -0,0 +1,23 @@
+__attribute__((__overloadable__))
+float fast_distance(float p0, float p1)
+{
+  return fast_length(p0-p1);
+}
+
+__attribute__((__overloadable__))
+float fast_distance(float2 p0, float2 p1)
+{
+  return fast_length(p0-p1);
+}
+
+__attribute__((__overloadable__))
+float fast_distance(float3 p0, float3 p1)
+{
+  return fast_length(p0-p1);
+}
+
+__attribute__((__overloadable__))
+float fast_distance(float4 p0, float4 p1)
+{
+  return fast_length(p0-p1);
+}
diff --git a/pocl/fast_length.cl b/pocl/fast_length.cl
new file mode 100644
index 0000000..eb765b9
--- /dev/null
+++ b/pocl/fast_length.cl
@@ -0,0 +1,26 @@
+// Note: Chapter 6.12.5 of the OpenCL standard says to use half_sqrt,
+// not fast_sqrt
+
+__attribute__((__overloadable__))
+float fast_length(float p)
+{
+  return half_sqrt(dot(p, p));
+}
+
+__attribute__((__overloadable__))
+float fast_length(float2 p)
+{
+  return half_sqrt(dot(p, p));
+}
+
+__attribute__((__overloadable__))
+float fast_length(float3 p)
+{
+  return half_sqrt(dot(p, p));
+}
+
+__attribute__((__overloadable__))
+float fast_length(float4 p)
+{
+  return half_sqrt(dot(p, p));
+}
diff --git a/pocl/fast_normalize.cl b/pocl/fast_normalize.cl
new file mode 100644
index 0000000..ecdd524
--- /dev/null
+++ b/pocl/fast_normalize.cl
@@ -0,0 +1,26 @@
+// Note: Chapter 6.12.5 of the OpenCL standard says to use half_rsqrt,
+// not fast_rsqrt
+
+__attribute__((__overloadable__))
+float fast_normalize(float p)
+{
+  return p * half_rsqrt(dot(p, p));
+}
+
+__attribute__((__overloadable__))
+float2 fast_normalize(float2 p)
+{
+  return p * half_rsqrt(dot(p, p));
+}
+
+__attribute__((__overloadable__))
+float3 fast_normalize(float3 p)
+{
+  return p * half_rsqrt(dot(p, p));
+}
+
+__attribute__((__overloadable__))
+float4 fast_normalize(float4 p)
+{
+  return p * half_rsqrt(dot(p, p));
+}
diff --git a/pocl/generate-files.py b/pocl/generate-files.py
new file mode 100755
index 0000000..00d8ce2
--- /dev/null
+++ b/pocl/generate-files.py
@@ -0,0 +1,496 @@
+#! /usr/bin/env python
+
+import re
+
+
+
+# Types:
+SI = "SI"                       # int/long
+SJ = "SJ"                       # int (even for double)
+SF = "SF"                       # float/double
+VB = "VB"                       # boolN
+VF = "VF"                       # floatN/doubleN
+VI = "VI"                       # intN/longN
+VJ = "VJ"                       # intN/longN (except int1 for double1)
+VK = "VK"                       # intN (even for doubleN)
+VU = "VU"                       # uintN/ulongN
+
+# Each function is described by a tuple with the following entries:
+#    1. name
+#    2. external argument types (see above)
+#    3. external return type
+#    4. vecmathlib argument types (see above)
+#    5. vecmathlib return type
+# This allows generating externally visible functions with different
+# signatures, e.g. to support OpenCL.
+vmlfuncs = [
+    ("acos"     , [VF        ], VF, [VF        ], VF), # 6.12.2
+    ("acosh"    , [VF        ], VF, [VF        ], VF), # 6.12.2
+    ("asin"     , [VF        ], VF, [VF        ], VF), # 6.12.2
+    ("asinh"    , [VF        ], VF, [VF        ], VF), # 6.12.2
+    ("atan"     , [VF        ], VF, [VF        ], VF), # 6.12.2
+    ("atanh"    , [VF        ], VF, [VF        ], VF), # 6.12.2
+    ("cbrt"     , [VF        ], VF, [VF        ], VF), # 6.12.2
+    ("ceil"     , [VF        ], VF, [VF        ], VF), # 6.12.2
+    ("copysign" , [VF, VF    ], VF, [VF, VF    ], VF), # 6.12.2
+    ("cos"      , [VF        ], VF, [VF        ], VF), # 6.12.2
+    ("cosh"     , [VF        ], VF, [VF        ], VF), # 6.12.2
+    ("exp"      , [VF        ], VF, [VF        ], VF), # 6.12.2
+    ("exp2"     , [VF        ], VF, [VF        ], VF), # 6.12.2
+    ("exp10"    , [VF        ], VF, [VF        ], VF), # 6.12.2
+    ("expm1"    , [VF        ], VF, [VF        ], VF), # 6.12.2
+    ("fabs"     , [VF        ], VF, [VF        ], VF), # 6.12.2
+    ("fdim"     , [VF, VF    ], VF, [VF, VF    ], VF), # 6.12.2
+    ("floor"    , [VF        ], VF, [VF        ], VF), # 6.12.2
+    ("fma"      , [VF, VF, VF], VF, [VF, VF, VF], VF), # 6.12.2
+    ("fmax"     , [VF, VF    ], VF, [VF, VF    ], VF), # 6.12.2
+    ("fmin"     , [VF, VF    ], VF, [VF, VF    ], VF), # 6.12.2
+    ("fmod"     , [VF, VF    ], VF, [VF, VF    ], VF), # 6.12.2
+    ("hypot"    , [VF, VF    ], VF, [VF, VF    ], VF), # 6.12.2
+    ("ilogb"    , [VF        ], VJ, [VF        ], VI), # 6.12.2 (but should return VK)
+    ("ldexp"    , [VF, VJ    ], VF, [VF, VI    ], VF), # 6.12.2 (but should take VK)
+    ("ldexp"    , [VF, SJ    ], VF, [VF, SI    ], VF), # 6.12.2
+    ("log"      , [VF        ], VF, [VF        ], VF), # 6.12.2
+    ("log2"     , [VF        ], VF, [VF        ], VF), # 6.12.2
+    ("log10"    , [VF        ], VF, [VF        ], VF), # 6.12.2
+    ("log1p"    , [VF        ], VF, [VF        ], VF), # 6.12.2
+    ("pow"      , [VF, VF    ], VF, [VF, VF    ], VF), # 6.12.2
+    ("remainder", [VF, VF    ], VF, [VF, VF    ], VF), # 6.12.2
+    ("round"    , [VF        ], VF, [VF        ], VF), # 6.12.2
+    ("rsqrt"    , [VF        ], VF, [VF        ], VF), # 6.12.2
+    ("sin"      , [VF        ], VF, [VF        ], VF), # 6.12.2
+    ("sinh"     , [VF        ], VF, [VF        ], VF), # 6.12.2
+    ("sqrt"     , [VF        ], VF, [VF        ], VF), # 6.12.2
+    ("tan"      , [VF        ], VF, [VF        ], VF), # 6.12.2
+    ("tanh"     , [VF        ], VF, [VF        ], VF), # 6.12.2
+    ("trunc"    , [VF        ], VF, [VF        ], VF), # 6.12.2
+    
+    ("isfinite" , [VF        ], VJ, [VF        ], VB), # 6.12.6
+    ("isinf"    , [VF        ], VJ, [VF        ], VB), # 6.12.6
+    ("isnan"    , [VF        ], VJ, [VF        ], VB), # 6.12.6
+    ("isnormal" , [VF        ], VJ, [VF        ], VB), # 6.12.6
+    ("signbit"  , [VF        ], VJ, [VF        ], VB), # 6.12.6
+    ]
+
+
+
+directfuncs = [
+    ("acospi"       , [VF        ], VF, "acos(x0)/(scalar_t)M_PI"),     # 6.12.2
+    ("asinpi"       , [VF        ], VF, "asin(x0)/(scalar_t)M_PI"),     # 6.12.2
+    ("atanpi"       , [VF        ], VF, "atan(x0)/(scalar_t)M_PI"),     # 6.12.2
+    ("atan2pi"      , [VF, VF    ], VF, "atan2(x0,x1)/(scalar_t)M_PI"), # 6.12.2
+    ("cospi"        , [VF        ], VF, "cos((scalar_t)M_PI*x0)"),      # 6.12.2
+    ("fmax"         , [VF, SF    ], VF, "fmax(x0,(vector_t)x1)"),       # 6.12.2
+    ("fmin"         , [VF, SF    ], VF, "fmin(x0,(vector_t)x1)"),       # 6.12.2
+    ("mad"          , [VF, VF, VF], VF, "fma(x0,x1,x2)"),               # 6.12.2
+    ("maxmag"       , [VF, VF    ], VF, "fabs(x0)>fabs(x1) ? x0 : fabs(x1)>fabs(x0) ? x1 : fmax(x0,x1)"), # 6.12.2
+    ("minmag"       , [VF, VF    ], VF, "fabs(x0)<fabs(x1) ? x0 : fabs(x1)<fabs(x0) ? x1 : fmin(x0,x1)"), # 6.12.2
+    ("nan"          , [VU        ], VF, "(scalar_t)0.0/(scalar_t)0.0"), # 6.12.2
+    ("pown"         , [VF, VK    ], VF, "pow(x0,convert_vector_t(x1))"), # 6.12.2
+    ("powr"         , [VF, VF    ], VF, "pow(x0,x1)"), # 6.12.2
+    ("rint"         , [VF        ], VF, "round(x0)"),  # 6.12.2
+    ("rootn"        , [VF, VK    ], VF, "pow(x0,(scalar_t)1.0/convert_vector_t(x1))"), # 6.12.2
+    ("sinpi"        , [VF        ], VF, "sin((scalar_t)M_PI*x0)"), # 6.12.2
+    ("tanpi"        , [VF        ], VF, "tan((scalar_t)M_PI*x0)"), # 6.12.2
+    
+    ("half_cos"     , [VF        ], VF, "cos(x0)"),          # 6.12.2
+    ("half_divide"  , [VF, VF    ], VF, "x0/x1"),            # 6.12.2
+    ("half_exp"     , [VF        ], VF, "exp(x0)"),          # 6.12.2
+    ("half_exp2"    , [VF        ], VF, "exp2(x0)"),         # 6.12.2
+    ("half_exp10"   , [VF        ], VF, "exp10(x0)"),        # 6.12.2
+    ("half_log"     , [VF        ], VF, "log(x0)"),          # 6.12.2
+    ("half_log2"    , [VF        ], VF, "log2(x0)"),         # 6.12.2
+    ("half_log10"   , [VF        ], VF, "log10(x0)"),        # 6.12.2
+    ("half_powr"    , [VF, VF    ], VF, "powr(x0,x1)"),      # 6.12.2
+    ("half_recip"   , [VF        ], VF, "(scalar_t)1.0/x0"), # 6.12.2
+    ("half_rsqrt"   , [VF        ], VF, "rsqrt(x0)"),        # 6.12.2
+    ("half_sin"     , [VF        ], VF, "sin(x0)"),          # 6.12.2
+    ("half_sqrt"    , [VF        ], VF, "sqrt(x0)"),         # 6.12.2
+    ("half_tan"     , [VF        ], VF, "tan(x0)"),          # 6.12.2
+    
+    ("native_cos"   , [VF        ], VF, "cos(x0)"),          # 6.12.2
+    ("native_divide", [VF, VF    ], VF, "x0/x1"),            # 6.12.2
+    ("native_exp"   , [VF        ], VF, "exp(x0)"),          # 6.12.2
+    ("native_exp2"  , [VF        ], VF, "exp2(x0)"),         # 6.12.2
+    ("native_exp10" , [VF        ], VF, "exp10(x0)"),        # 6.12.2
+    ("native_log"   , [VF        ], VF, "log(x0)"),          # 6.12.2
+    ("native_log2"  , [VF        ], VF, "log2(x0)"),         # 6.12.2
+    ("native_log10" , [VF        ], VF, "log10(x0)"),        # 6.12.2
+    ("native_powr"  , [VF, VF    ], VF, "powr(x0,x1)"),      # 6.12.2
+    ("native_recip" , [VF        ], VF, "(scalar_t)1.0/x0"), # 6.12.2
+    ("native_rsqrt" , [VF        ], VF, "rsqrt(x0)"),        # 6.12.2
+    ("native_sin"   , [VF        ], VF, "sin(x0)"),          # 6.12.2
+    ("native_sqrt"  , [VF        ], VF, "sqrt(x0)"),         # 6.12.2
+    ("native_tan"   , [VF        ], VF, "tan(x0)"),          # 6.12.2
+    
+    ("clamp"        , [VF, VF, VF], VF, "fmin(fmax(x0,x1),x2)"), # 6.12.4
+    ("clamp"        , [VF, SF, SF], VF, "fmin(fmax(x0,x1),x2)"), # 6.12.4
+    ("degrees"      , [VF        ], VF, "(scalar_t)(180.0/M_PI)*x0"), # 6.12.4
+    ("max"          , [VF, VF    ], VF, "fmax(x0,x1)"),   # 6.12.4
+    ("max"          , [VF, SF    ], VF, "fmax(x0,x1)"),   # 6.12.4
+    ("min"          , [VF, VF    ], VF, "fmin(x0,x1)"),   # 6.12.4
+    ("min"          , [VF, SF    ], VF, "fmin(x0,x1)"),   # 6.12.4
+    ("mix"          , [VF, VF, VF], VF, "x0+(x1-x0)*x2"), # 6.12.4
+    ("mix"          , [VF, VF, SF], VF, "x0+(x1-x0)*x2"), # 6.12.4
+    ("radians"      , [VF        ], VF, "(scalar_t)(M_PI/180.0)*x0"), # 6.12.4
+    ("step"         , [VF, VF    ], VF, "x1<x0 ? (vector_t)(scalar_t)0.0 : (vector_t)(scalar_t)1.0"), # 6.12.4
+    ("step"         , [SF, VF    ], VF, "x1<x0 ? (vector_t)(scalar_t)0.0 : (vector_t)(scalar_t)1.0"), # 6.12.4
+    ("smoothstep"   , [VF, VF, VF], VF, "({ vector_t t = clamp((x2-x0)/(x1-x0), (scalar_t)0.0, (scalar_t)1.0); t*t*((scalar_t)3.0-(scalar_t)2.0*t); })"), # 6.12.4
+    ("smoothstep"   , [SF, SF, VF], VF, "({ vector_t t = clamp((x2-x0)/(x1-x0), (scalar_t)0.0, (scalar_t)1.0); t*t*((scalar_t)3.0-(scalar_t)2.0*t); })"), # 6.12.4
+    ("sign"         , [VF        ], VF, "copysign(x0!=(scalar_t)0.0 ? (vector_t)(scalar_t)1.0 : (vector_t)(scalar_t)0.0,x0)"), # 6.12.4
+    
+    ("isequal"       , [VF, VF    ], VJ, "x0==x1"),                 # 6.12.6
+    ("isnotequal"    , [VF, VF    ], VJ, "x0!=x1"),                 # 6.12.6
+    ("isgreater"     , [VF, VF    ], VJ, "x0>x1"),                  # 6.12.6
+    ("isgreaterequal", [VF, VF    ], VJ, "x0>=x1"),                 # 6.12.6
+    ("isless"        , [VF, VF    ], VJ, "x0<x1"),                  # 6.12.6
+    ("islessequal"   , [VF, VF    ], VJ, "x0<=x1"),                 # 6.12.6
+    ("islessgreater" , [VF, VF    ], VJ, "x0<x1 || x0>x1"),         # 6.12.6
+    ("isordered"     , [VF, VF    ], VJ, "!isunordered(x0,x1)"),    # 6.12.6
+    ("isunordered"   , [VF, VF    ], VJ, "isnan(x0) || isnan(x1)"), # 6.12.6
+]
+
+# Missing functions from 6.12.2: atan2, erfc, erf, fract, frexp,
+# lgamma, lgamma_r, logb, modf, nextafter, remquo, sincos, tgamma
+
+# Unchecked: 6.12.3 (integer functions)
+
+# Missing functions from 6.12.6 (relational functions): any, all,
+# bitselect, select
+
+# Unchecked: 6.12.7 (vector data load and store functions)
+
+# Unchecked: 6.12.12 (miscellaneous vector functions)
+
+
+
+outfile = None
+outfile_did_truncate = set()
+def out(str): outfile.write("%s\n" % str)
+def out_open(name):
+    global outfile
+    global outfile_did_truncate
+    if outfile: raise "file already open"
+    is_first_open = name not in outfile_did_truncate
+    if is_first_open:
+        outfile = open(name, "w")
+        outfile.close()
+        outfile_did_truncate.add(name)
+        print name
+    outfile = open(name, "a")
+    return is_first_open
+def out_close():
+    global outfile
+    outfile.close()
+    outfile = None
+
+declfile = None
+def decl(str):
+    if str=="" or str.startswith("//") or str.startswith("#"):
+        declfile.write("%s\n" % str)
+    else:
+        declfile.write("__attribute__((__overloadable__)) %s;\n" % str)
+def decl_open(name):
+    global declfile
+    declfile = open(name, "w")
+def decl_close():
+    global declfile
+    declfile.close()
+    declfile = None
+
+
+
+def mktype(tp, vectype):
+    (basetype, size) = re.match("([A-Za-z]+)([0-9]*)", vectype).groups()
+    size = 1 if size=="" else int(size)
+    if tp==SJ:
+        if size==1: return "int"
+        return "int" if basetype=="float" else "long"
+    if tp==SF:
+        return basetype
+    if tp==VI:
+        ibasetype = "int" if basetype=="float" else "long"
+        return "%s%s" % (ibasetype, "" if size==1 else str(size))
+    if tp==VJ:
+        if size==1: return "int"
+        ibasetype = "int" if basetype=="float" else "long"
+        return "%s%d" % (ibasetype, size)
+    if tp==VK:
+        if size==1: return "int"
+        return "int%d" % size
+    if tp==VU:
+        ibasetype = "uint" if basetype=="float" else "ulong"
+        return "%s%s" % (ibasetype, "" if size==1 else str(size))
+    if tp==VF:
+        return vectype
+    raise "unreachable"
+
+def mkvmltype(tp, vectype):
+    if tp==SI: return vectype+"::int_t"
+    if tp==SF: return vectype+"::real_t"
+    if tp==VB: return vectype+"::boolvec_t"
+    if tp==VI: return vectype+"::intvec_t"
+    if tp==VF: return vectype
+    raise "unreachable"
+
+
+
+def output_vmlfunc_vml(func, vectype):
+    (name, args, ret, vmlargs, vmlret) = func
+    out("// Implement %s by calling vecmathlib" % name)
+    (basetype, size) = re.match("([A-Za-z]+)([0-9]*)", vectype).groups()
+    size = 1 if size=="" else int(size)
+    vmltype = "vecmathlib::realvec<%s,%d>" % (basetype, size)
+    vmlinttype = "%s::intvec_t" % vmltype
+    vmlbooltype = "%s::boolvec_t" % vmltype
+    funcargstr = ", ".join(map(lambda (n, arg):
+                                   "%s x%d" % (mktype(arg, vectype), n),
+                               zip(range(0, 100), args)))
+    funcretstr = mktype(ret, vectype)
+    decl("%s __vml_%s(%s)" % (funcretstr, name, funcargstr))
+    out("%s __vml_%s(%s)" % (funcretstr, name, funcargstr))
+    out("{")
+    for (n, arg, vmlarg) in zip(range(0, 100), args, vmlargs):
+        out("  %s y%d = bitcast<%s,%s>(x%d);" %
+            (mkvmltype(vmlarg, vmltype), n,
+             mktype(arg, vectype), mkvmltype(vmlarg, vmltype), n))
+    callargstr = ", ".join(map(lambda (n, arg): "y%d" % n,
+                               zip(range(0, 100), args)))
+    callretstr = mkvmltype(vmlret, vmltype)
+    out("  %s r = vecmathlib::%s(%s);" % (callretstr, name, callargstr))
+    # We may need to convert from the VML type to the OpenCL type
+    # before bitcasting. This may be a real conversion, e.g. bool to
+    # int. This may also involve a change in size (e.g. long to int),
+    # but only if the type is scalar. These conversions are applied
+    # before bitcasting.
+    # convfunc: conversion function to call
+    # convtype: result type of conversion, also input to bitcast
+    # bitcasttype: output of bitcast; may differ from function result
+    #              if a size change is needed
+    if vmlret==ret:
+        convfunc    = ""
+        convtype    = callretstr
+        bitcasttype = funcretstr
+    else:
+        if vmlret==VI and ret in (VJ,VK):
+            convfunc    = ""
+            convtype    = callretstr
+        elif vmlret==VB and ret in (VJ,VK):
+            convfunc = "vecmathlib::convert_int"
+            convtype = vmlinttype
+        else:
+            raise "missing"
+        if ret in (VJ,VK):
+            bitcasttype = mktype(VI, vectype)
+        else:
+            raise "missing"
+    out("  return bitcast<%s,%s>(%s(r));" % (convtype, bitcasttype, convfunc))
+    out("}")
+
+def output_vmlfunc_libm(func, vectype):
+    (name, args, ret, vmlargs, vmlret) = func
+    out("// Implement %s by calling libm" % name)
+    (basetype, size) = re.match("([A-Za-z]+)([0-9]*)", vectype).groups()
+    size = 1 if size=="" else int(size)
+    othertype = "vecmathlib::realpseudovec<%s,%d>" % (basetype, size)
+    otherinttype = "%s::intvec_t" % othertype
+    funcargstr = ", ".join(map(lambda (n, arg):
+                                   "%s x%d" % (mktype(arg, vectype), n),
+                               zip(range(0, 100), args)))
+    decl("%s __vml_%s(%s)" % (vectype, name, funcargstr))
+    out("%s __vml_%s(%s)" % (vectype, name, funcargstr))
+    out("{")
+    for (n, arg) in zip(range(0, 100), args):
+        out("  %s y%d = x%d;" % (othertype, n, n))
+    callargstr = ", ".join(map(lambda (n, arg): "y%d" % n,
+                               zip(range(0, 100), args)))
+    callretstr = othertype if ret==VF else otherinttype
+    out("  %s r = vecmathlib::%s(%s);" % (callretstr, name, callargstr))
+    out("  return r[0];")
+    out("}")
+
+def output_vmlfunc_upcast(func, vectype):
+    (name, args, ret, vmlargs, vmlret) = func
+    out("// Implement %s by using a larger vector size" % name)
+    (basetype, size) = re.match("([A-Za-z]+)([0-9]*)", vectype).groups()
+    size = 1 if size=="" else int(size)
+    size2 = 4 if size==3 else size*2 # next power of 2
+    size2 = "" if size2==1 else str(size2)
+    othertype = "%s%s" % (basetype, size2)
+    declargstr = ", ".join(map(lambda (n, arg): "%s" % mktype(arg, othertype),
+                               zip(range(0, 100), args)))
+    out("%s __vml_%s(%s);" % (mktype(ret, othertype), name, declargstr))
+    funcargstr = ", ".join(map(lambda (n, arg):
+                                   "%s x%d" % (mktype(arg, vectype), n),
+                               zip(range(0, 100), args)))
+    decl("%s __vml_%s(%s)" % (mktype(ret, vectype), name, funcargstr))
+    out("%s __vml_%s(%s)" % (mktype(ret, vectype), name, funcargstr))
+    out("{")
+    for (n, arg) in zip(range(0, 100), args):
+        out("  %s y%d = bitcast<%s,%s>(x%d);" %
+            (mktype(arg, othertype), n,
+             mktype(arg, vectype), mktype(arg, othertype), n))
+    callargstr = ", ".join(map(lambda (n, arg): "y%d" % n,
+                               zip(range(0, 100), args)))
+    out("  %s r = __vml_%s(%s);" % (mktype(ret, othertype), name, callargstr))
+    out("  return bitcast<%s,%s>(r);" %
+        (mktype(ret, othertype), mktype(ret, vectype)))
+    out("}")
+
+def output_vmlfunc_split(func, vectype):
+    (name, args, ret, vmlargs, vmlret) = func
+    out("// Implement %s by splitting into a smaller vector size" % name)
+    (basetype, size) = re.match("([A-Za-z]+)([0-9]*)", vectype).groups()
+    size = 1 if size=="" else int(size)
+    size2 = (size+1) / 2        # divide by 2, rounding up
+    size2 = "" if size2==1 else str(size2)
+    othertype = "%s%s" % (basetype, size2)
+    declargstr = ", ".join(map(lambda (n, arg): "%s" % mktype(arg, othertype),
+                               zip(range(0, 100), args)))
+    out("%s __vml_%s(%s);" % (mktype(ret, othertype), name, declargstr))
+    funcargstr = ", ".join(map(lambda (n, arg):
+                                   "%s x%d" % (mktype(arg, vectype), n),
+                               zip(range(0, 100), args)))
+    decl("%s __vml_%s(%s)" % (mktype(ret, vectype), name, funcargstr))
+    out("%s __vml_%s(%s)" % (mktype(ret, vectype), name, funcargstr))
+    out("{")
+    out("  struct pair { %s lo, hi; };" % othertype)
+    for (n, arg) in zip(range(0, 100), args):
+        out("  %s y%d = bitcast<%s,%s>(x%d);" %
+            (mktype(arg, othertype), n,
+             mktype(arg, vectype), mktype(arg, othertype), n))
+    callargstr = ", ".join(map(lambda (n, arg): "y%d" % n,
+                               zip(range(0, 100), args)))
+    out("  %s r = __vml_%s(%s);" % (mktype(ret, othertype), name, callargstr))
+    out("  return bitcast<%s,%s>(r);" %
+        (mktype(ret, othertype), mktype(ret, vectype)))
+    out("}")
+
+
+
+def output_directfunc_direct(func, vectype):
+    (name, args, ret, impl) = func
+    out("// Implement %s directly" % name)
+    (basetype, size) = re.match("([A-Za-z]+)([0-9]*)", vectype).groups()
+    size = 1 if size=="" else int(size)
+    funcargstr = ", ".join(map(lambda (n, arg):
+                                   "%s x%d" % (mktype(arg, vectype), n),
+                               zip(range(0, 100), args)))
+    funcretstr = mktype(ret, vectype)
+    decl("%s __vml_%s(%s)" % (funcretstr, name, funcargstr))
+    out("__attribute__((__overloadable__))");
+    out("%s __vml_%s(%s)" % (funcretstr, name, funcargstr))
+    out("{")
+    out("  typedef %s scalar_t;" % basetype)
+    out("  typedef %s vector_t;" % vectype)
+    out("#define convert_vector_t convert_%s" % vectype)
+    out("  return %s;" % impl)
+    out("#undef convert_vector_t")
+    out("}")
+
+
+
+def output_vmlfunc(func):
+    (name, args, ret, vmlargs, vmlret) = func
+    is_first_open = out_open("%s.cc" % name)
+    if is_first_open:
+        out("// Note: This file has been automatically generated. Do not modify.")
+        out("")
+        out("#include \"pocl-compat.h\"")
+        out("")
+    else:
+        out("")
+        out("")
+        out("")
+    decl("")
+    decl("// %s: %s -> %s" % (name, args, ret))
+    decl("#undef %s" % name)
+    decl("#define %s __vml_%s" % (name, name))
+    out("// %s: %s -> %s" % (name, args, ret))
+    for basetype in ["float", "double"]:
+        if basetype=="double":
+            out("")
+            out("#ifdef cl_khr_fp64")
+        for size in [1, 2, 3, 4, 8, 16]:
+            # Ignore this prototype for size==1 if there are any
+            # scalar arguments; this prevents duplicate definitions
+            if size==1 and any(map(lambda arg: arg in (SI, SJ, SF), args)):
+                continue
+            sizename = '' if size==1 else str(size)
+            vectype = basetype + sizename
+            # always use vecmathlib if available
+            out("")
+            out("// %s: VF=%s" % (name, vectype))
+            out("#if defined VECMATHLIB_HAVE_VEC_%s_%d" %
+                (basetype.upper(), size))
+            output_vmlfunc_vml(func, vectype)
+            if size==1:
+                # a scalar type: use libm
+                out("#else")
+                output_vmlfunc_libm(func, vectype)
+            else:
+                # a vector type: try upcasting to next power of 2
+                size2 = 4 if size==3 else size*2
+                out("#elif defined VECMATHLIB_HAVE_VEC_%s_%d" %
+                    (basetype.upper(), size2))
+                output_vmlfunc_upcast(func, vectype)
+                # a vector type: split into smaller vector type
+                out("#else")
+                output_vmlfunc_split(func, vectype)
+            out("#endif")
+        if basetype=="double":
+            out("")
+            out("#endif // #ifdef cl_khr_fp64")
+    out_close()
+
+
+
+def output_directfunc(func):
+    (name, args, ret, impl) = func
+    is_first_open = out_open("%s.cl" % name)
+    if is_first_open:
+        out("// Note: This file has been automatically generated. Do not modify.")
+        out("")
+    else:
+        out("")
+        out("")
+        out("")
+    decl("")
+    decl("// %s: %s -> %s" % (name, args, ret))
+    decl("#undef %s" % name)
+    decl("#define %s __vml_%s" % (name, name))
+    out("// %s: %s -> %s" % (name, args, ret))
+    for basetype in ["float", "double"]:
+        if ((name.startswith("half_") or name.startswith("native_")) and
+            basetype=="double"):
+            continue
+        if basetype=="double":
+            out("")
+            out("#ifdef cl_khr_fp64")
+        for size in [1, 2, 3, 4, 8, 16]:
+            # Ignore this prototype for size==1 if there are any
+            # scalar arguments; this prevents duplicate definitions
+            if size==1 and any(map(lambda arg: arg in (SI, SJ, SF), args)):
+                continue
+            sizename = '' if size==1 else str(size)
+            vectype = basetype + sizename
+            # always use vecmathlib if available
+            out("")
+            out("// %s: VF=%s" % (name, vectype))
+            output_directfunc_direct(func, vectype)
+        if basetype=="double":
+            out("")
+            out("#endif // #ifdef cl_khr_fp64")
+    out_close()
+
+
+
+decl_open("kernel-vecmathlib.h")
+decl("// Note: This file has been automatically generated. Do not modify.")
+decl("#ifndef KERNEL_VECMATHLIB_H")
+decl("#define KERNEL_VECMATHLIB_H 1")
+map(output_vmlfunc, vmlfuncs)
+map(output_directfunc, directfuncs)
+decl("")
+decl("#endif // #ifndef KERNEL_VECMATHLIB_H")
+decl_close()
diff --git a/pocl/length.cl b/pocl/length.cl
new file mode 100644
index 0000000..9715e59
--- /dev/null
+++ b/pocl/length.cl
@@ -0,0 +1,49 @@
+__attribute__((__overloadable__))
+float length(float p)
+{
+  return sqrt(dot(p, p));
+}
+
+__attribute__((__overloadable__))
+float length(float2 p)
+{
+  return sqrt(dot(p, p));
+}
+
+__attribute__((__overloadable__))
+float length(float3 p)
+{
+  return sqrt(dot(p, p));
+}
+
+__attribute__((__overloadable__))
+float length(float4 p)
+{
+  return sqrt(dot(p, p));
+}
+
+#ifdef cl_khr_fp64
+__attribute__((__overloadable__))
+double length(double p)
+{
+  return sqrt(dot(p, p));
+}
+
+__attribute__((__overloadable__))
+double length(double2 p)
+{
+  return sqrt(dot(p, p));
+}
+
+__attribute__((__overloadable__))
+double length(double3 p)
+{
+  return sqrt(dot(p, p));
+}
+
+__attribute__((__overloadable__))
+double length(double4 p)
+{
+  return sqrt(dot(p, p));
+}
+#endif
diff --git a/pocl/normalize.cl b/pocl/normalize.cl
new file mode 100644
index 0000000..e033567
--- /dev/null
+++ b/pocl/normalize.cl
@@ -0,0 +1,49 @@
+__attribute__((__overloadable__))
+float normalize(float p)
+{
+  return p * rsqrt(dot(p, p));
+}
+
+__attribute__((__overloadable__))
+float2 normalize(float2 p)
+{
+  return p * rsqrt(dot(p, p));
+}
+
+__attribute__((__overloadable__))
+float3 normalize(float3 p)
+{
+  return p * rsqrt(dot(p, p));
+}
+
+__attribute__((__overloadable__))
+float4 normalize(float4 p)
+{
+  return p * rsqrt(dot(p, p));
+}
+
+#ifdef cl_khr_fp64
+__attribute__((__overloadable__))
+double normalize(double p)
+{
+  return p * rsqrt(dot(p, p));
+}
+
+__attribute__((__overloadable__))
+double2 normalize(double2 p)
+{
+  return p * rsqrt(dot(p, p));
+}
+
+__attribute__((__overloadable__))
+double3 normalize(double3 p)
+{
+  return p * rsqrt(dot(p, p));
+}
+
+__attribute__((__overloadable__))
+double4 normalize(double4 p)
+{
+  return p * rsqrt(dot(p, p));
+}
+#endif
diff --git a/pocl/pocl-compat.h b/pocl/pocl-compat.h
new file mode 100644
index 0000000..8b4041e
--- /dev/null
+++ b/pocl/pocl-compat.h
@@ -0,0 +1,78 @@
+// -*-C++-*- Compatibility layer to help instantiante functions to
+// create a library that can be called from elsewhere
+
+
+
+// Make things go fast (and debugging difficult...)
+#define VML_NODEBUG
+#include "../vecmathlib.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+
+
+
+// Generic conversion function
+template<typename A, typename B>
+static B bitcast(A a)
+{
+  B b;
+  std::memcpy(&b, &a, std::min(sizeof a, sizeof b));
+  if (sizeof b > sizeof a) {
+    std::memset((char*)&b + sizeof a, 0, sizeof b - sizeof a);
+  }
+  return b;
+}
+
+
+
+// Define vector types
+
+using std::int32_t;
+#define int int32_t
+typedef int int2  __attribute__((__ext_vector_type__( 2)));
+typedef int int3  __attribute__((__ext_vector_type__( 3)));
+typedef int int4  __attribute__((__ext_vector_type__( 4)));
+typedef int int8  __attribute__((__ext_vector_type__( 8)));
+typedef int int16 __attribute__((__ext_vector_type__(16)));
+
+using std::uint32_t;
+#define uint uint32_t
+typedef uint uint2  __attribute__((__ext_vector_type__( 2)));
+typedef uint uint3  __attribute__((__ext_vector_type__( 3)));
+typedef uint uint4  __attribute__((__ext_vector_type__( 4)));
+typedef uint uint8  __attribute__((__ext_vector_type__( 8)));
+typedef uint uint16 __attribute__((__ext_vector_type__(16)));
+
+#ifdef cles_khr_int64
+using std::int64_t;
+#define long int64_t
+typedef long long2  __attribute__((__ext_vector_type__( 2)));
+typedef long long3  __attribute__((__ext_vector_type__( 3)));
+typedef long long4  __attribute__((__ext_vector_type__( 4)));
+typedef long long8  __attribute__((__ext_vector_type__( 8)));
+typedef long long16 __attribute__((__ext_vector_type__(16)));
+
+using std::uint64_t;
+#define ulong uint64_t
+typedef ulong ulong2  __attribute__((__ext_vector_type__( 2)));
+typedef ulong ulong3  __attribute__((__ext_vector_type__( 3)));
+typedef ulong ulong4  __attribute__((__ext_vector_type__( 4)));
+typedef ulong ulong8  __attribute__((__ext_vector_type__( 8)));
+typedef ulong ulong16 __attribute__((__ext_vector_type__(16)));
+#endif
+
+typedef float float2  __attribute__((__ext_vector_type__( 2)));
+typedef float float3  __attribute__((__ext_vector_type__( 3)));
+typedef float float4  __attribute__((__ext_vector_type__( 4)));
+typedef float float8  __attribute__((__ext_vector_type__( 8)));
+typedef float float16 __attribute__((__ext_vector_type__(16)));
+
+#ifdef cl_khr_fp64
+typedef double double2  __attribute__((__ext_vector_type__( 2)));
+typedef double double3  __attribute__((__ext_vector_type__( 3)));
+typedef double double4  __attribute__((__ext_vector_type__( 4)));
+typedef double double8  __attribute__((__ext_vector_type__( 8)));
+typedef double double16 __attribute__((__ext_vector_type__(16)));
+#endif
author	Erik Schnetter <schnetter@gmail.com>	2013-02-17 20:46:39 -0500
committer	Erik Schnetter <schnetter@gmail.com>	2013-02-17 20:46:39 -0500
commit	1fccde626317f3c6aad9186200d169baa1673a0f (patch)
tree	36c0c7a22e35b0737dbf63212e1629a6caa09079 /pocl
parent	16627815106407a34b85ca691bd98ee18c92abe3 (diff)
download	vecmathlib-1fccde626317f3c6aad9186200d169baa1673a0f.zip vecmathlib-1fccde626317f3c6aad9186200d169baa1673a0f.tar.gz