16 files changed, 5075 insertions, 0 deletions
diff --git a/lib/Headers/CMakeLists.txt b/lib/Headers/CMakeLists.txt
new file mode 100644
index 0000000..3c42167
--- /dev/null
+++ b/lib/Headers/CMakeLists.txt
@@ -0,0 +1,25 @@
+set(files
+  iso646.h
+  mmintrin.h
+  stdarg.h
+  stdbool.h
+  stddef.h
+  )
+
+set(output_dir ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/../Headers)
+
+foreach( f ${files} )
+  set( src ${CMAKE_CURRENT_SOURCE_DIR}/${f} )
+  set( dst ${output_dir}/${f} )
+  add_custom_command(OUTPUT ${dst}
+    DEPENDS ${src}
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${src} ${dst}
+    COMMENT "Copying clang's ${f}...")
+endforeach( f )
+
+add_custom_target(clang_headers ALL
+  DEPENDS ${files})
+
+install(FILES ${files}
+  PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ
+  DESTINATION Headers)
diff --git a/lib/Headers/Makefile b/lib/Headers/Makefile
new file mode 100644
index 0000000..77eb96d
--- /dev/null
+++ b/lib/Headers/Makefile
@@ -0,0 +1,40 @@
+##===- clang/lib/Headers/Makefile --------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+# 
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+include $(LEVEL)/Makefile.common
+
+# FIXME: Get version from a common place.
+HeaderDir := $(PROJ_OBJ_ROOT)/$(BuildMode)/lib/clang/1.0/include
+
+HEADERS := $(notdir $(wildcard $(PROJ_SRC_DIR)/*.h))
+
+OBJHEADERS := $(addprefix $(HeaderDir)/, $(HEADERS))
+
+
+$(OBJHEADERS): $(HeaderDir)/%.h: $(PROJ_SRC_DIR)/%.h $(HeaderDir)/.dir
+	$(Verb) cp $< $@
+	$(Echo) Copying $(notdir $<) to build dir
+
+# Hook into the standard Makefile rules.
+all-local:: $(OBJHEADERS)
+
+PROJ_headers := $(DESTDIR)$(PROJ_prefix)/lib/clang/1.0/include
+
+INSTHEADERS := $(addprefix $(PROJ_headers)/, $(HEADERS))
+
+$(PROJ_headers):
+	$(Verb) $(MKDIR) $@
+
+$(INSTHEADERS): $(PROJ_headers)/%.h: $(HeaderDir)/%.h | $(PROJ_headers)
+	$(Verb) $(DataInstall) $< $(PROJ_headers)
+	$(Echo) Installing compiler include file: $(notdir $<)
+
+install-local:: $(INSTHEADERS)
+
diff --git a/lib/Headers/emmintrin.h b/lib/Headers/emmintrin.h
new file mode 100644
index 0000000..c96000a
--- /dev/null
+++ b/lib/Headers/emmintrin.h
@@ -0,0 +1,1329 @@
+/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+ 
+#ifndef __EMMINTRIN_H
+#define __EMMINTRIN_H
+
+#ifndef __SSE2__
+#error "SSE2 instruction set not enabled"
+#else
+
+#include <xmmintrin.h>
+
+typedef double __m128d __attribute__((__vector_size__(16)));
+typedef long long __m128i __attribute__((__vector_size__(16)));
+
+typedef int __v4si __attribute__((__vector_size__(16)));
+typedef short __v8hi __attribute__((__vector_size__(16)));
+typedef char __v16qi __attribute__((__vector_size__(16)));
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_add_sd(__m128d a, __m128d b)
+{
+  return __builtin_ia32_addsd(a, b);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_add_pd(__m128d a, __m128d b)
+{
+  return a + b;
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_sub_sd(__m128d a, __m128d b)
+{
+  return __builtin_ia32_subsd(a, b);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_sub_pd(__m128d a, __m128d b)
+{
+  return a - b;
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_mul_sd(__m128d a, __m128d b)
+{
+  return __builtin_ia32_mulsd(a, b);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_mul_pd(__m128d a, __m128d b)
+{
+  return a * b;
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_div_sd(__m128d a, __m128d b)
+{
+  return __builtin_ia32_divsd(a, b);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_div_pd(__m128d a, __m128d b)
+{
+  return a / b;
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_sqrt_sd(__m128d a, __m128d b)
+{
+  __m128d c = __builtin_ia32_sqrtsd(b);
+  return (__m128d) { c[0], a[1] };
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_sqrt_pd(__m128d a)
+{
+  return __builtin_ia32_sqrtpd(a);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_min_sd(__m128d a, __m128d b)
+{
+  return __builtin_ia32_minsd(a, b);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_min_pd(__m128d a, __m128d b)
+{
+  return __builtin_ia32_minpd(a, b);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_max_sd(__m128d a, __m128d b)
+{
+  return __builtin_ia32_maxsd(a, b);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_max_pd(__m128d a, __m128d b)
+{
+  return __builtin_ia32_maxpd(a, b);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_and_pd(__m128d a, __m128d b)
+{
+  return __builtin_ia32_andpd(a, b);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_andnot_pd(__m128d a, __m128d b)
+{
+  return __builtin_ia32_andnpd(a, b);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_or_pd(__m128d a, __m128d b)
+{
+  return __builtin_ia32_orpd(a, b);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_xor_pd(__m128d a, __m128d b)
+{
+  return __builtin_ia32_xorpd(a, b);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_pd(__m128d a, __m128d b)
+{
+  return (__m128d)__builtin_ia32_cmppd(a, b, 0);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_pd(__m128d a, __m128d b)
+{
+  return (__m128d)__builtin_ia32_cmppd(a, b, 1);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmple_pd(__m128d a, __m128d b)
+{
+  return (__m128d)__builtin_ia32_cmppd(a, b, 2);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_pd(__m128d a, __m128d b)
+{
+  return (__m128d)__builtin_ia32_cmppd(b, a, 1);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpge_pd(__m128d a, __m128d b)
+{
+  return (__m128d)__builtin_ia32_cmppd(b, a, 2);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpord_pd(__m128d a, __m128d b)
+{
+  return (__m128d)__builtin_ia32_cmppd(a, b, 7);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpunord_pd(__m128d a, __m128d b)
+{
+  return (__m128d)__builtin_ia32_cmppd(a, b, 3);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpneq_pd(__m128d a, __m128d b)
+{
+  return (__m128d)__builtin_ia32_cmppd(a, b, 4);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnlt_pd(__m128d a, __m128d b)
+{
+  return (__m128d)__builtin_ia32_cmppd(a, b, 5);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnle_pd(__m128d a, __m128d b)
+{
+  return (__m128d)__builtin_ia32_cmppd(a, b, 6);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpngt_pd(__m128d a, __m128d b)
+{
+  return (__m128d)__builtin_ia32_cmppd(b, a, 5);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnge_pd(__m128d a, __m128d b)
+{
+  return (__m128d)__builtin_ia32_cmppd(b, a, 6);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_sd(__m128d a, __m128d b)
+{
+  return (__m128d)__builtin_ia32_cmpsd(a, b, 0);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_sd(__m128d a, __m128d b)
+{
+  return (__m128d)__builtin_ia32_cmpsd(a, b, 1);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmple_sd(__m128d a, __m128d b)
+{
+  return (__m128d)__builtin_ia32_cmpsd(a, b, 2);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_sd(__m128d a, __m128d b)
+{
+  return (__m128d)__builtin_ia32_cmpsd(b, a, 1);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpge_sd(__m128d a, __m128d b)
+{
+  return (__m128d)__builtin_ia32_cmpsd(b, a, 2);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpord_sd(__m128d a, __m128d b)
+{
+  return (__m128d)__builtin_ia32_cmpsd(a, b, 7);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpunord_sd(__m128d a, __m128d b)
+{
+  return (__m128d)__builtin_ia32_cmpsd(a, b, 3);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpneq_sd(__m128d a, __m128d b)
+{
+  return (__m128d)__builtin_ia32_cmpsd(a, b, 4);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnlt_sd(__m128d a, __m128d b)
+{
+  return (__m128d)__builtin_ia32_cmpsd(a, b, 5);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnle_sd(__m128d a, __m128d b)
+{
+  return (__m128d)__builtin_ia32_cmpsd(a, b, 6);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpngt_sd(__m128d a, __m128d b)
+{
+  return (__m128d)__builtin_ia32_cmpsd(b, a, 5);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnge_sd(__m128d a, __m128d b)
+{
+  return (__m128d)__builtin_ia32_cmpsd(b, a, 6);
+}
+
+static inline int __attribute__((__always_inline__, __nodebug__))
+_mm_comieq_sd(__m128d a, __m128d b)
+{
+  return __builtin_ia32_comisdeq(a, b);
+}
+
+static inline int __attribute__((__always_inline__, __nodebug__))
+_mm_comilt_sd(__m128d a, __m128d b)
+{
+  return __builtin_ia32_comisdlt(a, b);
+}
+
+static inline int __attribute__((__always_inline__, __nodebug__))
+_mm_comile_sd(__m128d a, __m128d b)
+{
+  return __builtin_ia32_comisdle(a, b);
+}
+
+static inline int __attribute__((__always_inline__, __nodebug__))
+_mm_comigt_sd(__m128d a, __m128d b)
+{
+  return __builtin_ia32_comisdgt(a, b);
+}
+
+static inline int __attribute__((__always_inline__, __nodebug__))
+_mm_comineq_sd(__m128d a, __m128d b)
+{
+  return __builtin_ia32_comisdneq(a, b);
+}
+
+static inline int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomieq_sd(__m128d a, __m128d b)
+{
+  return __builtin_ia32_ucomisdeq(a, b);
+}
+
+static inline int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomilt_sd(__m128d a, __m128d b)
+{
+  return __builtin_ia32_ucomisdlt(a, b);
+}
+
+static inline int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomile_sd(__m128d a, __m128d b)
+{
+  return __builtin_ia32_ucomisdle(a, b);
+}
+
+static inline int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomigt_sd(__m128d a, __m128d b)
+{
+  return __builtin_ia32_ucomisdgt(a, b);
+}
+
+static inline int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomineq_sd(__m128d a, __m128d b)
+{
+  return __builtin_ia32_ucomisdneq(a, b);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpd_ps(__m128d a)
+{
+  return __builtin_ia32_cvtpd2ps(a);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cvtps_pd(__m128 a)
+{
+  return __builtin_ia32_cvtps2pd(a);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepi32_pd(__m128i a)
+{
+  return __builtin_ia32_cvtdq2pd((__v4si)a);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpd_epi32(__m128d a)
+{
+  return __builtin_ia32_cvtpd2dq(a);
+}
+
+static inline int __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsd_si32(__m128d a)
+{
+  return __builtin_ia32_cvtsd2si(a);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsd_ss(__m128 a, __m128d b)
+{
+  return __builtin_ia32_cvtsd2ss(a, b);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi32_sd(__m128d a, int b)
+{
+  return __builtin_ia32_cvtsi2sd(a, b);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cvtss_sd(__m128d a, __m128 b)
+{
+  return __builtin_ia32_cvtss2sd(a, b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvttpd_epi32(__m128d a)
+{
+  return (__m128i)__builtin_ia32_cvttpd2dq(a);
+}
+
+static inline int __attribute__((__always_inline__, __nodebug__))
+_mm_cvttsd_si32(__m128d a)
+{
+  return __builtin_ia32_cvttsd2si(a);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpd_pi32(__m128d a)
+{
+  return (__m64)__builtin_ia32_cvtpd2pi(a);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cvttpd_pi32(__m128d a)
+{
+  return (__m64)__builtin_ia32_cvttpd2pi(a);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpi32_pd(__m64 a)
+{
+  return __builtin_ia32_cvtpi2pd((__v2si)a);
+}
+
+static inline double __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsd_f64(__m128d a)
+{
+  return a[0];
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_load_pd(double const *dp)
+{
+  return *(__m128d*)dp;
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_load1_pd(double const *dp)
+{
+  return (__m128d){ dp[0], dp[0] };
+}
+
+#define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_loadr_pd(double const *dp)
+{
+  return (__m128d){ dp[1], dp[0] };
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_loadu_pd(double const *dp)
+{
+  return __builtin_ia32_loadupd(dp);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_load_sd(double const *dp)
+{
+  return (__m128d){ *dp, 0.0 };
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_loadh_pd(__m128d a, double const *dp)
+{
+  return __builtin_shufflevector(a, *(__m128d *)dp, 0, 2);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_loadl_pd(__m128d a, double const *dp)
+{
+  return __builtin_shufflevector(a, *(__m128d *)dp, 2, 1);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_set_sd(double w)
+{
+  return (__m128d){ w, 0 };
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_set1_pd(double w)
+{
+  return (__m128d){ w, w };
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_set_pd(double w, double x)
+{
+  return (__m128d){ w, x };
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_setr_pd(double w, double x)
+{
+  return (__m128d){ x, w };
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_setzero_pd(void)
+{
+  return (__m128d){ 0, 0 };
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_move_sd(__m128d a, __m128d b)
+{
+  return (__m128d){ b[0], a[1] };
+}
+
+static inline void __attribute__((__always_inline__, __nodebug__))
+_mm_store_sd(double *dp, __m128d a)
+{
+  dp[0] = a[0];
+}
+
+static inline void __attribute__((__always_inline__, __nodebug__))
+_mm_store1_pd(double *dp, __m128d a)
+{
+  dp[0] = a[0];
+  dp[1] = a[0];
+}
+
+static inline void __attribute__((__always_inline__, __nodebug__))
+_mm_store_pd(double *dp, __m128d a)
+{
+  *(__m128d *)dp = a;
+}
+
+static inline void __attribute__((__always_inline__, __nodebug__))
+_mm_storeu_pd(double *dp, __m128d a)
+{
+  __builtin_ia32_storeupd(dp, a);
+}
+
+static inline void __attribute__((__always_inline__, __nodebug__))
+_mm_storer_pd(double *dp, __m128d a)
+{
+  dp[0] = a[1];
+  dp[1] = a[0];
+}
+
+static inline void __attribute__((__always_inline__, __nodebug__))
+_mm_storeh_pd(double *dp, __m128d a)
+{
+  dp[0] = a[1];
+}
+
+static inline void __attribute__((__always_inline__, __nodebug__))
+_mm_storel_pd(double *dp, __m128d a)
+{
+  dp[0] = a[0];
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_add_epi8(__m128i a, __m128i b)
+{
+  return (__m128i)((__v16qi)a + (__v16qi)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_add_epi16(__m128i a, __m128i b)
+{
+  return (__m128i)((__v8hi)a + (__v8hi)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_add_epi32(__m128i a, __m128i b)
+{
+  return (__m128i)((__v4si)a + (__v4si)b);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_add_si64(__m64 a, __m64 b)
+{
+  return a + b;
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_add_epi64(__m128i a, __m128i b)
+{
+  return a + b;
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_adds_epi8(__m128i a, __m128i b)
+{
+  return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_adds_epi16(__m128i a, __m128i b)
+{
+  return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_adds_epu8(__m128i a, __m128i b)
+{
+  return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_adds_epu16(__m128i a, __m128i b)
+{
+  return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_avg_epu8(__m128i a, __m128i b)
+{
+  return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_avg_epu16(__m128i a, __m128i b)
+{
+  return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_madd_epi16(__m128i a, __m128i b)
+{
+  return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_max_epi16(__m128i a, __m128i b)
+{
+  return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_max_epu8(__m128i a, __m128i b)
+{
+  return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_min_epi16(__m128i a, __m128i b)
+{
+  return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_min_epu8(__m128i a, __m128i b)
+{
+  return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_mulhi_epi16(__m128i a, __m128i b)
+{
+  return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_mulhi_epu16(__m128i a, __m128i b)
+{
+  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_mullo_epi16(__m128i a, __m128i b)
+{
+  return (__m128i)__builtin_ia32_pmullw128((__v8hi)a, (__v8hi)b);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_mul_su32(__m64 a, __m64 b)
+{
+  return __builtin_ia32_pmuludq((__v2si)a, (__v2si)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_mul_epu32(__m128i a, __m128i b)
+{
+  return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sad_epu8(__m128i a, __m128i b)
+{
+  return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sub_epi8(__m128i a, __m128i b)
+{
+  return (__m128i)((__v16qi)a - (__v16qi)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sub_epi16(__m128i a, __m128i b)
+{
+  return (__m128i)((__v8hi)a - (__v8hi)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sub_epi32(__m128i a, __m128i b)
+{
+  return (__m128i)((__v4si)a - (__v4si)b);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sub_si64(__m64 a, __m64 b)
+{
+  return a - b;
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sub_epi64(__m128i a, __m128i b)
+{
+  return a - b;
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_subs_epi8(__m128i a, __m128i b)
+{
+  return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_subs_epi16(__m128i a, __m128i b)
+{
+  return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_subs_epu8(__m128i a, __m128i b)
+{
+  return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_subs_epu16(__m128i a, __m128i b)
+{
+  return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_and_si128(__m128i a, __m128i b)
+{
+  return __builtin_ia32_pand128(a, b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_andnot_si128(__m128i a, __m128i b)
+{
+  return __builtin_ia32_pandn128(a, b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_or_si128(__m128i a, __m128i b)
+{
+  return __builtin_ia32_por128(a, b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_xor_si128(__m128i a, __m128i b)
+{
+  return __builtin_ia32_pxor128(a, b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_slli_si128(__m128i a, int imm)
+{
+  return __builtin_ia32_pslldqi128(a, imm * 8);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_slli_epi16(__m128i a, int count)
+{
+  return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sll_epi16(__m128i a, __m128i count)
+{
+  return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_slli_epi32(__m128i a, int count)
+{
+  return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sll_epi32(__m128i a, __m128i count)
+{
+  return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_slli_epi64(__m128i a, int count)
+{
+  return __builtin_ia32_psllqi128(a, count);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sll_epi64(__m128i a, __m128i count)
+{
+  return __builtin_ia32_psllq128(a, count);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srai_epi16(__m128i a, int count)
+{
+  return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sra_epi16(__m128i a, __m128i count)
+{
+  return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srai_epi32(__m128i a, int count)
+{
+  return (__m128i)__builtin_ia32_psradi128((__v4si)a, count);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sra_epi32(__m128i a, __m128i count)
+{
+  return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srli_si128(__m128i a, int imm)
+{
+  return __builtin_ia32_psrldqi128(a, imm * 8);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srli_epi16(__m128i a, int count)
+{
+  return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srl_epi16(__m128i a, __m128i count)
+{
+  return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srli_epi32(__m128i a, int count)
+{
+  return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srl_epi32(__m128i a, __m128i count)
+{
+  return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srli_epi64(__m128i a, int count)
+{
+  return __builtin_ia32_psrlqi128(a, count);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_srl_epi64(__m128i a, __m128i count)
+{
+  return __builtin_ia32_psrlq128(a, count);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_epi8(__m128i a, __m128i b)
+{
+  return (__m128i)__builtin_ia32_pcmpeqb128((__v16qi)a, (__v16qi)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_epi16(__m128i a, __m128i b)
+{
+  return (__m128i)__builtin_ia32_pcmpeqw128((__v8hi)a, (__v8hi)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_epi32(__m128i a, __m128i b)
+{
+  return (__m128i)__builtin_ia32_pcmpeqd128((__v4si)a, (__v4si)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_epi8(__m128i a, __m128i b)
+{
+  return (__m128i)__builtin_ia32_pcmpgtb128((__v16qi)a, (__v16qi)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_epi16(__m128i a, __m128i b)
+{
+  return (__m128i)__builtin_ia32_pcmpgtw128((__v8hi)a, (__v8hi)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_epi32(__m128i a, __m128i b)
+{
+  return (__m128i)__builtin_ia32_pcmpgtd128((__v4si)a, (__v4si)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_epi8(__m128i a, __m128i b)
+{
+  return (__m128i)__builtin_ia32_pcmpgtb128((__v16qi)b, (__v16qi)a);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_epi16(__m128i a, __m128i b)
+{
+  return (__m128i)__builtin_ia32_pcmpgtw128((__v8hi)b, (__v8hi)a);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_epi32(__m128i a, __m128i b)
+{
+  return (__m128i)__builtin_ia32_pcmpgtd128((__v4si)b, (__v4si)a);
+}
+
+#ifdef __x86_64__
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi64_sd(__m128d a, long long b)
+{
+  return __builtin_ia32_cvtsi642sd(a, b);
+}
+
+static inline long long __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsd_si64(__m128d a)
+{
+  return __builtin_ia32_cvtsd2si64(a);
+}
+
+static inline long long __attribute__((__always_inline__, __nodebug__))
+_mm_cvttsd_si64(__m128d a)
+{
+  return __builtin_ia32_cvttsd2si64(a);
+}
+#endif
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtepi32_ps(__m128i a)
+{
+  return __builtin_ia32_cvtdq2ps((__v4si)a);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtps_epi32(__m128 a)
+{
+  return (__m128i)__builtin_ia32_cvtps2dq(a);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvttps_epi32(__m128 a)
+{
+  return (__m128i)__builtin_ia32_cvttps2dq(a);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi32_si128(int a)
+{
+  return (__m128i)(__v4si){ a, 0, 0, 0 };
+}
+
+#ifdef __x86_64__
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi64_si128(long long a)
+{
+  return (__m128i){ a, 0 };
+}
+#endif
+
+static inline int __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi128_si32(__m128i a)
+{
+  __v4si b = (__v4si)a;
+  return b[0];
+}
+
+#ifdef __x86_64__
+static inline long long __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi128_si64(__m128i a)
+{
+  return a[0];
+}
+#endif
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_load_si128(__m128i const *p)
+{
+  return *p;
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_loadu_si128(__m128i const *p)
+{
+  return (__m128i)__builtin_ia32_loaddqu((char const *)p);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_loadl_epi64(__m128i const *p)
+{
+  return (__m128i)__builtin_ia32_loadlv4si((__v2si *)p);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_set_epi64(__m64 q1, __m64 q0)
+{
+  return (__m128i){ (long long)q0, (long long)q1 };
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_set_epi32(int i3, int i2, int i1, int i0)
+{
+  return (__m128i)(__v4si){ i0, i1, i2, i3};
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
+{
+  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
+{
+  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_set1_epi64(__m64 q)
+{
+  return (__m128i){ (long long)q, (long long)q };
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_set1_epi32(int i)
+{
+  return (__m128i)(__v4si){ i, i, i, i };
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_set1_epi16(short w)
+{
+  return (__m128i)(__v8hi){ w, w, w, w, w, w, w, w };
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_set1_epi8(char b)
+{
+  return (__m128i)(__v16qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_setr_epi64(__m64 q0, __m64 q1)
+{
+  return (__m128i){ (long long)q0, (long long)q1 };
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_setr_epi32(int i0, int i1, int i2, int i3)
+{
+  return (__m128i)(__v4si){ i0, i1, i2, i3};
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
+{
+  return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
+{
+  return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_setzero_si128(void)
+{
+  return (__m128i){ 0LL, 0LL };
+}
+
+static inline void __attribute__((__always_inline__, __nodebug__))
+_mm_store_si128(__m128i *p, __m128i b)
+{
+  *p = b;
+}
+
+static inline void __attribute__((__always_inline__, __nodebug__))
+_mm_storeu_si128(__m128i *p, __m128i b)
+{
+  __builtin_ia32_storedqu((char *)p, (__v16qi)b);
+}
+
+static inline void __attribute__((__always_inline__, __nodebug__))
+_mm_maskmoveu_si128(__m128i d, __m128i n, char *p)
+{
+  __builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p);
+}
+
+static inline void __attribute__((__always_inline__, __nodebug__))
+_mm_storel_epi64(__m128i *p, __m128i a)
+{
+  __builtin_ia32_storelv4si((__v2si *)p, a);
+}
+
+static inline void __attribute__((__always_inline__, __nodebug__))
+_mm_stream_pd(double *p, __m128d a)
+{
+  __builtin_ia32_movntpd(p, a);
+}
+
+static inline void __attribute__((__always_inline__, __nodebug__))
+_mm_stream_si128(__m128i *p, __m128i a)
+{
+  __builtin_ia32_movntdq(p, a);
+}
+
+static inline void __attribute__((__always_inline__, __nodebug__))
+_mm_stream_si32(int *p, int a)
+{
+  __builtin_ia32_movnti(p, a);
+}
+
+static inline void __attribute__((__always_inline__, __nodebug__))
+_mm_clflush(void const *p)
+{
+  __builtin_ia32_clflush(p);
+}
+
+static inline void __attribute__((__always_inline__, __nodebug__))
+_mm_lfence(void)
+{
+  __builtin_ia32_lfence();
+}
+
+static inline void __attribute__((__always_inline__, __nodebug__))
+_mm_mfence(void)
+{
+  __builtin_ia32_mfence();
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_packs_epi16(__m128i a, __m128i b)
+{
+  return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_packs_epi32(__m128i a, __m128i b)
+{
+  return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_packus_epi16(__m128i a, __m128i b)
+{
+  return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b);
+}
+
+static inline int __attribute__((__always_inline__, __nodebug__))
+_mm_extract_epi16(__m128i a, int imm)
+{
+  __v8hi b = (__v8hi)a;
+  return b[imm];
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_insert_epi16(__m128i a, int b, int imm)
+{
+  return (__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)a, b, imm);
+}
+
+static inline int __attribute__((__always_inline__, __nodebug__))
+_mm_movemask_epi8(__m128i a)
+{
+  return __builtin_ia32_pmovmskb128((__v16qi)a);
+}
+
+#define _mm_shuffle_epi32(a, imm) ((__m128i)__builtin_ia32_pshufd((__v4si)(a), (imm)))
+#define _mm_shufflehi_epi16(a, imm) ((__m128i)__builtin_ia32_pshufhw((__v8hi)(a), (imm)))
+#define _mm_shufflelo_epi16(a, imm) ((__m128i)__builtin_ia32_pshuflw((__v8hi)(a), (imm)))
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_unpackhi_epi8(__m128i a, __m128i b)
+{
+  return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_unpackhi_epi16(__m128i a, __m128i b)
+{
+  return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_unpackhi_epi32(__m128i a, __m128i b)
+{
+  return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4+2, 3, 4+3);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_unpackhi_epi64(__m128i a, __m128i b)
+{
+  return (__m128i)__builtin_shufflevector(a, b, 1, 2+1);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_unpacklo_epi8(__m128i a, __m128i b)
+{
+  return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_unpacklo_epi16(__m128i a, __m128i b)
+{
+  return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_unpacklo_epi32(__m128i a, __m128i b)
+{
+  return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4+0, 1, 4+1);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_unpacklo_epi64(__m128i a, __m128i b)
+{
+  return (__m128i)__builtin_shufflevector(a, b, 0, 2+0);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_movepi64_pi64(__m128i a)
+{
+  return (__m64)a[0];
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_movpi64_pi64(__m64 a)
+{
+  return (__m128i){ (long long)a, 0 };
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_move_epi64(__m128i a)
+{
+  return (__m128i){ a[0], 0 };
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_unpackhi_pd(__m128d a, __m128d b)
+{
+  return __builtin_shufflevector(a, b, 1, 2+1);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_unpacklo_pd(__m128d a, __m128d b)
+{
+  return __builtin_shufflevector(a, b, 0, 2+0);
+}
+
+static inline int __attribute__((__always_inline__, __nodebug__))
+_mm_movemask_pd(__m128d a)
+{
+  return __builtin_ia32_movmskpd(a);
+}
+
+#define _mm_shuffle_pd(a, b, i) (__builtin_ia32_shufpd((a), (b), (i)))
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_castpd_ps(__m128d in)
+{
+  return (__m128)in;
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_castpd_si128(__m128d in)
+{
+  return (__m128i)in;
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_castps_pd(__m128 in)
+{
+  return (__m128d)in;
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_castps_si128(__m128 in)
+{
+  return (__m128i)in;
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_castsi128_ps(__m128i in)
+{
+  return (__m128)in;
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_castsi128_pd(__m128i in)
+{
+  return (__m128d)in;
+}
+
+static inline void __attribute__((__always_inline__, __nodebug__))
+_mm_pause(void)
+{
+  __asm__ volatile ("pause");
+}
+
+#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
+
+#endif /* __SSE2__ */
+
+#endif /* __EMMINTRIN_H */
diff --git a/lib/Headers/float.h b/lib/Headers/float.h
new file mode 100644
index 0000000..28fb882
--- /dev/null
+++ b/lib/Headers/float.h
@@ -0,0 +1,71 @@
+/*===---- float.h - Characteristics of floating point types ----------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __FLOAT_H
+#define __FLOAT_H
+
+/* Characteristics of floating point types, C99 5.2.4.2.2 */
+
+#define FLT_EVAL_METHOD __FLT_EVAL_METHOD__
+#define FLT_ROUNDS (__builtin_flt_rounds())
+#define FLT_RADIX __FLT_RADIX__
+
+#define FLT_MANT_DIG __FLT_MANT_DIG__
+#define DBL_MANT_DIG __DBL_MANT_DIG__
+#define LDBL_MANT_DIG __LDBL_MANT_DIG__
+
+#define DECIMAL_DIG __DECIMAL_DIG__
+
+#define FLT_DIG __FLT_DIG__
+#define DBL_DIG __DBL_DIG__
+#define LDBL_DIG __LDBL_DIG__
+
+#define FLT_MIN_EXP __FLT_MIN_EXP__
+#define DBL_MIN_EXP __DBL_MIN_EXP__
+#define LDBL_MIN_EXP __LDBL_MIN_EXP__
+
+#define FLT_MIN_10_EXP __FLT_MIN_10_EXP__
+#define DBL_MIN_10_EXP __DBL_MIN_10_EXP__
+#define LDBL_MIN_10_EXP __LDBL_MIN_10_EXP__
+
+#define FLT_MAX_EXP __FLT_MAX_EXP__
+#define DBL_MAX_EXP __DBL_MAX_EXP__
+#define LDBL_MAX_EXP __LDBL_MAX_EXP__
+
+#define FLT_MAX_10_EXP __FLT_MAX_10_EXP__
+#define DBL_MAX_10_EXP __DBL_MAX_10_EXP__
+#define LDBL_MAX_10_EXP __LDBL_MAX_10_EXP__
+
+#define FLT_MAX __FLT_MAX__
+#define DBL_MAX __DBL_MAX__
+#define LDBL_MAX __LDBL_MAX__
+
+#define FLT_EPSILON __FLT_EPSILON__
+#define DBL_EPSILON __DBL_EPSILON__
+#define LDBL_EPSILON __LDBL_EPSILON__
+
+#define FLT_MIN __FLT_MIN__
+#define DBL_MIN __DBL_MIN__
+#define LDBL_MIN __LDBL_MIN__
+
+#endif /* __FLOAT_H */
diff --git a/lib/Headers/iso646.h b/lib/Headers/iso646.h
new file mode 100644
index 0000000..dca13c5
--- /dev/null
+++ b/lib/Headers/iso646.h
@@ -0,0 +1,43 @@
+/*===---- iso646.h - Standard header for alternate spellings of operators---===
+ *
+ * Copyright (c) 2008 Eli Friedman
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __ISO646_H
+#define __ISO646_H
+
+#ifndef __cplusplus
+#define and    &&
+#define and_eq &=
+#define bitand &
+#define bitor  |
+#define compl  ~
+#define not    !
+#define not_eq !=
+#define or     ||
+#define or_eq  |=
+#define xor    ^
+#define xor_eq ^=
+#endif
+
+#endif /* __ISO646_H */
diff --git a/lib/Headers/limits.h b/lib/Headers/limits.h
new file mode 100644
index 0000000..e4909ab
--- /dev/null
+++ b/lib/Headers/limits.h
@@ -0,0 +1,114 @@
+/*===---- limits.h - Standard header for integer sizes --------------------===*\
+ *
+ * Copyright (c) 2009 Chris Lattner
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __CLANG_LIMITS_H
+#define __CLANG_LIMITS_H
+
+/* The system's limits.h may, in turn, try to #include_next GCC's limits.h.
+   Avert this #include_next madness. */
+#if defined __GNUC__ && !defined _GCC_LIMITS_H_
+#define _GCC_LIMITS_H_
+#endif
+
+/* System headers include a number of constants from POSIX in <limits.h>. */
+#include_next <limits.h>
+
+/* Many system headers try to "help us out" by defining these.  No really, we
+   know how big each datatype is. */
+#undef  SCHAR_MIN
+#undef  SCHAR_MAX
+#undef  UCHAR_MAX
+#undef  SHRT_MIN
+#undef  SHRT_MAX
+#undef  USHRT_MAX
+#undef  INT_MIN
+#undef  INT_MAX
+#undef  UINT_MAX
+#undef  LONG_MIN
+#undef  LONG_MAX
+#undef  ULONG_MAX
+
+#undef  MB_LEN_MAX 
+#undef  CHAR_BIT
+#undef  CHAR_MIN
+#undef  CHAR_MAX
+
+/* C90/99 5.2.4.2.1 */
+#define SCHAR_MAX __SCHAR_MAX__
+#define SHRT_MAX  __SHRT_MAX__
+#define INT_MAX   __INT_MAX__
+#define LONG_MAX  __LONG_MAX__
+
+#define SCHAR_MIN (-__SCHAR_MAX__-1)
+#define SHRT_MIN  (-__SHRT_MAX__ -1)
+#define INT_MIN   (-__INT_MAX__  -1)
+#define LONG_MIN  (-__LONG_MAX__ -1L)
+
+#define UCHAR_MAX (__SCHAR_MAX__*2  +1)
+#define USHRT_MAX (__SHRT_MAX__ *2  +1)
+#define UINT_MAX  (__INT_MAX__  *2U +1U)
+#define ULONG_MAX (__LONG_MAX__ *2UL+1UL)
+
+#ifndef MB_LEN_MAX
+#define MB_LEN_MAX 1
+#endif
+
+#define CHAR_BIT  __CHAR_BIT__
+
+#ifdef __CHAR_UNSIGNED__  /* -funsigned-char */
+#define CHAR_MIN 0
+#define CHAR_MAX UCHAR_MAX
+#else
+#define CHAR_MIN SCHAR_MIN
+#define CHAR_MAX __SCHAR_MAX__
+#endif
+
+/* C99 5.2.4.2.1: Added long long. */
+#if __STDC_VERSION__ >= 199901
+
+#undef  LLONG_MIN
+#undef  LLONG_MAX
+#undef  ULLONG_MAX
+
+#define LLONG_MAX  __LONG_LONG_MAX__
+#define LLONG_MIN  (-__LONG_LONG_MAX__-1LL)
+#define ULLONG_MAX (__LONG_LONG_MAX__*2ULL+1ULL)
+#endif
+
+/* LONG_LONG_MIN/LONG_LONG_MAX/ULONG_LONG_MAX are a GNU extension.  It's too bad
+   that we don't have something like #pragma poison that could be used to
+   deprecate a macro - the code should just use LLONG_MAX and friends.
+ */
+#if defined(__GNU_LIBRARY__) ? defined(__USE_GNU) : !defined(__STRICT_ANSI__)
+
+#undef   LONG_LONG_MIN
+#undef   LONG_LONG_MAX
+#undef   ULONG_LONG_MAX
+
+#define LONG_LONG_MAX  __LONG_LONG_MAX__
+#define LONG_LONG_MIN  (-__LONG_LONG_MAX__-1LL)
+#define ULONG_LONG_MAX (__LONG_LONG_MAX__*2ULL+1ULL)
+#endif
+
+#endif /* __CLANG_LIMITS_H */
diff --git a/lib/Headers/mm_malloc.h b/lib/Headers/mm_malloc.h
new file mode 100644
index 0000000..a680c47
--- /dev/null
+++ b/lib/Headers/mm_malloc.h
@@ -0,0 +1,59 @@
+/*===---- mm_malloc.h - Allocating and Freeing Aligned Memory Blocks -------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __MM_MALLOC_H
+#define __MM_MALLOC_H
+
+#include <errno.h>
+#include <stdlib.h>
+
+static inline void *__attribute__((__always_inline__, __nodebug__)) _mm_malloc(size_t size, size_t align)
+{
+  if (align & (align - 1)) {
+    errno = EINVAL;
+    return 0;
+  }
+
+  if (!size)
+    return 0;
+
+  if (align < 2 * sizeof(void *))
+    align = 2 * sizeof(void *);
+
+  void *mallocedMemory = malloc(size + align);
+  if (!mallocedMemory)
+    return 0;
+
+  void *alignedMemory = (void *)(((size_t)mallocedMemory + align) & ~((size_t)align - 1));
+  ((void **)alignedMemory)[-1] = mallocedMemory;
+
+  return alignedMemory;
+}
+
+static inline void __attribute__((__always_inline__, __nodebug__)) _mm_free(void *p)
+{
+  if (p)
+    free(((void **)p)[-1]);
+}
+
+#endif /* __MM_MALLOC_H */
diff --git a/lib/Headers/mmintrin.h b/lib/Headers/mmintrin.h
new file mode 100644
index 0000000..339d212
--- /dev/null
+++ b/lib/Headers/mmintrin.h
@@ -0,0 +1,449 @@
+/*===---- mmintrin.h - MMX intrinsics --------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __MMINTRIN_H
+#define __MMINTRIN_H
+
+#ifndef __MMX__
+#error "MMX instruction set not enabled"
+#else
+
+typedef long long __m64 __attribute__((__vector_size__(8)));
+
+typedef int __v2si __attribute__((__vector_size__(8)));
+typedef short __v4hi __attribute__((__vector_size__(8)));
+typedef char __v8qi __attribute__((__vector_size__(8)));
+
+static inline void __attribute__((__always_inline__, __nodebug__))
+_mm_empty(void)
+{
+    __builtin_ia32_emms();
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi32_si64(int __i)
+{
+    return (__m64)(__v2si){__i, 0};
+}
+
+static inline int __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi64_si32(__m64 __m)
+{
+    __v2si __mmx_var2 = (__v2si)__m;
+    return __mmx_var2[0];
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi64_m64(long long __i)
+{
+    return (__m64)__i;
+}
+
+static inline long long __attribute__((__always_inline__, __nodebug__))
+_mm_cvtm64_si64(__m64 __m)
+{
+    return (long long)__m;
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_packs_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_packs_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_packs_pu16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2, 4, 8+4, 5,
+                                          8+5, 6, 8+6, 7, 8+7);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2, 2, 4+2, 3,
+                                          4+3);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 1, 2+1);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2, 0, 8+0, 1,
+                                          8+1, 2, 8+2, 3, 8+3);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2, 0, 4+0, 1,
+                                          4+1);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 0, 2+0);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_add_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)((__v8qi)__m1 + (__v8qi)__m2);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_add_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)((__v4hi)__m1 + (__v4hi)__m2);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_add_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)((__v2si)__m1 + (__v2si)__m2);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_adds_pi8(__m64 __m1, __m64 __m2) 
+{
+    return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_adds_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);    
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_adds_pu8(__m64 __m1, __m64 __m2) 
+{
+    return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
+}
+ 
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_adds_pu16(__m64 __m1, __m64 __m2) 
+{
+    return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sub_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)((__v8qi)__m1 - (__v8qi)__m2);
+}
+ 
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sub_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)((__v4hi)__m1 - (__v4hi)__m2);
+}
+ 
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sub_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)((__v2si)__m1 - (__v2si)__m2);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_subs_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_subs_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_subs_pu8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2);
+}
+ 
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_subs_pu16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_madd_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_mulhi_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2);
+}
+ 
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_mullo_pi16(__m64 __m1, __m64 __m2) 
+{
+    return (__m64)((__v4hi)__m1 * (__v4hi)__m2);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sll_pi16(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_slli_pi16(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count);    
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sll_pi32(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_pslld((__v2si)__m, __count);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_slli_pi32(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sll_si64(__m64 __m, __m64 __count)
+{
+    return __builtin_ia32_psllq(__m, __count);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_slli_si64(__m64 __m, int __count)
+{
+    return __builtin_ia32_psllqi(__m, __count);    
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sra_pi16(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count);    
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_srai_pi16(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sra_pi32(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psrad((__v2si)__m, __count);    
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_srai_pi32(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psradi((__v2si)__m, __count);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_srl_pi16(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count);    
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_srli_pi16(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count);    
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_srl_pi32(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psrld((__v2si)__m, __count);       
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_srli_pi32(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_srl_si64(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psrlq(__m, __count);    
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_srli_si64(__m64 __m, int __count)
+{
+    return __builtin_ia32_psrlqi(__m, __count);    
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_and_si64(__m64 __m1, __m64 __m2)
+{
+    return __m1 & __m2;
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_andnot_si64(__m64 __m1, __m64 __m2)
+{
+    return ~__m1 & __m2;
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_or_si64(__m64 __m1, __m64 __m2)
+{
+    return __m1 | __m2;
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_xor_si64(__m64 __m1, __m64 __m2)
+{
+    return __m1 ^ __m2;
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_setzero_si64(void)
+{
+    return (__m64){ 0LL };
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_set_pi32(int __i1, int __i0)
+{
+    return (__m64)(__v2si){ __i0, __i1 };
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
+{
+    return (__m64)(__v4hi){ __s0, __s1, __s2, __s3 };    
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
+            char __b1, char __b0)
+{
+    return (__m64)(__v8qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7 };
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_set1_pi32(int __i)
+{
+    return (__m64)(__v2si){ __i, __i };
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_set1_pi16(short __s)
+{
+    return (__m64)(__v4hi){ __s };
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_set1_pi8(char __b)
+{
+    return (__m64)(__v8qi){ __b };
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_setr_pi32(int __i1, int __i0)
+{
+    return (__m64)(__v2si){ __i1, __i0 };
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_setr_pi16(short __s3, short __s2, short __s1, short __s0)
+{
+    return (__m64)(__v4hi){ __s3, __s2, __s1, __s0 };
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_setr_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
+             char __b1, char __b0)
+{
+    return (__m64)(__v8qi){ __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0 };
+}
+
+#endif /* __MMX__ */
+
+#endif /* __MMINTRIN_H */
+
diff --git a/lib/Headers/pmmintrin.h b/lib/Headers/pmmintrin.h
new file mode 100644
index 0000000..cd90166
--- /dev/null
+++ b/lib/Headers/pmmintrin.h
@@ -0,0 +1,121 @@
+/*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+ 
+#ifndef __PMMINTRIN_H
+#define __PMMINTRIN_H
+
+#ifndef __SSE3__
+#error "SSE3 instruction set not enabled"
+#else
+
+#include <emmintrin.h>
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_lddqu_si128(__m128i const *p)
+{
+  return (__m128i)__builtin_ia32_lddqu((char const *)p);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_addsub_ps(__m128 a, __m128 b)
+{
+  return __builtin_ia32_addsubps(a, b);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_hadd_ps(__m128 a, __m128 b)
+{
+  return __builtin_ia32_haddps(a, b);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_hsub_ps(__m128 a, __m128 b)
+{
+  return __builtin_ia32_hsubps(a, b);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_movehdup_ps(__m128 a)
+{
+  return __builtin_shufflevector(a, a, 1, 1, 3, 3);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_moveldup_ps(__m128 a)
+{
+  return __builtin_shufflevector(a, a, 0, 0, 2, 2);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_addsub_pd(__m128d a, __m128d b)
+{
+  return __builtin_ia32_addsubpd(a, b);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_hadd_pd(__m128d a, __m128d b)
+{
+  return __builtin_ia32_haddpd(a, b);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_hsub_pd(__m128d a, __m128d b)
+{
+  return __builtin_ia32_hsubpd(a, b);
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_loaddup_pd(double const *dp)
+{
+  return (__m128d){ *dp, *dp };
+}
+
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
+_mm_movedup_pd(__m128d a)
+{
+  return __builtin_shufflevector(a, a, 0, 0);
+}
+
+#define _MM_DENORMALS_ZERO_ON   (0x0040)
+#define _MM_DENORMALS_ZERO_OFF  (0x0000)
+
+#define _MM_DENORMALS_ZERO_MASK (0x0040)
+
+#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
+#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
+
+static inline void __attribute__((__always_inline__, __nodebug__))
+_mm_monitor(void const *p, unsigned extensions, unsigned hints)
+{
+  __builtin_ia32_monitor((void *)p, extensions, hints);
+}
+
+static inline void __attribute__((__always_inline__, __nodebug__))
+_mm_mwait(unsigned extensions, unsigned hints)
+{
+  __builtin_ia32_mwait(extensions, hints);
+}
+
+#endif /* __SSE3__ */
+
+#endif /* __PMMINTRIN_H */
diff --git a/lib/Headers/stdarg.h b/lib/Headers/stdarg.h
new file mode 100644
index 0000000..c436ced
--- /dev/null
+++ b/lib/Headers/stdarg.h
@@ -0,0 +1,47 @@
+/*===---- stdarg.h - Variable argument handling ----------------------------===
+ *
+ * Copyright (c) 2008 Eli Friedman
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __STDARG_H
+#define __STDARG_H
+
+typedef __builtin_va_list va_list;
+#define va_start(ap, param) __builtin_va_start(ap, param)
+#define va_end(ap)          __builtin_va_end(ap)
+#define va_arg(ap, type)    __builtin_va_arg(ap, type)
+
+/* GCC always defines __va_copy, but does not define va_copy unless in c99 mode
+ * or -ansi is not specified, since it was not part of C90.
+ */
+#define __va_copy(d,s)	__builtin_va_copy(d,s)
+
+#if __STDC_VERSION__ >= 199900L || !defined(__STRICT_ANSI__)
+#define va_copy(dest, src)  __builtin_va_copy(dest, src)
+#endif
+
+/* Hack required to make standard headers work, at least on Ubuntu */
+#define __GNUC_VA_LIST 1
+typedef __builtin_va_list __gnuc_va_list;
+
+#endif /* __STDARG_H */
diff --git a/lib/Headers/stdbool.h b/lib/Headers/stdbool.h
new file mode 100644
index 0000000..e44a1f9
--- /dev/null
+++ b/lib/Headers/stdbool.h
@@ -0,0 +1,38 @@
+/*===---- stdbool.h - Standard header for booleans -------------------------===
+ *
+ * Copyright (c) 2008 Eli Friedman
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __STDBOOL_H
+#define __STDBOOL_H
+
+/* Don't define bool, true, and false in C++ */
+#ifndef __cplusplus
+#define bool _Bool
+#define true 1
+#define false 0
+#endif
+
+#define __bool_true_false_are_defined 1
+
+#endif /* __STDBOOL_H */
diff --git a/lib/Headers/stddef.h b/lib/Headers/stddef.h
new file mode 100644
index 0000000..2c84b4b
--- /dev/null
+++ b/lib/Headers/stddef.h
@@ -0,0 +1,43 @@
+/*===---- stddef.h - Basic type definitions --------------------------------===
+ *
+ * Copyright (c) 2008 Eli Friedman
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __STDDEF_H
+#define __STDDEF_H
+
+typedef __typeof__(((int*)0)-((int*)0)) ptrdiff_t;
+typedef __typeof__(sizeof(int)) size_t;
+#ifndef __cplusplus
+typedef __typeof__(*L"") wchar_t;
+#endif
+
+#ifdef __cplusplus
+#define NULL __null
+#else
+#define NULL ((void*)0)
+#endif
+
+#define offsetof(t, d) __builtin_offsetof(t, d)
+
+#endif /* __STDDEF_H */
diff --git a/lib/Headers/stdint.h b/lib/Headers/stdint.h
new file mode 100644
index 0000000..a7020d8
--- /dev/null
+++ b/lib/Headers/stdint.h
@@ -0,0 +1,232 @@
+/*===---- stdint.h - Standard header for sized integer types --------------===*\
+ *
+ * Copyright (c) 2009 Chris Lattner
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __CLANG_STDINT_H
+#define __CLANG_STDINT_H
+
+/* If we're hosted, fall back to the system's stdint.h, which might have
+ * additional definitions.
+ */
+#if __STDC_HOSTED__
+# include_next <stdint.h>
+#else
+
+/* We currently only support targets with power of two, 2s complement integers.
+ */
+
+/* C99 7.18.1.1 Exact-width integer types.
+ * C99 7.18.1.2 Minimum-width integer types.
+ * C99 7.18.1.3 Fastest minimum-width integer types.
+ * Since we only support pow-2 targets, these map directly to exact width types.
+ */
+
+#ifndef __int8_t_defined  /* glibc does weird things with sys/types.h */
+#define __int8_t_defined
+typedef signed __INT8_TYPE__ int8_t;
+typedef __INT16_TYPE__ int16_t;
+typedef __INT32_TYPE__ int32_t;
+#ifdef __INT64_TYPE__
+typedef __INT64_TYPE__ int64_t;
+#endif
+#endif
+
+typedef unsigned __INT8_TYPE__ uint8_t;
+typedef int8_t     int_least8_t;
+typedef uint8_t   uint_least8_t;
+typedef int8_t     int_fast8_t;
+typedef uint8_t   uint_fast8_t;
+
+typedef unsigned __INT16_TYPE__ uint16_t;
+typedef int16_t   int_least16_t;
+typedef uint16_t uint_least16_t;
+typedef int16_t   int_fast16_t;
+typedef uint16_t uint_fast16_t;
+
+#ifndef __uint32_t_defined  /* more glibc compatibility */
+#define __uint32_t_defined
+typedef unsigned __INT32_TYPE__ uint32_t;
+#endif
+typedef int32_t   int_least32_t;
+typedef uint32_t uint_least32_t;
+typedef int32_t   int_fast32_t;
+typedef uint32_t uint_fast32_t;
+
+/* Some 16-bit targets do not have a 64-bit datatype.  Only define the 64-bit
+ * typedefs if there is something to typedef them to.
+ */
+#ifdef __INT64_TYPE__
+typedef unsigned __INT64_TYPE__ uint64_t;
+typedef int64_t   int_least64_t;
+typedef uint64_t uint_least64_t;
+typedef int64_t   int_fast64_t;
+typedef uint64_t uint_fast64_t;
+#endif
+
+
+/* C99 7.18.1.4 Integer types capable of holding object pointers.
+ */
+#ifndef __intptr_t_defined
+typedef __INTPTR_TYPE__          intptr_t;
+#define __intptr_t_defined
+#endif
+typedef unsigned __INTPTR_TYPE__ uintptr_t;
+
+/* C99 7.18.1.5 Greatest-width integer types.
+ */
+typedef __INTMAX_TYPE__   intmax_t;
+typedef __UINTMAX_TYPE__ uintmax_t;
+
+/* C99 7.18.2.1 Limits of exact-width integer types. 
+ * Fixed sized values have fixed size max/min.
+ * C99 7.18.2.2 Limits of minimum-width integer types.
+ * Since we map these directly onto fixed-sized types, these values the same.
+ * C99 7.18.2.3 Limits of fastest minimum-width integer types.
+ *
+ * Note that C++ should not check __STDC_LIMIT_MACROS here, contrary to the
+ * claims of the C standard (see C++ 18.3.1p2, [cstdint.syn]).
+ */
+
+#define INT8_MAX    127
+#define INT8_MIN  (-128)
+#define UINT8_MAX   255
+#define INT_LEAST8_MIN   INT8_MIN
+#define INT_LEAST8_MAX   INT8_MAX
+#define UINT_LEAST8_MAX UINT8_MAX
+#define INT_FAST8_MIN    INT8_MIN
+#define INT_FAST8_MAX    INT8_MAX
+#define UINT_FAST8_MAX  UINT8_MAX
+
+#define INT16_MAX    32767
+#define INT16_MIN  (-32768)
+#define UINT16_MAX   65535
+#define INT_LEAST16_MIN   INT16_MIN
+#define INT_LEAST16_MAX   INT16_MAX
+#define UINT_LEAST16_MAX UINT16_MAX
+#define INT_FAST16_MIN    INT16_MIN
+#define INT_FAST16_MAX    INT16_MAX
+#define UINT_FAST16_MAX  UINT16_MAX
+
+#define INT32_MAX         2147483647
+#define INT32_MIN        (-2147483647-1)
+#define UINT32_MAX        4294967295U
+#define INT_LEAST32_MIN  INT32_MIN
+#define INT_LEAST32_MAX  INT32_MAX
+#define UINT_LEAST32_MAX UINT32_MAX
+#define INT_FAST32_MIN   INT32_MIN
+#define INT_FAST32_MAX   INT32_MAX
+#define UINT_FAST32_MAX  UINT32_MAX
+
+/* If we do not have 64-bit support, don't define the 64-bit size macros. */
+#ifdef __INT64_TYPE__
+#define INT64_MAX      9223372036854775807LL
+#define INT64_MIN    (-9223372036854775807LL-1)
+#define UINT64_MAX    18446744073709551615ULL
+#define INT_LEAST64_MIN  INT64_MIN
+#define INT_LEAST64_MAX  INT64_MAX
+#define UINT_LEAST64_MAX UINT64_MAX
+#define INT_FAST64_MIN    INT64_MIN
+#define INT_FAST64_MAX    INT64_MAX
+#define UINT_FAST64_MAX  UINT64_MAX
+#endif
+
+/* C99 7.18.2.4 Limits of integer types capable of holding object pointers. */
+/* C99 7.18.3 Limits of other integer types. */
+
+#if __POINTER_WIDTH__ == 64
+
+#define  INTPTR_MIN  INT64_MIN
+#define  INTPTR_MAX  INT64_MAX
+#define UINTPTR_MAX UINT64_MAX
+#define PTRDIFF_MIN  INT64_MIN
+#define PTRDIFF_MAX  INT64_MAX
+#define SIZE_MAX    UINT64_MAX
+
+#elif __POINTER_WIDTH__ == 32
+
+#define  INTPTR_MIN  INT32_MIN
+#define  INTPTR_MAX  INT32_MAX
+#define UINTPTR_MAX UINT32_MAX
+#define PTRDIFF_MIN  INT32_MIN
+#define PTRDIFF_MAX  INT32_MAX
+#define SIZE_MAX    UINT32_MAX
+
+#elif __POINTER_WIDTH__ == 16
+
+#define  INTPTR_MIN  INT16_MIN
+#define  INTPTR_MAX  INT16_MAX
+#define UINTPTR_MAX UINT16_MAX
+#define PTRDIFF_MIN  INT16_MIN
+#define PTRDIFF_MAX  INT16_MAX
+#define SIZE_MAX    UINT16_MAX
+
+#else
+#error "unknown or unset pointer width!"
+#endif
+
+/* C99 7.18.2.5 Limits of greatest-width integer types. */
+#define INTMAX_MIN  (-__INTMAX_MAX__-1)
+#define INTMAX_MAX   __INTMAX_MAX__
+#define UINTMAX_MAX (__INTMAX_MAX__*2ULL+1ULL)
+
+/* C99 7.18.3 Limits of other integer types. */
+#define SIG_ATOMIC_MIN INT32_MIN
+#define SIG_ATOMIC_MAX INT32_MAX
+#define WINT_MIN       INT32_MIN
+#define WINT_MAX       INT32_MAX
+
+/* FIXME: if we ever support a target with unsigned wchar_t, this should be
+ * 0 .. Max.
+ */
+#ifndef WCHAR_MAX
+#define WCHAR_MAX __WCHAR_MAX__
+#endif
+#ifndef WCHAR_MIN
+#define WCHAR_MIN (-__WCHAR_MAX__-1)
+#endif
+
+/* C99 7.18.4 Macros for minimum-width integer constants.
+ *
+ * Note that C++ should not check __STDC_CONSTANT_MACROS here, contrary to the
+ * claims of the C standard (see C++ 18.3.1p2, [cstdint.syn]).
+ */
+
+#define INT8_C(v)   (v)
+#define UINT8_C(v)  (v##U)
+#define INT16_C(v)  (v)
+#define UINT16_C(v) (v##U)
+#define INT32_C(v)  (v)
+#define UINT32_C(v) (v##U)
+
+/* Only define the 64-bit size macros if we have 64-bit support. */
+#ifdef __INT64_TYPE__
+#define INT64_C(v)  (v##LL)
+#define UINT64_C(v) (v##ULL)
+#endif
+
+/* 7.18.4.2 Macros for greatest-width integer constants. */
+#define INTMAX_C(v)  (v##LL)
+#define UINTMAX_C(v) (v##ULL)
+
+#endif /* __STDC_HOSTED__ */
+#endif /* __CLANG_STDINT_H */
diff --git a/lib/Headers/tgmath.h b/lib/Headers/tgmath.h
new file mode 100644
index 0000000..e1a0023
--- /dev/null
+++ b/lib/Headers/tgmath.h
@@ -0,0 +1,1358 @@
+/*===---- tgmath.h - Standard header for type generic math ----------------===*\
+ *
+ * Copyright (c) 2009 Howard Hinnant
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __TGMATH_H
+#define __TGMATH_H
+
+/* C99 7.22 Type-generic math <tgmath.h>. */
+#include <math.h>
+
+/* C++ handles type genericity with overloading in math.h. */
+#ifndef __cplusplus
+#include <complex.h>
+
+#define _TG_ATTRSp __attribute__((__overloadable__))
+#define _TG_ATTRS __attribute__((__overloadable__, __always_inline__))
+
+// promotion
+
+typedef void _Argument_type_is_not_arithmetic;
+static _Argument_type_is_not_arithmetic __tg_promote(...)
+  __attribute__((__unavailable__,__overloadable__));
+static double               _TG_ATTRSp __tg_promote(int);
+static double               _TG_ATTRSp __tg_promote(unsigned int);
+static double               _TG_ATTRSp __tg_promote(long);
+static double               _TG_ATTRSp __tg_promote(unsigned long);
+static double               _TG_ATTRSp __tg_promote(long long);
+static double               _TG_ATTRSp __tg_promote(unsigned long long);
+static float                _TG_ATTRSp __tg_promote(float);
+static double               _TG_ATTRSp __tg_promote(double);
+static long double          _TG_ATTRSp __tg_promote(long double);
+static float _Complex       _TG_ATTRSp __tg_promote(float _Complex);
+static double _Complex      _TG_ATTRSp __tg_promote(double _Complex);
+static long double _Complex _TG_ATTRSp __tg_promote(long double _Complex);
+
+#define __tg_promote1(__x)           (__typeof__(__tg_promote(__x)))
+#define __tg_promote2(__x, __y)      (__typeof__(__tg_promote(__x) + \
+                                                 __tg_promote(__y)))
+#define __tg_promote3(__x, __y, __z) (__typeof__(__tg_promote(__x) + \
+                                                 __tg_promote(__y) + \
+                                                 __tg_promote(__z)))
+
+// acos
+
+static float
+    _TG_ATTRS
+    __tg_acos(float __x) {return acosf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_acos(double __x) {return acos(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_acos(long double __x) {return acosl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_acos(float _Complex __x) {return cacosf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_acos(double _Complex __x) {return cacos(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_acos(long double _Complex __x) {return cacosl(__x);}
+
+#undef acos
+#define acos(__x) __tg_acos(__tg_promote1((__x))(__x))
+
+// asin
+
+static float
+    _TG_ATTRS
+    __tg_asin(float __x) {return asinf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_asin(double __x) {return asin(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_asin(long double __x) {return asinl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_asin(float _Complex __x) {return casinf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_asin(double _Complex __x) {return casin(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_asin(long double _Complex __x) {return casinl(__x);}
+
+#undef asin
+#define asin(__x) __tg_asin(__tg_promote1((__x))(__x))
+
+// atan
+
+static float
+    _TG_ATTRS
+    __tg_atan(float __x) {return atanf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_atan(double __x) {return atan(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_atan(long double __x) {return atanl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_atan(float _Complex __x) {return catanf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_atan(double _Complex __x) {return catan(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_atan(long double _Complex __x) {return catanl(__x);}
+
+#undef atan
+#define atan(__x) __tg_atan(__tg_promote1((__x))(__x))
+
+// acosh
+
+static float
+    _TG_ATTRS
+    __tg_acosh(float __x) {return acoshf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_acosh(double __x) {return acosh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_acosh(long double __x) {return acoshl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_acosh(float _Complex __x) {return cacoshf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_acosh(double _Complex __x) {return cacosh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_acosh(long double _Complex __x) {return cacoshl(__x);}
+
+#undef acosh
+#define acosh(__x) __tg_acosh(__tg_promote1((__x))(__x))
+
+// asinh
+
+static float
+    _TG_ATTRS
+    __tg_asinh(float __x) {return asinhf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_asinh(double __x) {return asinh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_asinh(long double __x) {return asinhl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_asinh(float _Complex __x) {return casinhf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_asinh(double _Complex __x) {return casinh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_asinh(long double _Complex __x) {return casinhl(__x);}
+
+#undef asinh
+#define asinh(__x) __tg_asinh(__tg_promote1((__x))(__x))
+
+// atanh
+
+static float
+    _TG_ATTRS
+    __tg_atanh(float __x) {return atanhf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_atanh(double __x) {return atanh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_atanh(long double __x) {return atanhl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_atanh(float _Complex __x) {return catanhf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_atanh(double _Complex __x) {return catanh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_atanh(long double _Complex __x) {return catanhl(__x);}
+
+#undef atanh
+#define atanh(__x) __tg_atanh(__tg_promote1((__x))(__x))
+
+// cos
+
+static float
+    _TG_ATTRS
+    __tg_cos(float __x) {return cosf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_cos(double __x) {return cos(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_cos(long double __x) {return cosl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_cos(float _Complex __x) {return ccosf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_cos(double _Complex __x) {return ccos(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_cos(long double _Complex __x) {return ccosl(__x);}
+
+#undef cos
+#define cos(__x) __tg_cos(__tg_promote1((__x))(__x))
+
+// sin
+
+static float
+    _TG_ATTRS
+    __tg_sin(float __x) {return sinf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_sin(double __x) {return sin(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_sin(long double __x) {return sinl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_sin(float _Complex __x) {return csinf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_sin(double _Complex __x) {return csin(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_sin(long double _Complex __x) {return csinl(__x);}
+
+#undef sin
+#define sin(__x) __tg_sin(__tg_promote1((__x))(__x))
+
+// tan
+
+static float
+    _TG_ATTRS
+    __tg_tan(float __x) {return tanf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_tan(double __x) {return tan(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_tan(long double __x) {return tanl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_tan(float _Complex __x) {return ctanf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_tan(double _Complex __x) {return ctan(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_tan(long double _Complex __x) {return ctanl(__x);}
+
+#undef tan
+#define tan(__x) __tg_tan(__tg_promote1((__x))(__x))
+
+// cosh
+
+static float
+    _TG_ATTRS
+    __tg_cosh(float __x) {return coshf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_cosh(double __x) {return cosh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_cosh(long double __x) {return coshl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_cosh(float _Complex __x) {return ccoshf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_cosh(double _Complex __x) {return ccosh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_cosh(long double _Complex __x) {return ccoshl(__x);}
+
+#undef cosh
+#define cosh(__x) __tg_cosh(__tg_promote1((__x))(__x))
+
+// sinh
+
+static float
+    _TG_ATTRS
+    __tg_sinh(float __x) {return sinhf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_sinh(double __x) {return sinh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_sinh(long double __x) {return sinhl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_sinh(float _Complex __x) {return csinhf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_sinh(double _Complex __x) {return csinh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_sinh(long double _Complex __x) {return csinhl(__x);}
+
+#undef sinh
+#define sinh(__x) __tg_sinh(__tg_promote1((__x))(__x))
+
+// tanh
+
+static float
+    _TG_ATTRS
+    __tg_tanh(float __x) {return tanhf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_tanh(double __x) {return tanh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_tanh(long double __x) {return tanhl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_tanh(float _Complex __x) {return ctanhf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_tanh(double _Complex __x) {return ctanh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_tanh(long double _Complex __x) {return ctanhl(__x);}
+
+#undef tanh
+#define tanh(__x) __tg_tanh(__tg_promote1((__x))(__x))
+
+// exp
+
+static float
+    _TG_ATTRS
+    __tg_exp(float __x) {return expf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_exp(double __x) {return exp(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_exp(long double __x) {return expl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_exp(float _Complex __x) {return cexpf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_exp(double _Complex __x) {return cexp(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_exp(long double _Complex __x) {return cexpl(__x);}
+
+#undef exp
+#define exp(__x) __tg_exp(__tg_promote1((__x))(__x))
+
+// log
+
+static float
+    _TG_ATTRS
+    __tg_log(float __x) {return logf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_log(double __x) {return log(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_log(long double __x) {return logl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_log(float _Complex __x) {return clogf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_log(double _Complex __x) {return clog(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_log(long double _Complex __x) {return clogl(__x);}
+
+#undef log
+#define log(__x) __tg_log(__tg_promote1((__x))(__x))
+
+// pow
+
+static float
+    _TG_ATTRS
+    __tg_pow(float __x, float __y) {return powf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_pow(double __x, double __y) {return pow(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_pow(long double __x, long double __y) {return powl(__x, __y);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_pow(float _Complex __x, float _Complex __y) {return cpowf(__x, __y);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_pow(double _Complex __x, double _Complex __y) {return cpow(__x, __y);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_pow(long double _Complex __x, long double _Complex __y) 
+    {return cpowl(__x, __y);}
+
+#undef pow
+#define pow(__x, __y) __tg_pow(__tg_promote2((__x), (__y))(__x), \
+                               __tg_promote2((__x), (__y))(__y))
+
+// sqrt
+
+static float
+    _TG_ATTRS
+    __tg_sqrt(float __x) {return sqrtf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_sqrt(double __x) {return sqrt(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_sqrt(long double __x) {return sqrtl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_sqrt(float _Complex __x) {return csqrtf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_sqrt(double _Complex __x) {return csqrt(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_sqrt(long double _Complex __x) {return csqrtl(__x);}
+
+#undef sqrt
+#define sqrt(__x) __tg_sqrt(__tg_promote1((__x))(__x))
+
+// fabs
+
+static float
+    _TG_ATTRS
+    __tg_fabs(float __x) {return fabsf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_fabs(double __x) {return fabs(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_fabs(long double __x) {return fabsl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_fabs(float _Complex __x) {return cabsf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_fabs(double _Complex __x) {return cabs(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_fabs(long double _Complex __x) {return cabsl(__x);}
+
+#undef fabs
+#define fabs(__x) __tg_fabs(__tg_promote1((__x))(__x))
+
+// atan2
+
+static float
+    _TG_ATTRS
+    __tg_atan2(float __x, float __y) {return atan2f(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_atan2(double __x, double __y) {return atan2(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_atan2(long double __x, long double __y) {return atan2l(__x, __y);}
+
+#undef atan2
+#define atan2(__x, __y) __tg_atan2(__tg_promote2((__x), (__y))(__x), \
+                                   __tg_promote2((__x), (__y))(__y))
+
+// cbrt
+
+static float
+    _TG_ATTRS
+    __tg_cbrt(float __x) {return cbrtf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_cbrt(double __x) {return cbrt(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_cbrt(long double __x) {return cbrtl(__x);}
+
+#undef cbrt
+#define cbrt(__x) __tg_cbrt(__tg_promote1((__x))(__x))
+
+// ceil
+
+static float
+    _TG_ATTRS
+    __tg_ceil(float __x) {return ceilf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_ceil(double __x) {return ceil(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_ceil(long double __x) {return ceill(__x);}
+
+#undef ceil
+#define ceil(__x) __tg_ceil(__tg_promote1((__x))(__x))
+
+// copysign
+
+static float
+    _TG_ATTRS
+    __tg_copysign(float __x, float __y) {return copysignf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_copysign(double __x, double __y) {return copysign(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_copysign(long double __x, long double __y) {return copysignl(__x, __y);}
+
+#undef copysign
+#define copysign(__x, __y) __tg_copysign(__tg_promote2((__x), (__y))(__x), \
+                                         __tg_promote2((__x), (__y))(__y))
+
+// erf
+
+static float
+    _TG_ATTRS
+    __tg_erf(float __x) {return erff(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_erf(double __x) {return erf(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_erf(long double __x) {return erfl(__x);}
+
+#undef erf
+#define erf(__x) __tg_erf(__tg_promote1((__x))(__x))
+
+// erfc
+
+static float
+    _TG_ATTRS
+    __tg_erfc(float __x) {return erfcf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_erfc(double __x) {return erfc(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_erfc(long double __x) {return erfcl(__x);}
+
+#undef erfc
+#define erfc(__x) __tg_erfc(__tg_promote1((__x))(__x))
+
+// exp2
+
+static float
+    _TG_ATTRS
+    __tg_exp2(float __x) {return exp2f(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_exp2(double __x) {return exp2(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_exp2(long double __x) {return exp2l(__x);}
+
+#undef exp2
+#define exp2(__x) __tg_exp2(__tg_promote1((__x))(__x))
+
+// expm1
+
+static float
+    _TG_ATTRS
+    __tg_expm1(float __x) {return expm1f(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_expm1(double __x) {return expm1(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_expm1(long double __x) {return expm1l(__x);}
+
+#undef expm1
+#define expm1(__x) __tg_expm1(__tg_promote1((__x))(__x))
+
+// fdim
+
+static float
+    _TG_ATTRS
+    __tg_fdim(float __x, float __y) {return fdimf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_fdim(double __x, double __y) {return fdim(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_fdim(long double __x, long double __y) {return fdiml(__x, __y);}
+
+#undef fdim
+#define fdim(__x, __y) __tg_fdim(__tg_promote2((__x), (__y))(__x), \
+                                 __tg_promote2((__x), (__y))(__y))
+
+// floor
+
+static float
+    _TG_ATTRS
+    __tg_floor(float __x) {return floorf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_floor(double __x) {return floor(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_floor(long double __x) {return floorl(__x);}
+
+#undef floor
+#define floor(__x) __tg_floor(__tg_promote1((__x))(__x))
+
+// fma
+
+static float
+    _TG_ATTRS
+    __tg_fma(float __x, float __y, float __z)
+    {return fmaf(__x, __y, __z);}
+
+static double
+    _TG_ATTRS
+    __tg_fma(double __x, double __y, double __z)
+    {return fma(__x, __y, __z);}
+
+static long double
+    _TG_ATTRS
+    __tg_fma(long double __x,long double __y, long double __z)
+    {return fmal(__x, __y, __z);}
+
+#undef fma
+#define fma(__x, __y, __z)                                \
+        __tg_fma(__tg_promote3((__x), (__y), (__z))(__x), \
+                 __tg_promote3((__x), (__y), (__z))(__y), \
+                 __tg_promote3((__x), (__y), (__z))(__z))
+
+// fmax
+
+static float
+    _TG_ATTRS
+    __tg_fmax(float __x, float __y) {return fmaxf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_fmax(double __x, double __y) {return fmax(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_fmax(long double __x, long double __y) {return fmaxl(__x, __y);}
+
+#undef fmax
+#define fmax(__x, __y) __tg_fmax(__tg_promote2((__x), (__y))(__x), \
+                                 __tg_promote2((__x), (__y))(__y))
+
+// fmin
+
+static float
+    _TG_ATTRS
+    __tg_fmin(float __x, float __y) {return fminf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_fmin(double __x, double __y) {return fmin(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_fmin(long double __x, long double __y) {return fminl(__x, __y);}
+
+#undef fmin
+#define fmin(__x, __y) __tg_fmin(__tg_promote2((__x), (__y))(__x), \
+                                 __tg_promote2((__x), (__y))(__y))
+
+// fmod
+
+static float
+    _TG_ATTRS
+    __tg_fmod(float __x, float __y) {return fmodf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_fmod(double __x, double __y) {return fmod(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_fmod(long double __x, long double __y) {return fmodl(__x, __y);}
+
+#undef fmod
+#define fmod(__x, __y) __tg_fmod(__tg_promote2((__x), (__y))(__x), \
+                                 __tg_promote2((__x), (__y))(__y))
+
+// frexp
+
+static float
+    _TG_ATTRS
+    __tg_frexp(float __x, int* __y) {return frexpf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_frexp(double __x, int* __y) {return frexp(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_frexp(long double __x, int* __y) {return frexpl(__x, __y);}
+
+#undef frexp
+#define frexp(__x, __y) __tg_frexp(__tg_promote1((__x))(__x), __y)
+
+// hypot
+
+static float
+    _TG_ATTRS
+    __tg_hypot(float __x, float __y) {return hypotf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_hypot(double __x, double __y) {return hypot(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_hypot(long double __x, long double __y) {return hypotl(__x, __y);}
+
+#undef hypot
+#define hypot(__x, __y) __tg_hypot(__tg_promote2((__x), (__y))(__x), \
+                                   __tg_promote2((__x), (__y))(__y))
+
+// ilogb
+
+static int
+    _TG_ATTRS
+    __tg_ilogb(float __x) {return ilogbf(__x);}
+
+static int
+    _TG_ATTRS
+    __tg_ilogb(double __x) {return ilogb(__x);}
+
+static int
+    _TG_ATTRS
+    __tg_ilogb(long double __x) {return ilogbl(__x);}
+
+#undef ilogb
+#define ilogb(__x) __tg_ilogb(__tg_promote1((__x))(__x))
+
+// ldexp
+
+static float
+    _TG_ATTRS
+    __tg_ldexp(float __x, int __y) {return ldexpf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_ldexp(double __x, int __y) {return ldexp(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_ldexp(long double __x, int __y) {return ldexpl(__x, __y);}
+
+#undef ldexp
+#define ldexp(__x, __y) __tg_ldexp(__tg_promote1((__x))(__x), __y)
+
+// lgamma
+
+static float
+    _TG_ATTRS
+    __tg_lgamma(float __x) {return lgammaf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_lgamma(double __x) {return lgamma(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_lgamma(long double __x) {return lgammal(__x);}
+
+#undef lgamma
+#define lgamma(__x) __tg_lgamma(__tg_promote1((__x))(__x))
+
+// llrint
+
+static long long
+    _TG_ATTRS
+    __tg_llrint(float __x) {return llrintf(__x);}
+
+static long long
+    _TG_ATTRS
+    __tg_llrint(double __x) {return llrint(__x);}
+
+static long long
+    _TG_ATTRS
+    __tg_llrint(long double __x) {return llrintl(__x);}
+
+#undef llrint
+#define llrint(__x) __tg_llrint(__tg_promote1((__x))(__x))
+
+// llround
+
+static long long
+    _TG_ATTRS
+    __tg_llround(float __x) {return llroundf(__x);}
+
+static long long
+    _TG_ATTRS
+    __tg_llround(double __x) {return llround(__x);}
+
+static long long
+    _TG_ATTRS
+    __tg_llround(long double __x) {return llroundl(__x);}
+
+#undef llround
+#define llround(__x) __tg_llround(__tg_promote1((__x))(__x))
+
+// log10
+
+static float
+    _TG_ATTRS
+    __tg_log10(float __x) {return log10f(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_log10(double __x) {return log10(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_log10(long double __x) {return log10l(__x);}
+
+#undef log10
+#define log10(__x) __tg_log10(__tg_promote1((__x))(__x))
+
+// log1p
+
+static float
+    _TG_ATTRS
+    __tg_log1p(float __x) {return log1pf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_log1p(double __x) {return log1p(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_log1p(long double __x) {return log1pl(__x);}
+
+#undef log1p
+#define log1p(__x) __tg_log1p(__tg_promote1((__x))(__x))
+
+// log2
+
+static float
+    _TG_ATTRS
+    __tg_log2(float __x) {return log2f(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_log2(double __x) {return log2(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_log2(long double __x) {return log2l(__x);}
+
+#undef log2
+#define log2(__x) __tg_log2(__tg_promote1((__x))(__x))
+
+// lrint
+
+static long
+    _TG_ATTRS
+    __tg_lrint(float __x) {return lrintf(__x);}
+
+static long
+    _TG_ATTRS
+    __tg_lrint(double __x) {return lrint(__x);}
+
+static long
+    _TG_ATTRS
+    __tg_lrint(long double __x) {return lrintl(__x);}
+
+#undef lrint
+#define lrint(__x) __tg_lrint(__tg_promote1((__x))(__x))
+
+// lround
+
+static long
+    _TG_ATTRS
+    __tg_lround(float __x) {return lroundf(__x);}
+
+static long
+    _TG_ATTRS
+    __tg_lround(double __x) {return lround(__x);}
+
+static long
+    _TG_ATTRS
+    __tg_lround(long double __x) {return lroundl(__x);}
+
+#undef lround
+#define lround(__x) __tg_lround(__tg_promote1((__x))(__x))
+
+// nearbyint
+
+static float
+    _TG_ATTRS
+    __tg_nearbyint(float __x) {return nearbyintf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_nearbyint(double __x) {return nearbyint(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_nearbyint(long double __x) {return nearbyintl(__x);}
+
+#undef nearbyint
+#define nearbyint(__x) __tg_nearbyint(__tg_promote1((__x))(__x))
+
+// nextafter
+
+static float
+    _TG_ATTRS
+    __tg_nextafter(float __x, float __y) {return nextafterf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_nextafter(double __x, double __y) {return nextafter(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_nextafter(long double __x, long double __y) {return nextafterl(__x, __y);}
+
+#undef nextafter
+#define nextafter(__x, __y) __tg_nextafter(__tg_promote2((__x), (__y))(__x), \
+                                           __tg_promote2((__x), (__y))(__y))
+
+// nexttoward
+
+static float
+    _TG_ATTRS
+    __tg_nexttoward(float __x, float __y) {return nexttowardf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_nexttoward(double __x, double __y) {return nexttoward(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_nexttoward(long double __x, long double __y) {return nexttowardl(__x, __y);}
+
+#undef nexttoward
+#define nexttoward(__x, __y) __tg_nexttoward(__tg_promote2((__x), (__y))(__x), \
+                                             __tg_promote2((__x), (__y))(__y))
+
+// remainder
+
+static float
+    _TG_ATTRS
+    __tg_remainder(float __x, float __y) {return remainderf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_remainder(double __x, double __y) {return remainder(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_remainder(long double __x, long double __y) {return remainderl(__x, __y);}
+
+#undef remainder
+#define remainder(__x, __y) __tg_remainder(__tg_promote2((__x), (__y))(__x), \
+                                           __tg_promote2((__x), (__y))(__y))
+
+// remquo
+
+static float
+    _TG_ATTRS
+    __tg_remquo(float __x, float __y, int* __z)
+    {return remquof(__x, __y, __z);}
+
+static double
+    _TG_ATTRS
+    __tg_remquo(double __x, double __y, int* __z)
+    {return remquo(__x, __y, __z);}
+
+static long double
+    _TG_ATTRS
+    __tg_remquo(long double __x,long double __y, int* __z)
+    {return remquol(__x, __y, __z);}
+
+#undef remquo
+#define remquo(__x, __y, __z)                         \
+        __tg_remquo(__tg_promote2((__x), (__y))(__x), \
+                    __tg_promote2((__x), (__y))(__y), \
+                    (__z))
+
+// rint
+
+static float
+    _TG_ATTRS
+    __tg_rint(float __x) {return rintf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_rint(double __x) {return rint(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_rint(long double __x) {return rintl(__x);}
+
+#undef rint
+#define rint(__x) __tg_rint(__tg_promote1((__x))(__x))
+
+// round
+
+static float
+    _TG_ATTRS
+    __tg_round(float __x) {return roundf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_round(double __x) {return round(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_round(long double __x) {return roundl(__x);}
+
+#undef round
+#define round(__x) __tg_round(__tg_promote1((__x))(__x))
+
+// scalbn
+
+static float
+    _TG_ATTRS
+    __tg_scalbn(float __x, int __y) {return scalbnf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_scalbn(double __x, int __y) {return scalbn(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_scalbn(long double __x, int __y) {return scalbnl(__x, __y);}
+
+#undef scalbn
+#define scalbn(__x, __y) __tg_scalbn(__tg_promote1((__x))(__x), __y)
+
+// scalbln
+
+static float
+    _TG_ATTRS
+    __tg_scalbln(float __x, long __y) {return scalblnf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_scalbln(double __x, long __y) {return scalbln(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_scalbln(long double __x, long __y) {return scalblnl(__x, __y);}
+
+#undef scalbln
+#define scalbln(__x, __y) __tg_scalbln(__tg_promote1((__x))(__x), __y)
+
+// tgamma
+
+static float
+    _TG_ATTRS
+    __tg_tgamma(float __x) {return tgammaf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_tgamma(double __x) {return tgamma(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_tgamma(long double __x) {return tgammal(__x);}
+
+#undef tgamma
+#define tgamma(__x) __tg_tgamma(__tg_promote1((__x))(__x))
+
+// trunc
+
+static float
+    _TG_ATTRS
+    __tg_trunc(float __x) {return truncf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_trunc(double __x) {return trunc(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_trunc(long double __x) {return truncl(__x);}
+
+#undef trunc
+#define trunc(__x) __tg_trunc(__tg_promote1((__x))(__x))
+
+// carg
+
+static float
+    _TG_ATTRS
+    __tg_carg(float __x) {return atan2f(0.F, __x);}
+
+static double
+    _TG_ATTRS
+    __tg_carg(double __x) {return atan2(0., __x);}
+
+static long double
+    _TG_ATTRS
+    __tg_carg(long double __x) {return atan2l(0.L, __x);}
+
+static float
+    _TG_ATTRS
+    __tg_carg(float _Complex __x) {return cargf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_carg(double _Complex __x) {return carg(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_carg(long double _Complex __x) {return cargl(__x);}
+
+#undef carg
+#define carg(__x) __tg_carg(__tg_promote1((__x))(__x))
+
+// cimag
+
+static float
+    _TG_ATTRS
+    __tg_cimag(float __x) {return 0;}
+
+static double
+    _TG_ATTRS
+    __tg_cimag(double __x) {return 0;}
+
+static long double
+    _TG_ATTRS
+    __tg_cimag(long double __x) {return 0;}
+
+static float
+    _TG_ATTRS
+    __tg_cimag(float _Complex __x) {return cimagf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_cimag(double _Complex __x) {return cimag(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_cimag(long double _Complex __x) {return cimagl(__x);}
+
+#undef cimag
+#define cimag(__x) __tg_cimag(__tg_promote1((__x))(__x))
+
+// conj
+
+static float _Complex
+    _TG_ATTRS
+    __tg_conj(float __x) {return __x;}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_conj(double __x) {return __x;}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_conj(long double __x) {return __x;}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_conj(float _Complex __x) {return conjf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_conj(double _Complex __x) {return conj(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_conj(long double _Complex __x) {return conjl(__x);}
+
+#undef conj
+#define conj(__x) __tg_conj(__tg_promote1((__x))(__x))
+
+// cproj
+
+static float _Complex
+    _TG_ATTRS
+    __tg_cproj(float __x) {return cprojf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_cproj(double __x) {return cproj(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_cproj(long double __x) {return cprojl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_cproj(float _Complex __x) {return cprojf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_cproj(double _Complex __x) {return cproj(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_cproj(long double _Complex __x) {return cprojl(__x);}
+
+#undef cproj
+#define cproj(__x) __tg_cproj(__tg_promote1((__x))(__x))
+
+// creal
+
+static float _Complex
+    _TG_ATTRS
+    __tg_creal(float __x) {return __x;}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_creal(double __x) {return __x;}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_creal(long double __x) {return __x;}
+
+static float
+    _TG_ATTRS
+    __tg_creal(float _Complex __x) {return crealf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_creal(double _Complex __x) {return creal(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_creal(long double _Complex __x) {return creall(__x);}
+
+#undef creal
+#define creal(__x) __tg_creal(__tg_promote1((__x))(__x))
+
+#undef _TG_ATTRSp
+#undef _TG_ATTRS
+
+#endif /* __cplusplus */
+#endif /* __TGMATH_H */
diff --git a/lib/Headers/tmmintrin.h b/lib/Headers/tmmintrin.h
new file mode 100644
index 0000000..e9715f1
--- /dev/null
+++ b/lib/Headers/tmmintrin.h
@@ -0,0 +1,218 @@
+/*===---- tmmintrin.h - SSSE3 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+ 
+#ifndef __TMMINTRIN_H
+#define __TMMINTRIN_H
+
+#ifndef __SSSE3__
+#error "SSSE3 instruction set not enabled"
+#else
+
+#include <pmmintrin.h>
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_abs_pi8(__m64 a)
+{
+    return (__m64)__builtin_ia32_pabsb((__v8qi)a);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_abs_epi8(__m128i a)
+{
+    return (__m128i)__builtin_ia32_pabsb128((__v16qi)a);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_abs_pi16(__m64 a)
+{
+    return (__m64)__builtin_ia32_pabsw((__v4hi)a);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_abs_epi16(__m128i a)
+{
+    return (__m128i)__builtin_ia32_pabsw128((__v8hi)a);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_abs_pi32(__m64 a)
+{
+    return (__m64)__builtin_ia32_pabsd((__v2si)a);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_abs_epi32(__m128i a)
+{
+    return (__m128i)__builtin_ia32_pabsd128((__v4si)a);
+}
+
+#define _mm_alignr_epi8(a, b, n) (__builtin_ia32_palignr128((a), (b), (n)))
+#define _mm_alignr_pi8(a, b, n) (__builtin_ia32_palignr((a), (b), (n)))
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_hadd_epi16(__m128i a, __m128i b)
+{
+    return (__m128i)__builtin_ia32_phaddw128((__v8hi)a, (__v8hi)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_hadd_epi32(__m128i a, __m128i b)
+{
+    return (__m128i)__builtin_ia32_phaddd128((__v4si)a, (__v4si)b);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_hadd_pi16(__m64 a, __m64 b)
+{
+    return (__m64)__builtin_ia32_phaddw((__v4hi)a, (__v4hi)b);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_hadd_pi32(__m64 a, __m64 b)
+{
+    return (__m64)__builtin_ia32_phaddd((__v2si)a, (__v2si)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_hadds_epi16(__m128i a, __m128i b)
+{
+    return (__m128i)__builtin_ia32_phaddsw128((__v8hi)a, (__v8hi)b);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_hadds_pi16(__m64 a, __m64 b)
+{
+    return (__m64)__builtin_ia32_phaddsw((__v4hi)a, (__v4hi)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_hsub_epi16(__m128i a, __m128i b)
+{
+    return (__m128i)__builtin_ia32_phsubw128((__v8hi)a, (__v8hi)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_hsub_epi32(__m128i a, __m128i b)
+{
+    return (__m128i)__builtin_ia32_psubd128((__v4si)a, (__v4si)b);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_hsub_pi16(__m64 a, __m64 b)
+{
+    return (__m64)__builtin_ia32_psubw((__v4hi)a, (__v4hi)b);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_hsub_pi32(__m64 a, __m64 b)
+{
+    return (__m64)__builtin_ia32_psubd((__v2si)a, (__v2si)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_hsubs_epi16(__m128i a, __m128i b)
+{
+    return (__m128i)__builtin_ia32_phsubsw128((__v8hi)a, (__v8hi)b);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_hsubs_pi16(__m64 a, __m64 b)
+{
+    return (__m64)__builtin_ia32_phsubsw((__v4hi)a, (__v4hi)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_maddubs_epi16(__m128i a, __m128i b)
+{
+    return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)a, (__v16qi)b);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_maddubs_pi16(__m64 a, __m64 b)
+{
+    return (__m64)__builtin_ia32_pmaddubsw((__v8qi)a, (__v8qi)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_mulhrs_epi16(__m128i a, __m128i b)
+{
+    return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)a, (__v8hi)b);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_mulhrs_pi16(__m64 a, __m64 b)
+{
+    return (__m64)__builtin_ia32_pmulhrsw((__v4hi)a, (__v4hi)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_shuffle_epi8(__m128i a, __m128i b)
+{
+    return (__m128i)__builtin_ia32_pshufb128((__v16qi)a, (__v16qi)b);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_shuffle_pi8(__m64 a, __m64 b)
+{
+    return (__m64)__builtin_ia32_pshufb((__v8qi)a, (__v8qi)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sign_epi8(__m128i a, __m128i b)
+{
+    return (__m128i)__builtin_ia32_psignb128((__v16qi)a, (__v16qi)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sign_epi16(__m128i a, __m128i b)
+{
+    return (__m128i)__builtin_ia32_psignw128((__v8hi)a, (__v8hi)b);
+}
+
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
+_mm_sign_epi32(__m128i a, __m128i b)
+{
+    return (__m128i)__builtin_ia32_psignd128((__v4si)a, (__v4si)b);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sign_pi8(__m64 a, __m64 b)
+{
+    return (__m64)__builtin_ia32_psignb((__v8qi)a, (__v8qi)b);  
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sign_pi16(__m64 a, __m64 b)
+{
+    return (__m64)__builtin_ia32_psignw((__v4hi)a, (__v4hi)b);  
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sign_pi32(__m64 a, __m64 b)
+{
+    return (__m64)__builtin_ia32_psignd((__v2si)a, (__v2si)b);
+}
+
+#endif /* __SSSE3__ */
+
+#endif /* __TMMINTRIN_H */
diff --git a/lib/Headers/xmmintrin.h b/lib/Headers/xmmintrin.h
new file mode 100644
index 0000000..c104f63
--- /dev/null
+++ b/lib/Headers/xmmintrin.h
@@ -0,0 +1,888 @@
+/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+ 
+#ifndef __XMMINTRIN_H
+#define __XMMINTRIN_H
+ 
+#ifndef __SSE__
+#error "SSE instruction set not enabled"
+#else
+
+#include <mmintrin.h>
+
+typedef float __v4sf __attribute__((__vector_size__(16)));
+typedef float __m128 __attribute__((__vector_size__(16)));
+
+#include <mm_malloc.h>
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_add_ss(__m128 a, __m128 b)
+{
+  return __builtin_ia32_addss(a, b);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_add_ps(__m128 a, __m128 b)
+{
+  return a + b;
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_sub_ss(__m128 a, __m128 b)
+{
+  return __builtin_ia32_subss(a, b);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_sub_ps(__m128 a, __m128 b)
+{
+  return a - b;
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_mul_ss(__m128 a, __m128 b)
+{
+  return __builtin_ia32_mulss(a, b);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_mul_ps(__m128 a, __m128 b)
+{
+  return a * b;
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_div_ss(__m128 a, __m128 b)
+{
+  return __builtin_ia32_divss(a, b);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_div_ps(__m128 a, __m128 b)
+{
+  return a / b;
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_sqrt_ss(__m128 a)
+{
+  return __builtin_ia32_sqrtss(a);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_sqrt_ps(__m128 a)
+{
+  return __builtin_ia32_sqrtps(a);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_rcp_ss(__m128 a)
+{
+  return __builtin_ia32_rcpss(a);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_rcp_ps(__m128 a)
+{
+  return __builtin_ia32_rcpps(a);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_rsqrt_ss(__m128 a)
+{
+  return __builtin_ia32_rsqrtss(a);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_rsqrt_ps(__m128 a)
+{
+  return __builtin_ia32_rsqrtps(a);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_min_ss(__m128 a, __m128 b)
+{
+  return __builtin_ia32_minss(a, b);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_min_ps(__m128 a, __m128 b)
+{
+  return __builtin_ia32_minps(a, b);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_max_ss(__m128 a, __m128 b)
+{
+  return __builtin_ia32_maxss(a, b);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_max_ps(__m128 a, __m128 b)
+{
+  return __builtin_ia32_maxps(a, b);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_and_ps(__m128 a, __m128 b)
+{
+  return __builtin_ia32_andps(a, b);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_andnot_ps(__m128 a, __m128 b)
+{
+  return __builtin_ia32_andnps(a, b);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_or_ps(__m128 a, __m128 b)
+{
+  return __builtin_ia32_orps(a, b);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_xor_ps(__m128 a, __m128 b)
+{
+  return __builtin_ia32_xorps(a, b);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_ss(__m128 a, __m128 b)
+{
+  return (__m128)__builtin_ia32_cmpss(a, b, 0);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpeq_ps(__m128 a, __m128 b)
+{
+  return (__m128)__builtin_ia32_cmpps(a, b, 0);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_ss(__m128 a, __m128 b)
+{
+  return (__m128)__builtin_ia32_cmpss(a, b, 1);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmplt_ps(__m128 a, __m128 b)
+{
+  return (__m128)__builtin_ia32_cmpps(a, b, 1);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmple_ss(__m128 a, __m128 b)
+{
+  return (__m128)__builtin_ia32_cmpss(a, b, 2);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmple_ps(__m128 a, __m128 b)
+{
+  return (__m128)__builtin_ia32_cmpps(a, b, 2);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_ss(__m128 a, __m128 b)
+{
+  return (__m128)__builtin_ia32_cmpss(b, a, 1);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpgt_ps(__m128 a, __m128 b)
+{
+  return (__m128)__builtin_ia32_cmpps(b, a, 1);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpge_ss(__m128 a, __m128 b)
+{
+  return (__m128)__builtin_ia32_cmpss(b, a, 2);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpge_ps(__m128 a, __m128 b)
+{
+  return (__m128)__builtin_ia32_cmpps(b, a, 2);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpneq_ss(__m128 a, __m128 b)
+{
+  return (__m128)__builtin_ia32_cmpss(a, b, 4);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpneq_ps(__m128 a, __m128 b)
+{
+  return (__m128)__builtin_ia32_cmpps(a, b, 4);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnlt_ss(__m128 a, __m128 b)
+{
+  return (__m128)__builtin_ia32_cmpss(a, b, 5);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnlt_ps(__m128 a, __m128 b)
+{
+  return (__m128)__builtin_ia32_cmpps(a, b, 5);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnle_ss(__m128 a, __m128 b)
+{
+  return (__m128)__builtin_ia32_cmpss(a, b, 6);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnle_ps(__m128 a, __m128 b)
+{
+  return (__m128)__builtin_ia32_cmpps(a, b, 6);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpngt_ss(__m128 a, __m128 b)
+{
+  return (__m128)__builtin_ia32_cmpss(b, a, 5);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpngt_ps(__m128 a, __m128 b)
+{
+  return (__m128)__builtin_ia32_cmpps(b, a, 5);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnge_ss(__m128 a, __m128 b)
+{
+  return (__m128)__builtin_ia32_cmpss(b, a, 6);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpnge_ps(__m128 a, __m128 b)
+{
+  return (__m128)__builtin_ia32_cmpps(b, a, 6);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpord_ss(__m128 a, __m128 b)
+{
+  return (__m128)__builtin_ia32_cmpss(a, b, 7);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpord_ps(__m128 a, __m128 b)
+{
+  return (__m128)__builtin_ia32_cmpps(a, b, 7);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpunord_ss(__m128 a, __m128 b)
+{
+  return (__m128)__builtin_ia32_cmpss(a, b, 3);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cmpunord_ps(__m128 a, __m128 b)
+{
+  return (__m128)__builtin_ia32_cmpps(a, b, 3);
+}
+
+static inline int __attribute__((__always_inline__, __nodebug__))
+_mm_comieq_ss(__m128 a, __m128 b)
+{
+  return __builtin_ia32_comieq(a, b);
+}
+
+static inline int __attribute__((__always_inline__, __nodebug__))
+_mm_comilt_ss(__m128 a, __m128 b)
+{
+  return __builtin_ia32_comilt(a, b);
+}
+
+static inline int __attribute__((__always_inline__, __nodebug__))
+_mm_comile_ss(__m128 a, __m128 b)
+{
+  return __builtin_ia32_comile(a, b);
+}
+
+static inline int __attribute__((__always_inline__, __nodebug__))
+_mm_comigt_ss(__m128 a, __m128 b)
+{
+  return __builtin_ia32_comigt(a, b);
+}
+
+static inline int __attribute__((__always_inline__, __nodebug__))
+_mm_comige_ss(__m128 a, __m128 b)
+{
+  return __builtin_ia32_comige(a, b);
+}
+
+static inline int __attribute__((__always_inline__, __nodebug__))
+_mm_comineq_ss(__m128 a, __m128 b)
+{
+  return __builtin_ia32_comineq(a, b);
+}
+
+static inline int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomieq_ss(__m128 a, __m128 b)
+{
+  return __builtin_ia32_ucomieq(a, b);
+}
+
+static inline int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomilt_ss(__m128 a, __m128 b)
+{
+  return __builtin_ia32_ucomilt(a, b);
+}
+
+static inline int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomile_ss(__m128 a, __m128 b)
+{
+  return __builtin_ia32_ucomile(a, b);
+}
+
+static inline int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomigt_ss(__m128 a, __m128 b)
+{
+  return __builtin_ia32_ucomigt(a, b);
+}
+
+static inline int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomige_ss(__m128 a, __m128 b)
+{
+  return __builtin_ia32_ucomige(a, b);
+}
+
+static inline int __attribute__((__always_inline__, __nodebug__))
+_mm_ucomineq_ss(__m128 a, __m128 b)
+{
+  return __builtin_ia32_ucomineq(a, b);
+}
+
+static inline int __attribute__((__always_inline__, __nodebug__))
+_mm_cvtss_si32(__m128 a)
+{
+  return __builtin_ia32_cvtss2si(a);
+}
+
+static inline long long __attribute__((__always_inline__, __nodebug__))
+_mm_cvtss_si64(__m128 a)
+{
+  return __builtin_ia32_cvtss2si64(a);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtps_pi32(__m128 a)
+{
+  return (__m64)__builtin_ia32_cvtps2pi(a);
+}
+
+static inline int __attribute__((__always_inline__, __nodebug__))
+_mm_cvttss_si32(__m128 a)
+{
+  return __builtin_ia32_cvttss2si(a);
+}
+
+static inline long long __attribute__((__always_inline__, __nodebug__))
+_mm_cvttss_si64(__m128 a)
+{
+  return __builtin_ia32_cvttss2si64(a);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cvttps_pi32(__m128 a)
+{
+  return (__m64)__builtin_ia32_cvttps2pi(a);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi32_ss(__m128 a, int b)
+{
+  return __builtin_ia32_cvtsi2ss(a, b);
+}
+
+#ifdef __x86_64__
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtsi64_ss(__m128 a, long long b)
+{
+  return __builtin_ia32_cvtsi642ss(a, b);
+}
+
+#endif
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpi32_ps(__m128 a, __m64 b)
+{
+  return __builtin_ia32_cvtpi2ps(a, (__v2si)b);
+}
+
+static inline float __attribute__((__always_inline__, __nodebug__))
+_mm_cvtss_f32(__m128 a)
+{
+  return a[0];
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_loadh_pi(__m128 a, __m64 const *p)
+{
+  return __builtin_ia32_loadhps(a, (__v2si *)p);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_loadl_pi(__m128 a, __m64 const *p)
+{
+  return __builtin_ia32_loadlps(a, (__v2si *)p);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_load_ss(float *p)
+{
+  return (__m128){ *p, 0, 0, 0 };
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_load1_ps(float *p)
+{
+  return (__m128){ *p, *p, *p, *p };
+}
+
+#define        _mm_load_ps1(p) _mm_load1_ps(p)
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_load_ps(float *p)
+{
+  return *(__m128*)p;
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_loadu_ps(float *p)
+{
+  return __builtin_ia32_loadups(p);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_loadr_ps(float *p)
+{
+  __m128 a = _mm_load_ps(p);
+  return __builtin_shufflevector(a, a, 3, 2, 1, 0);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_set_ss(float w)
+{
+  return (__m128){ w, 0, 0, 0 };
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_set1_ps(float w)
+{
+  return (__m128){ w, w, w, w };
+}
+
+// Microsoft specific.
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_set_ps1(float w)
+{
+    return _mm_set1_ps(w);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_set_ps(float z, float y, float x, float w)
+{
+  return (__m128){ w, x, y, z };
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_setr_ps(float z, float y, float x, float w)
+{
+  return (__m128){ z, y, x, w };
+}
+
+static inline __m128 __attribute__((__always_inline__))
+_mm_setzero_ps(void)
+{
+  return (__m128){ 0, 0, 0, 0 };
+}
+
+static inline void __attribute__((__always_inline__))
+_mm_storeh_pi(__m64 *p, __m128 a)
+{
+  __builtin_ia32_storehps((__v2si *)p, a);
+}
+
+static inline void __attribute__((__always_inline__))
+_mm_storel_pi(__m64 *p, __m128 a)
+{
+  __builtin_ia32_storelps((__v2si *)p, a);
+}
+
+static inline void __attribute__((__always_inline__))
+_mm_store_ss(float *p, __m128 a)
+{
+  *p = a[0];
+}
+
+static inline void __attribute__((__always_inline__, __nodebug__))
+_mm_storeu_ps(float *p, __m128 a)
+{
+  __builtin_ia32_storeups(p, a);
+}
+
+static inline void __attribute__((__always_inline__, __nodebug__))
+_mm_store1_ps(float *p, __m128 a)
+{
+  a = __builtin_shufflevector(a, a, 0, 0, 0, 0);
+  _mm_storeu_ps(p, a);
+}
+
+static inline void __attribute__((__always_inline__, __nodebug__))
+_mm_store_ps(float *p, __m128 a)
+{
+  *(__m128 *)p = a;
+}
+
+static inline void __attribute__((__always_inline__, __nodebug__))
+_mm_storer_ps(float *p, __m128 a)
+{
+  a = __builtin_shufflevector(a, a, 3, 2, 1, 0);
+  _mm_store_ps(p, a);
+}
+
+#define _MM_HINT_T0 1
+#define _MM_HINT_T1 2
+#define _MM_HINT_T2 3
+#define _MM_HINT_NTA 0
+
+/* FIXME: We have to #define this because "sel" must be a constant integer, and 
+   Sema doesn't do any form of constant propagation yet. */
+
+#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)a, 0, sel))
+
+static inline void __attribute__((__always_inline__, __nodebug__))
+_mm_stream_pi(__m64 *p, __m64 a)
+{
+  __builtin_ia32_movntq(p, a);
+}
+
+static inline void __attribute__((__always_inline__, __nodebug__))
+_mm_stream_ps(float *p, __m128 a)
+{
+  __builtin_ia32_movntps(p, a);
+}
+
+static inline void __attribute__((__always_inline__, __nodebug__))
+_mm_sfence(void)
+{
+  __builtin_ia32_sfence();
+}
+
+static inline int __attribute__((__always_inline__, __nodebug__))
+_mm_extract_pi16(__m64 a, int n)
+{
+  /* FIXME: 
+   * This should force n to be an immediate.
+   * This does not use the PEXTRW instruction. From looking at the LLVM source, the
+     instruction doesn't seem to be hooked up. 
+   * The code could probably be made better :)
+   */
+  __v4hi b = (__v4hi)a;
+  return b[(n == 0) ? 0 : (n == 1 ? 1 : (n == 2 ? 2 : 3))];
+}
+
+/* FIXME: Implement this. We could add a __builtin_insertelement function that's similar to
+   the already existing __builtin_shufflevector.
+*/
+/*
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_insert_pi16(__m64 a, int d, int n)
+{
+   return (__m64){ 0LL };
+}
+*/
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_max_pi16(__m64 a, __m64 b)
+{
+  return (__m64)__builtin_ia32_pmaxsw((__v4hi)a, (__v4hi)b);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_max_pu8(__m64 a, __m64 b)
+{
+  return (__m64)__builtin_ia32_pmaxub((__v8qi)a, (__v8qi)b);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_min_pi16(__m64 a, __m64 b)
+{
+  return (__m64)__builtin_ia32_pminsw((__v4hi)a, (__v4hi)b);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_min_pu8(__m64 a, __m64 b)
+{
+  return (__m64)__builtin_ia32_pminub((__v8qi)a, (__v8qi)b);
+}
+
+static inline int __attribute__((__always_inline__, __nodebug__))
+_mm_movemask_pi8(__m64 a)
+{
+  return __builtin_ia32_pmovmskb((__v8qi)a);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_mulhi_pu16(__m64 a, __m64 b)
+{
+  return (__m64)__builtin_ia32_pmulhuw((__v4hi)a, (__v4hi)b);  
+}
+
+#define _mm_shuffle_pi16(a, n) ((__m64)__builtin_ia32_pshufw((__v4hi)a, n))
+
+static inline void __attribute__((__always_inline__, __nodebug__))
+_mm_maskmove_si64(__m64 d, __m64 n, char *p)
+{
+  __builtin_ia32_maskmovq((__v8qi)d, (__v8qi)n, p);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_avg_pu8(__m64 a, __m64 b)
+{
+  return (__m64)__builtin_ia32_pavgb((__v8qi)a, (__v8qi)b);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_avg_pu16(__m64 a, __m64 b)
+{
+  return (__m64)__builtin_ia32_pavgw((__v4hi)a, (__v4hi)b);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_sad_pu8(__m64 a, __m64 b)
+{
+  return (__m64)__builtin_ia32_psadbw((__v8qi)a, (__v8qi)b);
+}
+
+static inline unsigned int __attribute__((__always_inline__, __nodebug__))
+_mm_getcsr(void)
+{
+  return __builtin_ia32_stmxcsr();
+}
+
+static inline void __attribute__((__always_inline__, __nodebug__))
+_mm_setcsr(unsigned int i)
+{
+  __builtin_ia32_ldmxcsr(i);
+}
+
+#define _mm_shuffle_ps(a, b, mask) (__builtin_ia32_shufps(a, b, mask))
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_unpackhi_ps(__m128 a, __m128 b)
+{
+  return __builtin_shufflevector(a, b, 2, 6, 3, 7);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_unpacklo_ps(__m128 a, __m128 b)
+{
+  return __builtin_shufflevector(a, b, 0, 4, 1, 5);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_move_ss(__m128 a, __m128 b)
+{
+  return __builtin_shufflevector(a, b, 4, 1, 2, 3);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_movehl_ps(__m128 a, __m128 b)
+{
+  return __builtin_shufflevector(a, b, 6, 7, 2, 3);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_movelh_ps(__m128 a, __m128 b)
+{
+  return __builtin_shufflevector(a, b, 0, 1, 4, 5);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpi16_ps(__m64 a)
+{
+  __m64 b, c;
+  __m128 r;
+
+  b = _mm_setzero_si64();
+  b = _mm_cmpgt_pi16(b, a);
+  c = _mm_unpackhi_pi16(a, b);  
+  r = _mm_setzero_ps();
+  r = _mm_cvtpi32_ps(r, c);
+  r = _mm_movelh_ps(r, r);
+  c = _mm_unpacklo_pi16(a, b);  
+  r = _mm_cvtpi32_ps(r, c);
+
+  return r;
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpu16_ps(__m64 a)
+{
+  __m64 b, c;
+  __m128 r;
+
+  b = _mm_setzero_si64();
+  c = _mm_unpackhi_pi16(a, b);  
+  r = _mm_setzero_ps();
+  r = _mm_cvtpi32_ps(r, c);
+  r = _mm_movelh_ps(r, r);
+  c = _mm_unpacklo_pi16(a, b);  
+  r = _mm_cvtpi32_ps(r, c);
+
+  return r;
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpi8_ps(__m64 a)
+{
+  __m64 b;
+  
+  b = _mm_setzero_si64();
+  b = _mm_cmpgt_pi8(b, a);
+  b = _mm_unpacklo_pi8(a, b);
+
+  return _mm_cvtpi16_ps(b);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpu8_ps(__m64 a)
+{
+  __m64 b;
+  
+  b = _mm_setzero_si64();
+  b = _mm_unpacklo_pi8(a, b);
+
+  return _mm_cvtpi16_ps(b);
+}
+
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtpi32x2_ps(__m64 a, __m64 b)
+{
+  __m128 c;
+  
+  c = _mm_setzero_ps();  
+  c = _mm_cvtpi32_ps(c, b);
+  c = _mm_movelh_ps(c, c);
+
+  return _mm_cvtpi32_ps(c, a);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtps_pi16(__m128 a)
+{
+  __m64 b, c;
+  
+  b = _mm_cvtps_pi32(a);
+  a = _mm_movehl_ps(a, a);
+  c = _mm_cvtps_pi32(a);
+  
+  return _mm_packs_pi16(b, c);
+}
+
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
+_mm_cvtps_pi8(__m128 a)
+{
+  __m64 b, c;
+  
+  b = _mm_cvtps_pi16(a);
+  c = _mm_setzero_si64();
+  
+  return _mm_packs_pi16(b, c);
+}
+
+static inline int __attribute__((__always_inline__, __nodebug__))
+_mm_movemask_ps(__m128 a)
+{
+  return __builtin_ia32_movmskps(a);
+}
+
+#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
+
+#define _MM_EXCEPT_INVALID    (0x0001)
+#define _MM_EXCEPT_DENORM     (0x0002)
+#define _MM_EXCEPT_DIV_ZERO   (0x0004)
+#define _MM_EXCEPT_OVERFLOW   (0x0008)
+#define _MM_EXCEPT_UNDERFLOW  (0x0010)
+#define _MM_EXCEPT_INEXACT    (0x0020)
+#define _MM_EXCEPT_MASK       (0x003f)
+
+#define _MM_MASK_INVALID      (0x0080)
+#define _MM_MASK_DENORM       (0x0100)
+#define _MM_MASK_DIV_ZERO     (0x0200)
+#define _MM_MASK_OVERFLOW     (0x0400)
+#define _MM_MASK_UNDERFLOW    (0x0800)
+#define _MM_MASK_INEXACT      (0x1000)
+#define _MM_MASK_MASK         (0x1f80)
+
+#define _MM_ROUND_NEAREST     (0x0000)
+#define _MM_ROUND_DOWN        (0x2000)
+#define _MM_ROUND_UP          (0x4000)
+#define _MM_ROUND_TOWARD_ZERO (0x6000)
+#define _MM_ROUND_MASK        (0x6000)
+
+#define _MM_FLUSH_ZERO_MASK   (0x8000)
+#define _MM_FLUSH_ZERO_ON     (0x8000)
+#define _MM_FLUSH_ZERO_OFF    (0x8000)
+
+#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
+#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
+#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
+#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
+
+#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
+#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
+#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
+#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
+
+#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
+do { \
+  __m128 tmp3, tmp2, tmp1, tmp0; \
+  tmp0 = _mm_unpacklo_ps((row0), (row1)); \
+  tmp2 = _mm_unpacklo_ps((row2), (row3)); \
+  tmp1 = _mm_unpackhi_ps((row0), (row1)); \
+  tmp3 = _mm_unpackhi_ps((row2), (row3)); \
+  (row0) = _mm_movelh_ps(tmp0, tmp2); \
+  (row1) = _mm_movehl_ps(tmp2, tmp0); \
+  (row2) = _mm_movelh_ps(tmp1, tmp3); \
+  (row3) = _mm_movelh_ps(tmp3, tmp1); \
+} while (0)
+
+#include <emmintrin.h>
+
+#endif /* __SSE__ */
+
+#endif /* __XMMINTRIN_H */