summaryrefslogtreecommitdiffstats
path: root/test/CodeGenCUDA
diff options
context:
space:
mode:
authordim <dim@FreeBSD.org>2015-05-27 18:47:56 +0000
committerdim <dim@FreeBSD.org>2015-05-27 18:47:56 +0000
commit3191b2b32a96e1a6ee833fcca73e5c8e0c67ba65 (patch)
treedbbd4047878da71c1a706e26ce05b4e7791b14cc /test/CodeGenCUDA
parent38d6f2e7f2ce51a5b3836d26596c6c34a3288752 (diff)
downloadFreeBSD-src-3191b2b32a96e1a6ee833fcca73e5c8e0c67ba65.zip
FreeBSD-src-3191b2b32a96e1a6ee833fcca73e5c8e0c67ba65.tar.gz
Vendor import of clang trunk r238337:
https://llvm.org/svn/llvm-project/cfe/trunk@238337
Diffstat (limited to 'test/CodeGenCUDA')
-rw-r--r--test/CodeGenCUDA/address-spaces.cu31
-rw-r--r--test/CodeGenCUDA/cuda-builtin-vars.cu28
-rw-r--r--test/CodeGenCUDA/device-stub.cu41
-rw-r--r--test/CodeGenCUDA/filter-decl.cu6
-rw-r--r--test/CodeGenCUDA/host-device-calls-host.cu32
-rw-r--r--test/CodeGenCUDA/launch-bounds.cu51
6 files changed, 181 insertions, 8 deletions
diff --git a/test/CodeGenCUDA/address-spaces.cu b/test/CodeGenCUDA/address-spaces.cu
index b808206..b0ef355 100644
--- a/test/CodeGenCUDA/address-spaces.cu
+++ b/test/CodeGenCUDA/address-spaces.cu
@@ -27,25 +27,25 @@ struct MyStruct {
// CHECK: @b = addrspace(3) global float 0.000000e+00
__device__ void foo() {
- // CHECK: load i32* addrspacecast (i32 addrspace(1)* @i to i32*)
+ // CHECK: load i32, i32* addrspacecast (i32 addrspace(1)* @i to i32*)
i++;
- // CHECK: load i32* addrspacecast (i32 addrspace(4)* @j to i32*)
+ // CHECK: load i32, i32* addrspacecast (i32 addrspace(4)* @j to i32*)
j++;
- // CHECK: load i32* addrspacecast (i32 addrspace(3)* @k to i32*)
+ // CHECK: load i32, i32* addrspacecast (i32 addrspace(3)* @k to i32*)
k++;
static int li;
- // CHECK: load i32* addrspacecast (i32 addrspace(1)* @_ZZ3foovE2li to i32*)
+ // CHECK: load i32, i32* addrspacecast (i32 addrspace(1)* @_ZZ3foovE2li to i32*)
li++;
__constant__ int lj;
- // CHECK: load i32* addrspacecast (i32 addrspace(4)* @_ZZ3foovE2lj to i32*)
+ // CHECK: load i32, i32* addrspacecast (i32 addrspace(4)* @_ZZ3foovE2lj to i32*)
lj++;
__shared__ int lk;
- // CHECK: load i32* addrspacecast (i32 addrspace(3)* @_ZZ3foovE2lk to i32*)
+ // CHECK: load i32, i32* addrspacecast (i32 addrspace(3)* @_ZZ3foovE2lk to i32*)
lk++;
}
@@ -75,7 +75,7 @@ __device__ void func2() {
*ap = 1.0f;
}
// CHECK: define void @_Z5func2v()
-// CHECK: store float* getelementptr inbounds ([256 x float]* addrspacecast ([256 x float] addrspace(3)* @_ZZ5func2vE1a to [256 x float]*), i32 0, i32 128), float** %ap
+// CHECK: store float* getelementptr inbounds ([256 x float], [256 x float]* addrspacecast ([256 x float] addrspace(3)* @_ZZ5func2vE1a to [256 x float]*), i32 0, i32 128), float** %ap
__device__ void func3() {
__shared__ float a;
@@ -100,3 +100,20 @@ __device__ float *func5() {
}
// CHECK: define float* @_Z5func5v()
// CHECK: ret float* addrspacecast (float addrspace(3)* @b to float*)
+
+struct StructWithCtor {
+ __device__ StructWithCtor(): data(1) {}
+ __device__ StructWithCtor(const StructWithCtor &second): data(second.data) {}
+ __device__ int getData() { return data; }
+ int data;
+};
+
+__device__ int construct_shared_struct() {
+// CHECK-LABEL: define i32 @_Z23construct_shared_structv()
+ __shared__ StructWithCtor s;
+// CHECK: call void @_ZN14StructWithCtorC1Ev(%struct.StructWithCtor* addrspacecast (%struct.StructWithCtor addrspace(3)* @_ZZ23construct_shared_structvE1s to %struct.StructWithCtor*))
+ __shared__ StructWithCtor t(s);
+// CHECK: call void @_ZN14StructWithCtorC1ERKS_(%struct.StructWithCtor* addrspacecast (%struct.StructWithCtor addrspace(3)* @_ZZ23construct_shared_structvE1t to %struct.StructWithCtor*), %struct.StructWithCtor* dereferenceable(4) addrspacecast (%struct.StructWithCtor addrspace(3)* @_ZZ23construct_shared_structvE1s to %struct.StructWithCtor*))
+ return t.getData();
+// CHECK: call i32 @_ZN14StructWithCtor7getDataEv(%struct.StructWithCtor* addrspacecast (%struct.StructWithCtor addrspace(3)* @_ZZ23construct_shared_structvE1t to %struct.StructWithCtor*))
+}
diff --git a/test/CodeGenCUDA/cuda-builtin-vars.cu b/test/CodeGenCUDA/cuda-builtin-vars.cu
new file mode 100644
index 0000000..834e16d
--- /dev/null
+++ b/test/CodeGenCUDA/cuda-builtin-vars.cu
@@ -0,0 +1,28 @@
+// RUN: %clang_cc1 "-triple" "nvptx-nvidia-cuda" -emit-llvm -fcuda-is-device -o - %s | FileCheck %s
+
+#include "cuda_builtin_vars.h"
+
+// CHECK: define void @_Z6kernelPi(i32* %out)
+__attribute__((global))
+void kernel(int *out) {
+ int i = 0;
+ out[i++] = threadIdx.x; // CHECK: call i32 @llvm.ptx.read.tid.x()
+ out[i++] = threadIdx.y; // CHECK: call i32 @llvm.ptx.read.tid.y()
+ out[i++] = threadIdx.z; // CHECK: call i32 @llvm.ptx.read.tid.z()
+
+ out[i++] = blockIdx.x; // CHECK: call i32 @llvm.ptx.read.ctaid.x()
+ out[i++] = blockIdx.y; // CHECK: call i32 @llvm.ptx.read.ctaid.y()
+ out[i++] = blockIdx.z; // CHECK: call i32 @llvm.ptx.read.ctaid.z()
+
+ out[i++] = blockDim.x; // CHECK: call i32 @llvm.ptx.read.ntid.x()
+ out[i++] = blockDim.y; // CHECK: call i32 @llvm.ptx.read.ntid.y()
+ out[i++] = blockDim.z; // CHECK: call i32 @llvm.ptx.read.ntid.z()
+
+ out[i++] = gridDim.x; // CHECK: call i32 @llvm.ptx.read.nctaid.x()
+ out[i++] = gridDim.y; // CHECK: call i32 @llvm.ptx.read.nctaid.y()
+ out[i++] = gridDim.z; // CHECK: call i32 @llvm.ptx.read.nctaid.z()
+
+ out[i++] = warpSize; // CHECK: store i32 32,
+
+ // CHECK: ret void
+}
diff --git a/test/CodeGenCUDA/device-stub.cu b/test/CodeGenCUDA/device-stub.cu
index ed94d10..7f5e159 100644
--- a/test/CodeGenCUDA/device-stub.cu
+++ b/test/CodeGenCUDA/device-stub.cu
@@ -1,7 +1,21 @@
-// RUN: %clang_cc1 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm %s -fcuda-include-gpubinary %s -o - | FileCheck %s
#include "Inputs/cuda.h"
+// Make sure that all parts of GPU code init/cleanup are there:
+// * constant unnamed string with the kernel name
+// CHECK: private unnamed_addr constant{{.*}}kernelfunc{{.*}}\00"
+// * constant unnamed string with GPU binary
+// CHECK: private unnamed_addr constant{{.*}}\00"
+// * constant struct that wraps GPU binary
+// CHECK: @__cuda_fatbin_wrapper = internal constant { i32, i32, i8*, i8* }
+// CHECK: { i32 1180844977, i32 1, {{.*}}, i8* null }
+// * variable to save GPU binary handle after initialization
+// CHECK: @__cuda_gpubin_handle = internal global i8** null
+// * Make sure our constructor/destructor was added to global ctor/dtor list.
+// CHECK: @llvm.global_ctors = appending global {{.*}}@__cuda_module_ctor
+// CHECK: @llvm.global_dtors = appending global {{.*}}@__cuda_module_dtor
+
// Test that we build the correct number of calls to cudaSetupArgument followed
// by a call to cudaLaunch.
@@ -11,3 +25,28 @@
// CHECK: call{{.*}}cudaSetupArgument
// CHECK: call{{.*}}cudaLaunch
__global__ void kernelfunc(int i, int j, int k) {}
+
+// Test that we've built correct kernel launch sequence.
+// CHECK: define{{.*}}hostfunc
+// CHECK: call{{.*}}cudaConfigureCall
+// CHECK: call{{.*}}kernelfunc
+void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
+
+// Test that we've built a function to register kernels
+// CHECK: define internal void @__cuda_register_kernels
+// CHECK: call{{.*}}cudaRegisterFunction(i8** %0, {{.*}}kernelfunc
+
+// Test that we've built contructor..
+// CHECK: define internal void @__cuda_module_ctor
+// .. that calls __cudaRegisterFatBinary(&__cuda_fatbin_wrapper)
+// CHECK: call{{.*}}cudaRegisterFatBinary{{.*}}__cuda_fatbin_wrapper
+// .. stores return value in __cuda_gpubin_handle
+// CHECK-NEXT: store{{.*}}__cuda_gpubin_handle
+// .. and then calls __cuda_register_kernels
+// CHECK-NEXT: call void @__cuda_register_kernels
+
+// Test that we've created destructor.
+// CHECK: define internal void @__cuda_module_dtor
+// CHECK: load{{.*}}__cuda_gpubin_handle
+// CHECK-NEXT: call void @__cudaUnregisterFatBinary
+
diff --git a/test/CodeGenCUDA/filter-decl.cu b/test/CodeGenCUDA/filter-decl.cu
index faaeb69..e69473f 100644
--- a/test/CodeGenCUDA/filter-decl.cu
+++ b/test/CodeGenCUDA/filter-decl.cu
@@ -3,6 +3,12 @@
#include "Inputs/cuda.h"
+// This has to be at the top of the file as that's where file-scope
+// asm ends up.
+// CHECK-HOST: module asm "file scope asm is host only"
+// CHECK-DEVICE-NOT: module asm "file scope asm is host only"
+__asm__("file scope asm is host only");
+
// CHECK-HOST-NOT: constantdata = global
// CHECK-DEVICE: constantdata = global
__constant__ char constantdata[256];
diff --git a/test/CodeGenCUDA/host-device-calls-host.cu b/test/CodeGenCUDA/host-device-calls-host.cu
new file mode 100644
index 0000000..8140f61
--- /dev/null
+++ b/test/CodeGenCUDA/host-device-calls-host.cu
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 %s -triple nvptx-unknown-unknown -fcuda-allow-host-calls-from-host-device -fcuda-is-device -Wno-cuda-compat -emit-llvm -o - | FileCheck %s
+
+#include "Inputs/cuda.h"
+
+extern "C"
+void host_function() {}
+
+// CHECK-LABEL: define void @hd_function_a
+extern "C"
+__host__ __device__ void hd_function_a() {
+ // CHECK: call void @host_function
+ host_function();
+}
+
+// CHECK: declare void @host_function
+
+// CHECK-LABEL: define void @hd_function_b
+extern "C"
+__host__ __device__ void hd_function_b(bool b) { if (b) host_function(); }
+
+// CHECK-LABEL: define void @device_function_b
+extern "C"
+__device__ void device_function_b() { hd_function_b(false); }
+
+// CHECK-LABEL: define void @global_function
+extern "C"
+__global__ void global_function() {
+ // CHECK: call void @device_function_b
+ device_function_b();
+}
+
+// CHECK: !{{[0-9]+}} = !{void ()* @global_function, !"kernel", i32 1}
diff --git a/test/CodeGenCUDA/launch-bounds.cu b/test/CodeGenCUDA/launch-bounds.cu
index 6f4102e..ecbd0ad 100644
--- a/test/CodeGenCUDA/launch-bounds.cu
+++ b/test/CodeGenCUDA/launch-bounds.cu
@@ -28,3 +28,54 @@ Kernel2()
}
// CHECK: !{{[0-9]+}} = !{void ()* @Kernel2, !"maxntidx", i32 256}
+
+template <int max_threads_per_block>
+__global__ void
+__launch_bounds__(max_threads_per_block)
+Kernel3()
+{
+}
+
+template void Kernel3<MAX_THREADS_PER_BLOCK>();
+// CHECK: !{{[0-9]+}} = !{void ()* @{{.*}}Kernel3{{.*}}, !"maxntidx", i32 256}
+
+template <int max_threads_per_block, int min_blocks_per_mp>
+__global__ void
+__launch_bounds__(max_threads_per_block, min_blocks_per_mp)
+Kernel4()
+{
+}
+template void Kernel4<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP>();
+
+// CHECK: !{{[0-9]+}} = !{void ()* @{{.*}}Kernel4{{.*}}, !"maxntidx", i32 256}
+// CHECK: !{{[0-9]+}} = !{void ()* @{{.*}}Kernel4{{.*}}, !"minctasm", i32 2}
+
+const int constint = 100;
+template <int max_threads_per_block, int min_blocks_per_mp>
+__global__ void
+__launch_bounds__(max_threads_per_block + constint,
+ min_blocks_per_mp + max_threads_per_block)
+Kernel5()
+{
+}
+template void Kernel5<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP>();
+
+// CHECK: !{{[0-9]+}} = !{void ()* @{{.*}}Kernel5{{.*}}, !"maxntidx", i32 356}
+// CHECK: !{{[0-9]+}} = !{void ()* @{{.*}}Kernel5{{.*}}, !"minctasm", i32 258}
+
+// Make sure we don't emit negative launch bounds values.
+__global__ void
+__launch_bounds__( -MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP )
+Kernel6()
+{
+}
+// CHECK-NOT: !{{[0-9]+}} = !{void ()* @{{.*}}Kernel6{{.*}}, !"maxntidx",
+// CHECK: !{{[0-9]+}} = !{void ()* @{{.*}}Kernel6{{.*}}, !"minctasm",
+
+__global__ void
+__launch_bounds__( MAX_THREADS_PER_BLOCK, -MIN_BLOCKS_PER_MP )
+Kernel7()
+{
+}
+// CHECK: !{{[0-9]+}} = !{void ()* @{{.*}}Kernel7{{.*}}, !"maxntidx",
+// CHECK-NOT: !{{[0-9]+}} = !{void ()* @{{.*}}Kernel7{{.*}}, !"minctasm",
OpenPOWER on IntegriCloud