diff --git a/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp b/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp index b4722c9ea814..e8039045a720 100644 --- a/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp @@ -54,6 +54,18 @@ class CIRGenNVCUDARuntime : public CIRGenCUDARuntime { // Map a kernel handle to the kernel stub. llvm::DenseMap KernelStubs; + struct VarInfo { + cir::GlobalOp var; + const VarDecl *declaration; + DeviceVarFlags flags; + }; + + llvm::SmallVector deviceVars; + + /// Keeps track of variable containing handle of GPU binary. Populated by + /// ModuleCtorFunction() and used to create corresponding cleanup calls in + /// ModuleDtorFunction() + llvm::GlobalVariable *gpuBinaryHandle = nullptr; // Mangle context for device. std::unique_ptr deviceMC; @@ -72,6 +84,7 @@ class CIRGenNVCUDARuntime : public CIRGenCUDARuntime { void emitDeviceStub(CIRGenFunction &cgf, cir::FuncOp fn, FunctionArgList &args) override; + void handleVarRegistration(const VarDecl *vd, cir::GlobalOp var) override; mlir::Operation *getKernelHandle(cir::FuncOp fn, GlobalDecl GD) override; @@ -86,6 +99,15 @@ class CIRGenNVCUDARuntime : public CIRGenCUDARuntime { /// Returns function or variable name on device side even if the current /// compilation is for host. std::string getDeviceSideName(const NamedDecl *nd) override; + + void registerDeviceVar(const VarDecl *vd, cir::GlobalOp &var, bool isExtern, + bool isConstant) { + deviceVars.push_back({var, + vd, + {DeviceVarFlags::Variable, isExtern, isConstant, + vd->hasAttr(), + /*Normalized*/ false, 0}}); + } }; } // namespace @@ -401,3 +423,35 @@ void CIRGenNVCUDARuntime::internalizeDeviceSideVar( d->getType()->isCUDADeviceBuiltinTextureType()) llvm_unreachable("NYI"); } + +void CIRGenNVCUDARuntime::handleVarRegistration(const VarDecl *declaration, + cir::GlobalOp globalVariable) { + if (declaration->hasAttr() || + declaration->hasAttr()) { + // Shadow variables and their properties must be registered with CUDA + // runtime. Skip Extern global variables, which will be registered in + // the TU where they are defined. + // + // Don't register a C++17 inline variable. The local symbol can be + // discarded and referencing a discarded local symbol from outside the + // comdat (__cuda_register_globals) is disallowed by the ELF spec. + // + // HIP managed variables need to be always recorded in device and host + // compilations for transformation. + // + // HIP managed variables and variables in CUDADeviceVarODRUsedByHost are + // added to llvm.compiler-used, therefore they are safe to be registered. + if ((!declaration->hasExternalStorage() && !declaration->isInline()) || + cgm.getASTContext().CUDADeviceVarODRUsedByHost.contains(declaration) || + declaration->hasAttr()) { + registerDeviceVar(declaration, globalVariable, + !declaration->hasDefinition(), + declaration->hasAttr()); + } + } else if (declaration->getType()->isCUDADeviceBuiltinSurfaceType() || + declaration->getType()->isCUDADeviceBuiltinTextureType()) { + // Builtin surfaces and textures and their template arguments are + // also registered with CUDA runtime. + llvm_unreachable("Surface and Texture registration NYI"); + } +} diff --git a/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h b/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h index a7c99b75cb36..ef15a796fba8 100644 --- a/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h +++ b/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h @@ -15,6 +15,7 @@ #ifndef LLVM_CLANG_LIB_CIR_CIRGENCUDARUNTIME_H #define LLVM_CLANG_LIB_CIR_CIRGENCUDARUNTIME_H +#include "clang/Basic/Sanitizers.h" #include "clang/CIR/Dialect/IR/CIRDialect.h" #include "clang/CIR/Dialect/IR/CIROpsEnums.h" @@ -35,6 +36,41 @@ class CIRGenCUDARuntime { CIRGenModule &cgm; public: + // Global variable properties that must be passed to CUDA runtime. + class DeviceVarFlags { + public: + enum DeviceVarKind { + Variable, // Variable + Surface, // Builtin surface + Texture, // Builtin texture + }; + + private: + LLVM_PREFERRED_TYPE(DeviceVarKind) + unsigned Kind : 2; + LLVM_PREFERRED_TYPE(bool) + unsigned Extern : 1; + LLVM_PREFERRED_TYPE(bool) + unsigned Constant : 1; // Constant variable. + LLVM_PREFERRED_TYPE(bool) + unsigned Managed : 1; // Managed variable. + LLVM_PREFERRED_TYPE(bool) + unsigned Normalized : 1; // Normalized texture. + int SurfTexType; // Type of surface/texutre. + + public: + DeviceVarFlags(DeviceVarKind K, bool E, bool C, bool M, bool N, int T) + : Kind(K), Extern(E), Constant(C), Managed(M), Normalized(N), + SurfTexType(T) {} + + DeviceVarKind getKind() const { return static_cast(Kind); } + bool isExtern() const { return Extern; } + bool isConstant() const { return Constant; } + bool isManaged() const { return Managed; } + bool isNormalized() const { return Normalized; } + int getSurfTexType() const { return SurfTexType; } + }; + CIRGenCUDARuntime(CIRGenModule &cgm) : cgm(cgm) {} virtual ~CIRGenCUDARuntime(); @@ -50,6 +86,10 @@ class CIRGenCUDARuntime { virtual void internalizeDeviceSideVar(const VarDecl *d, cir::GlobalLinkageKind &linkage) = 0; + + /// Check whether a variable is a device variable and register it if true. + virtual void handleVarRegistration(const VarDecl *vd, cir::GlobalOp var) = 0; + /// Returns function or variable name on device side even if the current /// compilation is for host. virtual std::string getDeviceSideName(const NamedDecl *nd) = 0; diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp index 0902f0751bae..37e419ba87c4 100644 --- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp @@ -1242,7 +1242,7 @@ CIRGenModule::getOrCreateCIRGlobal(StringRef mangledName, mlir::Type ty, // in both device and host compilations. if (getLangOpts().CUDA && d && d->hasAttr() && d->hasExternalStorage()) - llvm_unreachable("NYI"); + getCUDARuntime().handleVarRegistration(d, gv); } // TODO(cir): address space cast when needed for DAddrSpace. @@ -1515,16 +1515,21 @@ void CIRGenModule::emitGlobalVarDefinition(const clang::VarDecl *d, // the device. [...]" // CUDA B.2.2 "The __constant__ qualifier, optionally used together with // __device__, declares a variable that: [...] - if (langOpts.CUDA && langOpts.CUDAIsDevice) { - // __shared__ variables is not marked as externally initialized, - // because they must not be initialized. - if (linkage != cir::GlobalLinkageKind::InternalLinkage && - (d->hasAttr() || d->hasAttr() || - d->getType()->isCUDADeviceBuiltinSurfaceType() || - d->getType()->isCUDADeviceBuiltinTextureType())) { - gv->setAttr(CUDAExternallyInitializedAttr::getMnemonic(), - CUDAExternallyInitializedAttr::get(&getMLIRContext())); - } + if (langOpts.CUDA) { + if (langOpts.CUDAIsDevice) { + // __shared__ variables is not marked as externally initialized, + // because they must not be initialized. + if (linkage != cir::GlobalLinkageKind::InternalLinkage && + (d->hasAttr() || d->hasAttr() || + d->getType()->isCUDADeviceBuiltinSurfaceType() || + d->getType()->isCUDADeviceBuiltinTextureType())) { + gv->setAttr(CUDAExternallyInitializedAttr::getMnemonic(), + CUDAExternallyInitializedAttr::get(&getMLIRContext())); + } + } else + getCUDARuntime().internalizeDeviceSideVar(d, linkage); + + getCUDARuntime().handleVarRegistration(d, gv); } // Decorate CUDA shadow variables with the cu.shadow_name attribute so we know diff --git a/clang/test/CIR/CodeGen/CUDA/global-vars.cu b/clang/test/CIR/CodeGen/CUDA/global-vars.cu index 81ad8794dc4e..499908688536 100644 --- a/clang/test/CIR/CodeGen/CUDA/global-vars.cu +++ b/clang/test/CIR/CodeGen/CUDA/global-vars.cu @@ -7,24 +7,175 @@ // RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fclangir \ // RUN: -fcuda-is-device -emit-llvm -target-sdk-version=12.3 \ -// RUN: %s -o %t.cir -// RUN: FileCheck --check-prefix=LLVM-DEVICE --input-file=%t.cir %s +// RUN: %s -o %t.ll +// RUN: FileCheck --check-prefix=LLVM-DEVICE --input-file=%t.ll %s // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir \ // RUN: -x cuda -emit-cir -target-sdk-version=12.3 \ // RUN: %s -o %t.cir // RUN: FileCheck --check-prefix=CIR-HOST --input-file=%t.cir %s -__device__ int a; -// CIR-DEVICE: cir.global external lang_address_space(offload_global) @a = #cir.int<0> -// LLVM-DEVICE: @a = addrspace(1) externally_initialized global i32 0, align 4 -// CIR-HOST: {{.*}}cir.global external @a = #cir.undef : !s32i {alignment = 4 : i64, cu.shadow_name = #cir.cu.shadow_name}{{.*}} +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir \ +// RUN: -x cuda -emit-llvm -target-sdk-version=12.3 \ +// RUN: %s -o %t.ll +// RUN: FileCheck --check-prefix=LLVM-HOST --input-file=%t.ll %s + +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu \ +// RUN: -x cuda -emit-llvm -target-sdk-version=12.3 \ +// RUN: %s -o %t.ll +// RUN: FileCheck --check-prefix=OGCG-HOST --input-file=%t.ll %s + +// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda \ +// RUN: -fcuda-is-device -emit-llvm -target-sdk-version=12.3 \ +// RUN: %s -o %t.ll +// RUN: FileCheck --check-prefix=OGCG-DEVICE --input-file=%t.ll %s + +// HIP tests +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fclangir \ +// RUN: -fcuda-is-device -emit-cir -target-sdk-version=12.3 \ +// RUN: -x hip %s -o %t.cir +// RUN: FileCheck --check-prefix=CIR-DEVICE-HIP --input-file=%t.cir %s + +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fclangir \ +// RUN: -fcuda-is-device -emit-llvm -target-sdk-version=12.3 \ +// RUN: -x hip %s -o %t.ll +// RUN: FileCheck --check-prefix=LLVM-DEVICE-HIP --input-file=%t.ll %s + +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir \ +// RUN: -x hip -emit-cir -target-sdk-version=12.3 \ +// RUN: %s -o %t.cir +// RUN: FileCheck --check-prefix=CIR-HOST-HIP --input-file=%t.cir %s + +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir \ +// RUN: -x hip -emit-llvm -target-sdk-version=12.3 \ +// RUN: %s -o %t.ll +// RUN: FileCheck --check-prefix=LLVM-HOST-HIP --input-file=%t.ll %s + +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu \ +// RUN: -x hip -emit-llvm -target-sdk-version=12.3 \ +// RUN: %s -o %t.ll +// RUN: FileCheck --check-prefix=OGCG-HOST-HIP --input-file=%t.ll %s + +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \ +// RUN: -fcuda-is-device -emit-llvm -target-sdk-version=12.3 \ +// RUN: -x hip %s -o %t.ll +// RUN: FileCheck --check-prefix=OGCG-DEVICE-HIP --input-file=%t.ll %s __shared__ int shared; -// CIR-DEVICE: cir.global external lang_address_space(offload_local) @shared = #cir.undef +// CIR-DEVICE: cir.global external{{.*}}lang_address_space(offload_local) @shared = #cir.undef // LLVM-DEVICE: @shared = addrspace(3) global i32 undef, align 4 +// CIR-HOST: cir.global{{.*}}@shared = #cir.undef : !s32i {alignment = 4 : i64} +// CIR-HOST-NOT: cu.shadow_name +// LLVM-HOST: @shared = internal global i32 undef, align 4 +// OGCG-HOST: @shared = internal global i32 +// OGCG-DEVICE: @shared = addrspace(3) global i32 undef, align 4 +// CIR-DEVICE-HIP: cir.global external{{.*}}lang_address_space(offload_local) @shared = #cir.undef +// LLVM-DEVICE-HIP: @shared = addrspace(3) global i32 undef, align 4 +// CIR-HOST-HIP: cir.global{{.*}}@shared = #cir.undef : !s32i {alignment = 4 : i64} +// CIR-HOST-HIP-NOT: cu.shadow_name +// LLVM-HOST-HIP: @shared = internal global i32 undef, align 4 +// OGCG-HOST-HIP: @shared = internal global i32 +// OGCG-DEVICE-HIP: @shared = addrspace(3) global i32 undef, align 4 __constant__ int b; -// CIR-DEVICE: cir.global constant external lang_address_space(offload_constant) @b = #cir.int<0> : !s32i {alignment = 4 : i64, cu.externally_initialized = #cir.cu.externally_initialized} +// CIR-DEVICE: cir.global constant external{{.*}}lang_address_space(offload_constant) @b = #cir.int<0> : !s32i {alignment = 4 : i64, cu.externally_initialized = #cir.cu.externally_initialized} // LLVM-DEVICE: @b = addrspace(4) externally_initialized constant i32 0, align 4 -// CIR-HOST: {{.*}}cir.global external @b = #cir.undef : !s32i {alignment = 4 : i64, cu.shadow_name = #cir.cu.shadow_name}{{.*}} +// CIR-HOST: cir.global{{.*}}"private"{{.*}}internal{{.*}}@b = #cir.undef : !s32i {alignment = 4 : i64, cu.shadow_name = #cir.cu.shadow_name} +// LLVM-HOST: @b = internal global i32 undef, align 4 +// OGCG-HOST: @b = internal global i32 +// OGCG-DEVICE: @b = addrspace(4) externally_initialized constant i32 0, align 4 +// CIR-DEVICE-HIP: cir.global constant external{{.*}}lang_address_space(offload_constant) @b = #cir.int<0> : !s32i {alignment = 4 : i64, cu.externally_initialized = #cir.cu.externally_initialized} +// LLVM-DEVICE-HIP: @b = addrspace(4) externally_initialized constant i32 0, align 4 +// CIR-HOST-HIP: cir.global{{.*}}"private"{{.*}}internal{{.*}}@b = #cir.undef : !s32i {alignment = 4 : i64, cu.shadow_name = #cir.cu.shadow_name} +// LLVM-HOST-HIP: @b = internal global i32 undef, align 4 +// OGCG-HOST-HIP: @b = internal global i32 +// OGCG-DEVICE-HIP: @b = addrspace(4) externally_initialized constant i32 0, align 4 + +// External device variables should remain external on host side (they're just declarations) +// Note: External declarations may not appear in output if they're not referenced +extern __device__ int ext_device_var; +// CIR-HOST-NOT: cir.global{{.*}}@ext_device_var +// LLVM-HOST-NOT: @ext_device_var +// OGCG-HOST-NOT: @ext_device_var +// OGCG-DEVICE-NOT: @ext_device_var +// CIR-HOST-HIP-NOT: cir.global{{.*}}@ext_device_var +// LLVM-HOST-HIP-NOT: @ext_device_var +// OGCG-HOST-HIP-NOT: @ext_device_var +// OGCG-DEVICE-HIP-NOT: @ext_device_var + +extern __constant__ int ext_constant_var; +// CIR-HOST-NOT: cir.global{{.*}}@ext_constant_var +// LLVM-HOST-NOT: @ext_constant_var +// OGCG-HOST-NOT: @ext_constant_var +// OGCG-DEVICE-NOT: @ext_constant_var +// CIR-HOST-HIP-NOT: cir.global{{.*}}@ext_constant_var +// LLVM-HOST-HIP-NOT: @ext_constant_var +// OGCG-HOST-HIP-NOT: @ext_constant_var +// OGCG-DEVICE-HIP-NOT: @ext_constant_var + +// External device variables with definitions should be internal on host +extern __device__ int ext_device_var_def; +__device__ int ext_device_var_def = 1; +// CIR-DEVICE: cir.global external{{.*}}lang_address_space(offload_global) @ext_device_var_def = #cir.int<1> +// LLVM-DEVICE: @ext_device_var_def = addrspace(1) externally_initialized global i32 1, align 4 +// CIR-HOST: cir.global{{.*}}"private"{{.*}}internal{{.*}}@ext_device_var_def = #cir.undef : !s32i {alignment = 4 : i64, cu.shadow_name = #cir.cu.shadow_name} +// LLVM-HOST: @ext_device_var_def = internal global i32 undef, align 4 +// OGCG-HOST: @ext_device_var_def = internal global i32 +// OGCG-DEVICE: @ext_device_var_def = addrspace(1) externally_initialized global i32 1, align 4 +// CIR-DEVICE-HIP: cir.global external{{.*}}lang_address_space(offload_global) @ext_device_var_def = #cir.int<1> +// LLVM-DEVICE-HIP: @ext_device_var_def = addrspace(1) externally_initialized global i32 1, align 4 +// CIR-HOST-HIP: cir.global{{.*}}"private"{{.*}}internal{{.*}}@ext_device_var_def = #cir.undef : !s32i {alignment = 4 : i64, cu.shadow_name = #cir.cu.shadow_name} +// LLVM-HOST-HIP: @ext_device_var_def = internal global i32 undef, align 4 +// OGCG-HOST-HIP: @ext_device_var_def = internal global i32 +// OGCG-DEVICE-HIP: @ext_device_var_def = addrspace(1) externally_initialized global i32 1, align 4 + +extern __constant__ int ext_constant_var_def; +__constant__ int ext_constant_var_def = 2; +// CIR-DEVICE: cir.global constant external{{.*}}lang_address_space(offload_constant) @ext_constant_var_def = #cir.int<2> +// LLVM-DEVICE: @ext_constant_var_def = addrspace(4) externally_initialized constant i32 2, align 4 +// OGCG-DEVICE: @ext_constant_var_def = addrspace(4) externally_initialized constant i32 2, align 4 +// CIR-HOST: cir.global{{.*}}"private"{{.*}}internal{{.*}}@ext_constant_var_def = #cir.undef : !s32i {alignment = 4 : i64, cu.shadow_name = #cir.cu.shadow_name} +// LLVM-HOST: @ext_constant_var_def = internal global i32 undef, align 4 +// OGCG-HOST: @ext_constant_var_def = internal global i32 +// CIR-DEVICE-HIP: cir.global constant external{{.*}}lang_address_space(offload_constant) @ext_constant_var_def = #cir.int<2> +// LLVM-DEVICE-HIP: @ext_constant_var_def = addrspace(4) externally_initialized constant i32 2, align 4 +// CIR-HOST-HIP: cir.global{{.*}}"private"{{.*}}internal{{.*}}@ext_constant_var_def = #cir.undef : !s32i {alignment = 4 : i64, cu.shadow_name = #cir.cu.shadow_name} +// LLVM-HOST-HIP: @ext_constant_var_def = internal global i32 undef, align 4 +// OGCG-HOST-HIP: @ext_constant_var_def = internal global i32 +// OGCG-DEVICE-HIP: @ext_constant_var_def = addrspace(4) externally_initialized constant i32 2, align 4 + +// Regular host variables should NOT be internalized +int host_var; +// CIR-HOST: cir.global external @host_var = #cir.int<0> : !s32i +// LLVM-HOST: @host_var = global i32 0, align 4 +// OGCG-HOST: @host_var ={{.*}} global i32 + +// CIR-DEVICE-NOT: cir.global{{.*}}@host_var +// LLVM-DEVICE-NOT: @host_var +// OGCG-DEVICE-NOT: @host_var + +// CIR-HOST-HIP: cir.global external @host_var = #cir.int<0> : !s32i +// LLVM-HOST-HIP: @host_var = global i32 0, align 4 +// OGCG-HOST-HIP: @host_var ={{.*}} global i32 + +// CIR-DEVICE-HIP-NOT: cir.global{{.*}}@host_var +// LLVM-DEVICE-HIP-NOT: @host_var +// OGCG-DEVICE-HIP-NOT: @host_var + +// External host variables should remain external (may not appear if not referenced) +extern int ext_host_var; +// CIR-HOST-NOT: cir.global{{.*}}@ext_host_var +// LLVM-HOST-NOT: @ext_host_var +// OGCG-HOST-NOT: @ext_host_var + +// CIR-DEVICE-NOT: cir.global{{.*}}@ext_host_var +// LLVM-DEVICE-NOT: @ext_host_var +// OGCG-DEVICE-NOT: @ext_host_var + +// CIR-HOST-HIP-NOT: cir.global{{.*}}@ext_host_var +// LLVM-HOST-HIP-NOT: @ext_host_var +// OGCG-HOST-HIP-NOT: @ext_host_var + +// CIR-DEVICE-HIP-NOT: cir.global{{.*}}@ext_host_var +// LLVM-DEVICE-HIP-NOT: @ext_host_var +// OGCG-DEVICE-HIP-NOT: @ext_host_var diff --git a/clang/test/CIR/CodeGen/HIP/registration.cpp b/clang/test/CIR/CodeGen/HIP/registration.cpp index 815fae6afbfa..22429aaf4aa0 100644 --- a/clang/test/CIR/CodeGen/HIP/registration.cpp +++ b/clang/test/CIR/CodeGen/HIP/registration.cpp @@ -1,4 +1,4 @@ -#include "cuda.h" +#include "../Inputs/cuda.h" // RUN: echo "sample fatbin" > %t.fatbin // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir \ @@ -33,7 +33,7 @@ // LLVM-HOST: i32 1212764230, i32 1, ptr @__hip_fatbin_str, ptr null // LLVM-HOST: }, section ".hipFatBinSegment" // LLVM-HOST: @_Z2fnv = constant ptr @_Z17__device_stub__fnv, align 8 -// LLVM-HOST: @a = global i32 undef, align 4 +// LLVM-HOST: @a = internal global i32 undef, align 4 // LLVM-HOST: @llvm.global_ctors = {{.*}}ptr @__hip_module_ctor // CIR-HOST: cir.func internal private @__hip_module_dtor() {