Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,18 @@ class CIRGenNVCUDARuntime : public CIRGenCUDARuntime {

// Map a kernel handle to the kernel stub.
llvm::DenseMap<mlir::Operation *, mlir::Operation *> KernelStubs;
struct VarInfo {
cir::GlobalOp var;
const VarDecl *declaration;
DeviceVarFlags flags;
};

llvm::SmallVector<VarInfo, 16> deviceVars;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do you need this? Does this exist in OG?


/// Keeps track of variable containing handle of GPU binary. Populated by
/// ModuleCtorFunction() and used to create corresponding cleanup calls in
/// ModuleDtorFunction()
llvm::GlobalVariable *gpuBinaryHandle = nullptr;
Comment on lines +65 to +68
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see this being used at CodeGen. We handle the "gpuBinaryHandle" during "lowering". Why do you think we need this here?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch, this is merely an artifact from bringing the skeleton from OG, Will remove.


// Mangle context for device.
std::unique_ptr<MangleContext> deviceMC;
Expand All @@ -72,6 +84,7 @@ class CIRGenNVCUDARuntime : public CIRGenCUDARuntime {

void emitDeviceStub(CIRGenFunction &cgf, cir::FuncOp fn,
FunctionArgList &args) override;
void handleVarRegistration(const VarDecl *vd, cir::GlobalOp var) override;

mlir::Operation *getKernelHandle(cir::FuncOp fn, GlobalDecl GD) override;

Expand All @@ -86,6 +99,15 @@ class CIRGenNVCUDARuntime : public CIRGenCUDARuntime {
/// Returns function or variable name on device side even if the current
/// compilation is for host.
std::string getDeviceSideName(const NamedDecl *nd) override;

void registerDeviceVar(const VarDecl *vd, cir::GlobalOp &var, bool isExtern,
bool isConstant) {
deviceVars.push_back({var,
vd,
{DeviceVarFlags::Variable, isExtern, isConstant,
vd->hasAttr<HIPManagedAttr>(),
/*Normalized*/ false, 0}});
}
};

} // namespace
Expand Down Expand Up @@ -401,3 +423,35 @@ void CIRGenNVCUDARuntime::internalizeDeviceSideVar(
d->getType()->isCUDADeviceBuiltinTextureType())
llvm_unreachable("NYI");
}

void CIRGenNVCUDARuntime::handleVarRegistration(const VarDecl *declaration,
cir::GlobalOp globalVariable) {
if (declaration->hasAttr<CUDADeviceAttr>() ||
declaration->hasAttr<CUDAConstantAttr>()) {
// Shadow variables and their properties must be registered with CUDA
// runtime. Skip Extern global variables, which will be registered in
// the TU where they are defined.
//
// Don't register a C++17 inline variable. The local symbol can be
// discarded and referencing a discarded local symbol from outside the
// comdat (__cuda_register_globals) is disallowed by the ELF spec.
//
// HIP managed variables need to be always recorded in device and host
// compilations for transformation.
//
// HIP managed variables and variables in CUDADeviceVarODRUsedByHost are
// added to llvm.compiler-used, therefore they are safe to be registered.
if ((!declaration->hasExternalStorage() && !declaration->isInline()) ||
cgm.getASTContext().CUDADeviceVarODRUsedByHost.contains(declaration) ||
declaration->hasAttr<HIPManagedAttr>()) {
registerDeviceVar(declaration, globalVariable,
!declaration->hasDefinition(),
declaration->hasAttr<CUDAConstantAttr>());
}
} else if (declaration->getType()->isCUDADeviceBuiltinSurfaceType() ||
declaration->getType()->isCUDADeviceBuiltinTextureType()) {
// Builtin surfaces and textures and their template arguments are
// also registered with CUDA runtime.
llvm_unreachable("Surface and Texture registration NYI");
}
}
40 changes: 40 additions & 0 deletions clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#ifndef LLVM_CLANG_LIB_CIR_CIRGENCUDARUNTIME_H
#define LLVM_CLANG_LIB_CIR_CIRGENCUDARUNTIME_H

#include "clang/Basic/Sanitizers.h"
#include "clang/CIR/Dialect/IR/CIRDialect.h"
#include "clang/CIR/Dialect/IR/CIROpsEnums.h"

Expand All @@ -35,6 +36,41 @@ class CIRGenCUDARuntime {
CIRGenModule &cgm;

public:
// Global variable properties that must be passed to CUDA runtime.
class DeviceVarFlags {
public:
enum DeviceVarKind {
Variable, // Variable
Surface, // Builtin surface
Texture, // Builtin texture
};

private:
LLVM_PREFERRED_TYPE(DeviceVarKind)
unsigned Kind : 2;
LLVM_PREFERRED_TYPE(bool)
unsigned Extern : 1;
LLVM_PREFERRED_TYPE(bool)
unsigned Constant : 1; // Constant variable.
LLVM_PREFERRED_TYPE(bool)
unsigned Managed : 1; // Managed variable.
LLVM_PREFERRED_TYPE(bool)
unsigned Normalized : 1; // Normalized texture.
int SurfTexType; // Type of surface/texutre.

public:
DeviceVarFlags(DeviceVarKind K, bool E, bool C, bool M, bool N, int T)
: Kind(K), Extern(E), Constant(C), Managed(M), Normalized(N),
SurfTexType(T) {}

DeviceVarKind getKind() const { return static_cast<DeviceVarKind>(Kind); }
bool isExtern() const { return Extern; }
bool isConstant() const { return Constant; }
bool isManaged() const { return Managed; }
bool isNormalized() const { return Normalized; }
int getSurfTexType() const { return SurfTexType; }
};

CIRGenCUDARuntime(CIRGenModule &cgm) : cgm(cgm) {}
virtual ~CIRGenCUDARuntime();

Expand All @@ -50,6 +86,10 @@ class CIRGenCUDARuntime {

virtual void internalizeDeviceSideVar(const VarDecl *d,
cir::GlobalLinkageKind &linkage) = 0;

/// Check whether a variable is a device variable and register it if true.
virtual void handleVarRegistration(const VarDecl *vd, cir::GlobalOp var) = 0;

/// Returns function or variable name on device side even if the current
/// compilation is for host.
virtual std::string getDeviceSideName(const NamedDecl *nd) = 0;
Expand Down
27 changes: 16 additions & 11 deletions clang/lib/CIR/CodeGen/CIRGenModule.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1242,7 +1242,7 @@ CIRGenModule::getOrCreateCIRGlobal(StringRef mangledName, mlir::Type ty,
// in both device and host compilations.
if (getLangOpts().CUDA && d && d->hasAttr<HIPManagedAttr>() &&
d->hasExternalStorage())
llvm_unreachable("NYI");
getCUDARuntime().handleVarRegistration(d, gv);
}

// TODO(cir): address space cast when needed for DAddrSpace.
Expand Down Expand Up @@ -1515,16 +1515,21 @@ void CIRGenModule::emitGlobalVarDefinition(const clang::VarDecl *d,
// the device. [...]"
// CUDA B.2.2 "The __constant__ qualifier, optionally used together with
// __device__, declares a variable that: [...]
if (langOpts.CUDA && langOpts.CUDAIsDevice) {
// __shared__ variables is not marked as externally initialized,
// because they must not be initialized.
if (linkage != cir::GlobalLinkageKind::InternalLinkage &&
(d->hasAttr<CUDADeviceAttr>() || d->hasAttr<CUDAConstantAttr>() ||
d->getType()->isCUDADeviceBuiltinSurfaceType() ||
d->getType()->isCUDADeviceBuiltinTextureType())) {
gv->setAttr(CUDAExternallyInitializedAttr::getMnemonic(),
CUDAExternallyInitializedAttr::get(&getMLIRContext()));
}
if (langOpts.CUDA) {
if (langOpts.CUDAIsDevice) {
// __shared__ variables is not marked as externally initialized,
// because they must not be initialized.
if (linkage != cir::GlobalLinkageKind::InternalLinkage &&
(d->hasAttr<CUDADeviceAttr>() || d->hasAttr<CUDAConstantAttr>() ||
d->getType()->isCUDADeviceBuiltinSurfaceType() ||
d->getType()->isCUDADeviceBuiltinTextureType())) {
gv->setAttr(CUDAExternallyInitializedAttr::getMnemonic(),
CUDAExternallyInitializedAttr::get(&getMLIRContext()));
}
} else
getCUDARuntime().internalizeDeviceSideVar(d, linkage);

getCUDARuntime().handleVarRegistration(d, gv);
}

// Decorate CUDA shadow variables with the cu.shadow_name attribute so we know
Expand Down
169 changes: 160 additions & 9 deletions clang/test/CIR/CodeGen/CUDA/global-vars.cu
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see you are mixing CUDA and HIP tests here. This is ok, but we had historically split them between CUDA/HIP directories.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good Point, I'll make sure to split both things from now on.

Original file line number Diff line number Diff line change
Expand Up @@ -7,24 +7,175 @@

// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fclangir \
// RUN: -fcuda-is-device -emit-llvm -target-sdk-version=12.3 \
// RUN: %s -o %t.cir
// RUN: FileCheck --check-prefix=LLVM-DEVICE --input-file=%t.cir %s
// RUN: %s -o %t.ll
// RUN: FileCheck --check-prefix=LLVM-DEVICE --input-file=%t.ll %s

// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir \
// RUN: -x cuda -emit-cir -target-sdk-version=12.3 \
// RUN: %s -o %t.cir
// RUN: FileCheck --check-prefix=CIR-HOST --input-file=%t.cir %s

__device__ int a;
// CIR-DEVICE: cir.global external lang_address_space(offload_global) @a = #cir.int<0>
// LLVM-DEVICE: @a = addrspace(1) externally_initialized global i32 0, align 4
// CIR-HOST: {{.*}}cir.global external @a = #cir.undef : !s32i {alignment = 4 : i64, cu.shadow_name = #cir.cu.shadow_name<a>}{{.*}}
// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir \
// RUN: -x cuda -emit-llvm -target-sdk-version=12.3 \
// RUN: %s -o %t.ll
// RUN: FileCheck --check-prefix=LLVM-HOST --input-file=%t.ll %s

// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu \
// RUN: -x cuda -emit-llvm -target-sdk-version=12.3 \
// RUN: %s -o %t.ll
// RUN: FileCheck --check-prefix=OGCG-HOST --input-file=%t.ll %s

// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda \
// RUN: -fcuda-is-device -emit-llvm -target-sdk-version=12.3 \
// RUN: %s -o %t.ll
// RUN: FileCheck --check-prefix=OGCG-DEVICE --input-file=%t.ll %s

// HIP tests
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fclangir \
// RUN: -fcuda-is-device -emit-cir -target-sdk-version=12.3 \
// RUN: -x hip %s -o %t.cir
// RUN: FileCheck --check-prefix=CIR-DEVICE-HIP --input-file=%t.cir %s

// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fclangir \
// RUN: -fcuda-is-device -emit-llvm -target-sdk-version=12.3 \
// RUN: -x hip %s -o %t.ll
// RUN: FileCheck --check-prefix=LLVM-DEVICE-HIP --input-file=%t.ll %s

// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir \
// RUN: -x hip -emit-cir -target-sdk-version=12.3 \
// RUN: %s -o %t.cir
// RUN: FileCheck --check-prefix=CIR-HOST-HIP --input-file=%t.cir %s

// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir \
// RUN: -x hip -emit-llvm -target-sdk-version=12.3 \
// RUN: %s -o %t.ll
// RUN: FileCheck --check-prefix=LLVM-HOST-HIP --input-file=%t.ll %s

// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu \
// RUN: -x hip -emit-llvm -target-sdk-version=12.3 \
// RUN: %s -o %t.ll
// RUN: FileCheck --check-prefix=OGCG-HOST-HIP --input-file=%t.ll %s

// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \
// RUN: -fcuda-is-device -emit-llvm -target-sdk-version=12.3 \
// RUN: -x hip %s -o %t.ll
// RUN: FileCheck --check-prefix=OGCG-DEVICE-HIP --input-file=%t.ll %s

__shared__ int shared;
// CIR-DEVICE: cir.global external lang_address_space(offload_local) @shared = #cir.undef
// CIR-DEVICE: cir.global external{{.*}}lang_address_space(offload_local) @shared = #cir.undef
// LLVM-DEVICE: @shared = addrspace(3) global i32 undef, align 4
// CIR-HOST: cir.global{{.*}}@shared = #cir.undef : !s32i {alignment = 4 : i64}
// CIR-HOST-NOT: cu.shadow_name
// LLVM-HOST: @shared = internal global i32 undef, align 4
// OGCG-HOST: @shared = internal global i32
// OGCG-DEVICE: @shared = addrspace(3) global i32 undef, align 4
// CIR-DEVICE-HIP: cir.global external{{.*}}lang_address_space(offload_local) @shared = #cir.undef
// LLVM-DEVICE-HIP: @shared = addrspace(3) global i32 undef, align 4
// CIR-HOST-HIP: cir.global{{.*}}@shared = #cir.undef : !s32i {alignment = 4 : i64}
// CIR-HOST-HIP-NOT: cu.shadow_name
// LLVM-HOST-HIP: @shared = internal global i32 undef, align 4
// OGCG-HOST-HIP: @shared = internal global i32
// OGCG-DEVICE-HIP: @shared = addrspace(3) global i32 undef, align 4

__constant__ int b;
// CIR-DEVICE: cir.global constant external lang_address_space(offload_constant) @b = #cir.int<0> : !s32i {alignment = 4 : i64, cu.externally_initialized = #cir.cu.externally_initialized}
// CIR-DEVICE: cir.global constant external{{.*}}lang_address_space(offload_constant) @b = #cir.int<0> : !s32i {alignment = 4 : i64, cu.externally_initialized = #cir.cu.externally_initialized}
// LLVM-DEVICE: @b = addrspace(4) externally_initialized constant i32 0, align 4
// CIR-HOST: {{.*}}cir.global external @b = #cir.undef : !s32i {alignment = 4 : i64, cu.shadow_name = #cir.cu.shadow_name<b>}{{.*}}
// CIR-HOST: cir.global{{.*}}"private"{{.*}}internal{{.*}}@b = #cir.undef : !s32i {alignment = 4 : i64, cu.shadow_name = #cir.cu.shadow_name<b>}
// LLVM-HOST: @b = internal global i32 undef, align 4
// OGCG-HOST: @b = internal global i32
// OGCG-DEVICE: @b = addrspace(4) externally_initialized constant i32 0, align 4
// CIR-DEVICE-HIP: cir.global constant external{{.*}}lang_address_space(offload_constant) @b = #cir.int<0> : !s32i {alignment = 4 : i64, cu.externally_initialized = #cir.cu.externally_initialized}
// LLVM-DEVICE-HIP: @b = addrspace(4) externally_initialized constant i32 0, align 4
// CIR-HOST-HIP: cir.global{{.*}}"private"{{.*}}internal{{.*}}@b = #cir.undef : !s32i {alignment = 4 : i64, cu.shadow_name = #cir.cu.shadow_name<b>}
// LLVM-HOST-HIP: @b = internal global i32 undef, align 4
// OGCG-HOST-HIP: @b = internal global i32
// OGCG-DEVICE-HIP: @b = addrspace(4) externally_initialized constant i32 0, align 4

// External device variables should remain external on host side (they're just declarations)
// Note: External declarations may not appear in output if they're not referenced
extern __device__ int ext_device_var;
// CIR-HOST-NOT: cir.global{{.*}}@ext_device_var
// LLVM-HOST-NOT: @ext_device_var
// OGCG-HOST-NOT: @ext_device_var
// OGCG-DEVICE-NOT: @ext_device_var
// CIR-HOST-HIP-NOT: cir.global{{.*}}@ext_device_var
// LLVM-HOST-HIP-NOT: @ext_device_var
// OGCG-HOST-HIP-NOT: @ext_device_var
// OGCG-DEVICE-HIP-NOT: @ext_device_var

extern __constant__ int ext_constant_var;
// CIR-HOST-NOT: cir.global{{.*}}@ext_constant_var
// LLVM-HOST-NOT: @ext_constant_var
// OGCG-HOST-NOT: @ext_constant_var
// OGCG-DEVICE-NOT: @ext_constant_var
// CIR-HOST-HIP-NOT: cir.global{{.*}}@ext_constant_var
// LLVM-HOST-HIP-NOT: @ext_constant_var
// OGCG-HOST-HIP-NOT: @ext_constant_var
// OGCG-DEVICE-HIP-NOT: @ext_constant_var

// External device variables with definitions should be internal on host
extern __device__ int ext_device_var_def;
__device__ int ext_device_var_def = 1;
// CIR-DEVICE: cir.global external{{.*}}lang_address_space(offload_global) @ext_device_var_def = #cir.int<1>
// LLVM-DEVICE: @ext_device_var_def = addrspace(1) externally_initialized global i32 1, align 4
// CIR-HOST: cir.global{{.*}}"private"{{.*}}internal{{.*}}@ext_device_var_def = #cir.undef : !s32i {alignment = 4 : i64, cu.shadow_name = #cir.cu.shadow_name<ext_device_var_def>}
// LLVM-HOST: @ext_device_var_def = internal global i32 undef, align 4
// OGCG-HOST: @ext_device_var_def = internal global i32
// OGCG-DEVICE: @ext_device_var_def = addrspace(1) externally_initialized global i32 1, align 4
// CIR-DEVICE-HIP: cir.global external{{.*}}lang_address_space(offload_global) @ext_device_var_def = #cir.int<1>
// LLVM-DEVICE-HIP: @ext_device_var_def = addrspace(1) externally_initialized global i32 1, align 4
// CIR-HOST-HIP: cir.global{{.*}}"private"{{.*}}internal{{.*}}@ext_device_var_def = #cir.undef : !s32i {alignment = 4 : i64, cu.shadow_name = #cir.cu.shadow_name<ext_device_var_def>}
// LLVM-HOST-HIP: @ext_device_var_def = internal global i32 undef, align 4
// OGCG-HOST-HIP: @ext_device_var_def = internal global i32
// OGCG-DEVICE-HIP: @ext_device_var_def = addrspace(1) externally_initialized global i32 1, align 4

extern __constant__ int ext_constant_var_def;
__constant__ int ext_constant_var_def = 2;
// CIR-DEVICE: cir.global constant external{{.*}}lang_address_space(offload_constant) @ext_constant_var_def = #cir.int<2>
// LLVM-DEVICE: @ext_constant_var_def = addrspace(4) externally_initialized constant i32 2, align 4
// OGCG-DEVICE: @ext_constant_var_def = addrspace(4) externally_initialized constant i32 2, align 4
// CIR-HOST: cir.global{{.*}}"private"{{.*}}internal{{.*}}@ext_constant_var_def = #cir.undef : !s32i {alignment = 4 : i64, cu.shadow_name = #cir.cu.shadow_name<ext_constant_var_def>}
// LLVM-HOST: @ext_constant_var_def = internal global i32 undef, align 4
// OGCG-HOST: @ext_constant_var_def = internal global i32
// CIR-DEVICE-HIP: cir.global constant external{{.*}}lang_address_space(offload_constant) @ext_constant_var_def = #cir.int<2>
// LLVM-DEVICE-HIP: @ext_constant_var_def = addrspace(4) externally_initialized constant i32 2, align 4
// CIR-HOST-HIP: cir.global{{.*}}"private"{{.*}}internal{{.*}}@ext_constant_var_def = #cir.undef : !s32i {alignment = 4 : i64, cu.shadow_name = #cir.cu.shadow_name<ext_constant_var_def>}
// LLVM-HOST-HIP: @ext_constant_var_def = internal global i32 undef, align 4
// OGCG-HOST-HIP: @ext_constant_var_def = internal global i32
// OGCG-DEVICE-HIP: @ext_constant_var_def = addrspace(4) externally_initialized constant i32 2, align 4

// Regular host variables should NOT be internalized
int host_var;
// CIR-HOST: cir.global external @host_var = #cir.int<0> : !s32i
// LLVM-HOST: @host_var = global i32 0, align 4
// OGCG-HOST: @host_var ={{.*}} global i32

// CIR-DEVICE-NOT: cir.global{{.*}}@host_var
// LLVM-DEVICE-NOT: @host_var
// OGCG-DEVICE-NOT: @host_var

// CIR-HOST-HIP: cir.global external @host_var = #cir.int<0> : !s32i
// LLVM-HOST-HIP: @host_var = global i32 0, align 4
// OGCG-HOST-HIP: @host_var ={{.*}} global i32

// CIR-DEVICE-HIP-NOT: cir.global{{.*}}@host_var
// LLVM-DEVICE-HIP-NOT: @host_var
// OGCG-DEVICE-HIP-NOT: @host_var

// External host variables should remain external (may not appear if not referenced)
extern int ext_host_var;
// CIR-HOST-NOT: cir.global{{.*}}@ext_host_var
// LLVM-HOST-NOT: @ext_host_var
// OGCG-HOST-NOT: @ext_host_var

// CIR-DEVICE-NOT: cir.global{{.*}}@ext_host_var
// LLVM-DEVICE-NOT: @ext_host_var
// OGCG-DEVICE-NOT: @ext_host_var

// CIR-HOST-HIP-NOT: cir.global{{.*}}@ext_host_var
// LLVM-HOST-HIP-NOT: @ext_host_var
// OGCG-HOST-HIP-NOT: @ext_host_var

// CIR-DEVICE-HIP-NOT: cir.global{{.*}}@ext_host_var
// LLVM-DEVICE-HIP-NOT: @ext_host_var
// OGCG-DEVICE-HIP-NOT: @ext_host_var
4 changes: 2 additions & 2 deletions clang/test/CIR/CodeGen/HIP/registration.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#include "cuda.h"
#include "../Inputs/cuda.h"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We prefer on not having a relative path here #include <cuda.h> :D. We give th path through the -I%S/../Inputs/ flag we pass to CC1.


// RUN: echo "sample fatbin" > %t.fatbin
// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir \
Expand Down Expand Up @@ -33,7 +33,7 @@
// LLVM-HOST: i32 1212764230, i32 1, ptr @__hip_fatbin_str, ptr null
// LLVM-HOST: }, section ".hipFatBinSegment"
// LLVM-HOST: @_Z2fnv = constant ptr @_Z17__device_stub__fnv, align 8
// LLVM-HOST: @a = global i32 undef, align 4
// LLVM-HOST: @a = internal global i32 undef, align 4
// LLVM-HOST: @llvm.global_ctors = {{.*}}ptr @__hip_module_ctor

// CIR-HOST: cir.func internal private @__hip_module_dtor() {
Expand Down