From e1028110e77561574bfb7ea349154d46b5ea7b86 Mon Sep 17 00:00:00 2001 From: Flakebi Date: Sun, 14 Dec 2025 14:22:43 +0100 Subject: [PATCH] Add amdgpu intrinsics Add intrinsics for the amdgpu architecture. --- .github/workflows/main.yml | 15 +- ci/docker/amdgcn-amd-amdhsa/Dockerfile | 5 + ci/dox.sh | 9 + ci/run.sh | 3 + crates/core_arch/src/amdgpu/mod.rs | 818 +++++++++++++++++++++++++ crates/core_arch/src/core_arch_docs.md | 2 + crates/core_arch/src/mod.rs | 14 + 7 files changed, 864 insertions(+), 2 deletions(-) create mode 100644 ci/docker/amdgcn-amd-amdhsa/Dockerfile create mode 100644 crates/core_arch/src/amdgpu/mod.rs diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 28c15cf473..6c8a5694f3 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -84,6 +84,8 @@ jobs: os: ubuntu-latest - tuple: nvptx64-nvidia-cuda os: ubuntu-latest + - tuple: amdgcn-amd-amdhsa + os: ubuntu-latest - tuple: thumbv6m-none-eabi os: ubuntu-latest - tuple: thumbv7m-none-eabi @@ -201,6 +203,10 @@ jobs: tuple: aarch64-apple-ios-macabi os: macos-15 norun: true # https://github.com/rust-lang/stdarch/issues/1206 + - target: + tuple: amdgcn-amd-amdhsa + os: ubuntu-latest + norun: true steps: - uses: actions/checkout@v4 @@ -212,12 +218,17 @@ jobs: - run: rustup target add ${{ matrix.target.tuple }} shell: bash - if: matrix.build_std == '' + if: matrix.build_std == '' && matrix.target.tuple != 'amdgcn-amd-amdhsa' - run: | rustup component add rust-src echo "CARGO_UNSTABLE_BUILD_STD=std" >> $GITHUB_ENV shell: bash if: matrix.build_std != '' + - run: | + rustup component add rust-src + echo "CARGO_UNSTABLE_BUILD_STD=core,alloc" >> $GITHUB_ENV + shell: bash + if: matrix.target.tuple == 'amdgcn-amd-amdhsa' # Configure some env vars based on matrix configuration - run: echo "PROFILE=--profile=${{matrix.profile}}" >> $GITHUB_ENV @@ -233,7 +244,7 @@ jobs: if: matrix.disable_assert_instr != '' - run: echo "NOSTD=1" >> $GITHUB_ENV shell: bash - if: startsWith(matrix.target.tuple, 'thumb') || matrix.target.tuple == 'nvptx64-nvidia-cuda' + if: startsWith(matrix.target.tuple, 'thumb') || matrix.target.tuple == 'nvptx64-nvidia-cuda' || matrix.target.tuple == 'amdgcn-amd-amdhsa' # Windows & OSX go straight to `run.sh` ... - run: ./ci/run.sh diff --git a/ci/docker/amdgcn-amd-amdhsa/Dockerfile b/ci/docker/amdgcn-amd-amdhsa/Dockerfile new file mode 100644 index 0000000000..65cf281b14 --- /dev/null +++ b/ci/docker/amdgcn-amd-amdhsa/Dockerfile @@ -0,0 +1,5 @@ +FROM ubuntu:25.10 +RUN apt-get update && apt-get install -y --no-install-recommends \ + gcc \ + libc6-dev \ + ca-certificates diff --git a/ci/dox.sh b/ci/dox.sh index 94d76d4304..ab9608d728 100755 --- a/ci/dox.sh +++ b/ci/dox.sh @@ -15,6 +15,14 @@ dox() { cargo clean --target "${1}" + if [ "${1}" == "amdgcn-amd-amdhsa" ]; then + if [ "$CI" != "" ]; then + rustup component add rust-src + fi + export CARGO_UNSTABLE_BUILD_STD=core,alloc + export RUSTFLAGS="${RUSTFLAGS} -Ctarget-cpu=gfx900" + fi + cargo build --verbose --target "${1}" --manifest-path crates/core_arch/Cargo.toml cargo doc --verbose --target "${1}" --manifest-path crates/core_arch/Cargo.toml } @@ -33,6 +41,7 @@ if [ -z "$1" ]; then #dox mips64-unknown-linux-gnuabi64 dox wasm32-unknown-unknown dox nvptx64-nvidia-cuda + dox amdgcn-amd-amdhsa else dox "${1}" fi diff --git a/ci/run.sh b/ci/run.sh index 2bb77bae25..3ed62b6634 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -42,6 +42,9 @@ case ${TARGET} in armv7-*eabihf | thumbv7-*eabihf) export RUSTFLAGS="${RUSTFLAGS} -Ctarget-feature=+neon" ;; + amdgcn-*) + export RUSTFLAGS="${RUSTFLAGS} -Ctarget-cpu=gfx900" + ;; # Some of our test dependencies use the deprecated `gcc` crates which # doesn't detect RISC-V compilers automatically, so do it manually here. riscv*) diff --git a/crates/core_arch/src/amdgpu/mod.rs b/crates/core_arch/src/amdgpu/mod.rs new file mode 100644 index 0000000000..6c8c940dab --- /dev/null +++ b/crates/core_arch/src/amdgpu/mod.rs @@ -0,0 +1,818 @@ +//! amdgpu intrinsics +//! +//! The reference is the [LLVM amdgpu guide] and the [LLVM implementation]. +//! The order of intrinsics here follows the order in the [LLVM implementation]. +//! +//! [LLVM amdgpu guide]: https://llvm.org/docs/AMDGPUUsage.html#llvm-ir-intrinsics +//! [LLVM implementation]: https://github.com/llvm/llvm-project/blob/main/llvm/include/llvm/IR/IntrinsicsAMDGPU.td + +#[allow(improper_ctypes)] +unsafe extern "unadjusted" { + #[link_name = "llvm.amdgcn.workitem.id.x"] + safe fn llvm_workitem_id_x() -> u32; + #[link_name = "llvm.amdgcn.workitem.id.y"] + safe fn llvm_workitem_id_y() -> u32; + #[link_name = "llvm.amdgcn.workitem.id.z"] + safe fn llvm_workitem_id_z() -> u32; + + #[link_name = "llvm.amdgcn.workgroup.id.x"] + safe fn llvm_workgroup_id_x() -> u32; + #[link_name = "llvm.amdgcn.workgroup.id.y"] + safe fn llvm_workgroup_id_y() -> u32; + #[link_name = "llvm.amdgcn.workgroup.id.z"] + safe fn llvm_workgroup_id_z() -> u32; + + #[link_name = "llvm.amdgcn.groupstaticsize"] + safe fn llvm_groupstaticsize() -> u32; + #[link_name = "llvm.amdgcn.dispatch.id"] + safe fn llvm_dispatch_id() -> u64; + + #[link_name = "llvm.amdgcn.wavefrontsize"] + safe fn llvm_wavefrontsize() -> u32; + + #[link_name = "llvm.amdgcn.s.barrier"] + safe fn llvm_s_barrier(); + #[link_name = "llvm.amdgcn.s.barrier.signal"] + fn llvm_s_barrier_signal(barrier_type: u32); + #[link_name = "llvm.amdgcn.s.barrier.signal.isfirst"] + fn llvm_s_barrier_signal_isfirst(barrier_type: u32) -> bool; + #[link_name = "llvm.amdgcn.s.barrier.wait"] + fn llvm_s_barrier_wait(barrier_type: u16); + #[link_name = "llvm.amdgcn.s.barrier.leave"] + fn llvm_s_barrier_leave(barrier_type: u16); + #[link_name = "llvm.amdgcn.s.get.barrier.state"] + fn llvm_s_get_barrier_state(barrier_type: u32) -> u32; + #[link_name = "llvm.amdgcn.s.wave.barrier"] + safe fn llvm_s_wave_barrier(); + #[link_name = "llvm.amdgcn.sched.barrier"] + fn llvm_sched_barrier(mask: u32); + #[link_name = "llvm.amdgcn.sched.group.barrier"] + fn llvm_sched_group_barrier(mask: u32, size: u32, sync_id: u32); + + #[link_name = "llvm.amdgcn.s.sleep"] + safe fn llvm_s_sleep(count: u32); + + #[link_name = "llvm.amdgcn.s.sethalt"] + safe fn llvm_s_sethalt(value: u32) -> !; + + #[link_name = "llvm.amdgcn.s.getpc"] + safe fn llvm_s_getpc() -> i64; + + #[link_name = "llvm.amdgcn.mbcnt.lo"] + safe fn llvm_mbcnt_lo(value: u32, init: u32) -> u32; + #[link_name = "llvm.amdgcn.mbcnt.hi"] + safe fn llvm_mbcnt_hi(value: u32, init: u32) -> u32; + + #[link_name = "llvm.amdgcn.ballot"] + safe fn llvm_ballot(b: bool) -> u64; + + #[link_name = "llvm.amdgcn.inverse.ballot"] + safe fn llvm_inverse_ballot(value: u64) -> bool; + + #[link_name = "llvm.amdgcn.wave.reduce.umin"] + safe fn llvm_wave_reduce_umin(value: u32, strategy: u32) -> u32; + #[link_name = "llvm.amdgcn.wave.reduce.fmin"] + safe fn llvm_wave_reduce_fmin(value: f32, strategy: u32) -> f32; + #[link_name = "llvm.amdgcn.wave.reduce.min"] + safe fn llvm_wave_reduce_min(value: i32, strategy: u32) -> i32; + #[link_name = "llvm.amdgcn.wave.reduce.umax"] + safe fn llvm_wave_reduce_umax(value: u32, strategy: u32) -> u32; + #[link_name = "llvm.amdgcn.wave.reduce.fmax"] + safe fn llvm_wave_reduce_fmax(value: f32, strategy: u32) -> f32; + #[link_name = "llvm.amdgcn.wave.reduce.max"] + safe fn llvm_wave_reduce_max(value: i32, strategy: u32) -> i32; + #[link_name = "llvm.amdgcn.wave.reduce.add"] + safe fn llvm_wave_reduce_add(value: u32, strategy: u32) -> u32; + #[link_name = "llvm.amdgcn.wave.reduce.fadd"] + safe fn llvm_wave_reduce_fadd(value: f32, strategy: u32) -> f32; + #[link_name = "llvm.amdgcn.wave.reduce.and"] + safe fn llvm_wave_reduce_and(value: u32, strategy: u32) -> u32; + #[link_name = "llvm.amdgcn.wave.reduce.or"] + safe fn llvm_wave_reduce_or(value: u32, strategy: u32) -> u32; + #[link_name = "llvm.amdgcn.wave.reduce.xor"] + safe fn llvm_wave_reduce_xor(value: u32, strategy: u32) -> u32; + + // The following intrinsics can have multiple sizes + + #[link_name = "llvm.amdgcn.readfirstlane.i32"] + safe fn llvm_readfirstlane_u32(value: u32) -> u32; + #[link_name = "llvm.amdgcn.readfirstlane.i64"] + safe fn llvm_readfirstlane_u64(value: u64) -> u64; + #[link_name = "llvm.amdgcn.readlane.i32"] + fn llvm_readlane_u32(value: u32, lane: u32) -> u32; + #[link_name = "llvm.amdgcn.readlane.i64"] + fn llvm_readlane_u64(value: u64, lane: u32) -> u64; + #[link_name = "llvm.amdgcn.writelane.i32"] + fn llvm_writelane_u32(value: u32, lane: u32, default: u32) -> u32; + #[link_name = "llvm.amdgcn.writelane.i64"] + fn llvm_writelane_u64(value: u64, lane: u32, default: u64) -> u64; + + #[link_name = "llvm.amdgcn.endpgm"] + safe fn llvm_endpgm() -> !; + + #[link_name = "llvm.amdgcn.update.dpp.i32"] + fn llvm_update_dpp( + old: u32, + src: u32, + dpp_ctrl: u32, + row_mask: u32, + bank_mask: u32, + bound_control: bool, + ) -> u32; + + #[link_name = "llvm.amdgcn.s.memrealtime"] + safe fn llvm_s_memrealtime() -> u64; + + #[link_name = "llvm.amdgcn.ds.permute"] + fn llvm_ds_permute(lane: u32, value: u32) -> u32; + #[link_name = "llvm.amdgcn.ds.bpermute"] + fn llvm_ds_bpermute(lane: u32, value: u32) -> u32; + #[link_name = "llvm.amdgcn.perm"] + fn llvm_perm(src0: u32, src1: u32, selector: u32) -> u32; + + // gfx10 + #[link_name = "llvm.amdgcn.permlane16.i32"] + fn llvm_permlane16_u32( + old: u32, + src0: u32, + src1: u32, + src2: u32, + fi: bool, + bound_control: bool, + ) -> u32; + + // gfx10 + #[link_name = "llvm.amdgcn.permlanex16.i32"] + fn llvm_permlanex16_u32( + old: u32, + src0: u32, + src1: u32, + src2: u32, + fi: bool, + bound_control: bool, + ) -> u32; + + #[link_name = "llvm.amdgcn.s.get.waveid.in.workgroup"] + safe fn llvm_s_get_waveid_in_workgroup() -> u32; + + // gfx11 + #[link_name = "llvm.amdgcn.permlane64"] + fn llvm_permlane64_u32(value: u32) -> u32; + + // gfx12 + #[link_name = "llvm.amdgcn.permlane16.var"] + fn llvm_permlane16_var(old: u32, src0: u32, src1: u32, fi: bool, bound_control: bool) -> u32; + + // gfx12 + #[link_name = "llvm.amdgcn.permlanex16.var"] + fn llvm_permlanex16_var(old: u32, src0: u32, src1: u32, fi: bool, bound_control: bool) -> u32; + + #[link_name = "llvm.amdgcn.wave.id"] + safe fn llvm_wave_id() -> u32; + + // gfx950 + #[allow(improper_ctypes)] + #[link_name = "llvm.amdgcn.permlane16.swap"] + fn llvm_permlane16_swap( + vdst_old: u32, + vsrc_src0: u32, + fi: bool, + bound_control: bool, + ) -> (u32, u32); + + // gfx950 + #[allow(improper_ctypes)] + #[link_name = "llvm.amdgcn.permlane32.swap"] + fn llvm_permlane32_swap( + vdst_old: u32, + vsrc_src0: u32, + fi: bool, + bound_control: bool, + ) -> (u32, u32); +} + +/// Returns the x coordinate of the workitem index within the workgroup. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub fn workitem_id_x() -> u32 { + llvm_workitem_id_x() +} +/// Returns the y coordinate of the workitem index within the workgroup. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub fn workitem_id_y() -> u32 { + llvm_workitem_id_y() +} +/// Returns the z coordinate of the workitem index within the workgroup. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub fn workitem_id_z() -> u32 { + llvm_workitem_id_z() +} + +/// Returns the x coordinate of the workgroup index within the dispatch. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub fn workgroup_id_x() -> u32 { + llvm_workgroup_id_x() +} +/// Returns the y coordinate of the workgroup index within the dispatch. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub fn workgroup_id_y() -> u32 { + llvm_workgroup_id_y() +} +/// Returns the z coordinate of the workgroup index within the dispatch. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub fn workgroup_id_z() -> u32 { + llvm_workgroup_id_z() +} + +/// Returns the size of statically allocated shared memory for this program in bytes. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub fn groupstaticsize() -> u32 { + llvm_groupstaticsize() +} +/// Returns the id of the dispatch that is currently executed. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub fn dispatch_id() -> u64 { + llvm_dispatch_id() +} + +/// Returns the number of threads in a wavefront. +/// +/// Is always a power of 2. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub fn wavefrontsize() -> u32 { + llvm_wavefrontsize() +} + +/// Synchronize all wavefronts in a workgroup. +/// +/// Each wavefronts in a workgroup waits at the barrier until all wavefronts in the workgroup arrive at a barrier. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub fn s_barrier() { + llvm_s_barrier() +} + +/// Signal a specific barrier type. +/// +/// Only for non-named barriers. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub unsafe fn s_barrier_signal(barrier_type: u32) { + unsafe { llvm_s_barrier_signal(barrier_type) } +} + +/// Signal a specific barrier type. +/// +/// Only for non-named barriers. +/// Provides access to the s_barrier_signal_first instruction; +/// additionally ensures that the result value is valid even when +/// the intrinsic is used from a wavefront that is not running in a workgroup. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub unsafe fn s_barrier_signal_isfirst(barrier_type: u32) -> bool { + unsafe { llvm_s_barrier_signal_isfirst(barrier_type) } +} + +/// Wait for a specific barrier type. +/// +/// Only for non-named barriers. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub unsafe fn s_barrier_wait(barrier_type: u16) { + unsafe { llvm_s_barrier_wait(barrier_type) } +} + +/// Leave a specific barrier type. +/// +/// Only for non-named barriers. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub unsafe fn s_barrier_leave(barrier_type: u16) { + unsafe { llvm_s_barrier_leave(barrier_type) } +} + +/// Get the state of a specific barrier type. +/// +/// The `barrier_type` argument must be uniform, otherwise behavior is undefined. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub unsafe fn s_get_barrier_state(barrier_type: u32) -> u32 { + unsafe { llvm_s_get_barrier_state(barrier_type) } +} + +/// A barrier for only the threads within the current wavefront. +/// +/// Does not result in an instruction but restricts the compiler. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub fn s_wave_barrier() { + llvm_s_wave_barrier() +} + +/// Prevent movement of some instruction types. +/// +/// Controls the types of instructions that may be allowed to cross the intrinsic during instruction scheduling. +/// The parameter is a mask for the instruction types that can cross the intrinsic. +/// +/// - 0x0000: No instructions may be scheduled across `sched_barrier`. +/// - 0x0001: All, non-memory, non-side-effect producing instructions may be scheduled across `sched_barrier`, i.e. allow ALU instructions to pass. +/// - 0x0002: VALU instructions may be scheduled across `sched_barrier`. +/// - 0x0004: SALU instructions may be scheduled across `sched_barrier`. +/// - 0x0008: MFMA/WMMA instructions may be scheduled across `sched_barrier`. +/// - 0x0010: All VMEM instructions may be scheduled across `sched_barrier`. +/// - 0x0020: VMEM read instructions may be scheduled across `sched_barrier`. +/// - 0x0040: VMEM write instructions may be scheduled across `sched_barrier`. +/// - 0x0080: All DS instructions may be scheduled across `sched_barrier`. +/// - 0x0100: All DS read instructions may be scheduled across `sched_barrier`. +/// - 0x0200: All DS write instructions may be scheduled across `sched_barrier`. +/// - 0x0400: All Transcendental (e.g. V_EXP) instructions may be scheduled across `sched_barrier`. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub unsafe fn sched_barrier(mask: u32) { + unsafe { llvm_sched_barrier(mask) } +} + +/// Creates schedule groups with specific properties to create custom scheduling pipelines. +/// +/// The ordering between groups is enforced by the instruction scheduler. +/// The intrinsic applies to the code that precedes the intrinsic. +/// The intrinsic takes three values that control the behavior of the schedule groups. +/// +/// - `mask`: Classify instruction groups using the [`sched_barrier`] mask values. +/// - `size`: The number of instructions that are in the group. +/// - `sync_id`: Order is enforced between groups with matching values. +/// +/// The mask can include multiple instruction types. It is undefined behavior to set values beyond the range of valid masks. +/// +/// Combining multiple `sched_group_barrier` intrinsics enables an ordering of specific instruction types during instruction scheduling. +/// For example, the following enforces a sequence of 1 VMEM read, followed by 1 VALU instruction, followed by 5 MFMA instructions. +/// +/// ```rust +/// // 1 VMEM read +/// sched_group_barrier(32, 1, 0) +/// // 1 VALU +/// sched_group_barrier(2, 1, 0) +/// // 5 MFMA +/// sched_group_barrier(8, 5, 0) +/// ``` +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub unsafe fn sched_group_barrier(mask: u32, size: u32, sync_id: u32) { + unsafe { llvm_sched_group_barrier(mask, size, sync_id) } +} + +/// Sleeps for approximately `count * 64` cycles. +/// +/// `count` must be a constant. +/// Only the lower 7 bits of `count` are used. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub fn s_sleep(count: u32) { + llvm_s_sleep(count) +} + +/// Stop execution of the kernel. +/// +/// This usually signals an error state. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub fn s_sethalt(value: u32) -> ! { + llvm_s_sethalt(value) +} + +/// Returns the current process counter. +/// +/// Provides access to the s_getpc_b64 instruction, but with the return value sign-extended +/// from the width of the underlying PC hardware register even on processors where the +/// s_getpc_b64 instruction returns a zero-extended value. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub fn s_getpc() -> i64 { + llvm_s_getpc() +} + +/// Masked bit count, low 32 lanes. +/// +/// Computes the number of bits set in `value`, masked with a thread mask +/// which contains 1 for all active threads less than the current thread within a wavefront. +/// `init` is added to the result. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub fn mbcnt_lo(value: u32, init: u32) -> u32 { + llvm_mbcnt_lo(value, init) +} +/// Masked bit count, high 32 lanes. +/// +/// Computes the number of bits set in `value`, masked with a thread mask +/// which contains 1 for all active threads less than the current thread within a wavefront. +/// `init` is added to the result. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub fn mbcnt_hi(value: u32, init: u32) -> u32 { + llvm_mbcnt_hi(value, init) +} + +/// Returns a bitfield (`u32` or `u64`) containing the result of its i1 argument +/// in all active lanes, and zero in all inactive lanes. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub fn ballot(b: bool) -> u64 { + llvm_ballot(b) +} + +/// Indexes into the `value` with the current lane id and returns for each lane +/// if the corresponding bit is set. +/// +/// While [`ballot`] converts a `bool` to a mask, `inverse_ballot` converts a mask back to a `bool`. +/// This means `inverse_ballot(ballot(b)) == b`. +/// The inverse of `ballot(inverse_ballot(value)) ~= value` is not always true as inactive lanes are set to zero by `ballot`. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub fn inverse_ballot(value: u64) -> bool { + llvm_inverse_ballot(value) +} + +/// Performs an arithmetic min reduction on the unsigned values provided by each lane in the wavefront. +/// +/// The `strategy` argument is a hint for the reduction strategy. +/// - 0: Target default preference +/// - 1: Iterative strategy +/// - 2: DPP +/// +/// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub fn wave_reduce_umin(value: u32, strategy: u32) -> u32 { + llvm_wave_reduce_umin(value, strategy) +} +/// Performs an arithmetic min reduction on the unsigned values provided by each lane in the wavefront. +/// +/// The `strategy` argument is a hint for the reduction strategy. +/// - 0: Target default preference +/// - 1: Iterative strategy +/// - 2: DPP +/// +/// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub fn wave_reduce_fmin(value: f32, strategy: u32) -> f32 { + llvm_wave_reduce_fmin(value, strategy) +} +/// Performs an arithmetic min reduction on the unsigned values provided by each lane in the wavefront. +/// +/// The `strategy` argument is a hint for the reduction strategy. +/// - 0: Target default preference +/// - 1: Iterative strategy +/// - 2: DPP +/// +/// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub fn wave_reduce_min(value: i32, strategy: u32) -> i32 { + llvm_wave_reduce_min(value, strategy) +} + +/// Performs an arithmetic max reduction on the unsigned values provided by each lane in the wavefront. +/// +/// The `strategy` argument is a hint for the reduction strategy. +/// - 0: Target default preference +/// - 1: Iterative strategy +/// - 2: DPP +/// +/// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub fn wave_reduce_umax(value: u32, strategy: u32) -> u32 { + llvm_wave_reduce_umax(value, strategy) +} +/// Performs an arithmetic max reduction on the unsigned values provided by each lane in the wavefront. +/// +/// The `strategy` argument is a hint for the reduction strategy. +/// - 0: Target default preference +/// - 1: Iterative strategy +/// - 2: DPP +/// +/// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub fn wave_reduce_fmax(value: f32, strategy: u32) -> f32 { + llvm_wave_reduce_fmax(value, strategy) +} +/// Performs an arithmetic max reduction on the unsigned values provided by each lane in the wavefront. +/// +/// The `strategy` argument is a hint for the reduction strategy. +/// - 0: Target default preference +/// - 1: Iterative strategy +/// - 2: DPP +/// +/// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub fn wave_reduce_max(value: i32, strategy: u32) -> i32 { + llvm_wave_reduce_max(value, strategy) +} + +/// Performs a logical and reduction on the unsigned values provided by each lane in the wavefront. +/// +/// The `strategy` argument is a hint for the reduction strategy. +/// - 0: Target default preference +/// - 1: Iterative strategy +/// - 2: DPP +/// +/// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub fn wave_reduce_and(value: u32, strategy: u32) -> u32 { + llvm_wave_reduce_and(value, strategy) +} +/// Performs a logical or reduction on the unsigned values provided by each lane in the wavefront. +/// +/// The `strategy` argument is a hint for the reduction strategy. +/// - 0: Target default preference +/// - 1: Iterative strategy +/// - 2: DPP +/// +/// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub fn wave_reduce_or(value: u32, strategy: u32) -> u32 { + llvm_wave_reduce_or(value, strategy) +} +/// Performs a logical xor reduction on the unsigned values provided by each lane in the wavefront. +/// +/// The `strategy` argument is a hint for the reduction strategy. +/// - 0: Target default preference +/// - 1: Iterative strategy +/// - 2: DPP +/// +/// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub fn wave_reduce_xor(value: u32, strategy: u32) -> u32 { + llvm_wave_reduce_xor(value, strategy) +} + +// The following intrinsics can have multiple sizes + +/// Get `value` from the first active lane in the wavefront. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub fn readfirstlane_u32(value: u32) -> u32 { + llvm_readfirstlane_u32(value) +} +/// Get `value` from the first active lane in the wavefront. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub fn readfirstlane_u64(value: u64) -> u64 { + llvm_readfirstlane_u64(value) +} +/// Get `value` from the lane at index `lane` in the wavefront. +/// +/// The lane argument must be uniform across the currently active threads +/// of the current wavefront. Otherwise, the result is undefined. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub unsafe fn readlane_u32(value: u32, lane: u32) -> u32 { + unsafe { llvm_readlane_u32(value, lane) } +} +/// Get `value` from the lane at index `lane` in the wavefront. +/// +/// The lane argument must be uniform across the currently active threads +/// of the current wavefront. Otherwise, the result is undefined. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub unsafe fn readlane_u64(value: u64, lane: u32) -> u64 { + unsafe { llvm_readlane_u64(value, lane) } +} +/// Return `value` for the lane at index `lane` in the wavefront. +/// Return `default` for all other lanes. +/// +/// The value to write and lane select arguments must be uniform across the +/// currently active threads of the current wavefront. Otherwise, the result is +/// undefined. +/// +/// `value` is the value returned by `lane`. +/// `default` is the value returned by all lanes other than `lane`. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub unsafe fn writelane_u32(value: u32, lane: u32, default: u32) -> u32 { + unsafe { llvm_writelane_u32(value, lane, default) } +} +/// Return `value` for the lane at index `lane` in the wavefront. +/// Return `default` for all other lanes. +/// +/// The value to write and lane select arguments must be uniform across the +/// currently active threads of the current wavefront. Otherwise, the result is +/// undefined. +/// +/// `value` is the value returned by `lane`. +/// `default` is the value returned by all lanes other than `lane`. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub unsafe fn writelane_u64(value: u64, lane: u32, default: u64) -> u64 { + unsafe { llvm_writelane_u64(value, lane, default) } +} + +/// Stop execution of the wavefront. +/// +/// This usually signals the end of a successful execution. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub fn endpgm() -> ! { + llvm_endpgm() +} + +/// The `update_dpp` intrinsic represents the `update.dpp` operation in AMDGPU. +/// It takes an old value, a source operand, a DPP control operand, a row mask, a bank mask, and a bound control. +/// This operation is equivalent to a sequence of `v_mov_b32` operations. +/// +/// `llvm.amdgcn.update.dpp.i32 ` +/// Should be equivalent to: +/// ```asm +/// v_mov_b32 +/// v_mov_b32 +/// ``` +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub unsafe fn update_dpp( + old: u32, + src: u32, + dpp_ctrl: u32, + row_mask: u32, + bank_mask: u32, + bound_control: bool, +) -> u32 { + unsafe { llvm_update_dpp(old, src, dpp_ctrl, row_mask, bank_mask, bound_control) } +} + +/// Measures time based on a fixed frequency. +/// +/// Provides a real-time clock counter that runs at constant speed (typically 100 MHz) independent of ALU clock speeds. +/// The clock is consistent across the chip, so can be used for measuring between different wavefronts. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub fn s_memrealtime() -> u64 { + llvm_s_memrealtime() +} + +/// Scatter data across all lanes in a wavefront. +/// +/// Writes `value` to the lane `lane`. +/// +/// Reading from inactive lanes returns `0`. +/// In case multiple values get written to the same `lane`, the value from the source lane with the higher index is taken. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub unsafe fn ds_permute(lane: u32, value: u32) -> u32 { + unsafe { llvm_ds_permute(lane, value) } +} +/// Gather data across all lanes in a wavefront. +/// +/// Returns the `value` given to `ds_permute` by lane `lane`. +/// +/// Reading from inactive lanes returns `0`. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub unsafe fn ds_bpermute(lane: u32, value: u32) -> u32 { + unsafe { llvm_ds_bpermute(lane, value) } +} +/// Permute a 64-bit value. +/// +/// `selector` selects between different patterns in which the 64-bit values represented by `src0` and `src1` are permuted. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub unsafe fn perm(src0: u32, src1: u32, selector: u32) -> u32 { + unsafe { llvm_perm(src0, src1, selector) } +} + +// gfx10 +/// Performs arbitrary gather-style operation within a row (16 contiguous lanes) of the second input operand. +/// +/// The third and fourth inputs must be uniform across the current wavefront. +/// These are combined into a single 64-bit value representing lane selects used to swizzle within each row. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub unsafe fn permlane16_u32( + old: u32, + src0: u32, + src1: u32, + src2: u32, + fi: bool, + bound_control: bool, +) -> u32 { + unsafe { llvm_permlane16_u32(old, src0, src1, src2, fi, bound_control) } +} + +// gfx10 +/// Performs arbitrary gather-style operation across two rows (16 contiguous lanes) of the second input operand. +/// +/// The third and fourth inputs must be uniform across the current wavefront. +/// These are combined into a single 64-bit value representing lane selects used to swizzle within each row. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub unsafe fn permlanex16_u32( + old: u32, + src0: u32, + src1: u32, + src2: u32, + fi: bool, + bound_control: bool, +) -> u32 { + unsafe { llvm_permlanex16_u32(old, src0, src1, src2, fi, bound_control) } +} + +/// Get the index of the current wavefront in the workgroup. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub fn s_get_waveid_in_workgroup() -> u32 { + llvm_s_get_waveid_in_workgroup() +} + +// gfx11 +/// Swap `value` between upper and lower 32 lanes in a wavefront. +/// +/// Does nothing for wave32. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub unsafe fn permlane64_u32(value: u32) -> u32 { + unsafe { llvm_permlane64_u32(value) } +} + +// gfx12 +/// Performs arbitrary gather-style operation within a row (16 contiguous lanes) of the second input operand. +/// +/// In contrast to [`permlane16_u32`], allows each lane to specify its own gather lane. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub unsafe fn permlane16_var(old: u32, src0: u32, src1: u32, fi: bool, bound_control: bool) -> u32 { + unsafe { llvm_permlane16_var(old, src0, src1, fi, bound_control) } +} + +// gfx12 +/// Performs arbitrary gather-style operation across two rows (16 contiguous lanes) of the second input operand. +/// +/// In contrast to [`permlanex16_u32`], allows each lane to specify its own gather lane. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub unsafe fn permlanex16_var( + old: u32, + src0: u32, + src1: u32, + fi: bool, + bound_control: bool, +) -> u32 { + unsafe { llvm_permlanex16_var(old, src0, src1, fi, bound_control) } +} + +/// Get the index of the current wavefront in the workgroup. +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub fn wave_id() -> u32 { + llvm_wave_id() +} + +// gfx950 +/// Provide direct access to `v_permlane16_swap_b32` instruction on supported targets. +/// +/// Swaps the values across lanes of first 2 operands. +/// Odd rows of the first operand are swapped with even rows of the second operand (one row is 16 lanes). +/// Returns a pair for the swapped registers. +/// The first element of the return corresponds to the swapped element of the first argument. +#[allow(improper_ctypes)] +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub unsafe fn permlane16_swap( + vdst_old: u32, + vsrc_src0: u32, + fi: bool, + bound_control: bool, +) -> (u32, u32) { + unsafe { llvm_permlane16_swap(vdst_old, vsrc_src0, fi, bound_control) } +} + +// gfx950 +/// Provide direct access to `v_permlane32_swap_b32` instruction on supported targets. +/// +/// Swaps the values across lanes of first 2 operands. +/// Rows 2 and 3 of the first operand are swapped with rows 0 and 1 of the second operand (one row is 16 lanes). +/// Returns a pair for the swapped registers. +/// The first element of the return corresponds to the swapped element of the first argument. +#[allow(improper_ctypes)] +#[inline] +#[unstable(feature = "stdarch_amdgpu", issue = "149988")] +pub unsafe fn permlane32_swap( + vdst_old: u32, + vsrc_src0: u32, + fi: bool, + bound_control: bool, +) -> (u32, u32) { + unsafe { llvm_permlane32_swap(vdst_old, vsrc_src0, fi, bound_control) } +} diff --git a/crates/core_arch/src/core_arch_docs.md b/crates/core_arch/src/core_arch_docs.md index 6aea2b4618..7075945754 100644 --- a/crates/core_arch/src/core_arch_docs.md +++ b/crates/core_arch/src/core_arch_docs.md @@ -185,6 +185,7 @@ others at: * [`x86_64`] * [`arm`] * [`aarch64`] +* [`amdgpu`] * [`riscv32`] * [`riscv64`] * [`mips`] @@ -201,6 +202,7 @@ others at: [`x86_64`]: ../../core/arch/x86_64/index.html [`arm`]: ../../core/arch/arm/index.html [`aarch64`]: ../../core/arch/aarch64/index.html +[`amdgpu`]: ../../core/arch/amdgpu/index.html [`riscv32`]: ../../core/arch/riscv32/index.html [`riscv64`]: ../../core/arch/riscv64/index.html [`mips`]: ../../core/arch/mips/index.html diff --git a/crates/core_arch/src/mod.rs b/crates/core_arch/src/mod.rs index 2105cca1b4..fbb8562d31 100644 --- a/crates/core_arch/src/mod.rs +++ b/crates/core_arch/src/mod.rs @@ -274,6 +274,16 @@ pub mod arch { pub use crate::core_arch::nvptx::*; } + /// Platform-specific intrinsics for the `amdgpu` platform. + /// + /// See the [module documentation](../index.html) for more details. + #[cfg(any(target_arch = "amdgpu", doc))] + #[doc(cfg(target_arch = "amdgpu"))] + #[unstable(feature = "stdarch_amdgpu", issue = "149988")] + pub mod amdgpu { + pub use crate::core_arch::amdgpu::*; + } + /// Platform-specific intrinsics for the `loongarch32` platform. /// /// See the [module documentation](../index.html) for more details. @@ -349,6 +359,10 @@ mod powerpc64; #[doc(cfg(target_arch = "nvptx64"))] mod nvptx; +#[cfg(any(target_arch = "amdgpu", doc))] +#[doc(cfg(target_arch = "amdgpu"))] +mod amdgpu; + #[cfg(any(target_arch = "loongarch32", doc))] #[doc(cfg(target_arch = "loongarch32"))] mod loongarch32;