From e1028110e77561574bfb7ea349154d46b5ea7b86 Mon Sep 17 00:00:00 2001
From: Flakebi <flakebi@t-online.de>
Date: Sun, 14 Dec 2025 14:22:43 +0100
Subject: [PATCH] Add amdgpu intrinsics

Add intrinsics for the amdgpu architecture.
---
 .github/workflows/main.yml             |  15 +-
 ci/docker/amdgcn-amd-amdhsa/Dockerfile |   5 +
 ci/dox.sh                              |   9 +
 ci/run.sh                              |   3 +
 crates/core_arch/src/amdgpu/mod.rs     | 818 +++++++++++++++++++++++++
 crates/core_arch/src/core_arch_docs.md |   2 +
 crates/core_arch/src/mod.rs            |  14 +
 7 files changed, 864 insertions(+), 2 deletions(-)
 create mode 100644 ci/docker/amdgcn-amd-amdhsa/Dockerfile
 create mode 100644 crates/core_arch/src/amdgpu/mod.rs

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 28c15cf473..6c8a5694f3 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -84,6 +84,8 @@ jobs:
           os: ubuntu-latest
         - tuple: nvptx64-nvidia-cuda
           os: ubuntu-latest
+        - tuple: amdgcn-amd-amdhsa
+          os: ubuntu-latest
         - tuple: thumbv6m-none-eabi
           os: ubuntu-latest
         - tuple: thumbv7m-none-eabi
@@ -201,6 +203,10 @@ jobs:
             tuple: aarch64-apple-ios-macabi
             os: macos-15
           norun: true # https://github.com/rust-lang/stdarch/issues/1206
+        - target:
+            tuple: amdgcn-amd-amdhsa
+            os: ubuntu-latest
+          norun: true
 
     steps:
     - uses: actions/checkout@v4
@@ -212,12 +218,17 @@ jobs:
 
     - run: rustup target add ${{ matrix.target.tuple }}
       shell: bash
-      if: matrix.build_std == ''
+      if: matrix.build_std == '' && matrix.target.tuple != 'amdgcn-amd-amdhsa'
     - run: |
         rustup component add rust-src
         echo "CARGO_UNSTABLE_BUILD_STD=std" >> $GITHUB_ENV
       shell: bash
       if: matrix.build_std != ''
+    - run: |
+        rustup component add rust-src
+        echo "CARGO_UNSTABLE_BUILD_STD=core,alloc" >> $GITHUB_ENV
+      shell: bash
+      if: matrix.target.tuple == 'amdgcn-amd-amdhsa'
 
     # Configure some env vars based on matrix configuration
     - run: echo "PROFILE=--profile=${{matrix.profile}}" >> $GITHUB_ENV
@@ -233,7 +244,7 @@ jobs:
       if: matrix.disable_assert_instr != ''
     - run: echo "NOSTD=1" >> $GITHUB_ENV
       shell: bash
-      if: startsWith(matrix.target.tuple, 'thumb') || matrix.target.tuple == 'nvptx64-nvidia-cuda'
+      if: startsWith(matrix.target.tuple, 'thumb') || matrix.target.tuple == 'nvptx64-nvidia-cuda' || matrix.target.tuple == 'amdgcn-amd-amdhsa'
 
     # Windows & OSX go straight to `run.sh` ...
     - run: ./ci/run.sh
diff --git a/ci/docker/amdgcn-amd-amdhsa/Dockerfile b/ci/docker/amdgcn-amd-amdhsa/Dockerfile
new file mode 100644
index 0000000000..65cf281b14
--- /dev/null
+++ b/ci/docker/amdgcn-amd-amdhsa/Dockerfile
@@ -0,0 +1,5 @@
+FROM ubuntu:25.10
+RUN apt-get update && apt-get install -y --no-install-recommends \
+  gcc \
+  libc6-dev \
+  ca-certificates
diff --git a/ci/dox.sh b/ci/dox.sh
index 94d76d4304..ab9608d728 100755
--- a/ci/dox.sh
+++ b/ci/dox.sh
@@ -15,6 +15,14 @@ dox() {
 
   cargo clean --target "${1}"
 
+  if [ "${1}" == "amdgcn-amd-amdhsa" ]; then
+    if [ "$CI" != "" ]; then
+      rustup component add rust-src
+    fi
+    export CARGO_UNSTABLE_BUILD_STD=core,alloc
+    export RUSTFLAGS="${RUSTFLAGS} -Ctarget-cpu=gfx900"
+  fi
+
   cargo build --verbose --target "${1}" --manifest-path crates/core_arch/Cargo.toml
   cargo doc --verbose --target "${1}" --manifest-path crates/core_arch/Cargo.toml
 }
@@ -33,6 +41,7 @@ if [ -z "$1" ]; then
   #dox mips64-unknown-linux-gnuabi64
   dox wasm32-unknown-unknown
   dox nvptx64-nvidia-cuda
+  dox amdgcn-amd-amdhsa
 else
   dox "${1}"
 fi
diff --git a/ci/run.sh b/ci/run.sh
index 2bb77bae25..3ed62b6634 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -42,6 +42,9 @@ case ${TARGET} in
     armv7-*eabihf | thumbv7-*eabihf)
         export RUSTFLAGS="${RUSTFLAGS} -Ctarget-feature=+neon"
         ;;
+    amdgcn-*)
+        export RUSTFLAGS="${RUSTFLAGS} -Ctarget-cpu=gfx900"
+        ;;
     # Some of our test dependencies use the deprecated `gcc` crates which
     # doesn't detect RISC-V compilers automatically, so do it manually here.
     riscv*)
diff --git a/crates/core_arch/src/amdgpu/mod.rs b/crates/core_arch/src/amdgpu/mod.rs
new file mode 100644
index 0000000000..6c8c940dab
--- /dev/null
+++ b/crates/core_arch/src/amdgpu/mod.rs
@@ -0,0 +1,818 @@
+//! amdgpu intrinsics
+//!
+//! The reference is the [LLVM amdgpu guide] and the [LLVM implementation].
+//! The order of intrinsics here follows the order in the [LLVM implementation].
+//!
+//! [LLVM amdgpu guide]: https://llvm.org/docs/AMDGPUUsage.html#llvm-ir-intrinsics
+//! [LLVM implementation]: https://github.com/llvm/llvm-project/blob/main/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+
+#[allow(improper_ctypes)]
+unsafe extern "unadjusted" {
+    #[link_name = "llvm.amdgcn.workitem.id.x"]
+    safe fn llvm_workitem_id_x() -> u32;
+    #[link_name = "llvm.amdgcn.workitem.id.y"]
+    safe fn llvm_workitem_id_y() -> u32;
+    #[link_name = "llvm.amdgcn.workitem.id.z"]
+    safe fn llvm_workitem_id_z() -> u32;
+
+    #[link_name = "llvm.amdgcn.workgroup.id.x"]
+    safe fn llvm_workgroup_id_x() -> u32;
+    #[link_name = "llvm.amdgcn.workgroup.id.y"]
+    safe fn llvm_workgroup_id_y() -> u32;
+    #[link_name = "llvm.amdgcn.workgroup.id.z"]
+    safe fn llvm_workgroup_id_z() -> u32;
+
+    #[link_name = "llvm.amdgcn.groupstaticsize"]
+    safe fn llvm_groupstaticsize() -> u32;
+    #[link_name = "llvm.amdgcn.dispatch.id"]
+    safe fn llvm_dispatch_id() -> u64;
+
+    #[link_name = "llvm.amdgcn.wavefrontsize"]
+    safe fn llvm_wavefrontsize() -> u32;
+
+    #[link_name = "llvm.amdgcn.s.barrier"]
+    safe fn llvm_s_barrier();
+    #[link_name = "llvm.amdgcn.s.barrier.signal"]
+    fn llvm_s_barrier_signal(barrier_type: u32);
+    #[link_name = "llvm.amdgcn.s.barrier.signal.isfirst"]
+    fn llvm_s_barrier_signal_isfirst(barrier_type: u32) -> bool;
+    #[link_name = "llvm.amdgcn.s.barrier.wait"]
+    fn llvm_s_barrier_wait(barrier_type: u16);
+    #[link_name = "llvm.amdgcn.s.barrier.leave"]
+    fn llvm_s_barrier_leave(barrier_type: u16);
+    #[link_name = "llvm.amdgcn.s.get.barrier.state"]
+    fn llvm_s_get_barrier_state(barrier_type: u32) -> u32;
+    #[link_name = "llvm.amdgcn.s.wave.barrier"]
+    safe fn llvm_s_wave_barrier();
+    #[link_name = "llvm.amdgcn.sched.barrier"]
+    fn llvm_sched_barrier(mask: u32);
+    #[link_name = "llvm.amdgcn.sched.group.barrier"]
+    fn llvm_sched_group_barrier(mask: u32, size: u32, sync_id: u32);
+
+    #[link_name = "llvm.amdgcn.s.sleep"]
+    safe fn llvm_s_sleep(count: u32);
+
+    #[link_name = "llvm.amdgcn.s.sethalt"]
+    safe fn llvm_s_sethalt(value: u32) -> !;
+
+    #[link_name = "llvm.amdgcn.s.getpc"]
+    safe fn llvm_s_getpc() -> i64;
+
+    #[link_name = "llvm.amdgcn.mbcnt.lo"]
+    safe fn llvm_mbcnt_lo(value: u32, init: u32) -> u32;
+    #[link_name = "llvm.amdgcn.mbcnt.hi"]
+    safe fn llvm_mbcnt_hi(value: u32, init: u32) -> u32;
+
+    #[link_name = "llvm.amdgcn.ballot"]
+    safe fn llvm_ballot(b: bool) -> u64;
+
+    #[link_name = "llvm.amdgcn.inverse.ballot"]
+    safe fn llvm_inverse_ballot(value: u64) -> bool;
+
+    #[link_name = "llvm.amdgcn.wave.reduce.umin"]
+    safe fn llvm_wave_reduce_umin(value: u32, strategy: u32) -> u32;
+    #[link_name = "llvm.amdgcn.wave.reduce.fmin"]
+    safe fn llvm_wave_reduce_fmin(value: f32, strategy: u32) -> f32;
+    #[link_name = "llvm.amdgcn.wave.reduce.min"]
+    safe fn llvm_wave_reduce_min(value: i32, strategy: u32) -> i32;
+    #[link_name = "llvm.amdgcn.wave.reduce.umax"]
+    safe fn llvm_wave_reduce_umax(value: u32, strategy: u32) -> u32;
+    #[link_name = "llvm.amdgcn.wave.reduce.fmax"]
+    safe fn llvm_wave_reduce_fmax(value: f32, strategy: u32) -> f32;
+    #[link_name = "llvm.amdgcn.wave.reduce.max"]
+    safe fn llvm_wave_reduce_max(value: i32, strategy: u32) -> i32;
+    #[link_name = "llvm.amdgcn.wave.reduce.add"]
+    safe fn llvm_wave_reduce_add(value: u32, strategy: u32) -> u32;
+    #[link_name = "llvm.amdgcn.wave.reduce.fadd"]
+    safe fn llvm_wave_reduce_fadd(value: f32, strategy: u32) -> f32;
+    #[link_name = "llvm.amdgcn.wave.reduce.and"]
+    safe fn llvm_wave_reduce_and(value: u32, strategy: u32) -> u32;
+    #[link_name = "llvm.amdgcn.wave.reduce.or"]
+    safe fn llvm_wave_reduce_or(value: u32, strategy: u32) -> u32;
+    #[link_name = "llvm.amdgcn.wave.reduce.xor"]
+    safe fn llvm_wave_reduce_xor(value: u32, strategy: u32) -> u32;
+
+    // The following intrinsics can have multiple sizes
+
+    #[link_name = "llvm.amdgcn.readfirstlane.i32"]
+    safe fn llvm_readfirstlane_u32(value: u32) -> u32;
+    #[link_name = "llvm.amdgcn.readfirstlane.i64"]
+    safe fn llvm_readfirstlane_u64(value: u64) -> u64;
+    #[link_name = "llvm.amdgcn.readlane.i32"]
+    fn llvm_readlane_u32(value: u32, lane: u32) -> u32;
+    #[link_name = "llvm.amdgcn.readlane.i64"]
+    fn llvm_readlane_u64(value: u64, lane: u32) -> u64;
+    #[link_name = "llvm.amdgcn.writelane.i32"]
+    fn llvm_writelane_u32(value: u32, lane: u32, default: u32) -> u32;
+    #[link_name = "llvm.amdgcn.writelane.i64"]
+    fn llvm_writelane_u64(value: u64, lane: u32, default: u64) -> u64;
+
+    #[link_name = "llvm.amdgcn.endpgm"]
+    safe fn llvm_endpgm() -> !;
+
+    #[link_name = "llvm.amdgcn.update.dpp.i32"]
+    fn llvm_update_dpp(
+        old: u32,
+        src: u32,
+        dpp_ctrl: u32,
+        row_mask: u32,
+        bank_mask: u32,
+        bound_control: bool,
+    ) -> u32;
+
+    #[link_name = "llvm.amdgcn.s.memrealtime"]
+    safe fn llvm_s_memrealtime() -> u64;
+
+    #[link_name = "llvm.amdgcn.ds.permute"]
+    fn llvm_ds_permute(lane: u32, value: u32) -> u32;
+    #[link_name = "llvm.amdgcn.ds.bpermute"]
+    fn llvm_ds_bpermute(lane: u32, value: u32) -> u32;
+    #[link_name = "llvm.amdgcn.perm"]
+    fn llvm_perm(src0: u32, src1: u32, selector: u32) -> u32;
+
+    // gfx10
+    #[link_name = "llvm.amdgcn.permlane16.i32"]
+    fn llvm_permlane16_u32(
+        old: u32,
+        src0: u32,
+        src1: u32,
+        src2: u32,
+        fi: bool,
+        bound_control: bool,
+    ) -> u32;
+
+    // gfx10
+    #[link_name = "llvm.amdgcn.permlanex16.i32"]
+    fn llvm_permlanex16_u32(
+        old: u32,
+        src0: u32,
+        src1: u32,
+        src2: u32,
+        fi: bool,
+        bound_control: bool,
+    ) -> u32;
+
+    #[link_name = "llvm.amdgcn.s.get.waveid.in.workgroup"]
+    safe fn llvm_s_get_waveid_in_workgroup() -> u32;
+
+    // gfx11
+    #[link_name = "llvm.amdgcn.permlane64"]
+    fn llvm_permlane64_u32(value: u32) -> u32;
+
+    // gfx12
+    #[link_name = "llvm.amdgcn.permlane16.var"]
+    fn llvm_permlane16_var(old: u32, src0: u32, src1: u32, fi: bool, bound_control: bool) -> u32;
+
+    // gfx12
+    #[link_name = "llvm.amdgcn.permlanex16.var"]
+    fn llvm_permlanex16_var(old: u32, src0: u32, src1: u32, fi: bool, bound_control: bool) -> u32;
+
+    #[link_name = "llvm.amdgcn.wave.id"]
+    safe fn llvm_wave_id() -> u32;
+
+    // gfx950
+    #[allow(improper_ctypes)]
+    #[link_name = "llvm.amdgcn.permlane16.swap"]
+    fn llvm_permlane16_swap(
+        vdst_old: u32,
+        vsrc_src0: u32,
+        fi: bool,
+        bound_control: bool,
+    ) -> (u32, u32);
+
+    // gfx950
+    #[allow(improper_ctypes)]
+    #[link_name = "llvm.amdgcn.permlane32.swap"]
+    fn llvm_permlane32_swap(
+        vdst_old: u32,
+        vsrc_src0: u32,
+        fi: bool,
+        bound_control: bool,
+    ) -> (u32, u32);
+}
+
+/// Returns the x coordinate of the workitem index within the workgroup.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub fn workitem_id_x() -> u32 {
+    llvm_workitem_id_x()
+}
+/// Returns the y coordinate of the workitem index within the workgroup.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub fn workitem_id_y() -> u32 {
+    llvm_workitem_id_y()
+}
+/// Returns the z coordinate of the workitem index within the workgroup.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub fn workitem_id_z() -> u32 {
+    llvm_workitem_id_z()
+}
+
+/// Returns the x coordinate of the workgroup index within the dispatch.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub fn workgroup_id_x() -> u32 {
+    llvm_workgroup_id_x()
+}
+/// Returns the y coordinate of the workgroup index within the dispatch.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub fn workgroup_id_y() -> u32 {
+    llvm_workgroup_id_y()
+}
+/// Returns the z coordinate of the workgroup index within the dispatch.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub fn workgroup_id_z() -> u32 {
+    llvm_workgroup_id_z()
+}
+
+/// Returns the size of statically allocated shared memory for this program in bytes.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub fn groupstaticsize() -> u32 {
+    llvm_groupstaticsize()
+}
+/// Returns the id of the dispatch that is currently executed.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub fn dispatch_id() -> u64 {
+    llvm_dispatch_id()
+}
+
+/// Returns the number of threads in a wavefront.
+///
+/// Is always a power of 2.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub fn wavefrontsize() -> u32 {
+    llvm_wavefrontsize()
+}
+
+/// Synchronize all wavefronts in a workgroup.
+///
+/// Each wavefronts in a workgroup waits at the barrier until all wavefronts in the workgroup arrive at a barrier.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub fn s_barrier() {
+    llvm_s_barrier()
+}
+
+/// Signal a specific barrier type.
+///
+/// Only for non-named barriers.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub unsafe fn s_barrier_signal(barrier_type: u32) {
+    unsafe { llvm_s_barrier_signal(barrier_type) }
+}
+
+/// Signal a specific barrier type.
+///
+/// Only for non-named barriers.
+/// Provides access to the s_barrier_signal_first instruction;
+/// additionally ensures that the result value is valid even when
+/// the intrinsic is used from a wavefront that is not running in a workgroup.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub unsafe fn s_barrier_signal_isfirst(barrier_type: u32) -> bool {
+    unsafe { llvm_s_barrier_signal_isfirst(barrier_type) }
+}
+
+/// Wait for a specific barrier type.
+///
+/// Only for non-named barriers.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub unsafe fn s_barrier_wait(barrier_type: u16) {
+    unsafe { llvm_s_barrier_wait(barrier_type) }
+}
+
+/// Leave a specific barrier type.
+///
+/// Only for non-named barriers.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub unsafe fn s_barrier_leave(barrier_type: u16) {
+    unsafe { llvm_s_barrier_leave(barrier_type) }
+}
+
+/// Get the state of a specific barrier type.
+///
+/// The `barrier_type` argument must be uniform, otherwise behavior is undefined.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub unsafe fn s_get_barrier_state(barrier_type: u32) -> u32 {
+    unsafe { llvm_s_get_barrier_state(barrier_type) }
+}
+
+/// A barrier for only the threads within the current wavefront.
+///
+/// Does not result in an instruction but restricts the compiler.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub fn s_wave_barrier() {
+    llvm_s_wave_barrier()
+}
+
+/// Prevent movement of some instruction types.
+///
+/// Controls the types of instructions that may be allowed to cross the intrinsic during instruction scheduling.
+/// The parameter is a mask for the instruction types that can cross the intrinsic.
+///
+/// - 0x0000: No instructions may be scheduled across `sched_barrier`.
+/// - 0x0001: All, non-memory, non-side-effect producing instructions may be scheduled across `sched_barrier`, i.e. allow ALU instructions to pass.
+/// - 0x0002: VALU instructions may be scheduled across `sched_barrier`.
+/// - 0x0004: SALU instructions may be scheduled across `sched_barrier`.
+/// - 0x0008: MFMA/WMMA instructions may be scheduled across `sched_barrier`.
+/// - 0x0010: All VMEM instructions may be scheduled across `sched_barrier`.
+/// - 0x0020: VMEM read instructions may be scheduled across `sched_barrier`.
+/// - 0x0040: VMEM write instructions may be scheduled across `sched_barrier`.
+/// - 0x0080: All DS instructions may be scheduled across `sched_barrier`.
+/// - 0x0100: All DS read instructions may be scheduled across `sched_barrier`.
+/// - 0x0200: All DS write instructions may be scheduled across `sched_barrier`.
+/// - 0x0400: All Transcendental (e.g. V_EXP) instructions may be scheduled across `sched_barrier`.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub unsafe fn sched_barrier(mask: u32) {
+    unsafe { llvm_sched_barrier(mask) }
+}
+
+/// Creates schedule groups with specific properties to create custom scheduling pipelines.
+///
+/// The ordering between groups is enforced by the instruction scheduler.
+/// The intrinsic applies to the code that precedes the intrinsic.
+/// The intrinsic takes three values that control the behavior of the schedule groups.
+///
+/// - `mask`: Classify instruction groups using the [`sched_barrier`] mask values.
+/// - `size`: The number of instructions that are in the group.
+/// - `sync_id`: Order is enforced between groups with matching values.
+///
+/// The mask can include multiple instruction types. It is undefined behavior to set values beyond the range of valid masks.
+///
+/// Combining multiple `sched_group_barrier` intrinsics enables an ordering of specific instruction types during instruction scheduling.
+/// For example, the following enforces a sequence of 1 VMEM read, followed by 1 VALU instruction, followed by 5 MFMA instructions.
+///
+/// ```rust
+/// // 1 VMEM read
+/// sched_group_barrier(32, 1, 0)
+/// // 1 VALU
+/// sched_group_barrier(2, 1, 0)
+/// // 5 MFMA
+/// sched_group_barrier(8, 5, 0)
+/// ```
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub unsafe fn sched_group_barrier(mask: u32, size: u32, sync_id: u32) {
+    unsafe { llvm_sched_group_barrier(mask, size, sync_id) }
+}
+
+/// Sleeps for approximately `count * 64` cycles.
+///
+/// `count` must be a constant.
+/// Only the lower 7 bits of `count` are used.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub fn s_sleep(count: u32) {
+    llvm_s_sleep(count)
+}
+
+/// Stop execution of the kernel.
+///
+/// This usually signals an error state.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub fn s_sethalt(value: u32) -> ! {
+    llvm_s_sethalt(value)
+}
+
+/// Returns the current process counter.
+///
+/// Provides access to the s_getpc_b64 instruction, but with the return value sign-extended
+/// from the width of the underlying PC hardware register even on processors where the
+/// s_getpc_b64 instruction returns a zero-extended value.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub fn s_getpc() -> i64 {
+    llvm_s_getpc()
+}
+
+/// Masked bit count, low 32 lanes.
+///
+/// Computes the number of bits set in `value`, masked with a thread mask
+/// which contains 1 for all active threads less than the current thread within a wavefront.
+/// `init` is added to the result.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub fn mbcnt_lo(value: u32, init: u32) -> u32 {
+    llvm_mbcnt_lo(value, init)
+}
+/// Masked bit count, high 32 lanes.
+///
+/// Computes the number of bits set in `value`, masked with a thread mask
+/// which contains 1 for all active threads less than the current thread within a wavefront.
+/// `init` is added to the result.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub fn mbcnt_hi(value: u32, init: u32) -> u32 {
+    llvm_mbcnt_hi(value, init)
+}
+
+/// Returns a bitfield (`u32` or `u64`) containing the result of its i1 argument
+/// in all active lanes, and zero in all inactive lanes.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub fn ballot(b: bool) -> u64 {
+    llvm_ballot(b)
+}
+
+/// Indexes into the `value` with the current lane id and returns for each lane
+/// if the corresponding bit is set.
+///
+/// While [`ballot`] converts a `bool` to a mask, `inverse_ballot` converts a mask back to a `bool`.
+/// This means `inverse_ballot(ballot(b)) == b`.
+/// The inverse of `ballot(inverse_ballot(value)) ~= value` is not always true as inactive lanes are set to zero by `ballot`.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub fn inverse_ballot(value: u64) -> bool {
+    llvm_inverse_ballot(value)
+}
+
+/// Performs an arithmetic min reduction on the unsigned values provided by each lane in the wavefront.
+///
+/// The `strategy` argument is a hint for the reduction strategy.
+/// - 0: Target default preference
+/// - 1: Iterative strategy
+/// - 2: DPP
+///
+/// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub fn wave_reduce_umin(value: u32, strategy: u32) -> u32 {
+    llvm_wave_reduce_umin(value, strategy)
+}
+/// Performs an arithmetic min reduction on the unsigned values provided by each lane in the wavefront.
+///
+/// The `strategy` argument is a hint for the reduction strategy.
+/// - 0: Target default preference
+/// - 1: Iterative strategy
+/// - 2: DPP
+///
+/// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub fn wave_reduce_fmin(value: f32, strategy: u32) -> f32 {
+    llvm_wave_reduce_fmin(value, strategy)
+}
+/// Performs an arithmetic min reduction on the unsigned values provided by each lane in the wavefront.
+///
+/// The `strategy` argument is a hint for the reduction strategy.
+/// - 0: Target default preference
+/// - 1: Iterative strategy
+/// - 2: DPP
+///
+/// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub fn wave_reduce_min(value: i32, strategy: u32) -> i32 {
+    llvm_wave_reduce_min(value, strategy)
+}
+
+/// Performs an arithmetic max reduction on the unsigned values provided by each lane in the wavefront.
+///
+/// The `strategy` argument is a hint for the reduction strategy.
+/// - 0: Target default preference
+/// - 1: Iterative strategy
+/// - 2: DPP
+///
+/// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub fn wave_reduce_umax(value: u32, strategy: u32) -> u32 {
+    llvm_wave_reduce_umax(value, strategy)
+}
+/// Performs an arithmetic max reduction on the unsigned values provided by each lane in the wavefront.
+///
+/// The `strategy` argument is a hint for the reduction strategy.
+/// - 0: Target default preference
+/// - 1: Iterative strategy
+/// - 2: DPP
+///
+/// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub fn wave_reduce_fmax(value: f32, strategy: u32) -> f32 {
+    llvm_wave_reduce_fmax(value, strategy)
+}
+/// Performs an arithmetic max reduction on the unsigned values provided by each lane in the wavefront.
+///
+/// The `strategy` argument is a hint for the reduction strategy.
+/// - 0: Target default preference
+/// - 1: Iterative strategy
+/// - 2: DPP
+///
+/// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub fn wave_reduce_max(value: i32, strategy: u32) -> i32 {
+    llvm_wave_reduce_max(value, strategy)
+}
+
+/// Performs a logical and reduction on the unsigned values provided by each lane in the wavefront.
+///
+/// The `strategy` argument is a hint for the reduction strategy.
+/// - 0: Target default preference
+/// - 1: Iterative strategy
+/// - 2: DPP
+///
+/// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub fn wave_reduce_and(value: u32, strategy: u32) -> u32 {
+    llvm_wave_reduce_and(value, strategy)
+}
+/// Performs a logical or reduction on the unsigned values provided by each lane in the wavefront.
+///
+/// The `strategy` argument is a hint for the reduction strategy.
+/// - 0: Target default preference
+/// - 1: Iterative strategy
+/// - 2: DPP
+///
+/// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub fn wave_reduce_or(value: u32, strategy: u32) -> u32 {
+    llvm_wave_reduce_or(value, strategy)
+}
+/// Performs a logical xor reduction on the unsigned values provided by each lane in the wavefront.
+///
+/// The `strategy` argument is a hint for the reduction strategy.
+/// - 0: Target default preference
+/// - 1: Iterative strategy
+/// - 2: DPP
+///
+/// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub fn wave_reduce_xor(value: u32, strategy: u32) -> u32 {
+    llvm_wave_reduce_xor(value, strategy)
+}
+
+// The following intrinsics can have multiple sizes
+
+/// Get `value` from the first active lane in the wavefront.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub fn readfirstlane_u32(value: u32) -> u32 {
+    llvm_readfirstlane_u32(value)
+}
+/// Get `value` from the first active lane in the wavefront.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub fn readfirstlane_u64(value: u64) -> u64 {
+    llvm_readfirstlane_u64(value)
+}
+/// Get `value` from the lane at index `lane` in the wavefront.
+///
+/// The lane argument must be uniform across the currently active threads
+/// of the current wavefront. Otherwise, the result is undefined.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub unsafe fn readlane_u32(value: u32, lane: u32) -> u32 {
+    unsafe { llvm_readlane_u32(value, lane) }
+}
+/// Get `value` from the lane at index `lane` in the wavefront.
+///
+/// The lane argument must be uniform across the currently active threads
+/// of the current wavefront. Otherwise, the result is undefined.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub unsafe fn readlane_u64(value: u64, lane: u32) -> u64 {
+    unsafe { llvm_readlane_u64(value, lane) }
+}
+/// Return `value` for the lane at index `lane` in the wavefront.
+/// Return `default` for all other lanes.
+///
+/// The value to write and lane select arguments must be uniform across the
+/// currently active threads of the current wavefront. Otherwise, the result is
+/// undefined.
+///
+/// `value` is the value returned by `lane`.
+/// `default` is the value returned by all lanes other than `lane`.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub unsafe fn writelane_u32(value: u32, lane: u32, default: u32) -> u32 {
+    unsafe { llvm_writelane_u32(value, lane, default) }
+}
+/// Return `value` for the lane at index `lane` in the wavefront.
+/// Return `default` for all other lanes.
+///
+/// The value to write and lane select arguments must be uniform across the
+/// currently active threads of the current wavefront. Otherwise, the result is
+/// undefined.
+///
+/// `value` is the value returned by `lane`.
+/// `default` is the value returned by all lanes other than `lane`.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub unsafe fn writelane_u64(value: u64, lane: u32, default: u64) -> u64 {
+    unsafe { llvm_writelane_u64(value, lane, default) }
+}
+
+/// Stop execution of the wavefront.
+///
+/// This usually signals the end of a successful execution.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub fn endpgm() -> ! {
+    llvm_endpgm()
+}
+
+/// The `update_dpp` intrinsic represents the `update.dpp` operation in AMDGPU.
+/// It takes an old value, a source operand, a DPP control operand, a row mask, a bank mask, and a bound control.
+/// This operation is equivalent to a sequence of `v_mov_b32` operations.
+///
+/// `llvm.amdgcn.update.dpp.i32 <old> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>`
+/// Should be equivalent to:
+/// ```asm
+/// v_mov_b32 <dest> <old>
+/// v_mov_b32 <dest> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>
+/// ```
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub unsafe fn update_dpp(
+    old: u32,
+    src: u32,
+    dpp_ctrl: u32,
+    row_mask: u32,
+    bank_mask: u32,
+    bound_control: bool,
+) -> u32 {
+    unsafe { llvm_update_dpp(old, src, dpp_ctrl, row_mask, bank_mask, bound_control) }
+}
+
+/// Measures time based on a fixed frequency.
+///
+/// Provides a real-time clock counter that runs at constant speed (typically 100 MHz) independent of ALU clock speeds.
+/// The clock is consistent across the chip, so can be used for measuring between different wavefronts.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub fn s_memrealtime() -> u64 {
+    llvm_s_memrealtime()
+}
+
+/// Scatter data across all lanes in a wavefront.
+///
+/// Writes `value` to the lane `lane`.
+///
+/// Reading from inactive lanes returns `0`.
+/// In case multiple values get written to the same `lane`, the value from the source lane with the higher index is taken.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub unsafe fn ds_permute(lane: u32, value: u32) -> u32 {
+    unsafe { llvm_ds_permute(lane, value) }
+}
+/// Gather data across all lanes in a wavefront.
+///
+/// Returns the `value` given to `ds_permute` by lane `lane`.
+///
+/// Reading from inactive lanes returns `0`.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub unsafe fn ds_bpermute(lane: u32, value: u32) -> u32 {
+    unsafe { llvm_ds_bpermute(lane, value) }
+}
+/// Permute a 64-bit value.
+///
+/// `selector` selects between different patterns in which the 64-bit values represented by `src0` and `src1` are permuted.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub unsafe fn perm(src0: u32, src1: u32, selector: u32) -> u32 {
+    unsafe { llvm_perm(src0, src1, selector) }
+}
+
+// gfx10
+/// Performs arbitrary gather-style operation within a row (16 contiguous lanes) of the second input operand.
+///
+/// The third and fourth inputs must be uniform across the current wavefront.
+/// These are combined into a single 64-bit value representing lane selects used to swizzle within each row.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub unsafe fn permlane16_u32(
+    old: u32,
+    src0: u32,
+    src1: u32,
+    src2: u32,
+    fi: bool,
+    bound_control: bool,
+) -> u32 {
+    unsafe { llvm_permlane16_u32(old, src0, src1, src2, fi, bound_control) }
+}
+
+// gfx10
+/// Performs arbitrary gather-style operation across two rows (16 contiguous lanes) of the second input operand.
+///
+/// The third and fourth inputs must be uniform across the current wavefront.
+/// These are combined into a single 64-bit value representing lane selects used to swizzle within each row.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub unsafe fn permlanex16_u32(
+    old: u32,
+    src0: u32,
+    src1: u32,
+    src2: u32,
+    fi: bool,
+    bound_control: bool,
+) -> u32 {
+    unsafe { llvm_permlanex16_u32(old, src0, src1, src2, fi, bound_control) }
+}
+
+/// Get the index of the current wavefront in the workgroup.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub fn s_get_waveid_in_workgroup() -> u32 {
+    llvm_s_get_waveid_in_workgroup()
+}
+
+// gfx11
+/// Swap `value` between upper and lower 32 lanes in a wavefront.
+///
+/// Does nothing for wave32.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub unsafe fn permlane64_u32(value: u32) -> u32 {
+    unsafe { llvm_permlane64_u32(value) }
+}
+
+// gfx12
+/// Performs arbitrary gather-style operation within a row (16 contiguous lanes) of the second input operand.
+///
+/// In contrast to [`permlane16_u32`], allows each lane to specify its own gather lane.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub unsafe fn permlane16_var(old: u32, src0: u32, src1: u32, fi: bool, bound_control: bool) -> u32 {
+    unsafe { llvm_permlane16_var(old, src0, src1, fi, bound_control) }
+}
+
+// gfx12
+/// Performs arbitrary gather-style operation across two rows (16 contiguous lanes) of the second input operand.
+///
+/// In contrast to [`permlanex16_u32`], allows each lane to specify its own gather lane.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub unsafe fn permlanex16_var(
+    old: u32,
+    src0: u32,
+    src1: u32,
+    fi: bool,
+    bound_control: bool,
+) -> u32 {
+    unsafe { llvm_permlanex16_var(old, src0, src1, fi, bound_control) }
+}
+
+/// Get the index of the current wavefront in the workgroup.
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub fn wave_id() -> u32 {
+    llvm_wave_id()
+}
+
+// gfx950
+/// Provide direct access to `v_permlane16_swap_b32` instruction on supported targets.
+///
+/// Swaps the values across lanes of first 2 operands.
+/// Odd rows of the first operand are swapped with even rows of the second operand (one row is 16 lanes).
+/// Returns a pair for the swapped registers.
+/// The first element of the return corresponds to the swapped element of the first argument.
+#[allow(improper_ctypes)]
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub unsafe fn permlane16_swap(
+    vdst_old: u32,
+    vsrc_src0: u32,
+    fi: bool,
+    bound_control: bool,
+) -> (u32, u32) {
+    unsafe { llvm_permlane16_swap(vdst_old, vsrc_src0, fi, bound_control) }
+}
+
+// gfx950
+/// Provide direct access to `v_permlane32_swap_b32` instruction on supported targets.
+///
+/// Swaps the values across lanes of first 2 operands.
+/// Rows 2 and 3 of the first operand are swapped with rows 0 and 1 of the second operand (one row is 16 lanes).
+/// Returns a pair for the swapped registers.
+/// The first element of the return corresponds to the swapped element of the first argument.
+#[allow(improper_ctypes)]
+#[inline]
+#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+pub unsafe fn permlane32_swap(
+    vdst_old: u32,
+    vsrc_src0: u32,
+    fi: bool,
+    bound_control: bool,
+) -> (u32, u32) {
+    unsafe { llvm_permlane32_swap(vdst_old, vsrc_src0, fi, bound_control) }
+}
diff --git a/crates/core_arch/src/core_arch_docs.md b/crates/core_arch/src/core_arch_docs.md
index 6aea2b4618..7075945754 100644
--- a/crates/core_arch/src/core_arch_docs.md
+++ b/crates/core_arch/src/core_arch_docs.md
@@ -185,6 +185,7 @@ others at:
 * [`x86_64`]
 * [`arm`]
 * [`aarch64`]
+* [`amdgpu`]
 * [`riscv32`]
 * [`riscv64`]
 * [`mips`]
@@ -201,6 +202,7 @@ others at:
 [`x86_64`]: ../../core/arch/x86_64/index.html
 [`arm`]: ../../core/arch/arm/index.html
 [`aarch64`]: ../../core/arch/aarch64/index.html
+[`amdgpu`]: ../../core/arch/amdgpu/index.html
 [`riscv32`]: ../../core/arch/riscv32/index.html
 [`riscv64`]: ../../core/arch/riscv64/index.html
 [`mips`]: ../../core/arch/mips/index.html
diff --git a/crates/core_arch/src/mod.rs b/crates/core_arch/src/mod.rs
index 2105cca1b4..fbb8562d31 100644
--- a/crates/core_arch/src/mod.rs
+++ b/crates/core_arch/src/mod.rs
@@ -274,6 +274,16 @@ pub mod arch {
         pub use crate::core_arch::nvptx::*;
     }
 
+    /// Platform-specific intrinsics for the `amdgpu` platform.
+    ///
+    /// See the [module documentation](../index.html) for more details.
+    #[cfg(any(target_arch = "amdgpu", doc))]
+    #[doc(cfg(target_arch = "amdgpu"))]
+    #[unstable(feature = "stdarch_amdgpu", issue = "149988")]
+    pub mod amdgpu {
+        pub use crate::core_arch::amdgpu::*;
+    }
+
     /// Platform-specific intrinsics for the `loongarch32` platform.
     ///
     /// See the [module documentation](../index.html) for more details.
@@ -349,6 +359,10 @@ mod powerpc64;
 #[doc(cfg(target_arch = "nvptx64"))]
 mod nvptx;
 
+#[cfg(any(target_arch = "amdgpu", doc))]
+#[doc(cfg(target_arch = "amdgpu"))]
+mod amdgpu;
+
 #[cfg(any(target_arch = "loongarch32", doc))]
 #[doc(cfg(target_arch = "loongarch32"))]
 mod loongarch32;