From ed3f7758b72abccf1ad9765ea1e66621f98edba0 Mon Sep 17 00:00:00 2001 From: Valeriy Savchenko Date: Tue, 25 Nov 2025 12:01:50 +0000 Subject: [PATCH 1/2] [AArch64][NFC] Add test for vector udiv scalarization --- .../AArch64/udiv-by-const-promoted-ops.ll | 223 ++++++++++++++++++ 1 file changed, 223 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/udiv-by-const-promoted-ops.ll diff --git a/llvm/test/CodeGen/AArch64/udiv-by-const-promoted-ops.ll b/llvm/test/CodeGen/AArch64/udiv-by-const-promoted-ops.ll new file mode 100644 index 0000000000000..ad4bd6b44d9fd --- /dev/null +++ b/llvm/test/CodeGen/AArch64/udiv-by-const-promoted-ops.ll @@ -0,0 +1,223 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s + +; This test verifies that udiv by constant works correctly even when type +; legalization promotes constant operands (e.g., i16 -> i32 in BUILD_VECTOR). +; This is a regression test for a bug where v16i16 would be split into two +; v8i16 operations during legalization, the i16 constants would be promoted +; to i32, and then the second DAGCombine round would fail to recognize the +; promoted constants when trying to convert udiv into mul+shift. + +define <8 x i16> @udiv_v8i16_by_255(<8 x i16> %x) { +; CHECK-LABEL: udiv_v8i16_by_255: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #32897 // =0x8081 +; CHECK-NEXT: dup v1.8h, w8 +; CHECK-NEXT: umull2 v2.4s, v0.8h, v1.8h +; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h +; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h +; CHECK-NEXT: ushr v0.8h, v0.8h, #7 +; CHECK-NEXT: ret + %div = udiv <8 x i16> %x, splat (i16 255) + ret <8 x i16> %div +} + +define <16 x i16> @udiv_v16i16_by_255(<16 x i16> %x) { +; CHECK-LABEL: udiv_v16i16_by_255: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w9, v0.h[0] +; CHECK-NEXT: umov w11, v1.h[0] +; CHECK-NEXT: mov w8, #258 // =0x102 +; CHECK-NEXT: movk w8, #257, lsl #16 +; CHECK-NEXT: umov w10, v0.h[1] +; CHECK-NEXT: umov w12, v1.h[1] +; CHECK-NEXT: umov w13, v0.h[2] +; CHECK-NEXT: umov w14, v1.h[2] +; CHECK-NEXT: umull x9, w9, w8 +; CHECK-NEXT: umull x11, w11, w8 +; CHECK-NEXT: umull x10, w10, w8 +; CHECK-NEXT: umull x12, w12, w8 +; CHECK-NEXT: lsr x9, x9, #32 +; CHECK-NEXT: lsr x11, x11, #32 +; CHECK-NEXT: umull x13, w13, w8 +; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: lsr x10, x10, #32 +; CHECK-NEXT: umov w9, v0.h[3] +; CHECK-NEXT: fmov s3, w11 +; CHECK-NEXT: lsr x12, x12, #32 +; CHECK-NEXT: umull x11, w14, w8 +; CHECK-NEXT: umov w14, v1.h[3] +; CHECK-NEXT: mov v2.h[1], w10 +; CHECK-NEXT: lsr x10, x13, #32 +; CHECK-NEXT: mov v3.h[1], w12 +; CHECK-NEXT: umov w12, v0.h[4] +; CHECK-NEXT: lsr x11, x11, #32 +; CHECK-NEXT: umull x9, w9, w8 +; CHECK-NEXT: umull x13, w14, w8 +; CHECK-NEXT: umov w14, v1.h[4] +; CHECK-NEXT: mov v2.h[2], w10 +; CHECK-NEXT: mov v3.h[2], w11 +; CHECK-NEXT: lsr x9, x9, #32 +; CHECK-NEXT: umull x10, w12, w8 +; CHECK-NEXT: lsr x12, x13, #32 +; CHECK-NEXT: umov w11, v0.h[5] +; CHECK-NEXT: umull x13, w14, w8 +; CHECK-NEXT: umov w14, v1.h[5] +; CHECK-NEXT: mov v2.h[3], w9 +; CHECK-NEXT: lsr x9, x10, #32 +; CHECK-NEXT: mov v3.h[3], w12 +; CHECK-NEXT: lsr x12, x13, #32 +; CHECK-NEXT: umull x10, w11, w8 +; CHECK-NEXT: umov w11, v0.h[6] +; CHECK-NEXT: umull x13, w14, w8 +; CHECK-NEXT: umov w14, v1.h[6] +; CHECK-NEXT: mov v2.h[4], w9 +; CHECK-NEXT: umov w9, v0.h[7] +; CHECK-NEXT: mov v3.h[4], w12 +; CHECK-NEXT: lsr x10, x10, #32 +; CHECK-NEXT: lsr x12, x13, #32 +; CHECK-NEXT: umull x11, w11, w8 +; CHECK-NEXT: umull x13, w14, w8 +; CHECK-NEXT: umov w14, v1.h[7] +; CHECK-NEXT: mov v2.h[5], w10 +; CHECK-NEXT: umull x9, w9, w8 +; CHECK-NEXT: mov v3.h[5], w12 +; CHECK-NEXT: lsr x10, x11, #32 +; CHECK-NEXT: lsr x11, x13, #32 +; CHECK-NEXT: umull x8, w14, w8 +; CHECK-NEXT: lsr x9, x9, #32 +; CHECK-NEXT: mov v2.h[6], w10 +; CHECK-NEXT: mov v3.h[6], w11 +; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: mov v2.h[7], w9 +; CHECK-NEXT: mov v3.h[7], w8 +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: mov v1.16b, v3.16b +; CHECK-NEXT: ret + %div = udiv <16 x i16> %x, splat (i16 255) + ret <16 x i16> %div +} + +define <8 x i16> @urem_v8i16_by_255(<8 x i16> %x) { +; CHECK-LABEL: urem_v8i16_by_255: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #32897 // =0x8081 +; CHECK-NEXT: dup v1.8h, w8 +; CHECK-NEXT: umull2 v2.4s, v0.8h, v1.8h +; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h +; CHECK-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; CHECK-NEXT: movi v2.2d, #0xff00ff00ff00ff +; CHECK-NEXT: ushr v1.8h, v1.8h, #7 +; CHECK-NEXT: mls v0.8h, v1.8h, v2.8h +; CHECK-NEXT: ret + %rem = urem <8 x i16> %x, splat (i16 255) + ret <8 x i16> %rem +} + +define <16 x i16> @urem_v16i16_by_255(<16 x i16> %x) { +; CHECK-LABEL: urem_v16i16_by_255: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w9, v0.h[1] +; CHECK-NEXT: umov w10, v0.h[0] +; CHECK-NEXT: mov w8, #258 // =0x102 +; CHECK-NEXT: umov w12, v1.h[0] +; CHECK-NEXT: movk w8, #257, lsl #16 +; CHECK-NEXT: umov w11, v1.h[1] +; CHECK-NEXT: umov w17, v0.h[2] +; CHECK-NEXT: umov w18, v1.h[2] +; CHECK-NEXT: umov w0, v0.h[3] +; CHECK-NEXT: umov w1, v1.h[3] +; CHECK-NEXT: umull x13, w9, w8 +; CHECK-NEXT: umull x14, w10, w8 +; CHECK-NEXT: umull x16, w12, w8 +; CHECK-NEXT: umull x15, w11, w8 +; CHECK-NEXT: lsr x13, x13, #32 +; CHECK-NEXT: lsr x14, x14, #32 +; CHECK-NEXT: lsr x16, x16, #32 +; CHECK-NEXT: sub w13, w13, w13, lsl #8 +; CHECK-NEXT: sub w14, w14, w14, lsl #8 +; CHECK-NEXT: lsr x15, x15, #32 +; CHECK-NEXT: sub w16, w16, w16, lsl #8 +; CHECK-NEXT: add w9, w9, w13 +; CHECK-NEXT: umull x13, w17, w8 +; CHECK-NEXT: add w10, w10, w14 +; CHECK-NEXT: umull x14, w18, w8 +; CHECK-NEXT: sub w15, w15, w15, lsl #8 +; CHECK-NEXT: add w12, w12, w16 +; CHECK-NEXT: fmov s2, w10 +; CHECK-NEXT: umov w16, v1.h[4] +; CHECK-NEXT: fmov s3, w12 +; CHECK-NEXT: add w11, w11, w15 +; CHECK-NEXT: lsr x13, x13, #32 +; CHECK-NEXT: lsr x14, x14, #32 +; CHECK-NEXT: umov w15, v0.h[4] +; CHECK-NEXT: umull x10, w0, w8 +; CHECK-NEXT: umull x12, w1, w8 +; CHECK-NEXT: mov v2.h[1], w9 +; CHECK-NEXT: sub w13, w13, w13, lsl #8 +; CHECK-NEXT: mov v3.h[1], w11 +; CHECK-NEXT: sub w14, w14, w14, lsl #8 +; CHECK-NEXT: umov w9, v0.h[5] +; CHECK-NEXT: add w13, w17, w13 +; CHECK-NEXT: lsr x10, x10, #32 +; CHECK-NEXT: umov w11, v1.h[5] +; CHECK-NEXT: add w14, w18, w14 +; CHECK-NEXT: lsr x12, x12, #32 +; CHECK-NEXT: umull x17, w15, w8 +; CHECK-NEXT: umull x18, w16, w8 +; CHECK-NEXT: mov v2.h[2], w13 +; CHECK-NEXT: sub w10, w10, w10, lsl #8 +; CHECK-NEXT: mov v3.h[2], w14 +; CHECK-NEXT: sub w12, w12, w12, lsl #8 +; CHECK-NEXT: umov w13, v0.h[6] +; CHECK-NEXT: lsr x14, x17, #32 +; CHECK-NEXT: add w10, w0, w10 +; CHECK-NEXT: umull x17, w9, w8 +; CHECK-NEXT: lsr x18, x18, #32 +; CHECK-NEXT: add w12, w1, w12 +; CHECK-NEXT: umull x0, w11, w8 +; CHECK-NEXT: mov v2.h[3], w10 +; CHECK-NEXT: umov w10, v1.h[6] +; CHECK-NEXT: sub w14, w14, w14, lsl #8 +; CHECK-NEXT: mov v3.h[3], w12 +; CHECK-NEXT: sub w18, w18, w18, lsl #8 +; CHECK-NEXT: lsr x17, x17, #32 +; CHECK-NEXT: add w14, w15, w14 +; CHECK-NEXT: umov w12, v0.h[7] +; CHECK-NEXT: add w15, w16, w18 +; CHECK-NEXT: lsr x18, x0, #32 +; CHECK-NEXT: umov w16, v1.h[7] +; CHECK-NEXT: mov v2.h[4], w14 +; CHECK-NEXT: umull x14, w13, w8 +; CHECK-NEXT: sub w17, w17, w17, lsl #8 +; CHECK-NEXT: mov v3.h[4], w15 +; CHECK-NEXT: umull x15, w10, w8 +; CHECK-NEXT: sub w18, w18, w18, lsl #8 +; CHECK-NEXT: add w9, w9, w17 +; CHECK-NEXT: add w11, w11, w18 +; CHECK-NEXT: lsr x14, x14, #32 +; CHECK-NEXT: lsr x15, x15, #32 +; CHECK-NEXT: mov v2.h[5], w9 +; CHECK-NEXT: umull x9, w12, w8 +; CHECK-NEXT: mov v3.h[5], w11 +; CHECK-NEXT: umull x8, w16, w8 +; CHECK-NEXT: sub w11, w14, w14, lsl #8 +; CHECK-NEXT: sub w14, w15, w15, lsl #8 +; CHECK-NEXT: add w11, w13, w11 +; CHECK-NEXT: lsr x9, x9, #32 +; CHECK-NEXT: add w10, w10, w14 +; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: mov v2.h[6], w11 +; CHECK-NEXT: mov v3.h[6], w10 +; CHECK-NEXT: sub w9, w9, w9, lsl #8 +; CHECK-NEXT: sub w8, w8, w8, lsl #8 +; CHECK-NEXT: add w9, w12, w9 +; CHECK-NEXT: add w8, w16, w8 +; CHECK-NEXT: mov v2.h[7], w9 +; CHECK-NEXT: mov v3.h[7], w8 +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: mov v1.16b, v3.16b +; CHECK-NEXT: ret + %rem = urem <16 x i16> %x, splat (i16 255) + ret <16 x i16> %rem +} From 4db428ca02e61f0a61c738396e1228e06ba8a6db Mon Sep 17 00:00:00 2001 From: Valeriy Savchenko Date: Tue, 25 Nov 2025 12:04:21 +0000 Subject: [PATCH 2/2] [DAGCombiner] Allow promoted constants when lowering vector UDIVs --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 23 ++- .../CodeGen/SelectionDAG/TargetLowering.cpp | 7 +- llvm/test/CodeGen/AArch64/rem-by-const.ll | 65 ++---- .../AArch64/udiv-by-const-promoted-ops.ll | 191 +++--------------- 4 files changed, 56 insertions(+), 230 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 70950084ee6b7..caf45aeb38f6e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -1065,8 +1065,9 @@ static bool isConstantSplatVectorMaskForType(SDNode *N, EVT ScalarTy) { // Determines if it is a constant integer or a splat/build vector of constant // integers (and undefs). -// Do not permit build vector implicit truncation. -static bool isConstantOrConstantVector(SDValue N, bool NoOpaques = false) { +// Do not permit build vector implicit truncation unless AllowTruncation is set. +static bool isConstantOrConstantVector(SDValue N, bool NoOpaques = false, + bool AllowTruncation = false) { if (ConstantSDNode *Const = dyn_cast(N)) return !(Const->isOpaque() && NoOpaques); if (N.getOpcode() != ISD::BUILD_VECTOR && N.getOpcode() != ISD::SPLAT_VECTOR) @@ -1076,8 +1077,13 @@ static bool isConstantOrConstantVector(SDValue N, bool NoOpaques = false) { if (Op.isUndef()) continue; ConstantSDNode *Const = dyn_cast(Op); - if (!Const || Const->getAPIntValue().getBitWidth() != BitWidth || - (Const->isOpaque() && NoOpaques)) + if (!Const || (Const->isOpaque() && NoOpaques)) + return false; + // When AllowTruncation is true, allow constants that have been promoted + // during type legalization as long as the value fits in the target type. + if ((AllowTruncation && + Const->getAPIntValue().getActiveBits() > BitWidth) || + (!AllowTruncation && Const->getAPIntValue().getBitWidth() != BitWidth)) return false; } return true; @@ -5322,7 +5328,8 @@ SDValue DAGCombiner::visitUDIVLike(SDValue N0, SDValue N1, SDNode *N) { EVT VT = N->getValueType(0); // fold (udiv x, (1 << c)) -> x >>u c - if (isConstantOrConstantVector(N1, /*NoOpaques*/ true)) { + if (isConstantOrConstantVector(N1, /*NoOpaques=*/true, + /*AllowTruncation=*/true)) { if (SDValue LogBase2 = BuildLogBase2(N1, DL)) { AddToWorklist(LogBase2.getNode()); @@ -5336,7 +5343,8 @@ SDValue DAGCombiner::visitUDIVLike(SDValue N0, SDValue N1, SDNode *N) { // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2 if (N1.getOpcode() == ISD::SHL) { SDValue N10 = N1.getOperand(0); - if (isConstantOrConstantVector(N10, /*NoOpaques*/ true)) { + if (isConstantOrConstantVector(N10, /*NoOpaques=*/true, + /*AllowTruncation=*/true)) { if (SDValue LogBase2 = BuildLogBase2(N10, DL)) { AddToWorklist(LogBase2.getNode()); @@ -5352,7 +5360,8 @@ SDValue DAGCombiner::visitUDIVLike(SDValue N0, SDValue N1, SDNode *N) { // fold (udiv x, c) -> alternate AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); - if (isConstantOrConstantVector(N1) && + if (isConstantOrConstantVector(N1, /*NoOpaques=*/false, + /*AllowTruncation=*/true) && !TLI.isIntDivCheap(N->getValueType(0), Attr)) if (SDValue Op = BuildUDIV(N)) return Op; diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 783ec4b0bd211..e621f9d83a7b0 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -6738,7 +6738,9 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG, auto BuildUDIVPattern = [&](ConstantSDNode *C) { if (C->isZero()) return false; - const APInt& Divisor = C->getAPIntValue(); + // Truncate the divisor to the target scalar type in case it was promoted + // during type legalization. + APInt Divisor = C->getAPIntValue().trunc(EltBits); SDValue PreShift, MagicFactor, NPQFactor, PostShift; @@ -6779,7 +6781,8 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG, }; // Collect the shifts/magic values from each element. - if (!ISD::matchUnaryPredicate(N1, BuildUDIVPattern)) + if (!ISD::matchUnaryPredicate(N1, BuildUDIVPattern, /*AllowUndefs=*/false, + /*AllowTruncation=*/true)) return SDValue(); SDValue PreShift, PostShift, MagicFactor, NPQFactor; diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll index a55aaeb62830f..ffaf045fa45c2 100644 --- a/llvm/test/CodeGen/AArch64/rem-by-const.ll +++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll @@ -1433,35 +1433,13 @@ entry: define <4 x i8> @uv4i8_7(<4 x i8> %d, <4 x i8> %e) { ; CHECK-SD-LABEL: uv4i8_7: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-NEXT: mov w8, #18725 // =0x4925 +; CHECK-SD-NEXT: mov w8, #9363 // =0x2493 ; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8 -; CHECK-SD-NEXT: movk w8, #9362, lsl #16 -; CHECK-SD-NEXT: umov w9, v0.h[0] -; CHECK-SD-NEXT: umov w10, v0.h[1] -; CHECK-SD-NEXT: umov w13, v0.h[2] -; CHECK-SD-NEXT: umov w15, v0.h[3] -; CHECK-SD-NEXT: umull x11, w9, w8 -; CHECK-SD-NEXT: umull x12, w10, w8 -; CHECK-SD-NEXT: umull x14, w13, w8 -; CHECK-SD-NEXT: lsr x11, x11, #32 -; CHECK-SD-NEXT: umull x8, w15, w8 -; CHECK-SD-NEXT: lsr x12, x12, #32 -; CHECK-SD-NEXT: sub w11, w11, w11, lsl #3 -; CHECK-SD-NEXT: sub w12, w12, w12, lsl #3 -; CHECK-SD-NEXT: lsr x8, x8, #32 -; CHECK-SD-NEXT: add w9, w9, w11 -; CHECK-SD-NEXT: fmov s0, w9 -; CHECK-SD-NEXT: add w10, w10, w12 -; CHECK-SD-NEXT: lsr x9, x14, #32 -; CHECK-SD-NEXT: sub w8, w8, w8, lsl #3 -; CHECK-SD-NEXT: sub w9, w9, w9, lsl #3 -; CHECK-SD-NEXT: mov v0.h[1], w10 -; CHECK-SD-NEXT: add w8, w15, w8 -; CHECK-SD-NEXT: add w9, w13, w9 -; CHECK-SD-NEXT: mov v0.h[2], w9 -; CHECK-SD-NEXT: mov v0.h[3], w8 -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: movi v2.4h, #7 +; CHECK-SD-NEXT: dup v1.4h, w8 +; CHECK-SD-NEXT: umull v1.4s, v0.4h, v1.4h +; CHECK-SD-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-SD-NEXT: mls v0.4h, v1.4h, v2.4h ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: uv4i8_7: @@ -1508,32 +1486,13 @@ entry: define <4 x i8> @uv4i8_100(<4 x i8> %d, <4 x i8> %e) { ; CHECK-SD-LABEL: uv4i8_100: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-NEXT: mov w8, #23593 // =0x5c29 -; CHECK-SD-NEXT: mov w14, #100 // =0x64 +; CHECK-SD-NEXT: mov w8, #656 // =0x290 ; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8 -; CHECK-SD-NEXT: movk w8, #655, lsl #16 -; CHECK-SD-NEXT: umov w9, v0.h[0] -; CHECK-SD-NEXT: umov w10, v0.h[1] -; CHECK-SD-NEXT: umov w12, v0.h[2] -; CHECK-SD-NEXT: umov w15, v0.h[3] -; CHECK-SD-NEXT: umull x11, w9, w8 -; CHECK-SD-NEXT: umull x13, w10, w8 -; CHECK-SD-NEXT: lsr x11, x11, #32 -; CHECK-SD-NEXT: lsr x13, x13, #32 -; CHECK-SD-NEXT: msub w9, w11, w14, w9 -; CHECK-SD-NEXT: umull x11, w12, w8 -; CHECK-SD-NEXT: msub w10, w13, w14, w10 -; CHECK-SD-NEXT: fmov s0, w9 -; CHECK-SD-NEXT: umull x8, w15, w8 -; CHECK-SD-NEXT: lsr x9, x11, #32 -; CHECK-SD-NEXT: mov v0.h[1], w10 -; CHECK-SD-NEXT: msub w9, w9, w14, w12 -; CHECK-SD-NEXT: lsr x8, x8, #32 -; CHECK-SD-NEXT: msub w8, w8, w14, w15 -; CHECK-SD-NEXT: mov v0.h[2], w9 -; CHECK-SD-NEXT: mov v0.h[3], w8 -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: movi v2.4h, #100 +; CHECK-SD-NEXT: dup v1.4h, w8 +; CHECK-SD-NEXT: umull v1.4s, v0.4h, v1.4h +; CHECK-SD-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-SD-NEXT: mls v0.4h, v1.4h, v2.4h ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: uv4i8_100: diff --git a/llvm/test/CodeGen/AArch64/udiv-by-const-promoted-ops.ll b/llvm/test/CodeGen/AArch64/udiv-by-const-promoted-ops.ll index ad4bd6b44d9fd..cdd238cdd81ff 100644 --- a/llvm/test/CodeGen/AArch64/udiv-by-const-promoted-ops.ll +++ b/llvm/test/CodeGen/AArch64/udiv-by-const-promoted-ops.ll @@ -25,74 +25,16 @@ define <8 x i16> @udiv_v8i16_by_255(<8 x i16> %x) { define <16 x i16> @udiv_v16i16_by_255(<16 x i16> %x) { ; CHECK-LABEL: udiv_v16i16_by_255: ; CHECK: // %bb.0: -; CHECK-NEXT: umov w9, v0.h[0] -; CHECK-NEXT: umov w11, v1.h[0] -; CHECK-NEXT: mov w8, #258 // =0x102 -; CHECK-NEXT: movk w8, #257, lsl #16 -; CHECK-NEXT: umov w10, v0.h[1] -; CHECK-NEXT: umov w12, v1.h[1] -; CHECK-NEXT: umov w13, v0.h[2] -; CHECK-NEXT: umov w14, v1.h[2] -; CHECK-NEXT: umull x9, w9, w8 -; CHECK-NEXT: umull x11, w11, w8 -; CHECK-NEXT: umull x10, w10, w8 -; CHECK-NEXT: umull x12, w12, w8 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: lsr x11, x11, #32 -; CHECK-NEXT: umull x13, w13, w8 -; CHECK-NEXT: fmov s2, w9 -; CHECK-NEXT: lsr x10, x10, #32 -; CHECK-NEXT: umov w9, v0.h[3] -; CHECK-NEXT: fmov s3, w11 -; CHECK-NEXT: lsr x12, x12, #32 -; CHECK-NEXT: umull x11, w14, w8 -; CHECK-NEXT: umov w14, v1.h[3] -; CHECK-NEXT: mov v2.h[1], w10 -; CHECK-NEXT: lsr x10, x13, #32 -; CHECK-NEXT: mov v3.h[1], w12 -; CHECK-NEXT: umov w12, v0.h[4] -; CHECK-NEXT: lsr x11, x11, #32 -; CHECK-NEXT: umull x9, w9, w8 -; CHECK-NEXT: umull x13, w14, w8 -; CHECK-NEXT: umov w14, v1.h[4] -; CHECK-NEXT: mov v2.h[2], w10 -; CHECK-NEXT: mov v3.h[2], w11 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: umull x10, w12, w8 -; CHECK-NEXT: lsr x12, x13, #32 -; CHECK-NEXT: umov w11, v0.h[5] -; CHECK-NEXT: umull x13, w14, w8 -; CHECK-NEXT: umov w14, v1.h[5] -; CHECK-NEXT: mov v2.h[3], w9 -; CHECK-NEXT: lsr x9, x10, #32 -; CHECK-NEXT: mov v3.h[3], w12 -; CHECK-NEXT: lsr x12, x13, #32 -; CHECK-NEXT: umull x10, w11, w8 -; CHECK-NEXT: umov w11, v0.h[6] -; CHECK-NEXT: umull x13, w14, w8 -; CHECK-NEXT: umov w14, v1.h[6] -; CHECK-NEXT: mov v2.h[4], w9 -; CHECK-NEXT: umov w9, v0.h[7] -; CHECK-NEXT: mov v3.h[4], w12 -; CHECK-NEXT: lsr x10, x10, #32 -; CHECK-NEXT: lsr x12, x13, #32 -; CHECK-NEXT: umull x11, w11, w8 -; CHECK-NEXT: umull x13, w14, w8 -; CHECK-NEXT: umov w14, v1.h[7] -; CHECK-NEXT: mov v2.h[5], w10 -; CHECK-NEXT: umull x9, w9, w8 -; CHECK-NEXT: mov v3.h[5], w12 -; CHECK-NEXT: lsr x10, x11, #32 -; CHECK-NEXT: lsr x11, x13, #32 -; CHECK-NEXT: umull x8, w14, w8 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: mov v2.h[6], w10 -; CHECK-NEXT: mov v3.h[6], w11 -; CHECK-NEXT: lsr x8, x8, #32 -; CHECK-NEXT: mov v2.h[7], w9 -; CHECK-NEXT: mov v3.h[7], w8 -; CHECK-NEXT: mov v0.16b, v2.16b -; CHECK-NEXT: mov v1.16b, v3.16b +; CHECK-NEXT: mov w8, #32897 // =0x8081 +; CHECK-NEXT: dup v2.8h, w8 +; CHECK-NEXT: umull2 v3.4s, v0.8h, v2.8h +; CHECK-NEXT: umull v0.4s, v0.4h, v2.4h +; CHECK-NEXT: umull2 v4.4s, v1.8h, v2.8h +; CHECK-NEXT: umull v1.4s, v1.4h, v2.4h +; CHECK-NEXT: uzp2 v0.8h, v0.8h, v3.8h +; CHECK-NEXT: uzp2 v1.8h, v1.8h, v4.8h +; CHECK-NEXT: ushr v0.8h, v0.8h, #7 +; CHECK-NEXT: ushr v1.8h, v1.8h, #7 ; CHECK-NEXT: ret %div = udiv <16 x i16> %x, splat (i16 255) ret <16 x i16> %div @@ -117,106 +59,19 @@ define <8 x i16> @urem_v8i16_by_255(<8 x i16> %x) { define <16 x i16> @urem_v16i16_by_255(<16 x i16> %x) { ; CHECK-LABEL: urem_v16i16_by_255: ; CHECK: // %bb.0: -; CHECK-NEXT: umov w9, v0.h[1] -; CHECK-NEXT: umov w10, v0.h[0] -; CHECK-NEXT: mov w8, #258 // =0x102 -; CHECK-NEXT: umov w12, v1.h[0] -; CHECK-NEXT: movk w8, #257, lsl #16 -; CHECK-NEXT: umov w11, v1.h[1] -; CHECK-NEXT: umov w17, v0.h[2] -; CHECK-NEXT: umov w18, v1.h[2] -; CHECK-NEXT: umov w0, v0.h[3] -; CHECK-NEXT: umov w1, v1.h[3] -; CHECK-NEXT: umull x13, w9, w8 -; CHECK-NEXT: umull x14, w10, w8 -; CHECK-NEXT: umull x16, w12, w8 -; CHECK-NEXT: umull x15, w11, w8 -; CHECK-NEXT: lsr x13, x13, #32 -; CHECK-NEXT: lsr x14, x14, #32 -; CHECK-NEXT: lsr x16, x16, #32 -; CHECK-NEXT: sub w13, w13, w13, lsl #8 -; CHECK-NEXT: sub w14, w14, w14, lsl #8 -; CHECK-NEXT: lsr x15, x15, #32 -; CHECK-NEXT: sub w16, w16, w16, lsl #8 -; CHECK-NEXT: add w9, w9, w13 -; CHECK-NEXT: umull x13, w17, w8 -; CHECK-NEXT: add w10, w10, w14 -; CHECK-NEXT: umull x14, w18, w8 -; CHECK-NEXT: sub w15, w15, w15, lsl #8 -; CHECK-NEXT: add w12, w12, w16 -; CHECK-NEXT: fmov s2, w10 -; CHECK-NEXT: umov w16, v1.h[4] -; CHECK-NEXT: fmov s3, w12 -; CHECK-NEXT: add w11, w11, w15 -; CHECK-NEXT: lsr x13, x13, #32 -; CHECK-NEXT: lsr x14, x14, #32 -; CHECK-NEXT: umov w15, v0.h[4] -; CHECK-NEXT: umull x10, w0, w8 -; CHECK-NEXT: umull x12, w1, w8 -; CHECK-NEXT: mov v2.h[1], w9 -; CHECK-NEXT: sub w13, w13, w13, lsl #8 -; CHECK-NEXT: mov v3.h[1], w11 -; CHECK-NEXT: sub w14, w14, w14, lsl #8 -; CHECK-NEXT: umov w9, v0.h[5] -; CHECK-NEXT: add w13, w17, w13 -; CHECK-NEXT: lsr x10, x10, #32 -; CHECK-NEXT: umov w11, v1.h[5] -; CHECK-NEXT: add w14, w18, w14 -; CHECK-NEXT: lsr x12, x12, #32 -; CHECK-NEXT: umull x17, w15, w8 -; CHECK-NEXT: umull x18, w16, w8 -; CHECK-NEXT: mov v2.h[2], w13 -; CHECK-NEXT: sub w10, w10, w10, lsl #8 -; CHECK-NEXT: mov v3.h[2], w14 -; CHECK-NEXT: sub w12, w12, w12, lsl #8 -; CHECK-NEXT: umov w13, v0.h[6] -; CHECK-NEXT: lsr x14, x17, #32 -; CHECK-NEXT: add w10, w0, w10 -; CHECK-NEXT: umull x17, w9, w8 -; CHECK-NEXT: lsr x18, x18, #32 -; CHECK-NEXT: add w12, w1, w12 -; CHECK-NEXT: umull x0, w11, w8 -; CHECK-NEXT: mov v2.h[3], w10 -; CHECK-NEXT: umov w10, v1.h[6] -; CHECK-NEXT: sub w14, w14, w14, lsl #8 -; CHECK-NEXT: mov v3.h[3], w12 -; CHECK-NEXT: sub w18, w18, w18, lsl #8 -; CHECK-NEXT: lsr x17, x17, #32 -; CHECK-NEXT: add w14, w15, w14 -; CHECK-NEXT: umov w12, v0.h[7] -; CHECK-NEXT: add w15, w16, w18 -; CHECK-NEXT: lsr x18, x0, #32 -; CHECK-NEXT: umov w16, v1.h[7] -; CHECK-NEXT: mov v2.h[4], w14 -; CHECK-NEXT: umull x14, w13, w8 -; CHECK-NEXT: sub w17, w17, w17, lsl #8 -; CHECK-NEXT: mov v3.h[4], w15 -; CHECK-NEXT: umull x15, w10, w8 -; CHECK-NEXT: sub w18, w18, w18, lsl #8 -; CHECK-NEXT: add w9, w9, w17 -; CHECK-NEXT: add w11, w11, w18 -; CHECK-NEXT: lsr x14, x14, #32 -; CHECK-NEXT: lsr x15, x15, #32 -; CHECK-NEXT: mov v2.h[5], w9 -; CHECK-NEXT: umull x9, w12, w8 -; CHECK-NEXT: mov v3.h[5], w11 -; CHECK-NEXT: umull x8, w16, w8 -; CHECK-NEXT: sub w11, w14, w14, lsl #8 -; CHECK-NEXT: sub w14, w15, w15, lsl #8 -; CHECK-NEXT: add w11, w13, w11 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: add w10, w10, w14 -; CHECK-NEXT: lsr x8, x8, #32 -; CHECK-NEXT: mov v2.h[6], w11 -; CHECK-NEXT: mov v3.h[6], w10 -; CHECK-NEXT: sub w9, w9, w9, lsl #8 -; CHECK-NEXT: sub w8, w8, w8, lsl #8 -; CHECK-NEXT: add w9, w12, w9 -; CHECK-NEXT: add w8, w16, w8 -; CHECK-NEXT: mov v2.h[7], w9 -; CHECK-NEXT: mov v3.h[7], w8 -; CHECK-NEXT: mov v0.16b, v2.16b -; CHECK-NEXT: mov v1.16b, v3.16b +; CHECK-NEXT: mov w8, #32897 // =0x8081 +; CHECK-NEXT: dup v2.8h, w8 +; CHECK-NEXT: umull2 v3.4s, v0.8h, v2.8h +; CHECK-NEXT: umull v4.4s, v0.4h, v2.4h +; CHECK-NEXT: umull2 v5.4s, v1.8h, v2.8h +; CHECK-NEXT: umull v2.4s, v1.4h, v2.4h +; CHECK-NEXT: uzp2 v3.8h, v4.8h, v3.8h +; CHECK-NEXT: movi v4.2d, #0xff00ff00ff00ff +; CHECK-NEXT: uzp2 v2.8h, v2.8h, v5.8h +; CHECK-NEXT: ushr v3.8h, v3.8h, #7 +; CHECK-NEXT: ushr v2.8h, v2.8h, #7 +; CHECK-NEXT: mls v0.8h, v3.8h, v4.8h +; CHECK-NEXT: mls v1.8h, v2.8h, v4.8h ; CHECK-NEXT: ret %rem = urem <16 x i16> %x, splat (i16 255) ret <16 x i16> %rem