From 2850ba8793728310cda396014c2468e43e5f5161 Mon Sep 17 00:00:00 2001 From: Valeriy Savchenko Date: Fri, 28 Nov 2025 14:21:13 +0000 Subject: [PATCH 1/2] [AArch64][NFC] Add test for vector sdiv scalarization --- .../AArch64/sdiv-by-const-promoted-ops.ll | 318 ++++++++++++++++++ 1 file changed, 318 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/sdiv-by-const-promoted-ops.ll diff --git a/llvm/test/CodeGen/AArch64/sdiv-by-const-promoted-ops.ll b/llvm/test/CodeGen/AArch64/sdiv-by-const-promoted-ops.ll new file mode 100644 index 0000000000000..c6b1ab871d81f --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sdiv-by-const-promoted-ops.ll @@ -0,0 +1,318 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s + +define <8 x i16> @sdiv_v8i16_by_7(<8 x i16> %x) { +; CHECK-LABEL: sdiv_v8i16_by_7: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #18725 // =0x4925 +; CHECK-NEXT: dup v1.8h, w8 +; CHECK-NEXT: smull2 v2.4s, v0.8h, v1.8h +; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h +; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h +; CHECK-NEXT: sshr v0.8h, v0.8h, #1 +; CHECK-NEXT: usra v0.8h, v0.8h, #15 +; CHECK-NEXT: ret + %div = sdiv <8 x i16> %x, + ret <8 x i16> %div +} + +define <16 x i16> @sdiv_v16i16_by_7(<16 x i16> %x) { +; CHECK-LABEL: sdiv_v16i16_by_7: +; CHECK: // %bb.0: +; CHECK-NEXT: smov x11, v0.h[1] +; CHECK-NEXT: smov x10, v0.h[0] +; CHECK-NEXT: mov x8, #-56173 // =0xffffffffffff2493 +; CHECK-NEXT: smov x13, v0.h[3] +; CHECK-NEXT: smov x14, v1.h[1] +; CHECK-NEXT: movk x8, #37449, lsl #16 +; CHECK-NEXT: smov x16, v1.h[0] +; CHECK-NEXT: smov w12, v0.h[1] +; CHECK-NEXT: smov w15, v0.h[0] +; CHECK-NEXT: smov x18, v1.h[2] +; CHECK-NEXT: smov w0, v0.h[3] +; CHECK-NEXT: smov w1, v1.h[1] +; CHECK-NEXT: smull x11, w11, w8 +; CHECK-NEXT: smov w2, v1.h[0] +; CHECK-NEXT: smov x9, v0.h[2] +; CHECK-NEXT: smull x10, w10, w8 +; CHECK-NEXT: smov w17, v0.h[2] +; CHECK-NEXT: smov w3, v1.h[2] +; CHECK-NEXT: smull x13, w13, w8 +; CHECK-NEXT: smull x14, w14, w8 +; CHECK-NEXT: add x12, x12, x11, lsr #32 +; CHECK-NEXT: smull x16, w16, w8 +; CHECK-NEXT: add x10, x15, x10, lsr #32 +; CHECK-NEXT: smull x15, w18, w8 +; CHECK-NEXT: add x11, x0, x13, lsr #32 +; CHECK-NEXT: smov x0, v0.h[4] +; CHECK-NEXT: add x13, x1, x14, lsr #32 +; CHECK-NEXT: asr w18, w10, #2 +; CHECK-NEXT: smull x9, w9, w8 +; CHECK-NEXT: add x14, x2, x16, lsr #32 +; CHECK-NEXT: asr w16, w12, #2 +; CHECK-NEXT: smov x2, v1.h[3] +; CHECK-NEXT: add w18, w18, w10, lsr #31 +; CHECK-NEXT: add x15, x3, x15, lsr #32 +; CHECK-NEXT: smov w10, v0.h[5] +; CHECK-NEXT: add w12, w16, w12, lsr #31 +; CHECK-NEXT: asr w16, w14, #2 +; CHECK-NEXT: add x9, x17, x9, lsr #32 +; CHECK-NEXT: fmov s2, w18 +; CHECK-NEXT: smov w17, v0.h[4] +; CHECK-NEXT: smull x0, w0, w8 +; CHECK-NEXT: add w14, w16, w14, lsr #31 +; CHECK-NEXT: asr w16, w13, #2 +; CHECK-NEXT: asr w1, w9, #2 +; CHECK-NEXT: smov x18, v0.h[5] +; CHECK-NEXT: fmov s3, w14 +; CHECK-NEXT: mov v2.h[1], w12 +; CHECK-NEXT: add w12, w16, w13, lsr #31 +; CHECK-NEXT: smov w13, v1.h[3] +; CHECK-NEXT: smov x14, v1.h[4] +; CHECK-NEXT: smull x16, w2, w8 +; CHECK-NEXT: add w1, w1, w9, lsr #31 +; CHECK-NEXT: add x17, x17, x0, lsr #32 +; CHECK-NEXT: asr w0, w15, #2 +; CHECK-NEXT: mov v3.h[1], w12 +; CHECK-NEXT: smov w12, v1.h[4] +; CHECK-NEXT: smull x18, w18, w8 +; CHECK-NEXT: mov v2.h[2], w1 +; CHECK-NEXT: asr w1, w11, #2 +; CHECK-NEXT: add w15, w0, w15, lsr #31 +; CHECK-NEXT: add x13, x13, x16, lsr #32 +; CHECK-NEXT: smov x16, v1.h[5] +; CHECK-NEXT: smull x14, w14, w8 +; CHECK-NEXT: add w11, w1, w11, lsr #31 +; CHECK-NEXT: smov x0, v0.h[6] +; CHECK-NEXT: add x10, x10, x18, lsr #32 +; CHECK-NEXT: asr w1, w13, #2 +; CHECK-NEXT: mov v3.h[2], w15 +; CHECK-NEXT: smov w15, v1.h[5] +; CHECK-NEXT: add x12, x12, x14, lsr #32 +; CHECK-NEXT: mov v2.h[3], w11 +; CHECK-NEXT: asr w11, w17, #2 +; CHECK-NEXT: add w13, w1, w13, lsr #31 +; CHECK-NEXT: smull x16, w16, w8 +; CHECK-NEXT: smov x14, v1.h[6] +; CHECK-NEXT: asr w18, w12, #2 +; CHECK-NEXT: add w11, w11, w17, lsr #31 +; CHECK-NEXT: smov w9, v0.h[6] +; CHECK-NEXT: mov v3.h[3], w13 +; CHECK-NEXT: smull x17, w0, w8 +; CHECK-NEXT: smov x0, v1.h[7] +; CHECK-NEXT: add x13, x15, x16, lsr #32 +; CHECK-NEXT: add w12, w18, w12, lsr #31 +; CHECK-NEXT: smov w16, v1.h[6] +; CHECK-NEXT: mov v2.h[4], w11 +; CHECK-NEXT: smov x11, v0.h[7] +; CHECK-NEXT: smull x14, w14, w8 +; CHECK-NEXT: asr w15, w10, #2 +; CHECK-NEXT: asr w18, w13, #2 +; CHECK-NEXT: smov w1, v0.h[7] +; CHECK-NEXT: mov v3.h[4], w12 +; CHECK-NEXT: add x9, x9, x17, lsr #32 +; CHECK-NEXT: add w10, w15, w10, lsr #31 +; CHECK-NEXT: add w12, w18, w13, lsr #31 +; CHECK-NEXT: add x13, x16, x14, lsr #32 +; CHECK-NEXT: smov w14, v1.h[7] +; CHECK-NEXT: smull x11, w11, w8 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: mov v2.h[5], w10 +; CHECK-NEXT: asr w10, w9, #2 +; CHECK-NEXT: mov v3.h[5], w12 +; CHECK-NEXT: asr w12, w13, #2 +; CHECK-NEXT: add w9, w10, w9, lsr #31 +; CHECK-NEXT: add x10, x1, x11, lsr #32 +; CHECK-NEXT: add w11, w12, w13, lsr #31 +; CHECK-NEXT: add x8, x14, x8, lsr #32 +; CHECK-NEXT: mov v2.h[6], w9 +; CHECK-NEXT: asr w9, w10, #2 +; CHECK-NEXT: mov v3.h[6], w11 +; CHECK-NEXT: asr w11, w8, #2 +; CHECK-NEXT: add w9, w9, w10, lsr #31 +; CHECK-NEXT: add w8, w11, w8, lsr #31 +; CHECK-NEXT: mov v2.h[7], w9 +; CHECK-NEXT: mov v3.h[7], w8 +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: mov v1.16b, v3.16b +; CHECK-NEXT: ret + %div = sdiv <16 x i16> %x, + ret <16 x i16> %div +} + +define <8 x i16> @srem_v8i16_by_7(<8 x i16> %x) { +; CHECK-LABEL: srem_v8i16_by_7: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #18725 // =0x4925 +; CHECK-NEXT: dup v1.8h, w8 +; CHECK-NEXT: smull2 v2.4s, v0.8h, v1.8h +; CHECK-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; CHECK-NEXT: movi v2.8h, #7 +; CHECK-NEXT: sshr v1.8h, v1.8h, #1 +; CHECK-NEXT: usra v1.8h, v1.8h, #15 +; CHECK-NEXT: mls v0.8h, v1.8h, v2.8h +; CHECK-NEXT: ret + %rem = srem <8 x i16> %x, + ret <8 x i16> %rem +} + +define <16 x i16> @srem_v16i16_by_7(<16 x i16> %x) { +; CHECK-LABEL: srem_v16i16_by_7: +; CHECK: // %bb.0: +; CHECK-NEXT: str x19, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: smov x10, v0.h[0] +; CHECK-NEXT: smov x9, v0.h[1] +; CHECK-NEXT: mov x8, #-56173 // =0xffffffffffff2493 +; CHECK-NEXT: smov x15, v0.h[2] +; CHECK-NEXT: movk x8, #37449, lsl #16 +; CHECK-NEXT: smov x16, v0.h[3] +; CHECK-NEXT: smov w13, v0.h[0] +; CHECK-NEXT: smov w12, v0.h[1] +; CHECK-NEXT: smov x18, v1.h[0] +; CHECK-NEXT: smov w17, v0.h[2] +; CHECK-NEXT: smov w14, v0.h[3] +; CHECK-NEXT: smov w2, v1.h[0] +; CHECK-NEXT: smull x10, w10, w8 +; CHECK-NEXT: smov x4, v1.h[1] +; CHECK-NEXT: smov x0, v0.h[4] +; CHECK-NEXT: smull x9, w9, w8 +; CHECK-NEXT: smov w11, v0.h[4] +; CHECK-NEXT: smull x15, w15, w8 +; CHECK-NEXT: smull x1, w16, w8 +; CHECK-NEXT: add x10, x13, x10, lsr #32 +; CHECK-NEXT: smov x16, v0.h[5] +; CHECK-NEXT: add x3, x12, x9, lsr #32 +; CHECK-NEXT: smull x18, w18, w8 +; CHECK-NEXT: smov w9, v0.h[5] +; CHECK-NEXT: add x15, x17, x15, lsr #32 +; CHECK-NEXT: asr w6, w10, #2 +; CHECK-NEXT: smull x4, w4, w8 +; CHECK-NEXT: asr w5, w3, #2 +; CHECK-NEXT: add x1, x14, x1, lsr #32 +; CHECK-NEXT: smull x0, w0, w8 +; CHECK-NEXT: add x7, x2, x18, lsr #32 +; CHECK-NEXT: asr w19, w15, #2 +; CHECK-NEXT: smov w18, v1.h[1] +; CHECK-NEXT: add w10, w6, w10, lsr #31 +; CHECK-NEXT: add w3, w5, w3, lsr #31 +; CHECK-NEXT: asr w5, w1, #2 +; CHECK-NEXT: add w6, w19, w15, lsr #31 +; CHECK-NEXT: asr w15, w7, #2 +; CHECK-NEXT: sub w19, w10, w10, lsl #3 +; CHECK-NEXT: add w1, w5, w1, lsr #31 +; CHECK-NEXT: smov x5, v1.h[2] +; CHECK-NEXT: sub w3, w3, w3, lsl #3 +; CHECK-NEXT: add x4, x18, x4, lsr #32 +; CHECK-NEXT: add w7, w15, w7, lsr #31 +; CHECK-NEXT: add w13, w13, w19 +; CHECK-NEXT: sub w6, w6, w6, lsl #3 +; CHECK-NEXT: sub w1, w1, w1, lsl #3 +; CHECK-NEXT: fmov s2, w13 +; CHECK-NEXT: add w12, w12, w3 +; CHECK-NEXT: add x3, x11, x0, lsr #32 +; CHECK-NEXT: smov w0, v1.h[2] +; CHECK-NEXT: asr w13, w4, #2 +; CHECK-NEXT: smull x5, w5, w8 +; CHECK-NEXT: sub w7, w7, w7, lsl #3 +; CHECK-NEXT: add w6, w17, w6 +; CHECK-NEXT: smov w17, v1.h[3] +; CHECK-NEXT: mov v2.h[1], w12 +; CHECK-NEXT: add w13, w13, w4, lsr #31 +; CHECK-NEXT: smov x4, v1.h[3] +; CHECK-NEXT: add w2, w2, w7 +; CHECK-NEXT: smov x7, v1.h[4] +; CHECK-NEXT: add w14, w14, w1 +; CHECK-NEXT: add x5, x0, x5, lsr #32 +; CHECK-NEXT: sub w13, w13, w13, lsl #3 +; CHECK-NEXT: fmov s3, w2 +; CHECK-NEXT: smull x2, w16, w8 +; CHECK-NEXT: smov w16, v1.h[4] +; CHECK-NEXT: asr w12, w3, #2 +; CHECK-NEXT: mov v2.h[2], w6 +; CHECK-NEXT: asr w6, w5, #2 +; CHECK-NEXT: smull x4, w4, w8 +; CHECK-NEXT: add w18, w18, w13 +; CHECK-NEXT: smov x15, v0.h[6] +; CHECK-NEXT: add w3, w12, w3, lsr #31 +; CHECK-NEXT: add w5, w6, w5, lsr #31 +; CHECK-NEXT: smull x6, w7, w8 +; CHECK-NEXT: mov v3.h[1], w18 +; CHECK-NEXT: add x4, x17, x4, lsr #32 +; CHECK-NEXT: add x1, x9, x2, lsr #32 +; CHECK-NEXT: sub w3, w3, w3, lsl #3 +; CHECK-NEXT: sub w18, w5, w5, lsl #3 +; CHECK-NEXT: smov x5, v1.h[5] +; CHECK-NEXT: mov v2.h[3], w14 +; CHECK-NEXT: asr w2, w4, #2 +; CHECK-NEXT: smov w10, v0.h[6] +; CHECK-NEXT: smov x13, v0.h[7] +; CHECK-NEXT: add w18, w0, w18 +; CHECK-NEXT: add x0, x16, x6, lsr #32 +; CHECK-NEXT: add w11, w11, w3 +; CHECK-NEXT: add w2, w2, w4, lsr #31 +; CHECK-NEXT: smov w4, v1.h[5] +; CHECK-NEXT: mov v3.h[2], w18 +; CHECK-NEXT: asr w18, w0, #2 +; CHECK-NEXT: smull x5, w5, w8 +; CHECK-NEXT: smov x3, v1.h[7] +; CHECK-NEXT: sub w14, w2, w2, lsl #3 +; CHECK-NEXT: smov x2, v1.h[6] +; CHECK-NEXT: smull x15, w15, w8 +; CHECK-NEXT: add w18, w18, w0, lsr #31 +; CHECK-NEXT: asr w6, w1, #2 +; CHECK-NEXT: mov v2.h[4], w11 +; CHECK-NEXT: add w14, w17, w14 +; CHECK-NEXT: add x17, x4, x5, lsr #32 +; CHECK-NEXT: smov w12, v0.h[7] +; CHECK-NEXT: mov v3.h[3], w14 +; CHECK-NEXT: sub w14, w18, w18, lsl #3 +; CHECK-NEXT: smov w18, v1.h[6] +; CHECK-NEXT: smull x2, w2, w8 +; CHECK-NEXT: asr w0, w17, #2 +; CHECK-NEXT: add w1, w6, w1, lsr #31 +; CHECK-NEXT: add w11, w16, w14 +; CHECK-NEXT: add x15, x10, x15, lsr #32 +; CHECK-NEXT: smull x13, w13, w8 +; CHECK-NEXT: add w16, w0, w17, lsr #31 +; CHECK-NEXT: smov w17, v1.h[7] +; CHECK-NEXT: smull x8, w3, w8 +; CHECK-NEXT: mov v3.h[4], w11 +; CHECK-NEXT: add x11, x18, x2, lsr #32 +; CHECK-NEXT: sub w14, w1, w1, lsl #3 +; CHECK-NEXT: asr w0, w15, #2 +; CHECK-NEXT: sub w16, w16, w16, lsl #3 +; CHECK-NEXT: add x13, x12, x13, lsr #32 +; CHECK-NEXT: asr w1, w11, #2 +; CHECK-NEXT: add w9, w9, w14 +; CHECK-NEXT: add w14, w0, w15, lsr #31 +; CHECK-NEXT: add w15, w4, w16 +; CHECK-NEXT: add x8, x17, x8, lsr #32 +; CHECK-NEXT: add w11, w1, w11, lsr #31 +; CHECK-NEXT: mov v2.h[5], w9 +; CHECK-NEXT: mov v3.h[5], w15 +; CHECK-NEXT: sub w9, w14, w14, lsl #3 +; CHECK-NEXT: asr w14, w13, #2 +; CHECK-NEXT: asr w15, w8, #2 +; CHECK-NEXT: sub w11, w11, w11, lsl #3 +; CHECK-NEXT: add w9, w10, w9 +; CHECK-NEXT: add w10, w14, w13, lsr #31 +; CHECK-NEXT: add w8, w15, w8, lsr #31 +; CHECK-NEXT: add w11, w18, w11 +; CHECK-NEXT: mov v2.h[6], w9 +; CHECK-NEXT: mov v3.h[6], w11 +; CHECK-NEXT: sub w9, w10, w10, lsl #3 +; CHECK-NEXT: sub w8, w8, w8, lsl #3 +; CHECK-NEXT: add w9, w12, w9 +; CHECK-NEXT: add w8, w17, w8 +; CHECK-NEXT: mov v2.h[7], w9 +; CHECK-NEXT: mov v3.h[7], w8 +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: mov v1.16b, v3.16b +; CHECK-NEXT: ldr x19, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %rem = srem <16 x i16> %x, + ret <16 x i16> %rem +} From a758c8a3b76905deb31671aa2edc196047ac8f22 Mon Sep 17 00:00:00 2001 From: Valeriy Savchenko Date: Fri, 28 Nov 2025 14:22:01 +0000 Subject: [PATCH 2/2] [DAGCombiner] Allow promoted constants when lowering vector SDIVs --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 6 +- .../CodeGen/SelectionDAG/TargetLowering.cpp | 8 +- llvm/test/CodeGen/AArch64/rem-by-const.ll | 89 +----- .../AArch64/sdiv-by-const-promoted-ops.ll | 295 ++---------------- 4 files changed, 53 insertions(+), 345 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 5377f22e5c61f..0f539655ff0ed 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -5186,7 +5186,8 @@ static bool isDivisorPowerOfTwo(SDValue Divisor) { return false; }; - return ISD::matchUnaryPredicate(Divisor, IsPowerOfTwo); + return ISD::matchUnaryPredicate(Divisor, IsPowerOfTwo, /*AllowUndefs=*/false, + /*AllowTruncation=*/true); } SDValue DAGCombiner::visitSDIVLike(SDValue N0, SDValue N1, SDNode *N) { @@ -5250,7 +5251,8 @@ SDValue DAGCombiner::visitSDIVLike(SDValue N0, SDValue N1, SDNode *N) { // alternate sequence. Targets may check function attributes for size/speed // trade-offs. AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); - if (isConstantOrConstantVector(N1) && + if (isConstantOrConstantVector(N1, /*NoOpaques=*/false, + /*AllowTruncation=*/true) && !TLI.isIntDivCheap(N->getValueType(0), Attr)) if (SDValue Op = BuildSDIV(N)) return Op; diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 1e71937372159..e8110ed549653 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -6562,8 +6562,9 @@ SDValue TargetLowering::BuildSDIV(SDNode *N, SelectionDAG &DAG, auto BuildSDIVPattern = [&](ConstantSDNode *C) { if (C->isZero()) return false; - - const APInt &Divisor = C->getAPIntValue(); + // Truncate the divisor to the target scalar type in case it was promoted + // during type legalization. + APInt Divisor = C->getAPIntValue().trunc(EltBits); SignedDivisionByConstantInfo magics = SignedDivisionByConstantInfo::get(Divisor); int NumeratorFactor = 0; int ShiftMask = -1; @@ -6593,7 +6594,8 @@ SDValue TargetLowering::BuildSDIV(SDNode *N, SelectionDAG &DAG, SDValue N1 = N->getOperand(1); // Collect the shifts / magic values from each element. - if (!ISD::matchUnaryPredicate(N1, BuildSDIVPattern)) + if (!ISD::matchUnaryPredicate(N1, BuildSDIVPattern, /*AllowUndefs=*/false, + /*AllowTruncation=*/true)) return SDValue(); SDValue MagicFactor, Factor, Shift, ShiftMask; diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll index ffaf045fa45c2..c19ded18c94c9 100644 --- a/llvm/test/CodeGen/AArch64/rem-by-const.ll +++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll @@ -893,46 +893,15 @@ define <4 x i8> @sv4i8_7(<4 x i8> %d, <4 x i8> %e) { ; CHECK-SD-LABEL: sv4i8_7: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: shl v0.4h, v0.4h, #8 -; CHECK-SD-NEXT: mov x8, #-56173 // =0xffffffffffff2493 -; CHECK-SD-NEXT: movk x8, #37449, lsl #16 +; CHECK-SD-NEXT: mov w8, #18725 // =0x4925 +; CHECK-SD-NEXT: movi v2.4h, #7 +; CHECK-SD-NEXT: dup v1.4h, w8 ; CHECK-SD-NEXT: sshr v0.4h, v0.4h, #8 -; CHECK-SD-NEXT: smov x10, v0.h[0] -; CHECK-SD-NEXT: smov x9, v0.h[1] -; CHECK-SD-NEXT: smov w12, v0.h[0] -; CHECK-SD-NEXT: smov w11, v0.h[1] -; CHECK-SD-NEXT: smov x13, v0.h[2] -; CHECK-SD-NEXT: smov w14, v0.h[2] -; CHECK-SD-NEXT: smov x17, v0.h[3] -; CHECK-SD-NEXT: smull x10, w10, w8 -; CHECK-SD-NEXT: smull x9, w9, w8 -; CHECK-SD-NEXT: smull x13, w13, w8 -; CHECK-SD-NEXT: add x10, x12, x10, lsr #32 -; CHECK-SD-NEXT: smull x8, w17, w8 -; CHECK-SD-NEXT: add x9, x11, x9, lsr #32 -; CHECK-SD-NEXT: asr w16, w10, #2 -; CHECK-SD-NEXT: add x13, x14, x13, lsr #32 -; CHECK-SD-NEXT: asr w15, w9, #2 -; CHECK-SD-NEXT: add w10, w16, w10, lsr #31 -; CHECK-SD-NEXT: asr w16, w13, #2 -; CHECK-SD-NEXT: add w9, w15, w9, lsr #31 -; CHECK-SD-NEXT: smov w15, v0.h[3] -; CHECK-SD-NEXT: sub w10, w10, w10, lsl #3 -; CHECK-SD-NEXT: sub w9, w9, w9, lsl #3 -; CHECK-SD-NEXT: add w10, w12, w10 -; CHECK-SD-NEXT: fmov s0, w10 -; CHECK-SD-NEXT: add w9, w11, w9 -; CHECK-SD-NEXT: add w10, w16, w13, lsr #31 -; CHECK-SD-NEXT: add x8, x15, x8, lsr #32 -; CHECK-SD-NEXT: mov v0.h[1], w9 -; CHECK-SD-NEXT: sub w9, w10, w10, lsl #3 -; CHECK-SD-NEXT: asr w10, w8, #2 -; CHECK-SD-NEXT: add w9, w14, w9 -; CHECK-SD-NEXT: add w8, w10, w8, lsr #31 -; CHECK-SD-NEXT: mov v0.h[2], w9 -; CHECK-SD-NEXT: sub w8, w8, w8, lsl #3 -; CHECK-SD-NEXT: add w8, w15, w8 -; CHECK-SD-NEXT: mov v0.h[3], w8 -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-SD-NEXT: sshr v1.4s, v1.4s, #17 +; CHECK-SD-NEXT: xtn v1.4h, v1.4s +; CHECK-SD-NEXT: usra v1.4h, v1.4h, #15 +; CHECK-SD-NEXT: mls v0.4h, v1.4h, v2.4h ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: sv4i8_7: @@ -978,39 +947,15 @@ define <4 x i8> @sv4i8_100(<4 x i8> %d, <4 x i8> %e) { ; CHECK-SD-LABEL: sv4i8_100: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: shl v0.4h, v0.4h, #8 -; CHECK-SD-NEXT: mov w8, #34079 // =0x851f -; CHECK-SD-NEXT: mov w14, #100 // =0x64 -; CHECK-SD-NEXT: movk w8, #20971, lsl #16 -; CHECK-SD-NEXT: sshr v1.4h, v0.4h, #8 -; CHECK-SD-NEXT: smov x9, v1.h[0] -; CHECK-SD-NEXT: smov x10, v1.h[1] -; CHECK-SD-NEXT: smov x11, v1.h[2] -; CHECK-SD-NEXT: smov w12, v1.h[0] -; CHECK-SD-NEXT: smov x13, v1.h[3] -; CHECK-SD-NEXT: smov w15, v1.h[1] -; CHECK-SD-NEXT: smull x9, w9, w8 -; CHECK-SD-NEXT: smull x10, w10, w8 -; CHECK-SD-NEXT: smull x11, w11, w8 -; CHECK-SD-NEXT: asr x9, x9, #37 -; CHECK-SD-NEXT: smull x8, w13, w8 -; CHECK-SD-NEXT: asr x10, x10, #37 -; CHECK-SD-NEXT: add w9, w9, w9, lsr #31 -; CHECK-SD-NEXT: asr x11, x11, #37 -; CHECK-SD-NEXT: add w10, w10, w10, lsr #31 -; CHECK-SD-NEXT: asr x8, x8, #37 -; CHECK-SD-NEXT: msub w9, w9, w14, w12 -; CHECK-SD-NEXT: msub w10, w10, w14, w15 -; CHECK-SD-NEXT: add w8, w8, w8, lsr #31 -; CHECK-SD-NEXT: fmov s0, w9 -; CHECK-SD-NEXT: add w9, w11, w11, lsr #31 -; CHECK-SD-NEXT: smov w11, v1.h[2] -; CHECK-SD-NEXT: msub w9, w9, w14, w11 -; CHECK-SD-NEXT: mov v0.h[1], w10 -; CHECK-SD-NEXT: smov w10, v1.h[3] -; CHECK-SD-NEXT: msub w8, w8, w14, w10 -; CHECK-SD-NEXT: mov v0.h[2], w9 -; CHECK-SD-NEXT: mov v0.h[3], w8 -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: mov w8, #5243 // =0x147b +; CHECK-SD-NEXT: movi v2.4h, #100 +; CHECK-SD-NEXT: dup v1.4h, w8 +; CHECK-SD-NEXT: sshr v0.4h, v0.4h, #8 +; CHECK-SD-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-SD-NEXT: sshr v1.4s, v1.4s, #19 +; CHECK-SD-NEXT: xtn v1.4h, v1.4s +; CHECK-SD-NEXT: usra v1.4h, v1.4h, #15 +; CHECK-SD-NEXT: mls v0.4h, v1.4h, v2.4h ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: sv4i8_100: diff --git a/llvm/test/CodeGen/AArch64/sdiv-by-const-promoted-ops.ll b/llvm/test/CodeGen/AArch64/sdiv-by-const-promoted-ops.ll index c6b1ab871d81f..053cbc0616454 100644 --- a/llvm/test/CodeGen/AArch64/sdiv-by-const-promoted-ops.ll +++ b/llvm/test/CodeGen/AArch64/sdiv-by-const-promoted-ops.ll @@ -19,122 +19,18 @@ define <8 x i16> @sdiv_v8i16_by_7(<8 x i16> %x) { define <16 x i16> @sdiv_v16i16_by_7(<16 x i16> %x) { ; CHECK-LABEL: sdiv_v16i16_by_7: ; CHECK: // %bb.0: -; CHECK-NEXT: smov x11, v0.h[1] -; CHECK-NEXT: smov x10, v0.h[0] -; CHECK-NEXT: mov x8, #-56173 // =0xffffffffffff2493 -; CHECK-NEXT: smov x13, v0.h[3] -; CHECK-NEXT: smov x14, v1.h[1] -; CHECK-NEXT: movk x8, #37449, lsl #16 -; CHECK-NEXT: smov x16, v1.h[0] -; CHECK-NEXT: smov w12, v0.h[1] -; CHECK-NEXT: smov w15, v0.h[0] -; CHECK-NEXT: smov x18, v1.h[2] -; CHECK-NEXT: smov w0, v0.h[3] -; CHECK-NEXT: smov w1, v1.h[1] -; CHECK-NEXT: smull x11, w11, w8 -; CHECK-NEXT: smov w2, v1.h[0] -; CHECK-NEXT: smov x9, v0.h[2] -; CHECK-NEXT: smull x10, w10, w8 -; CHECK-NEXT: smov w17, v0.h[2] -; CHECK-NEXT: smov w3, v1.h[2] -; CHECK-NEXT: smull x13, w13, w8 -; CHECK-NEXT: smull x14, w14, w8 -; CHECK-NEXT: add x12, x12, x11, lsr #32 -; CHECK-NEXT: smull x16, w16, w8 -; CHECK-NEXT: add x10, x15, x10, lsr #32 -; CHECK-NEXT: smull x15, w18, w8 -; CHECK-NEXT: add x11, x0, x13, lsr #32 -; CHECK-NEXT: smov x0, v0.h[4] -; CHECK-NEXT: add x13, x1, x14, lsr #32 -; CHECK-NEXT: asr w18, w10, #2 -; CHECK-NEXT: smull x9, w9, w8 -; CHECK-NEXT: add x14, x2, x16, lsr #32 -; CHECK-NEXT: asr w16, w12, #2 -; CHECK-NEXT: smov x2, v1.h[3] -; CHECK-NEXT: add w18, w18, w10, lsr #31 -; CHECK-NEXT: add x15, x3, x15, lsr #32 -; CHECK-NEXT: smov w10, v0.h[5] -; CHECK-NEXT: add w12, w16, w12, lsr #31 -; CHECK-NEXT: asr w16, w14, #2 -; CHECK-NEXT: add x9, x17, x9, lsr #32 -; CHECK-NEXT: fmov s2, w18 -; CHECK-NEXT: smov w17, v0.h[4] -; CHECK-NEXT: smull x0, w0, w8 -; CHECK-NEXT: add w14, w16, w14, lsr #31 -; CHECK-NEXT: asr w16, w13, #2 -; CHECK-NEXT: asr w1, w9, #2 -; CHECK-NEXT: smov x18, v0.h[5] -; CHECK-NEXT: fmov s3, w14 -; CHECK-NEXT: mov v2.h[1], w12 -; CHECK-NEXT: add w12, w16, w13, lsr #31 -; CHECK-NEXT: smov w13, v1.h[3] -; CHECK-NEXT: smov x14, v1.h[4] -; CHECK-NEXT: smull x16, w2, w8 -; CHECK-NEXT: add w1, w1, w9, lsr #31 -; CHECK-NEXT: add x17, x17, x0, lsr #32 -; CHECK-NEXT: asr w0, w15, #2 -; CHECK-NEXT: mov v3.h[1], w12 -; CHECK-NEXT: smov w12, v1.h[4] -; CHECK-NEXT: smull x18, w18, w8 -; CHECK-NEXT: mov v2.h[2], w1 -; CHECK-NEXT: asr w1, w11, #2 -; CHECK-NEXT: add w15, w0, w15, lsr #31 -; CHECK-NEXT: add x13, x13, x16, lsr #32 -; CHECK-NEXT: smov x16, v1.h[5] -; CHECK-NEXT: smull x14, w14, w8 -; CHECK-NEXT: add w11, w1, w11, lsr #31 -; CHECK-NEXT: smov x0, v0.h[6] -; CHECK-NEXT: add x10, x10, x18, lsr #32 -; CHECK-NEXT: asr w1, w13, #2 -; CHECK-NEXT: mov v3.h[2], w15 -; CHECK-NEXT: smov w15, v1.h[5] -; CHECK-NEXT: add x12, x12, x14, lsr #32 -; CHECK-NEXT: mov v2.h[3], w11 -; CHECK-NEXT: asr w11, w17, #2 -; CHECK-NEXT: add w13, w1, w13, lsr #31 -; CHECK-NEXT: smull x16, w16, w8 -; CHECK-NEXT: smov x14, v1.h[6] -; CHECK-NEXT: asr w18, w12, #2 -; CHECK-NEXT: add w11, w11, w17, lsr #31 -; CHECK-NEXT: smov w9, v0.h[6] -; CHECK-NEXT: mov v3.h[3], w13 -; CHECK-NEXT: smull x17, w0, w8 -; CHECK-NEXT: smov x0, v1.h[7] -; CHECK-NEXT: add x13, x15, x16, lsr #32 -; CHECK-NEXT: add w12, w18, w12, lsr #31 -; CHECK-NEXT: smov w16, v1.h[6] -; CHECK-NEXT: mov v2.h[4], w11 -; CHECK-NEXT: smov x11, v0.h[7] -; CHECK-NEXT: smull x14, w14, w8 -; CHECK-NEXT: asr w15, w10, #2 -; CHECK-NEXT: asr w18, w13, #2 -; CHECK-NEXT: smov w1, v0.h[7] -; CHECK-NEXT: mov v3.h[4], w12 -; CHECK-NEXT: add x9, x9, x17, lsr #32 -; CHECK-NEXT: add w10, w15, w10, lsr #31 -; CHECK-NEXT: add w12, w18, w13, lsr #31 -; CHECK-NEXT: add x13, x16, x14, lsr #32 -; CHECK-NEXT: smov w14, v1.h[7] -; CHECK-NEXT: smull x11, w11, w8 -; CHECK-NEXT: smull x8, w0, w8 -; CHECK-NEXT: mov v2.h[5], w10 -; CHECK-NEXT: asr w10, w9, #2 -; CHECK-NEXT: mov v3.h[5], w12 -; CHECK-NEXT: asr w12, w13, #2 -; CHECK-NEXT: add w9, w10, w9, lsr #31 -; CHECK-NEXT: add x10, x1, x11, lsr #32 -; CHECK-NEXT: add w11, w12, w13, lsr #31 -; CHECK-NEXT: add x8, x14, x8, lsr #32 -; CHECK-NEXT: mov v2.h[6], w9 -; CHECK-NEXT: asr w9, w10, #2 -; CHECK-NEXT: mov v3.h[6], w11 -; CHECK-NEXT: asr w11, w8, #2 -; CHECK-NEXT: add w9, w9, w10, lsr #31 -; CHECK-NEXT: add w8, w11, w8, lsr #31 -; CHECK-NEXT: mov v2.h[7], w9 -; CHECK-NEXT: mov v3.h[7], w8 -; CHECK-NEXT: mov v0.16b, v2.16b -; CHECK-NEXT: mov v1.16b, v3.16b +; CHECK-NEXT: mov w8, #18725 // =0x4925 +; CHECK-NEXT: dup v2.8h, w8 +; CHECK-NEXT: smull2 v3.4s, v0.8h, v2.8h +; CHECK-NEXT: smull v0.4s, v0.4h, v2.4h +; CHECK-NEXT: smull2 v4.4s, v1.8h, v2.8h +; CHECK-NEXT: smull v1.4s, v1.4h, v2.4h +; CHECK-NEXT: uzp2 v0.8h, v0.8h, v3.8h +; CHECK-NEXT: uzp2 v1.8h, v1.8h, v4.8h +; CHECK-NEXT: sshr v0.8h, v0.8h, #1 +; CHECK-NEXT: sshr v1.8h, v1.8h, #1 +; CHECK-NEXT: usra v0.8h, v0.8h, #15 +; CHECK-NEXT: usra v1.8h, v1.8h, #15 ; CHECK-NEXT: ret %div = sdiv <16 x i16> %x, ret <16 x i16> %div @@ -160,158 +56,21 @@ define <8 x i16> @srem_v8i16_by_7(<8 x i16> %x) { define <16 x i16> @srem_v16i16_by_7(<16 x i16> %x) { ; CHECK-LABEL: srem_v16i16_by_7: ; CHECK: // %bb.0: -; CHECK-NEXT: str x19, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w19, -16 -; CHECK-NEXT: smov x10, v0.h[0] -; CHECK-NEXT: smov x9, v0.h[1] -; CHECK-NEXT: mov x8, #-56173 // =0xffffffffffff2493 -; CHECK-NEXT: smov x15, v0.h[2] -; CHECK-NEXT: movk x8, #37449, lsl #16 -; CHECK-NEXT: smov x16, v0.h[3] -; CHECK-NEXT: smov w13, v0.h[0] -; CHECK-NEXT: smov w12, v0.h[1] -; CHECK-NEXT: smov x18, v1.h[0] -; CHECK-NEXT: smov w17, v0.h[2] -; CHECK-NEXT: smov w14, v0.h[3] -; CHECK-NEXT: smov w2, v1.h[0] -; CHECK-NEXT: smull x10, w10, w8 -; CHECK-NEXT: smov x4, v1.h[1] -; CHECK-NEXT: smov x0, v0.h[4] -; CHECK-NEXT: smull x9, w9, w8 -; CHECK-NEXT: smov w11, v0.h[4] -; CHECK-NEXT: smull x15, w15, w8 -; CHECK-NEXT: smull x1, w16, w8 -; CHECK-NEXT: add x10, x13, x10, lsr #32 -; CHECK-NEXT: smov x16, v0.h[5] -; CHECK-NEXT: add x3, x12, x9, lsr #32 -; CHECK-NEXT: smull x18, w18, w8 -; CHECK-NEXT: smov w9, v0.h[5] -; CHECK-NEXT: add x15, x17, x15, lsr #32 -; CHECK-NEXT: asr w6, w10, #2 -; CHECK-NEXT: smull x4, w4, w8 -; CHECK-NEXT: asr w5, w3, #2 -; CHECK-NEXT: add x1, x14, x1, lsr #32 -; CHECK-NEXT: smull x0, w0, w8 -; CHECK-NEXT: add x7, x2, x18, lsr #32 -; CHECK-NEXT: asr w19, w15, #2 -; CHECK-NEXT: smov w18, v1.h[1] -; CHECK-NEXT: add w10, w6, w10, lsr #31 -; CHECK-NEXT: add w3, w5, w3, lsr #31 -; CHECK-NEXT: asr w5, w1, #2 -; CHECK-NEXT: add w6, w19, w15, lsr #31 -; CHECK-NEXT: asr w15, w7, #2 -; CHECK-NEXT: sub w19, w10, w10, lsl #3 -; CHECK-NEXT: add w1, w5, w1, lsr #31 -; CHECK-NEXT: smov x5, v1.h[2] -; CHECK-NEXT: sub w3, w3, w3, lsl #3 -; CHECK-NEXT: add x4, x18, x4, lsr #32 -; CHECK-NEXT: add w7, w15, w7, lsr #31 -; CHECK-NEXT: add w13, w13, w19 -; CHECK-NEXT: sub w6, w6, w6, lsl #3 -; CHECK-NEXT: sub w1, w1, w1, lsl #3 -; CHECK-NEXT: fmov s2, w13 -; CHECK-NEXT: add w12, w12, w3 -; CHECK-NEXT: add x3, x11, x0, lsr #32 -; CHECK-NEXT: smov w0, v1.h[2] -; CHECK-NEXT: asr w13, w4, #2 -; CHECK-NEXT: smull x5, w5, w8 -; CHECK-NEXT: sub w7, w7, w7, lsl #3 -; CHECK-NEXT: add w6, w17, w6 -; CHECK-NEXT: smov w17, v1.h[3] -; CHECK-NEXT: mov v2.h[1], w12 -; CHECK-NEXT: add w13, w13, w4, lsr #31 -; CHECK-NEXT: smov x4, v1.h[3] -; CHECK-NEXT: add w2, w2, w7 -; CHECK-NEXT: smov x7, v1.h[4] -; CHECK-NEXT: add w14, w14, w1 -; CHECK-NEXT: add x5, x0, x5, lsr #32 -; CHECK-NEXT: sub w13, w13, w13, lsl #3 -; CHECK-NEXT: fmov s3, w2 -; CHECK-NEXT: smull x2, w16, w8 -; CHECK-NEXT: smov w16, v1.h[4] -; CHECK-NEXT: asr w12, w3, #2 -; CHECK-NEXT: mov v2.h[2], w6 -; CHECK-NEXT: asr w6, w5, #2 -; CHECK-NEXT: smull x4, w4, w8 -; CHECK-NEXT: add w18, w18, w13 -; CHECK-NEXT: smov x15, v0.h[6] -; CHECK-NEXT: add w3, w12, w3, lsr #31 -; CHECK-NEXT: add w5, w6, w5, lsr #31 -; CHECK-NEXT: smull x6, w7, w8 -; CHECK-NEXT: mov v3.h[1], w18 -; CHECK-NEXT: add x4, x17, x4, lsr #32 -; CHECK-NEXT: add x1, x9, x2, lsr #32 -; CHECK-NEXT: sub w3, w3, w3, lsl #3 -; CHECK-NEXT: sub w18, w5, w5, lsl #3 -; CHECK-NEXT: smov x5, v1.h[5] -; CHECK-NEXT: mov v2.h[3], w14 -; CHECK-NEXT: asr w2, w4, #2 -; CHECK-NEXT: smov w10, v0.h[6] -; CHECK-NEXT: smov x13, v0.h[7] -; CHECK-NEXT: add w18, w0, w18 -; CHECK-NEXT: add x0, x16, x6, lsr #32 -; CHECK-NEXT: add w11, w11, w3 -; CHECK-NEXT: add w2, w2, w4, lsr #31 -; CHECK-NEXT: smov w4, v1.h[5] -; CHECK-NEXT: mov v3.h[2], w18 -; CHECK-NEXT: asr w18, w0, #2 -; CHECK-NEXT: smull x5, w5, w8 -; CHECK-NEXT: smov x3, v1.h[7] -; CHECK-NEXT: sub w14, w2, w2, lsl #3 -; CHECK-NEXT: smov x2, v1.h[6] -; CHECK-NEXT: smull x15, w15, w8 -; CHECK-NEXT: add w18, w18, w0, lsr #31 -; CHECK-NEXT: asr w6, w1, #2 -; CHECK-NEXT: mov v2.h[4], w11 -; CHECK-NEXT: add w14, w17, w14 -; CHECK-NEXT: add x17, x4, x5, lsr #32 -; CHECK-NEXT: smov w12, v0.h[7] -; CHECK-NEXT: mov v3.h[3], w14 -; CHECK-NEXT: sub w14, w18, w18, lsl #3 -; CHECK-NEXT: smov w18, v1.h[6] -; CHECK-NEXT: smull x2, w2, w8 -; CHECK-NEXT: asr w0, w17, #2 -; CHECK-NEXT: add w1, w6, w1, lsr #31 -; CHECK-NEXT: add w11, w16, w14 -; CHECK-NEXT: add x15, x10, x15, lsr #32 -; CHECK-NEXT: smull x13, w13, w8 -; CHECK-NEXT: add w16, w0, w17, lsr #31 -; CHECK-NEXT: smov w17, v1.h[7] -; CHECK-NEXT: smull x8, w3, w8 -; CHECK-NEXT: mov v3.h[4], w11 -; CHECK-NEXT: add x11, x18, x2, lsr #32 -; CHECK-NEXT: sub w14, w1, w1, lsl #3 -; CHECK-NEXT: asr w0, w15, #2 -; CHECK-NEXT: sub w16, w16, w16, lsl #3 -; CHECK-NEXT: add x13, x12, x13, lsr #32 -; CHECK-NEXT: asr w1, w11, #2 -; CHECK-NEXT: add w9, w9, w14 -; CHECK-NEXT: add w14, w0, w15, lsr #31 -; CHECK-NEXT: add w15, w4, w16 -; CHECK-NEXT: add x8, x17, x8, lsr #32 -; CHECK-NEXT: add w11, w1, w11, lsr #31 -; CHECK-NEXT: mov v2.h[5], w9 -; CHECK-NEXT: mov v3.h[5], w15 -; CHECK-NEXT: sub w9, w14, w14, lsl #3 -; CHECK-NEXT: asr w14, w13, #2 -; CHECK-NEXT: asr w15, w8, #2 -; CHECK-NEXT: sub w11, w11, w11, lsl #3 -; CHECK-NEXT: add w9, w10, w9 -; CHECK-NEXT: add w10, w14, w13, lsr #31 -; CHECK-NEXT: add w8, w15, w8, lsr #31 -; CHECK-NEXT: add w11, w18, w11 -; CHECK-NEXT: mov v2.h[6], w9 -; CHECK-NEXT: mov v3.h[6], w11 -; CHECK-NEXT: sub w9, w10, w10, lsl #3 -; CHECK-NEXT: sub w8, w8, w8, lsl #3 -; CHECK-NEXT: add w9, w12, w9 -; CHECK-NEXT: add w8, w17, w8 -; CHECK-NEXT: mov v2.h[7], w9 -; CHECK-NEXT: mov v3.h[7], w8 -; CHECK-NEXT: mov v0.16b, v2.16b -; CHECK-NEXT: mov v1.16b, v3.16b -; CHECK-NEXT: ldr x19, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: mov w8, #18725 // =0x4925 +; CHECK-NEXT: dup v2.8h, w8 +; CHECK-NEXT: smull2 v3.4s, v0.8h, v2.8h +; CHECK-NEXT: smull v4.4s, v0.4h, v2.4h +; CHECK-NEXT: smull2 v5.4s, v1.8h, v2.8h +; CHECK-NEXT: smull v2.4s, v1.4h, v2.4h +; CHECK-NEXT: uzp2 v3.8h, v4.8h, v3.8h +; CHECK-NEXT: movi v4.8h, #7 +; CHECK-NEXT: uzp2 v2.8h, v2.8h, v5.8h +; CHECK-NEXT: sshr v3.8h, v3.8h, #1 +; CHECK-NEXT: sshr v2.8h, v2.8h, #1 +; CHECK-NEXT: usra v3.8h, v3.8h, #15 +; CHECK-NEXT: usra v2.8h, v2.8h, #15 +; CHECK-NEXT: mls v0.8h, v3.8h, v4.8h +; CHECK-NEXT: mls v1.8h, v2.8h, v4.8h ; CHECK-NEXT: ret %rem = srem <16 x i16> %x, ret <16 x i16> %rem