expandFMINIMUMNUM_FMAXIMUMNUM: Improve compare between zeros#140193
expandFMINIMUMNUM_FMAXIMUMNUM: Improve compare between zeros#140193
Conversation
|
@llvm/pr-subscribers-backend-aarch64 @llvm/pr-subscribers-backend-x86 Author: YunQiang Su (wzssyqa) Changes
Patch is 2.87 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/140193.diff 7 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index f17d6a2787889..5166bc0ecd3b8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8681,13 +8681,9 @@ SDValue TargetLowering::expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *Node,
RHS = DAG.getSelectCC(DL, RHS, RHS, LHS, RHS, ISD::SETUO);
}
+ // Please always prefer RHS if equal.
SDValue MinMax =
DAG.getSelectCC(DL, LHS, RHS, LHS, RHS, IsMax ? ISD::SETGT : ISD::SETLT);
- // If MinMax is NaN, let's quiet it.
- if (!Flags.hasNoNaNs() && !DAG.isKnownNeverNaN(LHS) &&
- !DAG.isKnownNeverNaN(RHS)) {
- MinMax = DAG.getNode(ISD::FCANONICALIZE, DL, VT, MinMax, Flags);
- }
// Fixup signed zero behavior.
if (Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros() ||
@@ -8698,13 +8694,27 @@ SDValue TargetLowering::expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *Node,
DAG.getTargetConstant(IsMax ? fcPosZero : fcNegZero, DL, MVT::i32);
SDValue IsZero = DAG.getSetCC(DL, CCVT, MinMax,
DAG.getConstantFP(0.0, DL, VT), ISD::SETEQ);
- SDValue LCmp = DAG.getSelect(
- DL, VT, DAG.getNode(ISD::IS_FPCLASS, DL, CCVT, LHS, TestZero), LHS,
+ unsigned BitSize = VT.getScalarSizeInBits();
+ EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), BitSize);
+ EVT FloatVT = EVT::getFloatingPointVT(32);
+ if (VT.isVector()) {
+ IntVT =
+ EVT::getVectorVT(*DAG.getContext(), IntVT, VT.getVectorElementCount());
+ FloatVT = EVT::getVectorVT(*DAG.getContext(), FloatVT,
+ VT.getVectorElementCount());
+ }
+ SDValue LHSTrunc = LHS;
+ if (!isOperationLegal(ISD::BITCAST, IntVT) &&
+ !isOperationLegal(ISD::IS_FPCLASS, VT)) {
+ LHSTrunc = DAG.getNode(ISD::FP_ROUND, DL, FloatVT, LHS,
+ DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
+ }
+ // It's OK to select from LHS and MinMax, with only one ISD::IS_FPCLASS, as
+ // we preferred RHS when generate MinMax, if the operands are equal.
+ SDValue RetZero = DAG.getSelect(
+ DL, VT, DAG.getNode(ISD::IS_FPCLASS, DL, CCVT, LHSTrunc, TestZero), LHS,
MinMax, Flags);
- SDValue RCmp = DAG.getSelect(
- DL, VT, DAG.getNode(ISD::IS_FPCLASS, DL, CCVT, RHS, TestZero), RHS, LCmp,
- Flags);
- return DAG.getSelect(DL, VT, IsZero, RCmp, MinMax, Flags);
+ return DAG.getSelect(DL, VT, IsZero, RetZero, MinMax, Flags);
}
/// Returns a true value if if this FPClassTest can be performed with an ordered
diff --git a/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll b/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll
index d458bb2492f23..02ea2cc2a1919 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll
@@ -1696,23 +1696,13 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v4, v3
; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; GFX8-NEXT: s_movk_i32 s4, 0x7fff
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
-; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v3
; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
@@ -1722,22 +1712,14 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v1, v3
-; GFX8-NEXT: v_cndmask_b32_sdwa v1, v2, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v0, vcc
; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-SDAG-LABEL: v_max3_bf16_maximumnum_maximumnum__v_v_v_0:
@@ -1753,22 +1735,13 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, v3, v4
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX900-SDAG-NEXT: v_max_f32_e32 v3, v3, v3
-; GFX900-SDAG-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x7fff
-; GFX900-SDAG-NEXT: v_add3_u32 v4, v4, v3, s4
-; GFX900-SDAG-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX900-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX900-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; GFX900-SDAG-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
; GFX900-SDAG-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-SDAG-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
+; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v3
; GFX900-SDAG-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX900-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v2
@@ -1779,21 +1752,13 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, v1, v3
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v2, v0, vcc
-; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX900-SDAG-NEXT: v_max_f32_e32 v1, v1, v1
-; GFX900-SDAG-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX900-SDAG-NEXT: v_add3_u32 v3, v3, v1, s4
-; GFX900-SDAG-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX900-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX900-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX900-SDAG-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX900-SDAG-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX900-SDAG-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-SDAG-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX900-SDAG-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-SDAG-LABEL: v_max3_bf16_maximumnum_maximumnum__v_v_v_0:
@@ -1812,9 +1777,6 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
; GFX950-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, v3, v4
; GFX950-SDAG-NEXT: s_nop 1
; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX950-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX950-SDAG-NEXT: v_max_f32_e32 v3, v3, v3
-; GFX950-SDAG-NEXT: v_cvt_pk_bf16_f32 v3, v3, s0
; GFX950-SDAG-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
; GFX950-SDAG-NEXT: s_nop 1
; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
@@ -1838,9 +1800,6 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
; GFX950-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, v1, v3
; GFX950-SDAG-NEXT: s_nop 1
; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v2, v0, vcc
-; GFX950-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX950-SDAG-NEXT: v_max_f32_e32 v1, v1, v1
-; GFX950-SDAG-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
; GFX950-SDAG-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
; GFX950-SDAG-NEXT: s_nop 1
; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
@@ -1866,22 +1825,14 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v4
; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v3, v3
-; GFX10-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_add3_u32 v4, v4, v3, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo
; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
@@ -1891,21 +1842,13 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3
; GFX10-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_max3_bf16_maximumnum_maximumnum__v_v_v_0:
@@ -1921,61 +1864,38 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v4
; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v3, v3
-; GFX11-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v4, v4, v3, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo
; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v3
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v0 :: v_dual_lshlrev_b32 v3, 16, v2
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v0, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3
; GFX11-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_max3_bf16_maximumnum_maximumnum__v_v_v_0:
@@ -1997,72 +1917,47 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
; GFX12-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v4
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v3
-; GFX12-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v4, v4, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo
; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX12-NEXT: v_dual_cndmask_b32 v0, v3, v0 :: v_dual_lshlrev_b32 v3, 16, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 16, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v1, v2, v0, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX12-NEXT: ...
[truncated]
|
|
@llvm/pr-subscribers-llvm-selectiondag Author: YunQiang Su (wzssyqa) Changes
Patch is 2.87 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/140193.diff 7 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index f17d6a2787889..5166bc0ecd3b8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8681,13 +8681,9 @@ SDValue TargetLowering::expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *Node,
RHS = DAG.getSelectCC(DL, RHS, RHS, LHS, RHS, ISD::SETUO);
}
+ // Please always prefer RHS if equal.
SDValue MinMax =
DAG.getSelectCC(DL, LHS, RHS, LHS, RHS, IsMax ? ISD::SETGT : ISD::SETLT);
- // If MinMax is NaN, let's quiet it.
- if (!Flags.hasNoNaNs() && !DAG.isKnownNeverNaN(LHS) &&
- !DAG.isKnownNeverNaN(RHS)) {
- MinMax = DAG.getNode(ISD::FCANONICALIZE, DL, VT, MinMax, Flags);
- }
// Fixup signed zero behavior.
if (Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros() ||
@@ -8698,13 +8694,27 @@ SDValue TargetLowering::expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *Node,
DAG.getTargetConstant(IsMax ? fcPosZero : fcNegZero, DL, MVT::i32);
SDValue IsZero = DAG.getSetCC(DL, CCVT, MinMax,
DAG.getConstantFP(0.0, DL, VT), ISD::SETEQ);
- SDValue LCmp = DAG.getSelect(
- DL, VT, DAG.getNode(ISD::IS_FPCLASS, DL, CCVT, LHS, TestZero), LHS,
+ unsigned BitSize = VT.getScalarSizeInBits();
+ EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), BitSize);
+ EVT FloatVT = EVT::getFloatingPointVT(32);
+ if (VT.isVector()) {
+ IntVT =
+ EVT::getVectorVT(*DAG.getContext(), IntVT, VT.getVectorElementCount());
+ FloatVT = EVT::getVectorVT(*DAG.getContext(), FloatVT,
+ VT.getVectorElementCount());
+ }
+ SDValue LHSTrunc = LHS;
+ if (!isOperationLegal(ISD::BITCAST, IntVT) &&
+ !isOperationLegal(ISD::IS_FPCLASS, VT)) {
+ LHSTrunc = DAG.getNode(ISD::FP_ROUND, DL, FloatVT, LHS,
+ DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
+ }
+ // It's OK to select from LHS and MinMax, with only one ISD::IS_FPCLASS, as
+ // we preferred RHS when generate MinMax, if the operands are equal.
+ SDValue RetZero = DAG.getSelect(
+ DL, VT, DAG.getNode(ISD::IS_FPCLASS, DL, CCVT, LHSTrunc, TestZero), LHS,
MinMax, Flags);
- SDValue RCmp = DAG.getSelect(
- DL, VT, DAG.getNode(ISD::IS_FPCLASS, DL, CCVT, RHS, TestZero), RHS, LCmp,
- Flags);
- return DAG.getSelect(DL, VT, IsZero, RCmp, MinMax, Flags);
+ return DAG.getSelect(DL, VT, IsZero, RetZero, MinMax, Flags);
}
/// Returns a true value if if this FPClassTest can be performed with an ordered
diff --git a/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll b/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll
index d458bb2492f23..02ea2cc2a1919 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll
@@ -1696,23 +1696,13 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v4, v3
; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; GFX8-NEXT: s_movk_i32 s4, 0x7fff
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
-; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v3
; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
@@ -1722,22 +1712,14 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v1, v3
-; GFX8-NEXT: v_cndmask_b32_sdwa v1, v2, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v0, vcc
; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-SDAG-LABEL: v_max3_bf16_maximumnum_maximumnum__v_v_v_0:
@@ -1753,22 +1735,13 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, v3, v4
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX900-SDAG-NEXT: v_max_f32_e32 v3, v3, v3
-; GFX900-SDAG-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x7fff
-; GFX900-SDAG-NEXT: v_add3_u32 v4, v4, v3, s4
-; GFX900-SDAG-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX900-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX900-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; GFX900-SDAG-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
; GFX900-SDAG-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-SDAG-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
+; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v3
; GFX900-SDAG-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX900-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v2
@@ -1779,21 +1752,13 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, v1, v3
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v2, v0, vcc
-; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX900-SDAG-NEXT: v_max_f32_e32 v1, v1, v1
-; GFX900-SDAG-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX900-SDAG-NEXT: v_add3_u32 v3, v3, v1, s4
-; GFX900-SDAG-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX900-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX900-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX900-SDAG-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX900-SDAG-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2
-; GFX900-SDAG-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX900-SDAG-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX900-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX900-SDAG-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-SDAG-LABEL: v_max3_bf16_maximumnum_maximumnum__v_v_v_0:
@@ -1812,9 +1777,6 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
; GFX950-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, v3, v4
; GFX950-SDAG-NEXT: s_nop 1
; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
-; GFX950-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX950-SDAG-NEXT: v_max_f32_e32 v3, v3, v3
-; GFX950-SDAG-NEXT: v_cvt_pk_bf16_f32 v3, v3, s0
; GFX950-SDAG-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
; GFX950-SDAG-NEXT: s_nop 1
; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
@@ -1838,9 +1800,6 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
; GFX950-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, v1, v3
; GFX950-SDAG-NEXT: s_nop 1
; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v2, v0, vcc
-; GFX950-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX950-SDAG-NEXT: v_max_f32_e32 v1, v1, v1
-; GFX950-SDAG-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
; GFX950-SDAG-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
; GFX950-SDAG-NEXT: s_nop 1
; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
@@ -1866,22 +1825,14 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v4
; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v3, v3
-; GFX10-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_add3_u32 v4, v4, v3, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo
; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
@@ -1891,21 +1842,13 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3
; GFX10-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_max3_bf16_maximumnum_maximumnum__v_v_v_0:
@@ -1921,61 +1864,38 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v4
; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v3, v3
-; GFX11-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v4, v4, v3, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo
; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v3
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v0 :: v_dual_lshlrev_b32 v3, 16, v2
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v0, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3
; GFX11-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_max3_bf16_maximumnum_maximumnum__v_v_v_0:
@@ -1997,72 +1917,47 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
; GFX12-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v4
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v3
-; GFX12-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v4, v4, v3, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo
; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX12-NEXT: v_dual_cndmask_b32 v0, v3, v0 :: v_dual_lshlrev_b32 v3, 16, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 16, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cndmask_b32_e32 v1, v2, v0, vcc_lo
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v2
-; GFX12-NEXT: ...
[truncated]
|
b035f20 to
cd71163
Compare
|
@nikic any idea about how to sync the FMAXIMUMNUM and FMAXIMUM? FP_ROUND seems performing better since it doesn't need stack operations. |
…m, r=tgross35
Always use the pure Rust fallback instead of `llvm.{maximum,minimum}`
While llvm/llvm-project#142170 was merged, it was reverted and next attempt (llvm/llvm-project#140193) at fixing the LLVM implementation seems to have stall, so let's reverted back to pure Rust with the LLVM codegen.
cc [#t-compiler/llvm > &rust-lang#96;llvm.minimum&rust-lang#96;/&rust-lang#96;llvm.maximum&rust-lang#96; issues @ 💬](https://rust-lang.zulipchat.com/#narrow/channel/187780-t-compiler.2Fllvm/topic/.60llvm.2Eminimum.60.2F.60llvm.2Emaximum.60.20issues/near/527044712)
Fixes rust-lang#141087
r? `@tgross35`
Rollup merge of #143395 - Urgau:llvm-fallback-minimum-maximum, r=tgross35 Always use the pure Rust fallback instead of `llvm.{maximum,minimum}` While llvm/llvm-project#142170 was merged, it was reverted and next attempt (llvm/llvm-project#140193) at fixing the LLVM implementation seems to have stall, so let's reverted back to pure Rust with the LLVM codegen. cc [#t-compiler/llvm > `llvm.minimum`/`llvm.maximum` issues @ 💬](https://rust-lang.zulipchat.com/#narrow/channel/187780-t-compiler.2Fllvm/topic/.60llvm.2Eminimum.60.2F.60llvm.2Emaximum.60.20issues/near/527044712) Fixes #141087 r? `@tgross35`
…ss35
Always use the pure Rust fallback instead of `llvm.{maximum,minimum}`
While llvm/llvm-project#142170 was merged, it was reverted and next attempt (llvm/llvm-project#140193) at fixing the LLVM implementation seems to have stall, so let's reverted back to pure Rust with the LLVM codegen.
cc [#t-compiler/llvm > &rust-lang#96;llvm.minimum&rust-lang#96;/&rust-lang#96;llvm.maximum&rust-lang#96; issues @ 💬](https://rust-lang.zulipchat.com/#narrow/channel/187780-t-compiler.2Fllvm/topic/.60llvm.2Eminimum.60.2F.60llvm.2Emaximum.60.20issues/near/527044712)
Fixes rust-lang/rust#141087
r? `@tgross35`
…m, r=tgross35
Always use the pure Rust fallback instead of `llvm.{maximum,minimum}`
While llvm/llvm-project#142170 was merged, it was reverted and next attempt (llvm/llvm-project#140193) at fixing the LLVM implementation seems to have stall, so let's reverted back to pure Rust with the LLVM codegen.
cc [#t-compiler/llvm > `llvm.minimum`/`llvm.maximum` issues @ 💬](https://rust-lang.zulipchat.com/#narrow/channel/187780-t-compiler.2Fllvm/topic/.60llvm.2Eminimum.60.2F.60llvm.2Emaximum.60.20issues/near/527044712)
Fixes rust-lang#141087
r? `@tgross35`
1. On GPR32 platform, expandIS_FPCLASS may fail due to ISD::BITCAST double to int64 may fail. Let's FP_ROUND double to float first. Since we use it if MinMax is zero only, so the flushing won't break anything. 2. Only one IS_FPCLASS is needed. MinMax will always be RHS if equal. So we can select between LHS and MinMax. It will even safe if FP_ROUND flush a small LHS, as if LHS is not zero then, MinMax won't be Zero, so we will always use MinMax.
8c25184 to
5735e20
Compare
Co-authored-by: Nikita Popov <github@npopov.com>
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
🐧 Linux x64 Test Results
✅ The build succeeded and all tests passed. |
nikic
left a comment
There was a problem hiding this comment.
The implementation LGTM, though I did not review the asm diffs very carefully.
Co-authored-by: Matt Arsenault <arsenm2@gmail.com>
Co-authored-by: Matt Arsenault <arsenm2@gmail.com>
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/129/builds/34317 Here is the relevant piece of the build log for the reference |
…0193) 1. On GPR32 platform, expandIS_FPCLASS may fail due to ISD::BITCAST double to int64 may fail. Let's FP_ROUND double to float first. Since we use it if MinMax is zero only, so the flushing won't break anything. 2. Only one IS_FPCLASS is needed. MinMax will always be RHS if equal. So we can select between LHS and MinMax. It will even safe if FP_ROUND flush a small LHS, as if LHS is not zero then, MinMax won't be Zero, so we will always use MinMax. --------- Co-authored-by: Nikita Popov <github@npopov.com> Co-authored-by: Matt Arsenault <arsenm2@gmail.com>
On GPR32 platform, expandIS_FPCLASS may fail due to ISD::BITCAST
double to int64 may fail. Let's FP_ROUND double to float first.
Since we use it if MinMax is zero only, so the flushing won't
break anything.
Only one IS_FPCLASS is needed. MinMax will always be RHS if equal.
So we can select between LHS and MinMax.
It will even safe if FP_ROUND flush a small LHS, as if LHS is not zero
then, MinMax won't be Zero, so we will always use MinMax.