Skip to content

Commit 4d16376

Browse files
Optimize Vector128 and Vector256.Create methods (#35857)
* Updating Vector128.Create(T value) to be intrinsic * Updating Vector128.Create(T, ..., T) to be intrinsic * Updating Vector256.Create(T) and Vector256.Create(T, ..., T) to be intrinsic * Applying formatting patch * Adding additional comments explaining how the Vector128.Create and Vector256.Create calls are lowered * Add an assert that argCnt is as expected for LowerHWIntrinsicCreate * Applying formatting patch * Use unsigned rather than signed types in LowerHWIntrinsicCreate * Have HWIntrinsicInfo::lookupId take the number of arguments into account * Adjusting Vector128.Create(T, ..., T) to not be recursive on ARM64 * Fixing MyICJI::allocMem in superpmi to respect certain CorJitAllocMemFlag * Avoid an implicit downcast
1 parent eb7198f commit 4d16376

12 files changed

Lines changed: 1491 additions & 495 deletions

File tree

src/coreclr/src/ToolBox/superpmi/superpmi/icorjitinfo.cpp

Lines changed: 47 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1578,6 +1578,18 @@ bool MyICJI::runWithErrorTrap(void (*function)(void*), void* param)
15781578
return RunWithErrorTrap(function, param);
15791579
}
15801580

1581+
// Ideally we'd just use the copies of this in standardmacros.h
1582+
// however, superpmi is missing various other dependencies as well
1583+
static size_t ALIGN_UP_SPMI(size_t val, size_t alignment)
1584+
{
1585+
return (val + (alignment - 1)) & ~(alignment - 1);
1586+
}
1587+
1588+
static void* ALIGN_UP_SPMI(void* val, size_t alignment)
1589+
{
1590+
return (void*)ALIGN_UP_SPMI((size_t)val, alignment);
1591+
}
1592+
15811593
// get a block of memory for the code, readonly data, and read-write data
15821594
void MyICJI::allocMem(ULONG hotCodeSize, /* IN */
15831595
ULONG coldCodeSize, /* IN */
@@ -1590,13 +1602,46 @@ void MyICJI::allocMem(ULONG hotCodeSize, /* IN */
15901602
)
15911603
{
15921604
jitInstance->mc->cr->AddCall("allocMem");
1593-
// TODO-Cleanup: investigate if we need to check roDataBlock as well. Could hot block size be ever 0?
1605+
1606+
// TODO-Cleanup: Could hot block size be ever 0?
15941607
*hotCodeBlock = jitInstance->mc->cr->allocateMemory(hotCodeSize);
1608+
15951609
if (coldCodeSize > 0)
15961610
*coldCodeBlock = jitInstance->mc->cr->allocateMemory(coldCodeSize);
15971611
else
15981612
*coldCodeBlock = nullptr;
1599-
*roDataBlock = jitInstance->mc->cr->allocateMemory(roDataSize);
1613+
1614+
if (roDataSize > 0)
1615+
{
1616+
size_t roDataAlignment = sizeof(void*);
1617+
size_t roDataAlignedSize = static_cast<size_t>(roDataSize);
1618+
1619+
if ((flag & CORJIT_ALLOCMEM_FLG_RODATA_32BYTE_ALIGN) != 0)
1620+
{
1621+
roDataAlignment = 32;
1622+
}
1623+
else if ((flag & CORJIT_ALLOCMEM_FLG_RODATA_16BYTE_ALIGN) != 0)
1624+
{
1625+
roDataAlignment = 16;
1626+
}
1627+
else if (roDataSize >= 8)
1628+
{
1629+
roDataAlignment = 8;
1630+
}
1631+
1632+
// We need to round the roDataSize up to the alignment size and then
1633+
// overallocate by at most alignment - sizeof(void*) to ensure that
1634+
// we can offset roDataBlock to be an aligned address and that the
1635+
// allocation contains at least the originally requested size after
1636+
1637+
roDataAlignedSize = ALIGN_UP_SPMI(roDataAlignedSize, roDataAlignment);
1638+
roDataAlignedSize = roDataAlignedSize + (roDataAlignment - sizeof(void*));
1639+
*roDataBlock = jitInstance->mc->cr->allocateMemory(roDataAlignedSize);
1640+
*roDataBlock = ALIGN_UP_SPMI(*roDataBlock, roDataAlignment);
1641+
}
1642+
else
1643+
*roDataBlock = nullptr;
1644+
16001645
jitInstance->mc->cr->recAllocMem(hotCodeSize, coldCodeSize, roDataSize, xcptnsCount, flag, hotCodeBlock,
16011646
coldCodeBlock, roDataBlock);
16021647
}

src/coreclr/src/jit/emit.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5339,10 +5339,11 @@ UNATIVE_OFFSET emitter::emitDataGenBeg(UNATIVE_OFFSET size, bool align)
53395339
{
53405340
// Data can have any size but since alignment is deduced from the size there's no
53415341
// way to have a larger data size (e.g. 128) and request 4/8/16 byte alignment.
5342-
// 32 bytes (and more) alignment requires VM support (see ICorJitInfo::allocMem).
5343-
assert(size <= 16);
5342+
// As such, we restrict data above 16 bytes to be a multiple of 16 and assume 16-byte
5343+
// alignment. Alignment greater than 16 requires VM support (see ICorJitInfo::allocMem).
5344+
assert((size <= 16) || ((size % 16) == 0));
53445345

5345-
if (size == 16)
5346+
if (size >= 16)
53465347
{
53475348
emitConsDsc.align16 = true;
53485349
}

src/coreclr/src/jit/emitxarch.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10765,7 +10765,8 @@ BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
1076510765
}
1076610766

1076710767
// Check that the offset is properly aligned (i.e. the ddd in [ddd])
10768-
assert((emitChkAlign == false) || (ins == INS_lea) || (((size_t)addr & (byteSize - 1)) == 0));
10768+
assert((emitChkAlign == false) || (ins == INS_lea) ||
10769+
((byteSize < 16) && (((size_t)addr & (byteSize - 1)) == 0)) || (((size_t)addr & (16 - 1)) == 0));
1076910770
}
1077010771
else
1077110772
{

src/coreclr/src/jit/hwintrinsic.cpp

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -292,16 +292,19 @@ CORINFO_CLASS_HANDLE Compiler::gtGetStructHandleForHWSIMD(var_types simdType, va
292292
// lookupId: Gets the NamedIntrinsic for a given method name and InstructionSet
293293
//
294294
// Arguments:
295+
// comp -- The compiler
296+
// sig -- The signature of the intrinsic
295297
// className -- The name of the class associated with the HWIntrinsic to lookup
296298
// methodName -- The name of the method associated with the HWIntrinsic to lookup
297299
// enclosingClassName -- The name of the enclosing class of X64 classes
298300
//
299301
// Return Value:
300302
// The NamedIntrinsic associated with methodName and isa
301-
NamedIntrinsic HWIntrinsicInfo::lookupId(Compiler* comp,
302-
const char* className,
303-
const char* methodName,
304-
const char* enclosingClassName)
303+
NamedIntrinsic HWIntrinsicInfo::lookupId(Compiler* comp,
304+
CORINFO_SIG_INFO* sig,
305+
const char* className,
306+
const char* methodName,
307+
const char* enclosingClassName)
305308
{
306309
// TODO-Throughput: replace sequential search by binary search
307310
CORINFO_InstructionSet isa = lookupIsa(className, enclosingClassName);
@@ -324,14 +327,23 @@ NamedIntrinsic HWIntrinsicInfo::lookupId(Compiler* comp,
324327

325328
for (int i = 0; i < (NI_HW_INTRINSIC_END - NI_HW_INTRINSIC_START - 1); i++)
326329
{
330+
const HWIntrinsicInfo& intrinsicInfo = hwIntrinsicInfoArray[i];
331+
327332
if (isa != hwIntrinsicInfoArray[i].isa)
328333
{
329334
continue;
330335
}
331336

332-
if (strcmp(methodName, hwIntrinsicInfoArray[i].name) == 0)
337+
int numArgs = static_cast<unsigned>(intrinsicInfo.numArgs);
338+
339+
if ((numArgs != -1) && (sig->numArgs != static_cast<unsigned>(intrinsicInfo.numArgs)))
340+
{
341+
continue;
342+
}
343+
344+
if (strcmp(methodName, intrinsicInfo.name) == 0)
333345
{
334-
return hwIntrinsicInfoArray[i].id;
346+
return intrinsicInfo.id;
335347
}
336348
}
337349

src/coreclr/src/jit/hwintrinsic.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -258,10 +258,11 @@ struct HWIntrinsicInfo
258258

259259
static const HWIntrinsicInfo& lookup(NamedIntrinsic id);
260260

261-
static NamedIntrinsic lookupId(Compiler* comp,
262-
const char* className,
263-
const char* methodName,
264-
const char* enclosingClassName);
261+
static NamedIntrinsic lookupId(Compiler* comp,
262+
CORINFO_SIG_INFO* sig,
263+
const char* className,
264+
const char* methodName,
265+
const char* enclosingClassName);
265266
static CORINFO_InstructionSet lookupIsa(const char* className, const char* enclosingClassName);
266267

267268
static unsigned lookupSimdSize(Compiler* comp, NamedIntrinsic id, CORINFO_SIG_INFO* sig);

src/coreclr/src/jit/hwintrinsiclistxarch.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ HARDWARE_INTRINSIC(Vector128, AsVector2,
4343
HARDWARE_INTRINSIC(Vector128, AsVector3, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
4444
HARDWARE_INTRINSIC(Vector128, AsVector4, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
4545
HARDWARE_INTRINSIC(Vector128, AsVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
46+
HARDWARE_INTRINSIC(Vector128, Create, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
4647
HARDWARE_INTRINSIC(Vector128, CreateScalarUnsafe, 16, 1, {INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_movss, INS_movsdsse2}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
4748
// The instruction generated for float/double depends on which ISAs are supported
4849
HARDWARE_INTRINSIC(Vector128, get_AllBitsSet, 16, 0, {INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_cmpps, INS_cmppd}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
@@ -76,6 +77,7 @@ HARDWARE_INTRINSIC(Vector256, AsVector256,
7677
HARDWARE_INTRINSIC(Vector256, get_AllBitsSet, 32, 0, {INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_cmpps, INS_cmppd}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
7778
HARDWARE_INTRINSIC(Vector256, get_Count, 32, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
7879
HARDWARE_INTRINSIC(Vector256, get_Zero, 32, 0, {INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
80+
HARDWARE_INTRINSIC(Vector256, Create, 32, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
7981
HARDWARE_INTRINSIC(Vector256, CreateScalarUnsafe, 32, 1, {INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_movss, INS_movsdsse2}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
8082
HARDWARE_INTRINSIC(Vector256, GetElement, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg)
8183
HARDWARE_INTRINSIC(Vector256, WithElement, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg)

src/coreclr/src/jit/hwintrinsicxarch.cpp

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -521,6 +521,7 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic,
521521
{
522522
GenTree* retNode = nullptr;
523523
GenTree* op1 = nullptr;
524+
GenTree* op2 = nullptr;
524525

525526
if (!featureSIMD)
526527
{
@@ -747,6 +748,70 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic,
747748
break;
748749
}
749750

751+
case NI_Vector128_Create:
752+
case NI_Vector256_Create:
753+
{
754+
#if defined(TARGET_X86)
755+
if (varTypeIsLong(baseType))
756+
{
757+
// TODO-XARCH-CQ: It may be beneficial to emit the movq
758+
// instruction, which takes a 64-bit memory address and
759+
// works on 32-bit x86 systems.
760+
break;
761+
}
762+
#endif // TARGET_X86
763+
764+
// We shouldn't handle this as an intrinsic if the
765+
// respective ISAs have been disabled by the user.
766+
767+
if (intrinsic == NI_Vector256_Create)
768+
{
769+
if (!compExactlyDependsOn(InstructionSet_AVX))
770+
{
771+
break;
772+
}
773+
}
774+
else if (baseType == TYP_FLOAT)
775+
{
776+
if (!compExactlyDependsOn(InstructionSet_SSE))
777+
{
778+
break;
779+
}
780+
}
781+
else if (!compExactlyDependsOn(InstructionSet_SSE2))
782+
{
783+
break;
784+
}
785+
786+
if (sig->numArgs == 1)
787+
{
788+
op1 = impPopStack().val;
789+
retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize);
790+
}
791+
else if (sig->numArgs == 2)
792+
{
793+
op2 = impPopStack().val;
794+
op1 = impPopStack().val;
795+
retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, baseType, simdSize);
796+
}
797+
else
798+
{
799+
assert(sig->numArgs >= 3);
800+
801+
GenTreeArgList* tmp = nullptr;
802+
803+
for (unsigned i = 0; i < sig->numArgs; i++)
804+
{
805+
tmp = gtNewArgList(impPopStack().val);
806+
tmp->gtOp2 = op1;
807+
op1 = tmp;
808+
}
809+
810+
retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize);
811+
}
812+
break;
813+
}
814+
750815
case NI_Vector128_CreateScalarUnsafe:
751816
{
752817
assert(sig->numArgs == 1);

src/coreclr/src/jit/importer.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4504,7 +4504,10 @@ NamedIntrinsic Compiler::lookupNamedIntrinsic(CORINFO_METHOD_HANDLE method)
45044504

45054505
if ((namespaceName[0] == '\0') || (strcmp(namespaceName, platformNamespaceName) == 0))
45064506
{
4507-
result = HWIntrinsicInfo::lookupId(this, className, methodName, enclosingClassName);
4507+
CORINFO_SIG_INFO sig;
4508+
info.compCompHnd->getMethodSig(method, &sig);
4509+
4510+
result = HWIntrinsicInfo::lookupId(this, &sig, className, methodName, enclosingClassName);
45084511
}
45094512
else if (strcmp(methodName, "get_IsSupported") == 0)
45104513
{

src/coreclr/src/jit/lower.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,7 @@ class Lowering final : public Phase
315315
#ifdef FEATURE_HW_INTRINSICS
316316
void LowerHWIntrinsic(GenTreeHWIntrinsic* node);
317317
void LowerHWIntrinsicCC(GenTreeHWIntrinsic* node, NamedIntrinsic newIntrinsicId, GenCondition condition);
318+
void LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node);
318319
void LowerFusedMultiplyAdd(GenTreeHWIntrinsic* node);
319320
#endif // FEATURE_HW_INTRINSICS
320321

0 commit comments

Comments
 (0)