Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

x86: Add disableAVX2/512 options and check XCR0 for OS support #7602

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions compiler/control/OMROptions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,9 @@ TR::OptionTable OMR::Options::_jitOptions[] = {
{"disableAsyncCheckVersioning", "O\tdisable versioning of loops wrt async checks", SET_OPTION_BIT(TR_DisableAsyncCheckVersioning), "F"},
{"disableAsyncCompilation", "M\tdisable asynchronous compilation", SET_OPTION_BIT(TR_DisableAsyncCompilation), "F"},
{"disableAutoSIMD", "M\tdisable automatic vectorization of loops", SET_OPTION_BIT(TR_DisableAutoSIMD), "F"},
{"disableAVX", "C\tdisable avx and newer on x86", TR::Options::disableCPUFeatures, TR_DisableAVX, 0, "F"},
{"disableAVX2", "C\tdisable avx2 and newer on x86", TR::Options::disableCPUFeatures, TR_DisableAVX2, 0, "F"},
{"disableAVX512", "C\tdisable avx512 on x86", TR::Options::disableCPUFeatures, TR_DisableAVX512, 0, "F"},
{"disableBasicBlockExtension", "O\tdisable basic block extension", TR::Options::disableOptimization, basicBlockExtension, 0, "P"},
{"disableBasicBlockPeepHole", "O\tdisable basic blocks peepHole", SET_OPTION_BIT(TR_DisableBasicBlockPeepHole), "F"},
{"disableBCDArithChildOrdering", "O\tstress testing option -- do not reorder children of BCD arithmetic nodes", SET_OPTION_BIT(TR_DisableBCDArithChildOrdering), "F" },
Expand Down Expand Up @@ -554,6 +557,9 @@ TR::OptionTable OMR::Options::_jitOptions[] = {
{"disableSIMDUTF16BEEncoder", "M\tdisable inlining of SIMD UTF16 Big Endian encoder", SET_OPTION_BIT(TR_DisableSIMDUTF16BEEncoder), "F"},
{"disableSIMDUTF16LEEncoder", "M\tdisable inlining of SIMD UTF16 Little Endian encoder", SET_OPTION_BIT(TR_DisableSIMDUTF16LEEncoder), "F"},
{"disableSmartPlacementOfCodeCaches", "O\tdisable placement of code caches in memory so they are near each other and the DLLs", SET_OPTION_BIT(TR_DisableSmartPlacementOfCodeCaches), "F", NOT_IN_SUBSET},
{"disableSSE3", "C\tdisable sse 3 and newer on x86", TR::Options::disableCPUFeatures, TR_DisableSSE3, 0, "F"},
{"disableSSE4_1", "C\tdisable sse 4.1 and newer on x86", TR::Options::disableCPUFeatures, TR_DisableSSE4_1, 0, "F"},
{"disableSSE4_2", "C\tdisable sse 4.2 and newer on x86", TR::Options::disableCPUFeatures, TR_DisableSSE4_2, 0, "F"},
{"disableStableAnnotations", "M\tdisable recognition of @Stable", SET_OPTION_BIT(TR_DisableStableAnnotations), "F"},
{"disableStaticFinalFieldFolding", "O\tdisable generic static final field folding", TR::Options::disableOptimization, staticFinalFieldFolding, 0, "P"},
{"disableStoreOnCondition", "O\tdisable store on condition (STOC) code gen", SET_OPTION_BIT(TR_DisableStoreOnCondition), "F"},
Expand Down Expand Up @@ -4943,6 +4949,35 @@ OMR::Options::configureOptReporting(const char *option, void *base, TR::OptionTa
return option;
}

const char *
OMR::Options::disableCPUFeatures(const char *option, void *base, TR::OptionTable *entry)
{
TR::Options *options = (TR::Options*)base;
TR_CompilationOptions co = (TR_CompilationOptions)entry->parm1;
options->setOption(co);

/* When disabling SIMD, disable newer features too */

switch (co)
{
case TR_DisableSSE3:
options->setOption(TR_DisableSSE3);
case TR_DisableSSE4_1:
options->setOption(TR_DisableSSE4_1);
case TR_DisableSSE4_2:
options->setOption(TR_DisableSSE4_2);
case TR_DisableAVX:
options->setOption(TR_DisableAVX);
case TR_DisableAVX2:
options->setOption(TR_DisableAVX2);
case TR_DisableAVX512:
options->setOption(TR_DisableAVX512);
default:
break;
}

return option;
}

const char *OMR::Options::_verboseOptionNames[TR_NumVerboseOptions] =
{
Expand Down
13 changes: 7 additions & 6 deletions compiler/control/OMROptions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -400,13 +400,13 @@ enum TR_CompilationOptions
TR_EnableVectorAPIBoxing = 0x00010000 + 10,
TR_EnableSequentialLoadStoreWarm = 0x00020000 + 10,
TR_EnableSequentialLoadStoreCold = 0x00040000 + 10,
// Available = 0x00080000 + 10,
// Available = 0x00100000 + 10,
// Available = 0x00200000 + 10,
TR_DisableAVX = 0x00080000 + 10,
TR_DisableAVX2 = 0x00100000 + 10,
TR_DisableAVX512 = 0x00200000 + 10,
TR_ConservativeCompilation = 0x00400000 + 10,
// Available = 0x00800000 + 10,
// Available = 0x01000000 + 10,
// Available = 0x02000000 + 10,
TR_DisableSSE3 = 0x00800000 + 10,
TR_DisableSSE4_1 = 0x01000000 + 10,
TR_DisableSSE4_2 = 0x02000000 + 10,
TR_DisableNewX86VolatileSupport = 0x04000000 + 10,
// Available = 0x08000000 + 10,
// Available = 0x10000000 + 10,
Expand Down Expand Up @@ -2295,6 +2295,7 @@ class OMR_EXTENSIBLE Options
static const char *clearBitsFromStringSet(const char *option, void *base, TR::OptionTable *entry);

static const char *configureOptReporting(const char *option, void *base, TR::OptionTable *entry);
static const char *disableCPUFeatures(const char *option, void *base, TR::OptionTable *entry);

// Option processing helper functions
//
Expand Down
3 changes: 0 additions & 3 deletions compiler/x/codegen/OMRCodeGenerator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,6 @@ OMR::X86::CodeGenerator::initializeX86(TR::Compilation *comp)
static bool disableX86TRTO = feGetEnv("TR_disableX86TRTO") != NULL;
if (!disableX86TRTO)
{
TR_ASSERT_FATAL(comp->compileRelocatableCode() || comp->isOutOfProcessCompilation() || comp->compilePortableCode() || comp->target().cpu.supportsFeature(OMR_FEATURE_X86_SSE4_1) == self()->getX86ProcessorInfo().supportsSSE4_1(), "supportsSSE4_1() failed\n");
if (comp->target().cpu.supportsFeature(OMR_FEATURE_X86_SSE4_1))
{
self()->setSupportsArrayTranslateTRTO();
Expand All @@ -433,8 +432,6 @@ OMR::X86::CodeGenerator::initializeX86(TR::Compilation *comp)
static bool disableX86TROT = feGetEnv("TR_disableX86TROT") != NULL;
if (!disableX86TROT)
{
TR_ASSERT_FATAL(comp->compileRelocatableCode() || comp->isOutOfProcessCompilation() || comp->compilePortableCode() || comp->target().cpu.supportsFeature(OMR_FEATURE_X86_SSE4_1) == self()->getX86ProcessorInfo().supportsSSE4_1(), "supportsSSE4_1() failed\n");
TR_ASSERT_FATAL(comp->compileRelocatableCode() || comp->isOutOfProcessCompilation() || comp->compilePortableCode() || comp->target().cpu.supportsFeature(OMR_FEATURE_X86_SSE2) == self()->getX86ProcessorInfo().supportsSSE2(), "supportsSSE4_1() failed\n");
if (comp->target().cpu.supportsFeature(OMR_FEATURE_X86_SSE4_1))
{
self()->setSupportsArrayTranslateTROT();
Expand Down
2 changes: 0 additions & 2 deletions compiler/x/codegen/OMRInstOpCode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,6 @@ template <typename TBuffer> typename TBuffer::cursor_t OMR::X86::InstOpCode::OpC
TR::Instruction::REX rex(rexbits);
rex.W = rex_w;

TR_ASSERT_FATAL(comp->compileRelocatableCode() || comp->isOutOfProcessCompilation() || comp->compilePortableCode() || comp->target().cpu.supportsAVX() == TR::CodeGenerator::getX86ProcessorInfo().supportsAVX(), "supportsAVX() failed\n");

if (enc != VEX_L___)
{
if (enc >> 2 && enc != VEX_LZ)
Expand Down
1 change: 0 additions & 1 deletion compiler/x/codegen/OMRTreeEvaluator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5818,7 +5818,6 @@ TR::Register* OMR::X86::TreeEvaluator::floatingPointBinaryArithmeticEvaluator(TR
TR::Node* operandNode0 = node->getChild(0);
TR::Node* operandNode1 = node->getChild(1);

TR_ASSERT_FATAL(cg->comp()->compileRelocatableCode() || cg->comp()->isOutOfProcessCompilation() || cg->comp()->compilePortableCode() || cg->comp()->target().cpu.supportsAVX() == TR::CodeGenerator::getX86ProcessorInfo().supportsAVX(), "supportsAVX() failed\n");
bool useRegMemForm = cg->comp()->target().cpu.supportsAVX();

if (useRegMemForm)
Expand Down
2 changes: 0 additions & 2 deletions compiler/x/codegen/X86BinaryEncoding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1547,8 +1547,6 @@ TR::X86RegInstruction::enlarge(int32_t requestedEnlargementSize, int32_t maxEnla
if (disableRexExpansion || cg()->comp()->getOption(TR_DisableZealousCodegenOpts))
return OMR::X86::EnlargementResult(0, 0);

TR_ASSERT_FATAL(cg()->comp()->compileRelocatableCode() || cg()->comp()->isOutOfProcessCompilation() || cg()->comp()->compilePortableCode() || cg()->comp()->target().cpu.supportsAVX() == cg()->getX86ProcessorInfo().supportsAVX(), "supportsAVX() failed\n");

if (getOpCode().info().supportsAVX() && cg()->comp()->target().cpu.supportsAVX())
return OMR::X86::EnlargementResult(0, 0); // REX expansion isn't allowed for AVX instructions

Expand Down
84 changes: 78 additions & 6 deletions compiler/x/env/OMRCPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,14 +61,39 @@ OMR::X86::CPU::detect(OMRPortLibrary * const omrPortLib)
processorDescription.features[i] &= featureMasks.features[i];
}

bool disableAVX = true;
bool disableAVX512 = true;

// Check XCRO register for OS support of xmm/ymm/zmm
if (TRUE == omrsysinfo_processor_has_feature(&processorDescription, OMR_FEATURE_X86_OSXSAVE))
{
static const bool disableAVX = feGetEnv("TR_DisableAVX") != NULL;
if (((6 & _xgetbv(0)) != 6) || disableAVX) // '6' = mask for XCR0[2:1]='11b' (XMM state and YMM state are enabled)
{
// Unset OSXSAVE if not enabled via CR0
omrsysinfo_processor_set_feature(&processorDescription, OMR_FEATURE_X86_OSXSAVE, FALSE);
}
// '6' = mask for XCR0[2:1]='11b' (XMM state and YMM state are enabled)
disableAVX = ((6 & _xgetbv(0)) != 6);
// 'e6' = (mask for XCR0[7:5]='111b' (Opmask, ZMM_Hi256, Hi16_ZMM) + XCR0[2:1]='11b' (XMM/YMM))
disableAVX512 = ((0xe6 & _xgetbv(0)) != 0xe6);
}

if(disableAVX)
{
// Unset AVX/AVX2 if not enabled via CR0 or otherwise disabled
omrsysinfo_processor_set_feature(&processorDescription, OMR_FEATURE_X86_AVX, FALSE);
omrsysinfo_processor_set_feature(&processorDescription, OMR_FEATURE_X86_AVX2, FALSE);
}

if (disableAVX512)
{
// Unset AVX-512 if not enabled via CR0 or otherwise disabled
// If other AVX-512 extensions are supported in the port library, they need to be disabled here
omrsysinfo_processor_set_feature(&processorDescription, OMR_FEATURE_X86_AVX512F, FALSE);
omrsysinfo_processor_set_feature(&processorDescription, OMR_FEATURE_X86_AVX512VL, FALSE);
omrsysinfo_processor_set_feature(&processorDescription, OMR_FEATURE_X86_AVX512BW, FALSE);
omrsysinfo_processor_set_feature(&processorDescription, OMR_FEATURE_X86_AVX512CD, FALSE);
omrsysinfo_processor_set_feature(&processorDescription, OMR_FEATURE_X86_AVX512DQ, FALSE);
omrsysinfo_processor_set_feature(&processorDescription, OMR_FEATURE_X86_AVX512_BITALG, FALSE);
omrsysinfo_processor_set_feature(&processorDescription, OMR_FEATURE_X86_AVX512_VBMI, FALSE);
omrsysinfo_processor_set_feature(&processorDescription, OMR_FEATURE_X86_AVX512_VBMI2, FALSE);
omrsysinfo_processor_set_feature(&processorDescription, OMR_FEATURE_X86_AVX512_VNNI, FALSE);
omrsysinfo_processor_set_feature(&processorDescription, OMR_FEATURE_X86_AVX512_VPOPCNTDQ, FALSE);
}

return TR::CPU(processorDescription);
Expand Down Expand Up @@ -254,9 +279,56 @@ OMR::X86::CPU::is(OMRProcessorArchitecture p)
return _processorDescription.processor == p;
}

bool
OMR::X86::CPU::is_feature_disabled(uint32_t feature)
{
TR_CompilationOptions option = (TR_CompilationOptions) 0;
TR::Compilation *comp = TR::comp();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I need to be convinced this is the best way to do this because pulling the compilation object from TLS has never proven to be cheaper than retrieving it from elsewhere or passing it in as a parameter. It is really useful for debugging, in places where compile-time performance is not a concern (e.g., the ASSERT macro), or in code that is difficult to modify to introduce a compilation object. supports_feature (which immediately calls this function) is called from a number of places, so some compile-time benchmarking is required.

Did you consider passing in the compilation object to supports_feature or caching the compilation object in the CPU object itself? If so, please comment why forgoing those alternatives justifies the compile-time cost of using TR::comp().

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Passing the compilation object to supports_feature would require changes in both OpenJ9 and OMR, including at every call to to this function. That would introduce a large number of changes that would need to introduced with a coordinated merge between OpenJ9 and OMR.

I don't think I makes sense for TR::CPU to cache the compilation object either. It would need to change with compilation and my understanding is that environment information is shared across all compilations.

The only other thing I can thing of is use TR::comp(), cache the result to comp->getOption() in a static field and hope that gcc can optimize it. But doing this would have other drawbacks, in that you cant disable features for specific methods only and the CLI would probably need to verify that these options are only applied globally.


switch (feature)
{
case OMR_FEATURE_X86_SSE3:
option = TR_DisableSSE3;
break;
case OMR_FEATURE_X86_SSE4_1:
option = TR_DisableSSE4_1;
break;
case OMR_FEATURE_X86_SSE4_2:
option = TR_DisableSSE4_2;
break;
case OMR_FEATURE_X86_AVX:
option = TR_DisableAVX;
break;
case OMR_FEATURE_X86_AVX2:
option = TR_DisableAVX2;
break;
case OMR_FEATURE_X86_AVX512F:
case OMR_FEATURE_X86_AVX512VL:
case OMR_FEATURE_X86_AVX512BW:
case OMR_FEATURE_X86_AVX512CD:
case OMR_FEATURE_X86_AVX512DQ:
case OMR_FEATURE_X86_AVX512ER:
case OMR_FEATURE_X86_AVX512PF:
case OMR_FEATURE_X86_AVX512_BITALG:
case OMR_FEATURE_X86_AVX512_IFMA:
case OMR_FEATURE_X86_AVX512_VBMI:
case OMR_FEATURE_X86_AVX512_VBMI2:
case OMR_FEATURE_X86_AVX512_VNNI:
case OMR_FEATURE_X86_AVX512_VPOPCNTDQ:
option = TR_DisableAVX512;
default:
break;
}

return option && comp && comp->getOption(option);
}

bool
OMR::X86::CPU::supportsFeature(uint32_t feature)
{
if (is_feature_disabled(feature))
return false;

if (TR::Compiler->omrPortLib == NULL)
return self()->supports_feature_old_api(feature);

Expand Down
1 change: 1 addition & 0 deletions compiler/x/env/OMRCPU.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ class OMR_EXTENSIBLE CPU : public OMR::CPU
bool supportsFeature(uint32_t feature);
bool supports_feature_old_api(uint32_t feature);
bool supports_feature_test(uint32_t feature);
bool is_feature_disabled(uint32_t feature);

/**
* @brief Returns name of the current processor
Expand Down
38 changes: 31 additions & 7 deletions compiler/x/runtime/X86Runtime.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,15 +87,39 @@ inline bool jitGetCPUID(TR_X86CPUIDBuffer* pBuffer)
pBuffer->_featureFlags8 = CPUInfo[EBX];
pBuffer->_featureFlags10 = CPUInfo[ECX];

// Check for XSAVE
bool disableAVX = true;
bool disableAVX512 = true;

// Check XCRO register for OS support of xmm/ymm/zmm
if(pBuffer->_featureFlags2 & TR_OSXSAVE)
{
static const bool disableAVX = feGetEnv("TR_DisableAVX") != NULL;
if(((6 & _xgetbv(0)) != 6) || disableAVX) // '6' = mask for XCR0[2:1]='11b' (XMM state and YMM state are enabled)
{
// Unset OSXSAVE if not enabled via CR0
pBuffer->_featureFlags2 &= ~TR_OSXSAVE;
}
// '6' = mask for XCR0[2:1]='11b' (XMM state and YMM state are enabled)
disableAVX = ((6 & _xgetbv(0)) != 6);
// 'e6' = (mask for XCR0[7:5]='111b' (Opmask, ZMM_Hi256, Hi16_ZMM) + XCR0[2:1]='11b' (XMM/YMM))
disableAVX512 = ((0xe6 & _xgetbv(0)) != 0xe6);
}

if(disableAVX)
{
// Unset AVX/AVX2 if not enabled via CR0 or otherwise disabled
pBuffer->_featureFlags2 &= ~TR_AVX;
pBuffer->_featureFlags8 &= ~TR_AVX2;
}

if (disableAVX512)
{
// Unset AVX-512 if not enabled via CR0 or otherwise disabled
// If other AVX-512 extensions are supported in the old cpuid API, they need to be disabled here
pBuffer->_featureFlags8 &= ~TR_AVX512F;
pBuffer->_featureFlags8 &= ~TR_AVX512VL;
pBuffer->_featureFlags8 &= ~TR_AVX512BW;
pBuffer->_featureFlags8 &= ~TR_AVX512CD;
pBuffer->_featureFlags8 &= ~TR_AVX512DQ;
pBuffer->_featureFlags10 &= ~TR_AVX512_BITALG;
pBuffer->_featureFlags10 &= ~TR_AVX512_VBMI;
pBuffer->_featureFlags10 &= ~TR_AVX512_VBMI2;
pBuffer->_featureFlags10 &= ~TR_AVX512_VNNI;
pBuffer->_featureFlags10 &= ~TR_AVX512_VPOPCNTDQ;
}

/* Mask out the bits the compiler does not care about.
Expand Down