Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main'
Browse files Browse the repository at this point in the history
  • Loading branch information
herobuxx committed Nov 1, 2024
2 parents e4b34ac + da083e3 commit dc387dc
Show file tree
Hide file tree
Showing 79 changed files with 3,553 additions and 177 deletions.
1 change: 1 addition & 0 deletions clang/docs/ReleaseNotes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -676,6 +676,7 @@ X86 Support
- Supported intrinsics for ``MOVRS AND AVX10.2``.
* Supported intrinsics of ``_mm(256|512)_(mask(z))_loadrs_epi(8|16|32|64)``.
- Support ISA of ``AMX-FP8``.
- Support ISA of ``AMX-TRANSPOSE``.

Arm and AArch64 Support
^^^^^^^^^^^^^^^^^^^^^^^
Expand Down
11 changes: 11 additions & 0 deletions clang/include/clang/Basic/BuiltinsX86_64.def
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,11 @@ TARGET_BUILTIN(__builtin_ia32_tdpbf16ps_internal, "V256iUsUsUsV256iV256iV256i",
TARGET_BUILTIN(__builtin_ia32_tdpfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-fp16")
TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex")
TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex")
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
TARGET_BUILTIN(__builtin_ia32_ttransposed_internal, "V256iUsUsV256i", "n", "amx-transpose")
// AMX
TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile")
TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile")
Expand All @@ -148,6 +153,12 @@ TARGET_BUILTIN(__builtin_ia32_ptwrite64, "vUOi", "n", "ptwrite")
TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps, "vIUcIUcIUc", "n", "amx-complex")
TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps, "vIUcIUcIUc", "n", "amx-complex")

TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0, "vIUcvC*z", "n", "amx-transpose")
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1, "vIUcvC*z", "n","amx-transpose")
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1, "vIUcvC*z", "n", "amx-transpose")
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1, "vIUcvC*z", "n","amx-transpose")
TARGET_BUILTIN(__builtin_ia32_ttransposed, "vIUcIUc", "n", "amx-transpose")

TARGET_BUILTIN(__builtin_ia32_prefetchi, "vvC*Ui", "nc", "prefetchi")
TARGET_BUILTIN(__builtin_ia32_cmpccxadd32, "Siv*SiSiIi", "n", "cmpccxadd")
TARGET_BUILTIN(__builtin_ia32_cmpccxadd64, "SLLiSLLi*SLLiSLLiIi", "n", "cmpccxadd")
Expand Down
2 changes: 2 additions & 0 deletions clang/include/clang/Driver/Options.td
Original file line number Diff line number Diff line change
Expand Up @@ -6301,6 +6301,8 @@ def mamx_fp8 : Flag<["-"], "mamx-fp8">, Group<m_x86_Features_Group>;
def mno_amx_fp8 : Flag<["-"], "mno-amx-fp8">, Group<m_x86_Features_Group>;
def mamx_tile : Flag<["-"], "mamx-tile">, Group<m_x86_Features_Group>;
def mno_amx_tile : Flag<["-"], "mno-amx-tile">, Group<m_x86_Features_Group>;
def mamx_transpose : Flag<["-"], "mamx-transpose">, Group<m_x86_Features_Group>;
def mno_amx_transpose : Flag<["-"], "mno-amx-transpose">, Group<m_x86_Features_Group>;
def mcmpccxadd : Flag<["-"], "mcmpccxadd">, Group<m_x86_Features_Group>;
def mno_cmpccxadd : Flag<["-"], "mno-cmpccxadd">, Group<m_x86_Features_Group>;
def msse : Flag<["-"], "msse">, Group<m_x86_Features_Group>;
Expand Down
11 changes: 8 additions & 3 deletions clang/lib/Basic/DiagnosticIDs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -575,6 +575,12 @@ DiagnosticIDs::getDiagnosticSeverity(unsigned DiagID, SourceLocation Loc,
DiagID != diag::fatal_too_many_errors && Diag.FatalsAsError)
Result = diag::Severity::Error;

// Rest of the mappings are only applicable for diagnostics associated with a
// SourceLocation, bail out early for others.
if (!Diag.hasSourceManager())
return Result;

const auto &SM = Diag.getSourceManager();
// Custom diagnostics always are emitted in system headers.
bool ShowInSystemHeader =
!GetDiagInfo(DiagID) || GetDiagInfo(DiagID)->WarnShowInSystemHeader;
Expand All @@ -583,15 +589,14 @@ DiagnosticIDs::getDiagnosticSeverity(unsigned DiagID, SourceLocation Loc,
// because we also want to ignore extensions and warnings in -Werror and
// -pedantic-errors modes, which *map* warnings/extensions to errors.
if (State->SuppressSystemWarnings && !ShowInSystemHeader && Loc.isValid() &&
Diag.getSourceManager().isInSystemHeader(
Diag.getSourceManager().getExpansionLoc(Loc)))
SM.isInSystemHeader(SM.getExpansionLoc(Loc)))
return diag::Severity::Ignored;

// We also ignore warnings due to system macros
bool ShowInSystemMacro =
!GetDiagInfo(DiagID) || GetDiagInfo(DiagID)->WarnShowInSystemMacro;
if (State->SuppressSystemWarnings && !ShowInSystemMacro && Loc.isValid() &&
Diag.getSourceManager().isInSystemMacro(Loc))
SM.isInSystemMacro(Loc))
return diag::Severity::Ignored;

return Result;
Expand Down
10 changes: 8 additions & 2 deletions clang/lib/Basic/Targets/X86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
HasAMXCOMPLEX = true;
} else if (Feature == "+amx-fp8") {
HasAMXFP8 = true;
} else if (Feature == "+amx-transpose") {
HasAMXTRANSPOSE = true;
} else if (Feature == "+cmpccxadd") {
HasCMPCCXADD = true;
} else if (Feature == "+raoint") {
Expand Down Expand Up @@ -951,6 +953,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
Builder.defineMacro("__AMX_COMPLEX__");
if (HasAMXFP8)
Builder.defineMacro("__AMX_FP8__");
if (HasAMXTRANSPOSE)
Builder.defineMacro("__AMX_TRANSPOSE__");
if (HasCMPCCXADD)
Builder.defineMacro("__CMPCCXADD__");
if (HasRAOINT)
Expand Down Expand Up @@ -1079,9 +1083,10 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
.Case("amx-bf16", true)
.Case("amx-complex", true)
.Case("amx-fp16", true)
.Case("amx-fp8", true)
.Case("amx-int8", true)
.Case("amx-tile", true)
.Case("amx-fp8", true)
.Case("amx-transpose", true)
.Case("avx", true)
.Case("avx10.1-256", true)
.Case("avx10.1-512", true)
Expand Down Expand Up @@ -1198,9 +1203,10 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
.Case("amx-bf16", HasAMXBF16)
.Case("amx-complex", HasAMXCOMPLEX)
.Case("amx-fp16", HasAMXFP16)
.Case("amx-fp8", HasAMXFP8)
.Case("amx-int8", HasAMXINT8)
.Case("amx-tile", HasAMXTILE)
.Case("amx-fp8", HasAMXFP8)
.Case("amx-transpose", HasAMXTRANSPOSE)
.Case("avx", SSELevel >= AVX)
.Case("avx10.1-256", HasAVX10_1)
.Case("avx10.1-512", HasAVX10_1_512)
Expand Down
1 change: 1 addition & 0 deletions clang/lib/Basic/Targets/X86.h
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
bool HasAMXBF16 = false;
bool HasAMXCOMPLEX = false;
bool HasAMXFP8 = false;
bool HasAMXTRANSPOSE = false;
bool HasSERIALIZE = false;
bool HasTSXLDTRK = false;
bool HasUSERMSR = false;
Expand Down
52 changes: 52 additions & 0 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16994,6 +16994,58 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
// instruction, but it will create a memset that won't be optimized away.
return Builder.CreateMemSet(Ops[0], Ops[1], Ops[2], Align(1), true);
}
// Corresponding to intrisics which will return 2 tiles (tile0_tile1).
case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal: {
Intrinsic::ID IID;
switch (BuiltinID) {
default:
llvm_unreachable("Unsupported intrinsic!");
case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
IID = Intrinsic::x86_t2rpntlvwz0_internal;
break;
case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
IID = Intrinsic::x86_t2rpntlvwz0t1_internal;
break;
case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
IID = Intrinsic::x86_t2rpntlvwz1_internal;
break;
case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal:
IID = Intrinsic::x86_t2rpntlvwz1t1_internal;
break;
}

// Ops = (Row0, Col0, Col1, DstPtr0, DstPtr1, SrcPtr, Stride)
Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID),
{Ops[0], Ops[1], Ops[2], Ops[5], Ops[6]});

auto *PtrTy = E->getArg(3)->getType()->getAs<PointerType>();
assert(PtrTy && "arg3 must be of pointer type");
QualType PtreeTy = PtrTy->getPointeeType();
llvm::Type *TyPtee = ConvertType(PtreeTy);

// Bitcast amx type (x86_amx) to vector type (256 x i32)
// Then store tile0 into DstPtr0
Value *T0 = Builder.CreateExtractValue(Call, 0);
Value *VecT0 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector,
{TyPtee}, {T0});
Builder.CreateDefaultAlignedStore(VecT0, Ops[3]);

// Then store tile1 into DstPtr1
Value *T1 = Builder.CreateExtractValue(Call, 1);
Value *VecT1 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector,
{TyPtee}, {T1});
Value *Store = Builder.CreateDefaultAlignedStore(VecT1, Ops[4]);

// Note: Here we escape directly use x86_tilestored64_internal to store
// the results due to it can't make sure the Mem written scope. This may
// cause shapes reloads after first amx intrinsic, which current amx reg-
// ister allocation has no ability to handle it.

return Store;
}
case X86::BI__ud2:
// llvm.trap makes a ud2a instruction on x86.
return EmitTrapCall(Intrinsic::trap);
Expand Down
3 changes: 2 additions & 1 deletion clang/lib/Headers/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -148,8 +148,9 @@ set(x86_files
ammintrin.h
amxcomplexintrin.h
amxfp16intrin.h
amxintrin.h
amxfp8intrin.h
amxintrin.h
amxtransposeintrin.h
avx10_2_512bf16intrin.h
avx10_2_512convertintrin.h
avx10_2_512minmaxintrin.h
Expand Down
2 changes: 2 additions & 0 deletions clang/lib/Headers/amxintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,8 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
/// bytes. Since there is no 2D type in llvm IR, we use vector type to
/// represent 2D tile and the fixed size is maximum amx tile register size.
typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
typedef int _tile1024i_1024a
__attribute__((__vector_size__(1024), __aligned__(1024)));

/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
Expand Down
Loading

0 comments on commit dc387dc

Please sign in to comment.