Skip to content

Commit

Permalink
Merge pull request #1687 from AlexBrownAMD/release/rocm-rel-5.5
Browse files Browse the repository at this point in the history
Hotfix: Fix offset calculation to prevent overflow if offset is really large
  • Loading branch information
AlexBrownAMD authored Mar 28, 2023
2 parents e8a3c7d + 38d444a commit d3bbb8b
Show file tree
Hide file tree
Showing 29 changed files with 8,460 additions and 4,160 deletions.
8 changes: 4 additions & 4 deletions HostLibraryTests/hip/HipSolutionAdapter_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,15 +74,15 @@ TEST(HipSolutionAdapterTest, BetaOnlyKernel_Zero)

k.args.append<float*>("D", d_d);
k.args.append<float const*>("C", c_d);
k.args.append<uint64_t>("offsetD", desc.offset());
k.args.append<uint64_t>("offsetC", desc.offset());
k.args.append<unsigned int>("strideD1", desc.strides()[1]);
k.args.append<unsigned int>("strideD2", desc.strides()[2]);
k.args.append<unsigned int>("strideC1", desc.strides()[1]);
k.args.append<unsigned int>("strideC2", desc.strides()[2]);
k.args.append<unsigned int>("size0", desc.sizes()[0]);
k.args.append<unsigned int>("size1", desc.sizes()[1]);
k.args.append<unsigned int>("size2", desc.sizes()[2]);
k.args.append<unsigned int>("offsetD", desc.offset());
k.args.append<unsigned int>("offsetC", desc.offset());
k.args.append<float>("beta", 0.0f);

hip::SolutionAdapter adapter(false);
Expand Down Expand Up @@ -149,15 +149,15 @@ TEST(HipSolutionAdapterTest, BetaOnlyKernel_Nonzero)

k.args.append<float*>("D", d_d);
k.args.append<float const*>("C", c_d);
k.args.append<uint64_t>("offsetD", desc.offset());
k.args.append<uint64_t>("offsetC", desc.offset());
k.args.append<unsigned int>("strideD1", desc.strides()[1]);
k.args.append<unsigned int>("strideD2", desc.strides()[2]);
k.args.append<unsigned int>("strideC1", desc.strides()[1]);
k.args.append<unsigned int>("strideC2", desc.strides()[2]);
k.args.append<unsigned int>("size0", desc.sizes()[0]);
k.args.append<unsigned int>("size1", desc.sizes()[1]);
k.args.append<unsigned int>("size2", desc.sizes()[2]);
k.args.append<unsigned int>("offsetD", desc.offset());
k.args.append<unsigned int>("offsetC", desc.offset());
k.args.append<float>("beta", beta);

hip::SolutionAdapter adapter(false);
Expand Down
4 changes: 2 additions & 2 deletions HostLibraryTests/ocl/OclSolutionAdapter_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,15 +82,15 @@ KernelInvocation initKernelParams(Tensile::TensorDescriptor const& desc,
// k.args.append<cl::Buffer>("C", buffer_C);
k.args.append<float*>("D", device_d);
k.args.append<float const*>("C", device_c);
k.args.append<uint64_t>("offsetD", desc.offset());
k.args.append<uint64_t>("offsetC", desc.offset());
k.args.append<unsigned int>("strideD1", desc.strides()[1]);
k.args.append<unsigned int>("strideD2", desc.strides()[2]);
k.args.append<unsigned int>("strideC1", desc.strides()[1]);
k.args.append<unsigned int>("strideC2", desc.strides()[2]);
k.args.append<unsigned int>("size0", desc.sizes()[0]);
k.args.append<unsigned int>("size1", desc.sizes()[1]);
k.args.append<unsigned int>("size2", desc.sizes()[2]);
k.args.append<unsigned int>("offsetD", desc.offset());
k.args.append<unsigned int>("offsetC", desc.offset());
k.args.append<float>("beta", beta);

return k;
Expand Down
29 changes: 14 additions & 15 deletions Tensile/Common.py

Large diffs are not rendered by default.

65 changes: 5 additions & 60 deletions Tensile/Components/MAC_F32.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@

from ..Component import Component, MAC
from ..DataType import DataType
import queue

class MAC_F32_Plain(MAC):
"""
Expand All @@ -48,10 +47,6 @@ def __call__(self, writer, m, innerUnroll):
else:
raise RuntimeError("FMA instruction specified but not supported on {}".format(kernel["ISA"]))

dualMacEnable = 0
if writer.asmCaps["v_dual_fmac_f32"] and kernel["WavefrontSize"] == 32:
dualMacEnable = 1

if not writer.asmCaps[instruction]:
raise RuntimeError("{} instruction specified but not supported on {}".format(instruction, kernel["ISA"]))

Expand All @@ -72,8 +67,6 @@ def __call__(self, writer, m, innerUnroll):
priority = Component.Priority.find(writer)
macIdx = 0

instQ = queue.Queue()

for iui in range(0, innerUnroll):
for idx1 in range(0, kernel["ThreadTile1"]):
for idx0 in range(0, kernel["ThreadTile0"]):
Expand All @@ -87,67 +80,19 @@ def __call__(self, writer, m, innerUnroll):
vars["aStr"] = "v[vgprValuA_X{m}_I{iui} + {a}]".format_map(vars)
vars["bStr"] = "v[vgprValuB_X{m}_I{iui} + {b}]".format_map(vars)

if dualMacEnable == 1:
instVars = {}
instVars["endLine"] = writer.endLine
instVars["cStr"] = vars["cStr"]
instVars["aStr"] = vars["aStr"]
instVars["bStr"] = vars["bStr"]
instVars["a"] = vars["a"]
instVars["b"] = vars["b"]
instVars["instruction"] = instruction

if instQ.empty():
instQ.put(instVars)
else:
# pop instruction
prevVars = instQ.queue[0]

if self.isLegal(instVars, prevVars):
# make dual fmac
kStr += "v_dual_fmac_f32 {cStr}, {aStr}, {bStr}".format_map(prevVars) + " :: v_dual_fmac_f32 {cStr}, {aStr}, {bStr}{endLine}".format_map(vars)
kStr += priority(writer, 1, "Raise priority while processing macs")
instQ.get()
else:
# push instruction
instQ.put(instVars)

if instruction == "v_fma_f32":
kStr += "v_fma_f32 {cStr}, {aStr}, {bStr}, {cStr}{endLine}".format_map(vars)
else:
if instruction == "v_fma_f32":
kStr += "v_fma_f32 {cStr}, {aStr}, {bStr}, {cStr}{endLine}".format_map(vars)
else:
kStr += "{instruction} {cStr}, {aStr}, {bStr}{endLine}".format_map(vars)
kStr += "{instruction} {cStr}, {aStr}, {bStr}{endLine}".format_map(vars)

kStr += priority(writer, 1, "Raise priority while processing macs")

if macIdx == kernel["PerformanceWaitLocation"]:
kStr += self.popAllInstructions(instruction, instQ, priority, writer)
kStr += "s_waitcnt lgkmcnt({PerformanceWaitCount}) // extra wait for performance{endLine}".format_map(vars)
if macIdx == kernel["PerformanceSyncLocation"]:
kStr += self.popAllInstructions(instruction, instQ, priority, writer)
kStr += "s_barrier // extra barrier for performance{endLine}".format_map(vars)
macIdx += 1

kStr += self.popAllInstructions(instruction, instQ, priority, writer)
kStr += priority(writer, 0, "Reset priority after macs")

return kStr

def popAllInstructions(self, inst, instructionQueue, priority, writer):
# pop all instructions
kStr = ""
while instructionQueue.qsize() > 0:
prevVars = instructionQueue.get()
if inst == "v_fma_f32":
kStr += "v_fma_f32 {cStr}, {aStr}, {bStr}, {cStr}{endLine}".format_map(prevVars)
else:
kStr += "{instruction} {cStr}, {aStr}, {bStr}{endLine}".format_map(prevVars)
kStr += priority(writer, 1, "Raise priority while processing macs")
return kStr

def isLegal(self, instVars0, instVars1):
# VPOD has some restructions.
# For avoiding VGPR source-cache port limits, guarantee at least 1 duplicated SRC.
if instVars0["cStr"] == instVars1["cStr"]:
return False
if instVars0["a"] == instVars1["a"] or instVars0["b"] == instVars1["b"]:
return True
return False
10 changes: 5 additions & 5 deletions Tensile/Components/Signature.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,11 @@ def __call__(self, writer):
kStr += self.addArgument( 'A', '8', offset, "global_buffer", srcValueType, "generic"); offset += 8
kStr += self.addArgument( 'B', '8', offset, "global_buffer", srcValueType, "generic"); offset += 8

kStr += self.addArgument("OffsetD", '8', offset, "by_value", "u64"); offset += 8
kStr += self.addArgument("OffsetC", '8', offset, "by_value", "u64"); offset += 8
kStr += self.addArgument("OffsetA", '8', offset, "by_value", "u64"); offset += 8
kStr += self.addArgument("OffsetB", '8', offset, "by_value", "u64"); offset += 8

useSize = max(4, cptByte)
kStr += self.addArgument( "alpha", useSize, offset, "by_value", cptValueType); offset += useSize
if kernel["ProblemType"]["UseBeta"]:
Expand Down Expand Up @@ -238,11 +243,6 @@ def __call__(self, writer):
kStr += self.addArgument( "WgmRemainder1", '4', offset, "by_value", "u32"); offset += 4
kStr += self.addArgument( "MagicNumberWgmRemainder1", '4', offset, "by_value", "u32"); offset += 4

kStr += self.addArgument("OffsetD", '4', offset, "by_value", "u32"); offset += 4
kStr += self.addArgument("OffsetC", '4', offset, "by_value", "u32"); offset += 4
kStr += self.addArgument("OffsetA", '4', offset, "by_value", "u32"); offset += 4
kStr += self.addArgument("OffsetB", '4', offset, "by_value", "u32"); offset += 4

kStr += self.addArgument( "padding", '4', offset, "by_value", "u32"); offset += 4
kStr += " .group_segment_fixed_size: %u%s" % ( group_segment_size, writer.endLine ) #XXXXXX
kStr += " .kernarg_segment_align: %u%s" % ( 8, writer.endLine )
Expand Down
Loading

0 comments on commit d3bbb8b

Please sign in to comment.