Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
amcamd committed Oct 11, 2018
2 parents caacebd + 5cd8a4c commit 7556393
Show file tree
Hide file tree
Showing 120 changed files with 12,995 additions and 1,356 deletions.
24 changes: 19 additions & 5 deletions Tensile/Common.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,9 +267,21 @@
# If changing this also change runtime writeSolutionAssertionCheck* functions in Common.py and in TensileTypes.py (AssertionProperties class)
"AssertFree0ElementMultiple" : [1,2,4,8],

# When creating the kernel, assume that the 'second' free index size is some
# multiple of the element size.
# "first" free index is FreeIndex[1] and usually letter "J"
# 1 indicates no restriction (since all sizes are multiples of 1)
# If changing this also change runtime writeSolutionAssertionCheck* functions in Common.py and in TensileTypes.py (AssertionProperties class)
#"AssertFree1ElementMultiple" : [1,2,4,8],
"AssertFree1ElementMultiple" : [1], # TODO, support broader range here

# Generate code inside kernel to check assertions above on Tensor dimensions
"CheckTensorDimAsserts": [False, True],

# Generate code inside kernel to check several dimension overflow cases, in particular around use of 32-bit calcs
# 0 = no check, 1=checks for cases that should be avoided through assertions and kernel selection, 2=checks for cases that should never happen
"CheckDimOverflow": [0,1,2],

# For Block Mapping type:
# 0 : Use hardware-assigned wg number with no remapping.
# N : WG block width. "Wrap" to a new wg1 "row" assignment after N WGs assigned in that row.
Expand Down Expand Up @@ -429,11 +441,13 @@
{"BufferLoad": [ True ] },
{"BufferStore": [ True ] },
{"DirectToLds": [ True ] },
{"PreciseBoundsCheck": [ False ] },
{"PreciseBoundsCheck": [ True ] },
{"UseSgprForGRO": [ -1 ] },
{"AssertSummationElementMultiple": [ 1 ] },
{"AssertFree0ElementMultiple": [ 1 ] },
{"AssertFree1ElementMultiple": [ 1 ] },
{"CheckTensorDimAsserts" : [ False ] },
{"CheckDimOverflow" : [ 0 ] },

{"GlobalSplitU": [ 1 ] },
{"GlobalSplitUSummationAssignmentRoundRobin": [ True ] },
Expand Down Expand Up @@ -675,9 +689,11 @@ def tryAssembler(isaVersion, asmString):
if result != "":
return 0 # stdout and stderr must be empty
except subprocess.CalledProcessError, e:
if globalParameters["PrintLevel"] >=2:
print "CalledProcessError", e
return 0 # error, not supported

return 1 # syntax works for
return 1 # syntax works


################################################################################
Expand Down Expand Up @@ -737,13 +753,11 @@ def assignGlobalParameters( config ):
for (v) in globalParameters["SupportedISA"]:
globalParameters["AsmCaps"][v] = {}
isaVersion = "gfx" + "".join(map(str,v))
asmCmd = "%s -x assembler -target amdgcn-amdhsa -mcpu=%s -" \
% (globalParameters["AssemblerPath"], isaVersion)
# This doesn't work since assembler politely falls back to default with an unsupported mcpu argument:
globalParameters["AsmCaps"][v]["SupportedIsa"] = tryAssembler(isaVersion, "")
globalParameters["AsmCaps"][v]["HasExplicitCO"] = tryAssembler(isaVersion, "v_add_co_u32 v0,vcc,v0,v0")
globalParameters["AsmCaps"][v]["HasDirectToLds"] = tryAssembler(isaVersion, "buffer_load_dword v40, v36, s[24:27], s28 offen offset:0 lds")
globalParameters["AsmCaps"][v]["HasAddLshl"] = tryAssembler(isaVersion, "v_add_lshl_u32 v47, v36, v34, 0x2")
globalParameters["AsmCaps"][v]["HasSMulHi"] = tryAssembler(isaVersion, "s_mul_hi_u32 s47, s36, s34")
caps = ""
for k in globalParameters["AsmCaps"][v]:
caps += " %s=%u" % (k, globalParameters["AsmCaps"][v][k])
Expand Down
19 changes: 18 additions & 1 deletion Tensile/Configs/miopen/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,12 @@ P=problems
DEEPBENCH_CONV_1x1=$P/nn/deepbench_conv_1x1_batchN.yml $P/nn/deepbench_conv_1x1_batch1.yml
RESNET=$P/nn/resnet_batch64_B.yml

# Override SCHED as vega10, vega20
SCHED=vega10

# commonly-used headers and footers:
HEADER=boiler/header.yml
FOOTER=boiler/library_logic_vega10_only.yml
FOOTER=boiler/library_logic_$(SCHED)_only.yml

# Override TYPE as sgemm, hgemm (hgemm_hpa, dgemm, etc in future)
TYPE=sgemm
Expand All @@ -25,6 +28,9 @@ SOLUTION_SKINNY=solutions/$(TYPE)_skinny_explore_$(EXPLORE_LEVEL).yml

all: \
$(TYPE)_resnet.yaml \
$(TYPE)_resnet50_nn.yaml \
$(TYPE)_resnet50_nt.yaml \
$(TYPE)_resnet50_tn.yaml \
$(TYPE)_deepbench_conv1x1.yaml \
$(TYPE)_deepbench_gemm_nn.yaml \
$(TYPE)_deepbench_gemm_nt.yaml \
Expand All @@ -35,6 +41,17 @@ $(TYPE)_resnet.yaml: $(HEADER) types/$(TYPE)_nn.yml \
$(SOLUTION_SKINNY) $(RESNET) \
$(FOOTER)

# Resnet50
$(TYPE)_resnet50_nn.yaml: $(HEADER) types/$(TYPE)_nn.yml \
$(SOLUTION_SKINNY) $P/nn/resnet50_all.yml \
$(FOOTER)
$(TYPE)_resnet50_nt.yaml: $(HEADER) types/$(TYPE)_nt.yml \
$(SOLUTION_SKINNY) $P/nt/resnet50_all.yml \
$(FOOTER)
$(TYPE)_resnet50_tn.yaml: $(HEADER) types/$(TYPE)_tn.yml \
$(SOLUTION_SKINNY) $P/tn/resnet50_all.yml \
$(FOOTER)

# DeepBench Convolution:
$(TYPE)_deepbench_conv1x1.yaml: $(HEADER) types/$(TYPE)_nn.yml \
$(SOLUTION_SKINNY) $(DEEPBENCH_CONV_1x1) \
Expand Down
33 changes: 33 additions & 0 deletions Tensile/Configs/miopen/archives/resnet50/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
Start with the 6 asm_full logic files

- vega20_Cijk_Ailk_Bjlk_HB.yaml
- vega20_Cijk_Ailk_Bljk_HB.yaml
- vega20_Cijk_Alik_Bljk_HB.yaml
- vega20_Cijk_Ailk_Bjlk_SB.yaml
- vega20_Cijk_Ailk_Bljk_SB.yaml
- vega20_Cijk_Alik_Bljk_SB.yaml

from

- rocBLAS commit a85df88648587a0d2880a74c6c57964366ab02a1 for HGEMM
- rocBLAS commit 0ceb1ad64c8bda5473a1e1c3a74ab9ff204acbf8 for SGEMM

we merge the 6 Resnet50-specific logic files archived in the "logic" directory
into the corresponding asm_full logic files of the same name, resulting in the
6 combined asm_full logic files in

- rocBLAS commit ea27b3aba339b4fd48795153995d24dd96cd6457 for HGEMM+SGEMM

The 6 YAML configuration files used to generate the Resnet50-specific logic
files are archived in the "config" directory correspondingly named

- hgemm_resnet50_nt.yaml
- hgemm_resnet50_nn.yaml
- hgemm_resnet50_tn.yaml
- sgemm_resnet50_nt.yaml
- sgemm_resnet50_nn.yaml
- sgemm_resnet50_tn.yaml

Note that we explicitly purged the 6 sizes with either n=49 or k=49 from
the Resnet50-specific logic files for HGEMM because they won't be using
the assembly kernels.
115 changes: 115 additions & 0 deletions Tensile/Configs/miopen/archives/resnet50/config/hgemm_resnet50_nn.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
GlobalParameters:
MinimumRequiredVersion: 4.2.0
PrintLevel: 1
ForceRedoBenchmarkProblems: True
ForceRedoLibraryLogic: True
ForceRedoLibraryClient: True
CMakeBuildType: Release
EnqueuesPerSync: 1
SyncsPerBenchmark: 1
LibraryPrintDebug: False
NumElementsToValidate: 0
ValidationMaxToPrint: 4
ValidationPrintValids: False
ShortNames: False
MergeFiles: True
Platform: 0
Device: 0
KernelTime: True
PinClocks: True
SleepPercent: 200
DataInitTypeBeta : 0
CodeFromFiles: 1
SolutionSelectionAlg: 1
PrintWinnersOnly: 1

BenchmarkProblems:
########################################
# NN - standard
########################################
-
- # ProblemType
OperationType: GEMM
DataType: h
TransposeA: False
TransposeB: False
UseBeta: True
Batched: True
########################################
# Explore large number of ~10K half solutions
########################################
- # Benchmark Group
InitialSolutionParameters:
BenchmarkCommonParameters:
- EdgeType: ["ShiftPtr"]
- LoopTail: [True]
- KernelLanguage: ["Assembly"]
ForkParameters:
- FractionalLoad: [1]
- PrefetchGlobalRead: [ False, True ]
- PrefetchLocalRead: [ False, True]
- ThreadTile:
- [ 4, 4 ]
- [ 8, 4 ]
- [ 8, 8 ]
- [ 16, 8 ]
- [ 8, 16 ]
- [ 16, 16 ]
- WorkGroup:
- [ 16, 8, 2 ]
- [ 16, 4, 4 ]
- [ 16, 8, 1 ]
- [ 8, 32, 1 ]
- [ 16, 16, 1 ]
- [ 32, 8, 1 ]
- GlobalSplitU: [1,3,5]
- WorkGroupMapping: [1,8,64]
- DepthU: [ 8,16,24,32 ]
- VectorWidth: [4,8]
- GlobalReadVectorWidth: [1,8]
- LdsPadB: [0, -1 ]
- AssertSummationElementMultiple: [2]
- AssertFree0ElementMultiple: [2]
BenchmarkForkParameters:
JoinParameters:
BenchmarkJoinParameters:
BenchmarkFinalParameters:
- ProblemSizes:
# Resnet50 NN
- Exact: [ 784 , 128 , 64, 512 ] # beta= 0
- Exact: [ 784 , 512 , 64, 128 ] # beta= 0
- Exact: [ 3136 , 64 , 64, 64 ] # beta= 0
- Exact: [ 3136 , 64 , 64, 256 ] # beta= 0
- Exact: [ 3136 , 256 , 64, 64 ] # beta= 0
- Exact: [ 784 , 128 , 128, 512 ] # beta= 0
- Exact: [ 784 , 512 , 128, 128 ] # beta= 0
- Exact: [ 3136 , 64 , 128, 64 ] # beta= 0
- Exact: [ 3136 , 64 , 128, 256 ] # beta= 0
- Exact: [ 3136 , 256 , 128, 64 ] # beta= 0
- Exact: [ 3136 , 512 , 1, 2048 ] # beta= 0
- Exact: [ 3136 , 2048 , 1, 512 ] # beta= 0
- Exact: [ 12544 , 256 , 1, 1024 ] # beta= 0
- Exact: [ 12544 , 1024 , 1, 256 ] # beta= 0

LibraryLogic:
ScheduleName: "vega20"
DeviceNames: ["Device 66a0", "Device 66a7"]
ArchitectureName: "gfx906"

# ScheduleName: "vega10"
# DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"]
# ArchitectureName: "gfx900"

# ScheduleName: "mi25"
# DeviceNames: ["Device 6860"]
# ArchitectureName: "gfx900"

# ScheduleName: "r9nano"
# DeviceNames: ["Device 7300"]
# ArchitectureName: "gfx803"

# ScheduleName: "hip"
# DeviceNames: ["Device 0000"]
# ArchitectureName: "fallback"

LibraryClient:
119 changes: 119 additions & 0 deletions Tensile/Configs/miopen/archives/resnet50/config/hgemm_resnet50_nt.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
GlobalParameters:
MinimumRequiredVersion: 4.2.0
PrintLevel: 1
ForceRedoBenchmarkProblems: True
ForceRedoLibraryLogic: True
ForceRedoLibraryClient: True
CMakeBuildType: Release
EnqueuesPerSync: 1
SyncsPerBenchmark: 1
LibraryPrintDebug: False
NumElementsToValidate: 0
ValidationMaxToPrint: 4
ValidationPrintValids: False
ShortNames: False
MergeFiles: True
Platform: 0
Device: 0
KernelTime: True
PinClocks: True
SleepPercent: 200
DataInitTypeBeta : 0
CodeFromFiles: 1
SolutionSelectionAlg: 1
PrintWinnersOnly: 1

BenchmarkProblems:
########################################
# NT - standard
########################################
-
- # ProblemType
OperationType: GEMM
DataType: h
TransposeA: False
TransposeB: True
UseBeta: True
Batched: True
########################################
# Explore large number of ~10K half solutions
########################################
- # Benchmark Group
InitialSolutionParameters:
BenchmarkCommonParameters:
- EdgeType: ["ShiftPtr"]
- LoopTail: [True]
- KernelLanguage: ["Assembly"]
ForkParameters:
- FractionalLoad: [1]
- PrefetchGlobalRead: [ False, True ]
- PrefetchLocalRead: [ False, True]
- ThreadTile:
- [ 4, 4 ]
- [ 8, 4 ]
- [ 8, 8 ]
- [ 16, 8 ]
- [ 8, 16 ]
- [ 16, 16 ]
- WorkGroup:
- [ 16, 8, 2 ]
- [ 16, 4, 4 ]
- [ 16, 8, 1 ]
- [ 8, 32, 1 ]
- [ 16, 16, 1 ]
- [ 32, 8, 1 ]
- GlobalSplitU: [1,3,5]
- WorkGroupMapping: [1,8,64]
- DepthU: [ 8,16,24,32 ]
- VectorWidth: [4,8]
- GlobalReadVectorWidth: [1,8]
- LdsPadB: [0, -1 ]
- AssertSummationElementMultiple: [2]
- AssertFree0ElementMultiple: [2]
BenchmarkForkParameters:
JoinParameters:
BenchmarkJoinParameters:
BenchmarkFinalParameters:
- ProblemSizes:
# Resnet50 NT
- Exact: [ 49 , 512 , 64, 2048 ] # beta= 0
- Exact: [ 49 , 2048 , 64, 512 ] # beta= 0
- Exact: [ 196 , 256 , 64, 1024 ] # beta= 0
- Exact: [ 196 , 1024 , 64, 256 ] # beta= 0
- Exact: [ 784 , 128 , 64, 512 ] # beta= 0
- Exact: [ 784 , 512 , 64, 128 ] # beta= 0
- Exact: [ 3136 , 64 , 64, 64 ] # beta= 0
- Exact: [ 3136 , 256 , 64, 64 ] # beta= 0
- Exact: [ 3136 , 64 , 64, 256 ] # beta= 0
- Exact: [ 49 , 512 , 128, 2048 ] # beta= 0
- Exact: [ 49 , 2048 , 128, 512 ] # beta= 0
- Exact: [ 196 , 256 , 128, 1024 ] # beta= 0
- Exact: [ 196 , 1024 , 128, 256 ] # beta= 0
- Exact: [ 784 , 128 , 128, 512 ] # beta= 0
- Exact: [ 784 , 512 , 128, 128 ] # beta= 0
- Exact: [ 3136 , 64 , 128, 64 ] # beta= 0
- Exact: [ 3136 , 64 , 128, 256 ] # beta= 0
- Exact: [ 3136 , 256 , 128, 64 ] # beta= 0

LibraryLogic:
ScheduleName: "vega20"
DeviceNames: ["Device 66a0", "Device 66a7"]
ArchitectureName: "gfx906"

# ScheduleName: "vega10"
# DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"]
# ArchitectureName: "gfx900"

# ScheduleName: "mi25"
# DeviceNames: ["Device 6860"]
# ArchitectureName: "gfx900"

# ScheduleName: "r9nano"
# DeviceNames: ["Device 7300"]
# ArchitectureName: "gfx803"

# ScheduleName: "hip"
# DeviceNames: ["Device 0000"]
# ArchitectureName: "fallback"

LibraryClient:
Loading

0 comments on commit 7556393

Please sign in to comment.