diff --git a/Tensile/Common.py b/Tensile/Common.py index 6f8f0ec42b..39cde127c2 100644 --- a/Tensile/Common.py +++ b/Tensile/Common.py @@ -267,9 +267,21 @@ # If changing this also change runtime writeSolutionAssertionCheck* functions in Common.py and in TensileTypes.py (AssertionProperties class) "AssertFree0ElementMultiple" : [1,2,4,8], + # When creating the kernel, assume that the 'second' free index size is some + # multiple of the element size. + # "first" free index is FreeIndex[1] and usually letter "J" + # 1 indicates no restriction (since all sizes are multiples of 1) + # If changing this also change runtime writeSolutionAssertionCheck* functions in Common.py and in TensileTypes.py (AssertionProperties class) + #"AssertFree1ElementMultiple" : [1,2,4,8], + "AssertFree1ElementMultiple" : [1], # TODO, support broader range here + # Generate code inside kernel to check assertions above on Tensor dimensions "CheckTensorDimAsserts": [False, True], + # Generate code inside kernel to check several dimension overflow cases, in particular around use of 32-bit calcs + # 0 = no check, 1=checks for cases that should be avoided through assertions and kernel selection, 2=checks for cases that should never happen + "CheckDimOverflow": [0,1,2], + # For Block Mapping type: # 0 : Use hardware-assigned wg number with no remapping. # N : WG block width. "Wrap" to a new wg1 "row" assignment after N WGs assigned in that row. @@ -429,11 +441,13 @@ {"BufferLoad": [ True ] }, {"BufferStore": [ True ] }, {"DirectToLds": [ True ] }, - {"PreciseBoundsCheck": [ False ] }, + {"PreciseBoundsCheck": [ True ] }, {"UseSgprForGRO": [ -1 ] }, {"AssertSummationElementMultiple": [ 1 ] }, {"AssertFree0ElementMultiple": [ 1 ] }, + {"AssertFree1ElementMultiple": [ 1 ] }, {"CheckTensorDimAsserts" : [ False ] }, + {"CheckDimOverflow" : [ 0 ] }, {"GlobalSplitU": [ 1 ] }, {"GlobalSplitUSummationAssignmentRoundRobin": [ True ] }, @@ -675,9 +689,11 @@ def tryAssembler(isaVersion, asmString): if result != "": return 0 # stdout and stderr must be empty except subprocess.CalledProcessError, e: + if globalParameters["PrintLevel"] >=2: + print "CalledProcessError", e return 0 # error, not supported - return 1 # syntax works for + return 1 # syntax works ################################################################################ @@ -737,13 +753,11 @@ def assignGlobalParameters( config ): for (v) in globalParameters["SupportedISA"]: globalParameters["AsmCaps"][v] = {} isaVersion = "gfx" + "".join(map(str,v)) - asmCmd = "%s -x assembler -target amdgcn-amdhsa -mcpu=%s -" \ - % (globalParameters["AssemblerPath"], isaVersion) - # This doesn't work since assembler politely falls back to default with an unsupported mcpu argument: globalParameters["AsmCaps"][v]["SupportedIsa"] = tryAssembler(isaVersion, "") globalParameters["AsmCaps"][v]["HasExplicitCO"] = tryAssembler(isaVersion, "v_add_co_u32 v0,vcc,v0,v0") globalParameters["AsmCaps"][v]["HasDirectToLds"] = tryAssembler(isaVersion, "buffer_load_dword v40, v36, s[24:27], s28 offen offset:0 lds") globalParameters["AsmCaps"][v]["HasAddLshl"] = tryAssembler(isaVersion, "v_add_lshl_u32 v47, v36, v34, 0x2") + globalParameters["AsmCaps"][v]["HasSMulHi"] = tryAssembler(isaVersion, "s_mul_hi_u32 s47, s36, s34") caps = "" for k in globalParameters["AsmCaps"][v]: caps += " %s=%u" % (k, globalParameters["AsmCaps"][v][k]) diff --git a/Tensile/Configs/miopen/Makefile b/Tensile/Configs/miopen/Makefile index d20aacc2ce..14425d04cf 100644 --- a/Tensile/Configs/miopen/Makefile +++ b/Tensile/Configs/miopen/Makefile @@ -2,9 +2,12 @@ P=problems DEEPBENCH_CONV_1x1=$P/nn/deepbench_conv_1x1_batchN.yml $P/nn/deepbench_conv_1x1_batch1.yml RESNET=$P/nn/resnet_batch64_B.yml +# Override SCHED as vega10, vega20 +SCHED=vega10 + # commonly-used headers and footers: HEADER=boiler/header.yml -FOOTER=boiler/library_logic_vega10_only.yml +FOOTER=boiler/library_logic_$(SCHED)_only.yml # Override TYPE as sgemm, hgemm (hgemm_hpa, dgemm, etc in future) TYPE=sgemm @@ -25,6 +28,9 @@ SOLUTION_SKINNY=solutions/$(TYPE)_skinny_explore_$(EXPLORE_LEVEL).yml all: \ $(TYPE)_resnet.yaml \ + $(TYPE)_resnet50_nn.yaml \ + $(TYPE)_resnet50_nt.yaml \ + $(TYPE)_resnet50_tn.yaml \ $(TYPE)_deepbench_conv1x1.yaml \ $(TYPE)_deepbench_gemm_nn.yaml \ $(TYPE)_deepbench_gemm_nt.yaml \ @@ -35,6 +41,17 @@ $(TYPE)_resnet.yaml: $(HEADER) types/$(TYPE)_nn.yml \ $(SOLUTION_SKINNY) $(RESNET) \ $(FOOTER) +# Resnet50 +$(TYPE)_resnet50_nn.yaml: $(HEADER) types/$(TYPE)_nn.yml \ + $(SOLUTION_SKINNY) $P/nn/resnet50_all.yml \ + $(FOOTER) +$(TYPE)_resnet50_nt.yaml: $(HEADER) types/$(TYPE)_nt.yml \ + $(SOLUTION_SKINNY) $P/nt/resnet50_all.yml \ + $(FOOTER) +$(TYPE)_resnet50_tn.yaml: $(HEADER) types/$(TYPE)_tn.yml \ + $(SOLUTION_SKINNY) $P/tn/resnet50_all.yml \ + $(FOOTER) + # DeepBench Convolution: $(TYPE)_deepbench_conv1x1.yaml: $(HEADER) types/$(TYPE)_nn.yml \ $(SOLUTION_SKINNY) $(DEEPBENCH_CONV_1x1) \ diff --git a/Tensile/Configs/miopen/archives/resnet50/README.md b/Tensile/Configs/miopen/archives/resnet50/README.md new file mode 100644 index 0000000000..ac1eeeb3f9 --- /dev/null +++ b/Tensile/Configs/miopen/archives/resnet50/README.md @@ -0,0 +1,33 @@ +Start with the 6 asm_full logic files + + - vega20_Cijk_Ailk_Bjlk_HB.yaml + - vega20_Cijk_Ailk_Bljk_HB.yaml + - vega20_Cijk_Alik_Bljk_HB.yaml + - vega20_Cijk_Ailk_Bjlk_SB.yaml + - vega20_Cijk_Ailk_Bljk_SB.yaml + - vega20_Cijk_Alik_Bljk_SB.yaml + +from + + - rocBLAS commit a85df88648587a0d2880a74c6c57964366ab02a1 for HGEMM + - rocBLAS commit 0ceb1ad64c8bda5473a1e1c3a74ab9ff204acbf8 for SGEMM + +we merge the 6 Resnet50-specific logic files archived in the "logic" directory +into the corresponding asm_full logic files of the same name, resulting in the +6 combined asm_full logic files in + + - rocBLAS commit ea27b3aba339b4fd48795153995d24dd96cd6457 for HGEMM+SGEMM + +The 6 YAML configuration files used to generate the Resnet50-specific logic +files are archived in the "config" directory correspondingly named + + - hgemm_resnet50_nt.yaml + - hgemm_resnet50_nn.yaml + - hgemm_resnet50_tn.yaml + - sgemm_resnet50_nt.yaml + - sgemm_resnet50_nn.yaml + - sgemm_resnet50_tn.yaml + +Note that we explicitly purged the 6 sizes with either n=49 or k=49 from +the Resnet50-specific logic files for HGEMM because they won't be using +the assembly kernels. diff --git a/Tensile/Configs/miopen/archives/resnet50/config/hgemm_resnet50_nn.yaml b/Tensile/Configs/miopen/archives/resnet50/config/hgemm_resnet50_nn.yaml new file mode 100644 index 0000000000..e3d2c7c8af --- /dev/null +++ b/Tensile/Configs/miopen/archives/resnet50/config/hgemm_resnet50_nn.yaml @@ -0,0 +1,115 @@ +GlobalParameters: + MinimumRequiredVersion: 4.2.0 + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + CMakeBuildType: Release + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + LibraryPrintDebug: False + NumElementsToValidate: 0 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + Platform: 0 + Device: 0 + KernelTime: True + PinClocks: True + SleepPercent: 200 + DataInitTypeBeta : 0 + CodeFromFiles: 1 + SolutionSelectionAlg: 1 + PrintWinnersOnly: 1 + +BenchmarkProblems: + ######################################## + # NN - standard + ######################################## + - + - # ProblemType + OperationType: GEMM + DataType: h + TransposeA: False + TransposeB: False + UseBeta: True + Batched: True + ######################################## + # Explore large number of ~10K half solutions + ######################################## + - # Benchmark Group + InitialSolutionParameters: + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - KernelLanguage: ["Assembly"] + ForkParameters: + - FractionalLoad: [1] + - PrefetchGlobalRead: [ False, True ] + - PrefetchLocalRead: [ False, True] + - ThreadTile: + - [ 4, 4 ] + - [ 8, 4 ] + - [ 8, 8 ] + - [ 16, 8 ] + - [ 8, 16 ] + - [ 16, 16 ] + - WorkGroup: + - [ 16, 8, 2 ] + - [ 16, 4, 4 ] + - [ 16, 8, 1 ] + - [ 8, 32, 1 ] + - [ 16, 16, 1 ] + - [ 32, 8, 1 ] + - GlobalSplitU: [1,3,5] + - WorkGroupMapping: [1,8,64] + - DepthU: [ 8,16,24,32 ] + - VectorWidth: [4,8] + - GlobalReadVectorWidth: [1,8] + - LdsPadB: [0, -1 ] + - AssertSummationElementMultiple: [2] + - AssertFree0ElementMultiple: [2] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: +# Resnet50 NN + - Exact: [ 784 , 128 , 64, 512 ] # beta= 0 + - Exact: [ 784 , 512 , 64, 128 ] # beta= 0 + - Exact: [ 3136 , 64 , 64, 64 ] # beta= 0 + - Exact: [ 3136 , 64 , 64, 256 ] # beta= 0 + - Exact: [ 3136 , 256 , 64, 64 ] # beta= 0 + - Exact: [ 784 , 128 , 128, 512 ] # beta= 0 + - Exact: [ 784 , 512 , 128, 128 ] # beta= 0 + - Exact: [ 3136 , 64 , 128, 64 ] # beta= 0 + - Exact: [ 3136 , 64 , 128, 256 ] # beta= 0 + - Exact: [ 3136 , 256 , 128, 64 ] # beta= 0 + - Exact: [ 3136 , 512 , 1, 2048 ] # beta= 0 + - Exact: [ 3136 , 2048 , 1, 512 ] # beta= 0 + - Exact: [ 12544 , 256 , 1, 1024 ] # beta= 0 + - Exact: [ 12544 , 1024 , 1, 256 ] # beta= 0 + +LibraryLogic: + ScheduleName: "vega20" + DeviceNames: ["Device 66a0", "Device 66a7"] + ArchitectureName: "gfx906" + +# ScheduleName: "vega10" +# DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"] +# ArchitectureName: "gfx900" + +# ScheduleName: "mi25" +# DeviceNames: ["Device 6860"] +# ArchitectureName: "gfx900" + +# ScheduleName: "r9nano" +# DeviceNames: ["Device 7300"] +# ArchitectureName: "gfx803" + +# ScheduleName: "hip" +# DeviceNames: ["Device 0000"] +# ArchitectureName: "fallback" + +LibraryClient: diff --git a/Tensile/Configs/miopen/archives/resnet50/config/hgemm_resnet50_nt.yaml b/Tensile/Configs/miopen/archives/resnet50/config/hgemm_resnet50_nt.yaml new file mode 100644 index 0000000000..6ec161cf0c --- /dev/null +++ b/Tensile/Configs/miopen/archives/resnet50/config/hgemm_resnet50_nt.yaml @@ -0,0 +1,119 @@ +GlobalParameters: + MinimumRequiredVersion: 4.2.0 + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + CMakeBuildType: Release + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + LibraryPrintDebug: False + NumElementsToValidate: 0 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + Platform: 0 + Device: 0 + KernelTime: True + PinClocks: True + SleepPercent: 200 + DataInitTypeBeta : 0 + CodeFromFiles: 1 + SolutionSelectionAlg: 1 + PrintWinnersOnly: 1 + +BenchmarkProblems: + ######################################## + # NT - standard + ######################################## + - + - # ProblemType + OperationType: GEMM + DataType: h + TransposeA: False + TransposeB: True + UseBeta: True + Batched: True + ######################################## + # Explore large number of ~10K half solutions + ######################################## + - # Benchmark Group + InitialSolutionParameters: + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - KernelLanguage: ["Assembly"] + ForkParameters: + - FractionalLoad: [1] + - PrefetchGlobalRead: [ False, True ] + - PrefetchLocalRead: [ False, True] + - ThreadTile: + - [ 4, 4 ] + - [ 8, 4 ] + - [ 8, 8 ] + - [ 16, 8 ] + - [ 8, 16 ] + - [ 16, 16 ] + - WorkGroup: + - [ 16, 8, 2 ] + - [ 16, 4, 4 ] + - [ 16, 8, 1 ] + - [ 8, 32, 1 ] + - [ 16, 16, 1 ] + - [ 32, 8, 1 ] + - GlobalSplitU: [1,3,5] + - WorkGroupMapping: [1,8,64] + - DepthU: [ 8,16,24,32 ] + - VectorWidth: [4,8] + - GlobalReadVectorWidth: [1,8] + - LdsPadB: [0, -1 ] + - AssertSummationElementMultiple: [2] + - AssertFree0ElementMultiple: [2] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: +# Resnet50 NT + - Exact: [ 49 , 512 , 64, 2048 ] # beta= 0 + - Exact: [ 49 , 2048 , 64, 512 ] # beta= 0 + - Exact: [ 196 , 256 , 64, 1024 ] # beta= 0 + - Exact: [ 196 , 1024 , 64, 256 ] # beta= 0 + - Exact: [ 784 , 128 , 64, 512 ] # beta= 0 + - Exact: [ 784 , 512 , 64, 128 ] # beta= 0 + - Exact: [ 3136 , 64 , 64, 64 ] # beta= 0 + - Exact: [ 3136 , 256 , 64, 64 ] # beta= 0 + - Exact: [ 3136 , 64 , 64, 256 ] # beta= 0 + - Exact: [ 49 , 512 , 128, 2048 ] # beta= 0 + - Exact: [ 49 , 2048 , 128, 512 ] # beta= 0 + - Exact: [ 196 , 256 , 128, 1024 ] # beta= 0 + - Exact: [ 196 , 1024 , 128, 256 ] # beta= 0 + - Exact: [ 784 , 128 , 128, 512 ] # beta= 0 + - Exact: [ 784 , 512 , 128, 128 ] # beta= 0 + - Exact: [ 3136 , 64 , 128, 64 ] # beta= 0 + - Exact: [ 3136 , 64 , 128, 256 ] # beta= 0 + - Exact: [ 3136 , 256 , 128, 64 ] # beta= 0 + +LibraryLogic: + ScheduleName: "vega20" + DeviceNames: ["Device 66a0", "Device 66a7"] + ArchitectureName: "gfx906" + +# ScheduleName: "vega10" +# DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"] +# ArchitectureName: "gfx900" + +# ScheduleName: "mi25" +# DeviceNames: ["Device 6860"] +# ArchitectureName: "gfx900" + +# ScheduleName: "r9nano" +# DeviceNames: ["Device 7300"] +# ArchitectureName: "gfx803" + +# ScheduleName: "hip" +# DeviceNames: ["Device 0000"] +# ArchitectureName: "fallback" + +LibraryClient: diff --git a/Tensile/Configs/miopen/archives/resnet50/config/hgemm_resnet50_tn.yaml b/Tensile/Configs/miopen/archives/resnet50/config/hgemm_resnet50_tn.yaml new file mode 100644 index 0000000000..d77badde25 --- /dev/null +++ b/Tensile/Configs/miopen/archives/resnet50/config/hgemm_resnet50_tn.yaml @@ -0,0 +1,110 @@ +GlobalParameters: + MinimumRequiredVersion: 4.2.0 + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + CMakeBuildType: Release + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + LibraryPrintDebug: False + NumElementsToValidate: 0 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + Platform: 0 + Device: 0 + KernelTime: True + PinClocks: True + SleepPercent: 200 + DataInitTypeBeta : 0 + CodeFromFiles: 1 + SolutionSelectionAlg: 1 + PrintWinnersOnly: 1 + +BenchmarkProblems: + ######################################## + # TN - standard + ######################################## + - + - # ProblemType + OperationType: GEMM + DataType: h + TransposeA: True + TransposeB: False + UseBeta: True + Batched: True + ######################################## + # Explore large number of ~10K half solutions + ######################################## + - # Benchmark Group + InitialSolutionParameters: + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - KernelLanguage: ["Assembly"] + ForkParameters: + - FractionalLoad: [1] + - PrefetchGlobalRead: [ False, True ] + - PrefetchLocalRead: [ False, True] + - ThreadTile: + - [ 4, 4 ] + - [ 8, 4 ] + - [ 8, 8 ] + - [ 16, 8 ] + - [ 8, 16 ] + - [ 16, 16 ] + - WorkGroup: + - [ 16, 8, 2 ] + - [ 16, 4, 4 ] + - [ 16, 8, 1 ] + - [ 8, 32, 1 ] + - [ 16, 16, 1 ] + - [ 32, 8, 1 ] + - GlobalSplitU: [1,3,5] + - WorkGroupMapping: [1,8,64] + - DepthU: [ 8,16,24,32 ] + - VectorWidth: [4,8] + - GlobalReadVectorWidth: [1,8] + - LdsPadB: [0, -1 ] + - AssertSummationElementMultiple: [2] + - AssertFree0ElementMultiple: [2] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: +# Resnet50 TN + - Exact: [ 64 , 64 , 1, 3136 ] # beta= 1 + - Exact: [ 64 , 256 , 1, 3136 ] # beta= 1 + - Exact: [ 128 , 512 , 1, 784 ] # beta= 1 + - Exact: [ 256 , 64 , 1, 3136 ] # beta= 1 + - Exact: [ 256 , 1024 , 1, 196 ] # beta= 1 + - Exact: [ 512 , 128 , 1, 784 ] # beta= 1 + - Exact: [ 512 , 2048 , 1, 49 ] # beta= 1 + - Exact: [ 1024 , 256 , 1, 196 ] # beta= 1 + - Exact: [ 2048 , 512 , 1, 49 ] # beta= 1 + +LibraryLogic: + ScheduleName: "vega20" + DeviceNames: ["Device 66a0", "Device 66a7"] + ArchitectureName: "gfx906" + +# ScheduleName: "vega10" +# DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"] +# ArchitectureName: "gfx900" + +# ScheduleName: "mi25" +# DeviceNames: ["Device 6860"] +# ArchitectureName: "gfx900" + +# ScheduleName: "r9nano" +# DeviceNames: ["Device 7300"] +# ArchitectureName: "gfx803" + +# ScheduleName: "hip" +# DeviceNames: ["Device 0000"] +# ArchitectureName: "fallback" + +LibraryClient: diff --git a/Tensile/Configs/miopen/archives/resnet50/config/sgemm_resnet50_nn.yaml b/Tensile/Configs/miopen/archives/resnet50/config/sgemm_resnet50_nn.yaml new file mode 100644 index 0000000000..9c3bb3f71a --- /dev/null +++ b/Tensile/Configs/miopen/archives/resnet50/config/sgemm_resnet50_nn.yaml @@ -0,0 +1,111 @@ +GlobalParameters: + MinimumRequiredVersion: 4.2.0 + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + CMakeBuildType: Release + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + LibraryPrintDebug: False + NumElementsToValidate: 0 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + Platform: 0 + Device: 0 + KernelTime: True + PinClocks: True + SleepPercent: 200 + DataInitTypeBeta : 0 + CodeFromFiles: 1 + SolutionSelectionAlg: 1 + PrintWinnersOnly: 1 + +BenchmarkProblems: + ######################################## + # NN - standard + ######################################## + - + - # ProblemType + OperationType: GEMM + DataType: s + TransposeA: False + TransposeB: False + UseBeta: True + Batched: True + ######################################## + # Explore large number of ~10K solutions + ######################################## + - # Benchmark Group + InitialSolutionParameters: + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - KernelLanguage: ["Assembly"] + ForkParameters: + - FractionalLoad: [1] + - PrefetchGlobalRead: [ False, True ] + - PrefetchLocalRead: [ False, True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 4 ] + - WorkGroup: + - [ 16, 8, 2 ] + - [ 16, 4, 4 ] + - [ 16, 8, 1 ] + - [ 8, 32, 1 ] + - [ 16, 16, 1 ] + - [ 32, 8, 1 ] + - GlobalSplitU: [1,4,8] + - WorkGroupMapping: [1,8,64] + - DepthU: [ 8,16,32 ] + - VectorWidth: [1,2,4] + - GlobalReadVectorWidth: [1,-1] + - LdsPadA: [0] + - LdsPadB: [0] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: +# Resnet50 NN + - Exact: [ 784 , 128 , 64, 512 ] # beta= 0 + - Exact: [ 784 , 512 , 64, 128 ] # beta= 0 + - Exact: [ 3136 , 64 , 64, 64 ] # beta= 0 + - Exact: [ 3136 , 64 , 64, 256 ] # beta= 0 + - Exact: [ 3136 , 256 , 64, 64 ] # beta= 0 + - Exact: [ 784 , 128 , 128, 512 ] # beta= 0 + - Exact: [ 784 , 512 , 128, 128 ] # beta= 0 + - Exact: [ 3136 , 64 , 128, 64 ] # beta= 0 + - Exact: [ 3136 , 64 , 128, 256 ] # beta= 0 + - Exact: [ 3136 , 256 , 128, 64 ] # beta= 0 + - Exact: [ 3136 , 512 , 1, 2048 ] # beta= 0 + - Exact: [ 3136 , 2048 , 1, 512 ] # beta= 0 + - Exact: [ 12544 , 256 , 1, 1024 ] # beta= 0 + - Exact: [ 12544 , 1024 , 1, 256 ] # beta= 0 + +LibraryLogic: + ScheduleName: "vega20" + DeviceNames: ["Device 66a0", "Device 66a7"] + ArchitectureName: "gfx906" + +# ScheduleName: "vega10" +# DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"] +# ArchitectureName: "gfx900" + +# ScheduleName: "mi25" +# DeviceNames: ["Device 6860"] +# ArchitectureName: "gfx900" + +# ScheduleName: "r9nano" +# DeviceNames: ["Device 7300"] +# ArchitectureName: "gfx803" + +# ScheduleName: "hip" +# DeviceNames: ["Device 0000"] +# ArchitectureName: "fallback" + +LibraryClient: diff --git a/Tensile/Configs/miopen/archives/resnet50/config/sgemm_resnet50_nt.yaml b/Tensile/Configs/miopen/archives/resnet50/config/sgemm_resnet50_nt.yaml new file mode 100644 index 0000000000..b1ab3043a1 --- /dev/null +++ b/Tensile/Configs/miopen/archives/resnet50/config/sgemm_resnet50_nt.yaml @@ -0,0 +1,115 @@ +GlobalParameters: + MinimumRequiredVersion: 4.2.0 + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + CMakeBuildType: Release + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + LibraryPrintDebug: False + NumElementsToValidate: 0 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + Platform: 0 + Device: 0 + KernelTime: True + PinClocks: True + SleepPercent: 200 + DataInitTypeBeta : 0 + CodeFromFiles: 1 + SolutionSelectionAlg: 1 + PrintWinnersOnly: 1 + +BenchmarkProblems: + ######################################## + # NT - standard + ######################################## + - + - # ProblemType + OperationType: GEMM + DataType: s + TransposeA: False + TransposeB: True + UseBeta: True + Batched: True + ######################################## + # Explore large number of ~10K solutions + ######################################## + - # Benchmark Group + InitialSolutionParameters: + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - KernelLanguage: ["Assembly"] + ForkParameters: + - FractionalLoad: [1] + - PrefetchGlobalRead: [ False, True ] + - PrefetchLocalRead: [ False, True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 4 ] + - WorkGroup: + - [ 16, 8, 2 ] + - [ 16, 4, 4 ] + - [ 16, 8, 1 ] + - [ 8, 32, 1 ] + - [ 16, 16, 1 ] + - [ 32, 8, 1 ] + - GlobalSplitU: [1,3,5] + - WorkGroupMapping: [1,8,64] + - DepthU: [ 8,16,32 ] + - VectorWidth: [1,2,4] + - GlobalReadVectorWidth: [1,4] + - LdsPadA: [0, -1 ] + - LdsPadB: [0, -1 ] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: +# Resnet50 NT + - Exact: [ 49 , 512 , 64, 2048 ] # beta= 0 + - Exact: [ 49 , 2048 , 64, 512 ] # beta= 0 + - Exact: [ 196 , 256 , 64, 1024 ] # beta= 0 + - Exact: [ 196 , 1024 , 64, 256 ] # beta= 0 + - Exact: [ 784 , 128 , 64, 512 ] # beta= 0 + - Exact: [ 784 , 512 , 64, 128 ] # beta= 0 + - Exact: [ 3136 , 64 , 64, 64 ] # beta= 0 + - Exact: [ 3136 , 256 , 64, 64 ] # beta= 0 + - Exact: [ 3136 , 64 , 64, 256 ] # beta= 0 + - Exact: [ 49 , 512 , 128, 2048 ] # beta= 0 + - Exact: [ 49 , 2048 , 128, 512 ] # beta= 0 + - Exact: [ 196 , 256 , 128, 1024 ] # beta= 0 + - Exact: [ 196 , 1024 , 128, 256 ] # beta= 0 + - Exact: [ 784 , 128 , 128, 512 ] # beta= 0 + - Exact: [ 784 , 512 , 128, 128 ] # beta= 0 + - Exact: [ 3136 , 64 , 128, 64 ] # beta= 0 + - Exact: [ 3136 , 64 , 128, 256 ] # beta= 0 + - Exact: [ 3136 , 256 , 128, 64 ] # beta= 0 + +LibraryLogic: + ScheduleName: "vega20" + DeviceNames: ["Device 66a0", "Device 66a7"] + ArchitectureName: "gfx906" + +# ScheduleName: "vega10" +# DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"] +# ArchitectureName: "gfx900" + +# ScheduleName: "mi25" +# DeviceNames: ["Device 6860"] +# ArchitectureName: "gfx900" + +# ScheduleName: "r9nano" +# DeviceNames: ["Device 7300"] +# ArchitectureName: "gfx803" + +# ScheduleName: "hip" +# DeviceNames: ["Device 0000"] +# ArchitectureName: "fallback" + +LibraryClient: diff --git a/Tensile/Configs/miopen/archives/resnet50/config/sgemm_resnet50_tn.yaml b/Tensile/Configs/miopen/archives/resnet50/config/sgemm_resnet50_tn.yaml new file mode 100644 index 0000000000..1da57d40d9 --- /dev/null +++ b/Tensile/Configs/miopen/archives/resnet50/config/sgemm_resnet50_tn.yaml @@ -0,0 +1,106 @@ +GlobalParameters: + MinimumRequiredVersion: 4.2.0 + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + CMakeBuildType: Release + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + LibraryPrintDebug: False + NumElementsToValidate: 0 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + Platform: 0 + Device: 0 + KernelTime: True + PinClocks: True + SleepPercent: 200 + DataInitTypeBeta : 0 + CodeFromFiles: 1 + SolutionSelectionAlg: 1 + PrintWinnersOnly: 0 + +BenchmarkProblems: + ######################################## + # TN - standard + ######################################## + - + - # ProblemType + OperationType: GEMM + DataType: s + TransposeA: True + TransposeB: False + UseBeta: True + Batched: True + ######################################## + # Explore large number of ~10K solutions + ######################################## + - # Benchmark Group + InitialSolutionParameters: + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - KernelLanguage: ["Assembly"] + ForkParameters: + - FractionalLoad: [1] + - PrefetchGlobalRead: [ False, True ] + - PrefetchLocalRead: [ False, True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 4 ] + - WorkGroup: + - [ 16, 8, 2 ] + - [ 16, 4, 4 ] + - [ 16, 8, 1 ] + - [ 8, 32, 1 ] + - [ 16, 16, 1 ] + - [ 32, 8, 1 ] + - GlobalSplitU: [1,4,8] + - WorkGroupMapping: [1,8,64] + - DepthU: [ 8,16,32 ] + - VectorWidth: [1,2,4] + - GlobalReadVectorWidth: [1,-1] + - LdsPadA: [-1] + - LdsPadB: [-1] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: +# Resnet50 TN + - Exact: [ 64 , 64 , 1, 3136 ] # beta= 1 + - Exact: [ 64 , 256 , 1, 3136 ] # beta= 1 + - Exact: [ 128 , 512 , 1, 784 ] # beta= 1 + - Exact: [ 256 , 64 , 1, 3136 ] # beta= 1 + - Exact: [ 256 , 1024 , 1, 196 ] # beta= 1 + - Exact: [ 512 , 128 , 1, 784 ] # beta= 1 + - Exact: [ 512 , 2048 , 1, 49 ] # beta= 1 + - Exact: [ 1024 , 256 , 1, 196 ] # beta= 1 + - Exact: [ 2048 , 512 , 1, 49 ] # beta= 1 + +LibraryLogic: + ScheduleName: "vega20" + DeviceNames: ["Device 66a0", "Device 66a7"] + ArchitectureName: "gfx906" + +# ScheduleName: "vega10" +# DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"] +# ArchitectureName: "gfx900" + +# ScheduleName: "mi25" +# DeviceNames: ["Device 6860"] +# ArchitectureName: "gfx900" + +# ScheduleName: "r9nano" +# DeviceNames: ["Device 7300"] +# ArchitectureName: "gfx803" + +# ScheduleName: "hip" +# DeviceNames: ["Device 0000"] +# ArchitectureName: "fallback" + +LibraryClient: diff --git a/Tensile/Configs/miopen/archives/resnet50/logic/vega20_Cijk_Ailk_Bjlk_HB.yaml b/Tensile/Configs/miopen/archives/resnet50/logic/vega20_Cijk_Ailk_Bjlk_HB.yaml new file mode 100644 index 0000000000..5e394efecf --- /dev/null +++ b/Tensile/Configs/miopen/archives/resnet50/logic/vega20_Cijk_Ailk_Bjlk_HB.yaml @@ -0,0 +1,1312 @@ +- {MinimumRequiredVersion: 4.3.0} +- vega20 +- gfx906 +- [Device 66a0, Device 66a7] +- AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false +- - AssertFree0ElementMultiple: 2 + AssertSummationElementMultiple: 2 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsNumElements: 16384 + LdsOffsetA: 0 + LdsOffsetB: 8192 + LdsPadA: 0 + LdsPadB: 0 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT256x256x32_DTL0_GRVW08_GSU05_PGR0_PLR1_TT16_16_VW04_WG16_16_01_WGM08 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 2 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 2 + AssertSummationElementMultiple: 2 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdsNumElements: 2560 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT032x128x16_DTL0_GRVW08_GSU01_PGR0_PLR1_TT04_04_VW04_WG08_32_01_WGM01 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 2 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 2 + AssertSummationElementMultiple: 2 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 256 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 1 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 512 + LdsOffsetB_Blk: 4608 + LdsPadA: 0 + LdsPadB: 0 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT064x256x08_DTL0_GRVW08_GSU01_PGR1_PLR1_TT08_08_VW08_WG08_32_01_WGM01 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 2 + VectorStore: true + VectorWidth: 8 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 2 + AssertSummationElementMultiple: 2 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x064x08_DTL0_GRVW08_GSU01_PGR1_PLR1_TT08_08_VW08_WG16_08_01_WGM01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 2 + VectorStore: true + VectorWidth: 8 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 2 + AssertSummationElementMultiple: 2 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 512 + LdsOffsetB_Blk: 4608 + LdsPadA: 0 + LdsPadB: 0 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT032x128x16_DTL0_GRVW08_GSU01_PGR1_PLR1_TT04_04_VW04_WG08_32_01_WGM01 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 2 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 2 + AssertSummationElementMultiple: 2 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsNumElements: 3072 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x064x16_DTL0_GRVW08_GSU01_PGR0_PLR1_TT08_04_VW04_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 2 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 2 + AssertSummationElementMultiple: 2 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT064x128x16_DTL0_GRVW08_GSU01_PGR1_PLR1_TT08_04_VW04_WG08_32_01_WGM01 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 2 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 2 + AssertSummationElementMultiple: 2 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x064x16_DTL0_GRVW08_GSU01_PGR1_PLR1_TT08_04_VW04_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 2 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 2 + AssertSummationElementMultiple: 2 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdsNumElements: 2560 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT032x128x16_DTL0_GRVW08_GSU01_PGR0_PLR1_TT04_04_VW04_WG08_32_01_WGM64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 2 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 +- [2, 3, 0, 1] +- - - [3136, 256, 64, 64] + - [2, 16074.9] + - - [784, 512, 64, 128] + - [2, 15833.4] + - - [49, 2048, 128, 512] + - [0, 0.0] + - - [784, 128, 64, 512] + - [6, 16500.8] + - - [196, 1024, 64, 256] + - [1, 13742.4] + - - [3136, 64, 128, 64] + - [3, 14617.2] + - - [784, 512, 128, 128] + - [2, 16298.0] + - - [196, 1024, 128, 256] + - [8, 14030.9] + - - [196, 256, 64, 1024] + - [4, 14105.5] + - - [3136, 64, 64, 64] + - [5, 13485.4] + - - [3136, 64, 64, 256] + - [7, 16852.6] + - - [3136, 64, 128, 256] + - [5, 17346.9] + - - [784, 128, 128, 512] + - [6, 16988.5] + - - [49, 2048, 64, 512] + - [0, 0.0] + - - [196, 256, 128, 1024] + - [4, 14648.5] + - - [49, 512, 64, 2048] + - [0, 0.0] + - - [49, 512, 128, 2048] + - [0, 0.0] + - - [3136, 256, 128, 64] + - [2, 16431.6] +- null diff --git a/Tensile/Configs/miopen/archives/resnet50/logic/vega20_Cijk_Ailk_Bjlk_SB.yaml b/Tensile/Configs/miopen/archives/resnet50/logic/vega20_Cijk_Ailk_Bjlk_SB.yaml new file mode 100644 index 0000000000..fd8f6eefe3 --- /dev/null +++ b/Tensile/Configs/miopen/archives/resnet50/logic/vega20_Cijk_Ailk_Bjlk_SB.yaml @@ -0,0 +1,1467 @@ +- {MinimumRequiredVersion: 4.3.0} +- vega20 +- gfx906 +- [Device 66a0, Device 66a7] +- AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false +- - AssertFree0ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_VW04_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_VW04_WG16_08_01_WGM01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x128x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_VW04_WG08_32_01_WGM01 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_VW04_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_VW02_WG16_16_01_WGM64 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_VW02_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x08_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_04_VW04_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_VW04_WG16_16_01_WGM64 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x128x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_VW04_WG08_32_01_WGM64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x128x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_VW04_WG08_32_01_WGM08 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 +- [2, 3, 0, 1] +- - - [3136, 256, 64, 64] + - [1, 8023.34] + - - [784, 512, 64, 128] + - [0, 8118.42] + - - [49, 2048, 128, 512] + - [5, 6709.69] + - - [784, 128, 64, 512] + - [7, 8457.53] + - - [196, 1024, 64, 256] + - [8, 7259.55] + - - [3136, 64, 128, 64] + - [6, 7772.99] + - - [784, 512, 128, 128] + - [0, 8225.65] + - - [196, 1024, 128, 256] + - [9, 7369.53] + - - [196, 256, 64, 1024] + - [2, 7306.01] + - - [3136, 64, 64, 64] + - [6, 7555.78] + - - [3136, 64, 64, 256] + - [1, 8713.92] + - - [3136, 64, 128, 256] + - [1, 8912.31] + - - [784, 128, 128, 512] + - [0, 8609.88] + - - [49, 2048, 64, 512] + - [4, 6575.52] + - - [196, 256, 128, 1024] + - [2, 7483.58] + - - [49, 512, 64, 2048] + - [3, 6562.92] + - - [49, 512, 128, 2048] + - [1, 6808.05] + - - [3136, 256, 128, 64] + - [1, 8120.83] +- null diff --git a/Tensile/Configs/miopen/archives/resnet50/logic/vega20_Cijk_Ailk_Bljk_HB.yaml b/Tensile/Configs/miopen/archives/resnet50/logic/vega20_Cijk_Ailk_Bljk_HB.yaml new file mode 100644 index 0000000000..30f0f3e969 --- /dev/null +++ b/Tensile/Configs/miopen/archives/resnet50/logic/vega20_Cijk_Ailk_Bljk_HB.yaml @@ -0,0 +1,1443 @@ +- {MinimumRequiredVersion: 4.3.0} +- vega20 +- gfx906 +- [Device 66a0, Device 66a7] +- AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false +- - AssertFree0ElementMultiple: 2 + AssertSummationElementMultiple: 2 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_DTL0_GRVW08_GSU01_LPB00_PGR1_PLR1_TT08_08_VW08_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 2 + VectorStore: true + VectorWidth: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 2 + AssertSummationElementMultiple: 2 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_DTL0_GRVW08_GSU01_LPB00_PGR1_PLR1_TT08_08_VW08_WG16_16_01_WGM08 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 2 + VectorStore: true + VectorWidth: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 2 + AssertSummationElementMultiple: 2 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x032x16_DTL0_GRVW08_GSU01_LPB04_PGR1_PLR1_TT08_04_VW04_WG16_08_02_WGM64 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 2 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 64 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 2 + AssertSummationElementMultiple: 2 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x064x16_DTL0_GRVW08_GSU01_LPB04_PGR1_PLR1_TT08_04_VW04_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 2 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 2 + AssertSummationElementMultiple: 2 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x064x16_DTL0_GRVW08_GSU01_LPB04_PGR1_PLR1_TT08_04_VW04_WG16_16_01_WGM08 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 2 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 2 + AssertSummationElementMultiple: 2 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdsNumElements: 6272 + LdsOffsetA: 0 + LdsOffsetB: 4096 + LdsPadA: 0 + LdsPadB: 4 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x064x32_DTL0_GRVW08_GSU05_LPB04_PGR0_PLR1_TT08_04_VW04_WG16_16_01_WGM08 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 2 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 2 + AssertSummationElementMultiple: 2 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 4 + LVPB: 16 + LdsNumElements: 3072 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT064x128x16_DTL0_GRVW08_GSU01_LPB00_PGR0_PLR1_TT08_04_VW04_WG08_32_01_WGM01 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 2 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 2 + AssertSummationElementMultiple: 2 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 32 + LdsNumElements: 6720 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 8 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT256x064x08_DTL0_GRVW08_GSU01_LPB08_PGR1_PLR1_TT08_08_VW08_WG32_08_01_WGM01 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 2 + VectorStore: true + VectorWidth: 8 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 2 + AssertSummationElementMultiple: 2 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 4 + LVPB: 16 + LdsNumElements: 3136 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 4 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT064x128x16_DTL0_GRVW08_GSU01_LPB04_PGR0_PLR0_TT08_04_VW04_WG08_32_01_WGM01 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 2 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 2 + AssertSummationElementMultiple: 2 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdsNumElements: 3136 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 4 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x064x16_DTL0_GRVW08_GSU01_LPB04_PGR0_PLR1_TT08_04_VW04_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 2 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 +- [2, 3, 0, 1] +- - - [3136, 64, 128, 64] + - [7, 14524.3] + - - [784, 512, 64, 128] + - [6, 15406.1] + - - [3136, 256, 64, 64] + - [0, 15900.8] + - - [784, 128, 128, 512] + - [0, 15815.2] + - - [784, 128, 64, 512] + - [1, 15050.8] + - - [3136, 512, 1, 2048] + - [5, 14833.5] + - - [12544, 256, 1, 1024] + - [2, 15072.8] + - - [3136, 64, 128, 256] + - [9, 17266.8] + - - [3136, 64, 64, 256] + - [3, 16825.1] + - - [3136, 2048, 1, 512] + - [4, 16285.1] + - - [784, 512, 128, 128] + - [8, 15812.1] + - - [3136, 64, 64, 64] + - [7, 13432.6] + - - [12544, 1024, 1, 256] + - [3, 16901.1] + - - [3136, 256, 128, 64] + - [0, 16310.9] +- null diff --git a/Tensile/Configs/miopen/archives/resnet50/logic/vega20_Cijk_Ailk_Bljk_SB.yaml b/Tensile/Configs/miopen/archives/resnet50/logic/vega20_Cijk_Ailk_Bljk_SB.yaml new file mode 100644 index 0000000000..3a7cd4990c --- /dev/null +++ b/Tensile/Configs/miopen/archives/resnet50/logic/vega20_Cijk_Ailk_Bljk_SB.yaml @@ -0,0 +1,1465 @@ +- {MinimumRequiredVersion: 4.4.0} +- vega20 +- gfx906 +- [Device 66a0, Device 66a7] +- AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false +- - AssertFree0ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_GRVW02_GSU04_PGR1_PLR1_TT04_04_VW02_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: &id004 [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: &id002 [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_VW04_WG16_08_01_WGM01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + ThreadTile: &id003 [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 4 + LVPB: 32 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT08_04_VW04_WG08_32_01_WGM01 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: &id001 [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT08_04_VW04_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: *id001 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_VW04_WG32_08_01_WGM08 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: &id005 [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: false + PrefetchLocalRead: false + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR0_PLR0_TT04_04_VW04_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: *id004 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_VW04_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: *id004 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_VW04_WG32_08_01_WGM01 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + ThreadTile: *id003 + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id005 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT08_04_VW04_WG16_16_01_WGM08 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: *id001 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id002 + WorkGroupMapping: 8 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT08_04_VW04_WG16_16_01_WGM64 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: *id001 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id002 + WorkGroupMapping: 64 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 +- [2, 3, 0, 1] +- - - [3136, 64, 128, 64] + - [5, 7734.96] + - - [784, 512, 64, 128] + - [2, 7939.64] + - - [3136, 256, 64, 64] + - [5, 7970.44] + - - [784, 128, 128, 512] + - [9, 7502.7] + - - [784, 128, 64, 512] + - [8, 7371.52] + - - [3136, 512, 1, 2048] + - [0, 6769.36] + - - [12544, 256, 1, 1024] + - [8, 7289.15] + - - [3136, 64, 128, 256] + - [3, 8635.2] + - - [3136, 64, 64, 256] + - [3, 8461.01] + - - [3136, 2048, 1, 512] + - [7, 7924.34] + - - [784, 512, 128, 128] + - [1, 8065.06] + - - [3136, 64, 64, 64] + - [6, 7408.73] + - - [12544, 1024, 1, 256] + - [4, 8637.02] + - - [3136, 256, 128, 64] + - [5, 8065.85] +- null diff --git a/Tensile/Configs/miopen/archives/resnet50/logic/vega20_Cijk_Alik_Bljk_HB.yaml b/Tensile/Configs/miopen/archives/resnet50/logic/vega20_Cijk_Alik_Bljk_HB.yaml new file mode 100644 index 0000000000..7758863d15 --- /dev/null +++ b/Tensile/Configs/miopen/archives/resnet50/logic/vega20_Cijk_Alik_Bljk_HB.yaml @@ -0,0 +1,1028 @@ +- {MinimumRequiredVersion: 4.3.0} +- vega20 +- gfx906 +- [Device 66a0, Device 66a7] +- AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false +- - AssertFree0ElementMultiple: 2 + AssertSummationElementMultiple: 2 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdsNumElements: 16512 + LdsOffsetA: 0 + LdsOffsetB: 8192 + LdsPadA: 0 + LdsPadB: 4 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT256x256x32_GRVW08_GSU05_LPB04_PGR0_PLR1_TT16_16_VW04_WG16_16_01_WGM08 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 2 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 2 + AssertSummationElementMultiple: 2 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 24 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 24 + LSCB: 24 + LSPA: 64 + LSPB: 16 + LVCA: 3 + LVCB: 12 + LVPA: 8 + LVPB: 8 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 3584 + LdsPadA: 0 + LdsPadB: 4 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 6 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT064x016x24_GRVW08_GSU05_LPB04_PGR1_PLR1_TT04_04_VW04_WG16_04_04_WGM01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 2 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 2 + AssertSummationElementMultiple: 2 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 640 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT064x032x16_GRVW08_GSU01_LPB04_PGR1_PLR1_TT04_04_VW04_WG16_08_02_WGM64 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 2 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 64 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 2 + AssertSummationElementMultiple: 2 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT064x016x32_GRVW08_GSU05_LPB04_PGR1_PLR1_TT04_04_VW04_WG16_04_04_WGM08 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 2 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 2 + AssertSummationElementMultiple: 2 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 5 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT064x016x32_GRVW08_GSU05_LPB04_PGR1_PLR1_TT04_04_VW04_WG16_04_04_WGM01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 2 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 2 + AssertSummationElementMultiple: 2 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 640 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 4 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT064x032x16_GRVW08_GSU01_LPB04_PGR1_PLR1_TT04_04_VW04_WG16_08_02_WGM01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 2 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 2 + AssertSummationElementMultiple: 2 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 3 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 4 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT064x016x32_GRVW08_GSU03_LPB04_PGR1_PLR1_TT04_04_VW04_WG16_04_04_WGM01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 2 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 +- [2, 3, 0, 1] +- - - [512, 2048, 1, 49] + - [0, 0.0] + - - [512, 128, 1, 784] + - [6, 4014.08] + - - [2048, 512, 1, 49] + - [0, 0.0] + - - [1024, 256, 1, 196] + - [2, 5179.46] + - - [256, 64, 1, 3136] + - [1, 3243.7] + - - [256, 1024, 1, 196] + - [5, 5264.37] + - - [64, 256, 1, 3136] + - [4, 3227.3] + - - [128, 512, 1, 784] + - [6, 4170.47] + - - [64, 64, 1, 3136] + - [3, 810.925] +- null diff --git a/Tensile/Configs/miopen/archives/resnet50/logic/vega20_Cijk_Alik_Bljk_SB.yaml b/Tensile/Configs/miopen/archives/resnet50/logic/vega20_Cijk_Alik_Bljk_SB.yaml new file mode 100644 index 0000000000..fb602a76e4 --- /dev/null +++ b/Tensile/Configs/miopen/archives/resnet50/logic/vega20_Cijk_Alik_Bljk_SB.yaml @@ -0,0 +1,1179 @@ +- {MinimumRequiredVersion: 4.4.0} +- vega20 +- gfx906 +- [Device 66a0, Device 66a7] +- AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false +- - AssertFree0ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 1 + LdsPadB: 1 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW01_GSU04_LPA01_LPB01_PGR1_PLR1_TT04_04_VW01_WG16_04_04_WGM01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + ThreadTile: &id001 [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: &id002 [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdsNumElements: 6752 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 1 + LdsPadB: 1 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x32_GRVW01_GSU08_LPA01_LPB01_PGR1_PLR1_TT04_04_VW01_WG16_04_04_WGM01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + ThreadTile: *id001 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdsNumElements: 3456 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x32_GRVW01_GSU08_LPA02_LPB02_PGR1_PLR1_TT02_02_VW02_WG16_04_04_WGM01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW01_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_04_VW04_WG16_08_02_WGM01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + ThreadTile: *id001 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: &id003 [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW01_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_04_VW04_WG16_08_02_WGM64 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + ThreadTile: *id001 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: *id003 + WorkGroupMapping: 64 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdsNumElements: 3664 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x08_GRVW02_GSU01_LPA02_LPB02_PGR1_PLR1_TT04_08_VW02_WG32_08_01_WGM64 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2112 + LdsNumElementsAlignedB: 576 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2112 + LdsOffsetB_Blk: 6208 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x32_GRVW02_GSU01_LPA02_LPB02_PGR1_PLR1_TT04_04_VW02_WG16_04_04_WGM01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + ThreadTile: *id001 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: *id002 + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AssertFree0ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 32 + LVCA: 2 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdsNumElements: 3424 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 320 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 3136 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PreciseBoundsCheck: false + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x032x08_GRVW04_GSU01_LPA04_LPB04_PGR1_PLR1_TT08_04_VW04_WG16_08_01_WGM01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 +- [2, 3, 0, 1] +- - - [512, 2048, 1, 49] + - [7, 3733.89] + - - [512, 128, 1, 784] + - [6, 2906.12] + - - [2048, 512, 1, 49] + - [5, 3528.86] + - - [1024, 256, 1, 196] + - [4, 3691.11] + - - [256, 64, 1, 3136] + - [1, 2643.02] + - - [256, 1024, 1, 196] + - [3, 3964.52] + - - [64, 256, 1, 3136] + - [1, 2732.99] + - - [128, 512, 1, 784] + - [0, 3132.94] + - - [64, 64, 1, 3136] + - [2, 1016.18] +- null diff --git a/Tensile/Configs/miopen/boiler/library_logic_vega10_only.yml b/Tensile/Configs/miopen/boiler/library_logic_vega10_only.yml index 5e59af509b..c0f6ebcf0f 100644 --- a/Tensile/Configs/miopen/boiler/library_logic_vega10_only.yml +++ b/Tensile/Configs/miopen/boiler/library_logic_vega10_only.yml @@ -1,7 +1,7 @@ LibraryLogic: ScheduleName: "vega10" - DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"] + DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"] ArchitectureName: "gfx900" # ScheduleName: "mi25" diff --git a/Tensile/Configs/miopen/boiler/library_logic_vega20_only.yml b/Tensile/Configs/miopen/boiler/library_logic_vega20_only.yml new file mode 100644 index 0000000000..9d77fd954c --- /dev/null +++ b/Tensile/Configs/miopen/boiler/library_logic_vega20_only.yml @@ -0,0 +1,23 @@ + +LibraryLogic: + ScheduleName: "vega20" + DeviceNames: ["Device 66a0", "Device 66a1", "Device 66a7"] + ArchitectureName: "gfx906" + +# ScheduleName: "vega10" +# DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"] +# ArchitectureName: "gfx900" + +# ScheduleName: "mi25" +# DeviceNames: ["Device 6860"] +# ArchitectureName: "gfx900" + +# ScheduleName: "r9nano" +# DeviceNames: ["Device 7300"] +# ArchitectureName: "gfx803" + +# ScheduleName: "hip" +# DeviceNames: ["Device 0000"] +# ArchitectureName: "fallback" + +LibraryClient: diff --git a/Tensile/Configs/miopen/make_all.sh b/Tensile/Configs/miopen/make_all.sh index a431303696..c50511caba 100755 --- a/Tensile/Configs/miopen/make_all.sh +++ b/Tensile/Configs/miopen/make_all.sh @@ -2,3 +2,6 @@ make TYPE=hgemm make TYPE=sgemm #make TYPE=hgemm SOLUTION_SKINNY=solutions/hgemm_quick.yml #make TYPE=sgemm =solutions/hgemm_quick.yml +##make SCHED=vega20 TYPE=hgemm +##make SCHED=vega20 TYPE=sgemm +##make SCHED=vega20 TYPE=sgemm EXPLORE_LEVEL=7 diff --git a/Tensile/Configs/miopen/problems/nn/resnet50_all.yml b/Tensile/Configs/miopen/problems/nn/resnet50_all.yml new file mode 100644 index 0000000000..0682508008 --- /dev/null +++ b/Tensile/Configs/miopen/problems/nn/resnet50_all.yml @@ -0,0 +1,15 @@ +# Resnet50 NN + - Exact: [ 784 , 128 , 64, 512 ] # beta= 0 + - Exact: [ 784 , 512 , 64, 128 ] # beta= 0 + - Exact: [ 3136 , 64 , 64, 64 ] # beta= 0 + - Exact: [ 3136 , 64 , 64, 256 ] # beta= 0 + - Exact: [ 3136 , 256 , 64, 64 ] # beta= 0 + - Exact: [ 784 , 128 , 128, 512 ] # beta= 0 + - Exact: [ 784 , 512 , 128, 128 ] # beta= 0 + - Exact: [ 3136 , 64 , 128, 64 ] # beta= 0 + - Exact: [ 3136 , 64 , 128, 256 ] # beta= 0 + - Exact: [ 3136 , 256 , 128, 64 ] # beta= 0 + - Exact: [ 3136 , 512 , 1, 2048 ] # beta= 0 + - Exact: [ 3136 , 2048 , 1, 512 ] # beta= 0 + - Exact: [ 12544 , 256 , 1, 1024 ] # beta= 0 + - Exact: [ 12544 , 1024 , 1, 256 ] # beta= 0 diff --git a/Tensile/Configs/miopen/problems/nt/resnet50_all.yml b/Tensile/Configs/miopen/problems/nt/resnet50_all.yml new file mode 100644 index 0000000000..42d15bc1af --- /dev/null +++ b/Tensile/Configs/miopen/problems/nt/resnet50_all.yml @@ -0,0 +1,19 @@ +# Resnet50 NT + - Exact: [ 49 , 512 , 64, 2048 ] # beta= 0 + - Exact: [ 49 , 2048 , 64, 512 ] # beta= 0 + - Exact: [ 196 , 256 , 64, 1024 ] # beta= 0 + - Exact: [ 196 , 1024 , 64, 256 ] # beta= 0 + - Exact: [ 784 , 128 , 64, 512 ] # beta= 0 + - Exact: [ 784 , 512 , 64, 128 ] # beta= 0 + - Exact: [ 3136 , 64 , 64, 64 ] # beta= 0 + - Exact: [ 3136 , 256 , 64, 64 ] # beta= 0 + - Exact: [ 3136 , 64 , 64, 256 ] # beta= 0 + - Exact: [ 49 , 512 , 128, 2048 ] # beta= 0 + - Exact: [ 49 , 2048 , 128, 512 ] # beta= 0 + - Exact: [ 196 , 256 , 128, 1024 ] # beta= 0 + - Exact: [ 196 , 1024 , 128, 256 ] # beta= 0 + - Exact: [ 784 , 128 , 128, 512 ] # beta= 0 + - Exact: [ 784 , 512 , 128, 128 ] # beta= 0 + - Exact: [ 3136 , 64 , 128, 64 ] # beta= 0 + - Exact: [ 3136 , 64 , 128, 256 ] # beta= 0 + - Exact: [ 3136 , 256 , 128, 64 ] # beta= 0 diff --git a/Tensile/Configs/miopen/problems/tn/resnet50_all.yml b/Tensile/Configs/miopen/problems/tn/resnet50_all.yml new file mode 100644 index 0000000000..0dc47be699 --- /dev/null +++ b/Tensile/Configs/miopen/problems/tn/resnet50_all.yml @@ -0,0 +1,10 @@ +# Resnet50 TN + - Exact: [ 64 , 64 , 1, 3136 ] # beta= 1 + - Exact: [ 64 , 256 , 1, 3136 ] # beta= 1 + - Exact: [ 128 , 512 , 1, 784 ] # beta= 1 + - Exact: [ 256 , 64 , 1, 3136 ] # beta= 1 + - Exact: [ 256 , 1024 , 1, 196 ] # beta= 1 + - Exact: [ 512 , 128 , 1, 784 ] # beta= 1 + - Exact: [ 512 , 2048 , 1, 49 ] # beta= 1 + - Exact: [ 1024 , 256 , 1, 196 ] # beta= 1 + - Exact: [ 2048 , 512 , 1, 49 ] # beta= 1 diff --git a/Tensile/Configs/miopen/solutions/hgemm_large_explore_3.yml b/Tensile/Configs/miopen/solutions/hgemm_large_explore_3.yml index 343e6c3c01..efae0606c4 100644 --- a/Tensile/Configs/miopen/solutions/hgemm_large_explore_3.yml +++ b/Tensile/Configs/miopen/solutions/hgemm_large_explore_3.yml @@ -20,9 +20,10 @@ - DepthU: [ 16, 24, 32 ] - VectorWidth: [8] - GlobalReadVectorWidth: [-1] + - AssertSummationElementMultiple: [2] + - AssertFree0ElementMultiple: [2] BenchmarkForkParameters: JoinParameters: BenchmarkJoinParameters: BenchmarkFinalParameters: - ProblemSizes: - diff --git a/Tensile/Configs/miopen/solutions/hgemm_large_explore_5.yml b/Tensile/Configs/miopen/solutions/hgemm_large_explore_5.yml index 343e6c3c01..efae0606c4 100644 --- a/Tensile/Configs/miopen/solutions/hgemm_large_explore_5.yml +++ b/Tensile/Configs/miopen/solutions/hgemm_large_explore_5.yml @@ -20,9 +20,10 @@ - DepthU: [ 16, 24, 32 ] - VectorWidth: [8] - GlobalReadVectorWidth: [-1] + - AssertSummationElementMultiple: [2] + - AssertFree0ElementMultiple: [2] BenchmarkForkParameters: JoinParameters: BenchmarkJoinParameters: BenchmarkFinalParameters: - ProblemSizes: - diff --git a/Tensile/Configs/miopen/solutions/hgemm_quick.yml b/Tensile/Configs/miopen/solutions/hgemm_quick.yml index 1885450c33..d5ff1be3da 100644 --- a/Tensile/Configs/miopen/solutions/hgemm_quick.yml +++ b/Tensile/Configs/miopen/solutions/hgemm_quick.yml @@ -1,7 +1,7 @@ ######################################## # Explore small number of half solns ######################################## - - # NN workloads + - # Benchmark Group InitialSolutionParameters: BenchmarkCommonParameters: - EdgeType: ["ShiftPtr"] @@ -23,9 +23,10 @@ - WorkGroupMapping: [8] - DepthU: [ 16 ] - VectorWidth: [2,8] + - AssertSummationElementMultiple: [2] + - AssertFree0ElementMultiple: [2] BenchmarkForkParameters: JoinParameters: BenchmarkJoinParameters: BenchmarkFinalParameters: - ProblemSizes: - diff --git a/Tensile/Configs/miopen/solutions/hgemm_skinny_explore_3.yml b/Tensile/Configs/miopen/solutions/hgemm_skinny_explore_3.yml index 5a4086f5bd..b1bfb16316 100644 --- a/Tensile/Configs/miopen/solutions/hgemm_skinny_explore_3.yml +++ b/Tensile/Configs/miopen/solutions/hgemm_skinny_explore_3.yml @@ -1,7 +1,7 @@ ######################################## # Explore large number of ~10K half solutions ######################################## - - # NN workloads + - # Benchmark Group InitialSolutionParameters: BenchmarkCommonParameters: - EdgeType: ["ShiftPtr"] @@ -18,21 +18,21 @@ - [ 8, 16 ] - [ 16, 16 ] - WorkGroup: - #- [ 16, 8, 2 ] LSU broken for Half? - #- [ 16, 4, 4 ] + - [ 16, 8, 2 ] + - [ 16, 4, 4 ] - [ 16, 8, 1 ] - [ 8, 32, 1 ] - [ 16, 16, 1 ] - [ 32, 8, 1 ] - #- GlobalSplitU: [1,3,5] - - GlobalSplitU: [1] + - GlobalSplitU: [1,3,5] - WorkGroupMapping: [1,8] - DepthU: [ 16,32 ] - VectorWidth: [8] - GlobalReadVectorWidth: [1,8] + - AssertSummationElementMultiple: [2] + - AssertFree0ElementMultiple: [2] BenchmarkForkParameters: JoinParameters: BenchmarkJoinParameters: BenchmarkFinalParameters: - ProblemSizes: - diff --git a/Tensile/Configs/miopen/solutions/hgemm_skinny_explore_5.yml b/Tensile/Configs/miopen/solutions/hgemm_skinny_explore_5.yml index 263fd76470..9f5c49dc5a 100644 --- a/Tensile/Configs/miopen/solutions/hgemm_skinny_explore_5.yml +++ b/Tensile/Configs/miopen/solutions/hgemm_skinny_explore_5.yml @@ -1,7 +1,7 @@ ######################################## # Explore large number of ~10K half solutions ######################################## - - # NN workloads + - # Benchmark Group InitialSolutionParameters: BenchmarkCommonParameters: - EdgeType: ["ShiftPtr"] @@ -19,22 +19,22 @@ - [ 8, 16 ] - [ 16, 16 ] - WorkGroup: - #- [ 16, 8, 2 ] LSU broken for Half? - #- [ 16, 4, 4 ] + - [ 16, 8, 2 ] + - [ 16, 4, 4 ] - [ 16, 8, 1 ] - [ 8, 32, 1 ] - [ 16, 16, 1 ] - [ 32, 8, 1 ] - #- GlobalSplitU: [1,3,5] - - GlobalSplitU: [1] + - GlobalSplitU: [1,3,5] - WorkGroupMapping: [1,8,64] - - DepthU: [ 8,16,32 ] + - DepthU: [ 8,16,24,32 ] - VectorWidth: [4,8] - GlobalReadVectorWidth: [1,8] - LdsPadB: [0, -1 ] + - AssertSummationElementMultiple: [2] + - AssertFree0ElementMultiple: [2] BenchmarkForkParameters: JoinParameters: BenchmarkJoinParameters: BenchmarkFinalParameters: - ProblemSizes: - diff --git a/Tensile/Configs/miopen/solutions/sgemm_large_explore_3.yml b/Tensile/Configs/miopen/solutions/sgemm_large_explore_3.yml index 74cf6bd9f4..ae53569327 100644 --- a/Tensile/Configs/miopen/solutions/sgemm_large_explore_3.yml +++ b/Tensile/Configs/miopen/solutions/sgemm_large_explore_3.yml @@ -25,4 +25,3 @@ BenchmarkJoinParameters: BenchmarkFinalParameters: - ProblemSizes: - diff --git a/Tensile/Configs/miopen/solutions/sgemm_large_explore_5.yml b/Tensile/Configs/miopen/solutions/sgemm_large_explore_5.yml index 4914674343..5c84c18c20 100644 --- a/Tensile/Configs/miopen/solutions/sgemm_large_explore_5.yml +++ b/Tensile/Configs/miopen/solutions/sgemm_large_explore_5.yml @@ -27,4 +27,3 @@ BenchmarkJoinParameters: BenchmarkFinalParameters: - ProblemSizes: - diff --git a/Tensile/Configs/miopen/solutions/sgemm_large_explore_7.yml b/Tensile/Configs/miopen/solutions/sgemm_large_explore_7.yml new file mode 100644 index 0000000000..5c84c18c20 --- /dev/null +++ b/Tensile/Configs/miopen/solutions/sgemm_large_explore_7.yml @@ -0,0 +1,29 @@ + # Explore set of parms appropriate for large matrixes with large tiles: + - # Benchmark Group + InitialSolutionParameters: + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + - GlobalSplitU: [1] + ForkParameters: + - PrefetchGlobalRead: [False, True] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 6, 8 ] + - [ 8, 4 ] + - [ 8, 6 ] + - [ 8, 8 ] + - WorkGroup: + - [ 16, 16, 1 ] + - WorkGroupMapping: [1, 8, 64] + - DepthU: [ 16, 24, 32 ] + - VectorWidth: [-1] + - GlobalReadVectorWidth: [1,4] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: diff --git a/Tensile/Configs/miopen/solutions/sgemm_quick.yml b/Tensile/Configs/miopen/solutions/sgemm_quick.yml index 91be1807ce..d269364b36 100644 --- a/Tensile/Configs/miopen/solutions/sgemm_quick.yml +++ b/Tensile/Configs/miopen/solutions/sgemm_quick.yml @@ -1,7 +1,7 @@ ######################################## # Explore small number of half solns ######################################## - - # NN workloads + - # Benchmark Group InitialSolutionParameters: BenchmarkCommonParameters: - EdgeType: ["ShiftPtr"] @@ -25,4 +25,3 @@ BenchmarkJoinParameters: BenchmarkFinalParameters: - ProblemSizes: - diff --git a/Tensile/Configs/miopen/solutions/sgemm_skinny_explore_3.yml b/Tensile/Configs/miopen/solutions/sgemm_skinny_explore_3.yml index 4bad8e3cea..ca3b73da28 100644 --- a/Tensile/Configs/miopen/solutions/sgemm_skinny_explore_3.yml +++ b/Tensile/Configs/miopen/solutions/sgemm_skinny_explore_3.yml @@ -1,7 +1,7 @@ ######################################## # Explore large number of ~10K solutions ######################################## - - # NN workloads + - # Benchmark Group InitialSolutionParameters: BenchmarkCommonParameters: - EdgeType: ["ShiftPtr"] @@ -32,4 +32,3 @@ BenchmarkJoinParameters: BenchmarkFinalParameters: - ProblemSizes: - diff --git a/Tensile/Configs/miopen/solutions/sgemm_skinny_explore_5.yml b/Tensile/Configs/miopen/solutions/sgemm_skinny_explore_5.yml index c8bd8eb4b5..b3e3c533e0 100644 --- a/Tensile/Configs/miopen/solutions/sgemm_skinny_explore_5.yml +++ b/Tensile/Configs/miopen/solutions/sgemm_skinny_explore_5.yml @@ -1,7 +1,7 @@ ######################################## # Explore large number of ~10K solutions ######################################## - - # NN workloads + - # Benchmark Group InitialSolutionParameters: BenchmarkCommonParameters: - EdgeType: ["ShiftPtr"] @@ -35,4 +35,3 @@ BenchmarkJoinParameters: BenchmarkFinalParameters: - ProblemSizes: - diff --git a/Tensile/Configs/miopen/solutions/sgemm_skinny_explore_7.yml b/Tensile/Configs/miopen/solutions/sgemm_skinny_explore_7.yml index fba6e29558..c1c43f6eca 100644 --- a/Tensile/Configs/miopen/solutions/sgemm_skinny_explore_7.yml +++ b/Tensile/Configs/miopen/solutions/sgemm_skinny_explore_7.yml @@ -1,7 +1,7 @@ ######################################## # Explore large number of ~10K solutions ######################################## - - # NN workloads + - # Benchmark Group InitialSolutionParameters: BenchmarkCommonParameters: - EdgeType: ["ShiftPtr"] @@ -34,4 +34,3 @@ BenchmarkJoinParameters: BenchmarkFinalParameters: - ProblemSizes: - diff --git a/Tensile/Configs/miopen/types/hgemm_nt.yml b/Tensile/Configs/miopen/types/hgemm_nt.yml index 96920277b9..2289690eca 100644 --- a/Tensile/Configs/miopen/types/hgemm_nt.yml +++ b/Tensile/Configs/miopen/types/hgemm_nt.yml @@ -1,6 +1,6 @@ BenchmarkProblems: ######################################## - # NN - standard + # NT - standard ######################################## - - # ProblemType diff --git a/Tensile/Configs/miopen/types/hgemm_tn.yml b/Tensile/Configs/miopen/types/hgemm_tn.yml index dfaa22f225..b1fa3a944e 100644 --- a/Tensile/Configs/miopen/types/hgemm_tn.yml +++ b/Tensile/Configs/miopen/types/hgemm_tn.yml @@ -1,6 +1,6 @@ BenchmarkProblems: ######################################## - # NN - standard + # TN - standard ######################################## - - # ProblemType diff --git a/Tensile/Configs/miopen/types/hgemm_tt.yml b/Tensile/Configs/miopen/types/hgemm_tt.yml index 7581d52f39..fc655d9ed6 100644 --- a/Tensile/Configs/miopen/types/hgemm_tt.yml +++ b/Tensile/Configs/miopen/types/hgemm_tt.yml @@ -1,6 +1,6 @@ BenchmarkProblems: ######################################## - # NN - standard + # TT - standard ######################################## - - # ProblemType diff --git a/Tensile/Configs/miopen/types/sgemm_nt.yml b/Tensile/Configs/miopen/types/sgemm_nt.yml index 08931341ed..1dd119538e 100644 --- a/Tensile/Configs/miopen/types/sgemm_nt.yml +++ b/Tensile/Configs/miopen/types/sgemm_nt.yml @@ -1,6 +1,6 @@ BenchmarkProblems: ######################################## - # NN - standard + # NT - standard ######################################## - - # ProblemType diff --git a/Tensile/Configs/miopen/types/sgemm_tn.yml b/Tensile/Configs/miopen/types/sgemm_tn.yml index dccba30c73..59c783fa09 100644 --- a/Tensile/Configs/miopen/types/sgemm_tn.yml +++ b/Tensile/Configs/miopen/types/sgemm_tn.yml @@ -1,6 +1,6 @@ BenchmarkProblems: ######################################## - # NN - standard + # TN - standard ######################################## - - # ProblemType diff --git a/Tensile/Configs/miopen/types/sgemm_tt.yml b/Tensile/Configs/miopen/types/sgemm_tt.yml index 403fffc475..fbcaf50a2d 100644 --- a/Tensile/Configs/miopen/types/sgemm_tt.yml +++ b/Tensile/Configs/miopen/types/sgemm_tt.yml @@ -1,6 +1,6 @@ BenchmarkProblems: ######################################## - # NN - standard + # TT - standard ######################################## - - # ProblemType diff --git a/Tensile/Configs/rocblas_cgemm.yaml b/Tensile/Configs/rocblas_cgemm.yaml index a3f1f9e583..c0a6f4511c 100644 --- a/Tensile/Configs/rocblas_cgemm.yaml +++ b/Tensile/Configs/rocblas_cgemm.yaml @@ -178,7 +178,7 @@ BenchmarkProblems: LibraryLogic: ScheduleName: "vega10" - DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"] + DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"] ArchitectureName: "gfx900" # ScheduleName: "mi25" diff --git a/Tensile/Configs/rocblas_dgemm_asm_full.yaml b/Tensile/Configs/rocblas_dgemm_asm_full.yaml index b6edf01851..45c1544710 100644 --- a/Tensile/Configs/rocblas_dgemm_asm_full.yaml +++ b/Tensile/Configs/rocblas_dgemm_asm_full.yaml @@ -45,7 +45,6 @@ BenchmarkProblems: - WorkGroupMapping: [8] - PrefetchLocalRead: [True] - PrefetchGlobalRead: [True] - - PreciseBoundsCheck: [False] - VectorWidth: [-1] ForkParameters: - ThreadTile: @@ -71,7 +70,6 @@ BenchmarkProblems: - WorkGroupMapping: [8] - PrefetchLocalRead: [True] - PrefetchGlobalRead: [True] - - PreciseBoundsCheck: [False] - VectorWidth: [-1] ForkParameters: - ThreadTile: @@ -99,7 +97,6 @@ BenchmarkProblems: - WorkGroupMapping: [8] - PrefetchLocalRead: [True] - PrefetchGlobalRead: [True] - - PreciseBoundsCheck: [False] - VectorWidth: [-1] ForkParameters: - ThreadTile: @@ -156,7 +153,6 @@ BenchmarkProblems: - WorkGroupMapping: [8] - PrefetchLocalRead: [True] - PrefetchGlobalRead: [True] - - PreciseBoundsCheck: [False] - VectorWidth: [-1] ForkParameters: - ThreadTile: @@ -180,7 +176,6 @@ BenchmarkProblems: - EdgeType: ["ShiftPtr"] - LoopTail: [True] - PrefetchLocalRead: [True] - - PreciseBoundsCheck: [False] ForkParameters: - PrefetchGlobalRead: [True] - ThreadTile: @@ -212,7 +207,6 @@ BenchmarkProblems: - WorkGroupMapping: [8] - PrefetchLocalRead: [True] - PrefetchGlobalRead: [True] - - PreciseBoundsCheck: [False] - VectorWidth: [-1] ForkParameters: - ThreadTile: @@ -240,7 +234,6 @@ BenchmarkProblems: - WorkGroupMapping: [8] - PrefetchLocalRead: [True] - PrefetchGlobalRead: [True] - - PreciseBoundsCheck: [False] - VectorWidth: [-1] ForkParameters: - ThreadTile: @@ -297,7 +290,6 @@ BenchmarkProblems: - WorkGroupMapping: [8] - PrefetchLocalRead: [True] - PrefetchGlobalRead: [True] - - PreciseBoundsCheck: [False] - VectorWidth: [-1] ForkParameters: - ThreadTile: @@ -323,7 +315,6 @@ BenchmarkProblems: - WorkGroupMapping: [8] - PrefetchLocalRead: [True] - PrefetchGlobalRead: [True] - - PreciseBoundsCheck: [False] - VectorWidth: [-1] ForkParameters: - ThreadTile: @@ -363,7 +354,6 @@ BenchmarkProblems: - WorkGroupMapping: [8] - PrefetchLocalRead: [True] - PrefetchGlobalRead: [True] - - PreciseBoundsCheck: [False] - VectorWidth: [-1] ForkParameters: - ThreadTile: @@ -389,7 +379,6 @@ BenchmarkProblems: - WorkGroupMapping: [8] - PrefetchLocalRead: [True] - PrefetchGlobalRead: [True] - - PreciseBoundsCheck: [False] - VectorWidth: [-1] ForkParameters: - ThreadTile: @@ -408,11 +397,11 @@ BenchmarkProblems: LibraryLogic: # ScheduleName: "vega20" -# DeviceNames: ["Device 66a0", "Device 66a7"] +# DeviceNames: ["Device 66a0", "Device 66a1", "Device 66a7"] # ArchitectureName: "gfx906" ScheduleName: "vega10" - DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"] + DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"] ArchitectureName: "gfx900" # ScheduleName: "mi25" diff --git a/Tensile/Configs/rocblas_dgemm_asm_lite.yaml b/Tensile/Configs/rocblas_dgemm_asm_lite.yaml index c24fd9e820..bdf4ae1e66 100644 --- a/Tensile/Configs/rocblas_dgemm_asm_lite.yaml +++ b/Tensile/Configs/rocblas_dgemm_asm_lite.yaml @@ -66,7 +66,6 @@ BenchmarkProblems: - WorkGroupMapping: [8] - PrefetchLocalRead: [True] - PrefetchGlobalRead: [True] - - PreciseBoundsCheck: [False] - VectorWidth: [-1] ForkParameters: - ThreadTile: @@ -115,7 +114,6 @@ BenchmarkProblems: - WorkGroupMapping: [8] - PrefetchLocalRead: [True] - PrefetchGlobalRead: [True] - - PreciseBoundsCheck: [False] - VectorWidth: [-1] ForkParameters: - ThreadTile: @@ -176,7 +174,6 @@ BenchmarkProblems: - WorkGroupMapping: [8] - PrefetchLocalRead: [True] - PrefetchGlobalRead: [True] - - PreciseBoundsCheck: [False] - VectorWidth: [-1] ForkParameters: - ThreadTile: @@ -225,7 +222,6 @@ BenchmarkProblems: - WorkGroupMapping: [8] - PrefetchLocalRead: [True] - PrefetchGlobalRead: [True] - - PreciseBoundsCheck: [False] - VectorWidth: [-1] ForkParameters: - ThreadTile: @@ -287,7 +283,6 @@ BenchmarkProblems: - WorkGroupMapping: [8] - PrefetchLocalRead: [True] - PrefetchGlobalRead: [True] - - PreciseBoundsCheck: [False] - VectorWidth: [-1] ForkParameters: - ThreadTile: @@ -336,7 +331,6 @@ BenchmarkProblems: - WorkGroupMapping: [8] - PrefetchLocalRead: [True] - PrefetchGlobalRead: [True] - - PreciseBoundsCheck: [False] - VectorWidth: [-1] ForkParameters: - ThreadTile: @@ -397,7 +391,6 @@ BenchmarkProblems: - WorkGroupMapping: [8] - PrefetchLocalRead: [True] - PrefetchGlobalRead: [True] - - PreciseBoundsCheck: [False] - VectorWidth: [-1] ForkParameters: - ThreadTile: @@ -446,7 +439,6 @@ BenchmarkProblems: - WorkGroupMapping: [8] - PrefetchLocalRead: [True] - PrefetchGlobalRead: [True] - - PreciseBoundsCheck: [False] - VectorWidth: [-1] ForkParameters: - ThreadTile: @@ -466,11 +458,11 @@ BenchmarkProblems: LibraryLogic: # ScheduleName: "vega20" -# DeviceNames: ["Device 66a0", "Device 66a7"] +# DeviceNames: ["Device 66a0", "Device 66a1", "Device 66a7"] # ArchitectureName: "gfx906" ScheduleName: "vega10" - DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"] + DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"] ArchitectureName: "gfx900" # ScheduleName: "mi25" diff --git a/Tensile/Configs/rocblas_dgemm_hip_lite.yaml b/Tensile/Configs/rocblas_dgemm_hip_lite.yaml index c818484ad7..080601a7f1 100644 --- a/Tensile/Configs/rocblas_dgemm_hip_lite.yaml +++ b/Tensile/Configs/rocblas_dgemm_hip_lite.yaml @@ -164,7 +164,7 @@ BenchmarkProblems: LibraryLogic: # ScheduleName: "vega10" -# DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"] + DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"] # ArchitectureName: "gfx900" # ScheduleName: "mi25" diff --git a/Tensile/Configs/rocblas_hgemm_asm_full.yaml b/Tensile/Configs/rocblas_hgemm_asm_full.yaml index 3887d18331..81c6d43548 100644 --- a/Tensile/Configs/rocblas_hgemm_asm_full.yaml +++ b/Tensile/Configs/rocblas_hgemm_asm_full.yaml @@ -47,7 +47,6 @@ BenchmarkProblems: - EdgeType: ["ShiftPtr"] - LoopTail: [True] - KernelLanguage: ["Assembly"] - - PreciseBoundsCheck: [False] - PrefetchLocalRead: [True] - WorkGroupMapping: [1] ForkParameters: @@ -134,7 +133,6 @@ BenchmarkProblems: # - EdgeType: ["ShiftPtr"] # - LoopTail: [True] # - KernelLanguage: ["Assembly"] -# - PreciseBoundsCheck: [False] # - PrefetchGlobalRead: [True] # - PrefetchLocalRead: [True] # - WorkGroupMapping: [1] @@ -186,7 +184,6 @@ BenchmarkProblems: - EdgeType: ["ShiftPtr"] - LoopTail: [True] - KernelLanguage: ["Assembly"] - - PreciseBoundsCheck: [False] - PrefetchGlobalRead: [True] - PrefetchLocalRead: [True] ForkParameters: @@ -298,7 +295,6 @@ BenchmarkProblems: - EdgeType: ["ShiftPtr"] - LoopTail: [True] - KernelLanguage: ["Assembly"] - - PreciseBoundsCheck: [False] - PrefetchLocalRead: [True] - GlobalSplitU: [1] ForkParameters: @@ -422,7 +418,6 @@ BenchmarkProblems: - EdgeType: ["ShiftPtr"] - LoopTail: [True] - KernelLanguage: ["Assembly"] - - PreciseBoundsCheck: [False] - PrefetchGlobalRead: [True] - PrefetchLocalRead: [True] - WorkGroupMapping: [1] @@ -456,7 +451,6 @@ BenchmarkProblems: - EdgeType: ["ShiftPtr"] - LoopTail: [True] - KernelLanguage: ["Assembly"] - - PreciseBoundsCheck: [False] - PrefetchGlobalRead: [ True ] - PrefetchLocalRead: [ True ] ForkParameters: @@ -505,7 +499,6 @@ BenchmarkProblems: - EdgeType: ["ShiftPtr"] - LoopTail: [True] - KernelLanguage: ["Assembly"] - - PreciseBoundsCheck: [False] ForkParameters: - PrefetchGlobalRead: [ False, True ] - PrefetchLocalRead: [ False] @@ -632,7 +625,6 @@ BenchmarkProblems: - EdgeType: ["ShiftPtr"] - LoopTail: [True] - KernelLanguage: ["Assembly"] - - PreciseBoundsCheck: [False] - PrefetchLocalRead: [True] - WorkGroupMapping: [1] ForkParameters: @@ -684,7 +676,6 @@ BenchmarkProblems: # - EdgeType: ["ShiftPtr"] # - LoopTail: [True] # - KernelLanguage: ["Assembly"] -# - PreciseBoundsCheck: [False] # - PrefetchGlobalRead: [True] # - PrefetchLocalRead: [True] # - WorkGroupMapping: [1] @@ -730,7 +721,6 @@ BenchmarkProblems: - EdgeType: ["ShiftPtr"] - LoopTail: [True] - KernelLanguage: ["Assembly"] - - PreciseBoundsCheck: [False] - PrefetchGlobalRead: [True] - PrefetchLocalRead: [True] ForkParameters: @@ -804,7 +794,6 @@ BenchmarkProblems: - EdgeType: ["ShiftPtr"] - LoopTail: [True] - KernelLanguage: ["Assembly"] - - PreciseBoundsCheck: [False] - PrefetchGlobalRead: [True] - PrefetchLocalRead: [True] - GlobalSplitU: [1] @@ -923,7 +912,6 @@ BenchmarkProblems: - EdgeType: ["ShiftPtr"] - LoopTail: [True] - KernelLanguage: ["Assembly"] - - PreciseBoundsCheck: [False] - PrefetchGlobalRead: [True] - PrefetchLocalRead: [True] - WorkGroupMapping: [1] @@ -1040,7 +1028,6 @@ BenchmarkProblems: - EdgeType: ["ShiftPtr"] - LoopTail: [True] - KernelLanguage: ["Assembly"] - - PreciseBoundsCheck: [False] - PrefetchGlobalRead: [True] - PrefetchLocalRead: [True] ForkParameters: @@ -1084,7 +1071,6 @@ BenchmarkProblems: - EdgeType: ["ShiftPtr"] - LoopTail: [True] - KernelLanguage: ["Assembly"] - - PreciseBoundsCheck: [False] - PrefetchGlobalRead: [True] - PrefetchLocalRead: [True] - GlobalSplitU: [1] @@ -1128,7 +1114,6 @@ BenchmarkProblems: - EdgeType: ["ShiftPtr"] - LoopTail: [True] - KernelLanguage: ["Assembly"] - - PreciseBoundsCheck: [False] - PrefetchGlobalRead: [True] - PrefetchLocalRead: [True] - WorkGroupMapping: [1] @@ -1245,7 +1230,6 @@ BenchmarkProblems: - EdgeType: ["ShiftPtr"] - LoopTail: [True] - KernelLanguage: ["Assembly"] - - PreciseBoundsCheck: [False] - PrefetchGlobalRead: [True] - PrefetchLocalRead: [True] ForkParameters: @@ -1287,7 +1271,6 @@ BenchmarkProblems: - EdgeType: ["ShiftPtr"] - LoopTail: [True] - KernelLanguage: ["Assembly"] - - PreciseBoundsCheck: [False] - PrefetchLocalRead: [True] - GlobalSplitU: [1] ForkParameters: @@ -1324,7 +1307,6 @@ BenchmarkProblems: - EdgeType: ["ShiftPtr"] - LoopTail: [True] - KernelLanguage: ["Assembly"] - - PreciseBoundsCheck: [False] - PrefetchGlobalRead: [True] - PrefetchLocalRead: [True] - WorkGroupMapping: [-1] @@ -1420,11 +1402,11 @@ BenchmarkProblems: LibraryLogic: # ScheduleName: "vega20" -# DeviceNames: ["Device 66a0", "Device 66a7"] +# DeviceNames: ["Device 66a0", "Device 66a1", "Device 66a7"] # ArchitectureName: "gfx906" ScheduleName: "vega10" - DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"] + DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"] ArchitectureName: "gfx900" # ScheduleName: "mi25" diff --git a/Tensile/Configs/rocblas_hgemm_asm_lite.yaml b/Tensile/Configs/rocblas_hgemm_asm_lite.yaml index fa2de7c5e4..d878511989 100644 --- a/Tensile/Configs/rocblas_hgemm_asm_lite.yaml +++ b/Tensile/Configs/rocblas_hgemm_asm_lite.yaml @@ -305,11 +305,11 @@ BenchmarkProblems: LibraryLogic: # ScheduleName: "vega20" -# DeviceNames: ["Device 66a0", "Device 66a7"] +# DeviceNames: ["Device 66a0", "Device 66a1", "Device 66a7"] # ArchitectureName: "gfx906" ScheduleName: "vega10" - DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"] + DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"] ArchitectureName: "gfx900" # ScheduleName: "mi25" diff --git a/Tensile/Configs/rocblas_hgemm_hip_lite.yaml b/Tensile/Configs/rocblas_hgemm_hip_lite.yaml index bfcd3192a6..9daba38670 100644 --- a/Tensile/Configs/rocblas_hgemm_hip_lite.yaml +++ b/Tensile/Configs/rocblas_hgemm_hip_lite.yaml @@ -275,7 +275,7 @@ BenchmarkProblems: LibraryLogic: # ScheduleName: "vega10" -# DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"] + DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"] # ArchitectureName: "gfx900" # ScheduleName: "mi25" diff --git a/Tensile/Configs/rocblas_hpa_hgemm_asm_full.yaml b/Tensile/Configs/rocblas_hpa_hgemm_asm_full.yaml deleted file mode 100644 index 4450bbe3b9..0000000000 --- a/Tensile/Configs/rocblas_hpa_hgemm_asm_full.yaml +++ /dev/null @@ -1,304 +0,0 @@ -GlobalParameters: - MinimumRequiredVersion: 4.4.0 - PrintLevel: 1 - ForceRedoBenchmarkProblems: True - ForceRedoLibraryLogic: True - ForceRedoLibraryClient: True - CMakeBuildType: Release - EnqueuesPerSync: 1 - SyncsPerBenchmark: 1 - LibraryPrintDebug: False - NumElementsToValidate: 0 - ValidationMaxToPrint: 4 - ValidationPrintValids: False - ShortNames: False - MergeFiles: True - Platform: 0 - Device: 0 - KernelTime: True - PinClocks: True - SleepPercent: 200 - DataInitTypeBeta : 0 - -BenchmarkProblems: - - ######################################## - # NN - ######################################## - - - - # ProblemType - OperationType: GEMM - DataType: h - HighPrecisionAccumulate: True - TransposeA: False - TransposeB: False - UseBeta: True - Batched: True - - - # BenchmarkProblemSizeGroup - Assembly - InitialSolutionParameters: - BenchmarkCommonParameters: - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - - KernelLanguage: ["Assembly"] - ForkParameters: - - GlobalSplitU: [1, 3] - - PrefetchLocalRead: [False] - - PrefetchGlobalRead: [True] - - ThreadTile: - - [ 2, 4 ] - - [ 4, 8 ] - - [ 16, 8 ] - - WorkGroup: - - [ 32, 4, 1 ] - - [ 8, 8, 1 ] - - DepthU: [8] - - VectorWidth: [-1] - BenchmarkForkParameters: - JoinParameters: - BenchmarkJoinParameters: - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [127,1,129], 0, [2], [63,1,65] ] - - - # BenchmarkProblemSizeGroup - Assembly - InitialSolutionParameters: - BenchmarkCommonParameters: - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - ForkParameters: - - KernelLanguage: ["Assembly"] - - GlobalSplitU: [1, 3] - - PrefetchLocalRead: [False] - - PrefetchGlobalRead: [True] - - ThreadTile: - - [ 4, 2 ] - - [ 4, 8 ] - - [ 16, 16 ] - - [ 8, 8 ] - - WorkGroup: - - [ 16, 16, 1 ] - - [ 8, 8, 1 ] - - DepthU: [16] - - VectorWidth: [-1] - BenchmarkForkParameters: - JoinParameters: - BenchmarkJoinParameters: - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [127,1,129], 0, [2], [63,1,65] ] - - ######################################## - # NT - ######################################## - - - - # ProblemType - OperationType: GEMM - DataType: h - HighPrecisionAccumulate: True - TransposeA: False - TransposeB: True - UseBeta: True - Batched: True - - - # BenchmarkProblemSizeGroup - Assembly - InitialSolutionParameters: - BenchmarkCommonParameters: - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - - KernelLanguage: ["Assembly"] - ForkParameters: - - GlobalSplitU: [1, 3] - - PrefetchLocalRead: [True] - - PrefetchGlobalRead: [False] - - ThreadTile: - - [ 4, 2 ] - - [ 4, 8 ] - - [ 8, 8 ] - - WorkGroup: - - [ 16, 16, 1 ] - - [ 32, 4, 1 ] - - DepthU: [8] - - VectorWidth: [-1] - BenchmarkForkParameters: - JoinParameters: - BenchmarkJoinParameters: - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [127,1,129], 0, [2], [63,1,65] ] - - - # BenchmarkProblemSizeGroup - Assembly - InitialSolutionParameters: - BenchmarkCommonParameters: - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - ForkParameters: - - KernelLanguage: ["Assembly"] - - GlobalSplitU: [1, 3] - - PrefetchLocalRead: [True] - - PrefetchGlobalRead: [False] - - ThreadTile: - - [ 8, 2 ] - - [ 2, 8 ] - - [ 16, 2 ] - - [ 2, 16 ] - - WorkGroup: - - [ 16, 16, 1 ] - - [ 8, 8, 1 ] - - DepthU: [16] - - VectorWidth: [-1] - BenchmarkForkParameters: - JoinParameters: - BenchmarkJoinParameters: - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [127,1,129], 0, [2], [63,1,65] ] - -# ######################################## -# # TN -# ######################################## - - - - # ProblemType - OperationType: GEMM - DataType: h - HighPrecisionAccumulate: True - TransposeA: True - TransposeB: False - UseBeta: True - Batched: True - - - # BenchmarkProblemSizeGroup - Assembly - InitialSolutionParameters: - BenchmarkCommonParameters: - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - - KernelLanguage: ["Assembly"] - ForkParameters: - - GlobalSplitU: [1, 3] - - PrefetchLocalRead: [True] - - PrefetchGlobalRead: [False] - - ThreadTile: - - [ 4, 2 ] - - [ 4, 8 ] - - [ 8, 8 ] - - WorkGroup: - - [ 16, 16, 1 ] - - [ 32, 4, 1 ] - - DepthU: [8] - - VectorWidth: [-1] - BenchmarkForkParameters: - JoinParameters: - BenchmarkJoinParameters: - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [127,1,129], 0, [2], [63,1,65] ] - - - # BenchmarkProblemSizeGroup - Assembly - InitialSolutionParameters: - BenchmarkCommonParameters: - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - ForkParameters: - - KernelLanguage: ["Assembly"] - - GlobalSplitU: [1, 3] - - PrefetchLocalRead: [True] - - PrefetchGlobalRead: [False] - - ThreadTile: - - [ 8, 2 ] - - [ 2, 8 ] - - [ 16, 2 ] - - [ 2, 16 ] - - WorkGroup: - - [ 16, 16, 1 ] - - [ 8, 8, 1 ] - - DepthU: [16] - - VectorWidth: [-1] - BenchmarkForkParameters: - JoinParameters: - BenchmarkJoinParameters: - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [127,1,129], 0, [2], [63,1,65] ] - -# ######################################## -# # TT - standard -# ######################################## - - - - # ProblemType - OperationType: GEMM - DataType: h - HighPrecisionAccumulate: True - TransposeA: True - TransposeB: True - UseBeta: True - Batched: True - - - # BenchmarkProblemSizeGroup - Assembly - InitialSolutionParameters: - BenchmarkCommonParameters: - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - - KernelLanguage: ["Assembly"] - ForkParameters: - - PrefetchLocalRead: [False] - - PrefetchGlobalRead: [False] - - ThreadTile: - - [ 16, 4 ] - - [ 16, 8 ] - - [ 8, 8 ] - - WorkGroup: - - [ 16, 16, 1 ] - - [ 8, 8, 1 ] - - DepthU: [32] - - VectorWidth: [-1] - BenchmarkForkParameters: - JoinParameters: - BenchmarkJoinParameters: - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [127,1,129], 0, [2], [63,1,65] ] - - - # BenchmarkProblemSizeGroup - Assembly - InitialSolutionParameters: - BenchmarkCommonParameters: - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - ForkParameters: - - KernelLanguage: ["Assembly"] - - PrefetchLocalRead: [False] - - PrefetchGlobalRead: [False] - - ThreadTile: - - [ 8, 2 ] - - [ 2, 2 ] - - [ 4, 2 ] - - [ 8, 4 ] - - WorkGroup: - - [ 16, 16, 1 ] - - [ 8, 8, 1 ] - - DepthU: [16] - - VectorWidth: [-1] - BenchmarkForkParameters: - JoinParameters: - BenchmarkJoinParameters: - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [127,1,129], 0, [2], [63,1,65] ] - -LibraryLogic: - ScheduleName: "vega10" - DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"] - ArchitectureName: "gfx900" - -# ScheduleName: "mi25" -# DeviceNames: ["Device 6860"] -# ArchitectureName: "gfx900" - -# ScheduleName: "r9nano" -# DeviceNames: ["Device 7300"] -# ArchitectureName: "gfx803" - -# ScheduleName: "hip" -# DeviceNames: ["Device 0000"] -# ArchitectureName: "fallback" - -LibraryClient: diff --git a/Tensile/Configs/rocblas_hpa_hgemm_asm_lite.yaml b/Tensile/Configs/rocblas_hpa_hgemm_asm_lite.yaml index 4450bbe3b9..8edb7d64dd 100644 --- a/Tensile/Configs/rocblas_hpa_hgemm_asm_lite.yaml +++ b/Tensile/Configs/rocblas_hpa_hgemm_asm_lite.yaml @@ -286,7 +286,7 @@ BenchmarkProblems: LibraryLogic: ScheduleName: "vega10" - DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"] + DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"] ArchitectureName: "gfx900" # ScheduleName: "mi25" diff --git a/Tensile/Configs/rocblas_hpa_hgemm_hip_lite.yaml b/Tensile/Configs/rocblas_hpa_hgemm_hip_lite.yaml index 619a1c63ee..cf75d26477 100644 --- a/Tensile/Configs/rocblas_hpa_hgemm_hip_lite.yaml +++ b/Tensile/Configs/rocblas_hpa_hgemm_hip_lite.yaml @@ -43,15 +43,15 @@ BenchmarkProblems: - KernelLanguage: ["Source"] ForkParameters: - GlobalSplitU: [1, 3] - - PrefetchLocalRead: [False] - - PrefetchGlobalRead: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [False] - ThreadTile: - - [ 2, 4 ] + - [ 4, 2 ] - [ 4, 8 ] - - [ 16, 8 ] + - [ 8, 8 ] - WorkGroup: + - [ 16, 16, 1 ] - [ 32, 4, 1 ] - - [ 8, 8, 1 ] - DepthU: [8] - VectorWidth: [-1] BenchmarkForkParameters: @@ -69,13 +69,13 @@ BenchmarkProblems: ForkParameters: - KernelLanguage: ["Source"] - GlobalSplitU: [1, 3] - - PrefetchLocalRead: [False] - - PrefetchGlobalRead: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [False] - ThreadTile: - - [ 4, 2 ] - - [ 4, 8 ] - - [ 16, 16 ] - - [ 8, 8 ] + - [ 8, 2 ] + - [ 2, 8 ] + - [ 16, 2 ] + - [ 2, 16 ] - WorkGroup: - [ 16, 16, 1 ] - [ 8, 8, 1 ] @@ -96,13 +96,13 @@ BenchmarkProblems: ForkParameters: - KernelLanguage: ["Source"] - GlobalSplitU: [1, 3] - - PrefetchLocalRead: [False] - - PrefetchGlobalRead: [True] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [False] - ThreadTile: - - [ 4, 2 ] - - [ 4, 8 ] - - [ 16, 16 ] - - [ 8, 8 ] + - [ 8, 2 ] + - [ 2, 8 ] + - [ 16, 2 ] + - [ 2, 16 ] - WorkGroup: - [ 16, 16, 1 ] - [ 8, 8, 1 ] @@ -405,7 +405,7 @@ BenchmarkProblems: LibraryLogic: # ScheduleName: "vega10" -# DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"] +# DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"] # ArchitectureName: "gfx900" # ScheduleName: "mi25" diff --git a/Tensile/Configs/rocblas_hpa_hgemm_nn_asm_full.yaml b/Tensile/Configs/rocblas_hpa_hgemm_nn_asm_full.yaml new file mode 100644 index 0000000000..97df90c3e9 --- /dev/null +++ b/Tensile/Configs/rocblas_hpa_hgemm_nn_asm_full.yaml @@ -0,0 +1,628 @@ +GlobalParameters: + MinimumRequiredVersion: 4.4.0 + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + CMakeBuildType: Release + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + LibraryPrintDebug: False + NumElementsToValidate: 256 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + Platform: 0 + Device: 0 + KernelTime: True + PinClocks: True + SleepPercent: 200 + DataInitTypeBeta : 0 + ExitOnFails: 0 + +BenchmarkProblems: + + ######################################## + ######################################## + ### + ### NN + ### + ######################################## + ######################################## + - + - # ProblemType + OperationType: GEMM + DataType: h + HighPrecisionAccumulate: True + TransposeA: False + TransposeB: False + UseBeta: True + Batched: True + + ######################################## + # NN - Super Skinny + ######################################## + - # Benchmark Group + InitialSolutionParameters: + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + - WorkGroupMapping: [1] + ForkParameters: + - PrefetchGlobalRead: [False, True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 2 ] + - [ 8, 2 ] + - [ 4, 4 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 8, 16, 1 ] + - [ 16, 4, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - [ 32, 4, 1 ] + - [ 32, 8, 1 ] + - GlobalSplitU: [1] + - DepthU: [ 8, 16, 32, 64 ] + - VectorWidth: [-1] + - AssertSummationElementMultiple: [2] + - AssertFree0ElementMultiple: [2] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [ 512, 1, 1, 500000 ] + - Exact: [ 512, 2, 1, 500000 ] + - Exact: [ 512, 4, 1, 500000 ] + - Exact: [ 512, 8, 1, 500000 ] + - Exact: [ 512, 16, 1, 500000 ] + - Exact: [ 1024, 1, 1, 500000 ] + - Exact: [ 1024, 2, 1, 500000 ] + - Exact: [ 1024, 4, 1, 500000 ] + - Exact: [ 1024, 8, 1, 500000 ] + - Exact: [ 1024, 16, 1, 500000 ] + - Exact: [ 64, 1, 1, 1216 ] + - Exact: [ 128, 1, 1, 1024 ] + - Exact: [ 128, 1, 1, 1408 ] + - Exact: [ 512, 1, 1, 512 ] + - Exact: [ 3072, 1, 1, 128 ] + - Exact: [ 512, 2, 1, 512 ] + - Exact: [ 1024, 1, 1, 512 ] + - Exact: [ 4224, 1, 1, 128 ] + - Exact: [ 512, 4, 1, 512 ] + - Exact: [ 1024, 2, 1, 512 ] + - Exact: [ 1024, 4, 1, 512 ] + - Exact: [ 3072, 1, 1, 1024 ] + - Exact: [ 3072, 1, 1, 1024 ] + - Exact: [ 512, 16, 1, 512 ] + - Exact: [ 3072, 2, 1, 1024 ] + - Exact: [ 4608, 1, 1, 1536 ] + - Exact: [ 1024, 16, 1, 512 ] + - Exact: [ 3072, 4, 1, 1024 ] + - Exact: [ 4608, 2, 1, 1536 ] + - Exact: [ 6144, 1, 1, 2560 ] + - Exact: [ 7680, 1, 1, 2560 ] + - Exact: [ 8448, 1, 1, 2816 ] + - Exact: [ 4608, 4, 1, 1536 ] + - Exact: [ 6144, 2, 1, 2560 ] + - Exact: [ 7680, 2, 1, 2560 ] + - Exact: [ 8448, 2, 1, 2816 ] + - Exact: [ 1760, 16, 1, 1760 ] + - Exact: [ 3072, 16, 1, 1024 ] + - Exact: [ 6144, 4, 1, 2560 ] + - Exact: [ 2048, 16, 1, 2048 ] + - Exact: [ 7680, 4, 1, 2560 ] + - Exact: [ 8448, 4, 1, 2816 ] + - Exact: [ 2560, 16, 1, 2560 ] + - Exact: [ 4608, 16, 1, 1536 ] + - Exact: [ 6144, 16, 1, 2560 ] + - Exact: [ 4096, 16, 1, 4096 ] + - Exact: [ 7680, 16, 1, 2560 ] + - Exact: [ 8448, 16, 1, 2816 ] + +# ######################################## +# # NN - 3 Waves/WG +# ######################################## +# - # Benchmark Group +# InitialSolutionParameters: +# BenchmarkCommonParameters: +# - EdgeType: ["ShiftPtr"] +# - LoopTail: [True] +# - KernelLanguage: ["Assembly"] +# - PrefetchGlobalRead: [True] +# - PrefetchLocalRead: [True] +# - WorkGroupMapping: [1] +# ForkParameters: +# - ThreadTile: +# - [ 3, 3 ] +# - [ 3, 4 ] +# - [ 3, 6 ] +# - [ 4, 3 ] +# - [ 4, 4 ] +# - [ 4, 6 ] +# - [ 6, 3 ] +# - [ 6, 4 ] +# - [ 6, 6 ] +# - [ 8, 8 ] +# - WorkGroup: +# - [ 24, 8, 1 ] +# - [ 12, 16, 1 ] +# - [ 6, 32, 1 ] +# - [ 8, 8, 1 ] +# - [ 8, 24, 1 ] +# - [ 16, 16, 1 ] +# - GlobalSplitU: [1] +# - DepthU: [ 8, 16, 24, 32, 64 ] +# - VectorWidth: [ 2, 4, 8 ] +# - AssertSummationElementMultiple: [2] +# - AssertFree0ElementMultiple: [2] +# BenchmarkForkParameters: +# JoinParameters: +# BenchmarkJoinParameters: +# BenchmarkFinalParameters: +# - ProblemSizes: +# - Exact: [ 35, 700, 1, 2048 ] +# - Exact: [ 35, 700, 1, 2048 ] +# - Exact: [ 35, 700, 1, 2560 ] +# - Exact: [ 35, 1500, 1, 2048 ] +# - Exact: [ 35, 1500, 1, 2560 ] +# - Exact: [ 35, 8457, 1, 1760 ] +# - Exact: [ 35, 8457, 1, 2048 ] +# - Exact: [ 35, 8457, 1, 2560 ] +# - Exact: [ 35, 8457, 1, 4096 ] + + ######################################## + # NN - Small or Skinny + ######################################## + - # Benchmark Group + InitialSolutionParameters: + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - KernelLanguage: ["Assembly"] + - PrefetchGlobalRead: [True] + - PrefetchLocalRead: [True] + ForkParameters: + - ThreadTile: + - [ 2, 2 ] + - [ 4, 2 ] + - [ 4, 4 ] + - [ 8, 4 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 32, 8, 1 ] + - [ 16, 4, 1 ] + - [ 32, 4, 1 ] + - WorkGroupMapping: [1, 8] + - GlobalSplitU: [1] + - DepthU: [ 8, 16, 24, 32, 64 ] + - VectorWidth: [-1] + - AssertSummationElementMultiple: [2] + - AssertFree0ElementMultiple: [2] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [64, 128], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ] # skinny-0 + - Range: [ [64, 64, 64, 7000], [64, 128], [1], [256, 1024, 1024, 4096] ] # skinny-1 + - Range: [ [64, 64, 64, 700], [64, 64, 64, 700], [1], [256, 1024, 1024, 4096] ] # small + - Exact: [ 64, 1, 1, 1216 ] + - Exact: [ 128, 1, 1, 1024 ] + - Exact: [ 128, 1, 1, 1408 ] + - Exact: [ 512, 1, 1, 512 ] + - Exact: [ 3072, 1, 1, 128 ] + - Exact: [ 512, 2, 1, 512 ] + - Exact: [ 1024, 1, 1, 512 ] + - Exact: [ 4224, 1, 1, 128 ] + - Exact: [ 512, 4, 1, 512 ] + - Exact: [ 1024, 2, 1, 512 ] + - Exact: [ 1024, 4, 1, 512 ] + - Exact: [ 3072, 1, 1, 1024 ] + - Exact: [ 3072, 1, 1, 1024 ] + - Exact: [ 512, 16, 1, 512 ] + - Exact: [ 3072, 2, 1, 1024 ] + - Exact: [ 4608, 1, 1, 1536 ] + - Exact: [ 512, 32, 1, 512 ] + - Exact: [ 1024, 16, 1, 512 ] + - Exact: [ 3072, 4, 1, 1024 ] + - Exact: [ 4608, 2, 1, 1536 ] + - Exact: [ 6144, 1, 1, 2560 ] + - Exact: [ 1024, 32, 1, 512 ] + - Exact: [ 7680, 1, 1, 2560 ] + - Exact: [ 8448, 1, 1, 2816 ] + - Exact: [ 4608, 4, 1, 1536 ] + - Exact: [ 6144, 2, 1, 2560 ] + - Exact: [ 7680, 2, 1, 2560 ] + - Exact: [ 8448, 2, 1, 2816 ] + - Exact: [ 1760, 16, 1, 1760 ] +# - Exact: [ 35, 700, 1, 2048 ] +# - Exact: [ 35, 700, 1, 2048 ] + - Exact: [ 3072, 16, 1, 1024 ] +# - Exact: [ 35, 700, 1, 2560 ] + - Exact: [ 6144, 4, 1, 2560 ] + - Exact: [ 2048, 16, 1, 2048 ] + - Exact: [ 7680, 4, 1, 2560 ] + - Exact: [ 8448, 4, 1, 2816 ] + - Exact: [ 1760, 32, 1, 1760 ] + - Exact: [ 3072, 32, 1, 1024 ] + - Exact: [ 2560, 16, 1, 2560 ] +# - Exact: [ 35, 1500, 1, 2048 ] + - Exact: [ 4608, 16, 1, 1536 ] + - Exact: [ 2048, 32, 1, 2048 ] +# - Exact: [ 35, 1500, 1, 2560 ] + - Exact: [ 1760, 64, 1, 1760 ] + - Exact: [ 3072, 64, 1, 1024 ] + - Exact: [ 2560, 32, 1, 2560 ] + - Exact: [ 4608, 32, 1, 1536 ] + - Exact: [ 128, 1500, 1, 1280 ] + - Exact: [ 6144, 16, 1, 2560 ] + - Exact: [ 2048, 64, 1, 2048 ] + - Exact: [ 4096, 16, 1, 4096 ] + - Exact: [ 7680, 16, 1, 2560 ] + - Exact: [ 176, 1500, 1, 1408 ] + - Exact: [ 8448, 16, 1, 2816 ] + - Exact: [ 1760, 128, 1, 1760 ] + - Exact: [ 3072, 128, 1, 1024 ] + - Exact: [ 2560, 64, 1, 2560 ] + - Exact: [ 6144, 32, 1, 2560 ] +# - Exact: [ 35, 8457, 1, 1760 ] + - Exact: [ 2048, 128, 1, 2048 ] + - Exact: [ 4096, 32, 1, 4096 ] +# - Exact: [ 35, 8457, 1, 2048 ] + - Exact: [ 7680, 32, 1, 2560 ] +# - Exact: [ 35, 8457, 1, 2560 ] + - Exact: [ 8448, 32, 1, 2816 ] + - Exact: [ 2560, 128, 1, 2560 ] + - Exact: [ 4096, 64, 1, 4096 ] +# - Exact: [ 35, 8457, 1, 4096 ] + - Exact: [ 7680, 64, 1, 2560 ] + - Exact: [ 4096, 128, 1, 4096 ] + - Exact: [ 7680, 128, 1, 2560 ] + + ######################################## + # NN - Large + ######################################## + - # Benchmark Group + InitialSolutionParameters: + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + - GlobalSplitU: [1] + ForkParameters: + - PrefetchGlobalRead: [False, True] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 6, 8 ] + - [ 8, 4 ] + - [ 8, 6 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 16, 1 ] + - WorkGroupMapping: [8] # 1 removed for training performance + - DepthU: [ 8, 16, 24, 32, 64 ] + - VectorWidth: [-1] + - AssertSummationElementMultiple: [2] + - AssertFree0ElementMultiple: [2] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [64, 64, 64, 7000], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ] # large + - Exact: [ 1024, 1024, 1, 1024 ] + - Exact: [ 1024, 700, 1, 512 ] + - Exact: [ 1024, 700, 1, 512 ] + - Exact: [ 3072, 128, 1, 1024 ] + - Exact: [ 3072, 1500, 1, 128 ] + - Exact: [ 2560, 128, 1, 2560 ] + - Exact: [ 4224, 1500, 1, 176 ] + - Exact: [ 512, 1500, 1, 1536 ] + - Exact: [ 512, 1500, 1, 2048 ] + - Exact: [ 512, 1500, 1, 2560 ] + - Exact: [ 4096, 128, 1, 4096 ] + - Exact: [ 512, 1500, 1, 2816 ] + - Exact: [ 512, 3000, 1, 1536 ] + - Exact: [ 1024, 1500, 1, 1536 ] + - Exact: [ 7680, 128, 1, 2560 ] + - Exact: [ 512, 3000, 1, 2048 ] + - Exact: [ 1024, 1500, 1, 2048 ] + - Exact: [ 512, 3000, 1, 2560 ] + - Exact: [ 1024, 1500, 1, 2560 ] + - Exact: [ 512, 3000, 1, 2816 ] + - Exact: [ 1024, 1500, 1, 2816 ] + - Exact: [ 512, 6000, 1, 1536 ] + - Exact: [ 1024, 3000, 1, 1536 ] + - Exact: [ 3072, 1500, 1, 1024 ] + - Exact: [ 3072, 1500, 1, 1024 ] + - Exact: [ 512, 6000, 1, 2048 ] + - Exact: [ 1024, 3000, 1, 2048 ] + - Exact: [ 5124, 700, 1, 2048 ] + - Exact: [ 5124, 700, 1, 2048 ] + - Exact: [ 512, 6000, 1, 2560 ] + - Exact: [ 1024, 3000, 1, 2560 ] + - Exact: [ 512, 6000, 1, 2816 ] + - Exact: [ 1024, 3000, 1, 2816 ] + - Exact: [ 5124, 700, 1, 2560 ] + - Exact: [ 1024, 6000, 1, 1536 ] + - Exact: [ 3072, 3000, 1, 1024 ] + - Exact: [ 4608, 1500, 1, 1536 ] + - Exact: [ 1024, 6000, 1, 2048 ] + - Exact: [ 1024, 6000, 1, 2560 ] + - Exact: [ 5124, 1500, 1, 2048 ] + - Exact: [ 1024, 6000, 1, 2816 ] + - Exact: [ 512, 24000, 1, 1536 ] + - Exact: [ 3072, 6000, 1, 1024 ] + - Exact: [ 5124, 1500, 1, 2560 ] + - Exact: [ 4608, 3000, 1, 1536 ] + - Exact: [ 1760, 7000, 1, 1760 ] + - Exact: [ 6144, 1500, 1, 2560 ] + - Exact: [ 512, 24000, 1, 2048 ] + - Exact: [ 2048, 7000, 1, 2048 ] + - Exact: [ 7680, 1500, 1, 2560 ] + - Exact: [ 512, 24000, 1, 2560 ] + - Exact: [ 512, 24000, 1, 2816 ] + - Exact: [ 8448, 1500, 1, 2816 ] + - Exact: [ 512, 48000, 1, 1536 ] + - Exact: [ 1024, 24000, 1, 1536 ] + - Exact: [ 4608, 6000, 1, 1536 ] + - Exact: [ 2560, 7000, 1, 2560 ] + - Exact: [ 6144, 3000, 1, 2560 ] + - Exact: [ 512, 48000, 1, 2048 ] + - Exact: [ 1024, 24000, 1, 2048 ] + - Exact: [ 7680, 3000, 1, 2560 ] + - Exact: [ 512, 48000, 1, 2560 ] + - Exact: [ 1024, 24000, 1, 2560 ] + - Exact: [ 512, 48000, 1, 2816 ] + - Exact: [ 1024, 24000, 1, 2816 ] + - Exact: [ 8448, 3000, 1, 2816 ] + - Exact: [ 1024, 48000, 1, 1536 ] + - Exact: [ 3072, 24000, 1, 1024 ] + - Exact: [ 5124, 9124, 1, 1760 ] + - Exact: [ 6144, 6000, 1, 2560 ] + - Exact: [ 5124, 9124, 1, 2048 ] + - Exact: [ 1024, 48000, 1, 2048 ] + - Exact: [ 4096, 7000, 1, 4096 ] + - Exact: [ 7680, 6000, 1, 2560 ] + - Exact: [ 5124, 9124, 1, 2560 ] + - Exact: [ 1024, 48000, 1, 2560 ] + - Exact: [ 1024, 48000, 1, 2816 ] + - Exact: [ 8448, 6000, 1, 2816 ] + - Exact: [ 3072, 48000, 1, 1024 ] + - Exact: [ 4608, 24000, 1, 1536 ] + - Exact: [ 5124, 9124, 1, 4096 ] + - Exact: [ 4608, 48000, 1, 1536 ] + - Exact: [ 6144, 24000, 1, 2560 ] + - Exact: [ 7680, 24000, 1, 2560 ] + - Exact: [ 8448, 24000, 1, 2816 ] + - Exact: [ 6144, 48000, 1, 2560 ] + - Exact: [ 7680, 48000, 1, 2560 ] + - Exact: [ 8448, 48000, 1, 2816 ] + + ######################################## + # NN - VectorWidth Correctness + ######################################## + - # Benchmark Group + InitialSolutionParameters: + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - KernelLanguage: ["Assembly"] + - PrefetchGlobalRead: [True] + - PrefetchLocalRead: [True] + - WorkGroupMapping: [1] + ForkParameters: + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 16, 1 ] + - GlobalSplitU: [1] + - DepthU: [ 8, 16, 24, 32, 64 ] + - VectorWidth: [-1] + - AssertSummationElementMultiple: [2] + - AssertFree0ElementMultiple: [2] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [64], [64], [1], [256, 1024, 1024, 4096] ] # corner + - Range: [ [64], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ] # skinny-0 + - Range: [ [64, 64, 64, 7000], [64], [1], [256, 1024, 1024, 4096] ] # skinny-1 + + ######################################## + # NN - VGPR refactor + ######################################## + - # Benchmark Group + InitialSolutionParameters: + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - KernelLanguage: ["Assembly"] + - PrefetchGlobalRead: [ True ] + - PrefetchLocalRead: [ True ] + ForkParameters: + - ThreadTile: + - [ 2, 2 ] + - [ 2, 4 ] + - [ 4, 2 ] + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 4 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - [ 32, 8, 1 ] + - [ 32, 16, 1 ] + - WorkGroupMapping: [1, 8] + - DepthU: [ 8, 16, 24, 32, 64, 128 ] + - VectorWidth: [2, 4, 8] +# - GlobalReadVectorWidth: [4] + - LdsPadB: [0,1,2,4] + - AssertSummationElementMultiple: [2] + - AssertFree0ElementMultiple: [2] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [ 256, 193600, 1, 64 ] + - Exact: [ 64, 193600, 1, 64 ] + - Exact: [ 64, 193600, 1, 256 ] + - Exact: [ 512, 50176, 1, 128 ] + - Exact: [ 128, 50176, 1, 512 ] + - Exact: [ 256, 12544, 1, 1024 ] + - Exact: [ 1024, 12544, 1, 256 ] + - Exact: [ 2048, 3136, 1, 512 ] + - Exact: [ 512, 3136, 1, 2048 ] + + ######################################## + # NN - Batch + ######################################## + - # Benchmark Group - ResNet 1x1: + InitialSolutionParameters: + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - KernelLanguage: ["Assembly"] + ForkParameters: + - PrefetchGlobalRead: [ False, True ] + - PrefetchLocalRead: [ False] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 4 ] + - WorkGroup: + - [ 16, 8, 1 ] + - [ 8, 32, 1 ] + - [ 16, 16, 1 ] + - [ 32, 8, 1 ] + - WorkGroupMapping: [8] + - DepthU: [ 16 ] + - VectorWidth: [2, 4, 8] +# - GlobalReadVectorWidth: [1, 4] + - LdsPadB: [0,1,2,4] + - AssertSummationElementMultiple: [2] + - AssertFree0ElementMultiple: [2] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [ 196, 256, 64, 1024 ] + - Exact: [ 784, 512, 64, 128 ] +# - Exact: [ 49, 512, 64, 2048 ] + - Exact: [ 3136, 512, 1, 2048 ] + - Exact: [ 196, 1024, 64, 256 ] +# - Exact: [ 49, 2048, 64, 512 ] + - Exact: [ 3136, 2048, 1, 512 ] +# - Exact: [ 3025, 256, 64, 64 ] +# - Exact: [ 3025, 64, 64, 64 ] + + ######################################## + # NN - Source kernels + ######################################## + - # BenchmarkProblemSizeGroup - VW=2 for m,n,k<=4 + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Source"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - VectorWidth: [2] + - WorkGroupMapping: [8] + ForkParameters: + - ThreadTile: + - [ 8, 8 ] + - [ 4, 8 ] + - [ 4, 4 ] + - WorkGroup: + - [ 16, 16, 1 ] + - PrefetchGlobalRead: [False, True] + - PrefetchLocalRead: [False, True] +# - DepthU: [ 4, 8, 16, 32 ] + - DepthU: [ 4, 8, 16 ] + - VectorWidth: [2] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [32], [32], [1], [32] ] + - Range: [ [32], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ] + - Range: [ [64, 64, 64, 7000], [32], [1], [256, 1024, 1024, 4096] ] + - Range: [ [64, 64, 64, 7000], [64, 64, 64, 7000], [1], [32] ] + - Range: [ [32], [32], [1], [256, 1024, 1024, 4096] ] + - Range: [ [32], [64, 64, 64, 7000], [1], [32] ] + - Range: [ [64, 64, 64, 7000], [32], [1], [32] ] + + - # BenchmarkProblemSizeGroup - VW=1 for m,n,k==1 + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Source"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - VectorWidth: [1] + - WorkGroupMapping: [8] + ForkParameters: + - ThreadTile: + - [ 8, 8 ] + - [ 4, 8 ] + - [ 4, 4 ] + - WorkGroup: + - [ 16, 16, 1 ] + - PrefetchGlobalRead: [False, True] + - PrefetchLocalRead: [False, True] +# - DepthU: [ 4, 8, 16, 32 ] + - DepthU: [ 4, 8, 16 ] + - VectorWidth: [1] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [1], [1], [1], [1] ] + - Range: [ [1], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ] + - Range: [ [64, 64, 64, 7000], [1], [1], [256, 1024, 1024, 4096] ] + - Range: [ [64, 64, 64, 7000], [64, 64, 64, 7000], [1], [1] ] + - Range: [ [1], [1], [1], [256, 1024, 1024, 4096] ] + - Range: [ [1], [64, 64, 64, 7000], [1], [1] ] + - Range: [ [64, 64, 64, 7000], [1], [1], [1] ] + +LibraryLogic: +# ScheduleName: "vega20" +# DeviceNames: ["Device 66a0", "Device 66a1", "Device 66a7"] +# ArchitectureName: "gfx906" + + ScheduleName: "vega10" + DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"] + ArchitectureName: "gfx900" + +# ScheduleName: "mi25" +# DeviceNames: ["Device 6860"] +# ArchitectureName: "gfx900" + +# ScheduleName: "r9nano" +# DeviceNames: ["Device 7300"] +# ArchitectureName: "gfx803" + +# ScheduleName: "hip" +# DeviceNames: ["Device 0000"] +# ArchitectureName: "fallback" + +LibraryClient: diff --git a/Tensile/Configs/rocblas_hpa_hgemm_nt_asm_full.yaml b/Tensile/Configs/rocblas_hpa_hgemm_nt_asm_full.yaml new file mode 100644 index 0000000000..ca2455c4b0 --- /dev/null +++ b/Tensile/Configs/rocblas_hpa_hgemm_nt_asm_full.yaml @@ -0,0 +1,252 @@ +GlobalParameters: + MinimumRequiredVersion: 4.4.0 + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + CMakeBuildType: Release + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + LibraryPrintDebug: False + NumElementsToValidate: 256 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + Platform: 0 + Device: 0 + KernelTime: True + PinClocks: True + SleepPercent: 200 + DataInitTypeBeta : 0 + ExitOnFails: 0 + +BenchmarkProblems: + + ######################################## + ######################################## + ### + ### NT + ### + ######################################## + ######################################## + - + - # ProblemType + OperationType: GEMM + DataType: h + HighPrecisionAccumulate: True + TransposeA: False + TransposeB: True + UseBeta: True + Batched: True + + ######################################## + # NT - Small or Skinny + ######################################## + - # Benchmark Group + InitialSolutionParameters: + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - KernelLanguage: ["Assembly"] + - PrefetchGlobalRead: [True] + - PrefetchLocalRead: [True] + ForkParameters: + - ThreadTile: + - [ 2, 2 ] + - [ 4, 2 ] + - [ 4, 4 ] + - [ 8, 4 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 32, 8, 1 ] + - [ 16, 4, 1 ] + - [ 32, 4, 1 ] + - WorkGroupMapping: [1, 8] + - GlobalSplitU: [1] + - DepthU: [ 8, 16, 24, 32, 64 ] + - VectorWidth: [-1] + - AssertSummationElementMultiple: [2] + - AssertFree0ElementMultiple: [2] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [64, 128], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ] # skinny-0 + - Range: [ [64, 64, 64, 7000], [64, 128], [1], [256, 1024, 1024, 4096] ] # skinny-1 + - Range: [ [64, 64, 64, 700], [64, 64, 64, 700], [1], [256, 1024, 1024, 4096] ] # small + - Exact: [ 512, 16, 1, 512 ] + - Exact: [ 512, 32, 1, 512 ] + - Exact: [ 1024, 16, 1, 512 ] + - Exact: [ 1024, 32, 1, 512 ] + + ######################################## + # NT - Large + ######################################## + - # Benchmark Group + InitialSolutionParameters: + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - KernelLanguage: ["Assembly"] + - PrefetchGlobalRead: [True] + - PrefetchLocalRead: [True] + - GlobalSplitU: [1] + ForkParameters: + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 6, 8 ] + - [ 8, 4 ] + - [ 8, 6 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 16, 1 ] + - WorkGroupMapping: [8] +# - WorkGroupMapping: [1, 8] + - DepthU: [ 8, 16, 24, 32, 64 ] + - VectorWidth: [-1] + - AssertSummationElementMultiple: [2] + - AssertFree0ElementMultiple: [2] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [64, 64, 64, 7000], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ] # large + - Exact: [ 1024, 1024, 1, 1024 ] + - Exact: [ 1760, 7133, 1, 1760 ] + - Exact: [ 2048, 7133, 1, 2048 ] + - Exact: [ 2560, 7133, 1, 2560 ] + - Exact: [ 3072, 7435, 1, 1024 ] + - Exact: [ 4096, 7133, 1, 4096 ] + - Exact: [ 7680, 5481, 1, 2560 ] + + ######################################## + # NT - VectorWidth Correctness + ######################################## + - # Benchmark Group + InitialSolutionParameters: + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - KernelLanguage: ["Assembly"] + - PrefetchGlobalRead: [True] + - PrefetchLocalRead: [True] + - WorkGroupMapping: [1] + ForkParameters: + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 16, 1 ] + - GlobalSplitU: [1] + - DepthU: [ 8, 16, 24, 32, 64 ] + - VectorWidth: [-1] + - AssertSummationElementMultiple: [2] + - AssertFree0ElementMultiple: [2] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [64], [64], [1], [256, 1024, 1024, 4096] ] # corner + - Range: [ [64], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ] # skinny-0 + - Range: [ [64, 64, 64, 7000], [64], [1], [256, 1024, 1024, 4096] ] # skinny-1 + + ######################################## + # NT - Source kernels + ######################################## + - # BenchmarkProblemSizeGroup - VW=2 for m,n,k<=4 + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Source"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - VectorWidth: [2] + - WorkGroupMapping: [8] + ForkParameters: + - ThreadTile: + - [ 8, 8 ] + - [ 4, 8 ] + - [ 4, 4 ] + - WorkGroup: + - [ 16, 16, 1 ] + - PrefetchGlobalRead: [False, True] + - PrefetchLocalRead: [False, True] +# - DepthU: [ 4, 8, 16, 32 ] + - DepthU: [ 4, 8, 16 ] + - VectorWidth: [2] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [32], [32], [1], [32] ] + - Range: [ [32], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ] + - Range: [ [64, 64, 64, 7000], [32], [1], [256, 1024, 1024, 4096] ] + - Range: [ [64, 64, 64, 7000], [64, 64, 64, 7000], [1], [32] ] + - Range: [ [32], [32], [1], [256, 1024, 1024, 4096] ] + - Range: [ [32], [64, 64, 64, 7000], [1], [32] ] + - Range: [ [64, 64, 64, 7000], [32], [1], [32] ] + + - # BenchmarkProblemSizeGroup - VW=1 for m,n,k==1 + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Source"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - VectorWidth: [1] + - WorkGroupMapping: [8] + ForkParameters: + - ThreadTile: + - [ 8, 8 ] + - [ 4, 8 ] + - [ 4, 4 ] + - WorkGroup: + - [ 16, 16, 1 ] + - PrefetchGlobalRead: [False, True] + - PrefetchLocalRead: [False, True] +# - DepthU: [ 4, 8, 16, 32 ] + - DepthU: [ 4, 8, 16 ] + - VectorWidth: [1] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [1], [1], [1], [1] ] + - Range: [ [1], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ] + - Range: [ [64, 64, 64, 7000], [1], [1], [256, 1024, 1024, 4096] ] + - Range: [ [64, 64, 64, 7000], [64, 64, 64, 7000], [1], [1] ] + - Range: [ [1], [1], [1], [256, 1024, 1024, 4096] ] + - Range: [ [1], [64, 64, 64, 7000], [1], [1] ] + - Range: [ [64, 64, 64, 7000], [1], [1], [1] ] + +LibraryLogic: +# ScheduleName: "vega20" +# DeviceNames: ["Device 66a0", "Device 66a1", "Device 66a7"] +# ArchitectureName: "gfx906" + + ScheduleName: "vega10" + DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"] + ArchitectureName: "gfx900" + +# ScheduleName: "mi25" +# DeviceNames: ["Device 6860"] +# ArchitectureName: "gfx900" + +# ScheduleName: "r9nano" +# DeviceNames: ["Device 7300"] +# ArchitectureName: "gfx803" + +# ScheduleName: "hip" +# DeviceNames: ["Device 0000"] +# ArchitectureName: "fallback" + +LibraryClient: diff --git a/Tensile/Configs/rocblas_hpa_hgemm_tn_asm_full.yaml b/Tensile/Configs/rocblas_hpa_hgemm_tn_asm_full.yaml new file mode 100644 index 0000000000..cc9457c969 --- /dev/null +++ b/Tensile/Configs/rocblas_hpa_hgemm_tn_asm_full.yaml @@ -0,0 +1,453 @@ +GlobalParameters: + MinimumRequiredVersion: 4.4.0 + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + CMakeBuildType: Release + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + LibraryPrintDebug: False + NumElementsToValidate: 256 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + Platform: 0 + Device: 0 + KernelTime: True + PinClocks: True + SleepPercent: 200 + DataInitTypeBeta : 0 + ExitOnFails: 0 + +BenchmarkProblems: + + ######################################## + ######################################## + ### + ### TN + ### + ######################################## + ######################################## + - + - # ProblemType + OperationType: GEMM + DataType: h + HighPrecisionAccumulate: True + TransposeA: True + TransposeB: False + UseBeta: True + Batched: True + + ######################################## + # TN - Super Skinny + ######################################## + - # Benchmark Group + InitialSolutionParameters: + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + - WorkGroupMapping: [1] + ForkParameters: + - PrefetchGlobalRead: [False, True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 2 ] + - [ 8, 2 ] + - [ 4, 4 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 8, 16, 1 ] + - [ 16, 4, 1 ] + - [ 16, 8, 1 ] + - [ 16, 16, 1 ] + - [ 32, 4, 1 ] + - [ 32, 8, 1 ] + - GlobalSplitU: [1] + - DepthU: [ 8, 16, 32, 64 ] + - VectorWidth: [-1] + - AssertSummationElementMultiple: [2] + - AssertFree0ElementMultiple: [2] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [ 512, 8, 1, 500000 ] + - Exact: [ 512, 16, 1, 500000 ] + - Exact: [ 1024, 8, 1, 500000 ] + - Exact: [ 1024, 16, 1, 500000 ] + - Exact: [ 1760, 16, 1, 1760 ] + - Exact: [ 3072, 16, 1, 1024 ] + - Exact: [ 2048, 16, 1, 2048 ] + - Exact: [ 2560, 16, 1, 2560 ] + - Exact: [ 4608, 16, 1, 1536 ] + - Exact: [ 6144, 16, 1, 2560 ] + - Exact: [ 4096, 16, 1, 4096 ] + - Exact: [ 7680, 16, 1, 2560 ] + - Exact: [ 8448, 16, 1, 2816 ] + +# ######################################## +# # TN - 3 Waves/WG +# ######################################## +# - # Benchmark Group +# InitialSolutionParameters: +# BenchmarkCommonParameters: +# - EdgeType: ["ShiftPtr"] +# - LoopTail: [True] +# - KernelLanguage: ["Assembly"] +# - PrefetchGlobalRead: [True] +# - PrefetchLocalRead: [True] +# - WorkGroupMapping: [1] +# ForkParameters: +# - ThreadTile: +# - [ 3, 3 ] +# - [ 3, 4 ] +# - [ 3, 6 ] +# - [ 4, 3 ] +# - [ 4, 4 ] +# - [ 4, 6 ] +# - [ 6, 3 ] +# - [ 6, 4 ] +# - [ 6, 6 ] +# - WorkGroup: +# - [ 24, 8, 1 ] +# - [ 12, 16, 1 ] +# - [ 6, 32, 1 ] +# - [ 8, 8, 1 ] +# - [ 8, 24, 1 ] +# - [ 16, 16, 1 ] +# - GlobalSplitU: [1] +# - DepthU: [ 8, 16, 24, 32, 64 ] +# - VectorWidth: [2, 4, 8] +# - AssertSummationElementMultiple: [2] +# - AssertFree0ElementMultiple: [2] +# BenchmarkForkParameters: +# JoinParameters: +# BenchmarkJoinParameters: +# BenchmarkFinalParameters: +# - ProblemSizes: +# - Exact: [ 35, 8457, 1, 1760 ] +# - Exact: [ 35, 8457, 1, 2048 ] +# - Exact: [ 35, 8457, 1, 2560 ] +# - Exact: [ 35, 8457, 1, 4096 ] + + ######################################## + # TN - Small or Skinny + ######################################## + - # Benchmark Group + InitialSolutionParameters: + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - KernelLanguage: ["Assembly"] + - PrefetchGlobalRead: [True] + - PrefetchLocalRead: [True] + ForkParameters: + - ThreadTile: + - [ 2, 2 ] + - [ 4, 2 ] + - [ 4, 4 ] + - [ 8, 4 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 8, 1 ] + - [ 32, 8, 1 ] + - [ 16, 4, 1 ] + - [ 32, 4, 1 ] + - WorkGroupMapping: [1, 8] + - GlobalSplitU: [1] + - DepthU: [ 8, 16, 24, 32, 64 ] + - VectorWidth: [-1] + - AssertSummationElementMultiple: [2] + - AssertFree0ElementMultiple: [2] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [64, 128], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ] # skinny-0 + - Range: [ [64, 64, 64, 7000], [64, 128], [1], [256, 1024, 1024, 4096] ] # skinny-1 + - Range: [ [64, 64, 64, 700], [64, 64, 64, 700], [1], [256, 1024, 1024, 4096] ] # small + - Exact: [ 1760, 16, 1, 1760 ] + - Exact: [ 3072, 16, 1, 1024 ] + - Exact: [ 2048, 16, 1, 2048 ] + - Exact: [ 1760, 32, 1, 1760 ] + - Exact: [ 3072, 32, 1, 1024 ] + - Exact: [ 2560, 16, 1, 2560 ] + - Exact: [ 4608, 16, 1, 1536 ] + - Exact: [ 2048, 32, 1, 2048 ] + - Exact: [ 1760, 64, 1, 1760 ] + - Exact: [ 3072, 64, 1, 1024 ] + - Exact: [ 2560, 32, 1, 2560 ] + - Exact: [ 4608, 32, 1, 1536 ] + - Exact: [ 6144, 16, 1, 2560 ] + - Exact: [ 2048, 64, 1, 2048 ] + - Exact: [ 4096, 16, 1, 4096 ] + - Exact: [ 7680, 16, 1, 2560 ] + - Exact: [ 8448, 16, 1, 2816 ] + - Exact: [ 1760, 128, 1, 1760 ] + - Exact: [ 3072, 128, 1, 1024 ] + - Exact: [ 2560, 64, 1, 2560 ] + - Exact: [ 6144, 32, 1, 2560 ] +# - Exact: [ 35, 8457, 1, 1760 ] + - Exact: [ 2048, 128, 1, 2048 ] + - Exact: [ 4096, 32, 1, 4096 ] +# - Exact: [ 35, 8457, 1, 2048 ] + - Exact: [ 7680, 32, 1, 2560 ] +# - Exact: [ 35, 8457, 1, 2560 ] + - Exact: [ 8448, 32, 1, 2816 ] + - Exact: [ 2560, 128, 1, 2560 ] + - Exact: [ 4096, 64, 1, 4096 ] +# - Exact: [ 35, 8457, 1, 4096 ] + - Exact: [ 7680, 64, 1, 2560 ] + - Exact: [ 4096, 128, 1, 4096 ] + - Exact: [ 7680, 128, 1, 2560 ] + + ######################################## + # TN - Large + ######################################## + - # Benchmark Group + InitialSolutionParameters: + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - KernelLanguage: ["Assembly"] + - PrefetchGlobalRead: [True] + - PrefetchLocalRead: [True] + - GlobalSplitU: [1] + ForkParameters: + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 6, 8 ] + - [ 8, 4 ] + - [ 8, 6 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 16, 1 ] + - WorkGroupMapping: [8] +# - WorkGroupMapping: [1, 8] + - DepthU: [ 8, 16, 24, 32, 64 ] + - VectorWidth: [-1] + - AssertSummationElementMultiple: [2] + - AssertFree0ElementMultiple: [2] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [64, 64, 64, 7000], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ] # large + - Exact: [ 1024, 1024, 1, 1024 ] +# MIOpen sizes + - Exact: [ 1760, 800, 1, 1760 ] + - Exact: [ 1760, 1600, 1, 1760 ] + - Exact: [ 1760, 3200, 1, 1760 ] + - Exact: [ 1760, 6400, 1, 1760 ] + - Exact: [ 2048, 800, 1, 2048 ] + - Exact: [ 2048, 1600, 1, 2048 ] + - Exact: [ 2048, 3200, 1, 2048 ] + - Exact: [ 2048, 6400, 1, 2048 ] + - Exact: [ 2560, 800, 1, 2560 ] + - Exact: [ 2560, 1600, 1, 2560 ] + - Exact: [ 2560, 3200, 1, 2560 ] + - Exact: [ 2560, 6400, 1, 2560 ] + - Exact: [ 2048, 400, 1, 512 ] + - Exact: [ 2048, 800, 1, 512 ] + - Exact: [ 2048, 1600, 1, 512 ] + - Exact: [ 2048, 3200, 1, 512 ] + - Exact: [ 4096, 400, 1, 1024 ] + - Exact: [ 4096, 800, 1, 1024 ] + - Exact: [ 4096, 1600, 1, 1024 ] + - Exact: [ 4096, 3200, 1, 1024 ] + - Exact: [ 8192, 400, 1, 2048 ] + - Exact: [ 8192, 800, 1, 2048 ] + - Exact: [ 8192, 1600, 1, 2048 ] + - Exact: [ 8192, 3200, 1, 2048 ] + - Exact: [ 16384, 400, 1, 4096 ] + - Exact: [ 16384, 800, 1, 4096 ] + - Exact: [ 16384, 1600, 1, 4096 ] + - Exact: [ 16384, 3200, 1, 4096 ] + - Exact: [ 8448, 48000, 1, 2816 ] + - Exact: [ 8448, 24000, 1, 2816 ] + - Exact: [ 8448, 12000, 1, 2816 ] + - Exact: [ 8448, 5984, 1, 2816 ] + - Exact: [ 6144, 48000, 1, 2048 ] + - Exact: [ 6144, 24000, 1, 2048 ] + - Exact: [ 6144, 12000, 1, 2048 ] + - Exact: [ 6144, 5984, 1, 2048 ] + - Exact: [ 4608, 48000, 1, 1536 ] + - Exact: [ 4608, 24000, 1, 1536 ] + - Exact: [ 4608, 12000, 1, 1536 ] + - Exact: [ 4608, 5984, 1, 1536 ] + - Exact: [ 7680, 48000, 1, 2560 ] + - Exact: [ 7680, 24000, 1, 2560 ] + - Exact: [ 7680, 12000, 1, 2560 ] + - Exact: [ 7680, 5984, 1, 2560 ] +# Deepbench sizes + - Exact: [ 1024, 700, 1, 512 ] + - Exact: [ 512, 24000, 1, 1536 ] + - Exact: [ 1760, 7000, 1, 1760 ] + - Exact: [ 512, 24000, 1, 2048 ] + - Exact: [ 2048, 7000, 1, 2048 ] + - Exact: [ 512, 24000, 1, 2560 ] + - Exact: [ 512, 24000, 1, 2816 ] + - Exact: [ 512, 48000, 1, 1536 ] + - Exact: [ 1024, 24000, 1, 1536 ] + - Exact: [ 2560, 7000, 1, 2560 ] + - Exact: [ 512, 48000, 1, 2048 ] + - Exact: [ 1024, 24000, 1, 2048 ] + - Exact: [ 512, 48000, 1, 2560 ] + - Exact: [ 1024, 24000, 1, 2560 ] + - Exact: [ 512, 48000, 1, 2816 ] + - Exact: [ 1024, 24000, 1, 2816 ] + - Exact: [ 1024, 48000, 1, 1536 ] + - Exact: [ 3072, 24000, 1, 1024 ] + - Exact: [ 5124, 9124, 1, 1760 ] + - Exact: [ 5124, 9124, 1, 2048 ] + - Exact: [ 1024, 48000, 1, 2048 ] + - Exact: [ 4096, 7000, 1, 4096 ] + - Exact: [ 5124, 9124, 1, 2560 ] + - Exact: [ 1024, 48000, 1, 2560 ] + - Exact: [ 1024, 48000, 1, 2816 ] + - Exact: [ 3072, 48000, 1, 1024 ] + - Exact: [ 4608, 24000, 1, 1536 ] + - Exact: [ 5124, 9124, 1, 4096 ] + - Exact: [ 4608, 48000, 1, 1536 ] + - Exact: [ 6144, 24000, 1, 2560 ] + - Exact: [ 7680, 24000, 1, 2560 ] + - Exact: [ 8448, 24000, 1, 2816 ] + - Exact: [ 6144, 48000, 1, 2560 ] + - Exact: [ 7680, 48000, 1, 2560 ] + - Exact: [ 8448, 48000, 1, 2816 ] + + ######################################## + # TN - VectorWidth Correctness + ######################################## + - # Benchmark Group + InitialSolutionParameters: + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - KernelLanguage: ["Assembly"] + - PrefetchGlobalRead: [True] + - PrefetchLocalRead: [True] + - WorkGroupMapping: [1] + ForkParameters: + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 16, 1 ] + - GlobalSplitU: [1] + - DepthU: [ 8, 16, 24, 32, 64 ] + - VectorWidth: [-1] + - AssertSummationElementMultiple: [2] + - AssertFree0ElementMultiple: [2] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [64], [64], [1], [256, 1024, 1024, 4096] ] # corner + - Range: [ [64], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ] # skinny-0 + - Range: [ [64, 64, 64, 7000], [64], [1], [256, 1024, 1024, 4096] ] # skinny-1 + + ######################################## + # TN - Source kernels + ######################################## + - # BenchmarkProblemSizeGroup - VW=2 for m,n,k<=4 + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Source"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - VectorWidth: [2] + - WorkGroupMapping: [8] + ForkParameters: + - ThreadTile: + - [ 8, 8 ] + - [ 4, 8 ] + - [ 4, 4 ] + - WorkGroup: + - [ 16, 16, 1 ] + - PrefetchGlobalRead: [False, True] + - PrefetchLocalRead: [False, True] +# - DepthU: [ 4, 8, 16, 32 ] + - DepthU: [ 4, 8, 16 ] + - VectorWidth: [2] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [32], [32], [1], [32] ] + - Range: [ [32], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ] + - Range: [ [64, 64, 64, 7000], [32], [1], [256, 1024, 1024, 4096] ] + - Range: [ [64, 64, 64, 7000], [64, 64, 64, 7000], [1], [32] ] + - Range: [ [32], [32], [1], [256, 1024, 1024, 4096] ] + - Range: [ [32], [64, 64, 64, 7000], [1], [32] ] + - Range: [ [64, 64, 64, 7000], [32], [1], [32] ] + + - # BenchmarkProblemSizeGroup - VW=1 for m,n,k==1 + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Source"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - VectorWidth: [1] + - WorkGroupMapping: [8] + ForkParameters: + - ThreadTile: + - [ 8, 8 ] + - [ 4, 8 ] + - [ 4, 4 ] + - WorkGroup: + - [ 16, 16, 1 ] + - PrefetchGlobalRead: [False, True] + - PrefetchLocalRead: [False, True] +# - DepthU: [ 4, 8, 16, 32 ] + - DepthU: [ 4, 8, 16 ] + - VectorWidth: [1] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [1], [1], [1], [1] ] + - Range: [ [1], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ] + - Range: [ [64, 64, 64, 7000], [1], [1], [256, 1024, 1024, 4096] ] + - Range: [ [64, 64, 64, 7000], [64, 64, 64, 7000], [1], [1] ] + - Range: [ [1], [1], [1], [256, 1024, 1024, 4096] ] + - Range: [ [1], [64, 64, 64, 7000], [1], [1] ] + - Range: [ [64, 64, 64, 7000], [1], [1], [1] ] + +LibraryLogic: +# ScheduleName: "vega20" +# DeviceNames: ["Device 66a0", "Device 66a1", "Device 66a7"] +# ArchitectureName: "gfx906" + + ScheduleName: "vega10" + DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"] + ArchitectureName: "gfx900" + +# ScheduleName: "mi25" +# DeviceNames: ["Device 6860"] +# ArchitectureName: "gfx900" + +# ScheduleName: "r9nano" +# DeviceNames: ["Device 7300"] +# ArchitectureName: "gfx803" + +# ScheduleName: "hip" +# DeviceNames: ["Device 0000"] +# ArchitectureName: "fallback" + +LibraryClient: diff --git a/Tensile/Configs/rocblas_hpa_hgemm_tt_asm_full.yaml b/Tensile/Configs/rocblas_hpa_hgemm_tt_asm_full.yaml new file mode 100644 index 0000000000..2e313a427a --- /dev/null +++ b/Tensile/Configs/rocblas_hpa_hgemm_tt_asm_full.yaml @@ -0,0 +1,245 @@ +GlobalParameters: + MinimumRequiredVersion: 4.4.0 + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + CMakeBuildType: Release + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + LibraryPrintDebug: False + NumElementsToValidate: 256 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + Platform: 0 + Device: 0 + KernelTime: True + PinClocks: True + SleepPercent: 200 + DataInitTypeBeta : 0 + ExitOnFails: 0 + +BenchmarkProblems: + + ######################################## + ######################################## + ### + ### TT + ### + ######################################## + ######################################## + - + - # ProblemType + OperationType: GEMM + DataType: h + HighPrecisionAccumulate: True + TransposeA: True + TransposeB: True + UseBeta: True + Batched: True + + ######################################## + # TT - Small or Skinny + ######################################## + - # Benchmark Group + InitialSolutionParameters: + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - KernelLanguage: ["Assembly"] + - PrefetchGlobalRead: [True] + - PrefetchLocalRead: [True] + ForkParameters: + - ThreadTile: + - [ 2, 2 ] + - [ 2, 4 ] + - [ 4, 4 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 4, 32, 1 ] + - [ 8, 8, 1 ] + - [ 8, 16, 1 ] + - [ 8, 32, 1 ] + - [ 16, 16, 1 ] + - [ 32, 4, 1 ] + - [ 32, 8, 1 ] + - WorkGroupMapping: [-1, -4] + - GlobalSplitU: [1] + - DepthU: [ 8, 16, 24, 32, 64 ] + - VectorWidth: [-1] + - AssertSummationElementMultiple: [2] + - AssertFree0ElementMultiple: [2] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [64, 128], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ] # skinny-0 + - Range: [ [64, 64, 64, 7000], [64, 128], [1], [256, 1024, 1024, 4096] ] # skinny-1 + - Range: [ [64, 64, 64, 700], [64, 64, 64, 700], [1], [256, 1024, 1024, 4096] ] # small + + ######################################## + # TT - Large + ######################################## + - # Benchmark Group + InitialSolutionParameters: + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + - GlobalSplitU: [1] + ForkParameters: + - PrefetchGlobalRead: [False, True] + - ThreadTile: + - [ 4, 4 ] + - [ 4, 8 ] + - [ 6, 8 ] + - [ 8, 4 ] + - [ 8, 6 ] + - [ 8, 8 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 16, 1 ] + - WorkGroupMapping: [-1, -4] + - DepthU: [ 8, 16, 24, 32, 64 ] + - VectorWidth: [-1] + - AssertSummationElementMultiple: [2] + - AssertFree0ElementMultiple: [2] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [64, 64, 64, 7000], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ] # large + - Exact: [ 1024, 1024, 1, 1024 ] + + ######################################## + # TT - VectorWidth Correctness + ######################################## + - # Benchmark Group + InitialSolutionParameters: + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - KernelLanguage: ["Assembly"] + - PrefetchGlobalRead: [True] + - PrefetchLocalRead: [True] + - WorkGroupMapping: [-1] + ForkParameters: + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - WorkGroup: + - [ 8, 8, 1 ] + - [ 16, 16, 1 ] + - GlobalSplitU: [1] + - DepthU: [ 8, 16, 24, 32, 64 ] + - VectorWidth: [-1] + - AssertSummationElementMultiple: [2] + - AssertFree0ElementMultiple: [2] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [64], [64], [1], [256, 1024, 1024, 4096] ] # corner + - Range: [ [64], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ] # skinny-0 + - Range: [ [64, 64, 64, 7000], [64], [1], [256, 1024, 1024, 4096] ] # skinny-1 + + ######################################## + # TT - Source kernels + ######################################## + - # BenchmarkProblemSizeGroup - VW=2 for m,n,k<=4 + InitialSolutionParameters: + - WorkGroupMapping: [-1] + BenchmarkCommonParameters: + - KernelLanguage: ["Source"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - VectorWidth: [2] + - WorkGroupMapping: [-8] + ForkParameters: + - ThreadTile: + - [ 8, 8 ] + - [ 4, 8 ] + - [ 4, 4 ] + - WorkGroup: + - [ 16, 16, 1 ] + - PrefetchGlobalRead: [False, True] + - PrefetchLocalRead: [False, True] +# - DepthU: [ 4, 8, 16, 32 ] + - DepthU: [ 4, 8, 16 ] + - VectorWidth: [2] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [32], [32], [1], [32] ] + - Range: [ [32], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ] + - Range: [ [64, 64, 64, 7000], [32], [1], [256, 1024, 1024, 4096] ] + - Range: [ [64, 64, 64, 7000], [64, 64, 64, 7000], [1], [32] ] + - Range: [ [32], [32], [1], [256, 1024, 1024, 4096] ] + - Range: [ [32], [64, 64, 64, 7000], [1], [32] ] + - Range: [ [64, 64, 64, 7000], [32], [1], [32] ] + + - # BenchmarkProblemSizeGroup - VW=1 for m,n,k==1 + InitialSolutionParameters: + - WorkGroupMapping: [-1] + BenchmarkCommonParameters: + - KernelLanguage: ["Source"] + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - VectorWidth: [1] + - WorkGroupMapping: [-8] + ForkParameters: + - ThreadTile: + - [ 8, 8 ] + - [ 4, 8 ] + - [ 4, 4 ] + - WorkGroup: + - [ 16, 16, 1 ] + - PrefetchGlobalRead: [False, True] + - PrefetchLocalRead: [False, True] +# - DepthU: [ 4, 8, 16, 32 ] + - DepthU: [ 4, 8, 16 ] + - VectorWidth: [1] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [1], [1], [1], [1] ] + - Range: [ [1], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ] + - Range: [ [64, 64, 64, 7000], [1], [1], [256, 1024, 1024, 4096] ] + - Range: [ [64, 64, 64, 7000], [64, 64, 64, 7000], [1], [1] ] + - Range: [ [1], [1], [1], [256, 1024, 1024, 4096] ] + - Range: [ [1], [64, 64, 64, 7000], [1], [1] ] + - Range: [ [64, 64, 64, 7000], [1], [1], [1] ] + +LibraryLogic: +# ScheduleName: "vega20" +# DeviceNames: ["Device 66a0", "Device 66a1", "Device 66a7"] +# ArchitectureName: "gfx906" + + ScheduleName: "vega10" + DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"] + ArchitectureName: "gfx900" + +# ScheduleName: "mi25" +# DeviceNames: ["Device 6860"] +# ArchitectureName: "gfx900" + +# ScheduleName: "r9nano" +# DeviceNames: ["Device 7300"] +# ArchitectureName: "gfx803" + +# ScheduleName: "hip" +# DeviceNames: ["Device 0000"] +# ArchitectureName: "fallback" + +LibraryClient: diff --git a/Tensile/Configs/rocblas_sgemm_asm_full.yaml b/Tensile/Configs/rocblas_sgemm_asm_full.yaml index b90762ed2d..523d7ca02a 100644 --- a/Tensile/Configs/rocblas_sgemm_asm_full.yaml +++ b/Tensile/Configs/rocblas_sgemm_asm_full.yaml @@ -45,7 +45,6 @@ BenchmarkProblems: - LoopTail: [True] - KernelLanguage: ["Assembly"] - PrefetchLocalRead: [True] - - PreciseBoundsCheck: [False] - WorkGroupMapping: [1] ForkParameters: - PrefetchGlobalRead: [False, True] @@ -129,7 +128,6 @@ BenchmarkProblems: - KernelLanguage: ["Assembly"] - PrefetchGlobalRead: [True] - PrefetchLocalRead: [True] - - PreciseBoundsCheck: [False] - WorkGroupMapping: [1] ForkParameters: - ThreadTile: @@ -210,7 +208,6 @@ BenchmarkProblems: - KernelLanguage: ["Assembly"] - PrefetchGlobalRead: [True] - PrefetchLocalRead: [True] - - PreciseBoundsCheck: [False] ForkParameters: - ThreadTile: - [ 2, 2 ] @@ -348,7 +345,6 @@ BenchmarkProblems: - LoopTail: [True] - KernelLanguage: ["Assembly"] - PrefetchLocalRead: [True] - - PreciseBoundsCheck: [False] - GlobalSplitU: [1] ForkParameters: - PrefetchGlobalRead: [False, True] @@ -497,7 +493,6 @@ BenchmarkProblems: - KernelLanguage: ["Assembly"] - PrefetchGlobalRead: [True] - PrefetchLocalRead: [True] - - PreciseBoundsCheck: [False] - WorkGroupMapping: [1] ForkParameters: - ThreadTile: @@ -531,7 +526,6 @@ BenchmarkProblems: - KernelLanguage: ["Assembly"] - PrefetchGlobalRead: [ True ] - PrefetchLocalRead: [ True ] - - PreciseBoundsCheck: [False] ForkParameters: - ThreadTile: - [ 2, 2 ] @@ -576,7 +570,6 @@ BenchmarkProblems: - EdgeType: ["ShiftPtr"] - LoopTail: [True] - KernelLanguage: ["Assembly"] - - PreciseBoundsCheck: [False] ForkParameters: - PrefetchGlobalRead: [ False, True ] - PrefetchLocalRead: [ False] @@ -635,7 +628,6 @@ BenchmarkProblems: - LoopTail: [True] - KernelLanguage: ["Assembly"] - PrefetchLocalRead: [True] - - PreciseBoundsCheck: [False] - WorkGroupMapping: [1] ForkParameters: - PrefetchGlobalRead: [False, True] @@ -683,7 +675,6 @@ BenchmarkProblems: - KernelLanguage: ["Assembly"] - PrefetchGlobalRead: [True] - PrefetchLocalRead: [True] - - PreciseBoundsCheck: [False] - WorkGroupMapping: [1] ForkParameters: - ThreadTile: @@ -760,7 +751,6 @@ BenchmarkProblems: - KernelLanguage: ["Assembly"] - PrefetchGlobalRead: [True] - PrefetchLocalRead: [True] - - PreciseBoundsCheck: [False] ForkParameters: - ThreadTile: - [ 2, 2 ] @@ -861,7 +851,6 @@ BenchmarkProblems: - KernelLanguage: ["Assembly"] - PrefetchGlobalRead: [True] - PrefetchLocalRead: [True] - - PreciseBoundsCheck: [False] - GlobalSplitU: [1] ForkParameters: - ThreadTile: @@ -1003,7 +992,6 @@ BenchmarkProblems: - KernelLanguage: ["Assembly"] - PrefetchGlobalRead: [True] - PrefetchLocalRead: [True] - - PreciseBoundsCheck: [False] - WorkGroupMapping: [1] ForkParameters: - ThreadTile: @@ -1086,7 +1074,6 @@ BenchmarkProblems: - KernelLanguage: ["Assembly"] - PrefetchGlobalRead: [True] - PrefetchLocalRead: [True] - - PreciseBoundsCheck: [False] ForkParameters: - ThreadTile: - [ 2, 2 ] @@ -1157,7 +1144,6 @@ BenchmarkProblems: - KernelLanguage: ["Assembly"] - PrefetchGlobalRead: [True] - PrefetchLocalRead: [True] - - PreciseBoundsCheck: [False] - GlobalSplitU: [1] ForkParameters: - ThreadTile: @@ -1224,7 +1210,6 @@ BenchmarkProblems: - KernelLanguage: ["Assembly"] - PrefetchGlobalRead: [True] - PrefetchLocalRead: [True] - - PreciseBoundsCheck: [False] - WorkGroupMapping: [1] ForkParameters: - ThreadTile: @@ -1308,7 +1293,6 @@ BenchmarkProblems: - KernelLanguage: ["Assembly"] - PrefetchGlobalRead: [True] - PrefetchLocalRead: [True] - - PreciseBoundsCheck: [False] ForkParameters: - ThreadTile: - [ 2, 2 ] @@ -1374,7 +1358,6 @@ BenchmarkProblems: - LoopTail: [True] - KernelLanguage: ["Assembly"] - PrefetchLocalRead: [True] - - PreciseBoundsCheck: [False] - GlobalSplitU: [1] ForkParameters: - PrefetchGlobalRead: [False, True] @@ -1436,7 +1419,6 @@ BenchmarkProblems: - KernelLanguage: ["Assembly"] - PrefetchGlobalRead: [True] - PrefetchLocalRead: [True] - - PreciseBoundsCheck: [False] - WorkGroupMapping: [-1] ForkParameters: - ThreadTile: @@ -1461,11 +1443,11 @@ BenchmarkProblems: LibraryLogic: # ScheduleName: "vega20" -# DeviceNames: ["Device 66a0", "Device 66a7"] +# DeviceNames: ["Device 66a0", "Device 66a1", "Device 66a7"] # ArchitectureName: "gfx906" ScheduleName: "vega10" - DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"] + DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"] ArchitectureName: "gfx900" # ScheduleName: "mi25" diff --git a/Tensile/Configs/rocblas_sgemm_asm_lite.yaml b/Tensile/Configs/rocblas_sgemm_asm_lite.yaml index 3272e85653..3aa2ec8b04 100644 --- a/Tensile/Configs/rocblas_sgemm_asm_lite.yaml +++ b/Tensile/Configs/rocblas_sgemm_asm_lite.yaml @@ -552,11 +552,11 @@ BenchmarkProblems: LibraryLogic: # ScheduleName: "vega20" -# DeviceNames: ["Device 66a0", "Device 66a7"] +# DeviceNames: ["Device 66a0", "Device 66a1", "Device 66a7"] # ArchitectureName: "gfx906" ScheduleName: "vega10" - DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"] + DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"] ArchitectureName: "gfx900" # ScheduleName: "mi25" diff --git a/Tensile/Configs/rocblas_sgemm_hip_lite.yaml b/Tensile/Configs/rocblas_sgemm_hip_lite.yaml index 3eb1cfc46f..aef828980b 100644 --- a/Tensile/Configs/rocblas_sgemm_hip_lite.yaml +++ b/Tensile/Configs/rocblas_sgemm_hip_lite.yaml @@ -183,7 +183,7 @@ BenchmarkProblems: LibraryLogic: # ScheduleName: "vega10" -# DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"] + DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"] # ArchitectureName: "gfx900" # ScheduleName: "mi25" diff --git a/Tensile/Configs/rocblas_zgemm.yaml b/Tensile/Configs/rocblas_zgemm.yaml index 1014a00d2a..6ea62ca075 100644 --- a/Tensile/Configs/rocblas_zgemm.yaml +++ b/Tensile/Configs/rocblas_zgemm.yaml @@ -178,7 +178,7 @@ BenchmarkProblems: LibraryLogic: ScheduleName: "vega10" - DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"] + DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"] ArchitectureName: "gfx900" # ScheduleName: "mi25" diff --git a/Tensile/KernelWriter.py b/Tensile/KernelWriter.py index c8eb10819a..f187887dea 100644 --- a/Tensile/KernelWriter.py +++ b/Tensile/KernelWriter.py @@ -20,7 +20,7 @@ ################################################################################ from SolutionStructs import Solution -from Common import globalParameters, printExit, CHeader +from Common import globalParameters, CHeader import abc import os from os import path, chmod @@ -72,6 +72,20 @@ def kernelBody( self, kernel, tensorParametersA, tensorParametersB ): kStr += self.comment3("Allocate Resources") kStr += self.allocateResources(kernel) + if kernel["ProblemType"]["TLUA"]: + # TODO - enable more aggressive path + #guaranteeeNoPartialA = kernel["AssertFree0ElementMultiple"]%kernel["GlobalLoadVectorWidthA"]==0 + guaranteeeNoPartialA = kernel["GlobalLoadVectorWidthA"]==1 + else: + guaranteeeNoPartialA = True + + if kernel["ProblemType"]["TLUB"]: + # TODO - enable more aggressive path + #guaranteeeNoPartialB = kernel["AssertFree1ElementMultiple"]%kernel["GlobalLoadVectorWidthB"]==0 + guaranteeeNoPartialB = kernel["GlobalLoadVectorWidthB"]==1 + else: + guaranteeeNoPartialB = True + if self.enable["PreLoop"]: #################################### # Global Read Addresses @@ -668,12 +682,19 @@ def kernelBody( self, kernel, tensorParametersA, tensorParametersB ): # Shift Vector Components #################################### if kernel["EdgeType"] == "ShiftPtr": + + # noPartial means each component in the vector loads is always valid. In this case we + # don't need the awkward unshift code + # TODO : the unshift code is complex and currently appears broken. Long-term want to use + # the Assert*ElementMultiple>glvw code as often as possible, or use buffer-load-x1 + # in cases where it can't be used. Then can remove this path. + # shift vector components d0 - if self.readTileDimVectorA and kernel["GlobalLoadVectorWidthA"] > 1: + if not guaranteeeNoPartialA and self.readTileDimVectorA: kStr += self.comment("shift vector components d0") kStr += self.shiftVectorComponents(kernel, tensorParametersA) # shift vector components d1 - if self.readTileDimVectorB and kernel["GlobalLoadVectorWidthB"] > 1: + if not guaranteeeNoPartialB and self.readTileDimVectorB: kStr += self.comment("shift vector components d1") kStr += self.shiftVectorComponents(kernel, tensorParametersB) @@ -938,7 +959,8 @@ def initKernel(self, kernel, tensorParametersA, tensorParametersB ): self.writeUnrollDimComponentsA = False # Scalar if kernel["LocalDotLayout"]>1: self.writeTileDimComponentsA = kernel["GlobalReadVectorWidth"] > 1 # Components - writeCoal = True + # LDS writes with LDL>1 will never be coalesced + writeCoal = False else: self.writeTileDimComponentsA = kernel["GlobalReadVectorWidth"] > 1 # Components writeCoal = False @@ -1037,7 +1059,8 @@ def initKernel(self, kernel, tensorParametersA, tensorParametersB ): self.writeUnrollDimComponentsB = False if kernel["LocalDotLayout"]>1: self.writeTileDimComponentsB = kernel["GlobalReadVectorWidth"] > 1 # Components - writeCoal = True + # LDS writes with LDL>1 will never be coalesced + writeCoal = False else: self.writeTileDimComponentsB = kernel["GlobalReadVectorWidth"] > 1 # Components writeCoal = False @@ -1819,33 +1842,33 @@ def getSourceFileString(self, kernel): assemblerProcess = Popen(assemblerCommand, \ cwd=asmPath ) assemblerProcess.communicate() - if assemblerProcess.returncode: - printExit("Assembler process returned with code %u" \ - % assemblerProcess.returncode) - # read code object file fileString = "" - if not globalParameters["CodeFromFiles"]: - codeObjectFile = open(codeObjectFileName, "r") - codeObjectByteArray = bytearray(codeObjectFile.read()) - codeObjectFile.close() - - # write code object byte array - fileString += self.comment("code object byte array") - fileString += "const unsigned char %s_coba[%u] = {\n" % (kernelName, len(codeObjectByteArray)) - for byteIdx in range(0, len(codeObjectByteArray)): - byte = codeObjectByteArray[byteIdx] - fileString += "0x%02x" % byte - if byteIdx < len(codeObjectByteArray)-1: - fileString += "," - else: - fileString += "};\n" - if byteIdx % 16 == 15: - fileString += "\n" - - - # read code-object file and convert to c++ representable uchar* - # return string of code-object byte array + if assemblerProcess.returncode: + error = -1 + else: + # read code object file + if not globalParameters["CodeFromFiles"]: + codeObjectFile = open(codeObjectFileName, "r") + codeObjectByteArray = bytearray(codeObjectFile.read()) + codeObjectFile.close() + + # write code object byte array + fileString += self.comment("code object byte array") + fileString += "const unsigned char %s_coba[%u] = {\n" % (kernelName, len(codeObjectByteArray)) + for byteIdx in range(0, len(codeObjectByteArray)): + byte = codeObjectByteArray[byteIdx] + fileString += "0x%02x" % byte + if byteIdx < len(codeObjectByteArray)-1: + fileString += "," + else: + fileString += "};\n" + if byteIdx % 16 == 15: + fileString += "\n" + + + # read code-object file and convert to c++ representable uchar* + # return string of code-object byte array return (error, fileString) diff --git a/Tensile/KernelWriterAssembly.py b/Tensile/KernelWriterAssembly.py index 2604cfba38..e980e41766 100644 --- a/Tensile/KernelWriterAssembly.py +++ b/Tensile/KernelWriterAssembly.py @@ -339,7 +339,6 @@ def __init__( self, kernelMinNaming, kernelSerialNaming ): self.printedAssertCnt = 0 self.initLdsValue = 0xFFFFFFFF # Value to use for LDS Init, if enabled - self.db["CheckDimOverflow"] = False # check for tensor dims that exceed assumptions in code, in particular around use of 32-bit calcs # Check A and B values loaded from memory to ensure they are 1 # Requires DataInitTypeAB=1. @@ -357,6 +356,7 @@ def __init__( self, kernelMinNaming, kernelSerialNaming ): self.localReadDoCnt = 0 self.localWriteDoCnt = 0 + self.maxVgprs = 256 self.maxSgprs = 99 @@ -541,6 +541,21 @@ def initKernel(self, kernel, tPA, tPB ): # which parts of the code were changed to support the new mode. self.globalReadIncsUseVgpr = False if kernel["BufferLoad"] else True + # If True, GRO are expressed as offsets from the beginning of the macro-tile, and the SRD + # is set to the beginning of the macro-tile. + # If False, GRO are expressed as offsets from the beginning of the lowest 2 dimensions + # in the tensor. + # True can allow Buffer-Based logic to have significantly higher range and handle larger tensors + # But does not work with the PointerShift logic. + # Can be enabled with PBC (does not use branch logic) or if assertions guarantee no shift needed + # groOffsetInMacroTile doesn't work with pointer-shift because it sets the SRD to point to the + # start of the macro-tile - if we overhang by small number of elements ( 1 else 0) self.defineSgpr("AddressC", numSgprAddressC) @@ -1178,10 +1203,14 @@ def initKernel(self, kernel, tPA, tPB ): self.defineSgpr("PerpOverhangVccA", 2, 2) if kernel["fractionalPerpOverhangB"]: self.defineSgpr("PerpOverhangVccB", 2, 2) + if self.use64bPbcLimit: + self.defineSgpr("SrdShadowLimitA", 2, 2) + self.defineSgpr("SrdShadowLimitB", 2, 2) if globalParameters["DebugKernel"]: self.defineSgpr("AddressDbg", self.numSgprAddressDbg) self.defineSgpr("DebugKernelItems", 1) + #------------------------ # Registers defined below this point are not available in the post-loop # (we reclaim them to use as temps, typically for execmasks) @@ -1193,6 +1222,7 @@ def initKernel(self, kernel, tPA, tPB ): self.defineSgpr("OffsetA", numSgprOffsetA) self.defineSgpr("OffsetB", numSgprOffsetB) + self.defineSgpr("GlobalReadIncsA", numSgprGlobalReadIncsA) self.defineSgpr("GlobalReadIncsB", numSgprGlobalReadIncsB) @@ -1257,7 +1287,7 @@ def initKernel(self, kernel, tPA, tPB ): if self.db["CheckValue1B"] : print ("\n***WARNING: CheckValue1B enabled, may impact performance\n") if self.db["PrintRP"] : print ("\n***WARNING: PrintRP enabled, may generate verbose output\n") if kernel["CheckTensorDimAsserts"] : print ("\n***WARNING: CheckTensorDimAsserts enabled, may impact performance\n") - if self.db["CheckDimOverflow"] : print ("\n***WARNING: CheckDimOverflow enabled, may impact performance\n") + if kernel["CheckDimOverflow"] : print ("\n***WARNING: CheckDimOverflow enabled, may impact performance\n") ############################################################################## @@ -1313,7 +1343,7 @@ def defineMACMacro(self, kernel, innerUnroll): # we treat HighPrecisionAccumulate as expanded packed math b = blockB*2 a = blockA*2 - if kernel["LocalDotLayout"] > 1: # Only supports LocalDotLayout == 2 for now + if kernel["LocalDotLayout"] > 1 and innerUnroll == 2: # Only supports LocalDotLayout == 2 for now lcldot = kernel["LocalDotLayout"] iua = blockA / ((kernel["ThreadTileA"]/2) / lcldot) iub = blockB / ((kernel["ThreadTileB"]/2) / lcldot) @@ -1409,7 +1439,7 @@ def defineMACMacro(self, kernel, innerUnroll): # we treat HighPrecisionAccumulate as expanded packed math b = blockB*2 a = blockA*2 - if kernel["LocalDotLayout"] > 1: # Only supports LocalDotLayout == 2 for now + if kernel["LocalDotLayout"] > 1 and innerUnroll == 2: # Only supports LocalDotLayout == 2 for now lcldot = kernel["LocalDotLayout"] iua = blockA / ((kernel["ThreadTileA"]/2) / lcldot) iub = blockB / ((kernel["ThreadTileB"]/2) / lcldot) @@ -1422,28 +1452,28 @@ def defineMACMacro(self, kernel, innerUnroll): % ("vgprValuA_X%u_I%u"%(m,iua), rema*lcldot) bStr = "v[%s+%u]" \ % ("vgprValuB_X%u_I%u"%(m,iub), remb*lcldot) - kStr += "v_dot2_f32_f16 %s, %s, %s, %s //ValuC[%u] iua=%u iub=%u%s" % (cStr, aStr, bStr, cStr, cidx, iua, iub, self.endLine) + kStr += "v_dot2_f32_f16 %s, %s, %s, %s op_sel:[0,0] op_sel_hi:[1,1] //ValuC[%u] iua=%u iub=%u%s" % (cStr, aStr, bStr, cStr, cidx, iua, iub, self.endLine) cidx = blockA*2 + blockB*kernel["ThreadTile0"]*2 + 1 aStr = "v[%s+%u]" \ % ("vgprValuA_X%u_I%u"%(m,iua), rema*lcldot+1) bStr = "v[%s+%u]" \ % ("vgprValuB_X%u_I%u"%(m,iub), remb*lcldot) cStr = "v[%s+%u*2+%u*%u*2+0*2+1]" % ("vgprValuC", blockA, blockB, kernel["ThreadTile0"]) # *2 b/c of fp32 - kStr += "v_dot2_f32_f16 %s, %s, %s, %s //ValuC[%u]%s" % (cStr, aStr, bStr, cStr, cidx, self.endLine) + kStr += "v_dot2_f32_f16 %s, %s, %s, %s op_sel:[0,0] op_sel_hi:[1,1] //ValuC[%u]%s" % (cStr, aStr, bStr, cStr, cidx, self.endLine) cidx = blockA*2 + blockB*kernel["ThreadTile0"]*2 + kernel["ThreadTile0"] + 0 aStr = "v[%s+%u]" \ % ("vgprValuA_X%u_I%u"%(m,iua), rema*lcldot) bStr = "v[%s+%u]" \ % ("vgprValuB_X%u_I%u"%(m,iub), remb*lcldot+1) cStr = "v[%s+%u*2+%u*%u*2+%u*2+0]" % ("vgprValuC", blockA, blockB, kernel["ThreadTile0"], kernel["ThreadTile0"]/2) - kStr += "v_dot2_f32_f16 %s, %s, %s, %s //ValuC[%u]%s" % (cStr, aStr, bStr, cStr, cidx, self.endLine) + kStr += "v_dot2_f32_f16 %s, %s, %s, %s op_sel:[0,0] op_sel_hi:[1,1] //ValuC[%u]%s" % (cStr, aStr, bStr, cStr, cidx, self.endLine) cidx = blockA*2 + blockB*kernel["ThreadTile0"]*2 + kernel["ThreadTile0"] + 1 aStr = "v[%s+%u]" \ % ("vgprValuA_X%u_I%u"%(m,iua), rema*lcldot+1) bStr = "v[%s+%u]" \ % ("vgprValuB_X%u_I%u"%(m,iub), remb*lcldot+1) cStr = "v[%s+%u*2+%u*%u*2+%u*2+1]" % ("vgprValuC", blockA, blockB, kernel["ThreadTile0"], kernel["ThreadTile0"]/2) - kStr += "v_dot2_f32_f16 %s, %s, %s, %s //valuC[%u]%s" % (cStr, aStr, bStr, cStr, cidx, self.endLine) + kStr += "v_dot2_f32_f16 %s, %s, %s, %s op_sel:[0,0] op_sel_hi:[1,1] //valuC[%u]%s" % (cStr, aStr, bStr, cStr, cidx, self.endLine) #kStr += self.bomb(-13) """ ignore this, not quite correct for mixed precision @@ -1611,7 +1641,6 @@ def functionSignature(self, kernel ): # lds size #kStr += " compute_pgm_rsrc2_lds_size = 1 // ?%s" % self.endLine # don't use, it eats up 512 bytes of LDS -#jgolds which bpe should we use? assuming A kStr += " workgroup_group_segment_byte_size = %u // lds bytes%s" \ % ( kernel["LdsNumElements"] * self.bpeAB, self.endLine ) @@ -1679,8 +1708,6 @@ def functionSignature(self, kernel ): kStr += " v_add_u32 \dst, vcc, \src0, \src1" + self.endLine kStr += ".endm" + self.endLine - - ######################################## # VGPR Macros ######################################## @@ -1752,9 +1779,8 @@ def functionSignature(self, kernel ): if kernel["BufferLoad"] or kernel["BufferStore"]: - if not kernel["PreciseBoundsCheck"]: - kStr += self.comment3("2GB limit - set offsets to -1 to exceed this and clamp") - kStr += self.macroRegister("BufferLimit", "0x80000000") + kStr += self.comment3("2GB limit - set offsets to -1 to exceed this and clamp") + kStr += self.macroRegister("BufferLimit", "0x80000000") kStr += self.comment3("Bits 127:96 of SRD. Set DataFormat = 32 bit") kStr += self.macroRegister("Srd127_96", "0x0020000") #TODO-64 : This is max 32-bit negative value, the tail loop @@ -1772,12 +1798,18 @@ def functionSignature(self, kernel ): ("C", range(0, kernel["ProblemType"]["NumIndicesC"]), kernel["BufferStore"]), \ ("A", kernel["ProblemType"]["IndexAssignmentsA"], kernel["BufferLoad"]), \ ("B", kernel["ProblemType"]["IndexAssignmentsB"], kernel["BufferLoad"]) ]: + + # BufferStore does not use this macro so don't generate it: + if tensorChar == "C" and kernel["BufferStore"]: + continue + kStr += self.comment("Global Offset %s"%tensorChar) numDim = len(indices) idxChars = [] for i in indices: idxChars.append(self.indexChars[i]) + # macro declaration kStr += ".macro GLOBAL_OFFSET_%s vgprAddr"%tensorChar for i in range(0, numDim): @@ -1787,7 +1819,7 @@ def functionSignature(self, kernel ): or indices[i] == kernel["ProblemType"]["IndexUnroll"]: kStr += " vgprOffset%s" % idxChars[i] # other c index sgpr - elif indices[i] < kernel["ProblemType"]["NumIndicesC"]: + elif indices[i] < kernel["ProblemType"]["NumIndicesC"] and not justOffset32: kStr += " sgprOffset%s" % idxChars[i] # other sum index else: @@ -1829,7 +1861,7 @@ def functionSignature(self, kernel ): "mul d%u upper"%i) needAdd = 1 # other c index sgpr - elif indices[i] < kernel["ProblemType"]["NumIndicesC"]: + elif indices[i] < kernel["ProblemType"]["NumIndicesC"] and not justOffset32: kStr += inst("v_mov_b32", \ "v[\\vgprTmp+2]", \ "s[\\sgprOffset%s]"%idxChars[i], \ @@ -1839,7 +1871,7 @@ def functionSignature(self, kernel ): "v[\\vgprTmp+0]", \ sgpr("Strides%s+%u"%(tensorChar,i-1)), \ "v[\\vgprTmp+2]", \ - "mul d%u lower"%i) + "other stride mul d%u lower"%i) if not justOffset32: kStr += inst("v_mul_hi_u32", \ "v[\\vgprTmp+1]", \ @@ -1852,10 +1884,11 @@ def functionSignature(self, kernel ): # don't even need to add b/c offset=zero needAdd = 0 + destLo = "v[\\vgprAddr+0]" if needAdd: # addr += offset * stride (lo) kStr += inst("_v_add_co_u32", \ - "v[\\vgprAddr+0]", \ + destLo, \ "vcc", \ "v[\\vgprTmp+0]", \ offset, \ @@ -1870,15 +1903,15 @@ def functionSignature(self, kernel ): "vcc", \ "accumulate d%u upper"%i) else: - kStr += inst("v_mov_b32", "v[\\vgprAddr+0]", offset, "d0 lower") + if destLo != offset: + kStr += inst("v_mov_b32", destLo, offset, "setup d0 lower") if not justOffset32: kStr += inst("v_mov_b32", "v[\\vgprAddr+1]", hex(0), "d0 upper") # Change offset for subsequent dims (if needed) - offset = "v[\\vgprAddr+0]" + offset = destLo # addr *= bytes/element -#jgolds which bpe should we use? assuming A if justOffset32: kStr += inst("v_lshlrev_b32", \ "v[\\vgprAddr+0]", \ @@ -1958,7 +1991,6 @@ def allocateResources(self, kernel): if self.do["PreLoop"]: # set m0 -#jgolds which bpe here? Using A for now kStr += inst("s_mov_b32", "m0", hex(kernel["LdsNumElements"] \ * self.bpeAB), "LDS clamp at %u bytes" \ %(kernel["LdsNumElements"] * self.bpeAB) ) @@ -1977,6 +2009,22 @@ def allocateResources(self, kernel): kStr += inst("s_load_dword", sgpr("AddressDbg+1"), \ sgpr("KernArgAddress",2), hex(kernArgOffset), "load addr debug + 1" ) kernArgOffset += 1*4 + kStr += inst("s_load_dword", sgpr("Tensor2dSizeC+0"), \ + sgpr("KernArgAddress",2), hex(kernArgOffset+0), "load tensor size" ) + kStr += inst("s_load_dword", sgpr("Tensor2dSizeC+1"), \ + sgpr("KernArgAddress",2), hex(kernArgOffset+4), "load tensor size" ) + kernArgOffset += 2*4 + kStr += inst("s_load_dword", sgpr("Tensor2dSizeA+0"), \ + sgpr("KernArgAddress",2), hex(kernArgOffset+0), "load tensor size" ) + kStr += inst("s_load_dword", sgpr("Tensor2dSizeA+1"), \ + sgpr("KernArgAddress",2), hex(kernArgOffset+4), "load tensor size" ) + kernArgOffset += 2*4 + kStr += inst("s_load_dword", sgpr("Tensor2dSizeB+0"), \ + sgpr("KernArgAddress",2), hex(kernArgOffset+0), "load tensor size" ) + kStr += inst("s_load_dword", sgpr("Tensor2dSizeB+1"), \ + sgpr("KernArgAddress",2), hex(kernArgOffset+4), "load tensor size" ) + kernArgOffset += 2*4 + kStr += inst("s_load_dword", sgpr("AddressC"), \ sgpr("KernArgAddress",2), hex(kernArgOffset), "load addr c" ) kernArgOffset += 1*4 @@ -2042,13 +2090,18 @@ def allocateResources(self, kernel): kernArgOffset += 1*4 for i in range(0, self.numSgprSizesSum): kStr += inst("s_load_dword", sgpr("SizesSum+%u"%i), \ - sgpr("KernArgAddress",2), hex(kernArgOffset), "load size free %u"%i ) + sgpr("KernArgAddress",2), hex(kernArgOffset), "load size sum %u"%i ) kernArgOffset += 1*4 + + + kStr += inst("s_waitcnt", "lgkmcnt(0)", \ "wait for %u bytes of kern args" % kernArgOffset ) else: kStr += ".if 0\n" + #kStr += self.bomb() + ######################################## # Apply User Offsets kStr += self.comment("User Offsets") @@ -2158,6 +2211,8 @@ def allocateResources(self, kernel): kernel["AssertSummationElementMultiple"], 0x1001) kStr += self.assert_multiple_b32(sgpr("SizesFree+0"), kernel["AssertFree0ElementMultiple"], 0x1002) + kStr += self.assert_multiple_b32(sgpr("SizesFree+1"), + kernel["AssertFree1ElementMultiple"], 0x1003) return kStr @@ -2357,7 +2412,7 @@ def graSubgroup(self, kernel): ############################################################################## # Global Read Addresses: Tile Assignment A/B - # stores to v1,2 + # global read addresses: tile offset assignment (message from .s) ############################################################################## def graTileAssignment(self, kernel, tP): kStr = "" @@ -2388,9 +2443,19 @@ def graTileAssignment(self, kernel, tP): uReg = rReg tOpStr = "/" uOpStr = "%" - tReg2 = self.vgprPool.checkOut(1) - kStr += self.comment1("%s = gro%s-tile = serial%s%s + (wg%s*MT%s)" \ - % (vgpr(tReg2), tP["tensorChar"], tOpStr, divisorName, tP["tensorChar"], tP["tensorChar"]) ) + kStr += self.comment1("%s = %u" % (divisorName, kernel[divisorName])) + if self.groOffsetInMacroTile: + tReg2 = tReg + # treg2 and treg same register and value - we store the 'static' + # part of the address calculation in the SRD to maximize the + # range of the 32-bit GRO + kStr += self.comment1("%s = (local)gro%s-tile = serial%s%s (note (wg%s*MT%s) will be added to SRD)" \ + % (vgpr(tReg2), tP["tensorChar"], tOpStr, divisorName, tP["tensorChar"], tP["tensorChar"]) ) + else: + tReg2 = self.vgprPool.checkOut(1) + kStr += self.comment1("%s = gro%s-tile = serial%s%s + (wg%s*MT%s)" \ + % (vgpr(tReg2), tP["tensorChar"], tOpStr, divisorName, tP["tensorChar"], tP["tensorChar"]) ) + kStr += self.comment1("%s = gro%s-unroll = serial%s%s" \ % (vgpr(uReg), tP["tensorChar"], uOpStr, divisorName) ) dividendReg = "Serial" # local serial @@ -2406,10 +2471,13 @@ def graTileAssignment(self, kernel, tP): else: kStr += self.comment1("gro-unroll *= glvw") kStr += staticMultiply(vgpr(uReg), vgpr(uReg), tP["glvw"], sgpr(tmpSgpr)) - kStr += staticMultiply(vgpr(tmpVgpr), sgpr(tP["wg"]), kernel[tP["mt"]]) - kStr += inst("_v_add_co_u32", vgpr(tReg2), "vcc", vgpr(tmpVgpr), \ - vgpr(tReg), "gro%s-tile = serial%s%s*VW + (wg%s*MT%s)" \ - % (tP["tensorChar"], tOpStr, divisorName, tP["tensorChar"], tP["tensorChar"]) ) + if not self.groOffsetInMacroTile: + # Buffer Load will set the SRD to start of the MacroTile + # So don't add the static wg-related component here - save for later. + kStr += staticMultiply(vgpr(tmpVgpr), sgpr(tP["wg"]), kernel[tP["mt"]]) # workgroup + kStr += inst("_v_add_co_u32", vgpr(tReg2), "vcc", vgpr(tmpVgpr), \ + vgpr(tReg), "gro%s-tile = serial%s%s*VW + (wg%s*MT%s)" \ + % (tP["tensorChar"], tOpStr, divisorName, tP["tensorChar"], tP["tensorChar"]) ) if kernel["GlobalSplitU"] > 1: uReg2 = self.vgprPool.checkOut(1) @@ -2429,7 +2497,8 @@ def graTileAssignment(self, kernel, tP): ############################################################################## def graUnrollAssignment(self, kernel, tP): kStr = "" - if kernel["GlobalSplitU"] > 1: + # note groOffsetInMacroTile rolls these into SRD so don't change here: + if not self.groOffsetInMacroTile and kernel["GlobalSplitU"] > 1: gsuOffset = self.vgprPool.checkOut(1) kStr += inst("v_mov_b32", vgpr(gsuOffset), sgpr("GSUSumIdx"), "=gsuSumIdx") if kernel["GlobalSplitUSummationAssignmentRoundRobin"]: @@ -2454,6 +2523,7 @@ def graUnrollAssignment(self, kernel, tP): kStr += inst("v_mul_lo_u32", vgpr(gsuOffset), vgpr(quotient), \ vgpr(gsuOffset), "gsuOffset=gsuSumIdx*(SizeU/GSU)") self.vgprPool.checkIn(quotient) + kStr += inst("_v_add_co_u32", vgpr(tP["gpr"]["uReg"]), "vcc", \ vgpr(gsuOffset), vgpr(tP["gpr"]["uReg"]), \ "graUnrollAssignment += gsuOffset") @@ -2487,38 +2557,46 @@ def graOtherSummationAssignments(self, kernel): ############################################################################## def graTileOffsets(self, kernel, tP): kStr = "" - numTileOffsets = tP["nrt"] - if tP["rtc"]: - numTileOffsets *= tP["glvw"] - tP["vgprTileOffsets"] = self.vgprPool.checkOut(numTileOffsets) - v = tP["vgprTileOffsets"] - strideIdx = tP["lsc"] if tP["tlu"] else tP["lsp"] - stride = kernel[strideIdx] - if tP["rtc"]: - # l=0, s=0 - kStr += inst("v_mov_b32", vgpr(v), \ - vgpr(tP["gpr"]["tReg"]), "gro%s%s_%u_s%u"%(tP["tensorChar"], tP["tileChar"], 0, 0) ) - # l=0, s>0 - for s in range(1, tP["glvw"]): - kStr += inst("_v_add_co_u32", vgpr(v+s), "vcc", 1, \ - vgpr(v+s-1), "gro%s%s_%u_s%u"%(tP["tensorChar"], tP["tileChar"], 0, s) ) - for l in range(1, tP["nrt"]): - # l>0, s=0 - kStr += inst("_v_add_co_u32", vgpr(v+l*tP["glvw"]), "vcc", stride, \ - vgpr(v+(l-1)*tP["glvw"]), \ - "gro%s%s_%u_s%u + %s"%(tP["tensorChar"], tP["tileChar"], l, 0, strideIdx) ) - # l>0, s>0 - for s in range(1, tP["glvw"]): - kStr += inst("_v_add_co_u32", vgpr(v+l*tP["glvw"]+s), "vcc", \ - 1, vgpr(v+l*tP["glvw"]+(s-1)), \ - "gro%s%s_%u_s%u"%(tP["tensorChar"], tP["tileChar"], l, s) ) + if kernel["UseSgprForGRO"]: + # Let the vgprTileOffsets checkin handle tReg, don't need to do it here + tP["vgprTileOffsets"] = tP["gpr"]["tReg"] else: - kStr += inst("v_mov_b32", vgpr(v), \ - vgpr(tP["gpr"]["tReg"]), "gro%s%s_%u"%(tP["tensorChar"], tP["tileChar"], 0) ) - for l in range(1, tP["nrt"]): - kStr += inst("_v_add_co_u32", vgpr(v+l), "vcc", stride, \ - vgpr(v+l-1), "gro%s%s_%u += %s"%(tP["tensorChar"], tP["tileChar"], l, strideIdx) ) - self.vgprPool.checkIn(tP["gpr"]["tReg"]) + numTileOffsets = tP["nrt"] + if tP["rtc"]: + numTileOffsets *= tP["glvw"] + tP["vgprTileOffsets"] = self.vgprPool.checkOut(numTileOffsets) + v = tP["vgprTileOffsets"] + strideIdx = tP["lsc"] if tP["tlu"] else tP["lsp"] + stride = kernel[strideIdx] + if tP["rtc"]: + # l=0, s=0 + kStr += inst("v_mov_b32", vgpr(v), \ + vgpr(tP["gpr"]["tReg"]), "gro%s%s_%u_s%u"%(tP["tensorChar"], tP["tileChar"], 0, 0) ) + # l=0, s>0 + for s in range(1, tP["glvw"]): + kStr += inst("_v_add_co_u32", vgpr(v+s), "vcc", 1, \ + vgpr(v+s-1), "gro%s%s_%u_s%u"%(tP["tensorChar"], tP["tileChar"], 0, s) ) + for l in range(1, tP["nrt"]): + # l>0, s=0 + kStr += inst("_v_add_co_u32", vgpr(v+l*tP["glvw"]), "vcc", stride, \ + vgpr(v+(l-1)*tP["glvw"]), \ + "gro%s%s_%u_s%u + %s"%(tP["tensorChar"], tP["tileChar"], l, 0, strideIdx) ) + # l>0, s>0 + for s in range(1, tP["glvw"]): + kStr += inst("_v_add_co_u32", vgpr(v+l*tP["glvw"]+s), "vcc", \ + 1, vgpr(v+l*tP["glvw"]+(s-1)), \ + "gro%s%s_%u_s%u"%(tP["tensorChar"], tP["tileChar"], l, s) ) + else: + kStr += inst("v_mov_b32", vgpr(v), \ + vgpr(tP["gpr"]["tReg"]), "gro%s%s_%u"%(tP["tensorChar"], tP["tileChar"], 0) ) + for l in range(1, tP["nrt"]): + kStr += inst("_v_add_co_u32", vgpr(v+l), "vcc", stride, \ + vgpr(v+l-1), "gro%s%s_%u += %s"%(tP["tensorChar"], tP["tileChar"], l, strideIdx) ) + + # groOffsetInMacroTile uses same register for both of these, don't free it here: + if tP["gpr"]["lwoT"] != tP["gpr"]["tReg"] : + self.vgprPool.checkIn(tP["gpr"]["tReg"]) + tP["gpr"]["tReg"] = None return kStr @@ -2527,38 +2605,41 @@ def graTileOffsets(self, kernel, tP): ############################################################################## def graUnrollOffsets(self, kernel, tP): kStr = "" - numUnrollOffsets = tP["nru"] - if tP["ruc"]: - numUnrollOffsets *= tP["glvw"] - tP["gpr"]["unrollOffsets"] = self.vgprPool.checkOut(numUnrollOffsets) - v = tP["gpr"]["unrollOffsets"] - strideIdx = (tP["lsp"] if tP["tlu"] else tP["lsc"]) - stride = kernel[strideIdx] - if tP["ruc"]: - # l=0, s=0 - kStr += inst("v_mov_b32", vgpr(v), \ - vgpr(tP["gpr"]["uReg"]), "gro%s%s_%u_s%u"%(tP["tensorChar"], self.unrollChar, 0, 0) ) - # l=0, s>0 - for s in range(1, tP["glvw"]): - kStr += inst("_v_add_co_u32", vgpr(v+s), "vcc", 1, \ - vgpr(v+s-1), "gro%s%s_%u_s%u"%(tP["tensorChar"], self.unrollChar, 0, s) ) - for l in range(1, tP["nru"]): - # l>0, s=0 - kStr += inst("_v_add_co_u32", vgpr(v+l*tP["glvw"]), "vcc", stride, \ - vgpr(v+(l-1)*tP["glvw"]), \ - "gro%s%s_%u_s%u + %s"%(tP["tensorChar"], self.unrollChar, l, 0, strideIdx) ) - # l>0, s>0 - for s in range(1, tP["glvw"]): - kStr += inst("_v_add_co_u32", vgpr(v+l*tP["glvw"]+s), "vcc", \ - 1, vgpr(v+l*tP["glvw"]+(s-1)), \ - "gro%s%s_%u_s%u"%(tP["tensorChar"], self.unrollChar, 0, s) ) + if kernel["UseSgprForGRO"]: + tP["gpr"]["unrollOffsets"] = tP["gpr"]["uReg"] else: - kStr += inst("v_mov_b32", vgpr(v), \ - vgpr(tP["gpr"]["uReg"]), "gro%s%s_%u"%(tP["tensorChar"], self.unrollChar, 0) ) - for l in range(1, tP["nru"]): - kStr += inst("_v_add_co_u32", vgpr(v+l), "vcc", stride, \ - vgpr(v+l-1), "gro%s%s_%u + %s"%(tP["tensorChar"], self.unrollChar, l, strideIdx) ) - #self.vgprPool.checkIn(tP["gpr"]["uReg"]) + numUnrollOffsets = tP["nru"] + if tP["ruc"]: + numUnrollOffsets *= tP["glvw"] + tP["gpr"]["unrollOffsets"] = self.vgprPool.checkOut(numUnrollOffsets) + v = tP["gpr"]["unrollOffsets"] + strideIdx = (tP["lsp"] if tP["tlu"] else tP["lsc"]) + stride = kernel[strideIdx] + if tP["ruc"]: + # l=0, s=0 + kStr += inst("v_mov_b32", vgpr(v), \ + vgpr(tP["gpr"]["uReg"]), "gro%s%s_%u_s%u"%(tP["tensorChar"], self.unrollChar, 0, 0) ) + # l=0, s>0 + for s in range(1, tP["glvw"]): + kStr += inst("_v_add_co_u32", vgpr(v+s), "vcc", 1, \ + vgpr(v+s-1), "gro%s%s_%u_s%u"%(tP["tensorChar"], self.unrollChar, 0, s) ) + for l in range(1, tP["nru"]): + # l>0, s=0 + kStr += inst("_v_add_co_u32", vgpr(v+l*tP["glvw"]), "vcc", stride, \ + vgpr(v+(l-1)*tP["glvw"]), \ + "gro%s%s_%u_s%u + %s"%(tP["tensorChar"], self.unrollChar, l, 0, strideIdx) ) + # l>0, s>0 + for s in range(1, tP["glvw"]): + kStr += inst("_v_add_co_u32", vgpr(v+l*tP["glvw"]+s), "vcc", \ + 1, vgpr(v+l*tP["glvw"]+(s-1)), \ + "gro%s%s_%u_s%u"%(tP["tensorChar"], self.unrollChar, 0, s) ) + else: + kStr += inst("v_mov_b32", vgpr(v), \ + vgpr(tP["gpr"]["uReg"]), "gro%s%s_%u"%(tP["tensorChar"], self.unrollChar, 0) ) + for l in range(1, tP["nru"]): + kStr += inst("_v_add_co_u32", vgpr(v+l), "vcc", stride, \ + vgpr(v+l-1), "gro%s%s_%u + %s"%(tP["tensorChar"], self.unrollChar, l, strideIdx) ) + #self.vgprPool.checkIn(tP["gpr"]["uReg"]) return kStr @@ -2570,43 +2651,54 @@ def graBranch(self, kernel, tP): ############################################################################## # Global Read Addresses: Shift A/B + # See if the load (including vw) will extend past the 'free' dim of the + # tensor. If so clip to the last legal value which is inside the array ############################################################################## def graShift(self, kernel, tP): - if kernel["PreciseBoundsCheck"]: return "" + # PBC doesn't shift pointers, uses a difference edge detect mechanism + # FractionalLoad maps addresses in a different way? + if kernel["PreciseBoundsCheck"] : return "" kStr = "" # edge value margin = tP["glvw"] if tP["rtv"] else 1 edge = self.vgprPool.checkOut(1) - - if kernel["BufferLoad"] and kernel["PreciseBoundsCheck"]: - # Go to the edge. we can rely on preciseboundscheck to keep things inline - # Results in more loads of 0 which is better for power and debug - kStr += inst("v_mov_b32", vgpr(edge), sgpr("SizesFree+%u"%tP["idx"]), \ - "edge = Size%s"%(tP["tileChar"]) ) + if self.groOffsetInMacroTile: + # Subtract the static component from SizesFree: + # TODO - this code is dead since PreciseBoundsCheck returns above + tmpSgpr = self.getTmpSgpr(1) + kStr += inst("s_mul_i32", sgpr(tmpSgpr), sgpr(tP["wg"]), kernel[tP["mt"]], "WorkGroup[01] * MT") + kStr += inst("s_sub_u32", sgpr(tmpSgpr), sgpr("SizesFree+%u"%tP["idx"]), sgpr(tmpSgpr), \ + "edge = Size%s - WG*MT"%(tP["tileChar"])) + kStr += inst("s_sub_u32", sgpr(tmpSgpr), sgpr(tmpSgpr), margin, "edge -= margin") + kStr += inst("v_mov_b32", vgpr(edge), sgpr(tmpSgpr), \ + "edge vgpr = Size%s-%u"%(tP["tileChar"], margin) ) else: tmpSgpr = self.getTmpSgpr(1) - kStr += inst("s_add_u32", sgpr(tmpSgpr), hex(-margin), sgpr("SizesFree+%u"%tP["idx"]), \ + kStr += inst("s_sub_u32", sgpr(tmpSgpr), sgpr("SizesFree+%u"%tP["idx"]), margin, \ "edge = Size%s-%u"%(tP["tileChar"], margin) ) kStr += inst("v_mov_b32", vgpr(edge), sgpr(tmpSgpr), \ - "edge = Size%s-%u"%(tP["tileChar"], margin) ) + "edge vgpr = Size%s-%u"%(tP["tileChar"], margin) ) + + if kernel["CheckDimOverflow"]: + # if tensor is really skinnty (SizesFree is less then glvw) then shifting fails- + # can detect here if the computed edge after subtracting marging is <0 + kStr += self.assert_ge_i32(vgpr(edge), 0) + #kStr += self.assert_ne(sgpr("WorkGroup0"),1) # shift offsets v = tP["vgprTileOffsets"] tmpSgpr = self.getTmpSgpr(2) for l in range(0, tP["nrt"]): # compare - #kStr += dump(vgpr(v+l)) kStr += inst("v_cmp_lt_u32", sgpr(tmpSgpr,2), vgpr(v+l), vgpr(edge), "offset < edge" ) # shift kStr += inst("v_cndmask_b32", vgpr(v+l), vgpr(edge), vgpr(v+l), sgpr(tmpSgpr,2), "offset = (offset < edge) ? offset : edge" ) - #kStr += dump(vgpr(v+l)) self.vgprPool.checkIn(edge) #if tP["isB"]: # kStr += "s_endpgm\n" - return kStr ############################################################################## @@ -2651,7 +2743,7 @@ def graFinalOffsets(self, kernel, tP): if i < kernel["ProblemType"]["NumIndicesC"]: if i == tP["tileIdx"]: kStr += ", %2u" % vgprTile - else: # just a group index + elif not kernel["BufferLoad"]: # just a group index kStr += ", sgprWorkGroup%u"%i else: # summation index if i == kernel["ProblemType"]["IndexUnroll"]: @@ -2693,7 +2785,7 @@ def graFinalOffsets(self, kernel, tP): if tP["tlu"]: tileStride = kernel[tP["lsc"]] * (para*tVW + sPara*tVS) unrollStride = kernel[tP["lsp"]] * (perp*uVW + sPerp*uVS) - kStr += inst("s_mul_i32", sgpr(scalarGro), sgpr("Strides%s"%tc), unrollStride, \ + kStr += inst("s_mul_i32", sgpr(scalarGro), sgpr("Strides%s+0"%tc), unrollStride, \ "compute offset diff (scaled unrollDim)") if tileStride: kStr += inst("s_add_u32", sgpr(scalarGro), sgpr(scalarGro), tileStride, \ @@ -2701,7 +2793,7 @@ def graFinalOffsets(self, kernel, tP): else: tileStride = kernel[tP["lsp"]] * (perp*tVW + sPara*tVS) unrollStride = kernel[tP["lsc"]] * (para*uVW + sPerp*uVS) - kStr += inst("s_mul_i32", sgpr(scalarGro), sgpr("Strides%s"%tc), tileStride, \ + kStr += inst("s_mul_i32", sgpr(scalarGro), sgpr("Strides%s+0"%tc), tileStride, \ "compute offset diff (scaled tileDim)") if unrollStride: kStr += inst("s_add_u32", sgpr(scalarGro), sgpr(scalarGro), unrollStride, \ @@ -2717,15 +2809,14 @@ def graFinalOffsets(self, kernel, tP): if self.checkGRO: # Debug mode to verify that the computed offsets are offset by the expected scalar - print tc, "tileStride=", tileStride, "unrollStride=", unrollStride, \ "Strides%s="%tc kStr += self.assert_vector_diff(vgpr("GlobalReadOffset%s+%u"%(tc,0)), \ vgpr("GlobalReadOffset%s+%u"%(tc,graIdx)), \ sgpr(scalarGro)) - #-- End UseSgprForGRO + #-- End UseSgprForGRO # dump final offsets # BufferLoad flavor: #if tP["isA"]: @@ -2734,9 +2825,18 @@ def graFinalOffsets(self, kernel, tP): #kStr += dump(vgpr("GlobalReadAddr%s+%u+0"%(tP["tensorChar"], graIdx))) #kStr += dump(vgpr("GlobalReadAddr%s+%u+1"%(tP["tensorChar"], graIdx))) graIdx += self.rpgo if kernel["BufferLoad"] else self.rpga - self.vgprPool.checkIn(tileOffsets) - self.vgprPool.checkIn(unrollOffsets) + + if not kernel["UseSgprForGRO"]: + self.vgprPool.checkIn(tP["vgprTileOffsets"]) + tP["vgprTileOffsets"] = None + # UseSgprForGRO uses same vgpr for ureg and unrollOffsets so + # let checkin(ureg) do the deallocating + # vgprTileOffsets is renamed version of treg/lwo so deallocate + # it here + self.vgprPool.checkIn(unrollOffsets) self.vgprPool.checkIn(tmp) + #if tP["isB"]: + # kStr += self.bomb(0x100) if kernel["FractionalLoad"] and kernel["fractionalPerpOverhang%s"%tc]: overhang = kernel["fractionalPerpOverhang%s"%tc] @@ -2749,7 +2849,6 @@ def graFinalOffsets(self, kernel, tP): sgpr("PerpOverhangVcc%s"%tc), \ "fractional-overhang: some wi write to harmless LDS location") - return kStr ############################################################################## @@ -2760,8 +2859,155 @@ def graApplyUserOffsets(self, kernel): kStr += self.comment1("moved earlier") return kStr + ############################################################################## - # Global Read Addresses: Addresses A/B + # Add the constant offsets to the specified srd. + # Srd is set to point to the base of the tile. All offsets except lowest-order + # 2d dims are computed into the SRD. + # GRO are offset from the tile SRD and the first GRO will be 0 + # Only called for BufferLoad=1 (or eventually BufferStore=1) + ############################################################################## + def computeSrd(self, kernel, tP, tc, indices, bpe): + kStr = "" + + stmp = self.getTmpSgpr(6+2) # bozo, remove +1 + tileStart = stmp+4 + wroteTileStart = False + + #--- + # Compute tileStart #elements from the 2D array start + # Add tile (and unroll if GSU) component into SRD - SRD will point to beginning of the macro-tile: + if self.groOffsetInMacroTile: + wroteTileStart = True + startStride = 1 if kernel["ProblemType"]["UseInitialStrides"] else 0 + + # This is guaranteed to fit in 32-bit since the WG*MT is a number of elements in some unsigned direction: + kStr += self.s_mul_u64_u32(sgpr(tileStart+0), sgpr(tileStart+1), sgpr(tP["wg"]), kernel[tP["mt"]], "WorkGroup[01] * MT") + if kernel["CheckDimOverflow"] >=2: + kStr += self.assert_eq(sgpr(tileStart+1),0) + if not tP["tlu"]: # transpose case, tile is in perp dim and should be scaled by Stride + kStr += self.s_mul_u64_u32(sgpr(tileStart), sgpr(tileStart+1), sgpr(tileStart+0), \ + sgpr("Strides%s+%u"%(tc,startStride)), "tlu=0, scaled tile-offset by stride") + + if kernel["GlobalSplitU"] > 1: + # Only GlobalSplitUSummationAssignmentRoundRobin supported for groOffsetInMacroTile - would need different math here for start: + assert(kernel["GlobalSplitUSummationAssignmentRoundRobin"]) + + kStr += self.s_mul_u64_u32(sgpr(stmp+0), sgpr(stmp+1), kernel["DepthU"], sgpr("GSUSumIdx"), "gsuOffset = DepthU*bpe*GSUSumIdx") + if kernel["CheckDimOverflow"] >=2: + kStr += self.assert_eq(sgpr(stmp+1),0) + if tP["tlu"]: # transpose case, tile is in perp dim and should be scaled by Stride + kStr += self.s_mul_u64_u32(sgpr(stmp), sgpr(stmp+1), sgpr(stmp+0), \ + sgpr("Strides%s+%u"%(tc,startStride)), "tlu=1, scaled unroll-offset by stride") + + kStr += inst("s_add_u32", sgpr(tileStart+0), sgpr(tileStart+0), sgpr(stmp+0), "accum GsuOffet term to tilestart") + kStr += inst("s_addc_u32", sgpr(tileStart+1), sgpr(tileStart+1), sgpr(stmp+1), "accum GsuOffet term to tilestart") + + + # Output : tileStart[0:1] have offset in elements from the 2D start of the tile. + # if groOffsetInMacroTile=1, 2DStart + tileStart gives the the start of the macro-tile; + # This is used to compute the limit. + # Later we modify tileStart to include batch and higher-order dims and add this to SRD. + + #--- + # Compute BUFFER Limit: + if kernel["PreciseBoundsCheck"]: + if not wroteTileStart: + kStr += inst("s_mov_b32", sgpr(tileStart+0), 0, "set default tileStart") + kStr += inst("s_mov_b32", sgpr(tileStart+1), 0, "set default tileStart") + + startStride = 1 if kernel["ProblemType"]["UseInitialStrides"] else 0 + if self.use64bPbcLimit: + limitTmp0 = "SrdShadowLimit%s+0"%tc + limitTmp1 = "SrdShadowLimit%s+1"%tc + else: + limitTmp0 = stmp+0 + limitTmp1 = stmp+1 + + kStr += inst("s_sub_u32", sgpr(limitTmp0), sgpr("Tensor2dSize%s"%tc), sgpr(tileStart+0), "sub tileStart") + kStr += inst("s_subb_u32", sgpr(limitTmp1), sgpr("Tensor2dSize%s+1"%tc), sgpr(tileStart+1), "sub tileStart") + + if self.use64bPbcLimit: + # Set initial buffer limit + # if the limit is >64bit, incrementSrd decrements the shadow as the SRD increments, and when we get within 32-bit we start to step down the SRD + # if the limit is <32bits, set it accurately here: + # Note lshl_b64 the higher-numbered SGPR has the upper 32-bits + kStr += inst("s_lshl_b64", sgpr("SrdShadowLimit%s"%tc,2), sgpr("SrdShadowLimit%s"%tc,2), hex(log2(tP["bpe"])), "Set limit to use bytes") + kStr += inst("s_cmp_eq_u32", sgpr("SrdShadowLimit%s+1"%tc), 0, "are we within 2^32?") + kStr += inst("s_cselect_b32", sgpr("Srd%s+2"%tc), sgpr("SrdShadowLimit%s+0"%tc), "BufferLimit", "Move shadow to real if we are within 2^32") + + else: + # put limit directly into SRD: + kStr += inst("s_lshl_b32", sgpr("Srd%s+2"%tc), sgpr(stmp+0), hex(log2(tP["bpe"])), "Set limit to use bytes") + else: + # PreciseBoundsCheck=0, just pick a large max - later conditionally set some offsets to -1 to force OOB + kStr += inst("s_mov_b32", sgpr("Srd%s+2"%tc), "BufferLimit", "") + kStr += "\n" + + + # Apply any high-order address components to the tileStart and eventually the SRD - these include batch idx for batched gemm, >4D tensors, etc + numDim = len(indices) + for i in range(1, numDim): + idx = indices[i] + if idx == kernel["ProblemType"]["Index0"] \ + or idx == kernel["ProblemType"]["Index1"] \ + or idx == kernel["ProblemType"]["IndexUnroll"]: + continue # these will be captured in GRO not the SRD + else: + if not wroteTileStart: + kStr += self.s_mul_u64_u32(sgpr(tileStart+0), sgpr(tileStart+1), sgpr("Strides%s+%u"%(tc,i-1)), sgpr("WorkGroup%u"%i), "Stride*WG") + wroteTileStart = True + else: + kStr += self.s_mul_u64_u32(sgpr(stmp+0), sgpr(stmp+1), sgpr("Strides%s+%u"%(tc,i-1)), sgpr("WorkGroup%u"%i), "Stride*WG") + kStr += inst("s_add_u32", sgpr(tileStart+0), sgpr(tileStart+0), sgpr(stmp+0), "accum wg term to tilestart") + kStr += inst("s_addc_u32", sgpr(tileStart+1), sgpr(tileStart+1), sgpr(stmp+1), "accum wg term to tilestart") + + + # Add the tile start to the SRD + if wroteTileStart: + kStr += inst("s_lshl_b64", sgpr(tileStart,2), sgpr(tileStart,2), log2(bpe), "tileStart *= BPE") + kStr += inst("s_add_u32", sgpr("Srd%s+0"%tc), sgpr("Address%s+0"%tc), sgpr(tileStart+0), "SRD_base = Address+ tileStart0") + kStr += inst("s_addc_u32", sgpr("Srd%s+1"%tc), sgpr("Address%s+1"%tc), sgpr(tileStart+1), "SRD_base = Address+ tileStart1"); + else: + kStr += inst("s_mov_b32", sgpr("Srd%s+0"%tc), sgpr("Address%s+0"%tc), "init SRD base address (lower )" ) + kStr += inst("s_mov_b32", sgpr("Srd%s+1"%tc), sgpr("Address%s+1"%tc), "init SRD base address (upper) + other fields" ) + + kStr += inst("s_mov_b32", sgpr("Srd%s+3"%tc), "Srd127_96", "Set bits 127_96 in SRD") + + #if tP["isB"]: + # kStr += self.assert_ne(sgpr("WorkGroup2"), 1) + + + if kernel["PreciseBoundsCheck"] and kernel["CheckDimOverflow"]>=2: + # double-check to make sure the SRD limit is inside the allowed tensor: + # (only works in PBC mode since otherwise we set the limit to BufferLimit) + # - compute size of tensor in elements (including all dimensions) + # - subtract the SRD base and SRD buffer limit + # - Make sure the 64bit result is >0 + kStr += inst("s_lshl_b64", sgpr(stmp,2), sgpr("Tensor2dSize%s"%tc,2), log2(bpe), "tensor size in bytes") + kStr += inst("s_add_u32", sgpr(stmp+0), sgpr(stmp+0), sgpr("Address%s+0"%tc), "add start ptr to compute tensor%s bot-right"%tc) + kStr += inst("s_addc_u32", sgpr(stmp+1), sgpr(stmp+1), sgpr("Address%s+1"%tc), "add start ptr to compute tensor%s bot-right"%tc) + kStr += inst("s_sub_u32", sgpr(stmp+0), sgpr(stmp+0), sgpr("Srd%s+0"%tc), "sub SRD base") + kStr += inst("s_subb_u32", sgpr(stmp+1), sgpr(stmp+1), sgpr("Srd%s+1"%tc), "sub SRD base") + if self.use64bPbcLimit: + kStr += inst("s_sub_u32", sgpr(stmp+0), sgpr(stmp+0), sgpr("SrdShadowLimit%s+0"%tc), "sub buffer size") + kStr += inst("s_subb_u32", sgpr(stmp+1), sgpr(stmp+1), sgpr("SrdShadowLimit%s+1"%tc), "sub buffer size") + else: + kStr += inst("s_sub_u32", sgpr(stmp+0), sgpr(stmp+0), sgpr("Srd%s+2"%tc), "sub buffer limit") + + kStr += self.assert_eq(sgpr(stmp+1), 0) # must be 0 or we are way OOB + kStr += self.assert_ge_u32(sgpr(stmp+0), 0) # diff greater than zero + if 0 and tP["isB"]: + t = self.vgprPool.checkOut(1) + kStr += inst("s_add_u32", sgpr(stmp+6), sgpr("WorkGroup1"), sgpr("WorkGroup2"), "bozo, debug") + kStr += inst("v_mov_b32", vgpr(t), 0x54, "") + kStr += self.assert_ne(sgpr(stmp+6), vgpr(t) ) + self.vgprPool.checkIn(t) + + return kStr + +############################################################################## +# Global Read Addresses: Addresses A/B ############################################################################## def graAddresses(self, kernel, tP): kStr = "" @@ -2771,31 +3017,10 @@ def graAddresses(self, kernel, tP): if kernel["BufferLoad"]: # maxAddrSgpr = size[n] * stride[n-1] kStr += self.comment1("max read offset = size[n] * stride[n-1]") - dim = len(tP["ia"])-1 # dim - strideIdx = dim-1 # largest stride - sizeIdx = tP["ia"][dim] - sizeIdxIsSum = sizeIdx in kernel["ProblemType"]["IndicesSummation"] - if sizeIdxIsSum: - sizeIdx -= kernel["ProblemType"]["NumIndicesC"] + kStr += self.computeSrd(kernel, tP, tc, kernel["ProblemType"]["IndexAssignments%s"%tc], tP["bpe"]) - # Buffer-load uses one base read pointer stored in the SRD - set it here: - kStr += inst("s_mov_b32", sgpr("Srd%s+0"%tc), sgpr("Address%s+0"%tc), "init SRD base address (lower)" ) - kStr += inst("s_mov_b32", sgpr("Srd%s+1"%tc), sgpr("Address%s+1"%tc), "init SRD base address (upper) + other fields" ) - if kernel["PreciseBoundsCheck"]: - kStr += inst("s_mul_i32", \ - sgpr("Srd%s+2"%tc), \ - sgpr("Sizes%s+%u"%("Sum" if sizeIdxIsSum else "Free", sizeIdx)), \ - sgpr("Strides%s+%u"%(tc,strideIdx)), \ - "set limit to bottom-right corner of array") - kStr += inst("s_lshl_b32", - sgpr("Srd%s+2"%tc), \ - sgpr("Srd%s+2"%tc), \ - hex(log2(tP["bpe"])), \ - "Size in bytes") #TODO-64B - else: - kStr += inst("s_mov_b32", sgpr("Srd%s+2"%tc), "BufferLimit", "") - kStr += inst("s_mov_b32", sgpr("Srd%s+3"%tc), "Srd127_96", "Set bits 127_96 in SRD") + #kStr += self.bomb(0x13) # after addresses and SRD set else: tmp = self.vgprPool.checkOut(2) kStr += inst("v_mov_b32", vgpr(tmp+0), sgpr("Address%s+0"%tP["tensorChar"]), "" ) @@ -2845,7 +3070,6 @@ def graIncrements(self, kernel, loopIdx, tP): if tP["tlu"]: if self.globalReadIncsUseVgpr: tmpSgpr = self.getTmpSgpr(1) -#jgolds which bpe here? assuming tP kStr += inst("s_mul_i32", sgpr(tmpSgpr+0), \ hex(depthU*tP["bpe"]), sgpr("Strides%s"%tP["tensorChar"]), \ "incr = stride*%u*bytes"%depthU ) @@ -2869,7 +3093,6 @@ def graIncrements(self, kernel, loopIdx, tP): sgpr(tmpSgpr+1), \ "" ) else: # not globalReadIncsUseVgpr, ie use SGPR -#jgolds which bpe here? assuming tP kStr += inst("s_mul_i32", sgpr("GlobalReadIncs%s+0"%tP["tensorChar"]), \ hex(depthU*tP["bpe"]), sgpr("Strides%s"%tP["tensorChar"]), \ "incr = stride*%u*bytes"%depthU ) @@ -2886,7 +3109,6 @@ def graIncrements(self, kernel, loopIdx, tP): "(carry)") else: # transposed -#jgolds which bpe here? assuming tP if self.globalReadIncsUseVgpr: kStr += inst("v_mov_b32", vgpr("GlobalReadIncs%s+0"%tP["tensorChar"]), \ hex(depthU*tP["bpe"]), \ @@ -2905,6 +3127,8 @@ def graIncrements(self, kernel, loopIdx, tP): #kStr += dump(vgpr("GlobalReadIncs%s"%tP["tensorChar"])) #kStr += "s_endpgm\n" + #if tP["isB"]: + # kStr += self.bomb(0x100) return kStr ############################################################################## @@ -2961,7 +3185,7 @@ def lwaFirstOffset(self, kernel, tP): vgpr(uReg), \ ~(kernel["LocalDotLayout"]-1), \ vgpr(uReg), \ - "uReg & LDL") + "uReg & ~LDL") kStr += inst("v_mul_u32_u24", \ vgpr(uReg), \ hex(kernel["MacroTile%s"%tP["tensorChar"]] + kernel["LdsPad%s"%tc]), \ @@ -2997,6 +3221,7 @@ def lwaFirstOffset(self, kernel, tP): "lwFOB = lwB%s + lwB%s*MT%s + LDS_OFFSET_B=%u*%u" % (tP["tileChar"], \ self.unrollChar, tP["tileChar"], kernel["LdsOffsetB"], self.bpeAB) ) self.vgprPool.checkIn(tP["gpr"]["lwoT"]) + tP["gpr"]["lwoT"] = None self.vgprPool.checkIn(tP["gpr"]["uReg"]) if kernel["GlobalSplitU"] > 1: self.vgprPool.checkIn(tP["gpr"]["uReg2"]) @@ -3142,7 +3367,6 @@ def lraFinalOffset(self, kernel, tP): #if tP["isA"]: # kStr += self.bomb(113) - # dump lra final offset #if tP["isA"]: # kStr += dump(vgpr("LocalReadAddr%s"%tP["tensorChar"])) @@ -3162,7 +3386,6 @@ def lraDeclareAddresses(self, kernel, tP): if tP["isA"]: return self.comment1("N/A") else: -#jgolds which bpe here? Looks like tP, which is B return inst("_v_add_co_u32", \ vgpr("LocalReadAddr%s+0"%tP["tensorChar"]), \ "vcc", \ @@ -3459,13 +3682,26 @@ def incrementSrd(self, kernel, tP, incLower, incUpper): # also have to move the boundary since we change the base # so less buffers to the edge: - # TODO-64 if kernel["PreciseBoundsCheck"]: - kStr += inst("s_sub_u32 ", \ - sgpr("Srd%s+2"%(tc)), \ - sgpr("Srd%s+2"%(tc)), \ - incLower, \ - "limit -= inc)" ) + if self.use64bPbcLimit: + kStr += inst("s_sub_u32", \ + sgpr("SrdShadowLimit%s+0"%tc), \ + sgpr("SrdShadowLimit%s+0"%tc), \ + incLower, \ + "limit -= inc)") + kStr += inst("s_subb_u32", \ + sgpr("SrdShadowLimit%s+1"%tc), \ + sgpr("SrdShadowLimit%s+1"%tc), \ + incUpper, \ + "limit -= inc)" ) + kStr += inst("s_cmp_eq_u32", sgpr("SrdShadowLimit%s+1"%tc), 0, "are we within 2^32?") + kStr += inst("s_cmov_b32", sgpr("Srd%s+2"%tc), sgpr("SrdShadowLimit%s+0"%tc), "Move shadow to real if we are within 2^32") + else: + kStr += inst("s_sub_u32", \ + sgpr("Srd%s+2"%(tc)), \ + sgpr("Srd%s+2"%(tc)), \ + incLower, \ + "limit -= inc)" ) return kStr @@ -3569,11 +3805,10 @@ def globalReadDo(self, kernel, guardK, tP): ######################################## # Calculate Max Addr ######################################## - maxAddrSgpr = self.getTmpSgpr(2) # 3+6 = 9 sgprs available - tmpSgpr = maxAddrSgpr + 2 # 7 sgprs available + maxAddrSgpr = self.getTmpSgpr(4) + tmpSgpr = maxAddrSgpr + 2 #dumpVgpr = self.vgprPool.checkOut(1) - # TODO-64B: # Assumes the product of the two sizes is <4GB here. # We would need to slide the SRD if this is not the case. kStr += self.comment1("max read address = size[n] * stride[n-1]") @@ -3582,8 +3817,45 @@ def globalReadDo(self, kernel, guardK, tP): sizeIdx = tP["ia"][dim] sizeIdxIsSum = sizeIdx in kernel["ProblemType"]["IndicesSummation"] if sizeIdxIsSum: - sizeIdx -= kernel["ProblemType"]["NumIndicesC"] - if kernel["BufferLoad"] and not kernel["PreciseBoundsCheck"]: + sizeIdx -= kernel["ProblemType"]["NumIndicesC"] + + if not kernel["PreciseBoundsCheck"]: + # PBC moves the limit as SRD moves forward so don't need to reset boundary + # Else find the edge of the matrix and compute bounds + + if 1: + kStr += self.s_mul_u64_u32(sgpr(maxAddrSgpr+0), sgpr(maxAddrSgpr+1), \ + sgpr("Sizes%s+%u"%("Sum" if sizeIdxIsSum else "Free", sizeIdx)), \ + sgpr("Strides%s+%u"%(tP["tensorChar"],strideIdx)), \ + "64b tensor%s size in elements"%tc) + kStr += inst("s_lshl_b64", \ + sgpr(maxAddrSgpr,2), \ + sgpr(maxAddrSgpr,2), \ + hex(log2(tP["bpe"])), "<- tensor%s size in bytes"%tc) + else: + if kernel["ProblemType"]["NumIndicesC"] == 2: + kStr += inst("s_lshl_b64", \ + sgpr(maxAddrSgpr,2), \ + sgpr("Tensor2dSize%s"%tc,2), \ + hex(log2(tP["bpe"])), "<- tensor%s size in bytes"%tc) + elif kernel["ProblemType"]["NumIndicesC"] == 3: + # TODO - hardcored for two batches, remove when PBC code goes + kStr += self.s_mul_u64_u32(sgpr(maxAddrSgpr+0), sgpr(maxAddrSgpr+1), \ + sgpr("Tensor2dSize%s")%tc, \ + sgpr("SizesFree+2"), "scale Tensor2D by numBatches") + kStr += inst("s_lshl_b64", \ + sgpr(maxAddrSgpr,2), \ + sgpr(maxAddrSgpr,2), \ + hex(log2(tP["bpe"])), "<- tensor%s size in bytes"%tc) + else: + assert(0) # unsupported number of Free dims, should use PBC=1 instead + kStr += inst("s_lshl_b64", \ + sgpr(maxAddrSgpr,2), \ + sgpr(maxAddrSgpr,2), \ + hex(log2(tP["bpe"])), "<- tensor%s size in bytes"%tc) + + + if kernel["BufferLoad"]: # Set maxAddrSgpr to max allowed byte offset # maxAddrSgpr = size[n] * stride[n-1] * bpe # SRD has moved ahead for each tile so subtract original A to see if we are OOB: @@ -3594,17 +3866,11 @@ def globalReadDo(self, kernel, guardK, tP): sgpr("Address%s+0"%tc), \ "Compute distance of SRD from original array in bytes") - kStr += inst("s_mul_i32", \ - sgpr(maxAddrSgpr+0), \ - sgpr("Sizes%s+%u"%("Sum" if sizeIdxIsSum else "Free", sizeIdx)), \ - sgpr("Strides%s+%u"%(tP["tensorChar"],strideIdx)), \ - "Array size") - - kStr += inst("s_lshl_b32", - sgpr(maxAddrSgpr+0), \ - sgpr(maxAddrSgpr+0), \ - hex(log2(tP["bpe"])), \ - "Array size in bytes") + kStr += inst("s_subb_u32", \ + sgpr(tmpSgpr+1), \ + sgpr("Srd%s++1"%tc), \ + sgpr("Address%s+1"%tc), \ + "Compute distance of SRD from original array in bytes") kStr += inst("s_sub_u32", \ sgpr(maxAddrSgpr), \ @@ -3612,56 +3878,53 @@ def globalReadDo(self, kernel, guardK, tP): sgpr(tmpSgpr), \ "Max byte offset = MaxSize - SRD_Distance") - if not kernel["BufferLoad"]: - kStr += inst("s_mul_i32", \ - sgpr(maxAddrSgpr+0), \ - sgpr("Sizes%s+%u"%("Sum" if sizeIdxIsSum else "Free", sizeIdx)), \ - sgpr("Strides%s+%u"%(tP["tensorChar"],strideIdx)), \ - "mul d%u lower"%dim) + kStr += inst("s_subb_u32", \ + sgpr(maxAddrSgpr+1), \ + sgpr(maxAddrSgpr+1), \ + sgpr(tmpSgpr+1), \ + "Max byte offset = MaxSize - SRD_Distance") - kStr += inst("s_mov_b32", sgpr(maxAddrSgpr+1), hex(0), "zero (upper)") - # maxAddrSgpr *= bytes/element + if kernel["CheckDimOverflow"]>=2: + kStr += self.assert_eq(sgpr(maxAddrSgpr+1), 0) - kStr += inst("s_lshl_b64", \ - sgpr(maxAddrSgpr,2), \ - sgpr(maxAddrSgpr,2), \ - hex(log2(tP["bpe"])), "offset *= bytes/element") - # maxAddrSgpr += initial address - kStr += inst("s_add_u32", \ - sgpr(maxAddrSgpr+0), \ - sgpr(self.sgprs["AddressA"] if tP["isA"] else self.sgprs["AddressB"]), \ - sgpr(maxAddrSgpr+0), \ - "prepend address lower") - kStr += inst("s_addc_u32", \ - sgpr(maxAddrSgpr+1), \ - sgpr((self.sgprs["AddressA"] if tP["isA"] else self.sgprs["AddressB"])+1), \ - sgpr(maxAddrSgpr+1), \ - "prepend address upper") - # sgpr->vgpr - maxAddrVgpr = self.vgprPool.checkOut(2, "maxAddrVgpr") - kStr += inst("v_mov_b32", vgpr(maxAddrVgpr+0), sgpr(maxAddrSgpr+0), "sgpr->vgpr") - kStr += inst("v_mov_b32", vgpr(maxAddrVgpr+1), sgpr(maxAddrSgpr+1), "sgpr->vgpr") - - # full exec mask - fullExec = tmpSgpr - kStr += inst("s_mov_b64", sgpr(fullExec,2), \ - "0xFFFFFFFFFFFFFFFF", "to restore all threads active") - bpeVgpr = self.vgprPool.checkOut(1, "bpeVgpr") - kStr += inst("v_mov_b32", vgpr(bpeVgpr), hex(tP["bpe"]), "bpe") - - # can remove this? - zeroVgpr = self.vgprPool.checkOut(1) - kStr += inst("v_mov_b32", vgpr(zeroVgpr), hex(0), "zero") + else: # not BufferLoad + kStr += inst("s_add_u32", \ + sgpr(maxAddrSgpr+0), \ + sgpr(self.sgprs["AddressA"] if tP["isA"] else self.sgprs["AddressB"]), \ + sgpr(maxAddrSgpr+0), \ + "prepend address lower") + kStr += inst("s_addc_u32", \ + sgpr(maxAddrSgpr+1), \ + sgpr((self.sgprs["AddressA"] if tP["isA"] else self.sgprs["AddressB"])+1), \ + sgpr(maxAddrSgpr+1), \ + "prepend address upper") + # sgpr->vgpr + maxAddrVgpr = self.vgprPool.checkOut(2, "maxAddrVgpr") + kStr += inst("v_mov_b32", vgpr(maxAddrVgpr+0), sgpr(maxAddrSgpr+0), "sgpr->vgpr") + kStr += inst("v_mov_b32", vgpr(maxAddrVgpr+1), sgpr(maxAddrSgpr+1), "sgpr->vgpr") + + # full exec mask + fullExec = tmpSgpr + kStr += inst("s_mov_b64", sgpr(fullExec,2), \ + "0xFFFFFFFFFFFFFFFF", "to restore all threads active") + bpeVgpr = self.vgprPool.checkOut(1, "bpeVgpr") + kStr += inst("v_mov_b32", vgpr(bpeVgpr), hex(tP["bpe"]), "bpe") + + # can remove this? + zeroVgpr = self.vgprPool.checkOut(1) + kStr += inst("v_mov_b32", vgpr(zeroVgpr), hex(0), "zero") # End if guardK directToLdsLoads = 0 + loopCnt = -1 for perp in range(0, tP["nrp"]): for sPerp in range(0, tP["nrpv"]): for para in range(0, tP["nrc"]): for sPara in range(0, tP["nrcv"]/tP["nrcvpi"]): i = sPara + (tP["nrcv"]/tP["nrcvpi"]) * (para + tP["nrc"] * (sPerp + tP["nrpv"] * perp)) + loopCnt += 1 graIdx = i * self.rpgo if kernel["BufferLoad"] else i * self.rpga g2lIdx = i * loadWidth if guardK: @@ -3837,7 +4100,8 @@ def globalReadDo(self, kernel, guardK, tP): # Get offset (for checking, see comment below) and comment: (checkOffset, iDummy, comment) = \ - self.calculateLdsWriteOffset(perp, para, sPerp, sPara, kernel, tP) + self.calculateLdsWriteOffset(perp, para, sPerp, sPara, kernel, tP, 0) + # Direct to LDS always writes consecutive LDS locations at m0 + 4 * TidInWave # Therefore we double-check here to ensure the desired LDS write offset # is moving at NumThreads*4. This should already be guaranteed since @@ -3922,7 +4186,6 @@ def localWriteSwapOffsets(self, kernel, tP): if not self.do["LocalWrite"]: return "" kStr = "" tc = tP["tensorChar"] -#jgolds which bpe here? assuming tP #fixme-iui need to use wrapping increment for double or triple buffering: if kernel["LocalWriteUseSgpr%s"%tc]: kStr += inst("s_xor_b32", \ @@ -3947,7 +4210,6 @@ def localWriteResetOffsets(self, kernel, tP): kStr = "" resetMask = hex(kernel["LdsOffsetA_Blk"]*tP["bpe"]-1 | self.LdsOOB) tc = tP["tensorChar"] -#jgolds which bpe here? assuming tP if kernel["LocalWriteUseSgpr%s"%tc]: kStr += inst("s_and_b32", \ sgpr("LocalWriteAddr%s"%tP["tensorChar"]), \ @@ -3997,8 +4259,9 @@ def localWriteInitPointers(self, kernel, tP): # i : ? # comment : Comment with the text version of the formula ############################################################################# - def calculateLdsWriteOffset(self, perp, para, sPerp, sPara, kernel, tP): + def calculateLdsWriteOffset(self, perp, para, sPerp, sPara, kernel, tP, localWriteCnt): tc = tP["tensorChar"] + ldl = kernel["LocalDotLayout"] lscaOffset = para * kernel[tP["lsc"]] lspaOffset = perp * kernel[tP["lsp"]] @@ -4015,7 +4278,7 @@ def calculateLdsWriteOffset(self, perp, para, sPerp, sPara, kernel, tP): i = sPara + (tP["nrcv"]/tP["nrcvpi"]) * (para * tP["glvw"] + tP["nrc"] * (sPerp + tP["glvw"] * tP["nrpv"] * perp )) - if kernel["LocalDotLayout"] > 1: + if ldl > 1: # apply interleave for LocalDot: # Else they complement the address calculation to place adjacent-in-u data # so adjacent-in-lds. @@ -4023,7 +4286,7 @@ def calculateLdsWriteOffset(self, perp, para, sPerp, sPara, kernel, tP): "wtc=", tP["wtc"], "wuc=", tP["wuc"], "grcv=", tP["grcv"], \ "lscaOffset=", lscaOffset, "lspaOffset=", lspaOffset spacing = tP["glvw"] - lscaOffset += (lspaOffset % spacing) * kernel["LocalDotLayout"] + lscaOffset += (lspaOffset % spacing) * ldl lspaOffset /= spacing print " After LDL: lscaOffset=", lscaOffset, "lspaOffset=", lspaOffset @@ -4048,7 +4311,16 @@ def calculateLdsWriteOffset(self, perp, para, sPerp, sPara, kernel, tP): #print "2lscaOffset", lscaOffset offsetElements = (lspaOffset + lscaOffset) #print "offsetElements", offsetElements - offsetBytes = offsetElements*tP["bpe"] + if not tP["tlu"] and ldl > 1: +#jgolds HACK +#Need to clean this up. Does not follow usual paradigm, but works for cases we care about with dot2 + rem = (localWriteCnt) % ldl + quo = (localWriteCnt) / ldl + #print "quo %u, rem %u, MT %u"%(quo, rem, kernel["MacroTile%u"%tP["tensorIdx"]]) + offsetBytes = (quo * kernel["MacroTile%u"%tP["tensorIdx"]] * ldl + rem)*tP["bpe"] + else: + offsetBytes = offsetElements*tP["bpe"] + #print "offsetBytes", offsetBytes #print "offset", offset @@ -4123,7 +4395,8 @@ def localWriteDo(self, kernel, tP): sgpr("PerpOverhangVcc%s"%tc,2), \ "Mask load so out-of-gr-tile bounds returns 0. Note 1.0f=0x3f80000 which is large non-neg int") lwa = tmpLocalWriteAddr - +#jgolds HACK + loopCnt = 0 for para in range(0, tP["nrc"]): for s in range(0, max(tP["nwcv"],tP["nwpv"])/tP["nwcvpi"]): @@ -4140,8 +4413,9 @@ def localWriteDo(self, kernel, tP): elif tP["wuc"] == tP["grcv"]: sPerp = s - (offset, i, comment) = self.calculateLdsWriteOffset(perp, para, sPerp, sPara, kernel, tP) + (offset, i, comment) = self.calculateLdsWriteOffset(perp, para, sPerp, sPara, kernel, tP, loopCnt) g2lIdx = i*blockWidth + loopCnt+=1 paramList = [] @@ -4176,7 +4450,7 @@ def localWriteDo(self, kernel, tP): if 0: kStr += inst("s_barrier", "temp debug wait to check sync issue" ) - if 0 and tP["isA"]: + if 0 and tP["isB"]: #if 0 and self.localWriteDoCnt >= 0: kStr += "s_waitcnt lgkmcnt(0) & vmcnt(0)\n" kStr += inst("s_barrier", "dump LDS" ) @@ -4191,7 +4465,6 @@ def localReadSwapOffsets(self, kernel, tP): tc=tP["tensorChar"] if not self.do["LocalRead%s"%tc]: return "" kStr = "" -#jgolds which bpe here? assuming tP kStr += inst("v_xor_b32", \ vgpr("LocalReadAddr%s"%tP["tensorChar"]), \ hex(kernel["LdsOffsetA_Blk"]*tP["bpe"]), \ @@ -4211,7 +4484,6 @@ def localReadResetOffsets(self, kernel, tP): tP["localReadOffset"] = 0 tP["localReadElementOffset"] = 0 kStr += self.comment1("handled internally") -#jgolds which bpe here? assuming tP kStr += inst("v_and_b32", \ vgpr("LocalReadAddr%s"%tP["tensorChar"]), \ hex(kernel["LdsOffsetA_Blk"]*tP["bpe"]-1), \ @@ -4231,7 +4503,6 @@ def localReadInitPointers(self, kernel, tP): tP["localReadElementOffset"] = 0 kStr += self.comment1("N/A") else: -#jgolds which bpe here? assuming tP kStr += inst("v_and_b32", \ vgpr("LocalReadAddr%s"%tP["tensorChar"]), \ hex(kernel["LdsOffset%s_Blk"%tP["tensorChar"]]*tP["bpe"]-1), \ @@ -4247,9 +4518,17 @@ def localReadInc(self, kernel, iui, tP): if not self.do["LocalRead%s"%tc]: return "" kStr = "" tc = tP["tensorChar"] + ldl = kernel["LocalDotLayout"] + tt = tP["tt"] + partialInc = kernel[tt] if self.inTailLoop: -#jgolds which bpe here? assuming tP - inc = kernel["LocalSplitU"]*(kernel["MacroTile%u"%tP["tensorIdx"]]+kernel["LdsPad%s"%tc])*tP["bpe"] + if ldl > 1: + if iui < (kernel["InnerUnroll"] - 1): + inc = partialInc*tP["bpe"] + else: + inc = (ldl * kernel["LocalSplitU"]*(kernel["MacroTile%u"%tP["tensorIdx"]] + kernel["LdsPad%s"%tc]) - partialInc * (ldl - 1))*tP["bpe"] + else: + inc = kernel["LocalSplitU"]*(kernel["MacroTile%u"%tP["tensorIdx"]]+kernel["LdsPad%s"%tc])*tP["bpe"] tmpSgpr = self.getTmpSgpr(1) kStr += inst("s_mov_b32", sgpr(tmpSgpr), hex(inc), "inc") kStr += inst("_v_add_co_u32", \ @@ -4260,11 +4539,7 @@ def localReadInc(self, kernel, iui, tP): "lr%s += %u (LSU*(MT+PAD)*bpe)"%(tP["tensorChar"], inc) ) else: if tP["localReadInstruction"].numOffsets == 1: - ldl = kernel["LocalDotLayout"] if ldl > 1: - #jgolds - #HACK just hard coding to verify it works for the case I am testing - partialInc = 8 # in elements if iui < (kernel["InnerUnroll"] - 1): tP["localReadOffset"] += partialInc else: @@ -4446,7 +4721,7 @@ def shiftVectorComponents(self, kernel, tP): kStr += inst("_v_add_co_u32", vgpr(vReg), "vcc", vgpr(mvReg), vgpr(vReg), "vId = 2 components") self.vgprPool.checkIn(mvReg) self.vgprPool.checkIn(vRegD) - + kStr += inst("v_cmp_eq_u32", sgpr(tmpSgpr,2), vgpr(thread), \ vgpr(eReg), "mask" ) kStr += inst("v_mov_b32", vgpr(tmpVgpr+0), sgpr(tmpSgpr+0), "") @@ -4485,7 +4760,7 @@ def shiftVectorComponents(self, kernel, tP): for vectorIdx in range(0, numVectors): kStr += self.comment("shift d%u r=%u v=%u"%(tP["idx"], r, vectorIdx)) kStr += "label_%04u:%s" % (sviLabels[r-1][vectorIdx], self.endLine) - # mask if last thread in thread-tile column + # mask if last thread in thread#-tile column kStr += inst("v_cmpx_eq_u32", sgpr(tmpSgpr,2), vgpr(thread), \ vgpr(eReg), "serial % SG == (wgMT/VECTOR_WIDTH)%SG" ) tto = kernel["ThreadTile%u"%((tP["idx"]+1)%2)] # thread tile orthogonal @@ -4754,7 +5029,6 @@ def localSplitULocalRead(self, kernel): kStr = "" tmpSgpr = self.getTmpSgpr(1) baseAddr = self.vgprPool.checkOut(1) -#jgolds which bpe should we use? kStr += staticMultiply(vgpr(baseAddr), vgpr("Serial"), kernel["GlobalWriteVectorWidth"]*self.bpeAB, sgpr(tmpSgpr)) (elementStep, useDwordX2) = self.getLocalSplitUElementStep(kernel, True) # Load values for each subgroup @@ -5324,7 +5598,6 @@ def globalWriteElements(self, kernel, lsu, vectorWidths, elements): # Use bpeCexternal for all external values numVgprsPerAddr = self.rpgo if kernel["BufferStore"] else self.rpga -#jgolds which bpe should we use? numVgprsPerDataPerVI = 0 @@ -5432,9 +5705,7 @@ def globalWriteElements(self, kernel, lsu, vectorWidths, elements): elementStartIdx = batchIdx * numElementsPerBatch elementStopIdx = min( elementStartIdx + numElementsPerBatch, len(elements[edgeI]) ) elementsThisBatch = elements[edgeI][elementStartIdx:elementStopIdx] - numElementsThisBatch = len(elementsThisBatch) - numElementVgprs = int(numElementsThisBatch * ceil(numVgprsPerElement)) - #print "BATCH[%u/%u]: elements[edgeI][%u:%u] VGPRs=%u" % (batchIdx, numBatches, elementStartIdx, elementStopIdx, numElementVgprs) + #print "BATCH[%u/%u]: elements[edgeI][%u:%u] VGPRs=%u" % (batchIdx, numBatches, elementStartIdx, elementStopIdx ) # elementVgprs can be large and should be perfectly tuned to the number of available # VGPRS. We do not want to accidentally overflow and grow the pool here: kStr += self.globalWriteBatch(kernel, beta, edge, lsu, atomic, gwvw, atomicW, \ @@ -5735,7 +6006,7 @@ def globalWriteBatch(self, kernel, beta, edge, lsu, atomic, gwvw, atomicW, \ #kStr += inst("v_mov_b32", vgpr(addr), 0x0, "bozo") if edge: # Set address to -1 if OOB on either dimension - # TODO - for PreciseBoundsCheck we could set bounds on C to tile dim + # TODO - for PreciseBoundsCheckStore we could set bounds on C to tile dim # and only check the x/coord0 index here, save a couple inst kStr += inst("v_cmp_lt_u32", sgpr(tmpS01,2), vgpr( coordVgpr0), sgpr("SizesFree+0"), "coord0 < size0" ) kStr += inst("v_cmp_lt_u32", sgpr(tmpS23,2), vgpr(self.coordVgpr1), sgpr("SizesFree+1"), "coord1 < size1" ) @@ -6113,7 +6384,7 @@ def globalWriteBatch(self, kernel, beta, edge, lsu, atomic, gwvw, atomicW, \ # src2 = sumIdxV = f32 = opsel 00 dataCExternal = elementData[elementIdx] + vi/2 hi16 = sumIdxV%2 - kStr += inst("v_mad_mix_f32", vgpr("ValuC+%u"%sumIdxV), sgpr("Beta"), \ + kStr += inst(self.mixinst, vgpr("ValuC+%u"%sumIdxV), sgpr("Beta"), \ vgpr(dataCExternal), vgpr("ValuC+%u"%sumIdxV), \ "op_sel:[0,%u,0] op_sel_hi:[0,1,0]" % (hi16), \ "//C*=beta") @@ -6327,7 +6598,6 @@ def dumpLds(self, kernel, startU, numU): kStr += inst("s_barrier", "dump LDS" ) tmp = self.vgprPool.checkOut(1) tmpAddr = self.vgprPool.checkOut(1) -#jgolds which bpe should we use? kStr += inst("v_lshlrev_b32", \ vgpr(tmpAddr), \ hex(log2(self.bpeAB)), \ @@ -6392,8 +6662,37 @@ def kernelBodyBetaOnly(self, kernel): return kStr + # Perform 32-bit scalar mul and save u64 result in two SGPR + # src0 and src1 are 32-bit unsigned ints in scalar sgpr or small int constants (<64?)) + # return retuns in dst0:dest (lower 32-bit in dst0, high 64-bit in dst1)) + def s_mul_u64_u32 (self, dst0, dst1, src0, src1, comment): + kStr = "" + assert(dst1 != src0) # no worky since dst1 overwritten by first mul operations + assert(dst1 != src1) # no worky since dst1 overwritten by first mul operations + # the else path below has less restrictions but prefer consistency + if globalParameters["AsmCaps"][self.version]["HasSMulHi"]: + kStr += inst("s_mul_hi_u32", dst1, src0, src1, comment) + kStr += inst("s_mul_i32", dst0, src0, src1, comment) + else: + if type(src1) != 'str' or not src1.startswith("s"): + # Swap operands, need a scalar sgpr in src1 (not a constant) + t = src0 + src0 = src1 + src1 = t + vtmp0 = self.vgprPool.checkOut(2) + vtmp1 = vtmp0+1 + kStr += inst("v_mov_b32", vgpr(vtmp0), src0, comment) + kStr += inst("v_mul_hi_u32", vgpr(vtmp1), vgpr(vtmp0), src1, comment) + kStr += inst("v_readfirstlane_b32", dst1, vgpr(vtmp1), comment) + kStr += inst("v_mul_lo_u32", vgpr(vtmp1), vgpr(vtmp0), src1, comment) + kStr += inst("v_readfirstlane_b32", dst0, vgpr(vtmp1), comment) + self.vgprPool.checkIn(vtmp0) + return kStr + + + ############################################################################## - # Cause a GPUVM fault. + # Cause a GPUVM fault. # Instruction after the bomb will write the cookie to SGPR0, so you can see the cookie in the # backtrace. Useful for locating which spot in code generated the bomb # vgprAddr controls which vgpr to overwrite with the null pointer address @@ -6437,12 +6736,13 @@ def assertCommon(self, cookie=-1): ############################################################################## # assertCmpCommon : Common routine for all assert comparison functions ############################################################################## - def assertCmpCommon(self, c, val0, val1, cookie=-1): + def assertCmpCommon(self, cond, val0, val1, cookie=-1): kStr = "" if self.db["EnableAsserts"]: kStr += inst("s_or_saveexec_b64", sgpr("SaveExecMask",2), 0, \ "assert: saved execmask") - kStr += inst("v_cmpx_%s_u32"%c, "vcc", val0, val1, "v_cmp" ) + + kStr += inst("v_cmpx_%s"%cond, "vcc", val0, val1, "v_cmp" ) kStr += self.assertCommon(cookie) @@ -6457,22 +6757,48 @@ def assertCmpCommon(self, c, val0, val1, cookie=-1): # Asserts currently modify vcc ############################################################################## def assert_eq(self, val0, val1, cookie=-1): - return self.assertCmpCommon("ne", val0, val1, cookie) + return self.assertCmpCommon("ne_u32", val0, val1, cookie) def assert_ne(self, val0, val1, cookie=-1): - return self.assertCmpCommon("eq", val0, val1, cookie) + return self.assertCmpCommon("eq_u32", val0, val1, cookie) - def assert_lt(self, val0, val1, cookie=-1): - return self.assertCmpCommon("ge", val0, val1, cookie) + def assert_lt_u32(self, val0, val1, cookie=-1): + return self.assertCmpCommon("ge_u32", val0, val1, cookie) - def assert_gt(self, val0, val1, cookie=-1): - return self.assertCmpCommon("le", val0, val1, cookie) + def assert_gt_u32(self, val0, val1, cookie=-1): + return self.assertCmpCommon("le_u32", val0, val1, cookie) + + def assert_le_u32(self, val0, val1, cookie=-1): + return self.assertCmpCommon("gt_u32", val0, val1, cookie) + + def assert_ge_u32(self, val0, val1, cookie=-1): + return self.assertCmpCommon("lt_u32", val0, val1, cookie) + + def assert_ge_i32(self, val0, val1, cookie=-1): + return self.assertCmpCommon("lt_i32", val0, val1, cookie) + + # can left shift w/o losing non-zero bits: + def assert_no_shift_of(self, val0, shift, stmp, cookie=-1): + kStr = "" + # TODO - use BFE here: + kStr += inst ("s_mov_b32", stmp, hex((shift-1) << (32-log2(shift))), "assert_no_shift_of - compute mask") + kStr += inst ("s_and_b32", stmp, stmp, val0, "assert_no_shift_of") + kStr += self.assert_eq(stmp, 0, cookie) + return kStr + + + def bomb_at_wg3d(self, wg0, wg1, wg2, cookie=-1): + kStr = "" + tmp0 = sgpr("SaveExecMask") + tmp1 = sgpr("SaveExecMask"+1) + kStr += inst("s_cmp_u32", tmp0, sgpr("WorkGroup0"), wg0) + kStr += inst("s_cmp_u32", tmp1, sgpr("WorkGroup1"), wg1) + kStr += inst("s_or_b32", tmp0, tmp0, tmp1, "") + kStr += inst("s_cmp_u32", tmp1, sgpr("WorkGroup2"), wg2) + kStr += inst("s_or_b32", tmp0, tmp0, tmp1, "") + kStr += "WIP" - def assert_le(self, val0, val1, cookie=-1): - return self.assertCmpCommon("gt", val0, val1, cookie) - def assert_ge(self, val0, val1, cookie=-1): - return self.assertCmpCommon("lt", val0, val1, cookie) # asserts if val0 is not an integer multiple of multiple2 # multiple2 must be a constant and power of 2 diff --git a/Tensile/KernelWriterSource.py b/Tensile/KernelWriterSource.py index 9b8e7c081c..e0b836bd3f 100644 --- a/Tensile/KernelWriterSource.py +++ b/Tensile/KernelWriterSource.py @@ -367,7 +367,10 @@ def functionPrefix(self, kernel): if self.language == "OCL": kStr += "#define MAC(A,B,DST) mad(A,B,DST)" else: - kStr += "#define MAC(A,B,DST) DST += A*B" + if kernel["ProblemType"]["HighPrecisionAccumulate"] and kernel["ProblemType"]["DataType"].isHalf(): + kStr += "#define MAC(A,B,DST) DST += static_cast(A) * static_cast(B)" + else: + kStr += "#define MAC(A,B,DST) DST += A*B" kStr += self.endLine if self.language == "HIP" and kernel["ProblemType"]["DataType"].isComplex(): diff --git a/Tensile/SolutionStructs.py b/Tensile/SolutionStructs.py index faf48df197..eaac031802 100644 --- a/Tensile/SolutionStructs.py +++ b/Tensile/SolutionStructs.py @@ -1396,6 +1396,15 @@ def assignDerivedParameters(state): reject(state, "InnerUnroll only supported on assembly") state["LoopUnroll"] /= state["InnerUnroll"] + # HACK! + # For now, LocalDotLayout > 1 only works if the thread tile is a square and VectorWidth is equal to the + # thread tile size + ldl = state["LocalDotLayout"] + if ldl > 1 and \ + (state["ThreadTile0"] != state["VectorWidth"] or state["ThreadTile1"] != state["VectorWidth"] or state["AssertSummationElementMultiple"] % ldl != 0): + reject(state, "LocalDotLayout > 1 only supports square thread tiles and VectorWidth equal to ThreadTile0/1 size and ASEM a multiple of LDL") + return + if 0: print "info: ", pvar(state, "LoopUnroll"), " LDS Stats:", pvar(state, "LdsOffsetA"), pvar(state, "LdsOffsetB") print "info: ", pvar(state["ProblemType"], "TLUA"), \ @@ -1475,14 +1484,26 @@ def assignDerivedParameters(state): # check is used since this is faster and also for computation we only # need to ensure that none of the loads fault. threads which are # computing bogus sections of the C tile will later be ignored. - # precise checking only works for vectorloads<=AssertSummationElementMultiple - # else if the vload crosses boundary we ignore all components not just the - # ones that are OOB. + # precise checking only works when all elements of the load are in-bounds + # since if the vload crosses boundary we ignore all components not just the + # ones that are OOB. See comments for groOffsetInMacroTile + # So check for the cases where the unroll loop can + # generate partial loads here and reject PBC solutions: + # For non-TLU the free dim is in perp dim so loads can't be partially OOB + # so those always guaranteeeNoPartial*=True + if state["ProblemType"]["TLUA"]: + guaranteeeNoPartialA = state["AssertFree0ElementMultiple"]%state["GlobalLoadVectorWidthA"]==0 + else: + guaranteeeNoPartialA = state["AssertSummationElementMultiple"]%state["GlobalLoadVectorWidthA"]==0 + + if state["ProblemType"]["TLUB"]: + guaranteeNoPartialB = state["AssertFree1ElementMultiple"]%state["GlobalLoadVectorWidthB"]==0 + else: + guaranteeNoPartialB = state["AssertSummationElementMultiple"]%state["GlobalLoadVectorWidthB"]==0 + + #-- if state["PreciseBoundsCheck"]: - if state["GlobalLoadVectorWidthA"] > \ - state["AssertSummationElementMultiple"] \ - or state["GlobalLoadVectorWidthB"] > \ - state["AssertSummationElementMultiple"]: + if not guaranteeeNoPartialA or not guaranteeNoPartialB: state["PreciseBoundsCheck"] = False # Use SGPR to store an offset from GlobalReadOffsetA+0. @@ -1491,13 +1512,15 @@ def assignDerivedParameters(state): # individual vector registers doing bounds compares. if not state["PreciseBoundsCheck"]: state["UseSgprForGRO"] = 0 + if state["FractionalLoad"]: + reject(state, "Fractional currently requires PreciseBoundsCheck") # Move to PBC always if state["UseSgprForGRO"] == -1: - # Don't use SGPR if it looks like we might not have enough: + # Don't use SGPR if it looks like we might not have enough - better to leave PBC enabled even if we have to use VGPR # 40 is based on current SGPR usage, this may need to be tuned in the future: numLoadsA = state["NumLoadsCoalescedA"]*state["NumLoadsPerpendicularA"] numLoadsB = state["NumLoadsCoalescedB"]*state["NumLoadsPerpendicularB"] - if numLoadsA + numLoadsB > 40: + if numLoadsA + numLoadsB > 35: #print "info: Disabling UseSgprForGRO since predicting too many SGPR will be used" state["UseSgprForGRO"] = 0 else: diff --git a/Tensile/SolutionWriter.py b/Tensile/SolutionWriter.py index 3565e5c666..8cf32e253e 100644 --- a/Tensile/SolutionWriter.py +++ b/Tensile/SolutionWriter.py @@ -85,7 +85,6 @@ def getSourceString(self, solution, kernelsWithBuildErrs): if not globalParameters["MergeFiles"]: solutionName = self.getSolutionName(solution) s += "#include \"%s.h\"\n" % solutionName - #s += "#include \"MathTemplates.h\"\n" s += "\n" # solution function signature @@ -107,6 +106,12 @@ def getSourceString(self, solution, kernelsWithBuildErrs): t += " " if globalParameters["DebugKernel"]: s += "%sunsigned int *debugBuffer;\n" % t + # Tensor sizes in bytes, excluding batch dims and accounting for zero strides + # Do these first since they are 64-bits and want to avoid any unneeded padding: + s += "%s// Size of lowest Tensor's lowest 2 dims, in bytes. Does not include bath dim or higher (>2) order dimensions\n" % t + s += "%suint64_t tensor2dSizeC;\n" % t + s += "%suint64_t tensor2dSizeA;\n" % t + s += "%suint64_t tensor2dSizeB;\n" % t solutionArgs = self.getArgList(solution["ProblemType"], True, False, False) for arg in solutionArgs: if arg[0] == "TensileHalf": @@ -133,7 +138,6 @@ def getSourceString(self, solution, kernelsWithBuildErrs): # NOTE: host compiler aligns size of structs to 64-bits (at least) and aligns the offset of pointers to 64-bits, therefore, having pointers which are not at the beginning of the struct may get padded/shifted by the host compiler and, therefore, not coppied correctly to gpu - # kernels s += "\n%s/* kernels */\n" % (t) s += "%sconst unsigned int numKernels = %u; // 1 or 4\n" % (t, len(kernels)) @@ -271,6 +275,82 @@ def getSourceString(self, solution, kernelsWithBuildErrs): s += "%ssizes[%u][0][%u] = size%s;\n" \ % (t, kernelIdx, i, self.indexChars[i]) + # Tensor2DSizes - size excluding the batch dimension, accounts for cases where one of strides is 0 + problemType = solution["ProblemType"] + #print "IndexAssignmentsA=", problemType["IndexAssignmentsA"], "Batch=", problemType["IndicesBatch"] + firstStride = 0 if problemType["UseInitialStrides"] else 1 + del i + + numIdx = problemType["NumIndicesC"] + printMe = printedFree = 0 + s += "%suint64_t tensor2dSizeC = %s" % \ + (t, "1" if firstStride==1 else "strideC%u%s"% (0,self.indexChars[0])) + for idx in range(0,numIdx): + # Multiply only by first free and first summation + if idx in problemType["IndicesFree"] and printedFree<2: + printedFree += 1 + printMe = True + else: + printMe = False + + if printMe: + if idx < firstStride: + strideIdx = problemType["IndexAssignmentsA"][idx+1] + s += " * std::max(size%s, strideA%u%s)" % \ + (self.indexChars[idx], idx+1, self.indexChars[strideIdx]) + else: + s += " * size%s" % (self.indexChars[idx]) + s += ";\n" + + numIdx = len(problemType["IndexAssignmentsA"]) + printMe = printedStride = printedFree = printedSum = False + s += "%suint64_t tensor2dSizeA = %s" % (t, "1" if firstStride==1 else "strideA%u%s"% (0,self.indexChars[0])) + for i in range(0,numIdx): + idx = problemType["IndexAssignmentsA"][i] + + # Multiply only by first free and first summation + if idx in problemType["IndicesFree"] and not printedFree: + printMe = printedFree = True + elif idx in problemType["IndicesSummation"] and not printedSum: + printMe = printedSum = True + else: + printMe = False + + if printMe: + if not printedStride: + printedStride = True + strideIdx = problemType["IndexAssignmentsA"][i+1] + s += " * std::max(size%s, strideA%u%s)" % \ + (self.indexChars[idx], i+1, self.indexChars[strideIdx]) + else: + s += " * size%s" % (self.indexChars[idx]) + s += ";\n" + + numIdx = len(problemType["IndexAssignmentsB"]) + printMe = printedStride = printedFree = printedSum = False + s += "%suint64_t tensor2dSizeB = %s" % (t, "1" if firstStride==1 else "strideB%u%s"% (0,self.indexChars[0])) + for i in range(0,numIdx): + idx = problemType["IndexAssignmentsB"][i] + + # Multiply only by first free and first summation + if idx in problemType["IndicesFree"] and not printedFree: + printMe = printedFree = True + elif idx in problemType["IndicesSummation"] and not printedSum: + printMe = printedSum = True + else: + printMe = False + + if printMe: + if not printedStride: + printedStride = True + strideIdx = problemType["IndexAssignmentsB"][i+1] + s += " * std::max(size%s, strideB%u%s)" % \ + (self.indexChars[idx], i+1, self.indexChars[strideIdx]) + else: + s += " * size%s" % (self.indexChars[idx]) + s += ";\n" + + #s += "printf(\"Launching with grid=%zu_%zu problemGrid=%u_%u mt=%u_%u\\n\", globalWorkSize[0][0], globalWorkSize[0][1], totalWorkGroups0, totalWorkGroups1, macroTile0, macroTile1);\n" s += "\n" @@ -451,6 +531,9 @@ def getSourceString(self, solution, kernelsWithBuildErrs): # sizes for i in range(0, solution["ProblemType"]["TotalIndices"]): s += "%sprintf(\" sizes[kernelIdx][enqueueIdx][%u] = %%u\\n\", sizes[kernelIdx][enqueueIdx][%u] );\n" % (t, i, i ) + s += "%sprintf(\" tensor2dSizeC== %%lu\\n\", tensor2dSizeC );\n" % (t) + s += "%sprintf(\" tensor2dSizeA== %%lu\\n\", tensor2dSizeA );\n" % (t) + s += "%sprintf(\" tensor2dSizeB== %%lu\\n\", tensor2dSizeB );\n" % (t) ######################################## # OpenCL Runtime @@ -581,6 +664,10 @@ def getSourceString(self, solution, kernelsWithBuildErrs): s += "%shipFunctionArgs.size%s = sizes[kernelIdx][enqueueIdx][%u];\n" \ % (t, globalParameters["IndexChars"][i], i ) + s += "%shipFunctionArgs.tensor2dSizeC = tensor2dSizeC;\n" % (t) + s += "%shipFunctionArgs.tensor2dSizeA = tensor2dSizeA;\n" % (t) + s += "%shipFunctionArgs.tensor2dSizeB = tensor2dSizeB;\n" % (t) + if solution["PersistentKernel"]: # pass in the number of groups since not available in WG s += "%shipFunctionArgs.numGroupTiles0 = totalWorkGroups0;\n" % (t) diff --git a/Tensile/Source/Client.h b/Tensile/Source/Client.h index 7e30a0885a..aaa535bacd 100644 --- a/Tensile/Source/Client.h +++ b/Tensile/Source/Client.h @@ -210,7 +210,7 @@ void specializeData( const unsigned int numIndicesSummation = totalIndices - numIndicesC; - const unsigned int db = 1; // 0x1=header, 0x2=offset/value on each store, 0x4=loop debug + const unsigned int db = 0; // 0x1=header, 0x2=offset/value on each store, 0x4=loop debug TensorDims td("specialize_matrix", numIndicesAB, numIndicesC, allSizes, indexAssignments); if (db & 0x1) { diff --git a/Tensile/Source/SolutionHelper.h b/Tensile/Source/SolutionHelper.h index aab9902f0c..0f93740c51 100644 --- a/Tensile/Source/SolutionHelper.h +++ b/Tensile/Source/SolutionHelper.h @@ -28,6 +28,7 @@ #include #include #include +#include /******************************************************************************* * Kernel Cache diff --git a/Tensile/Source/TensileTypes.h b/Tensile/Source/TensileTypes.h index cdd83a34e7..978be1c232 100644 --- a/Tensile/Source/TensileTypes.h +++ b/Tensile/Source/TensileTypes.h @@ -70,8 +70,9 @@ TensileStatus tensileTeardown(); #define tensileStatusCheck(RET) { \ TensileStatus tensileCheckStatusTmp = RET; \ if(tensileCheckStatusTmp != tensileStatusSuccess) { \ - printf("TensileStatusFailure %i on line %u of %s\n", \ + fprintf(stderr, "ERROR: TensileStatusFailure %i on line %u of %s\n", \ tensileCheckStatusTmp, __LINE__, __FILE__); \ + abort();\ } } template diff --git a/Tensile/TensileCreateLibrary.py b/Tensile/TensileCreateLibrary.py index cd0b4c409e..670f4898da 100644 --- a/Tensile/TensileCreateLibrary.py +++ b/Tensile/TensileCreateLibrary.py @@ -26,7 +26,7 @@ from SolutionWriter import SolutionWriter from KernelWriterSource import KernelWriterSource from KernelWriterAssembly import KernelWriterAssembly -import multiprocessing, copy +import multiprocessing import os import sys @@ -159,6 +159,7 @@ def writeSolutionsAndKernels(outputPath, solutions, kernels, kernelsBetaOnly, \ kiStart, kiStop, child) t = multiprocessing.Process(target=processKernelSourceChunk, args=args) t.start() + child.close() # close child pipe in the parent process threads.append([t,kiStart,kiStop, parentConn]) if processLaunchProgressBar: processLaunchProgressBar.increment(kiStop-kiStart) @@ -174,7 +175,11 @@ def writeSolutionsAndKernels(outputPath, solutions, kernels, kernelsBetaOnly, \ someError = 0 for (t,kiStart,kiStop,parentConn) in threads: - results = parentConn.recv() + try: + results = parentConn.recv() + except EOFError as pipeErr: + print "*** warning: process", t, "returned pipe EOF",t,pipeErr + t.join() e = t.exitcode if e != 0 : @@ -262,6 +267,7 @@ def writeSolutionsAndKernels(outputPath, solutions, kernels, kernelsBetaOnly, \ solutionSourceFile.write(CHeader) solutionHeaderFile.write(CHeader) solutionSourceFile.write("#include \"Solutions.h\"\n") + solutionSourceFile.write("#include \n") solutionHeaderFile.write("#include \"TensileTypes.h\"\n") solutionHeaderFile.write("#include \"Kernels.h\"\n") solutionHeaderFile.write("#include \"SolutionHelper.h\"\n") @@ -563,6 +569,23 @@ def writeLogic(outputPath, logicData, solutionWriter ): s += " hipGetDeviceProperties(&deviceProperties, deviceId);\n" s += " std::string name = deviceProperties.name;\n" + if problemType["DataType"].isDouble() : + s += "\n" + s += "// intercept schedule selection and call HIP (source) kernel\n" + s += " if((strideA2K == 0) || (strideB2K == 0))\n" + s += " {\n" + numSchedules = len(schedules) + schedule = reordered_schedules[numSchedules-1] + scheduleName = schedule[0] + s += " return tensileGetSolution%s_%s_%s(" \ + % ( returnType, scheduleName, problemType) + for i in range(0, len(argListSizes)): + s += "%s%s" \ + % (argListSizes[i][1], + ", " if i < len(argListSizes)-1 else ");\n") + s += " }\n" + s += "\n" + if problemType["DataType"].isHalf() : # "first" free index, usually the letter "I" free0Index = problemType["IndicesFree"][0] diff --git a/Tensile/Tests/bugs/fractional_plus_pbc.yaml b/Tensile/Tests/bugs/fractional_plus_pbc.yaml new file mode 100644 index 0000000000..e9fb949047 --- /dev/null +++ b/Tensile/Tests/bugs/fractional_plus_pbc.yaml @@ -0,0 +1,68 @@ +GlobalParameters: + MinimumRequiredVersion: 4.0.0 + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + CMakeBuildType: Release + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + LibraryPrintDebug: False + NumElementsToValidate: 1000 + ValidationMaxToPrint: 10 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + Platform: 0 + Device: 0 + KernelTime: True + PinClocks: True + SleepPercent: 200 + DataInitTypeAlpha : 1 + DataInitTypeA : 3 + DataInitTypeB : 3 + PrintSolutionRejectionReason : 0 + +BenchmarkProblems: + - + - # ProblemType + OperationType: GEMM + DataType: s + TransposeA: True + TransposeB: True + UseBeta: True + Batched: True + + ######################################## + # NN - Batch + ######################################## + - # Benchmark Group - ResNet 1x1: + InitialSolutionParameters: + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - KernelLanguage: ["Assembly"] + + ForkParameters: + - FractionalLoad: [1] + - PrefetchGlobalRead: [0] + - PrefetchLocalRead: [1] + - ThreadTile: + - [ 8, 6 ] + - WorkGroup: + - [ 32, 8, 1 ] + #- [ 16, 16, 1 ] + - WorkGroupMapping: [64] + # - DepthU: [3,5,7,9,16] # some bugs with odd unroll dims + - DepthU: [16] + - VectorWidth: [2] + - GlobalReadVectorWidth: [2] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + #- Range: [ [127,1,127], 0, [1], [64,1,64] ] + +# to repro, disable this in SolutionStructs.py +# 1506 reject(state, "Fractional currently requires PreciseBoundsCheck") # debug this later +# Symptom is GPUVM fault, diff --git a/Tensile/Tests/create_tests.py b/Tensile/Tests/create_tests.py index 9438fc6c67..7e2a715940 100755 --- a/Tensile/Tests/create_tests.py +++ b/Tensile/Tests/create_tests.py @@ -1,6 +1,6 @@ #!/usr/bin/python -# Create a test_py script for all test*yaml files in specified directory +# Create a test_py script for all *.yaml files in specified directory # usage: create_tests.py TEST_DIR # Run from the Tensile/Tests directory, output script goes in the TEST_DIR/test_TEST_DIR.py @@ -14,7 +14,7 @@ print "info: writing test script to %s" % targetFile outfile = open(targetFile, "w" ) outfile.write("import Tensile.Tensile as Tensile\n\n") -for f in glob.glob("%s/*yaml"%targetDir): +for f in glob.glob("%s/*aml"%targetDir): baseName = os.path.basename(f) testName = os.path.splitext(baseName)[0] if not testName.startswith("test_"): diff --git a/Tensile/Tests/disabled/hgemm_nn_source.yaml b/Tensile/Tests/disabled/hgemm_nn_source.yaml new file mode 100644 index 0000000000..0eafc0eb1e --- /dev/null +++ b/Tensile/Tests/disabled/hgemm_nn_source.yaml @@ -0,0 +1,44 @@ +# Sweep across different vector widths and global vector widths +GlobalParameters: + MinimumRequiredVersion: 4.2.0 + NumElementsToValidate: -1 + KernelTime: True + SleepPercent: 0 + +BenchmarkProblems: + + - # + - # ProblemType + OperationType: GEMM + DataType: h + TransposeA: False + TransposeB: False + UseBeta: True + Batched: True + + - # BenchmarkProblemSizeGroup - Assembly + InitialSolutionParameters: + BenchmarkCommonParameters: + - LoopTail: [True] + - EdgeType: ["ShiftPtr"] + ForkParameters: + - KernelLanguage: ["Source"] + - GlobalSplitU: [1,3] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - [ 8, 16 ] + - WorkGroup: + - [ 16, 16, 1 ] + - [ 8, 16, 1 ] + - DepthU: [32] + - VectorWidth: [1,2,4,8] + - GlobalReadVectorWidth: [1,2,4,8] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127,1,129], 0, [4], [63,1,65] ] diff --git a/Tensile/Tests/disabled/test_disabled.py b/Tensile/Tests/disabled/test_disabled.py index 100f0c00fc..108ec8ae02 100644 --- a/Tensile/Tests/disabled/test_disabled.py +++ b/Tensile/Tests/disabled/test_disabled.py @@ -6,3 +6,5 @@ def test_create_library(tmpdir): def test_assertion_selection(tmpdir): Tensile.Tensile([Tensile.TensileTestPath("disabled/test_assertion_selection.yaml"), tmpdir.strpath]) +def test_hgemm_nn_source(tmpdir): + Tensile.Tensile([Tensile.TensileTestPath("disabled/hgemm_nn_source.yaml"), tmpdir.strpath]) diff --git a/Tensile/Tests/nightly/big_tensor/bigskinny_nt.yaml b/Tensile/Tests/nightly/big_tensor/bigskinny_nt.yaml new file mode 100644 index 0000000000..e9d2a8897f --- /dev/null +++ b/Tensile/Tests/nightly/big_tensor/bigskinny_nt.yaml @@ -0,0 +1,63 @@ +GlobalParameters: + MinimumRequiredVersion: 4.0.0 + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + CMakeBuildType: Release + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + LibraryPrintDebug: True + NumElementsToValidate: 1000 + ValidationMaxToPrint: 100 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + Platform: 0 + Device: 0 + KernelTime: True + PinClocks: True + SleepPercent: 0 + PrintSolutionRejectionReason : 1 + +BenchmarkProblems: + ######################################## + # NT - standard + ######################################## + - + - # ProblemType + OperationType: GEMM + DataType: d + TransposeA: False + TransposeB: False + UseBeta: True + Batched: False + + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + ForkParameters: + - CheckDimOverflow: [0, 1, 2] + - PrefetchLocalRead: [False] + - PrefetchGlobalRead: [False] + - ThreadTile: + - [ 4, 4 ] + - WorkGroup: + - [ 16, 16, 1 ] + - WorkGroupMapping: [64] + - GlobalSplitU: [1] + - DepthU: [ 4 ] + - VectorWidth: [2] + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - MinStride: [1296, 24296, 31296] + - Exact: [804, 20732, 184] + + ######################################## + +# Cijk_Ailk_Bljk_DB_MT064x064x04_AF0EM01_ASEM01_BL1_GRVW02_GSU01_ISA900_K1_KLA_LPB00_MGWVW01_NLCA01_NLCB01_PBC1_PGR1_PLR1_TT04_04_USFGRO00_VAW01_VW02_WG16_16_01_WGM08 +# ./rocblas-bench -f gemm -r d -m 11804 -n 25732 -k 384 --lda 31296 --ldb 31296 --ldc 31296 --transposeB N --transposeA N -v 1 + diff --git a/Tensile/Tests/nightly/big_tensor/test_big_tensor.py b/Tensile/Tests/nightly/big_tensor/test_big_tensor.py new file mode 100644 index 0000000000..03e1ff64b6 --- /dev/null +++ b/Tensile/Tests/nightly/big_tensor/test_big_tensor.py @@ -0,0 +1,5 @@ +import Tensile.Tensile as Tensile + +def test_bigskinny_nt(tmpdir): + Tensile.Tensile([Tensile.TensileTestPath("nightly/big_tensor/bigskinny_nt.yaml"), tmpdir.strpath]) + diff --git a/Tensile/Tests/nightly/classic_source/test_classic_source.py b/Tensile/Tests/nightly/classic_source/test_classic_source.py index 8fd9fb7da8..2b819e9439 100644 --- a/Tensile/Tests/nightly/classic_source/test_classic_source.py +++ b/Tensile/Tests/nightly/classic_source/test_classic_source.py @@ -1,7 +1,10 @@ import Tensile.Tensile as Tensile -def test_hgemm(tmpdir): - Tensile.Tensile([Tensile.TensileTestPath("nightly/classic_source/test_hgemm.yaml"), tmpdir.strpath]) +def test_hgemm_nn(tmpdir): + Tensile.Tensile([Tensile.TensileTestPath("nightly/classic_source/test_hgemm_nn.yaml"), tmpdir.strpath]) + +def test_hgemm_tn_tt(tmpdir): + Tensile.Tensile([Tensile.TensileTestPath("nightly/classic_source/test_hgemm_tn_tt.yaml"), tmpdir.strpath]) def test_sgemm_vectors(tmpdir): Tensile.Tensile([Tensile.TensileTestPath("nightly/classic_source/test_sgemm_vectors.yaml"), tmpdir.strpath]) @@ -12,6 +15,9 @@ def test_hgemm_vectors(tmpdir): def test_dgemm(tmpdir): Tensile.Tensile([Tensile.TensileTestPath("nightly/classic_source/test_dgemm.yaml"), tmpdir.strpath]) +def test_hgemm_nt(tmpdir): + Tensile.Tensile([Tensile.TensileTestPath("nightly/classic_source/test_hgemm_nt.yaml"), tmpdir.strpath]) + def test_sgemm(tmpdir): Tensile.Tensile([Tensile.TensileTestPath("nightly/classic_source/test_sgemm.yaml"), tmpdir.strpath]) diff --git a/Tensile/Tests/nightly/classic_source/test_hgemm.yaml b/Tensile/Tests/nightly/classic_source/test_hgemm.yaml deleted file mode 100644 index 188dbab557..0000000000 --- a/Tensile/Tests/nightly/classic_source/test_hgemm.yaml +++ /dev/null @@ -1,319 +0,0 @@ -GlobalParameters: - PrintLevel: 1 - ForceRedoBenchmarkProblems: True - PrintSolutionRejectionReason: False - MinimumRequiredVersion: 4.2.0 - NumElementsToValidate: -1 - ValidationMaxToPrint: 4 - DataInitTypeAB: 1 - DataInitTypeC: 1 - ExitOnFails: 0 # Some solutions fail so just ensure we find one good solution - -BenchmarkProblems: - - ############################################################################ - # NN - ############################################################################ - - - - # ProblemType - OperationType: GEMM - DataType: h - TransposeA: False - TransposeB: False - UseBeta: False - Batched: True - - - # Tile Sizes - BenchmarkCommonParameters: - - EdgeType: ["ShiftPtr"] - - LoopTail: [True] - - MacroTileShapeMax: [64] - ForkParameters: - - ThreadTile: - - [8, 8] - - [7, 4] - - [3, 5] - - [2, 6] - - [1, 1] - - WorkGroup: - - [ 16, 16, 1 ] - - [ 8, 2, 16 ] - - [ 16, 12, 1 ] - - DepthU: [ 2, 16, 64 ] - - GlobalSplitU: [1, 4] - - VectorWidth: [1] - BenchmarkFinalParameters: - - ProblemSizes: - - Exact: [ 1, 1, 1, 1 ] - - Range: [ [127, 1, 129], [127, 1, 129], [1, 2], [63, 1, 65] ] - - - - # Non-Tile Sizes - BenchmarkCommonParameters: - - EdgeType: ["ShiftPtr"] - - LoopTail: [True] - - WorkGroup: [ [8, 8, 2 ] ] - - ThreadTile: [ [4, 8] ] - - DepthU: [ 16 ] - ForkParameters: - - GlobalReadCoalesceGroupA: [False, True] - - GlobalReadCoalesceGroupB: [False, True] - - PrefetchGlobalRead: [False, True] - - PrefetchLocalRead: [False, True] - - VectorWidth: [1] - - GlobalSplitU: [1, 4] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [127, 1, 129], [127, 1, 129], [1], [63, 1, 65] ] - - - # Branches - BenchmarkCommonParameters: - - LoopTail: [True] - - EdgeType: ["Branch"] - - DepthU: [ 16 ] - ForkParameters: - - ThreadTile: - - [8, 8] - - [2, 8] - - WorkGroup: - - [ 8, 4, 4 ] - - [ 16, 16, 1 ] - - VectorWidth: [1] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [3], [63, 1, 64] ] - - - ############################################################################ - # NT - ############################################################################ - - - - # ProblemType - OperationType: GEMM - DataType: h - TransposeA: False - TransposeB: True - UseBeta: False - - - # Tile Sizes - BenchmarkCommonParameters: - - EdgeType: ["ShiftPtr"] - - LoopTail: [True] - - MacroTileShapeMax: [64] - ForkParameters: - - ThreadTile: - - [8, 8] - - [7, 4] - - [3, 5] - - [2, 6] - - [1, 1] - - WorkGroup: - - [ 16, 16, 1 ] - - [ 8, 2, 16 ] - - [ 16, 12, 1 ] - - DepthU: [ 2, 16, 64 ] - - GlobalSplitU: [1, 4] - - VectorWidth: [1] - BenchmarkFinalParameters: - - ProblemSizes: - - Exact: [ 1, 1, 1 ] - - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ] - - - - # Non-Tile Sizes - BenchmarkCommonParameters: - - EdgeType: ["ShiftPtr"] - - LoopTail: [True] - - WorkGroup: [ [8, 8, 2 ] ] - - ThreadTile: [ [4, 8] ] - - DepthU: [ 16 ] - ForkParameters: - - GlobalReadCoalesceGroupA: [False, True] - - GlobalReadCoalesceGroupB: [False, True] - - PrefetchGlobalRead: [False, True] - - PrefetchLocalRead: [False, True] - - VectorWidth: [1] - - GlobalSplitU: [1, 4] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ] - - - # Branches - BenchmarkCommonParameters: - - LoopTail: [True] - - EdgeType: ["Branch"] - - DepthU: [ 16 ] - ForkParameters: - - ThreadTile: - - [8, 8] - - [2, 8] - - WorkGroup: - - [ 8, 4, 4 ] - - [ 16, 16, 1 ] - - VectorWidth: [1] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [63, 1, 64] ] - - - ############################################################################ - # TN - ############################################################################ - - - - # ProblemType - OperationType: GEMM - DataType: h - TransposeA: True - TransposeB: False - UseBeta: True - - - # Tile Sizes - BenchmarkCommonParameters: - - EdgeType: ["ShiftPtr"] - - LoopTail: [True] - - MacroTileShapeMax: [64] - ForkParameters: - - ThreadTile: - - [8, 8] - - [7, 4] - - [3, 5] - - [2, 6] - - [1, 1] - - WorkGroup: - - [ 16, 16, 1 ] - - [ 8, 2, 16 ] - - [ 16, 12, 1 ] - - DepthU: [ 2, 16, 64 ] - - GlobalSplitU: [1, 4] - - VectorWidth: [1] - BenchmarkFinalParameters: - - ProblemSizes: - - Exact: [ 1, 1, 1 ] - - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ] - - - - # Non-Tile Sizes - BenchmarkCommonParameters: - - EdgeType: ["ShiftPtr"] - - LoopTail: [True] - - WorkGroup: [ [8, 8, 2 ] ] - - ThreadTile: [ [4, 8] ] - - DepthU: [ 16 ] - ForkParameters: - - GlobalReadCoalesceGroupA: [False, True] - - GlobalReadCoalesceGroupB: [False, True] - - PrefetchGlobalRead: [False, True] - - PrefetchLocalRead: [False, True] - - VectorWidth: [1] - - GlobalSplitU: [1, 4] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ] - - - # Branches - BenchmarkCommonParameters: - - LoopTail: [True] - - EdgeType: ["Branch"] - - DepthU: [ 16 ] - ForkParameters: - - ThreadTile: - - [8, 8] - - [2, 8] - - WorkGroup: - - [ 8, 4, 4 ] - - [ 16, 16, 1 ] - - VectorWidth: [1] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [63, 1, 64] ] - - - ############################################################################ - # TT - ############################################################################ - - - - # ProblemType - OperationType: GEMM - DataType: h - TransposeA: True - TransposeB: True - UseBeta: False - - - # Tile Sizes - BenchmarkCommonParameters: - - EdgeType: ["ShiftPtr"] - - LoopTail: [True] - - MacroTileShapeMax: [64] - ForkParameters: - - ThreadTile: - - [8, 8] - - [7, 4] - - [3, 5] - - [2, 6] - - [1, 1] - - WorkGroup: - - [ 16, 16, 1 ] - - [ 8, 2, 16 ] - - [ 16, 12, 1 ] - - DepthU: [ 2, 16, 64 ] - - GlobalSplitU: [1, 4] - - VectorWidth: [1] - BenchmarkFinalParameters: - - ProblemSizes: - - Exact: [ 1, 1, 1 ] - - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ] - - - - # Non-Tile Sizes - BenchmarkCommonParameters: - - EdgeType: ["ShiftPtr"] - - LoopTail: [True] - - WorkGroup: [ [8, 8, 2 ] ] - - ThreadTile: [ [4, 8] ] - - DepthU: [ 16 ] - ForkParameters: - - GlobalReadCoalesceGroupA: [False, True] - - GlobalReadCoalesceGroupB: [False, True] - - PrefetchGlobalRead: [False, True] - - PrefetchLocalRead: [False, True] - - VectorWidth: [1] - - GlobalSplitU: [1, 4] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ] - - - # Branches - BenchmarkCommonParameters: - - LoopTail: [True] - - EdgeType: ["Branch"] - - DepthU: [ 16 ] - ForkParameters: - - ThreadTile: - - [8, 8] - - [2, 8] - - WorkGroup: - - [ 8, 4, 4 ] - - [ 16, 16, 1 ] - - VectorWidth: [1] - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [63, 1, 64] ] - -LibraryLogic: - ScheduleName: "vega10" - DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861"] - ArchitectureName: "gfx900" - -# ScheduleName: "mi25" -# DeviceNames: ["Device 6860"] -# ArchitectureName: "gfx900" - -# ScheduleName: "r9nano" -# DeviceNames: ["Device 7300"] -# ArchitectureName: "gfx803" - -# ScheduleName: "hip" -# DeviceNames: ["Device 0000"] -# ArchitectureName: "fallback" - -LibraryClient: diff --git a/Tensile/Tests/nightly/classic_source/test_hgemm_nn.yaml b/Tensile/Tests/nightly/classic_source/test_hgemm_nn.yaml new file mode 100644 index 0000000000..c1f4baef13 --- /dev/null +++ b/Tensile/Tests/nightly/classic_source/test_hgemm_nn.yaml @@ -0,0 +1,85 @@ +GlobalParameters: + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + PrintSolutionRejectionReason: False + MinimumRequiredVersion: 4.2.0 + NumElementsToValidate: -1 + ValidationMaxToPrint: 4 + DataInitTypeAB: 1 + DataInitTypeC: 1 + ExitOnFails: 0 # Some solutions fail so just ensure we find one good solution + +BenchmarkProblems: + + ############################################################################ + # NN + ############################################################################ + - + - # ProblemType + OperationType: GEMM + DataType: h + TransposeA: False + TransposeB: False + UseBeta: False + Batched: True + + - # Tile Sizes + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - MacroTileShapeMax: [64] + ForkParameters: + - ThreadTile: + - [8, 8] + - [7, 4] + - [3, 5] + - [2, 6] + - [1, 1] + - WorkGroup: + - [ 16, 16, 1 ] + - [ 8, 2, 16 ] + - [ 16, 12, 1 ] + - DepthU: [ 2, 16, 64 ] + - GlobalSplitU: [1, 4] + - VectorWidth: [1] + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [ 1, 1, 1, 1 ] + - Range: [ [127, 1, 129], [127, 1, 129], [1, 2], [63, 1, 65] ] + + + - # Non-Tile Sizes + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - WorkGroup: [ [8, 8, 2 ] ] + - ThreadTile: [ [4, 8] ] + - DepthU: [ 16 ] + ForkParameters: + - GlobalReadCoalesceGroupA: [False, True] + - GlobalReadCoalesceGroupB: [False, True] + - PrefetchGlobalRead: [False, True] + - PrefetchLocalRead: [False, True] + - VectorWidth: [1] + - GlobalSplitU: [1, 4] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127, 1, 129], [127, 1, 129], [1], [63, 1, 65] ] + + - # Branches + BenchmarkCommonParameters: + - LoopTail: [True] + - EdgeType: ["Branch"] + - DepthU: [ 16 ] + ForkParameters: + - ThreadTile: + - [8, 8] + - [2, 8] + - WorkGroup: + - [ 8, 4, 4 ] + - [ 16, 16, 1 ] + - VectorWidth: [1] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [3], [63, 1, 64] ] + diff --git a/Tensile/Tests/nightly/classic_source/test_hgemm_nt.yaml b/Tensile/Tests/nightly/classic_source/test_hgemm_nt.yaml new file mode 100644 index 0000000000..2a048da245 --- /dev/null +++ b/Tensile/Tests/nightly/classic_source/test_hgemm_nt.yaml @@ -0,0 +1,87 @@ +GlobalParameters: + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + PrintSolutionRejectionReason: False + MinimumRequiredVersion: 4.2.0 + NumElementsToValidate: -1 + ValidationMaxToPrint: 4 + DataInitTypeAB: 1 + DataInitTypeC: 1 + ExitOnFails: 0 # Some solutions fail so just ensure we find one good solution + +BenchmarkProblems: + + ############################################################################ + # NT + ############################################################################ + - + - # ProblemType + OperationType: GEMM + DataType: h + TransposeA: False + TransposeB: True + UseBeta: False + +## disabled for now due to hanging +## +## - # Tile Sizes +## BenchmarkCommonParameters: +## - EdgeType: ["ShiftPtr"] +## - LoopTail: [True] +## - MacroTileShapeMax: [64] +## ForkParameters: +## - ThreadTile: +## - [8, 8] +## - [7, 4] +## - [3, 5] +## - [2, 6] +## - [1, 1] +## - WorkGroup: +## - [ 16, 16, 1 ] +## - [ 8, 2, 16 ] +## - [ 16, 12, 1 ] +## - DepthU: [ 2, 16, 64 ] +## - GlobalSplitU: [1, 4] +## - VectorWidth: [1] +## BenchmarkFinalParameters: +## - ProblemSizes: +## - Exact: [ 1, 1, 1 ] +## - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ] + + + - # Non-Tile Sizes + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - WorkGroup: [ [8, 8, 2 ] ] + - ThreadTile: [ [4, 8] ] + - DepthU: [ 16 ] + ForkParameters: + - GlobalReadCoalesceGroupA: [False, True] + - GlobalReadCoalesceGroupB: [False, True] + - PrefetchGlobalRead: [False, True] + - PrefetchLocalRead: [False, True] + - VectorWidth: [1] + - GlobalSplitU: [1, 4] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ] + + - # Branches + BenchmarkCommonParameters: + - LoopTail: [True] + - EdgeType: ["Branch"] + - DepthU: [ 16 ] + ForkParameters: + - ThreadTile: + - [8, 8] + - [2, 8] + - WorkGroup: + - [ 8, 4, 4 ] + - [ 16, 16, 1 ] + - VectorWidth: [1] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [63, 1, 64] ] + + diff --git a/Tensile/Tests/nightly/classic_source/test_hgemm_tn_tt.yaml b/Tensile/Tests/nightly/classic_source/test_hgemm_tn_tt.yaml new file mode 100644 index 0000000000..41bd3c4626 --- /dev/null +++ b/Tensile/Tests/nightly/classic_source/test_hgemm_tn_tt.yaml @@ -0,0 +1,178 @@ +GlobalParameters: + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + PrintSolutionRejectionReason: False + MinimumRequiredVersion: 4.2.0 + NumElementsToValidate: -1 + ValidationMaxToPrint: 4 + DataInitTypeAB: 1 + DataInitTypeC: 1 + ExitOnFails: 0 # Some solutions fail so just ensure we find one good solution + +BenchmarkProblems: + # Covers TN and TT cases to test combining at at + + + ############################################################################ + # TN + ############################################################################ + - + - # ProblemType + OperationType: GEMM + DataType: h + TransposeA: True + TransposeB: False + UseBeta: True + + - # Tile Sizes + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - MacroTileShapeMax: [64] + ForkParameters: + - ThreadTile: + - [8, 8] + - [7, 4] + - [3, 5] + - [2, 6] + - [1, 1] + - WorkGroup: + - [ 16, 16, 1 ] + - [ 8, 2, 16 ] + - [ 16, 12, 1 ] + - DepthU: [ 2, 16, 64 ] + - GlobalSplitU: [1, 4] + - VectorWidth: [1] + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [ 1, 1, 1 ] + - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ] + + + - # Non-Tile Sizes + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - WorkGroup: [ [8, 8, 2 ] ] + - ThreadTile: [ [4, 8] ] + - DepthU: [ 16 ] + ForkParameters: + - GlobalReadCoalesceGroupA: [False, True] + - GlobalReadCoalesceGroupB: [False, True] + - PrefetchGlobalRead: [False, True] + - PrefetchLocalRead: [False, True] + - VectorWidth: [1] + - GlobalSplitU: [1, 4] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ] + + - # Branches + BenchmarkCommonParameters: + - LoopTail: [True] + - EdgeType: ["Branch"] + - DepthU: [ 16 ] + ForkParameters: + - ThreadTile: + - [8, 8] + - [2, 8] + - WorkGroup: + - [ 8, 4, 4 ] + - [ 16, 16, 1 ] + - VectorWidth: [1] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [63, 1, 64] ] + + + ############################################################################ + # TT + ############################################################################ + - + - # ProblemType + OperationType: GEMM + DataType: h + TransposeA: True + TransposeB: True + UseBeta: False + +## disabled for now due to hanging +## +## - # Tile Sizes +## BenchmarkCommonParameters: +## - EdgeType: ["ShiftPtr"] +## - LoopTail: [True] +## - MacroTileShapeMax: [64] +## ForkParameters: +## - ThreadTile: +## - [8, 8] +## - [7, 4] +## - [3, 5] +## - [2, 6] +## - [1, 1] +## - WorkGroup: +## - [ 16, 16, 1 ] +## - [ 8, 2, 16 ] +## - [ 16, 12, 1 ] +## - DepthU: [ 2, 16, 64 ] +## - GlobalSplitU: [1, 4] +## - VectorWidth: [1] +## BenchmarkFinalParameters: +## - ProblemSizes: +## - Exact: [ 1, 1, 1 ] +## - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ] + + + - # Non-Tile Sizes + BenchmarkCommonParameters: + - EdgeType: ["ShiftPtr"] + - LoopTail: [True] + - WorkGroup: [ [8, 8, 2 ] ] + - ThreadTile: [ [4, 8] ] + - DepthU: [ 16 ] + ForkParameters: + - GlobalReadCoalesceGroupA: [False, True] + - GlobalReadCoalesceGroupB: [False, True] + - PrefetchGlobalRead: [False, True] + - PrefetchLocalRead: [False, True] + - VectorWidth: [1] + - GlobalSplitU: [1, 4] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ] + + - # Branches + BenchmarkCommonParameters: + - LoopTail: [True] + - EdgeType: ["Branch"] + - DepthU: [ 16 ] + ForkParameters: + - ThreadTile: + - [8, 8] + - [2, 8] + - WorkGroup: + - [ 8, 4, 4 ] + - [ 16, 16, 1 ] + - VectorWidth: [1] + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [63, 1, 64] ] + +LibraryLogic: + ScheduleName: "vega10" + DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"] + ArchitectureName: "gfx900" + +# ScheduleName: "mi25" +# DeviceNames: ["Device 6860"] +# ArchitectureName: "gfx900" + +# ScheduleName: "r9nano" +# DeviceNames: ["Device 7300"] +# ArchitectureName: "gfx803" + +# ScheduleName: "hip" +# DeviceNames: ["Device 0000"] +# ArchitectureName: "fallback" + +LibraryClient: diff --git a/Tensile/Tests/nightly/fractional/test_sgemm_fractional_edge.yaml b/Tensile/Tests/nightly/fractional/test_sgemm_fractional_edge.yaml index 48b1576422..78f9e37c4d 100644 --- a/Tensile/Tests/nightly/fractional/test_sgemm_fractional_edge.yaml +++ b/Tensile/Tests/nightly/fractional/test_sgemm_fractional_edge.yaml @@ -44,8 +44,6 @@ BenchmarkProblems: ForkParameters: - FractionalLoad: [1] - # Set to enable PBC and functional code-gen - else should get GLVW=1 path - - AssertSummationElementMultiple: [4] - PrefetchGlobalRead: [0] - PrefetchLocalRead: [1] - ThreadTile: diff --git a/Tensile/Tests/nightly/nonbatched/sgemm_asm_nn.yaml b/Tensile/Tests/nightly/nonbatched/sgemm_asm_nn.yaml new file mode 100644 index 0000000000..c7b90883b0 --- /dev/null +++ b/Tensile/Tests/nightly/nonbatched/sgemm_asm_nn.yaml @@ -0,0 +1,86 @@ +# benchmark assembly and source kernels +GlobalParameters: + MinimumRequiredVersion: 4.2.0 + CMakeBuildType: Release + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + NumElementsToValidate: -1 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + DataInitTypeAB: 3 + DataInitTypeC: 3 + KernelTime: True + +BenchmarkProblems: + + - # sgemm NN + - # ProblemType + OperationType: GEMM + DataType: s + TransposeA: False + TransposeB: False + UseBeta: True + Batched: False + + - # BenchmarkProblemSizeGroup - Assembly + InitialSolutionParameters: + BenchmarkCommonParameters: + - LoopTail: [True] + - EdgeType: ["ShiftPtr"] + - KernelLanguage: ["Assembly"] + ForkParameters: + - GlobalSplitU: [1, 3] + - PrefetchLocalRead: [False] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 3, 5 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 32, 4, 1 ] + - [ 8, 8, 1 ] + - [ 4, 8, 4 ] + - DepthU: [-3] + - VectorWidth: [-1] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127,1,129], 0, [63,1,65] ] + + - # BenchmarkProblemSizeGroup - Assembly + InitialSolutionParameters: + BenchmarkCommonParameters: + - LoopTail: [True] + - EdgeType: ["ShiftPtr"] + ForkParameters: + - KernelLanguage: ["Assembly"] + - GlobalSplitU: [1, 3] + - PrefetchLocalRead: [False] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 3, 3 ] + - [ 4, 4 ] + - [ 5, 5 ] + - [ 8, 8 ] + - WorkGroup: + - [ 16, 16, 1 ] + - [ 8, 8, 1 ] + - [ 4, 4, 4 ] + - DepthU: [-1] + - VectorWidth: [-1] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127,1,129], 0, [63,1,65] ] + + diff --git a/Tensile/Tests/nightly/nonbatched/sgemm_asm_nt.yaml b/Tensile/Tests/nightly/nonbatched/sgemm_asm_nt.yaml new file mode 100644 index 0000000000..a355052a3e --- /dev/null +++ b/Tensile/Tests/nightly/nonbatched/sgemm_asm_nt.yaml @@ -0,0 +1,85 @@ +# benchmark assembly and source kernels +GlobalParameters: + MinimumRequiredVersion: 4.2.0 + CMakeBuildType: Release + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + NumElementsToValidate: -1 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + DataInitTypeAB: 3 + DataInitTypeC: 3 + KernelTime: True + +BenchmarkProblems: + + - # sgemm NT + - # ProblemType + OperationType: GEMM + DataType: s + TransposeA: False + TransposeB: True + UseBeta: True + Batched: False + + - # BenchmarkProblemSizeGroup - Assembly + InitialSolutionParameters: + BenchmarkCommonParameters: + - LoopTail: [True] + - EdgeType: ["ShiftPtr"] + - KernelLanguage: ["Assembly"] + ForkParameters: + - GlobalSplitU: [1, 3] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 3, 5 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 16, 16, 1 ] + - [ 8, 16, 1 ] + - [ 2, 8, 8 ] + - DepthU: [-3] + - VectorWidth: [-1] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127,1,129], 0, [63,1,65] ] + + - # BenchmarkProblemSizeGroup - Assembly + InitialSolutionParameters: + BenchmarkCommonParameters: + - LoopTail: [True] + - EdgeType: ["ShiftPtr"] + ForkParameters: + - KernelLanguage: ["Assembly"] + - GlobalSplitU: [1, 3] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 3, 3 ] + - [ 4, 4 ] + - [ 5, 5 ] + - [ 8, 8 ] + - WorkGroup: + - [ 16, 16, 1 ] + - [ 16, 8, 1 ] + - [ 16, 2, 8 ] + - DepthU: [-1] + - VectorWidth: [-1] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127,1,129], 0, [63,1,65] ] + diff --git a/Tensile/Tests/nightly/nonbatched/sgemm_asm_tn.yaml b/Tensile/Tests/nightly/nonbatched/sgemm_asm_tn.yaml new file mode 100644 index 0000000000..20542af0a5 --- /dev/null +++ b/Tensile/Tests/nightly/nonbatched/sgemm_asm_tn.yaml @@ -0,0 +1,84 @@ +# benchmark assembly and source kernels +GlobalParameters: + MinimumRequiredVersion: 4.2.0 + CMakeBuildType: Release + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + NumElementsToValidate: -1 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + DataInitTypeAB: 3 + DataInitTypeC: 3 + KernelTime: True + +BenchmarkProblems: + + - # sgemm TN + - # ProblemType + OperationType: GEMM + DataType: s + TransposeA: True + TransposeB: False + UseBeta: True + Batched: False + + - # BenchmarkProblemSizeGroup - Assembly + InitialSolutionParameters: + BenchmarkCommonParameters: + - LoopTail: [True] + - EdgeType: ["ShiftPtr"] + - KernelLanguage: ["Assembly"] + ForkParameters: + - GlobalSplitU: [1, 3] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [False] + - ThreadTile: + - [ 3, 5 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 16, 16, 1 ] + - [ 32, 4, 1 ] + - DepthU: [-4] + - VectorWidth: [-1] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127,1,129], 0, [63,1,65] ] + + - # BenchmarkProblemSizeGroup - Assembly + InitialSolutionParameters: + BenchmarkCommonParameters: + - LoopTail: [True] + - EdgeType: ["ShiftPtr"] + ForkParameters: + - KernelLanguage: ["Assembly"] + - GlobalSplitU: [1, 3] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [False] + - ThreadTile: + - [ 3, 3 ] + - [ 4, 4 ] + - [ 5, 5 ] + - [ 8, 8 ] + - WorkGroup: + - [ 16, 16, 1 ] + - [ 8, 8, 1 ] + - DepthU: [-1] + - VectorWidth: [-1] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127,1,129], 0, [63,1,65] ] + + diff --git a/Tensile/Tests/nightly/nonbatched/sgemm_asm_tt.yaml b/Tensile/Tests/nightly/nonbatched/sgemm_asm_tt.yaml new file mode 100644 index 0000000000..31ceff3e59 --- /dev/null +++ b/Tensile/Tests/nightly/nonbatched/sgemm_asm_tt.yaml @@ -0,0 +1,80 @@ +# benchmark assembly and source kernels +GlobalParameters: + MinimumRequiredVersion: 4.2.0 + CMakeBuildType: Release + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + NumElementsToValidate: -1 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + DataInitTypeAB: 3 + DataInitTypeC: 3 + KernelTime: True + +BenchmarkProblems: + + - # sgemm TT + - # ProblemType + OperationType: GEMM + DataType: s + TransposeA: True + TransposeB: True + UseBeta: True + Batched: False + + - # BenchmarkProblemSizeGroup - Assembly + InitialSolutionParameters: + BenchmarkCommonParameters: + - LoopTail: [True] + - EdgeType: ["ShiftPtr"] + - KernelLanguage: ["Assembly"] + ForkParameters: + - PrefetchLocalRead: [False] + - PrefetchGlobalRead: [False] + - ThreadTile: + - [ 3, 5 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 16, 16, 1 ] + - [ 8, 8, 1 ] + - DepthU: [-3] + - VectorWidth: [-1] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127,1,129], 0, [63,1,65] ] + + - # BenchmarkProblemSizeGroup - Assembly + InitialSolutionParameters: + BenchmarkCommonParameters: + - LoopTail: [True] + - EdgeType: ["ShiftPtr"] + ForkParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [False] + - PrefetchGlobalRead: [False] + - ThreadTile: + - [ 3, 3 ] + - [ 4, 4 ] + - [ 5, 5 ] + - [ 8, 8 ] + - WorkGroup: + - [ 16, 16, 1 ] + - [ 8, 8, 1 ] + - DepthU: [-1] + - VectorWidth: [-1] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127,1,129], 0, [63,1,65] ] diff --git a/Tensile/Tests/nightly/nonbatched/test_nonbatched.py b/Tensile/Tests/nightly/nonbatched/test_nonbatched.py new file mode 100644 index 0000000000..aa55f6c728 --- /dev/null +++ b/Tensile/Tests/nightly/nonbatched/test_nonbatched.py @@ -0,0 +1,14 @@ +import Tensile.Tensile as Tensile + +def test_sgemm_asm_nt(tmpdir): + Tensile.Tensile([Tensile.TensileTestPath("nightly/nonbatched/sgemm_asm_nt.yaml"), tmpdir.strpath]) + +def test_sgemm_asm_nn(tmpdir): + Tensile.Tensile([Tensile.TensileTestPath("nightly/nonbatched/sgemm_asm_nn.yaml"), tmpdir.strpath]) + +def test_sgemm_asm_tn(tmpdir): + Tensile.Tensile([Tensile.TensileTestPath("nightly/nonbatched/sgemm_asm_tn.yaml"), tmpdir.strpath]) + +def test_sgemm_asm_tt(tmpdir): + Tensile.Tensile([Tensile.TensileTestPath("nightly/nonbatched/sgemm_asm_tt.yaml"), tmpdir.strpath]) + diff --git a/Tensile/Tests/nightly/vector_width/hgemm_nn_asm.yaml b/Tensile/Tests/nightly/vector_width/hgemm_nn_asm.yaml new file mode 100644 index 0000000000..408fde7660 --- /dev/null +++ b/Tensile/Tests/nightly/vector_width/hgemm_nn_asm.yaml @@ -0,0 +1,44 @@ +# Sweep across different vector widths and global vector widths +GlobalParameters: + MinimumRequiredVersion: 4.2.0 + NumElementsToValidate: -1 + KernelTime: True + SleepPercent: 0 + +BenchmarkProblems: + + - # + - # ProblemType + OperationType: GEMM + DataType: h + TransposeA: False + TransposeB: False + UseBeta: True + Batched: True + + - # BenchmarkProblemSizeGroup - Assembly + InitialSolutionParameters: + BenchmarkCommonParameters: + - LoopTail: [True] + - EdgeType: ["ShiftPtr"] + ForkParameters: + - KernelLanguage: ["Assembly"] + - GlobalSplitU: [1,3] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - [ 8, 16 ] + - WorkGroup: + - [ 16, 16, 1 ] + - [ 8, 16, 1 ] + - DepthU: [32] + - VectorWidth: [1,2,4,8] + - GlobalReadVectorWidth: [1,2,4,8] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127,1,129], 0, [4], [63,1,65] ] diff --git a/Tensile/Tests/nightly/vector_width/sgemm_nn_asm.yaml b/Tensile/Tests/nightly/vector_width/sgemm_nn_asm.yaml new file mode 100644 index 0000000000..db199c59bf --- /dev/null +++ b/Tensile/Tests/nightly/vector_width/sgemm_nn_asm.yaml @@ -0,0 +1,44 @@ +# Sweep vector width and global read vector width +GlobalParameters: + MinimumRequiredVersion: 4.2.0 + NumElementsToValidate: -1 + KernelTime: True + SleepPercent: 0 + +BenchmarkProblems: + + - + - # ProblemType + OperationType: GEMM + DataType: s + TransposeA: False + TransposeB: False + UseBeta: True + Batched: True + + - # BenchmarkProblemSizeGroup - Assembly + InitialSolutionParameters: + BenchmarkCommonParameters: + - LoopTail: [True] + - EdgeType: ["ShiftPtr"] + ForkParameters: + - KernelLanguage: ["Assembly"] + - GlobalSplitU: [1,3] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - [ 8, 16 ] + - WorkGroup: + - [ 16, 16, 1 ] + - [ 8, 16, 1 ] + - DepthU: [32] + - VectorWidth: [1,2,4,8] + - GlobalReadVectorWidth: [1,2,4,8] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127,1,129], 0, [4], [63,1,65] ] diff --git a/Tensile/Tests/nightly/vector_width/sgemm_nn_source.yaml b/Tensile/Tests/nightly/vector_width/sgemm_nn_source.yaml new file mode 100644 index 0000000000..4decd0996f --- /dev/null +++ b/Tensile/Tests/nightly/vector_width/sgemm_nn_source.yaml @@ -0,0 +1,44 @@ +# Sweep vector width and global read vector width +GlobalParameters: + MinimumRequiredVersion: 4.2.0 + NumElementsToValidate: -1 + KernelTime: True + SleepPercent: 0 + +BenchmarkProblems: + + - + - # ProblemType + OperationType: GEMM + DataType: s + TransposeA: False + TransposeB: False + UseBeta: True + Batched: True + + - # BenchmarkProblemSizeGroup - Assembly + InitialSolutionParameters: + BenchmarkCommonParameters: + - LoopTail: [True] + - EdgeType: ["ShiftPtr"] + ForkParameters: + - KernelLanguage: ["Source"] + - GlobalSplitU: [1,3] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 2, 2 ] + - [ 4, 4 ] + - [ 8, 16 ] + - WorkGroup: + - [ 16, 16, 1 ] + - [ 8, 16, 1 ] + - DepthU: [32] + - VectorWidth: [1,2,4,8] + - GlobalReadVectorWidth: [1,2,4,8] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127,1,129], 0, [4], [63,1,65] ] diff --git a/Tensile/Tests/nightly/vector_width/test_vector_width.py b/Tensile/Tests/nightly/vector_width/test_vector_width.py new file mode 100644 index 0000000000..7a6f4d0966 --- /dev/null +++ b/Tensile/Tests/nightly/vector_width/test_vector_width.py @@ -0,0 +1,16 @@ +import Tensile.Tensile as Tensile + +def test_sgemm_nn_source(tmpdir): + Tensile.Tensile([Tensile.TensileTestPath("nightly/vector_width/sgemm_nn_source.yaml"), tmpdir.strpath]) + +def test_sgemm_nn_asm(tmpdir): + Tensile.Tensile([Tensile.TensileTestPath("nightly/vector_width/sgemm_nn_asm.yaml"), tmpdir.strpath]) + +def test_hgemm_nn_asm(tmpdir): + Tensile.Tensile([Tensile.TensileTestPath("nightly/vector_width/hgemm_nn_asm.yaml"), tmpdir.strpath]) + +#disabled for now due to hanging with ROCm 1.9 +# +#def test_hgemm_nn_source(tmpdir): +# Tensile.Tensile([Tensile.TensileTestPath("nightly/vector_width/hgemm_nn_source.yaml"), tmpdir.strpath]) + diff --git a/Tensile/Tests/nightly_asm/README b/Tensile/Tests/nightly_asm/README new file mode 100644 index 0000000000..9c158062ce --- /dev/null +++ b/Tensile/Tests/nightly_asm/README @@ -0,0 +1,9 @@ +nightly_asm is collection of selected asm tests from the nightly run +These provide better covereage than pre_checkin and skip the source tests so complete more quickly than full nightly. + +nightly_asm is simply links into directories in nightly. +New directories in nightly will have to add a link in nightly_asm if desired. + +To run: + +$ PYTHONPATH=. py.test -v Tensile/Tests/nightly_asm diff --git a/Tensile/Tests/nightly_asm/assertions b/Tensile/Tests/nightly_asm/assertions new file mode 120000 index 0000000000..00d2b7e489 --- /dev/null +++ b/Tensile/Tests/nightly_asm/assertions @@ -0,0 +1 @@ +../nightly/assertions \ No newline at end of file diff --git a/Tensile/Tests/nightly_asm/big_tensor b/Tensile/Tests/nightly_asm/big_tensor new file mode 120000 index 0000000000..77cf397a71 --- /dev/null +++ b/Tensile/Tests/nightly_asm/big_tensor @@ -0,0 +1 @@ +../nightly/big_tensor \ No newline at end of file diff --git a/Tensile/Tests/nightly_asm/flat b/Tensile/Tests/nightly_asm/flat new file mode 120000 index 0000000000..b886566b68 --- /dev/null +++ b/Tensile/Tests/nightly_asm/flat @@ -0,0 +1 @@ +../nightly/flat \ No newline at end of file diff --git a/Tensile/Tests/nightly_asm/fractional b/Tensile/Tests/nightly_asm/fractional new file mode 120000 index 0000000000..40dfbec170 --- /dev/null +++ b/Tensile/Tests/nightly_asm/fractional @@ -0,0 +1 @@ +../nightly/fractional \ No newline at end of file diff --git a/Tensile/Tests/nightly_asm/global_split_u b/Tensile/Tests/nightly_asm/global_split_u new file mode 120000 index 0000000000..5551057206 --- /dev/null +++ b/Tensile/Tests/nightly_asm/global_split_u @@ -0,0 +1 @@ +../nightly/global_split_u \ No newline at end of file diff --git a/Tensile/Tests/nightly_asm/hpa_source b/Tensile/Tests/nightly_asm/hpa_source new file mode 120000 index 0000000000..2a9d54397f --- /dev/null +++ b/Tensile/Tests/nightly_asm/hpa_source @@ -0,0 +1 @@ +../nightly/hpa_source \ No newline at end of file diff --git a/Tensile/Tests/nightly_asm/local_split_u b/Tensile/Tests/nightly_asm/local_split_u new file mode 120000 index 0000000000..d455319439 --- /dev/null +++ b/Tensile/Tests/nightly_asm/local_split_u @@ -0,0 +1 @@ +../nightly/local_split_u \ No newline at end of file diff --git a/Tensile/Tests/nightly_asm/nonbatched b/Tensile/Tests/nightly_asm/nonbatched new file mode 120000 index 0000000000..41aa20fe9e --- /dev/null +++ b/Tensile/Tests/nightly_asm/nonbatched @@ -0,0 +1 @@ +../nightly/nonbatched \ No newline at end of file diff --git a/Tensile/Tests/nightly_asm/pre_checkin b/Tensile/Tests/nightly_asm/pre_checkin new file mode 120000 index 0000000000..4b921f09b7 --- /dev/null +++ b/Tensile/Tests/nightly_asm/pre_checkin @@ -0,0 +1 @@ +../nightly/pre_checkin \ No newline at end of file diff --git a/Tensile/Tests/nightly_asm/vector_width b/Tensile/Tests/nightly_asm/vector_width new file mode 120000 index 0000000000..57903ecc4f --- /dev/null +++ b/Tensile/Tests/nightly_asm/vector_width @@ -0,0 +1 @@ +../nightly/vector_width \ No newline at end of file diff --git a/Tensile/Tests/pre_checkin/test_dgemm_asm.yaml b/Tensile/Tests/pre_checkin/dgemm_asm.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/test_dgemm_asm.yaml rename to Tensile/Tests/pre_checkin/dgemm_asm.yaml diff --git a/Tensile/Tests/pre_checkin/test_hgemm_asm_nn.yaml b/Tensile/Tests/pre_checkin/hgemm_asm_nn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/test_hgemm_asm_nn.yaml rename to Tensile/Tests/pre_checkin/hgemm_asm_nn.yaml diff --git a/Tensile/Tests/pre_checkin/test_hgemm_asm_nt.yaml b/Tensile/Tests/pre_checkin/hgemm_asm_nt.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/test_hgemm_asm_nt.yaml rename to Tensile/Tests/pre_checkin/hgemm_asm_nt.yaml diff --git a/Tensile/Tests/pre_checkin/hgemm_asm_tn.yaml b/Tensile/Tests/pre_checkin/hgemm_asm_tn.yaml new file mode 100644 index 0000000000..ad533b6e6c --- /dev/null +++ b/Tensile/Tests/pre_checkin/hgemm_asm_tn.yaml @@ -0,0 +1,96 @@ +# benchmark assembly and source kernels +GlobalParameters: + MinimumRequiredVersion: 4.2.0 + NumElementsToValidate: -1 + KernelTime: True + +BenchmarkProblems: + + - # hgemm TN + - # ProblemType + OperationType: GEMM + DataType: h + TransposeA: True + TransposeB: False + UseBeta: True + Batched: True + + - # BenchmarkProblemSizeGroup - Assembly + InitialSolutionParameters: + BenchmarkCommonParameters: + - LoopTail: [True] + - EdgeType: ["ShiftPtr"] + - KernelLanguage: ["Assembly"] + ForkParameters: + - GlobalSplitU: [1, 3] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [False] + - ThreadTile: + - [ 4, 2 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 16, 16, 1 ] + - [ 32, 4, 1 ] + - DepthU: [8] + - VectorWidth: [-1] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127,1,129], 0, [2], [63,1,65] ] + + - # BenchmarkProblemSizeGroup - Assembly + InitialSolutionParameters: + BenchmarkCommonParameters: + - LoopTail: [True] + - EdgeType: ["ShiftPtr"] + ForkParameters: + - KernelLanguage: ["Assembly"] + - GlobalSplitU: [1, 3] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [False] + - ThreadTile: + - [ 8, 2 ] + - [ 2, 8 ] + - [ 16, 2 ] + - [ 2, 16 ] + - WorkGroup: + - [ 16, 16, 1 ] + - [ 8, 8, 1 ] + - DepthU: [16] + - VectorWidth: [-1] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127,1,129], 0, [2], [63,1,65] ] + + - # BenchmarkProblemSizeGroup - Assembly + InitialSolutionParameters: + BenchmarkCommonParameters: + - LoopTail: [True] + - EdgeType: ["ShiftPtr"] + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [True] + - WorkGroupMapping: [1] + ForkParameters: + - PrefetchGlobalRead: [False] + - ThreadTile: + - [4, 2] + - WorkGroup: + - [8, 16, 1] + - GlobalSplitU: [1] + - DepthU: [8] + - VectorWidth: [2] + - AssertSummationElementMultiple: [2] + - AssertFree0ElementMultiple: [2] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [ 512, 8, 1, 500000 ] + diff --git a/Tensile/Tests/pre_checkin/test_hgemm_asm_tt.yaml b/Tensile/Tests/pre_checkin/hgemm_asm_tt.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/test_hgemm_asm_tt.yaml rename to Tensile/Tests/pre_checkin/hgemm_asm_tt.yaml diff --git a/Tensile/Tests/pre_checkin/test_hgemm_hpa_asm_nn.yaml b/Tensile/Tests/pre_checkin/hgemm_hpa_asm_nn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/test_hgemm_hpa_asm_nn.yaml rename to Tensile/Tests/pre_checkin/hgemm_hpa_asm_nn.yaml diff --git a/Tensile/Tests/pre_checkin/test_hgemm_hpa_asm_nt.yaml b/Tensile/Tests/pre_checkin/hgemm_hpa_asm_nt.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/test_hgemm_hpa_asm_nt.yaml rename to Tensile/Tests/pre_checkin/hgemm_hpa_asm_nt.yaml diff --git a/Tensile/Tests/pre_checkin/test_hgemm_hpa_asm_tn.yaml b/Tensile/Tests/pre_checkin/hgemm_hpa_asm_tn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/test_hgemm_hpa_asm_tn.yaml rename to Tensile/Tests/pre_checkin/hgemm_hpa_asm_tn.yaml diff --git a/Tensile/Tests/pre_checkin/test_hgemm_hpa_asm_tt.yaml b/Tensile/Tests/pre_checkin/hgemm_hpa_asm_tt.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/test_hgemm_hpa_asm_tt.yaml rename to Tensile/Tests/pre_checkin/hgemm_hpa_asm_tt.yaml diff --git a/Tensile/Tests/pre_checkin/test_hgemm_hpa_iu2_asm_nn.yaml b/Tensile/Tests/pre_checkin/hgemm_hpa_iu2_asm_nn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/test_hgemm_hpa_iu2_asm_nn.yaml rename to Tensile/Tests/pre_checkin/hgemm_hpa_iu2_asm_nn.yaml diff --git a/Tensile/Tests/pre_checkin/test_hgemm_hpa_iu2_asm_nt.yaml b/Tensile/Tests/pre_checkin/hgemm_hpa_iu2_asm_nt.yaml similarity index 96% rename from Tensile/Tests/pre_checkin/test_hgemm_hpa_iu2_asm_nt.yaml rename to Tensile/Tests/pre_checkin/hgemm_hpa_iu2_asm_nt.yaml index ee811b0efc..afd7d53cd2 100644 --- a/Tensile/Tests/pre_checkin/test_hgemm_hpa_iu2_asm_nt.yaml +++ b/Tensile/Tests/pre_checkin/hgemm_hpa_iu2_asm_nt.yaml @@ -5,13 +5,13 @@ GlobalParameters: KernelTime: True BenchmarkProblems: - - # hgemm TN + - # hgemm NT - # ProblemType OperationType: GEMM DataType: h HighPrecisionAccumulate: True - TransposeA: True - TransposeB: False + TransposeA: False + TransposeB: True UseBeta: True Batched: True diff --git a/Tensile/Tests/pre_checkin/test_hgemm_hpa_iu2_asm_tn.yaml b/Tensile/Tests/pre_checkin/hgemm_hpa_iu2_asm_tn.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/test_hgemm_hpa_iu2_asm_tn.yaml rename to Tensile/Tests/pre_checkin/hgemm_hpa_iu2_asm_tn.yaml diff --git a/Tensile/Tests/pre_checkin/test_hgemm_hpa_iu2_asm_tt.yaml b/Tensile/Tests/pre_checkin/hgemm_hpa_iu2_asm_tt.yaml similarity index 100% rename from Tensile/Tests/pre_checkin/test_hgemm_hpa_iu2_asm_tt.yaml rename to Tensile/Tests/pre_checkin/hgemm_hpa_iu2_asm_tt.yaml diff --git a/Tensile/Tests/pre_checkin/sgemm_asm_nn.yaml b/Tensile/Tests/pre_checkin/sgemm_asm_nn.yaml new file mode 100644 index 0000000000..883cfda371 --- /dev/null +++ b/Tensile/Tests/pre_checkin/sgemm_asm_nn.yaml @@ -0,0 +1,86 @@ +# benchmark assembly and source kernels +GlobalParameters: + MinimumRequiredVersion: 4.2.0 + CMakeBuildType: Release + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + NumElementsToValidate: -1 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + DataInitTypeAB: 3 + DataInitTypeC: 3 + KernelTime: True + +BenchmarkProblems: + + - # sgemm NN + - # ProblemType + OperationType: GEMM + DataType: s + TransposeA: False + TransposeB: False + UseBeta: True + Batched: True + + - # BenchmarkProblemSizeGroup - Assembly + InitialSolutionParameters: + BenchmarkCommonParameters: + - LoopTail: [True] + - EdgeType: ["ShiftPtr"] + - KernelLanguage: ["Assembly"] + ForkParameters: + - GlobalSplitU: [1, 3] + - PrefetchLocalRead: [False] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 3, 5 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 32, 4, 1 ] + - [ 8, 8, 1 ] + - [ 4, 8, 4 ] + - DepthU: [-3] + - VectorWidth: [-1] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127,1,129], 0, [2], [63,1,65] ] + + - # BenchmarkProblemSizeGroup - Assembly + InitialSolutionParameters: + BenchmarkCommonParameters: + - LoopTail: [True] + - EdgeType: ["ShiftPtr"] + ForkParameters: + - KernelLanguage: ["Assembly"] + - GlobalSplitU: [1, 3] + - PrefetchLocalRead: [False] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 3, 3 ] + - [ 4, 4 ] + - [ 5, 5 ] + - [ 8, 8 ] + - WorkGroup: + - [ 16, 16, 1 ] + - [ 8, 8, 1 ] + - [ 4, 4, 4 ] + - DepthU: [-1] + - VectorWidth: [-1] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127,1,129], 0, [2], [63,1,65] ] + + diff --git a/Tensile/Tests/pre_checkin/sgemm_asm_nt.yaml b/Tensile/Tests/pre_checkin/sgemm_asm_nt.yaml new file mode 100644 index 0000000000..dacf0b2a0a --- /dev/null +++ b/Tensile/Tests/pre_checkin/sgemm_asm_nt.yaml @@ -0,0 +1,85 @@ +# benchmark assembly and source kernels +GlobalParameters: + MinimumRequiredVersion: 4.2.0 + CMakeBuildType: Release + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + NumElementsToValidate: -1 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + DataInitTypeAB: 3 + DataInitTypeC: 3 + KernelTime: True + +BenchmarkProblems: + + - # sgemm NT + - # ProblemType + OperationType: GEMM + DataType: s + TransposeA: False + TransposeB: True + UseBeta: True + Batched: True + + - # BenchmarkProblemSizeGroup - Assembly + InitialSolutionParameters: + BenchmarkCommonParameters: + - LoopTail: [True] + - EdgeType: ["ShiftPtr"] + - KernelLanguage: ["Assembly"] + ForkParameters: + - GlobalSplitU: [1, 3] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 3, 5 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 16, 16, 1 ] + - [ 8, 16, 1 ] + - [ 2, 8, 8 ] + - DepthU: [-3] + - VectorWidth: [-1] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127,1,129], 0, [2], [63,1,65] ] + + - # BenchmarkProblemSizeGroup - Assembly + InitialSolutionParameters: + BenchmarkCommonParameters: + - LoopTail: [True] + - EdgeType: ["ShiftPtr"] + ForkParameters: + - KernelLanguage: ["Assembly"] + - GlobalSplitU: [1, 3] + - PrefetchLocalRead: [True] + - PrefetchGlobalRead: [True] + - ThreadTile: + - [ 3, 3 ] + - [ 4, 4 ] + - [ 5, 5 ] + - [ 8, 8 ] + - WorkGroup: + - [ 16, 16, 1 ] + - [ 16, 8, 1 ] + - [ 16, 2, 8 ] + - DepthU: [-1] + - VectorWidth: [-1] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127,1,129], 0, [2], [63,1,65] ] + diff --git a/Tensile/Tests/pre_checkin/test_hgemm_asm_tn.yaml b/Tensile/Tests/pre_checkin/sgemm_asm_tn.yaml similarity index 76% rename from Tensile/Tests/pre_checkin/test_hgemm_asm_tn.yaml rename to Tensile/Tests/pre_checkin/sgemm_asm_tn.yaml index 3689155c8c..284de54edb 100644 --- a/Tensile/Tests/pre_checkin/test_hgemm_asm_tn.yaml +++ b/Tensile/Tests/pre_checkin/sgemm_asm_tn.yaml @@ -1,15 +1,28 @@ # benchmark assembly and source kernels GlobalParameters: MinimumRequiredVersion: 4.2.0 + CMakeBuildType: Release + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 NumElementsToValidate: -1 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + DataInitTypeAB: 3 + DataInitTypeC: 3 KernelTime: True BenchmarkProblems: - - # hgemm TN + - # sgemm TN - # ProblemType OperationType: GEMM - DataType: h + DataType: s TransposeA: True TransposeB: False UseBeta: True @@ -26,13 +39,13 @@ BenchmarkProblems: - PrefetchLocalRead: [True] - PrefetchGlobalRead: [False] - ThreadTile: - - [ 4, 2 ] + - [ 3, 5 ] - [ 4, 8 ] - [ 8, 8 ] - WorkGroup: - [ 16, 16, 1 ] - [ 32, 4, 1 ] - - DepthU: [8] + - DepthU: [-4] - VectorWidth: [-1] BenchmarkForkParameters: JoinParameters: @@ -52,14 +65,14 @@ BenchmarkProblems: - PrefetchLocalRead: [True] - PrefetchGlobalRead: [False] - ThreadTile: - - [ 8, 2 ] - - [ 2, 8 ] - - [ 16, 2 ] - - [ 2, 16 ] + - [ 3, 3 ] + - [ 4, 4 ] + - [ 5, 5 ] + - [ 8, 8 ] - WorkGroup: - [ 16, 16, 1 ] - [ 8, 8, 1 ] - - DepthU: [16] + - DepthU: [-1] - VectorWidth: [-1] BenchmarkForkParameters: JoinParameters: @@ -68,3 +81,4 @@ BenchmarkProblems: - ProblemSizes: - Range: [ [127,1,129], 0, [2], [63,1,65] ] + diff --git a/Tensile/Tests/pre_checkin/sgemm_asm_tt.yaml b/Tensile/Tests/pre_checkin/sgemm_asm_tt.yaml new file mode 100644 index 0000000000..dd984b94b4 --- /dev/null +++ b/Tensile/Tests/pre_checkin/sgemm_asm_tt.yaml @@ -0,0 +1,80 @@ +# benchmark assembly and source kernels +GlobalParameters: + MinimumRequiredVersion: 4.2.0 + CMakeBuildType: Release + PrintLevel: 1 + ForceRedoBenchmarkProblems: True + ForceRedoLibraryLogic: True + ForceRedoLibraryClient: True + EnqueuesPerSync: 1 + SyncsPerBenchmark: 1 + NumElementsToValidate: -1 + ValidationMaxToPrint: 4 + ValidationPrintValids: False + ShortNames: False + MergeFiles: True + DataInitTypeAB: 3 + DataInitTypeC: 3 + KernelTime: True + +BenchmarkProblems: + + - # sgemm TT + - # ProblemType + OperationType: GEMM + DataType: s + TransposeA: True + TransposeB: True + UseBeta: True + Batched: True + + - # BenchmarkProblemSizeGroup - Assembly + InitialSolutionParameters: + BenchmarkCommonParameters: + - LoopTail: [True] + - EdgeType: ["ShiftPtr"] + - KernelLanguage: ["Assembly"] + ForkParameters: + - PrefetchLocalRead: [False] + - PrefetchGlobalRead: [False] + - ThreadTile: + - [ 3, 5 ] + - [ 4, 8 ] + - [ 8, 8 ] + - WorkGroup: + - [ 16, 16, 1 ] + - [ 8, 8, 1 ] + - DepthU: [-3] + - VectorWidth: [-1] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127,1,129], 0, [2], [63,1,65] ] + + - # BenchmarkProblemSizeGroup - Assembly + InitialSolutionParameters: + BenchmarkCommonParameters: + - LoopTail: [True] + - EdgeType: ["ShiftPtr"] + ForkParameters: + - KernelLanguage: ["Assembly"] + - PrefetchLocalRead: [False] + - PrefetchGlobalRead: [False] + - ThreadTile: + - [ 3, 3 ] + - [ 4, 4 ] + - [ 5, 5 ] + - [ 8, 8 ] + - WorkGroup: + - [ 16, 16, 1 ] + - [ 8, 8, 1 ] + - DepthU: [-1] + - VectorWidth: [-1] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Range: [ [127,1,129], 0, [2], [63,1,65] ] diff --git a/Tensile/Tests/pre_checkin/test_pre_checkin.py b/Tensile/Tests/pre_checkin/test_pre_checkin.py index 97108bf4ee..dd2de22ce7 100644 --- a/Tensile/Tests/pre_checkin/test_pre_checkin.py +++ b/Tensile/Tests/pre_checkin/test_pre_checkin.py @@ -1,31 +1,52 @@ import Tensile.Tensile as Tensile -def test_sgemm_asm(tmpdir): - Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/test_sgemm_asm.yaml"), tmpdir.strpath]) +def test_hgemm_hpa_asm_tn(tmpdir): + Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/hgemm_hpa_asm_tn.yaml"), tmpdir.strpath]) + +def test_sgemm_asm_nt(tmpdir): + Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/sgemm_asm_nt.yaml"), tmpdir.strpath]) + +def test_hgemm_hpa_iu2_asm_tt(tmpdir): + Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/hgemm_hpa_iu2_asm_tt.yaml"), tmpdir.strpath]) + +def test_hgemm_hpa_iu2_asm_nt(tmpdir): + Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/hgemm_hpa_iu2_asm_nt.yaml"), tmpdir.strpath]) def test_hgemm_asm_nn(tmpdir): - Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/test_hgemm_asm_nn.yaml"), tmpdir.strpath]) + Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/hgemm_asm_nn.yaml"), tmpdir.strpath]) + +def test_sgemm_asm_nn(tmpdir): + Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/sgemm_asm_nn.yaml"), tmpdir.strpath]) def test_dgemm_asm(tmpdir): - Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/test_dgemm_asm.yaml"), tmpdir.strpath]) + Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/dgemm_asm.yaml"), tmpdir.strpath]) -def test_hgemm_hpa_asm_tn(tmpdir): - Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/test_hgemm_hpa_asm_tn.yaml"), tmpdir.strpath]) +def test_hgemm_hpa_iu2_asm_tn(tmpdir): + Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/hgemm_hpa_iu2_asm_tn.yaml"), tmpdir.strpath]) -def test_hgemm_hpa_asm_nn(tmpdir): - Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/test_hgemm_hpa_asm_nn.yaml"), tmpdir.strpath]) +def test_hgemm_asm_tn(tmpdir): + Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/hgemm_asm_tn.yaml"), tmpdir.strpath]) -def test_hgemm_asm_tt(tmpdir): - Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/test_hgemm_asm_tt.yaml"), tmpdir.strpath]) +def test_hgemm_hpa_asm_nn(tmpdir): + Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/hgemm_hpa_asm_nn.yaml"), tmpdir.strpath]) def test_hgemm_hpa_asm_tt(tmpdir): - Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/test_hgemm_hpa_asm_tt.yaml"), tmpdir.strpath]) + Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/hgemm_hpa_asm_tt.yaml"), tmpdir.strpath]) -def test_hgemm_asm_nt(tmpdir): - Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/test_hgemm_asm_nt.yaml"), tmpdir.strpath]) +def test_hgemm_asm_tt(tmpdir): + Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/hgemm_asm_tt.yaml"), tmpdir.strpath]) -def test_hgemm_asm_tn(tmpdir): - Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/test_hgemm_asm_tn.yaml"), tmpdir.strpath]) +def test_hgemm_hpa_iu2_asm_nn(tmpdir): + Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/hgemm_hpa_iu2_asm_nn.yaml"), tmpdir.strpath]) def test_hgemm_hpa_asm_nt(tmpdir): - Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/test_hgemm_hpa_asm_nt.yaml"), tmpdir.strpath]) + Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/hgemm_hpa_asm_nt.yaml"), tmpdir.strpath]) + +def test_sgemm_asm_tn(tmpdir): + Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/sgemm_asm_tn.yaml"), tmpdir.strpath]) + +def test_sgemm_asm_tt(tmpdir): + Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/sgemm_asm_tt.yaml"), tmpdir.strpath]) + +def test_hgemm_asm_nt(tmpdir): + Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/hgemm_asm_nt.yaml"), tmpdir.strpath]) diff --git a/Tensile/Tests/pre_checkin/test_sgemm_asm.yaml b/Tensile/Tests/pre_checkin/test_sgemm_asm.yaml deleted file mode 100644 index 21e3b0043a..0000000000 --- a/Tensile/Tests/pre_checkin/test_sgemm_asm.yaml +++ /dev/null @@ -1,270 +0,0 @@ -# benchmark assembly and source kernels -GlobalParameters: - MinimumRequiredVersion: 4.2.0 - CMakeBuildType: Release - PrintLevel: 1 - ForceRedoBenchmarkProblems: True - ForceRedoLibraryLogic: True - ForceRedoLibraryClient: True - EnqueuesPerSync: 1 - SyncsPerBenchmark: 1 - NumElementsToValidate: -1 - ValidationMaxToPrint: 4 - ValidationPrintValids: False - ShortNames: False - MergeFiles: True - DataInitTypeAB: 3 - DataInitTypeC: 3 - KernelTime: True - -BenchmarkProblems: - - - # sgemm NN - - # ProblemType - OperationType: GEMM - DataType: s - TransposeA: False - TransposeB: False - UseBeta: True - Batched: True - - - # BenchmarkProblemSizeGroup - Assembly - InitialSolutionParameters: - BenchmarkCommonParameters: - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - - KernelLanguage: ["Assembly"] - ForkParameters: - - GlobalSplitU: [1, 3] - - PrefetchLocalRead: [False] - - PrefetchGlobalRead: [True] - - ThreadTile: - - [ 3, 5 ] - - [ 4, 8 ] - - [ 8, 8 ] - - WorkGroup: - - [ 32, 4, 1 ] - - [ 8, 8, 1 ] - - [ 4, 8, 4 ] - - DepthU: [-3] - - VectorWidth: [-1] - BenchmarkForkParameters: - JoinParameters: - BenchmarkJoinParameters: - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [127,1,129], 0, [2], [63,1,65] ] - - - # BenchmarkProblemSizeGroup - Assembly - InitialSolutionParameters: - BenchmarkCommonParameters: - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - ForkParameters: - - KernelLanguage: ["Assembly"] - - GlobalSplitU: [1, 3] - - PrefetchLocalRead: [False] - - PrefetchGlobalRead: [True] - - ThreadTile: - - [ 3, 3 ] - - [ 4, 4 ] - - [ 5, 5 ] - - [ 8, 8 ] - - WorkGroup: - - [ 16, 16, 1 ] - - [ 8, 8, 1 ] - - [ 4, 4, 4 ] - - DepthU: [-1] - - VectorWidth: [-1] - BenchmarkForkParameters: - JoinParameters: - BenchmarkJoinParameters: - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [127,1,129], 0, [2], [63,1,65] ] - - - # sgemm NT - - # ProblemType - OperationType: GEMM - DataType: s - TransposeA: False - TransposeB: True - UseBeta: True - Batched: True - - - # BenchmarkProblemSizeGroup - Assembly - InitialSolutionParameters: - BenchmarkCommonParameters: - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - - KernelLanguage: ["Assembly"] - ForkParameters: - - GlobalSplitU: [1, 3] - - PrefetchLocalRead: [True] - - PrefetchGlobalRead: [True] - - ThreadTile: - - [ 3, 5 ] - - [ 4, 8 ] - - [ 8, 8 ] - - WorkGroup: - - [ 16, 16, 1 ] - - [ 8, 16, 1 ] - - [ 2, 8, 8 ] - - DepthU: [-3] - - VectorWidth: [-1] - BenchmarkForkParameters: - JoinParameters: - BenchmarkJoinParameters: - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [127,1,129], 0, [2], [63,1,65] ] - - - # BenchmarkProblemSizeGroup - Assembly - InitialSolutionParameters: - BenchmarkCommonParameters: - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - ForkParameters: - - KernelLanguage: ["Assembly"] - - GlobalSplitU: [1, 3] - - PrefetchLocalRead: [True] - - PrefetchGlobalRead: [True] - - ThreadTile: - - [ 3, 3 ] - - [ 4, 4 ] - - [ 5, 5 ] - - [ 8, 8 ] - - WorkGroup: - - [ 16, 16, 1 ] - - [ 16, 8, 1 ] - - [ 16, 2, 8 ] - - DepthU: [-1] - - VectorWidth: [-1] - BenchmarkForkParameters: - JoinParameters: - BenchmarkJoinParameters: - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [127,1,129], 0, [2], [63,1,65] ] - - - # sgemm TN - - # ProblemType - OperationType: GEMM - DataType: s - TransposeA: True - TransposeB: False - UseBeta: True - Batched: True - - - # BenchmarkProblemSizeGroup - Assembly - InitialSolutionParameters: - BenchmarkCommonParameters: - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - - KernelLanguage: ["Assembly"] - ForkParameters: - - GlobalSplitU: [1, 3] - - PrefetchLocalRead: [True] - - PrefetchGlobalRead: [False] - - ThreadTile: - - [ 3, 5 ] - - [ 4, 8 ] - - [ 8, 8 ] - - WorkGroup: - - [ 16, 16, 1 ] - - [ 32, 4, 1 ] - - DepthU: [-4] - - VectorWidth: [-1] - BenchmarkForkParameters: - JoinParameters: - BenchmarkJoinParameters: - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [127,1,129], 0, [2], [63,1,65] ] - - - # BenchmarkProblemSizeGroup - Assembly - InitialSolutionParameters: - BenchmarkCommonParameters: - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - ForkParameters: - - KernelLanguage: ["Assembly"] - - GlobalSplitU: [1, 3] - - PrefetchLocalRead: [True] - - PrefetchGlobalRead: [False] - - ThreadTile: - - [ 3, 3 ] - - [ 4, 4 ] - - [ 5, 5 ] - - [ 8, 8 ] - - WorkGroup: - - [ 16, 16, 1 ] - - [ 8, 8, 1 ] - - DepthU: [-1] - - VectorWidth: [-1] - BenchmarkForkParameters: - JoinParameters: - BenchmarkJoinParameters: - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [127,1,129], 0, [2], [63,1,65] ] - - - # sgemm TT - - # ProblemType - OperationType: GEMM - DataType: s - TransposeA: True - TransposeB: True - UseBeta: True - Batched: True - - - # BenchmarkProblemSizeGroup - Assembly - InitialSolutionParameters: - BenchmarkCommonParameters: - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - - KernelLanguage: ["Assembly"] - ForkParameters: - - PrefetchLocalRead: [False] - - PrefetchGlobalRead: [False] - - ThreadTile: - - [ 3, 5 ] - - [ 4, 8 ] - - [ 8, 8 ] - - WorkGroup: - - [ 16, 16, 1 ] - - [ 8, 8, 1 ] - - DepthU: [-3] - - VectorWidth: [-1] - BenchmarkForkParameters: - JoinParameters: - BenchmarkJoinParameters: - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [127,1,129], 0, [2], [63,1,65] ] - - - # BenchmarkProblemSizeGroup - Assembly - InitialSolutionParameters: - BenchmarkCommonParameters: - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - ForkParameters: - - KernelLanguage: ["Assembly"] - - PrefetchLocalRead: [False] - - PrefetchGlobalRead: [False] - - ThreadTile: - - [ 3, 3 ] - - [ 4, 4 ] - - [ 5, 5 ] - - [ 8, 8 ] - - WorkGroup: - - [ 16, 16, 1 ] - - [ 8, 8, 1 ] - - DepthU: [-1] - - VectorWidth: [-1] - BenchmarkForkParameters: - JoinParameters: - BenchmarkJoinParameters: - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [127,1,129], 0, [2], [63,1,65] ] diff --git a/Tensile/Utilities/merge_rocblas_yaml_files.py b/Tensile/Utilities/merge_rocblas_yaml_files.py new file mode 100644 index 0000000000..a0942c5862 --- /dev/null +++ b/Tensile/Utilities/merge_rocblas_yaml_files.py @@ -0,0 +1,358 @@ + +#from copy import deepcopy +#from Common import print1, print2, printExit, HR, ensurePath + +#from SolutionStructs import Solution + +#from __init__ import __version__ + +import os +import sys +import argparse + +HR = "################################################################################" + +################################################################################ +# Print Debug +################################################################################ + +def printWarning(message): + print "Tensile::WARNING: %s" % message + sys.stdout.flush() + +def printExit(message): + print "Tensile::FATAL: %s" % message + sys.stdout.flush() + sys.exit(-1) + +try: + import yaml +except ImportError: + printExit("You must install PyYAML to use Tensile (to parse config files). See http://pyyaml.org/wiki/PyYAML for installation instructions.") + +#import YAMLIO + +def ensurePath( path ): + if not os.path.exists(path): + os.makedirs(path) + return path + +################################################################################ +# Library Logic Container +################################################################################ +class LibraryLogic: + + def __init__(self,filename=None): + + if filename is not None: + print ("# Reading Library Logic: " + filename) + try: + stream = open(filename, "r") + except IOError: + printExit("Cannot open file: %s" % filename ) + data = yaml.load(stream, yaml.SafeLoader) + + self.__set_versionString(data[0]["MinimumRequiredVersion"]) + self.__set_scheduleName(data[1]) + self.__set_architectureName(data[2]) + self.__set_deviceNames(data[3]) + self.__set_problemType(data[4]) + self.__set_solutionStates(data[5]) + self.__set_indexOrder(data[6]) + self.__set_exactLogic(data[7]) + self.__set_rangeLogic(data[8]) + + stream.close() + + else: + self.__set_versionString(None) + self.__set_scheduleName(None) + self.__set_architectureName(None) + self.__set_deviceNames(None) + self.__set_problemType(None) + self.__set_solutionStates(None) + self.__set_indexOrder(None) + self.__set_exactLogic(None) + self.__set_rangeLogic(None) + + #versionString + def __get_versionString(self): + return self.__versionString + + def __set_versionString(self,value): + self.__versionString = value + + versionString = property(__get_versionString,__set_versionString) + + #scheduleName + def __get_scheduleName(self): + return self.__scheduleName + + def __set_scheduleName(self, value): + self.__scheduleName = value + + scheduleName = property(__get_scheduleName,__set_scheduleName) + + #architectureName + def __get_architectureName(self): + return self.__architectureName + + def __set_architectureName(self,value): + self.__architectureName = value + + architectureName = property(__get_architectureName,__set_architectureName) + + #deviceNames + def __get_deviceNames(self): + return self.__deviceNames + + def __set_deviceNames(self,value): + self.__deviceNames = value + + deviceNames = property(__get_deviceNames,__set_deviceNames) + + + #problemTypeState + def __get_problemType(self): + return self.__problemType + + def __set_problemType(self,value): + self.__problemType = value + + problemType = property(__get_problemType,__set_problemType) + + #solutionStates + def __get_solutionStates(self): + return self.__solutionStates + + def __set_solutionStates(self,value): + self.__solutionStates = value + + solutionStates = property(__get_solutionStates,__set_solutionStates) + + #indexOrder + def __get_indexOrder(self): + return self.__indexOrder + + def __set_indexOrder(self,value): + self.__indexOrder = value + + indexOrder = property(__get_indexOrder,__set_indexOrder) + + + #exactLogic + def __get_exactLogic(self): + return self.__exactLogic + + def __set_exactLogic(self,value): + self.__exactLogic = value + + exactLogic = property(__get_exactLogic,__set_exactLogic) + + #rangeLogic + def __get_rangeLogic(self): + return self.__rangeLogic + + def __set_rangeLogic(self,value): + self.__rangeLogic = value + + rangeLogic = property(__get_rangeLogic,__set_rangeLogic) + + def writeLibraryLogic(self,filename): + + data = [] + + if self.versionString is not None: + data.append({"MinimumRequiredVersion":self.versionString}) + + if self.scheduleName is not None: + data.append(self.scheduleName) + + if self.architectureName is not None: + data.append(self.architectureName) + + if self.deviceNames is not None: + data.append(self.deviceNames) + + if self.problemType is not None: + data.append(self.problemType) + + if self.solutionStates is not None: + data.append(self.solutionStates) + + if self.indexOrder is not None: + data.append(self.indexOrder) + + if self.exactLogic is not None: + data.append(self.exactLogic) + + if self.rangeLogic is not None: + data.append(self.rangeLogic) + + if not data: + printExit("No data to output") + else: + try: + stream = open(filename, "w") + yaml.safe_dump(data, stream) + stream.close() + except IOError: + printExit("Cannot open file: %s" % filename) + + +def MergeTensileLogicFiles(origionalLibraryLogic, exactLibraryLogic): + + mergedLibraryLogic = LibraryLogic() + + solutionList = origionalLibraryLogic.solutionStates + solutionListExact = exactLibraryLogic.solutionStates + + newSolutionOffset = len(solutionList) + + filterdSolutionExactList = [] + replicationMapping = {} + idx = 0 + idxMapping = newSolutionOffset + + # construct the mappings from the old exact kernal configurations + # to their definitions in the merged files + for solution in solutionListExact: + if solution in solutionList: + # if solution exists in the origional configuration the + # its placement in the merged kernel configurations list + # gets mapped to the pre-existing configuration + idxOrg = solutionList.index(solution) + replicationMapping[idx] = idxOrg + else: + filterdSolutionExactList.append(solution) + # if the solution does not exist in the origional configurations + # it gets mapped to the new offset + replicationMapping[idx] = idxMapping + idxMapping += 1 + + idx += 1 + + mergedSolutionList = [] + for solution in solutionList: + mergedSolutionList.append(solution) + + for solution in solutionListExact: + mergedSolutionList.append(solution) + + exactLogic = origionalLibraryLogic.exactLogic + exactLogicExact = exactLibraryLogic.exactLogic + + filteredExactLogicExact = [] + + # use the mapping from above to remap the exact logic + # in the merged file + for exact in exactLogicExact: + # example exact entry [[123,124,1,123], [5, 4312.3]] + # the first fiedl in [5, 4312.3] is the mapping to the + # kernel configuration + kernelIndex = exact[1][0] + + if replicationMapping.has_key(kernelIndex): + exact[1][0] = replicationMapping[kernelIndex] + + filteredExactLogicExact.append(exact) + + + sizeList, _ = zip(*exactLogicExact) + + mergedExactLogic = [] + for logicMapping in exactLogic: + if logicMapping[0] not in sizeList: + mergedExactLogic.append(logicMapping) + + for logicMapping in exactLogicExact: + mergedExactLogic.append(logicMapping) + + mergedLibraryLogic.versionString = origionalLibraryLogic.versionString + mergedLibraryLogic.scheduleName = origionalLibraryLogic.scheduleName + mergedLibraryLogic.architectureName = origionalLibraryLogic.architectureName + mergedLibraryLogic.deviceNames = origionalLibraryLogic.deviceNames + mergedLibraryLogic.problemType = origionalLibraryLogic.problemType + mergedLibraryLogic.solutionStates = mergedSolutionList + mergedLibraryLogic.indexOrder = origionalLibraryLogic.indexOrder + mergedLibraryLogic.exactLogic = mergedExactLogic + mergedLibraryLogic.rangeLogic = origionalLibraryLogic.rangeLogic + + return mergedLibraryLogic + + +def ProcessMergeLogicFile(exactFileName, origionalFileName, outputFileName): + + _, fileName = os.path.split(exactFileName) + + print ("processing file: " + fileName) + + libraryLogic = LibraryLogic(origionalFileName) + libraryLogicExact = LibraryLogic(exactFileName) + + mergedLibraryLogic = MergeTensileLogicFiles(libraryLogic,libraryLogicExact) + + mergedLibraryLogic.writeLibraryLogic(outputFileName) + +def RunMergeTensileLogicFiles(): + + print "" + print HR + print "# Merge Library Logic" + print HR + print "" + + ############################################################################## + # Parse Command Line Arguments + ############################################################################## + + argParser = argparse.ArgumentParser() + argParser.add_argument("OrigionalLogicPath", help="Path to the origional LibraryLogic.yaml input files.") + argParser.add_argument("ExactLogicPath", help="Path to the exact LibraryLogic.yaml input files.") + argParser.add_argument("OutputPath", help="Where to write library files?") + + args = argParser.parse_args() + + origionalLogicPath = args.OrigionalLogicPath + exactLogicPath = args.ExactLogicPath + outputPath = args.OutputPath + print ("Origional Logic Path: " + origionalLogicPath) + print ("Exact Logic Path: " + exactLogicPath) + print ("OutputPath: " + outputPath) + + print "" + ensurePath(outputPath) + if not os.path.exists(exactLogicPath): + printExit("LogicPath %s doesn't exist" % exactLogicPath) + + exactLogicFiles = [os.path.join(exactLogicPath, f) for f in os.listdir(exactLogicPath) \ + if (os.path.isfile(os.path.join(exactLogicPath, f)) \ + and os.path.splitext(f)[1]==".yaml")] + + #print1("# LibraryLogicFiles:" % exactLogicFiles) + #for logicFile in logicFiles: + # print1("# %s" % logicFile) + + for exactLogicFilePath in exactLogicFiles: + _, fileName = os.path.split(exactLogicFilePath) + #print1("# %s" % fileName) + origionalLogicFilePath = os.path.join(origionalLogicPath, fileName) + #print1("# %s" % origionalLogicFilePath) + if os.path.isfile(origionalLogicFilePath): + + outputLogicFilePath = os.path.join(outputPath, fileName) + + try: + ProcessMergeLogicFile(exactLogicFilePath, origionalLogicFilePath, outputLogicFilePath) + except Exception as ex: + print("Exception: {0}".format(ex)) + + else: + print ("# file does not exist in origional directory " + origionalLogicFilePath) + + +################################################################################ +# Main +################################################################################ +if __name__ == "__main__": + RunMergeTensileLogicFiles() \ No newline at end of file diff --git a/Tensile/__init__.py b/Tensile/__init__.py index c938064729..15daea2823 100644 --- a/Tensile/__init__.py +++ b/Tensile/__init__.py @@ -20,4 +20,4 @@ ################################################################################ # hardcoded tensile version; also in Tensile/Source/TensileConfigVersion.cmake -__version__ = "4.5.0" +__version__ = "4.6.0" diff --git a/bump-version.sh b/bump-version.sh index fc26ea5f59..840d777eae 100755 --- a/bump-version.sh +++ b/bump-version.sh @@ -3,11 +3,11 @@ # This script needs to be edited to bump version for new release. # Version will be bumped in Tensile/__init__.py and in .yaml files -OLD_VERSION="4.4.0" -NEW_VERSION="4.5.0" +OLD_VERSION="4.5.0" +NEW_VERSION="4.6.0" -OLD_MINIMUM_REQUIRED_VERSION="MinimumRequiredVersion: 4.4.0" -NEW_MINIMUM_REQUIRED_VERSION="MinimumRequiredVersion: 4.5.0" +OLD_MINIMUM_REQUIRED_VERSION="MinimumRequiredVersion: 4.5.0" +NEW_MINIMUM_REQUIRED_VERSION="MinimumRequiredVersion: 4.6.0" sed -i "s/${OLD_VERSION}/${NEW_VERSION}/g" Tensile/__init__.py