diff --git a/Tensile/Common.py b/Tensile/Common.py
index 6f8f0ec42b..39cde127c2 100644
--- a/Tensile/Common.py
+++ b/Tensile/Common.py
@@ -267,9 +267,21 @@
     # If changing this also change runtime writeSolutionAssertionCheck* functions in Common.py and in TensileTypes.py (AssertionProperties class)
     "AssertFree0ElementMultiple" : [1,2,4,8],
 
+    # When creating the kernel, assume that the 'second' free index size is some
+    # multiple of the element size.
+    # "first" free index is FreeIndex[1] and usually letter "J"
+    # 1 indicates no restriction (since all sizes are multiples of 1)
+    # If changing this also change runtime writeSolutionAssertionCheck* functions in Common.py and in TensileTypes.py (AssertionProperties class)
+    #"AssertFree1ElementMultiple" : [1,2,4,8],
+    "AssertFree1ElementMultiple" : [1],  # TODO, support broader range here
+
     # Generate code inside kernel to check assertions above on Tensor dimensions
     "CheckTensorDimAsserts":               [False, True],
 
+    # Generate code inside kernel to check several dimension overflow cases, in particular around use of 32-bit calcs
+    # 0 = no check, 1=checks for cases that should be avoided through assertions and kernel selection, 2=checks for cases that should never happen
+    "CheckDimOverflow":               [0,1,2],
+
     # For Block Mapping type:
     # 0   : Use hardware-assigned wg number with no remapping.
     # N   : WG block width.  "Wrap" to a new wg1 "row" assignment after N WGs assigned in that row.
@@ -429,11 +441,13 @@
     {"BufferLoad":                [ True ] },
     {"BufferStore":               [ True ] },
     {"DirectToLds":               [ True ] },
-    {"PreciseBoundsCheck":        [ False ] },
+    {"PreciseBoundsCheck":        [ True ] },
     {"UseSgprForGRO":             [ -1 ] },
     {"AssertSummationElementMultiple": [ 1 ] },
     {"AssertFree0ElementMultiple": [ 1 ] },
+    {"AssertFree1ElementMultiple": [ 1 ] },
     {"CheckTensorDimAsserts"      : [ False ] },
+    {"CheckDimOverflow"           : [ 0 ] },
 
     {"GlobalSplitU":              [ 1 ] },
     {"GlobalSplitUSummationAssignmentRoundRobin": [ True ] },
@@ -675,9 +689,11 @@ def tryAssembler(isaVersion, asmString):
     if result != "":
       return 0 # stdout and stderr must be empty
   except subprocess.CalledProcessError, e:
+    if globalParameters["PrintLevel"] >=2:
+        print "CalledProcessError", e
     return 0 # error, not supported
 
-  return 1 # syntax works for
+  return 1 # syntax works
 
 
 ################################################################################
@@ -737,13 +753,11 @@ def assignGlobalParameters( config ):
   for (v) in globalParameters["SupportedISA"]:
     globalParameters["AsmCaps"][v] = {}
     isaVersion = "gfx" + "".join(map(str,v))
-    asmCmd = "%s -x assembler -target amdgcn-amdhsa -mcpu=%s -" \
-               % (globalParameters["AssemblerPath"], isaVersion)
-    # This doesn't work since assembler politely falls back to default with an unsupported mcpu argument:
     globalParameters["AsmCaps"][v]["SupportedIsa"] = tryAssembler(isaVersion, "")
     globalParameters["AsmCaps"][v]["HasExplicitCO"] = tryAssembler(isaVersion, "v_add_co_u32 v0,vcc,v0,v0")
     globalParameters["AsmCaps"][v]["HasDirectToLds"] = tryAssembler(isaVersion, "buffer_load_dword v40, v36, s[24:27], s28 offen offset:0 lds")
     globalParameters["AsmCaps"][v]["HasAddLshl"] = tryAssembler(isaVersion, "v_add_lshl_u32 v47, v36, v34, 0x2")
+    globalParameters["AsmCaps"][v]["HasSMulHi"] = tryAssembler(isaVersion, "s_mul_hi_u32 s47, s36, s34")
     caps = ""
     for k in globalParameters["AsmCaps"][v]:
       caps += " %s=%u" % (k, globalParameters["AsmCaps"][v][k])
diff --git a/Tensile/Configs/miopen/Makefile b/Tensile/Configs/miopen/Makefile
index d20aacc2ce..14425d04cf 100644
--- a/Tensile/Configs/miopen/Makefile
+++ b/Tensile/Configs/miopen/Makefile
@@ -2,9 +2,12 @@ P=problems
 DEEPBENCH_CONV_1x1=$P/nn/deepbench_conv_1x1_batchN.yml $P/nn/deepbench_conv_1x1_batch1.yml
 RESNET=$P/nn/resnet_batch64_B.yml
 
+# Override SCHED as vega10, vega20
+SCHED=vega10
+
 # commonly-used headers and footers:
 HEADER=boiler/header.yml
-FOOTER=boiler/library_logic_vega10_only.yml
+FOOTER=boiler/library_logic_$(SCHED)_only.yml
 
 # Override TYPE as sgemm, hgemm (hgemm_hpa, dgemm, etc in future)
 TYPE=sgemm
@@ -25,6 +28,9 @@ SOLUTION_SKINNY=solutions/$(TYPE)_skinny_explore_$(EXPLORE_LEVEL).yml
 
 all: \
 	$(TYPE)_resnet.yaml \
+	$(TYPE)_resnet50_nn.yaml \
+	$(TYPE)_resnet50_nt.yaml \
+	$(TYPE)_resnet50_tn.yaml \
 	$(TYPE)_deepbench_conv1x1.yaml \
 	$(TYPE)_deepbench_gemm_nn.yaml \
 	$(TYPE)_deepbench_gemm_nt.yaml \
@@ -35,6 +41,17 @@ $(TYPE)_resnet.yaml: $(HEADER) types/$(TYPE)_nn.yml \
 	$(SOLUTION_SKINNY) $(RESNET) \
 	$(FOOTER)
 
+# Resnet50
+$(TYPE)_resnet50_nn.yaml: $(HEADER) types/$(TYPE)_nn.yml \
+	$(SOLUTION_SKINNY) $P/nn/resnet50_all.yml \
+	$(FOOTER)
+$(TYPE)_resnet50_nt.yaml: $(HEADER) types/$(TYPE)_nt.yml \
+	$(SOLUTION_SKINNY) $P/nt/resnet50_all.yml \
+	$(FOOTER)
+$(TYPE)_resnet50_tn.yaml: $(HEADER) types/$(TYPE)_tn.yml \
+	$(SOLUTION_SKINNY) $P/tn/resnet50_all.yml \
+	$(FOOTER)
+
 # DeepBench Convolution:
 $(TYPE)_deepbench_conv1x1.yaml: $(HEADER) types/$(TYPE)_nn.yml \
 	$(SOLUTION_SKINNY) $(DEEPBENCH_CONV_1x1) \
diff --git a/Tensile/Configs/miopen/archives/resnet50/README.md b/Tensile/Configs/miopen/archives/resnet50/README.md
new file mode 100644
index 0000000000..ac1eeeb3f9
--- /dev/null
+++ b/Tensile/Configs/miopen/archives/resnet50/README.md
@@ -0,0 +1,33 @@
+Start with the 6 asm_full logic files
+
+  - vega20_Cijk_Ailk_Bjlk_HB.yaml
+  - vega20_Cijk_Ailk_Bljk_HB.yaml
+  - vega20_Cijk_Alik_Bljk_HB.yaml
+  - vega20_Cijk_Ailk_Bjlk_SB.yaml
+  - vega20_Cijk_Ailk_Bljk_SB.yaml
+  - vega20_Cijk_Alik_Bljk_SB.yaml
+
+from
+
+  - rocBLAS commit a85df88648587a0d2880a74c6c57964366ab02a1 for HGEMM
+  - rocBLAS commit 0ceb1ad64c8bda5473a1e1c3a74ab9ff204acbf8 for SGEMM
+
+we merge the 6 Resnet50-specific logic files archived in the "logic" directory
+into the corresponding asm_full logic files of the same name, resulting in the
+6 combined asm_full logic files in
+
+  - rocBLAS commit ea27b3aba339b4fd48795153995d24dd96cd6457 for HGEMM+SGEMM
+
+The 6 YAML configuration files used to generate the Resnet50-specific logic
+files are archived in the "config" directory correspondingly named
+
+  - hgemm_resnet50_nt.yaml
+  - hgemm_resnet50_nn.yaml
+  - hgemm_resnet50_tn.yaml
+  - sgemm_resnet50_nt.yaml
+  - sgemm_resnet50_nn.yaml
+  - sgemm_resnet50_tn.yaml
+
+Note that we explicitly purged the 6 sizes with either n=49 or k=49 from
+the Resnet50-specific logic files for HGEMM because they won't be using
+the assembly kernels.
diff --git a/Tensile/Configs/miopen/archives/resnet50/config/hgemm_resnet50_nn.yaml b/Tensile/Configs/miopen/archives/resnet50/config/hgemm_resnet50_nn.yaml
new file mode 100644
index 0000000000..e3d2c7c8af
--- /dev/null
+++ b/Tensile/Configs/miopen/archives/resnet50/config/hgemm_resnet50_nn.yaml
@@ -0,0 +1,115 @@
+GlobalParameters:
+  MinimumRequiredVersion: 4.2.0
+  PrintLevel: 1
+  ForceRedoBenchmarkProblems: True
+  ForceRedoLibraryLogic: True
+  ForceRedoLibraryClient: True
+  CMakeBuildType: Release
+  EnqueuesPerSync: 1
+  SyncsPerBenchmark: 1
+  LibraryPrintDebug: False
+  NumElementsToValidate: 0
+  ValidationMaxToPrint: 4
+  ValidationPrintValids: False
+  ShortNames: False
+  MergeFiles: True
+  Platform: 0
+  Device: 0
+  KernelTime: True
+  PinClocks: True
+  SleepPercent: 200
+  DataInitTypeBeta : 0
+  CodeFromFiles: 1
+  SolutionSelectionAlg: 1
+  PrintWinnersOnly: 1
+
+BenchmarkProblems:
+  ########################################
+  # NN - standard
+  ########################################
+  -
+    - # ProblemType
+      OperationType: GEMM
+      DataType: h
+      TransposeA: False
+      TransposeB: False
+      UseBeta: True
+      Batched: True
+  ########################################
+  # Explore large number of ~10K half solutions
+  ########################################
+    - # Benchmark Group
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - EdgeType: ["ShiftPtr"]
+        - LoopTail: [True]
+        - KernelLanguage: ["Assembly"]
+      ForkParameters:
+        - FractionalLoad: [1]
+        - PrefetchGlobalRead: [ False, True ]
+        - PrefetchLocalRead: [ False, True]
+        - ThreadTile:
+          - [ 4, 4 ]
+          - [ 8, 4 ]
+          - [ 8, 8 ]
+          - [ 16, 8 ]
+          - [ 8, 16 ]
+          - [ 16, 16 ]
+        - WorkGroup:
+          - [ 16, 8, 2 ]
+          - [ 16, 4, 4 ]
+          - [ 16,  8, 1 ]
+          - [ 8, 32, 1 ]
+          - [ 16, 16, 1 ]
+          - [ 32,  8, 1 ]
+        - GlobalSplitU: [1,3,5]
+        - WorkGroupMapping: [1,8,64]
+        - DepthU: [ 8,16,24,32 ]
+        - VectorWidth: [4,8]
+        - GlobalReadVectorWidth: [1,8]
+        - LdsPadB: [0, -1 ]
+        - AssertSummationElementMultiple: [2]
+        - AssertFree0ElementMultiple: [2]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+# Resnet50 NN
+          - Exact: [   784 ,   128 ,  64,   512 ]    # beta= 0
+          - Exact: [   784 ,   512 ,  64,   128 ]    # beta= 0
+          - Exact: [  3136 ,    64 ,  64,    64 ]    # beta= 0
+          - Exact: [  3136 ,    64 ,  64,   256 ]    # beta= 0
+          - Exact: [  3136 ,   256 ,  64,    64 ]    # beta= 0
+          - Exact: [   784 ,   128 , 128,   512 ]    # beta= 0
+          - Exact: [   784 ,   512 , 128,   128 ]    # beta= 0
+          - Exact: [  3136 ,    64 , 128,    64 ]    # beta= 0
+          - Exact: [  3136 ,    64 , 128,   256 ]    # beta= 0
+          - Exact: [  3136 ,   256 , 128,    64 ]    # beta= 0
+          - Exact: [  3136 ,   512 ,   1,  2048 ]    # beta= 0
+          - Exact: [  3136 ,  2048 ,   1,   512 ]    # beta= 0
+          - Exact: [ 12544 ,   256 ,   1,  1024 ]    # beta= 0
+          - Exact: [ 12544 ,  1024 ,   1,   256 ]    # beta= 0
+
+LibraryLogic:
+    ScheduleName: "vega20"
+    DeviceNames: ["Device 66a0", "Device 66a7"]
+    ArchitectureName: "gfx906"
+
+#   ScheduleName: "vega10"
+#   DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"]
+#   ArchitectureName: "gfx900"
+
+#   ScheduleName: "mi25"
+#   DeviceNames: ["Device 6860"]
+#   ArchitectureName: "gfx900"
+
+#   ScheduleName: "r9nano"
+#   DeviceNames: ["Device 7300"]
+#   ArchitectureName: "gfx803"
+
+#   ScheduleName: "hip"
+#   DeviceNames: ["Device 0000"]
+#   ArchitectureName: "fallback"
+
+LibraryClient:
diff --git a/Tensile/Configs/miopen/archives/resnet50/config/hgemm_resnet50_nt.yaml b/Tensile/Configs/miopen/archives/resnet50/config/hgemm_resnet50_nt.yaml
new file mode 100644
index 0000000000..6ec161cf0c
--- /dev/null
+++ b/Tensile/Configs/miopen/archives/resnet50/config/hgemm_resnet50_nt.yaml
@@ -0,0 +1,119 @@
+GlobalParameters:
+  MinimumRequiredVersion: 4.2.0
+  PrintLevel: 1
+  ForceRedoBenchmarkProblems: True
+  ForceRedoLibraryLogic: True
+  ForceRedoLibraryClient: True
+  CMakeBuildType: Release
+  EnqueuesPerSync: 1
+  SyncsPerBenchmark: 1
+  LibraryPrintDebug: False
+  NumElementsToValidate: 0
+  ValidationMaxToPrint: 4
+  ValidationPrintValids: False
+  ShortNames: False
+  MergeFiles: True
+  Platform: 0
+  Device: 0
+  KernelTime: True
+  PinClocks: True
+  SleepPercent: 200
+  DataInitTypeBeta : 0
+  CodeFromFiles: 1
+  SolutionSelectionAlg: 1
+  PrintWinnersOnly: 1
+
+BenchmarkProblems:
+  ########################################
+  # NT - standard
+  ########################################
+  -
+    - # ProblemType
+      OperationType: GEMM
+      DataType: h
+      TransposeA: False
+      TransposeB: True
+      UseBeta: True
+      Batched: True
+  ########################################
+  # Explore large number of ~10K half solutions
+  ########################################
+    - # Benchmark Group
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - EdgeType: ["ShiftPtr"]
+        - LoopTail: [True]
+        - KernelLanguage: ["Assembly"]
+      ForkParameters:
+        - FractionalLoad: [1]
+        - PrefetchGlobalRead: [ False, True ]
+        - PrefetchLocalRead: [ False, True]
+        - ThreadTile:
+          - [ 4, 4 ]
+          - [ 8, 4 ]
+          - [ 8, 8 ]
+          - [ 16, 8 ]
+          - [ 8, 16 ]
+          - [ 16, 16 ]
+        - WorkGroup:
+          - [ 16, 8, 2 ]
+          - [ 16, 4, 4 ]
+          - [ 16,  8, 1 ]
+          - [ 8, 32, 1 ]
+          - [ 16, 16, 1 ]
+          - [ 32,  8, 1 ]
+        - GlobalSplitU: [1,3,5]
+        - WorkGroupMapping: [1,8,64]
+        - DepthU: [ 8,16,24,32 ]
+        - VectorWidth: [4,8]
+        - GlobalReadVectorWidth: [1,8]
+        - LdsPadB: [0, -1 ]
+        - AssertSummationElementMultiple: [2]
+        - AssertFree0ElementMultiple: [2]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+# Resnet50 NT
+          - Exact: [    49 ,   512 ,  64,  2048 ]    # beta= 0
+          - Exact: [    49 ,  2048 ,  64,   512 ]    # beta= 0
+          - Exact: [   196 ,   256 ,  64,  1024 ]    # beta= 0
+          - Exact: [   196 ,  1024 ,  64,   256 ]    # beta= 0
+          - Exact: [   784 ,   128 ,  64,   512 ]    # beta= 0
+          - Exact: [   784 ,   512 ,  64,   128 ]    # beta= 0
+          - Exact: [  3136 ,    64 ,  64,    64 ]    # beta= 0
+          - Exact: [  3136 ,   256 ,  64,    64 ]    # beta= 0
+          - Exact: [  3136 ,    64 ,  64,   256 ]    # beta= 0
+          - Exact: [    49 ,   512 , 128,  2048 ]    # beta= 0
+          - Exact: [    49 ,  2048 , 128,   512 ]    # beta= 0
+          - Exact: [   196 ,   256 , 128,  1024 ]    # beta= 0
+          - Exact: [   196 ,  1024 , 128,   256 ]    # beta= 0
+          - Exact: [   784 ,   128 , 128,   512 ]    # beta= 0
+          - Exact: [   784 ,   512 , 128,   128 ]    # beta= 0
+          - Exact: [  3136 ,    64 , 128,    64 ]    # beta= 0
+          - Exact: [  3136 ,    64 , 128,   256 ]    # beta= 0
+          - Exact: [  3136 ,   256 , 128,    64 ]    # beta= 0
+
+LibraryLogic:
+    ScheduleName: "vega20"
+    DeviceNames: ["Device 66a0", "Device 66a7"]
+    ArchitectureName: "gfx906"
+
+#   ScheduleName: "vega10"
+#   DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"]
+#   ArchitectureName: "gfx900"
+
+#   ScheduleName: "mi25"
+#   DeviceNames: ["Device 6860"]
+#   ArchitectureName: "gfx900"
+
+#   ScheduleName: "r9nano"
+#   DeviceNames: ["Device 7300"]
+#   ArchitectureName: "gfx803"
+
+#   ScheduleName: "hip"
+#   DeviceNames: ["Device 0000"]
+#   ArchitectureName: "fallback"
+
+LibraryClient:
diff --git a/Tensile/Configs/miopen/archives/resnet50/config/hgemm_resnet50_tn.yaml b/Tensile/Configs/miopen/archives/resnet50/config/hgemm_resnet50_tn.yaml
new file mode 100644
index 0000000000..d77badde25
--- /dev/null
+++ b/Tensile/Configs/miopen/archives/resnet50/config/hgemm_resnet50_tn.yaml
@@ -0,0 +1,110 @@
+GlobalParameters:
+  MinimumRequiredVersion: 4.2.0
+  PrintLevel: 1
+  ForceRedoBenchmarkProblems: True
+  ForceRedoLibraryLogic: True
+  ForceRedoLibraryClient: True
+  CMakeBuildType: Release
+  EnqueuesPerSync: 1
+  SyncsPerBenchmark: 1
+  LibraryPrintDebug: False
+  NumElementsToValidate: 0
+  ValidationMaxToPrint: 4
+  ValidationPrintValids: False
+  ShortNames: False
+  MergeFiles: True
+  Platform: 0
+  Device: 0
+  KernelTime: True
+  PinClocks: True
+  SleepPercent: 200
+  DataInitTypeBeta : 0
+  CodeFromFiles: 1
+  SolutionSelectionAlg: 1
+  PrintWinnersOnly: 1
+
+BenchmarkProblems:
+  ########################################
+  # TN - standard
+  ########################################
+  -
+    - # ProblemType
+      OperationType: GEMM
+      DataType: h
+      TransposeA: True
+      TransposeB: False
+      UseBeta: True
+      Batched: True
+  ########################################
+  # Explore large number of ~10K half solutions
+  ########################################
+    - # Benchmark Group
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - EdgeType: ["ShiftPtr"]
+        - LoopTail: [True]
+        - KernelLanguage: ["Assembly"]
+      ForkParameters:
+        - FractionalLoad: [1]
+        - PrefetchGlobalRead: [ False, True ]
+        - PrefetchLocalRead: [ False, True]
+        - ThreadTile:
+          - [ 4, 4 ]
+          - [ 8, 4 ]
+          - [ 8, 8 ]
+          - [ 16, 8 ]
+          - [ 8, 16 ]
+          - [ 16, 16 ]
+        - WorkGroup:
+          - [ 16, 8, 2 ]
+          - [ 16, 4, 4 ]
+          - [ 16,  8, 1 ]
+          - [ 8, 32, 1 ]
+          - [ 16, 16, 1 ]
+          - [ 32,  8, 1 ]
+        - GlobalSplitU: [1,3,5]
+        - WorkGroupMapping: [1,8,64]
+        - DepthU: [ 8,16,24,32 ]
+        - VectorWidth: [4,8]
+        - GlobalReadVectorWidth: [1,8]
+        - LdsPadB: [0, -1 ]
+        - AssertSummationElementMultiple: [2]
+        - AssertFree0ElementMultiple: [2]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+# Resnet50 TN
+          - Exact: [    64 ,    64 ,   1,  3136 ]    # beta= 1
+          - Exact: [    64 ,   256 ,   1,  3136 ]    # beta= 1
+          - Exact: [   128 ,   512 ,   1,   784 ]    # beta= 1
+          - Exact: [   256 ,    64 ,   1,  3136 ]    # beta= 1
+          - Exact: [   256 ,  1024 ,   1,   196 ]    # beta= 1
+          - Exact: [   512 ,   128 ,   1,   784 ]    # beta= 1
+          - Exact: [   512 ,  2048 ,   1,    49 ]    # beta= 1
+          - Exact: [  1024 ,   256 ,   1,   196 ]    # beta= 1
+          - Exact: [  2048 ,   512 ,   1,    49 ]    # beta= 1
+
+LibraryLogic:
+    ScheduleName: "vega20"
+    DeviceNames: ["Device 66a0", "Device 66a7"]
+    ArchitectureName: "gfx906"
+
+#   ScheduleName: "vega10"
+#   DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"]
+#   ArchitectureName: "gfx900"
+
+#   ScheduleName: "mi25"
+#   DeviceNames: ["Device 6860"]
+#   ArchitectureName: "gfx900"
+
+#   ScheduleName: "r9nano"
+#   DeviceNames: ["Device 7300"]
+#   ArchitectureName: "gfx803"
+
+#   ScheduleName: "hip"
+#   DeviceNames: ["Device 0000"]
+#   ArchitectureName: "fallback"
+
+LibraryClient:
diff --git a/Tensile/Configs/miopen/archives/resnet50/config/sgemm_resnet50_nn.yaml b/Tensile/Configs/miopen/archives/resnet50/config/sgemm_resnet50_nn.yaml
new file mode 100644
index 0000000000..9c3bb3f71a
--- /dev/null
+++ b/Tensile/Configs/miopen/archives/resnet50/config/sgemm_resnet50_nn.yaml
@@ -0,0 +1,111 @@
+GlobalParameters:
+  MinimumRequiredVersion: 4.2.0
+  PrintLevel: 1
+  ForceRedoBenchmarkProblems: True
+  ForceRedoLibraryLogic: True
+  ForceRedoLibraryClient: True
+  CMakeBuildType: Release
+  EnqueuesPerSync: 1
+  SyncsPerBenchmark: 1
+  LibraryPrintDebug: False
+  NumElementsToValidate: 0
+  ValidationMaxToPrint: 4
+  ValidationPrintValids: False
+  ShortNames: False
+  MergeFiles: True
+  Platform: 0
+  Device: 0
+  KernelTime: True
+  PinClocks: True
+  SleepPercent: 200
+  DataInitTypeBeta : 0
+  CodeFromFiles: 1
+  SolutionSelectionAlg: 1
+  PrintWinnersOnly: 1
+
+BenchmarkProblems:
+  ########################################
+  # NN - standard
+  ########################################
+  -
+    - # ProblemType
+      OperationType: GEMM
+      DataType: s
+      TransposeA: False
+      TransposeB: False
+      UseBeta: True
+      Batched: True
+  ########################################
+  # Explore large number of ~10K solutions
+  ########################################
+    - # Benchmark Group
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - EdgeType: ["ShiftPtr"]
+        - KernelLanguage: ["Assembly"]
+      ForkParameters:
+        - FractionalLoad: [1]
+        - PrefetchGlobalRead: [ False, True ]
+        - PrefetchLocalRead: [ False, True]
+        - ThreadTile:
+          - [ 2, 2 ]
+          - [ 4, 4 ]
+          - [ 4, 8 ]
+          - [ 8, 4 ]
+        - WorkGroup:
+          - [ 16, 8, 2 ]
+          - [ 16, 4, 4 ]
+          - [ 16,  8, 1 ]
+          - [ 8, 32, 1 ]
+          - [ 16, 16, 1 ]
+          - [ 32,  8, 1 ]
+        - GlobalSplitU: [1,4,8]
+        - WorkGroupMapping: [1,8,64]
+        - DepthU: [ 8,16,32 ]
+        - VectorWidth: [1,2,4]
+        - GlobalReadVectorWidth: [1,-1]
+        - LdsPadA: [0]
+        - LdsPadB: [0]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+# Resnet50 NN
+          - Exact: [   784 ,   128 ,  64,   512 ]    # beta= 0
+          - Exact: [   784 ,   512 ,  64,   128 ]    # beta= 0
+          - Exact: [  3136 ,    64 ,  64,    64 ]    # beta= 0
+          - Exact: [  3136 ,    64 ,  64,   256 ]    # beta= 0
+          - Exact: [  3136 ,   256 ,  64,    64 ]    # beta= 0
+          - Exact: [   784 ,   128 , 128,   512 ]    # beta= 0
+          - Exact: [   784 ,   512 , 128,   128 ]    # beta= 0
+          - Exact: [  3136 ,    64 , 128,    64 ]    # beta= 0
+          - Exact: [  3136 ,    64 , 128,   256 ]    # beta= 0
+          - Exact: [  3136 ,   256 , 128,    64 ]    # beta= 0
+          - Exact: [  3136 ,   512 ,   1,  2048 ]    # beta= 0
+          - Exact: [  3136 ,  2048 ,   1,   512 ]    # beta= 0
+          - Exact: [ 12544 ,   256 ,   1,  1024 ]    # beta= 0
+          - Exact: [ 12544 ,  1024 ,   1,   256 ]    # beta= 0
+
+LibraryLogic:
+    ScheduleName: "vega20"
+    DeviceNames: ["Device 66a0", "Device 66a7"]
+    ArchitectureName: "gfx906"
+
+#   ScheduleName: "vega10"
+#   DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"]
+#   ArchitectureName: "gfx900"
+
+#   ScheduleName: "mi25"
+#   DeviceNames: ["Device 6860"]
+#   ArchitectureName: "gfx900"
+
+#   ScheduleName: "r9nano"
+#   DeviceNames: ["Device 7300"]
+#   ArchitectureName: "gfx803"
+
+#   ScheduleName: "hip"
+#   DeviceNames: ["Device 0000"]
+#   ArchitectureName: "fallback"
+
+LibraryClient:
diff --git a/Tensile/Configs/miopen/archives/resnet50/config/sgemm_resnet50_nt.yaml b/Tensile/Configs/miopen/archives/resnet50/config/sgemm_resnet50_nt.yaml
new file mode 100644
index 0000000000..b1ab3043a1
--- /dev/null
+++ b/Tensile/Configs/miopen/archives/resnet50/config/sgemm_resnet50_nt.yaml
@@ -0,0 +1,115 @@
+GlobalParameters:
+  MinimumRequiredVersion: 4.2.0
+  PrintLevel: 1
+  ForceRedoBenchmarkProblems: True
+  ForceRedoLibraryLogic: True
+  ForceRedoLibraryClient: True
+  CMakeBuildType: Release
+  EnqueuesPerSync: 1
+  SyncsPerBenchmark: 1
+  LibraryPrintDebug: False
+  NumElementsToValidate: 0
+  ValidationMaxToPrint: 4
+  ValidationPrintValids: False
+  ShortNames: False
+  MergeFiles: True
+  Platform: 0
+  Device: 0
+  KernelTime: True
+  PinClocks: True
+  SleepPercent: 200
+  DataInitTypeBeta : 0
+  CodeFromFiles: 1
+  SolutionSelectionAlg: 1
+  PrintWinnersOnly: 1
+
+BenchmarkProblems:
+  ########################################
+  # NT - standard
+  ########################################
+  -
+    - # ProblemType
+      OperationType: GEMM
+      DataType: s
+      TransposeA: False
+      TransposeB: True
+      UseBeta: True
+      Batched: True
+  ########################################
+  # Explore large number of ~10K solutions
+  ########################################
+    - # Benchmark Group
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - EdgeType: ["ShiftPtr"]
+        - KernelLanguage: ["Assembly"]
+      ForkParameters:
+        - FractionalLoad: [1]
+        - PrefetchGlobalRead: [ False, True ]
+        - PrefetchLocalRead: [ False, True]
+        - ThreadTile:
+          - [ 2, 2 ]
+          - [ 4, 4 ]
+          - [ 4, 8 ]
+          - [ 8, 4 ]
+        - WorkGroup:
+          - [ 16, 8, 2 ]
+          - [ 16, 4, 4 ]
+          - [ 16,  8, 1 ]
+          - [ 8, 32, 1 ]
+          - [ 16, 16, 1 ]
+          - [ 32,  8, 1 ]
+        - GlobalSplitU: [1,3,5]
+        - WorkGroupMapping: [1,8,64]
+        - DepthU: [ 8,16,32 ]
+        - VectorWidth: [1,2,4]
+        - GlobalReadVectorWidth: [1,4]
+        - LdsPadA: [0, -1 ]
+        - LdsPadB: [0, -1 ]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+# Resnet50 NT
+          - Exact: [    49 ,   512 ,  64,  2048 ]    # beta= 0
+          - Exact: [    49 ,  2048 ,  64,   512 ]    # beta= 0
+          - Exact: [   196 ,   256 ,  64,  1024 ]    # beta= 0
+          - Exact: [   196 ,  1024 ,  64,   256 ]    # beta= 0
+          - Exact: [   784 ,   128 ,  64,   512 ]    # beta= 0
+          - Exact: [   784 ,   512 ,  64,   128 ]    # beta= 0
+          - Exact: [  3136 ,    64 ,  64,    64 ]    # beta= 0
+          - Exact: [  3136 ,   256 ,  64,    64 ]    # beta= 0
+          - Exact: [  3136 ,    64 ,  64,   256 ]    # beta= 0
+          - Exact: [    49 ,   512 , 128,  2048 ]    # beta= 0
+          - Exact: [    49 ,  2048 , 128,   512 ]    # beta= 0
+          - Exact: [   196 ,   256 , 128,  1024 ]    # beta= 0
+          - Exact: [   196 ,  1024 , 128,   256 ]    # beta= 0
+          - Exact: [   784 ,   128 , 128,   512 ]    # beta= 0
+          - Exact: [   784 ,   512 , 128,   128 ]    # beta= 0
+          - Exact: [  3136 ,    64 , 128,    64 ]    # beta= 0
+          - Exact: [  3136 ,    64 , 128,   256 ]    # beta= 0
+          - Exact: [  3136 ,   256 , 128,    64 ]    # beta= 0
+
+LibraryLogic:
+    ScheduleName: "vega20"
+    DeviceNames: ["Device 66a0", "Device 66a7"]
+    ArchitectureName: "gfx906"
+
+#   ScheduleName: "vega10"
+#   DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"]
+#   ArchitectureName: "gfx900"
+
+#   ScheduleName: "mi25"
+#   DeviceNames: ["Device 6860"]
+#   ArchitectureName: "gfx900"
+
+#   ScheduleName: "r9nano"
+#   DeviceNames: ["Device 7300"]
+#   ArchitectureName: "gfx803"
+
+#   ScheduleName: "hip"
+#   DeviceNames: ["Device 0000"]
+#   ArchitectureName: "fallback"
+
+LibraryClient:
diff --git a/Tensile/Configs/miopen/archives/resnet50/config/sgemm_resnet50_tn.yaml b/Tensile/Configs/miopen/archives/resnet50/config/sgemm_resnet50_tn.yaml
new file mode 100644
index 0000000000..1da57d40d9
--- /dev/null
+++ b/Tensile/Configs/miopen/archives/resnet50/config/sgemm_resnet50_tn.yaml
@@ -0,0 +1,106 @@
+GlobalParameters:
+  MinimumRequiredVersion: 4.2.0
+  PrintLevel: 1
+  ForceRedoBenchmarkProblems: True
+  ForceRedoLibraryLogic: True
+  ForceRedoLibraryClient: True
+  CMakeBuildType: Release
+  EnqueuesPerSync: 1
+  SyncsPerBenchmark: 1
+  LibraryPrintDebug: False
+  NumElementsToValidate: 0
+  ValidationMaxToPrint: 4
+  ValidationPrintValids: False
+  ShortNames: False
+  MergeFiles: True
+  Platform: 0
+  Device: 0
+  KernelTime: True
+  PinClocks: True
+  SleepPercent: 200
+  DataInitTypeBeta : 0
+  CodeFromFiles: 1
+  SolutionSelectionAlg: 1
+  PrintWinnersOnly: 0
+
+BenchmarkProblems:
+  ########################################
+  # TN - standard
+  ########################################
+  -
+    - # ProblemType
+      OperationType: GEMM
+      DataType: s
+      TransposeA: True
+      TransposeB: False
+      UseBeta: True
+      Batched: True
+  ########################################
+  # Explore large number of ~10K solutions
+  ########################################
+    - # Benchmark Group
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - EdgeType: ["ShiftPtr"]
+        - KernelLanguage: ["Assembly"]
+      ForkParameters:
+        - FractionalLoad: [1]
+        - PrefetchGlobalRead: [ False, True ]
+        - PrefetchLocalRead: [ False, True]
+        - ThreadTile:
+          - [ 2, 2 ]
+          - [ 4, 4 ]
+          - [ 4, 8 ]
+          - [ 8, 4 ]
+        - WorkGroup:
+          - [ 16, 8, 2 ]
+          - [ 16, 4, 4 ]
+          - [ 16,  8, 1 ]
+          - [ 8, 32, 1 ]
+          - [ 16, 16, 1 ]
+          - [ 32,  8, 1 ]
+        - GlobalSplitU: [1,4,8]
+        - WorkGroupMapping: [1,8,64]
+        - DepthU: [ 8,16,32 ]
+        - VectorWidth: [1,2,4]
+        - GlobalReadVectorWidth: [1,-1]
+        - LdsPadA: [-1]
+        - LdsPadB: [-1]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+# Resnet50 TN
+          - Exact: [    64 ,    64 ,   1,  3136 ]    # beta= 1
+          - Exact: [    64 ,   256 ,   1,  3136 ]    # beta= 1
+          - Exact: [   128 ,   512 ,   1,   784 ]    # beta= 1
+          - Exact: [   256 ,    64 ,   1,  3136 ]    # beta= 1
+          - Exact: [   256 ,  1024 ,   1,   196 ]    # beta= 1
+          - Exact: [   512 ,   128 ,   1,   784 ]    # beta= 1
+          - Exact: [   512 ,  2048 ,   1,    49 ]    # beta= 1
+          - Exact: [  1024 ,   256 ,   1,   196 ]    # beta= 1
+          - Exact: [  2048 ,   512 ,   1,    49 ]    # beta= 1
+
+LibraryLogic:
+    ScheduleName: "vega20"
+    DeviceNames: ["Device 66a0", "Device 66a7"]
+    ArchitectureName: "gfx906"
+
+#   ScheduleName: "vega10"
+#   DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"]
+#   ArchitectureName: "gfx900"
+
+#   ScheduleName: "mi25"
+#   DeviceNames: ["Device 6860"]
+#   ArchitectureName: "gfx900"
+
+#   ScheduleName: "r9nano"
+#   DeviceNames: ["Device 7300"]
+#   ArchitectureName: "gfx803"
+
+#   ScheduleName: "hip"
+#   DeviceNames: ["Device 0000"]
+#   ArchitectureName: "fallback"
+
+LibraryClient:
diff --git a/Tensile/Configs/miopen/archives/resnet50/logic/vega20_Cijk_Ailk_Bjlk_HB.yaml b/Tensile/Configs/miopen/archives/resnet50/logic/vega20_Cijk_Ailk_Bjlk_HB.yaml
new file mode 100644
index 0000000000..5e394efecf
--- /dev/null
+++ b/Tensile/Configs/miopen/archives/resnet50/logic/vega20_Cijk_Ailk_Bjlk_HB.yaml
@@ -0,0 +1,1312 @@
+- {MinimumRequiredVersion: 4.3.0}
+- vega20
+- gfx906
+- [Device 66a0, Device 66a7]
+- AssignedDerivedParameters: true
+  Batched: true
+  ComplexConjugateA: false
+  ComplexConjugateB: false
+  DataType: 4
+  HighPrecisionAccumulate: false
+  Index0: 0
+  Index01A: 0
+  Index01B: 1
+  Index1: 1
+  IndexAssignmentsA: [0, 3, 2]
+  IndexAssignmentsB: [1, 3, 2]
+  IndexUnroll: 3
+  IndexUnrollA: 1
+  IndexUnrollB: 1
+  IndicesBatch: [2]
+  IndicesFree: [0, 1]
+  IndicesSummation: [3]
+  NumIndicesBatch: 1
+  NumIndicesC: 3
+  NumIndicesFree: 2
+  NumIndicesSummation: 1
+  OperationType: GEMM
+  SilentHighPrecisionAccumulate: false
+  TLUA: true
+  TLUB: true
+  Tensor0: 0
+  Tensor1: 1
+  TileA: 0
+  TileB: 1
+  TotalIndices: 4
+  TransposeA: false
+  TransposeB: true
+  UseBeta: true
+  UseInitialStrides: false
+- - AssertFree0ElementMultiple: 2
+    AssertSummationElementMultiple: 2
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 32
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 8
+    GlobalLoadVectorWidthB: 8
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 8
+    GlobalSplitU: 5
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 256
+    LSCB: 256
+    LSPA: 8
+    LSPB: 8
+    LVCA: 32
+    LVCB: 32
+    LVPA: 1
+    LVPB: 1
+    LdsNumElements: 16384
+    LdsOffsetA: 0
+    LdsOffsetB: 8192
+    LdsPadA: 0
+    LdsPadB: 0
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 32
+    MacroTile0: 256
+    MacroTile1: 256
+    MacroTileA: 256
+    MacroTileB: 256
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 2
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 256
+    NumGlobalWriteVectorsPerThread: 64
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 4
+    NumLoadsPerpendicularB: 4
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: false
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 4
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [0, 3, 2]
+      IndexAssignmentsB: [1, 3, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 1
+      IndexUnrollB: 1
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: true
+      TLUB: true
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: false
+      TransposeB: true
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 0
+    SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT256x256x32_DTL0_GRVW08_GSU05_PGR0_PLR1_TT16_16_VW04_WG16_16_01_WGM08
+    SubGroup0: 16
+    SubGroup1: 16
+    SubGroupA: 16
+    SubGroupB: 16
+    ThreadTile: [16, 16]
+    ThreadTile0: 16
+    ThreadTile1: 16
+    ThreadTileA: 16
+    ThreadTileB: 16
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 2
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: [16, 16, 1]
+    WorkGroupMapping: 8
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 2
+    AssertSummationElementMultiple: 2
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 16
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 2
+    GlobalLoadVectorWidthB: 8
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 8
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 32
+    LSCB: 128
+    LSPA: 16
+    LSPB: 16
+    LVCA: 16
+    LVCB: 16
+    LVPA: 8
+    LVPB: 2
+    LdsNumElements: 2560
+    LdsOffsetA: 0
+    LdsOffsetB: 512
+    LdsPadA: 0
+    LdsPadB: 0
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 16
+    MacroTile0: 32
+    MacroTile1: 128
+    MacroTileA: 32
+    MacroTileB: 128
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 2
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 16
+    NumGlobalWriteVectorsPerThread: 4
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 1
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: false
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 4
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [0, 3, 2]
+      IndexAssignmentsB: [1, 3, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 1
+      IndexUnrollB: 1
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: true
+      TLUB: true
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: false
+      TransposeB: true
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 1
+    SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT032x128x16_DTL0_GRVW08_GSU01_PGR0_PLR1_TT04_04_VW04_WG08_32_01_WGM01
+    SubGroup0: 8
+    SubGroup1: 32
+    SubGroupA: 8
+    SubGroupB: 32
+    ThreadTile: [4, 4]
+    ThreadTile0: 4
+    ThreadTile1: 4
+    ThreadTileA: 4
+    ThreadTileB: 4
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 2
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: [8, 32, 1]
+    WorkGroupMapping: 1
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 2
+    AssertSummationElementMultiple: 2
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 8
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 2
+    GlobalLoadVectorWidthB: 8
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 8
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 8
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 64
+    LSCB: 256
+    LSPA: 8
+    LSPB: 8
+    LVCA: 32
+    LVCB: 32
+    LVPA: 4
+    LVPB: 1
+    LdsNumElements: 6656
+    LdsNumElementsAlignedA: 512
+    LdsNumElementsAlignedB: 2048
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 4096
+    LdsOffsetB: 512
+    LdsOffsetB_Blk: 4608
+    LdsPadA: 0
+    LdsPadB: 0
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 8
+    MacroTile0: 64
+    MacroTile1: 256
+    MacroTileA: 64
+    MacroTileB: 256
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 2
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 64
+    NumGlobalWriteVectorsPerThread: 8
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 1
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 4
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [0, 3, 2]
+      IndexAssignmentsB: [1, 3, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 1
+      IndexUnrollB: 1
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: true
+      TLUB: true
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: false
+      TransposeB: true
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 2
+    SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT064x256x08_DTL0_GRVW08_GSU01_PGR1_PLR1_TT08_08_VW08_WG08_32_01_WGM01
+    SubGroup0: 8
+    SubGroup1: 32
+    SubGroupA: 8
+    SubGroupB: 32
+    ThreadTile: [8, 8]
+    ThreadTile0: 8
+    ThreadTile1: 8
+    ThreadTileA: 8
+    ThreadTileB: 8
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 2
+    VectorStore: true
+    VectorWidth: 8
+    WorkGroup: [8, 32, 1]
+    WorkGroupMapping: 1
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 2
+    AssertSummationElementMultiple: 2
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 8
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 8
+    GlobalLoadVectorWidthB: 4
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 8
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 8
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 128
+    LSCB: 64
+    LSPA: 8
+    LSPB: 8
+    LVCA: 16
+    LVCB: 16
+    LVPA: 1
+    LVPB: 2
+    LdsNumElements: 3584
+    LdsNumElementsAlignedA: 1024
+    LdsNumElementsAlignedB: 512
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 2048
+    LdsOffsetB: 1024
+    LdsOffsetB_Blk: 3072
+    LdsPadA: 0
+    LdsPadB: 0
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 8
+    MacroTile0: 128
+    MacroTile1: 64
+    MacroTileA: 128
+    MacroTileB: 64
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 2
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 64
+    NumGlobalWriteVectorsPerThread: 8
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 1
+    NumLoadsPerpendicularB: 1
+    NumThreads: 128
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 4
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [0, 3, 2]
+      IndexAssignmentsB: [1, 3, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 1
+      IndexUnrollB: 1
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: true
+      TLUB: true
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: false
+      TransposeB: true
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 3
+    SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x064x08_DTL0_GRVW08_GSU01_PGR1_PLR1_TT08_08_VW08_WG16_08_01_WGM01
+    SubGroup0: 16
+    SubGroup1: 8
+    SubGroupA: 16
+    SubGroupB: 8
+    ThreadTile: [8, 8]
+    ThreadTile0: 8
+    ThreadTile1: 8
+    ThreadTileA: 8
+    ThreadTileB: 8
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 2
+    VectorStore: true
+    VectorWidth: 8
+    WorkGroup: [16, 8, 1]
+    WorkGroupMapping: 1
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 2
+    AssertSummationElementMultiple: 2
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 16
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 2
+    GlobalLoadVectorWidthB: 8
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 8
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 32
+    LSCB: 128
+    LSPA: 16
+    LSPB: 16
+    LVCA: 16
+    LVCB: 16
+    LVPA: 8
+    LVPB: 2
+    LdsNumElements: 6656
+    LdsNumElementsAlignedA: 512
+    LdsNumElementsAlignedB: 2048
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 4096
+    LdsOffsetB: 512
+    LdsOffsetB_Blk: 4608
+    LdsPadA: 0
+    LdsPadB: 0
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 16
+    MacroTile0: 32
+    MacroTile1: 128
+    MacroTileA: 32
+    MacroTileB: 128
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 2
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 16
+    NumGlobalWriteVectorsPerThread: 4
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 1
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 4
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [0, 3, 2]
+      IndexAssignmentsB: [1, 3, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 1
+      IndexUnrollB: 1
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: true
+      TLUB: true
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: false
+      TransposeB: true
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 4
+    SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT032x128x16_DTL0_GRVW08_GSU01_PGR1_PLR1_TT04_04_VW04_WG08_32_01_WGM01
+    SubGroup0: 8
+    SubGroup1: 32
+    SubGroupA: 8
+    SubGroupB: 32
+    ThreadTile: [4, 4]
+    ThreadTile0: 4
+    ThreadTile1: 4
+    ThreadTileA: 4
+    ThreadTileB: 4
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 2
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: [8, 32, 1]
+    WorkGroupMapping: 1
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 2
+    AssertSummationElementMultiple: 2
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 16
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 8
+    GlobalLoadVectorWidthB: 4
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 8
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 128
+    LSCB: 64
+    LSPA: 16
+    LSPB: 16
+    LVCA: 16
+    LVCB: 16
+    LVPA: 2
+    LVPB: 4
+    LdsNumElements: 3072
+    LdsOffsetA: 0
+    LdsOffsetB: 2048
+    LdsPadA: 0
+    LdsPadB: 0
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 16
+    MacroTile0: 128
+    MacroTile1: 64
+    MacroTileA: 128
+    MacroTileB: 64
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 2
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 32
+    NumGlobalWriteVectorsPerThread: 8
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 1
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: false
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 4
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [0, 3, 2]
+      IndexAssignmentsB: [1, 3, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 1
+      IndexUnrollB: 1
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: true
+      TLUB: true
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: false
+      TransposeB: true
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 5
+    SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x064x16_DTL0_GRVW08_GSU01_PGR0_PLR1_TT08_04_VW04_WG16_16_01_WGM01
+    SubGroup0: 16
+    SubGroup1: 16
+    SubGroupA: 16
+    SubGroupB: 16
+    ThreadTile: [8, 4]
+    ThreadTile0: 8
+    ThreadTile1: 4
+    ThreadTileA: 8
+    ThreadTileB: 4
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 2
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: [16, 16, 1]
+    WorkGroupMapping: 1
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 2
+    AssertSummationElementMultiple: 2
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 16
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 4
+    GlobalLoadVectorWidthB: 8
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 8
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 64
+    LSCB: 128
+    LSPA: 16
+    LSPB: 16
+    LVCA: 16
+    LVCB: 16
+    LVPA: 4
+    LVPB: 2
+    LdsNumElements: 7168
+    LdsNumElementsAlignedA: 1024
+    LdsNumElementsAlignedB: 2048
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 4096
+    LdsOffsetB: 1024
+    LdsOffsetB_Blk: 5120
+    LdsPadA: 0
+    LdsPadB: 0
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 16
+    MacroTile0: 64
+    MacroTile1: 128
+    MacroTileA: 64
+    MacroTileB: 128
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 2
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 32
+    NumGlobalWriteVectorsPerThread: 8
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 1
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 4
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [0, 3, 2]
+      IndexAssignmentsB: [1, 3, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 1
+      IndexUnrollB: 1
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: true
+      TLUB: true
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: false
+      TransposeB: true
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 6
+    SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT064x128x16_DTL0_GRVW08_GSU01_PGR1_PLR1_TT08_04_VW04_WG08_32_01_WGM01
+    SubGroup0: 8
+    SubGroup1: 32
+    SubGroupA: 8
+    SubGroupB: 32
+    ThreadTile: [8, 4]
+    ThreadTile0: 8
+    ThreadTile1: 4
+    ThreadTileA: 8
+    ThreadTileB: 4
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 2
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: [8, 32, 1]
+    WorkGroupMapping: 1
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 2
+    AssertSummationElementMultiple: 2
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 16
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 8
+    GlobalLoadVectorWidthB: 4
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 8
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 128
+    LSCB: 64
+    LSPA: 16
+    LSPB: 16
+    LVCA: 16
+    LVCB: 16
+    LVPA: 2
+    LVPB: 4
+    LdsNumElements: 7168
+    LdsNumElementsAlignedA: 2048
+    LdsNumElementsAlignedB: 1024
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 4096
+    LdsOffsetB: 2048
+    LdsOffsetB_Blk: 6144
+    LdsPadA: 0
+    LdsPadB: 0
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 16
+    MacroTile0: 128
+    MacroTile1: 64
+    MacroTileA: 128
+    MacroTileB: 64
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 2
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 32
+    NumGlobalWriteVectorsPerThread: 8
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 1
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 4
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [0, 3, 2]
+      IndexAssignmentsB: [1, 3, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 1
+      IndexUnrollB: 1
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: true
+      TLUB: true
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: false
+      TransposeB: true
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 7
+    SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x064x16_DTL0_GRVW08_GSU01_PGR1_PLR1_TT08_04_VW04_WG16_16_01_WGM01
+    SubGroup0: 16
+    SubGroup1: 16
+    SubGroupA: 16
+    SubGroupB: 16
+    ThreadTile: [8, 4]
+    ThreadTile0: 8
+    ThreadTile1: 4
+    ThreadTileA: 8
+    ThreadTileB: 4
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 2
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: [16, 16, 1]
+    WorkGroupMapping: 1
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 2
+    AssertSummationElementMultiple: 2
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 16
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 2
+    GlobalLoadVectorWidthB: 8
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 8
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 32
+    LSCB: 128
+    LSPA: 16
+    LSPB: 16
+    LVCA: 16
+    LVCB: 16
+    LVPA: 8
+    LVPB: 2
+    LdsNumElements: 2560
+    LdsOffsetA: 0
+    LdsOffsetB: 512
+    LdsPadA: 0
+    LdsPadB: 0
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 16
+    MacroTile0: 32
+    MacroTile1: 128
+    MacroTileA: 32
+    MacroTileB: 128
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 2
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 16
+    NumGlobalWriteVectorsPerThread: 4
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 1
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: false
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 4
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [0, 3, 2]
+      IndexAssignmentsB: [1, 3, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 1
+      IndexUnrollB: 1
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: true
+      TLUB: true
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: false
+      TransposeB: true
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 8
+    SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT032x128x16_DTL0_GRVW08_GSU01_PGR0_PLR1_TT04_04_VW04_WG08_32_01_WGM64
+    SubGroup0: 8
+    SubGroup1: 32
+    SubGroupA: 8
+    SubGroupB: 32
+    ThreadTile: [4, 4]
+    ThreadTile0: 4
+    ThreadTile1: 4
+    ThreadTileA: 4
+    ThreadTileB: 4
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 2
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: [8, 32, 1]
+    WorkGroupMapping: 64
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+- [2, 3, 0, 1]
+- - - [3136, 256, 64, 64]
+    - [2, 16074.9]
+  - - [784, 512, 64, 128]
+    - [2, 15833.4]
+  - - [49, 2048, 128, 512]
+    - [0, 0.0]
+  - - [784, 128, 64, 512]
+    - [6, 16500.8]
+  - - [196, 1024, 64, 256]
+    - [1, 13742.4]
+  - - [3136, 64, 128, 64]
+    - [3, 14617.2]
+  - - [784, 512, 128, 128]
+    - [2, 16298.0]
+  - - [196, 1024, 128, 256]
+    - [8, 14030.9]
+  - - [196, 256, 64, 1024]
+    - [4, 14105.5]
+  - - [3136, 64, 64, 64]
+    - [5, 13485.4]
+  - - [3136, 64, 64, 256]
+    - [7, 16852.6]
+  - - [3136, 64, 128, 256]
+    - [5, 17346.9]
+  - - [784, 128, 128, 512]
+    - [6, 16988.5]
+  - - [49, 2048, 64, 512]
+    - [0, 0.0]
+  - - [196, 256, 128, 1024]
+    - [4, 14648.5]
+  - - [49, 512, 64, 2048]
+    - [0, 0.0]
+  - - [49, 512, 128, 2048]
+    - [0, 0.0]
+  - - [3136, 256, 128, 64]
+    - [2, 16431.6]
+- null
diff --git a/Tensile/Configs/miopen/archives/resnet50/logic/vega20_Cijk_Ailk_Bjlk_SB.yaml b/Tensile/Configs/miopen/archives/resnet50/logic/vega20_Cijk_Ailk_Bjlk_SB.yaml
new file mode 100644
index 0000000000..fd8f6eefe3
--- /dev/null
+++ b/Tensile/Configs/miopen/archives/resnet50/logic/vega20_Cijk_Ailk_Bjlk_SB.yaml
@@ -0,0 +1,1467 @@
+- {MinimumRequiredVersion: 4.3.0}
+- vega20
+- gfx906
+- [Device 66a0, Device 66a7]
+- AssignedDerivedParameters: true
+  Batched: true
+  ComplexConjugateA: false
+  ComplexConjugateB: false
+  DataType: 0
+  HighPrecisionAccumulate: false
+  Index0: 0
+  Index01A: 0
+  Index01B: 1
+  Index1: 1
+  IndexAssignmentsA: [0, 3, 2]
+  IndexAssignmentsB: [1, 3, 2]
+  IndexUnroll: 3
+  IndexUnrollA: 1
+  IndexUnrollB: 1
+  IndicesBatch: [2]
+  IndicesFree: [0, 1]
+  IndicesSummation: [3]
+  NumIndicesBatch: 1
+  NumIndicesC: 3
+  NumIndicesFree: 2
+  NumIndicesSummation: 1
+  OperationType: GEMM
+  SilentHighPrecisionAccumulate: false
+  TLUA: true
+  TLUB: true
+  Tensor0: 0
+  Tensor1: 1
+  TileA: 0
+  TileB: 1
+  TotalIndices: 4
+  TransposeA: false
+  TransposeB: true
+  UseBeta: true
+  UseInitialStrides: false
+- - AssertFree0ElementMultiple: 1
+    AssertSummationElementMultiple: 1
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 8
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 2
+    GlobalLoadVectorWidthB: 4
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 4
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 64
+    LSCB: 128
+    LSPA: 8
+    LSPB: 8
+    LVCA: 32
+    LVCB: 32
+    LVPA: 4
+    LVPB: 2
+    LdsNumElements: 3584
+    LdsNumElementsAlignedA: 512
+    LdsNumElementsAlignedB: 1024
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 2048
+    LdsOffsetB: 512
+    LdsOffsetB_Blk: 2560
+    LdsPadA: 0
+    LdsPadB: 0
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 8
+    MacroTile0: 64
+    MacroTile1: 128
+    MacroTileA: 64
+    MacroTileB: 128
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 1
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 32
+    NumGlobalWriteVectorsPerThread: 8
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 1
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 0
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [0, 3, 2]
+      IndexAssignmentsB: [1, 3, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 1
+      IndexUnrollB: 1
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: true
+      TLUB: true
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: false
+      TransposeB: true
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 0
+    SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_VW04_WG16_16_01_WGM01
+    SubGroup0: 16
+    SubGroup1: 16
+    SubGroupA: 16
+    SubGroupB: 16
+    ThreadTile: [4, 8]
+    ThreadTile0: 4
+    ThreadTile1: 8
+    ThreadTileA: 4
+    ThreadTileB: 8
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 1
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: [16, 16, 1]
+    WorkGroupMapping: 1
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 1
+    AssertSummationElementMultiple: 1
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 8
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 4
+    GlobalLoadVectorWidthB: 4
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 4
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 64
+    LSCB: 64
+    LSPA: 8
+    LSPB: 8
+    LVCA: 16
+    LVCB: 16
+    LVPA: 2
+    LVPB: 2
+    LdsNumElements: 2048
+    LdsNumElementsAlignedA: 512
+    LdsNumElementsAlignedB: 512
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 1024
+    LdsOffsetB: 512
+    LdsOffsetB_Blk: 1536
+    LdsPadA: 0
+    LdsPadB: 0
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 8
+    MacroTile0: 64
+    MacroTile1: 64
+    MacroTileA: 64
+    MacroTileB: 64
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 1
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 32
+    NumGlobalWriteVectorsPerThread: 8
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 1
+    NumLoadsPerpendicularB: 1
+    NumThreads: 128
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 0
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [0, 3, 2]
+      IndexAssignmentsB: [1, 3, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 1
+      IndexUnrollB: 1
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: true
+      TLUB: true
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: false
+      TransposeB: true
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 1
+    SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_VW04_WG16_08_01_WGM01
+    SubGroup0: 16
+    SubGroup1: 8
+    SubGroupA: 16
+    SubGroupB: 8
+    ThreadTile: [4, 8]
+    ThreadTile0: 4
+    ThreadTile1: 8
+    ThreadTileA: 4
+    ThreadTileB: 8
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 1
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: [16, 8, 1]
+    WorkGroupMapping: 1
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 1
+    AssertSummationElementMultiple: 1
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 8
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 1
+    GlobalLoadVectorWidthB: 4
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 4
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 32
+    LSCB: 128
+    LSPA: 8
+    LSPB: 8
+    LVCA: 32
+    LVCB: 32
+    LVPA: 8
+    LVPB: 2
+    LdsNumElements: 3328
+    LdsNumElementsAlignedA: 256
+    LdsNumElementsAlignedB: 1024
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 2048
+    LdsOffsetB: 256
+    LdsOffsetB_Blk: 2304
+    LdsPadA: 0
+    LdsPadB: 0
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 8
+    MacroTile0: 32
+    MacroTile1: 128
+    MacroTileA: 32
+    MacroTileB: 128
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 1
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 16
+    NumGlobalWriteVectorsPerThread: 4
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 1
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 0
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [0, 3, 2]
+      IndexAssignmentsB: [1, 3, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 1
+      IndexUnrollB: 1
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: true
+      TLUB: true
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: false
+      TransposeB: true
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 2
+    SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x128x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_VW04_WG08_32_01_WGM01
+    SubGroup0: 8
+    SubGroup1: 32
+    SubGroupA: 8
+    SubGroupB: 32
+    ThreadTile: [4, 4]
+    ThreadTile0: 4
+    ThreadTile1: 4
+    ThreadTileA: 4
+    ThreadTileB: 4
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 1
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: [8, 32, 1]
+    WorkGroupMapping: 1
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 1
+    AssertSummationElementMultiple: 1
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 16
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 4
+    GlobalLoadVectorWidthB: 4
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 4
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 64
+    LSCB: 64
+    LSPA: 16
+    LSPB: 16
+    LVCA: 16
+    LVCB: 16
+    LVPA: 4
+    LVPB: 4
+    LdsNumElements: 4096
+    LdsNumElementsAlignedA: 1024
+    LdsNumElementsAlignedB: 1024
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 2048
+    LdsOffsetB: 1024
+    LdsOffsetB_Blk: 3072
+    LdsPadA: 0
+    LdsPadB: 0
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 16
+    MacroTile0: 64
+    MacroTile1: 64
+    MacroTileA: 64
+    MacroTileB: 64
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 1
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 16
+    NumGlobalWriteVectorsPerThread: 4
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 1
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 0
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [0, 3, 2]
+      IndexAssignmentsB: [1, 3, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 1
+      IndexUnrollB: 1
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: true
+      TLUB: true
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: false
+      TransposeB: true
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 3
+    SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_VW04_WG16_16_01_WGM01
+    SubGroup0: 16
+    SubGroup1: 16
+    SubGroupA: 16
+    SubGroupB: 16
+    ThreadTile: [4, 4]
+    ThreadTile0: 4
+    ThreadTile1: 4
+    ThreadTileA: 4
+    ThreadTileB: 4
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 1
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: [16, 16, 1]
+    WorkGroupMapping: 1
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 1
+    AssertSummationElementMultiple: 1
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 8
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 2
+    GlobalLoadVectorWidthB: 4
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 4
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 2
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 64
+    LSCB: 128
+    LSPA: 8
+    LSPB: 8
+    LVCA: 32
+    LVCB: 32
+    LVPA: 4
+    LVPB: 2
+    LdsNumElements: 3584
+    LdsNumElementsAlignedA: 512
+    LdsNumElementsAlignedB: 1024
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 2048
+    LdsOffsetB: 512
+    LdsOffsetB_Blk: 2560
+    LdsPadA: 0
+    LdsPadB: 0
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 8
+    MacroTile0: 64
+    MacroTile1: 128
+    MacroTileA: 64
+    MacroTileB: 128
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 1
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 32
+    NumGlobalWriteVectorsPerThread: 16
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 1
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 0
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [0, 3, 2]
+      IndexAssignmentsB: [1, 3, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 1
+      IndexUnrollB: 1
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: true
+      TLUB: true
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: false
+      TransposeB: true
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 4
+    SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_VW02_WG16_16_01_WGM64
+    SubGroup0: 16
+    SubGroup1: 16
+    SubGroupA: 16
+    SubGroupB: 16
+    ThreadTile: [4, 8]
+    ThreadTile0: 4
+    ThreadTile1: 8
+    ThreadTileA: 4
+    ThreadTileB: 8
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 1
+    VectorStore: true
+    VectorWidth: 2
+    WorkGroup: [16, 16, 1]
+    WorkGroupMapping: 64
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 1
+    AssertSummationElementMultiple: 1
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 8
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 2
+    GlobalLoadVectorWidthB: 4
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 4
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 2
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 64
+    LSCB: 128
+    LSPA: 8
+    LSPB: 8
+    LVCA: 32
+    LVCB: 32
+    LVPA: 4
+    LVPB: 2
+    LdsNumElements: 3584
+    LdsNumElementsAlignedA: 512
+    LdsNumElementsAlignedB: 1024
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 2048
+    LdsOffsetB: 512
+    LdsOffsetB_Blk: 2560
+    LdsPadA: 0
+    LdsPadB: 0
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 8
+    MacroTile0: 64
+    MacroTile1: 128
+    MacroTileA: 64
+    MacroTileB: 128
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 1
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 32
+    NumGlobalWriteVectorsPerThread: 16
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 1
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 0
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [0, 3, 2]
+      IndexAssignmentsB: [1, 3, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 1
+      IndexUnrollB: 1
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: true
+      TLUB: true
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: false
+      TransposeB: true
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 5
+    SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_VW02_WG16_16_01_WGM01
+    SubGroup0: 16
+    SubGroup1: 16
+    SubGroupA: 16
+    SubGroupB: 16
+    ThreadTile: [4, 8]
+    ThreadTile0: 4
+    ThreadTile1: 8
+    ThreadTileA: 4
+    ThreadTileB: 8
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 1
+    VectorStore: true
+    VectorWidth: 2
+    WorkGroup: [16, 16, 1]
+    WorkGroupMapping: 1
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 1
+    AssertSummationElementMultiple: 1
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 8
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 1
+    GlobalLoadVectorWidthB: 1
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 1
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 64
+    LSCB: 64
+    LSPA: 4
+    LSPB: 4
+    LVCA: 64
+    LVCB: 64
+    LVPA: 4
+    LVPB: 4
+    LdsNumElements: 2048
+    LdsNumElementsAlignedA: 512
+    LdsNumElementsAlignedB: 512
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 1024
+    LdsOffsetB: 512
+    LdsOffsetB_Blk: 1536
+    LdsPadA: 0
+    LdsPadB: 0
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 8
+    MacroTile0: 64
+    MacroTile1: 64
+    MacroTileA: 64
+    MacroTileB: 64
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 1
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 16
+    NumGlobalWriteVectorsPerThread: 4
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 2
+    NumLoadsPerpendicularB: 2
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 0
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [0, 3, 2]
+      IndexAssignmentsB: [1, 3, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 1
+      IndexUnrollB: 1
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: true
+      TLUB: true
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: false
+      TransposeB: true
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 6
+    SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x08_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_04_VW04_WG16_16_01_WGM01
+    SubGroup0: 16
+    SubGroup1: 16
+    SubGroupA: 16
+    SubGroupB: 16
+    ThreadTile: [4, 4]
+    ThreadTile0: 4
+    ThreadTile1: 4
+    ThreadTileA: 4
+    ThreadTileB: 4
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 1
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: [16, 16, 1]
+    WorkGroupMapping: 1
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 1
+    AssertSummationElementMultiple: 1
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 8
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 2
+    GlobalLoadVectorWidthB: 4
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 4
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 64
+    LSCB: 128
+    LSPA: 8
+    LSPB: 8
+    LVCA: 32
+    LVCB: 32
+    LVPA: 4
+    LVPB: 2
+    LdsNumElements: 3584
+    LdsNumElementsAlignedA: 512
+    LdsNumElementsAlignedB: 1024
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 2048
+    LdsOffsetB: 512
+    LdsOffsetB_Blk: 2560
+    LdsPadA: 0
+    LdsPadB: 0
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 8
+    MacroTile0: 64
+    MacroTile1: 128
+    MacroTileA: 64
+    MacroTileB: 128
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 1
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 32
+    NumGlobalWriteVectorsPerThread: 8
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 1
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 0
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [0, 3, 2]
+      IndexAssignmentsB: [1, 3, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 1
+      IndexUnrollB: 1
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: true
+      TLUB: true
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: false
+      TransposeB: true
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 7
+    SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_VW04_WG16_16_01_WGM64
+    SubGroup0: 16
+    SubGroup1: 16
+    SubGroupA: 16
+    SubGroupB: 16
+    ThreadTile: [4, 8]
+    ThreadTile0: 4
+    ThreadTile1: 8
+    ThreadTileA: 4
+    ThreadTileB: 8
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 1
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: [16, 16, 1]
+    WorkGroupMapping: 64
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 1
+    AssertSummationElementMultiple: 1
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 8
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 1
+    GlobalLoadVectorWidthB: 4
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 4
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 32
+    LSCB: 128
+    LSPA: 8
+    LSPB: 8
+    LVCA: 32
+    LVCB: 32
+    LVPA: 8
+    LVPB: 2
+    LdsNumElements: 3328
+    LdsNumElementsAlignedA: 256
+    LdsNumElementsAlignedB: 1024
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 2048
+    LdsOffsetB: 256
+    LdsOffsetB_Blk: 2304
+    LdsPadA: 0
+    LdsPadB: 0
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 8
+    MacroTile0: 32
+    MacroTile1: 128
+    MacroTileA: 32
+    MacroTileB: 128
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 1
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 16
+    NumGlobalWriteVectorsPerThread: 4
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 1
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 0
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [0, 3, 2]
+      IndexAssignmentsB: [1, 3, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 1
+      IndexUnrollB: 1
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: true
+      TLUB: true
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: false
+      TransposeB: true
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 8
+    SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x128x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_VW04_WG08_32_01_WGM64
+    SubGroup0: 8
+    SubGroup1: 32
+    SubGroupA: 8
+    SubGroupB: 32
+    ThreadTile: [4, 4]
+    ThreadTile0: 4
+    ThreadTile1: 4
+    ThreadTileA: 4
+    ThreadTileB: 4
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 1
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: [8, 32, 1]
+    WorkGroupMapping: 64
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 1
+    AssertSummationElementMultiple: 1
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 8
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 1
+    GlobalLoadVectorWidthB: 4
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 4
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 32
+    LSCB: 128
+    LSPA: 8
+    LSPB: 8
+    LVCA: 32
+    LVCB: 32
+    LVPA: 8
+    LVPB: 2
+    LdsNumElements: 3328
+    LdsNumElementsAlignedA: 256
+    LdsNumElementsAlignedB: 1024
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 2048
+    LdsOffsetB: 256
+    LdsOffsetB_Blk: 2304
+    LdsPadA: 0
+    LdsPadB: 0
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 8
+    MacroTile0: 32
+    MacroTile1: 128
+    MacroTileA: 32
+    MacroTileB: 128
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 1
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 16
+    NumGlobalWriteVectorsPerThread: 4
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 1
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 0
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [0, 3, 2]
+      IndexAssignmentsB: [1, 3, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 1
+      IndexUnrollB: 1
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: true
+      TLUB: true
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: false
+      TransposeB: true
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 9
+    SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x128x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_VW04_WG08_32_01_WGM08
+    SubGroup0: 8
+    SubGroup1: 32
+    SubGroupA: 8
+    SubGroupB: 32
+    ThreadTile: [4, 4]
+    ThreadTile0: 4
+    ThreadTile1: 4
+    ThreadTileA: 4
+    ThreadTileB: 4
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 1
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: [8, 32, 1]
+    WorkGroupMapping: 8
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+- [2, 3, 0, 1]
+- - - [3136, 256, 64, 64]
+    - [1, 8023.34]
+  - - [784, 512, 64, 128]
+    - [0, 8118.42]
+  - - [49, 2048, 128, 512]
+    - [5, 6709.69]
+  - - [784, 128, 64, 512]
+    - [7, 8457.53]
+  - - [196, 1024, 64, 256]
+    - [8, 7259.55]
+  - - [3136, 64, 128, 64]
+    - [6, 7772.99]
+  - - [784, 512, 128, 128]
+    - [0, 8225.65]
+  - - [196, 1024, 128, 256]
+    - [9, 7369.53]
+  - - [196, 256, 64, 1024]
+    - [2, 7306.01]
+  - - [3136, 64, 64, 64]
+    - [6, 7555.78]
+  - - [3136, 64, 64, 256]
+    - [1, 8713.92]
+  - - [3136, 64, 128, 256]
+    - [1, 8912.31]
+  - - [784, 128, 128, 512]
+    - [0, 8609.88]
+  - - [49, 2048, 64, 512]
+    - [4, 6575.52]
+  - - [196, 256, 128, 1024]
+    - [2, 7483.58]
+  - - [49, 512, 64, 2048]
+    - [3, 6562.92]
+  - - [49, 512, 128, 2048]
+    - [1, 6808.05]
+  - - [3136, 256, 128, 64]
+    - [1, 8120.83]
+- null
diff --git a/Tensile/Configs/miopen/archives/resnet50/logic/vega20_Cijk_Ailk_Bljk_HB.yaml b/Tensile/Configs/miopen/archives/resnet50/logic/vega20_Cijk_Ailk_Bljk_HB.yaml
new file mode 100644
index 0000000000..30f0f3e969
--- /dev/null
+++ b/Tensile/Configs/miopen/archives/resnet50/logic/vega20_Cijk_Ailk_Bljk_HB.yaml
@@ -0,0 +1,1443 @@
+- {MinimumRequiredVersion: 4.3.0}
+- vega20
+- gfx906
+- [Device 66a0, Device 66a7]
+- AssignedDerivedParameters: true
+  Batched: true
+  ComplexConjugateA: false
+  ComplexConjugateB: false
+  DataType: 4
+  HighPrecisionAccumulate: false
+  Index0: 0
+  Index01A: 0
+  Index01B: 1
+  Index1: 1
+  IndexAssignmentsA: [0, 3, 2]
+  IndexAssignmentsB: [3, 1, 2]
+  IndexUnroll: 3
+  IndexUnrollA: 1
+  IndexUnrollB: 0
+  IndicesBatch: [2]
+  IndicesFree: [0, 1]
+  IndicesSummation: [3]
+  NumIndicesBatch: 1
+  NumIndicesC: 3
+  NumIndicesFree: 2
+  NumIndicesSummation: 1
+  OperationType: GEMM
+  SilentHighPrecisionAccumulate: false
+  TLUA: true
+  TLUB: false
+  Tensor0: 0
+  Tensor1: 1
+  TileA: 0
+  TileB: 1
+  TotalIndices: 4
+  TransposeA: false
+  TransposeB: false
+  UseBeta: true
+  UseInitialStrides: false
+- - AssertFree0ElementMultiple: 2
+    AssertSummationElementMultiple: 2
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 16
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 8
+    GlobalLoadVectorWidthB: 8
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 8
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 8
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 128
+    LSCB: 16
+    LSPA: 16
+    LSPB: 128
+    LVCA: 16
+    LVCB: 2
+    LVPA: 2
+    LVPB: 16
+    LdsNumElements: 8192
+    LdsNumElementsAlignedA: 2048
+    LdsNumElementsAlignedB: 2048
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 4096
+    LdsOffsetB: 2048
+    LdsOffsetB_Blk: 6144
+    LdsPadA: 0
+    LdsPadB: 0
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 16
+    MacroTile0: 128
+    MacroTile1: 128
+    MacroTileA: 128
+    MacroTileB: 128
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 2
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 64
+    NumGlobalWriteVectorsPerThread: 8
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 1
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 4
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [0, 3, 2]
+      IndexAssignmentsB: [3, 1, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 1
+      IndexUnrollB: 0
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: true
+      TLUB: false
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: false
+      TransposeB: false
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 0
+    SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_DTL0_GRVW08_GSU01_LPB00_PGR1_PLR1_TT08_08_VW08_WG16_16_01_WGM01
+    SubGroup0: 16
+    SubGroup1: 16
+    SubGroupA: 16
+    SubGroupB: 16
+    ThreadTile: [8, 8]
+    ThreadTile0: 8
+    ThreadTile1: 8
+    ThreadTileA: 8
+    ThreadTileB: 8
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 2
+    VectorStore: true
+    VectorWidth: 8
+    WorkGroup: [16, 16, 1]
+    WorkGroupMapping: 1
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 2
+    AssertSummationElementMultiple: 2
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 16
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 8
+    GlobalLoadVectorWidthB: 8
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 8
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 8
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 128
+    LSCB: 16
+    LSPA: 16
+    LSPB: 128
+    LVCA: 16
+    LVCB: 2
+    LVPA: 2
+    LVPB: 16
+    LdsNumElements: 8192
+    LdsNumElementsAlignedA: 2048
+    LdsNumElementsAlignedB: 2048
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 4096
+    LdsOffsetB: 2048
+    LdsOffsetB_Blk: 6144
+    LdsPadA: 0
+    LdsPadB: 0
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 16
+    MacroTile0: 128
+    MacroTile1: 128
+    MacroTileA: 128
+    MacroTileB: 128
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 2
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 64
+    NumGlobalWriteVectorsPerThread: 8
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 1
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 4
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [0, 3, 2]
+      IndexAssignmentsB: [3, 1, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 1
+      IndexUnrollB: 0
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: true
+      TLUB: false
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: false
+      TransposeB: false
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 1
+    SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_DTL0_GRVW08_GSU01_LPB00_PGR1_PLR1_TT08_08_VW08_WG16_16_01_WGM08
+    SubGroup0: 16
+    SubGroup1: 16
+    SubGroupA: 16
+    SubGroupB: 16
+    ThreadTile: [8, 8]
+    ThreadTile0: 8
+    ThreadTile1: 8
+    ThreadTileA: 8
+    ThreadTileB: 8
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 2
+    VectorStore: true
+    VectorWidth: 8
+    WorkGroup: [16, 16, 1]
+    WorkGroupMapping: 8
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 2
+    AssertSummationElementMultiple: 2
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 16
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 8
+    GlobalLoadVectorWidthB: 2
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 8
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 128
+    LSCB: 16
+    LSPA: 16
+    LSPB: 32
+    LVCA: 16
+    LVCB: 8
+    LVPA: 2
+    LVPB: 16
+    LdsNumElements: 8192
+    LdsNumElementsAlignedA: 2048
+    LdsNumElementsAlignedB: 640
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 4096
+    LdsOffsetB: 2048
+    LdsOffsetB_Blk: 6144
+    LdsPadA: 0
+    LdsPadB: 4
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 2
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 8
+    MacroTile0: 128
+    MacroTile1: 32
+    MacroTileA: 128
+    MacroTileB: 32
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 2
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 16
+    NumGlobalWriteVectorsPerThread: 4
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 1
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 4
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [0, 3, 2]
+      IndexAssignmentsB: [3, 1, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 1
+      IndexUnrollB: 0
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: true
+      TLUB: false
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: false
+      TransposeB: false
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 2
+    SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x032x16_DTL0_GRVW08_GSU01_LPB04_PGR1_PLR1_TT08_04_VW04_WG16_08_02_WGM64
+    SubGroup0: 16
+    SubGroup1: 8
+    SubGroupA: 16
+    SubGroupB: 8
+    ThreadTile: [8, 4]
+    ThreadTile0: 8
+    ThreadTile1: 4
+    ThreadTileA: 8
+    ThreadTileB: 4
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 2
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: [16, 8, 2]
+    WorkGroupMapping: 64
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 2
+    AssertSummationElementMultiple: 2
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 16
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 8
+    GlobalLoadVectorWidthB: 4
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 8
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 128
+    LSCB: 16
+    LSPA: 16
+    LSPB: 64
+    LVCA: 16
+    LVCB: 4
+    LVPA: 2
+    LVPB: 16
+    LdsNumElements: 7232
+    LdsNumElementsAlignedA: 2048
+    LdsNumElementsAlignedB: 1152
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 4096
+    LdsOffsetB: 2048
+    LdsOffsetB_Blk: 6144
+    LdsPadA: 0
+    LdsPadB: 4
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 16
+    MacroTile0: 128
+    MacroTile1: 64
+    MacroTileA: 128
+    MacroTileB: 64
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 2
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 32
+    NumGlobalWriteVectorsPerThread: 8
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 1
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 4
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [0, 3, 2]
+      IndexAssignmentsB: [3, 1, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 1
+      IndexUnrollB: 0
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: true
+      TLUB: false
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: false
+      TransposeB: false
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 3
+    SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x064x16_DTL0_GRVW08_GSU01_LPB04_PGR1_PLR1_TT08_04_VW04_WG16_16_01_WGM01
+    SubGroup0: 16
+    SubGroup1: 16
+    SubGroupA: 16
+    SubGroupB: 16
+    ThreadTile: [8, 4]
+    ThreadTile0: 8
+    ThreadTile1: 4
+    ThreadTileA: 8
+    ThreadTileB: 4
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 2
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: [16, 16, 1]
+    WorkGroupMapping: 1
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 2
+    AssertSummationElementMultiple: 2
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 16
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 8
+    GlobalLoadVectorWidthB: 4
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 8
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 128
+    LSCB: 16
+    LSPA: 16
+    LSPB: 64
+    LVCA: 16
+    LVCB: 4
+    LVPA: 2
+    LVPB: 16
+    LdsNumElements: 7232
+    LdsNumElementsAlignedA: 2048
+    LdsNumElementsAlignedB: 1152
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 4096
+    LdsOffsetB: 2048
+    LdsOffsetB_Blk: 6144
+    LdsPadA: 0
+    LdsPadB: 4
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 16
+    MacroTile0: 128
+    MacroTile1: 64
+    MacroTileA: 128
+    MacroTileB: 64
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 2
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 32
+    NumGlobalWriteVectorsPerThread: 8
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 1
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 4
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [0, 3, 2]
+      IndexAssignmentsB: [3, 1, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 1
+      IndexUnrollB: 0
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: true
+      TLUB: false
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: false
+      TransposeB: false
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 4
+    SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x064x16_DTL0_GRVW08_GSU01_LPB04_PGR1_PLR1_TT08_04_VW04_WG16_16_01_WGM08
+    SubGroup0: 16
+    SubGroup1: 16
+    SubGroupA: 16
+    SubGroupB: 16
+    ThreadTile: [8, 4]
+    ThreadTile0: 8
+    ThreadTile1: 4
+    ThreadTileA: 8
+    ThreadTileB: 4
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 2
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: [16, 16, 1]
+    WorkGroupMapping: 8
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 2
+    AssertSummationElementMultiple: 2
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 32
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 8
+    GlobalLoadVectorWidthB: 8
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 8
+    GlobalSplitU: 5
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 128
+    LSCB: 32
+    LSPA: 16
+    LSPB: 64
+    LVCA: 16
+    LVCB: 4
+    LVPA: 2
+    LVPB: 8
+    LdsNumElements: 6272
+    LdsOffsetA: 0
+    LdsOffsetB: 4096
+    LdsPadA: 0
+    LdsPadB: 4
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 32
+    MacroTile0: 128
+    MacroTile1: 64
+    MacroTileA: 128
+    MacroTileB: 64
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 2
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 32
+    NumGlobalWriteVectorsPerThread: 8
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 2
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: false
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 4
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [0, 3, 2]
+      IndexAssignmentsB: [3, 1, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 1
+      IndexUnrollB: 0
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: true
+      TLUB: false
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: false
+      TransposeB: false
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 5
+    SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x064x32_DTL0_GRVW08_GSU05_LPB04_PGR0_PLR1_TT08_04_VW04_WG16_16_01_WGM08
+    SubGroup0: 16
+    SubGroup1: 16
+    SubGroupA: 16
+    SubGroupB: 16
+    ThreadTile: [8, 4]
+    ThreadTile0: 8
+    ThreadTile1: 4
+    ThreadTileA: 8
+    ThreadTileB: 4
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 2
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: [16, 16, 1]
+    WorkGroupMapping: 8
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 2
+    AssertSummationElementMultiple: 2
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 16
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 4
+    GlobalLoadVectorWidthB: 8
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 8
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 64
+    LSCB: 16
+    LSPA: 16
+    LSPB: 128
+    LVCA: 16
+    LVCB: 2
+    LVPA: 4
+    LVPB: 16
+    LdsNumElements: 3072
+    LdsOffsetA: 0
+    LdsOffsetB: 1024
+    LdsPadA: 0
+    LdsPadB: 0
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 16
+    MacroTile0: 64
+    MacroTile1: 128
+    MacroTileA: 64
+    MacroTileB: 128
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 2
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 32
+    NumGlobalWriteVectorsPerThread: 8
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 1
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: false
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 4
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [0, 3, 2]
+      IndexAssignmentsB: [3, 1, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 1
+      IndexUnrollB: 0
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: true
+      TLUB: false
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: false
+      TransposeB: false
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 6
+    SolutionNameMin: Cijk_Ailk_Bljk_HB_MT064x128x16_DTL0_GRVW08_GSU01_LPB00_PGR0_PLR1_TT08_04_VW04_WG08_32_01_WGM01
+    SubGroup0: 8
+    SubGroup1: 32
+    SubGroupA: 8
+    SubGroupB: 32
+    ThreadTile: [8, 4]
+    ThreadTile0: 8
+    ThreadTile1: 4
+    ThreadTileA: 8
+    ThreadTileB: 4
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 2
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: [8, 32, 1]
+    WorkGroupMapping: 1
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 2
+    AssertSummationElementMultiple: 2
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 8
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 8
+    GlobalLoadVectorWidthB: 2
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 8
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 8
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 256
+    LSCB: 8
+    LSPA: 8
+    LSPB: 64
+    LVCA: 32
+    LVCB: 4
+    LVPA: 1
+    LVPB: 32
+    LdsNumElements: 6720
+    LdsNumElementsAlignedA: 2048
+    LdsNumElementsAlignedB: 640
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 4096
+    LdsOffsetB: 2048
+    LdsOffsetB_Blk: 6144
+    LdsPadA: 0
+    LdsPadB: 8
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 8
+    MacroTile0: 256
+    MacroTile1: 64
+    MacroTileA: 256
+    MacroTileB: 64
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 2
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 64
+    NumGlobalWriteVectorsPerThread: 8
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 1
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 4
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [0, 3, 2]
+      IndexAssignmentsB: [3, 1, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 1
+      IndexUnrollB: 0
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: true
+      TLUB: false
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: false
+      TransposeB: false
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 7
+    SolutionNameMin: Cijk_Ailk_Bljk_HB_MT256x064x08_DTL0_GRVW08_GSU01_LPB08_PGR1_PLR1_TT08_08_VW08_WG32_08_01_WGM01
+    SubGroup0: 32
+    SubGroup1: 8
+    SubGroupA: 32
+    SubGroupB: 8
+    ThreadTile: [8, 8]
+    ThreadTile0: 8
+    ThreadTile1: 8
+    ThreadTileA: 8
+    ThreadTileB: 8
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 2
+    VectorStore: true
+    VectorWidth: 8
+    WorkGroup: [32, 8, 1]
+    WorkGroupMapping: 1
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 2
+    AssertSummationElementMultiple: 2
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 16
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 4
+    GlobalLoadVectorWidthB: 8
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 8
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 64
+    LSCB: 16
+    LSPA: 16
+    LSPB: 128
+    LVCA: 16
+    LVCB: 2
+    LVPA: 4
+    LVPB: 16
+    LdsNumElements: 3136
+    LdsOffsetA: 0
+    LdsOffsetB: 1024
+    LdsPadA: 0
+    LdsPadB: 4
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 16
+    MacroTile0: 64
+    MacroTile1: 128
+    MacroTileA: 64
+    MacroTileB: 128
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 2
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 32
+    NumGlobalWriteVectorsPerThread: 8
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 1
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: false
+    PrefetchLocalRead: false
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 4
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [0, 3, 2]
+      IndexAssignmentsB: [3, 1, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 1
+      IndexUnrollB: 0
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: true
+      TLUB: false
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: false
+      TransposeB: false
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 8
+    SolutionNameMin: Cijk_Ailk_Bljk_HB_MT064x128x16_DTL0_GRVW08_GSU01_LPB04_PGR0_PLR0_TT08_04_VW04_WG08_32_01_WGM01
+    SubGroup0: 8
+    SubGroup1: 32
+    SubGroupA: 8
+    SubGroupB: 32
+    ThreadTile: [8, 4]
+    ThreadTile0: 8
+    ThreadTile1: 4
+    ThreadTileA: 8
+    ThreadTileB: 4
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 2
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: [8, 32, 1]
+    WorkGroupMapping: 1
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 2
+    AssertSummationElementMultiple: 2
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 16
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 8
+    GlobalLoadVectorWidthB: 4
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 8
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 128
+    LSCB: 16
+    LSPA: 16
+    LSPB: 64
+    LVCA: 16
+    LVCB: 4
+    LVPA: 2
+    LVPB: 16
+    LdsNumElements: 3136
+    LdsOffsetA: 0
+    LdsOffsetB: 2048
+    LdsPadA: 0
+    LdsPadB: 4
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 16
+    MacroTile0: 128
+    MacroTile1: 64
+    MacroTileA: 128
+    MacroTileB: 64
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 2
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 32
+    NumGlobalWriteVectorsPerThread: 8
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 1
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: false
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 4
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [0, 3, 2]
+      IndexAssignmentsB: [3, 1, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 1
+      IndexUnrollB: 0
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: true
+      TLUB: false
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: false
+      TransposeB: false
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 9
+    SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x064x16_DTL0_GRVW08_GSU01_LPB04_PGR0_PLR1_TT08_04_VW04_WG16_16_01_WGM01
+    SubGroup0: 16
+    SubGroup1: 16
+    SubGroupA: 16
+    SubGroupB: 16
+    ThreadTile: [8, 4]
+    ThreadTile0: 8
+    ThreadTile1: 4
+    ThreadTileA: 8
+    ThreadTileB: 4
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 2
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: [16, 16, 1]
+    WorkGroupMapping: 1
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+- [2, 3, 0, 1]
+- - - [3136, 64, 128, 64]
+    - [7, 14524.3]
+  - - [784, 512, 64, 128]
+    - [6, 15406.1]
+  - - [3136, 256, 64, 64]
+    - [0, 15900.8]
+  - - [784, 128, 128, 512]
+    - [0, 15815.2]
+  - - [784, 128, 64, 512]
+    - [1, 15050.8]
+  - - [3136, 512, 1, 2048]
+    - [5, 14833.5]
+  - - [12544, 256, 1, 1024]
+    - [2, 15072.8]
+  - - [3136, 64, 128, 256]
+    - [9, 17266.8]
+  - - [3136, 64, 64, 256]
+    - [3, 16825.1]
+  - - [3136, 2048, 1, 512]
+    - [4, 16285.1]
+  - - [784, 512, 128, 128]
+    - [8, 15812.1]
+  - - [3136, 64, 64, 64]
+    - [7, 13432.6]
+  - - [12544, 1024, 1, 256]
+    - [3, 16901.1]
+  - - [3136, 256, 128, 64]
+    - [0, 16310.9]
+- null
diff --git a/Tensile/Configs/miopen/archives/resnet50/logic/vega20_Cijk_Ailk_Bljk_SB.yaml b/Tensile/Configs/miopen/archives/resnet50/logic/vega20_Cijk_Ailk_Bljk_SB.yaml
new file mode 100644
index 0000000000..3a7cd4990c
--- /dev/null
+++ b/Tensile/Configs/miopen/archives/resnet50/logic/vega20_Cijk_Ailk_Bljk_SB.yaml
@@ -0,0 +1,1465 @@
+- {MinimumRequiredVersion: 4.4.0}
+- vega20
+- gfx906
+- [Device 66a0, Device 66a7]
+- AssignedDerivedParameters: true
+  Batched: true
+  ComplexConjugateA: false
+  ComplexConjugateB: false
+  DataType: 0
+  HighPrecisionAccumulate: false
+  Index0: 0
+  Index01A: 0
+  Index01B: 1
+  Index1: 1
+  IndexAssignmentsA: [0, 3, 2]
+  IndexAssignmentsB: [3, 1, 2]
+  IndexUnroll: 3
+  IndexUnrollA: 1
+  IndexUnrollB: 0
+  IndicesBatch: [2]
+  IndicesFree: [0, 1]
+  IndicesSummation: [3]
+  NumIndicesBatch: 1
+  NumIndicesC: 3
+  NumIndicesFree: 2
+  NumIndicesSummation: 1
+  OperationType: GEMM
+  SilentHighPrecisionAccumulate: false
+  TLUA: true
+  TLUB: false
+  Tensor0: 0
+  Tensor1: 1
+  TileA: 0
+  TileB: 1
+  TotalIndices: 4
+  TransposeA: false
+  TransposeB: false
+  UseBeta: true
+  UseInitialStrides: false
+- - AssertFree0ElementMultiple: 1
+    AssertSummationElementMultiple: 1
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 16
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 2
+    GlobalLoadVectorWidthB: 2
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 2
+    GlobalSplitU: 4
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 2
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 64
+    LSCB: 16
+    LSPA: 8
+    LSPB: 32
+    LVCA: 32
+    LVCB: 8
+    LVPA: 4
+    LVPB: 16
+    LdsNumElements: 4096
+    LdsNumElementsAlignedA: 1024
+    LdsNumElementsAlignedB: 1024
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 2048
+    LdsOffsetB: 1024
+    LdsOffsetB_Blk: 3072
+    LdsPadA: 0
+    LdsPadB: 0
+    LocalDotLayout: 1
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 16
+    MacroTile0: 64
+    MacroTile1: 64
+    MacroTileA: 64
+    MacroTileB: 64
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 1
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 16
+    NumGlobalWriteVectorsPerThread: 8
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 2
+    NumLoadsPerpendicularB: 2
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 0
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [0, 3, 2]
+      IndexAssignmentsB: [3, 1, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 1
+      IndexUnrollB: 0
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: true
+      TLUB: false
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: false
+      TransposeB: false
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 0
+    SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_GRVW02_GSU04_PGR1_PLR1_TT04_04_VW02_WG16_16_01_WGM01
+    SubGroup0: 16
+    SubGroup1: 16
+    SubGroupA: 16
+    SubGroupB: 16
+    ThreadTile: &id004 [4, 4]
+    ThreadTile0: 4
+    ThreadTile1: 4
+    ThreadTileA: 4
+    ThreadTileB: 4
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 1
+    VectorStore: true
+    VectorWidth: 2
+    WorkGroup: &id002 [16, 16, 1]
+    WorkGroupMapping: 1
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 1
+    AssertSummationElementMultiple: 1
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 8
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 4
+    GlobalLoadVectorWidthB: 4
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 4
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 64
+    LSCB: 8
+    LSPA: 8
+    LSPB: 64
+    LVCA: 16
+    LVCB: 2
+    LVPA: 2
+    LVPB: 16
+    LdsNumElements: 2048
+    LdsNumElementsAlignedA: 512
+    LdsNumElementsAlignedB: 512
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 1024
+    LdsOffsetB: 512
+    LdsOffsetB_Blk: 1536
+    LdsPadA: 0
+    LdsPadB: 0
+    LocalDotLayout: 1
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 8
+    MacroTile0: 64
+    MacroTile1: 64
+    MacroTileA: 64
+    MacroTileB: 64
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 1
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 32
+    NumGlobalWriteVectorsPerThread: 8
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 1
+    NumLoadsPerpendicularB: 1
+    NumThreads: 128
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 0
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [0, 3, 2]
+      IndexAssignmentsB: [3, 1, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 1
+      IndexUnrollB: 0
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: true
+      TLUB: false
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: false
+      TransposeB: false
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 1
+    SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_VW04_WG16_08_01_WGM01
+    SubGroup0: 16
+    SubGroup1: 8
+    SubGroupA: 16
+    SubGroupB: 8
+    ThreadTile: &id003 [4, 8]
+    ThreadTile0: 4
+    ThreadTile1: 8
+    ThreadTileA: 4
+    ThreadTileB: 8
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 1
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: [16, 8, 1]
+    WorkGroupMapping: 1
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 1
+    AssertSummationElementMultiple: 1
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 8
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 2
+    GlobalLoadVectorWidthB: 4
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 4
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 64
+    LSCB: 8
+    LSPA: 8
+    LSPB: 128
+    LVCA: 32
+    LVCB: 2
+    LVPA: 4
+    LVPB: 32
+    LdsNumElements: 3584
+    LdsNumElementsAlignedA: 512
+    LdsNumElementsAlignedB: 1024
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 2048
+    LdsOffsetB: 512
+    LdsOffsetB_Blk: 2560
+    LdsPadA: 0
+    LdsPadB: 0
+    LocalDotLayout: 1
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 8
+    MacroTile0: 64
+    MacroTile1: 128
+    MacroTileA: 64
+    MacroTileB: 128
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 1
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 32
+    NumGlobalWriteVectorsPerThread: 8
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 1
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 0
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [0, 3, 2]
+      IndexAssignmentsB: [3, 1, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 1
+      IndexUnrollB: 0
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: true
+      TLUB: false
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: false
+      TransposeB: false
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 2
+    SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT08_04_VW04_WG08_32_01_WGM01
+    SubGroup0: 8
+    SubGroup1: 32
+    SubGroupA: 8
+    SubGroupB: 32
+    ThreadTile: &id001 [8, 4]
+    ThreadTile0: 8
+    ThreadTile1: 4
+    ThreadTileA: 8
+    ThreadTileB: 4
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 1
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: [8, 32, 1]
+    WorkGroupMapping: 1
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 1
+    AssertSummationElementMultiple: 1
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 8
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 4
+    GlobalLoadVectorWidthB: 2
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 4
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 128
+    LSCB: 8
+    LSPA: 8
+    LSPB: 64
+    LVCA: 32
+    LVCB: 4
+    LVPA: 2
+    LVPB: 32
+    LdsNumElements: 3584
+    LdsNumElementsAlignedA: 1024
+    LdsNumElementsAlignedB: 512
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 2048
+    LdsOffsetB: 1024
+    LdsOffsetB_Blk: 3072
+    LdsPadA: 0
+    LdsPadB: 0
+    LocalDotLayout: 1
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 8
+    MacroTile0: 128
+    MacroTile1: 64
+    MacroTileA: 128
+    MacroTileB: 64
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 1
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 32
+    NumGlobalWriteVectorsPerThread: 8
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 1
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 0
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [0, 3, 2]
+      IndexAssignmentsB: [3, 1, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 1
+      IndexUnrollB: 0
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: true
+      TLUB: false
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: false
+      TransposeB: false
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 3
+    SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT08_04_VW04_WG16_16_01_WGM01
+    SubGroup0: 16
+    SubGroup1: 16
+    SubGroupA: 16
+    SubGroupB: 16
+    ThreadTile: *id001
+    ThreadTile0: 8
+    ThreadTile1: 4
+    ThreadTileA: 8
+    ThreadTileB: 4
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 1
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: *id002
+    WorkGroupMapping: 1
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 1
+    AssertSummationElementMultiple: 1
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 8
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 4
+    GlobalLoadVectorWidthB: 2
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 4
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 128
+    LSCB: 8
+    LSPA: 8
+    LSPB: 64
+    LVCA: 32
+    LVCB: 4
+    LVPA: 2
+    LVPB: 32
+    LdsNumElements: 3584
+    LdsNumElementsAlignedA: 1024
+    LdsNumElementsAlignedB: 512
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 2048
+    LdsOffsetB: 1024
+    LdsOffsetB_Blk: 3072
+    LdsPadA: 0
+    LdsPadB: 0
+    LocalDotLayout: 1
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 8
+    MacroTile0: 128
+    MacroTile1: 64
+    MacroTileA: 128
+    MacroTileB: 64
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 1
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 32
+    NumGlobalWriteVectorsPerThread: 8
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 1
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 0
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [0, 3, 2]
+      IndexAssignmentsB: [3, 1, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 1
+      IndexUnrollB: 0
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: true
+      TLUB: false
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: false
+      TransposeB: false
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 4
+    SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_VW04_WG32_08_01_WGM08
+    SubGroup0: 32
+    SubGroup1: 8
+    SubGroupA: 32
+    SubGroupB: 8
+    ThreadTile: *id003
+    ThreadTile0: 4
+    ThreadTile1: 8
+    ThreadTileA: 4
+    ThreadTileB: 8
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 1
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: &id005 [32, 8, 1]
+    WorkGroupMapping: 8
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 1
+    AssertSummationElementMultiple: 1
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 16
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 4
+    GlobalLoadVectorWidthB: 4
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 4
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 64
+    LSCB: 16
+    LSPA: 16
+    LSPB: 64
+    LVCA: 16
+    LVCB: 4
+    LVPA: 4
+    LVPB: 16
+    LdsNumElements: 2048
+    LdsOffsetA: 0
+    LdsOffsetB: 1024
+    LdsPadA: 0
+    LdsPadB: 0
+    LocalDotLayout: 1
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 16
+    MacroTile0: 64
+    MacroTile1: 64
+    MacroTileA: 64
+    MacroTileB: 64
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 1
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 16
+    NumGlobalWriteVectorsPerThread: 4
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 1
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: false
+    PrefetchLocalRead: false
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 0
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [0, 3, 2]
+      IndexAssignmentsB: [3, 1, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 1
+      IndexUnrollB: 0
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: true
+      TLUB: false
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: false
+      TransposeB: false
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 5
+    SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR0_PLR0_TT04_04_VW04_WG16_16_01_WGM01
+    SubGroup0: 16
+    SubGroup1: 16
+    SubGroupA: 16
+    SubGroupB: 16
+    ThreadTile: *id004
+    ThreadTile0: 4
+    ThreadTile1: 4
+    ThreadTileA: 4
+    ThreadTileB: 4
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 1
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: *id002
+    WorkGroupMapping: 1
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 1
+    AssertSummationElementMultiple: 1
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 16
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 4
+    GlobalLoadVectorWidthB: 4
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 4
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 64
+    LSCB: 16
+    LSPA: 16
+    LSPB: 64
+    LVCA: 16
+    LVCB: 4
+    LVPA: 4
+    LVPB: 16
+    LdsNumElements: 4096
+    LdsNumElementsAlignedA: 1024
+    LdsNumElementsAlignedB: 1024
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 2048
+    LdsOffsetB: 1024
+    LdsOffsetB_Blk: 3072
+    LdsPadA: 0
+    LdsPadB: 0
+    LocalDotLayout: 1
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 16
+    MacroTile0: 64
+    MacroTile1: 64
+    MacroTileA: 64
+    MacroTileB: 64
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 1
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 16
+    NumGlobalWriteVectorsPerThread: 4
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 1
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 0
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [0, 3, 2]
+      IndexAssignmentsB: [3, 1, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 1
+      IndexUnrollB: 0
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: true
+      TLUB: false
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: false
+      TransposeB: false
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 6
+    SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_VW04_WG16_16_01_WGM01
+    SubGroup0: 16
+    SubGroup1: 16
+    SubGroupA: 16
+    SubGroupB: 16
+    ThreadTile: *id004
+    ThreadTile0: 4
+    ThreadTile1: 4
+    ThreadTileA: 4
+    ThreadTileB: 4
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 1
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: *id002
+    WorkGroupMapping: 1
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 1
+    AssertSummationElementMultiple: 1
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 16
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 4
+    GlobalLoadVectorWidthB: 4
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 4
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 128
+    LSCB: 16
+    LSPA: 8
+    LSPB: 64
+    LVCA: 32
+    LVCB: 4
+    LVPA: 2
+    LVPB: 16
+    LdsNumElements: 7168
+    LdsNumElementsAlignedA: 2048
+    LdsNumElementsAlignedB: 1024
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 4096
+    LdsOffsetB: 2048
+    LdsOffsetB_Blk: 6144
+    LdsPadA: 0
+    LdsPadB: 0
+    LocalDotLayout: 1
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 16
+    MacroTile0: 128
+    MacroTile1: 64
+    MacroTileA: 128
+    MacroTileB: 64
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 1
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 32
+    NumGlobalWriteVectorsPerThread: 8
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 2
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 0
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [0, 3, 2]
+      IndexAssignmentsB: [3, 1, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 1
+      IndexUnrollB: 0
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: true
+      TLUB: false
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: false
+      TransposeB: false
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 7
+    SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_VW04_WG32_08_01_WGM01
+    SubGroup0: 32
+    SubGroup1: 8
+    SubGroupA: 32
+    SubGroupB: 8
+    ThreadTile: *id003
+    ThreadTile0: 4
+    ThreadTile1: 8
+    ThreadTileA: 4
+    ThreadTileB: 8
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 1
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: *id005
+    WorkGroupMapping: 1
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 1
+    AssertSummationElementMultiple: 1
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 16
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 4
+    GlobalLoadVectorWidthB: 4
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 4
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 128
+    LSCB: 16
+    LSPA: 8
+    LSPB: 64
+    LVCA: 32
+    LVCB: 4
+    LVPA: 2
+    LVPB: 16
+    LdsNumElements: 7168
+    LdsNumElementsAlignedA: 2048
+    LdsNumElementsAlignedB: 1024
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 4096
+    LdsOffsetB: 2048
+    LdsOffsetB_Blk: 6144
+    LdsPadA: 0
+    LdsPadB: 0
+    LocalDotLayout: 1
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 16
+    MacroTile0: 128
+    MacroTile1: 64
+    MacroTileA: 128
+    MacroTileB: 64
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 1
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 32
+    NumGlobalWriteVectorsPerThread: 8
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 2
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 0
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [0, 3, 2]
+      IndexAssignmentsB: [3, 1, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 1
+      IndexUnrollB: 0
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: true
+      TLUB: false
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: false
+      TransposeB: false
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 8
+    SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT08_04_VW04_WG16_16_01_WGM08
+    SubGroup0: 16
+    SubGroup1: 16
+    SubGroupA: 16
+    SubGroupB: 16
+    ThreadTile: *id001
+    ThreadTile0: 8
+    ThreadTile1: 4
+    ThreadTileA: 8
+    ThreadTileB: 4
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 1
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: *id002
+    WorkGroupMapping: 8
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 1
+    AssertSummationElementMultiple: 1
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 16
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 4
+    GlobalLoadVectorWidthB: 4
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 4
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 128
+    LSCB: 16
+    LSPA: 8
+    LSPB: 64
+    LVCA: 32
+    LVCB: 4
+    LVPA: 2
+    LVPB: 16
+    LdsNumElements: 7168
+    LdsNumElementsAlignedA: 2048
+    LdsNumElementsAlignedB: 1024
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 4096
+    LdsOffsetB: 2048
+    LdsOffsetB_Blk: 6144
+    LdsPadA: 0
+    LdsPadB: 0
+    LocalDotLayout: 1
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 16
+    MacroTile0: 128
+    MacroTile1: 64
+    MacroTileA: 128
+    MacroTileB: 64
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 1
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 32
+    NumGlobalWriteVectorsPerThread: 8
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 2
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 0
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [0, 3, 2]
+      IndexAssignmentsB: [3, 1, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 1
+      IndexUnrollB: 0
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: true
+      TLUB: false
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: false
+      TransposeB: false
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 9
+    SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT08_04_VW04_WG16_16_01_WGM64
+    SubGroup0: 16
+    SubGroup1: 16
+    SubGroupA: 16
+    SubGroupB: 16
+    ThreadTile: *id001
+    ThreadTile0: 8
+    ThreadTile1: 4
+    ThreadTileA: 8
+    ThreadTileB: 4
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 1
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: *id002
+    WorkGroupMapping: 64
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+- [2, 3, 0, 1]
+- - - [3136, 64, 128, 64]
+    - [5, 7734.96]
+  - - [784, 512, 64, 128]
+    - [2, 7939.64]
+  - - [3136, 256, 64, 64]
+    - [5, 7970.44]
+  - - [784, 128, 128, 512]
+    - [9, 7502.7]
+  - - [784, 128, 64, 512]
+    - [8, 7371.52]
+  - - [3136, 512, 1, 2048]
+    - [0, 6769.36]
+  - - [12544, 256, 1, 1024]
+    - [8, 7289.15]
+  - - [3136, 64, 128, 256]
+    - [3, 8635.2]
+  - - [3136, 64, 64, 256]
+    - [3, 8461.01]
+  - - [3136, 2048, 1, 512]
+    - [7, 7924.34]
+  - - [784, 512, 128, 128]
+    - [1, 8065.06]
+  - - [3136, 64, 64, 64]
+    - [6, 7408.73]
+  - - [12544, 1024, 1, 256]
+    - [4, 8637.02]
+  - - [3136, 256, 128, 64]
+    - [5, 8065.85]
+- null
diff --git a/Tensile/Configs/miopen/archives/resnet50/logic/vega20_Cijk_Alik_Bljk_HB.yaml b/Tensile/Configs/miopen/archives/resnet50/logic/vega20_Cijk_Alik_Bljk_HB.yaml
new file mode 100644
index 0000000000..7758863d15
--- /dev/null
+++ b/Tensile/Configs/miopen/archives/resnet50/logic/vega20_Cijk_Alik_Bljk_HB.yaml
@@ -0,0 +1,1028 @@
+- {MinimumRequiredVersion: 4.3.0}
+- vega20
+- gfx906
+- [Device 66a0, Device 66a7]
+- AssignedDerivedParameters: true
+  Batched: true
+  ComplexConjugateA: false
+  ComplexConjugateB: false
+  DataType: 4
+  HighPrecisionAccumulate: false
+  Index0: 0
+  Index01A: 0
+  Index01B: 1
+  Index1: 1
+  IndexAssignmentsA: [3, 0, 2]
+  IndexAssignmentsB: [3, 1, 2]
+  IndexUnroll: 3
+  IndexUnrollA: 0
+  IndexUnrollB: 0
+  IndicesBatch: [2]
+  IndicesFree: [0, 1]
+  IndicesSummation: [3]
+  NumIndicesBatch: 1
+  NumIndicesC: 3
+  NumIndicesFree: 2
+  NumIndicesSummation: 1
+  OperationType: GEMM
+  SilentHighPrecisionAccumulate: false
+  TLUA: false
+  TLUB: false
+  Tensor0: 0
+  Tensor1: 1
+  TileA: 0
+  TileB: 1
+  TotalIndices: 4
+  TransposeA: true
+  TransposeB: false
+  UseBeta: true
+  UseInitialStrides: false
+- - AssertFree0ElementMultiple: 2
+    AssertSummationElementMultiple: 2
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 32
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 8
+    GlobalLoadVectorWidthB: 8
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 8
+    GlobalSplitU: 5
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 32
+    LSCB: 32
+    LSPA: 64
+    LSPB: 64
+    LVCA: 4
+    LVCB: 4
+    LVPA: 8
+    LVPB: 8
+    LdsNumElements: 16512
+    LdsOffsetA: 0
+    LdsOffsetB: 8192
+    LdsPadA: 0
+    LdsPadB: 4
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 32
+    MacroTile0: 256
+    MacroTile1: 256
+    MacroTileA: 256
+    MacroTileB: 256
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 2
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 256
+    NumGlobalWriteVectorsPerThread: 64
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 4
+    NumLoadsPerpendicularB: 4
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: false
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 4
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [3, 0, 2]
+      IndexAssignmentsB: [3, 1, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 0
+      IndexUnrollB: 0
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: false
+      TLUB: false
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: true
+      TransposeB: false
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 0
+    SolutionNameMin: Cijk_Alik_Bljk_HB_MT256x256x32_GRVW08_GSU05_LPB04_PGR0_PLR1_TT16_16_VW04_WG16_16_01_WGM08
+    SubGroup0: 16
+    SubGroup1: 16
+    SubGroupA: 16
+    SubGroupB: 16
+    ThreadTile: [16, 16]
+    ThreadTile0: 16
+    ThreadTile1: 16
+    ThreadTileA: 16
+    ThreadTileB: 16
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 2
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: [16, 16, 1]
+    WorkGroupMapping: 8
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 2
+    AssertSummationElementMultiple: 2
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 24
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 8
+    GlobalLoadVectorWidthB: 2
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 8
+    GlobalSplitU: 5
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 24
+    LSCB: 24
+    LSPA: 64
+    LSPB: 16
+    LVCA: 3
+    LVCB: 12
+    LVPA: 8
+    LVPB: 8
+    LdsNumElements: 4096
+    LdsNumElementsAlignedA: 1536
+    LdsNumElementsAlignedB: 512
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 2048
+    LdsOffsetB: 1536
+    LdsOffsetB_Blk: 3584
+    LdsPadA: 0
+    LdsPadB: 4
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 4
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 6
+    MacroTile0: 64
+    MacroTile1: 16
+    MacroTileA: 64
+    MacroTileB: 16
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 2
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 4
+    NumGlobalWriteVectorsPerThread: 1
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 1
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 4
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [3, 0, 2]
+      IndexAssignmentsB: [3, 1, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 0
+      IndexUnrollB: 0
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: false
+      TLUB: false
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: true
+      TransposeB: false
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 1
+    SolutionNameMin: Cijk_Alik_Bljk_HB_MT064x016x24_GRVW08_GSU05_LPB04_PGR1_PLR1_TT04_04_VW04_WG16_04_04_WGM01
+    SubGroup0: 16
+    SubGroup1: 4
+    SubGroupA: 16
+    SubGroupB: 4
+    ThreadTile: [4, 4]
+    ThreadTile0: 4
+    ThreadTile1: 4
+    ThreadTileA: 4
+    ThreadTileB: 4
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 2
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: [16, 4, 4]
+    WorkGroupMapping: 1
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 2
+    AssertSummationElementMultiple: 2
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 16
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 4
+    GlobalLoadVectorWidthB: 2
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 8
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 16
+    LSCB: 16
+    LSPA: 64
+    LSPB: 32
+    LVCA: 4
+    LVCB: 8
+    LVPA: 16
+    LVPB: 16
+    LdsNumElements: 4096
+    LdsNumElementsAlignedA: 1024
+    LdsNumElementsAlignedB: 640
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 2048
+    LdsOffsetB: 1024
+    LdsOffsetB_Blk: 3072
+    LdsPadA: 0
+    LdsPadB: 4
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 2
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 8
+    MacroTile0: 64
+    MacroTile1: 32
+    MacroTileA: 64
+    MacroTileB: 32
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 2
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 8
+    NumGlobalWriteVectorsPerThread: 2
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 1
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 4
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [3, 0, 2]
+      IndexAssignmentsB: [3, 1, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 0
+      IndexUnrollB: 0
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: false
+      TLUB: false
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: true
+      TransposeB: false
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 2
+    SolutionNameMin: Cijk_Alik_Bljk_HB_MT064x032x16_GRVW08_GSU01_LPB04_PGR1_PLR1_TT04_04_VW04_WG16_08_02_WGM64
+    SubGroup0: 16
+    SubGroup1: 8
+    SubGroupA: 16
+    SubGroupB: 8
+    ThreadTile: [4, 4]
+    ThreadTile0: 4
+    ThreadTile1: 4
+    ThreadTileA: 4
+    ThreadTileB: 4
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 2
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: [16, 8, 2]
+    WorkGroupMapping: 64
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 2
+    AssertSummationElementMultiple: 2
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 32
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 8
+    GlobalLoadVectorWidthB: 2
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 8
+    GlobalSplitU: 5
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 32
+    LSCB: 32
+    LSPA: 64
+    LSPB: 16
+    LVCA: 4
+    LVCB: 16
+    LVPA: 8
+    LVPB: 8
+    LdsNumElements: 6784
+    LdsNumElementsAlignedA: 2048
+    LdsNumElementsAlignedB: 640
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 4096
+    LdsOffsetB: 2048
+    LdsOffsetB_Blk: 6144
+    LdsPadA: 0
+    LdsPadB: 4
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 4
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 8
+    MacroTile0: 64
+    MacroTile1: 16
+    MacroTileA: 64
+    MacroTileB: 16
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 2
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 4
+    NumGlobalWriteVectorsPerThread: 1
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 1
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 4
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [3, 0, 2]
+      IndexAssignmentsB: [3, 1, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 0
+      IndexUnrollB: 0
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: false
+      TLUB: false
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: true
+      TransposeB: false
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 3
+    SolutionNameMin: Cijk_Alik_Bljk_HB_MT064x016x32_GRVW08_GSU05_LPB04_PGR1_PLR1_TT04_04_VW04_WG16_04_04_WGM08
+    SubGroup0: 16
+    SubGroup1: 4
+    SubGroupA: 16
+    SubGroupB: 4
+    ThreadTile: [4, 4]
+    ThreadTile0: 4
+    ThreadTile1: 4
+    ThreadTileA: 4
+    ThreadTileB: 4
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 2
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: [16, 4, 4]
+    WorkGroupMapping: 8
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 2
+    AssertSummationElementMultiple: 2
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 32
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 8
+    GlobalLoadVectorWidthB: 2
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 8
+    GlobalSplitU: 5
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 32
+    LSCB: 32
+    LSPA: 64
+    LSPB: 16
+    LVCA: 4
+    LVCB: 16
+    LVPA: 8
+    LVPB: 8
+    LdsNumElements: 6784
+    LdsNumElementsAlignedA: 2048
+    LdsNumElementsAlignedB: 640
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 4096
+    LdsOffsetB: 2048
+    LdsOffsetB_Blk: 6144
+    LdsPadA: 0
+    LdsPadB: 4
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 4
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 8
+    MacroTile0: 64
+    MacroTile1: 16
+    MacroTileA: 64
+    MacroTileB: 16
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 2
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 4
+    NumGlobalWriteVectorsPerThread: 1
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 1
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 4
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [3, 0, 2]
+      IndexAssignmentsB: [3, 1, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 0
+      IndexUnrollB: 0
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: false
+      TLUB: false
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: true
+      TransposeB: false
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 4
+    SolutionNameMin: Cijk_Alik_Bljk_HB_MT064x016x32_GRVW08_GSU05_LPB04_PGR1_PLR1_TT04_04_VW04_WG16_04_04_WGM01
+    SubGroup0: 16
+    SubGroup1: 4
+    SubGroupA: 16
+    SubGroupB: 4
+    ThreadTile: [4, 4]
+    ThreadTile0: 4
+    ThreadTile1: 4
+    ThreadTileA: 4
+    ThreadTileB: 4
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 2
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: [16, 4, 4]
+    WorkGroupMapping: 1
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 2
+    AssertSummationElementMultiple: 2
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 16
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 4
+    GlobalLoadVectorWidthB: 2
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 8
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 16
+    LSCB: 16
+    LSPA: 64
+    LSPB: 32
+    LVCA: 4
+    LVCB: 8
+    LVPA: 16
+    LVPB: 16
+    LdsNumElements: 4096
+    LdsNumElementsAlignedA: 1024
+    LdsNumElementsAlignedB: 640
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 2048
+    LdsOffsetB: 1024
+    LdsOffsetB_Blk: 3072
+    LdsPadA: 0
+    LdsPadB: 4
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 2
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 8
+    MacroTile0: 64
+    MacroTile1: 32
+    MacroTileA: 64
+    MacroTileB: 32
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 2
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 8
+    NumGlobalWriteVectorsPerThread: 2
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 1
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 4
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [3, 0, 2]
+      IndexAssignmentsB: [3, 1, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 0
+      IndexUnrollB: 0
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: false
+      TLUB: false
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: true
+      TransposeB: false
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 5
+    SolutionNameMin: Cijk_Alik_Bljk_HB_MT064x032x16_GRVW08_GSU01_LPB04_PGR1_PLR1_TT04_04_VW04_WG16_08_02_WGM01
+    SubGroup0: 16
+    SubGroup1: 8
+    SubGroupA: 16
+    SubGroupB: 8
+    ThreadTile: [4, 4]
+    ThreadTile0: 4
+    ThreadTile1: 4
+    ThreadTileA: 4
+    ThreadTileB: 4
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 2
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: [16, 8, 2]
+    WorkGroupMapping: 1
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 2
+    AssertSummationElementMultiple: 2
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 32
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 8
+    GlobalLoadVectorWidthB: 2
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 8
+    GlobalSplitU: 3
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 32
+    LSCB: 32
+    LSPA: 64
+    LSPB: 16
+    LVCA: 4
+    LVCB: 16
+    LVPA: 8
+    LVPB: 8
+    LdsNumElements: 6784
+    LdsNumElementsAlignedA: 2048
+    LdsNumElementsAlignedB: 640
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 4096
+    LdsOffsetB: 2048
+    LdsOffsetB_Blk: 6144
+    LdsPadA: 0
+    LdsPadB: 4
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 4
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 8
+    MacroTile0: 64
+    MacroTile1: 16
+    MacroTileA: 64
+    MacroTileB: 16
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 2
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 4
+    NumGlobalWriteVectorsPerThread: 1
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 1
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 4
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [3, 0, 2]
+      IndexAssignmentsB: [3, 1, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 0
+      IndexUnrollB: 0
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: false
+      TLUB: false
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: true
+      TransposeB: false
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 6
+    SolutionNameMin: Cijk_Alik_Bljk_HB_MT064x016x32_GRVW08_GSU03_LPB04_PGR1_PLR1_TT04_04_VW04_WG16_04_04_WGM01
+    SubGroup0: 16
+    SubGroup1: 4
+    SubGroupA: 16
+    SubGroupB: 4
+    ThreadTile: [4, 4]
+    ThreadTile0: 4
+    ThreadTile1: 4
+    ThreadTileA: 4
+    ThreadTileB: 4
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 2
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: [16, 4, 4]
+    WorkGroupMapping: 1
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+- [2, 3, 0, 1]
+- - - [512, 2048, 1, 49]
+    - [0, 0.0]
+  - - [512, 128, 1, 784]
+    - [6, 4014.08]
+  - - [2048, 512, 1, 49]
+    - [0, 0.0]
+  - - [1024, 256, 1, 196]
+    - [2, 5179.46]
+  - - [256, 64, 1, 3136]
+    - [1, 3243.7]
+  - - [256, 1024, 1, 196]
+    - [5, 5264.37]
+  - - [64, 256, 1, 3136]
+    - [4, 3227.3]
+  - - [128, 512, 1, 784]
+    - [6, 4170.47]
+  - - [64, 64, 1, 3136]
+    - [3, 810.925]
+- null
diff --git a/Tensile/Configs/miopen/archives/resnet50/logic/vega20_Cijk_Alik_Bljk_SB.yaml b/Tensile/Configs/miopen/archives/resnet50/logic/vega20_Cijk_Alik_Bljk_SB.yaml
new file mode 100644
index 0000000000..fb602a76e4
--- /dev/null
+++ b/Tensile/Configs/miopen/archives/resnet50/logic/vega20_Cijk_Alik_Bljk_SB.yaml
@@ -0,0 +1,1179 @@
+- {MinimumRequiredVersion: 4.4.0}
+- vega20
+- gfx906
+- [Device 66a0, Device 66a7]
+- AssignedDerivedParameters: true
+  Batched: true
+  ComplexConjugateA: false
+  ComplexConjugateB: false
+  DataType: 0
+  HighPrecisionAccumulate: false
+  Index0: 0
+  Index01A: 0
+  Index01B: 1
+  Index1: 1
+  IndexAssignmentsA: [3, 0, 2]
+  IndexAssignmentsB: [3, 1, 2]
+  IndexUnroll: 3
+  IndexUnrollA: 0
+  IndexUnrollB: 0
+  IndicesBatch: [2]
+  IndicesFree: [0, 1]
+  IndicesSummation: [3]
+  NumIndicesBatch: 1
+  NumIndicesC: 3
+  NumIndicesFree: 2
+  NumIndicesSummation: 1
+  OperationType: GEMM
+  SilentHighPrecisionAccumulate: false
+  TLUA: false
+  TLUB: false
+  Tensor0: 0
+  Tensor1: 1
+  TileA: 0
+  TileB: 1
+  TotalIndices: 4
+  TransposeA: true
+  TransposeB: false
+  UseBeta: true
+  UseInitialStrides: false
+- - AssertFree0ElementMultiple: 1
+    AssertSummationElementMultiple: 1
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 16
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 1
+    GlobalLoadVectorWidthB: 1
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 1
+    GlobalSplitU: 4
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 1
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 16
+    LSCB: 16
+    LSPA: 16
+    LSPB: 16
+    LVCA: 16
+    LVCB: 16
+    LVPA: 16
+    LVPB: 16
+    LdsNumElements: 4096
+    LdsNumElementsAlignedA: 1088
+    LdsNumElementsAlignedB: 320
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 2048
+    LdsOffsetB: 1088
+    LdsOffsetB_Blk: 3136
+    LdsPadA: 1
+    LdsPadB: 1
+    LocalDotLayout: 1
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 4
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 4
+    MacroTile0: 64
+    MacroTile1: 16
+    MacroTileA: 64
+    MacroTileB: 16
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 1
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 4
+    NumGlobalWriteVectorsPerThread: 4
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 4
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 0
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [3, 0, 2]
+      IndexAssignmentsB: [3, 1, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 0
+      IndexUnrollB: 0
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: false
+      TLUB: false
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: true
+      TransposeB: false
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 0
+    SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW01_GSU04_LPA01_LPB01_PGR1_PLR1_TT04_04_VW01_WG16_04_04_WGM01
+    SubGroup0: 16
+    SubGroup1: 4
+    SubGroupA: 16
+    SubGroupB: 4
+    ThreadTile: &id001 [4, 4]
+    ThreadTile0: 4
+    ThreadTile1: 4
+    ThreadTileA: 4
+    ThreadTileB: 4
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 1
+    VectorStore: true
+    VectorWidth: 1
+    WorkGroup: &id002 [16, 4, 4]
+    WorkGroupMapping: 1
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 1
+    AssertSummationElementMultiple: 1
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 32
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 1
+    GlobalLoadVectorWidthB: 1
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 1
+    GlobalSplitU: 8
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 1
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 32
+    LSCB: 32
+    LSPA: 8
+    LSPB: 8
+    LVCA: 32
+    LVCB: 32
+    LVPA: 8
+    LVPB: 8
+    LdsNumElements: 6752
+    LdsNumElementsAlignedA: 2112
+    LdsNumElementsAlignedB: 576
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 4096
+    LdsOffsetB: 2112
+    LdsOffsetB_Blk: 6208
+    LdsPadA: 1
+    LdsPadB: 1
+    LocalDotLayout: 1
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 4
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 8
+    MacroTile0: 64
+    MacroTile1: 16
+    MacroTileA: 64
+    MacroTileB: 16
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 1
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 4
+    NumGlobalWriteVectorsPerThread: 4
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 8
+    NumLoadsPerpendicularB: 2
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 0
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [3, 0, 2]
+      IndexAssignmentsB: [3, 1, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 0
+      IndexUnrollB: 0
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: false
+      TLUB: false
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: true
+      TransposeB: false
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 1
+    SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x32_GRVW01_GSU08_LPA01_LPB01_PGR1_PLR1_TT04_04_VW01_WG16_04_04_WGM01
+    SubGroup0: 16
+    SubGroup1: 4
+    SubGroupA: 16
+    SubGroupB: 4
+    ThreadTile: *id001
+    ThreadTile0: 4
+    ThreadTile1: 4
+    ThreadTileA: 4
+    ThreadTileB: 4
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 1
+    VectorStore: true
+    VectorWidth: 1
+    WorkGroup: *id002
+    WorkGroupMapping: 1
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 1
+    AssertSummationElementMultiple: 1
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 32
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 1
+    GlobalLoadVectorWidthB: 1
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 1
+    GlobalSplitU: 8
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 1
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 32
+    LSCB: 32
+    LSPA: 8
+    LSPB: 8
+    LVCA: 32
+    LVCB: 32
+    LVPA: 8
+    LVPB: 8
+    LdsNumElements: 3456
+    LdsNumElementsAlignedA: 1088
+    LdsNumElementsAlignedB: 320
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 2048
+    LdsOffsetB: 1088
+    LdsOffsetB_Blk: 3136
+    LdsPadA: 2
+    LdsPadB: 2
+    LocalDotLayout: 1
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 4
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 8
+    MacroTile0: 32
+    MacroTile1: 8
+    MacroTileA: 32
+    MacroTileB: 8
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 1
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 1
+    NumGlobalWriteVectorsPerThread: 1
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 4
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 0
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [3, 0, 2]
+      IndexAssignmentsB: [3, 1, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 0
+      IndexUnrollB: 0
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: false
+      TLUB: false
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: true
+      TransposeB: false
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 2
+    SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x32_GRVW01_GSU08_LPA02_LPB02_PGR1_PLR1_TT02_02_VW02_WG16_04_04_WGM01
+    SubGroup0: 16
+    SubGroup1: 4
+    SubGroupA: 16
+    SubGroupB: 4
+    ThreadTile: [2, 2]
+    ThreadTile0: 2
+    ThreadTile1: 2
+    ThreadTileA: 2
+    ThreadTileB: 2
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 1
+    VectorStore: true
+    VectorWidth: 2
+    WorkGroup: *id002
+    WorkGroupMapping: 1
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 1
+    AssertSummationElementMultiple: 1
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 16
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 1
+    GlobalLoadVectorWidthB: 1
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 1
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 16
+    LSCB: 16
+    LSPA: 16
+    LSPB: 16
+    LVCA: 16
+    LVCB: 16
+    LVPA: 16
+    LVPB: 16
+    LdsNumElements: 4096
+    LdsNumElementsAlignedA: 1088
+    LdsNumElementsAlignedB: 576
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 2048
+    LdsOffsetB: 1088
+    LdsOffsetB_Blk: 3136
+    LdsPadA: 4
+    LdsPadB: 4
+    LocalDotLayout: 1
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 2
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 8
+    MacroTile0: 64
+    MacroTile1: 32
+    MacroTileA: 64
+    MacroTileB: 32
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 1
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 8
+    NumGlobalWriteVectorsPerThread: 2
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 4
+    NumLoadsPerpendicularB: 2
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 0
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [3, 0, 2]
+      IndexAssignmentsB: [3, 1, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 0
+      IndexUnrollB: 0
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: false
+      TLUB: false
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: true
+      TransposeB: false
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 3
+    SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW01_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_04_VW04_WG16_08_02_WGM01
+    SubGroup0: 16
+    SubGroup1: 8
+    SubGroupA: 16
+    SubGroupB: 8
+    ThreadTile: *id001
+    ThreadTile0: 4
+    ThreadTile1: 4
+    ThreadTileA: 4
+    ThreadTileB: 4
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 1
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: &id003 [16, 8, 2]
+    WorkGroupMapping: 1
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 1
+    AssertSummationElementMultiple: 1
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 16
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 1
+    GlobalLoadVectorWidthB: 1
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 1
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 16
+    LSCB: 16
+    LSPA: 16
+    LSPB: 16
+    LVCA: 16
+    LVCB: 16
+    LVPA: 16
+    LVPB: 16
+    LdsNumElements: 4096
+    LdsNumElementsAlignedA: 1088
+    LdsNumElementsAlignedB: 576
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 2048
+    LdsOffsetB: 1088
+    LdsOffsetB_Blk: 3136
+    LdsPadA: 4
+    LdsPadB: 4
+    LocalDotLayout: 1
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 2
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 8
+    MacroTile0: 64
+    MacroTile1: 32
+    MacroTileA: 64
+    MacroTileB: 32
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 1
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 8
+    NumGlobalWriteVectorsPerThread: 2
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 4
+    NumLoadsPerpendicularB: 2
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 0
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [3, 0, 2]
+      IndexAssignmentsB: [3, 1, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 0
+      IndexUnrollB: 0
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: false
+      TLUB: false
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: true
+      TransposeB: false
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 4
+    SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW01_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_04_VW04_WG16_08_02_WGM64
+    SubGroup0: 16
+    SubGroup1: 8
+    SubGroupA: 16
+    SubGroupB: 8
+    ThreadTile: *id001
+    ThreadTile0: 4
+    ThreadTile1: 4
+    ThreadTileA: 4
+    ThreadTileB: 4
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 1
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: *id003
+    WorkGroupMapping: 64
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 1
+    AssertSummationElementMultiple: 1
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 8
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 2
+    GlobalLoadVectorWidthB: 2
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 2
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 2
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 8
+    LSCB: 8
+    LSPA: 64
+    LSPB: 64
+    LVCA: 4
+    LVCB: 4
+    LVPA: 32
+    LVPB: 32
+    LdsNumElements: 3664
+    LdsNumElementsAlignedA: 1088
+    LdsNumElementsAlignedB: 576
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 2048
+    LdsOffsetB: 1088
+    LdsOffsetB_Blk: 3136
+    LdsPadA: 2
+    LdsPadB: 2
+    LocalDotLayout: 1
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 8
+    MacroTile0: 128
+    MacroTile1: 64
+    MacroTileA: 128
+    MacroTileB: 64
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 1
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 32
+    NumGlobalWriteVectorsPerThread: 16
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 2
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 0
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [3, 0, 2]
+      IndexAssignmentsB: [3, 1, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 0
+      IndexUnrollB: 0
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: false
+      TLUB: false
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: true
+      TransposeB: false
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 5
+    SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x08_GRVW02_GSU01_LPA02_LPB02_PGR1_PLR1_TT04_08_VW02_WG32_08_01_WGM64
+    SubGroup0: 32
+    SubGroup1: 8
+    SubGroupA: 32
+    SubGroupB: 8
+    ThreadTile: [4, 8]
+    ThreadTile0: 4
+    ThreadTile1: 8
+    ThreadTileA: 4
+    ThreadTileB: 8
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 1
+    VectorStore: true
+    VectorWidth: 2
+    WorkGroup: [32, 8, 1]
+    WorkGroupMapping: 64
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 1
+    AssertSummationElementMultiple: 1
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 32
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 2
+    GlobalLoadVectorWidthB: 2
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 2
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 2
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 32
+    LSCB: 32
+    LSPA: 16
+    LSPB: 16
+    LVCA: 16
+    LVCB: 16
+    LVPA: 8
+    LVPB: 8
+    LdsNumElements: 6784
+    LdsNumElementsAlignedA: 2112
+    LdsNumElementsAlignedB: 576
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 4096
+    LdsOffsetB: 2112
+    LdsOffsetB_Blk: 6208
+    LdsPadA: 2
+    LdsPadB: 2
+    LocalDotLayout: 1
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 4
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 8
+    MacroTile0: 64
+    MacroTile1: 16
+    MacroTileA: 64
+    MacroTileB: 16
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 1
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 4
+    NumGlobalWriteVectorsPerThread: 2
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 4
+    NumLoadsPerpendicularB: 1
+    NumThreads: 256
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 0
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [3, 0, 2]
+      IndexAssignmentsB: [3, 1, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 0
+      IndexUnrollB: 0
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: false
+      TLUB: false
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: true
+      TransposeB: false
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 6
+    SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x32_GRVW02_GSU01_LPA02_LPB02_PGR1_PLR1_TT04_04_VW02_WG16_04_04_WGM01
+    SubGroup0: 16
+    SubGroup1: 4
+    SubGroupA: 16
+    SubGroupB: 4
+    ThreadTile: *id001
+    ThreadTile0: 4
+    ThreadTile1: 4
+    ThreadTileA: 4
+    ThreadTileB: 4
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 1
+    VectorStore: true
+    VectorWidth: 2
+    WorkGroup: *id002
+    WorkGroupMapping: 1
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+  - AssertFree0ElementMultiple: 1
+    AssertSummationElementMultiple: 1
+    AssignedDerivedParameters: true
+    AssignedProblemIndependentDerivedParameters: true
+    BufferLoad: true
+    BufferStore: true
+    CheckTensorDimAsserts: false
+    DepthU: 8
+    DirectToLds: false
+    DirectToLdsA: false
+    DirectToLdsB: false
+    DisableKernelPieces: 0
+    EdgeType: ShiftPtr
+    FractionalLoad: 1
+    GlobalLoadVectorWidthA: 4
+    GlobalLoadVectorWidthB: 2
+    GlobalRead2A: true
+    GlobalRead2B: true
+    GlobalReadCoalesceGroupA: true
+    GlobalReadCoalesceGroupB: true
+    GlobalReadCoalesceVectorA: true
+    GlobalReadCoalesceVectorB: true
+    GlobalReadVectorWidth: 4
+    GlobalSplitU: 1
+    GlobalSplitUSummationAssignmentRoundRobin: true
+    GlobalSplitUWorkGroupMappingRoundRobin: false
+    GlobalWriteVectorWidth: 4
+    InnerUnroll: 1
+    KernelLanguage: Assembly
+    LSCA: 8
+    LSCB: 8
+    LSPA: 64
+    LSPB: 32
+    LVCA: 2
+    LVCB: 4
+    LVPA: 16
+    LVPB: 16
+    LdsNumElements: 3424
+    LdsNumElementsAlignedA: 1088
+    LdsNumElementsAlignedB: 320
+    LdsOffsetA: 0
+    LdsOffsetA_Blk: 2048
+    LdsOffsetB: 1088
+    LdsOffsetB_Blk: 3136
+    LdsPadA: 4
+    LdsPadB: 4
+    LocalDotLayout: 1
+    LocalRead2A: true
+    LocalRead2B: true
+    LocalSplitU: 1
+    LocalWrite2A: true
+    LocalWrite2B: true
+    LocalWriteUseSgprA: false
+    LocalWriteUseSgprB: false
+    LoopDoWhile: false
+    LoopTail: true
+    LoopUnroll: 8
+    MacroTile0: 128
+    MacroTile1: 32
+    MacroTileA: 128
+    MacroTileB: 32
+    MacroTileShapeMax: 64
+    MacroTileShapeMin: 1
+    MaxOccupancy: 40
+    MinGlobalWriteVectorWidth: 1
+    NonTemporalA: 0
+    NonTemporalB: 0
+    NonTemporalC: 0
+    NumElementsPerThread: 32
+    NumGlobalWriteVectorsPerThread: 8
+    NumLoadsCoalescedA: 1
+    NumLoadsCoalescedB: 1
+    NumLoadsPerpendicularA: 2
+    NumLoadsPerpendicularB: 1
+    NumThreads: 128
+    PerformanceSyncLocation: -1
+    PerformanceWaitCount: -1
+    PerformanceWaitLocation: -1
+    PersistentKernel: 0
+    PreciseBoundsCheck: false
+    PrefetchGlobalRead: true
+    PrefetchLocalRead: true
+    ProblemType:
+      AssignedDerivedParameters: true
+      Batched: true
+      ComplexConjugateA: false
+      ComplexConjugateB: false
+      DataType: 0
+      HighPrecisionAccumulate: false
+      Index0: 0
+      Index01A: 0
+      Index01B: 1
+      Index1: 1
+      IndexAssignmentsA: [3, 0, 2]
+      IndexAssignmentsB: [3, 1, 2]
+      IndexUnroll: 3
+      IndexUnrollA: 0
+      IndexUnrollB: 0
+      IndicesBatch: [2]
+      IndicesFree: [0, 1]
+      IndicesSummation: [3]
+      NumIndicesBatch: 1
+      NumIndicesC: 3
+      NumIndicesFree: 2
+      NumIndicesSummation: 1
+      OperationType: GEMM
+      SilentHighPrecisionAccumulate: false
+      TLUA: false
+      TLUB: false
+      Tensor0: 0
+      Tensor1: 1
+      TileA: 0
+      TileB: 1
+      TotalIndices: 4
+      TransposeA: true
+      TransposeB: false
+      UseBeta: true
+      UseInitialStrides: false
+    SolutionIndex: 7
+    SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x032x08_GRVW04_GSU01_LPA04_LPB04_PGR1_PLR1_TT08_04_VW04_WG16_08_01_WGM01
+    SubGroup0: 16
+    SubGroup1: 8
+    SubGroupA: 16
+    SubGroupB: 8
+    ThreadTile: [8, 4]
+    ThreadTile0: 8
+    ThreadTile1: 4
+    ThreadTileA: 8
+    ThreadTileB: 4
+    UnrollMemFence: false
+    UseSgprForGRO: 0
+    Valid: true
+    VectorAtomicWidth: 1
+    VectorStore: true
+    VectorWidth: 4
+    WorkGroup: [16, 8, 1]
+    WorkGroupMapping: 1
+    WorkGroupMappingType: B
+    fractionalPerpOverhangA: 0
+    fractionalPerpOverhangB: 0
+- [2, 3, 0, 1]
+- - - [512, 2048, 1, 49]
+    - [7, 3733.89]
+  - - [512, 128, 1, 784]
+    - [6, 2906.12]
+  - - [2048, 512, 1, 49]
+    - [5, 3528.86]
+  - - [1024, 256, 1, 196]
+    - [4, 3691.11]
+  - - [256, 64, 1, 3136]
+    - [1, 2643.02]
+  - - [256, 1024, 1, 196]
+    - [3, 3964.52]
+  - - [64, 256, 1, 3136]
+    - [1, 2732.99]
+  - - [128, 512, 1, 784]
+    - [0, 3132.94]
+  - - [64, 64, 1, 3136]
+    - [2, 1016.18]
+- null
diff --git a/Tensile/Configs/miopen/boiler/library_logic_vega10_only.yml b/Tensile/Configs/miopen/boiler/library_logic_vega10_only.yml
index 5e59af509b..c0f6ebcf0f 100644
--- a/Tensile/Configs/miopen/boiler/library_logic_vega10_only.yml
+++ b/Tensile/Configs/miopen/boiler/library_logic_vega10_only.yml
@@ -1,7 +1,7 @@
 
 LibraryLogic:
     ScheduleName: "vega10"
-    DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"]
+    DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"]
     ArchitectureName: "gfx900"
 
 #   ScheduleName: "mi25"
diff --git a/Tensile/Configs/miopen/boiler/library_logic_vega20_only.yml b/Tensile/Configs/miopen/boiler/library_logic_vega20_only.yml
new file mode 100644
index 0000000000..9d77fd954c
--- /dev/null
+++ b/Tensile/Configs/miopen/boiler/library_logic_vega20_only.yml
@@ -0,0 +1,23 @@
+
+LibraryLogic:
+    ScheduleName: "vega20"
+    DeviceNames: ["Device 66a0", "Device 66a1", "Device 66a7"]
+    ArchitectureName: "gfx906"
+
+#   ScheduleName: "vega10"
+#   DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"]
+#   ArchitectureName: "gfx900"
+
+#   ScheduleName: "mi25"
+#   DeviceNames: ["Device 6860"]
+#   ArchitectureName: "gfx900"
+
+#   ScheduleName: "r9nano"
+#   DeviceNames: ["Device 7300"]
+#   ArchitectureName: "gfx803"
+
+#   ScheduleName: "hip"
+#   DeviceNames: ["Device 0000"]
+#   ArchitectureName: "fallback"
+
+LibraryClient:
diff --git a/Tensile/Configs/miopen/make_all.sh b/Tensile/Configs/miopen/make_all.sh
index a431303696..c50511caba 100755
--- a/Tensile/Configs/miopen/make_all.sh
+++ b/Tensile/Configs/miopen/make_all.sh
@@ -2,3 +2,6 @@ make TYPE=hgemm
 make TYPE=sgemm
 #make TYPE=hgemm SOLUTION_SKINNY=solutions/hgemm_quick.yml
 #make TYPE=sgemm =solutions/hgemm_quick.yml
+##make SCHED=vega20 TYPE=hgemm
+##make SCHED=vega20 TYPE=sgemm
+##make SCHED=vega20 TYPE=sgemm EXPLORE_LEVEL=7
diff --git a/Tensile/Configs/miopen/problems/nn/resnet50_all.yml b/Tensile/Configs/miopen/problems/nn/resnet50_all.yml
new file mode 100644
index 0000000000..0682508008
--- /dev/null
+++ b/Tensile/Configs/miopen/problems/nn/resnet50_all.yml
@@ -0,0 +1,15 @@
+# Resnet50 NN
+          - Exact: [   784 ,   128 ,  64,   512 ]    # beta= 0
+          - Exact: [   784 ,   512 ,  64,   128 ]    # beta= 0
+          - Exact: [  3136 ,    64 ,  64,    64 ]    # beta= 0
+          - Exact: [  3136 ,    64 ,  64,   256 ]    # beta= 0
+          - Exact: [  3136 ,   256 ,  64,    64 ]    # beta= 0
+          - Exact: [   784 ,   128 , 128,   512 ]    # beta= 0
+          - Exact: [   784 ,   512 , 128,   128 ]    # beta= 0
+          - Exact: [  3136 ,    64 , 128,    64 ]    # beta= 0
+          - Exact: [  3136 ,    64 , 128,   256 ]    # beta= 0
+          - Exact: [  3136 ,   256 , 128,    64 ]    # beta= 0
+          - Exact: [  3136 ,   512 ,   1,  2048 ]    # beta= 0
+          - Exact: [  3136 ,  2048 ,   1,   512 ]    # beta= 0
+          - Exact: [ 12544 ,   256 ,   1,  1024 ]    # beta= 0
+          - Exact: [ 12544 ,  1024 ,   1,   256 ]    # beta= 0
diff --git a/Tensile/Configs/miopen/problems/nt/resnet50_all.yml b/Tensile/Configs/miopen/problems/nt/resnet50_all.yml
new file mode 100644
index 0000000000..42d15bc1af
--- /dev/null
+++ b/Tensile/Configs/miopen/problems/nt/resnet50_all.yml
@@ -0,0 +1,19 @@
+# Resnet50 NT
+          - Exact: [    49 ,   512 ,  64,  2048 ]    # beta= 0
+          - Exact: [    49 ,  2048 ,  64,   512 ]    # beta= 0
+          - Exact: [   196 ,   256 ,  64,  1024 ]    # beta= 0
+          - Exact: [   196 ,  1024 ,  64,   256 ]    # beta= 0
+          - Exact: [   784 ,   128 ,  64,   512 ]    # beta= 0
+          - Exact: [   784 ,   512 ,  64,   128 ]    # beta= 0
+          - Exact: [  3136 ,    64 ,  64,    64 ]    # beta= 0
+          - Exact: [  3136 ,   256 ,  64,    64 ]    # beta= 0
+          - Exact: [  3136 ,    64 ,  64,   256 ]    # beta= 0
+          - Exact: [    49 ,   512 , 128,  2048 ]    # beta= 0
+          - Exact: [    49 ,  2048 , 128,   512 ]    # beta= 0
+          - Exact: [   196 ,   256 , 128,  1024 ]    # beta= 0
+          - Exact: [   196 ,  1024 , 128,   256 ]    # beta= 0
+          - Exact: [   784 ,   128 , 128,   512 ]    # beta= 0
+          - Exact: [   784 ,   512 , 128,   128 ]    # beta= 0
+          - Exact: [  3136 ,    64 , 128,    64 ]    # beta= 0
+          - Exact: [  3136 ,    64 , 128,   256 ]    # beta= 0
+          - Exact: [  3136 ,   256 , 128,    64 ]    # beta= 0
diff --git a/Tensile/Configs/miopen/problems/tn/resnet50_all.yml b/Tensile/Configs/miopen/problems/tn/resnet50_all.yml
new file mode 100644
index 0000000000..0dc47be699
--- /dev/null
+++ b/Tensile/Configs/miopen/problems/tn/resnet50_all.yml
@@ -0,0 +1,10 @@
+# Resnet50 TN
+          - Exact: [    64 ,    64 ,   1,  3136 ]    # beta= 1
+          - Exact: [    64 ,   256 ,   1,  3136 ]    # beta= 1
+          - Exact: [   128 ,   512 ,   1,   784 ]    # beta= 1
+          - Exact: [   256 ,    64 ,   1,  3136 ]    # beta= 1
+          - Exact: [   256 ,  1024 ,   1,   196 ]    # beta= 1
+          - Exact: [   512 ,   128 ,   1,   784 ]    # beta= 1
+          - Exact: [   512 ,  2048 ,   1,    49 ]    # beta= 1
+          - Exact: [  1024 ,   256 ,   1,   196 ]    # beta= 1
+          - Exact: [  2048 ,   512 ,   1,    49 ]    # beta= 1
diff --git a/Tensile/Configs/miopen/solutions/hgemm_large_explore_3.yml b/Tensile/Configs/miopen/solutions/hgemm_large_explore_3.yml
index 343e6c3c01..efae0606c4 100644
--- a/Tensile/Configs/miopen/solutions/hgemm_large_explore_3.yml
+++ b/Tensile/Configs/miopen/solutions/hgemm_large_explore_3.yml
@@ -20,9 +20,10 @@
         - DepthU: [ 16, 24, 32 ]
         - VectorWidth: [8]
         - GlobalReadVectorWidth: [-1]
+        - AssertSummationElementMultiple: [2]
+        - AssertFree0ElementMultiple: [2]
       BenchmarkForkParameters:
       JoinParameters:
       BenchmarkJoinParameters:
       BenchmarkFinalParameters:
         - ProblemSizes:
-
diff --git a/Tensile/Configs/miopen/solutions/hgemm_large_explore_5.yml b/Tensile/Configs/miopen/solutions/hgemm_large_explore_5.yml
index 343e6c3c01..efae0606c4 100644
--- a/Tensile/Configs/miopen/solutions/hgemm_large_explore_5.yml
+++ b/Tensile/Configs/miopen/solutions/hgemm_large_explore_5.yml
@@ -20,9 +20,10 @@
         - DepthU: [ 16, 24, 32 ]
         - VectorWidth: [8]
         - GlobalReadVectorWidth: [-1]
+        - AssertSummationElementMultiple: [2]
+        - AssertFree0ElementMultiple: [2]
       BenchmarkForkParameters:
       JoinParameters:
       BenchmarkJoinParameters:
       BenchmarkFinalParameters:
         - ProblemSizes:
-
diff --git a/Tensile/Configs/miopen/solutions/hgemm_quick.yml b/Tensile/Configs/miopen/solutions/hgemm_quick.yml
index 1885450c33..d5ff1be3da 100644
--- a/Tensile/Configs/miopen/solutions/hgemm_quick.yml
+++ b/Tensile/Configs/miopen/solutions/hgemm_quick.yml
@@ -1,7 +1,7 @@
   ########################################
   # Explore small number of half solns
   ########################################
-    - # NN workloads
+    - # Benchmark Group
       InitialSolutionParameters:
       BenchmarkCommonParameters:
         - EdgeType: ["ShiftPtr"]
@@ -23,9 +23,10 @@
         - WorkGroupMapping: [8]
         - DepthU: [ 16 ]
         - VectorWidth: [2,8]
+        - AssertSummationElementMultiple: [2]
+        - AssertFree0ElementMultiple: [2]
       BenchmarkForkParameters:
       JoinParameters:
       BenchmarkJoinParameters:
       BenchmarkFinalParameters:
         - ProblemSizes:
-
diff --git a/Tensile/Configs/miopen/solutions/hgemm_skinny_explore_3.yml b/Tensile/Configs/miopen/solutions/hgemm_skinny_explore_3.yml
index 5a4086f5bd..b1bfb16316 100644
--- a/Tensile/Configs/miopen/solutions/hgemm_skinny_explore_3.yml
+++ b/Tensile/Configs/miopen/solutions/hgemm_skinny_explore_3.yml
@@ -1,7 +1,7 @@
   ########################################
   # Explore large number of ~10K half solutions
   ########################################
-    - # NN workloads
+    - # Benchmark Group
       InitialSolutionParameters:
       BenchmarkCommonParameters:
         - EdgeType: ["ShiftPtr"]
@@ -18,21 +18,21 @@
           - [ 8, 16 ]
           - [ 16, 16 ]
         - WorkGroup:
-          #- [ 16, 8, 2 ]  LSU broken for Half?
-          #- [ 16, 4, 4 ]
+          - [ 16, 8, 2 ]
+          - [ 16, 4, 4 ]
           - [ 16,  8, 1 ]
           - [ 8, 32, 1 ]
           - [ 16, 16, 1 ]
           - [ 32,  8, 1 ]
-          #- GlobalSplitU: [1,3,5]
-        - GlobalSplitU: [1]
+        - GlobalSplitU: [1,3,5]
         - WorkGroupMapping: [1,8]
         - DepthU: [ 16,32 ]
         - VectorWidth: [8]
         - GlobalReadVectorWidth: [1,8]
+        - AssertSummationElementMultiple: [2]
+        - AssertFree0ElementMultiple: [2]
       BenchmarkForkParameters:
       JoinParameters:
       BenchmarkJoinParameters:
       BenchmarkFinalParameters:
         - ProblemSizes:
-
diff --git a/Tensile/Configs/miopen/solutions/hgemm_skinny_explore_5.yml b/Tensile/Configs/miopen/solutions/hgemm_skinny_explore_5.yml
index 263fd76470..9f5c49dc5a 100644
--- a/Tensile/Configs/miopen/solutions/hgemm_skinny_explore_5.yml
+++ b/Tensile/Configs/miopen/solutions/hgemm_skinny_explore_5.yml
@@ -1,7 +1,7 @@
   ########################################
   # Explore large number of ~10K half solutions
   ########################################
-    - # NN workloads
+    - # Benchmark Group
       InitialSolutionParameters:
       BenchmarkCommonParameters:
         - EdgeType: ["ShiftPtr"]
@@ -19,22 +19,22 @@
           - [ 8, 16 ]
           - [ 16, 16 ]
         - WorkGroup:
-          #- [ 16, 8, 2 ]  LSU broken for Half?
-          #- [ 16, 4, 4 ]
+          - [ 16, 8, 2 ]
+          - [ 16, 4, 4 ]
           - [ 16,  8, 1 ]
           - [ 8, 32, 1 ]
           - [ 16, 16, 1 ]
           - [ 32,  8, 1 ]
-          #- GlobalSplitU: [1,3,5]
-        - GlobalSplitU: [1]
+        - GlobalSplitU: [1,3,5]
         - WorkGroupMapping: [1,8,64]
-        - DepthU: [ 8,16,32 ]
+        - DepthU: [ 8,16,24,32 ]
         - VectorWidth: [4,8]
         - GlobalReadVectorWidth: [1,8]
         - LdsPadB: [0, -1 ]
+        - AssertSummationElementMultiple: [2]
+        - AssertFree0ElementMultiple: [2]
       BenchmarkForkParameters:
       JoinParameters:
       BenchmarkJoinParameters:
       BenchmarkFinalParameters:
         - ProblemSizes:
-
diff --git a/Tensile/Configs/miopen/solutions/sgemm_large_explore_3.yml b/Tensile/Configs/miopen/solutions/sgemm_large_explore_3.yml
index 74cf6bd9f4..ae53569327 100644
--- a/Tensile/Configs/miopen/solutions/sgemm_large_explore_3.yml
+++ b/Tensile/Configs/miopen/solutions/sgemm_large_explore_3.yml
@@ -25,4 +25,3 @@
       BenchmarkJoinParameters:
       BenchmarkFinalParameters:
         - ProblemSizes:
-
diff --git a/Tensile/Configs/miopen/solutions/sgemm_large_explore_5.yml b/Tensile/Configs/miopen/solutions/sgemm_large_explore_5.yml
index 4914674343..5c84c18c20 100644
--- a/Tensile/Configs/miopen/solutions/sgemm_large_explore_5.yml
+++ b/Tensile/Configs/miopen/solutions/sgemm_large_explore_5.yml
@@ -27,4 +27,3 @@
       BenchmarkJoinParameters:
       BenchmarkFinalParameters:
         - ProblemSizes:
-
diff --git a/Tensile/Configs/miopen/solutions/sgemm_large_explore_7.yml b/Tensile/Configs/miopen/solutions/sgemm_large_explore_7.yml
new file mode 100644
index 0000000000..5c84c18c20
--- /dev/null
+++ b/Tensile/Configs/miopen/solutions/sgemm_large_explore_7.yml
@@ -0,0 +1,29 @@
+    # Explore set of parms appropriate for large matrixes with large tiles:
+    - # Benchmark Group
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - EdgeType: ["ShiftPtr"]
+        - LoopTail: [True]
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+        - GlobalSplitU: [1]
+      ForkParameters:
+        - PrefetchGlobalRead: [False, True]
+        - ThreadTile:
+          - [ 4, 4 ]
+          - [ 4, 8 ]
+          - [ 6, 8 ]
+          - [ 8, 4 ]
+          - [ 8, 6 ]
+          - [ 8, 8 ]
+        - WorkGroup:
+          - [ 16, 16, 1 ]
+        - WorkGroupMapping: [1, 8, 64]
+        - DepthU: [ 16, 24, 32 ]
+        - VectorWidth: [-1]
+        - GlobalReadVectorWidth: [1,4]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
diff --git a/Tensile/Configs/miopen/solutions/sgemm_quick.yml b/Tensile/Configs/miopen/solutions/sgemm_quick.yml
index 91be1807ce..d269364b36 100644
--- a/Tensile/Configs/miopen/solutions/sgemm_quick.yml
+++ b/Tensile/Configs/miopen/solutions/sgemm_quick.yml
@@ -1,7 +1,7 @@
   ########################################
   # Explore small number of half solns
   ########################################
-    - # NN workloads
+    - # Benchmark Group
       InitialSolutionParameters:
       BenchmarkCommonParameters:
         - EdgeType: ["ShiftPtr"]
@@ -25,4 +25,3 @@
       BenchmarkJoinParameters:
       BenchmarkFinalParameters:
         - ProblemSizes:
-
diff --git a/Tensile/Configs/miopen/solutions/sgemm_skinny_explore_3.yml b/Tensile/Configs/miopen/solutions/sgemm_skinny_explore_3.yml
index 4bad8e3cea..ca3b73da28 100644
--- a/Tensile/Configs/miopen/solutions/sgemm_skinny_explore_3.yml
+++ b/Tensile/Configs/miopen/solutions/sgemm_skinny_explore_3.yml
@@ -1,7 +1,7 @@
   ########################################
   # Explore large number of ~10K solutions
   ########################################
-    - # NN workloads
+    - # Benchmark Group
       InitialSolutionParameters:
       BenchmarkCommonParameters:
         - EdgeType: ["ShiftPtr"]
@@ -32,4 +32,3 @@
       BenchmarkJoinParameters:
       BenchmarkFinalParameters:
         - ProblemSizes:
-
diff --git a/Tensile/Configs/miopen/solutions/sgemm_skinny_explore_5.yml b/Tensile/Configs/miopen/solutions/sgemm_skinny_explore_5.yml
index c8bd8eb4b5..b3e3c533e0 100644
--- a/Tensile/Configs/miopen/solutions/sgemm_skinny_explore_5.yml
+++ b/Tensile/Configs/miopen/solutions/sgemm_skinny_explore_5.yml
@@ -1,7 +1,7 @@
   ########################################
   # Explore large number of ~10K solutions
   ########################################
-    - # NN workloads
+    - # Benchmark Group
       InitialSolutionParameters:
       BenchmarkCommonParameters:
         - EdgeType: ["ShiftPtr"]
@@ -35,4 +35,3 @@
       BenchmarkJoinParameters:
       BenchmarkFinalParameters:
         - ProblemSizes:
-
diff --git a/Tensile/Configs/miopen/solutions/sgemm_skinny_explore_7.yml b/Tensile/Configs/miopen/solutions/sgemm_skinny_explore_7.yml
index fba6e29558..c1c43f6eca 100644
--- a/Tensile/Configs/miopen/solutions/sgemm_skinny_explore_7.yml
+++ b/Tensile/Configs/miopen/solutions/sgemm_skinny_explore_7.yml
@@ -1,7 +1,7 @@
   ########################################
   # Explore large number of ~10K solutions
   ########################################
-    - # NN workloads
+    - # Benchmark Group
       InitialSolutionParameters:
       BenchmarkCommonParameters:
         - EdgeType: ["ShiftPtr"]
@@ -34,4 +34,3 @@
       BenchmarkJoinParameters:
       BenchmarkFinalParameters:
         - ProblemSizes:
-
diff --git a/Tensile/Configs/miopen/types/hgemm_nt.yml b/Tensile/Configs/miopen/types/hgemm_nt.yml
index 96920277b9..2289690eca 100644
--- a/Tensile/Configs/miopen/types/hgemm_nt.yml
+++ b/Tensile/Configs/miopen/types/hgemm_nt.yml
@@ -1,6 +1,6 @@
 BenchmarkProblems:
   ########################################
-  # NN - standard
+  # NT - standard
   ########################################
   -
     - # ProblemType
diff --git a/Tensile/Configs/miopen/types/hgemm_tn.yml b/Tensile/Configs/miopen/types/hgemm_tn.yml
index dfaa22f225..b1fa3a944e 100644
--- a/Tensile/Configs/miopen/types/hgemm_tn.yml
+++ b/Tensile/Configs/miopen/types/hgemm_tn.yml
@@ -1,6 +1,6 @@
 BenchmarkProblems:
   ########################################
-  # NN - standard
+  # TN - standard
   ########################################
   -
     - # ProblemType
diff --git a/Tensile/Configs/miopen/types/hgemm_tt.yml b/Tensile/Configs/miopen/types/hgemm_tt.yml
index 7581d52f39..fc655d9ed6 100644
--- a/Tensile/Configs/miopen/types/hgemm_tt.yml
+++ b/Tensile/Configs/miopen/types/hgemm_tt.yml
@@ -1,6 +1,6 @@
 BenchmarkProblems:
   ########################################
-  # NN - standard
+  # TT - standard
   ########################################
   -
     - # ProblemType
diff --git a/Tensile/Configs/miopen/types/sgemm_nt.yml b/Tensile/Configs/miopen/types/sgemm_nt.yml
index 08931341ed..1dd119538e 100644
--- a/Tensile/Configs/miopen/types/sgemm_nt.yml
+++ b/Tensile/Configs/miopen/types/sgemm_nt.yml
@@ -1,6 +1,6 @@
 BenchmarkProblems:
   ########################################
-  # NN - standard
+  # NT - standard
   ########################################
   -
     - # ProblemType
diff --git a/Tensile/Configs/miopen/types/sgemm_tn.yml b/Tensile/Configs/miopen/types/sgemm_tn.yml
index dccba30c73..59c783fa09 100644
--- a/Tensile/Configs/miopen/types/sgemm_tn.yml
+++ b/Tensile/Configs/miopen/types/sgemm_tn.yml
@@ -1,6 +1,6 @@
 BenchmarkProblems:
   ########################################
-  # NN - standard
+  # TN - standard
   ########################################
   -
     - # ProblemType
diff --git a/Tensile/Configs/miopen/types/sgemm_tt.yml b/Tensile/Configs/miopen/types/sgemm_tt.yml
index 403fffc475..fbcaf50a2d 100644
--- a/Tensile/Configs/miopen/types/sgemm_tt.yml
+++ b/Tensile/Configs/miopen/types/sgemm_tt.yml
@@ -1,6 +1,6 @@
 BenchmarkProblems:
   ########################################
-  # NN - standard
+  # TT - standard
   ########################################
   -
     - # ProblemType
diff --git a/Tensile/Configs/rocblas_cgemm.yaml b/Tensile/Configs/rocblas_cgemm.yaml
index a3f1f9e583..c0a6f4511c 100644
--- a/Tensile/Configs/rocblas_cgemm.yaml
+++ b/Tensile/Configs/rocblas_cgemm.yaml
@@ -178,7 +178,7 @@ BenchmarkProblems:
 
 LibraryLogic:
     ScheduleName: "vega10"
-    DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"]
+    DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"]
     ArchitectureName: "gfx900"
 
 #   ScheduleName: "mi25"
diff --git a/Tensile/Configs/rocblas_dgemm_asm_full.yaml b/Tensile/Configs/rocblas_dgemm_asm_full.yaml
index b6edf01851..45c1544710 100644
--- a/Tensile/Configs/rocblas_dgemm_asm_full.yaml
+++ b/Tensile/Configs/rocblas_dgemm_asm_full.yaml
@@ -45,7 +45,6 @@ BenchmarkProblems:
         - WorkGroupMapping: [8]
         - PrefetchLocalRead: [True]
         - PrefetchGlobalRead: [True]
-        - PreciseBoundsCheck: [False]
         - VectorWidth: [-1]
       ForkParameters:
         - ThreadTile:
@@ -71,7 +70,6 @@ BenchmarkProblems:
         - WorkGroupMapping: [8]
         - PrefetchLocalRead: [True]
         - PrefetchGlobalRead: [True]
-        - PreciseBoundsCheck: [False]
         - VectorWidth: [-1]
       ForkParameters:
         - ThreadTile:
@@ -99,7 +97,6 @@ BenchmarkProblems:
         - WorkGroupMapping: [8]
         - PrefetchLocalRead: [True]
         - PrefetchGlobalRead: [True]
-        - PreciseBoundsCheck: [False]
         - VectorWidth: [-1]
       ForkParameters:
         - ThreadTile:
@@ -156,7 +153,6 @@ BenchmarkProblems:
         - WorkGroupMapping: [8]
         - PrefetchLocalRead: [True]
         - PrefetchGlobalRead: [True]
-        - PreciseBoundsCheck: [False]
         - VectorWidth: [-1]
       ForkParameters:
         - ThreadTile:
@@ -180,7 +176,6 @@ BenchmarkProblems:
         - EdgeType: ["ShiftPtr"]
         - LoopTail: [True]
         - PrefetchLocalRead: [True]
-        - PreciseBoundsCheck: [False]
       ForkParameters:
         - PrefetchGlobalRead: [True]
         - ThreadTile:
@@ -212,7 +207,6 @@ BenchmarkProblems:
         - WorkGroupMapping: [8]
         - PrefetchLocalRead: [True]
         - PrefetchGlobalRead: [True]
-        - PreciseBoundsCheck: [False]
         - VectorWidth: [-1]
       ForkParameters:
         - ThreadTile:
@@ -240,7 +234,6 @@ BenchmarkProblems:
         - WorkGroupMapping: [8]
         - PrefetchLocalRead: [True]
         - PrefetchGlobalRead: [True]
-        - PreciseBoundsCheck: [False]
         - VectorWidth: [-1]
       ForkParameters:
         - ThreadTile:
@@ -297,7 +290,6 @@ BenchmarkProblems:
         - WorkGroupMapping: [8]
         - PrefetchLocalRead: [True]
         - PrefetchGlobalRead: [True]
-        - PreciseBoundsCheck: [False]
         - VectorWidth: [-1]
       ForkParameters:
         - ThreadTile:
@@ -323,7 +315,6 @@ BenchmarkProblems:
         - WorkGroupMapping: [8]
         - PrefetchLocalRead: [True]
         - PrefetchGlobalRead: [True]
-        - PreciseBoundsCheck: [False]
         - VectorWidth: [-1]
       ForkParameters:
         - ThreadTile:
@@ -363,7 +354,6 @@ BenchmarkProblems:
         - WorkGroupMapping: [8]
         - PrefetchLocalRead: [True]
         - PrefetchGlobalRead: [True]
-        - PreciseBoundsCheck: [False]
         - VectorWidth: [-1]
       ForkParameters:
         - ThreadTile:
@@ -389,7 +379,6 @@ BenchmarkProblems:
         - WorkGroupMapping: [8]
         - PrefetchLocalRead: [True]
         - PrefetchGlobalRead: [True]
-        - PreciseBoundsCheck: [False]
         - VectorWidth: [-1]
       ForkParameters:
         - ThreadTile:
@@ -408,11 +397,11 @@ BenchmarkProblems:
 
 LibraryLogic:
 #   ScheduleName: "vega20"
-#   DeviceNames: ["Device 66a0", "Device 66a7"]
+#   DeviceNames: ["Device 66a0", "Device 66a1", "Device 66a7"]
 #   ArchitectureName: "gfx906"
 
     ScheduleName: "vega10"
-    DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"]
+    DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"]
     ArchitectureName: "gfx900"
 
 #   ScheduleName: "mi25"
diff --git a/Tensile/Configs/rocblas_dgemm_asm_lite.yaml b/Tensile/Configs/rocblas_dgemm_asm_lite.yaml
index c24fd9e820..bdf4ae1e66 100644
--- a/Tensile/Configs/rocblas_dgemm_asm_lite.yaml
+++ b/Tensile/Configs/rocblas_dgemm_asm_lite.yaml
@@ -66,7 +66,6 @@ BenchmarkProblems:
         - WorkGroupMapping: [8]
         - PrefetchLocalRead: [True]
         - PrefetchGlobalRead: [True]
-        - PreciseBoundsCheck: [False]
         - VectorWidth: [-1]
       ForkParameters:
         - ThreadTile:
@@ -115,7 +114,6 @@ BenchmarkProblems:
         - WorkGroupMapping: [8]
         - PrefetchLocalRead: [True]
         - PrefetchGlobalRead: [True]
-        - PreciseBoundsCheck: [False]
         - VectorWidth: [-1]
       ForkParameters:
         - ThreadTile:
@@ -176,7 +174,6 @@ BenchmarkProblems:
         - WorkGroupMapping: [8]
         - PrefetchLocalRead: [True]
         - PrefetchGlobalRead: [True]
-        - PreciseBoundsCheck: [False]
         - VectorWidth: [-1]
       ForkParameters:
         - ThreadTile:
@@ -225,7 +222,6 @@ BenchmarkProblems:
         - WorkGroupMapping: [8]
         - PrefetchLocalRead: [True]
         - PrefetchGlobalRead: [True]
-        - PreciseBoundsCheck: [False]
         - VectorWidth: [-1]
       ForkParameters:
         - ThreadTile:
@@ -287,7 +283,6 @@ BenchmarkProblems:
         - WorkGroupMapping: [8]
         - PrefetchLocalRead: [True]
         - PrefetchGlobalRead: [True]
-        - PreciseBoundsCheck: [False]
         - VectorWidth: [-1]
       ForkParameters:
         - ThreadTile:
@@ -336,7 +331,6 @@ BenchmarkProblems:
         - WorkGroupMapping: [8]
         - PrefetchLocalRead: [True]
         - PrefetchGlobalRead: [True]
-        - PreciseBoundsCheck: [False]
         - VectorWidth: [-1]
       ForkParameters:
         - ThreadTile:
@@ -397,7 +391,6 @@ BenchmarkProblems:
         - WorkGroupMapping: [8]
         - PrefetchLocalRead: [True]
         - PrefetchGlobalRead: [True]
-        - PreciseBoundsCheck: [False]
         - VectorWidth: [-1]
       ForkParameters:
         - ThreadTile:
@@ -446,7 +439,6 @@ BenchmarkProblems:
         - WorkGroupMapping: [8]
         - PrefetchLocalRead: [True]
         - PrefetchGlobalRead: [True]
-        - PreciseBoundsCheck: [False]
         - VectorWidth: [-1]
       ForkParameters:
         - ThreadTile:
@@ -466,11 +458,11 @@ BenchmarkProblems:
 
 LibraryLogic:
 #   ScheduleName: "vega20"
-#   DeviceNames: ["Device 66a0", "Device 66a7"]
+#   DeviceNames: ["Device 66a0", "Device 66a1", "Device 66a7"]
 #   ArchitectureName: "gfx906"
 
     ScheduleName: "vega10"
-    DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"]
+    DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"]
     ArchitectureName: "gfx900"
 
 #   ScheduleName: "mi25"
diff --git a/Tensile/Configs/rocblas_dgemm_hip_lite.yaml b/Tensile/Configs/rocblas_dgemm_hip_lite.yaml
index c818484ad7..080601a7f1 100644
--- a/Tensile/Configs/rocblas_dgemm_hip_lite.yaml
+++ b/Tensile/Configs/rocblas_dgemm_hip_lite.yaml
@@ -164,7 +164,7 @@ BenchmarkProblems:
 
 LibraryLogic:
 #   ScheduleName: "vega10"
-#   DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"]
+    DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"]
 #   ArchitectureName: "gfx900"
 
 #   ScheduleName: "mi25"
diff --git a/Tensile/Configs/rocblas_hgemm_asm_full.yaml b/Tensile/Configs/rocblas_hgemm_asm_full.yaml
index 3887d18331..81c6d43548 100644
--- a/Tensile/Configs/rocblas_hgemm_asm_full.yaml
+++ b/Tensile/Configs/rocblas_hgemm_asm_full.yaml
@@ -47,7 +47,6 @@ BenchmarkProblems:
         - EdgeType: ["ShiftPtr"]
         - LoopTail: [True]
         - KernelLanguage: ["Assembly"]
-        - PreciseBoundsCheck: [False]
         - PrefetchLocalRead: [True]
         - WorkGroupMapping: [1]
       ForkParameters:
@@ -134,7 +133,6 @@ BenchmarkProblems:
 #       - EdgeType: ["ShiftPtr"]
 #       - LoopTail: [True]
 #       - KernelLanguage: ["Assembly"]
-#       - PreciseBoundsCheck: [False]
 #       - PrefetchGlobalRead: [True]
 #       - PrefetchLocalRead: [True]
 #       - WorkGroupMapping: [1]
@@ -186,7 +184,6 @@ BenchmarkProblems:
         - EdgeType: ["ShiftPtr"]
         - LoopTail: [True]
         - KernelLanguage: ["Assembly"]
-        - PreciseBoundsCheck: [False]
         - PrefetchGlobalRead: [True]
         - PrefetchLocalRead: [True]
       ForkParameters:
@@ -298,7 +295,6 @@ BenchmarkProblems:
         - EdgeType: ["ShiftPtr"]
         - LoopTail: [True]
         - KernelLanguage: ["Assembly"]
-        - PreciseBoundsCheck: [False]
         - PrefetchLocalRead: [True]
         - GlobalSplitU: [1]
       ForkParameters:
@@ -422,7 +418,6 @@ BenchmarkProblems:
         - EdgeType: ["ShiftPtr"]
         - LoopTail: [True]
         - KernelLanguage: ["Assembly"]
-        - PreciseBoundsCheck: [False]
         - PrefetchGlobalRead: [True]
         - PrefetchLocalRead: [True]
         - WorkGroupMapping: [1]
@@ -456,7 +451,6 @@ BenchmarkProblems:
         - EdgeType: ["ShiftPtr"]
         - LoopTail: [True]
         - KernelLanguage: ["Assembly"]
-        - PreciseBoundsCheck: [False]
         - PrefetchGlobalRead: [ True ]
         - PrefetchLocalRead: [ True ]
       ForkParameters:
@@ -505,7 +499,6 @@ BenchmarkProblems:
         - EdgeType: ["ShiftPtr"]
         - LoopTail: [True]
         - KernelLanguage: ["Assembly"]
-        - PreciseBoundsCheck: [False]
       ForkParameters:
         - PrefetchGlobalRead: [ False, True ]
         - PrefetchLocalRead: [ False]
@@ -632,7 +625,6 @@ BenchmarkProblems:
         - EdgeType: ["ShiftPtr"]
         - LoopTail: [True]
         - KernelLanguage: ["Assembly"]
-        - PreciseBoundsCheck: [False]
         - PrefetchLocalRead: [True]
         - WorkGroupMapping: [1]
       ForkParameters:
@@ -684,7 +676,6 @@ BenchmarkProblems:
 #       - EdgeType: ["ShiftPtr"]
 #       - LoopTail: [True]
 #       - KernelLanguage: ["Assembly"]
-#       - PreciseBoundsCheck: [False]
 #       - PrefetchGlobalRead: [True]
 #       - PrefetchLocalRead: [True]
 #       - WorkGroupMapping: [1]
@@ -730,7 +721,6 @@ BenchmarkProblems:
         - EdgeType: ["ShiftPtr"]
         - LoopTail: [True]
         - KernelLanguage: ["Assembly"]
-        - PreciseBoundsCheck: [False]
         - PrefetchGlobalRead: [True]
         - PrefetchLocalRead: [True]
       ForkParameters:
@@ -804,7 +794,6 @@ BenchmarkProblems:
         - EdgeType: ["ShiftPtr"]
         - LoopTail: [True]
         - KernelLanguage: ["Assembly"]
-        - PreciseBoundsCheck: [False]
         - PrefetchGlobalRead: [True]
         - PrefetchLocalRead: [True]
         - GlobalSplitU: [1]
@@ -923,7 +912,6 @@ BenchmarkProblems:
         - EdgeType: ["ShiftPtr"]
         - LoopTail: [True]
         - KernelLanguage: ["Assembly"]
-        - PreciseBoundsCheck: [False]
         - PrefetchGlobalRead: [True]
         - PrefetchLocalRead: [True]
         - WorkGroupMapping: [1]
@@ -1040,7 +1028,6 @@ BenchmarkProblems:
         - EdgeType: ["ShiftPtr"]
         - LoopTail: [True]
         - KernelLanguage: ["Assembly"]
-        - PreciseBoundsCheck: [False]
         - PrefetchGlobalRead: [True]
         - PrefetchLocalRead: [True]
       ForkParameters:
@@ -1084,7 +1071,6 @@ BenchmarkProblems:
         - EdgeType: ["ShiftPtr"]
         - LoopTail: [True]
         - KernelLanguage: ["Assembly"]
-        - PreciseBoundsCheck: [False]
         - PrefetchGlobalRead: [True]
         - PrefetchLocalRead: [True]
         - GlobalSplitU: [1]
@@ -1128,7 +1114,6 @@ BenchmarkProblems:
         - EdgeType: ["ShiftPtr"]
         - LoopTail: [True]
         - KernelLanguage: ["Assembly"]
-        - PreciseBoundsCheck: [False]
         - PrefetchGlobalRead: [True]
         - PrefetchLocalRead: [True]
         - WorkGroupMapping: [1]
@@ -1245,7 +1230,6 @@ BenchmarkProblems:
         - EdgeType: ["ShiftPtr"]
         - LoopTail: [True]
         - KernelLanguage: ["Assembly"]
-        - PreciseBoundsCheck: [False]
         - PrefetchGlobalRead: [True]
         - PrefetchLocalRead: [True]
       ForkParameters:
@@ -1287,7 +1271,6 @@ BenchmarkProblems:
         - EdgeType: ["ShiftPtr"]
         - LoopTail: [True]
         - KernelLanguage: ["Assembly"]
-        - PreciseBoundsCheck: [False]
         - PrefetchLocalRead: [True]
         - GlobalSplitU: [1]
       ForkParameters:
@@ -1324,7 +1307,6 @@ BenchmarkProblems:
         - EdgeType: ["ShiftPtr"]
         - LoopTail: [True]
         - KernelLanguage: ["Assembly"]
-        - PreciseBoundsCheck: [False]
         - PrefetchGlobalRead: [True]
         - PrefetchLocalRead: [True]
         - WorkGroupMapping: [-1]
@@ -1420,11 +1402,11 @@ BenchmarkProblems:
 
 LibraryLogic:
 #   ScheduleName: "vega20"
-#   DeviceNames: ["Device 66a0", "Device 66a7"]
+#   DeviceNames: ["Device 66a0", "Device 66a1", "Device 66a7"]
 #   ArchitectureName: "gfx906"
 
     ScheduleName: "vega10"
-    DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"]
+    DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"]
     ArchitectureName: "gfx900"
 
 #   ScheduleName: "mi25"
diff --git a/Tensile/Configs/rocblas_hgemm_asm_lite.yaml b/Tensile/Configs/rocblas_hgemm_asm_lite.yaml
index fa2de7c5e4..d878511989 100644
--- a/Tensile/Configs/rocblas_hgemm_asm_lite.yaml
+++ b/Tensile/Configs/rocblas_hgemm_asm_lite.yaml
@@ -305,11 +305,11 @@ BenchmarkProblems:
 
 LibraryLogic:
 #   ScheduleName: "vega20"
-#   DeviceNames: ["Device 66a0", "Device 66a7"]
+#   DeviceNames: ["Device 66a0", "Device 66a1", "Device 66a7"]
 #   ArchitectureName: "gfx906"
 
     ScheduleName: "vega10"
-    DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"]
+    DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"]
     ArchitectureName: "gfx900"
 
 #   ScheduleName: "mi25"
diff --git a/Tensile/Configs/rocblas_hgemm_hip_lite.yaml b/Tensile/Configs/rocblas_hgemm_hip_lite.yaml
index bfcd3192a6..9daba38670 100644
--- a/Tensile/Configs/rocblas_hgemm_hip_lite.yaml
+++ b/Tensile/Configs/rocblas_hgemm_hip_lite.yaml
@@ -275,7 +275,7 @@ BenchmarkProblems:
 
 LibraryLogic:
 #   ScheduleName: "vega10"
-#   DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"]
+    DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"]
 #   ArchitectureName: "gfx900"
 
 #   ScheduleName: "mi25"
diff --git a/Tensile/Configs/rocblas_hpa_hgemm_asm_full.yaml b/Tensile/Configs/rocblas_hpa_hgemm_asm_full.yaml
deleted file mode 100644
index 4450bbe3b9..0000000000
--- a/Tensile/Configs/rocblas_hpa_hgemm_asm_full.yaml
+++ /dev/null
@@ -1,304 +0,0 @@
-GlobalParameters:
-  MinimumRequiredVersion: 4.4.0
-  PrintLevel: 1
-  ForceRedoBenchmarkProblems: True
-  ForceRedoLibraryLogic: True
-  ForceRedoLibraryClient: True
-  CMakeBuildType: Release
-  EnqueuesPerSync: 1
-  SyncsPerBenchmark: 1
-  LibraryPrintDebug: False
-  NumElementsToValidate: 0
-  ValidationMaxToPrint: 4
-  ValidationPrintValids: False
-  ShortNames: False
-  MergeFiles: True
-  Platform: 0
-  Device: 0
-  KernelTime: True
-  PinClocks: True
-  SleepPercent: 200
-  DataInitTypeBeta : 0
-
-BenchmarkProblems:
-
-  ########################################
-  # NN
-  ########################################
-  -
-    - # ProblemType
-      OperationType: GEMM
-      DataType: h
-      HighPrecisionAccumulate: True
-      TransposeA: False
-      TransposeB: False
-      UseBeta: True
-      Batched: True
-
-    - # BenchmarkProblemSizeGroup - Assembly
-      InitialSolutionParameters:
-      BenchmarkCommonParameters:
-        - LoopTail: [True]
-        - EdgeType: ["ShiftPtr"]
-        - KernelLanguage: ["Assembly"]
-      ForkParameters:
-        - GlobalSplitU: [1, 3]
-        - PrefetchLocalRead: [False]
-        - PrefetchGlobalRead: [True]
-        - ThreadTile:
-          - [ 2, 4 ]
-          - [ 4, 8 ]
-          - [ 16, 8 ]
-        - WorkGroup:
-          - [ 32,  4,  1 ]
-          - [  8,  8,  1 ]
-        - DepthU: [8]
-        - VectorWidth: [-1]
-      BenchmarkForkParameters:
-      JoinParameters:
-      BenchmarkJoinParameters:
-      BenchmarkFinalParameters:
-        - ProblemSizes:
-          - Range: [ [127,1,129], 0, [2], [63,1,65] ]
-
-    - # BenchmarkProblemSizeGroup - Assembly
-      InitialSolutionParameters:
-      BenchmarkCommonParameters:
-        - LoopTail: [True]
-        - EdgeType: ["ShiftPtr"]
-      ForkParameters:
-        - KernelLanguage: ["Assembly"]
-        - GlobalSplitU: [1, 3]
-        - PrefetchLocalRead: [False]
-        - PrefetchGlobalRead: [True]
-        - ThreadTile:
-          - [ 4, 2 ]
-          - [ 4, 8 ]
-          - [ 16, 16 ]
-          - [ 8, 8 ]
-        - WorkGroup:
-          - [ 16, 16,  1 ]
-          - [  8,  8,  1 ]
-        - DepthU: [16]
-        - VectorWidth: [-1]
-      BenchmarkForkParameters:
-      JoinParameters:
-      BenchmarkJoinParameters:
-      BenchmarkFinalParameters:
-        - ProblemSizes:
-          - Range: [ [127,1,129], 0, [2], [63,1,65] ]
-
-  ########################################
-  # NT
-  ########################################
-  -
-    - # ProblemType
-      OperationType: GEMM
-      DataType: h
-      HighPrecisionAccumulate: True
-      TransposeA: False
-      TransposeB: True
-      UseBeta: True
-      Batched: True
- 
-    - # BenchmarkProblemSizeGroup - Assembly
-      InitialSolutionParameters:
-      BenchmarkCommonParameters:
-        - LoopTail: [True]
-        - EdgeType: ["ShiftPtr"]
-        - KernelLanguage: ["Assembly"]
-      ForkParameters:
-        - GlobalSplitU: [1, 3]
-        - PrefetchLocalRead: [True]
-        - PrefetchGlobalRead: [False]
-        - ThreadTile:
-          - [ 4, 2 ]
-          - [ 4, 8 ]
-          - [ 8, 8 ]
-        - WorkGroup:
-          - [ 16, 16,  1 ]
-          - [ 32,  4,  1 ]
-        - DepthU: [8]
-        - VectorWidth: [-1]
-      BenchmarkForkParameters:
-      JoinParameters:
-      BenchmarkJoinParameters:
-      BenchmarkFinalParameters:
-        - ProblemSizes:
-          - Range: [ [127,1,129], 0, [2], [63,1,65] ]
-
-    - # BenchmarkProblemSizeGroup - Assembly
-      InitialSolutionParameters:
-      BenchmarkCommonParameters:
-        - LoopTail: [True]
-        - EdgeType: ["ShiftPtr"]
-      ForkParameters:
-        - KernelLanguage: ["Assembly"]
-        - GlobalSplitU: [1, 3]
-        - PrefetchLocalRead: [True]
-        - PrefetchGlobalRead: [False]
-        - ThreadTile:
-          - [ 8, 2 ]
-          - [ 2, 8 ]
-          - [ 16, 2 ]
-          - [ 2, 16 ]
-        - WorkGroup:
-          - [ 16, 16,  1 ]
-          - [  8,  8,  1 ]
-        - DepthU: [16]
-        - VectorWidth: [-1]
-      BenchmarkForkParameters:
-      JoinParameters:
-      BenchmarkJoinParameters:
-      BenchmarkFinalParameters:
-        - ProblemSizes:
-          - Range: [ [127,1,129], 0, [2], [63,1,65] ]
-
-# ########################################
-# # TN
-# ########################################
-  -
-    - # ProblemType
-      OperationType: GEMM
-      DataType: h
-      HighPrecisionAccumulate: True
-      TransposeA: True
-      TransposeB: False
-      UseBeta: True
-      Batched: True
-
-    - # BenchmarkProblemSizeGroup - Assembly
-      InitialSolutionParameters:
-      BenchmarkCommonParameters:
-        - LoopTail: [True]
-        - EdgeType: ["ShiftPtr"]
-        - KernelLanguage: ["Assembly"]
-      ForkParameters:
-        - GlobalSplitU: [1, 3]
-        - PrefetchLocalRead: [True]
-        - PrefetchGlobalRead: [False]
-        - ThreadTile:
-          - [ 4, 2 ]
-          - [ 4, 8 ]
-          - [ 8, 8 ]
-        - WorkGroup:
-          - [ 16, 16,  1 ]
-          - [ 32,  4,  1 ]
-        - DepthU: [8]
-        - VectorWidth: [-1]
-      BenchmarkForkParameters:
-      JoinParameters:
-      BenchmarkJoinParameters:
-      BenchmarkFinalParameters:
-        - ProblemSizes:
-          - Range: [ [127,1,129], 0, [2], [63,1,65] ]
-
-    - # BenchmarkProblemSizeGroup - Assembly
-      InitialSolutionParameters:
-      BenchmarkCommonParameters:
-        - LoopTail: [True]
-        - EdgeType: ["ShiftPtr"]
-      ForkParameters:
-        - KernelLanguage: ["Assembly"]
-        - GlobalSplitU: [1, 3]
-        - PrefetchLocalRead: [True]
-        - PrefetchGlobalRead: [False]
-        - ThreadTile:
-          - [ 8, 2 ]
-          - [ 2, 8 ]
-          - [ 16, 2 ]
-          - [ 2, 16 ]
-        - WorkGroup:
-          - [ 16, 16,  1 ]
-          - [  8,  8,  1 ]
-        - DepthU: [16]
-        - VectorWidth: [-1]
-      BenchmarkForkParameters:
-      JoinParameters:
-      BenchmarkJoinParameters:
-      BenchmarkFinalParameters:
-        - ProblemSizes:
-          - Range: [ [127,1,129], 0, [2], [63,1,65] ]
-
-# ########################################
-# # TT - standard
-# ########################################
-  -
-    - # ProblemType
-      OperationType: GEMM
-      DataType: h
-      HighPrecisionAccumulate: True
-      TransposeA: True
-      TransposeB: True
-      UseBeta: True
-      Batched: True
-
-    - # BenchmarkProblemSizeGroup - Assembly
-      InitialSolutionParameters:
-      BenchmarkCommonParameters:
-        - LoopTail: [True]
-        - EdgeType: ["ShiftPtr"]
-        - KernelLanguage: ["Assembly"]
-      ForkParameters:
-        - PrefetchLocalRead: [False]
-        - PrefetchGlobalRead: [False]
-        - ThreadTile:
-          - [ 16, 4 ]
-          - [ 16, 8 ]
-          - [ 8, 8 ]
-        - WorkGroup:
-          - [ 16, 16,  1 ]
-          - [  8,  8,  1 ]
-        - DepthU: [32]
-        - VectorWidth: [-1]
-      BenchmarkForkParameters:
-      JoinParameters:
-      BenchmarkJoinParameters:
-      BenchmarkFinalParameters:
-        - ProblemSizes:
-          - Range: [ [127,1,129], 0, [2], [63,1,65] ]
-
-    - # BenchmarkProblemSizeGroup - Assembly
-      InitialSolutionParameters:
-      BenchmarkCommonParameters:
-        - LoopTail: [True]
-        - EdgeType: ["ShiftPtr"]
-      ForkParameters:
-        - KernelLanguage: ["Assembly"]
-        - PrefetchLocalRead: [False]
-        - PrefetchGlobalRead: [False]
-        - ThreadTile:
-          - [ 8, 2 ]
-          - [ 2, 2 ]
-          - [ 4, 2 ]
-          - [ 8, 4 ]
-        - WorkGroup:
-          - [ 16, 16,  1 ]
-          - [  8,  8,  1 ]
-        - DepthU: [16]
-        - VectorWidth: [-1]
-      BenchmarkForkParameters:
-      JoinParameters:
-      BenchmarkJoinParameters:
-      BenchmarkFinalParameters:
-        - ProblemSizes:
-          - Range: [ [127,1,129], 0, [2], [63,1,65] ]
-
-LibraryLogic:
-    ScheduleName: "vega10"
-    DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"]
-    ArchitectureName: "gfx900"
-
-#   ScheduleName: "mi25"
-#   DeviceNames: ["Device 6860"]
-#   ArchitectureName: "gfx900"
-
-#   ScheduleName: "r9nano"
-#   DeviceNames: ["Device 7300"]
-#   ArchitectureName: "gfx803"
-
-#   ScheduleName: "hip"
-#   DeviceNames: ["Device 0000"]
-#   ArchitectureName: "fallback"
-
-LibraryClient:
diff --git a/Tensile/Configs/rocblas_hpa_hgemm_asm_lite.yaml b/Tensile/Configs/rocblas_hpa_hgemm_asm_lite.yaml
index 4450bbe3b9..8edb7d64dd 100644
--- a/Tensile/Configs/rocblas_hpa_hgemm_asm_lite.yaml
+++ b/Tensile/Configs/rocblas_hpa_hgemm_asm_lite.yaml
@@ -286,7 +286,7 @@ BenchmarkProblems:
 
 LibraryLogic:
     ScheduleName: "vega10"
-    DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"]
+    DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"]
     ArchitectureName: "gfx900"
 
 #   ScheduleName: "mi25"
diff --git a/Tensile/Configs/rocblas_hpa_hgemm_hip_lite.yaml b/Tensile/Configs/rocblas_hpa_hgemm_hip_lite.yaml
index 619a1c63ee..cf75d26477 100644
--- a/Tensile/Configs/rocblas_hpa_hgemm_hip_lite.yaml
+++ b/Tensile/Configs/rocblas_hpa_hgemm_hip_lite.yaml
@@ -43,15 +43,15 @@ BenchmarkProblems:
         - KernelLanguage: ["Source"]
       ForkParameters:
         - GlobalSplitU: [1, 3]
-        - PrefetchLocalRead: [False]
-        - PrefetchGlobalRead: [True]
+        - PrefetchLocalRead: [True]
+        - PrefetchGlobalRead: [False]
         - ThreadTile:
-          - [ 2, 4 ]
+          - [ 4, 2 ]
           - [ 4, 8 ]
-          - [ 16, 8 ]
+          - [ 8, 8 ]
         - WorkGroup:
+          - [ 16, 16,  1 ]
           - [ 32,  4,  1 ]
-          - [  8,  8,  1 ]
         - DepthU: [8]
         - VectorWidth: [-1]
       BenchmarkForkParameters:
@@ -69,13 +69,13 @@ BenchmarkProblems:
       ForkParameters:
         - KernelLanguage: ["Source"]
         - GlobalSplitU: [1, 3]
-        - PrefetchLocalRead: [False]
-        - PrefetchGlobalRead: [True]
+        - PrefetchLocalRead: [True]
+        - PrefetchGlobalRead: [False]
         - ThreadTile:
-          - [ 4, 2 ]
-          - [ 4, 8 ]
-          - [ 16, 16 ]
-          - [ 8, 8 ]
+          - [ 8, 2 ]
+          - [ 2, 8 ]
+          - [ 16, 2 ]
+          - [ 2, 16 ]
         - WorkGroup:
           - [ 16, 16,  1 ]
           - [  8,  8,  1 ]
@@ -96,13 +96,13 @@ BenchmarkProblems:
       ForkParameters:
         - KernelLanguage: ["Source"]
         - GlobalSplitU: [1, 3]
-        - PrefetchLocalRead: [False]
-        - PrefetchGlobalRead: [True]
+        - PrefetchLocalRead: [True]
+        - PrefetchGlobalRead: [False]
         - ThreadTile:
-          - [ 4, 2 ]
-          - [ 4, 8 ]
-          - [ 16, 16 ]
-          - [ 8, 8 ]
+          - [ 8, 2 ]
+          - [ 2, 8 ]
+          - [ 16, 2 ]
+          - [ 2, 16 ]
         - WorkGroup:
           - [ 16, 16,  1 ]
           - [  8,  8,  1 ]
@@ -405,7 +405,7 @@ BenchmarkProblems:
 
 LibraryLogic:
 #   ScheduleName: "vega10"
-#   DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"]
+#   DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"]
 #   ArchitectureName: "gfx900"
 
 #   ScheduleName: "mi25"
diff --git a/Tensile/Configs/rocblas_hpa_hgemm_nn_asm_full.yaml b/Tensile/Configs/rocblas_hpa_hgemm_nn_asm_full.yaml
new file mode 100644
index 0000000000..97df90c3e9
--- /dev/null
+++ b/Tensile/Configs/rocblas_hpa_hgemm_nn_asm_full.yaml
@@ -0,0 +1,628 @@
+GlobalParameters:
+  MinimumRequiredVersion: 4.4.0
+  PrintLevel: 1
+  ForceRedoBenchmarkProblems: True
+  ForceRedoLibraryLogic: True
+  ForceRedoLibraryClient: True
+  CMakeBuildType: Release
+  EnqueuesPerSync: 1
+  SyncsPerBenchmark: 1
+  LibraryPrintDebug: False
+  NumElementsToValidate: 256
+  ValidationMaxToPrint: 4
+  ValidationPrintValids: False
+  ShortNames: False
+  MergeFiles: True
+  Platform: 0
+  Device: 0
+  KernelTime: True
+  PinClocks: True
+  SleepPercent: 200
+  DataInitTypeBeta : 0
+  ExitOnFails: 0
+
+BenchmarkProblems:
+
+  ########################################
+  ########################################
+  ###
+  ###   NN
+  ###
+  ########################################
+  ########################################
+  -
+    - # ProblemType
+      OperationType: GEMM
+      DataType: h
+      HighPrecisionAccumulate: True
+      TransposeA: False
+      TransposeB: False
+      UseBeta: True
+      Batched: True
+
+  ########################################
+  # NN - Super Skinny
+  ########################################
+    - # Benchmark Group
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - EdgeType: ["ShiftPtr"]
+        - LoopTail: [True]
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+        - WorkGroupMapping: [1]
+      ForkParameters:
+        - PrefetchGlobalRead: [False, True]
+        - ThreadTile:
+          - [ 2, 2 ]
+          - [ 4, 2 ]
+          - [ 8, 2 ]
+          - [ 4, 4 ]
+          - [ 8, 8 ]
+        - WorkGroup:
+          - [ 8, 8, 1 ]
+          - [ 8, 16, 1 ]
+          - [ 16, 4, 1 ]
+          - [ 16, 8, 1 ]
+          - [ 16, 16, 1 ]
+          - [ 32, 4, 1 ]
+          - [ 32, 8, 1 ]
+        - GlobalSplitU: [1]
+        - DepthU: [ 8, 16, 32, 64 ]
+        - VectorWidth: [-1]
+        - AssertSummationElementMultiple: [2]
+        - AssertFree0ElementMultiple: [2]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [ 512, 1, 1, 500000 ]
+          - Exact: [ 512, 2, 1, 500000 ]
+          - Exact: [ 512, 4, 1, 500000 ]
+          - Exact: [ 512, 8, 1, 500000 ]
+          - Exact: [ 512, 16, 1, 500000 ]
+          - Exact: [ 1024, 1, 1, 500000 ]
+          - Exact: [ 1024, 2, 1, 500000 ]
+          - Exact: [ 1024, 4, 1, 500000 ]
+          - Exact: [ 1024, 8, 1, 500000 ]
+          - Exact: [ 1024, 16, 1, 500000 ]
+          - Exact: [ 64, 1, 1, 1216 ]
+          - Exact: [ 128, 1, 1, 1024 ]
+          - Exact: [ 128, 1, 1, 1408 ]
+          - Exact: [ 512, 1, 1, 512 ]
+          - Exact: [ 3072, 1, 1, 128 ]
+          - Exact: [ 512, 2, 1, 512 ]
+          - Exact: [ 1024, 1, 1, 512 ]
+          - Exact: [ 4224, 1, 1, 128 ]
+          - Exact: [ 512, 4, 1, 512 ]
+          - Exact: [ 1024, 2, 1, 512 ]
+          - Exact: [ 1024, 4, 1, 512 ]
+          - Exact: [ 3072, 1, 1, 1024 ]
+          - Exact: [ 3072, 1, 1, 1024 ]
+          - Exact: [ 512, 16, 1, 512 ]
+          - Exact: [ 3072, 2, 1, 1024 ]
+          - Exact: [ 4608, 1, 1, 1536 ]
+          - Exact: [ 1024, 16, 1, 512 ]
+          - Exact: [ 3072, 4, 1, 1024 ]
+          - Exact: [ 4608, 2, 1, 1536 ]
+          - Exact: [ 6144, 1, 1, 2560 ]
+          - Exact: [ 7680, 1, 1, 2560 ]
+          - Exact: [ 8448, 1, 1, 2816 ]
+          - Exact: [ 4608, 4, 1, 1536 ]
+          - Exact: [ 6144, 2, 1, 2560 ]
+          - Exact: [ 7680, 2, 1, 2560 ]
+          - Exact: [ 8448, 2, 1, 2816 ]
+          - Exact: [ 1760, 16, 1, 1760 ]
+          - Exact: [ 3072, 16, 1, 1024 ]
+          - Exact: [ 6144, 4, 1, 2560 ]
+          - Exact: [ 2048, 16, 1, 2048 ]
+          - Exact: [ 7680, 4, 1, 2560 ]
+          - Exact: [ 8448, 4, 1, 2816 ]
+          - Exact: [ 2560, 16, 1, 2560 ]
+          - Exact: [ 4608, 16, 1, 1536 ]
+          - Exact: [ 6144, 16, 1, 2560 ]
+          - Exact: [ 4096, 16, 1, 4096 ]
+          - Exact: [ 7680, 16, 1, 2560 ]
+          - Exact: [ 8448, 16, 1, 2816 ]
+
+# ########################################
+# # NN - 3 Waves/WG
+# ########################################
+#   - # Benchmark Group
+#     InitialSolutionParameters:
+#     BenchmarkCommonParameters:
+#       - EdgeType: ["ShiftPtr"]
+#       - LoopTail: [True]
+#       - KernelLanguage: ["Assembly"]
+#       - PrefetchGlobalRead: [True]
+#       - PrefetchLocalRead: [True]
+#       - WorkGroupMapping: [1]
+#     ForkParameters:
+#       - ThreadTile:
+#         - [ 3, 3 ]
+#         - [ 3, 4 ]
+#         - [ 3, 6 ]
+#         - [ 4, 3 ]
+#         - [ 4, 4 ]
+#         - [ 4, 6 ]
+#         - [ 6, 3 ]
+#         - [ 6, 4 ]
+#         - [ 6, 6 ]
+#         - [ 8, 8 ]
+#       - WorkGroup:
+#         - [ 24, 8, 1 ]
+#         - [ 12, 16, 1 ]
+#         - [ 6, 32, 1 ]
+#         - [ 8, 8, 1 ]
+#         - [ 8, 24, 1 ]
+#         - [ 16, 16, 1 ]
+#       - GlobalSplitU: [1]
+#       - DepthU: [ 8, 16, 24, 32, 64 ]
+#       - VectorWidth: [ 2, 4, 8 ]
+#       - AssertSummationElementMultiple: [2]
+#       - AssertFree0ElementMultiple: [2]
+#     BenchmarkForkParameters:
+#     JoinParameters:
+#     BenchmarkJoinParameters:
+#     BenchmarkFinalParameters:
+#       - ProblemSizes:
+#         - Exact: [ 35, 700, 1, 2048 ]
+#         - Exact: [ 35, 700, 1, 2048 ]
+#         - Exact: [ 35, 700, 1, 2560 ]
+#         - Exact: [ 35, 1500, 1, 2048 ]
+#         - Exact: [ 35, 1500, 1, 2560 ]
+#         - Exact: [ 35, 8457, 1, 1760 ]
+#         - Exact: [ 35, 8457, 1, 2048 ]
+#         - Exact: [ 35, 8457, 1, 2560 ]
+#         - Exact: [ 35, 8457, 1, 4096 ]
+
+  ########################################
+  # NN - Small or Skinny
+  ########################################
+    - # Benchmark Group
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - EdgeType: ["ShiftPtr"]
+        - LoopTail: [True]
+        - KernelLanguage: ["Assembly"]
+        - PrefetchGlobalRead: [True]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - ThreadTile:
+          - [ 2, 2 ]
+          - [ 4, 2 ]
+          - [ 4, 4 ]
+          - [ 8, 4 ]
+          - [ 8, 8 ]
+        - WorkGroup:
+          - [ 8, 8, 1 ]
+          - [ 16, 8, 1 ]
+          - [ 32, 8, 1 ]
+          - [ 16, 4, 1 ]
+          - [ 32, 4, 1 ]
+        - WorkGroupMapping: [1, 8]
+        - GlobalSplitU: [1]
+        - DepthU: [ 8, 16, 24, 32, 64 ]
+        - VectorWidth: [-1]
+        - AssertSummationElementMultiple: [2]
+        - AssertFree0ElementMultiple: [2]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [          [64, 128], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ] # skinny-0
+          - Range: [ [64, 64, 64, 7000],          [64, 128], [1], [256, 1024, 1024, 4096] ] # skinny-1
+          - Range: [ [64, 64, 64,  700], [64, 64, 64,  700], [1], [256, 1024, 1024, 4096] ] # small
+          - Exact: [ 64, 1, 1, 1216 ]
+          - Exact: [ 128, 1, 1, 1024 ]
+          - Exact: [ 128, 1, 1, 1408 ]
+          - Exact: [ 512, 1, 1, 512 ]
+          - Exact: [ 3072, 1, 1, 128 ]
+          - Exact: [ 512, 2, 1, 512 ]
+          - Exact: [ 1024, 1, 1, 512 ]
+          - Exact: [ 4224, 1, 1, 128 ]
+          - Exact: [ 512, 4, 1, 512 ]
+          - Exact: [ 1024, 2, 1, 512 ]
+          - Exact: [ 1024, 4, 1, 512 ]
+          - Exact: [ 3072, 1, 1, 1024 ]
+          - Exact: [ 3072, 1, 1, 1024 ]
+          - Exact: [ 512, 16, 1, 512 ]
+          - Exact: [ 3072, 2, 1, 1024 ]
+          - Exact: [ 4608, 1, 1, 1536 ]
+          - Exact: [ 512, 32, 1, 512 ]
+          - Exact: [ 1024, 16, 1, 512 ]
+          - Exact: [ 3072, 4, 1, 1024 ]
+          - Exact: [ 4608, 2, 1, 1536 ]
+          - Exact: [ 6144, 1, 1, 2560 ]
+          - Exact: [ 1024, 32, 1, 512 ]
+          - Exact: [ 7680, 1, 1, 2560 ]
+          - Exact: [ 8448, 1, 1, 2816 ]
+          - Exact: [ 4608, 4, 1, 1536 ]
+          - Exact: [ 6144, 2, 1, 2560 ]
+          - Exact: [ 7680, 2, 1, 2560 ]
+          - Exact: [ 8448, 2, 1, 2816 ]
+          - Exact: [ 1760, 16, 1, 1760 ]
+#         - Exact: [ 35, 700, 1, 2048 ]
+#         - Exact: [ 35, 700, 1, 2048 ]
+          - Exact: [ 3072, 16, 1, 1024 ]
+#         - Exact: [ 35, 700, 1, 2560 ]
+          - Exact: [ 6144, 4, 1, 2560 ]
+          - Exact: [ 2048, 16, 1, 2048 ]
+          - Exact: [ 7680, 4, 1, 2560 ]
+          - Exact: [ 8448, 4, 1, 2816 ]
+          - Exact: [ 1760, 32, 1, 1760 ]
+          - Exact: [ 3072, 32, 1, 1024 ]
+          - Exact: [ 2560, 16, 1, 2560 ]
+#         - Exact: [ 35, 1500, 1, 2048 ]
+          - Exact: [ 4608, 16, 1, 1536 ]
+          - Exact: [ 2048, 32, 1, 2048 ]
+#         - Exact: [ 35, 1500, 1, 2560 ]
+          - Exact: [ 1760, 64, 1, 1760 ]
+          - Exact: [ 3072, 64, 1, 1024 ]
+          - Exact: [ 2560, 32, 1, 2560 ]
+          - Exact: [ 4608, 32, 1, 1536 ]
+          - Exact: [ 128, 1500, 1, 1280 ]
+          - Exact: [ 6144, 16, 1, 2560 ]
+          - Exact: [ 2048, 64, 1, 2048 ]
+          - Exact: [ 4096, 16, 1, 4096 ]
+          - Exact: [ 7680, 16, 1, 2560 ]
+          - Exact: [ 176, 1500, 1, 1408 ]
+          - Exact: [ 8448, 16, 1, 2816 ]
+          - Exact: [ 1760, 128, 1, 1760 ]
+          - Exact: [ 3072, 128, 1, 1024 ]
+          - Exact: [ 2560, 64, 1, 2560 ]
+          - Exact: [ 6144, 32, 1, 2560 ]
+#         - Exact: [ 35, 8457, 1, 1760 ]
+          - Exact: [ 2048, 128, 1, 2048 ]
+          - Exact: [ 4096, 32, 1, 4096 ]
+#         - Exact: [ 35, 8457, 1, 2048 ]
+          - Exact: [ 7680, 32, 1, 2560 ]
+#         - Exact: [ 35, 8457, 1, 2560 ]
+          - Exact: [ 8448, 32, 1, 2816 ]
+          - Exact: [ 2560, 128, 1, 2560 ]
+          - Exact: [ 4096, 64, 1, 4096 ]
+#         - Exact: [ 35, 8457, 1, 4096 ]
+          - Exact: [ 7680, 64, 1, 2560 ]
+          - Exact: [ 4096, 128, 1, 4096 ]
+          - Exact: [ 7680, 128, 1, 2560 ]
+
+  ########################################
+  # NN - Large
+  ########################################
+    - # Benchmark Group
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - EdgeType: ["ShiftPtr"]
+        - LoopTail: [True]
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+        - GlobalSplitU: [1]
+      ForkParameters:
+        - PrefetchGlobalRead: [False, True]
+        - ThreadTile:
+          - [ 4, 4 ]
+          - [ 4, 8 ]
+          - [ 6, 8 ]
+          - [ 8, 4 ]
+          - [ 8, 6 ]
+          - [ 8, 8 ]
+        - WorkGroup:
+          - [ 8, 8, 1 ]
+          - [ 16, 16, 1 ]
+        - WorkGroupMapping: [8]            # 1 removed for training performance
+        - DepthU: [ 8, 16, 24, 32, 64 ]
+        - VectorWidth: [-1]
+        - AssertSummationElementMultiple: [2]
+        - AssertFree0ElementMultiple: [2]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [ [64, 64, 64, 7000], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ] # large
+          - Exact: [ 1024, 1024, 1, 1024 ]
+          - Exact: [ 1024, 700, 1, 512 ]
+          - Exact: [ 1024, 700, 1, 512 ]
+          - Exact: [ 3072, 128, 1, 1024 ]
+          - Exact: [ 3072, 1500, 1, 128 ]
+          - Exact: [ 2560, 128, 1, 2560 ]
+          - Exact: [ 4224, 1500, 1, 176 ]
+          - Exact: [ 512, 1500, 1, 1536 ]
+          - Exact: [ 512, 1500, 1, 2048 ]
+          - Exact: [ 512, 1500, 1, 2560 ]
+          - Exact: [ 4096, 128, 1, 4096 ]
+          - Exact: [ 512, 1500, 1, 2816 ]
+          - Exact: [ 512, 3000, 1, 1536 ]
+          - Exact: [ 1024, 1500, 1, 1536 ]
+          - Exact: [ 7680, 128, 1, 2560 ]
+          - Exact: [ 512, 3000, 1, 2048 ]
+          - Exact: [ 1024, 1500, 1, 2048 ]
+          - Exact: [ 512, 3000, 1, 2560 ]
+          - Exact: [ 1024, 1500, 1, 2560 ]
+          - Exact: [ 512, 3000, 1, 2816 ]
+          - Exact: [ 1024, 1500, 1, 2816 ]
+          - Exact: [ 512, 6000, 1, 1536 ]
+          - Exact: [ 1024, 3000, 1, 1536 ]
+          - Exact: [ 3072, 1500, 1, 1024 ]
+          - Exact: [ 3072, 1500, 1, 1024 ]
+          - Exact: [ 512, 6000, 1, 2048 ]
+          - Exact: [ 1024, 3000, 1, 2048 ]
+          - Exact: [ 5124, 700, 1, 2048 ]
+          - Exact: [ 5124, 700, 1, 2048 ]
+          - Exact: [ 512, 6000, 1, 2560 ]
+          - Exact: [ 1024, 3000, 1, 2560 ]
+          - Exact: [ 512, 6000, 1, 2816 ]
+          - Exact: [ 1024, 3000, 1, 2816 ]
+          - Exact: [ 5124, 700, 1, 2560 ]
+          - Exact: [ 1024, 6000, 1, 1536 ]
+          - Exact: [ 3072, 3000, 1, 1024 ]
+          - Exact: [ 4608, 1500, 1, 1536 ]
+          - Exact: [ 1024, 6000, 1, 2048 ]
+          - Exact: [ 1024, 6000, 1, 2560 ]
+          - Exact: [ 5124, 1500, 1, 2048 ]
+          - Exact: [ 1024, 6000, 1, 2816 ]
+          - Exact: [ 512, 24000, 1, 1536 ]
+          - Exact: [ 3072, 6000, 1, 1024 ]
+          - Exact: [ 5124, 1500, 1, 2560 ]
+          - Exact: [ 4608, 3000, 1, 1536 ]
+          - Exact: [ 1760, 7000, 1, 1760 ]
+          - Exact: [ 6144, 1500, 1, 2560 ]
+          - Exact: [ 512, 24000, 1, 2048 ]
+          - Exact: [ 2048, 7000, 1, 2048 ]
+          - Exact: [ 7680, 1500, 1, 2560 ]
+          - Exact: [ 512, 24000, 1, 2560 ]
+          - Exact: [ 512, 24000, 1, 2816 ]
+          - Exact: [ 8448, 1500, 1, 2816 ]
+          - Exact: [ 512, 48000, 1, 1536 ]
+          - Exact: [ 1024, 24000, 1, 1536 ]
+          - Exact: [ 4608, 6000, 1, 1536 ]
+          - Exact: [ 2560, 7000, 1, 2560 ]
+          - Exact: [ 6144, 3000, 1, 2560 ]
+          - Exact: [ 512, 48000, 1, 2048 ]
+          - Exact: [ 1024, 24000, 1, 2048 ]
+          - Exact: [ 7680, 3000, 1, 2560 ]
+          - Exact: [ 512, 48000, 1, 2560 ]
+          - Exact: [ 1024, 24000, 1, 2560 ]
+          - Exact: [ 512, 48000, 1, 2816 ]
+          - Exact: [ 1024, 24000, 1, 2816 ]
+          - Exact: [ 8448, 3000, 1, 2816 ]
+          - Exact: [ 1024, 48000, 1, 1536 ]
+          - Exact: [ 3072, 24000, 1, 1024 ]
+          - Exact: [ 5124, 9124, 1, 1760 ]
+          - Exact: [ 6144, 6000, 1, 2560 ]
+          - Exact: [ 5124, 9124, 1, 2048 ]
+          - Exact: [ 1024, 48000, 1, 2048 ]
+          - Exact: [ 4096, 7000, 1, 4096 ]
+          - Exact: [ 7680, 6000, 1, 2560 ]
+          - Exact: [ 5124, 9124, 1, 2560 ]
+          - Exact: [ 1024, 48000, 1, 2560 ]
+          - Exact: [ 1024, 48000, 1, 2816 ]
+          - Exact: [ 8448, 6000, 1, 2816 ]
+          - Exact: [ 3072, 48000, 1, 1024 ]
+          - Exact: [ 4608, 24000, 1, 1536 ]
+          - Exact: [ 5124, 9124, 1, 4096 ]
+          - Exact: [ 4608, 48000, 1, 1536 ]
+          - Exact: [ 6144, 24000, 1, 2560 ]
+          - Exact: [ 7680, 24000, 1, 2560 ]
+          - Exact: [ 8448, 24000, 1, 2816 ]
+          - Exact: [ 6144, 48000, 1, 2560 ]
+          - Exact: [ 7680, 48000, 1, 2560 ]
+          - Exact: [ 8448, 48000, 1, 2816 ]
+
+  ########################################
+  # NN - VectorWidth Correctness
+  ########################################
+    - # Benchmark Group
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - EdgeType: ["ShiftPtr"]
+        - LoopTail: [True]
+        - KernelLanguage: ["Assembly"]
+        - PrefetchGlobalRead: [True]
+        - PrefetchLocalRead: [True]
+        - WorkGroupMapping: [1]
+      ForkParameters:
+        - ThreadTile:
+          - [ 2, 2 ]
+          - [ 4, 4 ]
+        - WorkGroup:
+          - [ 8, 8, 1 ]
+          - [ 16, 16, 1 ]
+        - GlobalSplitU: [1]
+        - DepthU: [ 8, 16, 24, 32, 64 ]
+        - VectorWidth: [-1]
+        - AssertSummationElementMultiple: [2]
+        - AssertFree0ElementMultiple: [2]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [               [64],               [64], [1], [256, 1024, 1024, 4096] ] # corner
+          - Range: [               [64], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ] # skinny-0
+          - Range: [ [64, 64, 64, 7000],               [64], [1], [256, 1024, 1024, 4096] ] # skinny-1
+
+  ########################################
+  # NN - VGPR refactor
+  ########################################
+    - # Benchmark Group
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - EdgeType: ["ShiftPtr"]
+        - LoopTail: [True]
+        - KernelLanguage: ["Assembly"]
+        - PrefetchGlobalRead: [ True ]
+        - PrefetchLocalRead: [ True ]
+      ForkParameters:
+        - ThreadTile:
+          - [ 2, 2 ]
+          - [ 2, 4 ]
+          - [ 4, 2 ]
+          - [ 4, 4 ]
+          - [ 4, 8 ]
+          - [ 8, 4 ]
+          - [ 8, 8 ]
+        - WorkGroup:
+          - [  8,  8, 1 ]
+          - [ 16,  8, 1 ]
+          - [ 16, 16, 1 ]
+          - [ 32,  8, 1 ]
+          - [ 32, 16, 1 ]
+        - WorkGroupMapping: [1, 8]
+        - DepthU: [ 8, 16, 24, 32, 64, 128 ]
+        - VectorWidth: [2, 4, 8]
+#       - GlobalReadVectorWidth: [4]
+        - LdsPadB: [0,1,2,4]
+        - AssertSummationElementMultiple: [2]
+        - AssertFree0ElementMultiple: [2]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [ 256, 193600, 1, 64 ]
+          - Exact: [ 64, 193600, 1, 64 ]
+          - Exact: [ 64, 193600, 1, 256 ]
+          - Exact: [ 512, 50176, 1, 128 ]
+          - Exact: [ 128, 50176, 1, 512 ]
+          - Exact: [ 256, 12544, 1, 1024 ]
+          - Exact: [ 1024, 12544, 1, 256 ]
+          - Exact: [ 2048, 3136, 1, 512 ]
+          - Exact: [ 512, 3136, 1, 2048 ]
+
+  ########################################
+  # NN - Batch
+  ########################################
+    - # Benchmark Group - ResNet 1x1:
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - EdgeType: ["ShiftPtr"]
+        - LoopTail: [True]
+        - KernelLanguage: ["Assembly"]
+      ForkParameters:
+        - PrefetchGlobalRead: [ False, True ]
+        - PrefetchLocalRead: [ False]
+        - ThreadTile:
+          - [ 4, 4 ]
+          - [ 4, 8 ]
+          - [ 8, 4 ]
+        - WorkGroup:
+          - [ 16,  8, 1 ]
+          - [ 8, 32, 1 ]
+          - [ 16, 16, 1 ]
+          - [ 32,  8, 1 ]
+        - WorkGroupMapping: [8]
+        - DepthU: [ 16 ]
+        - VectorWidth: [2, 4, 8]
+#       - GlobalReadVectorWidth: [1, 4]
+        - LdsPadB: [0,1,2,4]
+        - AssertSummationElementMultiple: [2]
+        - AssertFree0ElementMultiple: [2]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [  196,  256, 64, 1024 ]
+          - Exact: [  784,  512, 64,  128 ]
+#         - Exact: [   49,  512, 64, 2048 ]
+          - Exact: [ 3136,  512,  1, 2048 ]
+          - Exact: [  196, 1024, 64,  256 ]
+#         - Exact: [   49, 2048, 64,  512 ]
+          - Exact: [ 3136, 2048,  1,  512 ]
+#         - Exact: [ 3025,  256, 64,   64 ]
+#         - Exact: [ 3025,   64, 64,   64 ]
+
+  ########################################
+  # NN - Source kernels
+  ########################################
+    - # BenchmarkProblemSizeGroup - VW=2 for m,n,k<=4
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Source"]
+        - EdgeType: ["ShiftPtr"]
+        - LoopTail: [True]
+        - VectorWidth: [2]
+        - WorkGroupMapping: [8]
+      ForkParameters:
+        - ThreadTile:
+          - [ 8, 8 ]
+          - [ 4, 8 ]
+          - [ 4, 4 ]
+        - WorkGroup:
+          - [ 16, 16,  1 ]
+        - PrefetchGlobalRead: [False, True]
+        - PrefetchLocalRead: [False, True]
+#       - DepthU: [ 4, 8, 16, 32 ]
+        - DepthU: [ 4, 8, 16 ]
+        - VectorWidth: [2]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [               [32],               [32], [1],                    [32] ]
+          - Range: [               [32], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ]
+          - Range: [ [64, 64, 64, 7000],               [32], [1], [256, 1024, 1024, 4096] ]
+          - Range: [ [64, 64, 64, 7000], [64, 64, 64, 7000], [1],                    [32] ]
+          - Range: [               [32],               [32], [1], [256, 1024, 1024, 4096] ]
+          - Range: [               [32], [64, 64, 64, 7000], [1],                    [32] ]
+          - Range: [ [64, 64, 64, 7000],               [32], [1],                    [32] ]
+
+    - # BenchmarkProblemSizeGroup - VW=1 for m,n,k==1
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Source"]
+        - EdgeType: ["ShiftPtr"]
+        - LoopTail: [True]
+        - VectorWidth: [1]
+        - WorkGroupMapping: [8]
+      ForkParameters:
+        - ThreadTile:
+          - [ 8, 8 ]
+          - [ 4, 8 ]
+          - [ 4, 4 ]
+        - WorkGroup:
+          - [ 16, 16,  1 ]
+        - PrefetchGlobalRead: [False, True]
+        - PrefetchLocalRead: [False, True]
+#       - DepthU: [ 4, 8, 16, 32 ]
+        - DepthU: [ 4, 8, 16 ]
+        - VectorWidth: [1]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [                [1],                [1], [1],                     [1] ]
+          - Range: [                [1], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ]
+          - Range: [ [64, 64, 64, 7000],                [1], [1], [256, 1024, 1024, 4096] ]
+          - Range: [ [64, 64, 64, 7000], [64, 64, 64, 7000], [1],                     [1] ]
+          - Range: [                [1],                [1], [1], [256, 1024, 1024, 4096] ]
+          - Range: [                [1], [64, 64, 64, 7000], [1],                     [1] ]
+          - Range: [ [64, 64, 64, 7000],                [1], [1],                     [1] ]
+
+LibraryLogic:
+#   ScheduleName: "vega20"
+#   DeviceNames: ["Device 66a0", "Device 66a1", "Device 66a7"]
+#   ArchitectureName: "gfx906"
+
+    ScheduleName: "vega10"
+    DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"]
+    ArchitectureName: "gfx900"
+
+#   ScheduleName: "mi25"
+#   DeviceNames: ["Device 6860"]
+#   ArchitectureName: "gfx900"
+
+#   ScheduleName: "r9nano"
+#   DeviceNames: ["Device 7300"]
+#   ArchitectureName: "gfx803"
+
+#   ScheduleName: "hip"
+#   DeviceNames: ["Device 0000"]
+#   ArchitectureName: "fallback"
+
+LibraryClient:
diff --git a/Tensile/Configs/rocblas_hpa_hgemm_nt_asm_full.yaml b/Tensile/Configs/rocblas_hpa_hgemm_nt_asm_full.yaml
new file mode 100644
index 0000000000..ca2455c4b0
--- /dev/null
+++ b/Tensile/Configs/rocblas_hpa_hgemm_nt_asm_full.yaml
@@ -0,0 +1,252 @@
+GlobalParameters:
+  MinimumRequiredVersion: 4.4.0
+  PrintLevel: 1
+  ForceRedoBenchmarkProblems: True
+  ForceRedoLibraryLogic: True
+  ForceRedoLibraryClient: True
+  CMakeBuildType: Release
+  EnqueuesPerSync: 1
+  SyncsPerBenchmark: 1
+  LibraryPrintDebug: False
+  NumElementsToValidate: 256
+  ValidationMaxToPrint: 4
+  ValidationPrintValids: False
+  ShortNames: False
+  MergeFiles: True
+  Platform: 0
+  Device: 0
+  KernelTime: True
+  PinClocks: True
+  SleepPercent: 200
+  DataInitTypeBeta : 0
+  ExitOnFails: 0
+
+BenchmarkProblems:
+
+  ########################################
+  ########################################
+  ###
+  ###   NT
+  ###
+  ########################################
+  ########################################
+  -
+    - # ProblemType
+      OperationType: GEMM
+      DataType: h
+      HighPrecisionAccumulate: True
+      TransposeA: False
+      TransposeB: True
+      UseBeta: True
+      Batched: True
+
+  ########################################
+  # NT - Small or Skinny
+  ########################################
+    - # Benchmark Group
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - EdgeType: ["ShiftPtr"]
+        - LoopTail: [True]
+        - KernelLanguage: ["Assembly"]
+        - PrefetchGlobalRead: [True]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - ThreadTile:
+          - [ 2, 2 ]
+          - [ 4, 2 ]
+          - [ 4, 4 ]
+          - [ 8, 4 ]
+          - [ 8, 8 ]
+        - WorkGroup:
+          - [ 8, 8, 1 ]
+          - [ 16, 8, 1 ]
+          - [ 32, 8, 1 ]
+          - [ 16, 4, 1 ]
+          - [ 32, 4, 1 ]
+        - WorkGroupMapping: [1, 8]
+        - GlobalSplitU: [1]
+        - DepthU: [ 8, 16, 24, 32, 64 ]
+        - VectorWidth: [-1]
+        - AssertSummationElementMultiple: [2]
+        - AssertFree0ElementMultiple: [2]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [          [64, 128], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ] # skinny-0
+          - Range: [ [64, 64, 64, 7000],          [64, 128], [1], [256, 1024, 1024, 4096] ] # skinny-1
+          - Range: [ [64, 64, 64,  700], [64, 64, 64,  700], [1], [256, 1024, 1024, 4096] ] # small
+          - Exact: [ 512, 16, 1, 512 ]
+          - Exact: [ 512, 32, 1, 512 ]
+          - Exact: [ 1024, 16, 1, 512 ]
+          - Exact: [ 1024, 32, 1, 512 ]
+
+  ########################################
+  # NT - Large
+  ########################################
+    - # Benchmark Group
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - EdgeType: ["ShiftPtr"]
+        - LoopTail: [True]
+        - KernelLanguage: ["Assembly"]
+        - PrefetchGlobalRead: [True]
+        - PrefetchLocalRead: [True]
+        - GlobalSplitU: [1]
+      ForkParameters:
+        - ThreadTile:
+          - [ 4, 4 ]
+          - [ 4, 8 ]
+          - [ 6, 8 ]
+          - [ 8, 4 ]
+          - [ 8, 6 ]
+          - [ 8, 8 ]
+        - WorkGroup:
+          - [ 8, 8, 1 ]
+          - [ 16, 16, 1 ]
+        - WorkGroupMapping: [8]
+#       - WorkGroupMapping: [1, 8]
+        - DepthU: [ 8, 16, 24, 32, 64 ]
+        - VectorWidth: [-1]
+        - AssertSummationElementMultiple: [2]
+        - AssertFree0ElementMultiple: [2]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [ [64, 64, 64, 7000], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ] # large
+          - Exact: [ 1024, 1024, 1, 1024 ]
+          - Exact: [ 1760, 7133, 1, 1760 ]
+          - Exact: [ 2048, 7133, 1, 2048 ]
+          - Exact: [ 2560, 7133, 1, 2560 ]
+          - Exact: [ 3072, 7435, 1, 1024 ]
+          - Exact: [ 4096, 7133, 1, 4096 ]
+          - Exact: [ 7680, 5481, 1, 2560 ]
+
+  ########################################
+  # NT - VectorWidth Correctness
+  ########################################
+    - # Benchmark Group
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - EdgeType: ["ShiftPtr"]
+        - LoopTail: [True]
+        - KernelLanguage: ["Assembly"]
+        - PrefetchGlobalRead: [True]
+        - PrefetchLocalRead: [True]
+        - WorkGroupMapping: [1]
+      ForkParameters:
+        - ThreadTile:
+          - [ 2, 2 ]
+          - [ 4, 4 ]
+        - WorkGroup:
+          - [ 8, 8, 1 ]
+          - [ 16, 16, 1 ]
+        - GlobalSplitU: [1]
+        - DepthU: [ 8, 16, 24, 32, 64 ]
+        - VectorWidth: [-1]
+        - AssertSummationElementMultiple: [2]
+        - AssertFree0ElementMultiple: [2]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [               [64],               [64], [1], [256, 1024, 1024, 4096] ] # corner
+          - Range: [               [64], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ] # skinny-0
+          - Range: [ [64, 64, 64, 7000],               [64], [1], [256, 1024, 1024, 4096] ] # skinny-1
+
+  ########################################
+  # NT - Source kernels
+  ########################################
+    - # BenchmarkProblemSizeGroup - VW=2 for m,n,k<=4
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Source"]
+        - EdgeType: ["ShiftPtr"]
+        - LoopTail: [True]
+        - VectorWidth: [2]
+        - WorkGroupMapping: [8]
+      ForkParameters:
+        - ThreadTile:
+          - [ 8, 8 ]
+          - [ 4, 8 ]
+          - [ 4, 4 ]
+        - WorkGroup:
+          - [ 16, 16,  1 ]
+        - PrefetchGlobalRead: [False, True]
+        - PrefetchLocalRead: [False, True]
+#       - DepthU: [ 4, 8, 16, 32 ]
+        - DepthU: [ 4, 8, 16 ]
+        - VectorWidth: [2]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [               [32],               [32], [1],                    [32] ]
+          - Range: [               [32], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ]
+          - Range: [ [64, 64, 64, 7000],               [32], [1], [256, 1024, 1024, 4096] ]
+          - Range: [ [64, 64, 64, 7000], [64, 64, 64, 7000], [1],                    [32] ]
+          - Range: [               [32],               [32], [1], [256, 1024, 1024, 4096] ]
+          - Range: [               [32], [64, 64, 64, 7000], [1],                    [32] ]
+          - Range: [ [64, 64, 64, 7000],               [32], [1],                    [32] ]
+
+    - # BenchmarkProblemSizeGroup - VW=1 for m,n,k==1
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Source"]
+        - EdgeType: ["ShiftPtr"]
+        - LoopTail: [True]
+        - VectorWidth: [1]
+        - WorkGroupMapping: [8]
+      ForkParameters:
+        - ThreadTile:
+          - [ 8, 8 ]
+          - [ 4, 8 ]
+          - [ 4, 4 ]
+        - WorkGroup:
+          - [ 16, 16,  1 ]
+        - PrefetchGlobalRead: [False, True]
+        - PrefetchLocalRead: [False, True]
+#       - DepthU: [ 4, 8, 16, 32 ]
+        - DepthU: [ 4, 8, 16 ]
+        - VectorWidth: [1]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [                [1],                [1], [1],                     [1] ]
+          - Range: [                [1], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ]
+          - Range: [ [64, 64, 64, 7000],                [1], [1], [256, 1024, 1024, 4096] ]
+          - Range: [ [64, 64, 64, 7000], [64, 64, 64, 7000], [1],                     [1] ]
+          - Range: [                [1],                [1], [1], [256, 1024, 1024, 4096] ]
+          - Range: [                [1], [64, 64, 64, 7000], [1],                     [1] ]
+          - Range: [ [64, 64, 64, 7000],                [1], [1],                     [1] ]
+
+LibraryLogic:
+#   ScheduleName: "vega20"
+#   DeviceNames: ["Device 66a0", "Device 66a1", "Device 66a7"]
+#   ArchitectureName: "gfx906"
+
+    ScheduleName: "vega10"
+    DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"]
+    ArchitectureName: "gfx900"
+
+#   ScheduleName: "mi25"
+#   DeviceNames: ["Device 6860"]
+#   ArchitectureName: "gfx900"
+
+#   ScheduleName: "r9nano"
+#   DeviceNames: ["Device 7300"]
+#   ArchitectureName: "gfx803"
+
+#   ScheduleName: "hip"
+#   DeviceNames: ["Device 0000"]
+#   ArchitectureName: "fallback"
+
+LibraryClient:
diff --git a/Tensile/Configs/rocblas_hpa_hgemm_tn_asm_full.yaml b/Tensile/Configs/rocblas_hpa_hgemm_tn_asm_full.yaml
new file mode 100644
index 0000000000..cc9457c969
--- /dev/null
+++ b/Tensile/Configs/rocblas_hpa_hgemm_tn_asm_full.yaml
@@ -0,0 +1,453 @@
+GlobalParameters:
+  MinimumRequiredVersion: 4.4.0
+  PrintLevel: 1
+  ForceRedoBenchmarkProblems: True
+  ForceRedoLibraryLogic: True
+  ForceRedoLibraryClient: True
+  CMakeBuildType: Release
+  EnqueuesPerSync: 1
+  SyncsPerBenchmark: 1
+  LibraryPrintDebug: False
+  NumElementsToValidate: 256
+  ValidationMaxToPrint: 4
+  ValidationPrintValids: False
+  ShortNames: False
+  MergeFiles: True
+  Platform: 0
+  Device: 0
+  KernelTime: True
+  PinClocks: True
+  SleepPercent: 200
+  DataInitTypeBeta : 0
+  ExitOnFails: 0
+
+BenchmarkProblems:
+
+  ########################################
+  ########################################
+  ###
+  ###   TN
+  ###
+  ########################################
+  ########################################
+  -
+    - # ProblemType
+      OperationType: GEMM
+      DataType: h
+      HighPrecisionAccumulate: True
+      TransposeA: True
+      TransposeB: False
+      UseBeta: True
+      Batched: True
+
+  ########################################
+  # TN - Super Skinny
+  ########################################
+    - # Benchmark Group
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - EdgeType: ["ShiftPtr"]
+        - LoopTail: [True]
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+        - WorkGroupMapping: [1]
+      ForkParameters:
+        - PrefetchGlobalRead: [False, True]
+        - ThreadTile:
+          - [ 2, 2 ]
+          - [ 4, 2 ]
+          - [ 8, 2 ]
+          - [ 4, 4 ]
+          - [ 8, 8 ]
+        - WorkGroup:
+          - [ 8, 8, 1 ]
+          - [ 8, 16, 1 ]
+          - [ 16, 4, 1 ]
+          - [ 16, 8, 1 ]
+          - [ 16, 16, 1 ]
+          - [ 32, 4, 1 ]
+          - [ 32, 8, 1 ]
+        - GlobalSplitU: [1]
+        - DepthU: [ 8, 16, 32, 64 ]
+        - VectorWidth: [-1]
+        - AssertSummationElementMultiple: [2]
+        - AssertFree0ElementMultiple: [2]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [ 512, 8, 1, 500000 ]
+          - Exact: [ 512, 16, 1, 500000 ]
+          - Exact: [ 1024, 8, 1, 500000 ]
+          - Exact: [ 1024, 16, 1, 500000 ]
+          - Exact: [ 1760, 16, 1, 1760 ]
+          - Exact: [ 3072, 16, 1, 1024 ]
+          - Exact: [ 2048, 16, 1, 2048 ]
+          - Exact: [ 2560, 16, 1, 2560 ]
+          - Exact: [ 4608, 16, 1, 1536 ]
+          - Exact: [ 6144, 16, 1, 2560 ]
+          - Exact: [ 4096, 16, 1, 4096 ]
+          - Exact: [ 7680, 16, 1, 2560 ]
+          - Exact: [ 8448, 16, 1, 2816 ]
+
+# ########################################
+# # TN - 3 Waves/WG
+# ########################################
+#   - # Benchmark Group
+#     InitialSolutionParameters:
+#     BenchmarkCommonParameters:
+#       - EdgeType: ["ShiftPtr"]
+#       - LoopTail: [True]
+#       - KernelLanguage: ["Assembly"]
+#       - PrefetchGlobalRead: [True]
+#       - PrefetchLocalRead: [True]
+#       - WorkGroupMapping: [1]
+#     ForkParameters:
+#       - ThreadTile:
+#         - [ 3, 3 ]
+#         - [ 3, 4 ]
+#         - [ 3, 6 ]
+#         - [ 4, 3 ]
+#         - [ 4, 4 ]
+#         - [ 4, 6 ]
+#         - [ 6, 3 ]
+#         - [ 6, 4 ]
+#         - [ 6, 6 ]
+#       - WorkGroup:
+#         - [ 24, 8, 1 ]
+#         - [ 12, 16, 1 ]
+#         - [ 6, 32, 1 ]
+#         - [ 8, 8, 1 ]
+#         - [ 8, 24, 1 ]
+#         - [ 16, 16, 1 ]
+#       - GlobalSplitU: [1]
+#       - DepthU: [ 8, 16, 24, 32, 64 ]
+#       - VectorWidth: [2, 4, 8]
+#       - AssertSummationElementMultiple: [2]
+#       - AssertFree0ElementMultiple: [2]
+#     BenchmarkForkParameters:
+#     JoinParameters:
+#     BenchmarkJoinParameters:
+#     BenchmarkFinalParameters:
+#       - ProblemSizes:
+#         - Exact: [ 35, 8457, 1, 1760 ]
+#         - Exact: [ 35, 8457, 1, 2048 ]
+#         - Exact: [ 35, 8457, 1, 2560 ]
+#         - Exact: [ 35, 8457, 1, 4096 ]
+
+  ########################################
+  # TN - Small or Skinny
+  ########################################
+    - # Benchmark Group
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - EdgeType: ["ShiftPtr"]
+        - LoopTail: [True]
+        - KernelLanguage: ["Assembly"]
+        - PrefetchGlobalRead: [True]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - ThreadTile:
+          - [ 2, 2 ]
+          - [ 4, 2 ]
+          - [ 4, 4 ]
+          - [ 8, 4 ]
+          - [ 8, 8 ]
+        - WorkGroup:
+          - [ 8, 8, 1 ]
+          - [ 16, 8, 1 ]
+          - [ 32, 8, 1 ]
+          - [ 16, 4, 1 ]
+          - [ 32, 4, 1 ]
+        - WorkGroupMapping: [1, 8]
+        - GlobalSplitU: [1]
+        - DepthU: [ 8, 16, 24, 32, 64 ]
+        - VectorWidth: [-1]
+        - AssertSummationElementMultiple: [2]
+        - AssertFree0ElementMultiple: [2]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [          [64, 128], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ] # skinny-0
+          - Range: [ [64, 64, 64, 7000],          [64, 128], [1], [256, 1024, 1024, 4096] ] # skinny-1
+          - Range: [ [64, 64, 64,  700], [64, 64, 64,  700], [1], [256, 1024, 1024, 4096] ] # small
+          - Exact: [ 1760, 16, 1, 1760 ]
+          - Exact: [ 3072, 16, 1, 1024 ]
+          - Exact: [ 2048, 16, 1, 2048 ]
+          - Exact: [ 1760, 32, 1, 1760 ]
+          - Exact: [ 3072, 32, 1, 1024 ]
+          - Exact: [ 2560, 16, 1, 2560 ]
+          - Exact: [ 4608, 16, 1, 1536 ]
+          - Exact: [ 2048, 32, 1, 2048 ]
+          - Exact: [ 1760, 64, 1, 1760 ]
+          - Exact: [ 3072, 64, 1, 1024 ]
+          - Exact: [ 2560, 32, 1, 2560 ]
+          - Exact: [ 4608, 32, 1, 1536 ]
+          - Exact: [ 6144, 16, 1, 2560 ]
+          - Exact: [ 2048, 64, 1, 2048 ]
+          - Exact: [ 4096, 16, 1, 4096 ]
+          - Exact: [ 7680, 16, 1, 2560 ]
+          - Exact: [ 8448, 16, 1, 2816 ]
+          - Exact: [ 1760, 128, 1, 1760 ]
+          - Exact: [ 3072, 128, 1, 1024 ]
+          - Exact: [ 2560, 64, 1, 2560 ]
+          - Exact: [ 6144, 32, 1, 2560 ]
+#         - Exact: [ 35, 8457, 1, 1760 ]
+          - Exact: [ 2048, 128, 1, 2048 ]
+          - Exact: [ 4096, 32, 1, 4096 ]
+#         - Exact: [ 35, 8457, 1, 2048 ]
+          - Exact: [ 7680, 32, 1, 2560 ]
+#         - Exact: [ 35, 8457, 1, 2560 ]
+          - Exact: [ 8448, 32, 1, 2816 ]
+          - Exact: [ 2560, 128, 1, 2560 ]
+          - Exact: [ 4096, 64, 1, 4096 ]
+#         - Exact: [ 35, 8457, 1, 4096 ]
+          - Exact: [ 7680, 64, 1, 2560 ]
+          - Exact: [ 4096, 128, 1, 4096 ]
+          - Exact: [ 7680, 128, 1, 2560 ]
+
+  ########################################
+  # TN - Large
+  ########################################
+    - # Benchmark Group
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - EdgeType: ["ShiftPtr"]
+        - LoopTail: [True]
+        - KernelLanguage: ["Assembly"]
+        - PrefetchGlobalRead: [True]
+        - PrefetchLocalRead: [True]
+        - GlobalSplitU: [1]
+      ForkParameters:
+        - ThreadTile:
+          - [ 4, 4 ]
+          - [ 4, 8 ]
+          - [ 6, 8 ]
+          - [ 8, 4 ]
+          - [ 8, 6 ]
+          - [ 8, 8 ]
+        - WorkGroup:
+          - [ 8, 8, 1 ]
+          - [ 16, 16, 1 ]
+        - WorkGroupMapping: [8]
+#       - WorkGroupMapping: [1, 8]
+        - DepthU: [ 8, 16, 24, 32, 64 ]
+        - VectorWidth: [-1]
+        - AssertSummationElementMultiple: [2]
+        - AssertFree0ElementMultiple: [2]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [ [64, 64, 64, 7000], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ] # large
+          - Exact: [ 1024, 1024, 1, 1024 ]
+# MIOpen sizes
+          - Exact: [  1760,   800, 1, 1760 ]
+          - Exact: [  1760,  1600, 1, 1760 ]
+          - Exact: [  1760,  3200, 1, 1760 ]
+          - Exact: [  1760,  6400, 1, 1760 ]
+          - Exact: [  2048,   800, 1, 2048 ]
+          - Exact: [  2048,  1600, 1, 2048 ]
+          - Exact: [  2048,  3200, 1, 2048 ]
+          - Exact: [  2048,  6400, 1, 2048 ]
+          - Exact: [  2560,   800, 1, 2560 ]
+          - Exact: [  2560,  1600, 1, 2560 ]
+          - Exact: [  2560,  3200, 1, 2560 ]
+          - Exact: [  2560,  6400, 1, 2560 ]
+          - Exact: [  2048,   400, 1,  512 ]
+          - Exact: [  2048,   800, 1,  512 ]
+          - Exact: [  2048,  1600, 1,  512 ]
+          - Exact: [  2048,  3200, 1,  512 ]
+          - Exact: [  4096,   400, 1, 1024 ]
+          - Exact: [  4096,   800, 1, 1024 ]
+          - Exact: [  4096,  1600, 1, 1024 ]
+          - Exact: [  4096,  3200, 1, 1024 ]
+          - Exact: [  8192,   400, 1, 2048 ]
+          - Exact: [  8192,   800, 1, 2048 ]
+          - Exact: [  8192,  1600, 1, 2048 ]
+          - Exact: [  8192,  3200, 1, 2048 ]
+          - Exact: [ 16384,   400, 1, 4096 ]
+          - Exact: [ 16384,   800, 1, 4096 ]
+          - Exact: [ 16384,  1600, 1, 4096 ]
+          - Exact: [ 16384,  3200, 1, 4096 ]
+          - Exact: [  8448, 48000, 1, 2816 ]
+          - Exact: [  8448, 24000, 1, 2816 ]
+          - Exact: [  8448, 12000, 1, 2816 ]
+          - Exact: [  8448,  5984, 1, 2816 ]
+          - Exact: [  6144, 48000, 1, 2048 ]
+          - Exact: [  6144, 24000, 1, 2048 ]
+          - Exact: [  6144, 12000, 1, 2048 ]
+          - Exact: [  6144,  5984, 1, 2048 ]
+          - Exact: [  4608, 48000, 1, 1536 ]
+          - Exact: [  4608, 24000, 1, 1536 ]
+          - Exact: [  4608, 12000, 1, 1536 ]
+          - Exact: [  4608,  5984, 1, 1536 ]
+          - Exact: [  7680, 48000, 1, 2560 ]
+          - Exact: [  7680, 24000, 1, 2560 ]
+          - Exact: [  7680, 12000, 1, 2560 ]
+          - Exact: [  7680,  5984, 1, 2560 ]
+# Deepbench sizes
+          - Exact: [ 1024, 700, 1, 512 ]
+          - Exact: [ 512, 24000, 1, 1536 ]
+          - Exact: [ 1760, 7000, 1, 1760 ]
+          - Exact: [ 512, 24000, 1, 2048 ]
+          - Exact: [ 2048, 7000, 1, 2048 ]
+          - Exact: [ 512, 24000, 1, 2560 ]
+          - Exact: [ 512, 24000, 1, 2816 ]
+          - Exact: [ 512, 48000, 1, 1536 ]
+          - Exact: [ 1024, 24000, 1, 1536 ]
+          - Exact: [ 2560, 7000, 1, 2560 ]
+          - Exact: [ 512, 48000, 1, 2048 ]
+          - Exact: [ 1024, 24000, 1, 2048 ]
+          - Exact: [ 512, 48000, 1, 2560 ]
+          - Exact: [ 1024, 24000, 1, 2560 ]
+          - Exact: [ 512, 48000, 1, 2816 ]
+          - Exact: [ 1024, 24000, 1, 2816 ]
+          - Exact: [ 1024, 48000, 1, 1536 ]
+          - Exact: [ 3072, 24000, 1, 1024 ]
+          - Exact: [ 5124, 9124, 1, 1760 ]
+          - Exact: [ 5124, 9124, 1, 2048 ]
+          - Exact: [ 1024, 48000, 1, 2048 ]
+          - Exact: [ 4096, 7000, 1, 4096 ]
+          - Exact: [ 5124, 9124, 1, 2560 ]
+          - Exact: [ 1024, 48000, 1, 2560 ]
+          - Exact: [ 1024, 48000, 1, 2816 ]
+          - Exact: [ 3072, 48000, 1, 1024 ]
+          - Exact: [ 4608, 24000, 1, 1536 ]
+          - Exact: [ 5124, 9124, 1, 4096 ]
+          - Exact: [ 4608, 48000, 1, 1536 ]
+          - Exact: [ 6144, 24000, 1, 2560 ]
+          - Exact: [ 7680, 24000, 1, 2560 ]
+          - Exact: [ 8448, 24000, 1, 2816 ]
+          - Exact: [ 6144, 48000, 1, 2560 ]
+          - Exact: [ 7680, 48000, 1, 2560 ]
+          - Exact: [ 8448, 48000, 1, 2816 ]
+
+  ########################################
+  # TN - VectorWidth Correctness
+  ########################################
+    - # Benchmark Group
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - EdgeType: ["ShiftPtr"]
+        - LoopTail: [True]
+        - KernelLanguage: ["Assembly"]
+        - PrefetchGlobalRead: [True]
+        - PrefetchLocalRead: [True]
+        - WorkGroupMapping: [1]
+      ForkParameters:
+        - ThreadTile:
+          - [ 2, 2 ]
+          - [ 4, 4 ]
+        - WorkGroup:
+          - [ 8, 8, 1 ]
+          - [ 16, 16, 1 ]
+        - GlobalSplitU: [1]
+        - DepthU: [ 8, 16, 24, 32, 64 ]
+        - VectorWidth: [-1]
+        - AssertSummationElementMultiple: [2]
+        - AssertFree0ElementMultiple: [2]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [               [64],               [64], [1], [256, 1024, 1024, 4096] ] # corner
+          - Range: [               [64], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ] # skinny-0
+          - Range: [ [64, 64, 64, 7000],               [64], [1], [256, 1024, 1024, 4096] ] # skinny-1
+
+  ########################################
+  # TN - Source kernels
+  ########################################
+    - # BenchmarkProblemSizeGroup - VW=2 for m,n,k<=4
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Source"]
+        - EdgeType: ["ShiftPtr"]
+        - LoopTail: [True]
+        - VectorWidth: [2]
+        - WorkGroupMapping: [8]
+      ForkParameters:
+        - ThreadTile:
+          - [ 8, 8 ]
+          - [ 4, 8 ]
+          - [ 4, 4 ]
+        - WorkGroup:
+          - [ 16, 16,  1 ]
+        - PrefetchGlobalRead: [False, True]
+        - PrefetchLocalRead: [False, True]
+#       - DepthU: [ 4, 8, 16, 32 ]
+        - DepthU: [ 4, 8, 16 ]
+        - VectorWidth: [2]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [               [32],               [32], [1],                    [32] ]
+          - Range: [               [32], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ]
+          - Range: [ [64, 64, 64, 7000],               [32], [1], [256, 1024, 1024, 4096] ]
+          - Range: [ [64, 64, 64, 7000], [64, 64, 64, 7000], [1],                    [32] ]
+          - Range: [               [32],               [32], [1], [256, 1024, 1024, 4096] ]
+          - Range: [               [32], [64, 64, 64, 7000], [1],                    [32] ]
+          - Range: [ [64, 64, 64, 7000],               [32], [1],                    [32] ]
+
+    - # BenchmarkProblemSizeGroup - VW=1 for m,n,k==1
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Source"]
+        - EdgeType: ["ShiftPtr"]
+        - LoopTail: [True]
+        - VectorWidth: [1]
+        - WorkGroupMapping: [8]
+      ForkParameters:
+        - ThreadTile:
+          - [ 8, 8 ]
+          - [ 4, 8 ]
+          - [ 4, 4 ]
+        - WorkGroup:
+          - [ 16, 16,  1 ]
+        - PrefetchGlobalRead: [False, True]
+        - PrefetchLocalRead: [False, True]
+#       - DepthU: [ 4, 8, 16, 32 ]
+        - DepthU: [ 4, 8, 16 ]
+        - VectorWidth: [1]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [                [1],                [1], [1],                     [1] ]
+          - Range: [                [1], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ]
+          - Range: [ [64, 64, 64, 7000],                [1], [1], [256, 1024, 1024, 4096] ]
+          - Range: [ [64, 64, 64, 7000], [64, 64, 64, 7000], [1],                     [1] ]
+          - Range: [                [1],                [1], [1], [256, 1024, 1024, 4096] ]
+          - Range: [                [1], [64, 64, 64, 7000], [1],                     [1] ]
+          - Range: [ [64, 64, 64, 7000],                [1], [1],                     [1] ]
+
+LibraryLogic:
+#   ScheduleName: "vega20"
+#   DeviceNames: ["Device 66a0", "Device 66a1", "Device 66a7"]
+#   ArchitectureName: "gfx906"
+
+    ScheduleName: "vega10"
+    DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"]
+    ArchitectureName: "gfx900"
+
+#   ScheduleName: "mi25"
+#   DeviceNames: ["Device 6860"]
+#   ArchitectureName: "gfx900"
+
+#   ScheduleName: "r9nano"
+#   DeviceNames: ["Device 7300"]
+#   ArchitectureName: "gfx803"
+
+#   ScheduleName: "hip"
+#   DeviceNames: ["Device 0000"]
+#   ArchitectureName: "fallback"
+
+LibraryClient:
diff --git a/Tensile/Configs/rocblas_hpa_hgemm_tt_asm_full.yaml b/Tensile/Configs/rocblas_hpa_hgemm_tt_asm_full.yaml
new file mode 100644
index 0000000000..2e313a427a
--- /dev/null
+++ b/Tensile/Configs/rocblas_hpa_hgemm_tt_asm_full.yaml
@@ -0,0 +1,245 @@
+GlobalParameters:
+  MinimumRequiredVersion: 4.4.0
+  PrintLevel: 1
+  ForceRedoBenchmarkProblems: True
+  ForceRedoLibraryLogic: True
+  ForceRedoLibraryClient: True
+  CMakeBuildType: Release
+  EnqueuesPerSync: 1
+  SyncsPerBenchmark: 1
+  LibraryPrintDebug: False
+  NumElementsToValidate: 256
+  ValidationMaxToPrint: 4
+  ValidationPrintValids: False
+  ShortNames: False
+  MergeFiles: True
+  Platform: 0
+  Device: 0
+  KernelTime: True
+  PinClocks: True
+  SleepPercent: 200
+  DataInitTypeBeta : 0
+  ExitOnFails: 0
+
+BenchmarkProblems:
+
+  ########################################
+  ########################################
+  ###
+  ###   TT
+  ###
+  ########################################
+  ########################################
+  -
+    - # ProblemType
+      OperationType: GEMM
+      DataType: h
+      HighPrecisionAccumulate: True
+      TransposeA: True
+      TransposeB: True
+      UseBeta: True
+      Batched: True
+
+  ########################################
+  # TT - Small or Skinny
+  ########################################
+    - # Benchmark Group
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - EdgeType: ["ShiftPtr"]
+        - LoopTail: [True]
+        - KernelLanguage: ["Assembly"]
+        - PrefetchGlobalRead: [True]
+        - PrefetchLocalRead: [True]
+      ForkParameters:
+        - ThreadTile:
+          - [ 2, 2 ]
+          - [ 2, 4 ]
+          - [ 4, 4 ]
+          - [ 4, 8 ]
+          - [ 8, 8 ]
+        - WorkGroup:
+          - [ 4, 32, 1 ]
+          - [ 8, 8, 1 ]
+          - [ 8, 16, 1 ]
+          - [ 8, 32, 1 ]
+          - [ 16, 16, 1 ]
+          - [ 32, 4, 1 ]
+          - [ 32, 8, 1 ]
+        - WorkGroupMapping: [-1, -4]
+        - GlobalSplitU: [1]
+        - DepthU: [ 8, 16, 24, 32, 64 ]
+        - VectorWidth: [-1]
+        - AssertSummationElementMultiple: [2]
+        - AssertFree0ElementMultiple: [2]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [          [64, 128], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ] # skinny-0
+          - Range: [ [64, 64, 64, 7000],          [64, 128], [1], [256, 1024, 1024, 4096] ] # skinny-1
+          - Range: [ [64, 64, 64,  700], [64, 64, 64,  700], [1], [256, 1024, 1024, 4096] ] # small
+
+  ########################################
+  # TT - Large
+  ########################################
+    - # Benchmark Group
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - EdgeType: ["ShiftPtr"]
+        - LoopTail: [True]
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+        - GlobalSplitU: [1]
+      ForkParameters:
+        - PrefetchGlobalRead: [False, True]
+        - ThreadTile:
+          - [ 4, 4 ]
+          - [ 4, 8 ]
+          - [ 6, 8 ]
+          - [ 8, 4 ]
+          - [ 8, 6 ]
+          - [ 8, 8 ]
+        - WorkGroup:
+          - [ 8, 8, 1 ]
+          - [ 16, 16, 1 ]
+        - WorkGroupMapping: [-1, -4]
+        - DepthU: [ 8, 16, 24, 32, 64 ]
+        - VectorWidth: [-1]
+        - AssertSummationElementMultiple: [2]
+        - AssertFree0ElementMultiple: [2]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [ [64, 64, 64, 7000], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ] # large
+          - Exact: [ 1024, 1024, 1, 1024 ]
+
+  ########################################
+  # TT - VectorWidth Correctness
+  ########################################
+    - # Benchmark Group
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - EdgeType: ["ShiftPtr"]
+        - LoopTail: [True]
+        - KernelLanguage: ["Assembly"]
+        - PrefetchGlobalRead: [True]
+        - PrefetchLocalRead: [True]
+        - WorkGroupMapping: [-1]
+      ForkParameters:
+        - ThreadTile:
+          - [ 2, 2 ]
+          - [ 4, 4 ]
+        - WorkGroup:
+          - [ 8, 8, 1 ]
+          - [ 16, 16, 1 ]
+        - GlobalSplitU: [1]
+        - DepthU: [ 8, 16, 24, 32, 64 ]
+        - VectorWidth: [-1]
+        - AssertSummationElementMultiple: [2]
+        - AssertFree0ElementMultiple: [2]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [               [64],               [64], [1], [256, 1024, 1024, 4096] ] # corner
+          - Range: [               [64], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ] # skinny-0
+          - Range: [ [64, 64, 64, 7000],               [64], [1], [256, 1024, 1024, 4096] ] # skinny-1
+
+  ########################################
+  # TT - Source kernels
+  ########################################
+    - # BenchmarkProblemSizeGroup - VW=2 for m,n,k<=4
+      InitialSolutionParameters:
+        - WorkGroupMapping: [-1]
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Source"]
+        - EdgeType: ["ShiftPtr"]
+        - LoopTail: [True]
+        - VectorWidth: [2]
+        - WorkGroupMapping: [-8]
+      ForkParameters:
+        - ThreadTile:
+          - [ 8, 8 ]
+          - [ 4, 8 ]
+          - [ 4, 4 ]
+        - WorkGroup:
+          - [ 16, 16,  1 ]
+        - PrefetchGlobalRead: [False, True]
+        - PrefetchLocalRead: [False, True]
+#       - DepthU: [ 4, 8, 16, 32 ]
+        - DepthU: [ 4, 8, 16 ]
+        - VectorWidth: [2]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [               [32],               [32], [1],                    [32] ]
+          - Range: [               [32], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ]
+          - Range: [ [64, 64, 64, 7000],               [32], [1], [256, 1024, 1024, 4096] ]
+          - Range: [ [64, 64, 64, 7000], [64, 64, 64, 7000], [1],                    [32] ]
+          - Range: [               [32],               [32], [1], [256, 1024, 1024, 4096] ]
+          - Range: [               [32], [64, 64, 64, 7000], [1],                    [32] ]
+          - Range: [ [64, 64, 64, 7000],               [32], [1],                    [32] ]
+
+    - # BenchmarkProblemSizeGroup - VW=1 for m,n,k==1
+      InitialSolutionParameters:
+        - WorkGroupMapping: [-1]
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Source"]
+        - EdgeType: ["ShiftPtr"]
+        - LoopTail: [True]
+        - VectorWidth: [1]
+        - WorkGroupMapping: [-8]
+      ForkParameters:
+        - ThreadTile:
+          - [ 8, 8 ]
+          - [ 4, 8 ]
+          - [ 4, 4 ]
+        - WorkGroup:
+          - [ 16, 16,  1 ]
+        - PrefetchGlobalRead: [False, True]
+        - PrefetchLocalRead: [False, True]
+#       - DepthU: [ 4, 8, 16, 32 ]
+        - DepthU: [ 4, 8, 16 ]
+        - VectorWidth: [1]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [                [1],                [1], [1],                     [1] ]
+          - Range: [                [1], [64, 64, 64, 7000], [1], [256, 1024, 1024, 4096] ]
+          - Range: [ [64, 64, 64, 7000],                [1], [1], [256, 1024, 1024, 4096] ]
+          - Range: [ [64, 64, 64, 7000], [64, 64, 64, 7000], [1],                     [1] ]
+          - Range: [                [1],                [1], [1], [256, 1024, 1024, 4096] ]
+          - Range: [                [1], [64, 64, 64, 7000], [1],                     [1] ]
+          - Range: [ [64, 64, 64, 7000],                [1], [1],                     [1] ]
+
+LibraryLogic:
+#   ScheduleName: "vega20"
+#   DeviceNames: ["Device 66a0", "Device 66a1", "Device 66a7"]
+#   ArchitectureName: "gfx906"
+
+    ScheduleName: "vega10"
+    DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"]
+    ArchitectureName: "gfx900"
+
+#   ScheduleName: "mi25"
+#   DeviceNames: ["Device 6860"]
+#   ArchitectureName: "gfx900"
+
+#   ScheduleName: "r9nano"
+#   DeviceNames: ["Device 7300"]
+#   ArchitectureName: "gfx803"
+
+#   ScheduleName: "hip"
+#   DeviceNames: ["Device 0000"]
+#   ArchitectureName: "fallback"
+
+LibraryClient:
diff --git a/Tensile/Configs/rocblas_sgemm_asm_full.yaml b/Tensile/Configs/rocblas_sgemm_asm_full.yaml
index b90762ed2d..523d7ca02a 100644
--- a/Tensile/Configs/rocblas_sgemm_asm_full.yaml
+++ b/Tensile/Configs/rocblas_sgemm_asm_full.yaml
@@ -45,7 +45,6 @@ BenchmarkProblems:
         - LoopTail: [True]
         - KernelLanguage: ["Assembly"]
         - PrefetchLocalRead: [True]
-        - PreciseBoundsCheck: [False]
         - WorkGroupMapping: [1]
       ForkParameters:
         - PrefetchGlobalRead: [False, True]
@@ -129,7 +128,6 @@ BenchmarkProblems:
         - KernelLanguage: ["Assembly"]
         - PrefetchGlobalRead: [True]
         - PrefetchLocalRead: [True]
-        - PreciseBoundsCheck: [False]
         - WorkGroupMapping: [1]
       ForkParameters:
         - ThreadTile:
@@ -210,7 +208,6 @@ BenchmarkProblems:
         - KernelLanguage: ["Assembly"]
         - PrefetchGlobalRead: [True]
         - PrefetchLocalRead: [True]
-        - PreciseBoundsCheck: [False]
       ForkParameters:
         - ThreadTile:
           - [ 2, 2 ]
@@ -348,7 +345,6 @@ BenchmarkProblems:
         - LoopTail: [True]
         - KernelLanguage: ["Assembly"]
         - PrefetchLocalRead: [True]
-        - PreciseBoundsCheck: [False]
         - GlobalSplitU: [1]
       ForkParameters:
         - PrefetchGlobalRead: [False, True]
@@ -497,7 +493,6 @@ BenchmarkProblems:
         - KernelLanguage: ["Assembly"]
         - PrefetchGlobalRead: [True]
         - PrefetchLocalRead: [True]
-        - PreciseBoundsCheck: [False]
         - WorkGroupMapping: [1]
       ForkParameters:
         - ThreadTile:
@@ -531,7 +526,6 @@ BenchmarkProblems:
         - KernelLanguage: ["Assembly"]
         - PrefetchGlobalRead: [ True ]
         - PrefetchLocalRead: [ True ]
-        - PreciseBoundsCheck: [False]
       ForkParameters:
         - ThreadTile:
           - [ 2, 2 ]
@@ -576,7 +570,6 @@ BenchmarkProblems:
         - EdgeType: ["ShiftPtr"]
         - LoopTail: [True]
         - KernelLanguage: ["Assembly"]
-        - PreciseBoundsCheck: [False]
       ForkParameters:
         - PrefetchGlobalRead: [ False, True ]
         - PrefetchLocalRead: [ False]
@@ -635,7 +628,6 @@ BenchmarkProblems:
         - LoopTail: [True]
         - KernelLanguage: ["Assembly"]
         - PrefetchLocalRead: [True]
-        - PreciseBoundsCheck: [False]
         - WorkGroupMapping: [1]
       ForkParameters:
         - PrefetchGlobalRead: [False, True]
@@ -683,7 +675,6 @@ BenchmarkProblems:
         - KernelLanguage: ["Assembly"]
         - PrefetchGlobalRead: [True]
         - PrefetchLocalRead: [True]
-        - PreciseBoundsCheck: [False]
         - WorkGroupMapping: [1]
       ForkParameters:
         - ThreadTile:
@@ -760,7 +751,6 @@ BenchmarkProblems:
         - KernelLanguage: ["Assembly"]
         - PrefetchGlobalRead: [True]
         - PrefetchLocalRead: [True]
-        - PreciseBoundsCheck: [False]
       ForkParameters:
         - ThreadTile:
           - [ 2, 2 ]
@@ -861,7 +851,6 @@ BenchmarkProblems:
         - KernelLanguage: ["Assembly"]
         - PrefetchGlobalRead: [True]
         - PrefetchLocalRead: [True]
-        - PreciseBoundsCheck: [False]
         - GlobalSplitU: [1]
       ForkParameters:
         - ThreadTile:
@@ -1003,7 +992,6 @@ BenchmarkProblems:
         - KernelLanguage: ["Assembly"]
         - PrefetchGlobalRead: [True]
         - PrefetchLocalRead: [True]
-        - PreciseBoundsCheck: [False]
         - WorkGroupMapping: [1]
       ForkParameters:
         - ThreadTile:
@@ -1086,7 +1074,6 @@ BenchmarkProblems:
         - KernelLanguage: ["Assembly"]
         - PrefetchGlobalRead: [True]
         - PrefetchLocalRead: [True]
-        - PreciseBoundsCheck: [False]
       ForkParameters:
         - ThreadTile:
           - [ 2, 2 ]
@@ -1157,7 +1144,6 @@ BenchmarkProblems:
         - KernelLanguage: ["Assembly"]
         - PrefetchGlobalRead: [True]
         - PrefetchLocalRead: [True]
-        - PreciseBoundsCheck: [False]
         - GlobalSplitU: [1]
       ForkParameters:
         - ThreadTile:
@@ -1224,7 +1210,6 @@ BenchmarkProblems:
         - KernelLanguage: ["Assembly"]
         - PrefetchGlobalRead: [True]
         - PrefetchLocalRead: [True]
-        - PreciseBoundsCheck: [False]
         - WorkGroupMapping: [1]
       ForkParameters:
         - ThreadTile:
@@ -1308,7 +1293,6 @@ BenchmarkProblems:
         - KernelLanguage: ["Assembly"]
         - PrefetchGlobalRead: [True]
         - PrefetchLocalRead: [True]
-        - PreciseBoundsCheck: [False]
       ForkParameters:
         - ThreadTile:
           - [ 2, 2 ]
@@ -1374,7 +1358,6 @@ BenchmarkProblems:
         - LoopTail: [True]
         - KernelLanguage: ["Assembly"]
         - PrefetchLocalRead: [True]
-        - PreciseBoundsCheck: [False]
         - GlobalSplitU: [1]
       ForkParameters:
         - PrefetchGlobalRead: [False, True]
@@ -1436,7 +1419,6 @@ BenchmarkProblems:
         - KernelLanguage: ["Assembly"]
         - PrefetchGlobalRead: [True]
         - PrefetchLocalRead: [True]
-        - PreciseBoundsCheck: [False]
         - WorkGroupMapping: [-1]
       ForkParameters:
         - ThreadTile:
@@ -1461,11 +1443,11 @@ BenchmarkProblems:
 
 LibraryLogic:
 #   ScheduleName: "vega20"
-#   DeviceNames: ["Device 66a0", "Device 66a7"]
+#   DeviceNames: ["Device 66a0", "Device 66a1", "Device 66a7"]
 #   ArchitectureName: "gfx906"
 
     ScheduleName: "vega10"
-    DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"]
+    DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"]
     ArchitectureName: "gfx900"
 
 #   ScheduleName: "mi25"
diff --git a/Tensile/Configs/rocblas_sgemm_asm_lite.yaml b/Tensile/Configs/rocblas_sgemm_asm_lite.yaml
index 3272e85653..3aa2ec8b04 100644
--- a/Tensile/Configs/rocblas_sgemm_asm_lite.yaml
+++ b/Tensile/Configs/rocblas_sgemm_asm_lite.yaml
@@ -552,11 +552,11 @@ BenchmarkProblems:
 
 LibraryLogic:
 #   ScheduleName: "vega20"
-#   DeviceNames: ["Device 66a0", "Device 66a7"]
+#   DeviceNames: ["Device 66a0", "Device 66a1", "Device 66a7"]
 #   ArchitectureName: "gfx906"
 
     ScheduleName: "vega10"
-    DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"]
+    DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"]
     ArchitectureName: "gfx900"
 
 #   ScheduleName: "mi25"
diff --git a/Tensile/Configs/rocblas_sgemm_hip_lite.yaml b/Tensile/Configs/rocblas_sgemm_hip_lite.yaml
index 3eb1cfc46f..aef828980b 100644
--- a/Tensile/Configs/rocblas_sgemm_hip_lite.yaml
+++ b/Tensile/Configs/rocblas_sgemm_hip_lite.yaml
@@ -183,7 +183,7 @@ BenchmarkProblems:
 
 LibraryLogic:
 #   ScheduleName: "vega10"
-#   DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"]
+    DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"]
 #   ArchitectureName: "gfx900"
 
 #   ScheduleName: "mi25"
diff --git a/Tensile/Configs/rocblas_zgemm.yaml b/Tensile/Configs/rocblas_zgemm.yaml
index 1014a00d2a..6ea62ca075 100644
--- a/Tensile/Configs/rocblas_zgemm.yaml
+++ b/Tensile/Configs/rocblas_zgemm.yaml
@@ -178,7 +178,7 @@ BenchmarkProblems:
 
 LibraryLogic:
     ScheduleName: "vega10"
-    DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"]
+    DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"]
     ArchitectureName: "gfx900"
 
 #   ScheduleName: "mi25"
diff --git a/Tensile/KernelWriter.py b/Tensile/KernelWriter.py
index c8eb10819a..f187887dea 100644
--- a/Tensile/KernelWriter.py
+++ b/Tensile/KernelWriter.py
@@ -20,7 +20,7 @@
 ################################################################################
 
 from SolutionStructs import Solution
-from Common import globalParameters, printExit, CHeader
+from Common import globalParameters, CHeader
 import abc
 import os
 from os import path, chmod
@@ -72,6 +72,20 @@ def kernelBody( self, kernel, tensorParametersA, tensorParametersB ):
     kStr += self.comment3("Allocate Resources")
     kStr += self.allocateResources(kernel)
 
+    if kernel["ProblemType"]["TLUA"]:
+      # TODO - enable more aggressive path
+      #guaranteeeNoPartialA = kernel["AssertFree0ElementMultiple"]%kernel["GlobalLoadVectorWidthA"]==0
+      guaranteeeNoPartialA = kernel["GlobalLoadVectorWidthA"]==1
+    else:
+      guaranteeeNoPartialA = True
+
+    if kernel["ProblemType"]["TLUB"]:
+      # TODO - enable more aggressive path
+      #guaranteeeNoPartialB = kernel["AssertFree1ElementMultiple"]%kernel["GlobalLoadVectorWidthB"]==0
+      guaranteeeNoPartialB = kernel["GlobalLoadVectorWidthB"]==1
+    else:
+      guaranteeeNoPartialB = True
+
     if self.enable["PreLoop"]:
       ####################################
       # Global Read Addresses
@@ -668,12 +682,19 @@ def kernelBody( self, kernel, tensorParametersA, tensorParametersB ):
       # Shift Vector Components
       ####################################
       if kernel["EdgeType"] == "ShiftPtr":
+
+        # noPartial means each component in the vector loads is always valid.  In this case we
+        # don't need the awkward unshift code
+        # TODO : the unshift code is complex and currently appears broken.  Long-term want to use
+        # the Assert*ElementMultiple>glvw code as often as possible, or use buffer-load-x1
+        # in cases where it can't be used.  Then can remove this path.
+
         # shift vector components d0
-        if self.readTileDimVectorA and kernel["GlobalLoadVectorWidthA"] > 1:
+        if not guaranteeeNoPartialA and self.readTileDimVectorA:
           kStr += self.comment("shift vector components d0")
           kStr += self.shiftVectorComponents(kernel, tensorParametersA)
         # shift vector components d1
-        if self.readTileDimVectorB and kernel["GlobalLoadVectorWidthB"] > 1:
+        if not guaranteeeNoPartialB and self.readTileDimVectorB:
           kStr += self.comment("shift vector components d1")
           kStr += self.shiftVectorComponents(kernel, tensorParametersB)
 
@@ -938,7 +959,8 @@ def initKernel(self, kernel, tensorParametersA, tensorParametersB ):
         self.writeUnrollDimComponentsA = False # Scalar
         if kernel["LocalDotLayout"]>1:
           self.writeTileDimComponentsA = kernel["GlobalReadVectorWidth"] > 1 # Components
-          writeCoal = True
+          # LDS writes with LDL>1 will never be coalesced
+          writeCoal = False
         else:
           self.writeTileDimComponentsA = kernel["GlobalReadVectorWidth"] > 1 # Components
           writeCoal = False
@@ -1037,7 +1059,8 @@ def initKernel(self, kernel, tensorParametersA, tensorParametersB ):
         self.writeUnrollDimComponentsB = False
         if kernel["LocalDotLayout"]>1:
           self.writeTileDimComponentsB = kernel["GlobalReadVectorWidth"] > 1 # Components
-          writeCoal = True
+          # LDS writes with LDL>1 will never be coalesced
+          writeCoal = False
         else:
           self.writeTileDimComponentsB = kernel["GlobalReadVectorWidth"] > 1 # Components
           writeCoal = False
@@ -1819,33 +1842,33 @@ def getSourceFileString(self, kernel):
       assemblerProcess = Popen(assemblerCommand, \
           cwd=asmPath )
       assemblerProcess.communicate()
-      if assemblerProcess.returncode:
-        printExit("Assembler process returned with code %u" \
-            % assemblerProcess.returncode)
 
-      # read code object file
       fileString = ""
-      if not globalParameters["CodeFromFiles"]:
-        codeObjectFile = open(codeObjectFileName, "r")
-        codeObjectByteArray = bytearray(codeObjectFile.read())
-        codeObjectFile.close()
-  
-        # write code object byte array
-        fileString += self.comment("code object byte array")
-        fileString += "const unsigned char %s_coba[%u] = {\n" % (kernelName, len(codeObjectByteArray))
-        for byteIdx in range(0, len(codeObjectByteArray)):
-          byte = codeObjectByteArray[byteIdx]
-          fileString += "0x%02x" % byte
-          if byteIdx < len(codeObjectByteArray)-1:
-            fileString += ","
-          else:
-            fileString += "};\n"
-          if byteIdx % 16 == 15:
-            fileString += "\n"
-
-
-      # read code-object file and convert to c++ representable uchar*
-      # return string of code-object byte array
+      if assemblerProcess.returncode:
+        error = -1
+      else:
+        # read code object file
+        if not globalParameters["CodeFromFiles"]:
+          codeObjectFile = open(codeObjectFileName, "r")
+          codeObjectByteArray = bytearray(codeObjectFile.read())
+          codeObjectFile.close()
+
+          # write code object byte array
+          fileString += self.comment("code object byte array")
+          fileString += "const unsigned char %s_coba[%u] = {\n" % (kernelName, len(codeObjectByteArray))
+          for byteIdx in range(0, len(codeObjectByteArray)):
+            byte = codeObjectByteArray[byteIdx]
+            fileString += "0x%02x" % byte
+            if byteIdx < len(codeObjectByteArray)-1:
+              fileString += ","
+            else:
+              fileString += "};\n"
+            if byteIdx % 16 == 15:
+              fileString += "\n"
+
+
+    # read code-object file and convert to c++ representable uchar*
+    # return string of code-object byte array
     return (error, fileString)
 
 
diff --git a/Tensile/KernelWriterAssembly.py b/Tensile/KernelWriterAssembly.py
index 2604cfba38..e980e41766 100644
--- a/Tensile/KernelWriterAssembly.py
+++ b/Tensile/KernelWriterAssembly.py
@@ -339,7 +339,6 @@ def __init__( self, kernelMinNaming, kernelSerialNaming ):
     self.printedAssertCnt  = 0
     self.initLdsValue     = 0xFFFFFFFF  # Value to use for LDS Init, if enabled
 
-    self.db["CheckDimOverflow"] = False # check for tensor dims that exceed assumptions in code, in particular around use of 32-bit calcs
 
     # Check A and B values loaded from memory to ensure they are 1
     # Requires DataInitTypeAB=1.
@@ -357,6 +356,7 @@ def __init__( self, kernelMinNaming, kernelSerialNaming ):
     self.localReadDoCnt   = 0
     self.localWriteDoCnt  = 0
 
+
     self.maxVgprs = 256
     self.maxSgprs = 99
 
@@ -541,6 +541,21 @@ def initKernel(self, kernel, tPA, tPB ):
     # which parts of the code were changed to support the new mode.
     self.globalReadIncsUseVgpr = False if kernel["BufferLoad"] else True
 
+    # If True, GRO are expressed as offsets from the beginning of the macro-tile, and the SRD
+    # is set to the beginning of the macro-tile.
+    # If False, GRO are expressed as offsets from the beginning of the lowest 2 dimensions
+    # in the tensor.
+    # True can allow Buffer-Based logic to have significantly higher range and handle larger tensors
+    # But does not work with the PointerShift logic.
+    # Can be enabled with PBC (does not use branch logic) or if assertions guarantee no shift needed
+    # groOffsetInMacroTile doesn't work with pointer-shift because it sets the SRD to point to the
+    # start of the macro-tile - if we overhang by small number of elements (<GRVW) then can't shift
+    # back to get all the data.
+    self.groOffsetInMacroTile = kernel["PreciseBoundsCheck"]
+
+    # use 64-bit buffer limit shadow register, only works with PBC
+    self.use64bPbcLimit = 1 and kernel["PreciseBoundsCheck"]
+
     self.checkGRO = False
     # checkGRO requires useSgprForGRO=0 so that code allocates and uses
     # the VGPRs that are used for the GRO offset checking
@@ -679,6 +694,12 @@ def initKernel(self, kernel, tPA, tPB ):
           } # 906
         }
 
+    if self.version == (9,0,0):
+      self.mixinst = "v_mad_mix_f32"
+    elif self.version == (9,0,6):
+      self.mixinst = "v_fma_mix_f32"
+    else:
+      self.mixinst = "NOT_SUPPORTED"
 
     self.overflowedResources = False # if true, comment out whole kernel
 
@@ -1150,10 +1171,14 @@ def initKernel(self, kernel, tPA, tPB ):
     if kernel["BufferStore"]:
       self.defineSgpr("SrdC", 4, 4)
 
+    self.defineSgpr("Tensor2dSizeC", 2,2)
+    self.defineSgpr("Tensor2dSizeA", 2,2)
+    self.defineSgpr("Tensor2dSizeB", 2,2)
+
     # To avoid corrupting tmp sgprs that may be used around the assert,
     # reserve some sgprs to save/restore the execmask
     if self.db["EnableAsserts"]:
-      self.defineSgpr("SaveExecMask", 2)
+      self.defineSgpr("SaveExecMask", 2, 2)
 
     self.defineSgpr("GSUSumIdx", 2 if kernel["GlobalSplitU"] > 1 else 0)
     self.defineSgpr("AddressC", numSgprAddressC)
@@ -1178,10 +1203,14 @@ def initKernel(self, kernel, tPA, tPB ):
         self.defineSgpr("PerpOverhangVccA", 2, 2)
       if kernel["fractionalPerpOverhangB"]:
         self.defineSgpr("PerpOverhangVccB", 2, 2)
+    if self.use64bPbcLimit:
+      self.defineSgpr("SrdShadowLimitA", 2, 2)
+      self.defineSgpr("SrdShadowLimitB", 2, 2)
     if globalParameters["DebugKernel"]:
       self.defineSgpr("AddressDbg", self.numSgprAddressDbg)
       self.defineSgpr("DebugKernelItems", 1)
 
+
     #------------------------
     # Registers defined below this point are not available in the post-loop
     # (we reclaim them to use as temps, typically for execmasks)
@@ -1193,6 +1222,7 @@ def initKernel(self, kernel, tPA, tPB ):
     self.defineSgpr("OffsetA", numSgprOffsetA)
     self.defineSgpr("OffsetB", numSgprOffsetB)
 
+
     self.defineSgpr("GlobalReadIncsA", numSgprGlobalReadIncsA)
     self.defineSgpr("GlobalReadIncsB", numSgprGlobalReadIncsB)
 
@@ -1257,7 +1287,7 @@ def initKernel(self, kernel, tPA, tPB ):
     if self.db["CheckValue1B"] : print ("\n***WARNING: CheckValue1B enabled, may impact performance\n")
     if self.db["PrintRP"] : print ("\n***WARNING: PrintRP enabled, may generate verbose output\n")
     if kernel["CheckTensorDimAsserts"] : print ("\n***WARNING: CheckTensorDimAsserts enabled, may impact performance\n")
-    if self.db["CheckDimOverflow"] : print ("\n***WARNING: CheckDimOverflow enabled, may impact performance\n")
+    if kernel["CheckDimOverflow"] : print ("\n***WARNING: CheckDimOverflow enabled, may impact performance\n")
 
 
   ##############################################################################
@@ -1313,7 +1343,7 @@ def defineMACMacro(self, kernel, innerUnroll):
                 # we treat HighPrecisionAccumulate as expanded packed math
                 b = blockB*2
                 a = blockA*2
-                if kernel["LocalDotLayout"] > 1:    # Only supports LocalDotLayout == 2 for now
+                if kernel["LocalDotLayout"] > 1 and innerUnroll == 2:    # Only supports LocalDotLayout == 2 for now
                   lcldot = kernel["LocalDotLayout"]
                   iua = blockA / ((kernel["ThreadTileA"]/2) / lcldot)
                   iub = blockB / ((kernel["ThreadTileB"]/2) / lcldot)
@@ -1409,7 +1439,7 @@ def defineMACMacro(self, kernel, innerUnroll):
                 # we treat HighPrecisionAccumulate as expanded packed math
                 b = blockB*2
                 a = blockA*2
-                if kernel["LocalDotLayout"] > 1:    # Only supports LocalDotLayout == 2 for now
+                if kernel["LocalDotLayout"] > 1 and innerUnroll == 2:    # Only supports LocalDotLayout == 2 for now
                   lcldot = kernel["LocalDotLayout"]
                   iua = blockA / ((kernel["ThreadTileA"]/2) / lcldot)
                   iub = blockB / ((kernel["ThreadTileB"]/2) / lcldot)
@@ -1422,28 +1452,28 @@ def defineMACMacro(self, kernel, innerUnroll):
                       % ("vgprValuA_X%u_I%u"%(m,iua), rema*lcldot)
                   bStr = "v[%s+%u]" \
                       % ("vgprValuB_X%u_I%u"%(m,iub), remb*lcldot)
-                  kStr += "v_dot2_f32_f16 %s, %s, %s, %s //ValuC[%u] iua=%u iub=%u%s" % (cStr, aStr, bStr, cStr, cidx, iua, iub, self.endLine)
+                  kStr += "v_dot2_f32_f16 %s, %s, %s, %s op_sel:[0,0] op_sel_hi:[1,1] //ValuC[%u] iua=%u iub=%u%s" % (cStr, aStr, bStr, cStr, cidx, iua, iub, self.endLine)
                   cidx = blockA*2 + blockB*kernel["ThreadTile0"]*2 + 1
                   aStr = "v[%s+%u]" \
                       % ("vgprValuA_X%u_I%u"%(m,iua), rema*lcldot+1)
                   bStr = "v[%s+%u]" \
                       % ("vgprValuB_X%u_I%u"%(m,iub), remb*lcldot)
                   cStr = "v[%s+%u*2+%u*%u*2+0*2+1]" % ("vgprValuC", blockA, blockB, kernel["ThreadTile0"]) # *2 b/c of fp32
-                  kStr += "v_dot2_f32_f16 %s, %s, %s, %s //ValuC[%u]%s" % (cStr, aStr, bStr, cStr, cidx, self.endLine)
+                  kStr += "v_dot2_f32_f16 %s, %s, %s, %s op_sel:[0,0] op_sel_hi:[1,1] //ValuC[%u]%s" % (cStr, aStr, bStr, cStr, cidx, self.endLine)
                   cidx = blockA*2 + blockB*kernel["ThreadTile0"]*2 + kernel["ThreadTile0"] + 0
                   aStr = "v[%s+%u]" \
                       % ("vgprValuA_X%u_I%u"%(m,iua), rema*lcldot)
                   bStr = "v[%s+%u]" \
                       % ("vgprValuB_X%u_I%u"%(m,iub), remb*lcldot+1)
                   cStr = "v[%s+%u*2+%u*%u*2+%u*2+0]" % ("vgprValuC", blockA, blockB, kernel["ThreadTile0"], kernel["ThreadTile0"]/2)
-                  kStr += "v_dot2_f32_f16 %s, %s, %s, %s //ValuC[%u]%s" % (cStr, aStr, bStr, cStr, cidx, self.endLine)
+                  kStr += "v_dot2_f32_f16 %s, %s, %s, %s op_sel:[0,0] op_sel_hi:[1,1] //ValuC[%u]%s" % (cStr, aStr, bStr, cStr, cidx, self.endLine)
                   cidx = blockA*2 + blockB*kernel["ThreadTile0"]*2 + kernel["ThreadTile0"] + 1
                   aStr = "v[%s+%u]" \
                       % ("vgprValuA_X%u_I%u"%(m,iua), rema*lcldot+1)
                   bStr = "v[%s+%u]" \
                       % ("vgprValuB_X%u_I%u"%(m,iub), remb*lcldot+1)
                   cStr = "v[%s+%u*2+%u*%u*2+%u*2+1]" % ("vgprValuC", blockA, blockB, kernel["ThreadTile0"], kernel["ThreadTile0"]/2)
-                  kStr += "v_dot2_f32_f16 %s, %s, %s, %s //valuC[%u]%s" % (cStr, aStr, bStr, cStr, cidx, self.endLine)
+                  kStr += "v_dot2_f32_f16 %s, %s, %s, %s op_sel:[0,0] op_sel_hi:[1,1] //valuC[%u]%s" % (cStr, aStr, bStr, cStr, cidx, self.endLine)
                   #kStr += self.bomb(-13)
                   """
                   ignore this, not quite correct for mixed precision
@@ -1611,7 +1641,6 @@ def functionSignature(self, kernel ):
 
     # lds size
     #kStr += "  compute_pgm_rsrc2_lds_size = 1 // ?%s" % self.endLine # don't use, it eats up 512 bytes of LDS
-#jgolds which bpe should we use? assuming A
     kStr += "  workgroup_group_segment_byte_size = %u // lds bytes%s" \
         % ( kernel["LdsNumElements"] * self.bpeAB, self.endLine )
 
@@ -1679,8 +1708,6 @@ def functionSignature(self, kernel ):
         kStr += "    v_add_u32 \dst, vcc, \src0, \src1" + self.endLine
     kStr += ".endm" + self.endLine
 
-
-
     ########################################
     # VGPR Macros
     ########################################
@@ -1752,9 +1779,8 @@ def functionSignature(self, kernel ):
 
 
     if kernel["BufferLoad"] or kernel["BufferStore"]:
-      if not kernel["PreciseBoundsCheck"]:
-        kStr += self.comment3("2GB limit - set offsets to -1 to exceed this and clamp")
-        kStr += self.macroRegister("BufferLimit", "0x80000000")
+      kStr += self.comment3("2GB limit - set offsets to -1 to exceed this and clamp")
+      kStr += self.macroRegister("BufferLimit", "0x80000000")
       kStr += self.comment3("Bits 127:96 of SRD.  Set DataFormat = 32 bit")
       kStr += self.macroRegister("Srd127_96",   "0x0020000")
       #TODO-64 : This is max 32-bit negative value, the tail loop
@@ -1772,12 +1798,18 @@ def functionSignature(self, kernel ):
         ("C", range(0, kernel["ProblemType"]["NumIndicesC"]), kernel["BufferStore"]), \
         ("A", kernel["ProblemType"]["IndexAssignmentsA"], kernel["BufferLoad"]), \
         ("B", kernel["ProblemType"]["IndexAssignmentsB"], kernel["BufferLoad"]) ]:
+
+      # BufferStore does not use this macro so don't generate it:
+      if tensorChar == "C" and kernel["BufferStore"]:
+        continue
+
       kStr += self.comment("Global Offset %s"%tensorChar)
       numDim = len(indices)
       idxChars = []
       for i in indices:
         idxChars.append(self.indexChars[i])
 
+
       # macro declaration
       kStr += ".macro GLOBAL_OFFSET_%s vgprAddr"%tensorChar
       for i in range(0, numDim):
@@ -1787,7 +1819,7 @@ def functionSignature(self, kernel ):
             or indices[i] == kernel["ProblemType"]["IndexUnroll"]:
           kStr += " vgprOffset%s" % idxChars[i]
         # other c index sgpr
-        elif indices[i] < kernel["ProblemType"]["NumIndicesC"]:
+        elif indices[i] < kernel["ProblemType"]["NumIndicesC"] and not justOffset32:
           kStr += " sgprOffset%s" % idxChars[i]
         # other sum index
         else:
@@ -1829,7 +1861,7 @@ def functionSignature(self, kernel ):
                 "mul d%u upper"%i)
           needAdd = 1
         # other c index sgpr
-        elif indices[i] < kernel["ProblemType"]["NumIndicesC"]:
+        elif indices[i] < kernel["ProblemType"]["NumIndicesC"] and not justOffset32:
           kStr += inst("v_mov_b32", \
               "v[\\vgprTmp+2]", \
               "s[\\sgprOffset%s]"%idxChars[i], \
@@ -1839,7 +1871,7 @@ def functionSignature(self, kernel ):
               "v[\\vgprTmp+0]", \
               sgpr("Strides%s+%u"%(tensorChar,i-1)), \
               "v[\\vgprTmp+2]",  \
-              "mul d%u lower"%i)
+              "other stride mul d%u lower"%i)
           if not justOffset32:
             kStr += inst("v_mul_hi_u32", \
                 "v[\\vgprTmp+1]", \
@@ -1852,10 +1884,11 @@ def functionSignature(self, kernel ):
           # don't even need to add b/c offset=zero
           needAdd = 0
 
+        destLo = "v[\\vgprAddr+0]"
         if needAdd:
           # addr += offset * stride (lo)
           kStr += inst("_v_add_co_u32", \
-              "v[\\vgprAddr+0]", \
+              destLo, \
               "vcc", \
               "v[\\vgprTmp+0]", \
               offset, \
@@ -1870,15 +1903,15 @@ def functionSignature(self, kernel ):
                 "vcc", \
                 "accumulate d%u upper"%i)
         else:
-          kStr += inst("v_mov_b32", "v[\\vgprAddr+0]", offset, "d0 lower")
+          if destLo != offset:
+            kStr += inst("v_mov_b32", destLo, offset, "setup d0 lower")
           if not justOffset32:
             kStr += inst("v_mov_b32", "v[\\vgprAddr+1]", hex(0), "d0 upper")
 
         # Change offset for subsequent dims (if needed)
-        offset = "v[\\vgprAddr+0]"
+        offset = destLo
 
       # addr *= bytes/element
-#jgolds which bpe should we use? assuming A
       if justOffset32:
         kStr += inst("v_lshlrev_b32", \
             "v[\\vgprAddr+0]", \
@@ -1958,7 +1991,6 @@ def allocateResources(self, kernel):
 
     if self.do["PreLoop"]: 
       # set m0
-#jgolds which bpe here? Using A for now
       kStr += inst("s_mov_b32", "m0", hex(kernel["LdsNumElements"] \
           * self.bpeAB), "LDS clamp at %u bytes" \
           %(kernel["LdsNumElements"] * self.bpeAB) )
@@ -1977,6 +2009,22 @@ def allocateResources(self, kernel):
         kStr += inst("s_load_dword", sgpr("AddressDbg+1"), \
             sgpr("KernArgAddress",2), hex(kernArgOffset), "load addr debug + 1" )
         kernArgOffset += 1*4
+      kStr += inst("s_load_dword", sgpr("Tensor2dSizeC+0"), \
+            sgpr("KernArgAddress",2), hex(kernArgOffset+0), "load tensor size" )
+      kStr += inst("s_load_dword", sgpr("Tensor2dSizeC+1"), \
+            sgpr("KernArgAddress",2), hex(kernArgOffset+4), "load tensor size" )
+      kernArgOffset += 2*4
+      kStr += inst("s_load_dword", sgpr("Tensor2dSizeA+0"), \
+            sgpr("KernArgAddress",2), hex(kernArgOffset+0), "load tensor size" )
+      kStr += inst("s_load_dword", sgpr("Tensor2dSizeA+1"), \
+            sgpr("KernArgAddress",2), hex(kernArgOffset+4), "load tensor size" )
+      kernArgOffset += 2*4
+      kStr += inst("s_load_dword", sgpr("Tensor2dSizeB+0"), \
+            sgpr("KernArgAddress",2), hex(kernArgOffset+0), "load tensor size" )
+      kStr += inst("s_load_dword", sgpr("Tensor2dSizeB+1"), \
+            sgpr("KernArgAddress",2), hex(kernArgOffset+4), "load tensor size" )
+      kernArgOffset += 2*4
+
       kStr += inst("s_load_dword", sgpr("AddressC"), \
           sgpr("KernArgAddress",2), hex(kernArgOffset), "load addr c" )
       kernArgOffset += 1*4
@@ -2042,13 +2090,18 @@ def allocateResources(self, kernel):
         kernArgOffset += 1*4
       for i in range(0, self.numSgprSizesSum):
         kStr += inst("s_load_dword", sgpr("SizesSum+%u"%i), \
-            sgpr("KernArgAddress",2), hex(kernArgOffset), "load size free %u"%i )
+            sgpr("KernArgAddress",2), hex(kernArgOffset), "load size sum %u"%i )
         kernArgOffset += 1*4
+
+
+
       kStr += inst("s_waitcnt", "lgkmcnt(0)", \
           "wait for %u bytes of kern args" % kernArgOffset )
     else:
       kStr += ".if 0\n"
 
+    #kStr += self.bomb()
+
     ########################################
     # Apply User Offsets
     kStr += self.comment("User Offsets")
@@ -2158,6 +2211,8 @@ def allocateResources(self, kernel):
                 kernel["AssertSummationElementMultiple"], 0x1001)
       kStr += self.assert_multiple_b32(sgpr("SizesFree+0"),
                 kernel["AssertFree0ElementMultiple"], 0x1002)
+      kStr += self.assert_multiple_b32(sgpr("SizesFree+1"),
+                kernel["AssertFree1ElementMultiple"], 0x1003)
 
     return kStr
 
@@ -2357,7 +2412,7 @@ def graSubgroup(self, kernel):
 
   ##############################################################################
   # Global Read Addresses: Tile Assignment A/B
-  # stores to v1,2
+  # global read addresses: tile offset assignment (message from .s)
   ##############################################################################
   def graTileAssignment(self, kernel, tP):
     kStr = ""
@@ -2388,9 +2443,19 @@ def graTileAssignment(self, kernel, tP):
       uReg = rReg
       tOpStr = "/"
       uOpStr = "%"
-    tReg2 = self.vgprPool.checkOut(1)
-    kStr += self.comment1("%s = gro%s-tile = serial%s%s + (wg%s*MT%s)" \
-        % (vgpr(tReg2), tP["tensorChar"], tOpStr, divisorName, tP["tensorChar"], tP["tensorChar"]) )
+    kStr += self.comment1("%s = %u" % (divisorName, kernel[divisorName]))
+    if self.groOffsetInMacroTile:
+      tReg2 = tReg
+      # treg2 and treg same register and value - we store the 'static'
+      # part of the address calculation in the SRD to maximize the
+      # range of the 32-bit GRO
+      kStr += self.comment1("%s = (local)gro%s-tile = serial%s%s (note (wg%s*MT%s) will be added to SRD)" \
+          % (vgpr(tReg2), tP["tensorChar"], tOpStr, divisorName, tP["tensorChar"], tP["tensorChar"]) )
+    else:
+      tReg2 = self.vgprPool.checkOut(1)
+      kStr += self.comment1("%s = gro%s-tile = serial%s%s + (wg%s*MT%s)" \
+          % (vgpr(tReg2), tP["tensorChar"], tOpStr, divisorName, tP["tensorChar"], tP["tensorChar"]) )
+
     kStr += self.comment1("%s = gro%s-unroll = serial%s%s" \
         % (vgpr(uReg), tP["tensorChar"], uOpStr, divisorName) )
     dividendReg = "Serial" # local serial
@@ -2406,10 +2471,13 @@ def graTileAssignment(self, kernel, tP):
       else:
         kStr += self.comment1("gro-unroll *= glvw")
         kStr += staticMultiply(vgpr(uReg), vgpr(uReg), tP["glvw"], sgpr(tmpSgpr))
-    kStr += staticMultiply(vgpr(tmpVgpr), sgpr(tP["wg"]), kernel[tP["mt"]])
-    kStr += inst("_v_add_co_u32", vgpr(tReg2), "vcc", vgpr(tmpVgpr), \
-        vgpr(tReg), "gro%s-tile = serial%s%s*VW + (wg%s*MT%s)" \
-        % (tP["tensorChar"], tOpStr, divisorName, tP["tensorChar"], tP["tensorChar"]) )
+    if not self.groOffsetInMacroTile:
+      # Buffer Load will set the SRD to start of the MacroTile
+      # So don't add the static wg-related component here - save for later.
+      kStr += staticMultiply(vgpr(tmpVgpr), sgpr(tP["wg"]), kernel[tP["mt"]])  # workgroup
+      kStr += inst("_v_add_co_u32", vgpr(tReg2), "vcc", vgpr(tmpVgpr), \
+          vgpr(tReg), "gro%s-tile = serial%s%s*VW + (wg%s*MT%s)" \
+          % (tP["tensorChar"], tOpStr, divisorName, tP["tensorChar"], tP["tensorChar"]) )
 
     if kernel["GlobalSplitU"] > 1:
       uReg2 = self.vgprPool.checkOut(1)
@@ -2429,7 +2497,8 @@ def graTileAssignment(self, kernel, tP):
   ##############################################################################
   def graUnrollAssignment(self, kernel, tP):
     kStr = ""
-    if kernel["GlobalSplitU"] > 1:
+    # note groOffsetInMacroTile rolls these into SRD so don't change here:
+    if not self.groOffsetInMacroTile and kernel["GlobalSplitU"] > 1:
       gsuOffset = self.vgprPool.checkOut(1)
       kStr += inst("v_mov_b32", vgpr(gsuOffset), sgpr("GSUSumIdx"), "=gsuSumIdx")
       if kernel["GlobalSplitUSummationAssignmentRoundRobin"]:
@@ -2454,6 +2523,7 @@ def graUnrollAssignment(self, kernel, tP):
         kStr += inst("v_mul_lo_u32", vgpr(gsuOffset), vgpr(quotient), \
             vgpr(gsuOffset), "gsuOffset=gsuSumIdx*(SizeU/GSU)")
         self.vgprPool.checkIn(quotient)
+
       kStr += inst("_v_add_co_u32", vgpr(tP["gpr"]["uReg"]), "vcc", \
           vgpr(gsuOffset), vgpr(tP["gpr"]["uReg"]), \
           "graUnrollAssignment += gsuOffset")
@@ -2487,38 +2557,46 @@ def graOtherSummationAssignments(self, kernel):
   ##############################################################################
   def graTileOffsets(self, kernel, tP):
     kStr = ""
-    numTileOffsets = tP["nrt"]
-    if tP["rtc"]:
-      numTileOffsets *= tP["glvw"]
-    tP["vgprTileOffsets"] = self.vgprPool.checkOut(numTileOffsets)
-    v = tP["vgprTileOffsets"]
-    strideIdx = tP["lsc"] if tP["tlu"] else tP["lsp"]
-    stride = kernel[strideIdx]
-    if tP["rtc"]:
-      # l=0, s=0
-      kStr += inst("v_mov_b32", vgpr(v), \
-          vgpr(tP["gpr"]["tReg"]), "gro%s%s_%u_s%u"%(tP["tensorChar"], tP["tileChar"], 0, 0) )
-      # l=0, s>0
-      for s in range(1, tP["glvw"]):
-        kStr += inst("_v_add_co_u32", vgpr(v+s), "vcc", 1, \
-            vgpr(v+s-1), "gro%s%s_%u_s%u"%(tP["tensorChar"], tP["tileChar"], 0, s) )
-      for l in range(1, tP["nrt"]):
-        # l>0, s=0
-        kStr += inst("_v_add_co_u32", vgpr(v+l*tP["glvw"]), "vcc", stride, \
-            vgpr(v+(l-1)*tP["glvw"]), \
-            "gro%s%s_%u_s%u + %s"%(tP["tensorChar"], tP["tileChar"], l, 0, strideIdx) )
-        # l>0, s>0
-        for s in range(1, tP["glvw"]):
-          kStr += inst("_v_add_co_u32", vgpr(v+l*tP["glvw"]+s), "vcc", \
-              1, vgpr(v+l*tP["glvw"]+(s-1)), \
-              "gro%s%s_%u_s%u"%(tP["tensorChar"], tP["tileChar"], l, s) )
+    if kernel["UseSgprForGRO"]:
+      # Let the vgprTileOffsets checkin handle tReg, don't need to do it here
+      tP["vgprTileOffsets"] = tP["gpr"]["tReg"] 
     else:
-      kStr += inst("v_mov_b32", vgpr(v), \
-          vgpr(tP["gpr"]["tReg"]), "gro%s%s_%u"%(tP["tensorChar"], tP["tileChar"], 0) )
-      for l in range(1, tP["nrt"]):
-        kStr += inst("_v_add_co_u32", vgpr(v+l), "vcc", stride, \
-            vgpr(v+l-1), "gro%s%s_%u += %s"%(tP["tensorChar"], tP["tileChar"], l, strideIdx) )
-    self.vgprPool.checkIn(tP["gpr"]["tReg"])
+      numTileOffsets = tP["nrt"]
+      if tP["rtc"]:
+        numTileOffsets *= tP["glvw"]
+      tP["vgprTileOffsets"] = self.vgprPool.checkOut(numTileOffsets)
+      v = tP["vgprTileOffsets"]
+      strideIdx = tP["lsc"] if tP["tlu"] else tP["lsp"]
+      stride = kernel[strideIdx]
+      if tP["rtc"]:
+        # l=0, s=0
+        kStr += inst("v_mov_b32", vgpr(v), \
+            vgpr(tP["gpr"]["tReg"]), "gro%s%s_%u_s%u"%(tP["tensorChar"], tP["tileChar"], 0, 0) )
+        # l=0, s>0
+        for s in range(1, tP["glvw"]):
+          kStr += inst("_v_add_co_u32", vgpr(v+s), "vcc", 1, \
+              vgpr(v+s-1), "gro%s%s_%u_s%u"%(tP["tensorChar"], tP["tileChar"], 0, s) )
+        for l in range(1, tP["nrt"]):
+          # l>0, s=0
+          kStr += inst("_v_add_co_u32", vgpr(v+l*tP["glvw"]), "vcc", stride, \
+              vgpr(v+(l-1)*tP["glvw"]), \
+              "gro%s%s_%u_s%u + %s"%(tP["tensorChar"], tP["tileChar"], l, 0, strideIdx) )
+          # l>0, s>0
+          for s in range(1, tP["glvw"]):
+            kStr += inst("_v_add_co_u32", vgpr(v+l*tP["glvw"]+s), "vcc", \
+                1, vgpr(v+l*tP["glvw"]+(s-1)), \
+                "gro%s%s_%u_s%u"%(tP["tensorChar"], tP["tileChar"], l, s) )
+      else:
+        kStr += inst("v_mov_b32", vgpr(v), \
+            vgpr(tP["gpr"]["tReg"]), "gro%s%s_%u"%(tP["tensorChar"], tP["tileChar"], 0) )
+        for l in range(1, tP["nrt"]):
+          kStr += inst("_v_add_co_u32", vgpr(v+l), "vcc", stride, \
+              vgpr(v+l-1), "gro%s%s_%u += %s"%(tP["tensorChar"], tP["tileChar"], l, strideIdx) )
+
+      # groOffsetInMacroTile uses same register for both of these, don't free it here:
+      if tP["gpr"]["lwoT"] != tP["gpr"]["tReg"] :
+        self.vgprPool.checkIn(tP["gpr"]["tReg"])
+        tP["gpr"]["tReg"] = None
     return kStr
 
 
@@ -2527,38 +2605,41 @@ def graTileOffsets(self, kernel, tP):
   ##############################################################################
   def graUnrollOffsets(self, kernel, tP):
     kStr = ""
-    numUnrollOffsets = tP["nru"]
-    if tP["ruc"]:
-      numUnrollOffsets *= tP["glvw"]
-    tP["gpr"]["unrollOffsets"] = self.vgprPool.checkOut(numUnrollOffsets)
-    v = tP["gpr"]["unrollOffsets"]
-    strideIdx = (tP["lsp"] if tP["tlu"] else tP["lsc"])
-    stride = kernel[strideIdx]
-    if tP["ruc"]:
-      # l=0, s=0
-      kStr += inst("v_mov_b32", vgpr(v), \
-          vgpr(tP["gpr"]["uReg"]), "gro%s%s_%u_s%u"%(tP["tensorChar"], self.unrollChar, 0, 0) )
-      # l=0, s>0
-      for s in range(1, tP["glvw"]):
-        kStr += inst("_v_add_co_u32", vgpr(v+s), "vcc", 1, \
-            vgpr(v+s-1), "gro%s%s_%u_s%u"%(tP["tensorChar"], self.unrollChar, 0, s) )
-      for l in range(1, tP["nru"]):
-        # l>0, s=0
-        kStr += inst("_v_add_co_u32", vgpr(v+l*tP["glvw"]), "vcc", stride, \
-            vgpr(v+(l-1)*tP["glvw"]), \
-            "gro%s%s_%u_s%u + %s"%(tP["tensorChar"], self.unrollChar, l, 0, strideIdx) )
-        # l>0, s>0
-        for s in range(1, tP["glvw"]):
-          kStr += inst("_v_add_co_u32", vgpr(v+l*tP["glvw"]+s), "vcc", \
-              1, vgpr(v+l*tP["glvw"]+(s-1)), \
-              "gro%s%s_%u_s%u"%(tP["tensorChar"], self.unrollChar, 0, s) )
+    if kernel["UseSgprForGRO"]:
+      tP["gpr"]["unrollOffsets"] = tP["gpr"]["uReg"]
     else:
-      kStr += inst("v_mov_b32", vgpr(v), \
-          vgpr(tP["gpr"]["uReg"]), "gro%s%s_%u"%(tP["tensorChar"], self.unrollChar, 0) )
-      for l in range(1, tP["nru"]):
-        kStr += inst("_v_add_co_u32", vgpr(v+l), "vcc", stride, \
-            vgpr(v+l-1), "gro%s%s_%u + %s"%(tP["tensorChar"], self.unrollChar, l, strideIdx) )
-    #self.vgprPool.checkIn(tP["gpr"]["uReg"])
+      numUnrollOffsets = tP["nru"]
+      if tP["ruc"]:
+        numUnrollOffsets *= tP["glvw"]
+      tP["gpr"]["unrollOffsets"] = self.vgprPool.checkOut(numUnrollOffsets)
+      v = tP["gpr"]["unrollOffsets"]
+      strideIdx = (tP["lsp"] if tP["tlu"] else tP["lsc"])
+      stride = kernel[strideIdx]
+      if tP["ruc"]:
+        # l=0, s=0
+        kStr += inst("v_mov_b32", vgpr(v), \
+            vgpr(tP["gpr"]["uReg"]), "gro%s%s_%u_s%u"%(tP["tensorChar"], self.unrollChar, 0, 0) )
+        # l=0, s>0
+        for s in range(1, tP["glvw"]):
+          kStr += inst("_v_add_co_u32", vgpr(v+s), "vcc", 1, \
+              vgpr(v+s-1), "gro%s%s_%u_s%u"%(tP["tensorChar"], self.unrollChar, 0, s) )
+        for l in range(1, tP["nru"]):
+          # l>0, s=0
+          kStr += inst("_v_add_co_u32", vgpr(v+l*tP["glvw"]), "vcc", stride, \
+              vgpr(v+(l-1)*tP["glvw"]), \
+              "gro%s%s_%u_s%u + %s"%(tP["tensorChar"], self.unrollChar, l, 0, strideIdx) )
+          # l>0, s>0
+          for s in range(1, tP["glvw"]):
+            kStr += inst("_v_add_co_u32", vgpr(v+l*tP["glvw"]+s), "vcc", \
+                1, vgpr(v+l*tP["glvw"]+(s-1)), \
+                "gro%s%s_%u_s%u"%(tP["tensorChar"], self.unrollChar, 0, s) )
+      else:
+        kStr += inst("v_mov_b32", vgpr(v), \
+            vgpr(tP["gpr"]["uReg"]), "gro%s%s_%u"%(tP["tensorChar"], self.unrollChar, 0) )
+        for l in range(1, tP["nru"]):
+          kStr += inst("_v_add_co_u32", vgpr(v+l), "vcc", stride, \
+              vgpr(v+l-1), "gro%s%s_%u + %s"%(tP["tensorChar"], self.unrollChar, l, strideIdx) )
+      #self.vgprPool.checkIn(tP["gpr"]["uReg"])
     return kStr
 
 
@@ -2570,43 +2651,54 @@ def graBranch(self, kernel, tP):
 
   ##############################################################################
   # Global Read Addresses: Shift A/B
+  # See if the load (including vw) will extend past the 'free' dim of the 
+  # tensor.  If so clip to the last legal value which is inside the array
   ##############################################################################
   def graShift(self, kernel, tP):
-    if kernel["PreciseBoundsCheck"]: return ""
+    # PBC doesn't shift pointers, uses a difference edge detect mechanism
+    # FractionalLoad maps addresses in a different way?
+    if kernel["PreciseBoundsCheck"] : return ""
 
     kStr = ""
     # edge value
     margin = tP["glvw"] if tP["rtv"] else 1
     edge = self.vgprPool.checkOut(1)
 
-
-    if kernel["BufferLoad"] and kernel["PreciseBoundsCheck"]:
-      # Go to the edge. we can rely on preciseboundscheck to keep things inline
-      # Results in more loads of 0 which is better for power and debug
-      kStr += inst("v_mov_b32", vgpr(edge), sgpr("SizesFree+%u"%tP["idx"]), \
-                "edge = Size%s"%(tP["tileChar"]) )
+    if self.groOffsetInMacroTile:
+      # Subtract the static component from SizesFree:
+      # TODO - this code is dead since PreciseBoundsCheck returns above
+      tmpSgpr = self.getTmpSgpr(1)
+      kStr += inst("s_mul_i32", sgpr(tmpSgpr), sgpr(tP["wg"]), kernel[tP["mt"]], "WorkGroup[01] * MT")
+      kStr += inst("s_sub_u32", sgpr(tmpSgpr), sgpr("SizesFree+%u"%tP["idx"]), sgpr(tmpSgpr), \
+                "edge = Size%s - WG*MT"%(tP["tileChar"]))
+      kStr += inst("s_sub_u32", sgpr(tmpSgpr), sgpr(tmpSgpr), margin, "edge -= margin")
+      kStr += inst("v_mov_b32", vgpr(edge), sgpr(tmpSgpr), \
+          "edge vgpr = Size%s-%u"%(tP["tileChar"], margin) )
     else:
       tmpSgpr = self.getTmpSgpr(1)
-      kStr += inst("s_add_u32", sgpr(tmpSgpr), hex(-margin), sgpr("SizesFree+%u"%tP["idx"]), \
+      kStr += inst("s_sub_u32", sgpr(tmpSgpr), sgpr("SizesFree+%u"%tP["idx"]), margin, \
           "edge = Size%s-%u"%(tP["tileChar"], margin) )
       kStr += inst("v_mov_b32", vgpr(edge), sgpr(tmpSgpr), \
-          "edge = Size%s-%u"%(tP["tileChar"], margin) )
+          "edge vgpr = Size%s-%u"%(tP["tileChar"], margin) )
+
+    if kernel["CheckDimOverflow"]:
+      # if tensor is really skinnty (SizesFree is less then glvw) then shifting fails-
+      # can detect here if the computed edge after subtracting marging is <0
+      kStr += self.assert_ge_i32(vgpr(edge), 0)
+    #kStr += self.assert_ne(sgpr("WorkGroup0"),1)
 
     # shift offsets
     v = tP["vgprTileOffsets"]
     tmpSgpr = self.getTmpSgpr(2)
     for l in range(0, tP["nrt"]):
       # compare
-      #kStr += dump(vgpr(v+l))
       kStr += inst("v_cmp_lt_u32", sgpr(tmpSgpr,2), vgpr(v+l), vgpr(edge), "offset < edge" )
       # shift
       kStr += inst("v_cndmask_b32", vgpr(v+l), vgpr(edge), vgpr(v+l), sgpr(tmpSgpr,2), "offset = (offset < edge) ? offset : edge" )
-      #kStr += dump(vgpr(v+l))
     self.vgprPool.checkIn(edge)
     #if tP["isB"]:
     #  kStr += "s_endpgm\n"
 
-
     return kStr
 
   ##############################################################################
@@ -2651,7 +2743,7 @@ def graFinalOffsets(self, kernel, tP):
                 if i < kernel["ProblemType"]["NumIndicesC"]:
                   if i == tP["tileIdx"]:
                     kStr += ", %2u" % vgprTile
-                  else: # just a group index
+                  elif not kernel["BufferLoad"]: # just a group index
                     kStr += ", sgprWorkGroup%u"%i
                 else: # summation index
                   if i == kernel["ProblemType"]["IndexUnroll"]:
@@ -2693,7 +2785,7 @@ def graFinalOffsets(self, kernel, tP):
               if tP["tlu"]:
                 tileStride   = kernel[tP["lsc"]] * (para*tVW + sPara*tVS)
                 unrollStride = kernel[tP["lsp"]] * (perp*uVW + sPerp*uVS)
-                kStr += inst("s_mul_i32", sgpr(scalarGro), sgpr("Strides%s"%tc), unrollStride, \
+                kStr += inst("s_mul_i32", sgpr(scalarGro), sgpr("Strides%s+0"%tc), unrollStride, \
                              "compute offset diff (scaled unrollDim)")
                 if tileStride:
                   kStr += inst("s_add_u32", sgpr(scalarGro), sgpr(scalarGro), tileStride, \
@@ -2701,7 +2793,7 @@ def graFinalOffsets(self, kernel, tP):
               else:
                 tileStride   = kernel[tP["lsp"]] * (perp*tVW + sPara*tVS)
                 unrollStride = kernel[tP["lsc"]] * (para*uVW + sPerp*uVS)
-                kStr += inst("s_mul_i32", sgpr(scalarGro), sgpr("Strides%s"%tc), tileStride, \
+                kStr += inst("s_mul_i32", sgpr(scalarGro), sgpr("Strides%s+0"%tc), tileStride, \
                              "compute offset diff (scaled tileDim)")
                 if unrollStride:
                   kStr += inst("s_add_u32", sgpr(scalarGro), sgpr(scalarGro), unrollStride, \
@@ -2717,15 +2809,14 @@ def graFinalOffsets(self, kernel, tP):
 
               if self.checkGRO:
                 # Debug mode to verify that the computed offsets are offset by the expected scalar
-
                 print tc, "tileStride=", tileStride, "unrollStride=", unrollStride, \
                       "Strides%s="%tc
 
                 kStr += self.assert_vector_diff(vgpr("GlobalReadOffset%s+%u"%(tc,0)), \
                                                 vgpr("GlobalReadOffset%s+%u"%(tc,graIdx)), \
                                                 sgpr(scalarGro))
-              #-- End UseSgprForGRO
 
+              #-- End UseSgprForGRO
             # dump final offsets
             # BufferLoad flavor:
             #if tP["isA"]:
@@ -2734,9 +2825,18 @@ def graFinalOffsets(self, kernel, tP):
             #kStr += dump(vgpr("GlobalReadAddr%s+%u+0"%(tP["tensorChar"], graIdx)))
             #kStr += dump(vgpr("GlobalReadAddr%s+%u+1"%(tP["tensorChar"], graIdx)))
             graIdx += self.rpgo if kernel["BufferLoad"] else self.rpga
-    self.vgprPool.checkIn(tileOffsets)
-    self.vgprPool.checkIn(unrollOffsets)
+
+    if not kernel["UseSgprForGRO"]:
+      self.vgprPool.checkIn(tP["vgprTileOffsets"])
+      tP["vgprTileOffsets"] = None
+      # UseSgprForGRO uses same vgpr for ureg and unrollOffsets so
+      # let checkin(ureg) do the deallocating
+      # vgprTileOffsets is renamed version of treg/lwo so deallocate
+      # it here
+      self.vgprPool.checkIn(unrollOffsets)
     self.vgprPool.checkIn(tmp)
+    #if tP["isB"]:
+    #  kStr += self.bomb(0x100)
 
     if kernel["FractionalLoad"] and kernel["fractionalPerpOverhang%s"%tc]:
       overhang = kernel["fractionalPerpOverhang%s"%tc]
@@ -2749,7 +2849,6 @@ def graFinalOffsets(self, kernel, tP):
           sgpr("PerpOverhangVcc%s"%tc), \
           "fractional-overhang: some wi write to harmless LDS location")
 
-
     return kStr
 
   ##############################################################################
@@ -2760,8 +2859,155 @@ def graApplyUserOffsets(self, kernel):
     kStr += self.comment1("moved earlier")
     return kStr
 
+
   ##############################################################################
-  # Global Read Addresses: Addresses A/B
+  # Add the constant offsets to the specified srd.
+  # Srd is set to point to the base of the tile. All offsets except lowest-order
+  # 2d dims are computed into the SRD.
+  # GRO are offset from the tile SRD and the first GRO will be 0
+  # Only called for BufferLoad=1 (or eventually BufferStore=1)
+  ##############################################################################
+  def computeSrd(self, kernel, tP, tc, indices, bpe):
+    kStr = ""
+
+    stmp = self.getTmpSgpr(6+2) # bozo, remove +1
+    tileStart = stmp+4
+    wroteTileStart = False
+
+    #---
+    # Compute tileStart #elements from the 2D array start
+    # Add tile (and unroll if GSU) component into SRD - SRD will point to beginning of the macro-tile:
+    if self.groOffsetInMacroTile:
+      wroteTileStart = True
+      startStride = 1 if kernel["ProblemType"]["UseInitialStrides"] else 0
+
+      # This is guaranteed to fit in 32-bit since the WG*MT is a number of elements in some unsigned direction:
+      kStr += self.s_mul_u64_u32(sgpr(tileStart+0), sgpr(tileStart+1), sgpr(tP["wg"]), kernel[tP["mt"]], "WorkGroup[01] * MT")
+      if kernel["CheckDimOverflow"] >=2:
+        kStr += self.assert_eq(sgpr(tileStart+1),0)
+      if not tP["tlu"]: # transpose case, tile is in perp dim and should be scaled by Stride
+        kStr += self.s_mul_u64_u32(sgpr(tileStart), sgpr(tileStart+1), sgpr(tileStart+0), \
+                  sgpr("Strides%s+%u"%(tc,startStride)), "tlu=0, scaled tile-offset by stride")
+
+      if kernel["GlobalSplitU"] > 1:
+        # Only GlobalSplitUSummationAssignmentRoundRobin supported for groOffsetInMacroTile - would need different math here for start:
+        assert(kernel["GlobalSplitUSummationAssignmentRoundRobin"])
+
+        kStr += self.s_mul_u64_u32(sgpr(stmp+0), sgpr(stmp+1), kernel["DepthU"], sgpr("GSUSumIdx"), "gsuOffset = DepthU*bpe*GSUSumIdx")
+        if kernel["CheckDimOverflow"] >=2:
+          kStr += self.assert_eq(sgpr(stmp+1),0)
+        if tP["tlu"]: # transpose case, tile is in perp dim and should be scaled by Stride
+          kStr += self.s_mul_u64_u32(sgpr(stmp), sgpr(stmp+1), sgpr(stmp+0), \
+                    sgpr("Strides%s+%u"%(tc,startStride)), "tlu=1, scaled unroll-offset by stride")
+
+        kStr += inst("s_add_u32",  sgpr(tileStart+0), sgpr(tileStart+0), sgpr(stmp+0), "accum GsuOffet term to tilestart")
+        kStr += inst("s_addc_u32", sgpr(tileStart+1), sgpr(tileStart+1), sgpr(stmp+1), "accum GsuOffet term to tilestart")
+
+
+    # Output : tileStart[0:1] have offset in elements from the 2D start of the tile.
+    # if groOffsetInMacroTile=1, 2DStart + tileStart gives the the start of the macro-tile; 
+    # This is used to compute the limit.
+    # Later we modify tileStart to include batch and higher-order dims and add this to SRD.
+
+    #---
+    # Compute BUFFER Limit:
+    if kernel["PreciseBoundsCheck"]:
+      if not wroteTileStart:
+        kStr += inst("s_mov_b32", sgpr(tileStart+0), 0, "set default tileStart")
+        kStr += inst("s_mov_b32", sgpr(tileStart+1), 0, "set default tileStart")
+
+      startStride = 1 if kernel["ProblemType"]["UseInitialStrides"] else 0
+      if self.use64bPbcLimit:
+        limitTmp0 = "SrdShadowLimit%s+0"%tc
+        limitTmp1 = "SrdShadowLimit%s+1"%tc
+      else:
+        limitTmp0 = stmp+0
+        limitTmp1 = stmp+1
+
+      kStr += inst("s_sub_u32",  sgpr(limitTmp0), sgpr("Tensor2dSize%s"%tc), sgpr(tileStart+0), "sub tileStart")
+      kStr += inst("s_subb_u32", sgpr(limitTmp1), sgpr("Tensor2dSize%s+1"%tc), sgpr(tileStart+1), "sub tileStart")
+
+      if self.use64bPbcLimit:
+        # Set initial buffer limit
+        # if the limit is >64bit, incrementSrd decrements the shadow as the SRD increments, and when we get within 32-bit we start to step down the SRD
+        # if the limit is <32bits, set it accurately here:
+        # Note lshl_b64 the higher-numbered SGPR has the upper 32-bits
+        kStr += inst("s_lshl_b64", sgpr("SrdShadowLimit%s"%tc,2),  sgpr("SrdShadowLimit%s"%tc,2), hex(log2(tP["bpe"])), "Set limit to use bytes")
+        kStr += inst("s_cmp_eq_u32", sgpr("SrdShadowLimit%s+1"%tc), 0, "are we within 2^32?")
+        kStr += inst("s_cselect_b32", sgpr("Srd%s+2"%tc), sgpr("SrdShadowLimit%s+0"%tc), "BufferLimit", "Move shadow to real if we are within 2^32")
+
+      else:
+        # put limit directly into SRD:
+        kStr += inst("s_lshl_b32", sgpr("Srd%s+2"%tc),  sgpr(stmp+0), hex(log2(tP["bpe"])), "Set limit to use bytes")
+    else:
+      # PreciseBoundsCheck=0, just pick a large max - later conditionally set some offsets to -1 to force OOB
+      kStr += inst("s_mov_b32", sgpr("Srd%s+2"%tc), "BufferLimit", "")
+      kStr += "\n"
+
+
+    # Apply any high-order address components to the tileStart and eventually the SRD - these include batch idx for batched gemm, >4D tensors, etc
+    numDim = len(indices)
+    for i in range(1, numDim):
+      idx = indices[i]
+      if idx == kernel["ProblemType"]["Index0"] \
+          or idx == kernel["ProblemType"]["Index1"] \
+          or idx == kernel["ProblemType"]["IndexUnroll"]:
+            continue # these will be captured in GRO not the SRD
+      else:
+        if not wroteTileStart:
+          kStr += self.s_mul_u64_u32(sgpr(tileStart+0), sgpr(tileStart+1), sgpr("Strides%s+%u"%(tc,i-1)), sgpr("WorkGroup%u"%i), "Stride*WG")
+          wroteTileStart = True
+        else:
+          kStr += self.s_mul_u64_u32(sgpr(stmp+0), sgpr(stmp+1), sgpr("Strides%s+%u"%(tc,i-1)), sgpr("WorkGroup%u"%i), "Stride*WG")
+          kStr += inst("s_add_u32",  sgpr(tileStart+0), sgpr(tileStart+0), sgpr(stmp+0), "accum wg term to tilestart")
+          kStr += inst("s_addc_u32", sgpr(tileStart+1), sgpr(tileStart+1), sgpr(stmp+1), "accum wg term to tilestart")
+
+
+    # Add the tile start to the SRD
+    if wroteTileStart:
+      kStr += inst("s_lshl_b64", sgpr(tileStart,2), sgpr(tileStart,2), log2(bpe), "tileStart *= BPE")
+      kStr += inst("s_add_u32",  sgpr("Srd%s+0"%tc), sgpr("Address%s+0"%tc), sgpr(tileStart+0), "SRD_base = Address+ tileStart0")
+      kStr += inst("s_addc_u32", sgpr("Srd%s+1"%tc), sgpr("Address%s+1"%tc), sgpr(tileStart+1), "SRD_base = Address+ tileStart1");
+    else:
+      kStr += inst("s_mov_b32", sgpr("Srd%s+0"%tc), sgpr("Address%s+0"%tc), "init SRD base address (lower )" )
+      kStr += inst("s_mov_b32", sgpr("Srd%s+1"%tc), sgpr("Address%s+1"%tc), "init SRD base address (upper) + other fields" )
+
+    kStr += inst("s_mov_b32", sgpr("Srd%s+3"%tc), "Srd127_96", "Set bits 127_96 in SRD")
+
+    #if tP["isB"]:
+    #  kStr += self.assert_ne(sgpr("WorkGroup2"), 1)
+
+
+    if kernel["PreciseBoundsCheck"] and kernel["CheckDimOverflow"]>=2:
+      # double-check to make sure the SRD limit is inside the allowed tensor:
+      # (only works in PBC mode since otherwise we set the limit to BufferLimit)
+      #   - compute size of tensor in elements (including all dimensions)
+      #   - subtract the SRD base and SRD buffer limit
+      #   - Make sure the 64bit result is >0
+      kStr += inst("s_lshl_b64", sgpr(stmp,2), sgpr("Tensor2dSize%s"%tc,2), log2(bpe), "tensor size in bytes")
+      kStr += inst("s_add_u32",  sgpr(stmp+0), sgpr(stmp+0), sgpr("Address%s+0"%tc), "add start ptr to compute tensor%s bot-right"%tc)
+      kStr += inst("s_addc_u32", sgpr(stmp+1), sgpr(stmp+1), sgpr("Address%s+1"%tc), "add start ptr to compute tensor%s bot-right"%tc)
+      kStr += inst("s_sub_u32",  sgpr(stmp+0), sgpr(stmp+0), sgpr("Srd%s+0"%tc), "sub SRD base")
+      kStr += inst("s_subb_u32", sgpr(stmp+1), sgpr(stmp+1), sgpr("Srd%s+1"%tc), "sub SRD base")
+      if self.use64bPbcLimit:
+        kStr += inst("s_sub_u32", sgpr(stmp+0), sgpr(stmp+0), sgpr("SrdShadowLimit%s+0"%tc), "sub buffer size")
+        kStr += inst("s_subb_u32", sgpr(stmp+1), sgpr(stmp+1), sgpr("SrdShadowLimit%s+1"%tc), "sub buffer size")
+      else:
+        kStr += inst("s_sub_u32",  sgpr(stmp+0), sgpr(stmp+0), sgpr("Srd%s+2"%tc), "sub buffer limit")
+
+      kStr += self.assert_eq(sgpr(stmp+1), 0)  # must be 0 or we are way OOB
+      kStr += self.assert_ge_u32(sgpr(stmp+0), 0) # diff greater than zero
+      if 0 and tP["isB"]:
+        t = self.vgprPool.checkOut(1)
+        kStr += inst("s_add_u32", sgpr(stmp+6), sgpr("WorkGroup1"), sgpr("WorkGroup2"), "bozo, debug")
+        kStr += inst("v_mov_b32", vgpr(t), 0x54, "")
+        kStr += self.assert_ne(sgpr(stmp+6), vgpr(t) )
+        self.vgprPool.checkIn(t)
+
+    return kStr
+
+##############################################################################
+# Global Read Addresses: Addresses A/B
   ##############################################################################
   def graAddresses(self, kernel, tP):
     kStr = ""
@@ -2771,31 +3017,10 @@ def graAddresses(self, kernel, tP):
     if kernel["BufferLoad"]:
       # maxAddrSgpr = size[n] * stride[n-1]
       kStr += self.comment1("max read offset = size[n] * stride[n-1]")
-      dim = len(tP["ia"])-1 # dim
-      strideIdx = dim-1 # largest stride
-      sizeIdx = tP["ia"][dim]
 
-      sizeIdxIsSum = sizeIdx in kernel["ProblemType"]["IndicesSummation"]
-      if sizeIdxIsSum:
-        sizeIdx -= kernel["ProblemType"]["NumIndicesC"]
+      kStr += self.computeSrd(kernel, tP, tc, kernel["ProblemType"]["IndexAssignments%s"%tc], tP["bpe"])
 
-      # Buffer-load uses one base read pointer stored in the SRD - set it here:
-      kStr += inst("s_mov_b32", sgpr("Srd%s+0"%tc), sgpr("Address%s+0"%tc), "init SRD base address (lower)" )
-      kStr += inst("s_mov_b32", sgpr("Srd%s+1"%tc), sgpr("Address%s+1"%tc), "init SRD base address (upper) + other fields" )
-      if kernel["PreciseBoundsCheck"]:
-        kStr += inst("s_mul_i32", \
-            sgpr("Srd%s+2"%tc), \
-            sgpr("Sizes%s+%u"%("Sum" if sizeIdxIsSum else "Free", sizeIdx)),  \
-            sgpr("Strides%s+%u"%(tc,strideIdx)), \
-            "set limit to bottom-right corner of array")
-        kStr += inst("s_lshl_b32",
-            sgpr("Srd%s+2"%tc), \
-            sgpr("Srd%s+2"%tc), \
-            hex(log2(tP["bpe"])), \
-            "Size in bytes") #TODO-64B
-      else:
-        kStr += inst("s_mov_b32", sgpr("Srd%s+2"%tc), "BufferLimit", "")
-      kStr += inst("s_mov_b32", sgpr("Srd%s+3"%tc), "Srd127_96", "Set bits 127_96 in SRD")
+      #kStr += self.bomb(0x13) # after addresses and SRD set
     else:
       tmp = self.vgprPool.checkOut(2)
       kStr += inst("v_mov_b32", vgpr(tmp+0), sgpr("Address%s+0"%tP["tensorChar"]), "" )
@@ -2845,7 +3070,6 @@ def graIncrements(self, kernel, loopIdx, tP):
       if tP["tlu"]:
         if self.globalReadIncsUseVgpr:
           tmpSgpr = self.getTmpSgpr(1)
-#jgolds which bpe here? assuming tP
           kStr += inst("s_mul_i32", sgpr(tmpSgpr+0), \
               hex(depthU*tP["bpe"]), sgpr("Strides%s"%tP["tensorChar"]), \
               "incr = stride*%u*bytes"%depthU )
@@ -2869,7 +3093,6 @@ def graIncrements(self, kernel, loopIdx, tP):
               sgpr(tmpSgpr+1), \
               "" )
         else: # not globalReadIncsUseVgpr, ie use SGPR
-#jgolds which bpe here? assuming tP
           kStr += inst("s_mul_i32", sgpr("GlobalReadIncs%s+0"%tP["tensorChar"]), \
               hex(depthU*tP["bpe"]), sgpr("Strides%s"%tP["tensorChar"]), \
               "incr = stride*%u*bytes"%depthU )
@@ -2886,7 +3109,6 @@ def graIncrements(self, kernel, loopIdx, tP):
               "(carry)")
 
       else: # transposed
-#jgolds which bpe here? assuming tP
         if self.globalReadIncsUseVgpr:
           kStr += inst("v_mov_b32", vgpr("GlobalReadIncs%s+0"%tP["tensorChar"]), \
               hex(depthU*tP["bpe"]), \
@@ -2905,6 +3127,8 @@ def graIncrements(self, kernel, loopIdx, tP):
 
     #kStr += dump(vgpr("GlobalReadIncs%s"%tP["tensorChar"]))
     #kStr += "s_endpgm\n"
+    #if tP["isB"]:
+    #  kStr += self.bomb(0x100)
     return kStr
 
   ##############################################################################
@@ -2961,7 +3185,7 @@ def lwaFirstOffset(self, kernel, tP):
           vgpr(uReg), \
           ~(kernel["LocalDotLayout"]-1), \
           vgpr(uReg), \
-          "uReg & LDL")
+          "uReg & ~LDL")
       kStr += inst("v_mul_u32_u24", \
           vgpr(uReg), \
           hex(kernel["MacroTile%s"%tP["tensorChar"]] + kernel["LdsPad%s"%tc]), \
@@ -2997,6 +3221,7 @@ def lwaFirstOffset(self, kernel, tP):
           "lwFOB = lwB%s + lwB%s*MT%s + LDS_OFFSET_B=%u*%u" % (tP["tileChar"], \
           self.unrollChar, tP["tileChar"], kernel["LdsOffsetB"], self.bpeAB) )
     self.vgprPool.checkIn(tP["gpr"]["lwoT"])
+    tP["gpr"]["lwoT"] = None
     self.vgprPool.checkIn(tP["gpr"]["uReg"])
     if kernel["GlobalSplitU"] > 1:
       self.vgprPool.checkIn(tP["gpr"]["uReg2"])
@@ -3142,7 +3367,6 @@ def lraFinalOffset(self, kernel, tP):
     #if tP["isA"]:
     #  kStr += self.bomb(113)
 
-
     # dump lra final offset
     #if tP["isA"]:
     #  kStr += dump(vgpr("LocalReadAddr%s"%tP["tensorChar"]))
@@ -3162,7 +3386,6 @@ def lraDeclareAddresses(self, kernel, tP):
     if tP["isA"]:
       return self.comment1("N/A")
     else:
-#jgolds which bpe here? Looks like tP, which is B
       return inst("_v_add_co_u32", \
           vgpr("LocalReadAddr%s+0"%tP["tensorChar"]), \
           "vcc", \
@@ -3459,13 +3682,26 @@ def incrementSrd(self, kernel, tP, incLower, incUpper):
 
     # also have to move the boundary since we change the base
     # so less buffers to the edge:
-    # TODO-64
     if kernel["PreciseBoundsCheck"]:
-      kStr += inst("s_sub_u32 ", \
-           sgpr("Srd%s+2"%(tc)), \
-           sgpr("Srd%s+2"%(tc)), \
-           incLower, \
-            "limit -= inc)" )
+      if self.use64bPbcLimit:
+        kStr += inst("s_sub_u32", \
+            sgpr("SrdShadowLimit%s+0"%tc), \
+            sgpr("SrdShadowLimit%s+0"%tc), \
+             incLower, \
+              "limit -= inc)")
+        kStr += inst("s_subb_u32", \
+            sgpr("SrdShadowLimit%s+1"%tc), \
+            sgpr("SrdShadowLimit%s+1"%tc), \
+             incUpper, \
+              "limit -= inc)" )
+        kStr += inst("s_cmp_eq_u32", sgpr("SrdShadowLimit%s+1"%tc), 0, "are we within 2^32?")
+        kStr += inst("s_cmov_b32", sgpr("Srd%s+2"%tc), sgpr("SrdShadowLimit%s+0"%tc), "Move shadow to real if we are within 2^32")
+      else:
+        kStr += inst("s_sub_u32", \
+             sgpr("Srd%s+2"%(tc)), \
+             sgpr("Srd%s+2"%(tc)), \
+             incLower, \
+              "limit -= inc)" )
 
     return kStr
 
@@ -3569,11 +3805,10 @@ def globalReadDo(self, kernel, guardK, tP):
       ########################################
       # Calculate Max Addr
       ########################################
-      maxAddrSgpr = self.getTmpSgpr(2) # 3+6 = 9 sgprs available
-      tmpSgpr = maxAddrSgpr + 2 # 7 sgprs available
+      maxAddrSgpr = self.getTmpSgpr(4)
+      tmpSgpr = maxAddrSgpr + 2
       #dumpVgpr = self.vgprPool.checkOut(1)
 
-      # TODO-64B:
       # Assumes the product of the two sizes is <4GB here.
       # We would need to slide the SRD if this is not the case.
       kStr += self.comment1("max read address = size[n] * stride[n-1]")
@@ -3582,8 +3817,45 @@ def globalReadDo(self, kernel, guardK, tP):
       sizeIdx = tP["ia"][dim]
       sizeIdxIsSum = sizeIdx in kernel["ProblemType"]["IndicesSummation"]
       if sizeIdxIsSum:
-	sizeIdx -= kernel["ProblemType"]["NumIndicesC"]
-      if kernel["BufferLoad"] and not kernel["PreciseBoundsCheck"]:
+        sizeIdx -= kernel["ProblemType"]["NumIndicesC"]
+
+      if not kernel["PreciseBoundsCheck"]:
+        # PBC moves the limit as SRD moves forward so don't need to reset boundary
+        # Else find the edge of the matrix and compute bounds
+
+        if 1:
+          kStr += self.s_mul_u64_u32(sgpr(maxAddrSgpr+0), sgpr(maxAddrSgpr+1),  \
+                      sgpr("Sizes%s+%u"%("Sum" if sizeIdxIsSum else "Free", sizeIdx)),  \
+                      sgpr("Strides%s+%u"%(tP["tensorChar"],strideIdx)), \
+                      "64b tensor%s size in elements"%tc)
+          kStr += inst("s_lshl_b64", \
+            sgpr(maxAddrSgpr,2), \
+            sgpr(maxAddrSgpr,2), \
+            hex(log2(tP["bpe"])), "<- tensor%s size in bytes"%tc)
+        else:
+          if kernel["ProblemType"]["NumIndicesC"] == 2:
+            kStr += inst("s_lshl_b64", \
+              sgpr(maxAddrSgpr,2), \
+              sgpr("Tensor2dSize%s"%tc,2), \
+              hex(log2(tP["bpe"])), "<- tensor%s size in bytes"%tc)
+          elif kernel["ProblemType"]["NumIndicesC"] == 3:
+            # TODO - hardcored for two batches, remove when PBC code goes
+            kStr += self.s_mul_u64_u32(sgpr(maxAddrSgpr+0), sgpr(maxAddrSgpr+1),  \
+                        sgpr("Tensor2dSize%s")%tc, \
+                        sgpr("SizesFree+2"), "scale Tensor2D by numBatches")
+            kStr += inst("s_lshl_b64", \
+              sgpr(maxAddrSgpr,2), \
+              sgpr(maxAddrSgpr,2), \
+              hex(log2(tP["bpe"])), "<- tensor%s size in bytes"%tc)
+          else:
+            assert(0) # unsupported number of Free dims, should use PBC=1 instead
+            kStr += inst("s_lshl_b64", \
+              sgpr(maxAddrSgpr,2), \
+              sgpr(maxAddrSgpr,2), \
+              hex(log2(tP["bpe"])), "<- tensor%s size in bytes"%tc)
+
+
+        if kernel["BufferLoad"]:
           # Set maxAddrSgpr to max allowed byte offset
           # maxAddrSgpr = size[n] * stride[n-1] * bpe
           # SRD has moved ahead for each tile so subtract original A to see if we are OOB:
@@ -3594,17 +3866,11 @@ def globalReadDo(self, kernel, guardK, tP):
               sgpr("Address%s+0"%tc), \
               "Compute distance of SRD from original array in bytes")
 
-          kStr += inst("s_mul_i32", \
-              sgpr(maxAddrSgpr+0), \
-              sgpr("Sizes%s+%u"%("Sum" if sizeIdxIsSum else "Free", sizeIdx)),  \
-              sgpr("Strides%s+%u"%(tP["tensorChar"],strideIdx)), \
-              "Array size")
-
-          kStr += inst("s_lshl_b32",
-              sgpr(maxAddrSgpr+0), \
-              sgpr(maxAddrSgpr+0), \
-              hex(log2(tP["bpe"])), \
-              "Array size in bytes")
+          kStr += inst("s_subb_u32", \
+              sgpr(tmpSgpr+1), \
+              sgpr("Srd%s++1"%tc), \
+              sgpr("Address%s+1"%tc), \
+              "Compute distance of SRD from original array in bytes")
 
           kStr += inst("s_sub_u32", \
               sgpr(maxAddrSgpr), \
@@ -3612,56 +3878,53 @@ def globalReadDo(self, kernel, guardK, tP):
               sgpr(tmpSgpr), \
               "Max byte offset =  MaxSize - SRD_Distance")
 
-      if not kernel["BufferLoad"]:
-	kStr += inst("s_mul_i32", \
-	    sgpr(maxAddrSgpr+0), \
-	    sgpr("Sizes%s+%u"%("Sum" if sizeIdxIsSum else "Free", sizeIdx)),  \
-	    sgpr("Strides%s+%u"%(tP["tensorChar"],strideIdx)), \
-	    "mul d%u lower"%dim)
+          kStr += inst("s_subb_u32", \
+              sgpr(maxAddrSgpr+1), \
+              sgpr(maxAddrSgpr+1), \
+              sgpr(tmpSgpr+1), \
+              "Max byte offset =  MaxSize - SRD_Distance")
 
-        kStr += inst("s_mov_b32", sgpr(maxAddrSgpr+1), hex(0), "zero (upper)")
-        # maxAddrSgpr *= bytes/element
+          if kernel["CheckDimOverflow"]>=2:
+            kStr += self.assert_eq(sgpr(maxAddrSgpr+1), 0)
 
-        kStr += inst("s_lshl_b64", \
-            sgpr(maxAddrSgpr,2), \
-            sgpr(maxAddrSgpr,2), \
-            hex(log2(tP["bpe"])), "offset *= bytes/element")
-            # maxAddrSgpr += initial address
-        kStr += inst("s_add_u32", \
-            sgpr(maxAddrSgpr+0), \
-            sgpr(self.sgprs["AddressA"] if tP["isA"] else self.sgprs["AddressB"]), \
-            sgpr(maxAddrSgpr+0), \
-            "prepend address lower")
-        kStr += inst("s_addc_u32", \
-            sgpr(maxAddrSgpr+1), \
-            sgpr((self.sgprs["AddressA"] if tP["isA"] else self.sgprs["AddressB"])+1), \
-            sgpr(maxAddrSgpr+1), \
-            "prepend address upper")
-        # sgpr->vgpr
-        maxAddrVgpr = self.vgprPool.checkOut(2, "maxAddrVgpr")
-        kStr += inst("v_mov_b32", vgpr(maxAddrVgpr+0), sgpr(maxAddrSgpr+0), "sgpr->vgpr")
-        kStr += inst("v_mov_b32", vgpr(maxAddrVgpr+1), sgpr(maxAddrSgpr+1), "sgpr->vgpr")
-
-        # full exec mask
-        fullExec = tmpSgpr
-        kStr += inst("s_mov_b64", sgpr(fullExec,2), \
-            "0xFFFFFFFFFFFFFFFF", "to restore all threads active")
-        bpeVgpr = self.vgprPool.checkOut(1, "bpeVgpr")
-	kStr += inst("v_mov_b32", vgpr(bpeVgpr), hex(tP["bpe"]), "bpe")
-
-	# can remove this?
-        zeroVgpr = self.vgprPool.checkOut(1)
-        kStr += inst("v_mov_b32", vgpr(zeroVgpr), hex(0), "zero")
+        else: # not BufferLoad
+          kStr += inst("s_add_u32", \
+              sgpr(maxAddrSgpr+0), \
+              sgpr(self.sgprs["AddressA"] if tP["isA"] else self.sgprs["AddressB"]), \
+              sgpr(maxAddrSgpr+0), \
+              "prepend address lower")
+          kStr += inst("s_addc_u32", \
+              sgpr(maxAddrSgpr+1), \
+              sgpr((self.sgprs["AddressA"] if tP["isA"] else self.sgprs["AddressB"])+1), \
+              sgpr(maxAddrSgpr+1), \
+              "prepend address upper")
+          # sgpr->vgpr
+          maxAddrVgpr = self.vgprPool.checkOut(2, "maxAddrVgpr")
+          kStr += inst("v_mov_b32", vgpr(maxAddrVgpr+0), sgpr(maxAddrSgpr+0), "sgpr->vgpr")
+          kStr += inst("v_mov_b32", vgpr(maxAddrVgpr+1), sgpr(maxAddrSgpr+1), "sgpr->vgpr")
+
+          # full exec mask
+          fullExec = tmpSgpr
+          kStr += inst("s_mov_b64", sgpr(fullExec,2), \
+              "0xFFFFFFFFFFFFFFFF", "to restore all threads active")
+          bpeVgpr = self.vgprPool.checkOut(1, "bpeVgpr")
+          kStr += inst("v_mov_b32", vgpr(bpeVgpr), hex(tP["bpe"]), "bpe")
+
+          # can remove this?
+          zeroVgpr = self.vgprPool.checkOut(1)
+          kStr += inst("v_mov_b32", vgpr(zeroVgpr), hex(0), "zero")
 
       # End if guardK
 
     directToLdsLoads = 0
 
+    loopCnt = -1
     for perp in range(0, tP["nrp"]):
       for sPerp in range(0, tP["nrpv"]):
         for para in range(0, tP["nrc"]):
           for sPara in range(0, tP["nrcv"]/tP["nrcvpi"]):
             i = sPara + (tP["nrcv"]/tP["nrcvpi"]) * (para + tP["nrc"] * (sPerp + tP["nrpv"] * perp))
+            loopCnt += 1
             graIdx = i * self.rpgo if kernel["BufferLoad"] else i * self.rpga
             g2lIdx = i * loadWidth
             if guardK:
@@ -3837,7 +4100,8 @@ def globalReadDo(self, kernel, guardK, tP):
 
                   # Get offset (for checking, see comment below) and comment:
                   (checkOffset, iDummy, comment) = \
-                      self.calculateLdsWriteOffset(perp, para, sPerp, sPara, kernel, tP)
+                      self.calculateLdsWriteOffset(perp, para, sPerp, sPara, kernel, tP, 0)
+
                   # Direct to LDS always writes consecutive LDS locations at m0 + 4 * TidInWave
                   # Therefore we double-check here to ensure the desired LDS write offset
                   # is moving at NumThreads*4.  This should already be guaranteed since
@@ -3922,7 +4186,6 @@ def localWriteSwapOffsets(self, kernel, tP):
     if not self.do["LocalWrite"]: return ""
     kStr = ""
     tc = tP["tensorChar"]
-#jgolds which bpe here? assuming tP
 #fixme-iui  need to use wrapping increment for double or triple buffering:
     if kernel["LocalWriteUseSgpr%s"%tc]:
       kStr += inst("s_xor_b32", \
@@ -3947,7 +4210,6 @@ def localWriteResetOffsets(self, kernel, tP):
     kStr = ""
     resetMask = hex(kernel["LdsOffsetA_Blk"]*tP["bpe"]-1 | self.LdsOOB)
     tc = tP["tensorChar"]
-#jgolds which bpe here? assuming tP
     if kernel["LocalWriteUseSgpr%s"%tc]:
       kStr += inst("s_and_b32", \
           sgpr("LocalWriteAddr%s"%tP["tensorChar"]), \
@@ -3997,8 +4259,9 @@ def localWriteInitPointers(self, kernel, tP):
   #   i : ?
   #   comment : Comment with the text version of the formula
   #############################################################################
-  def calculateLdsWriteOffset(self, perp, para, sPerp, sPara, kernel, tP):
+  def calculateLdsWriteOffset(self, perp, para, sPerp, sPara, kernel, tP, localWriteCnt):
     tc = tP["tensorChar"]
+    ldl = kernel["LocalDotLayout"]
     lscaOffset = para * kernel[tP["lsc"]]
     lspaOffset = perp * kernel[tP["lsp"]]
 
@@ -4015,7 +4278,7 @@ def calculateLdsWriteOffset(self, perp, para, sPerp, sPara, kernel, tP):
       i = sPara + (tP["nrcv"]/tP["nrcvpi"]) * (para * tP["glvw"] + tP["nrc"] * (sPerp + tP["glvw"] * tP["nrpv"] * perp ))
 
 
-    if kernel["LocalDotLayout"] > 1:
+    if ldl > 1:
       # apply interleave for LocalDot:
       # Else they complement the address calculation to place adjacent-in-u data
       # so adjacent-in-lds.
@@ -4023,7 +4286,7 @@ def calculateLdsWriteOffset(self, perp, para, sPerp, sPara, kernel, tP):
             "wtc=", tP["wtc"], "wuc=", tP["wuc"], "grcv=", tP["grcv"], \
             "lscaOffset=", lscaOffset, "lspaOffset=", lspaOffset
       spacing = tP["glvw"]
-      lscaOffset += (lspaOffset % spacing) * kernel["LocalDotLayout"]
+      lscaOffset += (lspaOffset % spacing) * ldl
       lspaOffset /= spacing
       print "    After LDL: lscaOffset=", lscaOffset, "lspaOffset=", lspaOffset
 
@@ -4048,7 +4311,16 @@ def calculateLdsWriteOffset(self, perp, para, sPerp, sPara, kernel, tP):
     #print "2lscaOffset", lscaOffset
     offsetElements = (lspaOffset + lscaOffset)
     #print "offsetElements", offsetElements
-    offsetBytes = offsetElements*tP["bpe"]
+    if not tP["tlu"] and ldl > 1:
+#jgolds HACK
+#Need to clean this up. Does not follow usual paradigm, but works for cases we care about with dot2
+      rem = (localWriteCnt) % ldl
+      quo = (localWriteCnt) / ldl
+      #print "quo %u, rem %u, MT %u"%(quo, rem, kernel["MacroTile%u"%tP["tensorIdx"]])
+      offsetBytes = (quo * kernel["MacroTile%u"%tP["tensorIdx"]] * ldl + rem)*tP["bpe"]
+    else:
+      offsetBytes = offsetElements*tP["bpe"]
+
     #print "offsetBytes", offsetBytes
     #print "offset", offset
 
@@ -4123,7 +4395,8 @@ def localWriteDo(self, kernel, tP):
                         sgpr("PerpOverhangVcc%s"%tc,2), \
                         "Mask load so out-of-gr-tile bounds returns 0. Note 1.0f=0x3f80000 which is large non-neg int")
             lwa = tmpLocalWriteAddr
-
+#jgolds HACK
+        loopCnt = 0
         for para in range(0, tP["nrc"]):
           for s in range(0, max(tP["nwcv"],tP["nwpv"])/tP["nwcvpi"]):
 
@@ -4140,8 +4413,9 @@ def localWriteDo(self, kernel, tP):
               elif tP["wuc"] == tP["grcv"]:
                 sPerp = s
 
-            (offset, i, comment) = self.calculateLdsWriteOffset(perp, para, sPerp, sPara, kernel, tP)
+            (offset, i, comment) = self.calculateLdsWriteOffset(perp, para, sPerp, sPara, kernel, tP, loopCnt)
             g2lIdx = i*blockWidth
+            loopCnt+=1
 
 
             paramList = []
@@ -4176,7 +4450,7 @@ def localWriteDo(self, kernel, tP):
     if 0:
       kStr += inst("s_barrier", "temp debug wait to check sync issue" )
 
-    if 0 and tP["isA"]:
+    if 0 and tP["isB"]:
     #if 0 and self.localWriteDoCnt >= 0:
       kStr += "s_waitcnt lgkmcnt(0) & vmcnt(0)\n"
       kStr += inst("s_barrier", "dump LDS" )
@@ -4191,7 +4465,6 @@ def localReadSwapOffsets(self, kernel, tP):
     tc=tP["tensorChar"]
     if not self.do["LocalRead%s"%tc]: return ""
     kStr = ""
-#jgolds which bpe here? assuming tP
     kStr += inst("v_xor_b32", \
         vgpr("LocalReadAddr%s"%tP["tensorChar"]), \
         hex(kernel["LdsOffsetA_Blk"]*tP["bpe"]), \
@@ -4211,7 +4484,6 @@ def localReadResetOffsets(self, kernel, tP):
       tP["localReadOffset"] = 0
       tP["localReadElementOffset"] = 0
       kStr += self.comment1("handled internally")
-#jgolds which bpe here? assuming tP
     kStr += inst("v_and_b32", \
         vgpr("LocalReadAddr%s"%tP["tensorChar"]), \
         hex(kernel["LdsOffsetA_Blk"]*tP["bpe"]-1), \
@@ -4231,7 +4503,6 @@ def localReadInitPointers(self, kernel, tP):
       tP["localReadElementOffset"] = 0
       kStr += self.comment1("N/A")
     else:
-#jgolds which bpe here? assuming tP
       kStr += inst("v_and_b32", \
           vgpr("LocalReadAddr%s"%tP["tensorChar"]), \
           hex(kernel["LdsOffset%s_Blk"%tP["tensorChar"]]*tP["bpe"]-1), \
@@ -4247,9 +4518,17 @@ def localReadInc(self, kernel, iui, tP):
     if not self.do["LocalRead%s"%tc]: return ""
     kStr = ""
     tc = tP["tensorChar"]
+    ldl = kernel["LocalDotLayout"]
+    tt = tP["tt"]
+    partialInc = kernel[tt]
     if self.inTailLoop:
-#jgolds which bpe here? assuming tP
-      inc = kernel["LocalSplitU"]*(kernel["MacroTile%u"%tP["tensorIdx"]]+kernel["LdsPad%s"%tc])*tP["bpe"]
+      if ldl > 1:
+        if iui < (kernel["InnerUnroll"] - 1):
+          inc = partialInc*tP["bpe"]
+        else:
+          inc = (ldl * kernel["LocalSplitU"]*(kernel["MacroTile%u"%tP["tensorIdx"]] + kernel["LdsPad%s"%tc]) - partialInc * (ldl - 1))*tP["bpe"]
+      else:
+        inc = kernel["LocalSplitU"]*(kernel["MacroTile%u"%tP["tensorIdx"]]+kernel["LdsPad%s"%tc])*tP["bpe"]
       tmpSgpr = self.getTmpSgpr(1)
       kStr += inst("s_mov_b32", sgpr(tmpSgpr), hex(inc), "inc")
       kStr += inst("_v_add_co_u32", \
@@ -4260,11 +4539,7 @@ def localReadInc(self, kernel, iui, tP):
           "lr%s += %u (LSU*(MT+PAD)*bpe)"%(tP["tensorChar"], inc) )
     else:
       if tP["localReadInstruction"].numOffsets == 1:
-        ldl = kernel["LocalDotLayout"]
         if ldl > 1:
-          #jgolds
-          #HACK just hard coding to verify it works for the case I am testing
-          partialInc = 8    # in elements
           if iui < (kernel["InnerUnroll"] - 1):
             tP["localReadOffset"] += partialInc
           else:
@@ -4446,7 +4721,7 @@ def shiftVectorComponents(self, kernel, tP):
       kStr += inst("_v_add_co_u32", vgpr(vReg), "vcc", vgpr(mvReg), vgpr(vReg), "vId = 2 components")
       self.vgprPool.checkIn(mvReg)
       self.vgprPool.checkIn(vRegD)
-    
+
     kStr += inst("v_cmp_eq_u32", sgpr(tmpSgpr,2), vgpr(thread), \
         vgpr(eReg), "mask" )
     kStr += inst("v_mov_b32", vgpr(tmpVgpr+0), sgpr(tmpSgpr+0), "")
@@ -4485,7 +4760,7 @@ def shiftVectorComponents(self, kernel, tP):
       for vectorIdx in range(0, numVectors):
         kStr += self.comment("shift d%u r=%u v=%u"%(tP["idx"], r, vectorIdx))
         kStr += "label_%04u:%s" % (sviLabels[r-1][vectorIdx], self.endLine)
-        # mask if last thread in thread-tile column
+        # mask if last thread in thread#-tile column
         kStr += inst("v_cmpx_eq_u32", sgpr(tmpSgpr,2), vgpr(thread), \
           vgpr(eReg), "serial % SG == (wgMT/VECTOR_WIDTH)%SG" )
         tto = kernel["ThreadTile%u"%((tP["idx"]+1)%2)] # thread tile orthogonal
@@ -4754,7 +5029,6 @@ def localSplitULocalRead(self, kernel):
     kStr = ""
     tmpSgpr = self.getTmpSgpr(1)
     baseAddr = self.vgprPool.checkOut(1)
-#jgolds which bpe should we use?
     kStr += staticMultiply(vgpr(baseAddr), vgpr("Serial"), kernel["GlobalWriteVectorWidth"]*self.bpeAB, sgpr(tmpSgpr))
     (elementStep, useDwordX2) = self.getLocalSplitUElementStep(kernel, True)
     # Load values for each subgroup
@@ -5324,7 +5598,6 @@ def globalWriteElements(self, kernel, lsu, vectorWidths, elements):
         # Use bpeCexternal for all external values
 
         numVgprsPerAddr = self.rpgo if kernel["BufferStore"] else self.rpga
-#jgolds which bpe should we use?
         numVgprsPerDataPerVI = 0
 
 
@@ -5432,9 +5705,7 @@ def globalWriteElements(self, kernel, lsu, vectorWidths, elements):
           elementStartIdx = batchIdx * numElementsPerBatch
           elementStopIdx = min( elementStartIdx + numElementsPerBatch, len(elements[edgeI]) )
           elementsThisBatch = elements[edgeI][elementStartIdx:elementStopIdx]
-          numElementsThisBatch = len(elementsThisBatch)
-          numElementVgprs = int(numElementsThisBatch * ceil(numVgprsPerElement))
-          #print "BATCH[%u/%u]: elements[edgeI][%u:%u] VGPRs=%u" % (batchIdx, numBatches, elementStartIdx, elementStopIdx, numElementVgprs)
+          #print "BATCH[%u/%u]: elements[edgeI][%u:%u] VGPRs=%u" % (batchIdx, numBatches, elementStartIdx, elementStopIdx )
           # elementVgprs can be large and should be perfectly tuned to the number of available
           # VGPRS.  We do not want to accidentally overflow and grow the pool here:
           kStr += self.globalWriteBatch(kernel, beta, edge, lsu, atomic, gwvw, atomicW, \
@@ -5735,7 +6006,7 @@ def globalWriteBatch(self, kernel, beta, edge, lsu, atomic, gwvw, atomicW, \
         #kStr += inst("v_mov_b32", vgpr(addr), 0x0, "bozo")
         if edge:
           # Set address to -1 if OOB on either dimension
-          # TODO - for PreciseBoundsCheck we could set bounds on C to tile dim
+          # TODO - for PreciseBoundsCheckStore we could set bounds on C to tile dim
           # and only check the x/coord0 index here, save a couple inst
           kStr += inst("v_cmp_lt_u32",  sgpr(tmpS01,2), vgpr(     coordVgpr0), sgpr("SizesFree+0"), "coord0 < size0" )
           kStr += inst("v_cmp_lt_u32",  sgpr(tmpS23,2), vgpr(self.coordVgpr1), sgpr("SizesFree+1"), "coord1 < size1" )
@@ -6113,7 +6384,7 @@ def globalWriteBatch(self, kernel, beta, edge, lsu, atomic, gwvw, atomicW, \
                 # src2 = sumIdxV = f32 = opsel 00
                 dataCExternal = elementData[elementIdx] + vi/2
                 hi16 = sumIdxV%2
-                kStr += inst("v_mad_mix_f32", vgpr("ValuC+%u"%sumIdxV), sgpr("Beta"), \
+                kStr += inst(self.mixinst, vgpr("ValuC+%u"%sumIdxV), sgpr("Beta"), \
                     vgpr(dataCExternal), vgpr("ValuC+%u"%sumIdxV), \
                     "op_sel:[0,%u,0] op_sel_hi:[0,1,0]" % (hi16), \
                     "//C*=beta")
@@ -6327,7 +6598,6 @@ def dumpLds(self, kernel, startU, numU):
       kStr += inst("s_barrier", "dump LDS" )
       tmp = self.vgprPool.checkOut(1)
       tmpAddr = self.vgprPool.checkOut(1)
-#jgolds which bpe should we use?
       kStr += inst("v_lshlrev_b32", \
           vgpr(tmpAddr), \
           hex(log2(self.bpeAB)), \
@@ -6392,8 +6662,37 @@ def kernelBodyBetaOnly(self, kernel):
     return kStr
 
 
+  # Perform 32-bit scalar mul and save u64 result in two SGPR
+  # src0 and src1 are 32-bit unsigned ints in scalar sgpr or small int constants (<64?))
+  # return retuns in dst0:dest (lower 32-bit in dst0, high 64-bit in dst1))
+  def s_mul_u64_u32 (self, dst0, dst1,  src0, src1, comment):
+    kStr = ""
+    assert(dst1 != src0) # no worky since dst1 overwritten by first mul operations
+    assert(dst1 != src1) # no worky since dst1 overwritten by first mul operations
+    # the else path below has less restrictions but prefer consistency
+    if globalParameters["AsmCaps"][self.version]["HasSMulHi"]:
+      kStr += inst("s_mul_hi_u32", dst1, src0, src1, comment)
+      kStr += inst("s_mul_i32", dst0, src0, src1, comment)
+    else:
+      if type(src1) != 'str' or not src1.startswith("s"):
+        # Swap operands, need a scalar sgpr in src1 (not a constant)
+        t = src0
+        src0 = src1
+        src1 = t
+      vtmp0 = self.vgprPool.checkOut(2)
+      vtmp1 = vtmp0+1
+      kStr += inst("v_mov_b32", vgpr(vtmp0), src0, comment)
+      kStr += inst("v_mul_hi_u32", vgpr(vtmp1), vgpr(vtmp0), src1, comment)
+      kStr += inst("v_readfirstlane_b32", dst1, vgpr(vtmp1), comment)
+      kStr += inst("v_mul_lo_u32", vgpr(vtmp1), vgpr(vtmp0), src1, comment)
+      kStr += inst("v_readfirstlane_b32", dst0, vgpr(vtmp1), comment)
+      self.vgprPool.checkIn(vtmp0)
+    return kStr
+
+
+
   ##############################################################################
-  # Cause a GPUVM fault.  
+  # Cause a GPUVM fault.
   # Instruction after the bomb will write the cookie to SGPR0, so you can see the cookie in the 
   # backtrace. Useful for locating which spot in code generated the bomb
   # vgprAddr controls which vgpr to overwrite with the null pointer address
@@ -6437,12 +6736,13 @@ def assertCommon(self, cookie=-1):
   ##############################################################################
   # assertCmpCommon : Common routine for all assert comparison functions
   ##############################################################################
-  def assertCmpCommon(self, c, val0, val1, cookie=-1):
+  def assertCmpCommon(self, cond, val0, val1, cookie=-1):
     kStr = ""
     if self.db["EnableAsserts"]:
       kStr += inst("s_or_saveexec_b64", sgpr("SaveExecMask",2), 0, \
           "assert: saved execmask")
-      kStr += inst("v_cmpx_%s_u32"%c, "vcc", val0, val1, "v_cmp" )
+
+      kStr += inst("v_cmpx_%s"%cond, "vcc", val0, val1, "v_cmp" )
 
       kStr += self.assertCommon(cookie)
 
@@ -6457,22 +6757,48 @@ def assertCmpCommon(self, c, val0, val1, cookie=-1):
   # Asserts currently modify vcc
   ##############################################################################
   def assert_eq(self, val0, val1, cookie=-1):
-    return self.assertCmpCommon("ne", val0, val1, cookie)
+    return self.assertCmpCommon("ne_u32", val0, val1, cookie)
 
   def assert_ne(self, val0, val1, cookie=-1):
-    return self.assertCmpCommon("eq", val0, val1, cookie)
+    return self.assertCmpCommon("eq_u32", val0, val1, cookie)
 
-  def assert_lt(self, val0, val1, cookie=-1):
-    return self.assertCmpCommon("ge", val0, val1, cookie)
+  def assert_lt_u32(self, val0, val1, cookie=-1):
+    return self.assertCmpCommon("ge_u32", val0, val1, cookie)
 
-  def assert_gt(self, val0, val1, cookie=-1):
-    return self.assertCmpCommon("le", val0, val1, cookie)
+  def assert_gt_u32(self, val0, val1, cookie=-1):
+    return self.assertCmpCommon("le_u32", val0, val1, cookie)
+
+  def assert_le_u32(self, val0, val1, cookie=-1):
+    return self.assertCmpCommon("gt_u32", val0, val1, cookie)
+
+  def assert_ge_u32(self, val0, val1, cookie=-1):
+    return self.assertCmpCommon("lt_u32", val0, val1, cookie)
+
+  def assert_ge_i32(self, val0, val1, cookie=-1):
+    return self.assertCmpCommon("lt_i32", val0, val1, cookie)
+
+  # can left shift w/o losing non-zero bits:
+  def assert_no_shift_of(self, val0, shift, stmp, cookie=-1):
+    kStr = ""
+    # TODO - use BFE here:
+    kStr += inst ("s_mov_b32", stmp, hex((shift-1) << (32-log2(shift))), "assert_no_shift_of - compute mask")
+    kStr += inst ("s_and_b32", stmp, stmp, val0, "assert_no_shift_of")
+    kStr += self.assert_eq(stmp, 0, cookie)
+    return kStr
+
+
+  def bomb_at_wg3d(self, wg0, wg1, wg2, cookie=-1):
+    kStr = ""
+    tmp0 = sgpr("SaveExecMask")
+    tmp1 = sgpr("SaveExecMask"+1)
+    kStr += inst("s_cmp_u32", tmp0, sgpr("WorkGroup0"), wg0)
+    kStr += inst("s_cmp_u32", tmp1, sgpr("WorkGroup1"), wg1)
+    kStr += inst("s_or_b32", tmp0, tmp0, tmp1, "")
+    kStr += inst("s_cmp_u32", tmp1, sgpr("WorkGroup2"), wg2)
+    kStr += inst("s_or_b32", tmp0, tmp0, tmp1, "")
+    kStr += "WIP"
 
-  def assert_le(self, val0, val1, cookie=-1):
-    return self.assertCmpCommon("gt", val0, val1, cookie)
 
-  def assert_ge(self, val0, val1, cookie=-1):
-    return self.assertCmpCommon("lt", val0, val1, cookie)
 
   # asserts if val0 is not an integer multiple of multiple2
   # multiple2 must be a constant and power of 2
diff --git a/Tensile/KernelWriterSource.py b/Tensile/KernelWriterSource.py
index 9b8e7c081c..e0b836bd3f 100644
--- a/Tensile/KernelWriterSource.py
+++ b/Tensile/KernelWriterSource.py
@@ -367,7 +367,10 @@ def functionPrefix(self, kernel):
     if self.language == "OCL":
       kStr += "#define MAC(A,B,DST) mad(A,B,DST)"
     else:
-      kStr += "#define MAC(A,B,DST) DST += A*B"
+      if kernel["ProblemType"]["HighPrecisionAccumulate"] and kernel["ProblemType"]["DataType"].isHalf():
+        kStr += "#define MAC(A,B,DST) DST += static_cast<float>(A) * static_cast<float>(B)" 
+      else:
+        kStr += "#define MAC(A,B,DST) DST += A*B" 
     kStr += self.endLine
 
     if self.language == "HIP" and kernel["ProblemType"]["DataType"].isComplex():
diff --git a/Tensile/SolutionStructs.py b/Tensile/SolutionStructs.py
index faf48df197..eaac031802 100644
--- a/Tensile/SolutionStructs.py
+++ b/Tensile/SolutionStructs.py
@@ -1396,6 +1396,15 @@ def assignDerivedParameters(state):
       reject(state, "InnerUnroll only supported on assembly")
     state["LoopUnroll"] /= state["InnerUnroll"]
 
+    # HACK!
+    # For now, LocalDotLayout > 1 only works if the thread tile is a square and VectorWidth is equal to the 
+    # thread tile size
+    ldl = state["LocalDotLayout"]
+    if ldl > 1 and \
+      (state["ThreadTile0"] != state["VectorWidth"] or state["ThreadTile1"] != state["VectorWidth"] or state["AssertSummationElementMultiple"] % ldl != 0):
+      reject(state, "LocalDotLayout > 1 only supports square thread tiles and VectorWidth equal to ThreadTile0/1 size and ASEM a multiple of LDL")
+      return
+
     if 0:
       print "info: ", pvar(state, "LoopUnroll"), " LDS Stats:", pvar(state, "LdsOffsetA"), pvar(state, "LdsOffsetB")
       print "info: ", pvar(state["ProblemType"], "TLUA"), \
@@ -1475,14 +1484,26 @@ def assignDerivedParameters(state):
     # check is used since this is faster and also for computation we only
     # need to ensure that none of the loads fault.  threads which are
     # computing bogus sections of the C tile will later be ignored.
-    # precise checking only works for vectorloads<=AssertSummationElementMultiple
-    # else if the vload crosses boundary we ignore all components not just the
-    # ones that are OOB.
+    # precise checking only works when all elements of the load are in-bounds
+    # since if the vload crosses boundary we ignore all components not just the
+    # ones that are OOB. See comments for groOffsetInMacroTile
+    # So check for the cases where the unroll loop can
+    # generate partial loads here and reject PBC solutions:
+    # For non-TLU the free dim is in perp dim so loads can't be partially OOB
+    # so those always guaranteeeNoPartial*=True
+    if state["ProblemType"]["TLUA"]:
+      guaranteeeNoPartialA = state["AssertFree0ElementMultiple"]%state["GlobalLoadVectorWidthA"]==0
+    else:
+      guaranteeeNoPartialA = state["AssertSummationElementMultiple"]%state["GlobalLoadVectorWidthA"]==0
+
+    if state["ProblemType"]["TLUB"]:
+      guaranteeNoPartialB = state["AssertFree1ElementMultiple"]%state["GlobalLoadVectorWidthB"]==0
+    else:
+      guaranteeNoPartialB = state["AssertSummationElementMultiple"]%state["GlobalLoadVectorWidthB"]==0
+
+    #--
     if state["PreciseBoundsCheck"]:
-      if  state["GlobalLoadVectorWidthA"] > \
-          state["AssertSummationElementMultiple"] \
-          or state["GlobalLoadVectorWidthB"] > \
-          state["AssertSummationElementMultiple"]:
+      if not guaranteeeNoPartialA or not guaranteeNoPartialB:
         state["PreciseBoundsCheck"] = False
 
     # Use SGPR to store an offset from GlobalReadOffsetA+0.
@@ -1491,13 +1512,15 @@ def assignDerivedParameters(state):
     # individual vector registers doing bounds compares.
     if not state["PreciseBoundsCheck"]:
       state["UseSgprForGRO"] = 0
+      if state["FractionalLoad"]:
+        reject(state, "Fractional currently requires PreciseBoundsCheck") # Move to PBC always
 
     if state["UseSgprForGRO"] == -1:
-      # Don't use SGPR if it looks like we might not have enough:
+      # Don't use SGPR if it looks like we might not have enough - better to leave PBC enabled even if we have to use VGPR
       # 40 is based on current SGPR usage, this may need to be tuned in the future:
       numLoadsA = state["NumLoadsCoalescedA"]*state["NumLoadsPerpendicularA"]
       numLoadsB = state["NumLoadsCoalescedB"]*state["NumLoadsPerpendicularB"]
-      if numLoadsA + numLoadsB > 40:
+      if numLoadsA + numLoadsB > 35:
         #print "info: Disabling UseSgprForGRO since predicting too many SGPR will be used"
         state["UseSgprForGRO"] = 0
       else:
diff --git a/Tensile/SolutionWriter.py b/Tensile/SolutionWriter.py
index 3565e5c666..8cf32e253e 100644
--- a/Tensile/SolutionWriter.py
+++ b/Tensile/SolutionWriter.py
@@ -85,7 +85,6 @@ def getSourceString(self, solution, kernelsWithBuildErrs):
     if not globalParameters["MergeFiles"]:
       solutionName = self.getSolutionName(solution)
       s += "#include \"%s.h\"\n" % solutionName
-      #s += "#include \"MathTemplates.h\"\n"
       s += "\n"
 
     # solution function signature
@@ -107,6 +106,12 @@ def getSourceString(self, solution, kernelsWithBuildErrs):
       t += "  "
       if globalParameters["DebugKernel"]:
         s += "%sunsigned int *debugBuffer;\n" % t
+      # Tensor sizes in bytes, excluding batch dims and accounting for zero strides
+      # Do these first since they are 64-bits and want to avoid any unneeded padding:
+      s += "%s// Size of lowest Tensor's lowest 2 dims, in bytes.  Does not include bath dim or higher (>2) order dimensions\n" % t
+      s += "%suint64_t tensor2dSizeC;\n" % t
+      s += "%suint64_t tensor2dSizeA;\n" % t
+      s += "%suint64_t tensor2dSizeB;\n" % t
       solutionArgs = self.getArgList(solution["ProblemType"], True, False, False)
       for arg in solutionArgs:
         if arg[0] == "TensileHalf":
@@ -133,7 +138,6 @@ def getSourceString(self, solution, kernelsWithBuildErrs):
 
     # NOTE: host compiler aligns size of structs to 64-bits (at least) and aligns the offset of pointers to 64-bits, therefore, having pointers which are not at the beginning of the struct may get padded/shifted by the host compiler and, therefore, not coppied correctly to gpu
 
-
     # kernels
     s += "\n%s/* kernels */\n" % (t)
     s += "%sconst unsigned int numKernels = %u; // 1 or 4\n" % (t, len(kernels))
@@ -271,6 +275,82 @@ def getSourceString(self, solution, kernelsWithBuildErrs):
         s += "%ssizes[%u][0][%u] = size%s;\n" \
             % (t, kernelIdx, i, self.indexChars[i])
 
+      # Tensor2DSizes - size excluding the batch dimension, accounts for cases where one of strides is 0
+      problemType = solution["ProblemType"]
+      #print "IndexAssignmentsA=", problemType["IndexAssignmentsA"], "Batch=", problemType["IndicesBatch"]
+      firstStride = 0 if problemType["UseInitialStrides"] else 1
+      del i
+
+      numIdx = problemType["NumIndicesC"]
+      printMe = printedFree = 0
+      s += "%suint64_t tensor2dSizeC = %s" % \
+          (t, "1" if firstStride==1 else "strideC%u%s"% (0,self.indexChars[0]))
+      for idx in range(0,numIdx):
+        # Multiply only by first free and first summation
+        if idx in problemType["IndicesFree"] and printedFree<2:
+          printedFree += 1
+          printMe = True
+        else:
+          printMe = False
+
+        if printMe:
+          if idx < firstStride:
+            strideIdx = problemType["IndexAssignmentsA"][idx+1]
+            s += " * std::max(size%s, strideA%u%s)" % \
+                (self.indexChars[idx], idx+1, self.indexChars[strideIdx])
+          else:
+            s += " * size%s" % (self.indexChars[idx])
+      s += ";\n"
+
+      numIdx = len(problemType["IndexAssignmentsA"])
+      printMe = printedStride = printedFree = printedSum = False
+      s += "%suint64_t tensor2dSizeA = %s" % (t, "1" if firstStride==1 else "strideA%u%s"% (0,self.indexChars[0]))
+      for i in range(0,numIdx):
+        idx = problemType["IndexAssignmentsA"][i]
+
+        # Multiply only by first free and first summation
+        if idx in problemType["IndicesFree"] and not printedFree:
+          printMe = printedFree = True
+        elif idx in problemType["IndicesSummation"] and not printedSum:
+          printMe = printedSum = True
+        else:
+          printMe = False
+
+        if printMe:
+          if not printedStride:
+            printedStride = True
+            strideIdx = problemType["IndexAssignmentsA"][i+1]
+            s += " * std::max(size%s, strideA%u%s)" % \
+                (self.indexChars[idx], i+1, self.indexChars[strideIdx])
+          else:
+            s += " * size%s" % (self.indexChars[idx])
+      s += ";\n"
+
+      numIdx = len(problemType["IndexAssignmentsB"])
+      printMe = printedStride = printedFree = printedSum = False
+      s += "%suint64_t tensor2dSizeB = %s" % (t, "1" if firstStride==1 else "strideB%u%s"% (0,self.indexChars[0]))
+      for i in range(0,numIdx):
+        idx = problemType["IndexAssignmentsB"][i]
+
+        # Multiply only by first free and first summation
+        if idx in problemType["IndicesFree"] and not printedFree:
+          printMe = printedFree = True
+        elif idx in problemType["IndicesSummation"] and not printedSum:
+          printMe = printedSum = True
+        else:
+          printMe = False
+
+        if printMe:
+          if not printedStride:
+            printedStride = True
+            strideIdx = problemType["IndexAssignmentsB"][i+1]
+            s += " * std::max(size%s, strideB%u%s)" % \
+                (self.indexChars[idx], i+1, self.indexChars[strideIdx])
+          else:
+            s += " * size%s" % (self.indexChars[idx])
+      s += ";\n"
+
+
     #s += "printf(\"Launching with grid=%zu_%zu problemGrid=%u_%u mt=%u_%u\\n\", globalWorkSize[0][0], globalWorkSize[0][1], totalWorkGroups0, totalWorkGroups1, macroTile0, macroTile1);\n"
     s += "\n"
 
@@ -451,6 +531,9 @@ def getSourceString(self, solution, kernelsWithBuildErrs):
         # sizes
         for i in range(0, solution["ProblemType"]["TotalIndices"]):
           s += "%sprintf(\"  sizes[kernelIdx][enqueueIdx][%u] = %%u\\n\", sizes[kernelIdx][enqueueIdx][%u] );\n" % (t, i, i )
+        s += "%sprintf(\"  tensor2dSizeC== %%lu\\n\", tensor2dSizeC );\n" % (t)
+        s += "%sprintf(\"  tensor2dSizeA== %%lu\\n\", tensor2dSizeA );\n" % (t)
+        s += "%sprintf(\"  tensor2dSizeB== %%lu\\n\", tensor2dSizeB );\n" % (t)
 
       ########################################
       # OpenCL Runtime
@@ -581,6 +664,10 @@ def getSourceString(self, solution, kernelsWithBuildErrs):
             s += "%shipFunctionArgs.size%s = sizes[kernelIdx][enqueueIdx][%u];\n" \
                 % (t, globalParameters["IndexChars"][i], i )
 
+          s += "%shipFunctionArgs.tensor2dSizeC = tensor2dSizeC;\n" % (t)
+          s += "%shipFunctionArgs.tensor2dSizeA = tensor2dSizeA;\n" % (t)
+          s += "%shipFunctionArgs.tensor2dSizeB = tensor2dSizeB;\n" % (t)
+
           if solution["PersistentKernel"]:
             # pass in the number of groups since not available in WG
             s += "%shipFunctionArgs.numGroupTiles0 = totalWorkGroups0;\n" % (t)
diff --git a/Tensile/Source/Client.h b/Tensile/Source/Client.h
index 7e30a0885a..aaa535bacd 100644
--- a/Tensile/Source/Client.h
+++ b/Tensile/Source/Client.h
@@ -210,7 +210,7 @@ void specializeData(
 
   const unsigned int numIndicesSummation = totalIndices - numIndicesC;
 
-  const unsigned int db = 1; // 0x1=header, 0x2=offset/value on each store, 0x4=loop debug
+  const unsigned int db = 0; // 0x1=header, 0x2=offset/value on each store, 0x4=loop debug
   TensorDims td("specialize_matrix", numIndicesAB, numIndicesC, allSizes, indexAssignments);
 
   if (db & 0x1) {
diff --git a/Tensile/Source/SolutionHelper.h b/Tensile/Source/SolutionHelper.h
index aab9902f0c..0f93740c51 100644
--- a/Tensile/Source/SolutionHelper.h
+++ b/Tensile/Source/SolutionHelper.h
@@ -28,6 +28,7 @@
 #include <string>
 #include <tuple>
 #include <mutex>
+#include <atomic>
 
 /*******************************************************************************
  * Kernel Cache
diff --git a/Tensile/Source/TensileTypes.h b/Tensile/Source/TensileTypes.h
index cdd83a34e7..978be1c232 100644
--- a/Tensile/Source/TensileTypes.h
+++ b/Tensile/Source/TensileTypes.h
@@ -70,8 +70,9 @@ TensileStatus tensileTeardown();
 #define tensileStatusCheck(RET) { \
   TensileStatus tensileCheckStatusTmp = RET; \
   if(tensileCheckStatusTmp != tensileStatusSuccess) { \
-    printf("TensileStatusFailure %i on line %u of %s\n", \
+    fprintf(stderr, "ERROR:  TensileStatusFailure %i on line %u of %s\n", \
         tensileCheckStatusTmp, __LINE__, __FILE__); \
+    abort();\
   } }
 
 template <int NumSizes, int LastSummationIdx, int Free0Idx>
diff --git a/Tensile/TensileCreateLibrary.py b/Tensile/TensileCreateLibrary.py
index cd0b4c409e..670f4898da 100644
--- a/Tensile/TensileCreateLibrary.py
+++ b/Tensile/TensileCreateLibrary.py
@@ -26,7 +26,7 @@
 from SolutionWriter import SolutionWriter
 from KernelWriterSource import KernelWriterSource
 from KernelWriterAssembly import KernelWriterAssembly
-import multiprocessing, copy
+import multiprocessing
 
 import os
 import sys
@@ -159,6 +159,7 @@ def writeSolutionsAndKernels(outputPath, solutions, kernels, kernelsBetaOnly, \
             kiStart, kiStop, child)
       t = multiprocessing.Process(target=processKernelSourceChunk, args=args)
       t.start()
+      child.close() # close child pipe in the parent process
       threads.append([t,kiStart,kiStop, parentConn])
       if processLaunchProgressBar:
         processLaunchProgressBar.increment(kiStop-kiStart)
@@ -174,7 +175,11 @@ def writeSolutionsAndKernels(outputPath, solutions, kernels, kernelsBetaOnly, \
 
   someError = 0
   for (t,kiStart,kiStop,parentConn) in threads:
-    results = parentConn.recv()
+    try:
+      results = parentConn.recv()
+    except EOFError as pipeErr:
+      print  "*** warning: process", t, "returned pipe EOF",t,pipeErr
+
     t.join()
     e = t.exitcode
     if e != 0 :
@@ -262,6 +267,7 @@ def writeSolutionsAndKernels(outputPath, solutions, kernels, kernelsBetaOnly, \
       solutionSourceFile.write(CHeader)
       solutionHeaderFile.write(CHeader)
     solutionSourceFile.write("#include \"Solutions.h\"\n")
+    solutionSourceFile.write("#include <algorithm>\n")
     solutionHeaderFile.write("#include \"TensileTypes.h\"\n")
     solutionHeaderFile.write("#include \"Kernels.h\"\n")
     solutionHeaderFile.write("#include \"SolutionHelper.h\"\n")
@@ -563,6 +569,23 @@ def writeLogic(outputPath, logicData, solutionWriter ):
           s += "    hipGetDeviceProperties(&deviceProperties, deviceId);\n"
           s += "    std::string name = deviceProperties.name;\n"
 
+        if problemType["DataType"].isDouble() :
+          s += "\n"
+          s += "//  intercept schedule selection and call HIP (source) kernel\n"
+          s += "    if((strideA2K == 0) || (strideB2K == 0))\n"
+          s += "    {\n"
+          numSchedules = len(schedules)
+          schedule = reordered_schedules[numSchedules-1]
+          scheduleName  = schedule[0]
+          s += "        return tensileGetSolution%s_%s_%s(" \
+                % ( returnType, scheduleName, problemType)
+          for i in range(0, len(argListSizes)):
+            s += "%s%s" \
+                % (argListSizes[i][1],
+                    ", " if i < len(argListSizes)-1 else ");\n")
+          s += "    }\n"
+          s += "\n"
+
         if problemType["DataType"].isHalf() :
           # "first" free index, usually the letter "I"
           free0Index = problemType["IndicesFree"][0]
diff --git a/Tensile/Tests/bugs/fractional_plus_pbc.yaml b/Tensile/Tests/bugs/fractional_plus_pbc.yaml
new file mode 100644
index 0000000000..e9fb949047
--- /dev/null
+++ b/Tensile/Tests/bugs/fractional_plus_pbc.yaml
@@ -0,0 +1,68 @@
+GlobalParameters:
+  MinimumRequiredVersion: 4.0.0
+  PrintLevel: 1
+  ForceRedoBenchmarkProblems: True
+  ForceRedoLibraryLogic: True
+  ForceRedoLibraryClient: True
+  CMakeBuildType: Release
+  EnqueuesPerSync: 1
+  SyncsPerBenchmark: 1
+  LibraryPrintDebug: False
+  NumElementsToValidate: 1000
+  ValidationMaxToPrint: 10
+  ValidationPrintValids: False
+  ShortNames: False
+  MergeFiles: True
+  Platform: 0
+  Device: 0
+  KernelTime: True
+  PinClocks: True
+  SleepPercent: 200
+  DataInitTypeAlpha : 1
+  DataInitTypeA : 3
+  DataInitTypeB : 3
+  PrintSolutionRejectionReason : 0
+
+BenchmarkProblems:
+  -
+    - # ProblemType
+      OperationType: GEMM
+      DataType: s
+      TransposeA: True
+      TransposeB: True
+      UseBeta: True
+      Batched: True
+
+  ########################################
+  # NN - Batch
+  ########################################
+    - # Benchmark Group - ResNet 1x1:
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - EdgeType: ["ShiftPtr"]
+        - KernelLanguage: ["Assembly"]
+
+      ForkParameters:
+        - FractionalLoad: [1]
+        - PrefetchGlobalRead: [0]
+        - PrefetchLocalRead: [1]
+        - ThreadTile:
+          - [ 8, 6 ]
+        - WorkGroup:
+          - [ 32, 8, 1 ]
+            #- [ 16, 16, 1 ]
+        - WorkGroupMapping: [64]
+          #  - DepthU: [3,5,7,9,16] # some bugs with odd unroll dims
+        - DepthU: [16]
+        - VectorWidth: [2]
+        - GlobalReadVectorWidth: [2]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+            #- Range: [ [127,1,127], 0, [1], [64,1,64] ]
+
+# to repro, disable this in SolutionStructs.py
+# 1506         reject(state, "Fractional currently requires PreciseBoundsCheck") # debug this later
+# Symptom is GPUVM fault, 
diff --git a/Tensile/Tests/create_tests.py b/Tensile/Tests/create_tests.py
index 9438fc6c67..7e2a715940 100755
--- a/Tensile/Tests/create_tests.py
+++ b/Tensile/Tests/create_tests.py
@@ -1,6 +1,6 @@
 #!/usr/bin/python 
 
-# Create a test_py script for all test*yaml files in specified directory
+# Create a test_py script for all *.yaml files in specified directory
 # usage: create_tests.py TEST_DIR
 # Run from the Tensile/Tests directory, output script goes in the TEST_DIR/test_TEST_DIR.py
 
@@ -14,7 +14,7 @@
 print "info: writing test script to %s" % targetFile
 outfile = open(targetFile, "w" )
 outfile.write("import Tensile.Tensile as Tensile\n\n")
-for f in glob.glob("%s/*yaml"%targetDir):
+for f in glob.glob("%s/*aml"%targetDir):
     baseName = os.path.basename(f)
     testName = os.path.splitext(baseName)[0]
     if not testName.startswith("test_"):
diff --git a/Tensile/Tests/disabled/hgemm_nn_source.yaml b/Tensile/Tests/disabled/hgemm_nn_source.yaml
new file mode 100644
index 0000000000..0eafc0eb1e
--- /dev/null
+++ b/Tensile/Tests/disabled/hgemm_nn_source.yaml
@@ -0,0 +1,44 @@
+# Sweep across different vector widths and global vector widths
+GlobalParameters:
+  MinimumRequiredVersion: 4.2.0
+  NumElementsToValidate: -1
+  KernelTime: True
+  SleepPercent: 0
+
+BenchmarkProblems:
+
+  - #
+    - # ProblemType
+      OperationType: GEMM
+      DataType: h
+      TransposeA: False
+      TransposeB: False
+      UseBeta: True
+      Batched: True
+
+    - # BenchmarkProblemSizeGroup - Assembly
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - LoopTail: [True]
+        - EdgeType: ["ShiftPtr"]
+      ForkParameters:
+        - KernelLanguage: ["Source"]
+        - GlobalSplitU: [1,3]
+        - PrefetchLocalRead: [True]
+        - PrefetchGlobalRead: [True]
+        - ThreadTile:
+          - [ 2, 2 ]
+          - [ 4, 4 ]
+          - [ 8, 16 ]
+        - WorkGroup:
+          - [ 16, 16,  1 ]
+          - [  8, 16,  1 ]
+        - DepthU: [32]
+        - VectorWidth: [1,2,4,8]
+        - GlobalReadVectorWidth: [1,2,4,8]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [ [127,1,129], 0, [4], [63,1,65] ]
diff --git a/Tensile/Tests/disabled/test_disabled.py b/Tensile/Tests/disabled/test_disabled.py
index 100f0c00fc..108ec8ae02 100644
--- a/Tensile/Tests/disabled/test_disabled.py
+++ b/Tensile/Tests/disabled/test_disabled.py
@@ -6,3 +6,5 @@ def test_create_library(tmpdir):
 def test_assertion_selection(tmpdir):
  Tensile.Tensile([Tensile.TensileTestPath("disabled/test_assertion_selection.yaml"), tmpdir.strpath])
 
+def test_hgemm_nn_source(tmpdir):
+ Tensile.Tensile([Tensile.TensileTestPath("disabled/hgemm_nn_source.yaml"), tmpdir.strpath])
diff --git a/Tensile/Tests/nightly/big_tensor/bigskinny_nt.yaml b/Tensile/Tests/nightly/big_tensor/bigskinny_nt.yaml
new file mode 100644
index 0000000000..e9d2a8897f
--- /dev/null
+++ b/Tensile/Tests/nightly/big_tensor/bigskinny_nt.yaml
@@ -0,0 +1,63 @@
+GlobalParameters:
+  MinimumRequiredVersion: 4.0.0
+  PrintLevel: 1
+  ForceRedoBenchmarkProblems: True
+  ForceRedoLibraryLogic: True
+  ForceRedoLibraryClient: True
+  CMakeBuildType: Release
+  EnqueuesPerSync: 1
+  SyncsPerBenchmark: 1
+  LibraryPrintDebug: True
+  NumElementsToValidate: 1000
+  ValidationMaxToPrint: 100
+  ValidationPrintValids: False
+  ShortNames: False
+  MergeFiles: True
+  Platform: 0
+  Device: 0
+  KernelTime: True
+  PinClocks: True
+  SleepPercent: 0
+  PrintSolutionRejectionReason : 1
+
+BenchmarkProblems:
+  ########################################
+  # NT - standard
+  ########################################
+  -
+    - # ProblemType
+      OperationType: GEMM
+      DataType: d
+      TransposeA: False
+      TransposeB: False
+      UseBeta: True
+      Batched: False
+
+    - # BenchmarkProblemSizeGroup - Standard
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+        - EdgeType: ["ShiftPtr"]
+      ForkParameters:
+        - CheckDimOverflow: [0, 1, 2]
+        - PrefetchLocalRead: [False]
+        - PrefetchGlobalRead: [False]
+        - ThreadTile:
+          - [ 4, 4 ]
+        - WorkGroup:
+          - [ 16, 16, 1 ]
+        - WorkGroupMapping: [64]
+        - GlobalSplitU: [1]
+        - DepthU: [ 4 ]
+        - VectorWidth: [2]
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - MinStride: [1296, 24296, 31296]
+          - Exact: [804, 20732, 184]
+
+  ########################################
+
+# Cijk_Ailk_Bljk_DB_MT064x064x04_AF0EM01_ASEM01_BL1_GRVW02_GSU01_ISA900_K1_KLA_LPB00_MGWVW01_NLCA01_NLCB01_PBC1_PGR1_PLR1_TT04_04_USFGRO00_VAW01_VW02_WG16_16_01_WGM08
+# ./rocblas-bench -f gemm -r d -m 11804 -n 25732 -k 384 --lda 31296 --ldb 31296 --ldc 31296    --transposeB N --transposeA N  -v 1
+
diff --git a/Tensile/Tests/nightly/big_tensor/test_big_tensor.py b/Tensile/Tests/nightly/big_tensor/test_big_tensor.py
new file mode 100644
index 0000000000..03e1ff64b6
--- /dev/null
+++ b/Tensile/Tests/nightly/big_tensor/test_big_tensor.py
@@ -0,0 +1,5 @@
+import Tensile.Tensile as Tensile
+
+def test_bigskinny_nt(tmpdir):
+ Tensile.Tensile([Tensile.TensileTestPath("nightly/big_tensor/bigskinny_nt.yaml"), tmpdir.strpath])
+
diff --git a/Tensile/Tests/nightly/classic_source/test_classic_source.py b/Tensile/Tests/nightly/classic_source/test_classic_source.py
index 8fd9fb7da8..2b819e9439 100644
--- a/Tensile/Tests/nightly/classic_source/test_classic_source.py
+++ b/Tensile/Tests/nightly/classic_source/test_classic_source.py
@@ -1,7 +1,10 @@
 import Tensile.Tensile as Tensile
 
-def test_hgemm(tmpdir):
- Tensile.Tensile([Tensile.TensileTestPath("nightly/classic_source/test_hgemm.yaml"), tmpdir.strpath])
+def test_hgemm_nn(tmpdir):
+ Tensile.Tensile([Tensile.TensileTestPath("nightly/classic_source/test_hgemm_nn.yaml"), tmpdir.strpath])
+
+def test_hgemm_tn_tt(tmpdir):
+ Tensile.Tensile([Tensile.TensileTestPath("nightly/classic_source/test_hgemm_tn_tt.yaml"), tmpdir.strpath])
 
 def test_sgemm_vectors(tmpdir):
  Tensile.Tensile([Tensile.TensileTestPath("nightly/classic_source/test_sgemm_vectors.yaml"), tmpdir.strpath])
@@ -12,6 +15,9 @@ def test_hgemm_vectors(tmpdir):
 def test_dgemm(tmpdir):
  Tensile.Tensile([Tensile.TensileTestPath("nightly/classic_source/test_dgemm.yaml"), tmpdir.strpath])
 
+def test_hgemm_nt(tmpdir):
+ Tensile.Tensile([Tensile.TensileTestPath("nightly/classic_source/test_hgemm_nt.yaml"), tmpdir.strpath])
+
 def test_sgemm(tmpdir):
  Tensile.Tensile([Tensile.TensileTestPath("nightly/classic_source/test_sgemm.yaml"), tmpdir.strpath])
 
diff --git a/Tensile/Tests/nightly/classic_source/test_hgemm.yaml b/Tensile/Tests/nightly/classic_source/test_hgemm.yaml
deleted file mode 100644
index 188dbab557..0000000000
--- a/Tensile/Tests/nightly/classic_source/test_hgemm.yaml
+++ /dev/null
@@ -1,319 +0,0 @@
-GlobalParameters:
-  PrintLevel: 1
-  ForceRedoBenchmarkProblems: True
-  PrintSolutionRejectionReason: False
-  MinimumRequiredVersion: 4.2.0
-  NumElementsToValidate: -1
-  ValidationMaxToPrint: 4
-  DataInitTypeAB: 1
-  DataInitTypeC: 1
-  ExitOnFails: 0  # Some solutions fail so just ensure we find one good solution
-
-BenchmarkProblems:
-
-    ############################################################################
-    # NN
-    ############################################################################
-  -
-    - # ProblemType
-      OperationType: GEMM
-      DataType: h
-      TransposeA: False
-      TransposeB: False
-      UseBeta: False
-      Batched: True
-
-    - # Tile Sizes
-      BenchmarkCommonParameters:
-        - EdgeType: ["ShiftPtr"]
-        - LoopTail: [True]
-        - MacroTileShapeMax: [64]
-      ForkParameters:
-        - ThreadTile:
-          - [8, 8]
-          - [7, 4]
-          - [3, 5]
-          - [2, 6]
-          - [1, 1]
-        - WorkGroup:
-          - [ 16, 16,  1 ]
-          - [  8,  2, 16 ]
-          - [ 16, 12,  1 ]
-        - DepthU: [ 2, 16, 64 ]
-        - GlobalSplitU: [1, 4]
-        - VectorWidth: [1]
-      BenchmarkFinalParameters:
-        - ProblemSizes:
-          - Exact: [   1, 1, 1, 1 ]
-          - Range: [ [127, 1, 129], [127, 1, 129], [1, 2], [63, 1, 65] ]
-
-
-    - # Non-Tile Sizes
-      BenchmarkCommonParameters:
-        - EdgeType: ["ShiftPtr"]
-        - LoopTail: [True]
-        - WorkGroup: [ [8, 8, 2 ] ]
-        - ThreadTile: [ [4, 8] ]
-        - DepthU: [ 16 ]
-      ForkParameters:
-        - GlobalReadCoalesceGroupA: [False, True]
-        - GlobalReadCoalesceGroupB: [False, True]
-        - PrefetchGlobalRead: [False, True]
-        - PrefetchLocalRead: [False, True]
-        - VectorWidth: [1]
-        - GlobalSplitU: [1, 4]
-      BenchmarkFinalParameters:
-        - ProblemSizes:
-          - Range: [ [127, 1, 129], [127, 1, 129], [1], [63, 1, 65] ]
-
-    - # Branches
-      BenchmarkCommonParameters:
-        - LoopTail: [True]
-        - EdgeType: ["Branch"]
-        - DepthU: [ 16 ]
-      ForkParameters:
-        - ThreadTile:
-          - [8, 8]
-          - [2, 8]
-        - WorkGroup:
-          - [  8,  4, 4 ]
-          - [ 16, 16, 1 ]
-        - VectorWidth: [1]
-      BenchmarkFinalParameters:
-        - ProblemSizes:
-          - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [3], [63, 1, 64] ]
-
-
-    ############################################################################
-    # NT
-    ############################################################################
-  -
-    - # ProblemType
-      OperationType: GEMM
-      DataType: h
-      TransposeA: False
-      TransposeB: True
-      UseBeta: False
-
-    - # Tile Sizes
-      BenchmarkCommonParameters:
-        - EdgeType: ["ShiftPtr"]
-        - LoopTail: [True]
-        - MacroTileShapeMax: [64]
-      ForkParameters:
-        - ThreadTile:
-          - [8, 8]
-          - [7, 4]
-          - [3, 5]
-          - [2, 6]
-          - [1, 1]
-        - WorkGroup:
-          - [ 16, 16,  1 ]
-          - [  8,  2, 16 ]
-          - [ 16, 12,  1 ]
-        - DepthU: [ 2, 16, 64 ]
-        - GlobalSplitU: [1, 4]
-        - VectorWidth: [1]
-      BenchmarkFinalParameters:
-        - ProblemSizes:
-          - Exact: [   1,   1,   1 ]
-          - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ]
-
-
-    - # Non-Tile Sizes
-      BenchmarkCommonParameters:
-        - EdgeType: ["ShiftPtr"]
-        - LoopTail: [True]
-        - WorkGroup: [ [8, 8, 2 ] ]
-        - ThreadTile: [ [4, 8] ]
-        - DepthU: [ 16 ]
-      ForkParameters:
-        - GlobalReadCoalesceGroupA: [False, True]
-        - GlobalReadCoalesceGroupB: [False, True]
-        - PrefetchGlobalRead: [False, True]
-        - PrefetchLocalRead: [False, True]
-        - VectorWidth: [1]
-        - GlobalSplitU: [1, 4]
-      BenchmarkFinalParameters:
-        - ProblemSizes:
-          - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ]
-
-    - # Branches
-      BenchmarkCommonParameters:
-        - LoopTail: [True]
-        - EdgeType: ["Branch"]
-        - DepthU: [ 16 ]
-      ForkParameters:
-        - ThreadTile:
-          - [8, 8]
-          - [2, 8]
-        - WorkGroup:
-          - [  8,  4, 4 ]
-          - [ 16, 16, 1 ]
-        - VectorWidth: [1]
-      BenchmarkFinalParameters:
-        - ProblemSizes:
-          - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [63, 1, 64] ]
-
-
-    ############################################################################
-    # TN
-    ############################################################################
-  -
-    - # ProblemType
-      OperationType: GEMM
-      DataType: h
-      TransposeA: True
-      TransposeB: False
-      UseBeta: True
-
-    - # Tile Sizes
-      BenchmarkCommonParameters:
-        - EdgeType: ["ShiftPtr"]
-        - LoopTail: [True]
-        - MacroTileShapeMax: [64]
-      ForkParameters:
-        - ThreadTile:
-          - [8, 8]
-          - [7, 4]
-          - [3, 5]
-          - [2, 6]
-          - [1, 1]
-        - WorkGroup:
-          - [ 16, 16,  1 ]
-          - [  8,  2, 16 ]
-          - [ 16, 12,  1 ]
-        - DepthU: [ 2, 16, 64 ]
-        - GlobalSplitU: [1, 4]
-        - VectorWidth: [1]
-      BenchmarkFinalParameters:
-        - ProblemSizes:
-          - Exact: [   1,   1,   1 ]
-          - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ]
-
-
-    - # Non-Tile Sizes
-      BenchmarkCommonParameters:
-        - EdgeType: ["ShiftPtr"]
-        - LoopTail: [True]
-        - WorkGroup: [ [8, 8, 2 ] ]
-        - ThreadTile: [ [4, 8] ]
-        - DepthU: [ 16 ]
-      ForkParameters:
-        - GlobalReadCoalesceGroupA: [False, True]
-        - GlobalReadCoalesceGroupB: [False, True]
-        - PrefetchGlobalRead: [False, True]
-        - PrefetchLocalRead: [False, True]
-        - VectorWidth: [1]
-        - GlobalSplitU: [1, 4]
-      BenchmarkFinalParameters:
-        - ProblemSizes:
-          - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ]
-
-    - # Branches
-      BenchmarkCommonParameters:
-        - LoopTail: [True]
-        - EdgeType: ["Branch"]
-        - DepthU: [ 16 ]
-      ForkParameters:
-        - ThreadTile:
-          - [8, 8]
-          - [2, 8]
-        - WorkGroup:
-          - [  8,  4, 4 ]
-          - [ 16, 16, 1 ]
-        - VectorWidth: [1]
-      BenchmarkFinalParameters:
-        - ProblemSizes:
-          - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [63, 1, 64] ]
-
-
-    ############################################################################
-    # TT
-    ############################################################################
-  -
-    - # ProblemType
-      OperationType: GEMM
-      DataType: h
-      TransposeA: True
-      TransposeB: True
-      UseBeta: False
-
-    - # Tile Sizes
-      BenchmarkCommonParameters:
-        - EdgeType: ["ShiftPtr"]
-        - LoopTail: [True]
-        - MacroTileShapeMax: [64]
-      ForkParameters:
-        - ThreadTile:
-          - [8, 8]
-          - [7, 4]
-          - [3, 5]
-          - [2, 6]
-          - [1, 1]
-        - WorkGroup:
-          - [ 16, 16,  1 ]
-          - [  8,  2, 16 ]
-          - [ 16, 12,  1 ]
-        - DepthU: [ 2, 16, 64 ]
-        - GlobalSplitU: [1, 4]
-        - VectorWidth: [1]
-      BenchmarkFinalParameters:
-        - ProblemSizes:
-          - Exact: [   1,   1,   1 ]
-          - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ]
-
-
-    - # Non-Tile Sizes
-      BenchmarkCommonParameters:
-        - EdgeType: ["ShiftPtr"]
-        - LoopTail: [True]
-        - WorkGroup: [ [8, 8, 2 ] ]
-        - ThreadTile: [ [4, 8] ]
-        - DepthU: [ 16 ]
-      ForkParameters:
-        - GlobalReadCoalesceGroupA: [False, True]
-        - GlobalReadCoalesceGroupB: [False, True]
-        - PrefetchGlobalRead: [False, True]
-        - PrefetchLocalRead: [False, True]
-        - VectorWidth: [1]
-        - GlobalSplitU: [1, 4]
-      BenchmarkFinalParameters:
-        - ProblemSizes:
-          - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ]
-
-    - # Branches
-      BenchmarkCommonParameters:
-        - LoopTail: [True]
-        - EdgeType: ["Branch"]
-        - DepthU: [ 16 ]
-      ForkParameters:
-        - ThreadTile:
-          - [8, 8]
-          - [2, 8]
-        - WorkGroup:
-          - [  8,  4, 4 ]
-          - [ 16, 16, 1 ]
-        - VectorWidth: [1]
-      BenchmarkFinalParameters:
-        - ProblemSizes:
-          - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [63, 1, 64] ]
-
-LibraryLogic:
-    ScheduleName: "vega10"
-    DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861"]
-    ArchitectureName: "gfx900"
-
-#   ScheduleName: "mi25"
-#   DeviceNames: ["Device 6860"]
-#   ArchitectureName: "gfx900"
-
-#   ScheduleName: "r9nano"
-#   DeviceNames: ["Device 7300"]
-#   ArchitectureName: "gfx803"
-
-#   ScheduleName: "hip"
-#   DeviceNames: ["Device 0000"]
-#   ArchitectureName: "fallback"
-
-LibraryClient:
diff --git a/Tensile/Tests/nightly/classic_source/test_hgemm_nn.yaml b/Tensile/Tests/nightly/classic_source/test_hgemm_nn.yaml
new file mode 100644
index 0000000000..c1f4baef13
--- /dev/null
+++ b/Tensile/Tests/nightly/classic_source/test_hgemm_nn.yaml
@@ -0,0 +1,85 @@
+GlobalParameters:
+  PrintLevel: 1
+  ForceRedoBenchmarkProblems: True
+  PrintSolutionRejectionReason: False
+  MinimumRequiredVersion: 4.2.0
+  NumElementsToValidate: -1
+  ValidationMaxToPrint: 4
+  DataInitTypeAB: 1
+  DataInitTypeC: 1
+  ExitOnFails: 0  # Some solutions fail so just ensure we find one good solution
+
+BenchmarkProblems:
+
+    ############################################################################
+    # NN
+    ############################################################################
+  -
+    - # ProblemType
+      OperationType: GEMM
+      DataType: h
+      TransposeA: False
+      TransposeB: False
+      UseBeta: False
+      Batched: True
+
+    - # Tile Sizes
+      BenchmarkCommonParameters:
+        - EdgeType: ["ShiftPtr"]
+        - LoopTail: [True]
+        - MacroTileShapeMax: [64]
+      ForkParameters:
+        - ThreadTile:
+          - [8, 8]
+          - [7, 4]
+          - [3, 5]
+          - [2, 6]
+          - [1, 1]
+        - WorkGroup:
+          - [ 16, 16,  1 ]
+          - [  8,  2, 16 ]
+          - [ 16, 12,  1 ]
+        - DepthU: [ 2, 16, 64 ]
+        - GlobalSplitU: [1, 4]
+        - VectorWidth: [1]
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [   1, 1, 1, 1 ]
+          - Range: [ [127, 1, 129], [127, 1, 129], [1, 2], [63, 1, 65] ]
+
+
+    - # Non-Tile Sizes
+      BenchmarkCommonParameters:
+        - EdgeType: ["ShiftPtr"]
+        - LoopTail: [True]
+        - WorkGroup: [ [8, 8, 2 ] ]
+        - ThreadTile: [ [4, 8] ]
+        - DepthU: [ 16 ]
+      ForkParameters:
+        - GlobalReadCoalesceGroupA: [False, True]
+        - GlobalReadCoalesceGroupB: [False, True]
+        - PrefetchGlobalRead: [False, True]
+        - PrefetchLocalRead: [False, True]
+        - VectorWidth: [1]
+        - GlobalSplitU: [1, 4]
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [ [127, 1, 129], [127, 1, 129], [1], [63, 1, 65] ]
+
+    - # Branches
+      BenchmarkCommonParameters:
+        - LoopTail: [True]
+        - EdgeType: ["Branch"]
+        - DepthU: [ 16 ]
+      ForkParameters:
+        - ThreadTile:
+          - [8, 8]
+          - [2, 8]
+        - WorkGroup:
+          - [  8,  4, 4 ]
+          - [ 16, 16, 1 ]
+        - VectorWidth: [1]
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [3], [63, 1, 64] ]
+
diff --git a/Tensile/Tests/nightly/classic_source/test_hgemm_nt.yaml b/Tensile/Tests/nightly/classic_source/test_hgemm_nt.yaml
new file mode 100644
index 0000000000..2a048da245
--- /dev/null
+++ b/Tensile/Tests/nightly/classic_source/test_hgemm_nt.yaml
@@ -0,0 +1,87 @@
+GlobalParameters:
+  PrintLevel: 1
+  ForceRedoBenchmarkProblems: True
+  PrintSolutionRejectionReason: False
+  MinimumRequiredVersion: 4.2.0
+  NumElementsToValidate: -1
+  ValidationMaxToPrint: 4
+  DataInitTypeAB: 1
+  DataInitTypeC: 1
+  ExitOnFails: 0  # Some solutions fail so just ensure we find one good solution
+
+BenchmarkProblems:
+
+    ############################################################################
+    # NT
+    ############################################################################
+  -
+    - # ProblemType
+      OperationType: GEMM
+      DataType: h
+      TransposeA: False
+      TransposeB: True
+      UseBeta: False
+
+##   disabled for now due to hanging
+##
+##    - # Tile Sizes
+##      BenchmarkCommonParameters:
+##        - EdgeType: ["ShiftPtr"]
+##        - LoopTail: [True]
+##        - MacroTileShapeMax: [64]
+##      ForkParameters:
+##        - ThreadTile:
+##          - [8, 8]
+##          - [7, 4]
+##          - [3, 5]
+##          - [2, 6]
+##          - [1, 1]
+##        - WorkGroup:
+##          - [ 16, 16,  1 ]
+##          - [  8,  2, 16 ]
+##          - [ 16, 12,  1 ]
+##        - DepthU: [ 2, 16, 64 ]
+##        - GlobalSplitU: [1, 4]
+##        - VectorWidth: [1]
+##      BenchmarkFinalParameters:
+##        - ProblemSizes:
+##          - Exact: [   1,   1,   1 ]
+##          - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ]
+
+
+    - # Non-Tile Sizes
+      BenchmarkCommonParameters:
+        - EdgeType: ["ShiftPtr"]
+        - LoopTail: [True]
+        - WorkGroup: [ [8, 8, 2 ] ]
+        - ThreadTile: [ [4, 8] ]
+        - DepthU: [ 16 ]
+      ForkParameters:
+        - GlobalReadCoalesceGroupA: [False, True]
+        - GlobalReadCoalesceGroupB: [False, True]
+        - PrefetchGlobalRead: [False, True]
+        - PrefetchLocalRead: [False, True]
+        - VectorWidth: [1]
+        - GlobalSplitU: [1, 4]
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ]
+
+    - # Branches
+      BenchmarkCommonParameters:
+        - LoopTail: [True]
+        - EdgeType: ["Branch"]
+        - DepthU: [ 16 ]
+      ForkParameters:
+        - ThreadTile:
+          - [8, 8]
+          - [2, 8]
+        - WorkGroup:
+          - [  8,  4, 4 ]
+          - [ 16, 16, 1 ]
+        - VectorWidth: [1]
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [63, 1, 64] ]
+
+
diff --git a/Tensile/Tests/nightly/classic_source/test_hgemm_tn_tt.yaml b/Tensile/Tests/nightly/classic_source/test_hgemm_tn_tt.yaml
new file mode 100644
index 0000000000..41bd3c4626
--- /dev/null
+++ b/Tensile/Tests/nightly/classic_source/test_hgemm_tn_tt.yaml
@@ -0,0 +1,178 @@
+GlobalParameters:
+  PrintLevel: 1
+  ForceRedoBenchmarkProblems: True
+  PrintSolutionRejectionReason: False
+  MinimumRequiredVersion: 4.2.0
+  NumElementsToValidate: -1
+  ValidationMaxToPrint: 4
+  DataInitTypeAB: 1
+  DataInitTypeC: 1
+  ExitOnFails: 0  # Some solutions fail so just ensure we find one good solution
+
+BenchmarkProblems:
+    # Covers TN and TT cases to test combining at at
+
+
+    ############################################################################
+    # TN
+    ############################################################################
+  -
+    - # ProblemType
+      OperationType: GEMM
+      DataType: h
+      TransposeA: True
+      TransposeB: False
+      UseBeta: True
+
+    - # Tile Sizes
+      BenchmarkCommonParameters:
+        - EdgeType: ["ShiftPtr"]
+        - LoopTail: [True]
+        - MacroTileShapeMax: [64]
+      ForkParameters:
+        - ThreadTile:
+          - [8, 8]
+          - [7, 4]
+          - [3, 5]
+          - [2, 6]
+          - [1, 1]
+        - WorkGroup:
+          - [ 16, 16,  1 ]
+          - [  8,  2, 16 ]
+          - [ 16, 12,  1 ]
+        - DepthU: [ 2, 16, 64 ]
+        - GlobalSplitU: [1, 4]
+        - VectorWidth: [1]
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [   1,   1,   1 ]
+          - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ]
+
+
+    - # Non-Tile Sizes
+      BenchmarkCommonParameters:
+        - EdgeType: ["ShiftPtr"]
+        - LoopTail: [True]
+        - WorkGroup: [ [8, 8, 2 ] ]
+        - ThreadTile: [ [4, 8] ]
+        - DepthU: [ 16 ]
+      ForkParameters:
+        - GlobalReadCoalesceGroupA: [False, True]
+        - GlobalReadCoalesceGroupB: [False, True]
+        - PrefetchGlobalRead: [False, True]
+        - PrefetchLocalRead: [False, True]
+        - VectorWidth: [1]
+        - GlobalSplitU: [1, 4]
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ]
+
+    - # Branches
+      BenchmarkCommonParameters:
+        - LoopTail: [True]
+        - EdgeType: ["Branch"]
+        - DepthU: [ 16 ]
+      ForkParameters:
+        - ThreadTile:
+          - [8, 8]
+          - [2, 8]
+        - WorkGroup:
+          - [  8,  4, 4 ]
+          - [ 16, 16, 1 ]
+        - VectorWidth: [1]
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [63, 1, 64] ]
+
+
+    ############################################################################
+    # TT
+    ############################################################################
+  -
+    - # ProblemType
+      OperationType: GEMM
+      DataType: h
+      TransposeA: True
+      TransposeB: True
+      UseBeta: False
+
+##   disabled for now due to hanging
+##
+##    - # Tile Sizes
+##      BenchmarkCommonParameters:
+##        - EdgeType: ["ShiftPtr"]
+##        - LoopTail: [True]
+##        - MacroTileShapeMax: [64]
+##      ForkParameters:
+##        - ThreadTile:
+##          - [8, 8]
+##          - [7, 4]
+##          - [3, 5]
+##          - [2, 6]
+##          - [1, 1]
+##        - WorkGroup:
+##          - [ 16, 16,  1 ]
+##          - [  8,  2, 16 ]
+##          - [ 16, 12,  1 ]
+##        - DepthU: [ 2, 16, 64 ]
+##        - GlobalSplitU: [1, 4]
+##        - VectorWidth: [1]
+##      BenchmarkFinalParameters:
+##        - ProblemSizes:
+##          - Exact: [   1,   1,   1 ]
+##          - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ]
+
+
+    - # Non-Tile Sizes
+      BenchmarkCommonParameters:
+        - EdgeType: ["ShiftPtr"]
+        - LoopTail: [True]
+        - WorkGroup: [ [8, 8, 2 ] ]
+        - ThreadTile: [ [4, 8] ]
+        - DepthU: [ 16 ]
+      ForkParameters:
+        - GlobalReadCoalesceGroupA: [False, True]
+        - GlobalReadCoalesceGroupB: [False, True]
+        - PrefetchGlobalRead: [False, True]
+        - PrefetchLocalRead: [False, True]
+        - VectorWidth: [1]
+        - GlobalSplitU: [1, 4]
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [ [127, 1, 129], [127, 1, 129], [63, 1, 65] ]
+
+    - # Branches
+      BenchmarkCommonParameters:
+        - LoopTail: [True]
+        - EdgeType: ["Branch"]
+        - DepthU: [ 16 ]
+      ForkParameters:
+        - ThreadTile:
+          - [8, 8]
+          - [2, 8]
+        - WorkGroup:
+          - [  8,  4, 4 ]
+          - [ 16, 16, 1 ]
+        - VectorWidth: [1]
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [ [8, 32, 0, 75], [8, 32, 0, 75], [63, 1, 64] ]
+
+LibraryLogic:
+    ScheduleName: "vega10"
+    DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]", "Vega [Radeon RX Vega]"]
+    ArchitectureName: "gfx900"
+
+#   ScheduleName: "mi25"
+#   DeviceNames: ["Device 6860"]
+#   ArchitectureName: "gfx900"
+
+#   ScheduleName: "r9nano"
+#   DeviceNames: ["Device 7300"]
+#   ArchitectureName: "gfx803"
+
+#   ScheduleName: "hip"
+#   DeviceNames: ["Device 0000"]
+#   ArchitectureName: "fallback"
+
+LibraryClient:
diff --git a/Tensile/Tests/nightly/fractional/test_sgemm_fractional_edge.yaml b/Tensile/Tests/nightly/fractional/test_sgemm_fractional_edge.yaml
index 48b1576422..78f9e37c4d 100644
--- a/Tensile/Tests/nightly/fractional/test_sgemm_fractional_edge.yaml
+++ b/Tensile/Tests/nightly/fractional/test_sgemm_fractional_edge.yaml
@@ -44,8 +44,6 @@ BenchmarkProblems:
 
       ForkParameters:
         - FractionalLoad: [1]
-          # Set to enable PBC and functional code-gen - else should get GLVW=1 path
-        - AssertSummationElementMultiple: [4]
         - PrefetchGlobalRead: [0]
         - PrefetchLocalRead: [1]
         - ThreadTile:
diff --git a/Tensile/Tests/nightly/nonbatched/sgemm_asm_nn.yaml b/Tensile/Tests/nightly/nonbatched/sgemm_asm_nn.yaml
new file mode 100644
index 0000000000..c7b90883b0
--- /dev/null
+++ b/Tensile/Tests/nightly/nonbatched/sgemm_asm_nn.yaml
@@ -0,0 +1,86 @@
+# benchmark assembly and source kernels
+GlobalParameters:
+  MinimumRequiredVersion: 4.2.0
+  CMakeBuildType: Release
+  PrintLevel: 1
+  ForceRedoBenchmarkProblems: True
+  ForceRedoLibraryLogic: True
+  ForceRedoLibraryClient: True
+  EnqueuesPerSync: 1
+  SyncsPerBenchmark: 1
+  NumElementsToValidate: -1
+  ValidationMaxToPrint: 4
+  ValidationPrintValids: False
+  ShortNames: False
+  MergeFiles: True
+  DataInitTypeAB: 3
+  DataInitTypeC: 3
+  KernelTime: True
+
+BenchmarkProblems:
+
+  - # sgemm NN
+    - # ProblemType
+      OperationType: GEMM
+      DataType: s
+      TransposeA: False
+      TransposeB: False
+      UseBeta: True
+      Batched: False
+
+    - # BenchmarkProblemSizeGroup - Assembly
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - LoopTail: [True]
+        - EdgeType: ["ShiftPtr"]
+        - KernelLanguage: ["Assembly"]
+      ForkParameters:
+        - GlobalSplitU: [1, 3]
+        - PrefetchLocalRead: [False]
+        - PrefetchGlobalRead: [True]
+        - ThreadTile:
+          - [ 3, 5 ]
+          - [ 4, 8 ]
+          - [ 8, 8 ]
+        - WorkGroup:
+          - [ 32,  4,  1 ]
+          - [  8,  8,  1 ]
+          - [  4,  8,  4 ]
+        - DepthU: [-3]
+        - VectorWidth: [-1]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [ [127,1,129], 0, [63,1,65] ]
+
+    - # BenchmarkProblemSizeGroup - Assembly
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - LoopTail: [True]
+        - EdgeType: ["ShiftPtr"]
+      ForkParameters:
+        - KernelLanguage: ["Assembly"]
+        - GlobalSplitU: [1, 3]
+        - PrefetchLocalRead: [False]
+        - PrefetchGlobalRead: [True]
+        - ThreadTile:
+          - [ 3, 3 ]
+          - [ 4, 4 ]
+          - [ 5, 5 ]
+          - [ 8, 8 ]
+        - WorkGroup:
+          - [ 16, 16,  1 ]
+          - [  8,  8,  1 ]
+          - [  4,  4,  4 ]
+        - DepthU: [-1]
+        - VectorWidth: [-1]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [ [127,1,129], 0, [63,1,65] ]
+
+
diff --git a/Tensile/Tests/nightly/nonbatched/sgemm_asm_nt.yaml b/Tensile/Tests/nightly/nonbatched/sgemm_asm_nt.yaml
new file mode 100644
index 0000000000..a355052a3e
--- /dev/null
+++ b/Tensile/Tests/nightly/nonbatched/sgemm_asm_nt.yaml
@@ -0,0 +1,85 @@
+# benchmark assembly and source kernels
+GlobalParameters:
+  MinimumRequiredVersion: 4.2.0
+  CMakeBuildType: Release
+  PrintLevel: 1
+  ForceRedoBenchmarkProblems: True
+  ForceRedoLibraryLogic: True
+  ForceRedoLibraryClient: True
+  EnqueuesPerSync: 1
+  SyncsPerBenchmark: 1
+  NumElementsToValidate: -1
+  ValidationMaxToPrint: 4
+  ValidationPrintValids: False
+  ShortNames: False
+  MergeFiles: True
+  DataInitTypeAB: 3
+  DataInitTypeC: 3
+  KernelTime: True
+
+BenchmarkProblems:
+
+  - # sgemm NT
+    - # ProblemType
+      OperationType: GEMM
+      DataType: s
+      TransposeA: False
+      TransposeB: True
+      UseBeta: True
+      Batched: False
+
+    - # BenchmarkProblemSizeGroup - Assembly
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - LoopTail: [True]
+        - EdgeType: ["ShiftPtr"]
+        - KernelLanguage: ["Assembly"]
+      ForkParameters:
+        - GlobalSplitU: [1, 3]
+        - PrefetchLocalRead: [True]
+        - PrefetchGlobalRead: [True]
+        - ThreadTile:
+          - [ 3, 5 ]
+          - [ 4, 8 ]
+          - [ 8, 8 ]
+        - WorkGroup:
+          - [ 16, 16,  1 ]
+          - [  8, 16,  1 ]
+          - [  2,  8,  8 ]
+        - DepthU: [-3]
+        - VectorWidth: [-1]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [ [127,1,129], 0, [63,1,65] ]
+
+    - # BenchmarkProblemSizeGroup - Assembly
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - LoopTail: [True]
+        - EdgeType: ["ShiftPtr"]
+      ForkParameters:
+        - KernelLanguage: ["Assembly"]
+        - GlobalSplitU: [1, 3]
+        - PrefetchLocalRead: [True]
+        - PrefetchGlobalRead: [True]
+        - ThreadTile:
+          - [ 3, 3 ]
+          - [ 4, 4 ]
+          - [ 5, 5 ]
+          - [ 8, 8 ]
+        - WorkGroup:
+          - [ 16, 16,  1 ]
+          - [ 16,  8,  1 ]
+          - [ 16,  2,  8 ]
+        - DepthU: [-1]
+        - VectorWidth: [-1]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [ [127,1,129], 0, [63,1,65] ]
+
diff --git a/Tensile/Tests/nightly/nonbatched/sgemm_asm_tn.yaml b/Tensile/Tests/nightly/nonbatched/sgemm_asm_tn.yaml
new file mode 100644
index 0000000000..20542af0a5
--- /dev/null
+++ b/Tensile/Tests/nightly/nonbatched/sgemm_asm_tn.yaml
@@ -0,0 +1,84 @@
+# benchmark assembly and source kernels
+GlobalParameters:
+  MinimumRequiredVersion: 4.2.0
+  CMakeBuildType: Release
+  PrintLevel: 1
+  ForceRedoBenchmarkProblems: True
+  ForceRedoLibraryLogic: True
+  ForceRedoLibraryClient: True
+  EnqueuesPerSync: 1
+  SyncsPerBenchmark: 1
+  NumElementsToValidate: -1
+  ValidationMaxToPrint: 4
+  ValidationPrintValids: False
+  ShortNames: False
+  MergeFiles: True
+  DataInitTypeAB: 3
+  DataInitTypeC: 3
+  KernelTime: True
+
+BenchmarkProblems:
+
+  - # sgemm TN
+    - # ProblemType
+      OperationType: GEMM
+      DataType: s
+      TransposeA: True
+      TransposeB: False
+      UseBeta: True
+      Batched: False
+
+    - # BenchmarkProblemSizeGroup - Assembly
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - LoopTail: [True]
+        - EdgeType: ["ShiftPtr"]
+        - KernelLanguage: ["Assembly"]
+      ForkParameters:
+        - GlobalSplitU: [1, 3]
+        - PrefetchLocalRead: [True]
+        - PrefetchGlobalRead: [False]
+        - ThreadTile:
+          - [ 3, 5 ]
+          - [ 4, 8 ]
+          - [ 8, 8 ]
+        - WorkGroup:
+          - [ 16, 16,  1 ]
+          - [ 32,  4,  1 ]
+        - DepthU: [-4]
+        - VectorWidth: [-1]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [ [127,1,129], 0, [63,1,65] ]
+
+    - # BenchmarkProblemSizeGroup - Assembly
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - LoopTail: [True]
+        - EdgeType: ["ShiftPtr"]
+      ForkParameters:
+        - KernelLanguage: ["Assembly"]
+        - GlobalSplitU: [1, 3]
+        - PrefetchLocalRead: [True]
+        - PrefetchGlobalRead: [False]
+        - ThreadTile:
+          - [ 3, 3 ]
+          - [ 4, 4 ]
+          - [ 5, 5 ]
+          - [ 8, 8 ]
+        - WorkGroup:
+          - [ 16, 16,  1 ]
+          - [  8,  8,  1 ]
+        - DepthU: [-1]
+        - VectorWidth: [-1]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [ [127,1,129], 0, [63,1,65] ]
+
+
diff --git a/Tensile/Tests/nightly/nonbatched/sgemm_asm_tt.yaml b/Tensile/Tests/nightly/nonbatched/sgemm_asm_tt.yaml
new file mode 100644
index 0000000000..31ceff3e59
--- /dev/null
+++ b/Tensile/Tests/nightly/nonbatched/sgemm_asm_tt.yaml
@@ -0,0 +1,80 @@
+# benchmark assembly and source kernels
+GlobalParameters:
+  MinimumRequiredVersion: 4.2.0
+  CMakeBuildType: Release
+  PrintLevel: 1
+  ForceRedoBenchmarkProblems: True
+  ForceRedoLibraryLogic: True
+  ForceRedoLibraryClient: True
+  EnqueuesPerSync: 1
+  SyncsPerBenchmark: 1
+  NumElementsToValidate: -1
+  ValidationMaxToPrint: 4
+  ValidationPrintValids: False
+  ShortNames: False
+  MergeFiles: True
+  DataInitTypeAB: 3
+  DataInitTypeC: 3
+  KernelTime: True
+
+BenchmarkProblems:
+
+  - # sgemm TT
+    - # ProblemType
+      OperationType: GEMM
+      DataType: s
+      TransposeA: True
+      TransposeB: True
+      UseBeta: True
+      Batched: False
+
+    - # BenchmarkProblemSizeGroup - Assembly
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - LoopTail: [True]
+        - EdgeType: ["ShiftPtr"]
+        - KernelLanguage: ["Assembly"]
+      ForkParameters:
+        - PrefetchLocalRead: [False]
+        - PrefetchGlobalRead: [False]
+        - ThreadTile:
+          - [ 3, 5 ]
+          - [ 4, 8 ]
+          - [ 8, 8 ]
+        - WorkGroup:
+          - [ 16, 16,  1 ]
+          - [  8,  8,  1 ]
+        - DepthU: [-3]
+        - VectorWidth: [-1]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [ [127,1,129], 0, [63,1,65] ]
+
+    - # BenchmarkProblemSizeGroup - Assembly
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - LoopTail: [True]
+        - EdgeType: ["ShiftPtr"]
+      ForkParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [False]
+        - PrefetchGlobalRead: [False]
+        - ThreadTile:
+          - [ 3, 3 ]
+          - [ 4, 4 ]
+          - [ 5, 5 ]
+          - [ 8, 8 ]
+        - WorkGroup:
+          - [ 16, 16,  1 ]
+          - [  8,  8,  1 ]
+        - DepthU: [-1]
+        - VectorWidth: [-1]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [ [127,1,129], 0, [63,1,65] ]
diff --git a/Tensile/Tests/nightly/nonbatched/test_nonbatched.py b/Tensile/Tests/nightly/nonbatched/test_nonbatched.py
new file mode 100644
index 0000000000..aa55f6c728
--- /dev/null
+++ b/Tensile/Tests/nightly/nonbatched/test_nonbatched.py
@@ -0,0 +1,14 @@
+import Tensile.Tensile as Tensile
+
+def test_sgemm_asm_nt(tmpdir):
+ Tensile.Tensile([Tensile.TensileTestPath("nightly/nonbatched/sgemm_asm_nt.yaml"), tmpdir.strpath])
+
+def test_sgemm_asm_nn(tmpdir):
+ Tensile.Tensile([Tensile.TensileTestPath("nightly/nonbatched/sgemm_asm_nn.yaml"), tmpdir.strpath])
+
+def test_sgemm_asm_tn(tmpdir):
+ Tensile.Tensile([Tensile.TensileTestPath("nightly/nonbatched/sgemm_asm_tn.yaml"), tmpdir.strpath])
+
+def test_sgemm_asm_tt(tmpdir):
+ Tensile.Tensile([Tensile.TensileTestPath("nightly/nonbatched/sgemm_asm_tt.yaml"), tmpdir.strpath])
+
diff --git a/Tensile/Tests/nightly/vector_width/hgemm_nn_asm.yaml b/Tensile/Tests/nightly/vector_width/hgemm_nn_asm.yaml
new file mode 100644
index 0000000000..408fde7660
--- /dev/null
+++ b/Tensile/Tests/nightly/vector_width/hgemm_nn_asm.yaml
@@ -0,0 +1,44 @@
+# Sweep across different vector widths and global vector widths
+GlobalParameters:
+  MinimumRequiredVersion: 4.2.0
+  NumElementsToValidate: -1
+  KernelTime: True
+  SleepPercent: 0
+
+BenchmarkProblems:
+
+  - #
+    - # ProblemType
+      OperationType: GEMM
+      DataType: h
+      TransposeA: False
+      TransposeB: False
+      UseBeta: True
+      Batched: True
+
+    - # BenchmarkProblemSizeGroup - Assembly
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - LoopTail: [True]
+        - EdgeType: ["ShiftPtr"]
+      ForkParameters:
+        - KernelLanguage: ["Assembly"]
+        - GlobalSplitU: [1,3]
+        - PrefetchLocalRead: [True]
+        - PrefetchGlobalRead: [True]
+        - ThreadTile:
+          - [ 2, 2 ]
+          - [ 4, 4 ]
+          - [ 8, 16 ]
+        - WorkGroup:
+          - [ 16, 16,  1 ]
+          - [  8, 16,  1 ]
+        - DepthU: [32]
+        - VectorWidth: [1,2,4,8]
+        - GlobalReadVectorWidth: [1,2,4,8]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [ [127,1,129], 0, [4], [63,1,65] ]
diff --git a/Tensile/Tests/nightly/vector_width/sgemm_nn_asm.yaml b/Tensile/Tests/nightly/vector_width/sgemm_nn_asm.yaml
new file mode 100644
index 0000000000..db199c59bf
--- /dev/null
+++ b/Tensile/Tests/nightly/vector_width/sgemm_nn_asm.yaml
@@ -0,0 +1,44 @@
+# Sweep vector width and global read vector width
+GlobalParameters:
+  MinimumRequiredVersion: 4.2.0
+  NumElementsToValidate: -1
+  KernelTime: True
+  SleepPercent: 0
+
+BenchmarkProblems:
+
+  - 
+    - # ProblemType
+      OperationType: GEMM
+      DataType: s
+      TransposeA: False
+      TransposeB: False
+      UseBeta: True
+      Batched: True
+
+    - # BenchmarkProblemSizeGroup - Assembly
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - LoopTail: [True]
+        - EdgeType: ["ShiftPtr"]
+      ForkParameters:
+        - KernelLanguage: ["Assembly"]
+        - GlobalSplitU: [1,3]
+        - PrefetchLocalRead: [True]
+        - PrefetchGlobalRead: [True]
+        - ThreadTile:
+          - [ 2, 2 ]
+          - [ 4, 4 ]
+          - [ 8, 16 ]
+        - WorkGroup:
+          - [ 16, 16,  1 ]
+          - [  8, 16,  1 ]
+        - DepthU: [32]
+        - VectorWidth: [1,2,4,8]
+        - GlobalReadVectorWidth: [1,2,4,8]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [ [127,1,129], 0, [4], [63,1,65] ]
diff --git a/Tensile/Tests/nightly/vector_width/sgemm_nn_source.yaml b/Tensile/Tests/nightly/vector_width/sgemm_nn_source.yaml
new file mode 100644
index 0000000000..4decd0996f
--- /dev/null
+++ b/Tensile/Tests/nightly/vector_width/sgemm_nn_source.yaml
@@ -0,0 +1,44 @@
+# Sweep vector width and global read vector width
+GlobalParameters:
+  MinimumRequiredVersion: 4.2.0
+  NumElementsToValidate: -1
+  KernelTime: True
+  SleepPercent: 0
+
+BenchmarkProblems:
+
+  - 
+    - # ProblemType
+      OperationType: GEMM
+      DataType: s
+      TransposeA: False
+      TransposeB: False
+      UseBeta: True
+      Batched: True
+
+    - # BenchmarkProblemSizeGroup - Assembly
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - LoopTail: [True]
+        - EdgeType: ["ShiftPtr"]
+      ForkParameters:
+        - KernelLanguage: ["Source"]
+        - GlobalSplitU: [1,3]
+        - PrefetchLocalRead: [True]
+        - PrefetchGlobalRead: [True]
+        - ThreadTile:
+          - [ 2, 2 ]
+          - [ 4, 4 ]
+          - [ 8, 16 ]
+        - WorkGroup:
+          - [ 16, 16,  1 ]
+          - [  8, 16,  1 ]
+        - DepthU: [32]
+        - VectorWidth: [1,2,4,8]
+        - GlobalReadVectorWidth: [1,2,4,8]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [ [127,1,129], 0, [4], [63,1,65] ]
diff --git a/Tensile/Tests/nightly/vector_width/test_vector_width.py b/Tensile/Tests/nightly/vector_width/test_vector_width.py
new file mode 100644
index 0000000000..7a6f4d0966
--- /dev/null
+++ b/Tensile/Tests/nightly/vector_width/test_vector_width.py
@@ -0,0 +1,16 @@
+import Tensile.Tensile as Tensile
+
+def test_sgemm_nn_source(tmpdir):
+ Tensile.Tensile([Tensile.TensileTestPath("nightly/vector_width/sgemm_nn_source.yaml"), tmpdir.strpath])
+
+def test_sgemm_nn_asm(tmpdir):
+ Tensile.Tensile([Tensile.TensileTestPath("nightly/vector_width/sgemm_nn_asm.yaml"), tmpdir.strpath])
+
+def test_hgemm_nn_asm(tmpdir):
+ Tensile.Tensile([Tensile.TensileTestPath("nightly/vector_width/hgemm_nn_asm.yaml"), tmpdir.strpath])
+
+#disabled for now due to hanging with ROCm 1.9
+#
+#def test_hgemm_nn_source(tmpdir):
+# Tensile.Tensile([Tensile.TensileTestPath("nightly/vector_width/hgemm_nn_source.yaml"), tmpdir.strpath])
+
diff --git a/Tensile/Tests/nightly_asm/README b/Tensile/Tests/nightly_asm/README
new file mode 100644
index 0000000000..9c158062ce
--- /dev/null
+++ b/Tensile/Tests/nightly_asm/README
@@ -0,0 +1,9 @@
+nightly_asm is collection of selected asm tests from the nightly run
+These provide better covereage than pre_checkin and skip the source tests so complete more quickly than full nightly.
+
+nightly_asm is simply links into directories in nightly.
+New directories in nightly will have to add a link in nightly_asm if desired.
+
+To run:
+
+$  PYTHONPATH=. py.test -v Tensile/Tests/nightly_asm
diff --git a/Tensile/Tests/nightly_asm/assertions b/Tensile/Tests/nightly_asm/assertions
new file mode 120000
index 0000000000..00d2b7e489
--- /dev/null
+++ b/Tensile/Tests/nightly_asm/assertions
@@ -0,0 +1 @@
+../nightly/assertions
\ No newline at end of file
diff --git a/Tensile/Tests/nightly_asm/big_tensor b/Tensile/Tests/nightly_asm/big_tensor
new file mode 120000
index 0000000000..77cf397a71
--- /dev/null
+++ b/Tensile/Tests/nightly_asm/big_tensor
@@ -0,0 +1 @@
+../nightly/big_tensor
\ No newline at end of file
diff --git a/Tensile/Tests/nightly_asm/flat b/Tensile/Tests/nightly_asm/flat
new file mode 120000
index 0000000000..b886566b68
--- /dev/null
+++ b/Tensile/Tests/nightly_asm/flat
@@ -0,0 +1 @@
+../nightly/flat
\ No newline at end of file
diff --git a/Tensile/Tests/nightly_asm/fractional b/Tensile/Tests/nightly_asm/fractional
new file mode 120000
index 0000000000..40dfbec170
--- /dev/null
+++ b/Tensile/Tests/nightly_asm/fractional
@@ -0,0 +1 @@
+../nightly/fractional
\ No newline at end of file
diff --git a/Tensile/Tests/nightly_asm/global_split_u b/Tensile/Tests/nightly_asm/global_split_u
new file mode 120000
index 0000000000..5551057206
--- /dev/null
+++ b/Tensile/Tests/nightly_asm/global_split_u
@@ -0,0 +1 @@
+../nightly/global_split_u
\ No newline at end of file
diff --git a/Tensile/Tests/nightly_asm/hpa_source b/Tensile/Tests/nightly_asm/hpa_source
new file mode 120000
index 0000000000..2a9d54397f
--- /dev/null
+++ b/Tensile/Tests/nightly_asm/hpa_source
@@ -0,0 +1 @@
+../nightly/hpa_source
\ No newline at end of file
diff --git a/Tensile/Tests/nightly_asm/local_split_u b/Tensile/Tests/nightly_asm/local_split_u
new file mode 120000
index 0000000000..d455319439
--- /dev/null
+++ b/Tensile/Tests/nightly_asm/local_split_u
@@ -0,0 +1 @@
+../nightly/local_split_u
\ No newline at end of file
diff --git a/Tensile/Tests/nightly_asm/nonbatched b/Tensile/Tests/nightly_asm/nonbatched
new file mode 120000
index 0000000000..41aa20fe9e
--- /dev/null
+++ b/Tensile/Tests/nightly_asm/nonbatched
@@ -0,0 +1 @@
+../nightly/nonbatched
\ No newline at end of file
diff --git a/Tensile/Tests/nightly_asm/pre_checkin b/Tensile/Tests/nightly_asm/pre_checkin
new file mode 120000
index 0000000000..4b921f09b7
--- /dev/null
+++ b/Tensile/Tests/nightly_asm/pre_checkin
@@ -0,0 +1 @@
+../nightly/pre_checkin
\ No newline at end of file
diff --git a/Tensile/Tests/nightly_asm/vector_width b/Tensile/Tests/nightly_asm/vector_width
new file mode 120000
index 0000000000..57903ecc4f
--- /dev/null
+++ b/Tensile/Tests/nightly_asm/vector_width
@@ -0,0 +1 @@
+../nightly/vector_width
\ No newline at end of file
diff --git a/Tensile/Tests/pre_checkin/test_dgemm_asm.yaml b/Tensile/Tests/pre_checkin/dgemm_asm.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/test_dgemm_asm.yaml
rename to Tensile/Tests/pre_checkin/dgemm_asm.yaml
diff --git a/Tensile/Tests/pre_checkin/test_hgemm_asm_nn.yaml b/Tensile/Tests/pre_checkin/hgemm_asm_nn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/test_hgemm_asm_nn.yaml
rename to Tensile/Tests/pre_checkin/hgemm_asm_nn.yaml
diff --git a/Tensile/Tests/pre_checkin/test_hgemm_asm_nt.yaml b/Tensile/Tests/pre_checkin/hgemm_asm_nt.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/test_hgemm_asm_nt.yaml
rename to Tensile/Tests/pre_checkin/hgemm_asm_nt.yaml
diff --git a/Tensile/Tests/pre_checkin/hgemm_asm_tn.yaml b/Tensile/Tests/pre_checkin/hgemm_asm_tn.yaml
new file mode 100644
index 0000000000..ad533b6e6c
--- /dev/null
+++ b/Tensile/Tests/pre_checkin/hgemm_asm_tn.yaml
@@ -0,0 +1,96 @@
+# benchmark assembly and source kernels
+GlobalParameters:
+  MinimumRequiredVersion: 4.2.0
+  NumElementsToValidate: -1
+  KernelTime: True
+
+BenchmarkProblems:
+
+  - # hgemm TN
+    - # ProblemType
+      OperationType: GEMM
+      DataType: h
+      TransposeA: True
+      TransposeB: False
+      UseBeta: True
+      Batched: True
+
+    - # BenchmarkProblemSizeGroup - Assembly
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - LoopTail: [True]
+        - EdgeType: ["ShiftPtr"]
+        - KernelLanguage: ["Assembly"]
+      ForkParameters:
+        - GlobalSplitU: [1, 3]
+        - PrefetchLocalRead: [True]
+        - PrefetchGlobalRead: [False]
+        - ThreadTile:
+          - [ 4, 2 ]
+          - [ 4, 8 ]
+          - [ 8, 8 ]
+        - WorkGroup:
+          - [ 16, 16,  1 ]
+          - [ 32,  4,  1 ]
+        - DepthU: [8]
+        - VectorWidth: [-1]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [ [127,1,129], 0, [2], [63,1,65] ]
+
+    - # BenchmarkProblemSizeGroup - Assembly
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - LoopTail: [True]
+        - EdgeType: ["ShiftPtr"]
+      ForkParameters:
+        - KernelLanguage: ["Assembly"]
+        - GlobalSplitU: [1, 3]
+        - PrefetchLocalRead: [True]
+        - PrefetchGlobalRead: [False]
+        - ThreadTile:
+          - [ 8, 2 ]
+          - [ 2, 8 ]
+          - [ 16, 2 ]
+          - [ 2, 16 ]
+        - WorkGroup:
+          - [ 16, 16,  1 ]
+          - [  8,  8,  1 ]
+        - DepthU: [16]
+        - VectorWidth: [-1]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [ [127,1,129], 0, [2], [63,1,65] ]
+
+    - # BenchmarkProblemSizeGroup - Assembly
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - LoopTail: [True]
+        - EdgeType: ["ShiftPtr"]
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [True]
+        - WorkGroupMapping: [1]
+      ForkParameters:
+        - PrefetchGlobalRead: [False]
+        - ThreadTile:
+          - [4, 2]
+        - WorkGroup:
+          - [8, 16, 1]
+        - GlobalSplitU: [1]
+        - DepthU: [8]
+        - VectorWidth: [2]
+        - AssertSummationElementMultiple: [2]
+        - AssertFree0ElementMultiple: [2]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [ 512, 8, 1, 500000 ]
+
diff --git a/Tensile/Tests/pre_checkin/test_hgemm_asm_tt.yaml b/Tensile/Tests/pre_checkin/hgemm_asm_tt.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/test_hgemm_asm_tt.yaml
rename to Tensile/Tests/pre_checkin/hgemm_asm_tt.yaml
diff --git a/Tensile/Tests/pre_checkin/test_hgemm_hpa_asm_nn.yaml b/Tensile/Tests/pre_checkin/hgemm_hpa_asm_nn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/test_hgemm_hpa_asm_nn.yaml
rename to Tensile/Tests/pre_checkin/hgemm_hpa_asm_nn.yaml
diff --git a/Tensile/Tests/pre_checkin/test_hgemm_hpa_asm_nt.yaml b/Tensile/Tests/pre_checkin/hgemm_hpa_asm_nt.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/test_hgemm_hpa_asm_nt.yaml
rename to Tensile/Tests/pre_checkin/hgemm_hpa_asm_nt.yaml
diff --git a/Tensile/Tests/pre_checkin/test_hgemm_hpa_asm_tn.yaml b/Tensile/Tests/pre_checkin/hgemm_hpa_asm_tn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/test_hgemm_hpa_asm_tn.yaml
rename to Tensile/Tests/pre_checkin/hgemm_hpa_asm_tn.yaml
diff --git a/Tensile/Tests/pre_checkin/test_hgemm_hpa_asm_tt.yaml b/Tensile/Tests/pre_checkin/hgemm_hpa_asm_tt.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/test_hgemm_hpa_asm_tt.yaml
rename to Tensile/Tests/pre_checkin/hgemm_hpa_asm_tt.yaml
diff --git a/Tensile/Tests/pre_checkin/test_hgemm_hpa_iu2_asm_nn.yaml b/Tensile/Tests/pre_checkin/hgemm_hpa_iu2_asm_nn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/test_hgemm_hpa_iu2_asm_nn.yaml
rename to Tensile/Tests/pre_checkin/hgemm_hpa_iu2_asm_nn.yaml
diff --git a/Tensile/Tests/pre_checkin/test_hgemm_hpa_iu2_asm_nt.yaml b/Tensile/Tests/pre_checkin/hgemm_hpa_iu2_asm_nt.yaml
similarity index 96%
rename from Tensile/Tests/pre_checkin/test_hgemm_hpa_iu2_asm_nt.yaml
rename to Tensile/Tests/pre_checkin/hgemm_hpa_iu2_asm_nt.yaml
index ee811b0efc..afd7d53cd2 100644
--- a/Tensile/Tests/pre_checkin/test_hgemm_hpa_iu2_asm_nt.yaml
+++ b/Tensile/Tests/pre_checkin/hgemm_hpa_iu2_asm_nt.yaml
@@ -5,13 +5,13 @@ GlobalParameters:
   KernelTime: True
 
 BenchmarkProblems:
-  - # hgemm TN
+  - # hgemm NT
     - # ProblemType
       OperationType: GEMM
       DataType: h
       HighPrecisionAccumulate: True
-      TransposeA: True
-      TransposeB: False
+      TransposeA: False
+      TransposeB: True
       UseBeta: True
       Batched: True
 
diff --git a/Tensile/Tests/pre_checkin/test_hgemm_hpa_iu2_asm_tn.yaml b/Tensile/Tests/pre_checkin/hgemm_hpa_iu2_asm_tn.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/test_hgemm_hpa_iu2_asm_tn.yaml
rename to Tensile/Tests/pre_checkin/hgemm_hpa_iu2_asm_tn.yaml
diff --git a/Tensile/Tests/pre_checkin/test_hgemm_hpa_iu2_asm_tt.yaml b/Tensile/Tests/pre_checkin/hgemm_hpa_iu2_asm_tt.yaml
similarity index 100%
rename from Tensile/Tests/pre_checkin/test_hgemm_hpa_iu2_asm_tt.yaml
rename to Tensile/Tests/pre_checkin/hgemm_hpa_iu2_asm_tt.yaml
diff --git a/Tensile/Tests/pre_checkin/sgemm_asm_nn.yaml b/Tensile/Tests/pre_checkin/sgemm_asm_nn.yaml
new file mode 100644
index 0000000000..883cfda371
--- /dev/null
+++ b/Tensile/Tests/pre_checkin/sgemm_asm_nn.yaml
@@ -0,0 +1,86 @@
+# benchmark assembly and source kernels
+GlobalParameters:
+  MinimumRequiredVersion: 4.2.0
+  CMakeBuildType: Release
+  PrintLevel: 1
+  ForceRedoBenchmarkProblems: True
+  ForceRedoLibraryLogic: True
+  ForceRedoLibraryClient: True
+  EnqueuesPerSync: 1
+  SyncsPerBenchmark: 1
+  NumElementsToValidate: -1
+  ValidationMaxToPrint: 4
+  ValidationPrintValids: False
+  ShortNames: False
+  MergeFiles: True
+  DataInitTypeAB: 3
+  DataInitTypeC: 3
+  KernelTime: True
+
+BenchmarkProblems:
+
+  - # sgemm NN
+    - # ProblemType
+      OperationType: GEMM
+      DataType: s
+      TransposeA: False
+      TransposeB: False
+      UseBeta: True
+      Batched: True
+
+    - # BenchmarkProblemSizeGroup - Assembly
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - LoopTail: [True]
+        - EdgeType: ["ShiftPtr"]
+        - KernelLanguage: ["Assembly"]
+      ForkParameters:
+        - GlobalSplitU: [1, 3]
+        - PrefetchLocalRead: [False]
+        - PrefetchGlobalRead: [True]
+        - ThreadTile:
+          - [ 3, 5 ]
+          - [ 4, 8 ]
+          - [ 8, 8 ]
+        - WorkGroup:
+          - [ 32,  4,  1 ]
+          - [  8,  8,  1 ]
+          - [  4,  8,  4 ]
+        - DepthU: [-3]
+        - VectorWidth: [-1]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [ [127,1,129], 0, [2], [63,1,65] ]
+
+    - # BenchmarkProblemSizeGroup - Assembly
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - LoopTail: [True]
+        - EdgeType: ["ShiftPtr"]
+      ForkParameters:
+        - KernelLanguage: ["Assembly"]
+        - GlobalSplitU: [1, 3]
+        - PrefetchLocalRead: [False]
+        - PrefetchGlobalRead: [True]
+        - ThreadTile:
+          - [ 3, 3 ]
+          - [ 4, 4 ]
+          - [ 5, 5 ]
+          - [ 8, 8 ]
+        - WorkGroup:
+          - [ 16, 16,  1 ]
+          - [  8,  8,  1 ]
+          - [  4,  4,  4 ]
+        - DepthU: [-1]
+        - VectorWidth: [-1]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [ [127,1,129], 0, [2], [63,1,65] ]
+
+
diff --git a/Tensile/Tests/pre_checkin/sgemm_asm_nt.yaml b/Tensile/Tests/pre_checkin/sgemm_asm_nt.yaml
new file mode 100644
index 0000000000..dacf0b2a0a
--- /dev/null
+++ b/Tensile/Tests/pre_checkin/sgemm_asm_nt.yaml
@@ -0,0 +1,85 @@
+# benchmark assembly and source kernels
+GlobalParameters:
+  MinimumRequiredVersion: 4.2.0
+  CMakeBuildType: Release
+  PrintLevel: 1
+  ForceRedoBenchmarkProblems: True
+  ForceRedoLibraryLogic: True
+  ForceRedoLibraryClient: True
+  EnqueuesPerSync: 1
+  SyncsPerBenchmark: 1
+  NumElementsToValidate: -1
+  ValidationMaxToPrint: 4
+  ValidationPrintValids: False
+  ShortNames: False
+  MergeFiles: True
+  DataInitTypeAB: 3
+  DataInitTypeC: 3
+  KernelTime: True
+
+BenchmarkProblems:
+
+  - # sgemm NT
+    - # ProblemType
+      OperationType: GEMM
+      DataType: s
+      TransposeA: False
+      TransposeB: True
+      UseBeta: True
+      Batched: True
+
+    - # BenchmarkProblemSizeGroup - Assembly
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - LoopTail: [True]
+        - EdgeType: ["ShiftPtr"]
+        - KernelLanguage: ["Assembly"]
+      ForkParameters:
+        - GlobalSplitU: [1, 3]
+        - PrefetchLocalRead: [True]
+        - PrefetchGlobalRead: [True]
+        - ThreadTile:
+          - [ 3, 5 ]
+          - [ 4, 8 ]
+          - [ 8, 8 ]
+        - WorkGroup:
+          - [ 16, 16,  1 ]
+          - [  8, 16,  1 ]
+          - [  2,  8,  8 ]
+        - DepthU: [-3]
+        - VectorWidth: [-1]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [ [127,1,129], 0, [2], [63,1,65] ]
+
+    - # BenchmarkProblemSizeGroup - Assembly
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - LoopTail: [True]
+        - EdgeType: ["ShiftPtr"]
+      ForkParameters:
+        - KernelLanguage: ["Assembly"]
+        - GlobalSplitU: [1, 3]
+        - PrefetchLocalRead: [True]
+        - PrefetchGlobalRead: [True]
+        - ThreadTile:
+          - [ 3, 3 ]
+          - [ 4, 4 ]
+          - [ 5, 5 ]
+          - [ 8, 8 ]
+        - WorkGroup:
+          - [ 16, 16,  1 ]
+          - [ 16,  8,  1 ]
+          - [ 16,  2,  8 ]
+        - DepthU: [-1]
+        - VectorWidth: [-1]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [ [127,1,129], 0, [2], [63,1,65] ]
+
diff --git a/Tensile/Tests/pre_checkin/test_hgemm_asm_tn.yaml b/Tensile/Tests/pre_checkin/sgemm_asm_tn.yaml
similarity index 76%
rename from Tensile/Tests/pre_checkin/test_hgemm_asm_tn.yaml
rename to Tensile/Tests/pre_checkin/sgemm_asm_tn.yaml
index 3689155c8c..284de54edb 100644
--- a/Tensile/Tests/pre_checkin/test_hgemm_asm_tn.yaml
+++ b/Tensile/Tests/pre_checkin/sgemm_asm_tn.yaml
@@ -1,15 +1,28 @@
 # benchmark assembly and source kernels
 GlobalParameters:
   MinimumRequiredVersion: 4.2.0
+  CMakeBuildType: Release
+  PrintLevel: 1
+  ForceRedoBenchmarkProblems: True
+  ForceRedoLibraryLogic: True
+  ForceRedoLibraryClient: True
+  EnqueuesPerSync: 1
+  SyncsPerBenchmark: 1
   NumElementsToValidate: -1
+  ValidationMaxToPrint: 4
+  ValidationPrintValids: False
+  ShortNames: False
+  MergeFiles: True
+  DataInitTypeAB: 3
+  DataInitTypeC: 3
   KernelTime: True
 
 BenchmarkProblems:
 
-  - # hgemm TN
+  - # sgemm TN
     - # ProblemType
       OperationType: GEMM
-      DataType: h
+      DataType: s
       TransposeA: True
       TransposeB: False
       UseBeta: True
@@ -26,13 +39,13 @@ BenchmarkProblems:
         - PrefetchLocalRead: [True]
         - PrefetchGlobalRead: [False]
         - ThreadTile:
-          - [ 4, 2 ]
+          - [ 3, 5 ]
           - [ 4, 8 ]
           - [ 8, 8 ]
         - WorkGroup:
           - [ 16, 16,  1 ]
           - [ 32,  4,  1 ]
-        - DepthU: [8]
+        - DepthU: [-4]
         - VectorWidth: [-1]
       BenchmarkForkParameters:
       JoinParameters:
@@ -52,14 +65,14 @@ BenchmarkProblems:
         - PrefetchLocalRead: [True]
         - PrefetchGlobalRead: [False]
         - ThreadTile:
-          - [ 8, 2 ]
-          - [ 2, 8 ]
-          - [ 16, 2 ]
-          - [ 2, 16 ]
+          - [ 3, 3 ]
+          - [ 4, 4 ]
+          - [ 5, 5 ]
+          - [ 8, 8 ]
         - WorkGroup:
           - [ 16, 16,  1 ]
           - [  8,  8,  1 ]
-        - DepthU: [16]
+        - DepthU: [-1]
         - VectorWidth: [-1]
       BenchmarkForkParameters:
       JoinParameters:
@@ -68,3 +81,4 @@ BenchmarkProblems:
         - ProblemSizes:
           - Range: [ [127,1,129], 0, [2], [63,1,65] ]
 
+
diff --git a/Tensile/Tests/pre_checkin/sgemm_asm_tt.yaml b/Tensile/Tests/pre_checkin/sgemm_asm_tt.yaml
new file mode 100644
index 0000000000..dd984b94b4
--- /dev/null
+++ b/Tensile/Tests/pre_checkin/sgemm_asm_tt.yaml
@@ -0,0 +1,80 @@
+# benchmark assembly and source kernels
+GlobalParameters:
+  MinimumRequiredVersion: 4.2.0
+  CMakeBuildType: Release
+  PrintLevel: 1
+  ForceRedoBenchmarkProblems: True
+  ForceRedoLibraryLogic: True
+  ForceRedoLibraryClient: True
+  EnqueuesPerSync: 1
+  SyncsPerBenchmark: 1
+  NumElementsToValidate: -1
+  ValidationMaxToPrint: 4
+  ValidationPrintValids: False
+  ShortNames: False
+  MergeFiles: True
+  DataInitTypeAB: 3
+  DataInitTypeC: 3
+  KernelTime: True
+
+BenchmarkProblems:
+
+  - # sgemm TT
+    - # ProblemType
+      OperationType: GEMM
+      DataType: s
+      TransposeA: True
+      TransposeB: True
+      UseBeta: True
+      Batched: True
+
+    - # BenchmarkProblemSizeGroup - Assembly
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - LoopTail: [True]
+        - EdgeType: ["ShiftPtr"]
+        - KernelLanguage: ["Assembly"]
+      ForkParameters:
+        - PrefetchLocalRead: [False]
+        - PrefetchGlobalRead: [False]
+        - ThreadTile:
+          - [ 3, 5 ]
+          - [ 4, 8 ]
+          - [ 8, 8 ]
+        - WorkGroup:
+          - [ 16, 16,  1 ]
+          - [  8,  8,  1 ]
+        - DepthU: [-3]
+        - VectorWidth: [-1]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [ [127,1,129], 0, [2], [63,1,65] ]
+
+    - # BenchmarkProblemSizeGroup - Assembly
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - LoopTail: [True]
+        - EdgeType: ["ShiftPtr"]
+      ForkParameters:
+        - KernelLanguage: ["Assembly"]
+        - PrefetchLocalRead: [False]
+        - PrefetchGlobalRead: [False]
+        - ThreadTile:
+          - [ 3, 3 ]
+          - [ 4, 4 ]
+          - [ 5, 5 ]
+          - [ 8, 8 ]
+        - WorkGroup:
+          - [ 16, 16,  1 ]
+          - [  8,  8,  1 ]
+        - DepthU: [-1]
+        - VectorWidth: [-1]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [ [127,1,129], 0, [2], [63,1,65] ]
diff --git a/Tensile/Tests/pre_checkin/test_pre_checkin.py b/Tensile/Tests/pre_checkin/test_pre_checkin.py
index 97108bf4ee..dd2de22ce7 100644
--- a/Tensile/Tests/pre_checkin/test_pre_checkin.py
+++ b/Tensile/Tests/pre_checkin/test_pre_checkin.py
@@ -1,31 +1,52 @@
 import Tensile.Tensile as Tensile
 
-def test_sgemm_asm(tmpdir):
- Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/test_sgemm_asm.yaml"), tmpdir.strpath])
+def test_hgemm_hpa_asm_tn(tmpdir):
+ Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/hgemm_hpa_asm_tn.yaml"), tmpdir.strpath])
+
+def test_sgemm_asm_nt(tmpdir):
+ Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/sgemm_asm_nt.yaml"), tmpdir.strpath])
+
+def test_hgemm_hpa_iu2_asm_tt(tmpdir):
+ Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/hgemm_hpa_iu2_asm_tt.yaml"), tmpdir.strpath])
+
+def test_hgemm_hpa_iu2_asm_nt(tmpdir):
+ Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/hgemm_hpa_iu2_asm_nt.yaml"), tmpdir.strpath])
 
 def test_hgemm_asm_nn(tmpdir):
- Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/test_hgemm_asm_nn.yaml"), tmpdir.strpath])
+ Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/hgemm_asm_nn.yaml"), tmpdir.strpath])
+
+def test_sgemm_asm_nn(tmpdir):
+ Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/sgemm_asm_nn.yaml"), tmpdir.strpath])
 
 def test_dgemm_asm(tmpdir):
- Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/test_dgemm_asm.yaml"), tmpdir.strpath])
+ Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/dgemm_asm.yaml"), tmpdir.strpath])
 
-def test_hgemm_hpa_asm_tn(tmpdir):
- Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/test_hgemm_hpa_asm_tn.yaml"), tmpdir.strpath])
+def test_hgemm_hpa_iu2_asm_tn(tmpdir):
+ Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/hgemm_hpa_iu2_asm_tn.yaml"), tmpdir.strpath])
 
-def test_hgemm_hpa_asm_nn(tmpdir):
- Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/test_hgemm_hpa_asm_nn.yaml"), tmpdir.strpath])
+def test_hgemm_asm_tn(tmpdir):
+ Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/hgemm_asm_tn.yaml"), tmpdir.strpath])
 
-def test_hgemm_asm_tt(tmpdir):
- Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/test_hgemm_asm_tt.yaml"), tmpdir.strpath])
+def test_hgemm_hpa_asm_nn(tmpdir):
+ Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/hgemm_hpa_asm_nn.yaml"), tmpdir.strpath])
 
 def test_hgemm_hpa_asm_tt(tmpdir):
- Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/test_hgemm_hpa_asm_tt.yaml"), tmpdir.strpath])
+ Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/hgemm_hpa_asm_tt.yaml"), tmpdir.strpath])
 
-def test_hgemm_asm_nt(tmpdir):
- Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/test_hgemm_asm_nt.yaml"), tmpdir.strpath])
+def test_hgemm_asm_tt(tmpdir):
+ Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/hgemm_asm_tt.yaml"), tmpdir.strpath])
 
-def test_hgemm_asm_tn(tmpdir):
- Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/test_hgemm_asm_tn.yaml"), tmpdir.strpath])
+def test_hgemm_hpa_iu2_asm_nn(tmpdir):
+ Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/hgemm_hpa_iu2_asm_nn.yaml"), tmpdir.strpath])
 
 def test_hgemm_hpa_asm_nt(tmpdir):
- Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/test_hgemm_hpa_asm_nt.yaml"), tmpdir.strpath])
+ Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/hgemm_hpa_asm_nt.yaml"), tmpdir.strpath])
+
+def test_sgemm_asm_tn(tmpdir):
+ Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/sgemm_asm_tn.yaml"), tmpdir.strpath])
+
+def test_sgemm_asm_tt(tmpdir):
+ Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/sgemm_asm_tt.yaml"), tmpdir.strpath])
+
+def test_hgemm_asm_nt(tmpdir):
+ Tensile.Tensile([Tensile.TensileTestPath("pre_checkin/hgemm_asm_nt.yaml"), tmpdir.strpath])
diff --git a/Tensile/Tests/pre_checkin/test_sgemm_asm.yaml b/Tensile/Tests/pre_checkin/test_sgemm_asm.yaml
deleted file mode 100644
index 21e3b0043a..0000000000
--- a/Tensile/Tests/pre_checkin/test_sgemm_asm.yaml
+++ /dev/null
@@ -1,270 +0,0 @@
-# benchmark assembly and source kernels
-GlobalParameters:
-  MinimumRequiredVersion: 4.2.0
-  CMakeBuildType: Release
-  PrintLevel: 1
-  ForceRedoBenchmarkProblems: True
-  ForceRedoLibraryLogic: True
-  ForceRedoLibraryClient: True
-  EnqueuesPerSync: 1
-  SyncsPerBenchmark: 1
-  NumElementsToValidate: -1
-  ValidationMaxToPrint: 4
-  ValidationPrintValids: False
-  ShortNames: False
-  MergeFiles: True
-  DataInitTypeAB: 3
-  DataInitTypeC: 3
-  KernelTime: True
-
-BenchmarkProblems:
-
-  - # sgemm NN
-    - # ProblemType
-      OperationType: GEMM
-      DataType: s
-      TransposeA: False
-      TransposeB: False
-      UseBeta: True
-      Batched: True
-
-    - # BenchmarkProblemSizeGroup - Assembly
-      InitialSolutionParameters:
-      BenchmarkCommonParameters:
-        - LoopTail: [True]
-        - EdgeType: ["ShiftPtr"]
-        - KernelLanguage: ["Assembly"]
-      ForkParameters:
-        - GlobalSplitU: [1, 3]
-        - PrefetchLocalRead: [False]
-        - PrefetchGlobalRead: [True]
-        - ThreadTile:
-          - [ 3, 5 ]
-          - [ 4, 8 ]
-          - [ 8, 8 ]
-        - WorkGroup:
-          - [ 32,  4,  1 ]
-          - [  8,  8,  1 ]
-          - [  4,  8,  4 ]
-        - DepthU: [-3]
-        - VectorWidth: [-1]
-      BenchmarkForkParameters:
-      JoinParameters:
-      BenchmarkJoinParameters:
-      BenchmarkFinalParameters:
-        - ProblemSizes:
-          - Range: [ [127,1,129], 0, [2], [63,1,65] ]
-
-    - # BenchmarkProblemSizeGroup - Assembly
-      InitialSolutionParameters:
-      BenchmarkCommonParameters:
-        - LoopTail: [True]
-        - EdgeType: ["ShiftPtr"]
-      ForkParameters:
-        - KernelLanguage: ["Assembly"]
-        - GlobalSplitU: [1, 3]
-        - PrefetchLocalRead: [False]
-        - PrefetchGlobalRead: [True]
-        - ThreadTile:
-          - [ 3, 3 ]
-          - [ 4, 4 ]
-          - [ 5, 5 ]
-          - [ 8, 8 ]
-        - WorkGroup:
-          - [ 16, 16,  1 ]
-          - [  8,  8,  1 ]
-          - [  4,  4,  4 ]
-        - DepthU: [-1]
-        - VectorWidth: [-1]
-      BenchmarkForkParameters:
-      JoinParameters:
-      BenchmarkJoinParameters:
-      BenchmarkFinalParameters:
-        - ProblemSizes:
-          - Range: [ [127,1,129], 0, [2], [63,1,65] ]
-
-  - # sgemm NT
-    - # ProblemType
-      OperationType: GEMM
-      DataType: s
-      TransposeA: False
-      TransposeB: True
-      UseBeta: True
-      Batched: True
-
-    - # BenchmarkProblemSizeGroup - Assembly
-      InitialSolutionParameters:
-      BenchmarkCommonParameters:
-        - LoopTail: [True]
-        - EdgeType: ["ShiftPtr"]
-        - KernelLanguage: ["Assembly"]
-      ForkParameters:
-        - GlobalSplitU: [1, 3]
-        - PrefetchLocalRead: [True]
-        - PrefetchGlobalRead: [True]
-        - ThreadTile:
-          - [ 3, 5 ]
-          - [ 4, 8 ]
-          - [ 8, 8 ]
-        - WorkGroup:
-          - [ 16, 16,  1 ]
-          - [  8, 16,  1 ]
-          - [  2,  8,  8 ]
-        - DepthU: [-3]
-        - VectorWidth: [-1]
-      BenchmarkForkParameters:
-      JoinParameters:
-      BenchmarkJoinParameters:
-      BenchmarkFinalParameters:
-        - ProblemSizes:
-          - Range: [ [127,1,129], 0, [2], [63,1,65] ]
-
-    - # BenchmarkProblemSizeGroup - Assembly
-      InitialSolutionParameters:
-      BenchmarkCommonParameters:
-        - LoopTail: [True]
-        - EdgeType: ["ShiftPtr"]
-      ForkParameters:
-        - KernelLanguage: ["Assembly"]
-        - GlobalSplitU: [1, 3]
-        - PrefetchLocalRead: [True]
-        - PrefetchGlobalRead: [True]
-        - ThreadTile:
-          - [ 3, 3 ]
-          - [ 4, 4 ]
-          - [ 5, 5 ]
-          - [ 8, 8 ]
-        - WorkGroup:
-          - [ 16, 16,  1 ]
-          - [ 16,  8,  1 ]
-          - [ 16,  2,  8 ]
-        - DepthU: [-1]
-        - VectorWidth: [-1]
-      BenchmarkForkParameters:
-      JoinParameters:
-      BenchmarkJoinParameters:
-      BenchmarkFinalParameters:
-        - ProblemSizes:
-          - Range: [ [127,1,129], 0, [2], [63,1,65] ]
-
-  - # sgemm TN
-    - # ProblemType
-      OperationType: GEMM
-      DataType: s
-      TransposeA: True
-      TransposeB: False
-      UseBeta: True
-      Batched: True
-
-    - # BenchmarkProblemSizeGroup - Assembly
-      InitialSolutionParameters:
-      BenchmarkCommonParameters:
-        - LoopTail: [True]
-        - EdgeType: ["ShiftPtr"]
-        - KernelLanguage: ["Assembly"]
-      ForkParameters:
-        - GlobalSplitU: [1, 3]
-        - PrefetchLocalRead: [True]
-        - PrefetchGlobalRead: [False]
-        - ThreadTile:
-          - [ 3, 5 ]
-          - [ 4, 8 ]
-          - [ 8, 8 ]
-        - WorkGroup:
-          - [ 16, 16,  1 ]
-          - [ 32,  4,  1 ]
-        - DepthU: [-4]
-        - VectorWidth: [-1]
-      BenchmarkForkParameters:
-      JoinParameters:
-      BenchmarkJoinParameters:
-      BenchmarkFinalParameters:
-        - ProblemSizes:
-          - Range: [ [127,1,129], 0, [2], [63,1,65] ]
-
-    - # BenchmarkProblemSizeGroup - Assembly
-      InitialSolutionParameters:
-      BenchmarkCommonParameters:
-        - LoopTail: [True]
-        - EdgeType: ["ShiftPtr"]
-      ForkParameters:
-        - KernelLanguage: ["Assembly"]
-        - GlobalSplitU: [1, 3]
-        - PrefetchLocalRead: [True]
-        - PrefetchGlobalRead: [False]
-        - ThreadTile:
-          - [ 3, 3 ]
-          - [ 4, 4 ]
-          - [ 5, 5 ]
-          - [ 8, 8 ]
-        - WorkGroup:
-          - [ 16, 16,  1 ]
-          - [  8,  8,  1 ]
-        - DepthU: [-1]
-        - VectorWidth: [-1]
-      BenchmarkForkParameters:
-      JoinParameters:
-      BenchmarkJoinParameters:
-      BenchmarkFinalParameters:
-        - ProblemSizes:
-          - Range: [ [127,1,129], 0, [2], [63,1,65] ]
-
-  - # sgemm TT
-    - # ProblemType
-      OperationType: GEMM
-      DataType: s
-      TransposeA: True
-      TransposeB: True
-      UseBeta: True
-      Batched: True
-
-    - # BenchmarkProblemSizeGroup - Assembly
-      InitialSolutionParameters:
-      BenchmarkCommonParameters:
-        - LoopTail: [True]
-        - EdgeType: ["ShiftPtr"]
-        - KernelLanguage: ["Assembly"]
-      ForkParameters:
-        - PrefetchLocalRead: [False]
-        - PrefetchGlobalRead: [False]
-        - ThreadTile:
-          - [ 3, 5 ]
-          - [ 4, 8 ]
-          - [ 8, 8 ]
-        - WorkGroup:
-          - [ 16, 16,  1 ]
-          - [  8,  8,  1 ]
-        - DepthU: [-3]
-        - VectorWidth: [-1]
-      BenchmarkForkParameters:
-      JoinParameters:
-      BenchmarkJoinParameters:
-      BenchmarkFinalParameters:
-        - ProblemSizes:
-          - Range: [ [127,1,129], 0, [2], [63,1,65] ]
-
-    - # BenchmarkProblemSizeGroup - Assembly
-      InitialSolutionParameters:
-      BenchmarkCommonParameters:
-        - LoopTail: [True]
-        - EdgeType: ["ShiftPtr"]
-      ForkParameters:
-        - KernelLanguage: ["Assembly"]
-        - PrefetchLocalRead: [False]
-        - PrefetchGlobalRead: [False]
-        - ThreadTile:
-          - [ 3, 3 ]
-          - [ 4, 4 ]
-          - [ 5, 5 ]
-          - [ 8, 8 ]
-        - WorkGroup:
-          - [ 16, 16,  1 ]
-          - [  8,  8,  1 ]
-        - DepthU: [-1]
-        - VectorWidth: [-1]
-      BenchmarkForkParameters:
-      JoinParameters:
-      BenchmarkJoinParameters:
-      BenchmarkFinalParameters:
-        - ProblemSizes:
-          - Range: [ [127,1,129], 0, [2], [63,1,65] ]
diff --git a/Tensile/Utilities/merge_rocblas_yaml_files.py b/Tensile/Utilities/merge_rocblas_yaml_files.py
new file mode 100644
index 0000000000..a0942c5862
--- /dev/null
+++ b/Tensile/Utilities/merge_rocblas_yaml_files.py
@@ -0,0 +1,358 @@
+
+#from copy import deepcopy
+#from Common import print1, print2, printExit, HR, ensurePath
+
+#from SolutionStructs import Solution
+
+#from __init__ import __version__
+
+import os
+import sys
+import argparse
+
+HR = "################################################################################"
+
+################################################################################
+# Print Debug
+################################################################################
+
+def printWarning(message):
+  print "Tensile::WARNING: %s" % message
+  sys.stdout.flush()
+
+def printExit(message):
+  print "Tensile::FATAL: %s" % message
+  sys.stdout.flush()
+  sys.exit(-1)
+
+try:
+  import yaml
+except ImportError:
+  printExit("You must install PyYAML to use Tensile (to parse config files). See http://pyyaml.org/wiki/PyYAML for installation instructions.")
+
+#import YAMLIO
+
+def ensurePath( path ):
+  if not os.path.exists(path):
+    os.makedirs(path)
+  return path
+
+################################################################################
+# Library Logic Container
+################################################################################
+class LibraryLogic:
+
+  def __init__(self,filename=None):
+
+    if filename is not None:
+      print ("# Reading Library Logic: " + filename)
+      try:
+        stream = open(filename, "r")
+      except IOError:
+        printExit("Cannot open file: %s" % filename )
+      data = yaml.load(stream, yaml.SafeLoader)
+
+      self.__set_versionString(data[0]["MinimumRequiredVersion"])
+      self.__set_scheduleName(data[1])
+      self.__set_architectureName(data[2])
+      self.__set_deviceNames(data[3])
+      self.__set_problemType(data[4])
+      self.__set_solutionStates(data[5])
+      self.__set_indexOrder(data[6])
+      self.__set_exactLogic(data[7])
+      self.__set_rangeLogic(data[8])
+    
+      stream.close()
+
+    else:
+      self.__set_versionString(None)
+      self.__set_scheduleName(None)
+      self.__set_architectureName(None)
+      self.__set_deviceNames(None)
+      self.__set_problemType(None)
+      self.__set_solutionStates(None)
+      self.__set_indexOrder(None)
+      self.__set_exactLogic(None)
+      self.__set_rangeLogic(None)
+
+  #versionString
+  def __get_versionString(self):
+    return self.__versionString
+
+  def __set_versionString(self,value):
+    self.__versionString = value
+
+  versionString = property(__get_versionString,__set_versionString)
+
+  #scheduleName
+  def __get_scheduleName(self):
+    return self.__scheduleName
+
+  def __set_scheduleName(self, value):
+    self.__scheduleName = value
+
+  scheduleName = property(__get_scheduleName,__set_scheduleName)
+
+  #architectureName
+  def __get_architectureName(self):
+    return self.__architectureName
+
+  def __set_architectureName(self,value):
+    self.__architectureName = value
+
+  architectureName = property(__get_architectureName,__set_architectureName)
+
+  #deviceNames
+  def __get_deviceNames(self):
+    return self.__deviceNames
+
+  def __set_deviceNames(self,value):
+    self.__deviceNames = value
+
+  deviceNames = property(__get_deviceNames,__set_deviceNames)
+
+  
+  #problemTypeState
+  def __get_problemType(self):
+    return self.__problemType
+
+  def __set_problemType(self,value):
+    self.__problemType = value
+
+  problemType = property(__get_problemType,__set_problemType)
+
+  #solutionStates
+  def __get_solutionStates(self):
+    return self.__solutionStates
+
+  def __set_solutionStates(self,value):
+    self.__solutionStates = value
+
+  solutionStates = property(__get_solutionStates,__set_solutionStates)
+
+  #indexOrder
+  def __get_indexOrder(self):
+    return self.__indexOrder
+
+  def __set_indexOrder(self,value):
+    self.__indexOrder = value
+  
+  indexOrder = property(__get_indexOrder,__set_indexOrder)
+
+
+  #exactLogic
+  def __get_exactLogic(self):
+    return self.__exactLogic
+
+  def __set_exactLogic(self,value):
+    self.__exactLogic = value
+
+  exactLogic = property(__get_exactLogic,__set_exactLogic)
+
+  #rangeLogic
+  def __get_rangeLogic(self):
+    return self.__rangeLogic
+
+  def __set_rangeLogic(self,value):
+    self.__rangeLogic = value
+
+  rangeLogic = property(__get_rangeLogic,__set_rangeLogic)
+
+  def writeLibraryLogic(self,filename):
+  
+    data = []
+
+    if self.versionString is not None:
+      data.append({"MinimumRequiredVersion":self.versionString})
+    
+    if self.scheduleName is not None:
+      data.append(self.scheduleName)     
+    
+    if self.architectureName is not None:
+       data.append(self.architectureName)
+    
+    if self.deviceNames is not None:
+      data.append(self.deviceNames)
+
+    if self.problemType is not None:
+      data.append(self.problemType)
+
+    if self.solutionStates is not None:
+      data.append(self.solutionStates)
+    
+    if self.indexOrder is not None:
+      data.append(self.indexOrder)
+    
+    if self.exactLogic is not None:
+      data.append(self.exactLogic)
+    
+    if self.rangeLogic is not None:
+      data.append(self.rangeLogic)
+
+    if not data:
+      printExit("No data to output")
+    else:
+      try:
+        stream = open(filename, "w")
+        yaml.safe_dump(data, stream)
+        stream.close()
+      except IOError:
+        printExit("Cannot open file: %s" % filename)
+
+
+def MergeTensileLogicFiles(origionalLibraryLogic, exactLibraryLogic):
+  
+  mergedLibraryLogic = LibraryLogic()
+
+  solutionList = origionalLibraryLogic.solutionStates
+  solutionListExact = exactLibraryLogic.solutionStates
+
+  newSolutionOffset = len(solutionList)
+
+  filterdSolutionExactList = []
+  replicationMapping = {}
+  idx = 0
+  idxMapping = newSolutionOffset
+
+  # construct the mappings from the old exact kernal configurations
+  # to their definitions in the merged files
+  for solution in solutionListExact:
+    if solution in solutionList:
+      # if solution exists in the origional configuration the
+      # its placement in the merged kernel configurations list
+      # gets mapped to the pre-existing configuration
+      idxOrg = solutionList.index(solution)
+      replicationMapping[idx] = idxOrg
+    else:
+      filterdSolutionExactList.append(solution)
+      # if the solution does not exist in the origional configurations
+      # it gets mapped to the new offset
+      replicationMapping[idx] = idxMapping
+      idxMapping += 1
+
+    idx += 1
+
+  mergedSolutionList = []
+  for solution in solutionList:
+    mergedSolutionList.append(solution)
+  
+  for solution in solutionListExact:
+    mergedSolutionList.append(solution)
+
+  exactLogic = origionalLibraryLogic.exactLogic
+  exactLogicExact = exactLibraryLogic.exactLogic
+
+  filteredExactLogicExact = []
+  
+  # use the mapping from above to remap the exact logic
+  # in the merged file
+  for exact in exactLogicExact:
+    # example exact entry [[123,124,1,123], [5, 4312.3]]
+    # the first fiedl in [5, 4312.3] is the mapping to the 
+    # kernel configuration
+    kernelIndex = exact[1][0]
+    
+    if replicationMapping.has_key(kernelIndex):
+      exact[1][0] = replicationMapping[kernelIndex]
+    
+    filteredExactLogicExact.append(exact)
+
+
+  sizeList, _ = zip(*exactLogicExact)
+
+  mergedExactLogic = []
+  for logicMapping in exactLogic:
+    if logicMapping[0] not in sizeList:
+      mergedExactLogic.append(logicMapping)
+
+  for logicMapping in exactLogicExact:
+    mergedExactLogic.append(logicMapping)
+
+  mergedLibraryLogic.versionString = origionalLibraryLogic.versionString
+  mergedLibraryLogic.scheduleName = origionalLibraryLogic.scheduleName
+  mergedLibraryLogic.architectureName = origionalLibraryLogic.architectureName
+  mergedLibraryLogic.deviceNames = origionalLibraryLogic.deviceNames
+  mergedLibraryLogic.problemType = origionalLibraryLogic.problemType
+  mergedLibraryLogic.solutionStates = mergedSolutionList
+  mergedLibraryLogic.indexOrder = origionalLibraryLogic.indexOrder
+  mergedLibraryLogic.exactLogic = mergedExactLogic
+  mergedLibraryLogic.rangeLogic  = origionalLibraryLogic.rangeLogic
+
+  return mergedLibraryLogic
+
+
+def ProcessMergeLogicFile(exactFileName, origionalFileName, outputFileName):
+  
+  _, fileName = os.path.split(exactFileName)
+
+  print ("processing file: " + fileName)
+
+  libraryLogic = LibraryLogic(origionalFileName)
+  libraryLogicExact = LibraryLogic(exactFileName)
+
+  mergedLibraryLogic = MergeTensileLogicFiles(libraryLogic,libraryLogicExact)
+
+  mergedLibraryLogic.writeLibraryLogic(outputFileName)
+
+def RunMergeTensileLogicFiles():
+
+  print ""
+  print HR
+  print "# Merge Library Logic"
+  print HR
+  print ""
+  
+  ##############################################################################
+  # Parse Command Line Arguments
+  ##############################################################################
+  
+  argParser = argparse.ArgumentParser()
+  argParser.add_argument("OrigionalLogicPath", help="Path to the origional LibraryLogic.yaml input files.")
+  argParser.add_argument("ExactLogicPath", help="Path to the exact LibraryLogic.yaml input files.")
+  argParser.add_argument("OutputPath", help="Where to write library files?")
+
+  args = argParser.parse_args()
+
+  origionalLogicPath = args.OrigionalLogicPath
+  exactLogicPath = args.ExactLogicPath
+  outputPath = args.OutputPath
+  print ("Origional Logic Path: " + origionalLogicPath)
+  print ("Exact Logic Path: " + exactLogicPath)
+  print ("OutputPath: " + outputPath)
+
+  print ""
+  ensurePath(outputPath)
+  if not os.path.exists(exactLogicPath):
+    printExit("LogicPath %s doesn't exist" % exactLogicPath)
+
+  exactLogicFiles = [os.path.join(exactLogicPath, f) for f in os.listdir(exactLogicPath) \
+      if (os.path.isfile(os.path.join(exactLogicPath, f)) \
+      and os.path.splitext(f)[1]==".yaml")]
+
+  #print1("# LibraryLogicFiles:" % exactLogicFiles)
+  #for logicFile in logicFiles:
+  #  print1("#   %s" % logicFile)
+
+  for exactLogicFilePath in exactLogicFiles:
+    _, fileName = os.path.split(exactLogicFilePath)
+    #print1("#   %s" % fileName)
+    origionalLogicFilePath = os.path.join(origionalLogicPath, fileName)
+    #print1("#   %s" % origionalLogicFilePath)
+    if os.path.isfile(origionalLogicFilePath):
+      
+      outputLogicFilePath = os.path.join(outputPath, fileName)
+
+      try:
+        ProcessMergeLogicFile(exactLogicFilePath, origionalLogicFilePath, outputLogicFilePath)
+      except Exception as ex:
+        print("Exception: {0}".format(ex))
+
+    else:
+      print ("# file does not exist in origional directory " + origionalLogicFilePath)
+    
+
+################################################################################
+# Main
+################################################################################
+if __name__ == "__main__":
+    RunMergeTensileLogicFiles()
\ No newline at end of file
diff --git a/Tensile/__init__.py b/Tensile/__init__.py
index c938064729..15daea2823 100644
--- a/Tensile/__init__.py
+++ b/Tensile/__init__.py
@@ -20,4 +20,4 @@
 ################################################################################
 
 # hardcoded tensile version; also in Tensile/Source/TensileConfigVersion.cmake
-__version__ = "4.5.0"
+__version__ = "4.6.0"
diff --git a/bump-version.sh b/bump-version.sh
index fc26ea5f59..840d777eae 100755
--- a/bump-version.sh
+++ b/bump-version.sh
@@ -3,11 +3,11 @@
 # This script needs to be edited to bump version for new release.
 # Version will be bumped in Tensile/__init__.py and in .yaml files
 
-OLD_VERSION="4.4.0"
-NEW_VERSION="4.5.0"
+OLD_VERSION="4.5.0"
+NEW_VERSION="4.6.0"
 
-OLD_MINIMUM_REQUIRED_VERSION="MinimumRequiredVersion: 4.4.0"
-NEW_MINIMUM_REQUIRED_VERSION="MinimumRequiredVersion: 4.5.0"
+OLD_MINIMUM_REQUIRED_VERSION="MinimumRequiredVersion: 4.5.0"
+NEW_MINIMUM_REQUIRED_VERSION="MinimumRequiredVersion: 4.6.0"
 
 sed -i "s/${OLD_VERSION}/${NEW_VERSION}/g" Tensile/__init__.py