Merge branch 'develop'

ROCm · Oct 11, 2018 · 7556393 · 7556393
2 parents caacebd + 5cd8a4c
commit 7556393
Show file tree

Hide file tree

Showing 120 changed files with 12,995 additions and 1,356 deletions.
diff --git a/Tensile/Common.py b/Tensile/Common.py
@@ -267,9 +267,21 @@
     # If changing this also change runtime writeSolutionAssertionCheck* functions in Common.py and in TensileTypes.py (AssertionProperties class)
     "AssertFree0ElementMultiple" : [1,2,4,8],
 
+    # When creating the kernel, assume that the 'second' free index size is some
+    # multiple of the element size.
+    # "first" free index is FreeIndex[1] and usually letter "J"
+    # 1 indicates no restriction (since all sizes are multiples of 1)
+    # If changing this also change runtime writeSolutionAssertionCheck* functions in Common.py and in TensileTypes.py (AssertionProperties class)
+    #"AssertFree1ElementMultiple" : [1,2,4,8],
+    "AssertFree1ElementMultiple" : [1],  # TODO, support broader range here
+
     # Generate code inside kernel to check assertions above on Tensor dimensions
     "CheckTensorDimAsserts":               [False, True],
 
+    # Generate code inside kernel to check several dimension overflow cases, in particular around use of 32-bit calcs
+    # 0 = no check, 1=checks for cases that should be avoided through assertions and kernel selection, 2=checks for cases that should never happen
+    "CheckDimOverflow":               [0,1,2],
+
     # For Block Mapping type:
     # 0   : Use hardware-assigned wg number with no remapping.
     # N   : WG block width.  "Wrap" to a new wg1 "row" assignment after N WGs assigned in that row.
@@ -429,11 +441,13 @@
     {"BufferLoad":                [ True ] },
     {"BufferStore":               [ True ] },
     {"DirectToLds":               [ True ] },
-    {"PreciseBoundsCheck":        [ False ] },
+    {"PreciseBoundsCheck":        [ True ] },
     {"UseSgprForGRO":             [ -1 ] },
     {"AssertSummationElementMultiple": [ 1 ] },
     {"AssertFree0ElementMultiple": [ 1 ] },
+    {"AssertFree1ElementMultiple": [ 1 ] },
     {"CheckTensorDimAsserts"      : [ False ] },
+    {"CheckDimOverflow"           : [ 0 ] },
 
     {"GlobalSplitU":              [ 1 ] },
     {"GlobalSplitUSummationAssignmentRoundRobin": [ True ] },
@@ -675,9 +689,11 @@ def tryAssembler(isaVersion, asmString):
     if result != "":
       return 0 # stdout and stderr must be empty
   except subprocess.CalledProcessError, e:
+    if globalParameters["PrintLevel"] >=2:
+        print "CalledProcessError", e
     return 0 # error, not supported
 
-  return 1 # syntax works for
+  return 1 # syntax works
 
 
 ################################################################################
@@ -737,13 +753,11 @@ def assignGlobalParameters( config ):
   for (v) in globalParameters["SupportedISA"]:
     globalParameters["AsmCaps"][v] = {}
     isaVersion = "gfx" + "".join(map(str,v))
-    asmCmd = "%s -x assembler -target amdgcn-amdhsa -mcpu=%s -" \
-               % (globalParameters["AssemblerPath"], isaVersion)
-    # This doesn't work since assembler politely falls back to default with an unsupported mcpu argument:
     globalParameters["AsmCaps"][v]["SupportedIsa"] = tryAssembler(isaVersion, "")
     globalParameters["AsmCaps"][v]["HasExplicitCO"] = tryAssembler(isaVersion, "v_add_co_u32 v0,vcc,v0,v0")
     globalParameters["AsmCaps"][v]["HasDirectToLds"] = tryAssembler(isaVersion, "buffer_load_dword v40, v36, s[24:27], s28 offen offset:0 lds")
     globalParameters["AsmCaps"][v]["HasAddLshl"] = tryAssembler(isaVersion, "v_add_lshl_u32 v47, v36, v34, 0x2")
+    globalParameters["AsmCaps"][v]["HasSMulHi"] = tryAssembler(isaVersion, "s_mul_hi_u32 s47, s36, s34")
     caps = ""
     for k in globalParameters["AsmCaps"][v]:
       caps += " %s=%u" % (k, globalParameters["AsmCaps"][v][k])

diff --git a/Tensile/Configs/miopen/Makefile b/Tensile/Configs/miopen/Makefile
@@ -2,9 +2,12 @@ P=problems
 DEEPBENCH_CONV_1x1=$P/nn/deepbench_conv_1x1_batchN.yml $P/nn/deepbench_conv_1x1_batch1.yml
 RESNET=$P/nn/resnet_batch64_B.yml
 
+# Override SCHED as vega10, vega20
+SCHED=vega10
+
 # commonly-used headers and footers:
 HEADER=boiler/header.yml
-FOOTER=boiler/library_logic_vega10_only.yml
+FOOTER=boiler/library_logic_$(SCHED)_only.yml
 
 # Override TYPE as sgemm, hgemm (hgemm_hpa, dgemm, etc in future)
 TYPE=sgemm
@@ -25,6 +28,9 @@ SOLUTION_SKINNY=solutions/$(TYPE)_skinny_explore_$(EXPLORE_LEVEL).yml
 
 all: \
 	$(TYPE)_resnet.yaml \
+	$(TYPE)_resnet50_nn.yaml \
+	$(TYPE)_resnet50_nt.yaml \
+	$(TYPE)_resnet50_tn.yaml \
 	$(TYPE)_deepbench_conv1x1.yaml \
 	$(TYPE)_deepbench_gemm_nn.yaml \
 	$(TYPE)_deepbench_gemm_nt.yaml \
@@ -35,6 +41,17 @@ $(TYPE)_resnet.yaml: $(HEADER) types/$(TYPE)_nn.yml \
 	$(SOLUTION_SKINNY) $(RESNET) \
 	$(FOOTER)
 
+# Resnet50
+$(TYPE)_resnet50_nn.yaml: $(HEADER) types/$(TYPE)_nn.yml \
+	$(SOLUTION_SKINNY) $P/nn/resnet50_all.yml \
+	$(FOOTER)
+$(TYPE)_resnet50_nt.yaml: $(HEADER) types/$(TYPE)_nt.yml \
+	$(SOLUTION_SKINNY) $P/nt/resnet50_all.yml \
+	$(FOOTER)
+$(TYPE)_resnet50_tn.yaml: $(HEADER) types/$(TYPE)_tn.yml \
+	$(SOLUTION_SKINNY) $P/tn/resnet50_all.yml \
+	$(FOOTER)
+
 # DeepBench Convolution:
 $(TYPE)_deepbench_conv1x1.yaml: $(HEADER) types/$(TYPE)_nn.yml \
 	$(SOLUTION_SKINNY) $(DEEPBENCH_CONV_1x1) \

diff --git a/Tensile/Configs/miopen/archives/resnet50/README.md b/Tensile/Configs/miopen/archives/resnet50/README.md
@@ -0,0 +1,33 @@
+Start with the 6 asm_full logic files
+
+  - vega20_Cijk_Ailk_Bjlk_HB.yaml
+  - vega20_Cijk_Ailk_Bljk_HB.yaml
+  - vega20_Cijk_Alik_Bljk_HB.yaml
+  - vega20_Cijk_Ailk_Bjlk_SB.yaml
+  - vega20_Cijk_Ailk_Bljk_SB.yaml
+  - vega20_Cijk_Alik_Bljk_SB.yaml
+
+from
+
+  - rocBLAS commit a85df88648587a0d2880a74c6c57964366ab02a1 for HGEMM
+  - rocBLAS commit 0ceb1ad64c8bda5473a1e1c3a74ab9ff204acbf8 for SGEMM
+
+we merge the 6 Resnet50-specific logic files archived in the "logic" directory
+into the corresponding asm_full logic files of the same name, resulting in the
+6 combined asm_full logic files in
+
+  - rocBLAS commit ea27b3aba339b4fd48795153995d24dd96cd6457 for HGEMM+SGEMM
+
+The 6 YAML configuration files used to generate the Resnet50-specific logic
+files are archived in the "config" directory correspondingly named
+
+  - hgemm_resnet50_nt.yaml
+  - hgemm_resnet50_nn.yaml
+  - hgemm_resnet50_tn.yaml
+  - sgemm_resnet50_nt.yaml
+  - sgemm_resnet50_nn.yaml
+  - sgemm_resnet50_tn.yaml
+
+Note that we explicitly purged the 6 sizes with either n=49 or k=49 from
+the Resnet50-specific logic files for HGEMM because they won't be using
+the assembly kernels.
diff --git a/Tensile/Configs/miopen/archives/resnet50/config/hgemm_resnet50_nn.yaml b/Tensile/Configs/miopen/archives/resnet50/config/hgemm_resnet50_nn.yaml
@@ -0,0 +1,115 @@
+GlobalParameters:
+  MinimumRequiredVersion: 4.2.0
+  PrintLevel: 1
+  ForceRedoBenchmarkProblems: True
+  ForceRedoLibraryLogic: True
+  ForceRedoLibraryClient: True
+  CMakeBuildType: Release
+  EnqueuesPerSync: 1
+  SyncsPerBenchmark: 1
+  LibraryPrintDebug: False
+  NumElementsToValidate: 0
+  ValidationMaxToPrint: 4
+  ValidationPrintValids: False
+  ShortNames: False
+  MergeFiles: True
+  Platform: 0
+  Device: 0
+  KernelTime: True
+  PinClocks: True
+  SleepPercent: 200
+  DataInitTypeBeta : 0
+  CodeFromFiles: 1
+  SolutionSelectionAlg: 1
+  PrintWinnersOnly: 1
+
+BenchmarkProblems:
+  ########################################
+  # NN - standard
+  ########################################
+  -
+    - # ProblemType
+      OperationType: GEMM
+      DataType: h
+      TransposeA: False
+      TransposeB: False
+      UseBeta: True
+      Batched: True
+  ########################################
+  # Explore large number of ~10K half solutions
+  ########################################
+    - # Benchmark Group
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - EdgeType: ["ShiftPtr"]
+        - LoopTail: [True]
+        - KernelLanguage: ["Assembly"]
+      ForkParameters:
+        - FractionalLoad: [1]
+        - PrefetchGlobalRead: [ False, True ]
+        - PrefetchLocalRead: [ False, True]
+        - ThreadTile:
+          - [ 4, 4 ]
+          - [ 8, 4 ]
+          - [ 8, 8 ]
+          - [ 16, 8 ]
+          - [ 8, 16 ]
+          - [ 16, 16 ]
+        - WorkGroup:
+          - [ 16, 8, 2 ]
+          - [ 16, 4, 4 ]
+          - [ 16,  8, 1 ]
+          - [ 8, 32, 1 ]
+          - [ 16, 16, 1 ]
+          - [ 32,  8, 1 ]
+        - GlobalSplitU: [1,3,5]
+        - WorkGroupMapping: [1,8,64]
+        - DepthU: [ 8,16,24,32 ]
+        - VectorWidth: [4,8]
+        - GlobalReadVectorWidth: [1,8]
+        - LdsPadB: [0, -1 ]
+        - AssertSummationElementMultiple: [2]
+        - AssertFree0ElementMultiple: [2]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+# Resnet50 NN
+          - Exact: [   784 ,   128 ,  64,   512 ]    # beta= 0
+          - Exact: [   784 ,   512 ,  64,   128 ]    # beta= 0
+          - Exact: [  3136 ,    64 ,  64,    64 ]    # beta= 0
+          - Exact: [  3136 ,    64 ,  64,   256 ]    # beta= 0
+          - Exact: [  3136 ,   256 ,  64,    64 ]    # beta= 0
+          - Exact: [   784 ,   128 , 128,   512 ]    # beta= 0
+          - Exact: [   784 ,   512 , 128,   128 ]    # beta= 0
+          - Exact: [  3136 ,    64 , 128,    64 ]    # beta= 0
+          - Exact: [  3136 ,    64 , 128,   256 ]    # beta= 0
+          - Exact: [  3136 ,   256 , 128,    64 ]    # beta= 0
+          - Exact: [  3136 ,   512 ,   1,  2048 ]    # beta= 0
+          - Exact: [  3136 ,  2048 ,   1,   512 ]    # beta= 0
+          - Exact: [ 12544 ,   256 ,   1,  1024 ]    # beta= 0
+          - Exact: [ 12544 ,  1024 ,   1,   256 ]    # beta= 0
+
+LibraryLogic:
+    ScheduleName: "vega20"
+    DeviceNames: ["Device 66a0", "Device 66a7"]
+    ArchitectureName: "gfx906"
+
+#   ScheduleName: "vega10"
+#   DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"]
+#   ArchitectureName: "gfx900"
+
+#   ScheduleName: "mi25"
+#   DeviceNames: ["Device 6860"]
+#   ArchitectureName: "gfx900"
+
+#   ScheduleName: "r9nano"
+#   DeviceNames: ["Device 7300"]
+#   ArchitectureName: "gfx803"
+
+#   ScheduleName: "hip"
+#   DeviceNames: ["Device 0000"]
+#   ArchitectureName: "fallback"
+
+LibraryClient:
diff --git a/Tensile/Configs/miopen/archives/resnet50/config/hgemm_resnet50_nt.yaml b/Tensile/Configs/miopen/archives/resnet50/config/hgemm_resnet50_nt.yaml
@@ -0,0 +1,119 @@
+GlobalParameters:
+  MinimumRequiredVersion: 4.2.0
+  PrintLevel: 1
+  ForceRedoBenchmarkProblems: True
+  ForceRedoLibraryLogic: True
+  ForceRedoLibraryClient: True
+  CMakeBuildType: Release
+  EnqueuesPerSync: 1
+  SyncsPerBenchmark: 1
+  LibraryPrintDebug: False
+  NumElementsToValidate: 0
+  ValidationMaxToPrint: 4
+  ValidationPrintValids: False
+  ShortNames: False
+  MergeFiles: True
+  Platform: 0
+  Device: 0
+  KernelTime: True
+  PinClocks: True
+  SleepPercent: 200
+  DataInitTypeBeta : 0
+  CodeFromFiles: 1
+  SolutionSelectionAlg: 1
+  PrintWinnersOnly: 1
+
+BenchmarkProblems:
+  ########################################
+  # NT - standard
+  ########################################
+  -
+    - # ProblemType
+      OperationType: GEMM
+      DataType: h
+      TransposeA: False
+      TransposeB: True
+      UseBeta: True
+      Batched: True
+  ########################################
+  # Explore large number of ~10K half solutions
+  ########################################
+    - # Benchmark Group
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - EdgeType: ["ShiftPtr"]
+        - LoopTail: [True]
+        - KernelLanguage: ["Assembly"]
+      ForkParameters:
+        - FractionalLoad: [1]
+        - PrefetchGlobalRead: [ False, True ]
+        - PrefetchLocalRead: [ False, True]
+        - ThreadTile:
+          - [ 4, 4 ]
+          - [ 8, 4 ]
+          - [ 8, 8 ]
+          - [ 16, 8 ]
+          - [ 8, 16 ]
+          - [ 16, 16 ]
+        - WorkGroup:
+          - [ 16, 8, 2 ]
+          - [ 16, 4, 4 ]
+          - [ 16,  8, 1 ]
+          - [ 8, 32, 1 ]
+          - [ 16, 16, 1 ]
+          - [ 32,  8, 1 ]
+        - GlobalSplitU: [1,3,5]
+        - WorkGroupMapping: [1,8,64]
+        - DepthU: [ 8,16,24,32 ]
+        - VectorWidth: [4,8]
+        - GlobalReadVectorWidth: [1,8]
+        - LdsPadB: [0, -1 ]
+        - AssertSummationElementMultiple: [2]
+        - AssertFree0ElementMultiple: [2]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+# Resnet50 NT
+          - Exact: [    49 ,   512 ,  64,  2048 ]    # beta= 0
+          - Exact: [    49 ,  2048 ,  64,   512 ]    # beta= 0
+          - Exact: [   196 ,   256 ,  64,  1024 ]    # beta= 0
+          - Exact: [   196 ,  1024 ,  64,   256 ]    # beta= 0
+          - Exact: [   784 ,   128 ,  64,   512 ]    # beta= 0
+          - Exact: [   784 ,   512 ,  64,   128 ]    # beta= 0
+          - Exact: [  3136 ,    64 ,  64,    64 ]    # beta= 0
+          - Exact: [  3136 ,   256 ,  64,    64 ]    # beta= 0
+          - Exact: [  3136 ,    64 ,  64,   256 ]    # beta= 0
+          - Exact: [    49 ,   512 , 128,  2048 ]    # beta= 0
+          - Exact: [    49 ,  2048 , 128,   512 ]    # beta= 0
+          - Exact: [   196 ,   256 , 128,  1024 ]    # beta= 0
+          - Exact: [   196 ,  1024 , 128,   256 ]    # beta= 0
+          - Exact: [   784 ,   128 , 128,   512 ]    # beta= 0
+          - Exact: [   784 ,   512 , 128,   128 ]    # beta= 0
+          - Exact: [  3136 ,    64 , 128,    64 ]    # beta= 0
+          - Exact: [  3136 ,    64 , 128,   256 ]    # beta= 0
+          - Exact: [  3136 ,   256 , 128,    64 ]    # beta= 0
+
+LibraryLogic:
+    ScheduleName: "vega20"
+    DeviceNames: ["Device 66a0", "Device 66a7"]
+    ArchitectureName: "gfx906"
+
+#   ScheduleName: "vega10"
+#   DeviceNames: ["Device 6863", "Device 6862", "Device 687f", "Device 6860", "Device 6861", "Vega 10 XTX [Radeon Vega Frontier Edition]"]
+#   ArchitectureName: "gfx900"
+
+#   ScheduleName: "mi25"
+#   DeviceNames: ["Device 6860"]
+#   ArchitectureName: "gfx900"
+
+#   ScheduleName: "r9nano"
+#   DeviceNames: ["Device 7300"]
+#   ArchitectureName: "gfx803"
+
+#   ScheduleName: "hip"
+#   DeviceNames: ["Device 0000"]
+#   ArchitectureName: "fallback"
+
+LibraryClient: