From df015ec574103930c767db50128a82d179115427 Mon Sep 17 00:00:00 2001 From: Alex Brown Date: Sat, 4 Mar 2023 09:22:58 -0700 Subject: [PATCH 1/8] Fix offset calculation to prevent overflow if offset is really large (#1684) --- Tensile/KernelWriterAssembly.py | 87 ++++++++++++------- .../pre_checkin/mfma/dgemm_large_offset.yaml | 63 ++++++++++++++ 2 files changed, 119 insertions(+), 31 deletions(-) create mode 100644 Tensile/Tests/pre_checkin/mfma/dgemm_large_offset.yaml diff --git a/Tensile/KernelWriterAssembly.py b/Tensile/KernelWriterAssembly.py index bac58142fa..3a86feffa1 100644 --- a/Tensile/KernelWriterAssembly.py +++ b/Tensile/KernelWriterAssembly.py @@ -3097,22 +3097,34 @@ def allocateResources(self, kernel): kStr += ".if 0\n" # add offset to buffer - if not kernel["_GlobalAccumulation"]: - kStr += inst("s_lshl_b32", sgpr("OffsetD"), sgpr("OffsetD"), hex(log2(self.bpeCexternal)), "elements offset to bytes offset") - kStr += inst("s_add_u32", sgpr("AddressD+0"), sgpr("AddressD+0"), sgpr("OffsetD"), "add offset to buffer address") - kStr += inst("s_addc_u32", sgpr("AddressD+1"), sgpr("AddressD+1"), 0, "add offset to buffer address") - - kStr += inst("s_lshl_b32", sgpr("OffsetC"), sgpr("OffsetC"), hex(log2(self.bpeCexternal)), "elements offset to bytes offset") - kStr += inst("s_add_u32", sgpr("AddressC+0"), sgpr("AddressC+0"), sgpr("OffsetC"), "add offset to buffer address") - kStr += inst("s_addc_u32", sgpr("AddressC+1"), sgpr("AddressC+1"), 0, "add offset to buffer address") + tmpOffset = self.sgprPool.checkOutAligned(2, 2, preventOverflow=0) - kStr += inst("s_lshl_b32", sgpr("OffsetA"), sgpr("OffsetA"), hex(log2(self.bpeAB)), "elements offset to bytes offset") - kStr += inst("s_add_u32", sgpr("AddressA+0"), sgpr("AddressA+0"), sgpr("OffsetA"), "add offset to buffer address") - kStr += inst("s_addc_u32", sgpr("AddressA+1"), sgpr("AddressA+1"), 0, "add offset to buffer address") - - kStr += inst("s_lshl_b32", sgpr("OffsetB"), sgpr("OffsetB"), hex(log2(self.bpeAB)), "elements offset to bytes offset") - kStr += inst("s_add_u32", sgpr("AddressB+0"), sgpr("AddressB+0"), sgpr("OffsetB"), "add offset to buffer address") - kStr += inst("s_addc_u32", sgpr("AddressB+1"), sgpr("AddressB+1"), 0, "add offset to buffer address") + if not kernel["_GlobalAccumulation"]: + kStr += inst("s_mov_b32", sgpr(tmpOffset), sgpr("OffsetD"), "copy to temp b64 to prevent overflow with large offset") + kStr += inst("s_mov_b32", sgpr(tmpOffset + 1), 0, "init to 0") + kStr += inst("s_lshl_b64", sgpr(tmpOffset, 2), sgpr(tmpOffset, 2), hex(log2(self.bpeCexternal)), "elements offset to bytes offset") + kStr += inst("s_add_u32", sgpr("AddressD+0"), sgpr("AddressD+0"), sgpr(tmpOffset), "add offset to buffer address") + kStr += inst("s_addc_u32", sgpr("AddressD+1"), sgpr("AddressD+1"), sgpr(tmpOffset + 1), "add offset to buffer address") + + kStr += inst("s_mov_b32", sgpr(tmpOffset), sgpr("OffsetC"), "copy to temp b64 to prevent overflow with large offset") + kStr += inst("s_mov_b32", sgpr(tmpOffset + 1), 0, "init to 0") + kStr += inst("s_lshl_b64", sgpr(tmpOffset, 2), sgpr(tmpOffset, 2), hex(log2(self.bpeCexternal)), "elements offset to bytes offset") + kStr += inst("s_add_u32", sgpr("AddressC+0"), sgpr("AddressC+0"), sgpr(tmpOffset), "add offset to buffer address") + kStr += inst("s_addc_u32", sgpr("AddressC+1"), sgpr("AddressC+1"), sgpr(tmpOffset + 1), "add offset to buffer address") + + kStr += inst("s_mov_b32", sgpr(tmpOffset), sgpr("OffsetA"), "copy to temp b64 to prevent overflow with large offset") + kStr += inst("s_mov_b32", sgpr(tmpOffset + 1), 0, "init to 0") + kStr += inst("s_lshl_b64", sgpr(tmpOffset, 2), sgpr(tmpOffset, 2), hex(log2(self.bpeAB)), "elements offset to bytes offset") + kStr += inst("s_add_u32", sgpr("AddressA+0"), sgpr("AddressA+0"), sgpr(tmpOffset), "add offset to buffer address") + kStr += inst("s_addc_u32", sgpr("AddressA+1"), sgpr("AddressA+1"), sgpr(tmpOffset + 1), "add offset to buffer address") + + kStr += inst("s_mov_b32", sgpr(tmpOffset), sgpr("OffsetB"), "copy to temp b64 to prevent overflow with large offset") + kStr += inst("s_mov_b32", sgpr(tmpOffset + 1), 0, "init to 0") + kStr += inst("s_lshl_b64", sgpr(tmpOffset, 2), sgpr(tmpOffset, 2), hex(log2(self.bpeAB)), "elements offset to bytes offset") + kStr += inst("s_add_u32", sgpr("AddressB+0"), sgpr("AddressB+0"), sgpr(tmpOffset), "add offset to buffer address") + kStr += inst("s_addc_u32", sgpr("AddressB+1"), sgpr("AddressB+1"), sgpr(tmpOffset + 1), "add offset to buffer address") + + self.sgprPool.checkIn(tmpOffset) # self.groOffsetInMacroTile == 1 case, subtract pre-pad here if self.groOffsetInMacroTile: @@ -3356,7 +3368,8 @@ def graWorkGroup(self, kernel, isPap): kStr = "" if kernel["PersistentKernel"]: - stmp = self.getTmpSgpr(4, 4).idx() + stmpRef = self.getTmpSgpr(4, 4) + stmp = stmpRef.idx() # Always reset pointers to handle odd-exit case which moves LRO to the upper bank if not self.prefetchAcrossPersistent and kernel["PrefetchGlobalRead"]: kStr += self.localReadResetOffsets(kernel, self.tPA) @@ -3388,22 +3401,34 @@ def graWorkGroup(self, kernel, isPap): kStr += inst("_s_load_b128", sgpr(stmp, 4), sgpr("KernArgAddress",2), hex(self.argOffsetOffset), "reload DCAB Offset") kStr += inst("s_waitcnt", "lgkmcnt(0)", "wait global buffer adress ready") - if not kernel["_GlobalAccumulation"]: - kStr += inst("s_lshl_b32", sgpr(stmp+0), sgpr(stmp+0), hex(log2(self.bpeCexternal)), "elements offset to bytes offset") - kStr += inst("s_add_u32", sgpr("AddressD+0"), sgpr("AddressD+0"), sgpr(stmp+0), "add offset to buffer address") - kStr += inst("s_addc_u32", sgpr("AddressD+1"), sgpr("AddressD+1"), 0, "add offset to buffer address") - - kStr += inst("s_lshl_b32", sgpr(stmp+1), sgpr(stmp+1), hex(log2(self.bpeCexternal)), "elements offset to bytes offset") - kStr += inst("s_add_u32", sgpr("AddressC+0"), sgpr("AddressC+0"), sgpr(stmp+1), "add offset to buffer address") - kStr += inst("s_addc_u32", sgpr("AddressC+1"), sgpr("AddressC+1"), 0, "add offset to buffer address") + tmpOffset = self.sgprPool.checkOutAligned(2, 2, preventOverflow=0) - kStr += inst("s_lshl_b32", sgpr(stmp+2), sgpr(stmp+2), hex(log2(self.bpeAB)), "elements offset to bytes offset") - kStr += inst("s_add_u32", sgpr("AddressA+0"), sgpr("AddressA+0"), sgpr(stmp+2), "add offset to buffer address") - kStr += inst("s_addc_u32", sgpr("AddressA+1"), sgpr("AddressA+1"), 0, "add offset to buffer address") - - kStr += inst("s_lshl_b32", sgpr(stmp+3), sgpr(stmp+3), hex(log2(self.bpeAB)), "elements offset to bytes offset") - kStr += inst("s_add_u32", sgpr("AddressB+0"), sgpr("AddressB+0"), sgpr(stmp+3), "add offset to buffer address") - kStr += inst("s_addc_u32", sgpr("AddressB+1"), sgpr("AddressB+1"), 0, "add offset to buffer address") + if not kernel["_GlobalAccumulation"]: + kStr += inst("s_mov_b32", sgpr(tmpOffset), sgpr(stmp+0), "copy to temp b64 to prevent overflow with large offset") + kStr += inst("s_mov_b32", sgpr(tmpOffset + 1), 0, "init to 0") + kStr += inst("s_lshl_b64", sgpr(tmpOffset, 2), sgpr(tmpOffset, 2), hex(log2(self.bpeCexternal)), "elements offset to bytes offset") + kStr += inst("s_add_u32", sgpr("AddressD+0"), sgpr("AddressD+0"), sgpr(tmpOffset), "add offset to buffer address") + kStr += inst("s_addc_u32", sgpr("AddressD+1"), sgpr("AddressD+1"), sgpr(tmpOffset + 1), "add offset to buffer address") + + kStr += inst("s_mov_b32", sgpr(tmpOffset), sgpr(stmp+1), "copy to temp b64 to prevent overflow with large offset") + kStr += inst("s_mov_b32", sgpr(tmpOffset + 1), 0, "init to 0") + kStr += inst("s_lshl_b64", sgpr(tmpOffset, 2), sgpr(tmpOffset, 2), hex(log2(self.bpeCexternal)), "elements offset to bytes offset") + kStr += inst("s_add_u32", sgpr("AddressC+0"), sgpr("AddressC+0"), sgpr(tmpOffset), "add offset to buffer address") + kStr += inst("s_addc_u32", sgpr("AddressC+1"), sgpr("AddressC+1"), sgpr(tmpOffset + 1), "add offset to buffer address") + + kStr += inst("s_mov_b32", sgpr(tmpOffset), sgpr(stmp+2), "copy to temp b64 to prevent overflow with large offset") + kStr += inst("s_mov_b32", sgpr(tmpOffset + 1), 0, "init to 0") + kStr += inst("s_lshl_b64", sgpr(tmpOffset, 2), sgpr(tmpOffset, 2), hex(log2(self.bpeAB)), "elements offset to bytes offset") + kStr += inst("s_add_u32", sgpr("AddressA+0"), sgpr("AddressA+0"), sgpr(tmpOffset), "add offset to buffer address") + kStr += inst("s_addc_u32", sgpr("AddressA+1"), sgpr("AddressA+1"), sgpr(tmpOffset + 1), "add offset to buffer address") + + kStr += inst("s_mov_b32", sgpr(tmpOffset), sgpr(stmp+3), "copy to temp b64 to prevent overflow with large offset") + kStr += inst("s_mov_b32", sgpr(tmpOffset + 1), 0, "init to 0") + kStr += inst("s_lshl_b64", sgpr(tmpOffset, 2), sgpr(tmpOffset, 2), hex(log2(self.bpeAB)), "elements offset to bytes offset") + kStr += inst("s_add_u32", sgpr("AddressB+0"), sgpr("AddressB+0"), sgpr(tmpOffset), "add offset to buffer address") + kStr += inst("s_addc_u32", sgpr("AddressB+1"), sgpr("AddressB+1"), sgpr(tmpOffset + 1), "add offset to buffer address") + + self.sgprPool.checkIn(tmpOffset) if self.groOffsetInMacroTile: prePad = self.srdShiftLeft["A"] * self.tPA["bpe"] # leave room in case we have to pointer shift diff --git a/Tensile/Tests/pre_checkin/mfma/dgemm_large_offset.yaml b/Tensile/Tests/pre_checkin/mfma/dgemm_large_offset.yaml new file mode 100644 index 0000000000..28ca177ae2 --- /dev/null +++ b/Tensile/Tests/pre_checkin/mfma/dgemm_large_offset.yaml @@ -0,0 +1,63 @@ +TestParameters: + marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102] # not supported by arch + +GlobalParameters: + NumElementsToValidate: -1 + BoundsCheck: True + KernelTime: True + # PrintSolutionRejectionReason: True + BufferOffsetB: 536877696 + +BenchmarkProblems: + ######################################## + # NT - standard + ######################################## + - # sgemm TN + - # ProblemType + OperationType: GEMM + DataType: d + TransposeA: False + TransposeB: False + UseBeta: True + Batched: True + + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - PrefetchLocalRead: [True] + ForkParameters: + - AssertMinApproxSize: [0] + - DepthU: [8] + - ExpandPointerSwap: [0] + - GroupLoadStore: [1] + - LocalReadVectorWidth: [1] + - MACInstruction: ["MAD"] + - NumElementsPerBatchStore: [4] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [3] + - ScheduleIterAlg: [3] + - SourceSwap: [1] + - StaggerU: [0] + - StaggerUStride: [0] + - StorePriorityOpt: [1] + - StoreSyncOpt: [6] + - WorkGroupMapping: [11] + - MatrixInstruction: + # - [16, 16, 4, 1, 1, 8,2, 1,4] # 128x128 + # - [16, 16, 4, 1, 1, 4,4, 2,2] # 128x128 + # - [16, 16, 4, 1, 1, 1,1, 4,1] # 128x128 + - [16, 16, 4, 1] + - ThreadTile: + - [ 2, 16 ] + - WorkGroup: + - [ 32, 4, 1 ] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + # - Exact: [1024, 1024, 1, 1024, 1048576, 524288, 524288, 524288] + - Exact: [1024, 1024, 1, 1024] + From 051844d3271161d5f15baa181cd9e76b4deee10e Mon Sep 17 00:00:00 2001 From: cmingch Date: Thu, 2 Mar 2023 00:58:30 +0800 Subject: [PATCH 2/8] Revert "Add dual mac insttruction for gfx11." (#1683) This reverts commit 78a8c51496a7ac5b58d6976009e08bd42a14258f. --- Tensile/Common.py | 29 +++--- Tensile/Components/MAC_F32.py | 65 +------------ .../pre_checkin/wave32/sgemm_asm_nn_wv32.yaml | 95 ------------------- .../pre_checkin/wave32/sgemm_asm_nt_wv32.yaml | 94 ------------------ .../pre_checkin/wave32/sgemm_asm_tn_wv32.yaml | 92 ------------------ .../pre_checkin/wave32/sgemm_asm_tt_wv32.yaml | 90 ------------------ pytest.ini | 1 - 7 files changed, 19 insertions(+), 447 deletions(-) delete mode 100644 Tensile/Tests/pre_checkin/wave32/sgemm_asm_nn_wv32.yaml delete mode 100644 Tensile/Tests/pre_checkin/wave32/sgemm_asm_nt_wv32.yaml delete mode 100644 Tensile/Tests/pre_checkin/wave32/sgemm_asm_tn_wv32.yaml delete mode 100644 Tensile/Tests/pre_checkin/wave32/sgemm_asm_tt_wv32.yaml diff --git a/Tensile/Common.py b/Tensile/Common.py index 47a379ee7c..71fa5168e8 100644 --- a/Tensile/Common.py +++ b/Tensile/Common.py @@ -281,20 +281,20 @@ } CACHED_ASM_CAPS = { - (8, 0, 3): {'SupportedISA': True, 'HasExplicitCO': False, 'HasExplicitNC': False, 'HasDirectToLdsDest': False, 'HasDirectToLdsNoDest': True, 'HasAddLshl': False, 'HasLshlOr': False, 'HasSMulHi': False, 'HasWMMA': False, 'HasMFMA': False, 'HasMFMA_f64': False, 'HasMFMA_bf16_1k': False, 'v_mac_f16': True, 'v_fma_f16': False, 'v_fmac_f16': False, 'v_pk_fma_f16': False, 'v_pk_fmac_f16': False, 'v_mad_mix_f32': False, 'v_fma_mix_f32': False, 'v_dot2_f32_f16': False, 'v_dot2c_f32_f16': False, 'v_dot4_i32_i8': False, 'v_dot4c_i32_i8': False, 'VOP3v_dot4_i32_i8': False, 'v_mac_f32': True, 'v_fma_f32': True, 'v_fmac_f32': False, 'v_fma_f64': True, 'v_dual_fmac_f32': False, 'HasAtomicAdd': False, 'MaxVmcnt': 15, 'MaxLgkmcnt': 15, 'SupportedSource': True}, - (9, 0, 0): {'SupportedISA': True, 'HasExplicitCO': True, 'HasExplicitNC': False, 'HasDirectToLdsDest': False, 'HasDirectToLdsNoDest': True, 'HasAddLshl': True, 'HasLshlOr': True, 'HasSMulHi': True, 'HasWMMA': False, 'HasMFMA': False, 'HasMFMA_f64': False, 'HasMFMA_bf16_1k': False, 'v_mac_f16': True, 'v_fma_f16': True, 'v_fmac_f16': False, 'v_pk_fma_f16': True, 'v_pk_fmac_f16': False, 'v_mad_mix_f32': True, 'v_fma_mix_f32': False, 'v_dot2_f32_f16': False, 'v_dot2c_f32_f16': False, 'v_dot4_i32_i8': False, 'v_dot4c_i32_i8': False, 'VOP3v_dot4_i32_i8': False, 'v_mac_f32': True, 'v_fma_f32': True, 'v_fmac_f32': False, 'v_fma_f64': True, 'v_dual_fmac_f32': False, 'HasAtomicAdd': False, 'MaxVmcnt': 63, 'MaxLgkmcnt': 15, 'SupportedSource': True}, - (9, 0, 6): {'SupportedISA': True, 'HasExplicitCO': True, 'HasExplicitNC': False, 'HasDirectToLdsDest': False, 'HasDirectToLdsNoDest': True, 'HasAddLshl': True, 'HasLshlOr': True, 'HasSMulHi': True, 'HasWMMA': False, 'HasMFMA': False, 'HasMFMA_f64': False, 'HasMFMA_bf16_1k': False, 'v_mac_f16': True, 'v_fma_f16': True, 'v_fmac_f16': False, 'v_pk_fma_f16': True, 'v_pk_fmac_f16': False, 'v_mad_mix_f32': False, 'v_fma_mix_f32': True, 'v_dot2_f32_f16': True, 'v_dot2c_f32_f16': False, 'v_dot4_i32_i8': False, 'v_dot4c_i32_i8': False, 'VOP3v_dot4_i32_i8': True, 'v_mac_f32': True, 'v_fma_f32': True, 'v_fmac_f32': True, 'v_fma_f64': True, 'v_dual_fmac_f32': False, 'HasAtomicAdd': False, 'MaxVmcnt': 63, 'MaxLgkmcnt': 15, 'SupportedSource': True}, - (9, 0, 8): {'SupportedISA': True, 'HasExplicitCO': True, 'HasExplicitNC': False, 'HasDirectToLdsDest': False, 'HasDirectToLdsNoDest': True, 'HasAddLshl': True, 'HasLshlOr': True, 'HasSMulHi': True, 'HasWMMA': False, 'HasMFMA': True, 'HasMFMA_f64': False, 'HasMFMA_bf16_1k': False, 'v_mac_f16': True, 'v_fma_f16': True, 'v_fmac_f16': False, 'v_pk_fma_f16': True, 'v_pk_fmac_f16': False, 'v_mad_mix_f32': False, 'v_fma_mix_f32': True, 'v_dot2_f32_f16': True, 'v_dot2c_f32_f16': True, 'v_dot4_i32_i8': False, 'v_dot4c_i32_i8': True, 'VOP3v_dot4_i32_i8': True, 'v_mac_f32': True, 'v_fma_f32': True, 'v_fmac_f32': True, 'v_fma_f64': True, 'v_dual_fmac_f32': False, 'HasAtomicAdd': True, 'MaxVmcnt': 63, 'MaxLgkmcnt': 15, 'SupportedSource': True}, - (9, 0, 10): {'SupportedISA': True, 'HasExplicitCO': True, 'HasExplicitNC': False, 'HasDirectToLdsDest': False, 'HasDirectToLdsNoDest': True, 'HasAddLshl': True, 'HasLshlOr': True, 'HasSMulHi': True, 'HasWMMA': False, 'HasMFMA': True, 'HasMFMA_f64': True, 'HasMFMA_bf16_1k': True, 'v_mac_f16': True, 'v_fma_f16': True, 'v_fmac_f16': False, 'v_pk_fma_f16': True, 'v_pk_fmac_f16': False, 'v_mad_mix_f32': False, 'v_fma_mix_f32': True, 'v_dot2_f32_f16': True, 'v_dot2c_f32_f16': True, 'v_dot4_i32_i8': False, 'v_dot4c_i32_i8': True, 'VOP3v_dot4_i32_i8': True, 'v_mac_f32': True, 'v_fma_f32': True, 'v_fmac_f32': True, 'v_fma_f64': True, 'v_dual_fmac_f32': False, 'HasAtomicAdd': True, 'MaxVmcnt': 63, 'MaxLgkmcnt': 15, 'SupportedSource': True}, - (10, 1, 0): {'SupportedISA': True, 'HasExplicitCO': True, 'HasExplicitNC': True, 'HasDirectToLdsDest': False, 'HasDirectToLdsNoDest': True, 'HasAddLshl': True, 'HasLshlOr': True, 'HasSMulHi': True, 'HasWMMA': False, 'HasMFMA': False, 'HasMFMA_f64': False, 'HasMFMA_bf16_1k': False, 'v_mac_f16': False, 'v_fma_f16': True, 'v_fmac_f16': False, 'v_pk_fma_f16': True, 'v_pk_fmac_f16': False, 'v_mad_mix_f32': False, 'v_fma_mix_f32': True, 'v_dot2_f32_f16': False, 'v_dot2c_f32_f16': False, 'v_dot4_i32_i8': False, 'v_dot4c_i32_i8': False, 'VOP3v_dot4_i32_i8': False, 'v_mac_f32': True, 'v_fma_f32': True, 'v_fmac_f32': True, 'v_fma_f64': True, 'v_dual_fmac_f32': False, 'HasAtomicAdd': False, 'MaxVmcnt': 63, 'MaxLgkmcnt': 15, 'SupportedSource': True}, - (10, 1, 1): {'SupportedISA': True, 'HasExplicitCO': True, 'HasExplicitNC': True, 'HasDirectToLdsDest': False, 'HasDirectToLdsNoDest': True, 'HasAddLshl': True, 'HasLshlOr': True, 'HasSMulHi': True, 'HasWMMA': False, 'HasMFMA': False, 'HasMFMA_f64': False, 'HasMFMA_bf16_1k': False, 'v_mac_f16': False, 'v_fma_f16': True, 'v_fmac_f16': False, 'v_pk_fma_f16': True, 'v_pk_fmac_f16': False, 'v_mad_mix_f32': False, 'v_fma_mix_f32': True, 'v_dot2_f32_f16': True, 'v_dot2c_f32_f16': True, 'v_dot4_i32_i8': False, 'v_dot4c_i32_i8': True, 'VOP3v_dot4_i32_i8': True, 'v_mac_f32': True, 'v_fma_f32': True, 'v_fmac_f32': True, 'v_fma_f64': True, 'v_dual_fmac_f32': False, 'HasAtomicAdd': False, 'MaxVmcnt': 63, 'MaxLgkmcnt': 15, 'SupportedSource': True}, - (10, 1, 2): {'SupportedISA': True, 'HasExplicitCO': True, 'HasExplicitNC': True, 'HasDirectToLdsDest': False, 'HasDirectToLdsNoDest': True, 'HasAddLshl': True, 'HasLshlOr': True, 'HasSMulHi': True, 'HasWMMA': False, 'HasMFMA': False, 'HasMFMA_f64': False, 'HasMFMA_bf16_1k': False, 'v_mac_f16': False, 'v_fma_f16': True, 'v_fmac_f16': False, 'v_pk_fma_f16': True, 'v_pk_fmac_f16': False, 'v_mad_mix_f32': False, 'v_fma_mix_f32': True, 'v_dot2_f32_f16': True, 'v_dot2c_f32_f16': True, 'v_dot4_i32_i8': False, 'v_dot4c_i32_i8': True, 'VOP3v_dot4_i32_i8': True, 'v_mac_f32': True, 'v_fma_f32': True, 'v_fmac_f32': True, 'v_fma_f64': True, 'v_dual_fmac_f32': False, 'HasAtomicAdd': False, 'MaxVmcnt': 63, 'MaxLgkmcnt': 15, 'SupportedSource': True}, - (10, 3, 0): {'SupportedISA': True, 'HasExplicitCO': True, 'HasExplicitNC': True, 'HasDirectToLdsDest': False, 'HasDirectToLdsNoDest': True, 'HasAddLshl': True, 'HasLshlOr': True, 'HasSMulHi': True, 'HasWMMA': False, 'HasMFMA': False, 'HasMFMA_f64': False, 'HasMFMA_bf16_1k': False, 'v_mac_f16': False, 'v_fma_f16': True, 'v_fmac_f16': False, 'v_pk_fma_f16': True, 'v_pk_fmac_f16': False, 'v_mad_mix_f32': False, 'v_fma_mix_f32': True, 'v_dot2_f32_f16': True, 'v_dot2c_f32_f16': True, 'v_dot4_i32_i8': False, 'v_dot4c_i32_i8': True, 'VOP3v_dot4_i32_i8': True, 'v_mac_f32': False, 'v_fma_f32': True, 'v_fmac_f32': True, 'v_fma_f64': True, 'v_dual_fmac_f32': False, 'HasAtomicAdd': False, 'MaxVmcnt': 63, 'MaxLgkmcnt': 15, 'SupportedSource': True}, - (10, 3, 1): {'SupportedISA': True, 'HasExplicitCO': True, 'HasExplicitNC': True, 'HasDirectToLdsDest': False, 'HasDirectToLdsNoDest': True, 'HasAddLshl': True, 'HasLshlOr': True, 'HasSMulHi': True, 'HasWMMA': False, 'HasMFMA': False, 'HasMFMA_f64': False, 'HasMFMA_bf16_1k': False, 'v_mac_f16': False, 'v_fma_f16': True, 'v_fmac_f16': False, 'v_pk_fma_f16': True, 'v_pk_fmac_f16': False, 'v_mad_mix_f32': False, 'v_fma_mix_f32': True, 'v_dot2_f32_f16': True, 'v_dot2c_f32_f16': True, 'v_dot4_i32_i8': False, 'v_dot4c_i32_i8': True, 'VOP3v_dot4_i32_i8': True, 'v_mac_f32': False, 'v_fma_f32': True, 'v_fmac_f32': True, 'v_fma_f64': True, 'v_dual_fmac_f32': False, 'HasAtomicAdd': False, 'MaxVmcnt': 63, 'MaxLgkmcnt': 15, 'SupportedSource': True}, - (11, 0, 0): {'SupportedISA': True, 'HasExplicitCO': True, 'HasExplicitNC': True, 'HasDirectToLdsDest': False, 'HasDirectToLdsNoDest': False, 'HasAddLshl': True, 'HasLshlOr': True, 'HasSMulHi': True, 'HasWMMA': True, 'HasMFMA': False, 'HasMFMA_f64': False, 'HasMFMA_bf16_1k': False, 'v_mac_f16': False, 'v_fma_f16': True, 'v_fmac_f16': False, 'v_pk_fma_f16': True, 'v_pk_fmac_f16': False, 'v_mad_mix_f32': False, 'v_fma_mix_f32': True, 'v_dot2_f32_f16': True, 'v_dot2c_f32_f16': True, 'v_dot4_i32_i8': False, 'v_dot4c_i32_i8': False, 'VOP3v_dot4_i32_i8': False, 'v_mac_f32': False, 'v_fma_f32': True, 'v_fmac_f32': True, 'v_fma_f64': True, 'v_dual_fmac_f32': True, 'HasAtomicAdd': True, 'MaxVmcnt': 63, 'MaxLgkmcnt': 15, 'SupportedSource': True}, - (11, 0, 1): {'SupportedISA': True, 'HasExplicitCO': True, 'HasExplicitNC': True, 'HasDirectToLdsDest': False, 'HasDirectToLdsNoDest': False, 'HasAddLshl': True, 'HasLshlOr': True, 'HasSMulHi': True, 'HasWMMA': True, 'HasMFMA': False, 'HasMFMA_f64': False, 'HasMFMA_bf16_1k': False, 'v_mac_f16': False, 'v_fma_f16': True, 'v_fmac_f16': False, 'v_pk_fma_f16': True, 'v_pk_fmac_f16': False, 'v_mad_mix_f32': False, 'v_fma_mix_f32': True, 'v_dot2_f32_f16': True, 'v_dot2c_f32_f16': True, 'v_dot4_i32_i8': False, 'v_dot4c_i32_i8': False, 'VOP3v_dot4_i32_i8': False, 'v_mac_f32': False, 'v_fma_f32': True, 'v_fmac_f32': True, 'v_fma_f64': True, 'v_dual_fmac_f32': True, 'HasAtomicAdd': True, 'MaxVmcnt': 63, 'MaxLgkmcnt': 15, 'SupportedSource': True}, - (11, 0, 2): {'SupportedISA': True, 'HasExplicitCO': True, 'HasExplicitNC': True, 'HasDirectToLdsDest': False, 'HasDirectToLdsNoDest': False, 'HasAddLshl': True, 'HasLshlOr': True, 'HasSMulHi': True, 'HasWMMA': True, 'HasMFMA': False, 'HasMFMA_f64': False, 'HasMFMA_bf16_1k': False, 'v_mac_f16': False, 'v_fma_f16': True, 'v_fmac_f16': False, 'v_pk_fma_f16': True, 'v_pk_fmac_f16': False, 'v_mad_mix_f32': False, 'v_fma_mix_f32': True, 'v_dot2_f32_f16': True, 'v_dot2c_f32_f16': True, 'v_dot4_i32_i8': False, 'v_dot4c_i32_i8': False, 'VOP3v_dot4_i32_i8': False, 'v_mac_f32': False, 'v_fma_f32': True, 'v_fmac_f32': True, 'v_fma_f64': True, 'v_dual_fmac_f32': True, 'HasAtomicAdd': True, 'MaxVmcnt': 63, 'MaxLgkmcnt': 15, 'SupportedSource': True}, - (0, 0, 0): {'SupportedISA': False, 'HasExplicitCO': False, 'HasExplicitNC': False, 'HasDirectToLdsDest': False, 'HasDirectToLdsNoDest': False, 'HasAddLshl': False, 'HasLshlOr': False, 'HasSMulHi': False, 'HasWMMA': False, 'HasMFMA': False, 'HasMFMA_f64': False, 'HasMFMA_bf16_1k': False, 'v_mac_f16': False, 'v_fma_f16': False, 'v_fmac_f16': False, 'v_pk_fma_f16': False, 'v_pk_fmac_f16': False, 'v_mad_mix_f32': False, 'v_fma_mix_f32': False, 'v_dot2_f32_f16': False, 'v_dot2c_f32_f16': False, 'v_dot4_i32_i8': False, 'v_dot4c_i32_i8': False, 'VOP3v_dot4_i32_i8': False, 'v_mac_f32': False, 'v_fma_f32': False, 'v_fmac_f32': False, 'v_fma_f64': False, 'v_dual_fmac_f32': False, 'HasAtomicAdd': False, 'MaxVmcnt': 0, 'MaxLgkmcnt': 15, 'SupportedSource': True}, + (8, 0, 3): {'SupportedISA': True, 'HasExplicitCO': False, 'HasExplicitNC': False, 'HasDirectToLdsDest': False, 'HasDirectToLdsNoDest': True, 'HasAddLshl': False, 'HasLshlOr': False, 'HasSMulHi': False, 'HasWMMA': False, 'HasMFMA': False, 'HasMFMA_f64': False, 'HasMFMA_bf16_1k': False, 'v_mac_f16': True, 'v_fma_f16': False, 'v_fmac_f16': False, 'v_pk_fma_f16': False, 'v_pk_fmac_f16': False, 'v_mad_mix_f32': False, 'v_fma_mix_f32': False, 'v_dot2_f32_f16': False, 'v_dot2c_f32_f16': False, 'v_dot4_i32_i8': False, 'v_dot4c_i32_i8': False, 'VOP3v_dot4_i32_i8': False, 'v_mac_f32': True, 'v_fma_f32': True, 'v_fmac_f32': False, 'v_fma_f64': True, 'HasAtomicAdd': False, 'MaxVmcnt': 15, 'MaxLgkmcnt': 15, 'SupportedSource': True}, + (9, 0, 0): {'SupportedISA': True, 'HasExplicitCO': True, 'HasExplicitNC': False, 'HasDirectToLdsDest': False, 'HasDirectToLdsNoDest': True, 'HasAddLshl': True, 'HasLshlOr': True, 'HasSMulHi': True, 'HasWMMA': False, 'HasMFMA': False, 'HasMFMA_f64': False, 'HasMFMA_bf16_1k': False, 'v_mac_f16': True, 'v_fma_f16': True, 'v_fmac_f16': False, 'v_pk_fma_f16': True, 'v_pk_fmac_f16': False, 'v_mad_mix_f32': True, 'v_fma_mix_f32': False, 'v_dot2_f32_f16': False, 'v_dot2c_f32_f16': False, 'v_dot4_i32_i8': False, 'v_dot4c_i32_i8': False, 'VOP3v_dot4_i32_i8': False, 'v_mac_f32': True, 'v_fma_f32': True, 'v_fmac_f32': False, 'v_fma_f64': True, 'HasAtomicAdd': False, 'MaxVmcnt': 63, 'MaxLgkmcnt': 15, 'SupportedSource': True}, + (9, 0, 6): {'SupportedISA': True, 'HasExplicitCO': True, 'HasExplicitNC': False, 'HasDirectToLdsDest': False, 'HasDirectToLdsNoDest': True, 'HasAddLshl': True, 'HasLshlOr': True, 'HasSMulHi': True, 'HasWMMA': False, 'HasMFMA': False, 'HasMFMA_f64': False, 'HasMFMA_bf16_1k': False, 'v_mac_f16': True, 'v_fma_f16': True, 'v_fmac_f16': False, 'v_pk_fma_f16': True, 'v_pk_fmac_f16': False, 'v_mad_mix_f32': False, 'v_fma_mix_f32': True, 'v_dot2_f32_f16': True, 'v_dot2c_f32_f16': False, 'v_dot4_i32_i8': False, 'v_dot4c_i32_i8': False, 'VOP3v_dot4_i32_i8': True, 'v_mac_f32': True, 'v_fma_f32': True, 'v_fmac_f32': True, 'v_fma_f64': True, 'HasAtomicAdd': False, 'MaxVmcnt': 63, 'MaxLgkmcnt': 15, 'SupportedSource': True}, + (9, 0, 8): {'SupportedISA': True, 'HasExplicitCO': True, 'HasExplicitNC': False, 'HasDirectToLdsDest': False, 'HasDirectToLdsNoDest': True, 'HasAddLshl': True, 'HasLshlOr': True, 'HasSMulHi': True, 'HasWMMA': False, 'HasMFMA': True, 'HasMFMA_f64': False, 'HasMFMA_bf16_1k': False, 'v_mac_f16': True, 'v_fma_f16': True, 'v_fmac_f16': False, 'v_pk_fma_f16': True, 'v_pk_fmac_f16': False, 'v_mad_mix_f32': False, 'v_fma_mix_f32': True, 'v_dot2_f32_f16': True, 'v_dot2c_f32_f16': True, 'v_dot4_i32_i8': False, 'v_dot4c_i32_i8': True, 'VOP3v_dot4_i32_i8': True, 'v_mac_f32': True, 'v_fma_f32': True, 'v_fmac_f32': True, 'v_fma_f64': True, 'HasAtomicAdd': True, 'MaxVmcnt': 63, 'MaxLgkmcnt': 15, 'SupportedSource': True}, + (9, 0, 10): {'SupportedISA': True, 'HasExplicitCO': True, 'HasExplicitNC': False, 'HasDirectToLdsDest': False, 'HasDirectToLdsNoDest': True, 'HasAddLshl': True, 'HasLshlOr': True, 'HasSMulHi': True, 'HasWMMA': False, 'HasMFMA': True, 'HasMFMA_f64': True, 'HasMFMA_bf16_1k': True, 'v_mac_f16': True, 'v_fma_f16': True, 'v_fmac_f16': False, 'v_pk_fma_f16': True, 'v_pk_fmac_f16': False, 'v_mad_mix_f32': False, 'v_fma_mix_f32': True, 'v_dot2_f32_f16': True, 'v_dot2c_f32_f16': True, 'v_dot4_i32_i8': False, 'v_dot4c_i32_i8': True, 'VOP3v_dot4_i32_i8': True, 'v_mac_f32': True, 'v_fma_f32': True, 'v_fmac_f32': True, 'v_fma_f64': True, 'HasAtomicAdd': True, 'MaxVmcnt': 63, 'MaxLgkmcnt': 15, 'SupportedSource': True}, + (10, 1, 0): {'SupportedISA': True, 'HasExplicitCO': True, 'HasExplicitNC': True, 'HasDirectToLdsDest': False, 'HasDirectToLdsNoDest': True, 'HasAddLshl': True, 'HasLshlOr': True, 'HasSMulHi': True, 'HasWMMA': False, 'HasMFMA': False, 'HasMFMA_f64': False, 'HasMFMA_bf16_1k': False, 'v_mac_f16': False, 'v_fma_f16': True, 'v_fmac_f16': False, 'v_pk_fma_f16': True, 'v_pk_fmac_f16': False, 'v_mad_mix_f32': False, 'v_fma_mix_f32': True, 'v_dot2_f32_f16': False, 'v_dot2c_f32_f16': False, 'v_dot4_i32_i8': False, 'v_dot4c_i32_i8': False, 'VOP3v_dot4_i32_i8': False, 'v_mac_f32': True, 'v_fma_f32': True, 'v_fmac_f32': True, 'v_fma_f64': True, 'HasAtomicAdd': False, 'MaxVmcnt': 63, 'MaxLgkmcnt': 15, 'SupportedSource': True}, + (10, 1, 1): {'SupportedISA': True, 'HasExplicitCO': True, 'HasExplicitNC': True, 'HasDirectToLdsDest': False, 'HasDirectToLdsNoDest': True, 'HasAddLshl': True, 'HasLshlOr': True, 'HasSMulHi': True, 'HasWMMA': False, 'HasMFMA': False, 'HasMFMA_f64': False, 'HasMFMA_bf16_1k': False, 'v_mac_f16': False, 'v_fma_f16': True, 'v_fmac_f16': False, 'v_pk_fma_f16': True, 'v_pk_fmac_f16': False, 'v_mad_mix_f32': False, 'v_fma_mix_f32': True, 'v_dot2_f32_f16': True, 'v_dot2c_f32_f16': True, 'v_dot4_i32_i8': False, 'v_dot4c_i32_i8': True, 'VOP3v_dot4_i32_i8': True, 'v_mac_f32': True, 'v_fma_f32': True, 'v_fmac_f32': True, 'v_fma_f64': True, 'HasAtomicAdd': False, 'MaxVmcnt': 63, 'MaxLgkmcnt': 15, 'SupportedSource': True}, + (10, 1, 2): {'SupportedISA': True, 'HasExplicitCO': True, 'HasExplicitNC': True, 'HasDirectToLdsDest': False, 'HasDirectToLdsNoDest': True, 'HasAddLshl': True, 'HasLshlOr': True, 'HasSMulHi': True, 'HasWMMA': False, 'HasMFMA': False, 'HasMFMA_f64': False, 'HasMFMA_bf16_1k': False, 'v_mac_f16': False, 'v_fma_f16': True, 'v_fmac_f16': False, 'v_pk_fma_f16': True, 'v_pk_fmac_f16': False, 'v_mad_mix_f32': False, 'v_fma_mix_f32': True, 'v_dot2_f32_f16': True, 'v_dot2c_f32_f16': True, 'v_dot4_i32_i8': False, 'v_dot4c_i32_i8': True, 'VOP3v_dot4_i32_i8': True, 'v_mac_f32': True, 'v_fma_f32': True, 'v_fmac_f32': True, 'v_fma_f64': True, 'HasAtomicAdd': False, 'MaxVmcnt': 63, 'MaxLgkmcnt': 15, 'SupportedSource': True}, + (10, 3, 0): {'SupportedISA': True, 'HasExplicitCO': True, 'HasExplicitNC': True, 'HasDirectToLdsDest': False, 'HasDirectToLdsNoDest': True, 'HasAddLshl': True, 'HasLshlOr': True, 'HasSMulHi': True, 'HasWMMA': False, 'HasMFMA': False, 'HasMFMA_f64': False, 'HasMFMA_bf16_1k': False, 'v_mac_f16': False, 'v_fma_f16': True, 'v_fmac_f16': False, 'v_pk_fma_f16': True, 'v_pk_fmac_f16': False, 'v_mad_mix_f32': False, 'v_fma_mix_f32': True, 'v_dot2_f32_f16': True, 'v_dot2c_f32_f16': True, 'v_dot4_i32_i8': False, 'v_dot4c_i32_i8': True, 'VOP3v_dot4_i32_i8': True, 'v_mac_f32': False, 'v_fma_f32': True, 'v_fmac_f32': True, 'v_fma_f64': True, 'HasAtomicAdd': False, 'MaxVmcnt': 63, 'MaxLgkmcnt': 15, 'SupportedSource': True}, + (10, 3, 1): {'SupportedISA': True, 'HasExplicitCO': True, 'HasExplicitNC': True, 'HasDirectToLdsDest': False, 'HasDirectToLdsNoDest': True, 'HasAddLshl': True, 'HasLshlOr': True, 'HasSMulHi': True, 'HasWMMA': False, 'HasMFMA': False, 'HasMFMA_f64': False, 'HasMFMA_bf16_1k': False, 'v_mac_f16': False, 'v_fma_f16': True, 'v_fmac_f16': False, 'v_pk_fma_f16': True, 'v_pk_fmac_f16': False, 'v_mad_mix_f32': False, 'v_fma_mix_f32': True, 'v_dot2_f32_f16': True, 'v_dot2c_f32_f16': True, 'v_dot4_i32_i8': False, 'v_dot4c_i32_i8': True, 'VOP3v_dot4_i32_i8': True, 'v_mac_f32': False, 'v_fma_f32': True, 'v_fmac_f32': True, 'v_fma_f64': True, 'HasAtomicAdd': False, 'MaxVmcnt': 63, 'MaxLgkmcnt': 15, 'SupportedSource': True}, + (11, 0, 0): {'SupportedISA': True, 'HasExplicitCO': True, 'HasExplicitNC': True, 'HasDirectToLdsDest': False, 'HasDirectToLdsNoDest': False, 'HasAddLshl': True, 'HasLshlOr': True, 'HasSMulHi': True, 'HasWMMA': True, 'HasMFMA': False, 'HasMFMA_f64': False, 'HasMFMA_bf16_1k': False, 'v_mac_f16': False, 'v_fma_f16': True, 'v_fmac_f16': False, 'v_pk_fma_f16': True, 'v_pk_fmac_f16': False, 'v_mad_mix_f32': False, 'v_fma_mix_f32': True, 'v_dot2_f32_f16': True, 'v_dot2c_f32_f16': True, 'v_dot4_i32_i8': False, 'v_dot4c_i32_i8': False, 'VOP3v_dot4_i32_i8': False, 'v_mac_f32': False, 'v_fma_f32': True, 'v_fmac_f32': True, 'v_fma_f64': True, 'HasAtomicAdd': True, 'MaxVmcnt': 63, 'MaxLgkmcnt': 15, 'SupportedSource': True}, + (11, 0, 1): {'SupportedISA': True, 'HasExplicitCO': True, 'HasExplicitNC': True, 'HasDirectToLdsDest': False, 'HasDirectToLdsNoDest': False, 'HasAddLshl': True, 'HasLshlOr': True, 'HasSMulHi': True, 'HasWMMA': True, 'HasMFMA': False, 'HasMFMA_f64': False, 'HasMFMA_bf16_1k': False, 'v_mac_f16': False, 'v_fma_f16': True, 'v_fmac_f16': False, 'v_pk_fma_f16': True, 'v_pk_fmac_f16': False, 'v_mad_mix_f32': False, 'v_fma_mix_f32': True, 'v_dot2_f32_f16': True, 'v_dot2c_f32_f16': True, 'v_dot4_i32_i8': False, 'v_dot4c_i32_i8': False, 'VOP3v_dot4_i32_i8': False, 'v_mac_f32': False, 'v_fma_f32': True, 'v_fmac_f32': True, 'v_fma_f64': True, 'HasAtomicAdd': True, 'MaxVmcnt': 63, 'MaxLgkmcnt': 15, 'SupportedSource': True}, + (11, 0, 2): {'SupportedISA': True, 'HasExplicitCO': True, 'HasExplicitNC': True, 'HasDirectToLdsDest': False, 'HasDirectToLdsNoDest': False, 'HasAddLshl': True, 'HasLshlOr': True, 'HasSMulHi': True, 'HasWMMA': True, 'HasMFMA': False, 'HasMFMA_f64': False, 'HasMFMA_bf16_1k': False, 'v_mac_f16': False, 'v_fma_f16': True, 'v_fmac_f16': False, 'v_pk_fma_f16': True, 'v_pk_fmac_f16': False, 'v_mad_mix_f32': False, 'v_fma_mix_f32': True, 'v_dot2_f32_f16': True, 'v_dot2c_f32_f16': True, 'v_dot4_i32_i8': False, 'v_dot4c_i32_i8': False, 'VOP3v_dot4_i32_i8': False, 'v_mac_f32': False, 'v_fma_f32': True, 'v_fmac_f32': True, 'v_fma_f64': True, 'HasAtomicAdd': True, 'MaxVmcnt': 63, 'MaxLgkmcnt': 15, 'SupportedSource': True}, + (0, 0, 0): {'SupportedISA': False, 'HasExplicitCO': False, 'HasExplicitNC': False, 'HasDirectToLdsDest': False, 'HasDirectToLdsNoDest': False, 'HasAddLshl': False, 'HasLshlOr': False, 'HasSMulHi': False, 'HasWMMA': False, 'HasMFMA': False, 'HasMFMA_f64': False, 'HasMFMA_bf16_1k': False, 'v_mac_f16': False, 'v_fma_f16': False, 'v_fmac_f16': False, 'v_pk_fma_f16': False, 'v_pk_fmac_f16': False, 'v_mad_mix_f32': False, 'v_fma_mix_f32': False, 'v_dot2_f32_f16': False, 'v_dot2c_f32_f16': False, 'v_dot4_i32_i8': False, 'v_dot4c_i32_i8': False, 'VOP3v_dot4_i32_i8': False, 'v_mac_f32': False, 'v_fma_f32': False, 'v_fmac_f32': False, 'v_fma_f64': False, 'HasAtomicAdd': False, 'MaxVmcnt': 0, 'MaxLgkmcnt': 15, 'SupportedSource': True}, } def getArchitectureName(gfxName): @@ -1784,7 +1784,6 @@ def GetAsmCaps(isaVersion): derivedAsmCaps["v_mac_f32"] = tryAssembler(isaVersion, "v_mac_f32 v20, v21, v22") derivedAsmCaps["v_fma_f32"] = tryAssembler(isaVersion, "v_fma_f32 v20, v21, v22, v23") derivedAsmCaps["v_fmac_f32"] = tryAssembler(isaVersion, "v_fmac_f32 v20, v21, v22") - derivedAsmCaps["v_dual_fmac_f32"] = tryAssembler(isaVersion, "v_dual_fmac_f32 v20, v21, v22 :: v_dual_fmac_f32 v23, v24, v25") derivedAsmCaps["v_fma_f64"] = tryAssembler(isaVersion, "v_fma_f64 v[20:21], v[22:23], v[24:25], v[20:21]") diff --git a/Tensile/Components/MAC_F32.py b/Tensile/Components/MAC_F32.py index 771023eb7f..1165d4e886 100644 --- a/Tensile/Components/MAC_F32.py +++ b/Tensile/Components/MAC_F32.py @@ -24,7 +24,6 @@ from ..Component import Component, MAC from ..DataType import DataType -import queue class MAC_F32_Plain(MAC): """ @@ -48,10 +47,6 @@ def __call__(self, writer, m, innerUnroll): else: raise RuntimeError("FMA instruction specified but not supported on {}".format(kernel["ISA"])) - dualMacEnable = 0 - if writer.asmCaps["v_dual_fmac_f32"] and kernel["WavefrontSize"] == 32: - dualMacEnable = 1 - if not writer.asmCaps[instruction]: raise RuntimeError("{} instruction specified but not supported on {}".format(instruction, kernel["ISA"])) @@ -72,8 +67,6 @@ def __call__(self, writer, m, innerUnroll): priority = Component.Priority.find(writer) macIdx = 0 - instQ = queue.Queue() - for iui in range(0, innerUnroll): for idx1 in range(0, kernel["ThreadTile1"]): for idx0 in range(0, kernel["ThreadTile0"]): @@ -87,67 +80,19 @@ def __call__(self, writer, m, innerUnroll): vars["aStr"] = "v[vgprValuA_X{m}_I{iui} + {a}]".format_map(vars) vars["bStr"] = "v[vgprValuB_X{m}_I{iui} + {b}]".format_map(vars) - if dualMacEnable == 1: - instVars = {} - instVars["endLine"] = writer.endLine - instVars["cStr"] = vars["cStr"] - instVars["aStr"] = vars["aStr"] - instVars["bStr"] = vars["bStr"] - instVars["a"] = vars["a"] - instVars["b"] = vars["b"] - instVars["instruction"] = instruction - - if instQ.empty(): - instQ.put(instVars) - else: - # pop instruction - prevVars = instQ.queue[0] - - if self.isLegal(instVars, prevVars): - # make dual fmac - kStr += "v_dual_fmac_f32 {cStr}, {aStr}, {bStr}".format_map(prevVars) + " :: v_dual_fmac_f32 {cStr}, {aStr}, {bStr}{endLine}".format_map(vars) - kStr += priority(writer, 1, "Raise priority while processing macs") - instQ.get() - else: - # push instruction - instQ.put(instVars) - + if instruction == "v_fma_f32": + kStr += "v_fma_f32 {cStr}, {aStr}, {bStr}, {cStr}{endLine}".format_map(vars) else: - if instruction == "v_fma_f32": - kStr += "v_fma_f32 {cStr}, {aStr}, {bStr}, {cStr}{endLine}".format_map(vars) - else: - kStr += "{instruction} {cStr}, {aStr}, {bStr}{endLine}".format_map(vars) + kStr += "{instruction} {cStr}, {aStr}, {bStr}{endLine}".format_map(vars) + + kStr += priority(writer, 1, "Raise priority while processing macs") if macIdx == kernel["PerformanceWaitLocation"]: - kStr += self.popAllInstructions(instruction, instQ, priority, writer) kStr += "s_waitcnt lgkmcnt({PerformanceWaitCount}) // extra wait for performance{endLine}".format_map(vars) if macIdx == kernel["PerformanceSyncLocation"]: - kStr += self.popAllInstructions(instruction, instQ, priority, writer) kStr += "s_barrier // extra barrier for performance{endLine}".format_map(vars) macIdx += 1 - kStr += self.popAllInstructions(instruction, instQ, priority, writer) kStr += priority(writer, 0, "Reset priority after macs") return kStr - - def popAllInstructions(self, inst, instructionQueue, priority, writer): - # pop all instructions - kStr = "" - while instructionQueue.qsize() > 0: - prevVars = instructionQueue.get() - if inst == "v_fma_f32": - kStr += "v_fma_f32 {cStr}, {aStr}, {bStr}, {cStr}{endLine}".format_map(prevVars) - else: - kStr += "{instruction} {cStr}, {aStr}, {bStr}{endLine}".format_map(prevVars) - kStr += priority(writer, 1, "Raise priority while processing macs") - return kStr - - def isLegal(self, instVars0, instVars1): - # VPOD has some restructions. - # For avoiding VGPR source-cache port limits, guarantee at least 1 duplicated SRC. - if instVars0["cStr"] == instVars1["cStr"]: - return False - if instVars0["a"] == instVars1["a"] or instVars0["b"] == instVars1["b"]: - return True - return False \ No newline at end of file diff --git a/Tensile/Tests/pre_checkin/wave32/sgemm_asm_nn_wv32.yaml b/Tensile/Tests/pre_checkin/wave32/sgemm_asm_nn_wv32.yaml deleted file mode 100644 index 6a5d2f1fe4..0000000000 --- a/Tensile/Tests/pre_checkin/wave32/sgemm_asm_nn_wv32.yaml +++ /dev/null @@ -1,95 +0,0 @@ -TestParameters: - marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030] # not supported by arch - -# benchmark assembly and source kernels -GlobalParameters: - MinimumRequiredVersion: 4.2.0 - CMakeBuildType: Release - PrintLevel: 1 - ForceRedoBenchmarkProblems: True - ForceRedoLibraryLogic: True - ForceRedoLibraryClient: True - EnqueuesPerSync: 1 - SyncsPerBenchmark: 1 - NumElementsToValidate: -1 - BoundsCheck: True - ValidationMaxToPrint: 4 - ValidationPrintValids: False - ShortNames: False - MergeFiles: True - DataInitTypeAB: 3 - DataInitTypeC: 3 - KernelTime: True - -BenchmarkProblems: - - - # sgemm NN - - # ProblemType - OperationType: GEMM - DataType: s - DestDataType: s - TransposeA: False - TransposeB: False - UseBeta: True - Batched: True - - - # BenchmarkProblemSizeGroup - Assembly - InitialSolutionParameters: - BenchmarkCommonParameters: - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - - KernelLanguage: ["Assembly"] - ForkParameters: - - WavefrontSize: [32] - - GlobalSplitU: [1, 3] - - PrefetchLocalRead: [False] - - PrefetchGlobalRead: [True] - - ThreadTile: - - [ 1, 11 ] - - [ 3, 5 ] - - [ 4, 8 ] - - [ 8, 8 ] - - [ 13, 3 ] - - WorkGroup: - - [ 32, 4, 1 ] - - [ 8, 8, 1 ] - - [ 4, 8, 4 ] - - DepthU: [-3] - - VectorWidth: [-1] - BenchmarkForkParameters: - JoinParameters: - BenchmarkJoinParameters: - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [127,1,129], 0, [2], [63,1,65] ] - - - # BenchmarkProblemSizeGroup - Assembly - InitialSolutionParameters: - BenchmarkCommonParameters: - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - ForkParameters: - - WavefrontSize: [32, 64] - - KernelLanguage: ["Assembly"] - - GlobalSplitU: [1, 3] - - PrefetchLocalRead: [False] - - PrefetchGlobalRead: [True] - - ThreadTile: - - [ 3, 3 ] - - [ 4, 4 ] - - [ 5, 5 ] - - [ 8, 8 ] - - WorkGroup: - - [ 16, 16, 1 ] - - [ 8, 8, 1 ] - - [ 4, 4, 4 ] - - DepthU: [-1] - - VectorWidth: [-1] - BenchmarkForkParameters: - JoinParameters: - BenchmarkJoinParameters: - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [127,1,129], 0, [2], [63,1,65] ] - - diff --git a/Tensile/Tests/pre_checkin/wave32/sgemm_asm_nt_wv32.yaml b/Tensile/Tests/pre_checkin/wave32/sgemm_asm_nt_wv32.yaml deleted file mode 100644 index b6bf925801..0000000000 --- a/Tensile/Tests/pre_checkin/wave32/sgemm_asm_nt_wv32.yaml +++ /dev/null @@ -1,94 +0,0 @@ -TestParameters: - marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030] # not supported by arch - -# benchmark assembly and source kernels -GlobalParameters: - MinimumRequiredVersion: 4.2.0 - CMakeBuildType: Release - PrintLevel: 1 - ForceRedoBenchmarkProblems: True - ForceRedoLibraryLogic: True - ForceRedoLibraryClient: True - EnqueuesPerSync: 1 - SyncsPerBenchmark: 1 - NumElementsToValidate: -1 - BoundsCheck: True - ValidationMaxToPrint: 4 - ValidationPrintValids: False - ShortNames: False - MergeFiles: True - DataInitTypeAB: 3 - DataInitTypeC: 3 - KernelTime: True - -BenchmarkProblems: - - - # sgemm NT - - # ProblemType - OperationType: GEMM - DataType: s - DestDataType: s - TransposeA: False - TransposeB: True - UseBeta: True - Batched: True - - - # BenchmarkProblemSizeGroup - Assembly - InitialSolutionParameters: - BenchmarkCommonParameters: - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - - KernelLanguage: ["Assembly"] - ForkParameters: - - WavefrontSize: [32] - - GlobalSplitU: [1, 3] - - PrefetchLocalRead: [True] - - PrefetchGlobalRead: [True] - - ThreadTile: - - [ 1, 11 ] - - [ 3, 5 ] - - [ 4, 8 ] - - [ 8, 8 ] - - [ 13, 3 ] - - WorkGroup: - - [ 16, 16, 1 ] - - [ 8, 16, 1 ] - - [ 2, 8, 8 ] - - DepthU: [-3] - - VectorWidth: [-1] - BenchmarkForkParameters: - JoinParameters: - BenchmarkJoinParameters: - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [127,1,129], 0, [2], [63,1,65] ] - - - # BenchmarkProblemSizeGroup - Assembly - InitialSolutionParameters: - BenchmarkCommonParameters: - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - ForkParameters: - - WavefrontSize: [32, 64] - - KernelLanguage: ["Assembly"] - - GlobalSplitU: [1, 3] - - PrefetchLocalRead: [True] - - PrefetchGlobalRead: [True] - - ThreadTile: - - [ 3, 3 ] - - [ 4, 4 ] - - [ 5, 5 ] - - [ 8, 8 ] - - WorkGroup: - - [ 16, 16, 1 ] - - [ 16, 8, 1 ] - - [ 16, 2, 8 ] - - DepthU: [-1] - - VectorWidth: [-1] - BenchmarkForkParameters: - JoinParameters: - BenchmarkJoinParameters: - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [127,1,129], 0, [2], [63,1,65] ] - diff --git a/Tensile/Tests/pre_checkin/wave32/sgemm_asm_tn_wv32.yaml b/Tensile/Tests/pre_checkin/wave32/sgemm_asm_tn_wv32.yaml deleted file mode 100644 index c645a3349f..0000000000 --- a/Tensile/Tests/pre_checkin/wave32/sgemm_asm_tn_wv32.yaml +++ /dev/null @@ -1,92 +0,0 @@ -TestParameters: - marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030] # not supported by arch - -# benchmark assembly and source kernels -GlobalParameters: - MinimumRequiredVersion: 4.2.0 - CMakeBuildType: Release - PrintLevel: 1 - ForceRedoBenchmarkProblems: True - ForceRedoLibraryLogic: True - ForceRedoLibraryClient: True - EnqueuesPerSync: 1 - SyncsPerBenchmark: 1 - NumElementsToValidate: -1 - BoundsCheck: True - ValidationMaxToPrint: 4 - ValidationPrintValids: False - ShortNames: False - MergeFiles: True - DataInitTypeAB: 3 - DataInitTypeC: 3 - KernelTime: True - -BenchmarkProblems: - - - # sgemm TN - - # ProblemType - OperationType: GEMM - DataType: s - DestDataType: s - TransposeA: True - TransposeB: False - UseBeta: True - Batched: True - - - # BenchmarkProblemSizeGroup - Assembly - InitialSolutionParameters: - BenchmarkCommonParameters: - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - - KernelLanguage: ["Assembly"] - ForkParameters: - - WavefrontSize: [32] - - GlobalSplitU: [1, 3] - - PrefetchLocalRead: [True] - - PrefetchGlobalRead: [False] - - ThreadTile: - - [ 1, 11 ] - - [ 3, 5 ] - - [ 4, 8 ] - - [ 8, 8 ] - - [ 13, 3 ] - - WorkGroup: - - [ 16, 16, 1 ] - - [ 32, 4, 1 ] - - DepthU: [-4] - - VectorWidth: [-1] - BenchmarkForkParameters: - JoinParameters: - BenchmarkJoinParameters: - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [127,1,129], 0, [2], [63,1,65] ] - - - # BenchmarkProblemSizeGroup - Assembly - InitialSolutionParameters: - BenchmarkCommonParameters: - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - ForkParameters: - - KernelLanguage: ["Assembly"] - - GlobalSplitU: [1, 3] - - PrefetchLocalRead: [True] - - PrefetchGlobalRead: [False] - - ThreadTile: - - [ 3, 3 ] - - [ 4, 4 ] - - [ 5, 5 ] - - [ 8, 8 ] - - WorkGroup: - - [ 16, 16, 1 ] - - [ 8, 8, 1 ] - - DepthU: [-1] - - VectorWidth: [-1] - BenchmarkForkParameters: - JoinParameters: - BenchmarkJoinParameters: - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [127,1,129], 0, [2], [63,1,65] ] - - diff --git a/Tensile/Tests/pre_checkin/wave32/sgemm_asm_tt_wv32.yaml b/Tensile/Tests/pre_checkin/wave32/sgemm_asm_tt_wv32.yaml deleted file mode 100644 index f0ad5f43b8..0000000000 --- a/Tensile/Tests/pre_checkin/wave32/sgemm_asm_tt_wv32.yaml +++ /dev/null @@ -1,90 +0,0 @@ -TestParameters: - marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030] # not supported by arch - -# benchmark assembly and source kernels -GlobalParameters: - MinimumRequiredVersion: 4.2.0 - CMakeBuildType: Release - PrintLevel: 1 - ForceRedoBenchmarkProblems: True - ForceRedoLibraryLogic: True - ForceRedoLibraryClient: True - EnqueuesPerSync: 1 - SyncsPerBenchmark: 1 - NumElementsToValidate: -1 - BoundsCheck: True - ValidationMaxToPrint: 4 - ValidationPrintValids: False - ShortNames: False - MergeFiles: True - DataInitTypeAB: 3 - DataInitTypeC: 3 - KernelTime: True - -BenchmarkProblems: - - - # sgemm TT - - # ProblemType - OperationType: GEMM - DataType: s - DestDataType: s - TransposeA: True - TransposeB: True - UseBeta: True - Batched: True - - - # BenchmarkProblemSizeGroup - Assembly - InitialSolutionParameters: - BenchmarkCommonParameters: - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - - KernelLanguage: ["Assembly"] - ForkParameters: - - WavefrontSize: [32] - - PrefetchLocalRead: [False] - - PrefetchGlobalRead: [False] - - ThreadTile: - - [ 1, 11 ] - - [ 3, 5 ] - - [ 4, 8 ] - - [ 8, 8 ] - - [ 13, 3 ] - - WorkGroup: - - [ 16, 16, 1 ] - - [ 8, 8, 1 ] - - DepthU: [-3] - - VectorWidth: [-1] - - WorkGroupMapping: [8,4] - BenchmarkForkParameters: - JoinParameters: - BenchmarkJoinParameters: - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [127,1,129], 0, [2], [63,1,65] ] - - - # BenchmarkProblemSizeGroup - Assembly - InitialSolutionParameters: - BenchmarkCommonParameters: - - LoopTail: [True] - - EdgeType: ["ShiftPtr"] - ForkParameters: - - WavefrontSize: [32, 64] - - KernelLanguage: ["Assembly"] - - PrefetchLocalRead: [False] - - PrefetchGlobalRead: [False] - - ThreadTile: - - [ 3, 3 ] - - [ 4, 4 ] - - [ 5, 5 ] - - [ 8, 8 ] - - WorkGroup: - - [ 16, 16, 1 ] - - [ 8, 8, 1 ] - - DepthU: [-1] - - VectorWidth: [-1] - BenchmarkForkParameters: - JoinParameters: - BenchmarkJoinParameters: - BenchmarkFinalParameters: - - ProblemSizes: - - Range: [ [127,1,129], 0, [2], [63,1,65] ] diff --git a/pytest.ini b/pytest.ini index bbfe723eb3..badc63530c 100644 --- a/pytest.ini +++ b/pytest.ini @@ -76,7 +76,6 @@ markers = syntax_error tensor_contraction vector_width - wave32 wip wmma zeropad From ccb928d8cc76aaa468b574f7d3e3644b7fee04a4 Mon Sep 17 00:00:00 2001 From: Alex Brown Date: Thu, 9 Mar 2023 08:36:52 -0700 Subject: [PATCH 3/8] Update kernels to have 64-bit offset arguments (#1688) Update kernels to have 64-bit offset arguments Fix error in client code that allocated double the required GPU memory when using offset Add 64bit offset test (sgemm to reduce host memory requirements for CI) --- Tensile/Components/Signature.py | 10 +- Tensile/KernelWriterAssembly.py | 111 +++++++----------- Tensile/KernelWriterSource.py | 12 +- .../client/source/DataInitialization.cpp | 5 - .../Source/lib/source/ContractionSolution.cpp | 10 +- .../pre_checkin/mfma/sgemm_64bit_offset.yaml | 64 ++++++++++ 6 files changed, 124 insertions(+), 88 deletions(-) create mode 100644 Tensile/Tests/pre_checkin/mfma/sgemm_64bit_offset.yaml diff --git a/Tensile/Components/Signature.py b/Tensile/Components/Signature.py index cef9c3de5e..9cd658354a 100644 --- a/Tensile/Components/Signature.py +++ b/Tensile/Components/Signature.py @@ -174,6 +174,11 @@ def __call__(self, writer): kStr += self.addArgument( 'A', '8', offset, "global_buffer", srcValueType, "generic"); offset += 8 kStr += self.addArgument( 'B', '8', offset, "global_buffer", srcValueType, "generic"); offset += 8 + kStr += self.addArgument("OffsetD", '8', offset, "by_value", "u64"); offset += 8 + kStr += self.addArgument("OffsetC", '8', offset, "by_value", "u64"); offset += 8 + kStr += self.addArgument("OffsetA", '8', offset, "by_value", "u64"); offset += 8 + kStr += self.addArgument("OffsetB", '8', offset, "by_value", "u64"); offset += 8 + useSize = max(4, cptByte) kStr += self.addArgument( "alpha", useSize, offset, "by_value", cptValueType); offset += useSize if kernel["ProblemType"]["UseBeta"]: @@ -238,11 +243,6 @@ def __call__(self, writer): kStr += self.addArgument( "WgmRemainder1", '4', offset, "by_value", "u32"); offset += 4 kStr += self.addArgument( "MagicNumberWgmRemainder1", '4', offset, "by_value", "u32"); offset += 4 - kStr += self.addArgument("OffsetD", '4', offset, "by_value", "u32"); offset += 4 - kStr += self.addArgument("OffsetC", '4', offset, "by_value", "u32"); offset += 4 - kStr += self.addArgument("OffsetA", '4', offset, "by_value", "u32"); offset += 4 - kStr += self.addArgument("OffsetB", '4', offset, "by_value", "u32"); offset += 4 - kStr += self.addArgument( "padding", '4', offset, "by_value", "u32"); offset += 4 kStr += " .group_segment_fixed_size: %u%s" % ( group_segment_size, writer.endLine ) #XXXXXX kStr += " .kernarg_segment_align: %u%s" % ( 8, writer.endLine ) diff --git a/Tensile/KernelWriterAssembly.py b/Tensile/KernelWriterAssembly.py index 3a86feffa1..97822b7ca9 100644 --- a/Tensile/KernelWriterAssembly.py +++ b/Tensile/KernelWriterAssembly.py @@ -1578,10 +1578,10 @@ def initKernel(self, kernel, tPA, tPB ): self.numSgprStridesB -= 1 self.numSgprSizesSum = kernel["ProblemType"]["NumIndicesSummation"] self.numSgprSizesFree = kernel["ProblemType"]["NumIndicesC"] - self.numSgprOffsetD = 1 - self.numSgprOffsetC = 1 - self.numSgprOffsetA = 1 - self.numSgprOffsetB = 1 + self.numSgprOffsetD = 2 + self.numSgprOffsetC = 2 + self.numSgprOffsetA = 2 + self.numSgprOffsetB = 2 self.numSgprAddressDbg = self.rpga if globalParameters["DebugKernel"] else 0 #################################### @@ -1703,6 +1703,14 @@ def initKernel(self, kernel, tPA, tPB ): self.defineSgpr("AddressC", numSgprAddressC) self.defineSgpr("AddressA", numSgprAddressA) self.defineSgpr("AddressB", numSgprAddressB) + + self.argOffsetOffset = self.argAddressOffset + (numSgprAddressD + numSgprAddressC + numSgprAddressA + numSgprAddressB) * 4 + + self.defineSgpr("OffsetD", self.numSgprOffsetD) + self.defineSgpr("OffsetC", self.numSgprOffsetC) + self.defineSgpr("OffsetA", self.numSgprOffsetA) + self.defineSgpr("OffsetB", self.numSgprOffsetB) + self.defineSgpr("Alpha", numSgprAlpha, numSgprAlpha) if kernel["ProblemType"]["UseBeta"]: self.defineSgpr("Beta", numSgprBeta, numSgprBeta) @@ -1767,11 +1775,6 @@ def initKernel(self, kernel, tPA, tPB ): self.defineSgpr("WgmRemainder1", 1) # Magic number to use for div by (NumWorkGroups1 % WGM) self.defineSgpr("MagicNumberWgmRemainder1", 1) # Magic number to use for div by (NumWorkGroups1 % WGM) - self.defineSgpr("OffsetD", self.numSgprOffsetD) - self.defineSgpr("OffsetC", self.numSgprOffsetC) - self.defineSgpr("OffsetA", self.numSgprOffsetA) - self.defineSgpr("OffsetB", self.numSgprOffsetB) - # dedicated sgpr(S) for storeC VGPR indexing # sgpr semaphore for message synchronization between different part of code section if kernel["StoreCInUnroll"]: @@ -1813,8 +1816,6 @@ def initKernel(self, kernel, tPA, tPB ): 3 + \ self.numSgprOffsetD + self.numSgprOffsetC + self.numSgprOffsetA + self.numSgprOffsetB - self.argOffsetOffset = (self.numSgprToLoad + 2 - (self.numSgprOffsetD + self.numSgprOffsetC + self.numSgprOffsetA + self.numSgprOffsetB)) * 4 - # Get kernel argument end here ################################### @@ -3097,34 +3098,22 @@ def allocateResources(self, kernel): kStr += ".if 0\n" # add offset to buffer - tmpOffset = self.sgprPool.checkOutAligned(2, 2, preventOverflow=0) - if not kernel["_GlobalAccumulation"]: - kStr += inst("s_mov_b32", sgpr(tmpOffset), sgpr("OffsetD"), "copy to temp b64 to prevent overflow with large offset") - kStr += inst("s_mov_b32", sgpr(tmpOffset + 1), 0, "init to 0") - kStr += inst("s_lshl_b64", sgpr(tmpOffset, 2), sgpr(tmpOffset, 2), hex(log2(self.bpeCexternal)), "elements offset to bytes offset") - kStr += inst("s_add_u32", sgpr("AddressD+0"), sgpr("AddressD+0"), sgpr(tmpOffset), "add offset to buffer address") - kStr += inst("s_addc_u32", sgpr("AddressD+1"), sgpr("AddressD+1"), sgpr(tmpOffset + 1), "add offset to buffer address") - - kStr += inst("s_mov_b32", sgpr(tmpOffset), sgpr("OffsetC"), "copy to temp b64 to prevent overflow with large offset") - kStr += inst("s_mov_b32", sgpr(tmpOffset + 1), 0, "init to 0") - kStr += inst("s_lshl_b64", sgpr(tmpOffset, 2), sgpr(tmpOffset, 2), hex(log2(self.bpeCexternal)), "elements offset to bytes offset") - kStr += inst("s_add_u32", sgpr("AddressC+0"), sgpr("AddressC+0"), sgpr(tmpOffset), "add offset to buffer address") - kStr += inst("s_addc_u32", sgpr("AddressC+1"), sgpr("AddressC+1"), sgpr(tmpOffset + 1), "add offset to buffer address") - - kStr += inst("s_mov_b32", sgpr(tmpOffset), sgpr("OffsetA"), "copy to temp b64 to prevent overflow with large offset") - kStr += inst("s_mov_b32", sgpr(tmpOffset + 1), 0, "init to 0") - kStr += inst("s_lshl_b64", sgpr(tmpOffset, 2), sgpr(tmpOffset, 2), hex(log2(self.bpeAB)), "elements offset to bytes offset") - kStr += inst("s_add_u32", sgpr("AddressA+0"), sgpr("AddressA+0"), sgpr(tmpOffset), "add offset to buffer address") - kStr += inst("s_addc_u32", sgpr("AddressA+1"), sgpr("AddressA+1"), sgpr(tmpOffset + 1), "add offset to buffer address") - - kStr += inst("s_mov_b32", sgpr(tmpOffset), sgpr("OffsetB"), "copy to temp b64 to prevent overflow with large offset") - kStr += inst("s_mov_b32", sgpr(tmpOffset + 1), 0, "init to 0") - kStr += inst("s_lshl_b64", sgpr(tmpOffset, 2), sgpr(tmpOffset, 2), hex(log2(self.bpeAB)), "elements offset to bytes offset") - kStr += inst("s_add_u32", sgpr("AddressB+0"), sgpr("AddressB+0"), sgpr(tmpOffset), "add offset to buffer address") - kStr += inst("s_addc_u32", sgpr("AddressB+1"), sgpr("AddressB+1"), sgpr(tmpOffset + 1), "add offset to buffer address") - - self.sgprPool.checkIn(tmpOffset) + kStr += inst("s_lshl_b64", sgpr("OffsetD", 2), sgpr("OffsetD", 2), hex(log2(self.bpeCexternal)), "elements offset to bytes offset") + kStr += inst("s_add_u32", sgpr("AddressD+0"), sgpr("AddressD+0"), sgpr("OffsetD"), "add offset to buffer address") + kStr += inst("s_addc_u32", sgpr("AddressD+1"), sgpr("AddressD+1"), sgpr("OffsetD+1"), "add offset to buffer address") + + kStr += inst("s_lshl_b64", sgpr("OffsetC", 2), sgpr("OffsetC", 2), hex(log2(self.bpeCexternal)), "elements offset to bytes offset") + kStr += inst("s_add_u32", sgpr("AddressC+0"), sgpr("AddressC+0"), sgpr("OffsetC"), "add offset to buffer address") + kStr += inst("s_addc_u32", sgpr("AddressC+1"), sgpr("AddressC+1"), sgpr("OffsetC+1"), "add offset to buffer address") + + kStr += inst("s_lshl_b64", sgpr("OffsetA", 2), sgpr("OffsetA", 2), hex(log2(self.bpeAB)), "elements offset to bytes offset") + kStr += inst("s_add_u32", sgpr("AddressA+0"), sgpr("AddressA+0"), sgpr("OffsetA"), "add offset to buffer address") + kStr += inst("s_addc_u32", sgpr("AddressA+1"), sgpr("AddressA+1"), sgpr("OffsetA+1"), "add offset to buffer address") + + kStr += inst("s_lshl_b64", sgpr("OffsetB", 2), sgpr("OffsetB", 2), hex(log2(self.bpeAB)), "elements offset to bytes offset") + kStr += inst("s_add_u32", sgpr("AddressB+0"), sgpr("AddressB+0"), sgpr("OffsetB"), "add offset to buffer address") + kStr += inst("s_addc_u32", sgpr("AddressB+1"), sgpr("AddressB+1"), sgpr("OffsetB+1"), "add offset to buffer address") # self.groOffsetInMacroTile == 1 case, subtract pre-pad here if self.groOffsetInMacroTile: @@ -3368,7 +3357,7 @@ def graWorkGroup(self, kernel, isPap): kStr = "" if kernel["PersistentKernel"]: - stmpRef = self.getTmpSgpr(4, 4) + stmpRef = self.getTmpSgpr(8, 4) stmp = stmpRef.idx() # Always reset pointers to handle odd-exit case which moves LRO to the upper bank if not self.prefetchAcrossPersistent and kernel["PrefetchGlobalRead"]: @@ -3398,37 +3387,25 @@ def graWorkGroup(self, kernel, isPap): kStr += inst("_s_load_b256", sgpr("AddressD", 8), sgpr("KernArgAddress",2), hex(self.argAddressOffset), "reload DCAB address") kStr += inst("s_waitcnt", "lgkmcnt(0)", "wait for reload DCAB address") kStr += self.loadBatchedAddress(kernel, "WGKSerial", stmp) - kStr += inst("_s_load_b128", sgpr(stmp, 4), sgpr("KernArgAddress",2), hex(self.argOffsetOffset), "reload DCAB Offset") + kStr += inst("_s_load_b256", sgpr(stmp, 8), sgpr("KernArgAddress",2), hex(self.argOffsetOffset), "reload DCAB Offset") kStr += inst("s_waitcnt", "lgkmcnt(0)", "wait global buffer adress ready") - tmpOffset = self.sgprPool.checkOutAligned(2, 2, preventOverflow=0) - if not kernel["_GlobalAccumulation"]: - kStr += inst("s_mov_b32", sgpr(tmpOffset), sgpr(stmp+0), "copy to temp b64 to prevent overflow with large offset") - kStr += inst("s_mov_b32", sgpr(tmpOffset + 1), 0, "init to 0") - kStr += inst("s_lshl_b64", sgpr(tmpOffset, 2), sgpr(tmpOffset, 2), hex(log2(self.bpeCexternal)), "elements offset to bytes offset") - kStr += inst("s_add_u32", sgpr("AddressD+0"), sgpr("AddressD+0"), sgpr(tmpOffset), "add offset to buffer address") - kStr += inst("s_addc_u32", sgpr("AddressD+1"), sgpr("AddressD+1"), sgpr(tmpOffset + 1), "add offset to buffer address") - - kStr += inst("s_mov_b32", sgpr(tmpOffset), sgpr(stmp+1), "copy to temp b64 to prevent overflow with large offset") - kStr += inst("s_mov_b32", sgpr(tmpOffset + 1), 0, "init to 0") - kStr += inst("s_lshl_b64", sgpr(tmpOffset, 2), sgpr(tmpOffset, 2), hex(log2(self.bpeCexternal)), "elements offset to bytes offset") - kStr += inst("s_add_u32", sgpr("AddressC+0"), sgpr("AddressC+0"), sgpr(tmpOffset), "add offset to buffer address") - kStr += inst("s_addc_u32", sgpr("AddressC+1"), sgpr("AddressC+1"), sgpr(tmpOffset + 1), "add offset to buffer address") - - kStr += inst("s_mov_b32", sgpr(tmpOffset), sgpr(stmp+2), "copy to temp b64 to prevent overflow with large offset") - kStr += inst("s_mov_b32", sgpr(tmpOffset + 1), 0, "init to 0") - kStr += inst("s_lshl_b64", sgpr(tmpOffset, 2), sgpr(tmpOffset, 2), hex(log2(self.bpeAB)), "elements offset to bytes offset") - kStr += inst("s_add_u32", sgpr("AddressA+0"), sgpr("AddressA+0"), sgpr(tmpOffset), "add offset to buffer address") - kStr += inst("s_addc_u32", sgpr("AddressA+1"), sgpr("AddressA+1"), sgpr(tmpOffset + 1), "add offset to buffer address") - - kStr += inst("s_mov_b32", sgpr(tmpOffset), sgpr(stmp+3), "copy to temp b64 to prevent overflow with large offset") - kStr += inst("s_mov_b32", sgpr(tmpOffset + 1), 0, "init to 0") - kStr += inst("s_lshl_b64", sgpr(tmpOffset, 2), sgpr(tmpOffset, 2), hex(log2(self.bpeAB)), "elements offset to bytes offset") - kStr += inst("s_add_u32", sgpr("AddressB+0"), sgpr("AddressB+0"), sgpr(tmpOffset), "add offset to buffer address") - kStr += inst("s_addc_u32", sgpr("AddressB+1"), sgpr("AddressB+1"), sgpr(tmpOffset + 1), "add offset to buffer address") - - self.sgprPool.checkIn(tmpOffset) + kStr += inst("s_lshl_b64", sgpr(stmp+0, 2), sgpr(stmp+0, 2), hex(log2(self.bpeCexternal)), "elements offset to bytes offset") + kStr += inst("s_add_u32", sgpr("AddressD+0"), sgpr("AddressD+0"), sgpr(stmp + 0), "add offset to buffer address") + kStr += inst("s_addc_u32", sgpr("AddressD+1"), sgpr("AddressD+1"), sgpr(stmp + 1), "add offset to buffer address") + + kStr += inst("s_lshl_b64", sgpr(stmp+2, 2), sgpr(stmp+2, 2), hex(log2(self.bpeCexternal)), "elements offset to bytes offset") + kStr += inst("s_add_u32", sgpr("AddressC+0"), sgpr("AddressC+0"), sgpr(stmp + 2), "add offset to buffer address") + kStr += inst("s_addc_u32", sgpr("AddressC+1"), sgpr("AddressC+1"), sgpr(stmp + 3), "add offset to buffer address") + + kStr += inst("s_lshl_b64", sgpr(stmp+4, 2), sgpr(stmp+4, 2), hex(log2(self.bpeAB)), "elements offset to bytes offset") + kStr += inst("s_add_u32", sgpr("AddressA+0"), sgpr("AddressA+0"), sgpr(stmp + 4), "add offset to buffer address") + kStr += inst("s_addc_u32", sgpr("AddressA+1"), sgpr("AddressA+1"), sgpr(stmp + 5), "add offset to buffer address") + + kStr += inst("s_lshl_b64", sgpr(stmp+6, 2), sgpr(stmp+6, 2), hex(log2(self.bpeAB)), "elements offset to bytes offset") + kStr += inst("s_add_u32", sgpr("AddressB+0"), sgpr("AddressB+0"), sgpr(stmp + 6), "add offset to buffer address") + kStr += inst("s_addc_u32", sgpr("AddressB+1"), sgpr("AddressB+1"), sgpr(stmp + 7), "add offset to buffer address") if self.groOffsetInMacroTile: prePad = self.srdShiftLeft["A"] * self.tPA["bpe"] # leave room in case we have to pointer shift diff --git a/Tensile/KernelWriterSource.py b/Tensile/KernelWriterSource.py index b4957f893c..dc2070a7b9 100644 --- a/Tensile/KernelWriterSource.py +++ b/Tensile/KernelWriterSource.py @@ -861,6 +861,12 @@ def functionSignature(self, kernel ): s += self.endLine s += " " + globalStr + ptrStr + " const * " + batchStr + "B" + # offset + s += "," + self.endLine + " uint64_t offsetD" + s += "," + self.endLine + " uint64_t offsetC" + s += "," + self.endLine + " uint64_t offsetA" + s += "," + self.endLine + " uint64_t offsetB" + # alpha & beta s += "," + self.endLine + " " \ + kernel["ProblemType"]["ComputeDataType"].toDevice(self.language) + " const alpha" @@ -920,12 +926,6 @@ def functionSignature(self, kernel ): s += "," + self.endLine + " unsigned int problemNumGroupTiles0" s += "," + self.endLine + " unsigned int problemNumGroupTiles1" - # offset - s += "," + self.endLine + " unsigned int offsetD" - s += "," + self.endLine + " unsigned int offsetC" - s += "," + self.endLine + " unsigned int offsetA" - s += "," + self.endLine + " unsigned int offsetB" - s += " )" return s diff --git a/Tensile/Source/client/source/DataInitialization.cpp b/Tensile/Source/client/source/DataInitialization.cpp index e423a4239d..ed16a76adb 100644 --- a/Tensile/Source/client/source/DataInitialization.cpp +++ b/Tensile/Source/client/source/DataInitialization.cpp @@ -379,11 +379,6 @@ namespace Tensile m_maxBatch = std::max(m_maxBatch, numOfBatch); } - m_aMaxElements += m_aBufferOffset; - m_bMaxElements += m_bBufferOffset; - m_cMaxElements += m_cBufferOffset; - m_dMaxElements += m_dBufferOffset; - if(m_curBoundsCheck == BoundsCheckMode::NaN) { m_aMaxElements += 1024; diff --git a/Tensile/Source/lib/source/ContractionSolution.cpp b/Tensile/Source/lib/source/ContractionSolution.cpp index 26993439de..0d40b03f49 100644 --- a/Tensile/Source/lib/source/ContractionSolution.cpp +++ b/Tensile/Source/lib/source/ContractionSolution.cpp @@ -392,6 +392,11 @@ namespace Tensile rv.args.append("batchB", inputs.batchB); } + rv.args.append("offsetD", d.offset()); + rv.args.append("offsetC", c.offset()); + rv.args.append("offsetA", a.offset()); + rv.args.append("offsetB", b.offset()); + rv.args.append("alpha", inputs.alpha); if(std::is_same::value && !isSourceKernel()) rv.args.append("alpha_2", inputs.alpha); @@ -579,11 +584,6 @@ namespace Tensile rv.args.append("magicNumberWgmRemainder1", magicNumberWgmRemainder1); } - rv.args.append("offsetD", d.offset()); - rv.args.append("offsetC", c.offset()); - rv.args.append("offsetA", a.offset()); - rv.args.append("offsetB", b.offset()); - if(!isSourceKernel()) { rv.args.append("pad", 0); diff --git a/Tensile/Tests/pre_checkin/mfma/sgemm_64bit_offset.yaml b/Tensile/Tests/pre_checkin/mfma/sgemm_64bit_offset.yaml new file mode 100644 index 0000000000..7c801d7c8b --- /dev/null +++ b/Tensile/Tests/pre_checkin/mfma/sgemm_64bit_offset.yaml @@ -0,0 +1,64 @@ +TestParameters: + marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102] # not supported by arch + +GlobalParameters: + NumElementsToValidate: -1 + BoundsCheck: True + KernelTime: True + # PrintSolutionRejectionReason: True + BufferOffsetB: 4294967296 + PristineOnGPU: False + +BenchmarkProblems: + ######################################## + # NN - standard + ######################################## + - # sgemm NN + - # ProblemType + OperationType: GEMM + DataType: s + TransposeA: False + TransposeB: False + UseBeta: True + Batched: True + + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - PrefetchLocalRead: [True] + ForkParameters: + - AssertMinApproxSize: [0] + - DepthU: [8] + - ExpandPointerSwap: [0] + - GroupLoadStore: [1] + - LocalReadVectorWidth: [1] + # - MACInstruction: ["MAD"] + - NumElementsPerBatchStore: [4] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [3] + - ScheduleIterAlg: [3] + - SourceSwap: [1] + - StaggerU: [0] + - StaggerUStride: [0] + - StorePriorityOpt: [1] + - StoreSyncOpt: [6] + - WorkGroupMapping: [11] + - MatrixInstruction: + # - [16, 16, 4, 1, 1, 8,2, 1,4] # 128x128 + # - [16, 16, 4, 1, 1, 4,4, 2,2] # 128x128 + # - [16, 16, 4, 1, 1, 1,1, 4,1] # 128x128 + - [16, 16, 4, 1] + - ThreadTile: + - [ 2, 16 ] + - WorkGroup: + - [ 32, 4, 1 ] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + # - Exact: [1024, 1024, 1, 1024, 1048576, 524288, 524288, 524288] + - Exact: [1024, 1024, 1, 1024] + From 32f8a5e20d36c9f0f16077449b7640d3735b553e Mon Sep 17 00:00:00 2001 From: Alex Brown Date: Tue, 14 Mar 2023 10:32:06 -0600 Subject: [PATCH 4/8] Reduce max sgpr usage by improving temporary register re-use (#1691) --- Tensile/KernelWriterAssembly.py | 34 ++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/Tensile/KernelWriterAssembly.py b/Tensile/KernelWriterAssembly.py index 97822b7ca9..7b5ccec731 100644 --- a/Tensile/KernelWriterAssembly.py +++ b/Tensile/KernelWriterAssembly.py @@ -4333,9 +4333,12 @@ def graFinalOffsetsSingleLoop(self, kernel, tP, tc, tmp, graIdx, perp, sPerp, pa def computeLoadSrd(self, kernel, tP, tc, indices, bpe, isPap): kStr = "" - stmp = self.getTmpSgpr(2+2+1).idx() - tileStart = stmp+2 - prePadSgpr = stmp+4 + stmpRef = self.getTmpSgpr(2) + stmp = stmpRef.idx() + tileStartRef = self.getTmpSgpr(2) + tileStart = tileStartRef.idx() + prePadSgprRef = self.getTmpSgpr(1) + prePadSgpr = prePadSgprRef.idx() wroteTileStart = False #--- # Compute tileStart #elements from the 2D array start @@ -9253,18 +9256,23 @@ def computeStoreSrdStart(self, kernel): if kernel["_GlobalAccumulation"] == 'MultipleBuffer': # GSU algorithm 2: adjust output buffer address to per GSU buffer - tmpSgpr = self.getTmpSgpr(5).idx() + tmpFree0Ref = self.getTmpSgpr(2) + tmpFree0 = tmpFree0Ref.idx() + tmpFreeNRef = self.getTmpSgpr(2) + tmpFreeN = tmpFreeNRef.idx() + tmpSgprRef = self.getTmpSgpr(1) + tmpSgpr = tmpSgprRef.idx() kStr += "// GSU Output Buffer offset: Free0 + (Free1-1)*StrideC1J + (Free2-1)*StrideCK * GSUIdx * bpe%s" % self.endLine - kStr += self.s_mul_u64_u32(sgpr(tmpSgpr+0), sgpr(tmpSgpr+1), sgpr("SizesFree+0"), sgpr("GSUSumIdx"), "Free0") + kStr += self.s_mul_u64_u32(sgpr(tmpFree0+0), sgpr(tmpFree0+1), sgpr("SizesFree+0"), sgpr("GSUSumIdx"), "Free0") for i in range(1, numDim): - kStr += inst("s_sub_u32", sgpr(tmpSgpr+4), sgpr("SizesFree+%u"%i), 1, "Free%u" % i) - kStr += inst("s_mul_i32", sgpr(tmpSgpr+4), sgpr(tmpSgpr+4), sgpr("GSUSumIdx"), "Free%u" % i) - kStr += self.s_mul_u64_u32(sgpr(tmpSgpr+2), sgpr(tmpSgpr+3), sgpr(tmpSgpr+4), sgpr("StrideC%s"%self.indexChars[i]), "Free%u" % i) - kStr += inst("s_add_u32", sgpr(tmpSgpr+0), sgpr(tmpSgpr+0), sgpr(tmpSgpr+2), "Free%u" % i) - kStr += inst("s_addc_u32", sgpr(tmpSgpr+1), sgpr(tmpSgpr+1), sgpr(tmpSgpr+3), "Free%u" % i) - kStr += inst("s_lshl_b64", sgpr(tmpSgpr+0,2), sgpr(tmpSgpr+0,2), log2(self.bpeCexternal), "scale by bpe") - kStr += inst("s_add_u32", sgpr("SrdD+0"), sgpr("SrdD+0"), sgpr(tmpSgpr+0), "add lo GSU offset to SRD") - kStr += inst("s_addc_u32", sgpr("SrdD+1"), sgpr("SrdD+1"), sgpr(tmpSgpr+1), "add hi GSU offset to SRD") + kStr += inst("s_sub_u32", sgpr(tmpSgpr), sgpr("SizesFree+%u"%i), 1, "Free%u" % i) + kStr += inst("s_mul_i32", sgpr(tmpSgpr), sgpr(tmpSgpr), sgpr("GSUSumIdx"), "Free%u" % i) + kStr += self.s_mul_u64_u32(sgpr(tmpFreeN+0), sgpr(tmpFreeN+1), sgpr(tmpSgpr), sgpr("StrideC%s"%self.indexChars[i]), "Free%u" % i) + kStr += inst("s_add_u32", sgpr(tmpFree0+0), sgpr(tmpFree0+0), sgpr(tmpFreeN+0), "Free%u" % i) + kStr += inst("s_addc_u32", sgpr(tmpFree0+1), sgpr(tmpFree0+1), sgpr(tmpFreeN+1), "Free%u" % i) + kStr += inst("s_lshl_b64", sgpr(tmpFree0+0,2), sgpr(tmpFree0+0,2), log2(self.bpeCexternal), "scale by bpe") + kStr += inst("s_add_u32", sgpr("SrdD+0"), sgpr("SrdD+0"), sgpr(tmpFree0+0), "add lo GSU offset to SRD") + kStr += inst("s_addc_u32", sgpr("SrdD+1"), sgpr("SrdD+1"), sgpr(tmpFree0+1), "add hi GSU offset to SRD") for cdir in (0,1): indices = kernel["PackedC%uIndicesX"%cdir] From 9ef81616d17104869349d547493c64132fe4baa2 Mon Sep 17 00:00:00 2001 From: Alex Brown Date: Thu, 16 Mar 2023 08:57:32 -0600 Subject: [PATCH 5/8] 64-bit offset parameters for post kernels (#1693) --- .../hip/HipSolutionAdapter_test.cpp | 8 +-- .../ocl/OclSolutionAdapter_test.cpp | 4 +- Tensile/KernelWriterBetaOnly.py | 8 +-- Tensile/KernelWriterConversion.py | 8 +-- .../Source/lib/source/ContractionSolution.cpp | 12 ++-- .../pre_checkin/mfma/dgemm_large_offset.yaml | 5 +- .../pre_checkin/mfma/sgemm_64bit_offset.yaml | 1 - .../mfma/sgemm_64bit_offset_post.yaml | 65 +++++++++++++++++++ 8 files changed, 87 insertions(+), 24 deletions(-) create mode 100644 Tensile/Tests/pre_checkin/mfma/sgemm_64bit_offset_post.yaml diff --git a/HostLibraryTests/hip/HipSolutionAdapter_test.cpp b/HostLibraryTests/hip/HipSolutionAdapter_test.cpp index 9ad26698f2..a510d0a9b3 100644 --- a/HostLibraryTests/hip/HipSolutionAdapter_test.cpp +++ b/HostLibraryTests/hip/HipSolutionAdapter_test.cpp @@ -74,6 +74,8 @@ TEST(HipSolutionAdapterTest, BetaOnlyKernel_Zero) k.args.append("D", d_d); k.args.append("C", c_d); + k.args.append("offsetD", desc.offset()); + k.args.append("offsetC", desc.offset()); k.args.append("strideD1", desc.strides()[1]); k.args.append("strideD2", desc.strides()[2]); k.args.append("strideC1", desc.strides()[1]); @@ -81,8 +83,6 @@ TEST(HipSolutionAdapterTest, BetaOnlyKernel_Zero) k.args.append("size0", desc.sizes()[0]); k.args.append("size1", desc.sizes()[1]); k.args.append("size2", desc.sizes()[2]); - k.args.append("offsetD", desc.offset()); - k.args.append("offsetC", desc.offset()); k.args.append("beta", 0.0f); hip::SolutionAdapter adapter(false); @@ -149,6 +149,8 @@ TEST(HipSolutionAdapterTest, BetaOnlyKernel_Nonzero) k.args.append("D", d_d); k.args.append("C", c_d); + k.args.append("offsetD", desc.offset()); + k.args.append("offsetC", desc.offset()); k.args.append("strideD1", desc.strides()[1]); k.args.append("strideD2", desc.strides()[2]); k.args.append("strideC1", desc.strides()[1]); @@ -156,8 +158,6 @@ TEST(HipSolutionAdapterTest, BetaOnlyKernel_Nonzero) k.args.append("size0", desc.sizes()[0]); k.args.append("size1", desc.sizes()[1]); k.args.append("size2", desc.sizes()[2]); - k.args.append("offsetD", desc.offset()); - k.args.append("offsetC", desc.offset()); k.args.append("beta", beta); hip::SolutionAdapter adapter(false); diff --git a/HostLibraryTests/ocl/OclSolutionAdapter_test.cpp b/HostLibraryTests/ocl/OclSolutionAdapter_test.cpp index e942af7ff4..88120d2ef2 100644 --- a/HostLibraryTests/ocl/OclSolutionAdapter_test.cpp +++ b/HostLibraryTests/ocl/OclSolutionAdapter_test.cpp @@ -82,6 +82,8 @@ KernelInvocation initKernelParams(Tensile::TensorDescriptor const& desc, // k.args.append("C", buffer_C); k.args.append("D", device_d); k.args.append("C", device_c); + k.args.append("offsetD", desc.offset()); + k.args.append("offsetC", desc.offset()); k.args.append("strideD1", desc.strides()[1]); k.args.append("strideD2", desc.strides()[2]); k.args.append("strideC1", desc.strides()[1]); @@ -89,8 +91,6 @@ KernelInvocation initKernelParams(Tensile::TensorDescriptor const& desc, k.args.append("size0", desc.sizes()[0]); k.args.append("size1", desc.sizes()[1]); k.args.append("size2", desc.sizes()[2]); - k.args.append("offsetD", desc.offset()); - k.args.append("offsetC", desc.offset()); k.args.append("beta", beta); return k; diff --git a/Tensile/KernelWriterBetaOnly.py b/Tensile/KernelWriterBetaOnly.py index 50b38dd2ef..926e8562ae 100644 --- a/Tensile/KernelWriterBetaOnly.py +++ b/Tensile/KernelWriterBetaOnly.py @@ -76,6 +76,10 @@ def functionSignature(self): batch = "" if isStridedBuffer else "Batch" kStr += " " + ptrStr + " const * " + batch + "C," + self.endLine + # offset + kStr += " uint64_t offsetD,%s" % self.endLine + kStr += " uint64_t offsetC,%s" % self.endLine + # strides firstStrideCD = 1 if self.state["ProblemType"]["UseInitialStridesCD"]: @@ -90,10 +94,6 @@ def functionSignature(self): for i in range(0, self.state["ProblemType"]["NumIndicesC"]): kStr += " unsigned int const size%s,%s" % (self.indexChars[i], self.endLine) - # offset - kStr += " unsigned int offsetD,%s" % self.endLine - kStr += " unsigned int offsetC,%s" % self.endLine - # beta kStr += " %s const beta)%s" % (self.state["ProblemType"]["ComputeDataType"].toDevice(self.language), self.endLine ) diff --git a/Tensile/KernelWriterConversion.py b/Tensile/KernelWriterConversion.py index c6b6d3490d..58963dbbd3 100644 --- a/Tensile/KernelWriterConversion.py +++ b/Tensile/KernelWriterConversion.py @@ -69,6 +69,10 @@ def functionSignature(self): kStr += " " + self.datatype + " * W," + self.endLine kStr += " " + ptrStr + " const * " + bStr + "C," + self.endLine + # offset + kStr += " uint64_t offsetD,%s" % self.endLine + kStr += " uint64_t offsetC,%s" % self.endLine + # alpha & beta kStr += " %s const alpha,%s" % (self.state["ProblemType"]["ComputeDataType"].toDevice(self.language), self.endLine) kStr += " %s const beta,%s" % (self.state["ProblemType"]["ComputeDataType"].toDevice(self.language), self.endLine) @@ -89,10 +93,6 @@ def functionSignature(self): for i in range(0, self.state["ProblemType"]["NumIndicesC"]): kStr += " unsigned int const size%s,%s" % (self.indexChars[i], self.endLine) - # offset - kStr += " unsigned int offsetD,%s" % self.endLine - kStr += " unsigned int offsetC,%s" % self.endLine - # gsu kStr += " unsigned int const gsu)%s" % self.endLine diff --git a/Tensile/Source/lib/source/ContractionSolution.cpp b/Tensile/Source/lib/source/ContractionSolution.cpp index 0d40b03f49..584ec5720f 100644 --- a/Tensile/Source/lib/source/ContractionSolution.cpp +++ b/Tensile/Source/lib/source/ContractionSolution.cpp @@ -649,6 +649,9 @@ namespace Tensile else rv.args.append("batchC", inputs.batchC); + rv.args.append("offsetD", d.offset()); + rv.args.append("offsetC", c.offset()); + if(sizeMapping.globalAccumulation) { size_t stride = d.sizes()[0]; @@ -677,9 +680,6 @@ namespace Tensile idx++; } - rv.args.append("offsetD", d.offset()); - rv.args.append("offsetC", c.offset()); - rv.args.append("beta", inputs.beta); //Pass along code object dependency @@ -758,6 +758,9 @@ namespace Tensile else rv.args.append("batchC", inputs.batchC); + rv.args.append("offsetD", d.offset()); + rv.args.append("offsetC", c.offset()); + if(sizeMapping.globalAccumulation == 2) rv.args.append("alpha", inputs.alpha); else @@ -788,9 +791,6 @@ namespace Tensile idx++; } - rv.args.append("offsetD", d.offset()); - rv.args.append("offsetC", c.offset()); - if(sizeMapping.globalAccumulation == 1) rv.args.append("gsu", 1); else diff --git a/Tensile/Tests/pre_checkin/mfma/dgemm_large_offset.yaml b/Tensile/Tests/pre_checkin/mfma/dgemm_large_offset.yaml index 28ca177ae2..77cfbcc6f9 100644 --- a/Tensile/Tests/pre_checkin/mfma/dgemm_large_offset.yaml +++ b/Tensile/Tests/pre_checkin/mfma/dgemm_large_offset.yaml @@ -10,9 +10,9 @@ GlobalParameters: BenchmarkProblems: ######################################## - # NT - standard + # NN - standard ######################################## - - # sgemm TN + - # dgemm NN - # ProblemType OperationType: GEMM DataType: d @@ -60,4 +60,3 @@ BenchmarkProblems: - ProblemSizes: # - Exact: [1024, 1024, 1, 1024, 1048576, 524288, 524288, 524288] - Exact: [1024, 1024, 1, 1024] - diff --git a/Tensile/Tests/pre_checkin/mfma/sgemm_64bit_offset.yaml b/Tensile/Tests/pre_checkin/mfma/sgemm_64bit_offset.yaml index 7c801d7c8b..ffe00f1c8f 100644 --- a/Tensile/Tests/pre_checkin/mfma/sgemm_64bit_offset.yaml +++ b/Tensile/Tests/pre_checkin/mfma/sgemm_64bit_offset.yaml @@ -61,4 +61,3 @@ BenchmarkProblems: - ProblemSizes: # - Exact: [1024, 1024, 1, 1024, 1048576, 524288, 524288, 524288] - Exact: [1024, 1024, 1, 1024] - diff --git a/Tensile/Tests/pre_checkin/mfma/sgemm_64bit_offset_post.yaml b/Tensile/Tests/pre_checkin/mfma/sgemm_64bit_offset_post.yaml new file mode 100644 index 0000000000..a719dab850 --- /dev/null +++ b/Tensile/Tests/pre_checkin/mfma/sgemm_64bit_offset_post.yaml @@ -0,0 +1,65 @@ +TestParameters: + marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102] # not supported by arch + +GlobalParameters: + NumElementsToValidate: -1 + BoundsCheck: True + KernelTime: True + # PrintSolutionRejectionReason: True + BufferOffsetC: 4294967296 + PristineOnGPU: False + +BenchmarkProblems: + ######################################## + # NN - standard + ######################################## + - # sgemm NN + - # ProblemType + OperationType: GEMM + DataType: s + TransposeA: False + TransposeB: False + UseBeta: True + Batched: True + + - # GSU post kernel test case + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - PrefetchLocalRead: [True] + ForkParameters: + - AssertMinApproxSize: [0] + - DepthU: [8] + - ExpandPointerSwap: [0] + - GroupLoadStore: [1] + - GlobalSplitU: [8] + - GlobalSplitUAlgorithm: ["MultipleBuffer"] + - LocalReadVectorWidth: [1] + # - MACInstruction: ["MAD"] + - NumElementsPerBatchStore: [4] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [3] + - ScheduleIterAlg: [3] + - SourceSwap: [1] + - StaggerU: [0] + - StaggerUStride: [0] + - StorePriorityOpt: [1] + - StoreSyncOpt: [6] + - WorkGroupMapping: [11] + - MatrixInstruction: + # - [16, 16, 4, 1, 1, 8,2, 1,4] # 128x128 + # - [16, 16, 4, 1, 1, 4,4, 2,2] # 128x128 + # - [16, 16, 4, 1, 1, 1,1, 4,1] # 128x128 + - [16, 16, 4, 1] + - ThreadTile: + - [ 2, 16 ] + - WorkGroup: + - [ 32, 4, 1 ] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + # - Exact: [1024, 1024, 1, 1024, 1048576, 524288, 524288, 524288] + - Exact: [512, 512, 1, 2048] From 073c24f85bd11a4235fe7f92c2a03966b6bd4201 Mon Sep 17 00:00:00 2001 From: Alex Brown Date: Tue, 21 Mar 2023 09:11:16 -0600 Subject: [PATCH 6/8] Fix tmp sgpr allocation to avoid over-writing values (alpha) (#1695) Add test case demonstrating changes --- Tensile/KernelWriterAssembly.py | 2 +- .../mfma/dgemm_alpha1_beta0_sgpr.yaml | 74 +++++++++++++++++++ 2 files changed, 75 insertions(+), 1 deletion(-) create mode 100644 Tensile/Tests/pre_checkin/mfma/dgemm_alpha1_beta0_sgpr.yaml diff --git a/Tensile/KernelWriterAssembly.py b/Tensile/KernelWriterAssembly.py index 7b5ccec731..483c32940a 100644 --- a/Tensile/KernelWriterAssembly.py +++ b/Tensile/KernelWriterAssembly.py @@ -6772,7 +6772,7 @@ def openSumAtLeastUnroll(self, kernel, prefetch, isOptNLL, isPap): return "" skipOptNLL = self.getNamedLabel("OptNLL_End") - tmpSgpr = self.getTmpSgpr(2).idx() + tmpSgpr = self.getTmpSgpr(4).idx() # skip beta check for StoreCInUnroll in OptNLL case if not kernel["StoreCInUnroll"]: diff --git a/Tensile/Tests/pre_checkin/mfma/dgemm_alpha1_beta0_sgpr.yaml b/Tensile/Tests/pre_checkin/mfma/dgemm_alpha1_beta0_sgpr.yaml new file mode 100644 index 0000000000..30f5e18f84 --- /dev/null +++ b/Tensile/Tests/pre_checkin/mfma/dgemm_alpha1_beta0_sgpr.yaml @@ -0,0 +1,74 @@ +TestParameters: + marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102] # not supported by arch + +GlobalParameters: + NumElementsToValidate: -1 + BoundsCheck: False + # KernelTime: True + # PrintSolutionRejectionReason: True + # BufferOffsetB: 536877696 + DataInitTypeAlpha: 1 + DataInitTypeBeta: 0 + +BenchmarkProblems: + ######################################## + # NN - standard + ######################################## + - # dgemm NN + - # ProblemType + OperationType: GEMM + DataType: d + TransposeA: False + TransposeB: False + UseBeta: True + Batched: True + + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + - EdgeType: ["ShiftPtr"] + - PrefetchLocalRead: [True] + ForkParameters: + - AssertMinApproxSize: [3] + - DepthU: [16] + - EdgeType: ["ShiftPtr"] + - ExpandPointerSwap: [0] + - GlobalReadVectorWidth: [2] + - GroupLoadStore: [1] + - InnerUnroll: [2] + - LocalReadVectorWidth: [1] + # - MACInstruction: ["MAD"] + - NumElementsPerBatchStore: [4] + - OptPreLoopVmcnt: [False] + - PrefetchGlobalRead: [2] + - PrefetchLocalRead: [3] + - ScheduleIterAlg: [3] + - SourceSwap: [1] + - StaggerU: [0] + - StaggerUStride: [0] + - StorePriorityOpt: [1] + - StoreSyncOpt: [6] + - StoreVectorWidth: [2] + - TransposeLDS: [0] + - VectorAtomicWidth: [1] + - VectorWidth: [2] + - WorkGroupMapping: [5] + - MatrixInstruction: + # - [16, 16, 4, 1, 1, 8,2, 1,4] # 128x128 + # - [16, 16, 4, 1, 1, 4,4, 2,2] # 128x128 + # - [16, 16, 4, 1, 1, 1,1, 4,1] # 128x128 + - [16, 16, 4, 1] + - ThreadTile: + - [ 2, 96 ] + - WorkGroup: + - [ 64, 4, 1 ] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + # - Exact: [1024, 1024, 1, 1024, 1048576, 524288, 524288, 524288] + # - Exact: [1024, 1024, 1, 1024] + - Exact: [7105, 504, 1, 7105] + From 5d2dd2460d139b967555ceefd370dde39136f62f Mon Sep 17 00:00:00 2001 From: Alex Brown Date: Tue, 21 Mar 2023 09:12:00 -0600 Subject: [PATCH 7/8] Update custom kernels with 64-bit offsets (#1696) Implement offset calculation in PK custom kernels Add test cases for custom kernels --- ...128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4.s | 2942 +++--- .../DGEMM_Aldebaran_PKFixedAtomic512Latest.s | 8249 ++++++++++++----- .../DGEMM_Aldebaran_PKFixedAtomic512_104.s | 128 +- .../custom_kernel/ck_dgemm_90a_nn.yaml | 57 + .../ck_dgemm_90a_nn_large_offset.yaml | 52 + .../custom_kernel/ck_dgemm_90a_pk.yaml | 59 + .../custom_kernel/ck_dgemm_90a_pk104.yaml | 60 + .../ck_dgemm_90a_pk104_offset.yaml | 61 + .../custom_kernel/ck_dgemm_90a_pk_offset.yaml | 60 + pytest.ini | 2 + 10 files changed, 8058 insertions(+), 3612 deletions(-) create mode 100644 Tensile/Tests/extended/custom_kernel/ck_dgemm_90a_nn.yaml create mode 100644 Tensile/Tests/extended/custom_kernel/ck_dgemm_90a_nn_large_offset.yaml create mode 100644 Tensile/Tests/extended/custom_kernel/ck_dgemm_90a_pk.yaml create mode 100644 Tensile/Tests/extended/custom_kernel/ck_dgemm_90a_pk104.yaml create mode 100644 Tensile/Tests/extended/custom_kernel/ck_dgemm_90a_pk104_offset.yaml create mode 100644 Tensile/Tests/extended/custom_kernel/ck_dgemm_90a_pk_offset.yaml diff --git a/Tensile/CustomKernels/DGEMM_Aldebaran_NN_MT128x128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4.s b/Tensile/CustomKernels/DGEMM_Aldebaran_NN_MT128x128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4.s index aad80abd01..b2c37f3469 100644 --- a/Tensile/CustomKernels/DGEMM_Aldebaran_NN_MT128x128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4.s +++ b/Tensile/CustomKernels/DGEMM_Aldebaran_NN_MT128x128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4.s @@ -137,134 +137,134 @@ amdhsa.kernels: .value_kind: global_buffer .value_type: f64 .address_space: generic - - .name: alpha + - .name: OffsetD .size: 8 .offset: 56 .value_kind: by_value + .value_type: u64 + - .name: OffsetC + .size: 8 + .offset: 64 + .value_kind: by_value + .value_type: u64 + - .name: OffsetA + .size: 8 + .offset: 72 + .value_kind: by_value + .value_type: u64 + - .name: OffsetB + .size: 8 + .offset: 80 + .value_kind: by_value + .value_type: u64 + - .name: alpha + .size: 8 + .offset: 88 + .value_kind: by_value .value_type: f64 - .name: beta .size: 8 - .offset: 64 + .offset: 96 .value_kind: by_value .value_type: f64 - .name: strideD0 .size: 4 - .offset: 72 + .offset: 104 .value_kind: by_value .value_type: u32 - .name: strideD1 .size: 4 - .offset: 76 + .offset: 108 .value_kind: by_value .value_type: u32 - .name: strideC0 .size: 4 - .offset: 80 + .offset: 112 .value_kind: by_value .value_type: u32 - .name: strideC1 .size: 4 - .offset: 84 + .offset: 116 .value_kind: by_value .value_type: u32 - .name: strideA0 .size: 4 - .offset: 88 + .offset: 120 .value_kind: by_value .value_type: u32 - .name: strideA1 .size: 4 - .offset: 92 + .offset: 124 .value_kind: by_value .value_type: u32 - .name: strideB0 .size: 4 - .offset: 96 + .offset: 128 .value_kind: by_value .value_type: u32 - .name: strideB1 .size: 4 - .offset: 100 + .offset: 132 .value_kind: by_value .value_type: u32 - .name: SizesFree0 .size: 4 - .offset: 104 + .offset: 136 .value_kind: by_value .value_type: u32 - .name: SizesFree1 .size: 4 - .offset: 108 + .offset: 140 .value_kind: by_value .value_type: u32 - .name: SizesFree2 .size: 4 - .offset: 112 + .offset: 144 .value_kind: by_value .value_type: u32 - .name: SizesSum0 .size: 4 - .offset: 116 + .offset: 148 .value_kind: by_value .value_type: u32 - .name: OrigStaggerUIter .size: 4 - .offset: 120 + .offset: 152 .value_kind: by_value .value_type: i32 - .name: NumWorkGroups0 .size: 4 - .offset: 124 + .offset: 156 .value_kind: by_value .value_type: u32 - .name: NumWorkGroups1 .size: 4 - .offset: 128 + .offset: 160 .value_kind: by_value .value_type: u32 - .name: NumFullBlocks .size: 4 - .offset: 132 + .offset: 164 .value_kind: by_value .value_type: u32 - .name: WgmRemainder1 .size: 4 - .offset: 136 + .offset: 168 .value_kind: by_value .value_type: u32 - .name: MagicNumberWgmRemainder1 .size: 4 - .offset: 140 - .value_kind: by_value - .value_type: u32 - - .name: OffsetD - .size: 4 - .offset: 144 - .value_kind: by_value - .value_type: u32 - - .name: OffsetC - .size: 4 - .offset: 148 - .value_kind: by_value - .value_type: u32 - - .name: OffsetA - .size: 4 - .offset: 152 - .value_kind: by_value - .value_type: u32 - - .name: OffsetB - .size: 4 - .offset: 156 + .offset: 172 .value_kind: by_value .value_type: u32 - .name: padding .size: 4 - .offset: 160 + .offset: 176 .value_kind: by_value .value_type: u32 .group_segment_fixed_size: 32768 .kernarg_segment_align: 8 - .kernarg_segment_size: 168 + .kernarg_segment_size: 184 .max_flat_workgroup_size: 256 .private_segment_fixed_size: 0 .sgpr_count: 73 @@ -597,29 +597,29 @@ DGEMM_Aldebaran_NN_MT128x128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4: .set sgprAddressC, 30 .set sgprAddressA, 32 .set sgprAddressB, 34 -.set sgprAlpha, 36 -.set sgprBeta, 38 -.set sgprStridesD, 40 -.set sgprStridesC, 42 -.set sgprStridesA, 44 -.set sgprStridesB, 46 -.set sgprSizesFree, 48 -.set sgprSizesSum, 51 -.set sgprOrigStaggerUIter, 52 -.set sgprNumWorkGroups0, 53 -.set sgprNumWorkGroups1, 54 -.set sgprNumFullBlocks, 55 -.set sgprWgmRemainder1, 56 -.set sgprMagicNumberWgmRemainder1, 57 -.set sgprOffsetD, 58 -.set sgprOffsetC, 59 -.set sgprOffsetA, 60 -.set sgprOffsetB, 61 -.set sgprShadowLimitA, 58 -.set sgprShadowLimitB, 60 +.set sgprOffsetD, 36 +.set sgprOffsetC, 38 +.set sgprOffsetA, 40 +.set sgprOffsetB, 42 +.set sgprAlpha, 44 +.set sgprBeta, 46 +.set sgprStridesD, 48 +.set sgprStridesC, 50 +.set sgprStridesA, 52 +.set sgprStridesB, 54 +.set sgprSizesFree, 56 +.set sgprSizesSum, 59 +.set sgprOrigStaggerUIter, 60 +.set sgprNumWorkGroups0, 61 +.set sgprNumWorkGroups1, 62 +.set sgprNumFullBlocks, 63 +.set sgprWgmRemainder1, 64 +.set sgprMagicNumberWgmRemainder1, 65 +.set sgprShadowLimitA, 36 +.set sgprShadowLimitB, 38 .set sgprStaggerUIter, 7 -.set sgprWrapUA, 62 -.set sgprWrapUB, 64 +.set sgprWrapUA, 40 +.set sgprWrapUB, 42 .set sgprGlobalReadIncsA, 66 .set sgprGlobalReadIncsB, 67 /* max SGPR=73 */ @@ -741,21 +741,21 @@ v_mov_b32 v[vgprSerial], v0 // thread serial id /* Load Kernel Args */ s_load_dwordx16 s[24:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x8 // s_load_dwordx16 s[40:55], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x48 // -s_load_dwordx4 s[56:59], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x88 // -s_load_dwordx2 s[60:61], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x98 // +s_load_dwordx8 s[56:63], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x88 // +s_load_dwordx2 s[64:65], s[sgprKernArgAddress:sgprKernArgAddress+1], 0xA8 // s_waitcnt lgkmcnt(0) // wait for 160 bytes of kern args -s_lshl_b32 s[sgprOffsetD], s[sgprOffsetD], 0x3 // elements offset to bytes offset +s_lshl_b64 s[sgprOffsetD:sgprOffsetD+1], s[sgprOffsetD:sgprOffsetD+1], 0x3 // elements offset to bytes offset s_add_u32 s[sgprAddressD+0], s[sgprAddressD+0], s[sgprOffsetD] // add offset to buffer address -s_addc_u32 s[sgprAddressD+1], s[sgprAddressD+1], 0 // add offset to buffer address -s_lshl_b32 s[sgprOffsetC], s[sgprOffsetC], 0x3 // elements offset to bytes offset +s_addc_u32 s[sgprAddressD+1], s[sgprAddressD+1], s[sgprOffsetD+1] // add offset to buffer address +s_lshl_b64 s[sgprOffsetC:sgprOffsetC+1], s[sgprOffsetC:sgprOffsetC+1], 0x3 // elements offset to bytes offset s_add_u32 s[sgprAddressC+0], s[sgprAddressC+0], s[sgprOffsetC] // add offset to buffer address -s_addc_u32 s[sgprAddressC+1], s[sgprAddressC+1], 0 // add offset to buffer address -s_lshl_b32 s[sgprOffsetA], s[sgprOffsetA], 0x3 // elements offset to bytes offset +s_addc_u32 s[sgprAddressC+1], s[sgprAddressC+1], s[sgprOffsetC+1] // add offset to buffer address +s_lshl_b64 s[sgprOffsetA:sgprOffsetA+1], s[sgprOffsetA:sgprOffsetA+1], 0x3 // elements offset to bytes offset s_add_u32 s[sgprAddressA+0], s[sgprAddressA+0], s[sgprOffsetA] // add offset to buffer address -s_addc_u32 s[sgprAddressA+1], s[sgprAddressA+1], 0 // add offset to buffer address -s_lshl_b32 s[sgprOffsetB], s[sgprOffsetB], 0x3 // elements offset to bytes offset +s_addc_u32 s[sgprAddressA+1], s[sgprAddressA+1], s[sgprOffsetA+1] // add offset to buffer address +s_lshl_b64 s[sgprOffsetB:sgprOffsetB+1], s[sgprOffsetB:sgprOffsetB+1], 0x3 // elements offset to bytes offset s_add_u32 s[sgprAddressB+0], s[sgprAddressB+0], s[sgprOffsetB] // add offset to buffer address -s_addc_u32 s[sgprAddressB+1], s[sgprAddressB+1], 0 // add offset to buffer address +s_addc_u32 s[sgprAddressB+1], s[sgprAddressB+1], s[sgprOffsetB+1] // add offset to buffer address .set OffsetD, UNDEF .set OffsetC, UNDEF @@ -2511,10 +2511,10 @@ v_and_b32 v128, 3, v144 // v128 = v144 % 4 v_mul_lo_u32 v128, 0x10, v128 // wave coordination offset 0 v_and_b32 v145, 15, v[vgprSerial] // v145 = v[vgprSerial] % 16 _v_add_lshl_u32 v128, v145, v128, 1 // coordination 0 = wave_id0 + tid0 -s_mul_i32 s55, 128, s[sgprWorkGroup0] // wgp0 * MT0 -v_add_u32 v128, s55, v128 // coord 0 = (tid0/MI_m)*4 + waveG0*MIB_m + MT0*SG0 -s_mul_i32 s55, 128, s[sgprWorkGroup1] // wgp1 * MT1 -v_add_u32 v129, s55, v129 // coord 1 = (tid0%MI_m) + waveG1*MIB_n + MT1*SG1 +s_mul_i32 s63, 128, s[sgprWorkGroup0] // wgp0 * MT0 +v_add_u32 v128, s63, v128 // coord 0 = (tid0/MI_m)*4 + waveG0*MIB_m + MT0*SG0 +s_mul_i32 s63, 128, s[sgprWorkGroup1] // wgp1 * MT1 +v_add_u32 v129, s63, v129 // coord 1 = (tid0%MI_m) + waveG1*MIB_n + MT1*SG1 GW_B0_E0_20: /* edge=0, allocate 2 sgpr. perBatchTmpS=2 perBatchMaskS=0 perElementMaskS=0 elementsPerBatch=1 */ @@ -2555,8 +2555,8 @@ v_mov_b32 v[vgprValuC+150], v[vgprValuC+10] // copy MI out reg to vreg[6] v_mov_b32 v[vgprValuC+151], v[vgprValuC+11] // copy MI out reg to vreg[7] /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -2577,8 +2577,8 @@ v_mov_b32 v[vgprValuC+150], v[vgprValuC+12] // copy MI out reg to vreg[10] v_mov_b32 v[vgprValuC+151], v[vgprValuC+13] // copy MI out reg to vreg[11] /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -2599,8 +2599,8 @@ v_mov_b32 v[vgprValuC+150], v[vgprValuC+14] // copy MI out reg to vreg[14] v_mov_b32 v[vgprValuC+151], v[vgprValuC+15] // copy MI out reg to vreg[15] /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -2621,8 +2621,8 @@ v_mov_b32 v[vgprValuC+150], v[vgprValuC+24] // copy MI out reg to vreg[18] v_mov_b32 v[vgprValuC+151], v[vgprValuC+25] // copy MI out reg to vreg[19] /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -2643,8 +2643,8 @@ v_mov_b32 v[vgprValuC+150], v[vgprValuC+26] // copy MI out reg to vreg[22] v_mov_b32 v[vgprValuC+151], v[vgprValuC+27] // copy MI out reg to vreg[23] /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -2665,8 +2665,8 @@ v_mov_b32 v[vgprValuC+150], v[vgprValuC+28] // copy MI out reg to vreg[26] v_mov_b32 v[vgprValuC+151], v[vgprValuC+29] // copy MI out reg to vreg[27] /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -2687,8 +2687,8 @@ v_mov_b32 v[vgprValuC+150], v[vgprValuC+30] // copy MI out reg to vreg[30] v_mov_b32 v[vgprValuC+151], v[vgprValuC+31] // copy MI out reg to vreg[31] /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -2709,8 +2709,8 @@ v_mov_b32 v[vgprValuC+150], v[vgprValuC+40] // copy MI out reg to vreg[34] v_mov_b32 v[vgprValuC+151], v[vgprValuC+41] // copy MI out reg to vreg[35] /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -2731,8 +2731,8 @@ v_mov_b32 v[vgprValuC+150], v[vgprValuC+42] // copy MI out reg to vreg[38] v_mov_b32 v[vgprValuC+151], v[vgprValuC+43] // copy MI out reg to vreg[39] /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -2753,8 +2753,8 @@ v_mov_b32 v[vgprValuC+150], v[vgprValuC+44] // copy MI out reg to vreg[42] v_mov_b32 v[vgprValuC+151], v[vgprValuC+45] // copy MI out reg to vreg[43] /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -2775,8 +2775,8 @@ v_mov_b32 v[vgprValuC+150], v[vgprValuC+46] // copy MI out reg to vreg[46] v_mov_b32 v[vgprValuC+151], v[vgprValuC+47] // copy MI out reg to vreg[47] /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -2797,8 +2797,8 @@ v_mov_b32 v[vgprValuC+150], v[vgprValuC+56] // copy MI out reg to vreg[50] v_mov_b32 v[vgprValuC+151], v[vgprValuC+57] // copy MI out reg to vreg[51] /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -2819,8 +2819,8 @@ v_mov_b32 v[vgprValuC+150], v[vgprValuC+58] // copy MI out reg to vreg[54] v_mov_b32 v[vgprValuC+151], v[vgprValuC+59] // copy MI out reg to vreg[55] /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -2841,8 +2841,8 @@ v_mov_b32 v[vgprValuC+150], v[vgprValuC+60] // copy MI out reg to vreg[58] v_mov_b32 v[vgprValuC+151], v[vgprValuC+61] // copy MI out reg to vreg[59] /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -2863,8 +2863,8 @@ v_mov_b32 v[vgprValuC+150], v[vgprValuC+62] // copy MI out reg to vreg[62] v_mov_b32 v[vgprValuC+151], v[vgprValuC+63] // copy MI out reg to vreg[63] /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -2885,8 +2885,8 @@ v_mov_b32 v[vgprValuC+150], v[vgprValuC+72] // copy MI out reg to vreg[66] v_mov_b32 v[vgprValuC+151], v[vgprValuC+73] // copy MI out reg to vreg[67] /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -2907,8 +2907,8 @@ v_mov_b32 v[vgprValuC+150], v[vgprValuC+74] // copy MI out reg to vreg[70] v_mov_b32 v[vgprValuC+151], v[vgprValuC+75] // copy MI out reg to vreg[71] /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -2929,8 +2929,8 @@ v_mov_b32 v[vgprValuC+150], v[vgprValuC+76] // copy MI out reg to vreg[74] v_mov_b32 v[vgprValuC+151], v[vgprValuC+77] // copy MI out reg to vreg[75] /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -2951,8 +2951,8 @@ v_mov_b32 v[vgprValuC+150], v[vgprValuC+78] // copy MI out reg to vreg[78] v_mov_b32 v[vgprValuC+151], v[vgprValuC+79] // copy MI out reg to vreg[79] /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -2973,8 +2973,8 @@ v_mov_b32 v[vgprValuC+150], v[vgprValuC+88] // copy MI out reg to vreg[82] v_mov_b32 v[vgprValuC+151], v[vgprValuC+89] // copy MI out reg to vreg[83] /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -2995,8 +2995,8 @@ v_mov_b32 v[vgprValuC+150], v[vgprValuC+90] // copy MI out reg to vreg[86] v_mov_b32 v[vgprValuC+151], v[vgprValuC+91] // copy MI out reg to vreg[87] /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -3017,8 +3017,8 @@ v_mov_b32 v[vgprValuC+150], v[vgprValuC+92] // copy MI out reg to vreg[90] v_mov_b32 v[vgprValuC+151], v[vgprValuC+93] // copy MI out reg to vreg[91] /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -3039,8 +3039,8 @@ v_mov_b32 v[vgprValuC+150], v[vgprValuC+94] // copy MI out reg to vreg[94] v_mov_b32 v[vgprValuC+151], v[vgprValuC+95] // copy MI out reg to vreg[95] /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -3061,8 +3061,8 @@ v_mov_b32 v[vgprValuC+150], v[vgprValuC+104] // copy MI out reg to vreg[98] v_mov_b32 v[vgprValuC+151], v[vgprValuC+105] // copy MI out reg to vreg[99] /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -3083,8 +3083,8 @@ v_mov_b32 v[vgprValuC+150], v[vgprValuC+106] // copy MI out reg to vreg[102] v_mov_b32 v[vgprValuC+151], v[vgprValuC+107] // copy MI out reg to vreg[103] /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -3105,8 +3105,8 @@ v_mov_b32 v[vgprValuC+150], v[vgprValuC+108] // copy MI out reg to vreg[106] v_mov_b32 v[vgprValuC+151], v[vgprValuC+109] // copy MI out reg to vreg[107] /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -3127,8 +3127,8 @@ v_mov_b32 v[vgprValuC+150], v[vgprValuC+110] // copy MI out reg to vreg[110] v_mov_b32 v[vgprValuC+151], v[vgprValuC+111] // copy MI out reg to vreg[111] /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -3149,8 +3149,8 @@ v_mov_b32 v[vgprValuC+150], v[vgprValuC+120] // copy MI out reg to vreg[114] v_mov_b32 v[vgprValuC+151], v[vgprValuC+121] // copy MI out reg to vreg[115] /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -3171,8 +3171,8 @@ v_mov_b32 v[vgprValuC+150], v[vgprValuC+122] // copy MI out reg to vreg[118] v_mov_b32 v[vgprValuC+151], v[vgprValuC+123] // copy MI out reg to vreg[119] /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -3193,8 +3193,8 @@ v_mov_b32 v[vgprValuC+150], v[vgprValuC+124] // copy MI out reg to vreg[122] v_mov_b32 v[vgprValuC+151], v[vgprValuC+125] // copy MI out reg to vreg[123] /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -3215,8 +3215,8 @@ v_mov_b32 v[vgprValuC+150], v[vgprValuC+126] // copy MI out reg to vreg[126] v_mov_b32 v[vgprValuC+151], v[vgprValuC+127] // copy MI out reg to vreg[127] /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[148:151], v146, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -3752,7 +3752,7 @@ s_setprio 0 // optimization store // TODO in Generator // skip shift vector if M % 2 == 0 -s_and_b32 s55, 0x1, s[sgprSizeI] +s_and_b32 s63, 0x1, s[sgprSizeI] s_cbranch_scc0 label_0029 // done shifting /* shift vector components d0 */ @@ -3761,14 +3761,14 @@ v_mov_b32 v131, s[sgprWorkGroup0] // v_mul_i32_i24 v131, -0x80, v131 // wg*MT _v_add_co_u32 v131, vcc, s[sgprSizesFree+0], v131 // wgMT = Size - wg*MT v_mov_b32 v132, 0x80 // MT -v_cmp_lt_u32 s[56:57], v131, v132 // wgMT < MT -v_cndmask_b32 v131, v132, v131, s[56:57] // wgMT = (wgMT < MT) ? wgMT : MT +v_cmp_lt_u32 s[64:65], v131, v132 // wgMT < MT +v_cndmask_b32 v131, v132, v131, s[64:65] // wgMT = (wgMT < MT) ? wgMT : MT v_lshrrev_b32 v133, 6, v[vgprSerial] // v133 = v[vgprSerial] / 64 v_and_b32 v133, 3, v133 // v133 = v133 % 4 v_lshrrev_b32 v134, 5, v131 // v134 = v131 / 32 v_and_b32 v134, 3, v134 // v134 = v134 % 4 -v_cmp_eq_u32 s[56:57], v134, v133 // wave_id == block_belong_to_wave? -v_cndmask_b32 v131, v132, v131, s[56:57] // wgMT = (wgMT < MT) ? wgMT : MT +v_cmp_eq_u32 s[64:65], v134, v133 // wave_id == block_belong_to_wave? +v_cndmask_b32 v131, v132, v131, s[64:65] // wgMT = (wgMT < MT) ? wgMT : MT /* mbReg: which mb block need to shift, mb(matrixInstCoal(16) * VectorWidth(2)) */ v_lshrrev_b32 v132, 5, v131 // v132 = v131 / 32 @@ -3815,8 +3815,8 @@ s_cbranch_vccnz label_0028 // branch to shift d0 r1 mb0 /* shift d0 r=1 mb=0 vw0 */ /******************************************/ label_0028: // r1 mb0 vw0 -s_mov_b32 s56, 0 // -v_cmpx_eq_u32 s[56:57], v134, s56 // is thread in edge glvw region +s_mov_b32 s64, 0 // +v_cmpx_eq_u32 s[64:65], v134, s64 // is thread in edge glvw region v_and_b32 v128, 63, v[vgprSerial] // permute register between threads v_lshlrev_b32 v128, 2, v128 // permute register between threads v_mov_b32 v135, v8 // glvw 1 mb 0 tt1 0 r 0 @@ -3947,8 +3947,8 @@ v_mov_b32 v135, v126 // glvw 1 mb 0 tt1 31 r 0 v_mov_b32 v118, v135 // v_mov_b32 v135, v127 // glvw 1 mb 0 tt1 31 r 1 v_mov_b32 v119, v135 // -s_mov_b64 s[56:57], 0xFFFFFFFFFFFFFFFF // to restore all threads active -s_or_saveexec_b64 vcc, s[56:57] // all threads active +s_mov_b64 s[64:65], 0xFFFFFFFFFFFFFFFF // to restore all threads active +s_or_saveexec_b64 vcc, s[64:65] // all threads active s_branch label_0029 // done shifting label_0029: // end shift0 @@ -3971,30 +3971,30 @@ v_and_b32 v128, 3, v132 // v128 = v132 % 4 v_mul_lo_u32 v128, 0x10, v128 // wave coordination offset 0 v_and_b32 v133, 15, v[vgprSerial] // v133 = v[vgprSerial] % 16 _v_add_lshl_u32 v128, v133, v128, 1 // coordination 0 = wave_id0 + tid0 -s_mul_i32 s55, 128, s[sgprWorkGroup0] // wgp0 * MT0 -v_add_u32 v128, s55, v128 // coord 0 = (tid0/MI_m)*4 + waveG0*MIB_m + MT0*SG0 -s_mul_i32 s55, 128, s[sgprWorkGroup1] // wgp1 * MT1 -v_add_u32 v129, s55, v129 // coord 1 = (tid0%MI_m) + waveG1*MIB_n + MT1*SG1 +s_mul_i32 s63, 128, s[sgprWorkGroup0] // wgp0 * MT0 +v_add_u32 v128, s63, v128 // coord 0 = (tid0/MI_m)*4 + waveG0*MIB_m + MT0*SG0 +s_mul_i32 s63, 128, s[sgprWorkGroup1] // wgp1 * MT1 +v_add_u32 v129, s63, v129 // coord 1 = (tid0%MI_m) + waveG1*MIB_n + MT1*SG1 /* not-LocalSplitU: global write */ -s_mov_b32 s56, s[sgprBeta+0] // tmp = Beta[0] -s_or_b32 s56, s[sgprBeta+1], s56 // tmp |= Beta[1] -s_cmpk_eq_u32 s56, 0x0 // Beta == 0 +s_mov_b32 s64, s[sgprBeta+0] // tmp = Beta[0] +s_or_b32 s64, s[sgprBeta+1], s64 // tmp |= Beta[1] +s_cmpk_eq_u32 s64, 0x0 // Beta == 0 s_cbranch_scc0 GW_Beta_46 // Branch if Beta is not zero -s_and_b32 s56, 127, s[sgprSizeI] // s56 = s[sgprSizeI] % 128 -s_add_u32 s57, -0x1, s[sgprNumWorkGroups0] // -s_cmp_ge_u32 s[sgprWorkGroup0], s57 // wg0 >= nwg0-1 ? -s_cselect_b32 s56, s56, 0 // set rMT0 -s_cmpk_gt_u32 s56, 0x0 // rMT0 > 0 +s_and_b32 s64, 127, s[sgprSizeI] // s64 = s[sgprSizeI] % 128 +s_add_u32 s65, -0x1, s[sgprNumWorkGroups0] // +s_cmp_ge_u32 s[sgprWorkGroup0], s65 // wg0 >= nwg0-1 ? +s_cselect_b32 s64, s64, 0 // set rMT0 +s_cmpk_gt_u32 s64, 0x0 // rMT0 > 0 s_cbranch_scc1 GW_B0_E1_37 // jump if edges required -s_and_b32 s56, 127, s[sgprSizeJ] // s56 = s[sgprSizeJ] % 128 -s_add_u32 s57, -0x1, s[sgprNumWorkGroups1] // -s_cmp_ge_u32 s[sgprWorkGroup1], s57 // wg1 >= nwg1-1 -s_cselect_b32 s56, s56, 0 // set rMT1 -s_cmpk_gt_u32 s56, 0x0 // rMT1 > 0 +s_and_b32 s64, 127, s[sgprSizeJ] // s64 = s[sgprSizeJ] % 128 +s_add_u32 s65, -0x1, s[sgprNumWorkGroups1] // +s_cmp_ge_u32 s[sgprWorkGroup1], s65 // wg1 >= nwg1-1 +s_cselect_b32 s64, s64, 0 // set rMT1 +s_cmpk_gt_u32 s64, 0x0 // rMT1 > 0 s_cbranch_scc1 GW_B0_E1_37 // jump if edges required GW_B0_E0_34: @@ -4036,8 +4036,8 @@ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+10:vgprValuC+10+1] // Multiply MI out reg with alpha /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -4058,8 +4058,8 @@ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+12:vgprValuC+12+1] // Multiply MI out reg with alpha /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -4080,8 +4080,8 @@ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+14:vgprValuC+14+1] // Multiply MI out reg with alpha /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -4102,8 +4102,8 @@ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] // Multiply MI out reg with alpha /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -4124,8 +4124,8 @@ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] // Multiply MI out reg with alpha /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -4146,8 +4146,8 @@ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] // Multiply MI out reg with alpha /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -4168,8 +4168,8 @@ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] // Multiply MI out reg with alpha /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -4190,8 +4190,8 @@ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] // Multiply MI out reg with alpha /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -4212,8 +4212,8 @@ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] // Multiply MI out reg with alpha /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -4234,8 +4234,8 @@ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] // Multiply MI out reg with alpha /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -4256,8 +4256,8 @@ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] // Multiply MI out reg with alpha /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -4278,8 +4278,8 @@ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] // Multiply MI out reg with alpha /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -4300,8 +4300,8 @@ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] // Multiply MI out reg with alpha /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -4322,8 +4322,8 @@ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] // Multiply MI out reg with alpha /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -4344,8 +4344,8 @@ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] // Multiply MI out reg with alpha /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -4366,8 +4366,8 @@ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] // Multiply MI out reg with alpha /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -4388,8 +4388,8 @@ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] // Multiply MI out reg with alpha /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -4410,8 +4410,8 @@ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] // Multiply MI out reg with alpha /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -4432,8 +4432,8 @@ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] // Multiply MI out reg with alpha /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -4454,8 +4454,8 @@ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] // Multiply MI out reg with alpha /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -4476,8 +4476,8 @@ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] // Multiply MI out reg with alpha /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -4498,8 +4498,8 @@ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] // Multiply MI out reg with alpha /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -4520,8 +4520,8 @@ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] // Multiply MI out reg with alpha /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -4542,8 +4542,8 @@ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] // Multiply MI out reg with alpha /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -4564,8 +4564,8 @@ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] // Multiply MI out reg with alpha /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -4586,8 +4586,8 @@ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] // Multiply MI out reg with alpha /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -4608,8 +4608,8 @@ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] // Multiply MI out reg with alpha /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -4630,8 +4630,8 @@ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] // Multiply MI out reg with alpha /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -4652,8 +4652,8 @@ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] // Multiply MI out reg with alpha /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -4674,8 +4674,8 @@ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] // Multiply MI out reg with alpha /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -4696,8 +4696,8 @@ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] // Multiply MI out reg with alpha /* apply mask, calc new C and issue writes */ -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[136:139], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -4716,11 +4716,11 @@ s_barrier /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(0,0,0,0) */ -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(0, 0, 0, 0)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+0:vgprValuC+0+1] // Multiply MI out reg with alpha @@ -4740,11 +4740,11 @@ s_barrier /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(0,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(0, 0, 0, 1)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+8:vgprValuC+8+1] // Multiply MI out reg with alpha @@ -4766,15 +4766,15 @@ s_barrier _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(1, 0, 0, 0)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+2:vgprValuC+2+1] // Multiply MI out reg with alpha @@ -4794,11 +4794,11 @@ s_barrier /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(1,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(1, 0, 0, 1)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+10:vgprValuC+10+1] // Multiply MI out reg with alpha @@ -4820,15 +4820,15 @@ s_barrier _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(2, 0, 0, 0)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+4:vgprValuC+4+1] // Multiply MI out reg with alpha @@ -4848,11 +4848,11 @@ s_barrier /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(2,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(2, 0, 0, 1)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+12:vgprValuC+12+1] // Multiply MI out reg with alpha @@ -4874,15 +4874,15 @@ s_barrier _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(3, 0, 0, 0)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+6:vgprValuC+6+1] // Multiply MI out reg with alpha @@ -4902,11 +4902,11 @@ s_barrier /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(3,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(3, 0, 0, 1)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+14:vgprValuC+14+1] // Multiply MI out reg with alpha @@ -4928,15 +4928,15 @@ s_barrier _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(4, 0, 0, 0)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] // Multiply MI out reg with alpha @@ -4956,11 +4956,11 @@ s_barrier /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(4,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(4, 0, 0, 1)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] // Multiply MI out reg with alpha @@ -4982,15 +4982,15 @@ s_barrier _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(5, 0, 0, 0)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] // Multiply MI out reg with alpha @@ -5010,11 +5010,11 @@ s_barrier /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(5,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(5, 0, 0, 1)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] // Multiply MI out reg with alpha @@ -5036,15 +5036,15 @@ s_barrier _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(6, 0, 0, 0)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] // Multiply MI out reg with alpha @@ -5064,11 +5064,11 @@ s_barrier /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(6,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(6, 0, 0, 1)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] // Multiply MI out reg with alpha @@ -5090,15 +5090,15 @@ s_barrier _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(7, 0, 0, 0)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] // Multiply MI out reg with alpha @@ -5118,11 +5118,11 @@ s_barrier /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(7,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(7, 0, 0, 1)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] // Multiply MI out reg with alpha @@ -5144,15 +5144,15 @@ s_barrier _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(8, 0, 0, 0)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] // Multiply MI out reg with alpha @@ -5172,11 +5172,11 @@ s_barrier /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(8,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(8, 0, 0, 1)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] // Multiply MI out reg with alpha @@ -5198,15 +5198,15 @@ s_barrier _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(9, 0, 0, 0)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] // Multiply MI out reg with alpha @@ -5226,11 +5226,11 @@ s_barrier /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(9,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(9, 0, 0, 1)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] // Multiply MI out reg with alpha @@ -5252,15 +5252,15 @@ s_barrier _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(10, 0, 0, 0)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] // Multiply MI out reg with alpha @@ -5280,11 +5280,11 @@ s_barrier /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(10,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(10, 0, 0, 1)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] // Multiply MI out reg with alpha @@ -5306,15 +5306,15 @@ s_barrier _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(11, 0, 0, 0)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] // Multiply MI out reg with alpha @@ -5334,11 +5334,11 @@ s_barrier /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(11,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(11, 0, 0, 1)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] // Multiply MI out reg with alpha @@ -5360,15 +5360,15 @@ s_barrier _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(12, 0, 0, 0)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] // Multiply MI out reg with alpha @@ -5388,11 +5388,11 @@ s_barrier /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(12,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(12, 0, 0, 1)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] // Multiply MI out reg with alpha @@ -5414,15 +5414,15 @@ s_barrier _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(13, 0, 0, 0)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] // Multiply MI out reg with alpha @@ -5442,11 +5442,11 @@ s_barrier /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(13,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(13, 0, 0, 1)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] // Multiply MI out reg with alpha @@ -5468,15 +5468,15 @@ s_barrier _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(14, 0, 0, 0)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] // Multiply MI out reg with alpha @@ -5496,11 +5496,11 @@ s_barrier /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(14,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(14, 0, 0, 1)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] // Multiply MI out reg with alpha @@ -5522,15 +5522,15 @@ s_barrier _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(15, 0, 0, 0)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] // Multiply MI out reg with alpha @@ -5550,11 +5550,11 @@ s_barrier /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(15,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(15, 0, 0, 1)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] // Multiply MI out reg with alpha @@ -5576,15 +5576,15 @@ s_barrier _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(16, 0, 0, 0)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] // Multiply MI out reg with alpha @@ -5604,11 +5604,11 @@ s_barrier /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(16,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(16, 0, 0, 1)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] // Multiply MI out reg with alpha @@ -5630,15 +5630,15 @@ s_barrier _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(17, 0, 0, 0)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] // Multiply MI out reg with alpha @@ -5658,11 +5658,11 @@ s_barrier /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(17,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(17, 0, 0, 1)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] // Multiply MI out reg with alpha @@ -5684,15 +5684,15 @@ s_barrier _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(18, 0, 0, 0)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] // Multiply MI out reg with alpha @@ -5712,11 +5712,11 @@ s_barrier /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(18,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(18, 0, 0, 1)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] // Multiply MI out reg with alpha @@ -5738,15 +5738,15 @@ s_barrier _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(19, 0, 0, 0)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] // Multiply MI out reg with alpha @@ -5766,11 +5766,11 @@ s_barrier /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(19,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(19, 0, 0, 1)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] // Multiply MI out reg with alpha @@ -5792,15 +5792,15 @@ s_barrier _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(20, 0, 0, 0)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] // Multiply MI out reg with alpha @@ -5820,11 +5820,11 @@ s_barrier /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(20,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(20, 0, 0, 1)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] // Multiply MI out reg with alpha @@ -5846,15 +5846,15 @@ s_barrier _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(21, 0, 0, 0)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] // Multiply MI out reg with alpha @@ -5874,11 +5874,11 @@ s_barrier /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(21,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(21, 0, 0, 1)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] // Multiply MI out reg with alpha @@ -5900,15 +5900,15 @@ s_barrier _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(22, 0, 0, 0)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] // Multiply MI out reg with alpha @@ -5928,11 +5928,11 @@ s_barrier /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(22,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(22, 0, 0, 1)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] // Multiply MI out reg with alpha @@ -5954,15 +5954,15 @@ s_barrier _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(23, 0, 0, 0)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] // Multiply MI out reg with alpha @@ -5982,11 +5982,11 @@ s_barrier /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(23,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(23, 0, 0, 1)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] // Multiply MI out reg with alpha @@ -6008,15 +6008,15 @@ s_barrier _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(24, 0, 0, 0)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] // Multiply MI out reg with alpha @@ -6036,11 +6036,11 @@ s_barrier /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(24,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(24, 0, 0, 1)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] // Multiply MI out reg with alpha @@ -6062,15 +6062,15 @@ s_barrier _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(25, 0, 0, 0)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] // Multiply MI out reg with alpha @@ -6090,11 +6090,11 @@ s_barrier /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(25,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(25, 0, 0, 1)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] // Multiply MI out reg with alpha @@ -6116,15 +6116,15 @@ s_barrier _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(26, 0, 0, 0)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] // Multiply MI out reg with alpha @@ -6144,11 +6144,11 @@ s_barrier /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(26,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(26, 0, 0, 1)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] // Multiply MI out reg with alpha @@ -6170,15 +6170,15 @@ s_barrier _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(27, 0, 0, 0)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] // Multiply MI out reg with alpha @@ -6198,11 +6198,11 @@ s_barrier /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(27,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(27, 0, 0, 1)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] // Multiply MI out reg with alpha @@ -6224,15 +6224,15 @@ s_barrier _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(28, 0, 0, 0)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] // Multiply MI out reg with alpha @@ -6252,11 +6252,11 @@ s_barrier /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(28,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(28, 0, 0, 1)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] // Multiply MI out reg with alpha @@ -6278,15 +6278,15 @@ s_barrier _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(29, 0, 0, 0)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] // Multiply MI out reg with alpha @@ -6306,11 +6306,11 @@ s_barrier /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(29,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(29, 0, 0, 1)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] // Multiply MI out reg with alpha @@ -6332,15 +6332,15 @@ s_barrier _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(30, 0, 0, 0)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] // Multiply MI out reg with alpha @@ -6360,11 +6360,11 @@ s_barrier /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(30,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(30, 0, 0, 1)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] // Multiply MI out reg with alpha @@ -6386,15 +6386,15 @@ s_barrier _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(31, 0, 0, 0)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] // Multiply MI out reg with alpha @@ -6414,11 +6414,11 @@ s_barrier /* calc coords, apply mask, and issue loads (if necessary) */ /* (d1,vc1,d0,vc0)=(31,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset /* rC *= alpha batchEements=[(31, 0, 0, 1)] */ v_mul_f64 v[vgprValuC+136:vgprValuC+136+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] // Multiply MI out reg with alpha @@ -6428,17 +6428,17 @@ buffer_store_dwordx2 v[136:137], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset: s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst s_branch label_GW_End_45 // jump to end GW_Beta_46: -s_and_b32 s56, 127, s[sgprSizeI] // s56 = s[sgprSizeI] % 128 -s_add_u32 s57, -0x1, s[sgprNumWorkGroups0] // -s_cmp_ge_u32 s[sgprWorkGroup0], s57 // wg0 >= nwg0-1 ? -s_cselect_b32 s56, s56, 0 // set rMT0 -s_cmpk_gt_u32 s56, 0x0 // rMT0 > 0 +s_and_b32 s64, 127, s[sgprSizeI] // s64 = s[sgprSizeI] % 128 +s_add_u32 s65, -0x1, s[sgprNumWorkGroups0] // +s_cmp_ge_u32 s[sgprWorkGroup0], s65 // wg0 >= nwg0-1 ? +s_cselect_b32 s64, s64, 0 // set rMT0 +s_cmpk_gt_u32 s64, 0x0 // rMT0 > 0 s_cbranch_scc1 GW_B1_E1_44 // jump if edges required -s_and_b32 s56, 127, s[sgprSizeJ] // s56 = s[sgprSizeJ] % 128 -s_add_u32 s57, -0x1, s[sgprNumWorkGroups1] // -s_cmp_ge_u32 s[sgprWorkGroup1], s57 // wg1 >= nwg1-1 -s_cselect_b32 s56, s56, 0 // set rMT1 -s_cmpk_gt_u32 s56, 0x0 // rMT1 > 0 +s_and_b32 s64, 127, s[sgprSizeJ] // s64 = s[sgprSizeJ] % 128 +s_add_u32 s65, -0x1, s[sgprNumWorkGroups1] // +s_cmp_ge_u32 s[sgprWorkGroup1], s65 // wg1 >= nwg1-1 +s_cselect_b32 s64, s64, 0 // set rMT1 +s_cmpk_gt_u32 s64, 0x0 // rMT1 > 0 s_cbranch_scc1 GW_B1_E1_44 // jump if edges required GW_B1_E0_41: @@ -6485,8 +6485,8 @@ s_barrier v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+2:vgprValuC+2+1] // Multiply MI out reg with alpha v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+10:vgprValuC+10+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(1,0,0,0) */ -s_mul_i32 s56, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait @@ -6496,8 +6496,8 @@ s_waitcnt vmcnt(0) // wait C /* apply mask, calc new C and issue writes */ v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -6516,8 +6516,8 @@ s_barrier v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+4:vgprValuC+4+1] // Multiply MI out reg with alpha v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+12:vgprValuC+12+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(2,0,0,0) */ -s_mul_i32 s56, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait @@ -6527,8 +6527,8 @@ s_waitcnt vmcnt(0) // wait C /* apply mask, calc new C and issue writes */ v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -6547,8 +6547,8 @@ s_barrier v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+6:vgprValuC+6+1] // Multiply MI out reg with alpha v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+14:vgprValuC+14+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(3,0,0,0) */ -s_mul_i32 s56, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait @@ -6558,8 +6558,8 @@ s_waitcnt vmcnt(0) // wait C /* apply mask, calc new C and issue writes */ v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -6578,8 +6578,8 @@ s_barrier v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+16:vgprValuC+16+1] // Multiply MI out reg with alpha v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(4,0,0,0) */ -s_mul_i32 s56, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait @@ -6589,8 +6589,8 @@ s_waitcnt vmcnt(0) // wait C /* apply mask, calc new C and issue writes */ v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -6609,8 +6609,8 @@ s_barrier v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+18:vgprValuC+18+1] // Multiply MI out reg with alpha v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(5,0,0,0) */ -s_mul_i32 s56, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait @@ -6620,8 +6620,8 @@ s_waitcnt vmcnt(0) // wait C /* apply mask, calc new C and issue writes */ v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -6640,8 +6640,8 @@ s_barrier v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+20:vgprValuC+20+1] // Multiply MI out reg with alpha v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(6,0,0,0) */ -s_mul_i32 s56, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait @@ -6651,8 +6651,8 @@ s_waitcnt vmcnt(0) // wait C /* apply mask, calc new C and issue writes */ v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -6671,8 +6671,8 @@ s_barrier v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+22:vgprValuC+22+1] // Multiply MI out reg with alpha v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(7,0,0,0) */ -s_mul_i32 s56, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait @@ -6682,8 +6682,8 @@ s_waitcnt vmcnt(0) // wait C /* apply mask, calc new C and issue writes */ v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -6702,8 +6702,8 @@ s_barrier v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+32:vgprValuC+32+1] // Multiply MI out reg with alpha v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(8,0,0,0) */ -s_mul_i32 s56, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait @@ -6713,8 +6713,8 @@ s_waitcnt vmcnt(0) // wait C /* apply mask, calc new C and issue writes */ v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -6733,8 +6733,8 @@ s_barrier v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+34:vgprValuC+34+1] // Multiply MI out reg with alpha v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(9,0,0,0) */ -s_mul_i32 s56, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait @@ -6744,8 +6744,8 @@ s_waitcnt vmcnt(0) // wait C /* apply mask, calc new C and issue writes */ v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -6764,8 +6764,8 @@ s_barrier v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+36:vgprValuC+36+1] // Multiply MI out reg with alpha v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(10,0,0,0) */ -s_mul_i32 s56, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait @@ -6775,8 +6775,8 @@ s_waitcnt vmcnt(0) // wait C /* apply mask, calc new C and issue writes */ v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -6795,8 +6795,8 @@ s_barrier v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+38:vgprValuC+38+1] // Multiply MI out reg with alpha v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(11,0,0,0) */ -s_mul_i32 s56, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait @@ -6806,8 +6806,8 @@ s_waitcnt vmcnt(0) // wait C /* apply mask, calc new C and issue writes */ v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -6826,8 +6826,8 @@ s_barrier v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+48:vgprValuC+48+1] // Multiply MI out reg with alpha v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(12,0,0,0) */ -s_mul_i32 s56, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait @@ -6837,8 +6837,8 @@ s_waitcnt vmcnt(0) // wait C /* apply mask, calc new C and issue writes */ v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -6857,8 +6857,8 @@ s_barrier v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+50:vgprValuC+50+1] // Multiply MI out reg with alpha v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(13,0,0,0) */ -s_mul_i32 s56, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait @@ -6868,8 +6868,8 @@ s_waitcnt vmcnt(0) // wait C /* apply mask, calc new C and issue writes */ v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -6888,8 +6888,8 @@ s_barrier v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+52:vgprValuC+52+1] // Multiply MI out reg with alpha v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(14,0,0,0) */ -s_mul_i32 s56, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait @@ -6899,8 +6899,8 @@ s_waitcnt vmcnt(0) // wait C /* apply mask, calc new C and issue writes */ v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -6919,8 +6919,8 @@ s_barrier v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+54:vgprValuC+54+1] // Multiply MI out reg with alpha v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(15,0,0,0) */ -s_mul_i32 s56, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait @@ -6930,8 +6930,8 @@ s_waitcnt vmcnt(0) // wait C /* apply mask, calc new C and issue writes */ v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -6950,8 +6950,8 @@ s_barrier v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+64:vgprValuC+64+1] // Multiply MI out reg with alpha v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(16,0,0,0) */ -s_mul_i32 s56, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait @@ -6961,8 +6961,8 @@ s_waitcnt vmcnt(0) // wait C /* apply mask, calc new C and issue writes */ v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -6981,8 +6981,8 @@ s_barrier v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+66:vgprValuC+66+1] // Multiply MI out reg with alpha v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(17,0,0,0) */ -s_mul_i32 s56, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait @@ -6992,8 +6992,8 @@ s_waitcnt vmcnt(0) // wait C /* apply mask, calc new C and issue writes */ v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -7012,8 +7012,8 @@ s_barrier v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+68:vgprValuC+68+1] // Multiply MI out reg with alpha v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(18,0,0,0) */ -s_mul_i32 s56, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait @@ -7023,8 +7023,8 @@ s_waitcnt vmcnt(0) // wait C /* apply mask, calc new C and issue writes */ v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -7043,8 +7043,8 @@ s_barrier v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+70:vgprValuC+70+1] // Multiply MI out reg with alpha v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(19,0,0,0) */ -s_mul_i32 s56, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait @@ -7054,8 +7054,8 @@ s_waitcnt vmcnt(0) // wait C /* apply mask, calc new C and issue writes */ v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -7074,8 +7074,8 @@ s_barrier v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+80:vgprValuC+80+1] // Multiply MI out reg with alpha v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(20,0,0,0) */ -s_mul_i32 s56, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait @@ -7085,8 +7085,8 @@ s_waitcnt vmcnt(0) // wait C /* apply mask, calc new C and issue writes */ v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -7105,8 +7105,8 @@ s_barrier v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+82:vgprValuC+82+1] // Multiply MI out reg with alpha v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(21,0,0,0) */ -s_mul_i32 s56, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait @@ -7116,8 +7116,8 @@ s_waitcnt vmcnt(0) // wait C /* apply mask, calc new C and issue writes */ v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -7136,8 +7136,8 @@ s_barrier v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+84:vgprValuC+84+1] // Multiply MI out reg with alpha v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(22,0,0,0) */ -s_mul_i32 s56, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait @@ -7147,8 +7147,8 @@ s_waitcnt vmcnt(0) // wait C /* apply mask, calc new C and issue writes */ v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -7167,8 +7167,8 @@ s_barrier v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+86:vgprValuC+86+1] // Multiply MI out reg with alpha v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(23,0,0,0) */ -s_mul_i32 s56, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait @@ -7178,8 +7178,8 @@ s_waitcnt vmcnt(0) // wait C /* apply mask, calc new C and issue writes */ v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -7198,8 +7198,8 @@ s_barrier v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+96:vgprValuC+96+1] // Multiply MI out reg with alpha v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(24,0,0,0) */ -s_mul_i32 s56, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait @@ -7209,8 +7209,8 @@ s_waitcnt vmcnt(0) // wait C /* apply mask, calc new C and issue writes */ v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -7229,8 +7229,8 @@ s_barrier v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+98:vgprValuC+98+1] // Multiply MI out reg with alpha v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(25,0,0,0) */ -s_mul_i32 s56, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait @@ -7240,8 +7240,8 @@ s_waitcnt vmcnt(0) // wait C /* apply mask, calc new C and issue writes */ v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -7260,8 +7260,8 @@ s_barrier v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+100:vgprValuC+100+1] // Multiply MI out reg with alpha v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(26,0,0,0) */ -s_mul_i32 s56, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait @@ -7271,8 +7271,8 @@ s_waitcnt vmcnt(0) // wait C /* apply mask, calc new C and issue writes */ v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -7291,8 +7291,8 @@ s_barrier v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+102:vgprValuC+102+1] // Multiply MI out reg with alpha v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(27,0,0,0) */ -s_mul_i32 s56, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait @@ -7302,8 +7302,8 @@ s_waitcnt vmcnt(0) // wait C /* apply mask, calc new C and issue writes */ v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -7322,8 +7322,8 @@ s_barrier v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+112:vgprValuC+112+1] // Multiply MI out reg with alpha v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(28,0,0,0) */ -s_mul_i32 s56, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait @@ -7333,8 +7333,8 @@ s_waitcnt vmcnt(0) // wait C /* apply mask, calc new C and issue writes */ v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -7353,8 +7353,8 @@ s_barrier v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+114:vgprValuC+114+1] // Multiply MI out reg with alpha v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(29,0,0,0) */ -s_mul_i32 s56, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait @@ -7364,8 +7364,8 @@ s_waitcnt vmcnt(0) // wait C /* apply mask, calc new C and issue writes */ v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -7384,8 +7384,8 @@ s_barrier v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+116:vgprValuC+116+1] // Multiply MI out reg with alpha v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(30,0,0,0) */ -s_mul_i32 s56, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait @@ -7395,8 +7395,8 @@ s_waitcnt vmcnt(0) // wait C /* apply mask, calc new C and issue writes */ v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -7415,8 +7415,8 @@ s_barrier v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+118:vgprValuC+118+1] // Multiply MI out reg with alpha v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(31,0,0,0) */ -s_mul_i32 s56, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe -s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideC1J], 32 // scale StrideC *= numRows(4) * bpe +s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait @@ -7426,8 +7426,8 @@ s_waitcnt vmcnt(0) // wait C /* apply mask, calc new C and issue writes */ v_fma_f64 v[vgprValuC+140:vgprValuC+140+1], v[136:137], s[sgprBeta:sgprBeta+1], v[vgprValuC+140:vgprValuC+140+1] // finalSum = sum*alpha + C*beta v_fma_f64 v[vgprValuC+142:vgprValuC+142+1], v[138:139], s[sgprBeta:sgprBeta+1], v[vgprValuC+142:vgprValuC+142+1] // finalSum = sum*alpha + C*beta -s_mul_i32 s56, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe -s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s56 // incToNextRow: gra SRD += inc(lower) +s_mul_i32 s64, s[sgprStrideD1J], 32 // scale StrideD *= numRows(4) * bpe +s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s64 // incToNextRow: gra SRD += inc(lower) s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0 // incToNextRow: gra SRD += inc(upper) buffer_store_dwordx4 v[140:143], v134, s[sgprSrdD:sgprSrdD+3], 0, offen, offset:0, glc slc // store D s_nop 0 // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst @@ -7436,7 +7436,7 @@ GW_B1_E1_44: // TODO in Generator // wider store if M % 2 == 0 -s_and_b32 s55, 0x1, s[sgprSizeI] +s_and_b32 s63, 0x1, s[sgprSizeI] s_cbranch_scc0 GW_B1_E1_VW2 // done shifting /* edge=1, allocate 6 sgpr. perBatchTmpS=4 perBatchMaskS=2 perElementMaskS=0 elementsPerBatch=1 */ @@ -7454,13 +7454,13 @@ s_barrier /* rC *= alpha batchEements=[(0, 0, 0, 0)] */ v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+0:vgprValuC+0+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(0,0,0,0) */ -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -7485,13 +7485,13 @@ s_barrier v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+8:vgprValuC+8+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(0,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -7518,17 +7518,17 @@ v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -7553,13 +7553,13 @@ s_barrier v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+10:vgprValuC+10+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(1,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -7586,17 +7586,17 @@ v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -7621,13 +7621,13 @@ s_barrier v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+12:vgprValuC+12+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(2,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -7654,17 +7654,17 @@ v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -7689,13 +7689,13 @@ s_barrier v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+14:vgprValuC+14+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(3,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -7722,17 +7722,17 @@ v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -7757,13 +7757,13 @@ s_barrier v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+24:vgprValuC+24+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(4,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -7790,17 +7790,17 @@ v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -7825,13 +7825,13 @@ s_barrier v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+26:vgprValuC+26+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(5,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -7858,17 +7858,17 @@ v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -7893,13 +7893,13 @@ s_barrier v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+28:vgprValuC+28+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(6,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -7926,17 +7926,17 @@ v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -7961,13 +7961,13 @@ s_barrier v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+30:vgprValuC+30+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(7,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -7994,17 +7994,17 @@ v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -8029,13 +8029,13 @@ s_barrier v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+40:vgprValuC+40+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(8,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -8062,17 +8062,17 @@ v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -8097,13 +8097,13 @@ s_barrier v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+42:vgprValuC+42+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(9,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -8130,17 +8130,17 @@ v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -8165,13 +8165,13 @@ s_barrier v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+44:vgprValuC+44+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(10,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -8198,17 +8198,17 @@ v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -8233,13 +8233,13 @@ s_barrier v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+46:vgprValuC+46+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(11,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -8266,17 +8266,17 @@ v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -8301,13 +8301,13 @@ s_barrier v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+56:vgprValuC+56+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(12,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -8334,17 +8334,17 @@ v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -8369,13 +8369,13 @@ s_barrier v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+58:vgprValuC+58+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(13,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -8402,17 +8402,17 @@ v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -8437,13 +8437,13 @@ s_barrier v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+60:vgprValuC+60+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(14,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -8470,17 +8470,17 @@ v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -8505,13 +8505,13 @@ s_barrier v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+62:vgprValuC+62+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(15,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -8538,17 +8538,17 @@ v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -8573,13 +8573,13 @@ s_barrier v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+72:vgprValuC+72+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(16,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -8606,17 +8606,17 @@ v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -8641,13 +8641,13 @@ s_barrier v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+74:vgprValuC+74+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(17,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -8674,17 +8674,17 @@ v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -8709,13 +8709,13 @@ s_barrier v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+76:vgprValuC+76+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(18,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -8742,17 +8742,17 @@ v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -8777,13 +8777,13 @@ s_barrier v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+78:vgprValuC+78+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(19,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -8810,17 +8810,17 @@ v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -8845,13 +8845,13 @@ s_barrier v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+88:vgprValuC+88+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(20,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -8878,17 +8878,17 @@ v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -8913,13 +8913,13 @@ s_barrier v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+90:vgprValuC+90+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(21,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -8946,17 +8946,17 @@ v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -8981,13 +8981,13 @@ s_barrier v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+92:vgprValuC+92+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(22,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -9014,17 +9014,17 @@ v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -9049,13 +9049,13 @@ s_barrier v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+94:vgprValuC+94+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(23,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -9082,17 +9082,17 @@ v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -9117,13 +9117,13 @@ s_barrier v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+104:vgprValuC+104+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(24,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -9150,17 +9150,17 @@ v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -9185,13 +9185,13 @@ s_barrier v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+106:vgprValuC+106+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(25,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -9218,17 +9218,17 @@ v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -9253,13 +9253,13 @@ s_barrier v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+108:vgprValuC+108+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(26,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -9286,17 +9286,17 @@ v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -9321,13 +9321,13 @@ s_barrier v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+110:vgprValuC+110+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(27,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -9354,17 +9354,17 @@ v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -9389,13 +9389,13 @@ s_barrier v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+120:vgprValuC+120+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(28,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -9422,17 +9422,17 @@ v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -9457,13 +9457,13 @@ s_barrier v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+122:vgprValuC+122+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(29,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -9490,17 +9490,17 @@ v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -9525,13 +9525,13 @@ s_barrier v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+124:vgprValuC+124+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(30,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -9558,17 +9558,17 @@ v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -9593,13 +9593,13 @@ s_barrier v_mul_f64 v[vgprValuC+138:vgprValuC+138+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+126:vgprValuC+126+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(31,0,0,1) */ _v_add_co_u32 v132, vcc, v128, 1 // coord0.1: coord0 += d0*sg0*VW + vc0 -v_cmp_lt_u32 s[56:57], v132, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v132, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v132, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx2 v[136:137], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -9629,13 +9629,13 @@ s_barrier v_mul_f64 v[vgprValuC+140:vgprValuC+140+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+0:vgprValuC+0+1] // Multiply MI out reg with alpha v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValuC+8:vgprValuC+8+1] // Multiply MI out reg with alpha /* (d1,vc1,d0,vc0)=(0,0,0,0) */ -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -9664,17 +9664,17 @@ v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -9703,17 +9703,17 @@ v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -9742,17 +9742,17 @@ v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -9781,17 +9781,17 @@ v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -9820,17 +9820,17 @@ v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -9859,17 +9859,17 @@ v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -9898,17 +9898,17 @@ v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -9937,17 +9937,17 @@ v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -9976,17 +9976,17 @@ v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -10015,17 +10015,17 @@ v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -10054,17 +10054,17 @@ v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -10093,17 +10093,17 @@ v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -10132,17 +10132,17 @@ v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -10171,17 +10171,17 @@ v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -10210,17 +10210,17 @@ v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -10249,17 +10249,17 @@ v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -10288,17 +10288,17 @@ v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -10327,17 +10327,17 @@ v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -10366,17 +10366,17 @@ v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -10405,17 +10405,17 @@ v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -10444,17 +10444,17 @@ v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -10483,17 +10483,17 @@ v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -10522,17 +10522,17 @@ v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -10561,17 +10561,17 @@ v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -10600,17 +10600,17 @@ v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -10639,17 +10639,17 @@ v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -10678,17 +10678,17 @@ v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -10717,17 +10717,17 @@ v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -10756,17 +10756,17 @@ v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -10795,17 +10795,17 @@ v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier @@ -10834,17 +10834,17 @@ v_mul_f64 v[vgprValuC+142:vgprValuC+142+1], s[sgprAlpha:sgprAlpha+1], v[vgprValu _v_add_co_u32 v129, vcc, v129, 4 // coord1.1: coord1Vgpr += d1*sg1*VW + vc1 /* Fix for UseInitialStridesCD, emitAddressSetupCode */ -s_mul_i32 s56, s[sgprStrideC1J], 4 // scale stride -_v_add_u32 v130, v130, s56 // ROWINC- Move cinRowPtr to next row -s_mul_i32 s56, s[sgprStrideD1J], 4 // scale stride -_v_add_u32 v131, v131, s56 // Move coutRowPtr to next row -v_cmp_lt_u32 s[56:57], v128, s[sgprSizeI] // coord0 < size0 -v_cmp_lt_u32 s[60:61], v129, s[sgprSizeJ] // coord1 < size1 -s_and_b64 s[60:61], s[56:57], s[60:61] // in0 && in1 +s_mul_i32 s64, s[sgprStrideC1J], 4 // scale stride +_v_add_u32 v130, v130, s64 // ROWINC- Move cinRowPtr to next row +s_mul_i32 s64, s[sgprStrideD1J], 4 // scale stride +_v_add_u32 v131, v131, s64 // Move coutRowPtr to next row +v_cmp_lt_u32 s[64:65], v128, s[sgprSizeI] // coord0 < size0 +v_cmp_lt_u32 s[38:39], v129, s[sgprSizeJ] // coord1 < size1 +s_and_b64 s[38:39], s[64:65], s[38:39] // in0 && in1 _v_add_lshl_u32 v135, v130, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v135, -1, v135, s[60:61] // LDC clip if OOB. offset +v_cndmask_b32 v135, -1, v135, s[38:39] // LDC clip if OOB. offset _v_add_lshl_u32 v134, v131, v128, 0x3 // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr -v_cndmask_b32 v134, -1, v134, s[60:61] // LDD clip if OOB. offset +v_cndmask_b32 v134, -1, v134, s[38:39] // LDD clip if OOB. offset buffer_load_dwordx4 v[136:139], v135, s[sgprSrdC:sgprSrdC+3], 0, offen offset:0, glc slc // load C for beta calc s_sleep 5 // optimization: sync and wait s_barrier diff --git a/Tensile/CustomKernels/DGEMM_Aldebaran_PKFixedAtomic512Latest.s b/Tensile/CustomKernels/DGEMM_Aldebaran_PKFixedAtomic512Latest.s index 51cbb61951..a1ac6d8279 100644 --- a/Tensile/CustomKernels/DGEMM_Aldebaran_PKFixedAtomic512Latest.s +++ b/Tensile/CustomKernels/DGEMM_Aldebaran_PKFixedAtomic512Latest.s @@ -136,149 +136,149 @@ amdhsa.kernels: .value_kind: global_buffer .value_type: f64 .address_space: generic - - .name: alpha + - .name: OffsetD .size: 8 .offset: 56 .value_kind: by_value + .value_type: u64 + - .name: OffsetC + .size: 8 + .offset: 64 + .value_kind: by_value + .value_type: u64 + - .name: OffsetA + .size: 8 + .offset: 72 + .value_kind: by_value + .value_type: u64 + - .name: OffsetB + .size: 8 + .offset: 80 + .value_kind: by_value + .value_type: u64 + - .name: alpha + .size: 8 + .offset: 88 + .value_kind: by_value .value_type: f64 - .name: beta .size: 8 - .offset: 64 + .offset: 96 .value_kind: by_value .value_type: f64 - .name: strideD0 .size: 4 - .offset: 72 + .offset: 104 .value_kind: by_value .value_type: u32 - .name: strideD1 .size: 4 - .offset: 76 + .offset: 108 .value_kind: by_value .value_type: u32 - .name: strideC0 .size: 4 - .offset: 80 + .offset: 112 .value_kind: by_value .value_type: u32 - .name: strideC1 .size: 4 - .offset: 84 + .offset: 116 .value_kind: by_value .value_type: u32 - .name: strideA0 .size: 4 - .offset: 88 + .offset: 120 .value_kind: by_value .value_type: u32 - .name: strideA1 .size: 4 - .offset: 92 + .offset: 124 .value_kind: by_value .value_type: u32 - .name: strideB0 .size: 4 - .offset: 96 + .offset: 128 .value_kind: by_value .value_type: u32 - .name: strideB1 .size: 4 - .offset: 100 + .offset: 132 .value_kind: by_value .value_type: u32 - .name: SizesFree0 .size: 4 - .offset: 104 + .offset: 136 .value_kind: by_value .value_type: u32 - .name: SizesFree1 .size: 4 - .offset: 108 + .offset: 140 .value_kind: by_value .value_type: u32 - .name: SizesFree2 .size: 4 - .offset: 112 + .offset: 144 .value_kind: by_value .value_type: u32 - .name: SizesSum0 .size: 4 - .offset: 116 + .offset: 148 .value_kind: by_value .value_type: u32 - .name: OrigStaggerUIter .size: 4 - .offset: 120 + .offset: 152 .value_kind: by_value .value_type: i32 - .name: NumWorkGroups0 .size: 4 - .offset: 124 + .offset: 156 .value_kind: by_value .value_type: u32 - .name: NumWorkGroups1 .size: 4 - .offset: 128 + .offset: 160 .value_kind: by_value .value_type: u32 - .name: MagicNumberProblemNumGroupTiles0 .size: 4 - .offset: 132 + .offset: 164 .value_kind: by_value .value_type: u32 - .name: MagicShiftProblemNumGroupTiles0 .size: 4 - .offset: 136 + .offset: 168 .value_kind: by_value .value_type: u32 - .name: GridNumWorkGroups0 .size: 4 - .offset: 140 + .offset: 172 .value_kind: by_value .value_type: u32 - .name: NumFullBlocks .size: 4 - .offset: 144 + .offset: 176 .value_kind: by_value .value_type: u32 - .name: WgmRemainder1 .size: 4 - .offset: 148 + .offset: 180 .value_kind: by_value .value_type: u32 - .name: MagicNumberWgmRemainder1 .size: 4 - .offset: 152 - .value_kind: by_value - .value_type: u32 - - .name: OffsetD - .size: 4 - .offset: 156 - .value_kind: by_value - .value_type: u32 - - .name: OffsetC - .size: 4 - .offset: 160 - .value_kind: by_value - .value_type: u32 - - .name: OffsetA - .size: 4 - .offset: 164 - .value_kind: by_value - .value_type: u32 - - .name: OffsetB - .size: 4 - .offset: 168 + .offset: 184 .value_kind: by_value .value_type: u32 - .name: padding .size: 4 - .offset: 172 + .offset: 188 .value_kind: by_value .value_type: u32 .group_segment_fixed_size: 60000 .kernarg_segment_align: 8 - .kernarg_segment_size: 160 + .kernarg_segment_size: 192 .max_flat_workgroup_size: 256 .private_segment_fixed_size: 0 .sgpr_count: 68 @@ -292,61 +292,79 @@ amdhsa.kernels: DGEMM_Aldebaran_PKFixedAtomic512Latest: .long 0x8601FF01, 0x0000FFFF -.long 0xC0020980, 0x00000068 -.long 0xC00209C0, 0x0000006C -.long 0xC0020A00, 0x00000074 -.long 0xC0020A40, 0x00000058 -.long 0xC0020A80, 0x00000060 -.long 0xC0020AC0, 0x00000050 -.long 0xC0060B00, 0x00000038 -.long 0xC0060B80, 0x00000040 +.long 0xC00207C0, 0x00000088 +.long 0xC0020800, 0x0000008C +.long 0xC0020840, 0x00000078 +.long 0xC0020880, 0x00000080 +.long 0xC00208C0, 0x00000070 +.long 0xC0060A80, 0x00000000 +.long 0xC0060B00, 0x00000008 +.long 0xC0060B80, 0x00000010 .long 0xC0060100, 0x00000028 .long 0xC0060200, 0x00000030 .long 0xC0060400, 0x00000020 -.long 0xC0020D00, 0x0000007C -.long 0xC0020D40, 0x00000080 -.long 0xC0020D80, 0x00000084 -.long 0xC0020DC0, 0x00000088 -.long 0xC0020E00, 0x0000008C +.long 0xC0061200, 0x00000048 +.long 0xC0061280, 0x00000050 +.long 0xC0061300, 0x00000040 +.long 0xC0020D00, 0x0000009C +.long 0xC0020D40, 0x000000A0 +.long 0xC0020D80, 0x000000A4 +.long 0xC0020DC0, 0x000000A8 +.long 0xC0020E00, 0x000000AC .long 0x20040086 .long 0x260000BF .long 0x7E600502 .long 0xBEB10002 .long 0xBF8CC07F -.long 0xBE8600FF, 0x80000000 -.long 0xBE8A00FF, 0x80000000 -.long 0xBE9200FF, 0x80000000 +.long 0x8EC88348 +.long 0x80044804 +.long 0x82054905 +.long 0x8ECA834A +.long 0x80084A08 +.long 0x82094B09 +.long 0x8ECC834C +.long 0x80104C10 +.long 0x82114D11 .long 0xBE8700FF, 0x00020000 .long 0xBE8B00FF, 0x00020000 .long 0xBE9300FF, 0x00020000 .long 0x8605FF05, 0x0000FFFF .long 0x8609FF09, 0x0000FFFF .long 0x8611FF11, 0x0000FFFF -.long 0x8705FF05, 0x00040000 -.long 0x8709FF09, 0x00040000 -.long 0x8711FF11, 0x00040000 -.long 0x921D3534 +.long 0x92173534 +.long 0xBF031731 +.long 0xBF85331E .long 0xBF033534 .long 0x85493534 .long 0x85483435 -.long 0xBECA0088 -.long 0xBECB00FF, 0x10000001 -.long 0xBECA0289 -.long 0xBECB02FF, 0x0E38E38F -.long 0xBE940090 -.long 0xBE9500FF, 0x08000001 -.long 0xBE940282 -.long 0xBE9502FF, 0x40000001 -.long 0xBF03FF1D, 0x00013880 -.long 0xBECA0214 -.long 0xBECB0215 -.long 0xBE97004B -.long 0x96151748 -.long 0x92141748 -.long 0x8F949F14 -.long 0xBEB90014 -.long 0x92144A39 -.long 0x80BA1448 +.long 0xBECA008C +.long 0xBECB00FF, 0x0AAAAAAB +.long 0xBF033534 +.long 0xBECA028A +.long 0xBECB02FF, 0x0CCCCCCD +.long 0xBE8C0090 +.long 0xBE8D00FF, 0x08000001 +.long 0xBF033534 +.long 0xBE8C028A +.long 0xBE8D02FF, 0x0CCCCCCD +.long 0xBF03FF17, 0x00007530 +.long 0xBECA020C +.long 0xBECB020D +.long 0xBE8C0090 +.long 0xBE8D00FF, 0x08000001 +.long 0xBF033534 +.long 0xBE8C0282 +.long 0xBE8D02FF, 0x40000001 +.long 0xBF03FF17, 0x00013880 +.long 0xBECA020C +.long 0xBECB020D +.long 0xBE8F004B +.long 0x960D0F48 +.long 0x920C0F48 +.long 0x8F8C9F0C +.long 0xBEB9000C +.long 0x920C4A39 +.long 0x80BA0C48 .long 0xB13A0000 .long 0xBEBA024A .long 0xBEBB00FF, 0x05555556 @@ -398,12 +416,12 @@ DGEMM_Aldebaran_PKFixedAtomic512Latest: .long 0xBEBB02FF, 0x80000001 .long 0xBECC0031 .long 0xB04D0000 -.long 0x96153631 -.long 0x8F169F37 -.long 0x92141631 -.long 0x80141514 -.long 0x8616FF37, 0x7FFFFFFF -.long 0x8F331614 +.long 0x960D3631 +.long 0x8F0E9F37 +.long 0x920C0E31 +.long 0x800C0D0C +.long 0x860EFF37, 0x7FFFFFFF +.long 0x8F330E0C .long 0x92323433 .long 0x80B23231 .long 0xBF033534 @@ -412,100 +430,107 @@ DGEMM_Aldebaran_PKFixedAtomic512Latest: .long 0xBF033849 .long 0xBEC6024C .long 0xBEC7024D -.long 0xBE97004B -.long 0x96151747 -.long 0x92141747 -.long 0x8F949F14 -.long 0x92154A14 -.long 0x80951547 -.long 0x92154915 -.long 0x80154615 -.long 0xBF093914 -.long 0xBE97023B -.long 0x85164A3A -.long 0x96471715 -.long 0x92461715 +.long 0xBE8F004B +.long 0x960D0F47 +.long 0x920C0F47 +.long 0x8F8C9F0C +.long 0x920D4A0C +.long 0x808D0D47 +.long 0x920D490D +.long 0x800D460D +.long 0xBF09390C +.long 0xBE8F023B +.long 0x850E4A3A +.long 0x96470F0D +.long 0x92460F0D .long 0x8FC69F46 -.long 0x92471646 -.long 0x80C74715 -.long 0x92144A14 -.long 0x80471447 +.long 0x92470E46 +.long 0x80C7470D +.long 0x920C4A0C +.long 0x80470C47 .long 0xBF033534 .long 0x85324746 .long 0x85334647 .long 0xBEC20108 -.long 0x8E15812A -.long 0x92151530 -.long 0x8E228215 -.long 0xB0160100 -.long 0x92143316 -.long 0x96153316 -.long 0x8E948214 -.long 0x80142214 -.long 0x82158015 -.long 0x80081408 -.long 0x82091509 -.long 0x7E100300 -.long 0xB0140400 -.long 0x92151430 -.long 0xBE9C0015 -.long 0xBEFC001C -.long 0xD1130054, 0x00011F00 -.long 0x2418A883 -.long 0x20A80084 -.long 0xD1060054, 0x00002954 -.long 0x6818A90C -.long 0x8E1B852A +.long 0x8E0D8322 +.long 0x921B0D30 +.long 0x8E0C8A33 +.long 0x800C1B0C +.long 0x80080C08 +.long 0x82098009 +.long 0x8EAE832E +.long 0x8028422E +.long 0x8229432F +.long 0x240E0082 +.long 0xB00C0400 +.long 0x920D0C30 +.long 0xBE96000D +.long 0xBEFC0016 +.long 0xD1130010, 0x00011F00 +.long 0x241E2083 +.long 0x20200084 +.long 0xD1060010, 0x00001910 +.long 0x681E210F +.long 0x8E158522 +.long 0xBE8A00FF, 0x80000000 .long 0xBEC00104 -.long 0xB0160100 -.long 0x92143216 -.long 0x8E148214 -.long 0x9216C030 -.long 0x8E218216 -.long 0x81142114 +.long 0x8E0C8A32 +.long 0x921AFF30, 0x00000100 +.long 0x810C1A0C +.long 0x80040C04 +.long 0x82058005 +.long 0x8EAC832C +.long 0x8026402C +.long 0x8227412D +.long 0x8E0D8121 +.long 0xD1130006, 0x00011F00 +.long 0x240C0C82 +.long 0x20200084 +.long 0xD1060010, 0x00001B10 +.long 0x680C2106 +.long 0x240C0C82 +.long 0x8E148521 +.long 0xBE8600FF, 0x80000000 +.long 0xE0511000, 0x80020707 +.long 0xE0511100, 0x80020707 +.long 0xE0511200, 0x80020707 +.long 0xE0511300, 0x80020707 +.long 0xE05C1000, 0x80013406 +.long 0xB77C1020 .long 0x80041404 .long 0x82058005 -.long 0x8E158129 -.long 0xD1130007, 0x00011F00 -.long 0x240E0E82 -.long 0x20A80084 -.long 0xD1060054, 0x00002B54 -.long 0x680EA907 -.long 0x8E1A8529 -.long 0xBE8C0080 -.long 0xBE8D0080 -.long 0xE0512000, 0x0D020808 -.long 0xE0512100, 0x0D020808 -.long 0xE0512200, 0x0D020808 -.long 0xE0512300, 0x0D020808 -.long 0xE05C2000, 0x0C013407 -.long 0xB77C1000 -.long 0x810C1A0C -.long 0x810D1B0D -.long 0xE0512000, 0x0D020808 -.long 0xE0512100, 0x0D020808 -.long 0xE0512200, 0x0D020808 -.long 0xE0512300, 0x0D020808 -.long 0xE05C2000, 0x0C013807 -.long 0xB77C1000 -.long 0x810C1A0C -.long 0x810D1B0D -.long 0xE0512000, 0x0D020808 -.long 0xE0512100, 0x0D020808 -.long 0xE0512200, 0x0D020808 -.long 0xE0512300, 0x0D020808 -.long 0xE05C2000, 0x0C013C07 -.long 0xB77C1000 -.long 0x810C1A0C -.long 0x810D1B0D -.long 0xE0512000, 0x0D020808 -.long 0xE0512100, 0x0D020808 -.long 0xE0512200, 0x0D020808 -.long 0xE0512300, 0x0D020808 -.long 0xE05C2000, 0x0C014007 -.long 0xB77C1000 -.long 0x810C1A0C -.long 0x810D1B0D +.long 0x80081508 +.long 0x82098009 +.long 0xE0511000, 0x80020707 +.long 0xE0511100, 0x80020707 +.long 0xE0511200, 0x80020707 +.long 0xE0511300, 0x80020707 +.long 0xE05C1000, 0x80013806 +.long 0xB77C1020 +.long 0x80041404 +.long 0x82058005 +.long 0x80081508 +.long 0x82098009 +.long 0xE0511000, 0x80020707 +.long 0xE0511100, 0x80020707 +.long 0xE0511200, 0x80020707 +.long 0xE0511300, 0x80020707 +.long 0xE05C1000, 0x80013C06 +.long 0xB77C1020 +.long 0x80041404 +.long 0x82058005 +.long 0x80081508 +.long 0x82098009 +.long 0xE0511000, 0x80020707 +.long 0xE0511100, 0x80020707 +.long 0xE0511200, 0x80020707 +.long 0xE0511300, 0x80020707 +.long 0xE05C1000, 0x80014006 +.long 0xB77C1020 +.long 0x80041404 +.long 0x82058005 +.long 0x80081508 +.long 0x82098009 .long 0xD3D94000, 0x18000080 .long 0xD3D94001, 0x18000080 .long 0xD3D94002, 0x18000080 @@ -764,69 +789,91 @@ DGEMM_Aldebaran_PKFixedAtomic512Latest: .long 0xD3D940FF, 0x18000080 .long 0xBF8C4F70 .long 0xBF8A0000 -.long 0xD8EC0000, 0x1400000C -.long 0xD8EC0080, 0x1600000C -.long 0xD8EC0100, 0x1800000C -.long 0xD8EC0180, 0x1A00000C -.long 0xD8EC0200, 0x1C00000C -.long 0xD8EC0280, 0x1E00000C -.long 0xD8EC0300, 0x2000000C -.long 0xD8EC0380, 0x2200000C +.long 0xD8EC0000, 0x1400000F +.long 0xD8EC0080, 0x1600000F +.long 0xD8EC0100, 0x1800000F +.long 0xD8EC0180, 0x1A00000F +.long 0xD8EC0200, 0x1C00000F +.long 0xD8EC0280, 0x1E00000F +.long 0xD8EC0300, 0x2000000F +.long 0xD8EC0380, 0x2200000F .long 0xBEC40110 -.long 0xB0140100 -.long 0x8E1E812B -.long 0x92183214 -.long 0x9216FF1E, 0x00000080 -.long 0x92143316 -.long 0x96153316 -.long 0x80181418 -.long 0x82198015 -.long 0x921630C0 -.long 0x80181618 -.long 0x82198019 -.long 0x8E1F841E -.long 0xD1130054, 0x00011F00 -.long 0x2412A882 -.long 0x20A80084 -.long 0x0CA8A81E -.long 0x6812A909 -.long 0x8E988218 -.long 0xBEA40080 -.long 0xB3240078 -.long 0xBF84021A +.long 0x8E188123 +.long 0x8E198418 +.long 0xB00C0100 +.long 0x920F320C +.long 0x920EFF18, 0x00000080 +.long 0x920C330E +.long 0x960D330E +.long 0x800C0F0C +.long 0x820D800D +.long 0x8E8C820C +.long 0x920E30FF, 0x00000100 +.long 0x803E0E0C +.long 0x823F800D +.long 0x8EAA832A +.long 0x80A40C2A +.long 0x82A50D2B +.long 0xBE9200FF, 0x80000000 +.long 0xBF091224 +.long 0x850C2412 +.long 0xBF068025 +.long 0xBE92020C +.long 0xD1130010, 0x00011F00 +.long 0x24182082 +.long 0x20200084 +.long 0x0C202018 +.long 0x6818210C +.long 0x24181882 +.long 0xD134000D, 0x0001110C +.long 0x924EFF32, 0x00000080 +.long 0x80CE4E1F +.long 0xD1130010, 0x00011F00 +.long 0x24202081 +.long 0x8E0C8530 +.long 0xD134000A, 0x00001910 +.long 0xD134000B, 0x0001030A +.long 0x7E1C02FF, 0x80000000 +.long 0xD0C9000C, 0x00009D0A +.long 0xD1000008, 0x0032190E +.long 0xD0C9000C, 0x00009D0B +.long 0xD1000009, 0x00321B0E +.long 0xBE9D0080 .long 0xBF8C0F7B .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8000, 0x04026914 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC1000, 0x2400000C -.long 0xD8EC1080, 0x2600000C +.long 0xE0511000, 0x80020707 +.long 0xD8EC1020, 0x2400000F +.long 0xD8EC10A0, 0x2600000F .long 0xD3EE8008, 0x04226916 .long 0xBF8CC67F .long 0xD3EE8010, 0x04426918 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC1100, 0x2800000C -.long 0xD8EC1180, 0x2A00000C +.long 0xE0511100, 0x80020707 +.long 0xD8EC1120, 0x2800000F +.long 0xD8EC11A0, 0x2A00000F .long 0xD3EE8018, 0x0462691A .long 0xBF8CC67F .long 0xD3EE8020, 0x0482691C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC1200, 0x2C00000C -.long 0xD8EC1280, 0x2E00000C +.long 0xE0511200, 0x80020707 +.long 0xD8EC1220, 0x2C00000F +.long 0xD8EC12A0, 0x2E00000F .long 0xD3EE8028, 0x04A2691E .long 0xBF8CC67F .long 0xD3EE8030, 0x04C26920 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC1300, 0x3000000C -.long 0xD8EC1380, 0x3200000C +.long 0xE0511300, 0x80020707 +.long 0xD8EC1320, 0x3000000F +.long 0xD8EC13A0, 0x3200000F .long 0xD3EE8038, 0x04E26922 -.long 0xB77C1000 -.long 0x810D1B0D +.long 0xB77C1020 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE8040, 0x05026D14 -.long 0xE05C2000, 0x0C014407 +.long 0xE05C1000, 0x80014406 .long 0xD3EE8048, 0x05226D16 .long 0xD3EE8050, 0x05426D18 -.long 0x810C1A0C +.long 0x80041404 +.long 0x82058005 .long 0xD3EE8058, 0x05626D1A .long 0xD3EE8060, 0x05826D1C .long 0xD3EE8068, 0x05A26D1E @@ -836,35 +883,37 @@ DGEMM_Aldebaran_PKFixedAtomic512Latest: .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8000, 0x04027124 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC2000, 0x1400000C -.long 0xD8EC2080, 0x1600000C +.long 0xE0511000, 0x80020707 +.long 0xD8EC2040, 0x1400000F +.long 0xD8EC20C0, 0x1600000F .long 0xD3EE8008, 0x04227126 .long 0xBF8CC67F .long 0xD3EE8010, 0x04427128 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC2100, 0x1800000C -.long 0xD8EC2180, 0x1A00000C +.long 0xE0511100, 0x80020707 +.long 0xD8EC2140, 0x1800000F +.long 0xD8EC21C0, 0x1A00000F .long 0xD3EE8018, 0x0462712A .long 0xBF8CC67F .long 0xD3EE8020, 0x0482712C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC2200, 0x1C00000C -.long 0xD8EC2280, 0x1E00000C +.long 0xE0511200, 0x80020707 +.long 0xD8EC2240, 0x1C00000F +.long 0xD8EC22C0, 0x1E00000F .long 0xD3EE8028, 0x04A2712E .long 0xBF8CC67F .long 0xD3EE8030, 0x04C27130 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC2300, 0x2000000C -.long 0xD8EC2380, 0x2200000C +.long 0xE0511300, 0x80020707 +.long 0xD8EC2340, 0x2000000F +.long 0xD8EC23C0, 0x2200000F .long 0xD3EE8038, 0x04E27132 -.long 0xB77C1000 -.long 0x810D1B0D +.long 0xB77C1020 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE8040, 0x05027524 -.long 0xE05C2000, 0x0C014807 +.long 0xE05C1000, 0x80014806 .long 0xD3EE8048, 0x05227526 .long 0xD3EE8050, 0x05427528 -.long 0x810C1A0C +.long 0x80041404 +.long 0x82058005 .long 0xD3EE8058, 0x0562752A .long 0xD3EE8060, 0x0582752C .long 0xD3EE8068, 0x05A2752E @@ -874,35 +923,37 @@ DGEMM_Aldebaran_PKFixedAtomic512Latest: .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8000, 0x04027914 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC3000, 0x2400000C -.long 0xD8EC3080, 0x2600000C +.long 0xE0511000, 0x80020707 +.long 0xD8EC3060, 0x2400000F +.long 0xD8EC30E0, 0x2600000F .long 0xD3EE8008, 0x04227916 .long 0xBF8CC67F .long 0xD3EE8010, 0x04427918 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC3100, 0x2800000C -.long 0xD8EC3180, 0x2A00000C +.long 0xE0511100, 0x80020707 +.long 0xD8EC3160, 0x2800000F +.long 0xD8EC31E0, 0x2A00000F .long 0xD3EE8018, 0x0462791A .long 0xBF8CC67F .long 0xD3EE8020, 0x0482791C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC3200, 0x2C00000C -.long 0xD8EC3280, 0x2E00000C +.long 0xE0511200, 0x80020707 +.long 0xD8EC3260, 0x2C00000F +.long 0xD8EC32E0, 0x2E00000F .long 0xD3EE8028, 0x04A2791E .long 0xBF8CC67F .long 0xD3EE8030, 0x04C27920 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC3300, 0x3000000C -.long 0xD8EC3380, 0x3200000C +.long 0xE0511300, 0x80020707 +.long 0xD8EC3360, 0x3000000F +.long 0xD8EC33E0, 0x3200000F .long 0xD3EE8038, 0x04E27922 -.long 0xB77C1000 -.long 0x810D1B0D +.long 0xB77C1020 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE8040, 0x05027D14 -.long 0xE05C2000, 0x0C014C07 +.long 0xE05C1000, 0x80014C06 .long 0xD3EE8048, 0x05227D16 .long 0xD3EE8050, 0x05427D18 -.long 0x810C1A0C +.long 0x80041404 +.long 0x82058005 .long 0xD3EE8058, 0x05627D1A .long 0xD3EE8060, 0x05827D1C .long 0xD3EE8068, 0x05A27D1E @@ -912,35 +963,37 @@ DGEMM_Aldebaran_PKFixedAtomic512Latest: .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8000, 0x04028124 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC4000, 0x1400000C -.long 0xD8EC4080, 0x1600000C +.long 0xE0511000, 0x80020707 +.long 0xD8EC4080, 0x1400000F +.long 0xD8EC4100, 0x1600000F .long 0xD3EE8008, 0x04228126 .long 0xBF8CC67F .long 0xD3EE8010, 0x04428128 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC4100, 0x1800000C -.long 0xD8EC4180, 0x1A00000C +.long 0xE0511100, 0x80020707 +.long 0xD8EC4180, 0x1800000F +.long 0xD8EC4200, 0x1A00000F .long 0xD3EE8018, 0x0462812A .long 0xBF8CC67F .long 0xD3EE8020, 0x0482812C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC4200, 0x1C00000C -.long 0xD8EC4280, 0x1E00000C +.long 0xE0511200, 0x80020707 +.long 0xD8EC4280, 0x1C00000F +.long 0xD8EC4300, 0x1E00000F .long 0xD3EE8028, 0x04A2812E .long 0xBF8CC67F .long 0xD3EE8030, 0x04C28130 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC4300, 0x2000000C -.long 0xD8EC4380, 0x2200000C +.long 0xE0511300, 0x80020707 +.long 0xD8EC4380, 0x2000000F +.long 0xD8EC4400, 0x2200000F .long 0xD3EE8038, 0x04E28132 -.long 0xBEFC001C -.long 0x810D1B0D +.long 0xBEFC0016 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE8040, 0x05028524 -.long 0xE05C2000, 0x0C015007 +.long 0xE05C1000, 0x80015006 .long 0xD3EE8048, 0x05228526 .long 0xD3EE8050, 0x05428528 -.long 0x810C1A0C +.long 0x80041404 +.long 0x82058005 .long 0xD3EE8058, 0x0562852A .long 0xD3EE8060, 0x0582852C .long 0xD3EE8068, 0x05A2852E @@ -950,35 +1003,37 @@ DGEMM_Aldebaran_PKFixedAtomic512Latest: .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8000, 0x04028914 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC5000, 0x2400000C -.long 0xD8EC5080, 0x2600000C +.long 0xE0511000, 0x80020707 +.long 0xD8EC50A0, 0x2400000F +.long 0xD8EC5120, 0x2600000F .long 0xD3EE8008, 0x04228916 .long 0xBF8CC67F .long 0xD3EE8010, 0x04428918 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC5100, 0x2800000C -.long 0xD8EC5180, 0x2A00000C +.long 0xE0511100, 0x80020707 +.long 0xD8EC51A0, 0x2800000F +.long 0xD8EC5220, 0x2A00000F .long 0xD3EE8018, 0x0462891A .long 0xBF8CC67F .long 0xD3EE8020, 0x0482891C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC5200, 0x2C00000C -.long 0xD8EC5280, 0x2E00000C +.long 0xE0511200, 0x80020707 +.long 0xD8EC52A0, 0x2C00000F +.long 0xD8EC5320, 0x2E00000F .long 0xD3EE8028, 0x04A2891E .long 0xBF8CC67F .long 0xD3EE8030, 0x04C28920 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC5300, 0x3000000C -.long 0xD8EC5380, 0x3200000C +.long 0xE0511300, 0x80020707 +.long 0xD8EC53A0, 0x3000000F +.long 0xD8EC5420, 0x3200000F .long 0xD3EE8038, 0x04E28922 -.long 0xB77C1000 -.long 0x810D1B0D +.long 0xB77C1020 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE8040, 0x05028D14 -.long 0xE05C2000, 0x0C013407 +.long 0xE05C1000, 0x80013406 .long 0xD3EE8048, 0x05228D16 .long 0xD3EE8050, 0x05428D18 -.long 0x810C1A0C +.long 0x80041404 +.long 0x82058005 .long 0xD3EE8058, 0x05628D1A .long 0xD3EE8060, 0x05828D1C .long 0xD3EE8068, 0x05A28D1E @@ -988,35 +1043,37 @@ DGEMM_Aldebaran_PKFixedAtomic512Latest: .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8000, 0x04029124 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC6000, 0x1400000C -.long 0xD8EC6080, 0x1600000C +.long 0xE0511000, 0x80020707 +.long 0xD8EC60C0, 0x1400000F +.long 0xD8EC6140, 0x1600000F .long 0xD3EE8008, 0x04229126 .long 0xBF8CC67F .long 0xD3EE8010, 0x04429128 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC6100, 0x1800000C -.long 0xD8EC6180, 0x1A00000C +.long 0xE0511100, 0x80020707 +.long 0xD8EC61C0, 0x1800000F +.long 0xD8EC6240, 0x1A00000F .long 0xD3EE8018, 0x0462912A .long 0xBF8CC67F .long 0xD3EE8020, 0x0482912C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC6200, 0x1C00000C -.long 0xD8EC6280, 0x1E00000C +.long 0xE0511200, 0x80020707 +.long 0xD8EC62C0, 0x1C00000F +.long 0xD8EC6340, 0x1E00000F .long 0xD3EE8028, 0x04A2912E .long 0xBF8CC67F .long 0xD3EE8030, 0x04C29130 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC6300, 0x2000000C -.long 0xD8EC6380, 0x2200000C +.long 0xE0511300, 0x80020707 +.long 0xD8EC63C0, 0x2000000F +.long 0xD8EC6440, 0x2200000F .long 0xD3EE8038, 0x04E29132 -.long 0xB77C1000 -.long 0x810D1B0D +.long 0xB77C1020 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE8040, 0x05029524 -.long 0xE05C2000, 0x0C013807 +.long 0xE05C1000, 0x80013806 .long 0xD3EE8048, 0x05229526 .long 0xD3EE8050, 0x05429528 -.long 0x810C1A0C +.long 0x80041404 +.long 0x82058005 .long 0xD3EE8058, 0x0562952A .long 0xD3EE8060, 0x0582952C .long 0xD3EE8068, 0x05A2952E @@ -1026,35 +1083,37 @@ DGEMM_Aldebaran_PKFixedAtomic512Latest: .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8000, 0x04029914 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC7000, 0x2400000C -.long 0xD8EC7080, 0x2600000C +.long 0xE0511000, 0x80020707 +.long 0xD8EC70E0, 0x2400000F +.long 0xD8EC7160, 0x2600000F .long 0xD3EE8008, 0x04229916 .long 0xBF8CC67F .long 0xD3EE8010, 0x04429918 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC7100, 0x2800000C -.long 0xD8EC7180, 0x2A00000C +.long 0xE0511100, 0x80020707 +.long 0xD8EC71E0, 0x2800000F +.long 0xD8EC7260, 0x2A00000F .long 0xD3EE8018, 0x0462991A .long 0xBF8CC67F .long 0xD3EE8020, 0x0482991C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC7200, 0x2C00000C -.long 0xD8EC7280, 0x2E00000C +.long 0xE0511200, 0x80020707 +.long 0xD8EC72E0, 0x2C00000F +.long 0xD8EC7360, 0x2E00000F .long 0xD3EE8028, 0x04A2991E .long 0xBF8CC67F .long 0xD3EE8030, 0x04C29920 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC7300, 0x3000000C -.long 0xD8EC7380, 0x3200000C +.long 0xE0511300, 0x80020707 +.long 0xD8EC73E0, 0x3000000F +.long 0xD8EC7460, 0x3200000F .long 0xD3EE8038, 0x04E29922 -.long 0xB77C1000 -.long 0x810D1B0D +.long 0xB77C1020 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE8040, 0x05029D14 -.long 0xE05C2000, 0x0C013C07 +.long 0xE05C1000, 0x80013C06 .long 0xD3EE8048, 0x05229D16 .long 0xD3EE8050, 0x05429D18 -.long 0x810C1A0C +.long 0x80041404 +.long 0x82058005 .long 0xD3EE8058, 0x05629D1A .long 0xD3EE8060, 0x05829D1C .long 0xD3EE8068, 0x05A29D1E @@ -1064,75 +1123,80 @@ DGEMM_Aldebaran_PKFixedAtomic512Latest: .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8000, 0x0402A124 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC0000, 0x1400000C -.long 0xD8EC0080, 0x1600000C +.long 0xE0511000, 0x80020707 +.long 0xD8EC0000, 0x1400000F +.long 0xD8EC0080, 0x1600000F .long 0xD3EE8008, 0x0422A126 .long 0xBF8CC67F .long 0xD3EE8010, 0x0442A128 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC0100, 0x1800000C -.long 0xD8EC0180, 0x1A00000C +.long 0xE0511100, 0x80020707 +.long 0xD8EC0100, 0x1800000F +.long 0xD8EC0180, 0x1A00000F .long 0xD3EE8018, 0x0462A12A .long 0xBF8CC67F .long 0xD3EE8020, 0x0482A12C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC0200, 0x1C00000C -.long 0xD8EC0280, 0x1E00000C +.long 0xE0511200, 0x80020707 +.long 0xD8EC0200, 0x1C00000F +.long 0xD8EC0280, 0x1E00000F .long 0xD3EE8028, 0x04A2A12E .long 0xBF8CC67F .long 0xD3EE8030, 0x04C2A130 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC0300, 0x2000000C -.long 0xD8EC0380, 0x2200000C +.long 0xE0511300, 0x80020707 +.long 0xD8EC0300, 0x2000000F +.long 0xD8EC0380, 0x2200000F .long 0xD3EE8038, 0x04E2A132 -.long 0xB77C1000 -.long 0x810D1B0D +.long 0xB77C1020 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE8040, 0x0502A524 -.long 0xE05C2000, 0x0C014007 +.long 0xE05C1000, 0x80014006 .long 0xD3EE8048, 0x0522A526 .long 0xD3EE8050, 0x0542A528 -.long 0x810C1A0C +.long 0x80041404 +.long 0x82058005 .long 0xD3EE8058, 0x0562A52A .long 0xD3EE8060, 0x0582A52C .long 0xD3EE8068, 0x05A2A52E .long 0xD3EE8070, 0x05C2A530 .long 0xD3EE8078, 0x05E2A532 -.long 0xB7240008 -.long 0xBF82FDE4 +.long 0xB71D0008 +.long 0xB31D0078 +.long 0xBF85FDD5 .long 0xBF8C0F7B .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8000, 0x04026914 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC1000, 0x2400000C -.long 0xD8EC1080, 0x2600000C +.long 0xE0511000, 0x80020707 +.long 0xD8EC1020, 0x2400000F +.long 0xD8EC10A0, 0x2600000F .long 0xD3EE8008, 0x04226916 .long 0xBF8CC67F .long 0xD3EE8010, 0x04426918 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC1100, 0x2800000C -.long 0xD8EC1180, 0x2A00000C +.long 0xE0511100, 0x80020707 +.long 0xD8EC1120, 0x2800000F +.long 0xD8EC11A0, 0x2A00000F .long 0xD3EE8018, 0x0462691A .long 0xBF8CC67F .long 0xD3EE8020, 0x0482691C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC1200, 0x2C00000C -.long 0xD8EC1280, 0x2E00000C +.long 0xE0511200, 0x80020707 +.long 0xD8EC1220, 0x2C00000F +.long 0xD8EC12A0, 0x2E00000F .long 0xD3EE8028, 0x04A2691E .long 0xBF8CC67F .long 0xD3EE8030, 0x04C26920 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC1300, 0x3000000C -.long 0xD8EC1380, 0x3200000C +.long 0xE0511300, 0x80020707 +.long 0xD8EC1320, 0x3000000F +.long 0xD8EC13A0, 0x3200000F .long 0xD3EE8038, 0x04E26922 -.long 0xB77C1000 -.long 0x810D1B0D +.long 0xB77C1020 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE8040, 0x05026D14 -.long 0xE05C2000, 0x0C014407 +.long 0xE05C1000, 0x80014406 .long 0xD3EE8048, 0x05226D16 .long 0xD3EE8050, 0x05426D18 -.long 0x810C1A0C +.long 0x80041404 +.long 0x82058005 .long 0xD3EE8058, 0x05626D1A .long 0xD3EE8060, 0x05826D1C .long 0xD3EE8068, 0x05A26D1E @@ -1142,35 +1206,37 @@ DGEMM_Aldebaran_PKFixedAtomic512Latest: .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8000, 0x04027124 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC2000, 0x1400000C -.long 0xD8EC2080, 0x1600000C +.long 0xE0511000, 0x80020707 +.long 0xD8EC2040, 0x1400000F +.long 0xD8EC20C0, 0x1600000F .long 0xD3EE8008, 0x04227126 .long 0xBF8CC67F .long 0xD3EE8010, 0x04427128 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC2100, 0x1800000C -.long 0xD8EC2180, 0x1A00000C +.long 0xE0511100, 0x80020707 +.long 0xD8EC2140, 0x1800000F +.long 0xD8EC21C0, 0x1A00000F .long 0xD3EE8018, 0x0462712A .long 0xBF8CC67F .long 0xD3EE8020, 0x0482712C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC2200, 0x1C00000C -.long 0xD8EC2280, 0x1E00000C +.long 0xE0511200, 0x80020707 +.long 0xD8EC2240, 0x1C00000F +.long 0xD8EC22C0, 0x1E00000F .long 0xD3EE8028, 0x04A2712E .long 0xBF8CC67F .long 0xD3EE8030, 0x04C27130 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC2300, 0x2000000C -.long 0xD8EC2380, 0x2200000C +.long 0xE0511300, 0x80020707 +.long 0xD8EC2340, 0x2000000F +.long 0xD8EC23C0, 0x2200000F .long 0xD3EE8038, 0x04E27132 -.long 0xB77C1000 -.long 0x810D1B0D +.long 0xB77C1020 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE8040, 0x05027524 -.long 0xE05C2000, 0x0C014807 +.long 0xE05C1000, 0x80014806 .long 0xD3EE8048, 0x05227526 .long 0xD3EE8050, 0x05427528 -.long 0x810C1A0C +.long 0x80041404 +.long 0x82058005 .long 0xD3EE8058, 0x0562752A .long 0xD3EE8060, 0x0582752C .long 0xD3EE8068, 0x05A2752E @@ -1180,93 +1246,103 @@ DGEMM_Aldebaran_PKFixedAtomic512Latest: .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8000, 0x04027914 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC3000, 0x2400000C -.long 0xD8EC3080, 0x2600000C +.long 0xE0511000, 0x80020707 +.long 0xD8EC3060, 0x2400000F +.long 0xD8EC30E0, 0x2600000F .long 0xD3EE8008, 0x04227916 .long 0xBF8CC67F .long 0xD3EE8010, 0x04427918 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC3100, 0x2800000C -.long 0xD8EC3180, 0x2A00000C +.long 0xE0511100, 0x80020707 +.long 0xD8EC3160, 0x2800000F +.long 0xD8EC31E0, 0x2A00000F .long 0xD3EE8018, 0x0462791A .long 0xBF8CC67F .long 0xD3EE8020, 0x0482791C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC3200, 0x2C00000C -.long 0xD8EC3280, 0x2E00000C +.long 0xE0511200, 0x80020707 +.long 0xD8EC3260, 0x2C00000F +.long 0xD8EC32E0, 0x2E00000F .long 0xD3EE8028, 0x04A2791E .long 0xBF8CC67F .long 0xD3EE8030, 0x04C27920 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC3300, 0x3000000C -.long 0xD8EC3380, 0x3200000C +.long 0xE0511300, 0x80020707 +.long 0xD8EC3360, 0x3000000F +.long 0xD8EC33E0, 0x3200000F .long 0xD3EE8038, 0x04E27922 -.long 0xB77C1000 -.long 0x810D1B0D +.long 0xB77C1020 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE8040, 0x05027D14 -.long 0xE05C2000, 0x0C014C07 +.long 0xE05C1000, 0x80014C06 .long 0xD3EE8048, 0x05227D16 .long 0xD3EE8050, 0x05427D18 -.long 0x810C1A0C +.long 0x80041404 +.long 0x82058005 .long 0xD3EE8058, 0x05627D1A .long 0xD3EE8060, 0x05827D1C .long 0xD3EE8068, 0x05A27D1E .long 0xD3EE8070, 0x05C27D20 .long 0xD3EE8078, 0x05E27D22 +.long 0x808C0426 +.long 0x828D0527 +.long 0xBE86000C +.long 0x808C0828 +.long 0x828D0929 +.long 0xBE8A000C .long 0xBF8C0F7B .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8000, 0x04028124 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC4000, 0x1400000C -.long 0xD8EC4080, 0x1600000C +.long 0xE0511000, 0x80020707 +.long 0xD8EC4080, 0x1400000F +.long 0xD8EC4100, 0x1600000F .long 0xD3EE8008, 0x04228126 .long 0xBF8CC67F .long 0xD3EE8010, 0x04428128 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC4100, 0x1800000C -.long 0xD8EC4180, 0x1A00000C +.long 0xE0511100, 0x80020707 +.long 0xD8EC4180, 0x1800000F +.long 0xD8EC4200, 0x1A00000F .long 0xD3EE8018, 0x0462812A .long 0xBF8CC67F .long 0xD3EE8020, 0x0482812C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC4200, 0x1C00000C -.long 0xD8EC4280, 0x1E00000C +.long 0xE0511200, 0x80020707 +.long 0xD8EC4280, 0x1C00000F +.long 0xD8EC4300, 0x1E00000F .long 0xD3EE8028, 0x04A2812E .long 0xBF8CC67F .long 0xD3EE8030, 0x04C28130 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC4300, 0x2000000C -.long 0xD8EC4380, 0x2200000C +.long 0xE0511300, 0x80020707 +.long 0xD8EC4380, 0x2000000F +.long 0xD8EC4400, 0x2200000F .long 0xD3EE8038, 0x04E28132 -.long 0xBEFC001C -.long 0x810D1B0D +.long 0xBEFC0016 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE8040, 0x05028524 -.long 0xE05C2000, 0x0C015007 +.long 0xE05C1000, 0x80015006 .long 0xD3EE8048, 0x05228526 .long 0xD3EE8050, 0x05428528 -.long 0x810C1A0C +.long 0x80041404 +.long 0x82058005 .long 0xD3EE8058, 0x0562852A .long 0xD3EE8060, 0x0582852C .long 0xD3EE8068, 0x05A2852E .long 0xD3EE8070, 0x05C28530 .long 0xD3EE8078, 0x05E28532 .long 0x81313831 -.long 0xBF031D31 -.long 0xBF850147 +.long 0xBF031731 +.long 0xBF85014B .long 0x814C384C -.long 0x8194494C -.long 0x8115814D +.long 0x818C494C +.long 0x810D814D .long 0xBF03494C -.long 0xBECC0214 -.long 0xBECD0215 -.long 0x96153631 -.long 0x8F169F37 -.long 0x92141631 -.long 0x80141514 -.long 0x8616FF37, 0x7FFFFFFF -.long 0x8F331614 +.long 0xBECC020C +.long 0xBECD020D +.long 0x960D3631 +.long 0x8F0E9F37 +.long 0x920C0E31 +.long 0x800C0D0C +.long 0x860EFF37, 0x7FFFFFFF +.long 0x8F330E0C .long 0x92323433 .long 0x80B23231 .long 0xBF033534 @@ -1275,76 +1351,72 @@ DGEMM_Aldebaran_PKFixedAtomic512Latest: .long 0xBF033849 .long 0xBEC6024C .long 0xBEC7024D -.long 0xBE97004B -.long 0x96151747 -.long 0x92141747 -.long 0x8F949F14 -.long 0x92154A14 -.long 0x80951547 -.long 0x92154915 -.long 0x80154615 -.long 0xBF093914 -.long 0xBE97023B -.long 0x85164A3A -.long 0x96471715 -.long 0x92461715 +.long 0xBE8F004B +.long 0x960D0F47 +.long 0x920C0F47 +.long 0x8F8C9F0C +.long 0x920D4A0C +.long 0x808D0D47 +.long 0x920D490D +.long 0x800D460D +.long 0xBF09390C +.long 0xBE8F023B +.long 0x850E4A3A +.long 0x96470F0D +.long 0x92460F0D .long 0x8FC69F46 -.long 0x92471646 -.long 0x80C74715 -.long 0x92144A14 -.long 0x80471447 +.long 0x92470E46 +.long 0x80C7470D +.long 0x920C4A0C +.long 0x80470C47 .long 0xBF033534 .long 0x85324746 .long 0x85334647 -.long 0xBE940032 -.long 0xBE950080 -.long 0x8E948A14 -.long 0x80142114 -.long 0x82158015 -.long 0x80041440 -.long 0x82051541 -.long 0xBE8C0080 -.long 0xBE940033 -.long 0xBE950080 -.long 0x8E948A14 -.long 0x80142214 -.long 0x82158015 -.long 0x80081442 -.long 0x82091543 -.long 0xBE8D0080 +.long 0x8E0C8A32 +.long 0x810C1A0C +.long 0x80040C40 +.long 0x82058041 +.long 0xBE8600FF, 0x80000000 +.long 0x8E0C8A33 +.long 0x800C1B0C +.long 0x80080C42 +.long 0x82098043 +.long 0xBE8A00FF, 0x80000000 .long 0xBF8C0F7B .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8000, 0x04028914 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC5000, 0x2400000C -.long 0xD8EC5080, 0x2600000C +.long 0xE0511000, 0x80020707 +.long 0xD8EC50A0, 0x2400000F +.long 0xD8EC5120, 0x2600000F .long 0xD3EE8008, 0x04228916 .long 0xBF8CC67F .long 0xD3EE8010, 0x04428918 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC5100, 0x2800000C -.long 0xD8EC5180, 0x2A00000C +.long 0xE0511100, 0x80020707 +.long 0xD8EC51A0, 0x2800000F +.long 0xD8EC5220, 0x2A00000F .long 0xD3EE8018, 0x0462891A .long 0xBF8CC67F .long 0xD3EE8020, 0x0482891C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC5200, 0x2C00000C -.long 0xD8EC5280, 0x2E00000C +.long 0xE0511200, 0x80020707 +.long 0xD8EC52A0, 0x2C00000F +.long 0xD8EC5320, 0x2E00000F .long 0xD3EE8028, 0x04A2891E .long 0xBF8CC67F .long 0xD3EE8030, 0x04C28920 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC5300, 0x3000000C -.long 0xD8EC5380, 0x3200000C +.long 0xE0511300, 0x80020707 +.long 0xD8EC53A0, 0x3000000F +.long 0xD8EC5420, 0x3200000F .long 0xD3EE8038, 0x04E28922 -.long 0xB77C1000 -.long 0x810D1B0D +.long 0xB77C1020 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE8040, 0x05028D14 -.long 0xE05C2000, 0x0C013407 +.long 0xE05C1000, 0x80013406 .long 0xD3EE8048, 0x05228D16 .long 0xD3EE8050, 0x05428D18 -.long 0x810C1A0C +.long 0x80041404 +.long 0x82058005 .long 0xD3EE8058, 0x05628D1A .long 0xD3EE8060, 0x05828D1C .long 0xD3EE8068, 0x05A28D1E @@ -1354,35 +1426,37 @@ DGEMM_Aldebaran_PKFixedAtomic512Latest: .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8000, 0x04029124 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC6000, 0x1400000C -.long 0xD8EC6080, 0x1600000C +.long 0xE0511000, 0x80020707 +.long 0xD8EC60C0, 0x1400000F +.long 0xD8EC6140, 0x1600000F .long 0xD3EE8008, 0x04229126 .long 0xBF8CC67F .long 0xD3EE8010, 0x04429128 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC6100, 0x1800000C -.long 0xD8EC6180, 0x1A00000C +.long 0xE0511100, 0x80020707 +.long 0xD8EC61C0, 0x1800000F +.long 0xD8EC6240, 0x1A00000F .long 0xD3EE8018, 0x0462912A .long 0xBF8CC67F .long 0xD3EE8020, 0x0482912C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC6200, 0x1C00000C -.long 0xD8EC6280, 0x1E00000C +.long 0xE0511200, 0x80020707 +.long 0xD8EC62C0, 0x1C00000F +.long 0xD8EC6340, 0x1E00000F .long 0xD3EE8028, 0x04A2912E .long 0xBF8CC67F .long 0xD3EE8030, 0x04C29130 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC6300, 0x2000000C -.long 0xD8EC6380, 0x2200000C +.long 0xE0511300, 0x80020707 +.long 0xD8EC63C0, 0x2000000F +.long 0xD8EC6440, 0x2200000F .long 0xD3EE8038, 0x04E29132 -.long 0xB77C1000 -.long 0x810D1B0D +.long 0xB77C1020 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE8040, 0x05029524 -.long 0xE05C2000, 0x0C013807 +.long 0xE05C1000, 0x80013806 .long 0xD3EE8048, 0x05229526 .long 0xD3EE8050, 0x05429528 -.long 0x810C1A0C +.long 0x80041404 +.long 0x82058005 .long 0xD3EE8058, 0x0562952A .long 0xD3EE8060, 0x0582952C .long 0xD3EE8068, 0x05A2952E @@ -1392,35 +1466,37 @@ DGEMM_Aldebaran_PKFixedAtomic512Latest: .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8000, 0x04029914 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC7000, 0x2400000C -.long 0xD8EC7080, 0x2600000C +.long 0xE0511000, 0x80020707 +.long 0xD8EC70E0, 0x2400000F +.long 0xD8EC7160, 0x2600000F .long 0xD3EE8008, 0x04229916 .long 0xBF8CC67F .long 0xD3EE8010, 0x04429918 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC7100, 0x2800000C -.long 0xD8EC7180, 0x2A00000C +.long 0xE0511100, 0x80020707 +.long 0xD8EC71E0, 0x2800000F +.long 0xD8EC7260, 0x2A00000F .long 0xD3EE8018, 0x0462991A .long 0xBF8CC67F .long 0xD3EE8020, 0x0482991C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC7200, 0x2C00000C -.long 0xD8EC7280, 0x2E00000C +.long 0xE0511200, 0x80020707 +.long 0xD8EC72E0, 0x2C00000F +.long 0xD8EC7360, 0x2E00000F .long 0xD3EE8028, 0x04A2991E .long 0xBF8CC67F .long 0xD3EE8030, 0x04C29920 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC7300, 0x3000000C -.long 0xD8EC7380, 0x3200000C +.long 0xE0511300, 0x80020707 +.long 0xD8EC73E0, 0x3000000F +.long 0xD8EC7460, 0x3200000F .long 0xD3EE8038, 0x04E29922 -.long 0xB77C1000 -.long 0x810D1B0D +.long 0xB77C1020 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE8040, 0x05029D14 -.long 0xE05C2000, 0x0C013C07 +.long 0xE05C1000, 0x80013C06 .long 0xD3EE8048, 0x05229D16 .long 0xD3EE8050, 0x05429D18 -.long 0x810C1A0C +.long 0x80041404 +.long 0x82058005 .long 0xD3EE8058, 0x05629D1A .long 0xD3EE8060, 0x05829D1C .long 0xD3EE8068, 0x05A29D1E @@ -1430,35 +1506,37 @@ DGEMM_Aldebaran_PKFixedAtomic512Latest: .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8000, 0x0402A124 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC0000, 0x1400000C -.long 0xD8EC0080, 0x1600000C +.long 0xE0511000, 0x80020707 +.long 0xD8EC0000, 0x1400000F +.long 0xD8EC0080, 0x1600000F .long 0xD3EE8008, 0x0422A126 .long 0xBF8CC67F .long 0xD3EE8010, 0x0442A128 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC0100, 0x1800000C -.long 0xD8EC0180, 0x1A00000C +.long 0xE0511100, 0x80020707 +.long 0xD8EC0100, 0x1800000F +.long 0xD8EC0180, 0x1A00000F .long 0xD3EE8018, 0x0462A12A .long 0xBF8CC67F .long 0xD3EE8020, 0x0482A12C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC0200, 0x1C00000C -.long 0xD8EC0280, 0x1E00000C +.long 0xE0511200, 0x80020707 +.long 0xD8EC0200, 0x1C00000F +.long 0xD8EC0280, 0x1E00000F .long 0xD3EE8028, 0x04A2A12E .long 0xBF8CC67F .long 0xD3EE8030, 0x04C2A130 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC0300, 0x2000000C -.long 0xD8EC0380, 0x2200000C +.long 0xE0511300, 0x80020707 +.long 0xD8EC0300, 0x2000000F +.long 0xD8EC0380, 0x2200000F .long 0xD3EE8038, 0x04E2A132 -.long 0xB77C1000 -.long 0x810D1B0D +.long 0xB77C1020 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE8040, 0x0502A524 -.long 0xE05C2000, 0x0C014007 +.long 0xE05C1000, 0x80014006 .long 0xD3EE8048, 0x0522A526 .long 0xD3EE8050, 0x0542A528 -.long 0x810C1A0C +.long 0x80041404 +.long 0x82058005 .long 0xD3EE8058, 0x0562A52A .long 0xD3EE8060, 0x0582A52C .long 0xD3EE8068, 0x05A2A52E @@ -1469,23 +1547,23 @@ DGEMM_Aldebaran_PKFixedAtomic512Latest: .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8000, 0x04028914 -.long 0xD8EC5000, 0x2400000C -.long 0xD8EC5080, 0x2600000C +.long 0xD8EC50A0, 0x2400000F +.long 0xD8EC5120, 0x2600000F .long 0xD3EE8008, 0x04228916 .long 0xBF8CC67F .long 0xD3EE8010, 0x04428918 -.long 0xD8EC5100, 0x2800000C -.long 0xD8EC5180, 0x2A00000C +.long 0xD8EC51A0, 0x2800000F +.long 0xD8EC5220, 0x2A00000F .long 0xD3EE8018, 0x0462891A .long 0xBF8CC67F .long 0xD3EE8020, 0x0482891C -.long 0xD8EC5200, 0x2C00000C -.long 0xD8EC5280, 0x2E00000C +.long 0xD8EC52A0, 0x2C00000F +.long 0xD8EC5320, 0x2E00000F .long 0xD3EE8028, 0x04A2891E .long 0xBF8CC67F .long 0xD3EE8030, 0x04C28920 -.long 0xD8EC5300, 0x3000000C -.long 0xD8EC5380, 0x3200000C +.long 0xD8EC53A0, 0x3000000F +.long 0xD8EC5420, 0x3200000F .long 0xD3EE8038, 0x04E28922 .long 0xD3EE8040, 0x05028D14 .long 0xD3EE8048, 0x05228D16 @@ -1499,23 +1577,23 @@ DGEMM_Aldebaran_PKFixedAtomic512Latest: .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8000, 0x04029124 -.long 0xD8EC6000, 0x1400000C -.long 0xD8EC6080, 0x1600000C +.long 0xD8EC60C0, 0x1400000F +.long 0xD8EC6140, 0x1600000F .long 0xD3EE8008, 0x04229126 .long 0xBF8CC67F .long 0xD3EE8010, 0x04429128 -.long 0xD8EC6100, 0x1800000C -.long 0xD8EC6180, 0x1A00000C +.long 0xD8EC61C0, 0x1800000F +.long 0xD8EC6240, 0x1A00000F .long 0xD3EE8018, 0x0462912A .long 0xBF8CC67F .long 0xD3EE8020, 0x0482912C -.long 0xD8EC6200, 0x1C00000C -.long 0xD8EC6280, 0x1E00000C +.long 0xD8EC62C0, 0x1C00000F +.long 0xD8EC6340, 0x1E00000F .long 0xD3EE8028, 0x04A2912E .long 0xBF8CC67F .long 0xD3EE8030, 0x04C29130 -.long 0xD8EC6300, 0x2000000C -.long 0xD8EC6380, 0x2200000C +.long 0xD8EC63C0, 0x2000000F +.long 0xD8EC6440, 0x2200000F .long 0xD3EE8038, 0x04E29132 .long 0xD3EE8040, 0x05029524 .long 0xD3EE8048, 0x05229526 @@ -1529,23 +1607,23 @@ DGEMM_Aldebaran_PKFixedAtomic512Latest: .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8000, 0x04029914 -.long 0xD8EC7000, 0x2400000C -.long 0xD8EC7080, 0x2600000C +.long 0xD8EC70E0, 0x2400000F +.long 0xD8EC7160, 0x2600000F .long 0xD3EE8008, 0x04229916 .long 0xBF8CC67F .long 0xD3EE8010, 0x04429918 -.long 0xD8EC7100, 0x2800000C -.long 0xD8EC7180, 0x2A00000C +.long 0xD8EC71E0, 0x2800000F +.long 0xD8EC7260, 0x2A00000F .long 0xD3EE8018, 0x0462991A .long 0xBF8CC67F .long 0xD3EE8020, 0x0482991C -.long 0xD8EC7200, 0x2C00000C -.long 0xD8EC7280, 0x2E00000C +.long 0xD8EC72E0, 0x2C00000F +.long 0xD8EC7360, 0x2E00000F .long 0xD3EE8028, 0x04A2991E .long 0xBF8CC67F .long 0xD3EE8030, 0x04C29920 -.long 0xD8EC7300, 0x3000000C -.long 0xD8EC7380, 0x3200000C +.long 0xD8EC73E0, 0x3000000F +.long 0xD8EC7460, 0x3200000F .long 0xD3EE8038, 0x04E29922 .long 0xD3EE8040, 0x05029D14 .long 0xD3EE8048, 0x05229D16 @@ -1577,529 +1655,589 @@ DGEMM_Aldebaran_PKFixedAtomic512Latest: .long 0xD3EE8068, 0x05A2A52E .long 0xD3EE8070, 0x05C2A530 .long 0xD3EE8078, 0x05E2A532 -.long 0xBF820DA5 -.long 0xBEA3007C -.long 0xBEA40080 -.long 0xBEA50080 -.long 0x80104418 -.long 0x82114519 -.long 0xBEBC0080 -.long 0xBEFC0023 +.long 0xBF8222AA +.long 0xBE9C007C +.long 0x860C811F +.long 0xB10C0001 +.long 0xBF851476 +.long 0xBE9D0080 +.long 0xBE9E0080 +.long 0x8010443E +.long 0x8211453F +.long 0xBEFC001C .long 0xBF8C0F7B .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8080, 0x06026914 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC1000, 0x2400000C -.long 0xD8EC1080, 0x2600000C +.long 0xD8EC1020, 0x2400000F +.long 0xD8EC10A0, 0x2600000F +.long 0xE0511000, 0x80020707 .long 0xD3EE8088, 0x06226916 .long 0xBF8CC67F .long 0xD3EE8090, 0x06426918 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC1100, 0x2800000C -.long 0xD8EC1180, 0x2A00000C +.long 0xD8EC1120, 0x2800000F +.long 0xD8EC11A0, 0x2A00000F +.long 0xE0511100, 0x80020707 .long 0xD3EE8098, 0x0662691A .long 0xBF8CC67F .long 0xD3EE80A0, 0x0682691C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC1200, 0x2C00000C -.long 0xD8EC1280, 0x2E00000C +.long 0xD8EC1220, 0x2C00000F +.long 0xD8EC12A0, 0x2E00000F +.long 0xE0511200, 0x80020707 .long 0xD3EE80A8, 0x06A2691E .long 0xBF8CC67F .long 0xD3EE80B0, 0x06C26920 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC1300, 0x3000000C -.long 0xD8EC1380, 0x3200000C +.long 0xD8EC1320, 0x3000000F +.long 0xD8EC13A0, 0x3200000F +.long 0xE0511300, 0x80020707 .long 0xD3EE80B8, 0x06E26922 .long 0xD3EE80C0, 0x07026D14 -.long 0xE05C2000, 0x0C014407 +.long 0xE05C1000, 0x80014406 .long 0xD3EE80C8, 0x07226D16 +.long 0xB71C1020 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE80D0, 0x07426D18 +.long 0x80041404 +.long 0x82058005 .long 0xD3EE80D8, 0x07626D1A -.long 0xB7231000 -.long 0x810C1A0C -.long 0x810D1B0D .long 0xD3EE80E0, 0x07826D1C -.long 0xE05C2000, 0x3C045409 -.long 0x803D1F3C +.long 0xE05C1000, 0x80045408 .long 0xD3EE80E8, 0x07A26D1E .long 0xD3EE80F0, 0x07C26D20 .long 0xD3EE80F8, 0x07E26D22 -.long 0xBEFC0023 +.long 0xBEFC001C .long 0xBF8C0F7C .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8080, 0x06027124 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC2000, 0x1400000C -.long 0xD8EC2080, 0x1600000C +.long 0xD8EC2040, 0x1400000F +.long 0xD8EC20C0, 0x1600000F +.long 0xE0511000, 0x80020707 .long 0xD3EE8088, 0x06227126 .long 0xBF8CC67F .long 0xD3EE8090, 0x06427128 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC2100, 0x1800000C -.long 0xD8EC2180, 0x1A00000C +.long 0xD8EC2140, 0x1800000F +.long 0xD8EC21C0, 0x1A00000F +.long 0xE0511100, 0x80020707 .long 0xD3EE8098, 0x0662712A .long 0xBF8CC67F .long 0xD3EE80A0, 0x0682712C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC2200, 0x1C00000C -.long 0xD8EC2280, 0x1E00000C +.long 0xD8EC2240, 0x1C00000F +.long 0xD8EC22C0, 0x1E00000F +.long 0xE0511200, 0x80020707 .long 0xD3EE80A8, 0x06A2712E .long 0xBF8CC67F .long 0xD3EE80B0, 0x06C27130 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC2300, 0x2000000C -.long 0xD8EC2380, 0x2200000C +.long 0xD8EC2340, 0x2000000F +.long 0xD8EC23C0, 0x2200000F +.long 0xE0511300, 0x80020707 .long 0xD3EE80B8, 0x06E27132 .long 0xD3EE80C0, 0x07027524 -.long 0xE05C2000, 0x0C014807 +.long 0xE05C1000, 0x80014806 .long 0xD3EE80C8, 0x07227526 +.long 0xB71C1020 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE80D0, 0x07427528 +.long 0x80041404 +.long 0x82058005 .long 0xD3EE80D8, 0x0762752A -.long 0xB7231000 -.long 0x810C1A0C -.long 0x810D1B0D .long 0xD3EE80E0, 0x0782752C .long 0xD3EE80E8, 0x07A2752E .long 0xD3EE80F0, 0x07C27530 .long 0xD3EE80F8, 0x07E27532 -.long 0xBEFC0023 +.long 0xBEFC001C .long 0xBF8C0F7C .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8080, 0x06027914 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC3000, 0x2400000C -.long 0xD8EC3080, 0x2600000C +.long 0xD8EC3060, 0x2400000F +.long 0xD8EC30E0, 0x2600000F +.long 0xE0511000, 0x80020707 .long 0xD3EE8088, 0x06227916 .long 0xBF8CC67F .long 0xD3EE8090, 0x06427918 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC3100, 0x2800000C -.long 0xD8EC3180, 0x2A00000C +.long 0xD8EC3160, 0x2800000F +.long 0xD8EC31E0, 0x2A00000F +.long 0xE0511100, 0x80020707 .long 0xD3EE8098, 0x0662791A .long 0xBF8CC67F .long 0xD3EE80A0, 0x0682791C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC3200, 0x2C00000C -.long 0xD8EC3280, 0x2E00000C +.long 0xD8EC3260, 0x2C00000F +.long 0xD8EC32E0, 0x2E00000F +.long 0xE0511200, 0x80020707 .long 0xD3EE80A8, 0x06A2791E .long 0xBF8CC67F .long 0xD3EE80B0, 0x06C27920 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC3300, 0x3000000C -.long 0xD8EC3380, 0x3200000C +.long 0xD8EC3360, 0x3000000F +.long 0xD8EC33E0, 0x3200000F +.long 0xE0511300, 0x80020707 .long 0xD3EE80B8, 0x06E27922 .long 0xD3EE80C0, 0x07027D14 -.long 0xE05C2000, 0x0C014C07 +.long 0xE05C1000, 0x80014C06 .long 0xD3EE80C8, 0x07227D16 +.long 0xB71C1020 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE80D0, 0x07427D18 +.long 0x80041404 +.long 0x82058005 .long 0xD3EE80D8, 0x07627D1A -.long 0xB7231000 -.long 0x810C1A0C -.long 0x810D1B0D +.long 0xBF11011E +.long 0xD3D84010, 0x18000100 +.long 0xD3D84011, 0x18000101 +.long 0xD3D84012, 0x18000140 +.long 0xD3D84013, 0x18000141 +.long 0xBF800001 +.long 0xBF9C0000 +.long 0xBF8C0F7A +.long 0xD2800010, 0x2002A910 +.long 0xD2800012, 0x2002AD12 .long 0xD3EE80E0, 0x07827D1C -.long 0xE05C2000, 0x3D045809 +.long 0xE07C1000, 0x80041008 .long 0xD3EE80E8, 0x07A27D1E +.long 0x80101910 +.long 0x82118011 .long 0xD3EE80F0, 0x07C27D20 +.long 0x80921912 +.long 0xBE920280 .long 0xD3EE80F8, 0x07E27D22 -.long 0xBEFC0023 +.long 0xBEFC001C .long 0xBF8C0F7D .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8080, 0x06028124 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC4000, 0x1400000C -.long 0xD8EC4080, 0x1600000C +.long 0xD8EC4080, 0x1400000F +.long 0xD8EC4100, 0x1600000F +.long 0xE0511000, 0x80020707 .long 0xD3EE8088, 0x06228126 .long 0xBF8CC67F .long 0xD3EE8090, 0x06428128 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC4100, 0x1800000C -.long 0xD8EC4180, 0x1A00000C +.long 0xD8EC4180, 0x1800000F +.long 0xD8EC4200, 0x1A00000F +.long 0xE0511100, 0x80020707 .long 0xD3EE8098, 0x0662812A .long 0xBF8CC67F .long 0xD3EE80A0, 0x0682812C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC4200, 0x1C00000C -.long 0xD8EC4280, 0x1E00000C +.long 0xD8EC4280, 0x1C00000F +.long 0xD8EC4300, 0x1E00000F +.long 0xE0511200, 0x80020707 .long 0xD3EE80A8, 0x06A2812E .long 0xBF8CC67F .long 0xD3EE80B0, 0x06C28130 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC4300, 0x2000000C -.long 0xD8EC4380, 0x2200000C +.long 0xD8EC4380, 0x2000000F +.long 0xD8EC4400, 0x2200000F +.long 0xE0511300, 0x80020707 .long 0xD3EE80B8, 0x06E28132 .long 0xD3EE80C0, 0x07028524 -.long 0xE05C2000, 0x0C015007 +.long 0xE05C1000, 0x80015006 .long 0xD3EE80C8, 0x07228526 +.long 0xBE9C0016 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE80D0, 0x07428528 +.long 0x80041404 +.long 0x82058005 .long 0xD3EE80D8, 0x0762852A -.long 0xBEA3001C -.long 0x810C1A0C -.long 0x810D1B0D .long 0xD3EE80E0, 0x0782852C .long 0xD3EE80E8, 0x07A2852E .long 0xD3EE80F0, 0x07C28530 .long 0xD3EE80F8, 0x07E28532 -.long 0xBEFC0023 +.long 0xBEFC001C .long 0xBF8C0F7C .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8080, 0x06028914 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC5000, 0x2400000C -.long 0xD8EC5080, 0x2600000C +.long 0xD8EC50A0, 0x2400000F +.long 0xD8EC5120, 0x2600000F +.long 0xE0511000, 0x80020707 .long 0xD3EE8088, 0x06228916 .long 0xBF8CC67F .long 0xD3EE8090, 0x06428918 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC5100, 0x2800000C -.long 0xD8EC5180, 0x2A00000C +.long 0xD8EC51A0, 0x2800000F +.long 0xD8EC5220, 0x2A00000F +.long 0xE0511100, 0x80020707 .long 0xD3EE8098, 0x0662891A .long 0xBF8CC67F .long 0xD3EE80A0, 0x0682891C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC5200, 0x2C00000C -.long 0xD8EC5280, 0x2E00000C +.long 0xD8EC52A0, 0x2C00000F +.long 0xD8EC5320, 0x2E00000F +.long 0xE0511200, 0x80020707 .long 0xD3EE80A8, 0x06A2891E .long 0xBF8CC67F .long 0xD3EE80B0, 0x06C28920 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC5300, 0x3000000C -.long 0xD8EC5380, 0x3200000C +.long 0xD8EC53A0, 0x3000000F +.long 0xD8EC5420, 0x3200000F +.long 0xE0511300, 0x80020707 .long 0xD3EE80B8, 0x06E28922 .long 0xD3EE80C0, 0x07028D14 -.long 0xE05C2000, 0x0C013407 +.long 0xE05C1000, 0x80013406 .long 0xD3EE80C8, 0x07228D16 +.long 0xB71C1020 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE80D0, 0x07428D18 +.long 0x80041404 +.long 0x82058005 .long 0xD3EE80D8, 0x07628D1A -.long 0xB7231000 -.long 0x810C1A0C -.long 0x810D1B0D -.long 0xBF110125 -.long 0xD3D84010, 0x18000100 -.long 0xD3D84011, 0x18000101 -.long 0xD3D84012, 0x18000140 -.long 0xD3D84013, 0x18000141 -.long 0xBF800001 -.long 0xBF9C0000 -.long 0xD2800010, 0x2002A910 -.long 0xD2800012, 0x2002AD12 .long 0xD3EE80E0, 0x07828D1C -.long 0xE07C2000, 0x3C041009 +.long 0xE05C1000, 0x80045808 .long 0xD3EE80E8, 0x07A28D1E .long 0xD3EE80F0, 0x07C28D20 .long 0xD3EE80F8, 0x07E28D22 -.long 0xBEFC0023 +.long 0xBEFC001C .long 0xBF8C0F7D .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8080, 0x06029124 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC6000, 0x1400000C -.long 0xD8EC6080, 0x1600000C +.long 0xD8EC60C0, 0x1400000F +.long 0xD8EC6140, 0x1600000F +.long 0xE0511000, 0x80020707 .long 0xD3EE8088, 0x06229126 .long 0xBF8CC67F .long 0xD3EE8090, 0x06429128 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC6100, 0x1800000C -.long 0xD8EC6180, 0x1A00000C +.long 0xD8EC61C0, 0x1800000F +.long 0xD8EC6240, 0x1A00000F +.long 0xE0511100, 0x80020707 .long 0xD3EE8098, 0x0662912A .long 0xBF8CC67F .long 0xD3EE80A0, 0x0682912C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC6200, 0x1C00000C -.long 0xD8EC6280, 0x1E00000C +.long 0xD8EC62C0, 0x1C00000F +.long 0xD8EC6340, 0x1E00000F +.long 0xE0511200, 0x80020707 .long 0xD3EE80A8, 0x06A2912E .long 0xBF8CC67F .long 0xD3EE80B0, 0x06C29130 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC6300, 0x2000000C -.long 0xD8EC6380, 0x2200000C +.long 0xD8EC63C0, 0x2000000F +.long 0xD8EC6440, 0x2200000F +.long 0xE0511300, 0x80020707 .long 0xD3EE80B8, 0x06E29132 .long 0xD3EE80C0, 0x07029524 -.long 0xE05C2000, 0x0C013807 +.long 0xE05C1000, 0x80013806 .long 0xD3EE80C8, 0x07229526 +.long 0xB71C1020 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE80D0, 0x07429528 +.long 0x80041404 +.long 0x82058005 .long 0xD3EE80D8, 0x0762952A -.long 0xB7231000 -.long 0x810C1A0C -.long 0x810D1B0D .long 0xD3EE80E0, 0x0782952C .long 0xD3EE80E8, 0x07A2952E .long 0xD3EE80F0, 0x07C29530 .long 0xD3EE80F8, 0x07E29532 -.long 0xBEFC0023 +.long 0xBEFC001C .long 0xBF8C0F7C .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8080, 0x06029914 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC7000, 0x2400000C -.long 0xD8EC7080, 0x2600000C +.long 0xD8EC70E0, 0x2400000F +.long 0xD8EC7160, 0x2600000F +.long 0xE0511000, 0x80020707 .long 0xD3EE8088, 0x06229916 .long 0xBF8CC67F .long 0xD3EE8090, 0x06429918 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC7100, 0x2800000C -.long 0xD8EC7180, 0x2A00000C +.long 0xD8EC71E0, 0x2800000F +.long 0xD8EC7260, 0x2A00000F +.long 0xE0511100, 0x80020707 .long 0xD3EE8098, 0x0662991A .long 0xBF8CC67F .long 0xD3EE80A0, 0x0682991C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC7200, 0x2C00000C -.long 0xD8EC7280, 0x2E00000C +.long 0xD8EC72E0, 0x2C00000F +.long 0xD8EC7360, 0x2E00000F +.long 0xE0511200, 0x80020707 .long 0xD3EE80A8, 0x06A2991E .long 0xBF8CC67F .long 0xD3EE80B0, 0x06C29920 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC7300, 0x3000000C -.long 0xD8EC7380, 0x3200000C +.long 0xD8EC73E0, 0x3000000F +.long 0xD8EC7460, 0x3200000F +.long 0xE0511300, 0x80020707 .long 0xD3EE80B8, 0x06E29922 .long 0xD3EE80C0, 0x07029D14 -.long 0xE05C2000, 0x0C013C07 +.long 0xE05C1000, 0x80013C06 .long 0xD3EE80C8, 0x07229D16 +.long 0xB71C1020 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE80D0, 0x07429D18 +.long 0x80041404 +.long 0x82058005 .long 0xD3EE80D8, 0x07629D1A -.long 0xB7231000 -.long 0x810C1A0C -.long 0x810D1B0D -.long 0xBF110125 +.long 0xBF11011E .long 0xD3D84010, 0x18000102 .long 0xD3D84011, 0x18000103 .long 0xD3D84012, 0x18000142 .long 0xD3D84013, 0x18000143 .long 0xBF800001 .long 0xBF9C0000 +.long 0xBF8C0F7A .long 0xD2800010, 0x2002B110 .long 0xD2800012, 0x2002B512 .long 0xD3EE80E0, 0x07829D1C -.long 0xE07C2000, 0x3D041009 -.long 0x803C1F3D +.long 0xE07C1000, 0x80041008 .long 0xD3EE80E8, 0x07A29D1E +.long 0x80101910 +.long 0x82118011 .long 0xD3EE80F0, 0x07C29D20 +.long 0x80921912 +.long 0xBE920280 .long 0xD3EE80F8, 0x07E29D22 -.long 0xBEFC0023 +.long 0xBEFC001C .long 0xBF8C0F7D .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8080, 0x0602A124 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC0000, 0x1400000C -.long 0xD8EC0080, 0x1600000C +.long 0xD8EC0000, 0x1400000F +.long 0xD8EC0080, 0x1600000F +.long 0xE0511000, 0x80020707 .long 0xD3EE8088, 0x0622A126 .long 0xBF8CC67F .long 0xD3EE8090, 0x0642A128 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC0100, 0x1800000C -.long 0xD8EC0180, 0x1A00000C +.long 0xD8EC0100, 0x1800000F +.long 0xD8EC0180, 0x1A00000F +.long 0xE0511100, 0x80020707 .long 0xD3EE8098, 0x0662A12A .long 0xBF8CC67F .long 0xD3EE80A0, 0x0682A12C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC0200, 0x1C00000C -.long 0xD8EC0280, 0x1E00000C +.long 0xD8EC0200, 0x1C00000F +.long 0xD8EC0280, 0x1E00000F +.long 0xE0511200, 0x80020707 .long 0xD3EE80A8, 0x06A2A12E .long 0xBF8CC67F .long 0xD3EE80B0, 0x06C2A130 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC0300, 0x2000000C -.long 0xD8EC0380, 0x2200000C +.long 0xD8EC0300, 0x2000000F +.long 0xD8EC0380, 0x2200000F +.long 0xE0511300, 0x80020707 .long 0xD3EE80B8, 0x06E2A132 .long 0xD3EE80C0, 0x0702A524 -.long 0xE05C2000, 0x0C014007 +.long 0xE05C1000, 0x80014006 .long 0xD3EE80C8, 0x0722A526 +.long 0xB71C1020 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE80D0, 0x0742A528 +.long 0x80041404 +.long 0x82058005 .long 0xD3EE80D8, 0x0762A52A -.long 0xB7231000 -.long 0x810C1A0C -.long 0x810D1B0D .long 0xD3EE80E0, 0x0782A52C .long 0xD3EE80E8, 0x07A2A52E .long 0xD3EE80F0, 0x07C2A530 .long 0xD3EE80F8, 0x07E2A532 -.long 0xB7250004 -.long 0xB7240008 -.long 0xB3240078 -.long 0xBF85FDB4 -.long 0xBEFC0023 -.long 0xBF8C0F7B +.long 0xB71E0004 +.long 0xB71D0008 +.long 0xB31D0078 +.long 0xBF85FD9C +.long 0xBEFC001C +.long 0xBF8C0F7C .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8080, 0x06026914 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC1000, 0x2400000C -.long 0xD8EC1080, 0x2600000C +.long 0xD8EC1020, 0x2400000F +.long 0xD8EC10A0, 0x2600000F +.long 0xE0511000, 0x80020707 .long 0xD3EE8088, 0x06226916 .long 0xBF8CC67F .long 0xD3EE8090, 0x06426918 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC1100, 0x2800000C -.long 0xD8EC1180, 0x2A00000C +.long 0xD8EC1120, 0x2800000F +.long 0xD8EC11A0, 0x2A00000F +.long 0xE0511100, 0x80020707 .long 0xD3EE8098, 0x0662691A .long 0xBF8CC67F .long 0xD3EE80A0, 0x0682691C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC1200, 0x2C00000C -.long 0xD8EC1280, 0x2E00000C +.long 0xD8EC1220, 0x2C00000F +.long 0xD8EC12A0, 0x2E00000F +.long 0xE0511200, 0x80020707 .long 0xD3EE80A8, 0x06A2691E .long 0xBF8CC67F .long 0xD3EE80B0, 0x06C26920 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC1300, 0x3000000C -.long 0xD8EC1380, 0x3200000C +.long 0xD8EC1320, 0x3000000F +.long 0xD8EC13A0, 0x3200000F +.long 0xE0511300, 0x80020707 .long 0xD3EE80B8, 0x06E26922 -.long 0xB7231000 -.long 0x810D1B0D +.long 0xB71C1020 .long 0xD3EE80C0, 0x07026D14 -.long 0xE05C2000, 0x0C014407 +.long 0xE05C1000, 0x80014406 .long 0xD3EE80C8, 0x07226D16 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE80D0, 0x07426D18 +.long 0x80041404 +.long 0x82058005 .long 0xD3EE80D8, 0x07626D1A -.long 0x810C1A0C .long 0xD3EE80E0, 0x07826D1C -.long 0xE05C2000, 0x3C045409 -.long 0x803D1F3C +.long 0xE05C1000, 0x80045408 .long 0xD3EE80E8, 0x07A26D1E .long 0xD3EE80F0, 0x07C26D20 .long 0xD3EE80F8, 0x07E26D22 -.long 0xBEFC0023 -.long 0xBF8C0F7C +.long 0xBEFC001C +.long 0xBF8C0F7D .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8080, 0x06027124 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC2000, 0x1400000C -.long 0xD8EC2080, 0x1600000C +.long 0xD8EC2040, 0x1400000F +.long 0xD8EC20C0, 0x1600000F +.long 0xE0511000, 0x80020707 .long 0xD3EE8088, 0x06227126 .long 0xBF8CC67F .long 0xD3EE8090, 0x06427128 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC2100, 0x1800000C -.long 0xD8EC2180, 0x1A00000C +.long 0xD8EC2140, 0x1800000F +.long 0xD8EC21C0, 0x1A00000F +.long 0xE0511100, 0x80020707 .long 0xD3EE8098, 0x0662712A .long 0xBF8CC67F .long 0xD3EE80A0, 0x0682712C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC2200, 0x1C00000C -.long 0xD8EC2280, 0x1E00000C +.long 0xD8EC2240, 0x1C00000F +.long 0xD8EC22C0, 0x1E00000F +.long 0xE0511200, 0x80020707 .long 0xD3EE80A8, 0x06A2712E .long 0xBF8CC67F .long 0xD3EE80B0, 0x06C27130 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC2300, 0x2000000C -.long 0xD8EC2380, 0x2200000C +.long 0xD8EC2340, 0x2000000F +.long 0xD8EC23C0, 0x2200000F +.long 0xE0511300, 0x80020707 .long 0xD3EE80B8, 0x06E27132 -.long 0xB7231000 -.long 0x810D1B0D +.long 0xB71C1020 .long 0xD3EE80C0, 0x07027524 -.long 0xE05C2000, 0x0C014807 +.long 0xE05C1000, 0x80014806 .long 0xD3EE80C8, 0x07227526 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE80D0, 0x07427528 +.long 0x80041404 +.long 0x82058005 .long 0xD3EE80D8, 0x0762752A -.long 0x810C1A0C .long 0xD3EE80E0, 0x0782752C .long 0xD3EE80E8, 0x07A2752E .long 0xD3EE80F0, 0x07C27530 .long 0xD3EE80F8, 0x07E27532 -.long 0xBEFC0023 +.long 0xBEFC001C .long 0xBF8C0F7C .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8080, 0x06027914 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC3000, 0x2400000C -.long 0xD8EC3080, 0x2600000C +.long 0xD8EC3060, 0x2400000F +.long 0xD8EC30E0, 0x2600000F +.long 0xE0511000, 0x80020707 .long 0xD3EE8088, 0x06227916 .long 0xBF8CC67F .long 0xD3EE8090, 0x06427918 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC3100, 0x2800000C -.long 0xD8EC3180, 0x2A00000C +.long 0xD8EC3160, 0x2800000F +.long 0xD8EC31E0, 0x2A00000F +.long 0xE0511100, 0x80020707 .long 0xD3EE8098, 0x0662791A .long 0xBF8CC67F .long 0xD3EE80A0, 0x0682791C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC3200, 0x2C00000C -.long 0xD8EC3280, 0x2E00000C +.long 0xD8EC3260, 0x2C00000F +.long 0xD8EC32E0, 0x2E00000F +.long 0xE0511200, 0x80020707 .long 0xD3EE80A8, 0x06A2791E .long 0xBF8CC67F .long 0xD3EE80B0, 0x06C27920 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC3300, 0x3000000C -.long 0xD8EC3380, 0x3200000C +.long 0xD8EC3360, 0x3000000F +.long 0xD8EC33E0, 0x3200000F +.long 0xE0511300, 0x80020707 .long 0xD3EE80B8, 0x06E27922 -.long 0xB7231000 -.long 0x810D1B0D +.long 0xB71C1020 .long 0xD3EE80C0, 0x07027D14 -.long 0xE05C2000, 0x0C014C07 +.long 0xE05C1000, 0x80014C06 .long 0xD3EE80C8, 0x07227D16 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE80D0, 0x07427D18 +.long 0x80041404 +.long 0x82058005 .long 0xD3EE80D8, 0x07627D1A -.long 0x810C1A0C +.long 0xD3D84010, 0x1800013C +.long 0xD3D84011, 0x1800013D +.long 0xD3D84012, 0x1800017C +.long 0xD3D84013, 0x1800017D +.long 0xBF800001 +.long 0xBF8C0F7A +.long 0xD2800010, 0x2002A910 +.long 0xD2800012, 0x2002AD12 .long 0xD3EE80E0, 0x07827D1C -.long 0xE05C2000, 0x3D045809 +.long 0xE07C1000, 0x80041008 .long 0xD3EE80E8, 0x07A27D1E +.long 0x80101910 +.long 0x82118011 .long 0xD3EE80F0, 0x07C27D20 +.long 0x80921912 +.long 0xBE920280 .long 0xD3EE80F8, 0x07E27D22 -.long 0xBEFC0023 +.long 0x808C0426 +.long 0x828D0527 +.long 0xBE86000C +.long 0x808C0828 +.long 0x828D0929 +.long 0xBE8A000C +.long 0xBEFC001C .long 0xBF8C0F7D .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8080, 0x06028124 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC4000, 0x1400000C -.long 0xD8EC4080, 0x1600000C +.long 0xD8EC4080, 0x1400000F +.long 0xD8EC4100, 0x1600000F +.long 0xE0511000, 0x80020707 .long 0xD3EE8088, 0x06228126 .long 0xBF8CC67F .long 0xD3EE8090, 0x06428128 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC4100, 0x1800000C -.long 0xD8EC4180, 0x1A00000C +.long 0xD8EC4180, 0x1800000F +.long 0xD8EC4200, 0x1A00000F +.long 0xE0511100, 0x80020707 .long 0xD3EE8098, 0x0662812A .long 0xBF8CC67F .long 0xD3EE80A0, 0x0682812C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC4200, 0x1C00000C -.long 0xD8EC4280, 0x1E00000C +.long 0xD8EC4280, 0x1C00000F +.long 0xD8EC4300, 0x1E00000F +.long 0xE0511200, 0x80020707 .long 0xD3EE80A8, 0x06A2812E .long 0xBF8CC67F .long 0xD3EE80B0, 0x06C28130 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC4300, 0x2000000C -.long 0xD8EC4380, 0x2200000C +.long 0xD8EC4380, 0x2000000F +.long 0xD8EC4400, 0x2200000F +.long 0xE0511300, 0x80020707 .long 0xD3EE80B8, 0x06E28132 -.long 0xBEA3001C -.long 0x810D1B0D +.long 0xBE9C0016 .long 0xD3EE80C0, 0x07028524 -.long 0xE05C2000, 0x0C015007 +.long 0xE05C1000, 0x80015006 .long 0xD3EE80C8, 0x07228526 .long 0xD3EE80D0, 0x07428528 .long 0xD3EE80D8, 0x0762852A -.long 0x810C1A0C .long 0xD3EE80E0, 0x0782852C .long 0xD3EE80E8, 0x07A2852E .long 0xD3EE80F0, 0x07C28530 .long 0xD3EE80F8, 0x07E28532 -.long 0xBEBE0032 -.long 0xBEBF0033 +.long 0xB00C0100 +.long 0x920F320C +.long 0x920EFF18, 0x00000080 +.long 0x920C330E +.long 0x960D330E +.long 0x800C0F0C +.long 0x820D800D +.long 0x8E8C820C +.long 0x920E30FF, 0x00000100 +.long 0x803E0E0C +.long 0x823F800D +.long 0x80A40C2A +.long 0x82A50D2B +.long 0x924EFF32, 0x00000080 +.long 0x80CE4E1F .long 0x81313831 -.long 0xBF031D31 -.long 0xBF850854 +.long 0xBF031731 +.long 0xBF850882 .long 0x814C384C -.long 0x8194494C -.long 0x8115814D +.long 0x818C494C +.long 0x810D814D .long 0xBF03494C -.long 0xBECC0214 -.long 0xBECD0215 -.long 0x96153631 -.long 0x8F169F37 -.long 0x92141631 -.long 0x80141514 -.long 0x8616FF37, 0x7FFFFFFF -.long 0x8F331614 +.long 0xBECC020C +.long 0xBECD020D +.long 0x960D3631 +.long 0x8F0E9F37 +.long 0x920C0E31 +.long 0x800C0D0C +.long 0x860EFF37, 0x7FFFFFFF +.long 0x8F330E0C .long 0x92323433 .long 0x80B23231 .long 0xBF033534 @@ -2108,226 +2246,218 @@ DGEMM_Aldebaran_PKFixedAtomic512Latest: .long 0xBF033849 .long 0xBEC6024C .long 0xBEC7024D -.long 0xBE97004B -.long 0x96151747 -.long 0x92141747 -.long 0x8F949F14 -.long 0x92154A14 -.long 0x80951547 -.long 0x92154915 -.long 0x80154615 -.long 0xBF093914 -.long 0xBE97023B -.long 0x85164A3A -.long 0x96471715 -.long 0x92461715 +.long 0xBE8F004B +.long 0x960D0F47 +.long 0x920C0F47 +.long 0x8F8C9F0C +.long 0x920D4A0C +.long 0x808D0D47 +.long 0x920D490D +.long 0x800D460D +.long 0xBF09390C +.long 0xBE8F023B +.long 0x850E4A3A +.long 0x96470F0D +.long 0x92460F0D .long 0x8FC69F46 -.long 0x92471646 -.long 0x80C74715 -.long 0x92144A14 -.long 0x80471447 +.long 0x92470E46 +.long 0x80C7470D +.long 0x920C4A0C +.long 0x80470C47 .long 0xBF033534 .long 0x85324746 .long 0x85334647 -.long 0xBE940032 -.long 0xBE950080 -.long 0x8E948A14 -.long 0x80142114 -.long 0x82158015 -.long 0x80041440 -.long 0x82051541 -.long 0xBE8C0080 -.long 0xBE940033 -.long 0xBE950080 -.long 0x8E948A14 -.long 0x80142214 -.long 0x82158015 -.long 0x80081442 -.long 0x82091543 -.long 0xBE8D0080 -.long 0xBEFC0023 +.long 0x8E0C8A32 +.long 0x810C1A0C +.long 0x80040C40 +.long 0x82058041 +.long 0xBE8600FF, 0x80000000 +.long 0x8E0C8A33 +.long 0x800C1B0C +.long 0x80080C42 +.long 0x82098043 +.long 0xBE8A00FF, 0x80000000 +.long 0xBEFC001C .long 0xBF8C0F7C .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8080, 0x06028914 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC5000, 0x2400000C -.long 0xD8EC5080, 0x2600000C +.long 0xD8EC50A0, 0x2400000F +.long 0xD8EC5120, 0x2600000F +.long 0xE0511000, 0x80020707 .long 0xD3EE8088, 0x06228916 .long 0xBF8CC67F .long 0xD3EE8090, 0x06428918 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC5100, 0x2800000C -.long 0xD8EC5180, 0x2A00000C +.long 0xD8EC51A0, 0x2800000F +.long 0xD8EC5220, 0x2A00000F +.long 0xE0511100, 0x80020707 .long 0xD3EE8098, 0x0662891A .long 0xBF8CC67F .long 0xD3EE80A0, 0x0682891C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC5200, 0x2C00000C -.long 0xD8EC5280, 0x2E00000C +.long 0xD8EC52A0, 0x2C00000F +.long 0xD8EC5320, 0x2E00000F +.long 0xE0511200, 0x80020707 .long 0xD3EE80A8, 0x06A2891E .long 0xBF8CC67F .long 0xD3EE80B0, 0x06C28920 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC5300, 0x3000000C -.long 0xD8EC5380, 0x3200000C +.long 0xD8EC53A0, 0x3000000F +.long 0xD8EC5420, 0x3200000F +.long 0xE0511300, 0x80020707 .long 0xD3EE80B8, 0x06E28922 -.long 0xB7231000 -.long 0x810D1B0D +.long 0xB71C1020 .long 0xD3EE80C0, 0x07028D14 -.long 0xE05C2000, 0x0C013407 +.long 0xE05C1000, 0x80013406 .long 0xD3EE80C8, 0x07228D16 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE80D0, 0x07428D18 +.long 0x80041404 +.long 0x82058005 .long 0xD3EE80D8, 0x07628D1A -.long 0x810C1A0C -.long 0xD3D84010, 0x1800013C -.long 0xD3D84011, 0x1800013D -.long 0xD3D84012, 0x1800017C -.long 0xD3D84013, 0x1800017D -.long 0xBF800001 -.long 0xD2800010, 0x2002A910 -.long 0xD2800012, 0x2002AD12 .long 0xD3EE80E0, 0x07828D1C -.long 0xE07C2000, 0x3C041009 +.long 0xE05C1000, 0x80045808 .long 0xD3EE80E8, 0x07A28D1E .long 0xD3EE80F0, 0x07C28D20 .long 0xD3EE80F8, 0x07E28D22 -.long 0xBEFC0023 +.long 0xBEFC001C .long 0xBF8C0F7D .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8080, 0x06029124 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC6000, 0x1400000C -.long 0xD8EC6080, 0x1600000C +.long 0xD8EC60C0, 0x1400000F +.long 0xD8EC6140, 0x1600000F +.long 0xE0511000, 0x80020707 .long 0xD3EE8088, 0x06229126 .long 0xBF8CC67F .long 0xD3EE8090, 0x06429128 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC6100, 0x1800000C -.long 0xD8EC6180, 0x1A00000C +.long 0xD8EC61C0, 0x1800000F +.long 0xD8EC6240, 0x1A00000F +.long 0xE0511100, 0x80020707 .long 0xD3EE8098, 0x0662912A .long 0xBF8CC67F .long 0xD3EE80A0, 0x0682912C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC6200, 0x1C00000C -.long 0xD8EC6280, 0x1E00000C +.long 0xD8EC62C0, 0x1C00000F +.long 0xD8EC6340, 0x1E00000F +.long 0xE0511200, 0x80020707 .long 0xD3EE80A8, 0x06A2912E .long 0xBF8CC67F .long 0xD3EE80B0, 0x06C29130 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC6300, 0x2000000C -.long 0xD8EC6380, 0x2200000C +.long 0xD8EC63C0, 0x2000000F +.long 0xD8EC6440, 0x2200000F +.long 0xE0511300, 0x80020707 .long 0xD3EE80B8, 0x06E29132 -.long 0xB7231000 -.long 0x810D1B0D +.long 0xB71C1020 .long 0xD3EE80C0, 0x07029524 -.long 0xE05C2000, 0x0C013807 +.long 0xE05C1000, 0x80013806 .long 0xD3EE80C8, 0x07229526 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE80D0, 0x07429528 +.long 0x80041404 +.long 0x82058005 .long 0xD3EE80D8, 0x0762952A -.long 0x810C1A0C .long 0xD3EE80E0, 0x0782952C .long 0xD3EE80E8, 0x07A2952E .long 0xD3EE80F0, 0x07C29530 .long 0xD3EE80F8, 0x07E29532 -.long 0xBEFC0023 +.long 0xBEFC001C .long 0xBF8C0F7C .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8080, 0x06029914 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC7000, 0x2400000C -.long 0xD8EC7080, 0x2600000C +.long 0xD8EC70E0, 0x2400000F +.long 0xD8EC7160, 0x2600000F +.long 0xE0511000, 0x80020707 .long 0xD3EE8088, 0x06229916 .long 0xBF8CC67F .long 0xD3EE8090, 0x06429918 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC7100, 0x2800000C -.long 0xD8EC7180, 0x2A00000C +.long 0xD8EC71E0, 0x2800000F +.long 0xD8EC7260, 0x2A00000F +.long 0xE0511100, 0x80020707 .long 0xD3EE8098, 0x0662991A .long 0xBF8CC67F .long 0xD3EE80A0, 0x0682991C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC7200, 0x2C00000C -.long 0xD8EC7280, 0x2E00000C +.long 0xD8EC72E0, 0x2C00000F +.long 0xD8EC7360, 0x2E00000F +.long 0xE0511200, 0x80020707 .long 0xD3EE80A8, 0x06A2991E .long 0xBF8CC67F .long 0xD3EE80B0, 0x06C29920 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC7300, 0x3000000C -.long 0xD8EC7380, 0x3200000C +.long 0xD8EC73E0, 0x3000000F +.long 0xD8EC7460, 0x3200000F +.long 0xE0511300, 0x80020707 .long 0xD3EE80B8, 0x06E29922 -.long 0xB7231000 -.long 0x810D1B0D +.long 0xB71C1020 .long 0xD3EE80C0, 0x07029D14 -.long 0xE05C2000, 0x0C013C07 +.long 0xE05C1000, 0x80013C06 .long 0xD3EE80C8, 0x07229D16 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE80D0, 0x07429D18 +.long 0x80041404 +.long 0x82058005 .long 0xD3EE80D8, 0x07629D1A -.long 0x810C1A0C .long 0xD3D84010, 0x1800013E .long 0xD3D84011, 0x1800013F .long 0xD3D84012, 0x1800017E .long 0xD3D84013, 0x1800017F .long 0xBF800001 +.long 0xBF8C0F7A .long 0xD2800010, 0x2002B110 .long 0xD2800012, 0x2002B512 .long 0xD3EE80E0, 0x07829D1C -.long 0xE07C2000, 0x3D041009 -.long 0x803C1F3D +.long 0xE07C1000, 0x80041008 .long 0xD3EE80E8, 0x07A29D1E .long 0xD3EE80F0, 0x07C29D20 .long 0xD3EE80F8, 0x07E29D22 -.long 0xBEFC0023 +.long 0xBEFC001C .long 0xBF8C0F7D .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8080, 0x0602A124 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC0000, 0x1400000C -.long 0xD8EC0080, 0x1600000C +.long 0xD8EC0000, 0x1400000F +.long 0xD8EC0080, 0x1600000F +.long 0xE0511000, 0x80020707 .long 0xD3EE8088, 0x0622A126 .long 0xBF8CC67F .long 0xD3EE8090, 0x0642A128 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC0100, 0x1800000C -.long 0xD8EC0180, 0x1A00000C +.long 0xD8EC0100, 0x1800000F +.long 0xD8EC0180, 0x1A00000F +.long 0xE0511100, 0x80020707 .long 0xD3EE8098, 0x0662A12A .long 0xBF8CC67F .long 0xD3EE80A0, 0x0682A12C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC0200, 0x1C00000C -.long 0xD8EC0280, 0x1E00000C +.long 0xD8EC0200, 0x1C00000F +.long 0xD8EC0280, 0x1E00000F +.long 0xE0511200, 0x80020707 .long 0xD3EE80A8, 0x06A2A12E .long 0xBF8CC67F .long 0xD3EE80B0, 0x06C2A130 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC0300, 0x2000000C -.long 0xD8EC0380, 0x2200000C +.long 0xD8EC0300, 0x2000000F +.long 0xD8EC0380, 0x2200000F +.long 0xE0511300, 0x80020707 .long 0xD3EE80B8, 0x06E2A132 -.long 0xB7231000 -.long 0x810D1B0D +.long 0xB71C1020 .long 0xD3EE80C0, 0x0702A524 -.long 0xE05C2000, 0x0C014007 +.long 0xE05C1000, 0x80014006 .long 0xD3EE80C8, 0x0722A526 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE80D0, 0x0742A528 -.long 0xD3EE80D8, 0x0762A52A -.long 0x810C1A0C +.long 0x80041404 +.long 0x82058005 +.long 0xD3EE80D8, 0x0762A52A .long 0xD3EE80E0, 0x0782A52C .long 0xD3EE80E8, 0x07A2A52E .long 0xD3EE80F0, 0x07C2A530 .long 0xD3EE80F8, 0x07E2A532 -.long 0xB0140100 -.long 0x92183E14 -.long 0x9216FF1E, 0x00000080 -.long 0x92143F16 -.long 0x96153F16 -.long 0x921630C0 -.long 0x80141416 -.long 0x80181418 -.long 0x82198015 -.long 0x8E988218 +.long 0xBE9200FF, 0x80000000 +.long 0xBF091224 +.long 0x850C2412 +.long 0xBF068025 +.long 0xBE92020C +.long 0xD0C9000C, 0x00009D0A +.long 0xD1000008, 0x0032190E .long 0xD3D94000, 0x18000080 .long 0xD3D94001, 0x18000080 .long 0xD3D94002, 0x18000080 @@ -2456,527 +2586,584 @@ DGEMM_Aldebaran_PKFixedAtomic512Latest: .long 0xD3D9407D, 0x18000080 .long 0xD3D9407E, 0x18000080 .long 0xD3D9407F, 0x18000080 -.long 0xBEA40080 -.long 0xBEA50080 -.long 0x80104418 -.long 0x82114519 -.long 0xBEBC0080 -.long 0xBEFC0023 -.long 0xBF8C0F7B +.long 0xBE9D0080 +.long 0xBE9E0080 +.long 0x8010443E +.long 0x8211453F +.long 0xBEFC001C +.long 0xBF8C0F7C .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8000, 0x04026914 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC1000, 0x2400000C -.long 0xD8EC1080, 0x2600000C +.long 0xD8EC1020, 0x2400000F +.long 0xD8EC10A0, 0x2600000F +.long 0xE0511000, 0x80020707 .long 0xD3EE8008, 0x04226916 .long 0xBF8CC67F .long 0xD3EE8010, 0x04426918 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC1100, 0x2800000C -.long 0xD8EC1180, 0x2A00000C +.long 0xD8EC1120, 0x2800000F +.long 0xD8EC11A0, 0x2A00000F +.long 0xE0511100, 0x80020707 .long 0xD3EE8018, 0x0462691A .long 0xBF8CC67F .long 0xD3EE8020, 0x0482691C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC1200, 0x2C00000C -.long 0xD8EC1280, 0x2E00000C +.long 0xD8EC1220, 0x2C00000F +.long 0xD8EC12A0, 0x2E00000F +.long 0xE0511200, 0x80020707 .long 0xD3EE8028, 0x04A2691E .long 0xBF8CC67F .long 0xD3EE8030, 0x04C26920 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC1300, 0x3000000C -.long 0xD8EC1380, 0x3200000C +.long 0xD8EC1320, 0x3000000F +.long 0xD8EC13A0, 0x3200000F +.long 0xE0511300, 0x80020707 .long 0xD3EE8038, 0x04E26922 .long 0xD3EE8040, 0x05026D14 -.long 0xE05C2000, 0x0C014407 +.long 0xE05C1000, 0x80014406 .long 0xD3EE8048, 0x05226D16 +.long 0xB71C1020 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE8050, 0x05426D18 +.long 0x80041404 +.long 0x82058005 .long 0xD3EE8058, 0x05626D1A -.long 0xB7231000 -.long 0x810C1A0C -.long 0x810D1B0D .long 0xD3EE8060, 0x05826D1C -.long 0xE05C2000, 0x3C045409 -.long 0x803D1F3C +.long 0xE05C1000, 0x80045408 .long 0xD3EE8068, 0x05A26D1E .long 0xD3EE8070, 0x05C26D20 .long 0xD3EE8078, 0x05E26D22 -.long 0xBEFC0023 -.long 0xBF8C0F7C +.long 0xBEFC001C +.long 0xBF8C0F7D .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8000, 0x04027124 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC2000, 0x1400000C -.long 0xD8EC2080, 0x1600000C +.long 0xD8EC2040, 0x1400000F +.long 0xD8EC20C0, 0x1600000F +.long 0xE0511000, 0x80020707 .long 0xD3EE8008, 0x04227126 .long 0xBF8CC67F .long 0xD3EE8010, 0x04427128 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC2100, 0x1800000C -.long 0xD8EC2180, 0x1A00000C +.long 0xD8EC2140, 0x1800000F +.long 0xD8EC21C0, 0x1A00000F +.long 0xE0511100, 0x80020707 .long 0xD3EE8018, 0x0462712A .long 0xBF8CC67F .long 0xD3EE8020, 0x0482712C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC2200, 0x1C00000C -.long 0xD8EC2280, 0x1E00000C +.long 0xD8EC2240, 0x1C00000F +.long 0xD8EC22C0, 0x1E00000F +.long 0xE0511200, 0x80020707 .long 0xD3EE8028, 0x04A2712E .long 0xBF8CC67F .long 0xD3EE8030, 0x04C27130 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC2300, 0x2000000C -.long 0xD8EC2380, 0x2200000C +.long 0xD8EC2340, 0x2000000F +.long 0xD8EC23C0, 0x2200000F +.long 0xE0511300, 0x80020707 .long 0xD3EE8038, 0x04E27132 .long 0xD3EE8040, 0x05027524 -.long 0xE05C2000, 0x0C014807 +.long 0xE05C1000, 0x80014806 .long 0xD3EE8048, 0x05227526 +.long 0xB71C1020 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE8050, 0x05427528 +.long 0x80041404 +.long 0x82058005 .long 0xD3EE8058, 0x0562752A -.long 0xB7231000 -.long 0x810C1A0C -.long 0x810D1B0D .long 0xD3EE8060, 0x0582752C .long 0xD3EE8068, 0x05A2752E .long 0xD3EE8070, 0x05C27530 .long 0xD3EE8078, 0x05E27532 -.long 0xBEFC0023 +.long 0xBEFC001C .long 0xBF8C0F7C .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8000, 0x04027914 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC3000, 0x2400000C -.long 0xD8EC3080, 0x2600000C +.long 0xD8EC3060, 0x2400000F +.long 0xD8EC30E0, 0x2600000F +.long 0xE0511000, 0x80020707 .long 0xD3EE8008, 0x04227916 .long 0xBF8CC67F .long 0xD3EE8010, 0x04427918 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC3100, 0x2800000C -.long 0xD8EC3180, 0x2A00000C +.long 0xD8EC3160, 0x2800000F +.long 0xD8EC31E0, 0x2A00000F +.long 0xE0511100, 0x80020707 .long 0xD3EE8018, 0x0462791A .long 0xBF8CC67F .long 0xD3EE8020, 0x0482791C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC3200, 0x2C00000C -.long 0xD8EC3280, 0x2E00000C +.long 0xD8EC3260, 0x2C00000F +.long 0xD8EC32E0, 0x2E00000F +.long 0xE0511200, 0x80020707 .long 0xD3EE8028, 0x04A2791E .long 0xBF8CC67F .long 0xD3EE8030, 0x04C27920 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC3300, 0x3000000C -.long 0xD8EC3380, 0x3200000C +.long 0xD8EC3360, 0x3000000F +.long 0xD8EC33E0, 0x3200000F +.long 0xE0511300, 0x80020707 .long 0xD3EE8038, 0x04E27922 .long 0xD3EE8040, 0x05027D14 -.long 0xE05C2000, 0x0C014C07 +.long 0xE05C1000, 0x80014C06 .long 0xD3EE8048, 0x05227D16 +.long 0xB71C1020 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE8050, 0x05427D18 +.long 0x80041404 +.long 0x82058005 .long 0xD3EE8058, 0x05627D1A -.long 0xB7231000 -.long 0x810C1A0C -.long 0x810D1B0D +.long 0xBF11011E +.long 0xD3D84010, 0x18000180 +.long 0xD3D84011, 0x18000181 +.long 0xD3D84012, 0x180001C0 +.long 0xD3D84013, 0x180001C1 +.long 0xBF800001 +.long 0xBF9C0000 +.long 0xBF8C0F7A +.long 0xD2800010, 0x2002A910 +.long 0xD2800012, 0x2002AD12 .long 0xD3EE8060, 0x05827D1C -.long 0xE05C2000, 0x3D045809 +.long 0xE07C1000, 0x80041008 .long 0xD3EE8068, 0x05A27D1E +.long 0x80101910 +.long 0x82118011 .long 0xD3EE8070, 0x05C27D20 +.long 0x80921912 +.long 0xBE920280 .long 0xD3EE8078, 0x05E27D22 -.long 0xBEFC0023 +.long 0xBEFC001C .long 0xBF8C0F7D .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8000, 0x04028124 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC4000, 0x1400000C -.long 0xD8EC4080, 0x1600000C +.long 0xD8EC4080, 0x1400000F +.long 0xD8EC4100, 0x1600000F +.long 0xE0511000, 0x80020707 .long 0xD3EE8008, 0x04228126 .long 0xBF8CC67F .long 0xD3EE8010, 0x04428128 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC4100, 0x1800000C -.long 0xD8EC4180, 0x1A00000C +.long 0xD8EC4180, 0x1800000F +.long 0xD8EC4200, 0x1A00000F +.long 0xE0511100, 0x80020707 .long 0xD3EE8018, 0x0462812A .long 0xBF8CC67F .long 0xD3EE8020, 0x0482812C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC4200, 0x1C00000C -.long 0xD8EC4280, 0x1E00000C +.long 0xD8EC4280, 0x1C00000F +.long 0xD8EC4300, 0x1E00000F +.long 0xE0511200, 0x80020707 .long 0xD3EE8028, 0x04A2812E .long 0xBF8CC67F .long 0xD3EE8030, 0x04C28130 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC4300, 0x2000000C -.long 0xD8EC4380, 0x2200000C +.long 0xD8EC4380, 0x2000000F +.long 0xD8EC4400, 0x2200000F +.long 0xE0511300, 0x80020707 .long 0xD3EE8038, 0x04E28132 .long 0xD3EE8040, 0x05028524 -.long 0xE05C2000, 0x0C015007 +.long 0xE05C1000, 0x80015006 .long 0xD3EE8048, 0x05228526 +.long 0xBE9C0016 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE8050, 0x05428528 +.long 0x80041404 +.long 0x82058005 .long 0xD3EE8058, 0x0562852A -.long 0xBEA3001C -.long 0x810C1A0C -.long 0x810D1B0D .long 0xD3EE8060, 0x0582852C .long 0xD3EE8068, 0x05A2852E .long 0xD3EE8070, 0x05C28530 .long 0xD3EE8078, 0x05E28532 -.long 0xBEFC0023 +.long 0xBEFC001C .long 0xBF8C0F7C .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8000, 0x04028914 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC5000, 0x2400000C -.long 0xD8EC5080, 0x2600000C +.long 0xD8EC50A0, 0x2400000F +.long 0xD8EC5120, 0x2600000F +.long 0xE0511000, 0x80020707 .long 0xD3EE8008, 0x04228916 .long 0xBF8CC67F .long 0xD3EE8010, 0x04428918 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC5100, 0x2800000C -.long 0xD8EC5180, 0x2A00000C +.long 0xD8EC51A0, 0x2800000F +.long 0xD8EC5220, 0x2A00000F +.long 0xE0511100, 0x80020707 .long 0xD3EE8018, 0x0462891A .long 0xBF8CC67F .long 0xD3EE8020, 0x0482891C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC5200, 0x2C00000C -.long 0xD8EC5280, 0x2E00000C +.long 0xD8EC52A0, 0x2C00000F +.long 0xD8EC5320, 0x2E00000F +.long 0xE0511200, 0x80020707 .long 0xD3EE8028, 0x04A2891E .long 0xBF8CC67F .long 0xD3EE8030, 0x04C28920 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC5300, 0x3000000C -.long 0xD8EC5380, 0x3200000C +.long 0xD8EC53A0, 0x3000000F +.long 0xD8EC5420, 0x3200000F +.long 0xE0511300, 0x80020707 .long 0xD3EE8038, 0x04E28922 .long 0xD3EE8040, 0x05028D14 -.long 0xE05C2000, 0x0C013407 +.long 0xE05C1000, 0x80013406 .long 0xD3EE8048, 0x05228D16 +.long 0xB71C1020 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE8050, 0x05428D18 +.long 0x80041404 +.long 0x82058005 .long 0xD3EE8058, 0x05628D1A -.long 0xB7231000 -.long 0x810C1A0C -.long 0x810D1B0D -.long 0xBF110125 -.long 0xD3D84010, 0x18000180 -.long 0xD3D84011, 0x18000181 -.long 0xD3D84012, 0x180001C0 -.long 0xD3D84013, 0x180001C1 -.long 0xBF800001 -.long 0xBF9C0000 -.long 0xD2800010, 0x2002A910 -.long 0xD2800012, 0x2002AD12 .long 0xD3EE8060, 0x05828D1C -.long 0xE07C2000, 0x3C041009 +.long 0xE05C1000, 0x80045808 .long 0xD3EE8068, 0x05A28D1E .long 0xD3EE8070, 0x05C28D20 .long 0xD3EE8078, 0x05E28D22 -.long 0xBEFC0023 +.long 0xBEFC001C .long 0xBF8C0F7D .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8000, 0x04029124 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC6000, 0x1400000C -.long 0xD8EC6080, 0x1600000C +.long 0xD8EC60C0, 0x1400000F +.long 0xD8EC6140, 0x1600000F +.long 0xE0511000, 0x80020707 .long 0xD3EE8008, 0x04229126 .long 0xBF8CC67F .long 0xD3EE8010, 0x04429128 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC6100, 0x1800000C -.long 0xD8EC6180, 0x1A00000C +.long 0xD8EC61C0, 0x1800000F +.long 0xD8EC6240, 0x1A00000F +.long 0xE0511100, 0x80020707 .long 0xD3EE8018, 0x0462912A .long 0xBF8CC67F .long 0xD3EE8020, 0x0482912C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC6200, 0x1C00000C -.long 0xD8EC6280, 0x1E00000C +.long 0xD8EC62C0, 0x1C00000F +.long 0xD8EC6340, 0x1E00000F +.long 0xE0511200, 0x80020707 .long 0xD3EE8028, 0x04A2912E .long 0xBF8CC67F .long 0xD3EE8030, 0x04C29130 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC6300, 0x2000000C -.long 0xD8EC6380, 0x2200000C +.long 0xD8EC63C0, 0x2000000F +.long 0xD8EC6440, 0x2200000F +.long 0xE0511300, 0x80020707 .long 0xD3EE8038, 0x04E29132 .long 0xD3EE8040, 0x05029524 -.long 0xE05C2000, 0x0C013807 +.long 0xE05C1000, 0x80013806 .long 0xD3EE8048, 0x05229526 +.long 0xB71C1020 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE8050, 0x05429528 +.long 0x80041404 +.long 0x82058005 .long 0xD3EE8058, 0x0562952A -.long 0xB7231000 -.long 0x810C1A0C -.long 0x810D1B0D .long 0xD3EE8060, 0x0582952C .long 0xD3EE8068, 0x05A2952E .long 0xD3EE8070, 0x05C29530 .long 0xD3EE8078, 0x05E29532 -.long 0xBEFC0023 +.long 0xBEFC001C .long 0xBF8C0F7C .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8000, 0x04029914 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC7000, 0x2400000C -.long 0xD8EC7080, 0x2600000C +.long 0xD8EC70E0, 0x2400000F +.long 0xD8EC7160, 0x2600000F +.long 0xE0511000, 0x80020707 .long 0xD3EE8008, 0x04229916 .long 0xBF8CC67F .long 0xD3EE8010, 0x04429918 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC7100, 0x2800000C -.long 0xD8EC7180, 0x2A00000C +.long 0xD8EC71E0, 0x2800000F +.long 0xD8EC7260, 0x2A00000F +.long 0xE0511100, 0x80020707 .long 0xD3EE8018, 0x0462991A .long 0xBF8CC67F .long 0xD3EE8020, 0x0482991C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC7200, 0x2C00000C -.long 0xD8EC7280, 0x2E00000C +.long 0xD8EC72E0, 0x2C00000F +.long 0xD8EC7360, 0x2E00000F +.long 0xE0511200, 0x80020707 .long 0xD3EE8028, 0x04A2991E .long 0xBF8CC67F .long 0xD3EE8030, 0x04C29920 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC7300, 0x3000000C -.long 0xD8EC7380, 0x3200000C +.long 0xD8EC73E0, 0x3000000F +.long 0xD8EC7460, 0x3200000F +.long 0xE0511300, 0x80020707 .long 0xD3EE8038, 0x04E29922 .long 0xD3EE8040, 0x05029D14 -.long 0xE05C2000, 0x0C013C07 +.long 0xE05C1000, 0x80013C06 .long 0xD3EE8048, 0x05229D16 +.long 0xB71C1020 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE8050, 0x05429D18 +.long 0x80041404 +.long 0x82058005 .long 0xD3EE8058, 0x05629D1A -.long 0xB7231000 -.long 0x810C1A0C -.long 0x810D1B0D -.long 0xBF110125 +.long 0xBF11011E .long 0xD3D84010, 0x18000182 .long 0xD3D84011, 0x18000183 .long 0xD3D84012, 0x180001C2 .long 0xD3D84013, 0x180001C3 .long 0xBF800001 .long 0xBF9C0000 +.long 0xBF8C0F7A .long 0xD2800010, 0x2002B110 .long 0xD2800012, 0x2002B512 .long 0xD3EE8060, 0x05829D1C -.long 0xE07C2000, 0x3D041009 -.long 0x803C1F3D +.long 0xE07C1000, 0x80041008 .long 0xD3EE8068, 0x05A29D1E +.long 0x80101910 +.long 0x82118011 .long 0xD3EE8070, 0x05C29D20 +.long 0x80921912 +.long 0xBE920280 .long 0xD3EE8078, 0x05E29D22 -.long 0xBEFC0023 +.long 0xBEFC001C .long 0xBF8C0F7D .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8000, 0x0402A124 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC0000, 0x1400000C -.long 0xD8EC0080, 0x1600000C +.long 0xD8EC0000, 0x1400000F +.long 0xD8EC0080, 0x1600000F +.long 0xE0511000, 0x80020707 .long 0xD3EE8008, 0x0422A126 .long 0xBF8CC67F .long 0xD3EE8010, 0x0442A128 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC0100, 0x1800000C -.long 0xD8EC0180, 0x1A00000C +.long 0xD8EC0100, 0x1800000F +.long 0xD8EC0180, 0x1A00000F +.long 0xE0511100, 0x80020707 .long 0xD3EE8018, 0x0462A12A .long 0xBF8CC67F .long 0xD3EE8020, 0x0482A12C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC0200, 0x1C00000C -.long 0xD8EC0280, 0x1E00000C +.long 0xD8EC0200, 0x1C00000F +.long 0xD8EC0280, 0x1E00000F +.long 0xE0511200, 0x80020707 .long 0xD3EE8028, 0x04A2A12E .long 0xBF8CC67F .long 0xD3EE8030, 0x04C2A130 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC0300, 0x2000000C -.long 0xD8EC0380, 0x2200000C +.long 0xD8EC0300, 0x2000000F +.long 0xD8EC0380, 0x2200000F +.long 0xE0511300, 0x80020707 .long 0xD3EE8038, 0x04E2A132 .long 0xD3EE8040, 0x0502A524 -.long 0xE05C2000, 0x0C014007 +.long 0xE05C1000, 0x80014006 .long 0xD3EE8048, 0x0522A526 +.long 0xB71C1020 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE8050, 0x0542A528 +.long 0x80041404 +.long 0x82058005 .long 0xD3EE8058, 0x0562A52A -.long 0xB7231000 -.long 0x810C1A0C -.long 0x810D1B0D .long 0xD3EE8060, 0x0582A52C .long 0xD3EE8068, 0x05A2A52E .long 0xD3EE8070, 0x05C2A530 .long 0xD3EE8078, 0x05E2A532 -.long 0xB7250004 -.long 0xB7240008 -.long 0xB3240078 -.long 0xBF85FDB4 -.long 0xBEFC0023 -.long 0xBF8C0F7B +.long 0xB71E0004 +.long 0xB71D0008 +.long 0xB31D0078 +.long 0xBF85FD9C +.long 0xBEFC001C +.long 0xBF8C0F7C .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8000, 0x04026914 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC1000, 0x2400000C -.long 0xD8EC1080, 0x2600000C +.long 0xD8EC1020, 0x2400000F +.long 0xD8EC10A0, 0x2600000F +.long 0xE0511000, 0x80020707 .long 0xD3EE8008, 0x04226916 .long 0xBF8CC67F .long 0xD3EE8010, 0x04426918 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC1100, 0x2800000C -.long 0xD8EC1180, 0x2A00000C +.long 0xD8EC1120, 0x2800000F +.long 0xD8EC11A0, 0x2A00000F +.long 0xE0511100, 0x80020707 .long 0xD3EE8018, 0x0462691A .long 0xBF8CC67F .long 0xD3EE8020, 0x0482691C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC1200, 0x2C00000C -.long 0xD8EC1280, 0x2E00000C +.long 0xD8EC1220, 0x2C00000F +.long 0xD8EC12A0, 0x2E00000F +.long 0xE0511200, 0x80020707 .long 0xD3EE8028, 0x04A2691E .long 0xBF8CC67F .long 0xD3EE8030, 0x04C26920 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC1300, 0x3000000C -.long 0xD8EC1380, 0x3200000C +.long 0xD8EC1320, 0x3000000F +.long 0xD8EC13A0, 0x3200000F +.long 0xE0511300, 0x80020707 .long 0xD3EE8038, 0x04E26922 -.long 0xB7231000 -.long 0x810D1B0D +.long 0xB71C1020 .long 0xD3EE8040, 0x05026D14 -.long 0xE05C2000, 0x0C014407 +.long 0xE05C1000, 0x80014406 .long 0xD3EE8048, 0x05226D16 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE8050, 0x05426D18 +.long 0x80041404 +.long 0x82058005 .long 0xD3EE8058, 0x05626D1A -.long 0x810C1A0C .long 0xD3EE8060, 0x05826D1C -.long 0xE05C2000, 0x3C045409 -.long 0x803D1F3C +.long 0xE05C1000, 0x80045408 .long 0xD3EE8068, 0x05A26D1E .long 0xD3EE8070, 0x05C26D20 .long 0xD3EE8078, 0x05E26D22 -.long 0xBEFC0023 -.long 0xBF8C0F7C +.long 0xBEFC001C +.long 0xBF8C0F7D .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8000, 0x04027124 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC2000, 0x1400000C -.long 0xD8EC2080, 0x1600000C +.long 0xD8EC2040, 0x1400000F +.long 0xD8EC20C0, 0x1600000F +.long 0xE0511000, 0x80020707 .long 0xD3EE8008, 0x04227126 .long 0xBF8CC67F .long 0xD3EE8010, 0x04427128 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC2100, 0x1800000C -.long 0xD8EC2180, 0x1A00000C +.long 0xD8EC2140, 0x1800000F +.long 0xD8EC21C0, 0x1A00000F +.long 0xE0511100, 0x80020707 .long 0xD3EE8018, 0x0462712A .long 0xBF8CC67F .long 0xD3EE8020, 0x0482712C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC2200, 0x1C00000C -.long 0xD8EC2280, 0x1E00000C +.long 0xD8EC2240, 0x1C00000F +.long 0xD8EC22C0, 0x1E00000F +.long 0xE0511200, 0x80020707 .long 0xD3EE8028, 0x04A2712E .long 0xBF8CC67F .long 0xD3EE8030, 0x04C27130 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC2300, 0x2000000C -.long 0xD8EC2380, 0x2200000C +.long 0xD8EC2340, 0x2000000F +.long 0xD8EC23C0, 0x2200000F +.long 0xE0511300, 0x80020707 .long 0xD3EE8038, 0x04E27132 -.long 0xB7231000 -.long 0x810D1B0D +.long 0xB71C1020 .long 0xD3EE8040, 0x05027524 -.long 0xE05C2000, 0x0C014807 +.long 0xE05C1000, 0x80014806 .long 0xD3EE8048, 0x05227526 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE8050, 0x05427528 +.long 0x80041404 +.long 0x82058005 .long 0xD3EE8058, 0x0562752A -.long 0x810C1A0C .long 0xD3EE8060, 0x0582752C .long 0xD3EE8068, 0x05A2752E .long 0xD3EE8070, 0x05C27530 .long 0xD3EE8078, 0x05E27532 -.long 0xBEFC0023 +.long 0xBEFC001C .long 0xBF8C0F7C .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8000, 0x04027914 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC3000, 0x2400000C -.long 0xD8EC3080, 0x2600000C +.long 0xD8EC3060, 0x2400000F +.long 0xD8EC30E0, 0x2600000F +.long 0xE0511000, 0x80020707 .long 0xD3EE8008, 0x04227916 .long 0xBF8CC67F .long 0xD3EE8010, 0x04427918 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC3100, 0x2800000C -.long 0xD8EC3180, 0x2A00000C +.long 0xD8EC3160, 0x2800000F +.long 0xD8EC31E0, 0x2A00000F +.long 0xE0511100, 0x80020707 .long 0xD3EE8018, 0x0462791A .long 0xBF8CC67F .long 0xD3EE8020, 0x0482791C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC3200, 0x2C00000C -.long 0xD8EC3280, 0x2E00000C +.long 0xD8EC3260, 0x2C00000F +.long 0xD8EC32E0, 0x2E00000F +.long 0xE0511200, 0x80020707 .long 0xD3EE8028, 0x04A2791E .long 0xBF8CC67F .long 0xD3EE8030, 0x04C27920 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC3300, 0x3000000C -.long 0xD8EC3380, 0x3200000C +.long 0xD8EC3360, 0x3000000F +.long 0xD8EC33E0, 0x3200000F +.long 0xE0511300, 0x80020707 .long 0xD3EE8038, 0x04E27922 -.long 0xB7231000 -.long 0x810D1B0D +.long 0xB71C1020 .long 0xD3EE8040, 0x05027D14 -.long 0xE05C2000, 0x0C014C07 +.long 0xE05C1000, 0x80014C06 .long 0xD3EE8048, 0x05227D16 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE8050, 0x05427D18 +.long 0x80041404 +.long 0x82058005 .long 0xD3EE8058, 0x05627D1A -.long 0x810C1A0C +.long 0xD3D84010, 0x180001BC +.long 0xD3D84011, 0x180001BD +.long 0xD3D84012, 0x180001FC +.long 0xD3D84013, 0x180001FD +.long 0xBF800001 +.long 0xBF8C0F7A +.long 0xD2800010, 0x2002A910 +.long 0xD2800012, 0x2002AD12 .long 0xD3EE8060, 0x05827D1C -.long 0xE05C2000, 0x3D045809 +.long 0xE07C1000, 0x80041008 .long 0xD3EE8068, 0x05A27D1E +.long 0x80101910 +.long 0x82118011 .long 0xD3EE8070, 0x05C27D20 +.long 0x80921912 +.long 0xBE920280 .long 0xD3EE8078, 0x05E27D22 -.long 0xBEFC0023 +.long 0x808C0426 +.long 0x828D0527 +.long 0xBE86000C +.long 0x808C0828 +.long 0x828D0929 +.long 0xBE8A000C +.long 0xBEFC001C .long 0xBF8C0F7D .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8000, 0x04028124 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC4000, 0x1400000C -.long 0xD8EC4080, 0x1600000C +.long 0xD8EC4080, 0x1400000F +.long 0xD8EC4100, 0x1600000F +.long 0xE0511000, 0x80020707 .long 0xD3EE8008, 0x04228126 .long 0xBF8CC67F .long 0xD3EE8010, 0x04428128 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC4100, 0x1800000C -.long 0xD8EC4180, 0x1A00000C +.long 0xD8EC4180, 0x1800000F +.long 0xD8EC4200, 0x1A00000F +.long 0xE0511100, 0x80020707 .long 0xD3EE8018, 0x0462812A .long 0xBF8CC67F .long 0xD3EE8020, 0x0482812C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC4200, 0x1C00000C -.long 0xD8EC4280, 0x1E00000C +.long 0xD8EC4280, 0x1C00000F +.long 0xD8EC4300, 0x1E00000F +.long 0xE0511200, 0x80020707 .long 0xD3EE8028, 0x04A2812E .long 0xBF8CC67F .long 0xD3EE8030, 0x04C28130 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC4300, 0x2000000C -.long 0xD8EC4380, 0x2200000C +.long 0xD8EC4380, 0x2000000F +.long 0xD8EC4400, 0x2200000F +.long 0xE0511300, 0x80020707 .long 0xD3EE8038, 0x04E28132 -.long 0xBEA3001C -.long 0x810D1B0D +.long 0xBE9C0016 .long 0xD3EE8040, 0x05028524 -.long 0xE05C2000, 0x0C015007 +.long 0xE05C1000, 0x80015006 .long 0xD3EE8048, 0x05228526 .long 0xD3EE8050, 0x05428528 .long 0xD3EE8058, 0x0562852A -.long 0x810C1A0C .long 0xD3EE8060, 0x0582852C .long 0xD3EE8068, 0x05A2852E .long 0xD3EE8070, 0x05C28530 .long 0xD3EE8078, 0x05E28532 -.long 0xBEBE0032 -.long 0xBEBF0033 +.long 0xB00C0100 +.long 0x920F320C +.long 0x920EFF18, 0x00000080 +.long 0x920C330E +.long 0x960D330E +.long 0x800C0F0C +.long 0x820D800D +.long 0x8E8C820C +.long 0x920E30FF, 0x00000100 +.long 0x803E0E0C +.long 0x823F800D +.long 0x80A40C2A +.long 0x82A50D2B +.long 0x924EFF32, 0x00000080 +.long 0x80CE4E1F .long 0x81313831 -.long 0xBF031D31 -.long 0xBF850368 +.long 0xBF031731 +.long 0xBF850350 .long 0x814C384C -.long 0x8194494C -.long 0x8115814D +.long 0x818C494C +.long 0x810D814D .long 0xBF03494C -.long 0xBECC0214 -.long 0xBECD0215 -.long 0x96153631 -.long 0x8F169F37 -.long 0x92141631 -.long 0x80141514 -.long 0x8616FF37, 0x7FFFFFFF -.long 0x8F331614 +.long 0xBECC020C +.long 0xBECD020D +.long 0x960D3631 +.long 0x8F0E9F37 +.long 0x920C0E31 +.long 0x800C0D0C +.long 0x860EFF37, 0x7FFFFFFF +.long 0x8F330E0C .long 0x92323433 .long 0x80B23231 .long 0xBF033534 @@ -2985,226 +3172,218 @@ DGEMM_Aldebaran_PKFixedAtomic512Latest: .long 0xBF033849 .long 0xBEC6024C .long 0xBEC7024D -.long 0xBE97004B -.long 0x96151747 -.long 0x92141747 -.long 0x8F949F14 -.long 0x92154A14 -.long 0x80951547 -.long 0x92154915 -.long 0x80154615 -.long 0xBF093914 -.long 0xBE97023B -.long 0x85164A3A -.long 0x96471715 -.long 0x92461715 +.long 0xBE8F004B +.long 0x960D0F47 +.long 0x920C0F47 +.long 0x8F8C9F0C +.long 0x920D4A0C +.long 0x808D0D47 +.long 0x920D490D +.long 0x800D460D +.long 0xBF09390C +.long 0xBE8F023B +.long 0x850E4A3A +.long 0x96470F0D +.long 0x92460F0D .long 0x8FC69F46 -.long 0x92471646 -.long 0x80C74715 -.long 0x92144A14 -.long 0x80471447 +.long 0x92470E46 +.long 0x80C7470D +.long 0x920C4A0C +.long 0x80470C47 .long 0xBF033534 .long 0x85324746 .long 0x85334647 -.long 0xBE940032 -.long 0xBE950080 -.long 0x8E948A14 -.long 0x80142114 -.long 0x82158015 -.long 0x80041440 -.long 0x82051541 -.long 0xBE8C0080 -.long 0xBE940033 -.long 0xBE950080 -.long 0x8E948A14 -.long 0x80142214 -.long 0x82158015 -.long 0x80081442 -.long 0x82091543 -.long 0xBE8D0080 -.long 0xBEFC0023 +.long 0x8E0C8A32 +.long 0x810C1A0C +.long 0x80040C40 +.long 0x82058041 +.long 0xBE8600FF, 0x80000000 +.long 0x8E0C8A33 +.long 0x800C1B0C +.long 0x80080C42 +.long 0x82098043 +.long 0xBE8A00FF, 0x80000000 +.long 0xBEFC001C .long 0xBF8C0F7C .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8000, 0x04028914 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC5000, 0x2400000C -.long 0xD8EC5080, 0x2600000C +.long 0xD8EC50A0, 0x2400000F +.long 0xD8EC5120, 0x2600000F +.long 0xE0511000, 0x80020707 .long 0xD3EE8008, 0x04228916 .long 0xBF8CC67F .long 0xD3EE8010, 0x04428918 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC5100, 0x2800000C -.long 0xD8EC5180, 0x2A00000C +.long 0xD8EC51A0, 0x2800000F +.long 0xD8EC5220, 0x2A00000F +.long 0xE0511100, 0x80020707 .long 0xD3EE8018, 0x0462891A .long 0xBF8CC67F .long 0xD3EE8020, 0x0482891C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC5200, 0x2C00000C -.long 0xD8EC5280, 0x2E00000C +.long 0xD8EC52A0, 0x2C00000F +.long 0xD8EC5320, 0x2E00000F +.long 0xE0511200, 0x80020707 .long 0xD3EE8028, 0x04A2891E .long 0xBF8CC67F .long 0xD3EE8030, 0x04C28920 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC5300, 0x3000000C -.long 0xD8EC5380, 0x3200000C +.long 0xD8EC53A0, 0x3000000F +.long 0xD8EC5420, 0x3200000F +.long 0xE0511300, 0x80020707 .long 0xD3EE8038, 0x04E28922 -.long 0xB7231000 -.long 0x810D1B0D +.long 0xB71C1020 .long 0xD3EE8040, 0x05028D14 -.long 0xE05C2000, 0x0C013407 +.long 0xE05C1000, 0x80013406 .long 0xD3EE8048, 0x05228D16 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE8050, 0x05428D18 +.long 0x80041404 +.long 0x82058005 .long 0xD3EE8058, 0x05628D1A -.long 0x810C1A0C -.long 0xD3D84010, 0x180001BC -.long 0xD3D84011, 0x180001BD -.long 0xD3D84012, 0x180001FC -.long 0xD3D84013, 0x180001FD -.long 0xBF800001 -.long 0xD2800010, 0x2002A910 -.long 0xD2800012, 0x2002AD12 .long 0xD3EE8060, 0x05828D1C -.long 0xE07C2000, 0x3C041009 +.long 0xE05C1000, 0x80045808 .long 0xD3EE8068, 0x05A28D1E .long 0xD3EE8070, 0x05C28D20 .long 0xD3EE8078, 0x05E28D22 -.long 0xBEFC0023 +.long 0xBEFC001C .long 0xBF8C0F7D .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8000, 0x04029124 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC6000, 0x1400000C -.long 0xD8EC6080, 0x1600000C +.long 0xD8EC60C0, 0x1400000F +.long 0xD8EC6140, 0x1600000F +.long 0xE0511000, 0x80020707 .long 0xD3EE8008, 0x04229126 .long 0xBF8CC67F .long 0xD3EE8010, 0x04429128 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC6100, 0x1800000C -.long 0xD8EC6180, 0x1A00000C +.long 0xD8EC61C0, 0x1800000F +.long 0xD8EC6240, 0x1A00000F +.long 0xE0511100, 0x80020707 .long 0xD3EE8018, 0x0462912A .long 0xBF8CC67F .long 0xD3EE8020, 0x0482912C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC6200, 0x1C00000C -.long 0xD8EC6280, 0x1E00000C +.long 0xD8EC62C0, 0x1C00000F +.long 0xD8EC6340, 0x1E00000F +.long 0xE0511200, 0x80020707 .long 0xD3EE8028, 0x04A2912E .long 0xBF8CC67F .long 0xD3EE8030, 0x04C29130 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC6300, 0x2000000C -.long 0xD8EC6380, 0x2200000C +.long 0xD8EC63C0, 0x2000000F +.long 0xD8EC6440, 0x2200000F +.long 0xE0511300, 0x80020707 .long 0xD3EE8038, 0x04E29132 -.long 0xB7231000 -.long 0x810D1B0D +.long 0xB71C1020 .long 0xD3EE8040, 0x05029524 -.long 0xE05C2000, 0x0C013807 +.long 0xE05C1000, 0x80013806 .long 0xD3EE8048, 0x05229526 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE8050, 0x05429528 +.long 0x80041404 +.long 0x82058005 .long 0xD3EE8058, 0x0562952A -.long 0x810C1A0C .long 0xD3EE8060, 0x0582952C .long 0xD3EE8068, 0x05A2952E .long 0xD3EE8070, 0x05C29530 .long 0xD3EE8078, 0x05E29532 -.long 0xBEFC0023 +.long 0xBEFC001C .long 0xBF8C0F7C .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8000, 0x04029914 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC7000, 0x2400000C -.long 0xD8EC7080, 0x2600000C +.long 0xD8EC70E0, 0x2400000F +.long 0xD8EC7160, 0x2600000F +.long 0xE0511000, 0x80020707 .long 0xD3EE8008, 0x04229916 .long 0xBF8CC67F .long 0xD3EE8010, 0x04429918 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC7100, 0x2800000C -.long 0xD8EC7180, 0x2A00000C +.long 0xD8EC71E0, 0x2800000F +.long 0xD8EC7260, 0x2A00000F +.long 0xE0511100, 0x80020707 .long 0xD3EE8018, 0x0462991A .long 0xBF8CC67F .long 0xD3EE8020, 0x0482991C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC7200, 0x2C00000C -.long 0xD8EC7280, 0x2E00000C +.long 0xD8EC72E0, 0x2C00000F +.long 0xD8EC7360, 0x2E00000F +.long 0xE0511200, 0x80020707 .long 0xD3EE8028, 0x04A2991E .long 0xBF8CC67F .long 0xD3EE8030, 0x04C29920 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC7300, 0x3000000C -.long 0xD8EC7380, 0x3200000C +.long 0xD8EC73E0, 0x3000000F +.long 0xD8EC7460, 0x3200000F +.long 0xE0511300, 0x80020707 .long 0xD3EE8038, 0x04E29922 -.long 0xB7231000 -.long 0x810D1B0D +.long 0xB71C1020 .long 0xD3EE8040, 0x05029D14 -.long 0xE05C2000, 0x0C013C07 +.long 0xE05C1000, 0x80013C06 .long 0xD3EE8048, 0x05229D16 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE8050, 0x05429D18 +.long 0x80041404 +.long 0x82058005 .long 0xD3EE8058, 0x05629D1A -.long 0x810C1A0C .long 0xD3D84010, 0x180001BE .long 0xD3D84011, 0x180001BF .long 0xD3D84012, 0x180001FE .long 0xD3D84013, 0x180001FF .long 0xBF800001 +.long 0xBF8C0F7A .long 0xD2800010, 0x2002B110 .long 0xD2800012, 0x2002B512 .long 0xD3EE8060, 0x05829D1C -.long 0xE07C2000, 0x3D041009 -.long 0x803C1F3D +.long 0xE07C1000, 0x80041008 .long 0xD3EE8068, 0x05A29D1E .long 0xD3EE8070, 0x05C29D20 .long 0xD3EE8078, 0x05E29D22 -.long 0xBEFC0023 +.long 0xBEFC001C .long 0xBF8C0F7D .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8000, 0x0402A124 -.long 0xE0512000, 0x0D020808 -.long 0xD8EC0000, 0x1400000C -.long 0xD8EC0080, 0x1600000C +.long 0xD8EC0000, 0x1400000F +.long 0xD8EC0080, 0x1600000F +.long 0xE0511000, 0x80020707 .long 0xD3EE8008, 0x0422A126 .long 0xBF8CC67F .long 0xD3EE8010, 0x0442A128 -.long 0xE0512100, 0x0D020808 -.long 0xD8EC0100, 0x1800000C -.long 0xD8EC0180, 0x1A00000C +.long 0xD8EC0100, 0x1800000F +.long 0xD8EC0180, 0x1A00000F +.long 0xE0511100, 0x80020707 .long 0xD3EE8018, 0x0462A12A .long 0xBF8CC67F .long 0xD3EE8020, 0x0482A12C -.long 0xE0512200, 0x0D020808 -.long 0xD8EC0200, 0x1C00000C -.long 0xD8EC0280, 0x1E00000C +.long 0xD8EC0200, 0x1C00000F +.long 0xD8EC0280, 0x1E00000F +.long 0xE0511200, 0x80020707 .long 0xD3EE8028, 0x04A2A12E .long 0xBF8CC67F .long 0xD3EE8030, 0x04C2A130 -.long 0xE0512300, 0x0D020808 -.long 0xD8EC0300, 0x2000000C -.long 0xD8EC0380, 0x2200000C +.long 0xD8EC0300, 0x2000000F +.long 0xD8EC0380, 0x2200000F +.long 0xE0511300, 0x80020707 .long 0xD3EE8038, 0x04E2A132 -.long 0xB7231000 -.long 0x810D1B0D +.long 0xB71C1020 .long 0xD3EE8040, 0x0502A524 -.long 0xE05C2000, 0x0C014007 +.long 0xE05C1000, 0x80014006 .long 0xD3EE8048, 0x0522A526 +.long 0x80081508 +.long 0x82098009 .long 0xD3EE8050, 0x0542A528 +.long 0x80041404 +.long 0x82058005 .long 0xD3EE8058, 0x0562A52A -.long 0x810C1A0C .long 0xD3EE8060, 0x0582A52C .long 0xD3EE8068, 0x05A2A52E .long 0xD3EE8070, 0x05C2A530 .long 0xD3EE8078, 0x05E2A532 -.long 0xB0140100 -.long 0x92183E14 -.long 0x9216FF1E, 0x00000080 -.long 0x92143F16 -.long 0x96153F16 -.long 0x921630C0 -.long 0x80141416 -.long 0x80181418 -.long 0x82198015 -.long 0x8E988218 +.long 0xBE9200FF, 0x80000000 +.long 0xBF091224 +.long 0x850C2412 +.long 0xBF068025 +.long 0xBE92020C +.long 0xD0C9000C, 0x00009D0A +.long 0xD1000008, 0x0032190E .long 0xD3D94080, 0x18000080 .long 0xD3D94081, 0x18000080 .long 0xD3D94082, 0x18000080 @@ -3333,66 +3512,59 @@ DGEMM_Aldebaran_PKFixedAtomic512Latest: .long 0xD3D940FD, 0x18000080 .long 0xD3D940FE, 0x18000080 .long 0xD3D940FF, 0x18000080 -.long 0xBF82F441 -.long 0xBF8C0F7B +.long 0xBF82F3D1 +.long 0xBF8C0F7C .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8080, 0x06028914 -.long 0xD8EC5000, 0x2400000C -.long 0xD8EC5080, 0x2600000C +.long 0xD8EC50A0, 0x2400000F +.long 0xD8EC5120, 0x2600000F .long 0xD3EE8088, 0x06228916 .long 0xBF8CC67F .long 0xD3EE8090, 0x06428918 -.long 0xD8EC5100, 0x2800000C -.long 0xD8EC5180, 0x2A00000C +.long 0xD8EC51A0, 0x2800000F +.long 0xD8EC5220, 0x2A00000F .long 0xD3EE8098, 0x0662891A .long 0xBF8CC67F .long 0xD3EE80A0, 0x0682891C -.long 0xD8EC5200, 0x2C00000C -.long 0xD8EC5280, 0x2E00000C +.long 0xD8EC52A0, 0x2C00000F +.long 0xD8EC5320, 0x2E00000F .long 0xD3EE80A8, 0x06A2891E .long 0xBF8CC67F .long 0xD3EE80B0, 0x06C28920 -.long 0xD8EC5300, 0x3000000C -.long 0xD8EC5380, 0x3200000C +.long 0xD8EC53A0, 0x3000000F +.long 0xD8EC5420, 0x3200000F .long 0xD3EE80B8, 0x06E28922 .long 0xD3EE80C0, 0x07028D14 .long 0xD3EE80C8, 0x07228D16 .long 0xD3EE80D0, 0x07428D18 .long 0xD3EE80D8, 0x07628D1A -.long 0xD3D84010, 0x1800013C -.long 0xD3D84011, 0x1800013D -.long 0xD3D84012, 0x1800017C -.long 0xD3D84013, 0x1800017D -.long 0xBF800001 -.long 0xD2800010, 0x2002A910 -.long 0xD2800012, 0x2002AD12 .long 0xD3EE80E0, 0x07828D1C -.long 0xE07C2000, 0x3C041009 +.long 0xE05C1000, 0x80045808 .long 0xD3EE80E8, 0x07A28D1E .long 0xD3EE80F0, 0x07C28D20 .long 0xD3EE80F8, 0x07E28D22 -.long 0xBF8C0F76 +.long 0xBF8C0F78 .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8080, 0x06029124 -.long 0xD8EC6000, 0x1400000C -.long 0xD8EC6080, 0x1600000C +.long 0xD8EC60C0, 0x1400000F +.long 0xD8EC6140, 0x1600000F .long 0xD3EE8088, 0x06229126 .long 0xBF8CC67F .long 0xD3EE8090, 0x06429128 -.long 0xD8EC6100, 0x1800000C -.long 0xD8EC6180, 0x1A00000C +.long 0xD8EC61C0, 0x1800000F +.long 0xD8EC6240, 0x1A00000F .long 0xD3EE8098, 0x0662912A .long 0xBF8CC67F .long 0xD3EE80A0, 0x0682912C -.long 0xD8EC6200, 0x1C00000C -.long 0xD8EC6280, 0x1E00000C +.long 0xD8EC62C0, 0x1C00000F +.long 0xD8EC6340, 0x1E00000F .long 0xD3EE80A8, 0x06A2912E .long 0xBF8CC67F .long 0xD3EE80B0, 0x06C29130 -.long 0xD8EC6300, 0x2000000C -.long 0xD8EC6380, 0x2200000C +.long 0xD8EC63C0, 0x2000000F +.long 0xD8EC6440, 0x2200000F .long 0xD3EE80B8, 0x06E29132 .long 0xD3EE80C0, 0x07029524 .long 0xD3EE80C8, 0x07229526 @@ -3402,27 +3574,27 @@ DGEMM_Aldebaran_PKFixedAtomic512Latest: .long 0xD3EE80E8, 0x07A2952E .long 0xD3EE80F0, 0x07C29530 .long 0xD3EE80F8, 0x07E29532 -.long 0xBF8C0F71 +.long 0xBF8C0F72 .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8080, 0x06029914 -.long 0xD8EC7000, 0x2400000C -.long 0xD8EC7080, 0x2600000C +.long 0xD8EC70E0, 0x2400000F +.long 0xD8EC7160, 0x2600000F .long 0xD3EE8088, 0x06229916 .long 0xBF8CC67F .long 0xD3EE8090, 0x06429918 -.long 0xD8EC7100, 0x2800000C -.long 0xD8EC7180, 0x2A00000C +.long 0xD8EC71E0, 0x2800000F +.long 0xD8EC7260, 0x2A00000F .long 0xD3EE8098, 0x0662991A .long 0xBF8CC67F .long 0xD3EE80A0, 0x0682991C -.long 0xD8EC7200, 0x2C00000C -.long 0xD8EC7280, 0x2E00000C +.long 0xD8EC72E0, 0x2C00000F +.long 0xD8EC7360, 0x2E00000F .long 0xD3EE80A8, 0x06A2991E .long 0xBF8CC67F .long 0xD3EE80B0, 0x06C29920 -.long 0xD8EC7300, 0x3000000C -.long 0xD8EC7380, 0x3200000C +.long 0xD8EC73E0, 0x3000000F +.long 0xD8EC7460, 0x3200000F .long 0xD3EE80B8, 0x06E29922 .long 0xD3EE80C0, 0x07029D14 .long 0xD3EE80C8, 0x07229D16 @@ -3433,15 +3605,15 @@ DGEMM_Aldebaran_PKFixedAtomic512Latest: .long 0xD3D84012, 0x1800017E .long 0xD3D84013, 0x1800017F .long 0xBF800001 +.long 0xBF8C0F70 .long 0xD2800010, 0x2002B110 .long 0xD2800012, 0x2002B512 .long 0xD3EE80E0, 0x07829D1C -.long 0xE07C2000, 0x3D041009 -.long 0x803C1F3D +.long 0xE07C1000, 0x80041008 .long 0xD3EE80E8, 0x07A29D1E .long 0xD3EE80F0, 0x07C29D20 .long 0xD3EE80F8, 0x07E29D22 -.long 0xBF8C0F70 +.long 0xBF8C0F72 .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8080, 0x0602A124 @@ -3463,76 +3635,66 @@ DGEMM_Aldebaran_PKFixedAtomic512Latest: .long 0xD3EE80E8, 0x07A2A52E .long 0xD3EE80F0, 0x07C2A530 .long 0xD3EE80F8, 0x07E2A532 -.long 0xB0140100 -.long 0x92183E14 -.long 0x9216FF1E, 0x00000080 -.long 0x92143F16 -.long 0x96153F16 -.long 0x921630C0 -.long 0x80141416 -.long 0x80181418 -.long 0x82198015 -.long 0x8E988218 -.long 0xBF82035A -.long 0xBF8C0F7B +.long 0xBE9200FF, 0x80000000 +.long 0xBF091224 +.long 0x850C2412 +.long 0xBF068025 +.long 0xBE92020C +.long 0xD0C9000C, 0x00009D0A +.long 0xD1000008, 0x0032190E +.long 0xBF820423 +.long 0xBF8C0F7C .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8000, 0x04028914 -.long 0xD8EC5000, 0x2400000C -.long 0xD8EC5080, 0x2600000C +.long 0xD8EC50A0, 0x2400000F +.long 0xD8EC5120, 0x2600000F .long 0xD3EE8008, 0x04228916 .long 0xBF8CC67F .long 0xD3EE8010, 0x04428918 -.long 0xD8EC5100, 0x2800000C -.long 0xD8EC5180, 0x2A00000C +.long 0xD8EC51A0, 0x2800000F +.long 0xD8EC5220, 0x2A00000F .long 0xD3EE8018, 0x0462891A .long 0xBF8CC67F .long 0xD3EE8020, 0x0482891C -.long 0xD8EC5200, 0x2C00000C -.long 0xD8EC5280, 0x2E00000C +.long 0xD8EC52A0, 0x2C00000F +.long 0xD8EC5320, 0x2E00000F .long 0xD3EE8028, 0x04A2891E .long 0xBF8CC67F .long 0xD3EE8030, 0x04C28920 -.long 0xD8EC5300, 0x3000000C -.long 0xD8EC5380, 0x3200000C +.long 0xD8EC53A0, 0x3000000F +.long 0xD8EC5420, 0x3200000F .long 0xD3EE8038, 0x04E28922 .long 0xD3EE8040, 0x05028D14 .long 0xD3EE8048, 0x05228D16 .long 0xD3EE8050, 0x05428D18 .long 0xD3EE8058, 0x05628D1A -.long 0xD3D84010, 0x180001BC -.long 0xD3D84011, 0x180001BD -.long 0xD3D84012, 0x180001FC -.long 0xD3D84013, 0x180001FD -.long 0xBF800001 -.long 0xD2800010, 0x2002A910 -.long 0xD2800012, 0x2002AD12 .long 0xD3EE8060, 0x05828D1C -.long 0xE07C2000, 0x3C041009 +.long 0xE05C1000, 0x80045808 .long 0xD3EE8068, 0x05A28D1E .long 0xD3EE8070, 0x05C28D20 .long 0xD3EE8078, 0x05E28D22 -.long 0xBF8C0F76 +.long 0xBF8C0F78 .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8000, 0x04029124 -.long 0xD8EC6000, 0x1400000C -.long 0xD8EC6080, 0x1600000C +.long 0xD8EC60C0, 0x1400000F +.long 0xD8EC6140, 0x1600000F .long 0xD3EE8008, 0x04229126 .long 0xBF8CC67F .long 0xD3EE8010, 0x04429128 -.long 0xD8EC6100, 0x1800000C -.long 0xD8EC6180, 0x1A00000C +.long 0xD8EC61C0, 0x1800000F +.long 0xD8EC6240, 0x1A00000F .long 0xD3EE8018, 0x0462912A .long 0xBF8CC67F .long 0xD3EE8020, 0x0482912C -.long 0xD8EC6200, 0x1C00000C -.long 0xD8EC6280, 0x1E00000C +.long 0xD8EC62C0, 0x1C00000F +.long 0xD8EC6340, 0x1E00000F .long 0xD3EE8028, 0x04A2912E .long 0xBF8CC67F .long 0xD3EE8030, 0x04C29130 -.long 0xD8EC6300, 0x2000000C -.long 0xD8EC6380, 0x2200000C +.long 0xD8EC63C0, 0x2000000F +.long 0xD8EC6440, 0x2200000F .long 0xD3EE8038, 0x04E29132 .long 0xD3EE8040, 0x05029524 .long 0xD3EE8048, 0x05229526 @@ -3542,27 +3704,27 @@ DGEMM_Aldebaran_PKFixedAtomic512Latest: .long 0xD3EE8068, 0x05A2952E .long 0xD3EE8070, 0x05C29530 .long 0xD3EE8078, 0x05E29532 -.long 0xBF8C0F71 +.long 0xBF8C0F72 .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8000, 0x04029914 -.long 0xD8EC7000, 0x2400000C -.long 0xD8EC7080, 0x2600000C +.long 0xD8EC70E0, 0x2400000F +.long 0xD8EC7160, 0x2600000F .long 0xD3EE8008, 0x04229916 .long 0xBF8CC67F .long 0xD3EE8010, 0x04429918 -.long 0xD8EC7100, 0x2800000C -.long 0xD8EC7180, 0x2A00000C +.long 0xD8EC71E0, 0x2800000F +.long 0xD8EC7260, 0x2A00000F .long 0xD3EE8018, 0x0462991A .long 0xBF8CC67F .long 0xD3EE8020, 0x0482991C -.long 0xD8EC7200, 0x2C00000C -.long 0xD8EC7280, 0x2E00000C +.long 0xD8EC72E0, 0x2C00000F +.long 0xD8EC7360, 0x2E00000F .long 0xD3EE8028, 0x04A2991E .long 0xBF8CC67F .long 0xD3EE8030, 0x04C29920 -.long 0xD8EC7300, 0x3000000C -.long 0xD8EC7380, 0x3200000C +.long 0xD8EC73E0, 0x3000000F +.long 0xD8EC7460, 0x3200000F .long 0xD3EE8038, 0x04E29922 .long 0xD3EE8040, 0x05029D14 .long 0xD3EE8048, 0x05229D16 @@ -3573,15 +3735,15 @@ DGEMM_Aldebaran_PKFixedAtomic512Latest: .long 0xD3D84012, 0x180001FE .long 0xD3D84013, 0x180001FF .long 0xBF800001 +.long 0xBF8C0F70 .long 0xD2800010, 0x2002B110 .long 0xD2800012, 0x2002B512 .long 0xD3EE8060, 0x05829D1C -.long 0xE07C2000, 0x3D041009 -.long 0x803C1F3D +.long 0xE07C1000, 0x80041008 .long 0xD3EE8068, 0x05A29D1E .long 0xD3EE8070, 0x05C29D20 .long 0xD3EE8078, 0x05E29D22 -.long 0xBF8C0F70 +.long 0xBF8C0F72 .long 0xBF8A0000 .long 0xBF8CC67F .long 0xD3EE8000, 0x0402A124 @@ -3603,734 +3765,4655 @@ DGEMM_Aldebaran_PKFixedAtomic512Latest: .long 0xD3EE8068, 0x05A2A52E .long 0xD3EE8070, 0x05C2A530 .long 0xD3EE8078, 0x05E2A532 -.long 0xB0140100 -.long 0x92183E14 -.long 0x9216FF1E, 0x00000080 -.long 0x92143F16 -.long 0x96153F16 -.long 0x921630C0 -.long 0x80141416 -.long 0x80181418 -.long 0x82198015 -.long 0x8E988218 -.long 0x80104418 -.long 0x82114519 -.long 0xBEBC0080 -.long 0xBE94003C -.long 0xE05C2000, 0x3C045409 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C045809 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C045C09 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C046009 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C046409 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C046809 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C046C09 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C047009 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C047409 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C047809 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C047C09 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C048009 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C048409 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C048809 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C048C09 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C049009 -.long 0xD3D84094, 0x18000100 -.long 0xD3D84095, 0x18000101 -.long 0xD3D84096, 0x18000140 -.long 0xD3D84097, 0x18000141 -.long 0xD3D84098, 0x18000102 -.long 0xD3D84099, 0x18000103 -.long 0xD3D8409A, 0x18000142 -.long 0xD3D8409B, 0x18000143 -.long 0xD3D8409C, 0x18000104 -.long 0xD3D8409D, 0x18000105 -.long 0xD3D8409E, 0x18000144 -.long 0xD3D8409F, 0x18000145 -.long 0xD3D840A0, 0x18000106 -.long 0xD3D840A1, 0x18000107 -.long 0xD3D840A2, 0x18000146 -.long 0xD3D840A3, 0x18000147 -.long 0xD3D840A4, 0x18000108 -.long 0xD3D840A5, 0x18000109 -.long 0xD3D840A6, 0x18000148 -.long 0xD3D840A7, 0x18000149 -.long 0xD3D840A8, 0x1800010A -.long 0xD3D840A9, 0x1800010B -.long 0xD3D840AA, 0x1800014A -.long 0xD3D840AB, 0x1800014B -.long 0xD3D840AC, 0x1800010C -.long 0xD3D840AD, 0x1800010D -.long 0xD3D840AE, 0x1800014C -.long 0xD3D840AF, 0x1800014D -.long 0xD3D840B0, 0x1800010E -.long 0xD3D840B1, 0x1800010F -.long 0xD3D840B2, 0x1800014E -.long 0xD3D840B3, 0x1800014F -.long 0xD3D840B4, 0x18000110 -.long 0xD3D840B5, 0x18000111 -.long 0xD3D840B6, 0x18000150 -.long 0xD3D840B7, 0x18000151 -.long 0xD3D840B8, 0x18000112 -.long 0xD3D840B9, 0x18000113 -.long 0xD3D840BA, 0x18000152 -.long 0xD3D840BB, 0x18000153 -.long 0xD3D840BC, 0x18000114 -.long 0xD3D840BD, 0x18000115 -.long 0xD3D840BE, 0x18000154 -.long 0xD3D840BF, 0x18000155 -.long 0xD3D840C0, 0x18000116 -.long 0xD3D840C1, 0x18000117 -.long 0xD3D840C2, 0x18000156 -.long 0xD3D840C3, 0x18000157 -.long 0xD3D840C4, 0x18000118 -.long 0xD3D840C5, 0x18000119 -.long 0xD3D840C6, 0x18000158 -.long 0xD3D840C7, 0x18000159 -.long 0xD3D840C8, 0x1800011A -.long 0xD3D840C9, 0x1800011B -.long 0xD3D840CA, 0x1800015A -.long 0xD3D840CB, 0x1800015B -.long 0xD3D840CC, 0x1800011C -.long 0xD3D840CD, 0x1800011D -.long 0xD3D840CE, 0x1800015C -.long 0xD3D840CF, 0x1800015D -.long 0xD3D840D0, 0x1800011E -.long 0xD3D840D1, 0x1800011F -.long 0xD3D840D2, 0x1800015E -.long 0xD3D840D3, 0x1800015F -.long 0xBF800001 -.long 0xBEBC0014 -.long 0xBF8C0F7F -.long 0xD2800094, 0x2002A994 -.long 0xD2800096, 0x2002AD96 -.long 0xE07C2000, 0x3C049409 -.long 0x803C1F3C -.long 0xBF8C0F7E -.long 0xD2800098, 0x2002B198 -.long 0xD280009A, 0x2002B59A -.long 0xE07C2000, 0x3C049809 -.long 0x803C1F3C -.long 0xBF8C0F7D -.long 0xD280009C, 0x2002B99C -.long 0xD280009E, 0x2002BD9E -.long 0xE07C2000, 0x3C049C09 -.long 0x803C1F3C -.long 0xBF8C0F7C -.long 0xD28000A0, 0x2002C1A0 -.long 0xD28000A2, 0x2002C5A2 -.long 0xE07C2000, 0x3C04A009 -.long 0x803C1F3C +.long 0xBE9200FF, 0x80000000 +.long 0xBF091224 +.long 0x850C2412 +.long 0xBF068025 +.long 0xBE92020C +.long 0xD0C9000C, 0x00009D0A +.long 0xD1000008, 0x0032190E +.long 0x8010443E +.long 0x8211453F +.long 0xBE8C0110 +.long 0xBE8E0012 +.long 0xE05C1000, 0x80045408 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x80045808 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x80045C08 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x80046008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x80046408 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x80046808 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x80046C08 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x80047008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x80047408 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x80047808 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x80047C08 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x80048008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x80048408 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x80048808 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x80048C08 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x80049008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x80049408 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x80049808 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x80049C08 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x8004A008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x8004A408 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x8004A808 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x8004AC08 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x8004B008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x8004B408 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x8004B808 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x8004BC08 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x8004C008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x8004C408 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x8004C808 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x8004CC08 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x8004D008 +.long 0xBE90010C +.long 0xBE92000E +.long 0xD3D84010, 0x18000100 +.long 0xD3D84011, 0x18000101 +.long 0xD3D84012, 0x18000140 +.long 0xD3D84013, 0x18000141 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x2002A910 +.long 0xD2800012, 0x2002AD12 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000102 +.long 0xD3D84011, 0x18000103 +.long 0xD3D84012, 0x18000142 +.long 0xD3D84013, 0x18000143 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x2002B110 +.long 0xD2800012, 0x2002B512 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000104 +.long 0xD3D84011, 0x18000105 +.long 0xD3D84012, 0x18000144 +.long 0xD3D84013, 0x18000145 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x2002B910 +.long 0xD2800012, 0x2002BD12 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000106 +.long 0xD3D84011, 0x18000107 +.long 0xD3D84012, 0x18000146 +.long 0xD3D84013, 0x18000147 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x2002C110 +.long 0xD2800012, 0x2002C512 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000108 +.long 0xD3D84011, 0x18000109 +.long 0xD3D84012, 0x18000148 +.long 0xD3D84013, 0x18000149 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x2002C910 +.long 0xD2800012, 0x2002CD12 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x1800010A +.long 0xD3D84011, 0x1800010B +.long 0xD3D84012, 0x1800014A +.long 0xD3D84013, 0x1800014B +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x2002D110 +.long 0xD2800012, 0x2002D512 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x1800010C +.long 0xD3D84011, 0x1800010D +.long 0xD3D84012, 0x1800014C +.long 0xD3D84013, 0x1800014D +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x2002D910 +.long 0xD2800012, 0x2002DD12 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x1800010E +.long 0xD3D84011, 0x1800010F +.long 0xD3D84012, 0x1800014E +.long 0xD3D84013, 0x1800014F +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x2002E110 +.long 0xD2800012, 0x2002E512 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000110 +.long 0xD3D84011, 0x18000111 +.long 0xD3D84012, 0x18000150 +.long 0xD3D84013, 0x18000151 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x2002E910 +.long 0xD2800012, 0x2002ED12 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000112 +.long 0xD3D84011, 0x18000113 +.long 0xD3D84012, 0x18000152 +.long 0xD3D84013, 0x18000153 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x2002F110 +.long 0xD2800012, 0x2002F512 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000114 +.long 0xD3D84011, 0x18000115 +.long 0xD3D84012, 0x18000154 +.long 0xD3D84013, 0x18000155 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x2002F910 +.long 0xD2800012, 0x2002FD12 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000116 +.long 0xD3D84011, 0x18000117 +.long 0xD3D84012, 0x18000156 +.long 0xD3D84013, 0x18000157 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20030110 +.long 0xD2800012, 0x20030512 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000118 +.long 0xD3D84011, 0x18000119 +.long 0xD3D84012, 0x18000158 +.long 0xD3D84013, 0x18000159 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20030910 +.long 0xD2800012, 0x20030D12 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x1800011A +.long 0xD3D84011, 0x1800011B +.long 0xD3D84012, 0x1800015A +.long 0xD3D84013, 0x1800015B +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20031110 +.long 0xD2800012, 0x20031512 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x1800011C +.long 0xD3D84011, 0x1800011D +.long 0xD3D84012, 0x1800015C +.long 0xD3D84013, 0x1800015D +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20031910 +.long 0xD2800012, 0x20031D12 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x1800011E +.long 0xD3D84011, 0x1800011F +.long 0xD3D84012, 0x1800015E +.long 0xD3D84013, 0x1800015F +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20032110 +.long 0xD2800012, 0x20032512 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000120 +.long 0xD3D84011, 0x18000121 +.long 0xD3D84012, 0x18000160 +.long 0xD3D84013, 0x18000161 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20032910 +.long 0xD2800012, 0x20032D12 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000122 +.long 0xD3D84011, 0x18000123 +.long 0xD3D84012, 0x18000162 +.long 0xD3D84013, 0x18000163 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20033110 +.long 0xD2800012, 0x20033512 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000124 +.long 0xD3D84011, 0x18000125 +.long 0xD3D84012, 0x18000164 +.long 0xD3D84013, 0x18000165 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20033910 +.long 0xD2800012, 0x20033D12 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000126 +.long 0xD3D84011, 0x18000127 +.long 0xD3D84012, 0x18000166 +.long 0xD3D84013, 0x18000167 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20034110 +.long 0xD2800012, 0x20034512 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000128 +.long 0xD3D84011, 0x18000129 +.long 0xD3D84012, 0x18000168 +.long 0xD3D84013, 0x18000169 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20034910 +.long 0xD2800012, 0x20034D12 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x1800012A +.long 0xD3D84011, 0x1800012B +.long 0xD3D84012, 0x1800016A +.long 0xD3D84013, 0x1800016B +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20035110 +.long 0xD2800012, 0x20035512 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x1800012C +.long 0xD3D84011, 0x1800012D +.long 0xD3D84012, 0x1800016C +.long 0xD3D84013, 0x1800016D +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20035910 +.long 0xD2800012, 0x20035D12 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x1800012E +.long 0xD3D84011, 0x1800012F +.long 0xD3D84012, 0x1800016E +.long 0xD3D84013, 0x1800016F +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20036110 +.long 0xD2800012, 0x20036512 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000130 +.long 0xD3D84011, 0x18000131 +.long 0xD3D84012, 0x18000170 +.long 0xD3D84013, 0x18000171 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20036910 +.long 0xD2800012, 0x20036D12 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000132 +.long 0xD3D84011, 0x18000133 +.long 0xD3D84012, 0x18000172 +.long 0xD3D84013, 0x18000173 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20037110 +.long 0xD2800012, 0x20037512 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000134 +.long 0xD3D84011, 0x18000135 +.long 0xD3D84012, 0x18000174 +.long 0xD3D84013, 0x18000175 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20037910 +.long 0xD2800012, 0x20037D12 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000136 +.long 0xD3D84011, 0x18000137 +.long 0xD3D84012, 0x18000176 +.long 0xD3D84013, 0x18000177 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20038110 +.long 0xD2800012, 0x20038512 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000138 +.long 0xD3D84011, 0x18000139 +.long 0xD3D84012, 0x18000178 +.long 0xD3D84013, 0x18000179 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20038910 +.long 0xD2800012, 0x20038D12 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x1800013A +.long 0xD3D84011, 0x1800013B +.long 0xD3D84012, 0x1800017A +.long 0xD3D84013, 0x1800017B +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20039110 +.long 0xD2800012, 0x20039512 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x1800013C +.long 0xD3D84011, 0x1800013D +.long 0xD3D84012, 0x1800017C +.long 0xD3D84013, 0x1800017D +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20039910 +.long 0xD2800012, 0x20039D12 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x1800013E +.long 0xD3D84011, 0x1800013F +.long 0xD3D84012, 0x1800017E +.long 0xD3D84013, 0x1800017F +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x2003A110 +.long 0xD2800012, 0x2003A512 +.long 0xE07C1000, 0x80041008 +.long 0xBF82193C +.long 0x8010443E +.long 0x8211453F +.long 0xBE8C0110 +.long 0xBE8E0012 +.long 0xE05C1000, 0x80045408 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x80045808 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x80045C08 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x80046008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x80046408 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x80046808 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x80046C08 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x80047008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x80047408 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x80047808 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x80047C08 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x80048008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x80048408 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x80048808 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x80048C08 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x80049008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x80049408 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x80049808 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x80049C08 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x8004A008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x8004A408 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x8004A808 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x8004AC08 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x8004B008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x8004B408 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x8004B808 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x8004BC08 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x8004C008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x8004C408 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x8004C808 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x8004CC08 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE05C1000, 0x8004D008 +.long 0xBE90010C +.long 0xBE92000E +.long 0xD3D84010, 0x18000180 +.long 0xD3D84011, 0x18000181 +.long 0xD3D84012, 0x180001C0 +.long 0xD3D84013, 0x180001C1 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x2002A910 +.long 0xD2800012, 0x2002AD12 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000182 +.long 0xD3D84011, 0x18000183 +.long 0xD3D84012, 0x180001C2 +.long 0xD3D84013, 0x180001C3 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x2002B110 +.long 0xD2800012, 0x2002B512 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000184 +.long 0xD3D84011, 0x18000185 +.long 0xD3D84012, 0x180001C4 +.long 0xD3D84013, 0x180001C5 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x2002B910 +.long 0xD2800012, 0x2002BD12 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000186 +.long 0xD3D84011, 0x18000187 +.long 0xD3D84012, 0x180001C6 +.long 0xD3D84013, 0x180001C7 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x2002C110 +.long 0xD2800012, 0x2002C512 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000188 +.long 0xD3D84011, 0x18000189 +.long 0xD3D84012, 0x180001C8 +.long 0xD3D84013, 0x180001C9 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x2002C910 +.long 0xD2800012, 0x2002CD12 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x1800018A +.long 0xD3D84011, 0x1800018B +.long 0xD3D84012, 0x180001CA +.long 0xD3D84013, 0x180001CB +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x2002D110 +.long 0xD2800012, 0x2002D512 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x1800018C +.long 0xD3D84011, 0x1800018D +.long 0xD3D84012, 0x180001CC +.long 0xD3D84013, 0x180001CD +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x2002D910 +.long 0xD2800012, 0x2002DD12 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x1800018E +.long 0xD3D84011, 0x1800018F +.long 0xD3D84012, 0x180001CE +.long 0xD3D84013, 0x180001CF +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x2002E110 +.long 0xD2800012, 0x2002E512 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000190 +.long 0xD3D84011, 0x18000191 +.long 0xD3D84012, 0x180001D0 +.long 0xD3D84013, 0x180001D1 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x2002E910 +.long 0xD2800012, 0x2002ED12 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000192 +.long 0xD3D84011, 0x18000193 +.long 0xD3D84012, 0x180001D2 +.long 0xD3D84013, 0x180001D3 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x2002F110 +.long 0xD2800012, 0x2002F512 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000194 +.long 0xD3D84011, 0x18000195 +.long 0xD3D84012, 0x180001D4 +.long 0xD3D84013, 0x180001D5 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x2002F910 +.long 0xD2800012, 0x2002FD12 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000196 +.long 0xD3D84011, 0x18000197 +.long 0xD3D84012, 0x180001D6 +.long 0xD3D84013, 0x180001D7 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20030110 +.long 0xD2800012, 0x20030512 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000198 +.long 0xD3D84011, 0x18000199 +.long 0xD3D84012, 0x180001D8 +.long 0xD3D84013, 0x180001D9 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20030910 +.long 0xD2800012, 0x20030D12 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x1800019A +.long 0xD3D84011, 0x1800019B +.long 0xD3D84012, 0x180001DA +.long 0xD3D84013, 0x180001DB +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20031110 +.long 0xD2800012, 0x20031512 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x1800019C +.long 0xD3D84011, 0x1800019D +.long 0xD3D84012, 0x180001DC +.long 0xD3D84013, 0x180001DD +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20031910 +.long 0xD2800012, 0x20031D12 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x1800019E +.long 0xD3D84011, 0x1800019F +.long 0xD3D84012, 0x180001DE +.long 0xD3D84013, 0x180001DF +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20032110 +.long 0xD2800012, 0x20032512 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x180001A0 +.long 0xD3D84011, 0x180001A1 +.long 0xD3D84012, 0x180001E0 +.long 0xD3D84013, 0x180001E1 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20032910 +.long 0xD2800012, 0x20032D12 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x180001A2 +.long 0xD3D84011, 0x180001A3 +.long 0xD3D84012, 0x180001E2 +.long 0xD3D84013, 0x180001E3 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20033110 +.long 0xD2800012, 0x20033512 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x180001A4 +.long 0xD3D84011, 0x180001A5 +.long 0xD3D84012, 0x180001E4 +.long 0xD3D84013, 0x180001E5 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20033910 +.long 0xD2800012, 0x20033D12 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x180001A6 +.long 0xD3D84011, 0x180001A7 +.long 0xD3D84012, 0x180001E6 +.long 0xD3D84013, 0x180001E7 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20034110 +.long 0xD2800012, 0x20034512 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x180001A8 +.long 0xD3D84011, 0x180001A9 +.long 0xD3D84012, 0x180001E8 +.long 0xD3D84013, 0x180001E9 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20034910 +.long 0xD2800012, 0x20034D12 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x180001AA +.long 0xD3D84011, 0x180001AB +.long 0xD3D84012, 0x180001EA +.long 0xD3D84013, 0x180001EB +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20035110 +.long 0xD2800012, 0x20035512 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x180001AC +.long 0xD3D84011, 0x180001AD +.long 0xD3D84012, 0x180001EC +.long 0xD3D84013, 0x180001ED +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20035910 +.long 0xD2800012, 0x20035D12 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x180001AE +.long 0xD3D84011, 0x180001AF +.long 0xD3D84012, 0x180001EE +.long 0xD3D84013, 0x180001EF +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20036110 +.long 0xD2800012, 0x20036512 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x180001B0 +.long 0xD3D84011, 0x180001B1 +.long 0xD3D84012, 0x180001F0 +.long 0xD3D84013, 0x180001F1 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20036910 +.long 0xD2800012, 0x20036D12 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x180001B2 +.long 0xD3D84011, 0x180001B3 +.long 0xD3D84012, 0x180001F2 +.long 0xD3D84013, 0x180001F3 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20037110 +.long 0xD2800012, 0x20037512 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x180001B4 +.long 0xD3D84011, 0x180001B5 +.long 0xD3D84012, 0x180001F4 +.long 0xD3D84013, 0x180001F5 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20037910 +.long 0xD2800012, 0x20037D12 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x180001B6 +.long 0xD3D84011, 0x180001B7 +.long 0xD3D84012, 0x180001F6 +.long 0xD3D84013, 0x180001F7 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20038110 +.long 0xD2800012, 0x20038512 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x180001B8 +.long 0xD3D84011, 0x180001B9 +.long 0xD3D84012, 0x180001F8 +.long 0xD3D84013, 0x180001F9 +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20038910 +.long 0xD2800012, 0x20038D12 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x180001BA +.long 0xD3D84011, 0x180001BB +.long 0xD3D84012, 0x180001FA +.long 0xD3D84013, 0x180001FB +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20039110 +.long 0xD2800012, 0x20039512 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x180001BC +.long 0xD3D84011, 0x180001BD +.long 0xD3D84012, 0x180001FC +.long 0xD3D84013, 0x180001FD +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x20039910 +.long 0xD2800012, 0x20039D12 +.long 0xE07C1000, 0x80041008 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x180001BE +.long 0xD3D84011, 0x180001BF +.long 0xD3D84012, 0x180001FE +.long 0xD3D84013, 0x180001FF +.long 0xBF800001 +.long 0xBF8C4F7F +.long 0xD2800010, 0x2003A110 +.long 0xD2800012, 0x2003A512 +.long 0xE07C1000, 0x80041008 +.long 0xBF8215FD +.long 0xBE9D0080 +.long 0xBE9E0080 +.long 0x8010443E +.long 0x8211453F +.long 0xBEFC001C .long 0xBF8C0F7B -.long 0xD28000A4, 0x2002C9A4 -.long 0xD28000A6, 0x2002CDA6 -.long 0xE07C2000, 0x3C04A409 -.long 0x803C1F3C -.long 0xBF8C0F7A -.long 0xD28000A8, 0x2002D1A8 -.long 0xD28000AA, 0x2002D5AA -.long 0xE07C2000, 0x3C04A809 -.long 0x803C1F3C -.long 0xBF8C0F79 -.long 0xD28000AC, 0x2002D9AC -.long 0xD28000AE, 0x2002DDAE -.long 0xE07C2000, 0x3C04AC09 -.long 0x803C1F3C -.long 0xBF8C0F78 -.long 0xD28000B0, 0x2002E1B0 -.long 0xD28000B2, 0x2002E5B2 -.long 0xE07C2000, 0x3C04B009 -.long 0x803C1F3C -.long 0xBF8C0F77 -.long 0xD28000B4, 0x2002E9B4 -.long 0xD28000B6, 0x2002EDB6 -.long 0xE07C2000, 0x3C04B409 -.long 0x803C1F3C -.long 0xBF8C0F76 -.long 0xD28000B8, 0x2002F1B8 -.long 0xD28000BA, 0x2002F5BA -.long 0xE07C2000, 0x3C04B809 -.long 0x803C1F3C -.long 0xBF8C0F75 -.long 0xD28000BC, 0x2002F9BC -.long 0xD28000BE, 0x2002FDBE -.long 0xE07C2000, 0x3C04BC09 -.long 0x803C1F3C -.long 0xBF8C0F74 -.long 0xD28000C0, 0x200301C0 -.long 0xD28000C2, 0x200305C2 -.long 0xE07C2000, 0x3C04C009 -.long 0x803C1F3C -.long 0xBF8C0F73 -.long 0xD28000C4, 0x200309C4 -.long 0xD28000C6, 0x20030DC6 -.long 0xE07C2000, 0x3C04C409 -.long 0x803C1F3C -.long 0xBF8C0F72 -.long 0xD28000C8, 0x200311C8 -.long 0xD28000CA, 0x200315CA -.long 0xE07C2000, 0x3C04C809 -.long 0x803C1F3C -.long 0xBF8C0F71 -.long 0xD28000CC, 0x200319CC -.long 0xD28000CE, 0x20031DCE -.long 0xE07C2000, 0x3C04CC09 -.long 0x803C1F3C -.long 0xBF8C0F70 -.long 0xD28000D0, 0x200321D0 -.long 0xD28000D2, 0x200325D2 -.long 0xE07C2000, 0x3C04D009 -.long 0x803C1F3C -.long 0xBE94003C -.long 0xE05C2000, 0x3C045409 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C045809 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C045C09 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C046009 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C046409 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C046809 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C046C09 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C047009 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C047409 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C047809 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C047C09 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C048009 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C048409 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C048809 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C048C09 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C049009 -.long 0xD3D84094, 0x18000120 -.long 0xD3D84095, 0x18000121 -.long 0xD3D84096, 0x18000160 -.long 0xD3D84097, 0x18000161 -.long 0xD3D84098, 0x18000122 -.long 0xD3D84099, 0x18000123 -.long 0xD3D8409A, 0x18000162 -.long 0xD3D8409B, 0x18000163 -.long 0xD3D8409C, 0x18000124 -.long 0xD3D8409D, 0x18000125 -.long 0xD3D8409E, 0x18000164 -.long 0xD3D8409F, 0x18000165 -.long 0xD3D840A0, 0x18000126 -.long 0xD3D840A1, 0x18000127 -.long 0xD3D840A2, 0x18000166 -.long 0xD3D840A3, 0x18000167 -.long 0xD3D840A4, 0x18000128 -.long 0xD3D840A5, 0x18000129 -.long 0xD3D840A6, 0x18000168 -.long 0xD3D840A7, 0x18000169 -.long 0xD3D840A8, 0x1800012A -.long 0xD3D840A9, 0x1800012B -.long 0xD3D840AA, 0x1800016A -.long 0xD3D840AB, 0x1800016B -.long 0xD3D840AC, 0x1800012C -.long 0xD3D840AD, 0x1800012D -.long 0xD3D840AE, 0x1800016C -.long 0xD3D840AF, 0x1800016D -.long 0xD3D840B0, 0x1800012E -.long 0xD3D840B1, 0x1800012F -.long 0xD3D840B2, 0x1800016E -.long 0xD3D840B3, 0x1800016F -.long 0xD3D840B4, 0x18000130 -.long 0xD3D840B5, 0x18000131 -.long 0xD3D840B6, 0x18000170 -.long 0xD3D840B7, 0x18000171 -.long 0xD3D840B8, 0x18000132 -.long 0xD3D840B9, 0x18000133 -.long 0xD3D840BA, 0x18000172 -.long 0xD3D840BB, 0x18000173 -.long 0xD3D840BC, 0x18000134 -.long 0xD3D840BD, 0x18000135 -.long 0xD3D840BE, 0x18000174 -.long 0xD3D840BF, 0x18000175 -.long 0xD3D840C0, 0x18000136 -.long 0xD3D840C1, 0x18000137 -.long 0xD3D840C2, 0x18000176 -.long 0xD3D840C3, 0x18000177 -.long 0xD3D840C4, 0x18000138 -.long 0xD3D840C5, 0x18000139 -.long 0xD3D840C6, 0x18000178 -.long 0xD3D840C7, 0x18000179 -.long 0xD3D840C8, 0x1800013A -.long 0xD3D840C9, 0x1800013B -.long 0xD3D840CA, 0x1800017A -.long 0xD3D840CB, 0x1800017B -.long 0xD3D840CC, 0x1800013C -.long 0xD3D840CD, 0x1800013D -.long 0xD3D840CE, 0x1800017C -.long 0xD3D840CF, 0x1800017D -.long 0xD3D840D0, 0x1800013E -.long 0xD3D840D1, 0x1800013F -.long 0xD3D840D2, 0x1800017E -.long 0xD3D840D3, 0x1800017F -.long 0xBF800001 -.long 0xBEBC0014 -.long 0xBF8C0F7F -.long 0xD2800094, 0x2002A994 -.long 0xD2800096, 0x2002AD96 -.long 0xE07C2000, 0x3C049409 -.long 0x803C1F3C -.long 0xBF8C0F7E -.long 0xD2800098, 0x2002B198 -.long 0xD280009A, 0x2002B59A -.long 0xE07C2000, 0x3C049809 -.long 0x803C1F3C +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8080, 0x06026914 +.long 0xD8EC1020, 0x2400000F +.long 0xD8EC10A0, 0x2600000F +.long 0xE0511000, 0x80020707 +.long 0xD3EE8088, 0x06226916 +.long 0xBF8CC67F +.long 0xD3EE8090, 0x06426918 +.long 0xD8EC1120, 0x2800000F +.long 0xD8EC11A0, 0x2A00000F +.long 0xE0511100, 0x80020707 +.long 0xD3EE8098, 0x0662691A +.long 0xBF8CC67F +.long 0xD3EE80A0, 0x0682691C +.long 0xD8EC1220, 0x2C00000F +.long 0xD8EC12A0, 0x2E00000F +.long 0xE0511200, 0x80020707 +.long 0xD3EE80A8, 0x06A2691E +.long 0xBF8CC67F +.long 0xD3EE80B0, 0x06C26920 +.long 0xD8EC1320, 0x3000000F +.long 0xD8EC13A0, 0x3200000F +.long 0xE0511300, 0x80020707 +.long 0xD3EE80B8, 0x06E26922 +.long 0xD3EE80C0, 0x07026D14 +.long 0xE05C1000, 0x80014406 +.long 0xD3EE80C8, 0x07226D16 +.long 0xB71C1020 +.long 0x80081508 +.long 0x82098009 +.long 0xD3EE80D0, 0x07426D18 +.long 0x80041404 +.long 0x82058005 +.long 0xD3EE80D8, 0x07626D1A +.long 0xD3EE80E0, 0x07826D1C +.long 0xE0541000, 0x80045408 +.long 0xE0541000, 0x80045609 +.long 0xD3EE80E8, 0x07A26D1E +.long 0xD3EE80F0, 0x07C26D20 +.long 0xD3EE80F8, 0x07E26D22 +.long 0xBEFC001C .long 0xBF8C0F7D -.long 0xD280009C, 0x2002B99C -.long 0xD280009E, 0x2002BD9E -.long 0xE07C2000, 0x3C049C09 -.long 0x803C1F3C -.long 0xBF8C0F7C -.long 0xD28000A0, 0x2002C1A0 -.long 0xD28000A2, 0x2002C5A2 -.long 0xE07C2000, 0x3C04A009 -.long 0x803C1F3C -.long 0xBF8C0F7B -.long 0xD28000A4, 0x2002C9A4 -.long 0xD28000A6, 0x2002CDA6 -.long 0xE07C2000, 0x3C04A409 -.long 0x803C1F3C +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8080, 0x06027124 +.long 0xD8EC2040, 0x1400000F +.long 0xD8EC20C0, 0x1600000F +.long 0xE0511000, 0x80020707 +.long 0xD3EE8088, 0x06227126 +.long 0xBF8CC67F +.long 0xD3EE8090, 0x06427128 +.long 0xD8EC2140, 0x1800000F +.long 0xD8EC21C0, 0x1A00000F +.long 0xE0511100, 0x80020707 +.long 0xD3EE8098, 0x0662712A +.long 0xBF8CC67F +.long 0xD3EE80A0, 0x0682712C +.long 0xD8EC2240, 0x1C00000F +.long 0xD8EC22C0, 0x1E00000F +.long 0xE0511200, 0x80020707 +.long 0xD3EE80A8, 0x06A2712E +.long 0xBF8CC67F +.long 0xD3EE80B0, 0x06C27130 +.long 0xD8EC2340, 0x2000000F +.long 0xD8EC23C0, 0x2200000F +.long 0xE0511300, 0x80020707 +.long 0xD3EE80B8, 0x06E27132 +.long 0xD3EE80C0, 0x07027524 +.long 0xE05C1000, 0x80014806 +.long 0xD3EE80C8, 0x07227526 +.long 0xB71C1020 +.long 0x80081508 +.long 0x82098009 +.long 0xD3EE80D0, 0x07427528 +.long 0x80041404 +.long 0x82058005 +.long 0xD3EE80D8, 0x0762752A +.long 0xD3EE80E0, 0x0782752C +.long 0xD3EE80E8, 0x07A2752E +.long 0xD3EE80F0, 0x07C27530 +.long 0xD3EE80F8, 0x07E27532 +.long 0xBEFC001C +.long 0xBF8C0F7D +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8080, 0x06027914 +.long 0xD8EC3060, 0x2400000F +.long 0xD8EC30E0, 0x2600000F +.long 0xE0511000, 0x80020707 +.long 0xD3EE8088, 0x06227916 +.long 0xBF8CC67F +.long 0xD3EE8090, 0x06427918 +.long 0xD8EC3160, 0x2800000F +.long 0xD8EC31E0, 0x2A00000F +.long 0xE0511100, 0x80020707 +.long 0xD3EE8098, 0x0662791A +.long 0xBF8CC67F +.long 0xD3EE80A0, 0x0682791C +.long 0xD8EC3260, 0x2C00000F +.long 0xD8EC32E0, 0x2E00000F +.long 0xE0511200, 0x80020707 +.long 0xD3EE80A8, 0x06A2791E +.long 0xBF8CC67F +.long 0xD3EE80B0, 0x06C27920 +.long 0xD8EC3360, 0x3000000F +.long 0xD8EC33E0, 0x3200000F +.long 0xE0511300, 0x80020707 +.long 0xD3EE80B8, 0x06E27922 +.long 0xD3EE80C0, 0x07027D14 +.long 0xE05C1000, 0x80014C06 +.long 0xD3EE80C8, 0x07227D16 +.long 0xB71C1020 +.long 0x80081508 +.long 0x82098009 +.long 0xD3EE80D0, 0x07427D18 +.long 0x80041404 +.long 0x82058005 +.long 0xD3EE80D8, 0x07627D1A +.long 0xBF11011E +.long 0xD3D84010, 0x18000100 +.long 0xD3D84011, 0x18000101 +.long 0xD3D84012, 0x18000140 +.long 0xD3D84013, 0x18000141 +.long 0xBF800001 +.long 0xBF9C0000 .long 0xBF8C0F7A -.long 0xD28000A8, 0x2002D1A8 -.long 0xD28000AA, 0x2002D5AA -.long 0xE07C2000, 0x3C04A809 -.long 0x803C1F3C -.long 0xBF8C0F79 -.long 0xD28000AC, 0x2002D9AC -.long 0xD28000AE, 0x2002DDAE -.long 0xE07C2000, 0x3C04AC09 -.long 0x803C1F3C -.long 0xBF8C0F78 -.long 0xD28000B0, 0x2002E1B0 -.long 0xD28000B2, 0x2002E5B2 -.long 0xE07C2000, 0x3C04B009 -.long 0x803C1F3C -.long 0xBF8C0F77 -.long 0xD28000B4, 0x2002E9B4 -.long 0xD28000B6, 0x2002EDB6 -.long 0xE07C2000, 0x3C04B409 -.long 0x803C1F3C -.long 0xBF8C0F76 -.long 0xD28000B8, 0x2002F1B8 -.long 0xD28000BA, 0x2002F5BA -.long 0xE07C2000, 0x3C04B809 -.long 0x803C1F3C -.long 0xBF8C0F75 -.long 0xD28000BC, 0x2002F9BC -.long 0xD28000BE, 0x2002FDBE -.long 0xE07C2000, 0x3C04BC09 -.long 0x803C1F3C -.long 0xBF8C0F74 -.long 0xD28000C0, 0x200301C0 -.long 0xD28000C2, 0x200305C2 -.long 0xE07C2000, 0x3C04C009 -.long 0x803C1F3C -.long 0xBF8C0F73 -.long 0xD28000C4, 0x200309C4 -.long 0xD28000C6, 0x20030DC6 -.long 0xE07C2000, 0x3C04C409 -.long 0x803C1F3C -.long 0xBF8C0F72 -.long 0xD28000C8, 0x200311C8 -.long 0xD28000CA, 0x200315CA -.long 0xE07C2000, 0x3C04C809 -.long 0x803C1F3C -.long 0xBF8C0F71 -.long 0xD28000CC, 0x200319CC -.long 0xD28000CE, 0x20031DCE -.long 0xE07C2000, 0x3C04CC09 -.long 0x803C1F3C -.long 0xBF8C0F70 -.long 0xD28000D0, 0x200321D0 -.long 0xD28000D2, 0x200325D2 -.long 0xE07C2000, 0x3C04D009 -.long 0x803C1F3C -.long 0xBF820267 -.long 0x80104418 -.long 0x82114519 -.long 0xBEBC0080 -.long 0xBE94003C -.long 0xE05C2000, 0x3C045409 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C045809 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C045C09 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C046009 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C046409 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C046809 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C046C09 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C047009 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C047409 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C047809 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C047C09 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C048009 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C048409 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C048809 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C048C09 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C049009 -.long 0xD3D84094, 0x18000180 -.long 0xD3D84095, 0x18000181 -.long 0xD3D84096, 0x180001C0 -.long 0xD3D84097, 0x180001C1 -.long 0xD3D84098, 0x18000182 -.long 0xD3D84099, 0x18000183 -.long 0xD3D8409A, 0x180001C2 -.long 0xD3D8409B, 0x180001C3 -.long 0xD3D8409C, 0x18000184 -.long 0xD3D8409D, 0x18000185 -.long 0xD3D8409E, 0x180001C4 -.long 0xD3D8409F, 0x180001C5 -.long 0xD3D840A0, 0x18000186 -.long 0xD3D840A1, 0x18000187 -.long 0xD3D840A2, 0x180001C6 -.long 0xD3D840A3, 0x180001C7 -.long 0xD3D840A4, 0x18000188 -.long 0xD3D840A5, 0x18000189 -.long 0xD3D840A6, 0x180001C8 -.long 0xD3D840A7, 0x180001C9 -.long 0xD3D840A8, 0x1800018A -.long 0xD3D840A9, 0x1800018B -.long 0xD3D840AA, 0x180001CA -.long 0xD3D840AB, 0x180001CB -.long 0xD3D840AC, 0x1800018C -.long 0xD3D840AD, 0x1800018D -.long 0xD3D840AE, 0x180001CC -.long 0xD3D840AF, 0x180001CD -.long 0xD3D840B0, 0x1800018E -.long 0xD3D840B1, 0x1800018F -.long 0xD3D840B2, 0x180001CE -.long 0xD3D840B3, 0x180001CF -.long 0xD3D840B4, 0x18000190 -.long 0xD3D840B5, 0x18000191 -.long 0xD3D840B6, 0x180001D0 -.long 0xD3D840B7, 0x180001D1 -.long 0xD3D840B8, 0x18000192 -.long 0xD3D840B9, 0x18000193 -.long 0xD3D840BA, 0x180001D2 -.long 0xD3D840BB, 0x180001D3 -.long 0xD3D840BC, 0x18000194 -.long 0xD3D840BD, 0x18000195 -.long 0xD3D840BE, 0x180001D4 -.long 0xD3D840BF, 0x180001D5 -.long 0xD3D840C0, 0x18000196 -.long 0xD3D840C1, 0x18000197 -.long 0xD3D840C2, 0x180001D6 -.long 0xD3D840C3, 0x180001D7 -.long 0xD3D840C4, 0x18000198 -.long 0xD3D840C5, 0x18000199 -.long 0xD3D840C6, 0x180001D8 -.long 0xD3D840C7, 0x180001D9 -.long 0xD3D840C8, 0x1800019A -.long 0xD3D840C9, 0x1800019B -.long 0xD3D840CA, 0x180001DA -.long 0xD3D840CB, 0x180001DB -.long 0xD3D840CC, 0x1800019C -.long 0xD3D840CD, 0x1800019D -.long 0xD3D840CE, 0x180001DC -.long 0xD3D840CF, 0x180001DD -.long 0xD3D840D0, 0x1800019E -.long 0xD3D840D1, 0x1800019F -.long 0xD3D840D2, 0x180001DE -.long 0xD3D840D3, 0x180001DF -.long 0xBF800001 -.long 0xBEBC0014 +.long 0xD2800010, 0x2002A910 +.long 0xD2800012, 0x2002AD12 +.long 0xD3EE80E0, 0x07827D1C +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xD3EE80E8, 0x07A27D1E +.long 0x80101910 +.long 0x82118011 +.long 0xD3EE80F0, 0x07C27D20 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3EE80F8, 0x07E27D22 +.long 0xBEFC001C .long 0xBF8C0F7F -.long 0xD2800094, 0x2002A994 -.long 0xD2800096, 0x2002AD96 -.long 0xE07C2000, 0x3C049409 -.long 0x803C1F3C -.long 0xBF8C0F7E -.long 0xD2800098, 0x2002B198 -.long 0xD280009A, 0x2002B59A -.long 0xE07C2000, 0x3C049809 -.long 0x803C1F3C +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8080, 0x06028124 +.long 0xD8EC4080, 0x1400000F +.long 0xD8EC4100, 0x1600000F +.long 0xE0511000, 0x80020707 +.long 0xD3EE8088, 0x06228126 +.long 0xBF8CC67F +.long 0xD3EE8090, 0x06428128 +.long 0xD8EC4180, 0x1800000F +.long 0xD8EC4200, 0x1A00000F +.long 0xE0511100, 0x80020707 +.long 0xD3EE8098, 0x0662812A +.long 0xBF8CC67F +.long 0xD3EE80A0, 0x0682812C +.long 0xD8EC4280, 0x1C00000F +.long 0xD8EC4300, 0x1E00000F +.long 0xE0511200, 0x80020707 +.long 0xD3EE80A8, 0x06A2812E +.long 0xBF8CC67F +.long 0xD3EE80B0, 0x06C28130 +.long 0xD8EC4380, 0x2000000F +.long 0xD8EC4400, 0x2200000F +.long 0xE0511300, 0x80020707 +.long 0xD3EE80B8, 0x06E28132 +.long 0xD3EE80C0, 0x07028524 +.long 0xE05C1000, 0x80015006 +.long 0xD3EE80C8, 0x07228526 +.long 0xBE9C0016 +.long 0x80081508 +.long 0x82098009 +.long 0xD3EE80D0, 0x07428528 +.long 0x80041404 +.long 0x82058005 +.long 0xD3EE80D8, 0x0762852A +.long 0xD3EE80E0, 0x0782852C +.long 0xD3EE80E8, 0x07A2852E +.long 0xD3EE80F0, 0x07C28530 +.long 0xD3EE80F8, 0x07E28532 +.long 0xBEFC001C .long 0xBF8C0F7D -.long 0xD280009C, 0x2002B99C -.long 0xD280009E, 0x2002BD9E -.long 0xE07C2000, 0x3C049C09 -.long 0x803C1F3C -.long 0xBF8C0F7C -.long 0xD28000A0, 0x2002C1A0 -.long 0xD28000A2, 0x2002C5A2 -.long 0xE07C2000, 0x3C04A009 -.long 0x803C1F3C -.long 0xBF8C0F7B -.long 0xD28000A4, 0x2002C9A4 -.long 0xD28000A6, 0x2002CDA6 -.long 0xE07C2000, 0x3C04A409 -.long 0x803C1F3C -.long 0xBF8C0F7A -.long 0xD28000A8, 0x2002D1A8 -.long 0xD28000AA, 0x2002D5AA -.long 0xE07C2000, 0x3C04A809 -.long 0x803C1F3C -.long 0xBF8C0F79 -.long 0xD28000AC, 0x2002D9AC -.long 0xD28000AE, 0x2002DDAE -.long 0xE07C2000, 0x3C04AC09 -.long 0x803C1F3C -.long 0xBF8C0F78 -.long 0xD28000B0, 0x2002E1B0 -.long 0xD28000B2, 0x2002E5B2 -.long 0xE07C2000, 0x3C04B009 -.long 0x803C1F3C -.long 0xBF8C0F77 -.long 0xD28000B4, 0x2002E9B4 -.long 0xD28000B6, 0x2002EDB6 -.long 0xE07C2000, 0x3C04B409 -.long 0x803C1F3C -.long 0xBF8C0F76 -.long 0xD28000B8, 0x2002F1B8 -.long 0xD28000BA, 0x2002F5BA -.long 0xE07C2000, 0x3C04B809 -.long 0x803C1F3C -.long 0xBF8C0F75 -.long 0xD28000BC, 0x2002F9BC -.long 0xD28000BE, 0x2002FDBE -.long 0xE07C2000, 0x3C04BC09 -.long 0x803C1F3C -.long 0xBF8C0F74 -.long 0xD28000C0, 0x200301C0 -.long 0xD28000C2, 0x200305C2 -.long 0xE07C2000, 0x3C04C009 -.long 0x803C1F3C -.long 0xBF8C0F73 -.long 0xD28000C4, 0x200309C4 -.long 0xD28000C6, 0x20030DC6 -.long 0xE07C2000, 0x3C04C409 -.long 0x803C1F3C -.long 0xBF8C0F72 -.long 0xD28000C8, 0x200311C8 -.long 0xD28000CA, 0x200315CA -.long 0xE07C2000, 0x3C04C809 -.long 0x803C1F3C -.long 0xBF8C0F71 -.long 0xD28000CC, 0x200319CC -.long 0xD28000CE, 0x20031DCE -.long 0xE07C2000, 0x3C04CC09 -.long 0x803C1F3C -.long 0xBF8C0F70 -.long 0xD28000D0, 0x200321D0 -.long 0xD28000D2, 0x200325D2 -.long 0xE07C2000, 0x3C04D009 -.long 0x803C1F3C -.long 0xBE94003C -.long 0xE05C2000, 0x3C045409 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C045809 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C045C09 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C046009 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C046409 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C046809 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C046C09 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C047009 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C047409 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C047809 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C047C09 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C048009 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C048409 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C048809 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C048C09 -.long 0x803C1F3C -.long 0xE05C2000, 0x3C049009 -.long 0xD3D84094, 0x180001A0 -.long 0xD3D84095, 0x180001A1 -.long 0xD3D84096, 0x180001E0 -.long 0xD3D84097, 0x180001E1 -.long 0xD3D84098, 0x180001A2 -.long 0xD3D84099, 0x180001A3 -.long 0xD3D8409A, 0x180001E2 -.long 0xD3D8409B, 0x180001E3 -.long 0xD3D8409C, 0x180001A4 -.long 0xD3D8409D, 0x180001A5 -.long 0xD3D8409E, 0x180001E4 -.long 0xD3D8409F, 0x180001E5 -.long 0xD3D840A0, 0x180001A6 -.long 0xD3D840A1, 0x180001A7 -.long 0xD3D840A2, 0x180001E6 -.long 0xD3D840A3, 0x180001E7 -.long 0xD3D840A4, 0x180001A8 -.long 0xD3D840A5, 0x180001A9 -.long 0xD3D840A6, 0x180001E8 -.long 0xD3D840A7, 0x180001E9 -.long 0xD3D840A8, 0x180001AA -.long 0xD3D840A9, 0x180001AB -.long 0xD3D840AA, 0x180001EA -.long 0xD3D840AB, 0x180001EB -.long 0xD3D840AC, 0x180001AC -.long 0xD3D840AD, 0x180001AD -.long 0xD3D840AE, 0x180001EC -.long 0xD3D840AF, 0x180001ED -.long 0xD3D840B0, 0x180001AE -.long 0xD3D840B1, 0x180001AF -.long 0xD3D840B2, 0x180001EE -.long 0xD3D840B3, 0x180001EF -.long 0xD3D840B4, 0x180001B0 -.long 0xD3D840B5, 0x180001B1 -.long 0xD3D840B6, 0x180001F0 -.long 0xD3D840B7, 0x180001F1 -.long 0xD3D840B8, 0x180001B2 -.long 0xD3D840B9, 0x180001B3 -.long 0xD3D840BA, 0x180001F2 -.long 0xD3D840BB, 0x180001F3 -.long 0xD3D840BC, 0x180001B4 -.long 0xD3D840BD, 0x180001B5 -.long 0xD3D840BE, 0x180001F4 -.long 0xD3D840BF, 0x180001F5 -.long 0xD3D840C0, 0x180001B6 -.long 0xD3D840C1, 0x180001B7 -.long 0xD3D840C2, 0x180001F6 -.long 0xD3D840C3, 0x180001F7 -.long 0xD3D840C4, 0x180001B8 -.long 0xD3D840C5, 0x180001B9 -.long 0xD3D840C6, 0x180001F8 -.long 0xD3D840C7, 0x180001F9 -.long 0xD3D840C8, 0x180001BA -.long 0xD3D840C9, 0x180001BB -.long 0xD3D840CA, 0x180001FA -.long 0xD3D840CB, 0x180001FB -.long 0xD3D840CC, 0x180001BC -.long 0xD3D840CD, 0x180001BD -.long 0xD3D840CE, 0x180001FC -.long 0xD3D840CF, 0x180001FD -.long 0xD3D840D0, 0x180001BE -.long 0xD3D840D1, 0x180001BF -.long 0xD3D840D2, 0x180001FE -.long 0xD3D840D3, 0x180001FF -.long 0xBF800001 -.long 0xBEBC0014 +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8080, 0x06028914 +.long 0xD8EC50A0, 0x2400000F +.long 0xD8EC5120, 0x2600000F +.long 0xE0511000, 0x80020707 +.long 0xD3EE8088, 0x06228916 +.long 0xBF8CC67F +.long 0xD3EE8090, 0x06428918 +.long 0xD8EC51A0, 0x2800000F +.long 0xD8EC5220, 0x2A00000F +.long 0xE0511100, 0x80020707 +.long 0xD3EE8098, 0x0662891A +.long 0xBF8CC67F +.long 0xD3EE80A0, 0x0682891C +.long 0xD8EC52A0, 0x2C00000F +.long 0xD8EC5320, 0x2E00000F +.long 0xE0511200, 0x80020707 +.long 0xD3EE80A8, 0x06A2891E +.long 0xBF8CC67F +.long 0xD3EE80B0, 0x06C28920 +.long 0xD8EC53A0, 0x3000000F +.long 0xD8EC5420, 0x3200000F +.long 0xE0511300, 0x80020707 +.long 0xD3EE80B8, 0x06E28922 +.long 0xD3EE80C0, 0x07028D14 +.long 0xE05C1000, 0x80013406 +.long 0xD3EE80C8, 0x07228D16 +.long 0xB71C1020 +.long 0x80081508 +.long 0x82098009 +.long 0xD3EE80D0, 0x07428D18 +.long 0x80041404 +.long 0x82058005 +.long 0xD3EE80D8, 0x07628D1A +.long 0xD3EE80E0, 0x07828D1C +.long 0xE0541000, 0x80045808 +.long 0xE0541000, 0x80045A09 +.long 0xD3EE80E8, 0x07A28D1E +.long 0xD3EE80F0, 0x07C28D20 +.long 0xD3EE80F8, 0x07E28D22 +.long 0xBEFC001C .long 0xBF8C0F7F -.long 0xD2800094, 0x2002A994 -.long 0xD2800096, 0x2002AD96 -.long 0xE07C2000, 0x3C049409 -.long 0x803C1F3C -.long 0xBF8C0F7E -.long 0xD2800098, 0x2002B198 -.long 0xD280009A, 0x2002B59A -.long 0xE07C2000, 0x3C049809 -.long 0x803C1F3C +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8080, 0x06029124 +.long 0xD8EC60C0, 0x1400000F +.long 0xD8EC6140, 0x1600000F +.long 0xE0511000, 0x80020707 +.long 0xD3EE8088, 0x06229126 +.long 0xBF8CC67F +.long 0xD3EE8090, 0x06429128 +.long 0xD8EC61C0, 0x1800000F +.long 0xD8EC6240, 0x1A00000F +.long 0xE0511100, 0x80020707 +.long 0xD3EE8098, 0x0662912A +.long 0xBF8CC67F +.long 0xD3EE80A0, 0x0682912C +.long 0xD8EC62C0, 0x1C00000F +.long 0xD8EC6340, 0x1E00000F +.long 0xE0511200, 0x80020707 +.long 0xD3EE80A8, 0x06A2912E +.long 0xBF8CC67F +.long 0xD3EE80B0, 0x06C29130 +.long 0xD8EC63C0, 0x2000000F +.long 0xD8EC6440, 0x2200000F +.long 0xE0511300, 0x80020707 +.long 0xD3EE80B8, 0x06E29132 +.long 0xD3EE80C0, 0x07029524 +.long 0xE05C1000, 0x80013806 +.long 0xD3EE80C8, 0x07229526 +.long 0xB71C1020 +.long 0x80081508 +.long 0x82098009 +.long 0xD3EE80D0, 0x07429528 +.long 0x80041404 +.long 0x82058005 +.long 0xD3EE80D8, 0x0762952A +.long 0xD3EE80E0, 0x0782952C +.long 0xD3EE80E8, 0x07A2952E +.long 0xD3EE80F0, 0x07C29530 +.long 0xD3EE80F8, 0x07E29532 +.long 0xBEFC001C .long 0xBF8C0F7D -.long 0xD280009C, 0x2002B99C -.long 0xD280009E, 0x2002BD9E -.long 0xE07C2000, 0x3C049C09 -.long 0x803C1F3C -.long 0xBF8C0F7C -.long 0xD28000A0, 0x2002C1A0 -.long 0xD28000A2, 0x2002C5A2 -.long 0xE07C2000, 0x3C04A009 -.long 0x803C1F3C -.long 0xBF8C0F7B -.long 0xD28000A4, 0x2002C9A4 -.long 0xD28000A6, 0x2002CDA6 -.long 0xE07C2000, 0x3C04A409 -.long 0x803C1F3C +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8080, 0x06029914 +.long 0xD8EC70E0, 0x2400000F +.long 0xD8EC7160, 0x2600000F +.long 0xE0511000, 0x80020707 +.long 0xD3EE8088, 0x06229916 +.long 0xBF8CC67F +.long 0xD3EE8090, 0x06429918 +.long 0xD8EC71E0, 0x2800000F +.long 0xD8EC7260, 0x2A00000F +.long 0xE0511100, 0x80020707 +.long 0xD3EE8098, 0x0662991A +.long 0xBF8CC67F +.long 0xD3EE80A0, 0x0682991C +.long 0xD8EC72E0, 0x2C00000F +.long 0xD8EC7360, 0x2E00000F +.long 0xE0511200, 0x80020707 +.long 0xD3EE80A8, 0x06A2991E +.long 0xBF8CC67F +.long 0xD3EE80B0, 0x06C29920 +.long 0xD8EC73E0, 0x3000000F +.long 0xD8EC7460, 0x3200000F +.long 0xE0511300, 0x80020707 +.long 0xD3EE80B8, 0x06E29922 +.long 0xD3EE80C0, 0x07029D14 +.long 0xE05C1000, 0x80013C06 +.long 0xD3EE80C8, 0x07229D16 +.long 0xB71C1020 +.long 0x80081508 +.long 0x82098009 +.long 0xD3EE80D0, 0x07429D18 +.long 0x80041404 +.long 0x82058005 +.long 0xD3EE80D8, 0x07629D1A +.long 0xBF11011E +.long 0xD3D84010, 0x18000102 +.long 0xD3D84011, 0x18000103 +.long 0xD3D84012, 0x18000142 +.long 0xD3D84013, 0x18000143 +.long 0xBF800001 +.long 0xBF9C0000 .long 0xBF8C0F7A -.long 0xD28000A8, 0x2002D1A8 -.long 0xD28000AA, 0x2002D5AA -.long 0xE07C2000, 0x3C04A809 -.long 0x803C1F3C -.long 0xBF8C0F79 -.long 0xD28000AC, 0x2002D9AC -.long 0xD28000AE, 0x2002DDAE -.long 0xE07C2000, 0x3C04AC09 -.long 0x803C1F3C -.long 0xBF8C0F78 -.long 0xD28000B0, 0x2002E1B0 -.long 0xD28000B2, 0x2002E5B2 -.long 0xE07C2000, 0x3C04B009 -.long 0x803C1F3C -.long 0xBF8C0F77 -.long 0xD28000B4, 0x2002E9B4 -.long 0xD28000B6, 0x2002EDB6 -.long 0xE07C2000, 0x3C04B409 -.long 0x803C1F3C -.long 0xBF8C0F76 -.long 0xD28000B8, 0x2002F1B8 -.long 0xD28000BA, 0x2002F5BA -.long 0xE07C2000, 0x3C04B809 -.long 0x803C1F3C -.long 0xBF8C0F75 -.long 0xD28000BC, 0x2002F9BC -.long 0xD28000BE, 0x2002FDBE -.long 0xE07C2000, 0x3C04BC09 -.long 0x803C1F3C -.long 0xBF8C0F74 -.long 0xD28000C0, 0x200301C0 -.long 0xD28000C2, 0x200305C2 -.long 0xE07C2000, 0x3C04C009 -.long 0x803C1F3C -.long 0xBF8C0F73 -.long 0xD28000C4, 0x200309C4 -.long 0xD28000C6, 0x20030DC6 -.long 0xE07C2000, 0x3C04C409 -.long 0x803C1F3C -.long 0xBF8C0F72 -.long 0xD28000C8, 0x200311C8 -.long 0xD28000CA, 0x200315CA -.long 0xE07C2000, 0x3C04C809 -.long 0x803C1F3C -.long 0xBF8C0F71 -.long 0xD28000CC, 0x200319CC -.long 0xD28000CE, 0x20031DCE -.long 0xE07C2000, 0x3C04CC09 -.long 0x803C1F3C -.long 0xBF8C0F70 -.long 0xD28000D0, 0x200321D0 -.long 0xD28000D2, 0x200325D2 -.long 0xE07C2000, 0x3C04D009 -.long 0x803C1F3C +.long 0xD2800010, 0x2002B110 +.long 0xD2800012, 0x2002B512 +.long 0xD3EE80E0, 0x07829D1C +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xD3EE80E8, 0x07A29D1E +.long 0x80101910 +.long 0x82118011 +.long 0xD3EE80F0, 0x07C29D20 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3EE80F8, 0x07E29D22 +.long 0xBEFC001C +.long 0xBF8C0F7F +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8080, 0x0602A124 +.long 0xD8EC0000, 0x1400000F +.long 0xD8EC0080, 0x1600000F +.long 0xE0511000, 0x80020707 +.long 0xD3EE8088, 0x0622A126 +.long 0xBF8CC67F +.long 0xD3EE8090, 0x0642A128 +.long 0xD8EC0100, 0x1800000F +.long 0xD8EC0180, 0x1A00000F +.long 0xE0511100, 0x80020707 +.long 0xD3EE8098, 0x0662A12A +.long 0xBF8CC67F +.long 0xD3EE80A0, 0x0682A12C +.long 0xD8EC0200, 0x1C00000F +.long 0xD8EC0280, 0x1E00000F +.long 0xE0511200, 0x80020707 +.long 0xD3EE80A8, 0x06A2A12E +.long 0xBF8CC67F +.long 0xD3EE80B0, 0x06C2A130 +.long 0xD8EC0300, 0x2000000F +.long 0xD8EC0380, 0x2200000F +.long 0xE0511300, 0x80020707 +.long 0xD3EE80B8, 0x06E2A132 +.long 0xD3EE80C0, 0x0702A524 +.long 0xE05C1000, 0x80014006 +.long 0xD3EE80C8, 0x0722A526 +.long 0xB71C1020 +.long 0x80081508 +.long 0x82098009 +.long 0xD3EE80D0, 0x0742A528 +.long 0x80041404 +.long 0x82058005 +.long 0xD3EE80D8, 0x0762A52A +.long 0xD3EE80E0, 0x0782A52C +.long 0xD3EE80E8, 0x07A2A52E +.long 0xD3EE80F0, 0x07C2A530 +.long 0xD3EE80F8, 0x07E2A532 +.long 0xB71E0004 +.long 0xB71D0008 +.long 0xB31D0078 +.long 0xBF85FD94 +.long 0xBEFC001C +.long 0xBF8C0F7D +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8080, 0x06026914 +.long 0xD8EC1020, 0x2400000F +.long 0xD8EC10A0, 0x2600000F +.long 0xE0511000, 0x80020707 +.long 0xD3EE8088, 0x06226916 +.long 0xBF8CC67F +.long 0xD3EE8090, 0x06426918 +.long 0xD8EC1120, 0x2800000F +.long 0xD8EC11A0, 0x2A00000F +.long 0xE0511100, 0x80020707 +.long 0xD3EE8098, 0x0662691A +.long 0xBF8CC67F +.long 0xD3EE80A0, 0x0682691C +.long 0xD8EC1220, 0x2C00000F +.long 0xD8EC12A0, 0x2E00000F +.long 0xE0511200, 0x80020707 +.long 0xD3EE80A8, 0x06A2691E +.long 0xBF8CC67F +.long 0xD3EE80B0, 0x06C26920 +.long 0xD8EC1320, 0x3000000F +.long 0xD8EC13A0, 0x3200000F +.long 0xE0511300, 0x80020707 +.long 0xD3EE80B8, 0x06E26922 +.long 0xB71C1020 +.long 0xD3EE80C0, 0x07026D14 +.long 0xE05C1000, 0x80014406 +.long 0xD3EE80C8, 0x07226D16 +.long 0x80081508 +.long 0x82098009 +.long 0xD3EE80D0, 0x07426D18 +.long 0x80041404 +.long 0x82058005 +.long 0xD3EE80D8, 0x07626D1A +.long 0xD3EE80E0, 0x07826D1C +.long 0xE0541000, 0x80045408 +.long 0xE0541000, 0x80045609 +.long 0xD3EE80E8, 0x07A26D1E +.long 0xD3EE80F0, 0x07C26D20 +.long 0xD3EE80F8, 0x07E26D22 +.long 0xBEFC001C +.long 0xBF8C0F7F +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8080, 0x06027124 +.long 0xD8EC2040, 0x1400000F +.long 0xD8EC20C0, 0x1600000F +.long 0xE0511000, 0x80020707 +.long 0xD3EE8088, 0x06227126 +.long 0xBF8CC67F +.long 0xD3EE8090, 0x06427128 +.long 0xD8EC2140, 0x1800000F +.long 0xD8EC21C0, 0x1A00000F +.long 0xE0511100, 0x80020707 +.long 0xD3EE8098, 0x0662712A +.long 0xBF8CC67F +.long 0xD3EE80A0, 0x0682712C +.long 0xD8EC2240, 0x1C00000F +.long 0xD8EC22C0, 0x1E00000F +.long 0xE0511200, 0x80020707 +.long 0xD3EE80A8, 0x06A2712E +.long 0xBF8CC67F +.long 0xD3EE80B0, 0x06C27130 +.long 0xD8EC2340, 0x2000000F +.long 0xD8EC23C0, 0x2200000F +.long 0xE0511300, 0x80020707 +.long 0xD3EE80B8, 0x06E27132 +.long 0xB71C1020 +.long 0xD3EE80C0, 0x07027524 +.long 0xE05C1000, 0x80014806 +.long 0xD3EE80C8, 0x07227526 +.long 0x80081508 +.long 0x82098009 +.long 0xD3EE80D0, 0x07427528 +.long 0x80041404 +.long 0x82058005 +.long 0xD3EE80D8, 0x0762752A +.long 0xD3EE80E0, 0x0782752C +.long 0xD3EE80E8, 0x07A2752E +.long 0xD3EE80F0, 0x07C27530 +.long 0xD3EE80F8, 0x07E27532 +.long 0xBEFC001C +.long 0xBF8C0F7D +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8080, 0x06027914 +.long 0xD8EC3060, 0x2400000F +.long 0xD8EC30E0, 0x2600000F +.long 0xE0511000, 0x80020707 +.long 0xD3EE8088, 0x06227916 +.long 0xBF8CC67F +.long 0xD3EE8090, 0x06427918 +.long 0xD8EC3160, 0x2800000F +.long 0xD8EC31E0, 0x2A00000F +.long 0xE0511100, 0x80020707 +.long 0xD3EE8098, 0x0662791A +.long 0xBF8CC67F +.long 0xD3EE80A0, 0x0682791C +.long 0xD8EC3260, 0x2C00000F +.long 0xD8EC32E0, 0x2E00000F +.long 0xE0511200, 0x80020707 +.long 0xD3EE80A8, 0x06A2791E +.long 0xBF8CC67F +.long 0xD3EE80B0, 0x06C27920 +.long 0xD8EC3360, 0x3000000F +.long 0xD8EC33E0, 0x3200000F +.long 0xE0511300, 0x80020707 +.long 0xD3EE80B8, 0x06E27922 +.long 0xB71C1020 +.long 0xD3EE80C0, 0x07027D14 +.long 0xE05C1000, 0x80014C06 +.long 0xD3EE80C8, 0x07227D16 +.long 0x80081508 +.long 0x82098009 +.long 0xD3EE80D0, 0x07427D18 +.long 0x80041404 +.long 0x82058005 +.long 0xD3EE80D8, 0x07627D1A +.long 0xD3D84010, 0x1800013C +.long 0xD3D84011, 0x1800013D +.long 0xD3D84012, 0x1800017C +.long 0xD3D84013, 0x1800017D +.long 0xBF800001 +.long 0xBF8C0F7A +.long 0xD2800010, 0x2002A910 +.long 0xD2800012, 0x2002AD12 +.long 0xD3EE80E0, 0x07827D1C +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xD3EE80E8, 0x07A27D1E +.long 0x80101910 +.long 0x82118011 +.long 0xD3EE80F0, 0x07C27D20 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3EE80F8, 0x07E27D22 +.long 0x808C0426 +.long 0x828D0527 +.long 0xBE86000C +.long 0x808C0828 +.long 0x828D0929 +.long 0xBE8A000C +.long 0xBEFC001C +.long 0xBF8C0F7F +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8080, 0x06028124 +.long 0xD8EC4080, 0x1400000F +.long 0xD8EC4100, 0x1600000F +.long 0xE0511000, 0x80020707 +.long 0xD3EE8088, 0x06228126 +.long 0xBF8CC67F +.long 0xD3EE8090, 0x06428128 +.long 0xD8EC4180, 0x1800000F +.long 0xD8EC4200, 0x1A00000F +.long 0xE0511100, 0x80020707 +.long 0xD3EE8098, 0x0662812A +.long 0xBF8CC67F +.long 0xD3EE80A0, 0x0682812C +.long 0xD8EC4280, 0x1C00000F +.long 0xD8EC4300, 0x1E00000F +.long 0xE0511200, 0x80020707 +.long 0xD3EE80A8, 0x06A2812E +.long 0xBF8CC67F +.long 0xD3EE80B0, 0x06C28130 +.long 0xD8EC4380, 0x2000000F +.long 0xD8EC4400, 0x2200000F +.long 0xE0511300, 0x80020707 +.long 0xD3EE80B8, 0x06E28132 +.long 0xBE9C0016 +.long 0xD3EE80C0, 0x07028524 +.long 0xE05C1000, 0x80015006 +.long 0xD3EE80C8, 0x07228526 +.long 0xD3EE80D0, 0x07428528 +.long 0xD3EE80D8, 0x0762852A +.long 0xD3EE80E0, 0x0782852C +.long 0xD3EE80E8, 0x07A2852E +.long 0xD3EE80F0, 0x07C28530 +.long 0xD3EE80F8, 0x07E28532 +.long 0xB00C0100 +.long 0x920F320C +.long 0x920EFF18, 0x00000080 +.long 0x920C330E +.long 0x960D330E +.long 0x800C0F0C +.long 0x820D800D +.long 0x8E8C820C +.long 0x920E30FF, 0x00000100 +.long 0x803E0E0C +.long 0x823F800D +.long 0x80A40C2A +.long 0x82A50D2B +.long 0x924EFF32, 0x00000080 +.long 0x80CE4E1F +.long 0x81313831 +.long 0xBF031731 +.long 0xBF85089E +.long 0x814C384C +.long 0x818C494C +.long 0x810D814D +.long 0xBF03494C +.long 0xBECC020C +.long 0xBECD020D +.long 0x960D3631 +.long 0x8F0E9F37 +.long 0x920C0E31 +.long 0x800C0D0C +.long 0x860EFF37, 0x7FFFFFFF +.long 0x8F330E0C +.long 0x92323433 +.long 0x80B23231 +.long 0xBF033534 +.long 0x85463332 +.long 0x85473233 +.long 0xBF033849 +.long 0xBEC6024C +.long 0xBEC7024D +.long 0xBE8F004B +.long 0x960D0F47 +.long 0x920C0F47 +.long 0x8F8C9F0C +.long 0x920D4A0C +.long 0x808D0D47 +.long 0x920D490D +.long 0x800D460D +.long 0xBF09390C +.long 0xBE8F023B +.long 0x850E4A3A +.long 0x96470F0D +.long 0x92460F0D +.long 0x8FC69F46 +.long 0x92470E46 +.long 0x80C7470D +.long 0x920C4A0C +.long 0x80470C47 +.long 0xBF033534 +.long 0x85324746 +.long 0x85334647 +.long 0x8E0C8A32 +.long 0x810C1A0C +.long 0x80040C40 +.long 0x82058041 +.long 0xBE8600FF, 0x80000000 +.long 0x8E0C8A33 +.long 0x800C1B0C +.long 0x80080C42 +.long 0x82098043 +.long 0xBE8A00FF, 0x80000000 +.long 0xBEFC001C +.long 0xBF8C0F7D +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8080, 0x06028914 +.long 0xD8EC50A0, 0x2400000F +.long 0xD8EC5120, 0x2600000F +.long 0xE0511000, 0x80020707 +.long 0xD3EE8088, 0x06228916 +.long 0xBF8CC67F +.long 0xD3EE8090, 0x06428918 +.long 0xD8EC51A0, 0x2800000F +.long 0xD8EC5220, 0x2A00000F +.long 0xE0511100, 0x80020707 +.long 0xD3EE8098, 0x0662891A +.long 0xBF8CC67F +.long 0xD3EE80A0, 0x0682891C +.long 0xD8EC52A0, 0x2C00000F +.long 0xD8EC5320, 0x2E00000F +.long 0xE0511200, 0x80020707 +.long 0xD3EE80A8, 0x06A2891E +.long 0xBF8CC67F +.long 0xD3EE80B0, 0x06C28920 +.long 0xD8EC53A0, 0x3000000F +.long 0xD8EC5420, 0x3200000F +.long 0xE0511300, 0x80020707 +.long 0xD3EE80B8, 0x06E28922 +.long 0xB71C1020 +.long 0xD3EE80C0, 0x07028D14 +.long 0xE05C1000, 0x80013406 +.long 0xD3EE80C8, 0x07228D16 +.long 0x80081508 +.long 0x82098009 +.long 0xD3EE80D0, 0x07428D18 +.long 0x80041404 +.long 0x82058005 +.long 0xD3EE80D8, 0x07628D1A +.long 0xD3EE80E0, 0x07828D1C +.long 0xE0541000, 0x80045808 +.long 0xE0541000, 0x80045A09 +.long 0xD3EE80E8, 0x07A28D1E +.long 0xD3EE80F0, 0x07C28D20 +.long 0xD3EE80F8, 0x07E28D22 +.long 0xBEFC001C +.long 0xBF8C0F7F +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8080, 0x06029124 +.long 0xD8EC60C0, 0x1400000F +.long 0xD8EC6140, 0x1600000F +.long 0xE0511000, 0x80020707 +.long 0xD3EE8088, 0x06229126 +.long 0xBF8CC67F +.long 0xD3EE8090, 0x06429128 +.long 0xD8EC61C0, 0x1800000F +.long 0xD8EC6240, 0x1A00000F +.long 0xE0511100, 0x80020707 +.long 0xD3EE8098, 0x0662912A +.long 0xBF8CC67F +.long 0xD3EE80A0, 0x0682912C +.long 0xD8EC62C0, 0x1C00000F +.long 0xD8EC6340, 0x1E00000F +.long 0xE0511200, 0x80020707 +.long 0xD3EE80A8, 0x06A2912E +.long 0xBF8CC67F +.long 0xD3EE80B0, 0x06C29130 +.long 0xD8EC63C0, 0x2000000F +.long 0xD8EC6440, 0x2200000F +.long 0xE0511300, 0x80020707 +.long 0xD3EE80B8, 0x06E29132 +.long 0xB71C1020 +.long 0xD3EE80C0, 0x07029524 +.long 0xE05C1000, 0x80013806 +.long 0xD3EE80C8, 0x07229526 +.long 0x80081508 +.long 0x82098009 +.long 0xD3EE80D0, 0x07429528 +.long 0x80041404 +.long 0x82058005 +.long 0xD3EE80D8, 0x0762952A +.long 0xD3EE80E0, 0x0782952C +.long 0xD3EE80E8, 0x07A2952E +.long 0xD3EE80F0, 0x07C29530 +.long 0xD3EE80F8, 0x07E29532 +.long 0xBEFC001C +.long 0xBF8C0F7D +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8080, 0x06029914 +.long 0xD8EC70E0, 0x2400000F +.long 0xD8EC7160, 0x2600000F +.long 0xE0511000, 0x80020707 +.long 0xD3EE8088, 0x06229916 +.long 0xBF8CC67F +.long 0xD3EE8090, 0x06429918 +.long 0xD8EC71E0, 0x2800000F +.long 0xD8EC7260, 0x2A00000F +.long 0xE0511100, 0x80020707 +.long 0xD3EE8098, 0x0662991A +.long 0xBF8CC67F +.long 0xD3EE80A0, 0x0682991C +.long 0xD8EC72E0, 0x2C00000F +.long 0xD8EC7360, 0x2E00000F +.long 0xE0511200, 0x80020707 +.long 0xD3EE80A8, 0x06A2991E +.long 0xBF8CC67F +.long 0xD3EE80B0, 0x06C29920 +.long 0xD8EC73E0, 0x3000000F +.long 0xD8EC7460, 0x3200000F +.long 0xE0511300, 0x80020707 +.long 0xD3EE80B8, 0x06E29922 +.long 0xB71C1020 +.long 0xD3EE80C0, 0x07029D14 +.long 0xE05C1000, 0x80013C06 +.long 0xD3EE80C8, 0x07229D16 +.long 0x80081508 +.long 0x82098009 +.long 0xD3EE80D0, 0x07429D18 +.long 0x80041404 +.long 0x82058005 +.long 0xD3EE80D8, 0x07629D1A +.long 0xD3D84010, 0x1800013E +.long 0xD3D84011, 0x1800013F +.long 0xD3D84012, 0x1800017E +.long 0xD3D84013, 0x1800017F +.long 0xBF800001 +.long 0xBF8C0F7A +.long 0xD2800010, 0x2002B110 +.long 0xD2800012, 0x2002B512 +.long 0xD3EE80E0, 0x07829D1C +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xD3EE80E8, 0x07A29D1E +.long 0xD3EE80F0, 0x07C29D20 +.long 0xD3EE80F8, 0x07E29D22 +.long 0xBEFC001C +.long 0xBF8C0F7F +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8080, 0x0602A124 +.long 0xD8EC0000, 0x1400000F +.long 0xD8EC0080, 0x1600000F +.long 0xE0511000, 0x80020707 +.long 0xD3EE8088, 0x0622A126 +.long 0xBF8CC67F +.long 0xD3EE8090, 0x0642A128 +.long 0xD8EC0100, 0x1800000F +.long 0xD8EC0180, 0x1A00000F +.long 0xE0511100, 0x80020707 +.long 0xD3EE8098, 0x0662A12A +.long 0xBF8CC67F +.long 0xD3EE80A0, 0x0682A12C +.long 0xD8EC0200, 0x1C00000F +.long 0xD8EC0280, 0x1E00000F +.long 0xE0511200, 0x80020707 +.long 0xD3EE80A8, 0x06A2A12E +.long 0xBF8CC67F +.long 0xD3EE80B0, 0x06C2A130 +.long 0xD8EC0300, 0x2000000F +.long 0xD8EC0380, 0x2200000F +.long 0xE0511300, 0x80020707 +.long 0xD3EE80B8, 0x06E2A132 +.long 0xB71C1020 +.long 0xD3EE80C0, 0x0702A524 +.long 0xE05C1000, 0x80014006 +.long 0xD3EE80C8, 0x0722A526 +.long 0x80081508 +.long 0x82098009 +.long 0xD3EE80D0, 0x0742A528 +.long 0x80041404 +.long 0x82058005 +.long 0xD3EE80D8, 0x0762A52A +.long 0xD3EE80E0, 0x0782A52C +.long 0xD3EE80E8, 0x07A2A52E +.long 0xD3EE80F0, 0x07C2A530 +.long 0xD3EE80F8, 0x07E2A532 +.long 0xBE9200FF, 0x80000000 +.long 0xBF091224 +.long 0x850C2412 +.long 0xBF068025 +.long 0xBE92020C +.long 0xD0C9000C, 0x00009D0A +.long 0xD1000008, 0x0032190E +.long 0xD0C9000C, 0x00009D0B +.long 0xD1000009, 0x00321B0E +.long 0xD3D94000, 0x18000080 +.long 0xD3D94001, 0x18000080 +.long 0xD3D94002, 0x18000080 +.long 0xD3D94003, 0x18000080 +.long 0xD3D94004, 0x18000080 +.long 0xD3D94005, 0x18000080 +.long 0xD3D94006, 0x18000080 +.long 0xD3D94007, 0x18000080 +.long 0xD3D94008, 0x18000080 +.long 0xD3D94009, 0x18000080 +.long 0xD3D9400A, 0x18000080 +.long 0xD3D9400B, 0x18000080 +.long 0xD3D9400C, 0x18000080 +.long 0xD3D9400D, 0x18000080 +.long 0xD3D9400E, 0x18000080 +.long 0xD3D9400F, 0x18000080 +.long 0xD3D94010, 0x18000080 +.long 0xD3D94011, 0x18000080 +.long 0xD3D94012, 0x18000080 +.long 0xD3D94013, 0x18000080 +.long 0xD3D94014, 0x18000080 +.long 0xD3D94015, 0x18000080 +.long 0xD3D94016, 0x18000080 +.long 0xD3D94017, 0x18000080 +.long 0xD3D94018, 0x18000080 +.long 0xD3D94019, 0x18000080 +.long 0xD3D9401A, 0x18000080 +.long 0xD3D9401B, 0x18000080 +.long 0xD3D9401C, 0x18000080 +.long 0xD3D9401D, 0x18000080 +.long 0xD3D9401E, 0x18000080 +.long 0xD3D9401F, 0x18000080 +.long 0xD3D94020, 0x18000080 +.long 0xD3D94021, 0x18000080 +.long 0xD3D94022, 0x18000080 +.long 0xD3D94023, 0x18000080 +.long 0xD3D94024, 0x18000080 +.long 0xD3D94025, 0x18000080 +.long 0xD3D94026, 0x18000080 +.long 0xD3D94027, 0x18000080 +.long 0xD3D94028, 0x18000080 +.long 0xD3D94029, 0x18000080 +.long 0xD3D9402A, 0x18000080 +.long 0xD3D9402B, 0x18000080 +.long 0xD3D9402C, 0x18000080 +.long 0xD3D9402D, 0x18000080 +.long 0xD3D9402E, 0x18000080 +.long 0xD3D9402F, 0x18000080 +.long 0xD3D94030, 0x18000080 +.long 0xD3D94031, 0x18000080 +.long 0xD3D94032, 0x18000080 +.long 0xD3D94033, 0x18000080 +.long 0xD3D94034, 0x18000080 +.long 0xD3D94035, 0x18000080 +.long 0xD3D94036, 0x18000080 +.long 0xD3D94037, 0x18000080 +.long 0xD3D94038, 0x18000080 +.long 0xD3D94039, 0x18000080 +.long 0xD3D9403A, 0x18000080 +.long 0xD3D9403B, 0x18000080 +.long 0xD3D9403C, 0x18000080 +.long 0xD3D9403D, 0x18000080 +.long 0xD3D9403E, 0x18000080 +.long 0xD3D9403F, 0x18000080 +.long 0xD3D94040, 0x18000080 +.long 0xD3D94041, 0x18000080 +.long 0xD3D94042, 0x18000080 +.long 0xD3D94043, 0x18000080 +.long 0xD3D94044, 0x18000080 +.long 0xD3D94045, 0x18000080 +.long 0xD3D94046, 0x18000080 +.long 0xD3D94047, 0x18000080 +.long 0xD3D94048, 0x18000080 +.long 0xD3D94049, 0x18000080 +.long 0xD3D9404A, 0x18000080 +.long 0xD3D9404B, 0x18000080 +.long 0xD3D9404C, 0x18000080 +.long 0xD3D9404D, 0x18000080 +.long 0xD3D9404E, 0x18000080 +.long 0xD3D9404F, 0x18000080 +.long 0xD3D94050, 0x18000080 +.long 0xD3D94051, 0x18000080 +.long 0xD3D94052, 0x18000080 +.long 0xD3D94053, 0x18000080 +.long 0xD3D94054, 0x18000080 +.long 0xD3D94055, 0x18000080 +.long 0xD3D94056, 0x18000080 +.long 0xD3D94057, 0x18000080 +.long 0xD3D94058, 0x18000080 +.long 0xD3D94059, 0x18000080 +.long 0xD3D9405A, 0x18000080 +.long 0xD3D9405B, 0x18000080 +.long 0xD3D9405C, 0x18000080 +.long 0xD3D9405D, 0x18000080 +.long 0xD3D9405E, 0x18000080 +.long 0xD3D9405F, 0x18000080 +.long 0xD3D94060, 0x18000080 +.long 0xD3D94061, 0x18000080 +.long 0xD3D94062, 0x18000080 +.long 0xD3D94063, 0x18000080 +.long 0xD3D94064, 0x18000080 +.long 0xD3D94065, 0x18000080 +.long 0xD3D94066, 0x18000080 +.long 0xD3D94067, 0x18000080 +.long 0xD3D94068, 0x18000080 +.long 0xD3D94069, 0x18000080 +.long 0xD3D9406A, 0x18000080 +.long 0xD3D9406B, 0x18000080 +.long 0xD3D9406C, 0x18000080 +.long 0xD3D9406D, 0x18000080 +.long 0xD3D9406E, 0x18000080 +.long 0xD3D9406F, 0x18000080 +.long 0xD3D94070, 0x18000080 +.long 0xD3D94071, 0x18000080 +.long 0xD3D94072, 0x18000080 +.long 0xD3D94073, 0x18000080 +.long 0xD3D94074, 0x18000080 +.long 0xD3D94075, 0x18000080 +.long 0xD3D94076, 0x18000080 +.long 0xD3D94077, 0x18000080 +.long 0xD3D94078, 0x18000080 +.long 0xD3D94079, 0x18000080 +.long 0xD3D9407A, 0x18000080 +.long 0xD3D9407B, 0x18000080 +.long 0xD3D9407C, 0x18000080 +.long 0xD3D9407D, 0x18000080 +.long 0xD3D9407E, 0x18000080 +.long 0xD3D9407F, 0x18000080 +.long 0xBE9D0080 +.long 0xBE9E0080 +.long 0x8010443E +.long 0x8211453F +.long 0xBEFC001C +.long 0xBF8C0F7D +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8000, 0x04026914 +.long 0xD8EC1020, 0x2400000F +.long 0xD8EC10A0, 0x2600000F +.long 0xE0511000, 0x80020707 +.long 0xD3EE8008, 0x04226916 +.long 0xBF8CC67F +.long 0xD3EE8010, 0x04426918 +.long 0xD8EC1120, 0x2800000F +.long 0xD8EC11A0, 0x2A00000F +.long 0xE0511100, 0x80020707 +.long 0xD3EE8018, 0x0462691A +.long 0xBF8CC67F +.long 0xD3EE8020, 0x0482691C +.long 0xD8EC1220, 0x2C00000F +.long 0xD8EC12A0, 0x2E00000F +.long 0xE0511200, 0x80020707 +.long 0xD3EE8028, 0x04A2691E +.long 0xBF8CC67F +.long 0xD3EE8030, 0x04C26920 +.long 0xD8EC1320, 0x3000000F +.long 0xD8EC13A0, 0x3200000F +.long 0xE0511300, 0x80020707 +.long 0xD3EE8038, 0x04E26922 +.long 0xD3EE8040, 0x05026D14 +.long 0xE05C1000, 0x80014406 +.long 0xD3EE8048, 0x05226D16 +.long 0xB71C1020 +.long 0x80081508 +.long 0x82098009 +.long 0xD3EE8050, 0x05426D18 +.long 0x80041404 +.long 0x82058005 +.long 0xD3EE8058, 0x05626D1A +.long 0xD3EE8060, 0x05826D1C +.long 0xE0541000, 0x80045408 +.long 0xE0541000, 0x80045609 +.long 0xD3EE8068, 0x05A26D1E +.long 0xD3EE8070, 0x05C26D20 +.long 0xD3EE8078, 0x05E26D22 +.long 0xBEFC001C +.long 0xBF8C0F7F +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8000, 0x04027124 +.long 0xD8EC2040, 0x1400000F +.long 0xD8EC20C0, 0x1600000F +.long 0xE0511000, 0x80020707 +.long 0xD3EE8008, 0x04227126 +.long 0xBF8CC67F +.long 0xD3EE8010, 0x04427128 +.long 0xD8EC2140, 0x1800000F +.long 0xD8EC21C0, 0x1A00000F +.long 0xE0511100, 0x80020707 +.long 0xD3EE8018, 0x0462712A +.long 0xBF8CC67F +.long 0xD3EE8020, 0x0482712C +.long 0xD8EC2240, 0x1C00000F +.long 0xD8EC22C0, 0x1E00000F +.long 0xE0511200, 0x80020707 +.long 0xD3EE8028, 0x04A2712E +.long 0xBF8CC67F +.long 0xD3EE8030, 0x04C27130 +.long 0xD8EC2340, 0x2000000F +.long 0xD8EC23C0, 0x2200000F +.long 0xE0511300, 0x80020707 +.long 0xD3EE8038, 0x04E27132 +.long 0xD3EE8040, 0x05027524 +.long 0xE05C1000, 0x80014806 +.long 0xD3EE8048, 0x05227526 +.long 0xB71C1020 +.long 0x80081508 +.long 0x82098009 +.long 0xD3EE8050, 0x05427528 +.long 0x80041404 +.long 0x82058005 +.long 0xD3EE8058, 0x0562752A +.long 0xD3EE8060, 0x0582752C +.long 0xD3EE8068, 0x05A2752E +.long 0xD3EE8070, 0x05C27530 +.long 0xD3EE8078, 0x05E27532 +.long 0xBEFC001C +.long 0xBF8C0F7D +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8000, 0x04027914 +.long 0xD8EC3060, 0x2400000F +.long 0xD8EC30E0, 0x2600000F +.long 0xE0511000, 0x80020707 +.long 0xD3EE8008, 0x04227916 +.long 0xBF8CC67F +.long 0xD3EE8010, 0x04427918 +.long 0xD8EC3160, 0x2800000F +.long 0xD8EC31E0, 0x2A00000F +.long 0xE0511100, 0x80020707 +.long 0xD3EE8018, 0x0462791A +.long 0xBF8CC67F +.long 0xD3EE8020, 0x0482791C +.long 0xD8EC3260, 0x2C00000F +.long 0xD8EC32E0, 0x2E00000F +.long 0xE0511200, 0x80020707 +.long 0xD3EE8028, 0x04A2791E +.long 0xBF8CC67F +.long 0xD3EE8030, 0x04C27920 +.long 0xD8EC3360, 0x3000000F +.long 0xD8EC33E0, 0x3200000F +.long 0xE0511300, 0x80020707 +.long 0xD3EE8038, 0x04E27922 +.long 0xD3EE8040, 0x05027D14 +.long 0xE05C1000, 0x80014C06 +.long 0xD3EE8048, 0x05227D16 +.long 0xB71C1020 +.long 0x80081508 +.long 0x82098009 +.long 0xD3EE8050, 0x05427D18 +.long 0x80041404 +.long 0x82058005 +.long 0xD3EE8058, 0x05627D1A +.long 0xBF11011E +.long 0xD3D84010, 0x18000180 +.long 0xD3D84011, 0x18000181 +.long 0xD3D84012, 0x180001C0 +.long 0xD3D84013, 0x180001C1 +.long 0xBF800001 +.long 0xBF9C0000 +.long 0xBF8C0F7A +.long 0xD2800010, 0x2002A910 +.long 0xD2800012, 0x2002AD12 +.long 0xD3EE8060, 0x05827D1C +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xD3EE8068, 0x05A27D1E +.long 0x80101910 +.long 0x82118011 +.long 0xD3EE8070, 0x05C27D20 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3EE8078, 0x05E27D22 +.long 0xBEFC001C +.long 0xBF8C0F7F +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8000, 0x04028124 +.long 0xD8EC4080, 0x1400000F +.long 0xD8EC4100, 0x1600000F +.long 0xE0511000, 0x80020707 +.long 0xD3EE8008, 0x04228126 +.long 0xBF8CC67F +.long 0xD3EE8010, 0x04428128 +.long 0xD8EC4180, 0x1800000F +.long 0xD8EC4200, 0x1A00000F +.long 0xE0511100, 0x80020707 +.long 0xD3EE8018, 0x0462812A +.long 0xBF8CC67F +.long 0xD3EE8020, 0x0482812C +.long 0xD8EC4280, 0x1C00000F +.long 0xD8EC4300, 0x1E00000F +.long 0xE0511200, 0x80020707 +.long 0xD3EE8028, 0x04A2812E +.long 0xBF8CC67F +.long 0xD3EE8030, 0x04C28130 +.long 0xD8EC4380, 0x2000000F +.long 0xD8EC4400, 0x2200000F +.long 0xE0511300, 0x80020707 +.long 0xD3EE8038, 0x04E28132 +.long 0xD3EE8040, 0x05028524 +.long 0xE05C1000, 0x80015006 +.long 0xD3EE8048, 0x05228526 +.long 0xBE9C0016 +.long 0x80081508 +.long 0x82098009 +.long 0xD3EE8050, 0x05428528 +.long 0x80041404 +.long 0x82058005 +.long 0xD3EE8058, 0x0562852A +.long 0xD3EE8060, 0x0582852C +.long 0xD3EE8068, 0x05A2852E +.long 0xD3EE8070, 0x05C28530 +.long 0xD3EE8078, 0x05E28532 +.long 0xBEFC001C +.long 0xBF8C0F7D +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8000, 0x04028914 +.long 0xD8EC50A0, 0x2400000F +.long 0xD8EC5120, 0x2600000F +.long 0xE0511000, 0x80020707 +.long 0xD3EE8008, 0x04228916 +.long 0xBF8CC67F +.long 0xD3EE8010, 0x04428918 +.long 0xD8EC51A0, 0x2800000F +.long 0xD8EC5220, 0x2A00000F +.long 0xE0511100, 0x80020707 +.long 0xD3EE8018, 0x0462891A +.long 0xBF8CC67F +.long 0xD3EE8020, 0x0482891C +.long 0xD8EC52A0, 0x2C00000F +.long 0xD8EC5320, 0x2E00000F +.long 0xE0511200, 0x80020707 +.long 0xD3EE8028, 0x04A2891E +.long 0xBF8CC67F +.long 0xD3EE8030, 0x04C28920 +.long 0xD8EC53A0, 0x3000000F +.long 0xD8EC5420, 0x3200000F +.long 0xE0511300, 0x80020707 +.long 0xD3EE8038, 0x04E28922 +.long 0xD3EE8040, 0x05028D14 +.long 0xE05C1000, 0x80013406 +.long 0xD3EE8048, 0x05228D16 +.long 0xB71C1020 +.long 0x80081508 +.long 0x82098009 +.long 0xD3EE8050, 0x05428D18 +.long 0x80041404 +.long 0x82058005 +.long 0xD3EE8058, 0x05628D1A +.long 0xD3EE8060, 0x05828D1C +.long 0xE0541000, 0x80045808 +.long 0xE0541000, 0x80045A09 +.long 0xD3EE8068, 0x05A28D1E +.long 0xD3EE8070, 0x05C28D20 +.long 0xD3EE8078, 0x05E28D22 +.long 0xBEFC001C +.long 0xBF8C0F7F +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8000, 0x04029124 +.long 0xD8EC60C0, 0x1400000F +.long 0xD8EC6140, 0x1600000F +.long 0xE0511000, 0x80020707 +.long 0xD3EE8008, 0x04229126 +.long 0xBF8CC67F +.long 0xD3EE8010, 0x04429128 +.long 0xD8EC61C0, 0x1800000F +.long 0xD8EC6240, 0x1A00000F +.long 0xE0511100, 0x80020707 +.long 0xD3EE8018, 0x0462912A +.long 0xBF8CC67F +.long 0xD3EE8020, 0x0482912C +.long 0xD8EC62C0, 0x1C00000F +.long 0xD8EC6340, 0x1E00000F +.long 0xE0511200, 0x80020707 +.long 0xD3EE8028, 0x04A2912E +.long 0xBF8CC67F +.long 0xD3EE8030, 0x04C29130 +.long 0xD8EC63C0, 0x2000000F +.long 0xD8EC6440, 0x2200000F +.long 0xE0511300, 0x80020707 +.long 0xD3EE8038, 0x04E29132 +.long 0xD3EE8040, 0x05029524 +.long 0xE05C1000, 0x80013806 +.long 0xD3EE8048, 0x05229526 +.long 0xB71C1020 +.long 0x80081508 +.long 0x82098009 +.long 0xD3EE8050, 0x05429528 +.long 0x80041404 +.long 0x82058005 +.long 0xD3EE8058, 0x0562952A +.long 0xD3EE8060, 0x0582952C +.long 0xD3EE8068, 0x05A2952E +.long 0xD3EE8070, 0x05C29530 +.long 0xD3EE8078, 0x05E29532 +.long 0xBEFC001C +.long 0xBF8C0F7D +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8000, 0x04029914 +.long 0xD8EC70E0, 0x2400000F +.long 0xD8EC7160, 0x2600000F +.long 0xE0511000, 0x80020707 +.long 0xD3EE8008, 0x04229916 +.long 0xBF8CC67F +.long 0xD3EE8010, 0x04429918 +.long 0xD8EC71E0, 0x2800000F +.long 0xD8EC7260, 0x2A00000F +.long 0xE0511100, 0x80020707 +.long 0xD3EE8018, 0x0462991A +.long 0xBF8CC67F +.long 0xD3EE8020, 0x0482991C +.long 0xD8EC72E0, 0x2C00000F +.long 0xD8EC7360, 0x2E00000F +.long 0xE0511200, 0x80020707 +.long 0xD3EE8028, 0x04A2991E +.long 0xBF8CC67F +.long 0xD3EE8030, 0x04C29920 +.long 0xD8EC73E0, 0x3000000F +.long 0xD8EC7460, 0x3200000F +.long 0xE0511300, 0x80020707 +.long 0xD3EE8038, 0x04E29922 +.long 0xD3EE8040, 0x05029D14 +.long 0xE05C1000, 0x80013C06 +.long 0xD3EE8048, 0x05229D16 +.long 0xB71C1020 +.long 0x80081508 +.long 0x82098009 +.long 0xD3EE8050, 0x05429D18 +.long 0x80041404 +.long 0x82058005 +.long 0xD3EE8058, 0x05629D1A +.long 0xBF11011E +.long 0xD3D84010, 0x18000182 +.long 0xD3D84011, 0x18000183 +.long 0xD3D84012, 0x180001C2 +.long 0xD3D84013, 0x180001C3 +.long 0xBF800001 +.long 0xBF9C0000 +.long 0xBF8C0F7A +.long 0xD2800010, 0x2002B110 +.long 0xD2800012, 0x2002B512 +.long 0xD3EE8060, 0x05829D1C +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xD3EE8068, 0x05A29D1E +.long 0x80101910 +.long 0x82118011 +.long 0xD3EE8070, 0x05C29D20 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3EE8078, 0x05E29D22 +.long 0xBEFC001C +.long 0xBF8C0F7F +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8000, 0x0402A124 +.long 0xD8EC0000, 0x1400000F +.long 0xD8EC0080, 0x1600000F +.long 0xE0511000, 0x80020707 +.long 0xD3EE8008, 0x0422A126 +.long 0xBF8CC67F +.long 0xD3EE8010, 0x0442A128 +.long 0xD8EC0100, 0x1800000F +.long 0xD8EC0180, 0x1A00000F +.long 0xE0511100, 0x80020707 +.long 0xD3EE8018, 0x0462A12A +.long 0xBF8CC67F +.long 0xD3EE8020, 0x0482A12C +.long 0xD8EC0200, 0x1C00000F +.long 0xD8EC0280, 0x1E00000F +.long 0xE0511200, 0x80020707 +.long 0xD3EE8028, 0x04A2A12E +.long 0xBF8CC67F +.long 0xD3EE8030, 0x04C2A130 +.long 0xD8EC0300, 0x2000000F +.long 0xD8EC0380, 0x2200000F +.long 0xE0511300, 0x80020707 +.long 0xD3EE8038, 0x04E2A132 +.long 0xD3EE8040, 0x0502A524 +.long 0xE05C1000, 0x80014006 +.long 0xD3EE8048, 0x0522A526 +.long 0xB71C1020 +.long 0x80081508 +.long 0x82098009 +.long 0xD3EE8050, 0x0542A528 +.long 0x80041404 +.long 0x82058005 +.long 0xD3EE8058, 0x0562A52A +.long 0xD3EE8060, 0x0582A52C +.long 0xD3EE8068, 0x05A2A52E +.long 0xD3EE8070, 0x05C2A530 +.long 0xD3EE8078, 0x05E2A532 +.long 0xB71E0004 +.long 0xB71D0008 +.long 0xB31D0078 +.long 0xBF85FD94 +.long 0xBEFC001C +.long 0xBF8C0F7D +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8000, 0x04026914 +.long 0xD8EC1020, 0x2400000F +.long 0xD8EC10A0, 0x2600000F +.long 0xE0511000, 0x80020707 +.long 0xD3EE8008, 0x04226916 +.long 0xBF8CC67F +.long 0xD3EE8010, 0x04426918 +.long 0xD8EC1120, 0x2800000F +.long 0xD8EC11A0, 0x2A00000F +.long 0xE0511100, 0x80020707 +.long 0xD3EE8018, 0x0462691A +.long 0xBF8CC67F +.long 0xD3EE8020, 0x0482691C +.long 0xD8EC1220, 0x2C00000F +.long 0xD8EC12A0, 0x2E00000F +.long 0xE0511200, 0x80020707 +.long 0xD3EE8028, 0x04A2691E +.long 0xBF8CC67F +.long 0xD3EE8030, 0x04C26920 +.long 0xD8EC1320, 0x3000000F +.long 0xD8EC13A0, 0x3200000F +.long 0xE0511300, 0x80020707 +.long 0xD3EE8038, 0x04E26922 +.long 0xB71C1020 +.long 0xD3EE8040, 0x05026D14 +.long 0xE05C1000, 0x80014406 +.long 0xD3EE8048, 0x05226D16 +.long 0x80081508 +.long 0x82098009 +.long 0xD3EE8050, 0x05426D18 +.long 0x80041404 +.long 0x82058005 +.long 0xD3EE8058, 0x05626D1A +.long 0xD3EE8060, 0x05826D1C +.long 0xE0541000, 0x80045408 +.long 0xE0541000, 0x80045609 +.long 0xD3EE8068, 0x05A26D1E +.long 0xD3EE8070, 0x05C26D20 +.long 0xD3EE8078, 0x05E26D22 +.long 0xBEFC001C +.long 0xBF8C0F7F +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8000, 0x04027124 +.long 0xD8EC2040, 0x1400000F +.long 0xD8EC20C0, 0x1600000F +.long 0xE0511000, 0x80020707 +.long 0xD3EE8008, 0x04227126 +.long 0xBF8CC67F +.long 0xD3EE8010, 0x04427128 +.long 0xD8EC2140, 0x1800000F +.long 0xD8EC21C0, 0x1A00000F +.long 0xE0511100, 0x80020707 +.long 0xD3EE8018, 0x0462712A +.long 0xBF8CC67F +.long 0xD3EE8020, 0x0482712C +.long 0xD8EC2240, 0x1C00000F +.long 0xD8EC22C0, 0x1E00000F +.long 0xE0511200, 0x80020707 +.long 0xD3EE8028, 0x04A2712E +.long 0xBF8CC67F +.long 0xD3EE8030, 0x04C27130 +.long 0xD8EC2340, 0x2000000F +.long 0xD8EC23C0, 0x2200000F +.long 0xE0511300, 0x80020707 +.long 0xD3EE8038, 0x04E27132 +.long 0xB71C1020 +.long 0xD3EE8040, 0x05027524 +.long 0xE05C1000, 0x80014806 +.long 0xD3EE8048, 0x05227526 +.long 0x80081508 +.long 0x82098009 +.long 0xD3EE8050, 0x05427528 +.long 0x80041404 +.long 0x82058005 +.long 0xD3EE8058, 0x0562752A +.long 0xD3EE8060, 0x0582752C +.long 0xD3EE8068, 0x05A2752E +.long 0xD3EE8070, 0x05C27530 +.long 0xD3EE8078, 0x05E27532 +.long 0xBEFC001C +.long 0xBF8C0F7D +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8000, 0x04027914 +.long 0xD8EC3060, 0x2400000F +.long 0xD8EC30E0, 0x2600000F +.long 0xE0511000, 0x80020707 +.long 0xD3EE8008, 0x04227916 +.long 0xBF8CC67F +.long 0xD3EE8010, 0x04427918 +.long 0xD8EC3160, 0x2800000F +.long 0xD8EC31E0, 0x2A00000F +.long 0xE0511100, 0x80020707 +.long 0xD3EE8018, 0x0462791A +.long 0xBF8CC67F +.long 0xD3EE8020, 0x0482791C +.long 0xD8EC3260, 0x2C00000F +.long 0xD8EC32E0, 0x2E00000F +.long 0xE0511200, 0x80020707 +.long 0xD3EE8028, 0x04A2791E +.long 0xBF8CC67F +.long 0xD3EE8030, 0x04C27920 +.long 0xD8EC3360, 0x3000000F +.long 0xD8EC33E0, 0x3200000F +.long 0xE0511300, 0x80020707 +.long 0xD3EE8038, 0x04E27922 +.long 0xB71C1020 +.long 0xD3EE8040, 0x05027D14 +.long 0xE05C1000, 0x80014C06 +.long 0xD3EE8048, 0x05227D16 +.long 0x80081508 +.long 0x82098009 +.long 0xD3EE8050, 0x05427D18 +.long 0x80041404 +.long 0x82058005 +.long 0xD3EE8058, 0x05627D1A +.long 0xD3D84010, 0x180001BC +.long 0xD3D84011, 0x180001BD +.long 0xD3D84012, 0x180001FC +.long 0xD3D84013, 0x180001FD +.long 0xBF800001 +.long 0xBF8C0F7A +.long 0xD2800010, 0x2002A910 +.long 0xD2800012, 0x2002AD12 +.long 0xD3EE8060, 0x05827D1C +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xD3EE8068, 0x05A27D1E +.long 0x80101910 +.long 0x82118011 +.long 0xD3EE8070, 0x05C27D20 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3EE8078, 0x05E27D22 +.long 0x808C0426 +.long 0x828D0527 +.long 0xBE86000C +.long 0x808C0828 +.long 0x828D0929 +.long 0xBE8A000C +.long 0xBEFC001C +.long 0xBF8C0F7F +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8000, 0x04028124 +.long 0xD8EC4080, 0x1400000F +.long 0xD8EC4100, 0x1600000F +.long 0xE0511000, 0x80020707 +.long 0xD3EE8008, 0x04228126 +.long 0xBF8CC67F +.long 0xD3EE8010, 0x04428128 +.long 0xD8EC4180, 0x1800000F +.long 0xD8EC4200, 0x1A00000F +.long 0xE0511100, 0x80020707 +.long 0xD3EE8018, 0x0462812A +.long 0xBF8CC67F +.long 0xD3EE8020, 0x0482812C +.long 0xD8EC4280, 0x1C00000F +.long 0xD8EC4300, 0x1E00000F +.long 0xE0511200, 0x80020707 +.long 0xD3EE8028, 0x04A2812E +.long 0xBF8CC67F +.long 0xD3EE8030, 0x04C28130 +.long 0xD8EC4380, 0x2000000F +.long 0xD8EC4400, 0x2200000F +.long 0xE0511300, 0x80020707 +.long 0xD3EE8038, 0x04E28132 +.long 0xBE9C0016 +.long 0xD3EE8040, 0x05028524 +.long 0xE05C1000, 0x80015006 +.long 0xD3EE8048, 0x05228526 +.long 0xD3EE8050, 0x05428528 +.long 0xD3EE8058, 0x0562852A +.long 0xD3EE8060, 0x0582852C +.long 0xD3EE8068, 0x05A2852E +.long 0xD3EE8070, 0x05C28530 +.long 0xD3EE8078, 0x05E28532 +.long 0xB00C0100 +.long 0x920F320C +.long 0x920EFF18, 0x00000080 +.long 0x920C330E +.long 0x960D330E +.long 0x800C0F0C +.long 0x820D800D +.long 0x8E8C820C +.long 0x920E30FF, 0x00000100 +.long 0x803E0E0C +.long 0x823F800D +.long 0x80A40C2A +.long 0x82A50D2B +.long 0x924EFF32, 0x00000080 +.long 0x80CE4E1F +.long 0x81313831 +.long 0xBF031731 +.long 0xBF850360 +.long 0x814C384C +.long 0x818C494C +.long 0x810D814D +.long 0xBF03494C +.long 0xBECC020C +.long 0xBECD020D +.long 0x960D3631 +.long 0x8F0E9F37 +.long 0x920C0E31 +.long 0x800C0D0C +.long 0x860EFF37, 0x7FFFFFFF +.long 0x8F330E0C +.long 0x92323433 +.long 0x80B23231 +.long 0xBF033534 +.long 0x85463332 +.long 0x85473233 +.long 0xBF033849 +.long 0xBEC6024C +.long 0xBEC7024D +.long 0xBE8F004B +.long 0x960D0F47 +.long 0x920C0F47 +.long 0x8F8C9F0C +.long 0x920D4A0C +.long 0x808D0D47 +.long 0x920D490D +.long 0x800D460D +.long 0xBF09390C +.long 0xBE8F023B +.long 0x850E4A3A +.long 0x96470F0D +.long 0x92460F0D +.long 0x8FC69F46 +.long 0x92470E46 +.long 0x80C7470D +.long 0x920C4A0C +.long 0x80470C47 +.long 0xBF033534 +.long 0x85324746 +.long 0x85334647 +.long 0x8E0C8A32 +.long 0x810C1A0C +.long 0x80040C40 +.long 0x82058041 +.long 0xBE8600FF, 0x80000000 +.long 0x8E0C8A33 +.long 0x800C1B0C +.long 0x80080C42 +.long 0x82098043 +.long 0xBE8A00FF, 0x80000000 +.long 0xBEFC001C +.long 0xBF8C0F7D +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8000, 0x04028914 +.long 0xD8EC50A0, 0x2400000F +.long 0xD8EC5120, 0x2600000F +.long 0xE0511000, 0x80020707 +.long 0xD3EE8008, 0x04228916 +.long 0xBF8CC67F +.long 0xD3EE8010, 0x04428918 +.long 0xD8EC51A0, 0x2800000F +.long 0xD8EC5220, 0x2A00000F +.long 0xE0511100, 0x80020707 +.long 0xD3EE8018, 0x0462891A +.long 0xBF8CC67F +.long 0xD3EE8020, 0x0482891C +.long 0xD8EC52A0, 0x2C00000F +.long 0xD8EC5320, 0x2E00000F +.long 0xE0511200, 0x80020707 +.long 0xD3EE8028, 0x04A2891E +.long 0xBF8CC67F +.long 0xD3EE8030, 0x04C28920 +.long 0xD8EC53A0, 0x3000000F +.long 0xD8EC5420, 0x3200000F +.long 0xE0511300, 0x80020707 +.long 0xD3EE8038, 0x04E28922 +.long 0xB71C1020 +.long 0xD3EE8040, 0x05028D14 +.long 0xE05C1000, 0x80013406 +.long 0xD3EE8048, 0x05228D16 +.long 0x80081508 +.long 0x82098009 +.long 0xD3EE8050, 0x05428D18 +.long 0x80041404 +.long 0x82058005 +.long 0xD3EE8058, 0x05628D1A +.long 0xD3EE8060, 0x05828D1C +.long 0xE0541000, 0x80045808 +.long 0xE0541000, 0x80045A09 +.long 0xD3EE8068, 0x05A28D1E +.long 0xD3EE8070, 0x05C28D20 +.long 0xD3EE8078, 0x05E28D22 +.long 0xBEFC001C +.long 0xBF8C0F7F +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8000, 0x04029124 +.long 0xD8EC60C0, 0x1400000F +.long 0xD8EC6140, 0x1600000F +.long 0xE0511000, 0x80020707 +.long 0xD3EE8008, 0x04229126 +.long 0xBF8CC67F +.long 0xD3EE8010, 0x04429128 +.long 0xD8EC61C0, 0x1800000F +.long 0xD8EC6240, 0x1A00000F +.long 0xE0511100, 0x80020707 +.long 0xD3EE8018, 0x0462912A +.long 0xBF8CC67F +.long 0xD3EE8020, 0x0482912C +.long 0xD8EC62C0, 0x1C00000F +.long 0xD8EC6340, 0x1E00000F +.long 0xE0511200, 0x80020707 +.long 0xD3EE8028, 0x04A2912E +.long 0xBF8CC67F +.long 0xD3EE8030, 0x04C29130 +.long 0xD8EC63C0, 0x2000000F +.long 0xD8EC6440, 0x2200000F +.long 0xE0511300, 0x80020707 +.long 0xD3EE8038, 0x04E29132 +.long 0xB71C1020 +.long 0xD3EE8040, 0x05029524 +.long 0xE05C1000, 0x80013806 +.long 0xD3EE8048, 0x05229526 +.long 0x80081508 +.long 0x82098009 +.long 0xD3EE8050, 0x05429528 +.long 0x80041404 +.long 0x82058005 +.long 0xD3EE8058, 0x0562952A +.long 0xD3EE8060, 0x0582952C +.long 0xD3EE8068, 0x05A2952E +.long 0xD3EE8070, 0x05C29530 +.long 0xD3EE8078, 0x05E29532 +.long 0xBEFC001C +.long 0xBF8C0F7D +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8000, 0x04029914 +.long 0xD8EC70E0, 0x2400000F +.long 0xD8EC7160, 0x2600000F +.long 0xE0511000, 0x80020707 +.long 0xD3EE8008, 0x04229916 +.long 0xBF8CC67F +.long 0xD3EE8010, 0x04429918 +.long 0xD8EC71E0, 0x2800000F +.long 0xD8EC7260, 0x2A00000F +.long 0xE0511100, 0x80020707 +.long 0xD3EE8018, 0x0462991A +.long 0xBF8CC67F +.long 0xD3EE8020, 0x0482991C +.long 0xD8EC72E0, 0x2C00000F +.long 0xD8EC7360, 0x2E00000F +.long 0xE0511200, 0x80020707 +.long 0xD3EE8028, 0x04A2991E +.long 0xBF8CC67F +.long 0xD3EE8030, 0x04C29920 +.long 0xD8EC73E0, 0x3000000F +.long 0xD8EC7460, 0x3200000F +.long 0xE0511300, 0x80020707 +.long 0xD3EE8038, 0x04E29922 +.long 0xB71C1020 +.long 0xD3EE8040, 0x05029D14 +.long 0xE05C1000, 0x80013C06 +.long 0xD3EE8048, 0x05229D16 +.long 0x80081508 +.long 0x82098009 +.long 0xD3EE8050, 0x05429D18 +.long 0x80041404 +.long 0x82058005 +.long 0xD3EE8058, 0x05629D1A +.long 0xD3D84010, 0x180001BE +.long 0xD3D84011, 0x180001BF +.long 0xD3D84012, 0x180001FE +.long 0xD3D84013, 0x180001FF +.long 0xBF800001 +.long 0xBF8C0F7A +.long 0xD2800010, 0x2002B110 +.long 0xD2800012, 0x2002B512 +.long 0xD3EE8060, 0x05829D1C +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xD3EE8068, 0x05A29D1E +.long 0xD3EE8070, 0x05C29D20 +.long 0xD3EE8078, 0x05E29D22 +.long 0xBEFC001C +.long 0xBF8C0F7F +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8000, 0x0402A124 +.long 0xD8EC0000, 0x1400000F +.long 0xD8EC0080, 0x1600000F +.long 0xE0511000, 0x80020707 +.long 0xD3EE8008, 0x0422A126 +.long 0xBF8CC67F +.long 0xD3EE8010, 0x0442A128 +.long 0xD8EC0100, 0x1800000F +.long 0xD8EC0180, 0x1A00000F +.long 0xE0511100, 0x80020707 +.long 0xD3EE8018, 0x0462A12A +.long 0xBF8CC67F +.long 0xD3EE8020, 0x0482A12C +.long 0xD8EC0200, 0x1C00000F +.long 0xD8EC0280, 0x1E00000F +.long 0xE0511200, 0x80020707 +.long 0xD3EE8028, 0x04A2A12E +.long 0xBF8CC67F +.long 0xD3EE8030, 0x04C2A130 +.long 0xD8EC0300, 0x2000000F +.long 0xD8EC0380, 0x2200000F +.long 0xE0511300, 0x80020707 +.long 0xD3EE8038, 0x04E2A132 +.long 0xB71C1020 +.long 0xD3EE8040, 0x0502A524 +.long 0xE05C1000, 0x80014006 +.long 0xD3EE8048, 0x0522A526 +.long 0x80081508 +.long 0x82098009 +.long 0xD3EE8050, 0x0542A528 +.long 0x80041404 +.long 0x82058005 +.long 0xD3EE8058, 0x0562A52A +.long 0xD3EE8060, 0x0582A52C +.long 0xD3EE8068, 0x05A2A52E +.long 0xD3EE8070, 0x05C2A530 +.long 0xD3EE8078, 0x05E2A532 +.long 0xBE9200FF, 0x80000000 +.long 0xBF091224 +.long 0x850C2412 +.long 0xBF068025 +.long 0xBE92020C +.long 0xD0C9000C, 0x00009D0A +.long 0xD1000008, 0x0032190E +.long 0xD0C9000C, 0x00009D0B +.long 0xD1000009, 0x00321B0E +.long 0xD3D94080, 0x18000080 +.long 0xD3D94081, 0x18000080 +.long 0xD3D94082, 0x18000080 +.long 0xD3D94083, 0x18000080 +.long 0xD3D94084, 0x18000080 +.long 0xD3D94085, 0x18000080 +.long 0xD3D94086, 0x18000080 +.long 0xD3D94087, 0x18000080 +.long 0xD3D94088, 0x18000080 +.long 0xD3D94089, 0x18000080 +.long 0xD3D9408A, 0x18000080 +.long 0xD3D9408B, 0x18000080 +.long 0xD3D9408C, 0x18000080 +.long 0xD3D9408D, 0x18000080 +.long 0xD3D9408E, 0x18000080 +.long 0xD3D9408F, 0x18000080 +.long 0xD3D94090, 0x18000080 +.long 0xD3D94091, 0x18000080 +.long 0xD3D94092, 0x18000080 +.long 0xD3D94093, 0x18000080 +.long 0xD3D94094, 0x18000080 +.long 0xD3D94095, 0x18000080 +.long 0xD3D94096, 0x18000080 +.long 0xD3D94097, 0x18000080 +.long 0xD3D94098, 0x18000080 +.long 0xD3D94099, 0x18000080 +.long 0xD3D9409A, 0x18000080 +.long 0xD3D9409B, 0x18000080 +.long 0xD3D9409C, 0x18000080 +.long 0xD3D9409D, 0x18000080 +.long 0xD3D9409E, 0x18000080 +.long 0xD3D9409F, 0x18000080 +.long 0xD3D940A0, 0x18000080 +.long 0xD3D940A1, 0x18000080 +.long 0xD3D940A2, 0x18000080 +.long 0xD3D940A3, 0x18000080 +.long 0xD3D940A4, 0x18000080 +.long 0xD3D940A5, 0x18000080 +.long 0xD3D940A6, 0x18000080 +.long 0xD3D940A7, 0x18000080 +.long 0xD3D940A8, 0x18000080 +.long 0xD3D940A9, 0x18000080 +.long 0xD3D940AA, 0x18000080 +.long 0xD3D940AB, 0x18000080 +.long 0xD3D940AC, 0x18000080 +.long 0xD3D940AD, 0x18000080 +.long 0xD3D940AE, 0x18000080 +.long 0xD3D940AF, 0x18000080 +.long 0xD3D940B0, 0x18000080 +.long 0xD3D940B1, 0x18000080 +.long 0xD3D940B2, 0x18000080 +.long 0xD3D940B3, 0x18000080 +.long 0xD3D940B4, 0x18000080 +.long 0xD3D940B5, 0x18000080 +.long 0xD3D940B6, 0x18000080 +.long 0xD3D940B7, 0x18000080 +.long 0xD3D940B8, 0x18000080 +.long 0xD3D940B9, 0x18000080 +.long 0xD3D940BA, 0x18000080 +.long 0xD3D940BB, 0x18000080 +.long 0xD3D940BC, 0x18000080 +.long 0xD3D940BD, 0x18000080 +.long 0xD3D940BE, 0x18000080 +.long 0xD3D940BF, 0x18000080 +.long 0xD3D940C0, 0x18000080 +.long 0xD3D940C1, 0x18000080 +.long 0xD3D940C2, 0x18000080 +.long 0xD3D940C3, 0x18000080 +.long 0xD3D940C4, 0x18000080 +.long 0xD3D940C5, 0x18000080 +.long 0xD3D940C6, 0x18000080 +.long 0xD3D940C7, 0x18000080 +.long 0xD3D940C8, 0x18000080 +.long 0xD3D940C9, 0x18000080 +.long 0xD3D940CA, 0x18000080 +.long 0xD3D940CB, 0x18000080 +.long 0xD3D940CC, 0x18000080 +.long 0xD3D940CD, 0x18000080 +.long 0xD3D940CE, 0x18000080 +.long 0xD3D940CF, 0x18000080 +.long 0xD3D940D0, 0x18000080 +.long 0xD3D940D1, 0x18000080 +.long 0xD3D940D2, 0x18000080 +.long 0xD3D940D3, 0x18000080 +.long 0xD3D940D4, 0x18000080 +.long 0xD3D940D5, 0x18000080 +.long 0xD3D940D6, 0x18000080 +.long 0xD3D940D7, 0x18000080 +.long 0xD3D940D8, 0x18000080 +.long 0xD3D940D9, 0x18000080 +.long 0xD3D940DA, 0x18000080 +.long 0xD3D940DB, 0x18000080 +.long 0xD3D940DC, 0x18000080 +.long 0xD3D940DD, 0x18000080 +.long 0xD3D940DE, 0x18000080 +.long 0xD3D940DF, 0x18000080 +.long 0xD3D940E0, 0x18000080 +.long 0xD3D940E1, 0x18000080 +.long 0xD3D940E2, 0x18000080 +.long 0xD3D940E3, 0x18000080 +.long 0xD3D940E4, 0x18000080 +.long 0xD3D940E5, 0x18000080 +.long 0xD3D940E6, 0x18000080 +.long 0xD3D940E7, 0x18000080 +.long 0xD3D940E8, 0x18000080 +.long 0xD3D940E9, 0x18000080 +.long 0xD3D940EA, 0x18000080 +.long 0xD3D940EB, 0x18000080 +.long 0xD3D940EC, 0x18000080 +.long 0xD3D940ED, 0x18000080 +.long 0xD3D940EE, 0x18000080 +.long 0xD3D940EF, 0x18000080 +.long 0xD3D940F0, 0x18000080 +.long 0xD3D940F1, 0x18000080 +.long 0xD3D940F2, 0x18000080 +.long 0xD3D940F3, 0x18000080 +.long 0xD3D940F4, 0x18000080 +.long 0xD3D940F5, 0x18000080 +.long 0xD3D940F6, 0x18000080 +.long 0xD3D940F7, 0x18000080 +.long 0xD3D940F8, 0x18000080 +.long 0xD3D940F9, 0x18000080 +.long 0xD3D940FA, 0x18000080 +.long 0xD3D940FB, 0x18000080 +.long 0xD3D940FC, 0x18000080 +.long 0xD3D940FD, 0x18000080 +.long 0xD3D940FE, 0x18000080 +.long 0xD3D940FF, 0x18000080 +.long 0xBF82F3A9 +.long 0xBF8C0F7D +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8080, 0x06028914 +.long 0xD8EC50A0, 0x2400000F +.long 0xD8EC5120, 0x2600000F +.long 0xD3EE8088, 0x06228916 +.long 0xBF8CC67F +.long 0xD3EE8090, 0x06428918 +.long 0xD8EC51A0, 0x2800000F +.long 0xD8EC5220, 0x2A00000F +.long 0xD3EE8098, 0x0662891A +.long 0xBF8CC67F +.long 0xD3EE80A0, 0x0682891C +.long 0xD8EC52A0, 0x2C00000F +.long 0xD8EC5320, 0x2E00000F +.long 0xD3EE80A8, 0x06A2891E +.long 0xBF8CC67F +.long 0xD3EE80B0, 0x06C28920 +.long 0xD8EC53A0, 0x3000000F +.long 0xD8EC5420, 0x3200000F +.long 0xD3EE80B8, 0x06E28922 +.long 0xD3EE80C0, 0x07028D14 +.long 0xD3EE80C8, 0x07228D16 +.long 0xD3EE80D0, 0x07428D18 +.long 0xD3EE80D8, 0x07628D1A +.long 0xD3EE80E0, 0x07828D1C +.long 0xE0541000, 0x80045808 +.long 0xE0541000, 0x80045A09 +.long 0xD3EE80E8, 0x07A28D1E +.long 0xD3EE80F0, 0x07C28D20 +.long 0xD3EE80F8, 0x07E28D22 +.long 0xBF8C0F7A +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8080, 0x06029124 +.long 0xD8EC60C0, 0x1400000F +.long 0xD8EC6140, 0x1600000F +.long 0xD3EE8088, 0x06229126 +.long 0xBF8CC67F +.long 0xD3EE8090, 0x06429128 +.long 0xD8EC61C0, 0x1800000F +.long 0xD8EC6240, 0x1A00000F +.long 0xD3EE8098, 0x0662912A +.long 0xBF8CC67F +.long 0xD3EE80A0, 0x0682912C +.long 0xD8EC62C0, 0x1C00000F +.long 0xD8EC6340, 0x1E00000F +.long 0xD3EE80A8, 0x06A2912E +.long 0xBF8CC67F +.long 0xD3EE80B0, 0x06C29130 +.long 0xD8EC63C0, 0x2000000F +.long 0xD8EC6440, 0x2200000F +.long 0xD3EE80B8, 0x06E29132 +.long 0xD3EE80C0, 0x07029524 +.long 0xD3EE80C8, 0x07229526 +.long 0xD3EE80D0, 0x07429528 +.long 0xD3EE80D8, 0x0762952A +.long 0xD3EE80E0, 0x0782952C +.long 0xD3EE80E8, 0x07A2952E +.long 0xD3EE80F0, 0x07C29530 +.long 0xD3EE80F8, 0x07E29532 +.long 0xBF8C0F73 +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8080, 0x06029914 +.long 0xD8EC70E0, 0x2400000F +.long 0xD8EC7160, 0x2600000F +.long 0xD3EE8088, 0x06229916 +.long 0xBF8CC67F +.long 0xD3EE8090, 0x06429918 +.long 0xD8EC71E0, 0x2800000F +.long 0xD8EC7260, 0x2A00000F +.long 0xD3EE8098, 0x0662991A +.long 0xBF8CC67F +.long 0xD3EE80A0, 0x0682991C +.long 0xD8EC72E0, 0x2C00000F +.long 0xD8EC7360, 0x2E00000F +.long 0xD3EE80A8, 0x06A2991E +.long 0xBF8CC67F +.long 0xD3EE80B0, 0x06C29920 +.long 0xD8EC73E0, 0x3000000F +.long 0xD8EC7460, 0x3200000F +.long 0xD3EE80B8, 0x06E29922 +.long 0xD3EE80C0, 0x07029D14 +.long 0xD3EE80C8, 0x07229D16 +.long 0xD3EE80D0, 0x07429D18 +.long 0xD3EE80D8, 0x07629D1A +.long 0xD3D84010, 0x1800013E +.long 0xD3D84011, 0x1800013F +.long 0xD3D84012, 0x1800017E +.long 0xD3D84013, 0x1800017F +.long 0xBF800001 +.long 0xBF8C0F70 +.long 0xD2800010, 0x2002B110 +.long 0xD2800012, 0x2002B512 +.long 0xD3EE80E0, 0x07829D1C +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xD3EE80E8, 0x07A29D1E +.long 0xD3EE80F0, 0x07C29D20 +.long 0xD3EE80F8, 0x07E29D22 +.long 0xBF8C0F74 +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8080, 0x0602A124 +.long 0xD3EE8088, 0x0622A126 +.long 0xBF8CC47F +.long 0xD3EE8090, 0x0642A128 +.long 0xD3EE8098, 0x0662A12A +.long 0xBF8CC27F +.long 0xD3EE80A0, 0x0682A12C +.long 0xD3EE80A8, 0x06A2A12E +.long 0xBF8CC07F +.long 0xD3EE80B0, 0x06C2A130 +.long 0xD3EE80B8, 0x06E2A132 +.long 0xD3EE80C0, 0x0702A524 +.long 0xD3EE80C8, 0x0722A526 +.long 0xD3EE80D0, 0x0742A528 +.long 0xD3EE80D8, 0x0762A52A +.long 0xD3EE80E0, 0x0782A52C +.long 0xD3EE80E8, 0x07A2A52E +.long 0xD3EE80F0, 0x07C2A530 +.long 0xD3EE80F8, 0x07E2A532 +.long 0xBE9200FF, 0x80000000 +.long 0xBF091224 +.long 0x850C2412 +.long 0xBF068025 +.long 0xBE92020C +.long 0xD0C9000C, 0x00009D0A +.long 0xD1000008, 0x0032190E +.long 0xD0C9000C, 0x00009D0B +.long 0xD1000009, 0x00321B0E +.long 0xBF8204D3 +.long 0xBF8C0F7D +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8000, 0x04028914 +.long 0xD8EC50A0, 0x2400000F +.long 0xD8EC5120, 0x2600000F +.long 0xD3EE8008, 0x04228916 +.long 0xBF8CC67F +.long 0xD3EE8010, 0x04428918 +.long 0xD8EC51A0, 0x2800000F +.long 0xD8EC5220, 0x2A00000F +.long 0xD3EE8018, 0x0462891A +.long 0xBF8CC67F +.long 0xD3EE8020, 0x0482891C +.long 0xD8EC52A0, 0x2C00000F +.long 0xD8EC5320, 0x2E00000F +.long 0xD3EE8028, 0x04A2891E +.long 0xBF8CC67F +.long 0xD3EE8030, 0x04C28920 +.long 0xD8EC53A0, 0x3000000F +.long 0xD8EC5420, 0x3200000F +.long 0xD3EE8038, 0x04E28922 +.long 0xD3EE8040, 0x05028D14 +.long 0xD3EE8048, 0x05228D16 +.long 0xD3EE8050, 0x05428D18 +.long 0xD3EE8058, 0x05628D1A +.long 0xD3EE8060, 0x05828D1C +.long 0xE0541000, 0x80045808 +.long 0xE0541000, 0x80045A09 +.long 0xD3EE8068, 0x05A28D1E +.long 0xD3EE8070, 0x05C28D20 +.long 0xD3EE8078, 0x05E28D22 +.long 0xBF8C0F7A +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8000, 0x04029124 +.long 0xD8EC60C0, 0x1400000F +.long 0xD8EC6140, 0x1600000F +.long 0xD3EE8008, 0x04229126 +.long 0xBF8CC67F +.long 0xD3EE8010, 0x04429128 +.long 0xD8EC61C0, 0x1800000F +.long 0xD8EC6240, 0x1A00000F +.long 0xD3EE8018, 0x0462912A +.long 0xBF8CC67F +.long 0xD3EE8020, 0x0482912C +.long 0xD8EC62C0, 0x1C00000F +.long 0xD8EC6340, 0x1E00000F +.long 0xD3EE8028, 0x04A2912E +.long 0xBF8CC67F +.long 0xD3EE8030, 0x04C29130 +.long 0xD8EC63C0, 0x2000000F +.long 0xD8EC6440, 0x2200000F +.long 0xD3EE8038, 0x04E29132 +.long 0xD3EE8040, 0x05029524 +.long 0xD3EE8048, 0x05229526 +.long 0xD3EE8050, 0x05429528 +.long 0xD3EE8058, 0x0562952A +.long 0xD3EE8060, 0x0582952C +.long 0xD3EE8068, 0x05A2952E +.long 0xD3EE8070, 0x05C29530 +.long 0xD3EE8078, 0x05E29532 +.long 0xBF8C0F73 +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8000, 0x04029914 +.long 0xD8EC70E0, 0x2400000F +.long 0xD8EC7160, 0x2600000F +.long 0xD3EE8008, 0x04229916 +.long 0xBF8CC67F +.long 0xD3EE8010, 0x04429918 +.long 0xD8EC71E0, 0x2800000F +.long 0xD8EC7260, 0x2A00000F +.long 0xD3EE8018, 0x0462991A +.long 0xBF8CC67F +.long 0xD3EE8020, 0x0482991C +.long 0xD8EC72E0, 0x2C00000F +.long 0xD8EC7360, 0x2E00000F +.long 0xD3EE8028, 0x04A2991E +.long 0xBF8CC67F +.long 0xD3EE8030, 0x04C29920 +.long 0xD8EC73E0, 0x3000000F +.long 0xD8EC7460, 0x3200000F +.long 0xD3EE8038, 0x04E29922 +.long 0xD3EE8040, 0x05029D14 +.long 0xD3EE8048, 0x05229D16 +.long 0xD3EE8050, 0x05429D18 +.long 0xD3EE8058, 0x05629D1A +.long 0xD3D84010, 0x180001BE +.long 0xD3D84011, 0x180001BF +.long 0xD3D84012, 0x180001FE +.long 0xD3D84013, 0x180001FF +.long 0xBF800001 +.long 0xBF8C0F70 +.long 0xD2800010, 0x2002B110 +.long 0xD2800012, 0x2002B512 +.long 0xD3EE8060, 0x05829D1C +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xD3EE8068, 0x05A29D1E +.long 0xD3EE8070, 0x05C29D20 +.long 0xD3EE8078, 0x05E29D22 +.long 0xBF8C0F74 +.long 0xBF8A0000 +.long 0xBF8CC67F +.long 0xD3EE8000, 0x0402A124 +.long 0xD3EE8008, 0x0422A126 +.long 0xBF8CC47F +.long 0xD3EE8010, 0x0442A128 +.long 0xD3EE8018, 0x0462A12A +.long 0xBF8CC27F +.long 0xD3EE8020, 0x0482A12C +.long 0xD3EE8028, 0x04A2A12E +.long 0xBF8CC07F +.long 0xD3EE8030, 0x04C2A130 +.long 0xD3EE8038, 0x04E2A132 +.long 0xD3EE8040, 0x0502A524 +.long 0xD3EE8048, 0x0522A526 +.long 0xD3EE8050, 0x0542A528 +.long 0xD3EE8058, 0x0562A52A +.long 0xD3EE8060, 0x0582A52C +.long 0xD3EE8068, 0x05A2A52E +.long 0xD3EE8070, 0x05C2A530 +.long 0xD3EE8078, 0x05E2A532 +.long 0xBE9200FF, 0x80000000 +.long 0xBF091224 +.long 0x850C2412 +.long 0xBF068025 +.long 0xBE92020C +.long 0xD0C9000C, 0x00009D0A +.long 0xD1000008, 0x0032190E +.long 0xD0C9000C, 0x00009D0B +.long 0xD1000009, 0x00321B0E +.long 0x8010443E +.long 0x8211453F +.long 0xBE9200FF, 0x80000000 +.long 0xBF091224 +.long 0x850C2412 +.long 0xBF068025 +.long 0xBE92020C +.long 0xE0541000, 0x80045408 +.long 0xE0541000, 0x80045609 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x80045808 +.long 0xE0541000, 0x80045A09 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x80045C08 +.long 0xE0541000, 0x80045E09 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x80046008 +.long 0xE0541000, 0x80046209 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x80046408 +.long 0xE0541000, 0x80046609 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x80046808 +.long 0xE0541000, 0x80046A09 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x80046C08 +.long 0xE0541000, 0x80046E09 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x80047008 +.long 0xE0541000, 0x80047209 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x80047408 +.long 0xE0541000, 0x80047609 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x80047808 +.long 0xE0541000, 0x80047A09 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x80047C08 +.long 0xE0541000, 0x80047E09 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x80048008 +.long 0xE0541000, 0x80048209 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x80048408 +.long 0xE0541000, 0x80048609 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x80048808 +.long 0xE0541000, 0x80048A09 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x80048C08 +.long 0xE0541000, 0x80048E09 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x80049008 +.long 0xE0541000, 0x80049209 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x80049408 +.long 0xE0541000, 0x80049609 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x80049808 +.long 0xE0541000, 0x80049A09 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x80049C08 +.long 0xE0541000, 0x80049E09 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x8004A008 +.long 0xE0541000, 0x8004A209 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x8004A408 +.long 0xE0541000, 0x8004A609 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x8004A808 +.long 0xE0541000, 0x8004AA09 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x8004AC08 +.long 0xE0541000, 0x8004AE09 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x8004B008 +.long 0xE0541000, 0x8004B209 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x8004B408 +.long 0xE0541000, 0x8004B609 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x8004B808 +.long 0xE0541000, 0x8004BA09 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x8004BC08 +.long 0xE0541000, 0x8004BE09 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x8004C008 +.long 0xE0541000, 0x8004C209 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x8004C408 +.long 0xE0541000, 0x8004C609 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x8004C808 +.long 0xE0541000, 0x8004CA09 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x8004CC08 +.long 0xE0541000, 0x8004CE09 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x8004D008 +.long 0xE0541000, 0x8004D209 +.long 0x8010443E +.long 0x8211453F +.long 0xBE9200FF, 0x80000000 +.long 0xBF068025 +.long 0xBE920224 +.long 0xD3D84010, 0x18000100 +.long 0xD3D84011, 0x18000101 +.long 0xD3D84012, 0x18000140 +.long 0xD3D84013, 0x18000141 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x2002A910 +.long 0xD2800012, 0x2002AD12 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000102 +.long 0xD3D84011, 0x18000103 +.long 0xD3D84012, 0x18000142 +.long 0xD3D84013, 0x18000143 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x2002B110 +.long 0xD2800012, 0x2002B512 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000104 +.long 0xD3D84011, 0x18000105 +.long 0xD3D84012, 0x18000144 +.long 0xD3D84013, 0x18000145 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x2002B910 +.long 0xD2800012, 0x2002BD12 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000106 +.long 0xD3D84011, 0x18000107 +.long 0xD3D84012, 0x18000146 +.long 0xD3D84013, 0x18000147 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x2002C110 +.long 0xD2800012, 0x2002C512 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000108 +.long 0xD3D84011, 0x18000109 +.long 0xD3D84012, 0x18000148 +.long 0xD3D84013, 0x18000149 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x2002C910 +.long 0xD2800012, 0x2002CD12 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x1800010A +.long 0xD3D84011, 0x1800010B +.long 0xD3D84012, 0x1800014A +.long 0xD3D84013, 0x1800014B +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x2002D110 +.long 0xD2800012, 0x2002D512 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x1800010C +.long 0xD3D84011, 0x1800010D +.long 0xD3D84012, 0x1800014C +.long 0xD3D84013, 0x1800014D +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x2002D910 +.long 0xD2800012, 0x2002DD12 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x1800010E +.long 0xD3D84011, 0x1800010F +.long 0xD3D84012, 0x1800014E +.long 0xD3D84013, 0x1800014F +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x2002E110 +.long 0xD2800012, 0x2002E512 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000110 +.long 0xD3D84011, 0x18000111 +.long 0xD3D84012, 0x18000150 +.long 0xD3D84013, 0x18000151 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x2002E910 +.long 0xD2800012, 0x2002ED12 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000112 +.long 0xD3D84011, 0x18000113 +.long 0xD3D84012, 0x18000152 +.long 0xD3D84013, 0x18000153 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x2002F110 +.long 0xD2800012, 0x2002F512 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000114 +.long 0xD3D84011, 0x18000115 +.long 0xD3D84012, 0x18000154 +.long 0xD3D84013, 0x18000155 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x2002F910 +.long 0xD2800012, 0x2002FD12 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000116 +.long 0xD3D84011, 0x18000117 +.long 0xD3D84012, 0x18000156 +.long 0xD3D84013, 0x18000157 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20030110 +.long 0xD2800012, 0x20030512 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000118 +.long 0xD3D84011, 0x18000119 +.long 0xD3D84012, 0x18000158 +.long 0xD3D84013, 0x18000159 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20030910 +.long 0xD2800012, 0x20030D12 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x1800011A +.long 0xD3D84011, 0x1800011B +.long 0xD3D84012, 0x1800015A +.long 0xD3D84013, 0x1800015B +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20031110 +.long 0xD2800012, 0x20031512 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x1800011C +.long 0xD3D84011, 0x1800011D +.long 0xD3D84012, 0x1800015C +.long 0xD3D84013, 0x1800015D +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20031910 +.long 0xD2800012, 0x20031D12 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x1800011E +.long 0xD3D84011, 0x1800011F +.long 0xD3D84012, 0x1800015E +.long 0xD3D84013, 0x1800015F +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20032110 +.long 0xD2800012, 0x20032512 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000120 +.long 0xD3D84011, 0x18000121 +.long 0xD3D84012, 0x18000160 +.long 0xD3D84013, 0x18000161 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20032910 +.long 0xD2800012, 0x20032D12 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000122 +.long 0xD3D84011, 0x18000123 +.long 0xD3D84012, 0x18000162 +.long 0xD3D84013, 0x18000163 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20033110 +.long 0xD2800012, 0x20033512 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000124 +.long 0xD3D84011, 0x18000125 +.long 0xD3D84012, 0x18000164 +.long 0xD3D84013, 0x18000165 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20033910 +.long 0xD2800012, 0x20033D12 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000126 +.long 0xD3D84011, 0x18000127 +.long 0xD3D84012, 0x18000166 +.long 0xD3D84013, 0x18000167 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20034110 +.long 0xD2800012, 0x20034512 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000128 +.long 0xD3D84011, 0x18000129 +.long 0xD3D84012, 0x18000168 +.long 0xD3D84013, 0x18000169 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20034910 +.long 0xD2800012, 0x20034D12 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x1800012A +.long 0xD3D84011, 0x1800012B +.long 0xD3D84012, 0x1800016A +.long 0xD3D84013, 0x1800016B +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20035110 +.long 0xD2800012, 0x20035512 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x1800012C +.long 0xD3D84011, 0x1800012D +.long 0xD3D84012, 0x1800016C +.long 0xD3D84013, 0x1800016D +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20035910 +.long 0xD2800012, 0x20035D12 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x1800012E +.long 0xD3D84011, 0x1800012F +.long 0xD3D84012, 0x1800016E +.long 0xD3D84013, 0x1800016F +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20036110 +.long 0xD2800012, 0x20036512 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000130 +.long 0xD3D84011, 0x18000131 +.long 0xD3D84012, 0x18000170 +.long 0xD3D84013, 0x18000171 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20036910 +.long 0xD2800012, 0x20036D12 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000132 +.long 0xD3D84011, 0x18000133 +.long 0xD3D84012, 0x18000172 +.long 0xD3D84013, 0x18000173 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20037110 +.long 0xD2800012, 0x20037512 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000134 +.long 0xD3D84011, 0x18000135 +.long 0xD3D84012, 0x18000174 +.long 0xD3D84013, 0x18000175 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20037910 +.long 0xD2800012, 0x20037D12 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000136 +.long 0xD3D84011, 0x18000137 +.long 0xD3D84012, 0x18000176 +.long 0xD3D84013, 0x18000177 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20038110 +.long 0xD2800012, 0x20038512 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000138 +.long 0xD3D84011, 0x18000139 +.long 0xD3D84012, 0x18000178 +.long 0xD3D84013, 0x18000179 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20038910 +.long 0xD2800012, 0x20038D12 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x1800013A +.long 0xD3D84011, 0x1800013B +.long 0xD3D84012, 0x1800017A +.long 0xD3D84013, 0x1800017B +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20039110 +.long 0xD2800012, 0x20039512 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x1800013C +.long 0xD3D84011, 0x1800013D +.long 0xD3D84012, 0x1800017C +.long 0xD3D84013, 0x1800017D +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20039910 +.long 0xD2800012, 0x20039D12 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x1800013E +.long 0xD3D84011, 0x1800013F +.long 0xD3D84012, 0x1800017E +.long 0xD3D84013, 0x1800017F +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x2003A110 +.long 0xD2800012, 0x2003A512 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0xBF8203E6 +.long 0x8010443E +.long 0x8211453F +.long 0xBE9200FF, 0x80000000 +.long 0xBF091224 +.long 0x850C2412 +.long 0xBF068025 +.long 0xBE92020C +.long 0xE0541000, 0x80045408 +.long 0xE0541000, 0x80045609 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x80045808 +.long 0xE0541000, 0x80045A09 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x80045C08 +.long 0xE0541000, 0x80045E09 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x80046008 +.long 0xE0541000, 0x80046209 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x80046408 +.long 0xE0541000, 0x80046609 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x80046808 +.long 0xE0541000, 0x80046A09 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x80046C08 +.long 0xE0541000, 0x80046E09 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x80047008 +.long 0xE0541000, 0x80047209 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x80047408 +.long 0xE0541000, 0x80047609 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x80047808 +.long 0xE0541000, 0x80047A09 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x80047C08 +.long 0xE0541000, 0x80047E09 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x80048008 +.long 0xE0541000, 0x80048209 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x80048408 +.long 0xE0541000, 0x80048609 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x80048808 +.long 0xE0541000, 0x80048A09 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x80048C08 +.long 0xE0541000, 0x80048E09 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x80049008 +.long 0xE0541000, 0x80049209 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x80049408 +.long 0xE0541000, 0x80049609 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x80049808 +.long 0xE0541000, 0x80049A09 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x80049C08 +.long 0xE0541000, 0x80049E09 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x8004A008 +.long 0xE0541000, 0x8004A209 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x8004A408 +.long 0xE0541000, 0x8004A609 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x8004A808 +.long 0xE0541000, 0x8004AA09 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x8004AC08 +.long 0xE0541000, 0x8004AE09 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x8004B008 +.long 0xE0541000, 0x8004B209 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x8004B408 +.long 0xE0541000, 0x8004B609 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x8004B808 +.long 0xE0541000, 0x8004BA09 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x8004BC08 +.long 0xE0541000, 0x8004BE09 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x8004C008 +.long 0xE0541000, 0x8004C209 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x8004C408 +.long 0xE0541000, 0x8004C609 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x8004C808 +.long 0xE0541000, 0x8004CA09 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x8004CC08 +.long 0xE0541000, 0x8004CE09 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xE0541000, 0x8004D008 +.long 0xE0541000, 0x8004D209 +.long 0x8010443E +.long 0x8211453F +.long 0xBE9200FF, 0x80000000 +.long 0xBF068025 +.long 0xBE920224 +.long 0xD3D84010, 0x18000180 +.long 0xD3D84011, 0x18000181 +.long 0xD3D84012, 0x180001C0 +.long 0xD3D84013, 0x180001C1 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x2002A910 +.long 0xD2800012, 0x2002AD12 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000182 +.long 0xD3D84011, 0x18000183 +.long 0xD3D84012, 0x180001C2 +.long 0xD3D84013, 0x180001C3 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x2002B110 +.long 0xD2800012, 0x2002B512 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000184 +.long 0xD3D84011, 0x18000185 +.long 0xD3D84012, 0x180001C4 +.long 0xD3D84013, 0x180001C5 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x2002B910 +.long 0xD2800012, 0x2002BD12 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000186 +.long 0xD3D84011, 0x18000187 +.long 0xD3D84012, 0x180001C6 +.long 0xD3D84013, 0x180001C7 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x2002C110 +.long 0xD2800012, 0x2002C512 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000188 +.long 0xD3D84011, 0x18000189 +.long 0xD3D84012, 0x180001C8 +.long 0xD3D84013, 0x180001C9 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x2002C910 +.long 0xD2800012, 0x2002CD12 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x1800018A +.long 0xD3D84011, 0x1800018B +.long 0xD3D84012, 0x180001CA +.long 0xD3D84013, 0x180001CB +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x2002D110 +.long 0xD2800012, 0x2002D512 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x1800018C +.long 0xD3D84011, 0x1800018D +.long 0xD3D84012, 0x180001CC +.long 0xD3D84013, 0x180001CD +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x2002D910 +.long 0xD2800012, 0x2002DD12 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x1800018E +.long 0xD3D84011, 0x1800018F +.long 0xD3D84012, 0x180001CE +.long 0xD3D84013, 0x180001CF +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x2002E110 +.long 0xD2800012, 0x2002E512 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000190 +.long 0xD3D84011, 0x18000191 +.long 0xD3D84012, 0x180001D0 +.long 0xD3D84013, 0x180001D1 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x2002E910 +.long 0xD2800012, 0x2002ED12 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000192 +.long 0xD3D84011, 0x18000193 +.long 0xD3D84012, 0x180001D2 +.long 0xD3D84013, 0x180001D3 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x2002F110 +.long 0xD2800012, 0x2002F512 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000194 +.long 0xD3D84011, 0x18000195 +.long 0xD3D84012, 0x180001D4 +.long 0xD3D84013, 0x180001D5 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x2002F910 +.long 0xD2800012, 0x2002FD12 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000196 +.long 0xD3D84011, 0x18000197 +.long 0xD3D84012, 0x180001D6 +.long 0xD3D84013, 0x180001D7 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20030110 +.long 0xD2800012, 0x20030512 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x18000198 +.long 0xD3D84011, 0x18000199 +.long 0xD3D84012, 0x180001D8 +.long 0xD3D84013, 0x180001D9 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20030910 +.long 0xD2800012, 0x20030D12 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x1800019A +.long 0xD3D84011, 0x1800019B +.long 0xD3D84012, 0x180001DA +.long 0xD3D84013, 0x180001DB +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20031110 +.long 0xD2800012, 0x20031512 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x1800019C +.long 0xD3D84011, 0x1800019D +.long 0xD3D84012, 0x180001DC +.long 0xD3D84013, 0x180001DD +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20031910 +.long 0xD2800012, 0x20031D12 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x1800019E +.long 0xD3D84011, 0x1800019F +.long 0xD3D84012, 0x180001DE +.long 0xD3D84013, 0x180001DF +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20032110 +.long 0xD2800012, 0x20032512 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x180001A0 +.long 0xD3D84011, 0x180001A1 +.long 0xD3D84012, 0x180001E0 +.long 0xD3D84013, 0x180001E1 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20032910 +.long 0xD2800012, 0x20032D12 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x180001A2 +.long 0xD3D84011, 0x180001A3 +.long 0xD3D84012, 0x180001E2 +.long 0xD3D84013, 0x180001E3 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20033110 +.long 0xD2800012, 0x20033512 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x180001A4 +.long 0xD3D84011, 0x180001A5 +.long 0xD3D84012, 0x180001E4 +.long 0xD3D84013, 0x180001E5 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20033910 +.long 0xD2800012, 0x20033D12 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x180001A6 +.long 0xD3D84011, 0x180001A7 +.long 0xD3D84012, 0x180001E6 +.long 0xD3D84013, 0x180001E7 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20034110 +.long 0xD2800012, 0x20034512 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x180001A8 +.long 0xD3D84011, 0x180001A9 +.long 0xD3D84012, 0x180001E8 +.long 0xD3D84013, 0x180001E9 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20034910 +.long 0xD2800012, 0x20034D12 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x180001AA +.long 0xD3D84011, 0x180001AB +.long 0xD3D84012, 0x180001EA +.long 0xD3D84013, 0x180001EB +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20035110 +.long 0xD2800012, 0x20035512 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x180001AC +.long 0xD3D84011, 0x180001AD +.long 0xD3D84012, 0x180001EC +.long 0xD3D84013, 0x180001ED +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20035910 +.long 0xD2800012, 0x20035D12 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x180001AE +.long 0xD3D84011, 0x180001AF +.long 0xD3D84012, 0x180001EE +.long 0xD3D84013, 0x180001EF +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20036110 +.long 0xD2800012, 0x20036512 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x180001B0 +.long 0xD3D84011, 0x180001B1 +.long 0xD3D84012, 0x180001F0 +.long 0xD3D84013, 0x180001F1 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20036910 +.long 0xD2800012, 0x20036D12 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x180001B2 +.long 0xD3D84011, 0x180001B3 +.long 0xD3D84012, 0x180001F2 +.long 0xD3D84013, 0x180001F3 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20037110 +.long 0xD2800012, 0x20037512 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x180001B4 +.long 0xD3D84011, 0x180001B5 +.long 0xD3D84012, 0x180001F4 +.long 0xD3D84013, 0x180001F5 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20037910 +.long 0xD2800012, 0x20037D12 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x180001B6 +.long 0xD3D84011, 0x180001B7 +.long 0xD3D84012, 0x180001F6 +.long 0xD3D84013, 0x180001F7 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20038110 +.long 0xD2800012, 0x20038512 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x180001B8 +.long 0xD3D84011, 0x180001B9 +.long 0xD3D84012, 0x180001F8 +.long 0xD3D84013, 0x180001F9 +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20038910 +.long 0xD2800012, 0x20038D12 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x180001BA +.long 0xD3D84011, 0x180001BB +.long 0xD3D84012, 0x180001FA +.long 0xD3D84013, 0x180001FB +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20039110 +.long 0xD2800012, 0x20039512 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x180001BC +.long 0xD3D84011, 0x180001BD +.long 0xD3D84012, 0x180001FC +.long 0xD3D84013, 0x180001FD +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x20039910 +.long 0xD2800012, 0x20039D12 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 +.long 0x80101910 +.long 0x82118011 +.long 0x80921912 +.long 0xBE920280 +.long 0xD3D84010, 0x180001BE +.long 0xD3D84011, 0x180001BF +.long 0xD3D84012, 0x180001FE +.long 0xD3D84013, 0x180001FF +.long 0xBF800001 +.long 0xBF8C4F7E +.long 0xD2800010, 0x2003A110 +.long 0xD2800012, 0x2003A512 +.long 0xE0741000, 0x80041008 +.long 0xE0741000, 0x80041209 +.long 0xBF800000 .long 0xBF8C0000 .long 0xBF810000 diff --git a/Tensile/CustomKernels/DGEMM_Aldebaran_PKFixedAtomic512_104.s b/Tensile/CustomKernels/DGEMM_Aldebaran_PKFixedAtomic512_104.s index 9f35741256..b47caf0ebe 100644 --- a/Tensile/CustomKernels/DGEMM_Aldebaran_PKFixedAtomic512_104.s +++ b/Tensile/CustomKernels/DGEMM_Aldebaran_PKFixedAtomic512_104.s @@ -136,149 +136,149 @@ amdhsa.kernels: .value_kind: global_buffer .value_type: f64 .address_space: generic - - .name: alpha + - .name: OffsetD .size: 8 .offset: 56 .value_kind: by_value + .value_type: u64 + - .name: OffsetC + .size: 8 + .offset: 64 + .value_kind: by_value + .value_type: u64 + - .name: OffsetA + .size: 8 + .offset: 72 + .value_kind: by_value + .value_type: u64 + - .name: OffsetB + .size: 8 + .offset: 80 + .value_kind: by_value + .value_type: u64 + - .name: alpha + .size: 8 + .offset: 88 + .value_kind: by_value .value_type: f64 - .name: beta .size: 8 - .offset: 64 + .offset: 96 .value_kind: by_value .value_type: f64 - .name: strideD0 .size: 4 - .offset: 72 + .offset: 104 .value_kind: by_value .value_type: u32 - .name: strideD1 .size: 4 - .offset: 76 + .offset: 108 .value_kind: by_value .value_type: u32 - .name: strideC0 .size: 4 - .offset: 80 + .offset: 112 .value_kind: by_value .value_type: u32 - .name: strideC1 .size: 4 - .offset: 84 + .offset: 116 .value_kind: by_value .value_type: u32 - .name: strideA0 .size: 4 - .offset: 88 + .offset: 120 .value_kind: by_value .value_type: u32 - .name: strideA1 .size: 4 - .offset: 92 + .offset: 124 .value_kind: by_value .value_type: u32 - .name: strideB0 .size: 4 - .offset: 96 + .offset: 128 .value_kind: by_value .value_type: u32 - .name: strideB1 .size: 4 - .offset: 100 + .offset: 132 .value_kind: by_value .value_type: u32 - .name: SizesFree0 .size: 4 - .offset: 104 + .offset: 136 .value_kind: by_value .value_type: u32 - .name: SizesFree1 .size: 4 - .offset: 108 + .offset: 140 .value_kind: by_value .value_type: u32 - .name: SizesFree2 .size: 4 - .offset: 112 + .offset: 144 .value_kind: by_value .value_type: u32 - .name: SizesSum0 .size: 4 - .offset: 116 + .offset: 148 .value_kind: by_value .value_type: u32 - .name: OrigStaggerUIter .size: 4 - .offset: 120 + .offset: 152 .value_kind: by_value .value_type: i32 - .name: NumWorkGroups0 .size: 4 - .offset: 124 + .offset: 156 .value_kind: by_value .value_type: u32 - .name: NumWorkGroups1 .size: 4 - .offset: 128 + .offset: 160 .value_kind: by_value .value_type: u32 - .name: MagicNumberProblemNumGroupTiles0 .size: 4 - .offset: 132 + .offset: 164 .value_kind: by_value .value_type: u32 - .name: MagicShiftProblemNumGroupTiles0 .size: 4 - .offset: 136 + .offset: 168 .value_kind: by_value .value_type: u32 - .name: GridNumWorkGroups0 .size: 4 - .offset: 140 + .offset: 172 .value_kind: by_value .value_type: u32 - .name: NumFullBlocks .size: 4 - .offset: 144 + .offset: 176 .value_kind: by_value .value_type: u32 - .name: WgmRemainder1 .size: 4 - .offset: 148 + .offset: 180 .value_kind: by_value .value_type: u32 - .name: MagicNumberWgmRemainder1 .size: 4 - .offset: 152 - .value_kind: by_value - .value_type: u32 - - .name: OffsetD - .size: 4 - .offset: 156 - .value_kind: by_value - .value_type: u32 - - .name: OffsetC - .size: 4 - .offset: 160 - .value_kind: by_value - .value_type: u32 - - .name: OffsetA - .size: 4 - .offset: 164 - .value_kind: by_value - .value_type: u32 - - .name: OffsetB - .size: 4 - .offset: 168 + .offset: 184 .value_kind: by_value .value_type: u32 - .name: padding .size: 4 - .offset: 172 + .offset: 188 .value_kind: by_value .value_type: u32 .group_segment_fixed_size: 60000 .kernarg_segment_align: 8 - .kernarg_segment_size: 160 + .kernarg_segment_size: 192 .max_flat_workgroup_size: 256 .private_segment_fixed_size: 0 .sgpr_count: 68 @@ -292,27 +292,39 @@ amdhsa.kernels: DGEMM_Aldebaran_PKFixedAtomic512_104: .long 0x8601FF01, 0x0000FFFF -.long 0xC0020980, 0x00000068 -.long 0xC00209C0, 0x0000006C -.long 0xC0020A00, 0x00000074 -.long 0xC0020A40, 0x00000058 -.long 0xC0020A80, 0x00000060 -.long 0xC0020AC0, 0x00000050 -.long 0xC0060B00, 0x00000038 -.long 0xC0060B80, 0x00000040 +.long 0xC0020980, 0x00000088 +.long 0xC00209C0, 0x0000008C +.long 0xC0020A00, 0x00000094 +.long 0xC0020A40, 0x00000078 +.long 0xC0020A80, 0x00000080 +.long 0xC0020AC0, 0x00000070 +.long 0xC0060B00, 0x00000058 +.long 0xC0060B80, 0x00000060 .long 0xC0060100, 0x00000028 .long 0xC0060200, 0x00000030 .long 0xC0060400, 0x00000020 -.long 0xC0020D00, 0x0000007C -.long 0xC0020D40, 0x00000080 -.long 0xC0020D80, 0x00000084 -.long 0xC0020DC0, 0x00000088 -.long 0xC0020E00, 0x0000008C +.long 0xC0061200, 0x00000048 +.long 0xC0061280, 0x00000050 +.long 0xC0061300, 0x00000040 +.long 0xC0020D00, 0x0000009C +.long 0xC0020D40, 0x000000A0 +.long 0xC0020D80, 0x000000A4 +.long 0xC0020DC0, 0x000000A8 +.long 0xC0020E00, 0x000000AC .long 0x20040086 .long 0x260000BF .long 0x7E600502 .long 0xBEB10002 .long 0xBF8CC07F +.long 0x8EC88348 +.long 0x80044804 +.long 0x82054905 +.long 0x8ECA834A +.long 0x80084A08 +.long 0x82094B09 +.long 0x8ECC834C +.long 0x80104C10 +.long 0x82114D11 .long 0xBEB800FF, 0x00000068 .long 0xBF033831 .long 0xBF851ADA diff --git a/Tensile/Tests/extended/custom_kernel/ck_dgemm_90a_nn.yaml b/Tensile/Tests/extended/custom_kernel/ck_dgemm_90a_nn.yaml new file mode 100644 index 0000000000..a1da180ea1 --- /dev/null +++ b/Tensile/Tests/extended/custom_kernel/ck_dgemm_90a_nn.yaml @@ -0,0 +1,57 @@ +TestParameters: + marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102] # not supported by arch + +GlobalParameters: + MinimumRequiredVersion: 4.7.2 + +BenchmarkProblems: + - + - # ProblemType + Batched: True + ComputeDataType: d + DataType: d + DestDataType: d + HighPrecisionAccumulate: False + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + NumIndicesC: 3 + OperationType: GEMM + TransposeA: False + TransposeB: False + UseBeta: True + - # BenchmarkProblemSizeGroup + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - AssertMinApproxSize: [3] + - AssertSizeMultiple: [{3: 32}] + - AssertStrideAEqual: [{0: 1}] + - AssertStrideBEqual: [{0: 1}] + - AssertStrideCEqual: [{0: 1}] + - AssertStrideDEqual: [{0: 1}] + - CustomKernelName: ['DGEMM_Aldebaran_NN_MT128x128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4'] + - DepthU: [16] + - GlobalReadVectorWidth: [2] + - KernelLanguage: ['Assembly'] + - LocalReadVectorWidth: [1] + - MatrixInstruction: + - [16,16,4,1,1,2,8,4,1] + - OptPreLoopVmcnt: [0] + - SourceSwap: [1] + - StaggerU: [4] + - StaggerUStride: [128] + - StoreVectorWidth: [2] + - VectorAtomicWidth: [1] + - VectorWidth: [2] + - WorkGroupMapping: [4] + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [512, 512, 1, 512] + - Exact: [513, 3584, 1, 512] # Eff: 33.22 Solution Index: 376 + - Exact: [1025, 2048, 1, 512] # Eff: 37.722 Solution Index: 376 + - Exact: [1537, 4608, 1, 512] # Eff: 60.346 Solution Index: 376 + - Exact: [3073, 7168, 1, 512] # Eff: 76.107 Solution Index: 376 + - Exact: [4609, 10752, 1, 512] # Eff: 82.875 Solution Index: 376 + - Exact: [4608, 3841, 1, 768] # Eff: 83.664 Solution Index: 376 diff --git a/Tensile/Tests/extended/custom_kernel/ck_dgemm_90a_nn_large_offset.yaml b/Tensile/Tests/extended/custom_kernel/ck_dgemm_90a_nn_large_offset.yaml new file mode 100644 index 0000000000..9e5542f176 --- /dev/null +++ b/Tensile/Tests/extended/custom_kernel/ck_dgemm_90a_nn_large_offset.yaml @@ -0,0 +1,52 @@ +TestParameters: + marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102] # not supported by arch + +GlobalParameters: + MinimumRequiredVersion: 4.7.2 + BufferOffsetB: 536877696 + +BenchmarkProblems: + - + - # ProblemType + Batched: True + ComputeDataType: d + DataType: d + DestDataType: d + HighPrecisionAccumulate: False + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + NumIndicesC: 3 + OperationType: GEMM + TransposeA: False + TransposeB: False + UseBeta: True + - # BenchmarkProblemSizeGroup + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - AssertMinApproxSize: [3] + - AssertSizeMultiple: [{3: 32}] + - AssertStrideAEqual: [{0: 1}] + - AssertStrideBEqual: [{0: 1}] + - AssertStrideCEqual: [{0: 1}] + - AssertStrideDEqual: [{0: 1}] + - CustomKernelName: ['DGEMM_Aldebaran_NN_MT128x128x16_MI16x16x4x1_GRVW2_SU4_SUS128_WGM4'] + - DepthU: [16] + - GlobalReadVectorWidth: [2] + - KernelLanguage: ['Assembly'] + - LocalReadVectorWidth: [1] + - MatrixInstruction: + - [16,16,4,1,1,2,8,4,1] + - OptPreLoopVmcnt: [0] + - SourceSwap: [1] + - StaggerU: [4] + - StaggerUStride: [128] + - StoreVectorWidth: [2] + - VectorAtomicWidth: [1] + - VectorWidth: [2] + - WorkGroupMapping: [4] + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1024, 1024, 1, 1024] diff --git a/Tensile/Tests/extended/custom_kernel/ck_dgemm_90a_pk.yaml b/Tensile/Tests/extended/custom_kernel/ck_dgemm_90a_pk.yaml new file mode 100644 index 0000000000..dd84804699 --- /dev/null +++ b/Tensile/Tests/extended/custom_kernel/ck_dgemm_90a_pk.yaml @@ -0,0 +1,59 @@ +TestParameters: + marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102] # not supported by arch + +GlobalParameters: + MinimumRequiredVersion: 4.8.1 + PrintSolutionRejectionReason: True + CEqualD: True + DataInitTypeAlpha: 17 + DataInitTypeBeta: 1 + +BenchmarkProblems: + - + - # ProblemType + Batched: True + ComputeDataType: d + DataType: d + DestDataType: d + HighPrecisionAccumulate: False + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + NumIndicesC: 3 + OperationType: GEMM + TransposeA: False + TransposeB: True + UseBeta: True + - # BenchmarkProblemSizeGroup + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - AssertAlphaValue: [-1] + - AssertBetaValue: [1] + - AssertCEqualsD: [True] + - AssertMinApproxSize: [0] + - AssertSizeEqual: [{3: 512}] + - AssertSizeMultiple: [{0: 128, 1: 128}] + - AssertStrideAEqual: [{0: 1}] + - AssertStrideBEqual: [{0: 1}] + - AssertStrideCEqual: [{0: 1}] + - AssertStrideDEqual: [{0: 1}] + - CustomKernelName: ['DGEMM_Aldebaran_PKFixedAtomic512Latest'] + - DepthU: [8] + - GlobalReadVectorWidth: [1] + - GlobalSplitUAlgorithm: ['SingleBuffer'] + - KernelLanguage: ['Assembly'] + - LocalReadVectorWidth: [1] + - MatrixInstruction: + - [16,16,4,1,1,8,2,1,4] + - OptPreLoopVmcnt: [0] + - PersistentKernel: [1] + - StoreVectorWidth: [1] + - VectorAtomicWidth: [1] + - VectorWidth: [1] + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [7040, 4096, 1, 512] # Eff: 91.956 Solution Index: 20 + - Exact: [8448, 3840, 1, 512] # Eff: 92.567 Solution Index: 20 + - Exact: [7680, 4224, 1, 512] # Eff: 92.491 Solution Index: 20 diff --git a/Tensile/Tests/extended/custom_kernel/ck_dgemm_90a_pk104.yaml b/Tensile/Tests/extended/custom_kernel/ck_dgemm_90a_pk104.yaml new file mode 100644 index 0000000000..e6d9cf1206 --- /dev/null +++ b/Tensile/Tests/extended/custom_kernel/ck_dgemm_90a_pk104.yaml @@ -0,0 +1,60 @@ +TestParameters: + marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102] # not supported by arch + +GlobalParameters: + MinimumRequiredVersion: 4.32.1 + # PrintSolutionRejectionReason: True + CEqualD: True + DataInitTypeAlpha: 17 + DataInitTypeBeta: 1 + +BenchmarkProblems: + - + - # ProblemType + Batched: True + ComputeDataType: d + DataType: d + DestDataType: d + HighPrecisionAccumulate: False + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + NumIndicesC: 3 + OperationType: GEMM + TransposeA: False + TransposeB: True + UseBeta: True + - # BenchmarkProblemSizeGroup + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - AssertAlphaValue: [-1] + - AssertBetaValue: [1] + - AssertCEqualsD: [True] + - AssertMinApproxSize: [0] + - AssertSizeEqual: [{3: 512}] + - AssertSizeMultiple: [{0: 128, 1: 128}] + - AssertStrideAEqual: [{0: 1}] + - AssertStrideBEqual: [{0: 1}] + - AssertStrideCEqual: [{0: 1}] + - AssertStrideDEqual: [{0: 1}] + - CustomKernelName: ['DGEMM_Aldebaran_PKFixedAtomic512_104'] + - DepthU: [8] + - GlobalReadVectorWidth: [1] + - GlobalSplitUAlgorithm: ['SingleBuffer'] + - KernelLanguage: ['Assembly'] + - LocalReadVectorWidth: [1] + - MatrixInstruction: + - [16,16,4,1,1,8,2,1,4] + - OptPreLoopVmcnt: [0] + - PersistentKernel: [1] + - StoreVectorWidth: [1] + - VectorAtomicWidth: [1] + - VectorWidth: [1] + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [6656, 4096, 1, 512] # Eff: 87.137 Solution Index: 19 + - Exact: [6144, 4992, 1, 512] # Eff: 87.63 Solution Index: 19 + - Exact: [8192, 3328, 1, 512] # Eff: 87.21 Solution Index: 19 + - Exact: [8320, 4096, 1, 512] # Eff: 87.851 Solution Index: 19 diff --git a/Tensile/Tests/extended/custom_kernel/ck_dgemm_90a_pk104_offset.yaml b/Tensile/Tests/extended/custom_kernel/ck_dgemm_90a_pk104_offset.yaml new file mode 100644 index 0000000000..3d2842b4e3 --- /dev/null +++ b/Tensile/Tests/extended/custom_kernel/ck_dgemm_90a_pk104_offset.yaml @@ -0,0 +1,61 @@ +TestParameters: + marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102] # not supported by arch + +GlobalParameters: + MinimumRequiredVersion: 4.32.1 + # PrintSolutionRejectionReason: True + CEqualD: True + DataInitTypeAlpha: 17 + DataInitTypeBeta: 1 + BufferOffsetB: 128 + +BenchmarkProblems: + - + - # ProblemType + Batched: True + ComputeDataType: d + DataType: d + DestDataType: d + HighPrecisionAccumulate: False + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + NumIndicesC: 3 + OperationType: GEMM + TransposeA: False + TransposeB: True + UseBeta: True + - # BenchmarkProblemSizeGroup + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - AssertAlphaValue: [-1] + - AssertBetaValue: [1] + - AssertCEqualsD: [True] + - AssertMinApproxSize: [0] + - AssertSizeEqual: [{3: 512}] + - AssertSizeMultiple: [{0: 128, 1: 128}] + - AssertStrideAEqual: [{0: 1}] + - AssertStrideBEqual: [{0: 1}] + - AssertStrideCEqual: [{0: 1}] + - AssertStrideDEqual: [{0: 1}] + - CustomKernelName: ['DGEMM_Aldebaran_PKFixedAtomic512_104'] + - DepthU: [8] + - GlobalReadVectorWidth: [1] + - GlobalSplitUAlgorithm: ['SingleBuffer'] + - KernelLanguage: ['Assembly'] + - LocalReadVectorWidth: [1] + - MatrixInstruction: + - [16,16,4,1,1,8,2,1,4] + - OptPreLoopVmcnt: [0] + - PersistentKernel: [1] + - StoreVectorWidth: [1] + - VectorAtomicWidth: [1] + - VectorWidth: [1] + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [6656, 4096, 1, 512] # Eff: 87.137 Solution Index: 19 + - Exact: [6144, 4992, 1, 512] # Eff: 87.63 Solution Index: 19 + - Exact: [8192, 3328, 1, 512] # Eff: 87.21 Solution Index: 19 + - Exact: [8320, 4096, 1, 512] # Eff: 87.851 Solution Index: 19 diff --git a/Tensile/Tests/extended/custom_kernel/ck_dgemm_90a_pk_offset.yaml b/Tensile/Tests/extended/custom_kernel/ck_dgemm_90a_pk_offset.yaml new file mode 100644 index 0000000000..df627f2823 --- /dev/null +++ b/Tensile/Tests/extended/custom_kernel/ck_dgemm_90a_pk_offset.yaml @@ -0,0 +1,60 @@ +TestParameters: + marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102] # not supported by arch + +GlobalParameters: + MinimumRequiredVersion: 4.8.1 + PrintSolutionRejectionReason: True + CEqualD: True + DataInitTypeAlpha: 17 + DataInitTypeBeta: 1 + BufferOffsetB: 128 + +BenchmarkProblems: + - + - # ProblemType + Batched: True + ComputeDataType: d + DataType: d + DestDataType: d + HighPrecisionAccumulate: False + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + NumIndicesC: 3 + OperationType: GEMM + TransposeA: False + TransposeB: True + UseBeta: True + - # BenchmarkProblemSizeGroup + InitialSolutionParameters: + BenchmarkCommonParameters: + ForkParameters: + - AssertAlphaValue: [-1] + - AssertBetaValue: [1] + - AssertCEqualsD: [True] + - AssertMinApproxSize: [0] + - AssertSizeEqual: [{3: 512}] + - AssertSizeMultiple: [{0: 128, 1: 128}] + - AssertStrideAEqual: [{0: 1}] + - AssertStrideBEqual: [{0: 1}] + - AssertStrideCEqual: [{0: 1}] + - AssertStrideDEqual: [{0: 1}] + - CustomKernelName: ['DGEMM_Aldebaran_PKFixedAtomic512Latest'] + - DepthU: [8] + - GlobalReadVectorWidth: [1] + - GlobalSplitUAlgorithm: ['SingleBuffer'] + - KernelLanguage: ['Assembly'] + - LocalReadVectorWidth: [1] + - MatrixInstruction: + - [16,16,4,1,1,8,2,1,4] + - OptPreLoopVmcnt: [0] + - PersistentKernel: [1] + - StoreVectorWidth: [1] + - VectorAtomicWidth: [1] + - VectorWidth: [1] + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [7040, 4096, 1, 512] # Eff: 91.956 Solution Index: 20 + - Exact: [8448, 3840, 1, 512] # Eff: 92.567 Solution Index: 20 + - Exact: [7680, 4224, 1, 512] # Eff: 92.491 Solution Index: 20 diff --git a/pytest.ini b/pytest.ini index badc63530c..19e643d42d 100644 --- a/pytest.ini +++ b/pytest.ini @@ -52,6 +52,8 @@ markers = classic_source convolution convolution-vs-contraction + cov + custom_kernel direct_to_lds direct_to_vgpr dot2 From 38d444a9f2b6cddfeaeedcb39a5688150fa27093 Mon Sep 17 00:00:00 2001 From: Koji Nakajima <75698246+nakajee@users.noreply.github.com> Date: Fri, 17 Feb 2023 12:46:30 -0700 Subject: [PATCH 8/8] Fix for memory access error with StaggerU + large stride (#1672) * use unsigned multiplications for StaggerU releted calculations to avoid overflow --- Tensile/KernelWriterAssembly.py | 39 ++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/Tensile/KernelWriterAssembly.py b/Tensile/KernelWriterAssembly.py index 483c32940a..8d0272e29a 100644 --- a/Tensile/KernelWriterAssembly.py +++ b/Tensile/KernelWriterAssembly.py @@ -5454,14 +5454,14 @@ def calculateStagger(self, kernel, tP): imod.addComment1("SRDs += (StaggerUIter) * GlobalReadIncs%s+%u"% (tc, self.unrollIdx)) # Calculate the stagger byte offset - imod.addCode(self.s_mul_i64_i32( + imod.addCode(self.s_mul_u64_u32( sgpr(staggerTmp), sgpr(staggerTmp+1), \ sgpr("StaggerUIter"), sgpr("GlobalReadIncs%s+%u"%(tc, self.unrollIdx)), \ " stagger byte offset")) # Amount of bytes to add to get back to start. # on the llop iteration which matches StaggerUIter, this offset added instead of GlobalReadInc - imod.addCode(self.s_mul_i64_i32(sgpr("WrapU%s+0"%tc), sgpr("WrapU%s+1"%tc), \ + imod.addCode(self.s_mul_u64_u32(sgpr("WrapU%s+0"%tc), sgpr("WrapU%s+1"%tc), \ self.loopCounter(kernel, self.unrollIdx), sgpr("GlobalReadIncs%s+%u"%(tc,self.unrollIdx)), \ "Number of bytes accessed by the unroll loop")) @@ -5513,17 +5513,30 @@ def removeStagger(self, kernel, tP): imod = Code.Module("removeStagger") if self.staggerU: tc = tP["tensorChar"] - tmp = self.getTmpSgpr(2).idx() - # might be able to refactor this to eliminate signed math - imod.addInst("s_sub_i32", sgpr(tmp), 3 if kernel["PrefetchGlobalRead"] else 2, \ - sgpr("StaggerUIter"), "") - imod.addCode(self.s_mul_i64_i32(sgpr(tmp), sgpr(tmp+1), \ - sgpr(tmp), sgpr("GlobalReadIncs%s+%u"%(tc,self.unrollIdx)), \ - "start offset S in bytes")) - imod.addInst("s_sub_u32", sgpr(tmp), sgpr(tmp), sgpr("WrapU%s"%tc), "S - WrapU") - imod.addInst("s_subb_u32", sgpr(tmp+1), sgpr(tmp+1), sgpr("WrapU%s+1"%(tc)), "S - WrapU") - - imod.addCode(self.incrementSrd(kernel, tP, sgpr(tmp), sgpr(tmp+1))) + tmp = self.getTmpSgpr(4).idx() + tmpForInc = tmp + tmpForExtra = tmp + 2 + # need to use extra 64bit mul to avoid negative value by subtraction + # ((3 or 2) - StaggerUIter) * GlobalReadIncs + # -> (3 or 2) * GlobalReadIncs - StaggerUIter * GlobalReadIncs + extra = 3 if kernel["PrefetchGlobalRead"] else 2 + # tmpForInc = extra * GlobalReadIncs + imod.addInst("s_mov_b32", sgpr(tmpForExtra), extra, "") + imod.addCode(self.s_mul_u64_u32(sgpr(tmpForInc), sgpr(tmpForInc+1), \ + sgpr(tmpForExtra), sgpr("GlobalReadIncs%s+%u"%(tc,self.unrollIdx)), \ + "%u * GlobalReadIncs"%extra)) + # tmpForExtra = StaggerUIter * GlobalReadIncs + imod.addCode(self.s_mul_u64_u32(sgpr(tmpForExtra), sgpr(tmpForExtra+1), \ + sgpr("StaggerUIter"), sgpr("GlobalReadIncs%s+%u"%(tc,self.unrollIdx)), \ + "StaggerUIter * GlobalReadIncs")) + # tmpForInc = tmpForInc - tmpForExtra = (extra - StaggerUIter) * GlobalReadIncs + imod.addInst("s_sub_u32", sgpr(tmpForInc), sgpr(tmpForInc), sgpr(tmpForExtra), "start offset S in bytes") + imod.addInst("s_subb_u32", sgpr(tmpForInc+1), sgpr(tmpForInc+1), sgpr(tmpForExtra+1), "start offset S in bytes") + # -= WrapU + imod.addInst("s_sub_u32", sgpr(tmpForInc), sgpr(tmpForInc), sgpr("WrapU%s"%tc), "S - WrapU") + imod.addInst("s_subb_u32", sgpr(tmpForInc+1), sgpr(tmpForInc+1), sgpr("WrapU%s+1"%(tc)), "S - WrapU") + + imod.addCode(self.incrementSrd(kernel, tP, sgpr(tmpForInc), sgpr(tmpForInc+1))) return imod