Skip to content

Commit

Permalink
Fix input indexes reuse for output
Browse files Browse the repository at this point in the history
  • Loading branch information
sshlyapn committed Oct 11, 2024
1 parent 390502c commit b7cfd67
Showing 1 changed file with 4 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -60,20 +60,19 @@ KERNEL(dynamic_quantize_gpu_opt_generic)(
half max_value = 0.0001h;
half val[INNERMOST_DIM_VALUE / SUBGROUP_SIZE];

const uint data_offset = INPUT0_GET_INDEX(b, f, y, x);
const uint input_offset = INPUT0_GET_INDEX(b, f, y, x);
unroll_for (uint i = 0; i < INNERMOST_DIM_VALUE / SUBGROUP_SIZE; i++) {
// val[i] = input[data_offset + i * SUBGROUP_SIZE + sglid];
val[i] = INPUT_BLOCK_READ(input, data_offset + i * SUBGROUP_SIZE);
val[i] = INPUT_BLOCK_READ(input, input_offset + i * SUBGROUP_SIZE);
max_value = fmax(max_value, fabs(val[i]));
}

max_value = work_group_reduce_max(max_value);

half scale = 127.0h / max_value;

const uint output_offset = OUTPUT_GET_INDEX(b, f, y, x);
unroll_for (uint i = 0; i < INNERMOST_DIM_VALUE / SUBGROUP_SIZE; i++) {
OUTPUT_BLOCK_WRITE(output, data_offset + i * SUBGROUP_SIZE, convert_char(val[i] * scale));
// output[data_offset + i * SUBGROUP_SIZE + sglid] = convert_char(val[i] * scale);
OUTPUT_BLOCK_WRITE(output, output_offset + i * SUBGROUP_SIZE, convert_char(val[i] * scale));
}

#ifdef SCALES_OUTPUT_ORDER
Expand Down

0 comments on commit b7cfd67

Please sign in to comment.