Skip to content

Commit

Permalink
WIP: change scales layout, enable more models
Browse files Browse the repository at this point in the history
  • Loading branch information
sshlyapn committed Oct 1, 2024
1 parent 3e1ae88 commit 8b5711e
Show file tree
Hide file tree
Showing 30 changed files with 572 additions and 143 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -22,20 +22,25 @@ class TRANSFORMATIONS_API DynamicQuantize : public ov::op::Op {
/// \param data Input tensor with data
/// \param group_sizes Group sizes for dynamic quantization
/// \param dt_scale Data type for scale output
DynamicQuantize(const Output<Node>& data, std::vector<uint64_t> group_sizes, element::Type dt_scale);
DynamicQuantize(const Output<Node>& data, std::vector<uint64_t> group_sizes, element::Type dt_scale, std::vector<uint64_t> scales_output_order = {});

void validate_and_infer_types() override;

std::shared_ptr<Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override;
const std::vector<uint64_t>& get_group_sizes() const {
return m_group_sizes;
};
const std::vector<uint64_t>& get_scales_output_order() const {
return m_scales_output_order;
};
static std::vector<ov::PartialShape> shape_infer(const DynamicQuantize* op,
const std::vector<ov::PartialShape>& input_shapes,
const std::vector<uint64_t>& group_sizes);
const std::vector<uint64_t>& group_sizes,
const std::vector<uint64_t>& scales_output_order = {});

private:
std::vector<uint64_t> m_group_sizes;
std::vector<uint64_t> m_scales_output_order;
element::Type m_dt_scale;
};

Expand Down
21 changes: 18 additions & 3 deletions src/common/transformations/src/ov_ops/dynamic_quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,23 +13,25 @@ namespace ov {
namespace op {
namespace internal {

DynamicQuantize::DynamicQuantize(const Output<Node>& data, std::vector<uint64_t> group_sizes, element::Type dt_scale)
DynamicQuantize::DynamicQuantize(const Output<Node>& data, std::vector<uint64_t> group_sizes, element::Type dt_scale, std::vector<uint64_t> scales_output_order)
: Op({data}),
m_group_sizes(std::move(group_sizes)),
m_scales_output_order(std::move(scales_output_order)),
m_dt_scale(dt_scale) {
OPENVINO_ASSERT(data.get_partial_shape().rank() == m_group_sizes.size(),
"FC input rank should be same as the rank of group_size ",
data.get_tensor_ptr()->get_partial_shape().rank(),
" / ",
m_group_sizes.size());
OPENVINO_ASSERT(data.get_partial_shape().rank() == scales_output_order.size() || scales_output_order.empty());
set_output_size(2);
validate_and_infer_types();
}

void DynamicQuantize::validate_and_infer_types() {
std::vector<ov::PartialShape> input_shapes = {get_input_partial_shape(0)};

auto out_shapes = shape_infer(this, input_shapes, m_group_sizes);
auto out_shapes = shape_infer(this, input_shapes, m_group_sizes, m_scales_output_order);
set_output_type(0, element::i8, out_shapes[0]);
set_output_type(1, m_dt_scale, out_shapes[1]);
}
Expand All @@ -41,7 +43,8 @@ std::shared_ptr<Node> DynamicQuantize::clone_with_new_inputs(const ov::OutputVec

std::vector<ov::PartialShape> DynamicQuantize::shape_infer(const DynamicQuantize* op,
const std::vector<ov::PartialShape>& input_shapes,
const std::vector<uint64_t>& group_sizes) {
const std::vector<uint64_t>& group_sizes,
const std::vector<uint64_t>& scales_output_order) {
std::vector<ov::PartialShape> out_shapes;
out_shapes.push_back(input_shapes[0]);

Expand All @@ -51,17 +54,29 @@ std::vector<ov::PartialShape> DynamicQuantize::shape_infer(const DynamicQuantize
scale_shape.size(),
" / ",
group_sizes.size());
OPENVINO_ASSERT(scale_shape.size() == scales_output_order.size() || scales_output_order.empty());

for (size_t i = 0; i < scale_shape.size(); i++) {
if (scale_shape[i].is_dynamic())
continue;

if (group_sizes[i] == UINT64_MAX)
scale_shape[i] = 1;
else {
if (scale_shape[i] == 0)
continue;

scale_shape[i] /= group_sizes[i]; // if group_size is larger than shape, scale_shape will be 1
scale_shape[i] = std::max(static_cast<int>(scale_shape[i].get_length()), 1);
}
}
if (!scales_output_order.empty()) {
auto non_transposed_scale_shape = scale_shape;
for (size_t i = 0; i < scales_output_order.size(); i++) {
OPENVINO_ASSERT(scales_output_order[i] < scale_shape.size());
scale_shape[i] = non_transposed_scale_shape[scales_output_order[i]];
}
}
out_shapes.push_back(scale_shape);
return out_shapes;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ struct kernel_impl_params final {
optional_layout weights_zero_points_layout = optional_layout();
optional_layout activations_zero_points_layout = optional_layout();
optional_layout compensation_layout = optional_layout();
optional_layout state_layout = optional_layout();
std::vector<layout> state_layouts;

std::map<size_t, memory::ptr> memory_deps = {};
size_t primary_input_idx = 0;
Expand Down
7 changes: 4 additions & 3 deletions src/plugins/intel_gpu/include/intel_gpu/op/kv_cache.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,22 +22,23 @@ class KVCache : public ov::op::Op, public ov::op::util::VariableExtension {

KVCache(const Output<Node>& past,
const Output<Node>& new_token_data,
const Output<Node>& beam_idx,
const std::shared_ptr<ov::op::util::Variable>& past_values,
int64_t concat_axis,
int64_t gather_axis,
const ov::element::Type output_type = ov::element::undefined);

KVCache(const Output<Node>& past,
const Output<Node>& new_token_data,
const Output<Node>& beam_idx,
const std::shared_ptr<ov::op::util::Variable>& past_values,
int64_t concat_axis,
int64_t gather_axis,
const ov::element::Type output_type = ov::element::undefined);

KVCache(const Output<Node>& past,
const Output<Node>& new_token_data,
const Output<Node>& new_token_scale,
const Output<Node>& beam_idx,
const Output<Node>& past_scale,
const Output<Node>& new_token_scale,
const std::shared_ptr<ov::op::util::Variable>& past_values,
int64_t concat_axis,
int64_t gather_axis,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ class VariableStateIndirectKVCache : public MultiTensorState {

VariableState::Ptr get_compression_scale_state() const;
ov::PartialShape get_compression_scale_shape(const ov::PartialShape& kv_cache_shape);
void set_scales_layout(const cldnn::layout& new_layout);

private:
size_t m_beam_axis = 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,14 @@ struct dynamic_quantize : public primitive_base<dynamic_quantize> {
dynamic_quantize(const primitive_id& id,
const input_info& input,
const std::vector<uint64_t>& group_sizes,
const std::vector<uint64_t>& scales_output_order,
const std::vector<optional_data_type> data_types = {optional_data_type(data_types::f16), optional_data_type(data_types::i8)})
: primitive_base(id, {input}, 2, data_types),
group_sizes(group_sizes) {}
group_sizes(group_sizes),
scales_output_order(scales_output_order) {}

std::vector<uint64_t> group_sizes;
std::vector<uint64_t> scales_output_order;

size_t hash() const override {
size_t seed = primitive::hash();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,12 @@ struct read_value : public primitive_base<read_value> {
: primitive_base(id, inputs, 1, {optional_data_type{output_layout.data_type}}),
variable_id{variable_id},
output_layout{output_layout},
compressed(false),
user_specified_type(user_specified_type) {}

std::string variable_id;
layout output_layout;
bool compressed;
ov::element::Type user_specified_type;

bool operator==(const primitive& rhs) const override {
Expand Down
19 changes: 15 additions & 4 deletions src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,27 +22,38 @@ layout dynamic_quantize_inst::calc_output_layout(dynamic_quantize_node const& no
}

template<typename ShapeType>
std::vector<layout> dynamic_quantize_inst::__calc_output_layouts(const layout &act_layout, const std::vector<uint64_t>& group_sizes) {
std::vector<layout> dynamic_quantize_inst::__calc_output_layouts(const layout &act_layout, const std::vector<uint64_t>& group_sizes, const std::vector<uint64_t>& scales_output_order) {
ov::op::internal::DynamicQuantize op;
auto output_format = act_layout.format;

std::vector<ShapeType> input_shapes = {
act_layout.get<ShapeType>(),
};

auto output_shapes = ov::op::internal::DynamicQuantize::shape_infer(&op, input_shapes, group_sizes);
auto print_arr = [&](const std::vector<uint64_t>& vec, size_t max_len, std::string name) {
std::stringstream ss;
for (size_t i = 0; i < std::min(max_len, vec.size()); i++) {
ss << vec[i] << ", ";
}
std::cout << "Array " << name << " for calc_shape (len=" << vec.size() << ") content: " << ss.str() << "\n";
};

print_arr(scales_output_order, scales_output_order.size(), "scales_output_order");
print_arr(group_sizes, group_sizes.size(), "group_sizes");

auto output_shapes = ov::op::internal::DynamicQuantize::shape_infer(&op, input_shapes, group_sizes, scales_output_order);
GPU_DEBUG_TRACE_DETAIL << "shape infer dynamic" << output_shapes[0] << " " << output_shapes[1] << "\n";

return { layout(output_shapes[0], data_types::i8, output_format), layout(output_shapes[1], data_types::f16, output_format) };
}

template std::vector<layout> dynamic_quantize_inst::__calc_output_layouts<ov::PartialShape>(const layout &act_layout, const std::vector<uint64_t>& group_sizes);
template std::vector<layout> dynamic_quantize_inst::__calc_output_layouts<ov::PartialShape>(const layout &act_layout, const std::vector<uint64_t>& group_sizes, const std::vector<uint64_t>& scales_output_order);

template<typename ShapeType>
std::vector<layout> dynamic_quantize_inst::calc_output_layouts(dynamic_quantize_node const& /*node*/, const kernel_impl_params& impl_param) {
auto desc = impl_param.typed_desc<dynamic_quantize>();
const auto& input_layout = impl_param.get_input_layout();
return __calc_output_layouts<ov::PartialShape>(input_layout, desc->group_sizes);
return __calc_output_layouts<ov::PartialShape>(input_layout, desc->group_sizes, desc->scales_output_order);
}

template std::vector<layout> dynamic_quantize_inst::calc_output_layouts<ov::PartialShape>(dynamic_quantize_node const& node,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ struct dynamic_quantize_impl : typed_primitive_impl_ocl<dynamic_quantize> {
auto params = get_default_params<kernel_selector::dynamic_quantize_params>(impl_param, is_shape_agnostic);
params.outputs.push_back(convert_data_tensor(impl_param.get_output_layout(1)));

const auto& desc = impl_param.typed_desc<dynamic_quantize>();
params.group_sizes = desc->group_sizes;
params.scales_output_order = desc->scales_output_order;

return params;
}

Expand Down
52 changes: 30 additions & 22 deletions src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "multi_stage_primitive.hpp"

#include "kv_cache_inst.h"
#include "dynamic_quantize_inst.h"
#include "concatenation/concatenation_kernel_selector.h"
#include "concatenation/concatenation_kernel_base.h"
#include "beam_table_update/beam_table_update_kernel_selector.hpp"
Expand Down Expand Up @@ -69,9 +70,7 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {

cldnn::memory::ptr beam_table_prev = nullptr;
cldnn::memory::ptr beam_table_new = nullptr;

cldnn::memory::ptr compression_scale_prev = nullptr;
cldnn::memory::ptr scale_new = nullptr;
cldnn::memory::ptr compression_scale = nullptr;

void load(BinaryInputBuffer& ib) override {
parent::load(ib);
Expand Down Expand Up @@ -105,8 +104,8 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
args.outputs = { beam_table_new };
} else if (stage == scale_stage) {
// FIXME: indirectness and compression are orthogonal feature.
args.inputs = { compression_scale_prev, instance.input_memory_ptr(3) };
args.outputs = { scale_new };
args.inputs = { instance.input_memory_ptr(3), instance.input_memory_ptr(4) }; // [past, new, beam_table, past_scale, new_scale]
args.outputs = { compression_scale };
}

return args;
Expand Down Expand Up @@ -204,27 +203,25 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
dynamic_cast<ov::intel_gpu::VariableStateIndirectKVCache&>(variable).get_compression_scale_state();
auto comp_scale_layout = instance.get_impl_params()->output_layouts[2];
auto comp_scale_shape = comp_scale_layout.get_shape();
std::swap(compression_scale_prev, scale_new);

if (!scale_new || scale_new->count() < ov::shape_size(comp_scale_shape)) {
bool skip_first_kernel = true;
if (!compression_scale || compression_scale->count() < ov::shape_size(comp_scale_shape)) {
const auto concat_axis = 2;
auto alloc_shape = comp_scale_shape;
alloc_shape[desc->concat_axis] += instance.get_prealloc_iter_num();
alloc_shape[concat_axis] += instance.get_prealloc_iter_num();
const layout comp_scale_alloc_layout = {alloc_shape, comp_scale_layout.data_type, comp_scale_layout.format};
GPU_DEBUG_TRACE_DETAIL << "Realloc compression scale table to " << comp_scale_alloc_layout.to_short_string() << std::endl;
scale_new = engine.allocate_memory(comp_scale_alloc_layout, scale_alloc_type, false);
compression_scale = engine.allocate_memory(comp_scale_alloc_layout, scale_alloc_type, false);

// Alloc prev mem too as it will be needed in the future
// That also simplifies arguments setting a little bit as we don't need to handle an optional past state
if (!compression_scale_prev) {
compression_scale_prev = engine.allocate_memory(comp_scale_alloc_layout, scale_alloc_type, false);
}
skip_first_kernel = comp_scale_state->get_layout().count() == 0;
}

instance.set_output_memory(scale_new, false, 2);
comp_scale_state->set_memory(scale_new, instance.get_impl_params()->output_layouts[2]);
instance.set_output_memory(compression_scale, false, 2);
comp_scale_state->set_memory(compression_scale, instance.get_impl_params()->output_layouts[2]);

auto comp_scale_kernel_params = get_compression_scale_update_kernel_params(impl_param, comp_scale_state->is_set());
(_kernels_data[scale_stage].update_dispatch_data_func)(comp_scale_kernel_params, _kernels_data[scale_stage]);
_kernels_data[scale_stage].kernels[0].skip_execution = skip_first_kernel;

execute_stage(events, instance, res_events, scale_stage);
comp_scale_state->set();
Expand Down Expand Up @@ -344,8 +341,8 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
params.indirect_axis = indirect_axis;

const bool compressed = impl_param.typed_desc<kv_cache>()->compressed;
const auto beam_table_past_idx = compressed ? 4 : 3;
const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset; // [kv_past, kv_new_token, [beam_idx, compression_scale_past, beam_table_past, compression_scale_new]]
const auto beam_table_past_idx = compressed ? 5 : 3;
const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset; // [kv_past, kv_new_token, [beam_idx, compression_scale_past, compression_scale_new, beam_table_past]]
const auto& out_offsets_map = impl_param.out_port_to_shape_info_offset; // [kv_present, beam_table_present, compression_scale_present]
std::map<size_t, size_t> in_tensor_to_offset_map = {
{0, in_offsets_map.at(beam_table_past_idx)}, // beam_table_past
Expand All @@ -364,27 +361,38 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
const auto& primitive = impl_param.typed_desc<kv_cache>();
auto params = get_default_params<kernel_selector::concatenation_params>(impl_param, is_shape_agnostic);

const auto concat_axis = 2;
params.axis = convert_axis(concat_axis, impl_param.get_output_layout().get_rank());

auto inputs_count = 2;
auto comp_scale_past_layout = impl_param.input_layouts[3];
auto comp_scale_new_layout = impl_param.input_layouts[4];
auto comp_scale_present_layout = impl_param.output_layouts[2];
layout comp_scale_past_layout = get_compression_scale_layout(impl_param);

GPU_DEBUG_TRACE_DETAIL << "Past scale: " << comp_scale_past_layout.to_short_string() << "\n";
GPU_DEBUG_TRACE_DETAIL << "New scale: " << comp_scale_new_layout.to_short_string() << "\n";
GPU_DEBUG_TRACE_DETAIL << "Present scale: " << comp_scale_present_layout.to_short_string() << "\n";

params.inputs.resize(inputs_count);
params.inputs[0] = convert_data_tensor(comp_scale_past_layout);
params.inputs[1] = convert_data_tensor(impl_param.input_layouts[3]);
params.inputs[1] = convert_data_tensor(comp_scale_new_layout);
params.outputs[0] = convert_data_tensor(comp_scale_present_layout);

const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset;
const auto& out_offsets_map = impl_param.out_port_to_shape_info_offset;

// FIXME: need to handle the index properly when indirect is off
std::map<size_t, size_t> in_tensor_to_offset_map = {
{0, in_offsets_map.at(5)}, // compression_scale_past
{1, in_offsets_map.at(3)}, // compression_scale_new
{0, in_offsets_map.at(3)}, // compression_scale_past
{1, in_offsets_map.at(4)}, // compression_scale_new
};
std::map<size_t, size_t> out_tensor_to_offset_map = {
{0, out_offsets_map.at(2)}, // compression_scale_present
};

GPU_DEBUG_TRACE_DETAIL << "Dynamic shape in0 " << in_offsets_map.at(3) << "\n";
GPU_DEBUG_TRACE_DETAIL << "Dynamic shape in1 " << in_offsets_map.at(4) << "\n";
GPU_DEBUG_TRACE_DETAIL << "Dynamic shape offset " << out_offsets_map.at(2) << "\n";
params.set_dynamic_shape_offsets(in_tensor_to_offset_map, out_tensor_to_offset_map);

return params;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class typed_primitive_inst<dynamic_quantize> : public typed_primitive_inst_base<

// Internal function to be used from fakealignment
template<typename ShapeType>
static std::vector<layout> __calc_output_layouts(const layout &act_layout, const std::vector<uint64_t>& group_size);
static std::vector<layout> __calc_output_layouts(const layout &act_layout, const std::vector<uint64_t>& group_size, const std::vector<uint64_t>& scales_output_order);
static std::string to_string(dynamic_quantize_node const& node);

typed_primitive_inst(network& network, dynamic_quantize_node const& node);
Expand Down
4 changes: 1 addition & 3 deletions src/plugins/intel_gpu/src/graph/include/kv_cache_inst.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,7 @@ struct typed_program_node<kv_cache> : public typed_program_node_base<kv_cache> {
res.push_back(layout(ov::PartialShape::dynamic(4), data_types::i32, format::bfyx));
}

if (get_primitive()->compressed) { // insert an additional input with compressed_scale past layout
res.push_back(layout(ov::PartialShape::dynamic(4), data_types::f16, format::bfyx));
}
GPU_DEBUG_TRACE_DETAIL << "Total shape info input layouts: " << res.size() << "\n";

return res;
}
Expand Down
Loading

0 comments on commit 8b5711e

Please sign in to comment.