Skip to content

Commit

Permalink
Add KVCacheCompressed operation, replace QuantizationConfig with Attr…
Browse files Browse the repository at this point in the history
…ibute structure
  • Loading branch information
sshlyapn committed Oct 28, 2024
1 parent 07976ba commit e02e320
Show file tree
Hide file tree
Showing 30 changed files with 534 additions and 479 deletions.
75 changes: 35 additions & 40 deletions src/common/transformations/include/ov_ops/dynamic_quantize.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,30 +11,18 @@ namespace ov {
namespace op {
namespace internal {

struct QuantizationConfig {
enum class QuantizationType { Symmetric, Asymmetric };

QuantizationType type = QuantizationType::Symmetric;
element::Type quantization_dt = element::undefined;
element::Type scale_dt = element::undefined;
element::Type zp_dt = element::undefined;
std::vector<uint64_t> group_sizes = {};

bool operator==(const QuantizationConfig& rhs) const {
return type == rhs.type && quantization_dt == rhs.quantization_dt && scale_dt == rhs.scale_dt &&
zp_dt == rhs.zp_dt && group_sizes == rhs.group_sizes;
}

bool is_asymmetric_quantization() const {
return type == QuantizationType::Asymmetric;
}
};

/// \brief Operator performing Dynamic Quantize
class TRANSFORMATIONS_API DynamicQuantize : public ov::op::Op {
public:
OPENVINO_OP("DynamicQuantize", "ie_internal_opset");

/**
* @brief Configuration for the type of quantization applied to the data:
* - Symmetric: Quantization where the zero point is fixed at zero, and the range is symmetric around zero.
* - Asymmetric: Quantization where the zero point is not fixed at zero.
*/
enum class QuantizationType { Symmetric, Asymmetric };

/**
* @brief Configuration for how Activations, Scales and Zero Points will be stored in output buffers:
* - Planar: Activations, Scales, and Zero Points are stored in independent buffers.
Expand All @@ -43,51 +31,58 @@ class TRANSFORMATIONS_API DynamicQuantize : public ov::op::Op {
*/
enum class OutputStorageType { Planar, InterleavedScalesZP, /* InterleavedActivationsScalesZP */ };

/// \brief Structure that specifies attributes for interpolation
struct Attributes {
QuantizationType quantization_type = QuantizationType::Symmetric;
element::Type quantization_dt = element::undefined;
element::Type scale_dt = element::undefined;
element::Type zp_dt = element::undefined;

std::vector<uint64_t> group_sizes = {};
std::vector<uint64_t> scales_zp_output_order = {};
OutputStorageType output_storage_type = OutputStorageType::Planar;
};

DynamicQuantize() = default;
/// \brief Constructs an DynamicQuantize operation.
///
/// \param data Input tensor with data
/// \param config Dynamic quantization configuration
DynamicQuantize(const Output<Node>& data,
const QuantizationConfig& config,
const OutputStorageType& output_storage = OutputStorageType::Planar,
const std::vector<uint64_t>& scales_zp_output_order = {});
DynamicQuantize(const Output<Node>& data, const Attributes& attrs);

void validate_and_infer_types() override;

std::shared_ptr<Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override;

const std::vector<uint64_t>& get_group_sizes() const {
return m_config.group_sizes;
const Attributes& get_attrs() const {
return m_attrs;
}

void set_attrs(Attributes attrs) {
m_attrs = std::move(attrs);
}

QuantizationConfig::QuantizationType get_quantization_type() const {
return m_config.type;
const std::vector<uint64_t>& get_group_sizes() const {
return m_attrs.group_sizes;
}

QuantizationConfig get_quantization_config() const {
return m_config;
QuantizationType get_quantization_type() const {
return m_attrs.quantization_type;
}

OutputStorageType get_output_storage_type() const {
return m_output_storage_type;
return m_attrs.output_storage_type;
}

const std::vector<uint64_t>& get_scales_zp_output_order() const {
return m_scales_zp_output_order;
return m_attrs.scales_zp_output_order;
}

static std::vector<ov::PartialShape> shape_infer(
const DynamicQuantize* op,
const std::vector<ov::PartialShape>& input_shapes,
const QuantizationConfig& config,
const OutputStorageType& output_storage = OutputStorageType::Planar,
const std::vector<uint64_t>& scales_zp_output_order = {});
static std::vector<ov::PartialShape> shape_infer(const DynamicQuantize* op,
const std::vector<ov::PartialShape>& input_shapes);

protected:
OutputStorageType m_output_storage_type;
std::vector<uint64_t> m_scales_zp_output_order;
QuantizationConfig m_config;
Attributes m_attrs;
};

} // namespace internal
Expand Down
63 changes: 30 additions & 33 deletions src/common/transformations/src/ov_ops/dynamic_quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,35 +13,30 @@ namespace ov {
namespace op {
namespace internal {

DynamicQuantize::DynamicQuantize(const Output<Node>& data,
const QuantizationConfig& config,
const OutputStorageType& output_storage,
const std::vector<uint64_t>& scales_zp_output_order)
: Op({data}),
m_output_storage_type(output_storage),
m_scales_zp_output_order(scales_zp_output_order),
m_config(config) {
if (m_scales_zp_output_order.empty()) {
m_scales_zp_output_order.resize(data.get_partial_shape().size());
std::iota(m_scales_zp_output_order.begin(), m_scales_zp_output_order.end(), 0);
DynamicQuantize::DynamicQuantize(const Output<Node>& data, const Attributes& attrs) : Op({data}), m_attrs(attrs) {
if (m_attrs.scales_zp_output_order.empty()) {
m_attrs.scales_zp_output_order.resize(data.get_partial_shape().size());
std::iota(m_attrs.scales_zp_output_order.begin(), m_attrs.scales_zp_output_order.end(), 0);
}

OPENVINO_ASSERT(data.get_partial_shape().rank() == m_config.group_sizes.size(),
OPENVINO_ASSERT(data.get_partial_shape().rank() == m_attrs.group_sizes.size(),
"DQ input rank should be same as the rank of group_size ",
data.get_tensor_ptr()->get_partial_shape().rank(),
" / ",
m_config.group_sizes.size());
m_attrs.group_sizes.size());

OPENVINO_ASSERT(data.get_partial_shape().size() == m_scales_zp_output_order.size(),
OPENVINO_ASSERT(data.get_partial_shape().size() == m_attrs.scales_zp_output_order.size(),
"DQ input rank should be same as the rank of scales and zero points output order)");

size_t outputs_number = 2;
if (config.is_asymmetric_quantization() && output_storage == OutputStorageType::Planar)
if (m_attrs.quantization_type == QuantizationType::Asymmetric &&
m_attrs.output_storage_type == OutputStorageType::Planar)
outputs_number = 3;

OPENVINO_ASSERT((output_storage == OutputStorageType::Planar) ||
(config.is_asymmetric_quantization() && config.scale_dt == config.zp_dt),
"Scales and Zero Points should have the same data type to be stored in the single buffer");
OPENVINO_ASSERT(
(m_attrs.output_storage_type == OutputStorageType::Planar) ||
(m_attrs.quantization_type == QuantizationType::Asymmetric && m_attrs.scale_dt == m_attrs.zp_dt),
"Scales and Zero Points should have the same data type to be stored in the single buffer");

set_output_size(outputs_number);
validate_and_infer_types();
Expand All @@ -50,29 +45,27 @@ DynamicQuantize::DynamicQuantize(const Output<Node>& data,
void DynamicQuantize::validate_and_infer_types() {
std::vector<ov::PartialShape> input_shapes = {get_input_partial_shape(0)};

auto out_shapes = shape_infer(this, input_shapes, m_config, m_output_storage_type, m_scales_zp_output_order);
set_output_type(0, m_config.quantization_dt, out_shapes[0]);
set_output_type(1, m_config.scale_dt, out_shapes[1]);
auto out_shapes = shape_infer(this, input_shapes);
set_output_type(0, m_attrs.quantization_dt, out_shapes[0]);
set_output_type(1, m_attrs.scale_dt, out_shapes[1]);

if (m_config.is_asymmetric_quantization() && m_output_storage_type == OutputStorageType::Planar)
set_output_type(2, m_config.zp_dt, out_shapes[2]);
if (m_attrs.quantization_type == QuantizationType::Asymmetric &&
m_attrs.output_storage_type == OutputStorageType::Planar)
set_output_type(2, m_attrs.zp_dt, out_shapes[2]);
}

std::shared_ptr<Node> DynamicQuantize::clone_with_new_inputs(const ov::OutputVector& new_args) const {
check_new_args_count(this, new_args);
return std::make_shared<DynamicQuantize>(new_args.at(0), m_config, m_output_storage_type, m_scales_zp_output_order);
return std::make_shared<DynamicQuantize>(new_args.at(0), m_attrs);
}

std::vector<ov::PartialShape> DynamicQuantize::shape_infer(const DynamicQuantize* op,
const std::vector<ov::PartialShape>& input_shapes,
const QuantizationConfig& config,
const OutputStorageType& output_storage,
const std::vector<uint64_t>& scales_zp_output_order) {
const auto& group_sizes = config.group_sizes;
const std::vector<ov::PartialShape>& input_shapes) {
std::vector<ov::PartialShape> out_shapes;
out_shapes.push_back(input_shapes[0]);

auto scale_shape = input_shapes[0];
const auto& group_sizes = op->m_attrs.group_sizes;
OPENVINO_ASSERT(scale_shape.size() == group_sizes.size(),
"Scale_shape and group_size are supposed to have same rank: ",
scale_shape.size(),
Expand All @@ -91,7 +84,8 @@ std::vector<ov::PartialShape> DynamicQuantize::shape_infer(const DynamicQuantize
out_shapes.push_back(scale_shape);

// Add zero points shape, same as the scales
if (config.is_asymmetric_quantization() && output_storage == OutputStorageType::Planar)
if (op->m_attrs.quantization_type == QuantizationType::Asymmetric &&
op->m_attrs.output_storage_type == OutputStorageType::Planar)
out_shapes.push_back(scale_shape);

auto transpose_shape = [](const ov::PartialShape& shape, const std::vector<uint64_t>& scales_zp_output_order) {
Expand All @@ -105,14 +99,17 @@ std::vector<ov::PartialShape> DynamicQuantize::shape_infer(const DynamicQuantize
};

// Transpose scales and zero points shapes
const auto& scales_zp_output_order = op->m_attrs.scales_zp_output_order;
for (size_t i = 1; i < out_shapes.size(); i++) {
out_shapes[i] = transpose_shape(out_shapes[i], scales_zp_output_order);
}

if (config.is_asymmetric_quantization() && output_storage != OutputStorageType::Planar) {
if (op->m_attrs.quantization_type == QuantizationType::Asymmetric &&
op->m_attrs.output_storage_type != OutputStorageType::Planar) {
// Currently scales and zero points are supposed to be combined over the last dimension only
const auto combine_axis = out_shapes[1].size() - 1;
OPENVINO_ASSERT(config.group_sizes[scales_zp_output_order[combine_axis]] != 1);
const auto combine_axis = scales_zp_output_order.empty() ? out_shapes[1].size() - 1
: scales_zp_output_order[out_shapes[1].size() - 1];
OPENVINO_ASSERT(group_sizes[combine_axis] != 1);

out_shapes[1][combine_axis] *= 2; // [scale, zero_point] pairs
}
Expand Down
3 changes: 1 addition & 2 deletions src/plugins/intel_gpu/include/intel_gpu/op/indirect_sdpa.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,7 @@ class IndirectSDPA : public ov::intel_gpu::op::SDPA {
const std::vector<int64_t>& order_k,
const std::vector<int64_t>& order_v,
const std::vector<int64_t>& order_out,
const QuantizationConfig& quantization_config,
const bool combine_scales_and_zp,
const QuantizationAttribute& quantization_attribute,
const ov::element::Type output_type = ov::element::undefined);

bool visit_attributes(ov::AttributeVisitor &visitor) override;
Expand Down
34 changes: 7 additions & 27 deletions src/plugins/intel_gpu/include/intel_gpu/op/kv_cache.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,6 @@ class KVCache : public ov::op::Op, public ov::op::util::VariableExtension {
public:
OPENVINO_OP("KVCache", "gpu_opset");

using QuantizationConfig = ov::op::internal::QuantizationConfig;

KVCache() = default;

KVCache(const Output<Node>& past,
Expand All @@ -37,15 +35,6 @@ class KVCache : public ov::op::Op, public ov::op::util::VariableExtension {
int64_t gather_axis,
const ov::element::Type output_type = ov::element::undefined);

KVCache(const OutputVector& inputs,
const std::shared_ptr<ov::op::util::Variable>& past_values,
int64_t concat_axis,
int64_t gather_axis,
bool combine_scales_and_zp,
const QuantizationConfig& config,
const std::vector<uint64_t>& scales_zp_output_order,
const ov::element::Type output_type = ov::element::undefined);

bool visit_attributes(ov::AttributeVisitor& visitor) override;

void validate_and_infer_types() override;
Expand All @@ -65,32 +54,23 @@ class KVCache : public ov::op::Op, public ov::op::util::VariableExtension {

bool get_indirect() const { return m_indirect; }

bool get_kv_compressed() const { return m_compressed; }
bool get_combine_scales_and_zp() const { return m_combine_scales_and_zp; }
QuantizationConfig get_quantization_config() const { return m_quantization_config; }
std::vector<uint64_t> get_scales_zp_output_order() const { return m_scales_zp_output_order; }
protected:
KVCache(const OutputVector& inputs,
const std::shared_ptr<ov::op::util::Variable>& past_values,
bool indirect,
int64_t concat_axis,
int64_t gather_axis,
const ov::element::Type output_type = ov::element::undefined);

private:
int64_t m_concat_axis = 0;
int64_t m_gather_axis = 0;
bool m_indirect = false;

bool m_compressed = false;
bool m_combine_scales_and_zp = false;
QuantizationConfig m_quantization_config = {};
std::vector<uint64_t> m_scales_zp_output_order = {};

ov::element::Type m_output_type;
};

std::vector<ov::PartialShape> shape_infer(const KVCache* op, const std::vector<ov::PartialShape>& input_shapes);

std::vector<ov::PartialShape> shape_infer(const KVCache* op,
const std::vector<ov::PartialShape>& input_shapes,
const ov::op::internal::QuantizationConfig& config,
const std::vector<uint64_t>& scales_output_order = {},
bool combine_scales_and_zp = false);

} // namespace op
} // namespace intel_gpu
} // namespace ov
56 changes: 56 additions & 0 deletions src/plugins/intel_gpu/include/intel_gpu/op/kv_cache_compressed.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "intel_gpu/op/kv_cache.hpp"
#include "ov_ops/dynamic_quantize.hpp"

namespace ov {
namespace intel_gpu {
namespace op {

/// \brief Operator that implements Key-Values cache subgraph for large language models.
/// This operation updates data of the corresponding Variable
class KVCacheCompressed : public ov::intel_gpu::op::KVCache {
public:
OPENVINO_OP("KVCacheCompressed", "gpu_opset");

using QuantizationAttrs = ov::op::internal::DynamicQuantize::Attributes;

KVCacheCompressed() = default;

KVCacheCompressed(const OutputVector& inputs,
const std::shared_ptr<ov::op::util::Variable>& past_values,
int64_t concat_axis,
int64_t gather_axis,
const QuantizationAttrs& quantization_attrs,
const ov::element::Type output_type = ov::element::undefined);

void validate_and_infer_types() override;

std::shared_ptr<Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override;

bool get_kv_compressed() const { return m_compressed; }
bool get_combine_scales_and_zp() const {
return m_quantization_attrs.quantization_type == ov::op::internal::DynamicQuantize::QuantizationType::Asymmetric &&
m_quantization_attrs.output_storage_type != ov::op::internal::DynamicQuantize::OutputStorageType::Planar;
}

QuantizationAttrs get_quantization_attrs() const { return m_quantization_attrs; }
void set_quantization_attrs(QuantizationAttrs attrs) { m_quantization_attrs = std::move(attrs); }

std::vector<uint64_t> get_scales_zp_output_order() const { return m_quantization_attrs.scales_zp_output_order; }

private:
bool m_compressed;
QuantizationAttrs m_quantization_attrs = {};
};

std::vector<ov::PartialShape> shape_infer(const KVCacheCompressed* op,
const std::vector<ov::PartialShape>& input_shapes);

} // namespace op
} // namespace intel_gpu
} // namespace ov
Loading

0 comments on commit e02e320

Please sign in to comment.