forked from openvinotoolkit/openvino
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
WIP: [GPU] KV-cache compression support
- Loading branch information
Showing
70 changed files
with
3,303 additions
and
219 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
57 changes: 57 additions & 0 deletions
57
src/plugins/intel_gpu/include/intel_gpu/op/dynamic_quantize.hpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
// Copyright (C) 2024 Intel Corporation | ||
// SPDX-License-Identifier: Apache-2.0 | ||
// | ||
|
||
#pragma once | ||
|
||
#include "openvino/op/op.hpp" | ||
#include "ov_ops/dynamic_quantize.hpp" | ||
|
||
namespace ov { | ||
namespace intel_gpu { | ||
namespace op { | ||
|
||
class DynamicQuantize : public ov::op::internal::DynamicQuantize { | ||
public: | ||
OPENVINO_OP("DynamicQuantize", "gpu_opset"); | ||
|
||
using QuantizationConfig = ov::op::internal::QuantizationConfig; | ||
|
||
DynamicQuantize() = default; | ||
/// \brief Constructs an DynamicQuantize operation. | ||
/// | ||
/// \param data Input tensor with data | ||
/// \param config Dynamic quantization configuration | ||
/// \param scales_zp_output_order Non default order of scales | ||
/// \param combine_scales_and_zp Save scales and zero points into single buffer by pairs (scale, zp) | ||
DynamicQuantize(const Output<Node>& data, | ||
const QuantizationConfig& config, | ||
const std::vector<uint64_t>& scales_zp_output_order = {}, | ||
const bool combine_scales_and_zp = false); | ||
|
||
void validate_and_infer_types() override; | ||
|
||
std::shared_ptr<Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override; | ||
|
||
const std::vector<uint64_t>& get_scales_zp_output_order() const { | ||
return m_scales_zp_output_order; | ||
}; | ||
|
||
bool get_combine_scales_and_zp() const { | ||
return m_combine_scales_and_zp; | ||
}; | ||
|
||
static std::vector<ov::PartialShape> shape_infer(const DynamicQuantize* op, | ||
const std::vector<ov::PartialShape>& input_shapes, | ||
const QuantizationConfig& config, | ||
const std::vector<uint64_t>& scales_zp_output_order, | ||
const bool combine_scales_and_zp = false); | ||
|
||
private: | ||
bool m_combine_scales_and_zp = false; | ||
std::vector<uint64_t> m_scales_zp_output_order; | ||
}; | ||
|
||
} // namespace internal | ||
} // namespace op | ||
} // namespace ov |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
39 changes: 39 additions & 0 deletions
39
src/plugins/intel_gpu/include/intel_gpu/op/read_values.hpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
// Copyright (C) 2023 Intel Corporation | ||
// SPDX-License-Identifier: Apache-2.0 | ||
// | ||
|
||
#pragma once | ||
|
||
#include "intel_gpu/op/read_value.hpp" | ||
|
||
namespace ov { | ||
namespace intel_gpu { | ||
namespace op { | ||
|
||
/// \brief This operation handles the OpenVINO GPU Plugin's custom variable representation (which can store multiple states in a single variable) at the graph level. | ||
class ReadValues : public ReadValue { | ||
public: | ||
OPENVINO_OP("ReadValues", "gpu_opset"); | ||
|
||
ReadValues() = default; | ||
|
||
ReadValues(const std::shared_ptr<ov::op::util::Variable>& variable, | ||
const std::vector<ov::op::util::VariableInfo>& internal_states_infos); | ||
|
||
ReadValues(const OutputVector& variable_initializers, | ||
const std::shared_ptr<ov::op::util::Variable>& variable, | ||
const std::vector<ov::op::util::VariableInfo>& internal_states_infos); | ||
|
||
bool visit_attributes(ov::AttributeVisitor& visitor) override; | ||
|
||
void validate_and_infer_types() override; | ||
|
||
std::shared_ptr<Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override; | ||
|
||
private: | ||
std::vector<ov::op::util::VariableInfo> m_internal_states_infos; | ||
}; | ||
|
||
} // namespace op | ||
} // namespace intel_gpu | ||
} // namespace ov |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.