Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GPU] Fix sd1.5_controlnet_lora bad image. #26881

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/plugins/intel_gpu/src/graph/gemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,8 @@ layout gemm_inst::transform_output_layout(const std::shared_ptr<const gemm> prim
(i == 1) ? transposed_input1_pshape :
input_layouts[i].get_partial_shape();
for (size_t j = 0; j != input_pshape.size(); ++j) {
ov::Dimension::merge(output_pshape[j], output_pshape[j], input_pshape[j]);
if (input_pshape[j].get_max_length() != input_pshape[j].get_min_length())
ov::Dimension::merge(output_pshape[j], output_pshape[j], input_pshape[j]);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,9 @@ void concat_in_place_optimization::optimize_cascade(concatenation_node& node, st
}
node.set_output_layout(concat_layout);
node.can_be_optimized(true);
if (node.is_dynamic()) {
node.set_runtime_skippable(true);
}
GPU_DEBUG_TRACE_DETAIL << "[prepare_buffer_fusing] : " << node.id() << " can be optimized" << std::endl;
}

Expand Down
7 changes: 4 additions & 3 deletions src/plugins/intel_gpu/src/graph/primitive_inst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -502,7 +502,7 @@ event::ptr primitive_inst::realloc_if_needed() {

event::ptr ev = nullptr;
const auto& users = get_user_insts();
if (users.size() == 1 && users.front()->get_node().is_type<concatenation>()) {
if (users.size() == 1 && users.front()->get_node().is_type<concatenation>() && users.front()->get_node().is_runtime_skippable()) {
auto concat_inst = users.front();
if (concat_inst->can_be_optimized()) {
if (!concat_inst->allocation_done_by_other) {
Expand Down Expand Up @@ -656,7 +656,7 @@ event::ptr primitive_inst::realloc_if_needed() {
}

// Clear out memory if if was previously reused, but now primitive can't be optimized
if (_node->is_runtime_skippable() || _node->is_type<crop>()) {
if (!_node->is_type<concatenation>() && (_node->is_runtime_skippable() || _node->is_type<crop>())) {
if (can_be_optimized()) {
_max_output_layout_count = _deps[0].first->_max_output_layout_count;
GPU_DEBUG_PROFILED_STAGE_MEMALLOC_INFO("can_be_optimized");
Expand Down Expand Up @@ -1351,7 +1351,8 @@ void primitive_inst::do_runtime_in_place_concat() {
if (get_users().size() != 1) return;

auto concat_inst = get_user_insts().front();
if (!concat_inst->get_node().is_type<concatenation>() || !concat_inst->get_node().can_be_optimized())

if (!concat_inst->get_node().is_type<concatenation>() || !(concat_inst->get_node().can_be_optimized() && concat_inst->get_node().is_runtime_skippable()))
return;

if (has_subgraph_dependency(concat_inst->dependencies())) {
Expand Down
12 changes: 12 additions & 0 deletions src/plugins/intel_gpu/src/graph/program_node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1548,6 +1548,18 @@ void program_node::create_onednn_primitive_attributes(
mem_desc.get_dims(), mem_desc.get_data_type());
} else if (is_type<gemm>()) {
size_t rank = cldnn::format::dimension(in.format);
auto in_pshape = in.get_partial_shape();
auto out_pshape = get_output_layout().get_partial_shape();
size_t ones_to_add = std::max(out_pshape.size(), static_cast<size_t>(rank)) - in_pshape.size();
if (ones_to_add > 0) {
layout new_layout = in;
ov::PartialShape new_input_pshape;
std::vector<ov::Dimension> dims(in_pshape.begin(), in_pshape.begin() + in_pshape.size());
new_input_pshape = ov::PartialShape(dims);
new_input_pshape.insert(new_input_pshape.begin(), ones_to_add, 1ul);
new_layout.set_partial_shape(new_input_pshape);
in = new_layout;
}
size_t in_batched_size = in.count() / (in.spatial(0) * in.spatial(1));
dnnl::memory::dims dims = onednn::convert_gemm_tensor(in.get_tensor(), rank, in_batched_size == 1);
dnnl::memory::data_type dt = onednn::convert_data_type(in.data_type);
Expand Down
74 changes: 74 additions & 0 deletions src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1577,6 +1577,76 @@ class gemm_gpu_tests: public ::testing::Test {
ASSERT_NEAR(output_ptr[i], ref_out_data[i], abs_error) << "at " << i;
}
}

void test_dynamic_static_broadcast_3dim(std::vector<size_t> BMKN, bool is_caching_test, const double abs_error = 0.0001) {
tests::random_generator rg;
rg.set_seed(GET_SUITE_NAME);

auto& engine = get_test_engine();
cldnn::layout input0_layout;
cldnn::layout input1_layout;

std::vector<int64_t> input0_order = {0, 1, 2};
std::vector<int64_t> input1_order = {0, 1, 2};
std::vector<int64_t> output_order = {0, 1, 2};

size_t BATCH_SIZE = BMKN[0];
size_t M_SIZE = BMKN[1];
size_t K_SIZE = BMKN[2];
size_t N_SIZE = BMKN[3];

ov::Shape input0_shape = { BATCH_SIZE, M_SIZE, K_SIZE };
ov::Shape input1_shape = { 1, K_SIZE, N_SIZE };
ov::Shape output_shape = { BATCH_SIZE, M_SIZE, N_SIZE };

input0_layout = layout{ov::PartialShape::dynamic(input0_shape.size()), data_types::f16, format::bfyx};
input1_layout = layout{ov::PartialShape(input1_shape), data_types::f16, format::bfyx};

auto input0_mem = engine.allocate_memory(layout{ov::PartialShape(input0_shape), data_types::f16, format::bfyx});
auto input1_mem = engine.allocate_memory(layout{ov::PartialShape(input1_shape), data_types::f16, format::bfyx});

auto input_0_data = rg.generate_random_1d<ov::float16>(ov::shape_size(input0_shape), -2, 2);
auto input_1_data = rg.generate_random_1d<ov::float16>(ov::shape_size(input1_shape), -2, 2);

set_values(input0_mem, input_0_data);
set_values(input1_mem, input_1_data);

topology topology;
topology.add(input_layout("input0", input0_layout),
input_layout("input1", input1_layout),
gemm("gemm", { input_info("input0"), input_info("input1") }, data_types::f16, input0_order, input1_order, output_order)
);

ExecutionConfig config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::optimize_data(true));
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test);
network->set_input_data("input0", input0_mem);
network->set_input_data("input1", input1_mem);

auto outputs = network->execute();

auto output_mem = outputs.at("gemm").get_memory();
cldnn::mem_lock<ov::float16> output_ptr(output_mem, get_test_stream());

std::vector<ov::float16> ref_out_data;
ref_out_data.resize(ov::shape_size(output_shape));

ov::reference::matmul<ov::float16>(input_0_data.data(),
input_1_data.data(),
ref_out_data.data(),
input0_shape,
input1_shape,
output_shape,
false,
false);

ASSERT_EQ(output_ptr.size(), ref_out_data.size());

for (uint32_t i = 0; i < ref_out_data.size(); ++i) {
ASSERT_NEAR(output_ptr[i], ref_out_data[i], abs_error) << "at " << i;
}
}
};

TEST_F(gemm_gpu_tests, basic_bfyx_t2_inplace_crop_with_pad) {
Expand Down Expand Up @@ -1710,6 +1780,10 @@ TEST_F(gemm_gpu_tests, transpose_matmul_static_4d_f32_n_tile_32_input1_ylast) {
this->test_transpose_matmul_f32(4, false, false, /*BMKN*/{19, 37, 23, 29}, /*input0_order*/{0, 1, 2, 3}, /*input1_order*/{0, 1, 3, 2});
}

TEST_F(gemm_gpu_tests, test_dynamic_static_broadcast_3dim) {
this->test_dynamic_static_broadcast_3dim(/*BMKN*/{2, 16, 2, 2}, false);
}

TEST_F(gemm_gpu_tests, transpose_matmul_in0_indirect) {
this->test_transpose_indirect(false, true, false);
}
Expand Down
Loading