Skip to content

Commit

Permalink
refine
Browse files Browse the repository at this point in the history
Signed-off-by: fishbell <[email protected]>
  • Loading branch information
songbell committed Feb 18, 2025
1 parent fd2117a commit 2464c03
Show file tree
Hide file tree
Showing 5 changed files with 150 additions and 74 deletions.
Original file line number Diff line number Diff line change
@@ -1,28 +1,26 @@
// Copyright (C) 2018-2024 Intel Corporation
// Copyright (C) 2018-2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "include/batch_headers/fetch_data.cl"

#define TO_OUTPUT_TYPE CAT(convert_, OUTPUT_TYPE)
#define INPUT0_VEC_TYPE MAKE_VECTOR_TYPE(INPUT0_TYPE, 8)
#define INPUT1_VEC_TYPE MAKE_VECTOR_TYPE(INPUT1_TYPE, 8)
#define OUTPUT_VEC_TYPE MAKE_VECTOR_TYPE(OUTPUT_TYPE, 8)

#define TO_VECTOR_TYPE_IMPL_8(elem_type) CAT(convert_##elem_type, 8)
#define TO_VECTOR_TYPE(elem_type, size) CAT(TO_VECTOR_TYPE_IMPL_, size)(elem_type)

#define TO_VECTOR_TYPE_IMPL_SAT_8(elem_type) CAT(convert_##elem_type, 8##_sat)
#define TO_VECTOR_TYPE_IMPL_SAT_RTE_8(elem_type) CAT(convert_##elem_type, 8##_sat_rte)
#define TO_VECTOR_TYPE_SAT(elem_type, size) CAT(TO_VECTOR_TYPE_IMPL_SAT_, size)(elem_type)
#define TO_VECTOR_TYPE_SAT_RTE(elem_type, size) CAT(TO_VECTOR_TYPE_IMPL_SAT_RTE_, size)(elem_type)
#define VLOAD_DECLS vload8(global_id, input)

#ifdef SUB_GROUP_SIZE
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
#endif
#ifndef IS_DYNAMIC
__attribute__((reqd_work_group_size(LWS_0, LWS_1, LWS_2)))
#endif

KERNEL(quantize_gpu_scale_shift_vload8_opt)(OPTIONAL_SHAPE_INFO_ARG
const __global INPUT0_TYPE* input,
const __global INPUT1_TYPE* input_low,
Expand All @@ -37,7 +35,8 @@ KERNEL(quantize_gpu_scale_shift_vload8_opt)(OPTIONAL_SHAPE_INFO_ARG
{
const int global_id = get_global_id(0);

const INPUT0_VEC_TYPE in0 = VLOAD_DECLS;
const INPUT0_VEC_TYPE in0 = _sub_group_block_read8(input);
//vload8(global_id, input);

OUTPUT_VEC_TYPE res;

Expand Down Expand Up @@ -113,21 +112,11 @@ KERNEL(quantize_gpu_scale_shift_vload8_opt)(OPTIONAL_SHAPE_INFO_ARG

#endif // CAN_USE_OUTPUT_RANGE

// *********************************** //
// Common section with results writing //
// *********************************** //

#if FEATURE_BLOCKED_FORMAT
//if (of < OUTPUT_FEATURE_NUM)
#endif
#if OUTPUT_IS_FP
res = TO_VECTOR_TYPE_SAT(OUTPUT_TYPE, 8)(val);
#else
res = TO_VECTOR_TYPE_SAT_RTE(OUTPUT_TYPE, 8)(val);;
#endif

vstore8(res, global_id, output);
}

#undef TO_OUTPUT_TYPE
#undef TO_OUTPUT_TYPE_SAT_RTE
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (C) 2018-2024 Intel Corporation
// Copyright (C) 2018-2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

Expand All @@ -9,8 +9,7 @@

#include "kernel_selector_utils.h"

static const size_t sub_group_size = 32;
static const size_t feature_size = 32;
static const size_t vec_size = 8;

namespace kernel_selector {
ParamsKey QuantizeKernelScaleShift_vload8::GetSupportedKey() const {
Expand All @@ -30,20 +29,43 @@ ParamsKey QuantizeKernelScaleShift_vload8::GetSupportedKey() const {
k.EnableBatching();
k.EnableDifferentTypes();
k.EnableQuantizeScaleShiftOpt();
k.EnableDynamicShapesSupport();
return k;
}

auto parse_block_size = [](int index, DataLayout dl) {
std::string format_str = toString(dl);
auto get_block_size = [&] (std::string substr) {
auto start_pos = format_str.find(substr);
if (start_pos != std::string::npos) {
auto end_pos = format_str.find("_", start_pos);
auto sub_string = format_str.substr(start_pos + strlen(substr.c_str()) , end_pos);
return std::atoi(sub_string.c_str());
}
return 1;
};
return index == 0 ? get_block_size("BSV") : (index == 1 ? get_block_size("FSV") : 1);
};

auto get_total_size = [](const quantize_params& params) {
const auto input = params.inputs[0];
size_t totalSize = input.LogicalSize();
auto feature_block_size = parse_block_size(1, input.GetLayout());
auto feature_division = feature_block_size > 1 ? (input.Feature().v ? input.Feature().v : 1) : 1;
auto feature_align_multiplexer = feature_block_size > 1 ? Align(input.Feature().v, feature_block_size) : 1;
auto batch_block_size = parse_block_size(0, input.GetLayout());
auto batch_divsion = batch_block_size > 1 ? (input.Batch().v ? input.Batch().v : 1) : 1;
auto batch_align_multiplexer = batch_block_size > 1 ? Align(input.Batch().v, batch_block_size) : 1;
return (totalSize / (feature_division * batch_divsion)) * feature_align_multiplexer * batch_align_multiplexer;
};

CommonDispatchData QuantizeKernelScaleShift_vload8::SetDefault(const quantize_params& params) const {
CommonDispatchData dispatchData;
// need special handle for blocked format??
if (true) {
dispatchData.gws[0] = std::max(params.outputs[0].LogicalSize() / 8, (size_t)1);
dispatchData.gws[0] = std::max(get_total_size(params) / vec_size, (size_t)1);
dispatchData.gws[1] = 1;
dispatchData.gws[2] = 1;
}
dispatchData.lws = GetOptimalLocalWorkGroupSizes({dispatchData.gws[0], dispatchData.gws[1], dispatchData.gws[2]},
params.engineInfo);
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
return dispatchData;
}

Expand Down Expand Up @@ -83,56 +105,41 @@ bool QuantizeKernelScaleShift_vload8::Validate(const Params& p) const {
// this kernel is opt for per tensor quantization params for now
if (!params.per_tensor_input_range || !params.per_tensor_output_range || !params.per_tensor_input_scale ||
!params.per_tensor_output_scale || !params.per_tensor_output_shift ||
(params.has_pre_shift && !params.per_tensor_input_shift))
return false;
/*auto check_blocked_format = [] (const DataTensor& dt) -> bool {
// if padding is there for blocked format, there will be uncessary cals introduced if directly using vec compute
auto feature_block_size = 16;
auto feature_size = dt.Feature().v;
if (feature_size % feature_block_size != 0)
return false;
if (dt.DoubleBlockedLayout()) {
auto batch_size = dt.Batch().v;
if (batch_size % feature_block_size != 0)
return false;
}
return true;
};*/
if (!params.outputs[0].SimpleLayout() || params.outputs[0].GetLayout() != params.inputs[0].GetLayout() || params.outputs[0].PhysicalSize() % 8 != 0)
(params.has_pre_shift && !params.per_tensor_input_shift) ||
params.outputs[0].GetLayout() != params.inputs[0].GetLayout())
return false;
/*if (!params.outputs[0].SimpleLayout()) {
//return check_blocked_format(params.outputs[0]);
return false;
}*/
return true;
}

KernelsData QuantizeKernelScaleShift_vload8::GetKernelsData(const Params& params) const {
assert(params.GetType() == KernelType::QUANTIZE);

KernelData kd = KernelData::Default<quantize_params>(params);
quantize_params& nparams = *static_cast<quantize_params*>(kd.params.get());

if (!Validate(params)) {
return {};
// for blocked format, if extra padding exist in a block, will be opt in a seprate kernel
if (!params.inputs[0].SimpleLayout()) {
const auto input_layout = params.inputs[0].GetLayout();
const auto batch_size = params.inputs[0].Batch().v;
const auto feature_size = params.inputs[0].Feature().v;
if (!params.inputs[0].SimpleLayout())
if (((input_layout == DataLayout::b_fs_yx_fsv16 || input_layout == DataLayout::b_fs_zyx_fsv16) &&
feature_size % 16 != 0) ||
((input_layout == DataLayout::b_fs_yx_fsv32 || input_layout == DataLayout::b_fs_zyx_fsv32) &&
feature_size % 32 != 0) ||
(input_layout == DataLayout::b_fs_yx_fsv4 && feature_size % 8 != 0) ||
input_layout == DataLayout::fs_b_yx_fsv32 ||
((input_layout == DataLayout::bs_fs_yx_bsv32_fsv16 ||
input_layout == DataLayout::bs_fs_zyx_bsv32_fsv16) &&
(feature_size % 16 != 0 || batch_size % 32 != 0)) ||
((input_layout == DataLayout::bs_fs_yx_bsv32_fsv32 ||
input_layout == DataLayout::bs_fs_zyx_bsv32_fsv32) &&
(feature_size % 32 != 0 || batch_size % 32 != 0)) ||
((input_layout == DataLayout::bs_fs_yx_bsv16_fsv16 ||
input_layout == DataLayout::bs_fs_zyx_bsv16_fsv16) &&
(feature_size % 16 != 0 || batch_size % 16 != 0)) ||
((input_layout == DataLayout::bs_fs_yx_bsv16_fsv32 ||
input_layout == DataLayout::bs_fs_zyx_bsv16_fsv32) &&
(feature_size % 32 != 0 || batch_size % 16 != 0)))
return false;
}

auto dispatchData = SetDefault(nparams);
auto entry_point = GetEntryPoint(kernelName, nparams.layerID, params);
auto cldnn_jit = GetJitConstants(nparams, dispatchData);
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);

GetUpdateDispatchDataFunc(kd);

auto& kernel = kd.kernels[0];

kernel.params.workGroups.global = dispatchData.gws;
kernel.params.workGroups.local = dispatchData.lws;
kernel.code.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, EXE_MODE_DEFAULT);
kernel.params.arguments =
GetArgsDesc(static_cast<int>(nparams.inputs.size()), false, false, 0, 1, nparams.has_dynamic_tensors());

return {kd};
auto total_size = get_total_size(params);
if ((total_size % vec_size) != 0 || (params.inputs[0].GetFirstElementOffset() % vec_size) != 0)
return false;
return true;
}

KernelsPriority QuantizeKernelScaleShift_vload8::GetKernelsPriority(const Params& /*params*/) const {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class QuantizeKernelScaleShift_vload8 : public QuantizeKernelBase {
CommonDispatchData SetDefault(const quantize_params& params) const override;
KernelsPriority GetKernelsPriority(const Params& params) const override;
ParamsKey GetSupportedKey() const override;
KernelsData GetKernelsData(const Params& params) const override;

protected:
bool Validate(const Params& p) const override;
JitConstants GetJitConstants(const quantize_params& params, const CommonDispatchData& dispatchData) const override;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ quantize_kernel_selector::quantize_kernel_selector() {
Attach<QuantizeKernelRef>();
Attach<QuantizeKernelScaleShift>();
Attach<QuantizeKernelScaleShift_vload8>();
//Attach<QuantizeKernelScaleShift_vload8_opt>();
}

KernelsData quantize_kernel_selector::GetBestKernels(const Params& params) const {
Expand Down
83 changes: 81 additions & 2 deletions src/plugins/intel_gpu/tests/unit/test_cases/quantize_gpu_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -758,7 +758,7 @@ TEST(quantize_gpu, opt_vec_kernel) {
auto output_low = engine.allocate_memory({ { 1, 1, 1, 1 }, data_types::f32, format::bfyx });
auto output_high = engine.allocate_memory({ { 1, 1, 1, 1 }, data_types::f32, format::bfyx });

layout in_dyn_layout { ov::PartialShape::dynamic(4), data_types::f32, format::bfyx };
layout in_dyn_layout { ov::PartialShape{1, 16, 2, 2}, data_types::f32, format::bfyx };

set_values(input, { -1.0f, 2.1f, 3.0f, 4.0f,
5.0f, 2.0f, 2.0f, 3.0f,
Expand Down Expand Up @@ -810,7 +810,6 @@ TEST(quantize_gpu, opt_vec_kernel) {
auto inst = network.get_primitive("quantize");
auto impl = inst->get_impl();
ASSERT_TRUE(impl != nullptr);
ASSERT_TRUE(impl->is_dynamic());

auto outputs = network.execute();

Expand All @@ -828,6 +827,86 @@ TEST(quantize_gpu, opt_vec_kernel) {
}
}

TEST(quantize_gpu, opt_vec_kernel_fsv16) {
auto& engine = get_test_engine();

auto input = engine.allocate_memory({ { 1, 16, 2, 2 }, data_types::f32, format::b_fs_yx_fsv16 });
auto input_low = engine.allocate_memory({ { 1, 1, 1, 1 }, data_types::f32, format::bfyx });
auto input_high = engine.allocate_memory({ { 1, 1, 1, 1 }, data_types::f32, format::bfyx });
auto output_low = engine.allocate_memory({ { 1, 1, 1, 1 }, data_types::f32, format::bfyx });
auto output_high = engine.allocate_memory({ { 1, 1, 1, 1 }, data_types::f32, format::bfyx });

layout in_dyn_layout { ov::PartialShape{1, 16, 2, 2}, data_types::f32, format::b_fs_yx_fsv16 };

set_values(input, { -1.0f, 2.1f, 3.0f, 4.0f,
5.0f, 2.0f, 2.0f, 3.0f,
4.0f, 6.0f, 3.0f, 3.0f,
3.0f, 5.0f, 1.0f, 1.0f,

1.0f, 1.0f, 1.0f, 1.0f,
4.0f, 6.0f, 3.0f, 3.0f,
3.0f, 5.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f,

1.0f, 2.0f, 3.0f, 4.0f,
5.0f, 2.0f, 2.0f, 3.0f,
4.0f, 6.0f, 3.0f, 3.0f,
3.0f, 5.0f, 1.0f, 1.0f,

1.0f, 1.0f, 1.0f, 1.0f,
4.0f, 6.0f, 3.0f, 3.0f,
3.0f, 5.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f });

set_values(input_low, { 0.0f });
set_values(input_high, { 10.0f });

set_values(output_low, { 0.0f });
set_values(output_high, { 255.0f });

std::vector<uint8_t> ref_data = {0, 53, 76, 102, 128, 51, 51, 76, 102, 153, 76, 76, 76, 128, 25, 25,
25, 25, 25, 25, 102, 153, 76, 76, 76, 128, 25, 25, 25, 25, 25, 25,
25, 51, 76, 102, 128, 51, 51, 76, 102, 153, 76, 76, 76, 128, 25, 25,
25, 25, 25, 25, 102, 153, 76, 76, 76, 128, 25, 25, 25, 25, 25, 25};

topology topology;
topology.add(
input_layout("input", in_dyn_layout),
data("input_low", input_low),
data("input_high", input_high),
data("output_low", output_low),
data("output_high", output_high),
quantize("quantize", input_info("input"), input_info("input_low"), input_info("input_high"), input_info("output_low"), input_info("output_high"), 255, data_types::u8),
reorder("reorder", input_info("quantize"), format::bfyx, data_types::f32),
softmax("softmax", input_info("reorder"))
);

ExecutionConfig config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
config.set_property(ov::intel_gpu::optimize_data(true));
network network(engine, topology, config);
network.set_input_data("input", input);

auto inst = network.get_primitive("quantize");
auto impl = inst->get_impl();
ASSERT_TRUE(impl != nullptr);

auto outputs = network.execute();

auto output = outputs.at("softmax").get_memory();
cldnn::mem_lock<float> output_ptr(output, get_test_stream());

// Check that layout and memory contains logical size of tensor
ASSERT_EQ(output->count(), (size_t)64);
ASSERT_EQ(output->get_layout().count(), (size_t)64);

ASSERT_EQ(output->size(), ref_data.size() * sizeof(float));

for (size_t i = 0; i < ref_data.size(); ++i) {
ASSERT_NEAR(output_ptr[i], ref_data[i], 1) << " index = " << i;
}
}

TEST(quantize_gpu, dynamic_fsv16) {
auto& engine = get_test_engine();

Expand Down

0 comments on commit 2464c03

Please sign in to comment.