refine

Signed-off-by: fishbell <[email protected]>
openvinotoolkit · Feb 18, 2025 · 2464c03 · 2464c03
1 parent fd2117a
commit 2464c03
Show file tree

Hide file tree

Showing 5 changed files with 150 additions and 74 deletions.
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/quantize_gpu_scale_shift_vload8_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/quantize_gpu_scale_shift_vload8_opt.cl
@@ -1,28 +1,26 @@
-// Copyright (C) 2018-2024 Intel Corporation
+// Copyright (C) 2018-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #include "include/batch_headers/fetch_data.cl"
-
-#define TO_OUTPUT_TYPE              CAT(convert_, OUTPUT_TYPE)
 #define INPUT0_VEC_TYPE  MAKE_VECTOR_TYPE(INPUT0_TYPE, 8)
 #define INPUT1_VEC_TYPE  MAKE_VECTOR_TYPE(INPUT1_TYPE, 8)
 #define OUTPUT_VEC_TYPE  MAKE_VECTOR_TYPE(OUTPUT_TYPE, 8)
 
 #define TO_VECTOR_TYPE_IMPL_8(elem_type)  CAT(convert_##elem_type, 8)
 #define TO_VECTOR_TYPE(elem_type, size)   CAT(TO_VECTOR_TYPE_IMPL_, size)(elem_type)
-
 #define TO_VECTOR_TYPE_IMPL_SAT_8(elem_type)  CAT(convert_##elem_type, 8##_sat)
 #define TO_VECTOR_TYPE_IMPL_SAT_RTE_8(elem_type)  CAT(convert_##elem_type, 8##_sat_rte)
 #define TO_VECTOR_TYPE_SAT(elem_type, size)   CAT(TO_VECTOR_TYPE_IMPL_SAT_, size)(elem_type)
 #define TO_VECTOR_TYPE_SAT_RTE(elem_type, size)   CAT(TO_VECTOR_TYPE_IMPL_SAT_RTE_, size)(elem_type)
-#define VLOAD_DECLS vload8(global_id, input)
+
 #ifdef SUB_GROUP_SIZE
 REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
 #endif
 #ifndef IS_DYNAMIC
 __attribute__((reqd_work_group_size(LWS_0, LWS_1, LWS_2)))
 #endif
+
 KERNEL(quantize_gpu_scale_shift_vload8_opt)(OPTIONAL_SHAPE_INFO_ARG
                                      const __global INPUT0_TYPE* input,
                                      const __global INPUT1_TYPE* input_low,
@@ -37,7 +35,8 @@ KERNEL(quantize_gpu_scale_shift_vload8_opt)(OPTIONAL_SHAPE_INFO_ARG
 {
     const int global_id = get_global_id(0);
 
-    const INPUT0_VEC_TYPE in0 = VLOAD_DECLS;
+    const INPUT0_VEC_TYPE in0 = _sub_group_block_read8(input);
+    //vload8(global_id, input);
 
     OUTPUT_VEC_TYPE res;
 
@@ -113,21 +112,11 @@ KERNEL(quantize_gpu_scale_shift_vload8_opt)(OPTIONAL_SHAPE_INFO_ARG
 
 #endif // CAN_USE_OUTPUT_RANGE
 
-// *********************************** //
-// Common section with results writing //
-// *********************************** //
-
-#if FEATURE_BLOCKED_FORMAT
-    //if (of < OUTPUT_FEATURE_NUM)
-#endif
 #if OUTPUT_IS_FP
         res = TO_VECTOR_TYPE_SAT(OUTPUT_TYPE, 8)(val);
 #else
         res = TO_VECTOR_TYPE_SAT_RTE(OUTPUT_TYPE, 8)(val);;
 #endif
 
     vstore8(res, global_id, output);
-}
-
-#undef TO_OUTPUT_TYPE
-#undef TO_OUTPUT_TYPE_SAT_RTE
+}
diff --git a/...intel_gpu/src/kernel_selector/kernels/quantize/quantize_kernel_scale_shift_vload8_opt.cpp b/...intel_gpu/src/kernel_selector/kernels/quantize/quantize_kernel_scale_shift_vload8_opt.cpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2024 Intel Corporation
+// Copyright (C) 2018-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -9,8 +9,7 @@
 
 #include "kernel_selector_utils.h"
 
-static const size_t sub_group_size = 32;
-static const size_t feature_size = 32;
+static const size_t vec_size = 8;
 
 namespace kernel_selector {
 ParamsKey QuantizeKernelScaleShift_vload8::GetSupportedKey() const {
@@ -30,20 +29,43 @@ ParamsKey QuantizeKernelScaleShift_vload8::GetSupportedKey() const {
     k.EnableBatching();
     k.EnableDifferentTypes();
     k.EnableQuantizeScaleShiftOpt();
-    k.EnableDynamicShapesSupport();
     return k;
 }
 
+auto parse_block_size = [](int index, DataLayout dl) {
+    std::string format_str = toString(dl);
+    auto get_block_size = [&] (std::string substr) {
+        auto start_pos = format_str.find(substr);
+        if (start_pos != std::string::npos) {
+            auto end_pos = format_str.find("_", start_pos);
+            auto sub_string = format_str.substr(start_pos + strlen(substr.c_str()) , end_pos);
+            return std::atoi(sub_string.c_str());
+        }
+        return 1;
+    };
+    return index == 0 ? get_block_size("BSV") : (index == 1 ? get_block_size("FSV") : 1);
+};
+
+auto get_total_size = [](const quantize_params& params) {
+    const auto input = params.inputs[0];
+    size_t totalSize = input.LogicalSize();
+    auto feature_block_size = parse_block_size(1, input.GetLayout());
+    auto feature_division = feature_block_size > 1 ? (input.Feature().v ? input.Feature().v : 1) : 1;
+    auto feature_align_multiplexer = feature_block_size > 1 ? Align(input.Feature().v, feature_block_size) : 1;
+    auto batch_block_size = parse_block_size(0, input.GetLayout());
+    auto batch_divsion = batch_block_size > 1 ? (input.Batch().v ? input.Batch().v : 1) : 1;
+    auto batch_align_multiplexer = batch_block_size > 1 ? Align(input.Batch().v, batch_block_size) : 1;
+    return (totalSize / (feature_division * batch_divsion)) * feature_align_multiplexer * batch_align_multiplexer;
+};
+
 CommonDispatchData QuantizeKernelScaleShift_vload8::SetDefault(const quantize_params& params) const {
     CommonDispatchData dispatchData;
-    // need special handle for blocked format??
     if (true) {
-        dispatchData.gws[0] = std::max(params.outputs[0].LogicalSize() / 8, (size_t)1);
+        dispatchData.gws[0] = std::max(get_total_size(params) / vec_size, (size_t)1);
         dispatchData.gws[1] = 1;
         dispatchData.gws[2] = 1;
     }
-    dispatchData.lws = GetOptimalLocalWorkGroupSizes({dispatchData.gws[0], dispatchData.gws[1], dispatchData.gws[2]},
-                                                     params.engineInfo);
+    dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
     return dispatchData;
 }
 
@@ -83,56 +105,41 @@ bool QuantizeKernelScaleShift_vload8::Validate(const Params& p) const {
     // this kernel is opt for per tensor quantization params for now
     if (!params.per_tensor_input_range || !params.per_tensor_output_range || !params.per_tensor_input_scale ||
         !params.per_tensor_output_scale || !params.per_tensor_output_shift ||
-        (params.has_pre_shift && !params.per_tensor_input_shift))
-        return false;
-    /*auto check_blocked_format = [] (const DataTensor& dt) -> bool {
-        // if padding is there for blocked format, there will be uncessary cals introduced if directly using vec compute
-        auto feature_block_size = 16;
-        auto feature_size = dt.Feature().v;
-        if (feature_size % feature_block_size != 0)
-            return false;
-        if (dt.DoubleBlockedLayout()) {
-            auto batch_size = dt.Batch().v;
-            if (batch_size % feature_block_size != 0)
-                return false;
-        }
-        return true;
-    };*/
-    if (!params.outputs[0].SimpleLayout() || params.outputs[0].GetLayout() != params.inputs[0].GetLayout() || params.outputs[0].PhysicalSize() % 8 != 0)
+        (params.has_pre_shift && !params.per_tensor_input_shift) ||
+        params.outputs[0].GetLayout() != params.inputs[0].GetLayout())
         return false;
-    /*if (!params.outputs[0].SimpleLayout()) {
-        //return check_blocked_format(params.outputs[0]);
-        return false;
-    }*/
-    return true;
-}
-
-KernelsData QuantizeKernelScaleShift_vload8::GetKernelsData(const Params& params) const {
-    assert(params.GetType() == KernelType::QUANTIZE);
 
-    KernelData kd = KernelData::Default<quantize_params>(params);
-    quantize_params& nparams = *static_cast<quantize_params*>(kd.params.get());
-
-    if (!Validate(params)) {
-        return {};
+    // for blocked format, if extra padding exist in a block, will be opt in a seprate kernel
+    if (!params.inputs[0].SimpleLayout()) {
+        const auto input_layout = params.inputs[0].GetLayout();
+        const auto batch_size = params.inputs[0].Batch().v;
+        const auto feature_size = params.inputs[0].Feature().v;
+        if (!params.inputs[0].SimpleLayout())
+            if (((input_layout == DataLayout::b_fs_yx_fsv16 || input_layout == DataLayout::b_fs_zyx_fsv16) &&
+                 feature_size % 16 != 0) ||
+                ((input_layout == DataLayout::b_fs_yx_fsv32 || input_layout == DataLayout::b_fs_zyx_fsv32) &&
+                 feature_size % 32 != 0) ||
+                (input_layout == DataLayout::b_fs_yx_fsv4 && feature_size % 8 != 0) ||
+                input_layout == DataLayout::fs_b_yx_fsv32 ||
+                ((input_layout == DataLayout::bs_fs_yx_bsv32_fsv16 ||
+                  input_layout == DataLayout::bs_fs_zyx_bsv32_fsv16) &&
+                 (feature_size % 16 != 0 || batch_size % 32 != 0)) ||
+                ((input_layout == DataLayout::bs_fs_yx_bsv32_fsv32 ||
+                  input_layout == DataLayout::bs_fs_zyx_bsv32_fsv32) &&
+                 (feature_size % 32 != 0 || batch_size % 32 != 0)) ||
+                ((input_layout == DataLayout::bs_fs_yx_bsv16_fsv16 ||
+                  input_layout == DataLayout::bs_fs_zyx_bsv16_fsv16) &&
+                 (feature_size % 16 != 0 || batch_size % 16 != 0)) ||
+                ((input_layout == DataLayout::bs_fs_yx_bsv16_fsv32 ||
+                  input_layout == DataLayout::bs_fs_zyx_bsv16_fsv32) &&
+                 (feature_size % 32 != 0 || batch_size % 16 != 0)))
+                return false;
     }
 
-    auto dispatchData = SetDefault(nparams);
-    auto entry_point = GetEntryPoint(kernelName, nparams.layerID, params);
-    auto cldnn_jit = GetJitConstants(nparams, dispatchData);
-    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
-
-    GetUpdateDispatchDataFunc(kd);
-
-    auto& kernel = kd.kernels[0];
-
-    kernel.params.workGroups.global = dispatchData.gws;
-    kernel.params.workGroups.local = dispatchData.lws;
-    kernel.code.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, EXE_MODE_DEFAULT);
-    kernel.params.arguments =
-        GetArgsDesc(static_cast<int>(nparams.inputs.size()), false, false, 0, 1, nparams.has_dynamic_tensors());
-
-    return {kd};
+    auto total_size = get_total_size(params);
+    if ((total_size % vec_size) != 0 || (params.inputs[0].GetFirstElementOffset() % vec_size) != 0)
+        return false;
+    return true;
 }
 
 KernelsPriority QuantizeKernelScaleShift_vload8::GetKernelsPriority(const Params& /*params*/) const {

diff --git a/...s/intel_gpu/src/kernel_selector/kernels/quantize/quantize_kernel_scale_shift_vload8_opt.h b/...s/intel_gpu/src/kernel_selector/kernels/quantize/quantize_kernel_scale_shift_vload8_opt.h
@@ -17,7 +17,7 @@ class QuantizeKernelScaleShift_vload8 : public QuantizeKernelBase {
     CommonDispatchData SetDefault(const quantize_params& params) const override;
     KernelsPriority GetKernelsPriority(const Params& params) const override;
     ParamsKey GetSupportedKey() const override;
-    KernelsData GetKernelsData(const Params& params) const override;
+
 protected:
     bool Validate(const Params& p) const override;
     JitConstants GetJitConstants(const quantize_params& params, const CommonDispatchData& dispatchData) const override;

diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/quantize/qunatize_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/quantize/qunatize_kernel_selector.cpp
@@ -13,6 +13,7 @@ quantize_kernel_selector::quantize_kernel_selector() {
     Attach<QuantizeKernelRef>();
     Attach<QuantizeKernelScaleShift>();
     Attach<QuantizeKernelScaleShift_vload8>();
+    //Attach<QuantizeKernelScaleShift_vload8_opt>();
 }
 
 KernelsData quantize_kernel_selector::GetBestKernels(const Params& params) const {

diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/quantize_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/quantize_gpu_test.cpp
@@ -758,7 +758,7 @@ TEST(quantize_gpu, opt_vec_kernel) {
     auto output_low  = engine.allocate_memory({ { 1, 1,  1, 1 }, data_types::f32, format::bfyx });
     auto output_high = engine.allocate_memory({ { 1, 1,  1, 1 }, data_types::f32, format::bfyx });
 
-    layout in_dyn_layout { ov::PartialShape::dynamic(4), data_types::f32, format::bfyx };
+    layout in_dyn_layout { ov::PartialShape{1, 16, 2, 2}, data_types::f32, format::bfyx };
 
     set_values(input, { -1.0f, 2.1f, 3.0f, 4.0f,
                          5.0f, 2.0f, 2.0f, 3.0f,
@@ -810,7 +810,6 @@ TEST(quantize_gpu, opt_vec_kernel) {
     auto inst = network.get_primitive("quantize");
     auto impl = inst->get_impl();
     ASSERT_TRUE(impl != nullptr);
-    ASSERT_TRUE(impl->is_dynamic());
 
     auto outputs = network.execute();
 
@@ -828,6 +827,86 @@ TEST(quantize_gpu, opt_vec_kernel) {
     }
 }
 
+TEST(quantize_gpu, opt_vec_kernel_fsv16) {
+    auto& engine = get_test_engine();
+
+    auto input       = engine.allocate_memory({ { 1, 16, 2, 2 }, data_types::f32, format::b_fs_yx_fsv16 });
+    auto input_low   = engine.allocate_memory({ { 1, 1, 1, 1 }, data_types::f32, format::bfyx });
+    auto input_high  = engine.allocate_memory({ { 1, 1, 1, 1 }, data_types::f32, format::bfyx });
+    auto output_low  = engine.allocate_memory({ { 1, 1,  1, 1 }, data_types::f32, format::bfyx });
+    auto output_high = engine.allocate_memory({ { 1, 1,  1, 1 }, data_types::f32, format::bfyx });
+
+    layout in_dyn_layout { ov::PartialShape{1, 16, 2, 2}, data_types::f32, format::b_fs_yx_fsv16 };
+
+    set_values(input, { -1.0f, 2.1f, 3.0f, 4.0f,
+                         5.0f, 2.0f, 2.0f, 3.0f,
+                         4.0f, 6.0f, 3.0f, 3.0f,
+                         3.0f, 5.0f, 1.0f, 1.0f,
+
+                         1.0f, 1.0f, 1.0f, 1.0f,
+                         4.0f, 6.0f, 3.0f, 3.0f,
+                         3.0f, 5.0f, 1.0f, 1.0f,
+                         1.0f, 1.0f, 1.0f, 1.0f,
+
+                         1.0f, 2.0f, 3.0f, 4.0f,
+                         5.0f, 2.0f, 2.0f, 3.0f,
+                         4.0f, 6.0f, 3.0f, 3.0f,
+                         3.0f, 5.0f, 1.0f, 1.0f,
+
+                         1.0f, 1.0f, 1.0f, 1.0f,
+                         4.0f, 6.0f, 3.0f, 3.0f,
+                         3.0f, 5.0f, 1.0f, 1.0f,
+                         1.0f, 1.0f, 1.0f, 1.0f });
+
+    set_values(input_low,  { 0.0f });
+    set_values(input_high, { 10.0f });
+
+    set_values(output_low,  { 0.0f });
+    set_values(output_high, { 255.0f });
+
+    std::vector<uint8_t> ref_data = {0,  53, 76, 102, 128, 51,  51, 76, 102, 153, 76, 76, 76, 128, 25, 25,
+                                     25, 25, 25, 25,  102, 153, 76, 76, 76,  128, 25, 25, 25, 25,  25, 25,
+                                     25, 51, 76, 102, 128, 51,  51, 76, 102, 153, 76, 76, 76, 128, 25, 25,
+                                     25, 25, 25, 25,  102, 153, 76, 76, 76,  128, 25, 25, 25, 25,  25, 25};
+
+    topology topology;
+    topology.add(
+        input_layout("input", in_dyn_layout),
+        data("input_low", input_low),
+        data("input_high", input_high),
+        data("output_low", output_low),
+        data("output_high", output_high),
+        quantize("quantize", input_info("input"), input_info("input_low"), input_info("input_high"), input_info("output_low"), input_info("output_high"), 255, data_types::u8),
+        reorder("reorder", input_info("quantize"), format::bfyx, data_types::f32),
+        softmax("softmax", input_info("reorder"))
+    );
+
+    ExecutionConfig config = get_test_default_config(engine);
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+    config.set_property(ov::intel_gpu::optimize_data(true));
+    network network(engine, topology, config);
+    network.set_input_data("input", input);
+
+    auto inst = network.get_primitive("quantize");
+    auto impl = inst->get_impl();
+    ASSERT_TRUE(impl != nullptr);
+
+    auto outputs = network.execute();
+
+    auto output = outputs.at("softmax").get_memory();
+    cldnn::mem_lock<float> output_ptr(output, get_test_stream());
+
+    // Check that layout and memory contains logical size of tensor
+    ASSERT_EQ(output->count(), (size_t)64);
+    ASSERT_EQ(output->get_layout().count(), (size_t)64);
+
+    ASSERT_EQ(output->size(), ref_data.size() * sizeof(float));
+
+    for (size_t i = 0; i < ref_data.size(); ++i) {
+        ASSERT_NEAR(output_ptr[i], ref_data[i], 1) << " index = " << i;
+    }
+}
+
 TEST(quantize_gpu, dynamic_fsv16) {
     auto& engine = get_test_engine();