Merge branch 'master' into cmake_refactor

openvinotoolkit · Feb 19, 2025 · ccdd193 · ccdd193
2 parents feaaa55 + 6d19e8b
commit ccdd193
Show file tree

Hide file tree

Showing 7 changed files with 115 additions and 78 deletions.
diff --git a/src/common/util/include/openvino/util/common_util.hpp b/src/common/util/include/openvino/util/common_util.hpp
@@ -180,5 +180,13 @@ constexpr std::array<std::conditional_t<std::is_void_v<T>, std::common_type_t<Ar
     return {std::forward<Args>(args)...};
 }
 
+#if defined(_WIN32)
+bool may_i_use_dynamic_code();
+#else
+constexpr bool may_i_use_dynamic_code() {
+    return true;
+}
+#endif
+
 }  // namespace util
 }  // namespace ov
diff --git a/src/common/util/include/openvino/util/file_util.hpp b/src/common/util/include/openvino/util/file_util.hpp
@@ -239,7 +239,11 @@ inline std::string from_file_path(const ov::util::Path& path) {
 
 // TODO: remove this function after all calls use Path
 inline FilePath to_file_path(const ov::util::Path& path) {
+#if defined(_WIN32) && defined(OPENVINO_ENABLE_UNICODE_PATH_SUPPORT)
+    return ov::util::string_to_wstring(path.string());
+#else
     return path.native();
+#endif
 }
 
 #ifdef OPENVINO_ENABLE_UNICODE_PATH_SUPPORT

diff --git a/src/common/util/src/common_util.cpp b/src/common/util/src/common_util.cpp
@@ -6,6 +6,10 @@
 
 #include <algorithm>
 
+#if defined(_WIN32)
+#    include <windows.h>
+#endif
+
 std::string ov::util::to_lower(const std::string& s) {
     std::string rc = s;
     std::transform(rc.begin(), rc.end(), rc.begin(), ::tolower);
@@ -60,3 +64,12 @@ std::string ov::util::filter_lines_by_prefix(const std::string& str, const std::
     }
     return res.str();
 }
+
+#if defined(_WIN32)
+bool ov::util::may_i_use_dynamic_code() {
+    HANDLE handle = GetCurrentProcess();
+    PROCESS_MITIGATION_DYNAMIC_CODE_POLICY dynamic_code_policy = {0};
+    GetProcessMitigationPolicy(handle, ProcessDynamicCodePolicy, &dynamic_code_policy, sizeof(dynamic_code_policy));
+    return dynamic_code_policy.ProhibitDynamicCode != TRUE;
+}
+#endif
diff --git a/src/core/reference/src/op/convert.cpp b/src/core/reference/src/op/convert.cpp
@@ -8,6 +8,7 @@
 
 #ifdef OV_CORE_USE_XBYAK_JIT
 #    include "openvino/reference/utils/jit_generator.hpp"
+#    include "openvino/util/common_util.hpp"
 #endif
 
 #ifdef OV_CORE_USE_INTRINSICS
@@ -480,14 +481,15 @@ class jit_count_out_of_range : public jit::Generator {
 template <class Clamp, typename TI, typename TO>
 void convert_impl(const TI* arg, TO* out, size_t count) {
 #ifdef OV_CORE_USE_XBYAK_JIT
-    if (auto converter = jit_convert_array::get<TI, TO, Clamp::enabled>()) {
-        jit_convert_array::args_t args = {arg, out, count};
-        converter(&args);
-    } else
-#endif
-    {
-        Converter<TI, TO>::template apply<Clamp>(arg, out, count);
+    if (util::may_i_use_dynamic_code()) {
+        if (auto converter = jit_convert_array::get<TI, TO, Clamp::enabled>()) {
+            jit_convert_array::args_t args = {arg, out, count};
+            converter(&args);
+            return;
+        }
     }
+#endif  // OV_CORE_USE_XBYAK_JIT
+    Converter<TI, TO>::template apply<Clamp>(arg, out, count);
 }
 }  // namespace
 
@@ -544,11 +546,13 @@ void convert_from_bf16_to_f16_with_clamp(const bfloat16* arg, float16* out, size
 
 size_t count_out_of_f16_range(const float* arg, size_t count) {
 #ifdef OV_CORE_USE_XBYAK_JIT
-    if (auto converter = jit_count_out_of_range::get<float, float16>()) {
-        size_t num_out_of_range = 0;
-        jit_count_out_of_range::args_t args = {arg, &num_out_of_range, count};
-        converter(&args);
-        return num_out_of_range;
+    if (util::may_i_use_dynamic_code()) {
+        if (auto converter = jit_count_out_of_range::get<float, float16>()) {
+            size_t num_out_of_range = 0;
+            jit_count_out_of_range::args_t args = {arg, &num_out_of_range, count};
+            converter(&args);
+            return num_out_of_range;
+        }
     }
 #endif  // OV_CORE_USE_XBYAK_JIT
     const auto is_out_of_f16_range = [](const float v) {

diff --git a/src/core/src/runtime/compute_hash.cpp b/src/core/src/runtime/compute_hash.cpp
@@ -21,6 +21,7 @@
 #ifdef OV_CORE_USE_XBYAK_JIT
 #    include "openvino/core/parallel.hpp"
 #    include "openvino/reference/utils/registers_pool.hpp"
+#    include "openvino/util/common_util.hpp"
 #endif  // OV_CORE_USE_XBYAK_JIT
 
 namespace ov {
@@ -822,77 +823,79 @@ void ComputeHash<isa>::fold_to_64(const Vmm& v_dst) {
 
 size_t compute_hash(const void* src, size_t size) {
 #ifdef OV_CORE_USE_XBYAK_JIT
-    if (Generator::mayiuse(avx2)) {
-        uint64_t result = 0lu;
-
-        // Parallel section
-        constexpr uint64_t min_wa_per_thread = 131072lu;  // 2^17
-        const uint64_t size_u64 = static_cast<uint64_t>(size);
-        if (size_u64 >= min_wa_per_thread * 2lu) {
-            static auto first_thr_kernel = Generator::mayiuse(avx512_core)
-                                               ? jit::ComputeHash<avx512_core>::create({jit::FIRST_THREAD})
-                                               : jit::ComputeHash<avx2>::create({jit::FIRST_THREAD});
-            static auto n_thr_kernel = Generator::mayiuse(avx512_core)
-                                           ? jit::ComputeHash<avx512_core>::create({jit::N_THREAD})
-                                           : jit::ComputeHash<avx2>::create({jit::N_THREAD});
-            static auto final_fold_kernel = Generator::mayiuse(avx512_core)
-                                                ? jit::ComputeHash<avx512_core>::create({jit::FINAL_FOLD})
-                                                : jit::ComputeHash<avx2>::create({jit::FINAL_FOLD});
-
-            static const uint64_t max_thr_num = 2lu;
-            uint64_t thr_num = std::min(size_u64 / min_wa_per_thread, max_thr_num);
-            const uint64_t el_per_thread =
-                first_thr_kernel->get_vlen() * ((size_u64 / thr_num) / first_thr_kernel->get_vlen());
-            std::vector<uint8_t> intermediate(thr_num * first_thr_kernel->get_vlen());
-
-            parallel_nt_static(static_cast<int>(thr_num), [&](const int ithr, const int nthr) {
-                uint64_t start = el_per_thread * ithr;
-                if (start >= size_u64) {
-                    return;
-                }
-                uint64_t work_amount = (el_per_thread + start > size_u64) ? size_u64 - start : el_per_thread;
+    if (util::may_i_use_dynamic_code()) {
+        if (Generator::mayiuse(avx2)) {
+            uint64_t result = 0lu;
+
+            // Parallel section
+            constexpr uint64_t min_wa_per_thread = 131072lu;  // 2^17
+            const uint64_t size_u64 = static_cast<uint64_t>(size);
+            if (size_u64 >= min_wa_per_thread * 2lu) {
+                static auto first_thr_kernel = Generator::mayiuse(avx512_core)
+                                                   ? jit::ComputeHash<avx512_core>::create({jit::FIRST_THREAD})
+                                                   : jit::ComputeHash<avx2>::create({jit::FIRST_THREAD});
+                static auto n_thr_kernel = Generator::mayiuse(avx512_core)
+                                               ? jit::ComputeHash<avx512_core>::create({jit::N_THREAD})
+                                               : jit::ComputeHash<avx2>::create({jit::N_THREAD});
+                static auto final_fold_kernel = Generator::mayiuse(avx512_core)
+                                                    ? jit::ComputeHash<avx512_core>::create({jit::FINAL_FOLD})
+                                                    : jit::ComputeHash<avx2>::create({jit::FINAL_FOLD});
+
+                static const uint64_t max_thr_num = 2lu;
+                uint64_t thr_num = std::min(size_u64 / min_wa_per_thread, max_thr_num);
+                const uint64_t el_per_thread =
+                    first_thr_kernel->get_vlen() * ((size_u64 / thr_num) / first_thr_kernel->get_vlen());
+                std::vector<uint8_t> intermediate(thr_num * first_thr_kernel->get_vlen());
+
+                parallel_nt_static(static_cast<int>(thr_num), [&](const int ithr, const int nthr) {
+                    uint64_t start = el_per_thread * ithr;
+                    if (start >= size_u64) {
+                        return;
+                    }
+                    uint64_t work_amount = (el_per_thread + start > size_u64) ? size_u64 - start : el_per_thread;
+
+                    jit::ComputeHashCallArgs args;
+
+                    args.src_ptr = reinterpret_cast<const uint8_t*>(src) + first_thr_kernel->get_vlen() * ithr;
+                    args.dst_ptr = &(intermediate[first_thr_kernel->get_vlen() * ithr]);
+                    args.k_ptr = jit::K_PULL;
+                    args.work_amount = work_amount;
+                    args.size = size_u64;
+                    args.threads_num = thr_num;
+
+                    if (ithr == 0) {
+                        (*first_thr_kernel)(&args);
+                    } else {
+                        (*n_thr_kernel)(&args);
+                    }
+                });
 
                 jit::ComputeHashCallArgs args;
+                args.work_amount = size_u64 - el_per_thread * thr_num;
+                args.src_ptr = reinterpret_cast<const uint8_t*>(src) + size_u64 - args.work_amount;
+                args.dst_ptr = &result;
+                args.k_ptr = jit::K_PULL;
+                args.size = size_u64;
+                args.intermediate_ptr = intermediate.data();
 
-                args.src_ptr = reinterpret_cast<const uint8_t*>(src) + first_thr_kernel->get_vlen() * ithr;
-                args.dst_ptr = &(intermediate[first_thr_kernel->get_vlen() * ithr]);
+                (*final_fold_kernel)(&args);
+            } else {
+                static auto single_thr_kernel = Generator::mayiuse(avx512_core)
+                                                    ? jit::ComputeHash<avx512_core>::create({jit::SINGLE_THREAD})
+                                                    : jit::ComputeHash<avx2>::create({jit::SINGLE_THREAD});
+
+                jit::ComputeHashCallArgs args;
+                args.src_ptr = src;
+                args.dst_ptr = &result;
                 args.k_ptr = jit::K_PULL;
-                args.work_amount = work_amount;
+                args.work_amount = size_u64;
                 args.size = size_u64;
-                args.threads_num = thr_num;
-
-                if (ithr == 0) {
-                    (*first_thr_kernel)(&args);
-                } else {
-                    (*n_thr_kernel)(&args);
-                }
-            });
-
-            jit::ComputeHashCallArgs args;
-            args.work_amount = size_u64 - el_per_thread * thr_num;
-            args.src_ptr = reinterpret_cast<const uint8_t*>(src) + size_u64 - args.work_amount;
-            args.dst_ptr = &result;
-            args.k_ptr = jit::K_PULL;
-            args.size = size_u64;
-            args.intermediate_ptr = intermediate.data();
-
-            (*final_fold_kernel)(&args);
-        } else {
-            static auto single_thr_kernel = Generator::mayiuse(avx512_core)
-                                                ? jit::ComputeHash<avx512_core>::create({jit::SINGLE_THREAD})
-                                                : jit::ComputeHash<avx2>::create({jit::SINGLE_THREAD});
-
-            jit::ComputeHashCallArgs args;
-            args.src_ptr = src;
-            args.dst_ptr = &result;
-            args.k_ptr = jit::K_PULL;
-            args.work_amount = size_u64;
-            args.size = size_u64;
-
-            (*single_thr_kernel)(&args);
-        }
 
-        return result;
+                (*single_thr_kernel)(&args);
+            }
+
+            return result;
+        }
     }
 
 #endif  // OV_CORE_USE_XBYAK_JIT

diff --git a/src/core/src/type/float8_e4m3.cpp b/src/core/src/type/float8_e4m3.cpp
@@ -8,8 +8,8 @@
 #include <cmath>
 #include <limits>
 
-#include "openvino/core/type/float_util.hpp"
 #include "openvino/core/type/float16.hpp"
+#include "openvino/core/type/float_util.hpp"
 
 namespace ov {
 

diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@@ -1460,6 +1460,11 @@ void primitive_inst::do_runtime_skip_gather() {
                 GPU_DEBUG_TRACE_DETAIL << "--- Cannot optimize because idx_data [" << i << "] (" << idx_data[i] << ") != " << i << std::endl;
                 if (_impl_params->output_layouts[0].data_padding.is_dynamic())
                     _impl_params->output_layouts[0].data_padding = padding();
+                // for runtime skippable nodes, if previous iter is skipped while this iter not, its output memory needs to be revalidate
+                // as memory opt/release may be applied for these nodes to reduce memory footprint in previous iters
+                if (can_be_optimized()) {
+                    set_flag(ExecutionFlags::SHAPE_CHANGED);
+                }
                 set_can_be_optimized(false);
                 return;
             }