Skip to content

Commit

Permalink
Merge branch 'master' into cmake_refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
akladiev authored Feb 19, 2025
2 parents feaaa55 + 6d19e8b commit ccdd193
Show file tree
Hide file tree
Showing 7 changed files with 115 additions and 78 deletions.
8 changes: 8 additions & 0 deletions src/common/util/include/openvino/util/common_util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -180,5 +180,13 @@ constexpr std::array<std::conditional_t<std::is_void_v<T>, std::common_type_t<Ar
return {std::forward<Args>(args)...};
}

#if defined(_WIN32)
bool may_i_use_dynamic_code();
#else
constexpr bool may_i_use_dynamic_code() {
return true;
}
#endif

} // namespace util
} // namespace ov
4 changes: 4 additions & 0 deletions src/common/util/include/openvino/util/file_util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,11 @@ inline std::string from_file_path(const ov::util::Path& path) {

// TODO: remove this function after all calls use Path
inline FilePath to_file_path(const ov::util::Path& path) {
#if defined(_WIN32) && defined(OPENVINO_ENABLE_UNICODE_PATH_SUPPORT)
return ov::util::string_to_wstring(path.string());
#else
return path.native();
#endif
}

#ifdef OPENVINO_ENABLE_UNICODE_PATH_SUPPORT
Expand Down
13 changes: 13 additions & 0 deletions src/common/util/src/common_util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@

#include <algorithm>

#if defined(_WIN32)
# include <windows.h>
#endif

std::string ov::util::to_lower(const std::string& s) {
std::string rc = s;
std::transform(rc.begin(), rc.end(), rc.begin(), ::tolower);
Expand Down Expand Up @@ -60,3 +64,12 @@ std::string ov::util::filter_lines_by_prefix(const std::string& str, const std::
}
return res.str();
}

#if defined(_WIN32)
bool ov::util::may_i_use_dynamic_code() {
HANDLE handle = GetCurrentProcess();
PROCESS_MITIGATION_DYNAMIC_CODE_POLICY dynamic_code_policy = {0};
GetProcessMitigationPolicy(handle, ProcessDynamicCodePolicy, &dynamic_code_policy, sizeof(dynamic_code_policy));
return dynamic_code_policy.ProhibitDynamicCode != TRUE;
}
#endif
28 changes: 16 additions & 12 deletions src/core/reference/src/op/convert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#ifdef OV_CORE_USE_XBYAK_JIT
# include "openvino/reference/utils/jit_generator.hpp"
# include "openvino/util/common_util.hpp"
#endif

#ifdef OV_CORE_USE_INTRINSICS
Expand Down Expand Up @@ -480,14 +481,15 @@ class jit_count_out_of_range : public jit::Generator {
template <class Clamp, typename TI, typename TO>
void convert_impl(const TI* arg, TO* out, size_t count) {
#ifdef OV_CORE_USE_XBYAK_JIT
if (auto converter = jit_convert_array::get<TI, TO, Clamp::enabled>()) {
jit_convert_array::args_t args = {arg, out, count};
converter(&args);
} else
#endif
{
Converter<TI, TO>::template apply<Clamp>(arg, out, count);
if (util::may_i_use_dynamic_code()) {
if (auto converter = jit_convert_array::get<TI, TO, Clamp::enabled>()) {
jit_convert_array::args_t args = {arg, out, count};
converter(&args);
return;
}
}
#endif // OV_CORE_USE_XBYAK_JIT
Converter<TI, TO>::template apply<Clamp>(arg, out, count);
}
} // namespace

Expand Down Expand Up @@ -544,11 +546,13 @@ void convert_from_bf16_to_f16_with_clamp(const bfloat16* arg, float16* out, size

size_t count_out_of_f16_range(const float* arg, size_t count) {
#ifdef OV_CORE_USE_XBYAK_JIT
if (auto converter = jit_count_out_of_range::get<float, float16>()) {
size_t num_out_of_range = 0;
jit_count_out_of_range::args_t args = {arg, &num_out_of_range, count};
converter(&args);
return num_out_of_range;
if (util::may_i_use_dynamic_code()) {
if (auto converter = jit_count_out_of_range::get<float, float16>()) {
size_t num_out_of_range = 0;
jit_count_out_of_range::args_t args = {arg, &num_out_of_range, count};
converter(&args);
return num_out_of_range;
}
}
#endif // OV_CORE_USE_XBYAK_JIT
const auto is_out_of_f16_range = [](const float v) {
Expand Down
133 changes: 68 additions & 65 deletions src/core/src/runtime/compute_hash.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#ifdef OV_CORE_USE_XBYAK_JIT
# include "openvino/core/parallel.hpp"
# include "openvino/reference/utils/registers_pool.hpp"
# include "openvino/util/common_util.hpp"
#endif // OV_CORE_USE_XBYAK_JIT

namespace ov {
Expand Down Expand Up @@ -822,77 +823,79 @@ void ComputeHash<isa>::fold_to_64(const Vmm& v_dst) {

size_t compute_hash(const void* src, size_t size) {
#ifdef OV_CORE_USE_XBYAK_JIT
if (Generator::mayiuse(avx2)) {
uint64_t result = 0lu;

// Parallel section
constexpr uint64_t min_wa_per_thread = 131072lu; // 2^17
const uint64_t size_u64 = static_cast<uint64_t>(size);
if (size_u64 >= min_wa_per_thread * 2lu) {
static auto first_thr_kernel = Generator::mayiuse(avx512_core)
? jit::ComputeHash<avx512_core>::create({jit::FIRST_THREAD})
: jit::ComputeHash<avx2>::create({jit::FIRST_THREAD});
static auto n_thr_kernel = Generator::mayiuse(avx512_core)
? jit::ComputeHash<avx512_core>::create({jit::N_THREAD})
: jit::ComputeHash<avx2>::create({jit::N_THREAD});
static auto final_fold_kernel = Generator::mayiuse(avx512_core)
? jit::ComputeHash<avx512_core>::create({jit::FINAL_FOLD})
: jit::ComputeHash<avx2>::create({jit::FINAL_FOLD});

static const uint64_t max_thr_num = 2lu;
uint64_t thr_num = std::min(size_u64 / min_wa_per_thread, max_thr_num);
const uint64_t el_per_thread =
first_thr_kernel->get_vlen() * ((size_u64 / thr_num) / first_thr_kernel->get_vlen());
std::vector<uint8_t> intermediate(thr_num * first_thr_kernel->get_vlen());

parallel_nt_static(static_cast<int>(thr_num), [&](const int ithr, const int nthr) {
uint64_t start = el_per_thread * ithr;
if (start >= size_u64) {
return;
}
uint64_t work_amount = (el_per_thread + start > size_u64) ? size_u64 - start : el_per_thread;
if (util::may_i_use_dynamic_code()) {
if (Generator::mayiuse(avx2)) {
uint64_t result = 0lu;

// Parallel section
constexpr uint64_t min_wa_per_thread = 131072lu; // 2^17
const uint64_t size_u64 = static_cast<uint64_t>(size);
if (size_u64 >= min_wa_per_thread * 2lu) {
static auto first_thr_kernel = Generator::mayiuse(avx512_core)
? jit::ComputeHash<avx512_core>::create({jit::FIRST_THREAD})
: jit::ComputeHash<avx2>::create({jit::FIRST_THREAD});
static auto n_thr_kernel = Generator::mayiuse(avx512_core)
? jit::ComputeHash<avx512_core>::create({jit::N_THREAD})
: jit::ComputeHash<avx2>::create({jit::N_THREAD});
static auto final_fold_kernel = Generator::mayiuse(avx512_core)
? jit::ComputeHash<avx512_core>::create({jit::FINAL_FOLD})
: jit::ComputeHash<avx2>::create({jit::FINAL_FOLD});

static const uint64_t max_thr_num = 2lu;
uint64_t thr_num = std::min(size_u64 / min_wa_per_thread, max_thr_num);
const uint64_t el_per_thread =
first_thr_kernel->get_vlen() * ((size_u64 / thr_num) / first_thr_kernel->get_vlen());
std::vector<uint8_t> intermediate(thr_num * first_thr_kernel->get_vlen());

parallel_nt_static(static_cast<int>(thr_num), [&](const int ithr, const int nthr) {
uint64_t start = el_per_thread * ithr;
if (start >= size_u64) {
return;
}
uint64_t work_amount = (el_per_thread + start > size_u64) ? size_u64 - start : el_per_thread;

jit::ComputeHashCallArgs args;

args.src_ptr = reinterpret_cast<const uint8_t*>(src) + first_thr_kernel->get_vlen() * ithr;
args.dst_ptr = &(intermediate[first_thr_kernel->get_vlen() * ithr]);
args.k_ptr = jit::K_PULL;
args.work_amount = work_amount;
args.size = size_u64;
args.threads_num = thr_num;

if (ithr == 0) {
(*first_thr_kernel)(&args);
} else {
(*n_thr_kernel)(&args);
}
});

jit::ComputeHashCallArgs args;
args.work_amount = size_u64 - el_per_thread * thr_num;
args.src_ptr = reinterpret_cast<const uint8_t*>(src) + size_u64 - args.work_amount;
args.dst_ptr = &result;
args.k_ptr = jit::K_PULL;
args.size = size_u64;
args.intermediate_ptr = intermediate.data();

args.src_ptr = reinterpret_cast<const uint8_t*>(src) + first_thr_kernel->get_vlen() * ithr;
args.dst_ptr = &(intermediate[first_thr_kernel->get_vlen() * ithr]);
(*final_fold_kernel)(&args);
} else {
static auto single_thr_kernel = Generator::mayiuse(avx512_core)
? jit::ComputeHash<avx512_core>::create({jit::SINGLE_THREAD})
: jit::ComputeHash<avx2>::create({jit::SINGLE_THREAD});

jit::ComputeHashCallArgs args;
args.src_ptr = src;
args.dst_ptr = &result;
args.k_ptr = jit::K_PULL;
args.work_amount = work_amount;
args.work_amount = size_u64;
args.size = size_u64;
args.threads_num = thr_num;

if (ithr == 0) {
(*first_thr_kernel)(&args);
} else {
(*n_thr_kernel)(&args);
}
});

jit::ComputeHashCallArgs args;
args.work_amount = size_u64 - el_per_thread * thr_num;
args.src_ptr = reinterpret_cast<const uint8_t*>(src) + size_u64 - args.work_amount;
args.dst_ptr = &result;
args.k_ptr = jit::K_PULL;
args.size = size_u64;
args.intermediate_ptr = intermediate.data();

(*final_fold_kernel)(&args);
} else {
static auto single_thr_kernel = Generator::mayiuse(avx512_core)
? jit::ComputeHash<avx512_core>::create({jit::SINGLE_THREAD})
: jit::ComputeHash<avx2>::create({jit::SINGLE_THREAD});

jit::ComputeHashCallArgs args;
args.src_ptr = src;
args.dst_ptr = &result;
args.k_ptr = jit::K_PULL;
args.work_amount = size_u64;
args.size = size_u64;

(*single_thr_kernel)(&args);
}

return result;
(*single_thr_kernel)(&args);
}

return result;
}
}

#endif // OV_CORE_USE_XBYAK_JIT
Expand Down
2 changes: 1 addition & 1 deletion src/core/src/type/float8_e4m3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
#include <cmath>
#include <limits>

#include "openvino/core/type/float_util.hpp"
#include "openvino/core/type/float16.hpp"
#include "openvino/core/type/float_util.hpp"

namespace ov {

Expand Down
5 changes: 5 additions & 0 deletions src/plugins/intel_gpu/src/graph/primitive_inst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1460,6 +1460,11 @@ void primitive_inst::do_runtime_skip_gather() {
GPU_DEBUG_TRACE_DETAIL << "--- Cannot optimize because idx_data [" << i << "] (" << idx_data[i] << ") != " << i << std::endl;
if (_impl_params->output_layouts[0].data_padding.is_dynamic())
_impl_params->output_layouts[0].data_padding = padding();
// for runtime skippable nodes, if previous iter is skipped while this iter not, its output memory needs to be revalidate
// as memory opt/release may be applied for these nodes to reduce memory footprint in previous iters
if (can_be_optimized()) {
set_flag(ExecutionFlags::SHAPE_CHANGED);
}
set_can_be_optimized(false);
return;
}
Expand Down

0 comments on commit ccdd193

Please sign in to comment.