From 15f5a4858b3073e6dc96c9258c0145776f420917 Mon Sep 17 00:00:00 2001 From: Xu Han Date: Fri, 30 Aug 2024 17:51:46 +0000 Subject: [PATCH] [inductor] enable Intel Compiler(icx-cl) for inductor windows (#134772) This PR is enable Intel Compiler (`icx-cl`) for Windows inductor, likes previous PR: https://github.com/pytorch/pytorch/pull/134444 which enable clang. Changes: 1. Fix icx-cl crash by wrong decode args, the right decode should be "utf-8". 2. Add intel compiler check, and intel compiler Windows drivers check(icx-cl). 3. Add Intel compiler openmp args config. 4. Add intel compiler openmp binary preload. For intel compiler openmp binary path: image For performance, Intel compiler(`icx-cl`) is much better performance than MSVC(`cl`): image Append `clang-cl` performance data: image Pull Request resolved: https://github.com/pytorch/pytorch/pull/134772 Approved by: https://github.com/jgong5, https://github.com/jansel --- torch/_inductor/cpp_builder.py | 66 +++++++++++++++++++++++++++++++++- 1 file changed, 65 insertions(+), 1 deletion(-) diff --git a/torch/_inductor/cpp_builder.py b/torch/_inductor/cpp_builder.py index f78bc354cc90cc..693f9b06418285 100644 --- a/torch/_inductor/cpp_builder.py +++ b/torch/_inductor/cpp_builder.py @@ -58,7 +58,7 @@ def use_global_cache() -> bool: _IS_MACOS = sys.platform.startswith("darwin") _IS_WINDOWS = sys.platform == "win32" -SUBPROCESS_DECODE_ARGS = ("oem",) if _IS_WINDOWS else () +SUBPROCESS_DECODE_ARGS = ("utf-8",) if _IS_WINDOWS else () log = logging.getLogger(__name__) @@ -198,6 +198,33 @@ def _is_msvc_cl(cpp_compiler: str) -> bool: return False +@functools.lru_cache(None) +def _is_intel_compiler(cpp_compiler: str) -> bool: + try: + output_msg = ( + subprocess.check_output( + [cpp_compiler, "--version"], stderr=subprocess.DEVNULL + ) + .strip() + .decode(*SUBPROCESS_DECODE_ARGS) + ) + is_intel_compiler = "Intel" in output_msg.splitlines()[0] + if is_intel_compiler: + if _IS_WINDOWS: + if re.search(r"((icx$)|(icx-cc$))", cpp_compiler): + raise RuntimeError( + "Please use icx-cl, due to torch.compile only support MSVC-like CLI (compiler flags syntax)." + ) + return is_intel_compiler + except FileNotFoundError as exc: + return False + except subprocess.SubprocessError: + # --version args not support. + return False + + return False + + @functools.lru_cache(None) def is_gcc() -> bool: return _is_gcc(get_cpp_compiler()) @@ -208,6 +235,11 @@ def is_clang() -> bool: return _is_clang(get_cpp_compiler()) +@functools.lru_cache(None) +def is_intel_compiler() -> bool: + return _is_intel_compiler(get_cpp_compiler()) + + @functools.lru_cache(None) def is_apple_clang() -> bool: return _is_apple_clang(get_cpp_compiler()) @@ -798,6 +830,20 @@ def perload_clang_libomp_win(cpp_compiler: str, omp_name: str) -> None: pass +@functools.lru_cache(None) +def perload_icx_libomp_win(cpp_compiler: str) -> None: + try: + output = subprocess.check_output( + [cpp_compiler, "-print-file-name=libiomp5md.dll"], stderr=subprocess.DEVNULL + ).decode(*SUBPROCESS_DECODE_ARGS) + omp_path = output.rstrip() + if os.path.isfile(omp_path): + os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" + omp_module = cdll.LoadLibrary(omp_path) + except subprocess.SubprocessError: + pass + + def _get_openmp_args( cpp_compiler: str, ) -> Tuple[List[str], List[str], List[str], List[str], List[str], List[str]]: @@ -854,10 +900,28 @@ def _get_openmp_args( # if openmp is still not available, we let the compiler to have a try, # and raise error together with instructions at compilation error later elif _IS_WINDOWS: + """ + On Windows, `clang` and `icx` have their specific openmp implenmention. + And the openmp lib is in compiler's some sub-directory. + For dynamic library(DLL) load, the Windows native APIs are `LoadLibraryA` and `LoadLibraryExA`, and their search + dependencies have some rules: + https://learn.microsoft.com/en-us/windows/win32/api/libloaderapi/nf-libloaderapi-loadlibraryexa#searching-for-dlls-and-dependencies + In some case, the rules may not include compiler's sub-directories. + So, it can't search and load compiler's openmp library correctly. + And then, the whole application would be broken. + + To avoid the openmp load failed, we can automatic locate the openmp binary and preload it. + 1. For clang, the function is `perload_clang_libomp_win`. + 2. For icx, the function is `perload_icx_libomp_win`. + """ if _is_clang(cpp_compiler): cflags.append("openmp") libs.append("libomp") perload_clang_libomp_win(cpp_compiler, "libomp.dll") + elif _is_intel_compiler(cpp_compiler): + cflags.append("Qiopenmp") + libs.append("libiomp5md") + perload_icx_libomp_win(cpp_compiler) else: # /openmp, /openmp:llvm # llvm on Windows, new openmp: https://devblogs.microsoft.com/cppblog/msvc-openmp-update/