Skip to content

Commit

Permalink
[inductor] enable Intel Compiler(icx-cl) for inductor windows (pytorc…
Browse files Browse the repository at this point in the history
…h#134772)

This PR is enable Intel Compiler (`icx-cl`) for Windows inductor, likes previous PR: pytorch#134444 which enable clang.

Changes:
1. Fix icx-cl crash by wrong decode args, the right decode should be "utf-8".
2. Add intel compiler check, and intel compiler Windows drivers check(icx-cl).
3. Add Intel compiler openmp args config.
4. Add intel compiler openmp binary preload.

For intel compiler openmp binary path:
<img width="788" alt="image" src="https://github.com/user-attachments/assets/54c76356-018d-4bef-a9b7-0ea150fd7aba">

For performance, Intel compiler(`icx-cl`) is much better performance than MSVC(`cl`):
<img width="875" alt="image" src="https://github.com/user-attachments/assets/67865faf-b1de-4535-917a-486b72527204">

Append `clang-cl` performance data:
<img width="821" alt="image" src="https://github.com/user-attachments/assets/476f4568-bf58-457f-b73d-4e57f49be384">

Pull Request resolved: pytorch#134772
Approved by: https://github.com/jgong5, https://github.com/jansel
  • Loading branch information
xuhancn authored and pytorchmergebot committed Aug 30, 2024
1 parent 9e0ddc0 commit 15f5a48
Showing 1 changed file with 65 additions and 1 deletion.
66 changes: 65 additions & 1 deletion torch/_inductor/cpp_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def use_global_cache() -> bool:
_IS_MACOS = sys.platform.startswith("darwin")
_IS_WINDOWS = sys.platform == "win32"

SUBPROCESS_DECODE_ARGS = ("oem",) if _IS_WINDOWS else ()
SUBPROCESS_DECODE_ARGS = ("utf-8",) if _IS_WINDOWS else ()

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -198,6 +198,33 @@ def _is_msvc_cl(cpp_compiler: str) -> bool:
return False


@functools.lru_cache(None)
def _is_intel_compiler(cpp_compiler: str) -> bool:
try:
output_msg = (
subprocess.check_output(
[cpp_compiler, "--version"], stderr=subprocess.DEVNULL
)
.strip()
.decode(*SUBPROCESS_DECODE_ARGS)
)
is_intel_compiler = "Intel" in output_msg.splitlines()[0]
if is_intel_compiler:
if _IS_WINDOWS:
if re.search(r"((icx$)|(icx-cc$))", cpp_compiler):
raise RuntimeError(
"Please use icx-cl, due to torch.compile only support MSVC-like CLI (compiler flags syntax)."
)
return is_intel_compiler
except FileNotFoundError as exc:
return False
except subprocess.SubprocessError:
# --version args not support.
return False

return False


@functools.lru_cache(None)
def is_gcc() -> bool:
return _is_gcc(get_cpp_compiler())
Expand All @@ -208,6 +235,11 @@ def is_clang() -> bool:
return _is_clang(get_cpp_compiler())


@functools.lru_cache(None)
def is_intel_compiler() -> bool:
return _is_intel_compiler(get_cpp_compiler())


@functools.lru_cache(None)
def is_apple_clang() -> bool:
return _is_apple_clang(get_cpp_compiler())
Expand Down Expand Up @@ -798,6 +830,20 @@ def perload_clang_libomp_win(cpp_compiler: str, omp_name: str) -> None:
pass


@functools.lru_cache(None)
def perload_icx_libomp_win(cpp_compiler: str) -> None:
try:
output = subprocess.check_output(
[cpp_compiler, "-print-file-name=libiomp5md.dll"], stderr=subprocess.DEVNULL
).decode(*SUBPROCESS_DECODE_ARGS)
omp_path = output.rstrip()
if os.path.isfile(omp_path):
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
omp_module = cdll.LoadLibrary(omp_path)
except subprocess.SubprocessError:
pass


def _get_openmp_args(
cpp_compiler: str,
) -> Tuple[List[str], List[str], List[str], List[str], List[str], List[str]]:
Expand Down Expand Up @@ -854,10 +900,28 @@ def _get_openmp_args(
# if openmp is still not available, we let the compiler to have a try,
# and raise error together with instructions at compilation error later
elif _IS_WINDOWS:
"""
On Windows, `clang` and `icx` have their specific openmp implenmention.
And the openmp lib is in compiler's some sub-directory.
For dynamic library(DLL) load, the Windows native APIs are `LoadLibraryA` and `LoadLibraryExA`, and their search
dependencies have some rules:
https://learn.microsoft.com/en-us/windows/win32/api/libloaderapi/nf-libloaderapi-loadlibraryexa#searching-for-dlls-and-dependencies
In some case, the rules may not include compiler's sub-directories.
So, it can't search and load compiler's openmp library correctly.
And then, the whole application would be broken.
To avoid the openmp load failed, we can automatic locate the openmp binary and preload it.
1. For clang, the function is `perload_clang_libomp_win`.
2. For icx, the function is `perload_icx_libomp_win`.
"""
if _is_clang(cpp_compiler):
cflags.append("openmp")
libs.append("libomp")
perload_clang_libomp_win(cpp_compiler, "libomp.dll")
elif _is_intel_compiler(cpp_compiler):
cflags.append("Qiopenmp")
libs.append("libiomp5md")
perload_icx_libomp_win(cpp_compiler)
else:
# /openmp, /openmp:llvm
# llvm on Windows, new openmp: https://devblogs.microsoft.com/cppblog/msvc-openmp-update/
Expand Down

0 comments on commit 15f5a48

Please sign in to comment.