From cd06f1fd842606a39d5d3e846e788c89c248fbf8 Mon Sep 17 00:00:00 2001 From: Laurent El Shafey Date: Tue, 30 Apr 2024 10:32:01 -0700 Subject: [PATCH] Revert "[NVIDIA ] Remove references to deprecated XLA flags." --- README.md | 4 ++-- paxml/contrib/gpu/scripts_gpu/benchmark_gpt_multinode.sh | 5 +++-- paxml/contrib/gpu/scripts_gpu/run_lambada_singlenode.sh | 5 +++-- .../contrib/gpu/scripts_gpu/run_llama_boolq_multiprocess.sh | 5 +++-- paxml/contrib/gpu/scripts_gpu/run_pile_multinode.sh | 5 +++-- paxml/contrib/gpu/scripts_gpu/run_pile_singlenode.sh | 5 +++-- 6 files changed, 17 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 3c709c2a0..0d76e1119 100644 --- a/README.md +++ b/README.md @@ -128,7 +128,7 @@ The workflow to use the Profile Guided Latency Estimator workflow in XLA/GPU is: You could do so by setting: ```bash -export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true" +export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_async_all_gather=true --xla_gpu_enable_async_reduce_scatter=true" ``` - 2. Collect and post process a profile by using JAX profiler, saving the extracted instruction latencies into a binary protobuf file. @@ -172,7 +172,7 @@ After this step, you will get a `profile.pb` file under the `rundir` printed in You need to pass the `profile.pb` file to the `--xla_gpu_pgle_profile_file_or_directory_path` flag. ```bash - export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_pgle_profile_file_or_directory_path=/path/to/profile/profile.pb" + export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_async_all_gather=true --xla_gpu_enable_async_reduce_scatter=true --xla_gpu_pgle_profile_file_or_directory_path=/path/to/profile/profile.pb" ``` To enable logging in the XLA and check if the profile is good, set the logging level to include `INFO`: diff --git a/paxml/contrib/gpu/scripts_gpu/benchmark_gpt_multinode.sh b/paxml/contrib/gpu/scripts_gpu/benchmark_gpt_multinode.sh index 512773960..0ab901588 100644 --- a/paxml/contrib/gpu/scripts_gpu/benchmark_gpt_multinode.sh +++ b/paxml/contrib/gpu/scripts_gpu/benchmark_gpt_multinode.sh @@ -28,9 +28,10 @@ export VOCAB_PATH=gs://t5-data/vocabs/cc_all.32000.100extra/sentencepiece.model export XLA_PYTHON_CLIENT_MEM_FRACTION=${XLA_PYTHON_CLIENT_MEM_FRACTION:-0.85} BASE_XLA_FLAGS=${BASE_XLA_FLAGS:-"--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_triton_gemm=false - --xla_gpu_enable_highest_priority_async_stream=true + --xla_gpu_enable_async_all_gather=true + --xla_gpu_enable_async_reduce_scatter=true --xla_gpu_enable_highest_priority_async_stream=true --xla_gpu_enable_triton_softmax_fusion=false --xla_gpu_all_reduce_combine_threshold_bytes=51200 - --xla_gpu_graph_level=0"} + --xla_gpu_graph_level=0 --xla_gpu_enable_async_all_reduce=true"} export XLA_FLAGS="$BASE_XLA_FLAGS ${XLA_FLAGS:-}" diff --git a/paxml/contrib/gpu/scripts_gpu/run_lambada_singlenode.sh b/paxml/contrib/gpu/scripts_gpu/run_lambada_singlenode.sh index 4eef7d5ed..055c1670b 100755 --- a/paxml/contrib/gpu/scripts_gpu/run_lambada_singlenode.sh +++ b/paxml/contrib/gpu/scripts_gpu/run_lambada_singlenode.sh @@ -27,9 +27,10 @@ LOG_DIR=$6 export VOCAB_PATH=$VOCAB_PATH BASE_XLA_FLAGS=${BASE_XLA_FLAGS:-"--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_triton_gemm=false - --xla_gpu_enable_highest_priority_async_stream=true + --xla_gpu_enable_async_all_gather=true + --xla_gpu_enable_async_reduce_scatter=true --xla_gpu_enable_highest_priority_async_stream=true --xla_gpu_enable_triton_softmax_fusion=false --xla_gpu_all_reduce_combine_threshold_bytes=51200 - --xla_gpu_graph_level=0"} + --xla_gpu_graph_level=0 --xla_gpu_enable_async_all_reduce=true"} export XLA_FLAGS="$BASE_XLA_FLAGS ${XLA_FLAGS:-}" diff --git a/paxml/contrib/gpu/scripts_gpu/run_llama_boolq_multiprocess.sh b/paxml/contrib/gpu/scripts_gpu/run_llama_boolq_multiprocess.sh index 77182cc7d..f97660f2a 100755 --- a/paxml/contrib/gpu/scripts_gpu/run_llama_boolq_multiprocess.sh +++ b/paxml/contrib/gpu/scripts_gpu/run_llama_boolq_multiprocess.sh @@ -27,9 +27,10 @@ CONFIG=${7:-LLaMA7B} export VOCAB_PATH=$VOCAB_PATH export XLA_PYTHON_CLIENT_MEM_FRACTION=${XLA_PYTHON_CLIENT_MEM_FRACTION:-0.85} BASE_XLA_FLAGS=${BASE_XLA_FLAGS:-"--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_triton_gemm=false - --xla_gpu_simplify_all_fp_conversions --xla_gpu_enable_highest_priority_async_stream=true + --xla_gpu_simplify_all_fp_conversions --xla_gpu_enable_async_all_gather=true + --xla_gpu_enable_async_reduce_scatter=true --xla_gpu_enable_highest_priority_async_stream=true --xla_gpu_enable_triton_softmax_fusion=false --xla_gpu_all_reduce_combine_threshold_bytes=51200 - --xla_gpu_graph_level=0"} + --xla_gpu_graph_level=0 --xla_gpu_enable_async_all_reduce=true"} export XLA_FLAGS="$BASE_XLA_FLAGS ${XLA_FLAGS:-}" ## LLaMA currently incompatible with TE diff --git a/paxml/contrib/gpu/scripts_gpu/run_pile_multinode.sh b/paxml/contrib/gpu/scripts_gpu/run_pile_multinode.sh index 536173dd2..48cf130b9 100644 --- a/paxml/contrib/gpu/scripts_gpu/run_pile_multinode.sh +++ b/paxml/contrib/gpu/scripts_gpu/run_pile_multinode.sh @@ -27,9 +27,10 @@ LOG_DIR=${6:-"test_logdir"} export VOCAB_PATH=$VOCAB_PATH export XLA_PYTHON_CLIENT_MEM_FRACTION=${XLA_PYTHON_CLIENT_MEM_FRACTION:-0.85} BASE_XLA_FLAGS=${BASE_XLA_FLAGS:-"--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_triton_gemm=false - --xla_gpu_enable_highest_priority_async_stream=true + --xla_gpu_enable_async_all_gather=true + --xla_gpu_enable_async_reduce_scatter=true --xla_gpu_enable_highest_priority_async_stream=true --xla_gpu_enable_triton_softmax_fusion=false --xla_gpu_all_reduce_combine_threshold_bytes=51200 - --xla_gpu_graph_level=0"} + --xla_gpu_graph_level=0 --xla_gpu_enable_async_all_reduce=true"} export XLA_FLAGS="$BASE_XLA_FLAGS ${XLA_FLAGS:-}" diff --git a/paxml/contrib/gpu/scripts_gpu/run_pile_singlenode.sh b/paxml/contrib/gpu/scripts_gpu/run_pile_singlenode.sh index 4538dd33a..df8a91e54 100644 --- a/paxml/contrib/gpu/scripts_gpu/run_pile_singlenode.sh +++ b/paxml/contrib/gpu/scripts_gpu/run_pile_singlenode.sh @@ -27,9 +27,10 @@ LOG_DIR=${6:-"test_logdir"} export VOCAB_PATH=$VOCAB_PATH BASE_XLA_FLAGS=${BASE_XLA_FLAGS:-"--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_triton_gemm=false - --xla_gpu_enable_highest_priority_async_stream=true + --xla_gpu_enable_async_all_gather=true + --xla_gpu_enable_async_reduce_scatter=true --xla_gpu_enable_highest_priority_async_stream=true --xla_gpu_enable_triton_softmax_fusion=false --xla_gpu_all_reduce_combine_threshold_bytes=51200 - --xla_gpu_graph_level=0"} + --xla_gpu_graph_level=0 --xla_gpu_enable_async_all_reduce=true"} export XLA_FLAGS="$BASE_XLA_FLAGS ${XLA_FLAGS:-}"