From cd06f1fd842606a39d5d3e846e788c89c248fbf8 Mon Sep 17 00:00:00 2001
From: Laurent El Shafey <shafey@google.com>
Date: Tue, 30 Apr 2024 10:32:01 -0700
Subject: [PATCH] Revert "[NVIDIA ] Remove references to deprecated XLA flags."

---
 README.md                                                    | 4 ++--
 paxml/contrib/gpu/scripts_gpu/benchmark_gpt_multinode.sh     | 5 +++--
 paxml/contrib/gpu/scripts_gpu/run_lambada_singlenode.sh      | 5 +++--
 .../contrib/gpu/scripts_gpu/run_llama_boolq_multiprocess.sh  | 5 +++--
 paxml/contrib/gpu/scripts_gpu/run_pile_multinode.sh          | 5 +++--
 paxml/contrib/gpu/scripts_gpu/run_pile_singlenode.sh         | 5 +++--
 6 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 3c709c2a0..0d76e1119 100644
--- a/README.md
+++ b/README.md
@@ -128,7 +128,7 @@ The workflow to use the Profile Guided Latency Estimator workflow in XLA/GPU is:
 You could do so by setting:
 
 ```bash
-export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true"
+export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_async_all_gather=true --xla_gpu_enable_async_reduce_scatter=true"
 ```
 
 - 2. Collect and post process a profile by using JAX profiler, saving the extracted instruction latencies into a binary protobuf file.
@@ -172,7 +172,7 @@ After this step, you will get a `profile.pb` file under the `rundir` printed in
 You need to pass the `profile.pb` file to the `--xla_gpu_pgle_profile_file_or_directory_path` flag.
 
 ```bash
- export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_pgle_profile_file_or_directory_path=/path/to/profile/profile.pb"
+ export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_async_all_gather=true --xla_gpu_enable_async_reduce_scatter=true --xla_gpu_pgle_profile_file_or_directory_path=/path/to/profile/profile.pb"
 ```
 
 To enable logging in the XLA and check if the profile is good, set the logging level to include `INFO`:
diff --git a/paxml/contrib/gpu/scripts_gpu/benchmark_gpt_multinode.sh b/paxml/contrib/gpu/scripts_gpu/benchmark_gpt_multinode.sh
index 512773960..0ab901588 100644
--- a/paxml/contrib/gpu/scripts_gpu/benchmark_gpt_multinode.sh
+++ b/paxml/contrib/gpu/scripts_gpu/benchmark_gpt_multinode.sh
@@ -28,9 +28,10 @@ export VOCAB_PATH=gs://t5-data/vocabs/cc_all.32000.100extra/sentencepiece.model
 
 export XLA_PYTHON_CLIENT_MEM_FRACTION=${XLA_PYTHON_CLIENT_MEM_FRACTION:-0.85}
 BASE_XLA_FLAGS=${BASE_XLA_FLAGS:-"--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_triton_gemm=false
-                       --xla_gpu_enable_highest_priority_async_stream=true
+                       --xla_gpu_enable_async_all_gather=true
+                       --xla_gpu_enable_async_reduce_scatter=true  --xla_gpu_enable_highest_priority_async_stream=true
                        --xla_gpu_enable_triton_softmax_fusion=false  --xla_gpu_all_reduce_combine_threshold_bytes=51200
-                       --xla_gpu_graph_level=0"}
+                       --xla_gpu_graph_level=0 --xla_gpu_enable_async_all_reduce=true"}
 export XLA_FLAGS="$BASE_XLA_FLAGS ${XLA_FLAGS:-}"
 
 
diff --git a/paxml/contrib/gpu/scripts_gpu/run_lambada_singlenode.sh b/paxml/contrib/gpu/scripts_gpu/run_lambada_singlenode.sh
index 4eef7d5ed..055c1670b 100755
--- a/paxml/contrib/gpu/scripts_gpu/run_lambada_singlenode.sh
+++ b/paxml/contrib/gpu/scripts_gpu/run_lambada_singlenode.sh
@@ -27,9 +27,10 @@ LOG_DIR=$6
 
 export VOCAB_PATH=$VOCAB_PATH
 BASE_XLA_FLAGS=${BASE_XLA_FLAGS:-"--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_triton_gemm=false
-                       --xla_gpu_enable_highest_priority_async_stream=true
+                       --xla_gpu_enable_async_all_gather=true
+                       --xla_gpu_enable_async_reduce_scatter=true  --xla_gpu_enable_highest_priority_async_stream=true
                        --xla_gpu_enable_triton_softmax_fusion=false  --xla_gpu_all_reduce_combine_threshold_bytes=51200
-                       --xla_gpu_graph_level=0"}
+                       --xla_gpu_graph_level=0 --xla_gpu_enable_async_all_reduce=true"}
 export XLA_FLAGS="$BASE_XLA_FLAGS ${XLA_FLAGS:-}"
 
 
diff --git a/paxml/contrib/gpu/scripts_gpu/run_llama_boolq_multiprocess.sh b/paxml/contrib/gpu/scripts_gpu/run_llama_boolq_multiprocess.sh
index 77182cc7d..f97660f2a 100755
--- a/paxml/contrib/gpu/scripts_gpu/run_llama_boolq_multiprocess.sh
+++ b/paxml/contrib/gpu/scripts_gpu/run_llama_boolq_multiprocess.sh
@@ -27,9 +27,10 @@ CONFIG=${7:-LLaMA7B}
 export VOCAB_PATH=$VOCAB_PATH
 export XLA_PYTHON_CLIENT_MEM_FRACTION=${XLA_PYTHON_CLIENT_MEM_FRACTION:-0.85}
 BASE_XLA_FLAGS=${BASE_XLA_FLAGS:-"--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_triton_gemm=false
-                       --xla_gpu_simplify_all_fp_conversions --xla_gpu_enable_highest_priority_async_stream=true
+                       --xla_gpu_simplify_all_fp_conversions --xla_gpu_enable_async_all_gather=true
+                       --xla_gpu_enable_async_reduce_scatter=true  --xla_gpu_enable_highest_priority_async_stream=true
                        --xla_gpu_enable_triton_softmax_fusion=false  --xla_gpu_all_reduce_combine_threshold_bytes=51200
-                       --xla_gpu_graph_level=0"}
+                       --xla_gpu_graph_level=0 --xla_gpu_enable_async_all_reduce=true"}
 export XLA_FLAGS="$BASE_XLA_FLAGS ${XLA_FLAGS:-}"
 
 ## LLaMA currently incompatible with TE
diff --git a/paxml/contrib/gpu/scripts_gpu/run_pile_multinode.sh b/paxml/contrib/gpu/scripts_gpu/run_pile_multinode.sh
index 536173dd2..48cf130b9 100644
--- a/paxml/contrib/gpu/scripts_gpu/run_pile_multinode.sh
+++ b/paxml/contrib/gpu/scripts_gpu/run_pile_multinode.sh
@@ -27,9 +27,10 @@ LOG_DIR=${6:-"test_logdir"}
 export VOCAB_PATH=$VOCAB_PATH
 export XLA_PYTHON_CLIENT_MEM_FRACTION=${XLA_PYTHON_CLIENT_MEM_FRACTION:-0.85}
 BASE_XLA_FLAGS=${BASE_XLA_FLAGS:-"--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_triton_gemm=false
-                       --xla_gpu_enable_highest_priority_async_stream=true
+                       --xla_gpu_enable_async_all_gather=true
+                       --xla_gpu_enable_async_reduce_scatter=true  --xla_gpu_enable_highest_priority_async_stream=true
                        --xla_gpu_enable_triton_softmax_fusion=false  --xla_gpu_all_reduce_combine_threshold_bytes=51200
-                       --xla_gpu_graph_level=0"}
+                       --xla_gpu_graph_level=0 --xla_gpu_enable_async_all_reduce=true"}
 export XLA_FLAGS="$BASE_XLA_FLAGS ${XLA_FLAGS:-}"
 
 
diff --git a/paxml/contrib/gpu/scripts_gpu/run_pile_singlenode.sh b/paxml/contrib/gpu/scripts_gpu/run_pile_singlenode.sh
index 4538dd33a..df8a91e54 100644
--- a/paxml/contrib/gpu/scripts_gpu/run_pile_singlenode.sh
+++ b/paxml/contrib/gpu/scripts_gpu/run_pile_singlenode.sh
@@ -27,9 +27,10 @@ LOG_DIR=${6:-"test_logdir"}
 export VOCAB_PATH=$VOCAB_PATH
 
 BASE_XLA_FLAGS=${BASE_XLA_FLAGS:-"--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_triton_gemm=false
-                       --xla_gpu_enable_highest_priority_async_stream=true
+                       --xla_gpu_enable_async_all_gather=true
+                       --xla_gpu_enable_async_reduce_scatter=true  --xla_gpu_enable_highest_priority_async_stream=true
                        --xla_gpu_enable_triton_softmax_fusion=false  --xla_gpu_all_reduce_combine_threshold_bytes=51200
-                       --xla_gpu_graph_level=0"}
+                       --xla_gpu_graph_level=0 --xla_gpu_enable_async_all_reduce=true"}
 export XLA_FLAGS="$BASE_XLA_FLAGS ${XLA_FLAGS:-}"