google · laurentes · Apr 30, 2024 · Apr 30, 2024
diff --git a/README.md b/README.md
@@ -128,7 +128,7 @@ The workflow to use the Profile Guided Latency Estimator workflow in XLA/GPU is:
 You could do so by setting:
 
 ```bash
-export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true"
+export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_async_all_gather=true --xla_gpu_enable_async_reduce_scatter=true"
 ```
 
 - 2. Collect and post process a profile by using JAX profiler, saving the extracted instruction latencies into a binary protobuf file.
@@ -172,7 +172,7 @@ After this step, you will get a `profile.pb` file under the `rundir` printed in
 You need to pass the `profile.pb` file to the `--xla_gpu_pgle_profile_file_or_directory_path` flag.
 
 ```bash
- export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_pgle_profile_file_or_directory_path=/path/to/profile/profile.pb"
+ export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_async_all_gather=true --xla_gpu_enable_async_reduce_scatter=true --xla_gpu_pgle_profile_file_or_directory_path=/path/to/profile/profile.pb"
 ```
 
 To enable logging in the XLA and check if the profile is good, set the logging level to include `INFO`:

diff --git a/paxml/contrib/gpu/scripts_gpu/benchmark_gpt_multinode.sh b/paxml/contrib/gpu/scripts_gpu/benchmark_gpt_multinode.sh
@@ -28,9 +28,10 @@ export VOCAB_PATH=gs://t5-data/vocabs/cc_all.32000.100extra/sentencepiece.model
 
 export XLA_PYTHON_CLIENT_MEM_FRACTION=${XLA_PYTHON_CLIENT_MEM_FRACTION:-0.85}
 BASE_XLA_FLAGS=${BASE_XLA_FLAGS:-"--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_triton_gemm=false
-                       --xla_gpu_enable_highest_priority_async_stream=true
+                       --xla_gpu_enable_async_all_gather=true
+                       --xla_gpu_enable_async_reduce_scatter=true  --xla_gpu_enable_highest_priority_async_stream=true
                        --xla_gpu_enable_triton_softmax_fusion=false  --xla_gpu_all_reduce_combine_threshold_bytes=51200
-                       --xla_gpu_graph_level=0"}
+                       --xla_gpu_graph_level=0 --xla_gpu_enable_async_all_reduce=true"}
 export XLA_FLAGS="$BASE_XLA_FLAGS ${XLA_FLAGS:-}"
 
 

diff --git a/paxml/contrib/gpu/scripts_gpu/run_lambada_singlenode.sh b/paxml/contrib/gpu/scripts_gpu/run_lambada_singlenode.sh
@@ -27,9 +27,10 @@ LOG_DIR=$6
 
 export VOCAB_PATH=$VOCAB_PATH
 BASE_XLA_FLAGS=${BASE_XLA_FLAGS:-"--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_triton_gemm=false
-                       --xla_gpu_enable_highest_priority_async_stream=true
+                       --xla_gpu_enable_async_all_gather=true
+                       --xla_gpu_enable_async_reduce_scatter=true  --xla_gpu_enable_highest_priority_async_stream=true
                        --xla_gpu_enable_triton_softmax_fusion=false  --xla_gpu_all_reduce_combine_threshold_bytes=51200
-                       --xla_gpu_graph_level=0"}
+                       --xla_gpu_graph_level=0 --xla_gpu_enable_async_all_reduce=true"}
 export XLA_FLAGS="$BASE_XLA_FLAGS ${XLA_FLAGS:-}"
 
 

diff --git a/paxml/contrib/gpu/scripts_gpu/run_llama_boolq_multiprocess.sh b/paxml/contrib/gpu/scripts_gpu/run_llama_boolq_multiprocess.sh
@@ -27,9 +27,10 @@ CONFIG=${7:-LLaMA7B}
 export VOCAB_PATH=$VOCAB_PATH
 export XLA_PYTHON_CLIENT_MEM_FRACTION=${XLA_PYTHON_CLIENT_MEM_FRACTION:-0.85}
 BASE_XLA_FLAGS=${BASE_XLA_FLAGS:-"--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_triton_gemm=false
-                       --xla_gpu_simplify_all_fp_conversions --xla_gpu_enable_highest_priority_async_stream=true
+                       --xla_gpu_simplify_all_fp_conversions --xla_gpu_enable_async_all_gather=true
+                       --xla_gpu_enable_async_reduce_scatter=true  --xla_gpu_enable_highest_priority_async_stream=true
                        --xla_gpu_enable_triton_softmax_fusion=false  --xla_gpu_all_reduce_combine_threshold_bytes=51200
-                       --xla_gpu_graph_level=0"}
+                       --xla_gpu_graph_level=0 --xla_gpu_enable_async_all_reduce=true"}
 export XLA_FLAGS="$BASE_XLA_FLAGS ${XLA_FLAGS:-}"
 
 ## LLaMA currently incompatible with TE

diff --git a/paxml/contrib/gpu/scripts_gpu/run_pile_multinode.sh b/paxml/contrib/gpu/scripts_gpu/run_pile_multinode.sh
@@ -27,9 +27,10 @@ LOG_DIR=${6:-"test_logdir"}
 export VOCAB_PATH=$VOCAB_PATH
 export XLA_PYTHON_CLIENT_MEM_FRACTION=${XLA_PYTHON_CLIENT_MEM_FRACTION:-0.85}
 BASE_XLA_FLAGS=${BASE_XLA_FLAGS:-"--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_triton_gemm=false
-                       --xla_gpu_enable_highest_priority_async_stream=true
+                       --xla_gpu_enable_async_all_gather=true
+                       --xla_gpu_enable_async_reduce_scatter=true  --xla_gpu_enable_highest_priority_async_stream=true
                        --xla_gpu_enable_triton_softmax_fusion=false  --xla_gpu_all_reduce_combine_threshold_bytes=51200
-                       --xla_gpu_graph_level=0"}
+                       --xla_gpu_graph_level=0 --xla_gpu_enable_async_all_reduce=true"}
 export XLA_FLAGS="$BASE_XLA_FLAGS ${XLA_FLAGS:-}"
 
 

diff --git a/paxml/contrib/gpu/scripts_gpu/run_pile_singlenode.sh b/paxml/contrib/gpu/scripts_gpu/run_pile_singlenode.sh
@@ -27,9 +27,10 @@ LOG_DIR=${6:-"test_logdir"}
 export VOCAB_PATH=$VOCAB_PATH
 
 BASE_XLA_FLAGS=${BASE_XLA_FLAGS:-"--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_triton_gemm=false
-                       --xla_gpu_enable_highest_priority_async_stream=true
+                       --xla_gpu_enable_async_all_gather=true
+                       --xla_gpu_enable_async_reduce_scatter=true  --xla_gpu_enable_highest_priority_async_stream=true
                        --xla_gpu_enable_triton_softmax_fusion=false  --xla_gpu_all_reduce_combine_threshold_bytes=51200
-                       --xla_gpu_graph_level=0"}
+                       --xla_gpu_graph_level=0 --xla_gpu_enable_async_all_reduce=true"}
 export XLA_FLAGS="$BASE_XLA_FLAGS ${XLA_FLAGS:-}"