flexflow · goliaro · Feb 12, 2025 · Feb 11, 2025 · Feb 12, 2025
diff --git a/conda/flexflow.yml b/conda/flexflow.yml
@@ -20,7 +20,7 @@ dependencies:
     - torchvision>=0.14.1
     - regex
     - onnx
-    - transformers>=4.31.0
+    - transformers>=4.47.1
     - sentencepiece
     - einops
     - requests

diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile
@@ -121,7 +121,7 @@ RUN if [ "$FF_GPU_BACKEND" == "cuda" ] ; then \
     fi
 RUN rm /usr/local/bin/install_pytorch.sh
 # Various dependencies
-RUN pip3 install transformers>=4.31.0 sentencepiece einops
+RUN pip3 install transformers>=4.47.1 sentencepiece einops
 RUN pip3 install tensorflow notebook
 # PEFT-related
 RUN pip3 install scipy bitsandbytes datasets accelerate loralib triton peft

diff --git a/requirements.txt b/requirements.txt
@@ -11,7 +11,7 @@ torch>=1.13.1
 torchaudio>=0.13.1
 torchvision>=0.14.1
 onnx
-transformers>=4.31.0
+transformers>=4.47.1
 sentencepiece
 einops
 pip

diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu
@@ -328,7 +328,7 @@ void peft_bwd_kernel(SoftmaxMeta const *m,
       token_ids[j] = bc->tokensInfo[j + tokens_previous_requests + 1].token_id;
     }
 
-    DT scale_factor = 1.0 / (bc->requestsInfo[i].num_tokens_in_batch - 1);
+    DT scale_factor = 1.0 / (bc->requestsInfo[i].num_tokens_in_batch );
     // ignore last token
     checkCUDA(cudaMemsetAsync(
         input_grad_ptr + (tokens_previous_requests +

diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py
@@ -14,9 +14,9 @@
 import argparse
 import transformers
 
-if transformers.__version__ < "4.31.0":
+if transformers.__version__ < "4.47.1":
     raise RuntimeError(
-        "Please update the transformers library version to 4.31.0 or above"
+        "Please update the transformers library version to 4.47.1 or above"
     )
 from datasets import load_dataset
 

diff --git a/tests/peft/peft_alignment_test.py b/tests/peft/peft_alignment_test.py
@@ -746,8 +746,8 @@ def compare(hf_tensor, ff_tensor, label="", tolerance=1e-4):
             ff_gradient_name = convert_hf_filename_to_ff(hf_gradient_name)
             ff_gradient = get_ff_tensor(ff_gradient_name, hf_gradient.shape, tp_type=TPType.REPLICATE)
 
-            lora_low_rank_activation_fwd_path = f"/usr/.cache/flexflow/debug/flexflow/fwd/step_{step_idx}/shard_0/layers.{i}.layers.{i}.mlp.down_proj.lora.low_rank_activation"
-            lora_low_rank_activation_bwd_path = f"/usr/.cache/flexflow/debug/flexflow/bwd/step_{step_idx}/shard_0/layers.{i}.layers.{i}.mlp.down_proj.lora.low_rank_activation"
+            lora_low_rank_activation_fwd_path = f"fwd/step_{step_idx}/shard_0/layers.{i}.layers.{i}.mlp.down_proj.lora.low_rank_activation"
+            lora_low_rank_activation_bwd_path = f"bwd/step_{step_idx}/shard_0/layers.{i}.layers.{i}.mlp.down_proj.lora.low_rank_activation"
             lora_low_rank_activation_fwd = load_ff_tensor(lora_low_rank_activation_fwd_path, [16, 128])[:,:self.num_tokens]
             lora_low_rank_activation_fwd = torch.from_numpy(lora_low_rank_activation_fwd)
             lora_low_rank_activation_bwd = load_ff_tensor(lora_low_rank_activation_bwd_path, [16, 24])