From e544fe0425266aee198f255a326e32248acf7b41 Mon Sep 17 00:00:00 2001
From: Lukas Kuhn <lukaskuhn.lku@gmail.com>
Date: Mon, 13 Nov 2023 20:09:39 +0100
Subject: [PATCH 1/6] feat: added tracemalloc arg to train_dreambooth

---
 examples/lora_dreambooth/train_dreambooth.py | 40 ++++++++++++--------
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/examples/lora_dreambooth/train_dreambooth.py b/examples/lora_dreambooth/train_dreambooth.py
index 3e350b0313..eefcffbf5e 100644
--- a/examples/lora_dreambooth/train_dreambooth.py
+++ b/examples/lora_dreambooth/train_dreambooth.py
@@ -9,6 +9,7 @@
 import warnings
 from pathlib import Path
 from typing import Optional
+from contextlib import nullcontext
 
 import datasets
 import diffusers
@@ -217,6 +218,13 @@ def parse_args(input_args=None):
         "--num_dataloader_workers", type=int, default=1, help="Num of workers for the training dataloader."
     )
 
+    parser.add_argument(
+        "--trace_memory_allocation",
+        default=True,
+        action="store_true",
+        help="Flag to track memory allocation during training.",
+    )
+
     parser.add_argument(
         "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
     )
@@ -897,7 +905,7 @@ def main(args):
         unet.train()
         if args.train_text_encoder:
             text_encoder.train()
-        with TorchTracemalloc() as tracemalloc:
+        with TorchTracemalloc() if args.trace_memory_allocation else nullcontext() as tracemalloc:
             for step, batch in enumerate(train_dataloader):
                 # Skip steps until we reach the resumed step
                 if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step:
@@ -1038,23 +1046,25 @@ def main(args):
                 if global_step >= args.max_train_steps:
                     break
         # Printing the GPU memory usage details such as allocated memory, peak memory, and total memory usage
-        accelerator.print("GPU Memory before entering the train : {}".format(b2mb(tracemalloc.begin)))
-        accelerator.print("GPU Memory consumed at the end of the train (end-begin): {}".format(tracemalloc.used))
-        accelerator.print("GPU Peak Memory consumed during the train (max-begin): {}".format(tracemalloc.peaked))
-        accelerator.print(
-            "GPU Total Peak Memory consumed during the train (max): {}".format(
-                tracemalloc.peaked + b2mb(tracemalloc.begin)
+
+        if args.trace_memory_allocation:
+            accelerator.print("GPU Memory before entering the train : {}".format(b2mb(tracemalloc.begin)))
+            accelerator.print("GPU Memory consumed at the end of the train (end-begin): {}".format(tracemalloc.used))
+            accelerator.print("GPU Peak Memory consumed during the train (max-begin): {}".format(tracemalloc.peaked))
+            accelerator.print(
+                "GPU Total Peak Memory consumed during the train (max): {}".format(
+                    tracemalloc.peaked + b2mb(tracemalloc.begin)
+                )
             )
-        )
 
-        accelerator.print("CPU Memory before entering the train : {}".format(b2mb(tracemalloc.cpu_begin)))
-        accelerator.print("CPU Memory consumed at the end of the train (end-begin): {}".format(tracemalloc.cpu_used))
-        accelerator.print("CPU Peak Memory consumed during the train (max-begin): {}".format(tracemalloc.cpu_peaked))
-        accelerator.print(
-            "CPU Total Peak Memory consumed during the train (max): {}".format(
-                tracemalloc.cpu_peaked + b2mb(tracemalloc.cpu_begin)
+            accelerator.print("CPU Memory before entering the train : {}".format(b2mb(tracemalloc.cpu_begin)))
+            accelerator.print("CPU Memory consumed at the end of the train (end-begin): {}".format(tracemalloc.cpu_used))
+            accelerator.print("CPU Peak Memory consumed during the train (max-begin): {}".format(tracemalloc.cpu_peaked))
+            accelerator.print(
+                "CPU Total Peak Memory consumed during the train (max): {}".format(
+                    tracemalloc.cpu_peaked + b2mb(tracemalloc.cpu_begin)
+                )
             )
-        )
 
     # Create the pipeline using using the trained modules and save it.
     accelerator.wait_for_everyone()

From abd01deb8f9596fc40c5ed8a482cac6537cb0215 Mon Sep 17 00:00:00 2001
From: Lukas Kuhn <lukaskuhn.lku@gmail.com>
Date: Mon, 13 Nov 2023 20:13:21 +0100
Subject: [PATCH 2/6] fix: added help for arg

---
 examples/lora_dreambooth/train_dreambooth.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/lora_dreambooth/train_dreambooth.py b/examples/lora_dreambooth/train_dreambooth.py
index eefcffbf5e..be092225d8 100644
--- a/examples/lora_dreambooth/train_dreambooth.py
+++ b/examples/lora_dreambooth/train_dreambooth.py
@@ -222,7 +222,7 @@ def parse_args(input_args=None):
         "--trace_memory_allocation",
         default=True,
         action="store_true",
-        help="Flag to track memory allocation during training.",
+        help="Flag to track memory allocation during training. This could slow down training on Windows.",
     )
 
     parser.add_argument(

From 3087dfa1cfcdddeac71b86410ecde4069e6a5aa1 Mon Sep 17 00:00:00 2001
From: Lukas Kuhn <lukaskuhn.lku@gmail.com>
Date: Mon, 13 Nov 2023 20:25:06 +0100
Subject: [PATCH 3/6] fix: changed arg name

---
 examples/lora_dreambooth/train_dreambooth.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/lora_dreambooth/train_dreambooth.py b/examples/lora_dreambooth/train_dreambooth.py
index be092225d8..0819661d13 100644
--- a/examples/lora_dreambooth/train_dreambooth.py
+++ b/examples/lora_dreambooth/train_dreambooth.py
@@ -219,10 +219,10 @@ def parse_args(input_args=None):
     )
 
     parser.add_argument(
-        "--trace_memory_allocation",
-        default=True,
+        "--no_tracemalloc",
+        default=False,
         action="store_true",
-        help="Flag to track memory allocation during training. This could slow down training on Windows.",
+        help="Flag to stop memory allocation tracing during training. This could speed up training on Windows.",
     )
 
     parser.add_argument(
@@ -905,7 +905,7 @@ def main(args):
         unet.train()
         if args.train_text_encoder:
             text_encoder.train()
-        with TorchTracemalloc() if args.trace_memory_allocation else nullcontext() as tracemalloc:
+        with TorchTracemalloc() if not args.no_tracemalloc else nullcontext() as tracemalloc:
             for step, batch in enumerate(train_dataloader):
                 # Skip steps until we reach the resumed step
                 if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step:
@@ -1047,7 +1047,7 @@ def main(args):
                     break
         # Printing the GPU memory usage details such as allocated memory, peak memory, and total memory usage
 
-        if args.trace_memory_allocation:
+        if not args.no_tracemalloc:
             accelerator.print("GPU Memory before entering the train : {}".format(b2mb(tracemalloc.begin)))
             accelerator.print("GPU Memory consumed at the end of the train (end-begin): {}".format(tracemalloc.used))
             accelerator.print("GPU Peak Memory consumed during the train (max-begin): {}".format(tracemalloc.peaked))

From 439410030be66f40bd1c1adbc088efdf225d8d96 Mon Sep 17 00:00:00 2001
From: Lukas Kuhn <lukaskuhn.lku@gmail.com>
Date: Tue, 14 Nov 2023 09:12:28 +0100
Subject: [PATCH 4/6] fix formatting

---
 examples/lora_dreambooth/train_dreambooth.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/examples/lora_dreambooth/train_dreambooth.py b/examples/lora_dreambooth/train_dreambooth.py
index 0819661d13..ede2cb9448 100644
--- a/examples/lora_dreambooth/train_dreambooth.py
+++ b/examples/lora_dreambooth/train_dreambooth.py
@@ -1058,8 +1058,12 @@ def main(args):
             )
 
             accelerator.print("CPU Memory before entering the train : {}".format(b2mb(tracemalloc.cpu_begin)))
-            accelerator.print("CPU Memory consumed at the end of the train (end-begin): {}".format(tracemalloc.cpu_used))
-            accelerator.print("CPU Peak Memory consumed during the train (max-begin): {}".format(tracemalloc.cpu_peaked))
+            accelerator.print(
+                "CPU Memory consumed at the end of the train (end-begin): {}".format(tracemalloc.cpu_used)
+            )
+            accelerator.print(
+                "CPU Peak Memory consumed during the train (max-begin): {}".format(tracemalloc.cpu_peaked)
+            )
             accelerator.print(
                 "CPU Total Peak Memory consumed during the train (max): {}".format(
                     tracemalloc.cpu_peaked + b2mb(tracemalloc.cpu_begin)

From 3abfc90466b879939f9438b04c2c6e6be4895c41 Mon Sep 17 00:00:00 2001
From: Lukas Kuhn <lukaskuhn.lku@gmail.com>
Date: Tue, 14 Nov 2023 11:28:13 +0100
Subject: [PATCH 5/6] fix: import order

---
 examples/lora_dreambooth/train_dreambooth.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/lora_dreambooth/train_dreambooth.py b/examples/lora_dreambooth/train_dreambooth.py
index ede2cb9448..73d827a4c3 100644
--- a/examples/lora_dreambooth/train_dreambooth.py
+++ b/examples/lora_dreambooth/train_dreambooth.py
@@ -7,9 +7,9 @@
 import os
 import threading
 import warnings
+from contextlib import nullcontext
 from pathlib import Path
 from typing import Optional
-from contextlib import nullcontext
 
 import datasets
 import diffusers

From 00564f4f3d08b4b08113a42cb306ad1ee418a43c Mon Sep 17 00:00:00 2001
From: Lukas Kuhn <lukaskuhn.lku@gmail.com>
Date: Tue, 21 Nov 2023 22:17:35 +0100
Subject: [PATCH 6/6] fix: dataset was loaded twice in ft script

---
 examples/fp4_finetuning/finetune_fp4_opt_bnb_peft.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/examples/fp4_finetuning/finetune_fp4_opt_bnb_peft.py b/examples/fp4_finetuning/finetune_fp4_opt_bnb_peft.py
index f0fc29d8e1..018cc53b05 100755
--- a/examples/fp4_finetuning/finetune_fp4_opt_bnb_peft.py
+++ b/examples/fp4_finetuning/finetune_fp4_opt_bnb_peft.py
@@ -131,11 +131,6 @@ def print_trainable_parameters(model):
 
 """### Training"""
 
-
-data = load_dataset("Abirate/english_quotes")
-data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
-
-
 data = load_dataset("Abirate/english_quotes")
 data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)