shallowdream204 · YHX021014 · Nov 12, 2024
diff --git a/test_1024.py b/test_1024.py
@@ -238,7 +238,7 @@ def parse_args():
         log_with=args.report_to,
         project_dir=os.path.join(config.work_dir, 'logs'),
         fsdp_plugin=fsdp_plugin,
-        even_batches=even_batches,
+        # even_batches=even_batches,
         kwargs_handlers=[init_handler]
     )
     logger = get_root_logger(os.path.join(config.work_dir, 'eval_dreamclear.log'))

diff --git a/tools/extract_t5_features.py b/tools/extract_t5_features.py
@@ -32,7 +32,8 @@ def extract_caption_t5_job(item):
             caption = [caption]
 
         # save_path = item.replace('.caption','')
-        save_path = os.path.join(args.save_npz_folder, os.path.basename(item).replace('.caption','.npz'))
+        # save_path = os.path.join(args.save_npz_folder, os.path.basename(item).replace('.caption','.npz'))
+        save_path = os.path.join(args.save_npz_folder, os.path.basename(item).replace('.txt','.npz'))
         if os.path.exists(f"{save_path}.npz"):
             return
         try:
@@ -51,7 +52,7 @@ def get_caption_files(folder_path):
     caption_files = []
     for root, dirs, files in os.walk(folder_path):
         for file in files:
-            if file.endswith('.caption'):
+            if file.endswith('.txt'):
                 caption_files.append(os.path.join(root, file))
     caption_files.sort()
     return caption_files

diff --git a/tools/llavaInfer.py b/tools/llavaInfer.py
@@ -0,0 +1,55 @@
+from PIL import Image
+import os
+from pathlib import Path
+import torch
+from transformers import AutoProcessor, LlavaForConditionalGeneration
+import argparse
+
+def inferLLaVa_oneImage(img_path, prompt):
+    image = Image.open(img_path)
+
+    inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda:0")
+    for k,v in inputs.items():
+        print(k,v.shape)
+
+    with torch.no_grad():
+        outputs = model.generate(**inputs, max_new_tokens=100)
+
+    description = processor.batch_decode(outputs, skip_special_tokens=True)
+    description = description[0].split("ASSISTANT:")[-1]
+    print(description)
+
+
+def generate_descriptions_for_directory(image_dir, output_dir):
+
+    os.makedirs(output_dir, exist_ok=True)
+
+    for image_file in Path(image_dir).glob("*.png"): 
+        image = Image.open(image_file)
+        prompt = "USER: <image>\nDescribe this image and its style in a very detailed manner.\nASSISTANT:"
+        inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda:0")
+
+        with torch.no_grad():
+            outputs = model.generate(**inputs, max_new_tokens=100)
+
+        description = processor.batch_decode(outputs, skip_special_tokens=True)
+        description = description[0].split("ASSISTANT:")[-1]
+
+        output_file = Path(output_dir) / f"{image_file.stem}.txt"
+        with open(output_file, "w") as f:
+            f.write(description)
+        print(f"Generated description for {image_file.name} saved to {output_file}")
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description="Generate image descriptions using Llava model.")
+    parser.add_argument('--images_dir', type=str, required=True, help="Directory containing input images.")
+    parser.add_argument('--caption_dir', type=str, required=True, help="Directory to save generated captions.")
+
+    args = parser.parse_args()
+
+    processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
+    model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", device_map="auto")
+
+    generate_descriptions_for_directory(args.images_dir, args.caption_dir)