diff --git a/datadreamer/pipelines/generate_dataset_from_scratch.py b/datadreamer/pipelines/generate_dataset_from_scratch.py index d675b9c..19da5b9 100644 --- a/datadreamer/pipelines/generate_dataset_from_scratch.py +++ b/datadreamer/pipelines/generate_dataset_from_scratch.py @@ -21,7 +21,11 @@ TinyLlamaLMPromptGenerator, ) -prompt_generators = {"simple": SimplePromptGenerator, "lm": LMPromptGenerator, 'tiny': TinyLlamaLMPromptGenerator} +prompt_generators = { + "simple": SimplePromptGenerator, + "lm": LMPromptGenerator, + "tiny": TinyLlamaLMPromptGenerator, +} image_generators = { "sdxl": StableDiffusionImageGenerator, diff --git a/datadreamer/prompt_generation/__init__.py b/datadreamer/prompt_generation/__init__.py index b71c16f..8c1422e 100644 --- a/datadreamer/prompt_generation/__init__.py +++ b/datadreamer/prompt_generation/__init__.py @@ -3,4 +3,9 @@ from .synonym_generator import SynonymGenerator from .tinyllama_lm_prompt_generator import TinyLlamaLMPromptGenerator -__all__ = ["SimplePromptGenerator", "LMPromptGenerator", "SynonymGenerator", "TinyLlamaLMPromptGenerator"] +__all__ = [ + "SimplePromptGenerator", + "LMPromptGenerator", + "SynonymGenerator", + "TinyLlamaLMPromptGenerator", +] diff --git a/datadreamer/prompt_generation/tinyllama_lm_prompt_generator.py b/datadreamer/prompt_generation/tinyllama_lm_prompt_generator.py index 9a1faff..8a6554b 100644 --- a/datadreamer/prompt_generation/tinyllama_lm_prompt_generator.py +++ b/datadreamer/prompt_generation/tinyllama_lm_prompt_generator.py @@ -1,9 +1,7 @@ -import random import re from typing import List, Optional import torch -from tqdm import tqdm from transformers import AutoModelForCausalLM, AutoTokenizer from datadreamer.prompt_generation.lm_prompt_generator import LMPromptGenerator @@ -48,7 +46,8 @@ def _init_lang_model(self): model = AutoModelForCausalLM.from_pretrained( "TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype="auto", - low_cpu_mem_usage=True + device_map="cpu", + low_cpu_mem_usage=True, ) else: print("Loading language model on GPU...") @@ -56,17 +55,29 @@ def _init_lang_model(self): "TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.float16 ) - tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained( + "TinyLlama/TinyLlama-1.1B-Chat-v1.0", trust_remote_code=True + ) print("Done!") return model.to(self.device), tokenizer def remove_incomplete_sentence(self, text): # Define the regex pattern to capture up to the last sentence-ending punctuation - pattern = r'^(.*[.!?])' - # pattern = r'^(.*[.!?;:])' + pattern = r"^(.*[.!?])" match = re.search(pattern, text) return match.group(0) if match else text + def remove_caption_sentences(self, text): + # Pattern to find sentences that start with "Caption reads: " + # \s* matches any whitespace characters at the beginning of the string (including none) + # re.IGNORECASE makes the search case-insensitive + # [^\.!?]* matches any sequence of characters that are not a period, exclamation mark, or question mark + # [\.\!?] matches a period, exclamation mark, or question mark, indicating the end of a sentence + pattern = re.compile(r"\s*Caption reads: [^\.!?]*[\.\!?]", re.IGNORECASE) + # Replace the matched sentences with an empty string + cleaned_text = re.sub(pattern, "", text) + return cleaned_text + def _create_lm_prompt_text(self, selected_objects: List[str]) -> str: """Creates a language model text prompt based on selected objects. @@ -77,7 +88,6 @@ def _create_lm_prompt_text(self, selected_objects: List[str]) -> str: str: A text prompt for the language model. """ return f"<|system|>\nYou are a chatbot who describes content of images!\n<|user|>\nGenerate a short and concise caption for an image. Follow this template: 'A photo of {', '.join(selected_objects)}', where the objects interact in a meaningful way within a scene, complete with a short scene description. The caption must be short in length and start with the words: 'A photo of '! Do not use the phrase 'Caption reads'.\n<|assistant|>\n" - def generate_prompt(self, prompt_text: str) -> str: """Generates a single prompt using the language model. @@ -110,7 +120,9 @@ def generate_prompt(self, prompt_text: str) -> str: .replace("'", "") ) - return self.remove_incomplete_sentence(decoded_prompt) + return self.remove_caption_sentences( + self.remove_incomplete_sentence(decoded_prompt) + ) if __name__ == "__main__": diff --git a/tests/integration/test_pipeline.py b/tests/integration/test_pipeline.py index 56f10b3..fcd64b9 100644 --- a/tests/integration/test_pipeline.py +++ b/tests/integration/test_pipeline.py @@ -356,6 +356,97 @@ def test_cuda_lm_sdxl_detection_pipeline(): _check_detection_pipeline(cmd, target_folder) +# ========================================================= +# DETECTION - TinyLlama LLM +# ========================================================= +@pytest.mark.skipif( + total_memory < 16 or total_disk_space < 35, + reason="Test requires at least 16GB of RAM and 35GB of HDD", +) +def test_cpu_tiny_sdxl_turbo_detection_pipeline(): + # Define target folder + target_folder = "data/data-det-cpu-tiny-sdxl-turbo/" + # Define the command to run the datadreamer + cmd = ( + f"datadreamer --save_dir {target_folder} " + f"--class_names alien mars cat " + f"--prompts_number 1 " + f"--prompt_generator tiny " + f"--num_objects_range 1 2 " + f"--image_generator sdxl-turbo " + f"--use_image_tester " + f"--device cpu" + ) + # Check the run of the pipeline + _check_detection_pipeline(cmd, target_folder) + + +@pytest.mark.skipif( + not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 35, + reason="Test requires GPU, at least 16GB of RAM and 35GB of HDD", +) +def test_cuda_tiny_sdxl_turbo_detection_pipeline(): + # Define target folder + target_folder = "data/data-det-cuda-tiny-sdxl-turbo/" + # Define the command to run the datadreamer + cmd = ( + f"datadreamer --save_dir {target_folder} " + f"--class_names alien mars cat " + f"--prompts_number 1 " + f"--prompt_generator tiny " + f"--num_objects_range 1 2 " + f"--image_generator sdxl-turbo " + f"--use_image_tester " + f"--device cuda" + ) + # Check the run of the pipeline + _check_detection_pipeline(cmd, target_folder) + + +@pytest.mark.skipif( + total_memory < 16 or total_disk_space < 35, + reason="Test requires at least 16GB of RAM and 35GB of HDD", +) +def test_cpu_tiny_sdxl_detection_pipeline(): + # Define target folder + target_folder = "data/data-det-cpu-tiny-sdxl/" + # Define the command to run the datadreamer + cmd = ( + f"datadreamer --save_dir {target_folder} " + f"--class_names alien mars cat " + f"--prompts_number 1 " + f"--prompt_generator tiny " + f"--num_objects_range 1 2 " + f"--image_generator sdxl " + f"--use_image_tester " + f"--device cpu" + ) + # Check the run of the pipeline + _check_detection_pipeline(cmd, target_folder) + + +@pytest.mark.skipif( + not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 35, + reason="Test requires GPU, at least 16GB of RAM and 35GB of HDD", +) +def test_cuda_tiny_sdxl_detection_pipeline(): + # Define target folder + target_folder = "data/data-det-cuda-tiny-sdxl/" + # Define the command to run the datadreamer + cmd = ( + f"datadreamer --save_dir {target_folder} " + f"--class_names alien mars cat " + f"--prompts_number 1 " + f"--prompt_generator tiny " + f"--num_objects_range 1 2 " + f"--image_generator sdxl " + f"--use_image_tester " + f"--device cuda" + ) + # Check the run of the pipeline + _check_detection_pipeline(cmd, target_folder) + + # ========================================================= # CLASSIFICATION - SIMPLE LM # ========================================================= @@ -544,3 +635,98 @@ def test_cuda_lm_sdxl_classification_pipeline(): ) # Check the run of the pipeline _check_detection_pipeline(cmd, target_folder) + + +# ========================================================= +# CLASSIFICATION - TinyLlama LLM +# ========================================================= +@pytest.mark.skipif( + total_memory < 16 or total_disk_space < 35, + reason="Test requires at least 16GB of RAM and 35GB of HDD", +) +def test_cpu_tiny_sdxl_turbo_classification_pipeline(): + # Define target folder + target_folder = "data/data-cls-cpu-tiny-sdxl-turbo/" + # Define the command to run the datadreamer + cmd = ( + f"datadreamer --task classification " + f"--save_dir {target_folder} " + f"--class_names alien mars cat " + f"--prompts_number 1 " + f"--prompt_generator tiny " + f"--num_objects_range 1 2 " + f"--image_generator sdxl-turbo " + f"--use_image_tester " + f"--device cpu" + ) + # Check the run of the pipeline + _check_detection_pipeline(cmd, target_folder) + + +@pytest.mark.skipif( + not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 35, + reason="Test requires GPU, at least 16GB of RAM and 35GB of HDD", +) +def test_cuda_tiny_sdxl_turbo_classification_pipeline(): + # Define target folder + target_folder = "data/data-cls-cuda-tiny-sdxl-turbo/" + # Define the command to run the datadreamer + cmd = ( + f"datadreamer --task classification " + f"--save_dir {target_folder} " + f"--class_names alien mars cat " + f"--prompts_number 1 " + f"--prompt_generator tiny " + f"--num_objects_range 1 2 " + f"--image_generator sdxl-turbo " + f"--use_image_tester " + f"--device cuda" + ) + # Check the run of the pipeline + _check_detection_pipeline(cmd, target_folder) + + +@pytest.mark.skipif( + total_memory < 16 or total_disk_space < 35, + reason="Test requires at least 16GB of RAM and 35GB of HDD", +) +def test_cpu_tiny_sdxl_classification_pipeline(): + # Define target folder + target_folder = "data/data-cls-cpu-tiny-sdxl/" + # Define the command to run the datadreamer + cmd = ( + f"datadreamer --task classification " + f"--save_dir {target_folder} " + f"--class_names alien mars cat " + f"--prompts_number 1 " + f"--prompt_generator tiny " + f"--num_objects_range 1 2 " + f"--image_generator sdxl " + f"--use_image_tester " + f"--device cpu" + ) + # Check the run of the pipeline + _check_detection_pipeline(cmd, target_folder) + + +@pytest.mark.skipif( + not torch.cuda.is_available() or total_memory < 16 or total_disk_space < 35, + reason="Test requires GPU, at least 16GB of RAM and 35GB of HDD", +) +def test_cuda_tiny_sdxl_classification_pipeline(): + # Define target folder + target_folder = "data/data-cls-cuda-tiny-sdxl/" + # Define the command to run the datadreamer + cmd = ( + f"datadreamer --task classification " + f"--save_dir {target_folder} " + f"--class_names alien mars cat " + f"--prompts_number 1 " + f"--prompt_generator tiny " + f"--num_objects_range 1 2 " + f"--image_generator sdxl " + f"--use_image_tester " + f"--device cuda" + ) + # Check the run of the pipeline + _check_detection_pipeline(cmd, target_folder) diff --git a/tests/unittests/test_prompt_generation.py b/tests/unittests/test_prompt_generation.py index 9e3423c..92e921a 100644 --- a/tests/unittests/test_prompt_generation.py +++ b/tests/unittests/test_prompt_generation.py @@ -5,6 +5,9 @@ from datadreamer.prompt_generation.lm_prompt_generator import LMPromptGenerator from datadreamer.prompt_generation.simple_prompt_generator import SimplePromptGenerator from datadreamer.prompt_generation.synonym_generator import SynonymGenerator +from datadreamer.prompt_generation.tinyllama_lm_prompt_generator import ( + TinyLlamaLMPromptGenerator, +) # Get the total memory in GB total_memory = psutil.virtual_memory().total / (1024**3) @@ -32,9 +35,9 @@ def test_simple_prompt_generator(): assert prompt_text == f"A photo of a {', a '.join(selected_objects)}" -def _check_lm_prompt_generator(device: str): +def _check_lm_prompt_generator(device: str, prompt_generator_class=LMPromptGenerator): object_names = ["aeroplane", "bicycle", "bird", "boat"] - prompt_generator = LMPromptGenerator( + prompt_generator = prompt_generator_class( class_names=object_names, prompts_number=2, device=device ) prompts = prompt_generator.generate_prompts() @@ -73,6 +76,22 @@ def test_cpu_lm_prompt_generator(): _check_lm_prompt_generator("cpu") +@pytest.mark.skipif( + total_memory < 8 or not torch.cuda.is_available() or total_disk_space < 12, + reason="Test requires at least 8GB of RAM, 12GB of HDD and CUDA support", +) +def test_cuda_tinyllama_lm_prompt_generator(): + _check_lm_prompt_generator("cuda", TinyLlamaLMPromptGenerator) + + +@pytest.mark.skipif( + total_memory < 12 or total_disk_space < 12, + reason="Test requires at least 12GB of RAM and 12GB of HDD for running on CPU", +) +def test_cpu_tinyllama_lm_prompt_generator(): + _check_lm_prompt_generator("cpu", TinyLlamaLMPromptGenerator) + + def _check_synonym_generator(device: str): synonyms_num = 3 generator = SynonymGenerator(synonyms_number=synonyms_num, device=device)