diff --git a/README.md b/README.md index fffba0e..9ae75af 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ The goal of this project is to make Stable Diffusion more accessible, simple and **Installation:** ``` -pip install stablepy==0.4.1 +pip install stablepy==0.5.0 ``` **Usage:** @@ -80,6 +80,16 @@ images[1] ``` **📖 News:** +🔥 Version 0.5.0: New Update Details + +- Fix LoRA SDXL compatibility. +- Latent upscaler and variants. +- Perturbed Attention Guidance (PAG) enhances image generation quality without the need for training. +- Multiple images for one FaceID adapter. +- ControlNet for SDXL: MLSD, Segmentation, Normalbae. +- ControlNet "lineart_anime" task accessible and able to load a model different from the "lineart" task. +- ControlNet Tile and Recolor for SD1.5 and SDXL ("tile" replaces the previous task called "sdxl_tile_realistic"). + 🔥 Version 0.4.0: New Update Details - IP Adapter with the variants FaceID and Instant-Style diff --git a/poetry.lock b/poetry.lock index 5766a57..5cec11d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -960,13 +960,13 @@ files = [ [[package]] name = "diffusers" -version = "0.29.0" +version = "0.30.2" description = "State-of-the-art diffusion in PyTorch and JAX." optional = false python-versions = ">=3.8.0" files = [ - {file = "diffusers-0.29.0-py3-none-any.whl", hash = "sha256:4c194d2379644a0f7ef9b4ff12c8cf5de4c6324e811265754f08e2f839b8cedb"}, - {file = "diffusers-0.29.0.tar.gz", hash = "sha256:0212030a8fabe7a07d1c8e925ccdf7a529a98b0425fa078900a679b2451a8ac2"}, + {file = "diffusers-0.30.2-py3-none-any.whl", hash = "sha256:739826043147c2b59560944591dfdea5d24cd4fb15e751abbe20679a289bece8"}, + {file = "diffusers-0.30.2.tar.gz", hash = "sha256:641875f78f36bdfa4b9af752b124d1fd6d431eadd5547fe0a3f354ae0af2636c"}, ] [package.dependencies] @@ -980,13 +980,13 @@ requests = "*" safetensors = ">=0.3.1" [package.extras] -dev = ["GitPython (<3.1.19)", "Jinja2", "accelerate (>=0.29.3)", "compel (==0.1.8)", "datasets", "flax (>=0.4.1)", "hf-doc-builder (>=0.3.0)", "invisible-watermark (>=0.2.0)", "isort (>=5.5.4)", "jax (>=0.4.1)", "jaxlib (>=0.4.1)", "k-diffusion (>=0.0.12)", "librosa", "parameterized", "peft (>=0.6.0)", "protobuf (>=3.20.3,<4)", "pytest", "pytest-timeout", "pytest-xdist", "requests-mock (==1.10.0)", "ruff (==0.1.5)", "safetensors (>=0.3.1)", "scipy", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "torch (>=1.4)", "torchvision", "transformers (>=4.25.1)", "urllib3 (<=2.0.0)"] +dev = ["GitPython (<3.1.19)", "Jinja2", "accelerate (>=0.31.0)", "compel (==0.1.8)", "datasets", "flax (>=0.4.1)", "hf-doc-builder (>=0.3.0)", "invisible-watermark (>=0.2.0)", "isort (>=5.5.4)", "jax (>=0.4.1)", "jaxlib (>=0.4.1)", "k-diffusion (>=0.0.12)", "librosa", "parameterized", "peft (>=0.6.0)", "protobuf (>=3.20.3,<4)", "pytest", "pytest-timeout", "pytest-xdist", "requests-mock (==1.10.0)", "ruff (==0.1.5)", "safetensors (>=0.3.1)", "scipy", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "torch (>=1.4)", "torchvision", "transformers (>=4.41.2)", "urllib3 (<=2.0.0)"] docs = ["hf-doc-builder (>=0.3.0)"] flax = ["flax (>=0.4.1)", "jax (>=0.4.1)", "jaxlib (>=0.4.1)"] quality = ["hf-doc-builder (>=0.3.0)", "isort (>=5.5.4)", "ruff (==0.1.5)", "urllib3 (<=2.0.0)"] -test = ["GitPython (<3.1.19)", "Jinja2", "compel (==0.1.8)", "datasets", "invisible-watermark (>=0.2.0)", "k-diffusion (>=0.0.12)", "librosa", "parameterized", "pytest", "pytest-timeout", "pytest-xdist", "requests-mock (==1.10.0)", "safetensors (>=0.3.1)", "scipy", "sentencepiece (>=0.1.91,!=0.1.92)", "torchvision", "transformers (>=4.25.1)"] -torch = ["accelerate (>=0.29.3)", "torch (>=1.4)"] -training = ["Jinja2", "accelerate (>=0.29.3)", "datasets", "peft (>=0.6.0)", "protobuf (>=3.20.3,<4)", "tensorboard"] +test = ["GitPython (<3.1.19)", "Jinja2", "compel (==0.1.8)", "datasets", "invisible-watermark (>=0.2.0)", "k-diffusion (>=0.0.12)", "librosa", "parameterized", "pytest", "pytest-timeout", "pytest-xdist", "requests-mock (==1.10.0)", "safetensors (>=0.3.1)", "scipy", "sentencepiece (>=0.1.91,!=0.1.92)", "torchvision", "transformers (>=4.41.2)"] +torch = ["accelerate (>=0.31.0)", "torch (>=1.4)"] +training = ["Jinja2", "accelerate (>=0.31.0)", "datasets", "peft (>=0.6.0)", "protobuf (>=3.20.3,<4)", "tensorboard"] [[package]] name = "easydict" @@ -2605,9 +2605,9 @@ files = [ [package.dependencies] numpy = [ + {version = ">=1.23.5", markers = "python_version >= \"3.11\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, - {version = ">=1.23.5", markers = "python_version >= \"3.11\""}, ] [[package]] @@ -2628,9 +2628,9 @@ files = [ [package.dependencies] numpy = [ + {version = ">=1.23.5", markers = "python_version >= \"3.11\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, - {version = ">=1.23.5", markers = "python_version >= \"3.11\""}, ] [[package]] @@ -2670,9 +2670,9 @@ files = [ [package.dependencies] numpy = [ + {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, - {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, ] [[package]] @@ -2791,8 +2791,8 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.22.4", markers = "python_version < \"3.11\""}, {version = ">=1.23.2", markers = "python_version == \"3.11\""}, + {version = ">=1.22.4", markers = "python_version < \"3.11\""}, ] python-dateutil = ">=2.8.2" pytz = ">=2020.1" @@ -4961,4 +4961,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "efe3f385cdce67877df7018b7eeb79e52a4af03253e41be9c3dbe06f1124a7fc" +content-hash = "229667b0f529f65314c6f98a9b8cadee9bb40d03b2545f5ddec9b839eb803e94" diff --git a/pyproject.toml b/pyproject.toml index e3411c9..a518ea7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "stablepy" -version = "0.4.1" +version = "0.5.0" description = "A tool for easy use of stable diffusion" authors = ["Roger Condori(R3gm) "] readme = "README.md" @@ -11,7 +11,7 @@ torch = {version = "*", source = "pytorch-gpu-src"} torchvision = {version = "*", source = "pytorch-gpu-src"} torchaudio = {version = "*", source = "pytorch-gpu-src"} omegaconf = "2.3.0" -diffusers = "0.29.0" +diffusers = "0.30.2" compel = "2.0.2" invisible-watermark = "^0.2.0" transformers = "^4.41.2" @@ -28,6 +28,7 @@ peft = "^0.11.1" torchsde = "^0.2.6" onnxruntime = "^1.18.0" insightface = "^0.7.3" +opencv-contrib-python = "^4.8.0.76" [[tool.poetry.source]] name = "pytorch-gpu-src" diff --git a/stablepy/__init__.py b/stablepy/__init__.py index 108ff01..92093a4 100644 --- a/stablepy/__init__.py +++ b/stablepy/__init__.py @@ -4,6 +4,7 @@ from .diffusers_vanilla import utils from .upscalers.esrgan import UpscalerESRGAN, UpscalerLanczos, UpscalerNearest from .logging.logging_setup import logger +from .diffusers_vanilla.high_resolution import LATENT_UPSCALERS from .diffusers_vanilla.constants import ( CONTROLNET_MODEL_IDS, VALID_TASKS, diff --git a/stablepy/__version__.py b/stablepy/__version__.py index 3d26edf..3d18726 100644 --- a/stablepy/__version__.py +++ b/stablepy/__version__.py @@ -1 +1 @@ -__version__ = "0.4.1" +__version__ = "0.5.0" diff --git a/stablepy/diffusers_vanilla/constants.py b/stablepy/diffusers_vanilla/constants.py index 1f2788d..ed306cc 100644 --- a/stablepy/diffusers_vanilla/constants.py +++ b/stablepy/diffusers_vanilla/constants.py @@ -18,18 +18,68 @@ EDMEulerScheduler, TCDScheduler, ) +from diffusers import ( + StableDiffusionControlNetPipeline, + StableDiffusionControlNetInpaintPipeline, + StableDiffusionPipeline, + StableDiffusionXLInpaintPipeline, + StableDiffusionXLAdapterPipeline, + StableDiffusionXLPipeline, + StableDiffusionXLControlNetPipeline, + StableDiffusionPAGPipeline, + # StableDiffusionControlNetPAGInpaintPipeline, + StableDiffusionControlNetPAGPipeline, + # StableDiffusionControlNetImg2ImgPAGPipeline, + StableDiffusionXLPAGPipeline, + StableDiffusionXLPAGInpaintPipeline, + StableDiffusionXLControlNetPAGPipeline, + # StableDiffusionXLAdapterPAGPipeline, + # StableDiffusionXLControlNetImg2ImgPAGPipeline, +) + +CLASS_DIFFUSERS_TASK = { + "StableDiffusionPipeline": { + "base": StableDiffusionPipeline, + "inpaint": StableDiffusionControlNetInpaintPipeline, + "controlnet": StableDiffusionControlNetPipeline, + # "controlnet_img2img": StableDiffusionControlNetImg2ImgPipeline, + }, + "StableDiffusionXLPipeline": { + "base": StableDiffusionXLPipeline, + "inpaint": StableDiffusionXLInpaintPipeline, + "controlnet": StableDiffusionXLControlNetPipeline, + "adapter": StableDiffusionXLAdapterPipeline, + # "controlnet_img2img": StableDiffusionXLControlNetImg2ImgPipeline, + }, +} + +CLASS_PAG_DIFFUSERS_TASK = { + "StableDiffusionPipeline": { + "base": StableDiffusionPAGPipeline, + "inpaint": StableDiffusionControlNetInpaintPipeline, + "controlnet": StableDiffusionControlNetPAGPipeline, + # "controlnet_img2img": StableDiffusionControlNetImg2ImgPAGPipeline, + }, + "StableDiffusionXLPipeline": { + "base": StableDiffusionXLPAGPipeline, + "inpaint": StableDiffusionXLPAGInpaintPipeline, + "controlnet": StableDiffusionXLControlNetPAGPipeline, + # "adapter": StableDiffusionXLAdapterPAGPipeline, + # "controlnet_img2img": StableDiffusionXLControlNetImg2ImgPAGPipeline, + }, +} CONTROLNET_MODEL_IDS = { "openpose": ["lllyasviel/control_v11p_sd15_openpose", "r3gm/controlnet-openpose-sdxl-1.0-fp16"], "canny": ["lllyasviel/control_v11p_sd15_canny", "r3gm/controlnet-canny-scribble-integrated-sdxl-v2-fp16"], - "mlsd": "lllyasviel/control_v11p_sd15_mlsd", + "mlsd": ["lllyasviel/control_v11p_sd15_mlsd", "r3gm/controlnet-union-sdxl-1.0-fp16"], "scribble": ["lllyasviel/control_v11p_sd15_scribble", "r3gm/controlnet-canny-scribble-integrated-sdxl-v2-fp16"], - "softedge": ["lllyasviel/control_v11p_sd15_softedge", "r3gm/controlnet-canny-scribble-integrated-sdxl-v2-fp16"], - "segmentation": "lllyasviel/control_v11p_sd15_seg", - "depth": ["lllyasviel/control_v11f1p_sd15_depth", "diffusers/controlnet-depth-sdxl-1.0-mid"], - "normalbae": "lllyasviel/control_v11p_sd15_normalbae", - "lineart": ["lllyasviel/control_v11p_sd15_lineart", "r3gm/controlnet-lineart-anime-sdxl-fp16"], - "lineart_anime": "lllyasviel/control_v11p_sd15s2_lineart_anime", + "softedge": ["lllyasviel/control_v11p_sd15_softedge", "r3gm/controlnet-union-sdxl-1.0-fp16"], + "segmentation": ["lllyasviel/control_v11p_sd15_seg", "r3gm/controlnet-union-sdxl-1.0-fp16"], + "depth": ["lllyasviel/control_v11f1p_sd15_depth", "r3gm/controlnet-union-sdxl-1.0-fp16"], + "normalbae": ["lllyasviel/control_v11p_sd15_normalbae", "r3gm/controlnet-union-sdxl-1.0-fp16"], + "lineart": ["lllyasviel/control_v11p_sd15_lineart", "r3gm/controlnet-union-sdxl-1.0-fp16"], + "lineart_anime": ["lllyasviel/control_v11p_sd15s2_lineart_anime", "r3gm/controlnet-lineart-anime-sdxl-fp16"], "shuffle": "lllyasviel/control_v11e_sd15_shuffle", "ip2p": "lllyasviel/control_v11e_sd15_ip2p", "inpaint": "lllyasviel/control_v11p_sd15_inpaint", @@ -41,7 +91,8 @@ "sdxl_openpose_t2i": "TencentARC/t2i-adapter-openpose-sdxl-1.0", "img2img": "Nothinghere", "pattern": ["monster-labs/control_v1p_sd15_qrcode_monster", "r3gm/control_v1p_sdxl_qrcode_monster_fp16"], - "sdxl_tile_realistic": "Yakonrus/SDXL_Controlnet_Tile_Realistic_v2", + "tile": ["lllyasviel/control_v11f1e_sd15_tile", "r3gm/controlnet-tile-sdxl-1.0-fp16"], # "sdxl_tile_realistic": "Yakonrus/SDXL_Controlnet_Tile_Realistic_v2", + "recolor": ["latentcat/control_v1p_sd15_brightness", "r3gm/controlnet-recolor-sdxl-fp16"], # "sdxl_depth-zoe_t2i": "TencentARC/t2i-adapter-depth-zoe-sdxl-1.0", # "sdxl_recolor_t2i": "TencentARC/t2i-adapter-recolor-sdxl-1.0", } @@ -89,16 +140,17 @@ OLD_PROMPT_WEIGHT_OPTIONS = ALL_PROMPT_WEIGHT_OPTIONS[0:2] SCHEDULER_CONFIG_MAP = { - "DPM++ 2M": (DPMSolverMultistepScheduler, {"use_karras_sigmas": False}), - "DPM++ 2M Karras": (DPMSolverMultistepScheduler, {"use_karras_sigmas": True}), + "DPM++ 2M": (DPMSolverMultistepScheduler, {"algorithm_type": "dpmsolver++", "use_karras_sigmas": False}), + "DPM++ 2M Karras": (DPMSolverMultistepScheduler, {"algorithm_type": "dpmsolver++", "use_karras_sigmas": True}), "DPM++ 2M SDE": (DPMSolverMultistepScheduler, {"use_karras_sigmas": False, "algorithm_type": "sde-dpmsolver++"}), "DPM++ 2M SDE Karras": (DPMSolverMultistepScheduler, {"use_karras_sigmas": True, "algorithm_type": "sde-dpmsolver++"}), - "DPM++ 2S": (DPMSolverSinglestepScheduler, {"use_karras_sigmas": False}), - "DPM++ 2S Karras": (DPMSolverSinglestepScheduler, {"use_karras_sigmas": True}), - "DPM++ 1S": (DPMSolverMultistepScheduler, {"solver_order": 1}), - "DPM++ 1S Karras": (DPMSolverMultistepScheduler, {"solver_order": 1, "use_karras_sigmas": True}), - "DPM++ 3M": (DPMSolverMultistepScheduler, {"solver_order": 3}), - "DPM++ 3M Karras": (DPMSolverMultistepScheduler, {"solver_order": 3, "use_karras_sigmas": True}), + "DPM++ 2S": (DPMSolverSinglestepScheduler, {"algorithm_type": "dpmsolver++", "use_karras_sigmas": False}), + "DPM++ 2S Karras": (DPMSolverSinglestepScheduler, {"algorithm_type": "dpmsolver++", "use_karras_sigmas": True}), + "DPM++ 1S": (DPMSolverMultistepScheduler, {"algorithm_type": "dpmsolver++", "solver_order": 1}), + "DPM++ 1S Karras": (DPMSolverMultistepScheduler, {"algorithm_type": "dpmsolver++", "solver_order": 1, "use_karras_sigmas": True}), + "DPM++ 3M": (DPMSolverMultistepScheduler, {"algorithm_type": "dpmsolver++", "solver_order": 3}), + "DPM++ 3M Karras": (DPMSolverMultistepScheduler, {"algorithm_type": "dpmsolver++", "solver_order": 3, "use_karras_sigmas": True}), + "DPM 3M": (DPMSolverMultistepScheduler, {"algorithm_type": "dpmsolver", "final_sigmas_type": "sigma_min", "solver_order": 3}), "DPM++ SDE": (DPMSolverSDEScheduler, {"use_karras_sigmas": False}), "DPM++ SDE Karras": (DPMSolverSDEScheduler, {"use_karras_sigmas": True}), "DPM2": (KDPM2DiscreteScheduler, {}), @@ -125,8 +177,8 @@ "DPM++ 2M EDM Karras": (EDMDPMSolverMultistepScheduler, {"use_karras_sigmas": True, "solver_order": 2, "solver_type": "midpoint", "final_sigmas_type": "zero", "algorithm_type": "dpmsolver++"}), "DDPM": (DDPMScheduler, {}), - "DPM++ 2M Lu": (DPMSolverMultistepScheduler, {"use_lu_lambdas": True}), - "DPM++ 2M Ef": (DPMSolverMultistepScheduler, {"euler_at_final": True}), + "DPM++ 2M Lu": (DPMSolverMultistepScheduler, {"algorithm_type": "dpmsolver++", "use_lu_lambdas": True}), + "DPM++ 2M Ef": (DPMSolverMultistepScheduler, {"algorithm_type": "dpmsolver++", "euler_at_final": True}), "DPM++ 2M SDE Lu": (DPMSolverMultistepScheduler, {"use_lu_lambdas": True, "algorithm_type": "sde-dpmsolver++"}), "DPM++ 2M SDE Ef": (DPMSolverMultistepScheduler, {"algorithm_type": "sde-dpmsolver++", "euler_at_final": True}), @@ -153,9 +205,9 @@ "base_light_v2": ["h94/IP-Adapter", "models", "ip-adapter_sd15_light_v11.bin", "H"], "faceid_plus": ["h94/IP-Adapter-FaceID", "", "ip-adapter-faceid-plus_sd15.bin", "H"], "faceid_plus_v2": ["h94/IP-Adapter-FaceID", "", "ip-adapter-faceid-plusv2_sd15.bin", "H"], - "faceid": ["h94/IP-Adapter-FaceID", "", "ip-adapter-faceid_sd15.bin", None], - "faceid_portrait_v2": ["h94/IP-Adapter-FaceID", "", "ip-adapter-faceid-portrait-v11_sd15.bin", None], # last portrait - "faceid_portrait": ["h94/IP-Adapter-FaceID", "", "ip-adapter-faceid-portrait_sd15.bin", None], + "faceid": ["h94/IP-Adapter-FaceID", "", "ip-adapter-faceid_sd15.bin", "H"], # None + "faceid_portrait_v2": ["h94/IP-Adapter-FaceID", "", "ip-adapter-faceid-portrait-v11_sd15.bin", "H"], # None + "faceid_portrait": ["h94/IP-Adapter-FaceID", "", "ip-adapter-faceid-portrait_sd15.bin", "H"], # None "composition_plus": ["ostris/ip-composition-adapter", "", "ip_plus_composition_sd15.safetensors", "H"] }, "StableDiffusionXLPipeline": { @@ -165,9 +217,9 @@ "base_vit_G": ["h94/IP-Adapter", "sdxl_models", "ip-adapter_sdxl.safetensors", "G"], "base": ["h94/IP-Adapter", "sdxl_models", "ip-adapter_sdxl_vit-h.safetensors", "H"], "faceid_plus_v2": ["h94/IP-Adapter-FaceID", "", "ip-adapter-faceid-plusv2_sdxl.bin", "H"], - "faceid": ["h94/IP-Adapter-FaceID", "", "ip-adapter-faceid_sdxl.bin", None], - "faceid_portrait": ["h94/IP-Adapter-FaceID", "", "ip-adapter-faceid-portrait_sdxl.bin", None], - "faceid_portrait_v2": ["h94/IP-Adapter-FaceID", "", "ip-adapter-faceid-portrait_sdxl_unnorm.bin", None], + "faceid": ["h94/IP-Adapter-FaceID", "", "ip-adapter-faceid_sdxl.bin", "H"], # None + "faceid_portrait": ["h94/IP-Adapter-FaceID", "", "ip-adapter-faceid-portrait_sdxl.bin", "H"], # None + "faceid_portrait_v2": ["h94/IP-Adapter-FaceID", "", "ip-adapter-faceid-portrait_sdxl_unnorm.bin", "H"], # None "composition_plus": ["ostris/ip-composition-adapter", "", "ip_plus_composition_sdxl.safetensors", "H"] } } # no suffix lora diff --git a/stablepy/diffusers_vanilla/extra_model_loaders.py b/stablepy/diffusers_vanilla/extra_model_loaders.py index 2a32e85..bb72d4e 100644 --- a/stablepy/diffusers_vanilla/extra_model_loaders.py +++ b/stablepy/diffusers_vanilla/extra_model_loaders.py @@ -64,7 +64,7 @@ def custom_task_model_loader( elif model_category in ["hires", "detailfix_img2img"]: # Pipe hires detailfix_pipe img2img - if task_name != "txt2img": + if task_name != "txt2img" or hasattr(pipe, "set_pag_applied_layers"): if not hasattr(pipe, "text_encoder_2"): hires_pipe = StableDiffusionPipeline( vae=pipe.vae, @@ -91,9 +91,9 @@ def custom_task_model_loader( image_encoder=pipe.image_encoder, ) - hires_pipe = AutoPipelineForImage2Image.from_pipe(hires_pipe) + hires_pipe = AutoPipelineForImage2Image.from_pipe(hires_pipe, enable_pag=False) else: - hires_pipe = AutoPipelineForImage2Image.from_pipe(pipe) + hires_pipe = AutoPipelineForImage2Image.from_pipe(pipe, enable_pag=False) if hasattr(hires_pipe, "text_encoder_2"): hires_pipe.enable_vae_slicing() diff --git a/stablepy/diffusers_vanilla/high_resolution.py b/stablepy/diffusers_vanilla/high_resolution.py index dd23848..1ebf563 100644 --- a/stablepy/diffusers_vanilla/high_resolution.py +++ b/stablepy/diffusers_vanilla/high_resolution.py @@ -1,7 +1,21 @@ from ..upscalers.esrgan import UpscalerESRGAN, UpscalerLanczos, UpscalerNearest from ..logging.logging_setup import logger -import torch, gc +import torch +import gc from diffusers import DDIMScheduler +from diffusers.image_processor import VaeImageProcessor + +latent_upscale_modes = { + "Latent": {"mode": "bilinear", "antialias": False}, + "Latent (antialiased)": {"mode": "bilinear", "antialias": True}, + "Latent (bicubic)": {"mode": "bicubic", "antialias": False}, + "Latent (bicubic antialiased)": {"mode": "bicubic", "antialias": True}, + "Latent (nearest)": {"mode": "nearest", "antialias": False}, + "Latent (nearest-exact)": {"mode": "nearest-exact", "antialias": False}, +} + +LATENT_UPSCALERS = latent_upscale_modes.keys() + def process_images_high_resolution( images, @@ -11,10 +25,10 @@ def process_images_high_resolution( task_name=None, generator=None, hires_pipe=None, - ): +): def upscale_images(images, upscaler_model_path, esrgan_tile, esrgan_tile_overlap): - if upscaler_model_path != None: + if upscaler_model_path is not None: if upscaler_model_path == "Lanczos": scaler = UpscalerLanczos() elif upscaler_model_path == "Nearest": @@ -37,6 +51,7 @@ def upscale_images(images, upscaler_model_path, esrgan_tile, esrgan_tile_overlap def hires_fix(images): if hires_steps > 1: + control_image_up = None if task_name not in ["txt2img", "inpaint", "img2img"]: control_image_up = images[0] images = images[1:] @@ -61,7 +76,7 @@ def hires_fix(images): **hires_params_config, ).images[0] elif "The size of tensor a (0) must match the size of tensor b (3) at non-singleton" in e or "cannot reshape tensor of 0 elements into shape [0, -1, 1, 512] because the unspecified dimensi" in e: - logger.error(f"strength or steps too low for the model to produce a satisfactory response, returning image only with upscaling.") + logger.error("Strength or steps too low for the model to produce a satisfactory response, returning image only with upscaling.") img_pos_hires = img_pre_hires else: logger.error(e) @@ -72,11 +87,63 @@ def hires_fix(images): result_hires.append(img_pos_hires) images = result_hires - if task_name not in ["txt2img", "inpaint", "img2img"]: + if control_image_up: images = [control_image_up] + images + return images - images = upscale_images(images, upscaler_model_path, esrgan_tile, esrgan_tile_overlap) + if upscaler_model_path in LATENT_UPSCALERS: + control_image_up_pre_latent = None + if task_name not in ["txt2img", "inpaint", "img2img"]: + control_image_up_pre_latent = [images[0]] + images = images[1:] + + image_processor = VaeImageProcessor() + images_conversion = [] + for img_base in images: + if not isinstance(img_base, torch.Tensor): + prep_image = image_processor.preprocess(img_base) + prep_image = prep_image.to(device=hires_pipe.vae.device.type, dtype=hires_pipe.vae.dtype) + + with torch.no_grad(): + img_base = hires_pipe.vae.encode(prep_image).latent_dist.sample() + + img_base = hires_pipe.vae.config.scaling_factor * img_base + + images_conversion.append(img_base) + + config_latent = latent_upscale_modes[upscaler_model_path] + + logger.debug(str(images_conversion[0].shape)) + + images = [ + torch.nn.functional.interpolate( + im_l, + size=( + int(images_conversion[0].shape[2] * upscaler_increases_size), # maybe round instead of int + int(images_conversion[0].shape[3] * upscaler_increases_size), + ), + mode=config_latent["mode"], + antialias=config_latent["antialias"], + ) for im_l in images_conversion + ] + + logger.debug(str(images[0].shape)) + logger.info( + "Latent resolution: " + f"{images[0].shape[2] * 8}x{images[0].shape[3] * 8}" + ) + + if control_image_up_pre_latent: + images = control_image_up_pre_latent + images + + torch.cuda.empty_cache() + + else: + images = upscale_images( + images, upscaler_model_path, esrgan_tile, esrgan_tile_overlap + ) + images = hires_fix(images) return images diff --git a/stablepy/diffusers_vanilla/lora_loader.py b/stablepy/diffusers_vanilla/lora_loader.py index 4c83fda..f153a93 100644 --- a/stablepy/diffusers_vanilla/lora_loader.py +++ b/stablepy/diffusers_vanilla/lora_loader.py @@ -5,6 +5,31 @@ from safetensors.torch import load_file from collections import defaultdict from ..logging.logging_setup import logger +import safetensors + +valid_layers = [ + "input_blocks", + "middle_block", + "output_blocks", + "text_model", + ".down_blocks", + ".mid_block", + ".up_blocks", + # "text_projection", # text encoder 2 layer + # "conv_in", # unet extra layers + # "time_proj", + # "time_embedding", + # "time_embedding.linear_1", + # "time_embedding.act", + # "time_embedding.linear_2", + # "add_time_proj", + # "add_embedding", + # "add_embedding.linear_1", + # "add_embedding.linear_2", + # "conv_norm_out", + # "conv_out" +] + def load_lora_weights(pipeline, checkpoint_path, multiplier, device, dtype): LORA_PREFIX_UNET = "lora_unet" @@ -83,9 +108,28 @@ def load_lora_weights(pipeline, checkpoint_path, multiplier, device, dtype): def lora_mix_load(pipe, lora_path, alpha_scale=1.0, device="cuda", dtype=torch.float16): if hasattr(pipe, "text_encoder_2"): # sdxl lora - pipe.load_lora_weights(lora_path) - pipe.fuse_lora(lora_scale=alpha_scale) - pipe.unload_lora_weights() + try: + pipe.load_lora_weights(lora_path) + pipe.fuse_lora(lora_scale=alpha_scale) + pipe.unload_lora_weights() + except Exception as e: + if "size mismatch for" in str(e): + raise e + + logger.debug(str(e)) + + state_dict = safetensors.torch.load_file(lora_path, device="cpu") + state_dict = { + k: w for k, w in state_dict.items() + if any(ly in k for ly in valid_layers) + } + + if not state_dict: + raise ValueError("No valid layers were found.") + + pipe.load_lora_weights(state_dict) + pipe.fuse_lora(lora_scale=alpha_scale) + pipe.unload_lora_weights() else: # sd lora try: diff --git a/stablepy/diffusers_vanilla/model.py b/stablepy/diffusers_vanilla/model.py index e0f306e..4a0c29a 100644 --- a/stablepy/diffusers_vanilla/model.py +++ b/stablepy/diffusers_vanilla/model.py @@ -5,16 +5,11 @@ from diffusers import ( ControlNetModel, DiffusionPipeline, - StableDiffusionControlNetPipeline, - StableDiffusionControlNetInpaintPipeline, StableDiffusionPipeline, AutoencoderKL, - StableDiffusionXLInpaintPipeline, - StableDiffusionXLAdapterPipeline, T2IAdapter, StableDiffusionXLPipeline, AutoPipelineForImage2Image, - StableDiffusionXLControlNetPipeline, ) from huggingface_hub import hf_hub_download import torch @@ -38,9 +33,10 @@ import cv2 from diffusers import ( DDIMScheduler, - UniPCMultistepScheduler, ) from .constants import ( + CLASS_DIFFUSERS_TASK, + CLASS_PAG_DIFFUSERS_TASK, CONTROLNET_MODEL_IDS, VALID_TASKS, SD15_TASKS, @@ -66,7 +62,7 @@ from .adetailer import ad_model_process from ..logging.logging_setup import logger from .extra_model_loaders import custom_task_model_loader -from .high_resolution import process_images_high_resolution +from .high_resolution import process_images_high_resolution, LATENT_UPSCALERS from .style_prompt_config import ( styles_data, STYLE_NAMES, @@ -162,6 +158,33 @@ def __call__(self, image: np.ndarray, **kwargs) -> PIL.Image.Image: return PIL.Image.fromarray(color_seg) +def apply_gaussian_blur(image_np, ksize=5): + sigmaX = ksize / 2 + ksize = int(ksize) + if ksize % 2 == 0: + ksize += 1 + blurred_image_np = cv2.GaussianBlur(image_np, (ksize, ksize), sigmaX=sigmaX) + return blurred_image_np + + +def recolor_luminance(img, thr_a=1.0, **kwargs): + result = cv2.cvtColor(HWC3(img), cv2.COLOR_BGR2LAB) + result = result[:, :, 0].astype(np.float32) / 255.0 + result = result ** thr_a + result = (result * 255.0).clip(0, 255).astype(np.uint8) + result = cv2.cvtColor(result, cv2.COLOR_GRAY2RGB) + return result + + +def recolor_intensity(img, thr_a=1.0, **kwargs): + result = cv2.cvtColor(HWC3(img), cv2.COLOR_BGR2HSV) + result = result[:, :, 2].astype(np.float32) / 255.0 + result = result ** thr_a + result = (result * 255.0).clip(0, 255).astype(np.uint8) + result = cv2.cvtColor(result, cv2.COLOR_GRAY2RGB) + return result + + class Preprocessor: MODEL_ID = "lllyasviel/Annotators" @@ -280,13 +303,18 @@ def convert_image_to_numpy_array(image, gui_active=False): class Model_Diffusers: def __init__( self, - base_model_id: str = "runwayml/stable-diffusion-v1-5", + base_model_id: str = "Lykon/dreamshaper-8", task_name: str = "txt2img", vae_model=None, type_model_precision=torch.float16, retain_task_model_in_cache=True, + device=None, ): - self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + self.device = ( + torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + if device is None + else torch.device(device) + ) self.base_model_id = "" self.task_name = "" self.vae_model = None @@ -310,6 +338,80 @@ def __init__( self.image_encoder_name = None self.image_encoder_module = None + def switch_pipe_class( + self, + class_name, + task_name, + model_id, + enable_pag, + ): + + tk = "base" + model_components = dict( + vae=self.pipe.vae, + text_encoder=self.pipe.text_encoder, + tokenizer=self.pipe.tokenizer, + unet=self.pipe.unet, + scheduler=self.pipe.scheduler, + feature_extractor=self.pipe.feature_extractor, + image_encoder=self.pipe.image_encoder, + ) + if class_name == "StableDiffusionPipeline": + model_components["safety_checker"] = self.pipe.safety_checker + model_components["requires_safety_checker"] = self.pipe.config.requires_safety_checker + + if task_name not in ["txt2img", "img2img"]: + model_components["controlnet"] = ControlNetModel.from_pretrained( + model_id, torch_dtype=self.type_model_precision + ) + tk = "controlnet" + + elif class_name == "StableDiffusionXLPipeline": + model_components["text_encoder_2"] = self.pipe.text_encoder_2 + model_components["tokenizer_2"] = self.pipe.tokenizer_2 + + if task_name not in ["txt2img", "inpaint", "img2img"]: + if "t2i" not in task_name: + model_components["controlnet"] = ControlNetModel.from_pretrained( + model_id, torch_dtype=torch.float16, variant="fp16" + ).to(self.device) + tk = "controlnet" + else: + model_components["adapter"] = T2IAdapter.from_pretrained( + model_id, + torch_dtype=torch.float16, + varient="fp16", + ).to(self.device) + tk = "adapter" + + if task_name == "inpaint": + tk = "inpaint" + + if enable_pag: + if ( + tk == "adapter" or + (task_name in ["inpaint", "img2img"] and "XL" not in class_name) + ): + logger.warning( + f"PAG is not enabled for {class_name} with {task_name}." + ) + enable_pag = False + + # Load Pipeline + if enable_pag: + model_components["pag_applied_layers"] = "mid" + self.pipe = CLASS_PAG_DIFFUSERS_TASK[class_name][tk](**model_components).to(self.device) + else: + self.pipe = CLASS_DIFFUSERS_TASK[class_name][tk](**model_components).to(self.device) + + if task_name == "img2img": + self.pipe = AutoPipelineForImage2Image.from_pipe(self.pipe, enable_pag=enable_pag) + + # Create new base values + self.pipe.to(self.device) + torch.cuda.empty_cache() + gc.collect() + def load_pipe( self, base_model_id: str, @@ -357,6 +459,7 @@ def load_pipe( else: # Unload previous model and stuffs self.pipe = None + self.task_name = "" self.model_memory = {} self.lora_memory = [None, None, None, None, None] self.lora_scale_memory = [1.0, 1.0, 1.0, 1.0, 1.0] @@ -496,138 +599,18 @@ def load_pipe( else: model_id = model_id[0] - if task_name == "inpaint": - match class_name: - case "StableDiffusionPipeline": - - controlnet = ControlNetModel.from_pretrained( - model_id, torch_dtype=self.type_model_precision - ) - - self.pipe = StableDiffusionControlNetInpaintPipeline( - vae=self.pipe.vae, - text_encoder=self.pipe.text_encoder, - tokenizer=self.pipe.tokenizer, - unet=self.pipe.unet, - controlnet=controlnet, - scheduler=self.pipe.scheduler, - safety_checker=self.pipe.safety_checker, - feature_extractor=self.pipe.feature_extractor, - requires_safety_checker=self.pipe.config.requires_safety_checker, - image_encoder=self.pipe.image_encoder, - ) - case "StableDiffusionXLPipeline": - - self.pipe = StableDiffusionXLInpaintPipeline( - vae=self.pipe.vae, - text_encoder=self.pipe.text_encoder, - text_encoder_2=self.pipe.text_encoder_2, - tokenizer=self.pipe.tokenizer, - tokenizer_2=self.pipe.tokenizer_2, - unet=self.pipe.unet, - # controlnet=self.controlnet, - scheduler=self.pipe.scheduler, - feature_extractor=self.pipe.feature_extractor, - image_encoder=self.pipe.image_encoder, - ) - - if task_name not in ["txt2img", "inpaint", "img2img"]: - match class_name: - case "StableDiffusionPipeline": - - controlnet = ControlNetModel.from_pretrained( - model_id, torch_dtype=self.type_model_precision - ) - - self.pipe = StableDiffusionControlNetPipeline( - vae=self.pipe.vae, - text_encoder=self.pipe.text_encoder, - tokenizer=self.pipe.tokenizer, - unet=self.pipe.unet, - controlnet=controlnet, - scheduler=self.pipe.scheduler, - safety_checker=self.pipe.safety_checker, - feature_extractor=self.pipe.feature_extractor, - requires_safety_checker=self.pipe.config.requires_safety_checker, - image_encoder=self.pipe.image_encoder, - ) - self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config) - - case "StableDiffusionXLPipeline": - if "t2i" not in task_name: - controlnet = ControlNetModel.from_pretrained( - model_id, torch_dtype=torch.float16, variant="fp16" - ).to(self.device) - - self.pipe = StableDiffusionXLControlNetPipeline( - vae=self.pipe.vae, - text_encoder=self.pipe.text_encoder, - text_encoder_2=self.pipe.text_encoder_2, - tokenizer=self.pipe.tokenizer, - tokenizer_2=self.pipe.tokenizer_2, - unet=self.pipe.unet, - scheduler=self.pipe.scheduler, - controlnet=controlnet, - feature_extractor=self.pipe.feature_extractor, - image_encoder=self.pipe.image_encoder, - ).to(self.device) - - else: - adapter = T2IAdapter.from_pretrained( - model_id, - torch_dtype=torch.float16, - varient="fp16", - ).to(self.device) - - self.pipe = StableDiffusionXLAdapterPipeline( - vae=self.pipe.vae, - text_encoder=self.pipe.text_encoder, - text_encoder_2=self.pipe.text_encoder_2, - tokenizer=self.pipe.tokenizer, - tokenizer_2=self.pipe.tokenizer_2, - unet=self.pipe.unet, - adapter=adapter, - scheduler=self.pipe.scheduler, - feature_extractor=self.pipe.feature_extractor, - image_encoder=self.pipe.image_encoder, - ).to(self.device) - - if task_name in ["txt2img", "img2img"]: - match class_name: - - case "StableDiffusionPipeline": - self.pipe = StableDiffusionPipeline( - vae=self.pipe.vae, - text_encoder=self.pipe.text_encoder, - tokenizer=self.pipe.tokenizer, - unet=self.pipe.unet, - scheduler=self.pipe.scheduler, - safety_checker=self.pipe.safety_checker, - feature_extractor=self.pipe.feature_extractor, - requires_safety_checker=self.pipe.config.requires_safety_checker, - image_encoder=self.pipe.image_encoder, - ) - - case "StableDiffusionXLPipeline": - self.pipe = StableDiffusionXLPipeline( - vae=self.pipe.vae, - text_encoder=self.pipe.text_encoder, - text_encoder_2=self.pipe.text_encoder_2, - tokenizer=self.pipe.tokenizer, - tokenizer_2=self.pipe.tokenizer_2, - unet=self.pipe.unet, - scheduler=self.pipe.scheduler, - feature_extractor=self.pipe.feature_extractor, - image_encoder=self.pipe.image_encoder, - ) - - if task_name == "img2img": - self.pipe = AutoPipelineForImage2Image.from_pipe(self.pipe) + if ( + (self.task_name != task_name) + or (self.class_name != class_name) + ): + self.switch_pipe_class( + class_name, + task_name, + model_id, + enable_pag=False, + ) - # Create new base values - self.pipe.to(self.device) - torch.cuda.empty_cache() - gc.collect() + self.model_id_task = model_id self.base_model_id = base_model_id self.task_name = task_name @@ -672,16 +655,17 @@ def get_image_preprocess( value_threshold: float, distance_threshold: float, t2i_adapter_preprocessor: bool, + recolor_gamma_correction: float, ) -> list[PIL.Image.Image]: if image is None: raise ValueError("No reference image found.") - if self.class_name == "StableDiffusionPipeline" and self.task_name in ["lineart", "lineart_anime"]: - if "anime" in preprocessor_name: - self.load_controlnet_weight("lineart_anime") - logger.info("Linear anime") - else: - self.load_controlnet_weight("lineart") + # if self.class_name == "StableDiffusionPipeline" and self.task_name in ["lineart", "lineart_anime"]: + # if "anime" in preprocessor_name: + # self.load_controlnet_weight("lineart_anime") + # logger.info("Linear anime") + # else: + # self.load_controlnet_weight("lineart") if "t2i" in self.task_name: preprocessor_name = T2I_PREPROCESSOR_NAME[self.task_name] if t2i_adapter_preprocessor else "None" @@ -778,6 +762,26 @@ def get_image_preprocess( image=image, image_resolution=image_resolution, ) + elif self.task_name == "tile": + image_np = resize_image(image, resolution=image_resolution) + blur_names = { + "Mild Blur": 5, + "Moderate Blur": 15, + "Heavy Blur": 27, + } + image_np = apply_gaussian_blur( + image_np, ksize=blur_names[preprocessor_name] + ) + control_image = PIL.Image.fromarray(image_np) + elif self.task_name == "recolor": + image_np = resize_image(image, resolution=image_resolution) + + if preprocessor_name == "Recolor luminance": + image_np = recolor_luminance(image_np, thr_a=recolor_gamma_correction) + elif preprocessor_name == "Recolor intensity": + image_np = recolor_intensity(image_np, thr_a=recolor_gamma_correction) + + control_image = PIL.Image.fromarray(image_np) else: raise ValueError("No valid preprocessor name") @@ -1247,45 +1251,58 @@ def get_ip_embeds( image_embeds = [] for i, (image, ip_weight) in enumerate(zip(ip_images, self.ip_adapter_config)): - if "plus" in ip_weight: + if not isinstance(image, list): + image = [image] - ref_images_embeds = [] - ip_adapter_images = [] + image_embeds_single = [] + image_projection = [] + for j, single_image in enumerate(image): - image = cv2.cvtColor(np.asarray(image), cv2.COLOR_BGR2RGB) - faces = app.get(image) - ip_adapter_images.append(face_align.norm_crop(image, landmark=faces[0].kps, image_size=224)) # if not detected face error - image = torch.from_numpy(faces[0].normed_embedding) - ref_images_embeds.append(image.unsqueeze(0)) + single_image = cv2.cvtColor(np.asarray(single_image), cv2.COLOR_BGR2RGB) + faces = app.get(single_image) + + if len(faces) == 0: + num_batch_image = "" if len(image) == 1 else f", subimage {j+1}" + raise ValueError(f"No face detected in image number {i+1}{num_batch_image}") + + if "plus" in ip_weight: + face_crop_align = face_align.norm_crop(single_image, landmark=faces[0].kps, image_size=224) + image_projection.append(face_crop_align) + + single_image = torch.from_numpy(faces[0].normed_embedding) + ref_images_embeds = [] + ref_images_embeds.append(single_image.unsqueeze(0)) ref_images_embeds = torch.stack(ref_images_embeds, dim=0).unsqueeze(0) + neg_ref_images_embeds = torch.zeros_like(ref_images_embeds) + id_embed = torch.cat([neg_ref_images_embeds, ref_images_embeds]).to(dtype=self.type_model_precision, device=self.device) - image_embeds.append(id_embed) + image_embeds_single.append(id_embed) + image_embeds.append(torch.cat(image_embeds_single, dim=1)) + + if image_projection: clip_embeds = self.pipe.prepare_ip_adapter_image_embeds( - [ip_adapter_images] * len(ip_images), + [image_projection] * len(ip_images), None, torch.device(self.device), num_images, do_classifier_free_guidance )[0] + gc.collect() + torch.cuda.empty_cache() + self.pipe.unet.encoder_hid_proj.image_projection_layers[i].clip_embeds = clip_embeds.to(dtype=self.type_model_precision) if "plusv2" in ip_weight: self.pipe.unet.encoder_hid_proj.image_projection_layers[i].shortcut = True else: self.pipe.unet.encoder_hid_proj.image_projection_layers[i].shortcut = False - else: - ref_images_embeds = [] - image = cv2.cvtColor(np.asarray(image), cv2.COLOR_BGR2RGB) - faces = app.get(image) - image = torch.from_numpy(faces[0].normed_embedding) - ref_images_embeds.append(image.unsqueeze(0)) - ref_images_embeds = torch.stack(ref_images_embeds, dim=0).unsqueeze(0) - neg_ref_images_embeds = torch.zeros_like(ref_images_embeds) - id_embed = torch.cat([neg_ref_images_embeds, ref_images_embeds]).to(dtype=self.type_model_precision, device=self.device) - image_embeds.append(id_embed) + gc.collect() + torch.cuda.empty_cache() + + # average_embedding = torch.mean(torch.stack(faceid_all_embeds, dim=0), dim=0) processed_masks = [] if ip_masks and ip_masks[0] is not None: # fix this auto generate mask if any have it... @@ -1362,6 +1379,7 @@ def __call__( adetailer_B_params: Dict[str, Any] = {}, style_prompt: Optional[Any] = [""], style_json_file: Optional[Any] = "", + pag_scale: float = 0., image: Optional[Any] = None, preprocessor_name: Optional[str] = "None", @@ -1373,6 +1391,7 @@ def __call__( high_threshold: int = 200, value_threshold: float = 0.1, distance_threshold: float = 0.1, + recolor_gamma_correction: float = 1.0, controlnet_conditioning_scale: float = 1.0, control_guidance_start: float = 0.0, control_guidance_end: float = 1.0, @@ -1447,7 +1466,7 @@ def __call__( print(scheduler_names) ``` syntax_weights (str, optional, defaults to "Classic"): - Specifies the type of syntax weights and emphasis used during generation. + Specifies the type of syntax weights and emphasis used during generation. "Classic" is (word:weight), "Compel" is (word)weight. To see all the valid syntax weight options, use the following code: @@ -1505,8 +1524,18 @@ def __call__( If a style that is in STYLE_NAMES is specified, it will be added to the original prompt and negative prompt. style_json_file (str, optional): JSON with styles to be applied and used in style_prompt. + pag_scale (float, optional): + Perturbed Attention Guidance (PAG) enhances image generation quality without the need for training. + If it is used, it is recommended to use values close to 3.0 for good results. upscaler_model_path (str, optional): - Placeholder for upscaler model path. + This is the path of the ESRGAN model that will be used for the upscale; on the other hand, + you can also use simply 'Lanczos', 'Nearest,' or 'Latent,' the latter of which has variants + that can be consulted in the following code: + + ```python + from stablepy import LATENT_UPSCALERS + print(LATENT_UPSCALERS) + ``` upscaler_increases_size (float, optional, defaults to 1.5): Placeholder for upscaler increases size parameter. esrgan_tile (int, optional, defaults to 100): @@ -1662,6 +1691,8 @@ def __call__( raise ValueError( "You need to specify the for this task." ) + if hires_steps < 2 and upscaler_model_path in LATENT_UPSCALERS: + raise ValueError("Latent upscaler requires hires_steps. Use at least 2 steps.") if img_height % 8 != 0: img_height = img_height + (8 - img_height % 8) logger.warning(f"Height must be divisible by 8, changed to {str(img_height)}") @@ -1691,6 +1722,16 @@ def __call__( reload=True, ) + pag_scale_is_true = bool(pag_scale) + hasattr_pipe_pag = hasattr(self.pipe, "set_pag_applied_layers") + if pag_scale_is_true != hasattr_pipe_pag: + self.switch_pipe_class( + self.class_name, + self.task_name, + self.model_id_task, + enable_pag=bool(pag_scale), + ) + self.pipe.set_progress_bar_config(leave=leave_progress_bar) self.pipe.set_progress_bar_config(disable=disable_progress_bar) @@ -1907,6 +1948,7 @@ def __call__( value_threshold=value_threshold, distance_threshold=distance_threshold, t2i_adapter_preprocessor=t2i_adapter_preprocessor, + recolor_gamma_correction=recolor_gamma_correction, ) # Task Parameters @@ -1919,6 +1961,9 @@ def __call__( "num_images_per_prompt": num_images, } + if hasattr(self.pipe, "set_pag_applied_layers"): + pipe_params_config["pag_scale"] = float(pag_scale) + if self.task_name == "txt2img": pipe_params_config["height"] = img_height pipe_params_config["width"] = img_width @@ -2059,7 +2104,7 @@ def __call__( "prompt": adetailer_A_params["prompt"], "negative_prompt": adetailer_A_params["negative_prompt"], "strength": adetailer_A_params["strength"], - "num_inference_steps": num_steps, + "num_inference_steps": int(num_steps * 1.5), "guidance_scale": guidance_scale, } @@ -2142,7 +2187,7 @@ def __call__( "prompt": adetailer_B_params["prompt"], "negative_prompt": adetailer_B_params["negative_prompt"], "strength": adetailer_B_params["strength"], - "num_inference_steps": num_steps, + "num_inference_steps": int(num_steps * 1.5), "guidance_scale": guidance_scale, } @@ -2296,6 +2341,13 @@ def __call__( hires_pipe.to(self.device) torch.cuda.empty_cache() gc.collect() + + if ( + upscaler_model_path in LATENT_UPSCALERS + and ((not adetailer_A and not adetailer_B) or hires_before_adetailer) + ): + pipe_params_config["output_type"] = "latent" + else: hires_params_config = {} hires_pipe = None @@ -2350,8 +2402,7 @@ def __call__( images = self.pipe( **pipe_params_config, ).images - if self.task_name not in ["txt2img", "inpaint", "img2img"]: - images = [control_image] + images + except Exception as e: e = str(e) if "Tensor with 2 elements cannot be converted to Scalar" in e: @@ -2362,15 +2413,21 @@ def __call__( images = self.pipe( **pipe_params_config, ).images - if self.task_name not in ["txt2img", "inpaint", "img2img"]: - images = [control_image] + images + elif "The size of tensor a (0) must match the size of tensor b (3) at non-singleton" in e: raise ValueError( "steps / strength too low for the model to produce a satisfactory response" ) + else: raise ValueError(e) + if isinstance(images, torch.Tensor): + images = [tl.unsqueeze(0) for tl in torch.unbind(images, dim=0)] + + if self.task_name not in ["txt2img", "inpaint", "img2img"]: + images = [control_image] + images + torch.cuda.empty_cache() gc.collect() diff --git a/stablepy/diffusers_vanilla/utils.py b/stablepy/diffusers_vanilla/utils.py index 863f12c..a704b10 100644 --- a/stablepy/diffusers_vanilla/utils.py +++ b/stablepy/diffusers_vanilla/utils.py @@ -2,7 +2,7 @@ from PIL import Image from PIL.PngImagePlugin import PngInfo from ..logging.logging_setup import logger -import torch + def save_pil_image_with_metadata(image, folder_path, metadata_list): if not os.path.exists(folder_path): @@ -29,7 +29,7 @@ def save_pil_image_with_metadata(image, folder_path, metadata_list): # metadata.add_text("Seed", str(metadata_list[7])) image.save(image_path, pnginfo=metadata) - except: + except Exception: logger.info("Saving image without metadata") image.save(image_path) @@ -61,7 +61,8 @@ def checkpoint_model_type(checkpoint_path): elif key_name_sd_xl_refiner in checkpoint: # only refiner xl has embedder and one text embedders model_type = "refiner" - + del checkpoint return model_type + \ No newline at end of file