From 41ad43304f831ceefa6a8b667032d45ea1711bf2 Mon Sep 17 00:00:00 2001 From: Cui-yshoho Date: Mon, 6 Jan 2025 16:12:52 +0800 Subject: [PATCH] add pipe_test --- ...table_diffusion_3_controlnet_inpainting.py | 7 +- .../pipeline_pag_controlnet_sd_xl_img2img.py | 15 - .../pipelines/__init__.py | 0 .../pipelines/animatediff/__init__.py | 0 .../pipelines/animatediff/test_animatediff.py | 0 .../test_animatediff_controlnet.py | 0 .../animatediff/test_animatediff_sdxl.py | 0 .../test_animatediff_sparsectrl.py | 0 .../test_animatediff_video2video.py | 0 ...test_animatediff_video2video_controlnet.py | 198 +++++++++++++ .../pipelines/aura_flow/__init__.py | 0 .../aura_flow/test_pipeline_aura_flow.py | 0 .../pipelines/blipdiffusion/__init__.py | 0 .../blipdiffusion/test_blipdiffusion.py | 0 .../pipelines/cogvideo}/__init__.py | 0 .../pipelines/cogvideo/test_cogvideox.py | 176 ++++++++++++ .../cogvideo/test_cogvideox_fun_control.py | 183 ++++++++++++ .../cogvideo/test_cogvideox_image2video.py | 185 ++++++++++++ .../cogvideo/test_cogvideox_video2video.py | 186 +++++++++++++ .../pipelines/cogview3}/__init__.py | 0 .../pipelines/cogview3/test_cogview3plus.py | 159 +++++++++++ .../pipelines/consistency_models}/__init__.py | 0 .../test_consistency_models.py | 0 .../pipelines/controlnet}/__init__.py | 0 .../pipelines/controlnet/test_controlnet.py | 0 .../test_controlnet_blip_diffusion.py | 0 .../controlnet/test_controlnet_img2img.py | 0 .../controlnet/test_controlnet_inpaint.py | 0 .../test_controlnet_inpaint_sdxl.py | 0 .../controlnet/test_controlnet_sdxl.py | 0 .../test_controlnet_sdxl_img2img.py | 0 .../pipelines/controlnet_sd3}/__init__.py | 0 .../test_controlnet_inpaint_sd3.py | 263 ++++++++++++++++++ .../controlnet_sd3/test_controlnet_sd3.py | 0 .../pipelines/controlnet_xs}/__init__.py | 0 .../controlnet_xs/test_controlnetxs.py | 0 .../controlnet_xs/test_controlnetxs_sdxl.py | 0 .../pipelines/dance_diffusion}/__init__.py | 0 .../dance_diffusion/test_dance_diffusion.py | 0 .../pipelines/ddim}/__init__.py | 0 .../pipelines/ddim/test_ddim.py | 0 .../pipelines/ddpm}/__init__.py | 0 .../pipelines/ddpm/test_ddpm.py | 0 .../pipelines/deepfloyd_if/__init__.py | 0 .../pipelines/deepfloyd_if/test_if.py | 0 .../pipelines/deepfloyd_if/test_if_img2img.py | 0 .../test_if_img2img_superresolution.py | 0 .../deepfloyd_if/test_if_inpainting.py | 0 .../test_if_inpainting_superresolution.py | 0 .../deepfloyd_if/test_if_superresolution.py | 0 .../pipelines/dit}/__init__.py | 0 .../pipelines/dit/test_dit.py | 0 .../pipelines/flux}/__init__.py | 0 .../pipelines/flux/test_pipeline_flux.py | 167 +++++++++++ .../flux/test_pipeline_flux_controlnet.py | 201 +++++++++++++ ...pipeline_flux_controlnet_image_to_image.py | 210 ++++++++++++++ ...est_pipeline_flux_controlnet_inpainting.py | 216 ++++++++++++++ .../flux/test_pipeline_flux_img2img.py | 186 +++++++++++++ .../flux/test_pipeline_flux_inpaint.py | 190 +++++++++++++ .../pipelines/hunyuan_dit}/__init__.py | 0 .../pipelines/hunyuan_dit/test_hunyuan_dit.py | 0 .../pipelines/i2vgen_xl}/__init__.py | 0 .../pipelines/i2vgen_xl/test_i2vgenxl.py | 0 .../pipelines/kandinsky}/__init__.py | 0 .../pipelines/kandinsky/test_kandinsky.py | 0 .../kandinsky/test_kandinsky_combined.py | 0 .../kandinsky/test_kandinsky_img2img.py | 0 .../kandinsky/test_kandinsky_inpaint.py | 0 .../kandinsky/test_kandinsky_prior.py | 0 .../pipelines/kandinsky2_2}/__init__.py | 0 .../pipelines/kandinsky2_2/test_kandinsky.py | 0 .../kandinsky2_2/test_kandinsky_combined.py | 0 .../kandinsky2_2/test_kandinsky_controlnet.py | 0 .../test_kandinsky_controlnet_img2img.py | 0 .../kandinsky2_2/test_kandinsky_img2img.py | 0 .../kandinsky2_2/test_kandinsky_inpaint.py | 0 .../kandinsky2_2/test_kandinsky_prior.py | 0 .../test_kandinsky_prior_emb2emb.py | 0 .../pipelines/kandinsky3}/__init__.py | 0 .../pipelines/kandinsky3/test_kandinsky3.py | 0 .../kandinsky3/test_kandinsky3_img2img.py | 0 .../latent_consistency_models}/__init__.py | 0 .../test_latent_consistency_models.py | 0 .../test_latent_consistency_models_img2img.py | 0 .../pipelines/latent_diffusion}/__init__.py | 0 .../latent_diffusion/test_latent_diffusion.py | 0 .../test_latent_diffusion_superresolution.py | 0 .../pipelines/marigold}/__init__.py | 0 .../pipelines/marigold/test_marigold_depth.py | 0 .../marigold/test_marigold_normals.py | 0 .../pipelines/pag}/__init__.py | 0 .../pag/test_pag_controlnet_sd_inpaint.py | 218 +++++++++++++++ .../pag/test_pag_controlnet_sdxl_img2img.py | 256 +++++++++++++++++ .../pipelines/pag/test_pag_sd_img2img.py | 186 +++++++++++++ .../pipelines/pipeline_test_utils.py | 10 +- .../pipelines/pixart_alpha}/__init__.py | 0 .../pipelines/pixart_alpha/test_pixart.py | 0 .../pipelines/pixart_sigma}/__init__.py | 0 .../pipelines/pixart_sigma/test_pixart.py | 0 .../pipelines/shap_e}/__init__.py | 0 .../pipelines/shap_e/test_shap_e.py | 0 .../pipelines/shap_e/test_shap_e_img2img.py | 0 .../pipelines/stable_cascade}/__init__.py | 0 .../test_stable_cascade_combined.py | 0 .../test_stable_cascade_decoder.py | 0 .../test_stable_cascade_prior.py | 0 .../pipelines/stable_diffusion}/__init__.py | 0 .../stable_diffusion/test_stable_diffusion.py | 0 .../test_stable_diffusion_img2img.py | 0 .../test_stable_diffusion_inpaint.py | 0 ...st_stable_diffusion_instruction_pix2pix.py | 0 .../pipelines/stable_diffusion_2}/__init__.py | 0 .../test_stable_diffusion.py | 0 .../test_stable_diffusion_depth.py | 0 .../test_stable_diffusion_diffedit.py | 0 .../test_stable_diffusion_inpaint.py | 0 .../test_stable_diffusion_latent_upscale.py | 0 .../test_stable_diffusion_upscale.py | 0 .../test_stable_diffusion_v_pred.py | 0 .../pipelines/stable_diffusion_3}/__init__.py | 0 .../test_pipeline_stable_diffusion_3.py | 0 ...est_pipeline_stable_diffusion_3_img2img.py | 0 .../stable_diffusion_adapter}/__init__.py | 0 .../test_stable_diffusion_adapter.py | 0 .../stable_diffusion_gligen}/__init__.py | 0 .../test_stable_diffusion_gligen.py | 0 .../__init__.py | 0 ...test_stable_diffusion_gligen_text_image.py | 0 .../__init__.py | 0 .../test_stable_diffusion_image_variation.py | 0 .../pipelines/stable_diffusion_xl/__init__.py | 0 .../test_stable_diffusion_xl.py | 0 .../test_stable_diffusion_xl_adapter.py | 0 .../test_stable_diffusion_xl_img2img.py | 0 .../test_stable_diffusion_xl_inpaint.py | 0 ...stable_diffusion_xl_instruction_pix2pix.py | 0 .../stable_video_diffusion/__init__.py | 0 .../test_stable_video_diffusion.py | 0 .../pipelines/unclip/__init__.py | 0 .../pipelines/unclip/test_unclip.py | 0 .../unclip/test_unclip_image_variation.py | 0 .../pipelines/wuerstchen/__init__.py | 0 .../wuerstchen/test_wuerstchen_combined.py | 0 .../wuerstchen/test_wuerstchen_decoder.py | 0 .../wuerstchen/test_wuerstchen_prior.py | 0 145 files changed, 3191 insertions(+), 21 deletions(-) rename tests/{diffusers => diffusers_tests}/pipelines/__init__.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/animatediff/__init__.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/animatediff/test_animatediff.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/animatediff/test_animatediff_controlnet.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/animatediff/test_animatediff_sdxl.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/animatediff/test_animatediff_sparsectrl.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/animatediff/test_animatediff_video2video.py (100%) create mode 100644 tests/diffusers_tests/pipelines/animatediff/test_animatediff_video2video_controlnet.py rename tests/{diffusers => diffusers_tests}/pipelines/aura_flow/__init__.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/aura_flow/test_pipeline_aura_flow.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/blipdiffusion/__init__.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/blipdiffusion/test_blipdiffusion.py (100%) rename tests/{diffusers/pipelines/consistency_models => diffusers_tests/pipelines/cogvideo}/__init__.py (100%) create mode 100644 tests/diffusers_tests/pipelines/cogvideo/test_cogvideox.py create mode 100644 tests/diffusers_tests/pipelines/cogvideo/test_cogvideox_fun_control.py create mode 100644 tests/diffusers_tests/pipelines/cogvideo/test_cogvideox_image2video.py create mode 100644 tests/diffusers_tests/pipelines/cogvideo/test_cogvideox_video2video.py rename tests/{diffusers/pipelines/controlnet => diffusers_tests/pipelines/cogview3}/__init__.py (100%) create mode 100644 tests/diffusers_tests/pipelines/cogview3/test_cogview3plus.py rename tests/{diffusers/pipelines/controlnet_sd3 => diffusers_tests/pipelines/consistency_models}/__init__.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/consistency_models/test_consistency_models.py (100%) rename tests/{diffusers/pipelines/controlnet_xs => diffusers_tests/pipelines/controlnet}/__init__.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/controlnet/test_controlnet.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/controlnet/test_controlnet_blip_diffusion.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/controlnet/test_controlnet_img2img.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/controlnet/test_controlnet_inpaint.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/controlnet/test_controlnet_inpaint_sdxl.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/controlnet/test_controlnet_sdxl.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/controlnet/test_controlnet_sdxl_img2img.py (100%) rename tests/{diffusers/pipelines/dance_diffusion => diffusers_tests/pipelines/controlnet_sd3}/__init__.py (100%) create mode 100644 tests/diffusers_tests/pipelines/controlnet_sd3/test_controlnet_inpaint_sd3.py rename tests/{diffusers => diffusers_tests}/pipelines/controlnet_sd3/test_controlnet_sd3.py (100%) rename tests/{diffusers/pipelines/ddim => diffusers_tests/pipelines/controlnet_xs}/__init__.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/controlnet_xs/test_controlnetxs.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/controlnet_xs/test_controlnetxs_sdxl.py (100%) rename tests/{diffusers/pipelines/ddpm => diffusers_tests/pipelines/dance_diffusion}/__init__.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/dance_diffusion/test_dance_diffusion.py (100%) rename tests/{diffusers/pipelines/dit => diffusers_tests/pipelines/ddim}/__init__.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/ddim/test_ddim.py (100%) rename tests/{diffusers/pipelines/hunyuan_dit => diffusers_tests/pipelines/ddpm}/__init__.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/ddpm/test_ddpm.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/deepfloyd_if/__init__.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/deepfloyd_if/test_if.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/deepfloyd_if/test_if_img2img.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/deepfloyd_if/test_if_img2img_superresolution.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/deepfloyd_if/test_if_inpainting.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/deepfloyd_if/test_if_superresolution.py (100%) rename tests/{diffusers/pipelines/i2vgen_xl => diffusers_tests/pipelines/dit}/__init__.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/dit/test_dit.py (100%) rename tests/{diffusers/pipelines/kandinsky => diffusers_tests/pipelines/flux}/__init__.py (100%) create mode 100644 tests/diffusers_tests/pipelines/flux/test_pipeline_flux.py create mode 100644 tests/diffusers_tests/pipelines/flux/test_pipeline_flux_controlnet.py create mode 100644 tests/diffusers_tests/pipelines/flux/test_pipeline_flux_controlnet_image_to_image.py create mode 100644 tests/diffusers_tests/pipelines/flux/test_pipeline_flux_controlnet_inpainting.py create mode 100644 tests/diffusers_tests/pipelines/flux/test_pipeline_flux_img2img.py create mode 100644 tests/diffusers_tests/pipelines/flux/test_pipeline_flux_inpaint.py rename tests/{diffusers/pipelines/kandinsky2_2 => diffusers_tests/pipelines/hunyuan_dit}/__init__.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/hunyuan_dit/test_hunyuan_dit.py (100%) rename tests/{diffusers/pipelines/kandinsky3 => diffusers_tests/pipelines/i2vgen_xl}/__init__.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/i2vgen_xl/test_i2vgenxl.py (100%) rename tests/{diffusers/pipelines/latent_consistency_models => diffusers_tests/pipelines/kandinsky}/__init__.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/kandinsky/test_kandinsky.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/kandinsky/test_kandinsky_combined.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/kandinsky/test_kandinsky_img2img.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/kandinsky/test_kandinsky_inpaint.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/kandinsky/test_kandinsky_prior.py (100%) rename tests/{diffusers/pipelines/latent_diffusion => diffusers_tests/pipelines/kandinsky2_2}/__init__.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/kandinsky2_2/test_kandinsky.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/kandinsky2_2/test_kandinsky_combined.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/kandinsky2_2/test_kandinsky_controlnet.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/kandinsky2_2/test_kandinsky_controlnet_img2img.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/kandinsky2_2/test_kandinsky_img2img.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/kandinsky2_2/test_kandinsky_inpaint.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/kandinsky2_2/test_kandinsky_prior.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/kandinsky2_2/test_kandinsky_prior_emb2emb.py (100%) rename tests/{diffusers/pipelines/marigold => diffusers_tests/pipelines/kandinsky3}/__init__.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/kandinsky3/test_kandinsky3.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/kandinsky3/test_kandinsky3_img2img.py (100%) rename tests/{diffusers/pipelines/pixart_alpha => diffusers_tests/pipelines/latent_consistency_models}/__init__.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/latent_consistency_models/test_latent_consistency_models.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/latent_consistency_models/test_latent_consistency_models_img2img.py (100%) rename tests/{diffusers/pipelines/pixart_sigma => diffusers_tests/pipelines/latent_diffusion}/__init__.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/latent_diffusion/test_latent_diffusion.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py (100%) rename tests/{diffusers/pipelines/shap_e => diffusers_tests/pipelines/marigold}/__init__.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/marigold/test_marigold_depth.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/marigold/test_marigold_normals.py (100%) rename tests/{diffusers/pipelines/stable_cascade => diffusers_tests/pipelines/pag}/__init__.py (100%) create mode 100644 tests/diffusers_tests/pipelines/pag/test_pag_controlnet_sd_inpaint.py create mode 100644 tests/diffusers_tests/pipelines/pag/test_pag_controlnet_sdxl_img2img.py create mode 100644 tests/diffusers_tests/pipelines/pag/test_pag_sd_img2img.py rename tests/{diffusers => diffusers_tests}/pipelines/pipeline_test_utils.py (97%) rename tests/{diffusers/pipelines/stable_diffusion => diffusers_tests/pipelines/pixart_alpha}/__init__.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/pixart_alpha/test_pixart.py (100%) rename tests/{diffusers/pipelines/stable_diffusion_2 => diffusers_tests/pipelines/pixart_sigma}/__init__.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/pixart_sigma/test_pixart.py (100%) rename tests/{diffusers/pipelines/stable_diffusion_3 => diffusers_tests/pipelines/shap_e}/__init__.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/shap_e/test_shap_e.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/shap_e/test_shap_e_img2img.py (100%) rename tests/{diffusers/pipelines/stable_diffusion_adapter => diffusers_tests/pipelines/stable_cascade}/__init__.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/stable_cascade/test_stable_cascade_combined.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/stable_cascade/test_stable_cascade_decoder.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/stable_cascade/test_stable_cascade_prior.py (100%) rename tests/{diffusers/pipelines/stable_diffusion_gligen => diffusers_tests/pipelines/stable_diffusion}/__init__.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/stable_diffusion/test_stable_diffusion.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/stable_diffusion/test_stable_diffusion_img2img.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py (100%) rename tests/{diffusers/pipelines/stable_diffusion_gligen_text_image => diffusers_tests/pipelines/stable_diffusion_2}/__init__.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/stable_diffusion_2/test_stable_diffusion.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py (100%) rename tests/{diffusers/pipelines/stable_diffusion_image_variation => diffusers_tests/pipelines/stable_diffusion_3}/__init__.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3_img2img.py (100%) rename tests/{diffusers/pipelines/stable_diffusion_xl => diffusers_tests/pipelines/stable_diffusion_adapter}/__init__.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py (100%) rename tests/{diffusers/pipelines/stable_video_diffusion => diffusers_tests/pipelines/stable_diffusion_gligen}/__init__.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/stable_diffusion_gligen/test_stable_diffusion_gligen.py (100%) rename tests/{diffusers/pipelines/unclip => diffusers_tests/pipelines/stable_diffusion_gligen_text_image}/__init__.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/stable_diffusion_gligen_text_image/test_stable_diffusion_gligen_text_image.py (100%) rename tests/{diffusers/pipelines/wuerstchen => diffusers_tests/pipelines/stable_diffusion_image_variation}/__init__.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py (100%) create mode 100644 tests/diffusers_tests/pipelines/stable_diffusion_xl/__init__.py rename tests/{diffusers => diffusers_tests}/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_instruction_pix2pix.py (100%) create mode 100644 tests/diffusers_tests/pipelines/stable_video_diffusion/__init__.py rename tests/{diffusers => diffusers_tests}/pipelines/stable_video_diffusion/test_stable_video_diffusion.py (100%) create mode 100644 tests/diffusers_tests/pipelines/unclip/__init__.py rename tests/{diffusers => diffusers_tests}/pipelines/unclip/test_unclip.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/unclip/test_unclip_image_variation.py (100%) create mode 100644 tests/diffusers_tests/pipelines/wuerstchen/__init__.py rename tests/{diffusers => diffusers_tests}/pipelines/wuerstchen/test_wuerstchen_combined.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/wuerstchen/test_wuerstchen_decoder.py (100%) rename tests/{diffusers => diffusers_tests}/pipelines/wuerstchen/test_wuerstchen_prior.py (100%) diff --git a/mindone/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py b/mindone/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py index 3c1badc163..37a2725606 100644 --- a/mindone/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +++ b/mindone/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py @@ -27,6 +27,7 @@ from ...loaders import FromSingleFileMixin, SD3LoraLoaderMixin from ...models.autoencoders import AutoencoderKL from ...models.controlnet_sd3 import SD3ControlNetModel, SD3MultiControlNetModel +from ...models.layers_compat import pad from ...models.transformers import SD3Transformer2DModel from ...schedulers import FlowMatchEulerDiscreteScheduler from ...utils import logging, scale_lora_layers, unscale_lora_layers @@ -458,9 +459,7 @@ def encode_prompt( max_sequence_length=max_sequence_length, ) - clip_prompt_embeds = ops.pad( - clip_prompt_embeds, (0, t5_prompt_embed.shape[-1] - clip_prompt_embeds.shape[-1]) - ) + clip_prompt_embeds = pad(clip_prompt_embeds, (0, t5_prompt_embed.shape[-1] - clip_prompt_embeds.shape[-1])) prompt_embeds = ops.cat([clip_prompt_embeds, t5_prompt_embed], axis=-2) pooled_prompt_embeds = ops.cat([pooled_prompt_embed, pooled_prompt_2_embed], axis=-1) @@ -511,7 +510,7 @@ def encode_prompt( max_sequence_length=max_sequence_length, ) - negative_clip_prompt_embeds = ops.pad( + negative_clip_prompt_embeds = pad( negative_clip_prompt_embeds, (0, t5_negative_prompt_embed.shape[-1] - negative_clip_prompt_embeds.shape[-1]), ) diff --git a/mindone/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py b/mindone/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py index 3ed64833e3..b18b6b289d 100644 --- a/mindone/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py +++ b/mindone/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py @@ -35,7 +35,6 @@ TextualInversionLoaderMixin, ) from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel -from ...models.attention_processor import AttnProcessor2_0, XFormersAttnProcessor from ...schedulers import KarrasDiffusionSchedulers from ...utils import logging, scale_lora_layers, unscale_lora_layers from ...utils.mindspore_utils import randn_tensor @@ -985,21 +984,7 @@ def _get_add_time_ids( # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae def upcast_vae(self): - dtype = self.vae.dtype self.vae.to(dtype=ms.float32) - use_torch_2_0_or_xformers = isinstance( - self.vae.decoder.mid_block.attentions[0].processor, - ( - AttnProcessor2_0, - XFormersAttnProcessor, - ), - ) - # if xformers or torch_2_0 is used attention block does not need - # to be in float32 which can save lots of memory - if use_torch_2_0_or_xformers: - self.vae.post_quant_conv.to(dtype) - self.vae.decoder.conv_in.to(dtype) - self.vae.decoder.mid_block.to(dtype) @property def guidance_scale(self): diff --git a/tests/diffusers/pipelines/__init__.py b/tests/diffusers_tests/pipelines/__init__.py similarity index 100% rename from tests/diffusers/pipelines/__init__.py rename to tests/diffusers_tests/pipelines/__init__.py diff --git a/tests/diffusers/pipelines/animatediff/__init__.py b/tests/diffusers_tests/pipelines/animatediff/__init__.py similarity index 100% rename from tests/diffusers/pipelines/animatediff/__init__.py rename to tests/diffusers_tests/pipelines/animatediff/__init__.py diff --git a/tests/diffusers/pipelines/animatediff/test_animatediff.py b/tests/diffusers_tests/pipelines/animatediff/test_animatediff.py similarity index 100% rename from tests/diffusers/pipelines/animatediff/test_animatediff.py rename to tests/diffusers_tests/pipelines/animatediff/test_animatediff.py diff --git a/tests/diffusers/pipelines/animatediff/test_animatediff_controlnet.py b/tests/diffusers_tests/pipelines/animatediff/test_animatediff_controlnet.py similarity index 100% rename from tests/diffusers/pipelines/animatediff/test_animatediff_controlnet.py rename to tests/diffusers_tests/pipelines/animatediff/test_animatediff_controlnet.py diff --git a/tests/diffusers/pipelines/animatediff/test_animatediff_sdxl.py b/tests/diffusers_tests/pipelines/animatediff/test_animatediff_sdxl.py similarity index 100% rename from tests/diffusers/pipelines/animatediff/test_animatediff_sdxl.py rename to tests/diffusers_tests/pipelines/animatediff/test_animatediff_sdxl.py diff --git a/tests/diffusers/pipelines/animatediff/test_animatediff_sparsectrl.py b/tests/diffusers_tests/pipelines/animatediff/test_animatediff_sparsectrl.py similarity index 100% rename from tests/diffusers/pipelines/animatediff/test_animatediff_sparsectrl.py rename to tests/diffusers_tests/pipelines/animatediff/test_animatediff_sparsectrl.py diff --git a/tests/diffusers/pipelines/animatediff/test_animatediff_video2video.py b/tests/diffusers_tests/pipelines/animatediff/test_animatediff_video2video.py similarity index 100% rename from tests/diffusers/pipelines/animatediff/test_animatediff_video2video.py rename to tests/diffusers_tests/pipelines/animatediff/test_animatediff_video2video.py diff --git a/tests/diffusers_tests/pipelines/animatediff/test_animatediff_video2video_controlnet.py b/tests/diffusers_tests/pipelines/animatediff/test_animatediff_video2video_controlnet.py new file mode 100644 index 0000000000..3f2c3d9843 --- /dev/null +++ b/tests/diffusers_tests/pipelines/animatediff/test_animatediff_video2video_controlnet.py @@ -0,0 +1,198 @@ +import unittest + +import numpy as np +import torch +from ddt import data, ddt, unpack +from PIL import Image +from transformers import CLIPTextConfig + +import mindspore as ms + +from ..pipeline_test_utils import ( + THRESHOLD_FP16, + THRESHOLD_FP32, + PipelineTesterMixin, + get_module, + get_pipeline_components, +) + +test_cases = [ + {"mode": ms.PYNATIVE_MODE, "dtype": "float32"}, + {"mode": ms.PYNATIVE_MODE, "dtype": "float16"}, + {"mode": ms.GRAPH_MODE, "dtype": "float32"}, + {"mode": ms.GRAPH_MODE, "dtype": "float16"}, +] + + +@ddt +class AnimateDiffVideoToVideoControlNetPipelineFastTests(PipelineTesterMixin, unittest.TestCase): + cross_attention_dim = 8 + block_out_channels = (8, 8) + + pipeline_config = [ + [ + "unet", + "diffusers.models.unets.unet_2d_condition.UNet2DConditionModel", + "mindone.diffusers.models.unets.unet_2d_condition.UNet2DConditionModel", + dict( + block_out_channels=block_out_channels, + layers_per_block=2, + sample_size=8, + in_channels=4, + out_channels=4, + down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"), + up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), + cross_attention_dim=cross_attention_dim, + norm_num_groups=2, + ), + ], + [ + "scheduler", + "diffusers.schedulers.scheduling_ddim.DDIMScheduler", + "mindone.diffusers.schedulers.scheduling_ddim.DDIMScheduler", + dict( + beta_start=0.00085, + beta_end=0.012, + beta_schedule="linear", + clip_sample=False, + ), + ], + [ + "controlnet", + "diffusers.models.controlnet.ControlNetModel", + "mindone.diffusers.models.controlnet.ControlNetModel", + dict( + block_out_channels=block_out_channels, + layers_per_block=2, + in_channels=4, + down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"), + cross_attention_dim=cross_attention_dim, + conditioning_embedding_out_channels=(8, 8), + norm_num_groups=1, + ), + ], + [ + "vae", + "diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL", + "mindone.diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL", + dict( + block_out_channels=block_out_channels, + in_channels=3, + out_channels=3, + down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], + up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], + latent_channels=4, + norm_num_groups=2, + ), + ], + [ + "text_encoder", + "transformers.models.clip.modeling_clip.CLIPTextModel", + "mindone.transformers.models.clip.modeling_clip.CLIPTextModel", + dict( + config=CLIPTextConfig( + bos_token_id=0, + eos_token_id=2, + hidden_size=cross_attention_dim, + intermediate_size=37, + layer_norm_eps=1e-05, + num_attention_heads=4, + num_hidden_layers=5, + pad_token_id=1, + vocab_size=1000, + ), + ), + ], + [ + "tokenizer", + "transformers.models.clip.tokenization_clip.CLIPTokenizer", + "transformers.models.clip.tokenization_clip.CLIPTokenizer", + dict( + pretrained_model_name_or_path="hf-internal-testing/tiny-random-clip", + ), + ], + [ + "motion_adapter", + "diffusers.models.unets.unet_motion_model.MotionAdapter", + "mindone.diffusers.models.unets.unet_motion_model.MotionAdapter", + dict( + block_out_channels=block_out_channels, + motion_layers_per_block=2, + motion_norm_num_groups=2, + motion_num_attention_heads=4, + ), + ], + ] + + def get_dummy_components(self): + components = { + key: None + for key in [ + "unet", + "controlnet", + "scheduler", + "vae", + "motion_adapter", + "text_encoder", + "tokenizer", + "feature_extractor", + "image_encoder", + ] + } + + return get_pipeline_components(components, self.pipeline_config) + + def get_dummy_inputs(self, num_frames: int = 2): + video_height = 32 + video_width = 32 + video = [Image.new("RGB", (video_width, video_height))] * num_frames + + video_height = 32 + video_width = 32 + conditioning_frames = [Image.new("RGB", (video_width, video_height))] * num_frames + + inputs = { + "video": video, + "conditioning_frames": conditioning_frames, + "prompt": "A painting of a squirrel eating a burger", + "num_inference_steps": 2, + "guidance_scale": 7.5, + "output_type": "np", + } + return inputs + + @data(*test_cases) + @unpack + def test_inference(self, mode, dtype): + ms.set_context(mode=mode) + + pt_components, ms_components = self.get_dummy_components() + pt_pipe_cls = get_module( + "diffusers.pipelines.animatediff.pipeline_animatediff_video2video_controlnet.AnimateDiffVideoToVideoControlNetPipeline" + ) + ms_pipe_cls = get_module( + "mindone.diffusers.pipelines.animatediff.pipeline_animatediff_video2video_controlnet.AnimateDiffVideoToVideoControlNetPipeline" + ) + + pt_pipe = pt_pipe_cls(**pt_components) + ms_pipe = ms_pipe_cls(**ms_components) + + pt_pipe.set_progress_bar_config(disable=None) + ms_pipe.set_progress_bar_config(disable=None) + + ms_dtype, pt_dtype = getattr(ms, dtype), getattr(torch, dtype) + pt_pipe = pt_pipe.to(pt_dtype) + ms_pipe = ms_pipe.to(ms_dtype) + + inputs = self.get_dummy_inputs() + + torch.manual_seed(0) + pt_frame = pt_pipe(**inputs) + torch.manual_seed(0) + ms_frame = ms_pipe(**inputs) + + pt_image_slice = pt_frame.frames[0][0, -3:, -3:, -1] + ms_image_slice = ms_frame[0][0][0, -3:, -3:, -1] + + threshold = THRESHOLD_FP32 if dtype == "float32" else THRESHOLD_FP16 + assert np.max(np.linalg.norm(pt_image_slice - ms_image_slice) / np.linalg.norm(pt_image_slice)) < threshold diff --git a/tests/diffusers/pipelines/aura_flow/__init__.py b/tests/diffusers_tests/pipelines/aura_flow/__init__.py similarity index 100% rename from tests/diffusers/pipelines/aura_flow/__init__.py rename to tests/diffusers_tests/pipelines/aura_flow/__init__.py diff --git a/tests/diffusers/pipelines/aura_flow/test_pipeline_aura_flow.py b/tests/diffusers_tests/pipelines/aura_flow/test_pipeline_aura_flow.py similarity index 100% rename from tests/diffusers/pipelines/aura_flow/test_pipeline_aura_flow.py rename to tests/diffusers_tests/pipelines/aura_flow/test_pipeline_aura_flow.py diff --git a/tests/diffusers/pipelines/blipdiffusion/__init__.py b/tests/diffusers_tests/pipelines/blipdiffusion/__init__.py similarity index 100% rename from tests/diffusers/pipelines/blipdiffusion/__init__.py rename to tests/diffusers_tests/pipelines/blipdiffusion/__init__.py diff --git a/tests/diffusers/pipelines/blipdiffusion/test_blipdiffusion.py b/tests/diffusers_tests/pipelines/blipdiffusion/test_blipdiffusion.py similarity index 100% rename from tests/diffusers/pipelines/blipdiffusion/test_blipdiffusion.py rename to tests/diffusers_tests/pipelines/blipdiffusion/test_blipdiffusion.py diff --git a/tests/diffusers/pipelines/consistency_models/__init__.py b/tests/diffusers_tests/pipelines/cogvideo/__init__.py similarity index 100% rename from tests/diffusers/pipelines/consistency_models/__init__.py rename to tests/diffusers_tests/pipelines/cogvideo/__init__.py diff --git a/tests/diffusers_tests/pipelines/cogvideo/test_cogvideox.py b/tests/diffusers_tests/pipelines/cogvideo/test_cogvideox.py new file mode 100644 index 0000000000..2b8243396b --- /dev/null +++ b/tests/diffusers_tests/pipelines/cogvideo/test_cogvideox.py @@ -0,0 +1,176 @@ +# Copyright 2024 The HuggingFace Team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import torch +from ddt import data, ddt, unpack + +import mindspore as ms + +from ..pipeline_test_utils import ( + THRESHOLD_FP16, + THRESHOLD_FP32, + PipelineTesterMixin, + get_module, + get_pipeline_components, +) + +test_cases = [ + {"mode": ms.PYNATIVE_MODE, "dtype": "float16"}, + {"mode": ms.PYNATIVE_MODE, "dtype": "bfloat16"}, +] + + +@ddt +class CogVideoXPipelineFastTests(PipelineTesterMixin, unittest.TestCase): + pipeline_config = [ + [ + "transformer", + "diffusers.models.transformers.cogvideox_transformer_3d.CogVideoXTransformer3DModel", + "mindone.diffusers.models.transformers.cogvideox_transformer_3d.CogVideoXTransformer3DModel", + dict( + # Product of num_attention_heads * attention_head_dim must be divisible by 16 for 3D positional embeddings + # But, since we are using tiny-random-t5 here, we need the internal dim of CogVideoXTransformer3DModel + # to be 32. The internal dim is product of num_attention_heads and attention_head_dim + num_attention_heads=4, + attention_head_dim=8, + in_channels=4, + out_channels=4, + time_embed_dim=2, + text_embed_dim=32, # Must match with tiny-random-t5 + num_layers=1, + sample_width=2, # latent width: 2 -> final width: 16 + sample_height=2, # latent height: 2 -> final height: 16 + sample_frames=9, # latent frames: (9 - 1) / 4 + 1 = 3 -> final frames: 9 + patch_size=2, + temporal_compression_ratio=4, + max_text_seq_length=16, + ), + ], + [ + "vae", + "diffusers.models.autoencoders.autoencoder_kl_cogvideox.AutoencoderKLCogVideoX", + "mindone.diffusers.models.autoencoders.autoencoder_kl_cogvideox.AutoencoderKLCogVideoX", + dict( + in_channels=3, + out_channels=3, + down_block_types=( + "CogVideoXDownBlock3D", + "CogVideoXDownBlock3D", + "CogVideoXDownBlock3D", + "CogVideoXDownBlock3D", + ), + up_block_types=( + "CogVideoXUpBlock3D", + "CogVideoXUpBlock3D", + "CogVideoXUpBlock3D", + "CogVideoXUpBlock3D", + ), + block_out_channels=(8, 8, 8, 8), + latent_channels=4, + layers_per_block=1, + norm_num_groups=2, + temporal_compression_ratio=4, + ), + ], + [ + "scheduler", + "diffusers.schedulers.scheduling_ddim.DDIMScheduler", + "mindone.diffusers.schedulers.scheduling_ddim.DDIMScheduler", + dict(), + ], + [ + "text_encoder", + "transformers.models.t5.modeling_t5.T5EncoderModel", + "mindone.transformers.models.t5.modeling_t5.T5EncoderModel", + dict( + pretrained_model_name_or_path="hf-internal-testing/tiny-random-t5", + revision="refs/pr/1", + ), + ], + [ + "tokenizer", + "transformers.models.auto.tokenization_auto.AutoTokenizer", + "transformers.models.auto.tokenization_auto.AutoTokenizer", + dict( + pretrained_model_name_or_path="hf-internal-testing/tiny-random-t5", + ), + ], + ] + + def get_dummy_components(self): + components = { + key: None + for key in [ + "transformer", + "vae", + "scheduler", + "text_encoder", + "tokenizer", + ] + } + + return get_pipeline_components(components, self.pipeline_config) + + def get_dummy_inputs(self): + inputs = { + "prompt": "dance monkey", + "negative_prompt": "", + "num_inference_steps": 2, + "guidance_scale": 6.0, + # Cannot reduce because convolution kernel becomes bigger than sample + "height": 16, + "width": 16, + "num_frames": 8, + "max_sequence_length": 16, + "output_type": "np", + } + return inputs + + @data(*test_cases) + @unpack + def test_inference(self, mode, dtype): + ms.set_context(mode=mode) + + pt_components, ms_components = self.get_dummy_components() + pt_pipe_cls = get_module("diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline") + ms_pipe_cls = get_module("mindone.diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline") + + pt_pipe = pt_pipe_cls(**pt_components) + ms_pipe = ms_pipe_cls(**ms_components) + + pt_pipe.set_progress_bar_config(disable=None) + ms_pipe.set_progress_bar_config(disable=None) + + ms_dtype, pt_dtype = getattr(ms, dtype), getattr(torch, dtype) + pt_pipe = pt_pipe.to(pt_dtype) + ms_pipe = ms_pipe.to(ms_dtype) + + inputs = self.get_dummy_inputs() + + torch.manual_seed(0) + pt_video = pt_pipe(**inputs).frames + torch.manual_seed(0) + ms_video = ms_pipe(**inputs)[0] + + pt_generated_video = pt_video[0] + ms_generated_video = ms_video[0] + + threshold = THRESHOLD_FP32 if dtype == "float32" else THRESHOLD_FP16 + assert ( + np.max(np.linalg.norm(pt_generated_video - ms_generated_video) / np.linalg.norm(pt_generated_video)) + < threshold + ) diff --git a/tests/diffusers_tests/pipelines/cogvideo/test_cogvideox_fun_control.py b/tests/diffusers_tests/pipelines/cogvideo/test_cogvideox_fun_control.py new file mode 100644 index 0000000000..01aa6b1bfb --- /dev/null +++ b/tests/diffusers_tests/pipelines/cogvideo/test_cogvideox_fun_control.py @@ -0,0 +1,183 @@ +# Copyright 2024 The HuggingFace Team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import torch +from ddt import data, ddt, unpack +from PIL import Image + +import mindspore as ms + +from ..pipeline_test_utils import ( + THRESHOLD_FP16, + THRESHOLD_FP32, + PipelineTesterMixin, + get_module, + get_pipeline_components, +) + +test_cases = [ + {"mode": ms.PYNATIVE_MODE, "dtype": "float16"}, + {"mode": ms.PYNATIVE_MODE, "dtype": "bfloat16"}, +] + + +@ddt +class CogVideoXFunControlPipelineFastTests(PipelineTesterMixin, unittest.TestCase): + pipeline_config = [ + [ + "transformer", + "diffusers.models.transformers.cogvideox_transformer_3d.CogVideoXTransformer3DModel", + "mindone.diffusers.models.transformers.cogvideox_transformer_3d.CogVideoXTransformer3DModel", + dict( + num_attention_heads=4, + attention_head_dim=8, + in_channels=8, + out_channels=4, + time_embed_dim=2, + text_embed_dim=32, # Must match with tiny-random-t5 + num_layers=1, + sample_width=2, # latent width: 2 -> final width: 16 + sample_height=2, # latent height: 2 -> final height: 16 + sample_frames=9, # latent frames: (9 - 1) / 4 + 1 = 3 -> final frames: 9 + patch_size=2, + temporal_compression_ratio=4, + max_text_seq_length=16, + ), + ], + [ + "vae", + "diffusers.models.autoencoders.autoencoder_kl_cogvideox.AutoencoderKLCogVideoX", + "mindone.diffusers.models.autoencoders.autoencoder_kl_cogvideox.AutoencoderKLCogVideoX", + dict( + in_channels=3, + out_channels=3, + down_block_types=( + "CogVideoXDownBlock3D", + "CogVideoXDownBlock3D", + "CogVideoXDownBlock3D", + "CogVideoXDownBlock3D", + ), + up_block_types=( + "CogVideoXUpBlock3D", + "CogVideoXUpBlock3D", + "CogVideoXUpBlock3D", + "CogVideoXUpBlock3D", + ), + block_out_channels=(8, 8, 8, 8), + latent_channels=4, + layers_per_block=1, + norm_num_groups=2, + temporal_compression_ratio=4, + ), + ], + [ + "scheduler", + "diffusers.schedulers.scheduling_ddim.DDIMScheduler", + "mindone.diffusers.schedulers.scheduling_ddim.DDIMScheduler", + dict(), + ], + [ + "text_encoder", + "transformers.models.t5.modeling_t5.T5EncoderModel", + "mindone.transformers.models.t5.modeling_t5.T5EncoderModel", + dict( + pretrained_model_name_or_path="hf-internal-testing/tiny-random-t5", + revision="refs/pr/1", + ), + ], + [ + "tokenizer", + "transformers.models.auto.tokenization_auto.AutoTokenizer", + "transformers.models.auto.tokenization_auto.AutoTokenizer", + dict( + pretrained_model_name_or_path="hf-internal-testing/tiny-random-t5", + ), + ], + ] + + def get_dummy_components(self): + components = { + key: None + for key in [ + "transformer", + "vae", + "scheduler", + "text_encoder", + "tokenizer", + ] + } + + return get_pipeline_components(components, self.pipeline_config) + + def get_dummy_inputs(self, num_frames: int = 8): + # Cannot reduce because convolution kernel becomes bigger than sample + height = 16 + width = 16 + + control_video = [Image.new("RGB", (width, height))] * num_frames + + inputs = { + "prompt": "dance monkey", + "negative_prompt": "", + "control_video": control_video, + "num_inference_steps": 2, + "guidance_scale": 6.0, + "height": height, + "width": width, + "max_sequence_length": 16, + "output_type": "np", + } + return inputs + + @data(*test_cases) + @unpack + def test_inference(self, mode, dtype): + ms.set_context(mode=mode) + + pt_components, ms_components = self.get_dummy_components() + pt_pipe_cls = get_module( + "diffusers.pipelines.cogvideo.pipeline_cogvideox_fun_control.CogVideoXFunControlPipeline" + ) + ms_pipe_cls = get_module( + "mindone.diffusers.pipelines.cogvideo.pipeline_cogvideox_fun_control.CogVideoXFunControlPipeline" + ) + + pt_pipe = pt_pipe_cls(**pt_components) + ms_pipe = ms_pipe_cls(**ms_components) + + pt_pipe.set_progress_bar_config(disable=None) + ms_pipe.set_progress_bar_config(disable=None) + + ms_dtype, pt_dtype = getattr(ms, dtype), getattr(torch, dtype) + pt_pipe = pt_pipe.to(pt_dtype) + ms_pipe = ms_pipe.to(ms_dtype) + + inputs = self.get_dummy_inputs() + + torch.manual_seed(0) + pt_video = pt_pipe(**inputs).frames + torch.manual_seed(0) + ms_video = ms_pipe(**inputs)[0] + + pt_generated_video = pt_video[0] + ms_generated_video = ms_video[0] + + threshold = THRESHOLD_FP32 if dtype == "float32" else THRESHOLD_FP16 + assert ( + np.max(np.linalg.norm(pt_generated_video - ms_generated_video) / np.linalg.norm(pt_generated_video)) + < threshold + ) diff --git a/tests/diffusers_tests/pipelines/cogvideo/test_cogvideox_image2video.py b/tests/diffusers_tests/pipelines/cogvideo/test_cogvideox_image2video.py new file mode 100644 index 0000000000..138b834831 --- /dev/null +++ b/tests/diffusers_tests/pipelines/cogvideo/test_cogvideox_image2video.py @@ -0,0 +1,185 @@ +# Copyright 2024 The HuggingFace Team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import torch +from ddt import data, ddt, unpack +from PIL import Image + +import mindspore as ms + +from ..pipeline_test_utils import ( + THRESHOLD_FP16, + THRESHOLD_FP32, + PipelineTesterMixin, + get_module, + get_pipeline_components, +) + +test_cases = [ + {"mode": ms.PYNATIVE_MODE, "dtype": "float16"}, + {"mode": ms.PYNATIVE_MODE, "dtype": "bfloat16"}, +] + + +@ddt +class CogVideoXImageToVideoPipelineFastTests(PipelineTesterMixin, unittest.TestCase): + pipeline_config = [ + [ + "transformer", + "diffusers.models.transformers.cogvideox_transformer_3d.CogVideoXTransformer3DModel", + "mindone.diffusers.models.transformers.cogvideox_transformer_3d.CogVideoXTransformer3DModel", + dict( + num_attention_heads=2, + attention_head_dim=16, + in_channels=8, + out_channels=4, + time_embed_dim=2, + text_embed_dim=32, # Must match with tiny-random-t5 + num_layers=1, + sample_width=2, # latent width: 2 -> final width: 16 + sample_height=2, # latent height: 2 -> final height: 16 + sample_frames=9, # latent frames: (9 - 1) / 4 + 1 = 3 -> final frames: 9 + patch_size=2, + temporal_compression_ratio=4, + max_text_seq_length=16, + use_rotary_positional_embeddings=True, + use_learned_positional_embeddings=True, + ), + ], + [ + "vae", + "diffusers.models.autoencoders.autoencoder_kl_cogvideox.AutoencoderKLCogVideoX", + "mindone.diffusers.models.autoencoders.autoencoder_kl_cogvideox.AutoencoderKLCogVideoX", + dict( + in_channels=3, + out_channels=3, + down_block_types=( + "CogVideoXDownBlock3D", + "CogVideoXDownBlock3D", + "CogVideoXDownBlock3D", + "CogVideoXDownBlock3D", + ), + up_block_types=( + "CogVideoXUpBlock3D", + "CogVideoXUpBlock3D", + "CogVideoXUpBlock3D", + "CogVideoXUpBlock3D", + ), + block_out_channels=(8, 8, 8, 8), + latent_channels=4, + layers_per_block=1, + norm_num_groups=2, + temporal_compression_ratio=4, + ), + ], + [ + "scheduler", + "diffusers.schedulers.scheduling_ddim.DDIMScheduler", + "mindone.diffusers.schedulers.scheduling_ddim.DDIMScheduler", + dict(), + ], + [ + "text_encoder", + "transformers.models.t5.modeling_t5.T5EncoderModel", + "mindone.transformers.models.t5.modeling_t5.T5EncoderModel", + dict( + pretrained_model_name_or_path="hf-internal-testing/tiny-random-t5", + revision="refs/pr/1", + ), + ], + [ + "tokenizer", + "transformers.models.auto.tokenization_auto.AutoTokenizer", + "transformers.models.auto.tokenization_auto.AutoTokenizer", + dict( + pretrained_model_name_or_path="hf-internal-testing/tiny-random-t5", + ), + ], + ] + + def get_dummy_components(self): + components = { + key: None + for key in [ + "transformer", + "vae", + "scheduler", + "text_encoder", + "tokenizer", + ] + } + + return get_pipeline_components(components, self.pipeline_config) + + def get_dummy_inputs(self): + # Cannot reduce below 16 because convolution kernel becomes bigger than sample + # Cannot reduce below 32 because 3D RoPE errors out + image_height = 16 + image_width = 16 + image = Image.new("RGB", (image_width, image_height)) + inputs = { + "image": image, + "prompt": "dance monkey", + "negative_prompt": "", + "num_inference_steps": 2, + "guidance_scale": 6.0, + "height": image_height, + "width": image_width, + "num_frames": 8, + "max_sequence_length": 16, + "output_type": "np", + } + return inputs + + @data(*test_cases) + @unpack + def test_inference(self, mode, dtype): + ms.set_context(mode=mode) + + pt_components, ms_components = self.get_dummy_components() + pt_pipe_cls = get_module( + "diffusers.pipelines.cogvideo.pipeline_cogvideox_image2video.CogVideoXImageToVideoPipeline" + ) + ms_pipe_cls = get_module( + "mindone.diffusers.pipelines.cogvideo.pipeline_cogvideox_image2video.CogVideoXImageToVideoPipeline" + ) + + pt_pipe = pt_pipe_cls(**pt_components) + ms_pipe = ms_pipe_cls(**ms_components) + + pt_pipe.set_progress_bar_config(disable=None) + ms_pipe.set_progress_bar_config(disable=None) + + ms_dtype, pt_dtype = getattr(ms, dtype), getattr(torch, dtype) + pt_pipe = pt_pipe.to(pt_dtype) + ms_pipe = ms_pipe.to(ms_dtype) + + inputs = self.get_dummy_inputs() + + torch.manual_seed(0) + pt_video = pt_pipe(**inputs).frames + torch.manual_seed(0) + ms_video = ms_pipe(**inputs)[0] + + pt_generated_video = pt_video[0] + ms_generated_video = ms_video[0] + + threshold = THRESHOLD_FP32 if dtype == "float32" else THRESHOLD_FP16 + assert ( + np.max(np.linalg.norm(pt_generated_video - ms_generated_video) / np.linalg.norm(pt_generated_video)) + < threshold + ) diff --git a/tests/diffusers_tests/pipelines/cogvideo/test_cogvideox_video2video.py b/tests/diffusers_tests/pipelines/cogvideo/test_cogvideox_video2video.py new file mode 100644 index 0000000000..bfcb276a6d --- /dev/null +++ b/tests/diffusers_tests/pipelines/cogvideo/test_cogvideox_video2video.py @@ -0,0 +1,186 @@ +# Copyright 2024 The HuggingFace Team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import torch +from ddt import data, ddt, unpack +from PIL import Image + +import mindspore as ms + +from ..pipeline_test_utils import ( + THRESHOLD_FP16, + THRESHOLD_FP32, + PipelineTesterMixin, + get_module, + get_pipeline_components, +) + +test_cases = [ + {"mode": ms.PYNATIVE_MODE, "dtype": "float16"}, + {"mode": ms.PYNATIVE_MODE, "dtype": "bfloat16"}, +] + + +@ddt +class CogVideoXVideoToVideoPipelineFastTests(PipelineTesterMixin, unittest.TestCase): + pipeline_config = [ + [ + "transformer", + "diffusers.models.transformers.cogvideox_transformer_3d.CogVideoXTransformer3DModel", + "mindone.diffusers.models.transformers.cogvideox_transformer_3d.CogVideoXTransformer3DModel", + dict( + # Product of num_attention_heads * attention_head_dim must be divisible by 16 for 3D positional embeddings + # But, since we are using tiny-random-t5 here, we need the internal dim of CogVideoXTransformer3DModel + # to be 32. The internal dim is product of num_attention_heads and attention_head_dim + num_attention_heads=4, + attention_head_dim=8, + in_channels=4, + out_channels=4, + time_embed_dim=2, + text_embed_dim=32, # Must match with tiny-random-t5 + num_layers=1, + sample_width=2, # latent width: 2 -> final width: 16 + sample_height=2, # latent height: 2 -> final height: 16 + sample_frames=9, # latent frames: (9 - 1) / 4 + 1 = 3 -> final frames: 9 + patch_size=2, + temporal_compression_ratio=4, + max_text_seq_length=16, + ), + ], + [ + "vae", + "diffusers.models.autoencoders.autoencoder_kl_cogvideox.AutoencoderKLCogVideoX", + "mindone.diffusers.models.autoencoders.autoencoder_kl_cogvideox.AutoencoderKLCogVideoX", + dict( + in_channels=3, + out_channels=3, + down_block_types=( + "CogVideoXDownBlock3D", + "CogVideoXDownBlock3D", + "CogVideoXDownBlock3D", + "CogVideoXDownBlock3D", + ), + up_block_types=( + "CogVideoXUpBlock3D", + "CogVideoXUpBlock3D", + "CogVideoXUpBlock3D", + "CogVideoXUpBlock3D", + ), + block_out_channels=(8, 8, 8, 8), + latent_channels=4, + layers_per_block=1, + norm_num_groups=2, + temporal_compression_ratio=4, + ), + ], + [ + "scheduler", + "diffusers.schedulers.scheduling_ddim.DDIMScheduler", + "mindone.diffusers.schedulers.scheduling_ddim.DDIMScheduler", + dict(), + ], + [ + "text_encoder", + "transformers.models.t5.modeling_t5.T5EncoderModel", + "mindone.transformers.models.t5.modeling_t5.T5EncoderModel", + dict( + pretrained_model_name_or_path="hf-internal-testing/tiny-random-t5", + revision="refs/pr/1", + ), + ], + [ + "tokenizer", + "transformers.models.auto.tokenization_auto.AutoTokenizer", + "transformers.models.auto.tokenization_auto.AutoTokenizer", + dict( + pretrained_model_name_or_path="hf-internal-testing/tiny-random-t5", + ), + ], + ] + + def get_dummy_components(self): + components = { + key: None + for key in [ + "transformer", + "vae", + "scheduler", + "text_encoder", + "tokenizer", + ] + } + + return get_pipeline_components(components, self.pipeline_config) + + def get_dummy_inputs(self, num_frames: int = 8): + video_height = 16 + video_width = 16 + video = [Image.new("RGB", (video_width, video_height))] * num_frames + + inputs = { + "video": video, + "prompt": "dance monkey", + "negative_prompt": "", + "num_inference_steps": 2, + "strength": 0.5, + "guidance_scale": 6.0, + # Cannot reduce because convolution kernel becomes bigger than sample + "height": video_height, + "width": video_width, + "max_sequence_length": 16, + "output_type": "np", + } + return inputs + + @data(*test_cases) + @unpack + def test_inference(self, mode, dtype): + ms.set_context(mode=mode) + + pt_components, ms_components = self.get_dummy_components() + pt_pipe_cls = get_module( + "diffusers.pipelines.cogvideo.pipeline_cogvideox_video2video.CogVideoXVideoToVideoPipeline" + ) + ms_pipe_cls = get_module( + "mindone.diffusers.pipelines.cogvideo.pipeline_cogvideox_video2video.CogVideoXVideoToVideoPipeline" + ) + + pt_pipe = pt_pipe_cls(**pt_components) + ms_pipe = ms_pipe_cls(**ms_components) + + pt_pipe.set_progress_bar_config(disable=None) + ms_pipe.set_progress_bar_config(disable=None) + + ms_dtype, pt_dtype = getattr(ms, dtype), getattr(torch, dtype) + pt_pipe = pt_pipe.to(pt_dtype) + ms_pipe = ms_pipe.to(ms_dtype) + + inputs = self.get_dummy_inputs() + + torch.manual_seed(0) + pt_video = pt_pipe(**inputs).frames + torch.manual_seed(0) + ms_video = ms_pipe(**inputs)[0] + + pt_generated_video = pt_video[0] + ms_generated_video = ms_video[0] + + threshold = THRESHOLD_FP32 if dtype == "float32" else THRESHOLD_FP16 + assert ( + np.max(np.linalg.norm(pt_generated_video - ms_generated_video) / np.linalg.norm(pt_generated_video)) + < threshold + ) diff --git a/tests/diffusers/pipelines/controlnet/__init__.py b/tests/diffusers_tests/pipelines/cogview3/__init__.py similarity index 100% rename from tests/diffusers/pipelines/controlnet/__init__.py rename to tests/diffusers_tests/pipelines/cogview3/__init__.py diff --git a/tests/diffusers_tests/pipelines/cogview3/test_cogview3plus.py b/tests/diffusers_tests/pipelines/cogview3/test_cogview3plus.py new file mode 100644 index 0000000000..6d1d1925a6 --- /dev/null +++ b/tests/diffusers_tests/pipelines/cogview3/test_cogview3plus.py @@ -0,0 +1,159 @@ +# Copyright 2024 The HuggingFace Team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import torch +from ddt import data, ddt, unpack + +import mindspore as ms + +from ..pipeline_test_utils import ( + THRESHOLD_FP16, + THRESHOLD_FP32, + PipelineTesterMixin, + get_module, + get_pipeline_components, +) + +test_cases = [ + {"mode": ms.PYNATIVE_MODE, "dtype": "float32"}, + {"mode": ms.PYNATIVE_MODE, "dtype": "bfloat16"}, + {"mode": ms.GRAPH_MODE, "dtype": "float32"}, + {"mode": ms.GRAPH_MODE, "dtype": "bfloat16"}, +] + + +@ddt +class CogView3PlusPipelineFastTests(PipelineTesterMixin, unittest.TestCase): + pipeline_config = [ + [ + "transformer", + "diffusers.models.transformers.transformer_cogview3plus.CogView3PlusTransformer2DModel", + "mindone.diffusers.models.transformers.transformer_cogview3plus.CogView3PlusTransformer2DModel", + dict( + patch_size=2, + in_channels=4, + num_layers=1, + attention_head_dim=4, + num_attention_heads=2, + out_channels=4, + text_embed_dim=32, # Must match with tiny-random-t5 + time_embed_dim=8, + condition_dim=2, + pos_embed_max_size=8, + sample_size=8, + ), + ], + [ + "vae", + "diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL", + "mindone.diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL", + dict( + block_out_channels=[32, 64], + in_channels=3, + out_channels=3, + down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], + up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], + latent_channels=4, + sample_size=128, + ), + ], + [ + "scheduler", + "diffusers.schedulers.scheduling_ddim_cogvideox.CogVideoXDDIMScheduler", + "mindone.diffusers.schedulers.scheduling_ddim_cogvideox.CogVideoXDDIMScheduler", + dict(), + ], + [ + "text_encoder", + "transformers.models.t5.modeling_t5.T5EncoderModel", + "mindone.transformers.models.t5.modeling_t5.T5EncoderModel", + dict( + pretrained_model_name_or_path="hf-internal-testing/tiny-random-t5", + revision="refs/pr/1", + ), + ], + [ + "tokenizer", + "transformers.models.auto.tokenization_auto.AutoTokenizer", + "transformers.models.auto.tokenization_auto.AutoTokenizer", + dict( + pretrained_model_name_or_path="hf-internal-testing/tiny-random-t5", + ), + ], + ] + + def get_dummy_components(self): + components = { + key: None + for key in [ + "transformer", + "vae", + "scheduler", + "text_encoder", + "tokenizer", + ] + } + + return get_pipeline_components(components, self.pipeline_config) + + def get_dummy_inputs(self): + inputs = { + "prompt": "dance monkey", + "negative_prompt": "", + "num_inference_steps": 2, + "guidance_scale": 6.0, + "height": 16, + "width": 16, + "max_sequence_length": 16, + "output_type": "np", + } + return inputs + + @data(*test_cases) + @unpack + def test_inference(self, mode, dtype): + ms.set_context(mode=mode) + + pt_components, ms_components = self.get_dummy_components() + pt_pipe_cls = get_module("diffusers.pipelines.cogview3.pipeline_cogview3plus.CogView3PlusPipeline") + ms_pipe_cls = get_module("mindone.diffusers.pipelines.cogview3.pipeline_cogview3plus.CogView3PlusPipeline") + + pt_pipe = pt_pipe_cls(**pt_components) + ms_pipe = ms_pipe_cls(**ms_components) + + pt_pipe.set_progress_bar_config(disable=None) + ms_pipe.set_progress_bar_config(disable=None) + + ms_dtype, pt_dtype = getattr(ms, dtype), getattr(torch, dtype) + pt_pipe = pt_pipe.to(pt_dtype) + ms_pipe = ms_pipe.to(ms_dtype) + + inputs = self.get_dummy_inputs() + + torch.manual_seed(0) + pt_image = pt_pipe(**inputs).frames + torch.manual_seed(0) + ms_image = ms_pipe(**inputs)[0] + + pt_generated_image = pt_image[0] + ms_generated_image = ms_image[0] + + threshold = THRESHOLD_FP32 if dtype == "float32" else THRESHOLD_FP16 + assert ( + np.max(np.linalg.norm(pt_generated_image - ms_generated_image) / np.linalg.norm(pt_generated_image)) + < threshold + ) diff --git a/tests/diffusers/pipelines/controlnet_sd3/__init__.py b/tests/diffusers_tests/pipelines/consistency_models/__init__.py similarity index 100% rename from tests/diffusers/pipelines/controlnet_sd3/__init__.py rename to tests/diffusers_tests/pipelines/consistency_models/__init__.py diff --git a/tests/diffusers/pipelines/consistency_models/test_consistency_models.py b/tests/diffusers_tests/pipelines/consistency_models/test_consistency_models.py similarity index 100% rename from tests/diffusers/pipelines/consistency_models/test_consistency_models.py rename to tests/diffusers_tests/pipelines/consistency_models/test_consistency_models.py diff --git a/tests/diffusers/pipelines/controlnet_xs/__init__.py b/tests/diffusers_tests/pipelines/controlnet/__init__.py similarity index 100% rename from tests/diffusers/pipelines/controlnet_xs/__init__.py rename to tests/diffusers_tests/pipelines/controlnet/__init__.py diff --git a/tests/diffusers/pipelines/controlnet/test_controlnet.py b/tests/diffusers_tests/pipelines/controlnet/test_controlnet.py similarity index 100% rename from tests/diffusers/pipelines/controlnet/test_controlnet.py rename to tests/diffusers_tests/pipelines/controlnet/test_controlnet.py diff --git a/tests/diffusers/pipelines/controlnet/test_controlnet_blip_diffusion.py b/tests/diffusers_tests/pipelines/controlnet/test_controlnet_blip_diffusion.py similarity index 100% rename from tests/diffusers/pipelines/controlnet/test_controlnet_blip_diffusion.py rename to tests/diffusers_tests/pipelines/controlnet/test_controlnet_blip_diffusion.py diff --git a/tests/diffusers/pipelines/controlnet/test_controlnet_img2img.py b/tests/diffusers_tests/pipelines/controlnet/test_controlnet_img2img.py similarity index 100% rename from tests/diffusers/pipelines/controlnet/test_controlnet_img2img.py rename to tests/diffusers_tests/pipelines/controlnet/test_controlnet_img2img.py diff --git a/tests/diffusers/pipelines/controlnet/test_controlnet_inpaint.py b/tests/diffusers_tests/pipelines/controlnet/test_controlnet_inpaint.py similarity index 100% rename from tests/diffusers/pipelines/controlnet/test_controlnet_inpaint.py rename to tests/diffusers_tests/pipelines/controlnet/test_controlnet_inpaint.py diff --git a/tests/diffusers/pipelines/controlnet/test_controlnet_inpaint_sdxl.py b/tests/diffusers_tests/pipelines/controlnet/test_controlnet_inpaint_sdxl.py similarity index 100% rename from tests/diffusers/pipelines/controlnet/test_controlnet_inpaint_sdxl.py rename to tests/diffusers_tests/pipelines/controlnet/test_controlnet_inpaint_sdxl.py diff --git a/tests/diffusers/pipelines/controlnet/test_controlnet_sdxl.py b/tests/diffusers_tests/pipelines/controlnet/test_controlnet_sdxl.py similarity index 100% rename from tests/diffusers/pipelines/controlnet/test_controlnet_sdxl.py rename to tests/diffusers_tests/pipelines/controlnet/test_controlnet_sdxl.py diff --git a/tests/diffusers/pipelines/controlnet/test_controlnet_sdxl_img2img.py b/tests/diffusers_tests/pipelines/controlnet/test_controlnet_sdxl_img2img.py similarity index 100% rename from tests/diffusers/pipelines/controlnet/test_controlnet_sdxl_img2img.py rename to tests/diffusers_tests/pipelines/controlnet/test_controlnet_sdxl_img2img.py diff --git a/tests/diffusers/pipelines/dance_diffusion/__init__.py b/tests/diffusers_tests/pipelines/controlnet_sd3/__init__.py similarity index 100% rename from tests/diffusers/pipelines/dance_diffusion/__init__.py rename to tests/diffusers_tests/pipelines/controlnet_sd3/__init__.py diff --git a/tests/diffusers_tests/pipelines/controlnet_sd3/test_controlnet_inpaint_sd3.py b/tests/diffusers_tests/pipelines/controlnet_sd3/test_controlnet_inpaint_sd3.py new file mode 100644 index 0000000000..aab5f85941 --- /dev/null +++ b/tests/diffusers_tests/pipelines/controlnet_sd3/test_controlnet_inpaint_sd3.py @@ -0,0 +1,263 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import torch +from ddt import data, ddt, unpack +from diffusers.utils.torch_utils import randn_tensor +from transformers import CLIPTextConfig + +import mindspore as ms + +from ..pipeline_test_utils import ( + THRESHOLD_FP16, + THRESHOLD_FP32, + PipelineTesterMixin, + get_module, + get_pipeline_components, +) + +test_cases = [ + {"mode": ms.PYNATIVE_MODE, "dtype": "float32"}, + {"mode": ms.PYNATIVE_MODE, "dtype": "float16"}, + {"mode": ms.GRAPH_MODE, "dtype": "float32"}, + {"mode": ms.GRAPH_MODE, "dtype": "float16"}, +] + + +@ddt +class StableDiffusion3ControlInpaintNetPipelineFastTests(PipelineTesterMixin, unittest.TestCase): + pipeline_config = [ + [ + "transformer", + "diffusers.models.transformers.transformer_sd3.SD3Transformer2DModel", + "mindone.diffusers.models.transformers.transformer_sd3.SD3Transformer2DModel", + dict( + sample_size=32, + patch_size=1, + in_channels=8, + num_layers=4, + attention_head_dim=8, + num_attention_heads=4, + joint_attention_dim=32, + caption_projection_dim=32, + pooled_projection_dim=64, + out_channels=8, + ), + ], + [ + "controlnet", + "diffusers.models.controlnet_sd3.SD3ControlNetModel", + "mindone.diffusers.models.controlnet_sd3.SD3ControlNetModel", + dict( + sample_size=32, + patch_size=1, + in_channels=8, + num_layers=1, + attention_head_dim=8, + num_attention_heads=4, + joint_attention_dim=32, + caption_projection_dim=32, + pooled_projection_dim=64, + out_channels=8, + extra_conditioning_channels=1, + ), + ], + [ + "text_encoder", + "transformers.models.clip.modeling_clip.CLIPTextModelWithProjection", + "mindone.transformers.models.clip.modeling_clip.CLIPTextModelWithProjection", + dict( + config=CLIPTextConfig( + bos_token_id=0, + eos_token_id=2, + hidden_size=32, + intermediate_size=37, + layer_norm_eps=1e-05, + num_attention_heads=4, + num_hidden_layers=5, + pad_token_id=1, + vocab_size=1000, + hidden_act="gelu", + projection_dim=32, + ), + ), + ], + [ + "text_encoder_2", + "transformers.models.clip.modeling_clip.CLIPTextModelWithProjection", + "mindone.transformers.models.clip.modeling_clip.CLIPTextModelWithProjection", + dict( + config=CLIPTextConfig( + bos_token_id=0, + eos_token_id=2, + hidden_size=32, + intermediate_size=37, + layer_norm_eps=1e-05, + num_attention_heads=4, + num_hidden_layers=5, + pad_token_id=1, + vocab_size=1000, + hidden_act="gelu", + projection_dim=32, + ), + ), + ], + [ + "text_encoder_3", + "transformers.models.t5.modeling_t5.T5EncoderModel", + "mindone.transformers.models.t5.modeling_t5.T5EncoderModel", + dict( + pretrained_model_name_or_path="hf-internal-testing/tiny-random-t5", + revision="refs/pr/1", + ), + ], + [ + "tokenizer", + "transformers.models.clip.tokenization_clip.CLIPTokenizer", + "transformers.models.clip.tokenization_clip.CLIPTokenizer", + dict( + pretrained_model_name_or_path="hf-internal-testing/tiny-random-clip", + ), + ], + [ + "tokenizer_2", + "transformers.models.clip.tokenization_clip.CLIPTokenizer", + "transformers.models.clip.tokenization_clip.CLIPTokenizer", + dict( + pretrained_model_name_or_path="hf-internal-testing/tiny-random-clip", + ), + ], + [ + "tokenizer_3", + "transformers.models.auto.tokenization_auto.AutoTokenizer", + "transformers.models.auto.tokenization_auto.AutoTokenizer", + dict( + pretrained_model_name_or_path="hf-internal-testing/tiny-random-t5", + ), + ], + [ + "vae", + "diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL", + "mindone.diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL", + dict( + sample_size=32, + in_channels=3, + out_channels=3, + block_out_channels=(4,), + layers_per_block=1, + latent_channels=8, + norm_num_groups=1, + use_quant_conv=False, + use_post_quant_conv=False, + shift_factor=0.0609, + scaling_factor=1.5035, + ), + ], + [ + "scheduler", + "diffusers.schedulers.scheduling_flow_match_euler_discrete.FlowMatchEulerDiscreteScheduler", + "mindone.diffusers.schedulers.scheduling_flow_match_euler_discrete.FlowMatchEulerDiscreteScheduler", + dict(), + ], + ] + + def get_dummy_components(self): + components = { + key: None + for key in [ + "scheduler", + "text_encoder", + "text_encoder_2", + "text_encoder_3", + "tokenizer", + "tokenizer_2", + "tokenizer_3", + "transformer", + "vae", + "controlnet", + ] + } + + return get_pipeline_components(components, self.pipeline_config) + + def get_dummy_inputs(self): + pt_control_image = randn_tensor((1, 3, 32, 32), dtype=torch.float16) + pt_control_mask = randn_tensor((1, 1, 32, 32), dtype=torch.float16) + + ms_control_image = ms.tensor(pt_control_image.numpy()) + ms_control_mask = ms.tensor(pt_control_mask.numpy()) + + controlnet_conditioning_scale = 0.95 + + pt_inputs = { + "prompt": "A painting of a squirrel eating a burger", + "num_inference_steps": 2, + "guidance_scale": 7.0, + "output_type": "np", + "control_image": pt_control_image, + "control_mask": pt_control_mask, + "controlnet_conditioning_scale": controlnet_conditioning_scale, + } + + ms_inputs = { + "prompt": "A painting of a squirrel eating a burger", + "num_inference_steps": 2, + "guidance_scale": 7.0, + "output_type": "np", + "control_image": ms_control_image, + "control_mask": ms_control_mask, + "controlnet_conditioning_scale": controlnet_conditioning_scale, + } + + return pt_inputs, ms_inputs + + @data(*test_cases) + @unpack + def test_controlnet_inpaint_sd3(self, mode, dtype): + ms.set_context(mode=mode) + + pt_components, ms_components = self.get_dummy_components() + pt_pipe_cls = get_module( + "diffusers.pipelines.controlnet_sd3.pipeline_stable_diffusion_3_controlnet_inpainting.StableDiffusion3ControlNetInpaintingPipeline" + ) + ms_pipe_cls = get_module( + "mindone.diffusers.pipelines.controlnet_sd3.pipeline_stable_diffusion_3_controlnet_inpainting.StableDiffusion3ControlNetInpaintingPipeline" + ) + + pt_pipe = pt_pipe_cls(**pt_components) + ms_pipe = ms_pipe_cls(**ms_components) + + pt_pipe.set_progress_bar_config(disable=None) + ms_pipe.set_progress_bar_config(disable=None) + + ms_dtype, pt_dtype = getattr(ms, dtype), getattr(torch, dtype) + pt_pipe = pt_pipe.to(pt_dtype) + ms_pipe = ms_pipe.to(ms_dtype) + + pt_inputs, ms_inputs = self.get_dummy_inputs() + + torch.manual_seed(0) + pt_image = pt_pipe(**pt_inputs) + torch.manual_seed(0) + ms_image = ms_pipe(**ms_inputs) + + pt_image_slice = pt_image.images[0, -3:, -3:, -1] + ms_image_slice = ms_image[0][0, -3:, -3:, -1] + + threshold = THRESHOLD_FP32 if dtype == "float32" else THRESHOLD_FP16 + assert np.max(np.linalg.norm(pt_image_slice - ms_image_slice) / np.linalg.norm(pt_image_slice)) < threshold diff --git a/tests/diffusers/pipelines/controlnet_sd3/test_controlnet_sd3.py b/tests/diffusers_tests/pipelines/controlnet_sd3/test_controlnet_sd3.py similarity index 100% rename from tests/diffusers/pipelines/controlnet_sd3/test_controlnet_sd3.py rename to tests/diffusers_tests/pipelines/controlnet_sd3/test_controlnet_sd3.py diff --git a/tests/diffusers/pipelines/ddim/__init__.py b/tests/diffusers_tests/pipelines/controlnet_xs/__init__.py similarity index 100% rename from tests/diffusers/pipelines/ddim/__init__.py rename to tests/diffusers_tests/pipelines/controlnet_xs/__init__.py diff --git a/tests/diffusers/pipelines/controlnet_xs/test_controlnetxs.py b/tests/diffusers_tests/pipelines/controlnet_xs/test_controlnetxs.py similarity index 100% rename from tests/diffusers/pipelines/controlnet_xs/test_controlnetxs.py rename to tests/diffusers_tests/pipelines/controlnet_xs/test_controlnetxs.py diff --git a/tests/diffusers/pipelines/controlnet_xs/test_controlnetxs_sdxl.py b/tests/diffusers_tests/pipelines/controlnet_xs/test_controlnetxs_sdxl.py similarity index 100% rename from tests/diffusers/pipelines/controlnet_xs/test_controlnetxs_sdxl.py rename to tests/diffusers_tests/pipelines/controlnet_xs/test_controlnetxs_sdxl.py diff --git a/tests/diffusers/pipelines/ddpm/__init__.py b/tests/diffusers_tests/pipelines/dance_diffusion/__init__.py similarity index 100% rename from tests/diffusers/pipelines/ddpm/__init__.py rename to tests/diffusers_tests/pipelines/dance_diffusion/__init__.py diff --git a/tests/diffusers/pipelines/dance_diffusion/test_dance_diffusion.py b/tests/diffusers_tests/pipelines/dance_diffusion/test_dance_diffusion.py similarity index 100% rename from tests/diffusers/pipelines/dance_diffusion/test_dance_diffusion.py rename to tests/diffusers_tests/pipelines/dance_diffusion/test_dance_diffusion.py diff --git a/tests/diffusers/pipelines/dit/__init__.py b/tests/diffusers_tests/pipelines/ddim/__init__.py similarity index 100% rename from tests/diffusers/pipelines/dit/__init__.py rename to tests/diffusers_tests/pipelines/ddim/__init__.py diff --git a/tests/diffusers/pipelines/ddim/test_ddim.py b/tests/diffusers_tests/pipelines/ddim/test_ddim.py similarity index 100% rename from tests/diffusers/pipelines/ddim/test_ddim.py rename to tests/diffusers_tests/pipelines/ddim/test_ddim.py diff --git a/tests/diffusers/pipelines/hunyuan_dit/__init__.py b/tests/diffusers_tests/pipelines/ddpm/__init__.py similarity index 100% rename from tests/diffusers/pipelines/hunyuan_dit/__init__.py rename to tests/diffusers_tests/pipelines/ddpm/__init__.py diff --git a/tests/diffusers/pipelines/ddpm/test_ddpm.py b/tests/diffusers_tests/pipelines/ddpm/test_ddpm.py similarity index 100% rename from tests/diffusers/pipelines/ddpm/test_ddpm.py rename to tests/diffusers_tests/pipelines/ddpm/test_ddpm.py diff --git a/tests/diffusers/pipelines/deepfloyd_if/__init__.py b/tests/diffusers_tests/pipelines/deepfloyd_if/__init__.py similarity index 100% rename from tests/diffusers/pipelines/deepfloyd_if/__init__.py rename to tests/diffusers_tests/pipelines/deepfloyd_if/__init__.py diff --git a/tests/diffusers/pipelines/deepfloyd_if/test_if.py b/tests/diffusers_tests/pipelines/deepfloyd_if/test_if.py similarity index 100% rename from tests/diffusers/pipelines/deepfloyd_if/test_if.py rename to tests/diffusers_tests/pipelines/deepfloyd_if/test_if.py diff --git a/tests/diffusers/pipelines/deepfloyd_if/test_if_img2img.py b/tests/diffusers_tests/pipelines/deepfloyd_if/test_if_img2img.py similarity index 100% rename from tests/diffusers/pipelines/deepfloyd_if/test_if_img2img.py rename to tests/diffusers_tests/pipelines/deepfloyd_if/test_if_img2img.py diff --git a/tests/diffusers/pipelines/deepfloyd_if/test_if_img2img_superresolution.py b/tests/diffusers_tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py similarity index 100% rename from tests/diffusers/pipelines/deepfloyd_if/test_if_img2img_superresolution.py rename to tests/diffusers_tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py diff --git a/tests/diffusers/pipelines/deepfloyd_if/test_if_inpainting.py b/tests/diffusers_tests/pipelines/deepfloyd_if/test_if_inpainting.py similarity index 100% rename from tests/diffusers/pipelines/deepfloyd_if/test_if_inpainting.py rename to tests/diffusers_tests/pipelines/deepfloyd_if/test_if_inpainting.py diff --git a/tests/diffusers/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py b/tests/diffusers_tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py similarity index 100% rename from tests/diffusers/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py rename to tests/diffusers_tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py diff --git a/tests/diffusers/pipelines/deepfloyd_if/test_if_superresolution.py b/tests/diffusers_tests/pipelines/deepfloyd_if/test_if_superresolution.py similarity index 100% rename from tests/diffusers/pipelines/deepfloyd_if/test_if_superresolution.py rename to tests/diffusers_tests/pipelines/deepfloyd_if/test_if_superresolution.py diff --git a/tests/diffusers/pipelines/i2vgen_xl/__init__.py b/tests/diffusers_tests/pipelines/dit/__init__.py similarity index 100% rename from tests/diffusers/pipelines/i2vgen_xl/__init__.py rename to tests/diffusers_tests/pipelines/dit/__init__.py diff --git a/tests/diffusers/pipelines/dit/test_dit.py b/tests/diffusers_tests/pipelines/dit/test_dit.py similarity index 100% rename from tests/diffusers/pipelines/dit/test_dit.py rename to tests/diffusers_tests/pipelines/dit/test_dit.py diff --git a/tests/diffusers/pipelines/kandinsky/__init__.py b/tests/diffusers_tests/pipelines/flux/__init__.py similarity index 100% rename from tests/diffusers/pipelines/kandinsky/__init__.py rename to tests/diffusers_tests/pipelines/flux/__init__.py diff --git a/tests/diffusers_tests/pipelines/flux/test_pipeline_flux.py b/tests/diffusers_tests/pipelines/flux/test_pipeline_flux.py new file mode 100644 index 0000000000..2d0fe373b6 --- /dev/null +++ b/tests/diffusers_tests/pipelines/flux/test_pipeline_flux.py @@ -0,0 +1,167 @@ +import unittest + +import numpy as np +import torch +from ddt import data, ddt, unpack +from transformers import CLIPTextConfig + +import mindspore as ms + +from ..pipeline_test_utils import ( + THRESHOLD_FP16, + THRESHOLD_FP32, + PipelineTesterMixin, + get_module, + get_pipeline_components, +) + +test_cases = [ + {"mode": ms.PYNATIVE_MODE, "dtype": "float32"}, + {"mode": ms.PYNATIVE_MODE, "dtype": "float16"}, + {"mode": ms.GRAPH_MODE, "dtype": "float32"}, + {"mode": ms.GRAPH_MODE, "dtype": "float16"}, +] + + +@ddt +class FluxPipelineFastTests(PipelineTesterMixin, unittest.TestCase): + pipeline_config = [ + [ + "transformer", + "diffusers.models.transformers.transformer_flux.FluxTransformer2DModel", + "mindone.diffusers.models.transformers.transformer_flux.FluxTransformer2DModel", + dict( + patch_size=1, + in_channels=4, + num_layers=1, + num_single_layers=1, + attention_head_dim=16, + num_attention_heads=2, + joint_attention_dim=32, + pooled_projection_dim=32, + axes_dims_rope=[4, 4, 8], + ), + ], + [ + "text_encoder", + "transformers.models.clip.modeling_clip.CLIPTextModel", + "mindone.transformers.models.clip.modeling_clip.CLIPTextModel", + dict( + config=CLIPTextConfig( + bos_token_id=0, + eos_token_id=2, + hidden_size=32, + intermediate_size=37, + layer_norm_eps=1e-05, + num_attention_heads=4, + num_hidden_layers=5, + pad_token_id=1, + vocab_size=1000, + hidden_act="gelu", + projection_dim=32, + ), + ), + ], + [ + "text_encoder_2", + "transformers.models.t5.modeling_t5.T5EncoderModel", + "mindone.transformers.models.t5.modeling_t5.T5EncoderModel", + dict( + pretrained_model_name_or_path="hf-internal-testing/tiny-random-t5", + revision="refs/pr/1", + ), + ], + [ + "tokenizer", + "transformers.models.clip.tokenization_clip.CLIPTokenizer", + "transformers.models.clip.tokenization_clip.CLIPTokenizer", + dict(pretrained_model_name_or_path="hf-internal-testing/tiny-random-clip"), + ], + [ + "tokenizer_2", + "transformers.models.auto.tokenization_auto.AutoTokenizer", + "transformers.models.auto.tokenization_auto.AutoTokenizer", + dict(pretrained_model_name_or_path="hf-internal-testing/tiny-random-t5"), + ], + [ + "vae", + "diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL", + "mindone.diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL", + dict( + sample_size=32, + in_channels=3, + out_channels=3, + block_out_channels=(4,), + layers_per_block=1, + latent_channels=1, + norm_num_groups=1, + use_quant_conv=False, + use_post_quant_conv=False, + shift_factor=0.0609, + scaling_factor=1.5035, + ), + ], + [ + "scheduler", + "diffusers.schedulers.scheduling_flow_match_euler_discrete.FlowMatchEulerDiscreteScheduler", + "mindone.diffusers.schedulers.scheduling_flow_match_euler_discrete.FlowMatchEulerDiscreteScheduler", + dict(), + ], + ] + + def get_dummy_components(self): + components = { + key: None + for key in [ + "scheduler", + "text_encoder", + "text_encoder_2", + "tokenizer", + "tokenizer_2", + "transformer", + "vae", + ] + } + + return get_pipeline_components(components, self.pipeline_config) + + def get_dummy_inputs(self): + inputs = { + "prompt": "A painting of a squirrel eating a burger", + "num_inference_steps": 2, + "guidance_scale": 5.0, + "height": 8, + "width": 8, + "max_sequence_length": 48, + "output_type": "np", + } + return inputs + + @data(*test_cases) + @unpack + def test_flux_inference(self, mode, dtype): + ms.set_context(mode=mode) + + pt_components, ms_components = self.get_dummy_components() + pt_pipe_cls = get_module("diffusers.pipelines.flux.pipeline_flux.FluxPipeline") + ms_pipe_cls = get_module("mindone.diffusers.pipelines.flux.pipeline_flux.FluxPipeline") + + pt_pipe = pt_pipe_cls(**pt_components) + ms_pipe = ms_pipe_cls(**ms_components) + + ms_dtype, pt_dtype = getattr(ms, dtype), getattr(torch, dtype) + pt_pipe = pt_pipe.to(pt_dtype) + ms_pipe = ms_pipe.to(ms_dtype) + + inputs = self.get_dummy_inputs() + + torch.manual_seed(0) + pt_image = pt_pipe(**inputs) + torch.manual_seed(0) + ms_image = ms_pipe(**inputs) + + pt_image_slice = pt_image.images[0, -3:, -3:, -1] + ms_image_slice = ms_image[0][0, -3:, -3:, -1] + + threshold = THRESHOLD_FP32 if dtype == "float32" else THRESHOLD_FP16 + assert np.max(np.linalg.norm(pt_image_slice - ms_image_slice) / np.linalg.norm(pt_image_slice)) < threshold diff --git a/tests/diffusers_tests/pipelines/flux/test_pipeline_flux_controlnet.py b/tests/diffusers_tests/pipelines/flux/test_pipeline_flux_controlnet.py new file mode 100644 index 0000000000..c105884ef6 --- /dev/null +++ b/tests/diffusers_tests/pipelines/flux/test_pipeline_flux_controlnet.py @@ -0,0 +1,201 @@ +import unittest + +import numpy as np +import torch +from ddt import data, ddt, unpack +from diffusers.utils.torch_utils import randn_tensor +from transformers import CLIPTextConfig + +import mindspore as ms + +from ..pipeline_test_utils import ( + THRESHOLD_FP16, + THRESHOLD_FP32, + PipelineTesterMixin, + get_module, + get_pipeline_components, +) + +test_cases = [ + {"mode": ms.PYNATIVE_MODE, "dtype": "float32"}, + {"mode": ms.PYNATIVE_MODE, "dtype": "float16"}, + {"mode": ms.GRAPH_MODE, "dtype": "float32"}, + {"mode": ms.GRAPH_MODE, "dtype": "float16"}, +] + + +@ddt +class FluxControlInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase): + pipeline_config = [ + [ + "transformer", + "diffusers.models.transformers.transformer_flux.FluxTransformer2DModel", + "mindone.diffusers.models.transformers.transformer_flux.FluxTransformer2DModel", + dict( + patch_size=1, + in_channels=16, + num_layers=1, + num_single_layers=1, + attention_head_dim=16, + num_attention_heads=2, + joint_attention_dim=32, + pooled_projection_dim=32, + axes_dims_rope=[4, 4, 8], + ), + ], + [ + "controlnet", + "diffusers.models.controlnet_flux.FluxControlNetModel", + "mindone.diffusers.models.controlnet_flux.FluxControlNetModel", + dict( + patch_size=1, + in_channels=16, + num_layers=1, + num_single_layers=1, + attention_head_dim=16, + num_attention_heads=2, + joint_attention_dim=32, + pooled_projection_dim=32, + axes_dims_rope=[4, 4, 8], + ), + ], + [ + "text_encoder", + "transformers.models.clip.modeling_clip.CLIPTextModel", + "mindone.transformers.models.clip.modeling_clip.CLIPTextModel", + dict( + config=CLIPTextConfig( + bos_token_id=0, + eos_token_id=2, + hidden_size=32, + intermediate_size=37, + layer_norm_eps=1e-05, + num_attention_heads=4, + num_hidden_layers=5, + pad_token_id=1, + vocab_size=1000, + hidden_act="gelu", + projection_dim=32, + ), + ), + ], + [ + "text_encoder_2", + "transformers.models.t5.modeling_t5.T5EncoderModel", + "mindone.transformers.models.t5.modeling_t5.T5EncoderModel", + dict( + pretrained_model_name_or_path="hf-internal-testing/tiny-random-t5", + revision="refs/pr/1", + ), + ], + [ + "tokenizer", + "transformers.models.clip.tokenization_clip.CLIPTokenizer", + "transformers.models.clip.tokenization_clip.CLIPTokenizer", + dict(pretrained_model_name_or_path="hf-internal-testing/tiny-random-clip"), + ], + [ + "tokenizer_2", + "transformers.models.auto.tokenization_auto.AutoTokenizer", + "transformers.models.auto.tokenization_auto.AutoTokenizer", + dict(pretrained_model_name_or_path="hf-internal-testing/tiny-random-t5"), + ], + [ + "vae", + "diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL", + "mindone.diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL", + dict( + sample_size=32, + in_channels=3, + out_channels=3, + block_out_channels=(4,), + layers_per_block=1, + latent_channels=4, + norm_num_groups=1, + use_quant_conv=False, + use_post_quant_conv=False, + shift_factor=0.0609, + scaling_factor=1.5035, + ), + ], + [ + "scheduler", + "diffusers.schedulers.scheduling_flow_match_euler_discrete.FlowMatchEulerDiscreteScheduler", + "mindone.diffusers.schedulers.scheduling_flow_match_euler_discrete.FlowMatchEulerDiscreteScheduler", + dict(), + ], + ] + + def get_dummy_components(self): + components = { + key: None + for key in [ + "scheduler", + "text_encoder", + "text_encoder_2", + "tokenizer", + "tokenizer_2", + "transformer", + "vae", + "controlnet", + ] + } + + return get_pipeline_components(components, self.pipeline_config) + + def get_dummy_inputs(self, seed=0): + pt_control_image = randn_tensor( + (1, 3, 32, 32), + dtype=torch.float16, + ) + ms_control_image = ms.tensor(pt_control_image.numpy()) + + controlnet_conditioning_scale = 0.5 + + pt_inputs = { + "prompt": "A painting of a squirrel eating a burger", + "num_inference_steps": 2, + "guidance_scale": 3.5, + "output_type": "np", + "control_image": pt_control_image, + "controlnet_conditioning_scale": controlnet_conditioning_scale, + } + + ms_inputs = { + "prompt": "A painting of a squirrel eating a burger", + "num_inference_steps": 2, + "guidance_scale": 3.5, + "output_type": "np", + "control_image": ms_control_image, + "controlnet_conditioning_scale": controlnet_conditioning_scale, + } + return pt_inputs, ms_inputs + + @data(*test_cases) + @unpack + def test_inference(self, mode, dtype): + ms.set_context(mode=mode) + + pt_components, ms_components = self.get_dummy_components() + pt_pipe_cls = get_module("diffusers.pipelines.flux.pipeline_flux_controlnet.FluxControlNetPipeline") + ms_pipe_cls = get_module("mindone.diffusers.pipelines.flux.pipeline_flux_controlnet.FluxControlNetPipeline") + + pt_pipe = pt_pipe_cls(**pt_components) + ms_pipe = ms_pipe_cls(**ms_components) + + ms_dtype, pt_dtype = getattr(ms, dtype), getattr(torch, dtype) + pt_pipe = pt_pipe.to(pt_dtype) + ms_pipe = ms_pipe.to(ms_dtype) + + pt_inputs, ms_inputs = self.get_dummy_inputs() + + torch.manual_seed(0) + pt_image = pt_pipe(**pt_inputs) + torch.manual_seed(0) + ms_image = ms_pipe(**ms_inputs) + + pt_image_slice = pt_image.images[0, -3:, -3:, -1] + ms_image_slice = ms_image[0][0, -3:, -3:, -1] + + threshold = THRESHOLD_FP32 if dtype == "float32" else THRESHOLD_FP16 + assert np.max(np.linalg.norm(pt_image_slice - ms_image_slice) / np.linalg.norm(pt_image_slice)) < threshold diff --git a/tests/diffusers_tests/pipelines/flux/test_pipeline_flux_controlnet_image_to_image.py b/tests/diffusers_tests/pipelines/flux/test_pipeline_flux_controlnet_image_to_image.py new file mode 100644 index 0000000000..782b868d05 --- /dev/null +++ b/tests/diffusers_tests/pipelines/flux/test_pipeline_flux_controlnet_image_to_image.py @@ -0,0 +1,210 @@ +import unittest + +import numpy as np +import torch +from ddt import data, ddt, unpack +from transformers import CLIPTextConfig + +import mindspore as ms + +from ..pipeline_test_utils import ( + THRESHOLD_FP16, + THRESHOLD_FP32, + PipelineTesterMixin, + get_module, + get_pipeline_components, +) + +test_cases = [ + {"mode": ms.PYNATIVE_MODE, "dtype": "float32"}, + {"mode": ms.PYNATIVE_MODE, "dtype": "float16"}, + {"mode": ms.GRAPH_MODE, "dtype": "float32"}, + {"mode": ms.GRAPH_MODE, "dtype": "float16"}, +] + + +@ddt +class FluxControlImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase): + pipeline_config = [ + [ + "transformer", + "diffusers.models.transformers.transformer_flux.FluxTransformer2DModel", + "mindone.diffusers.models.transformers.transformer_flux.FluxTransformer2DModel", + dict( + patch_size=1, + in_channels=4, + num_layers=1, + num_single_layers=1, + attention_head_dim=16, + num_attention_heads=2, + joint_attention_dim=32, + pooled_projection_dim=32, + axes_dims_rope=[4, 4, 8], + ), + ], + [ + "text_encoder", + "transformers.models.clip.modeling_clip.CLIPTextModel", + "mindone.transformers.models.clip.modeling_clip.CLIPTextModel", + dict( + config=CLIPTextConfig( + bos_token_id=0, + eos_token_id=2, + hidden_size=32, + intermediate_size=37, + layer_norm_eps=1e-05, + num_attention_heads=4, + num_hidden_layers=5, + pad_token_id=1, + vocab_size=1000, + hidden_act="gelu", + projection_dim=32, + ), + ), + ], + [ + "text_encoder_2", + "transformers.models.t5.modeling_t5.T5EncoderModel", + "mindone.transformers.models.t5.modeling_t5.T5EncoderModel", + dict( + pretrained_model_name_or_path="hf-internal-testing/tiny-random-t5", + revision="refs/pr/1", + ), + ], + [ + "tokenizer", + "transformers.models.clip.tokenization_clip.CLIPTokenizer", + "transformers.models.clip.tokenization_clip.CLIPTokenizer", + dict(pretrained_model_name_or_path="hf-internal-testing/tiny-random-clip"), + ], + [ + "tokenizer_2", + "transformers.models.auto.tokenization_auto.AutoTokenizer", + "transformers.models.auto.tokenization_auto.AutoTokenizer", + dict(pretrained_model_name_or_path="hf-internal-testing/tiny-random-t5"), + ], + [ + "vae", + "diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL", + "mindone.diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL", + dict( + sample_size=32, + in_channels=3, + out_channels=3, + block_out_channels=(4,), + layers_per_block=1, + latent_channels=1, + norm_num_groups=1, + use_quant_conv=False, + use_post_quant_conv=False, + shift_factor=0.0609, + scaling_factor=1.5035, + ), + ], + [ + "controlnet", + "diffusers.models.controlnet_flux.FluxControlNetModel", + "mindone.diffusers.models.controlnet_flux.FluxControlNetModel", + dict( + in_channels=4, + num_layers=1, + num_single_layers=1, + attention_head_dim=16, + num_attention_heads=2, + joint_attention_dim=32, + pooled_projection_dim=32, + axes_dims_rope=[4, 4, 8], + ), + ], + [ + "scheduler", + "diffusers.schedulers.scheduling_flow_match_euler_discrete.FlowMatchEulerDiscreteScheduler", + "mindone.diffusers.schedulers.scheduling_flow_match_euler_discrete.FlowMatchEulerDiscreteScheduler", + dict(), + ], + ] + + def get_dummy_components(self): + components = { + key: None + for key in [ + "scheduler", + "text_encoder", + "text_encoder_2", + "tokenizer", + "tokenizer_2", + "transformer", + "vae", + "controlnet", + ] + } + + return get_pipeline_components(components, self.pipeline_config) + + def get_dummy_inputs(self): + pt_image = torch.randn(1, 3, 32, 32) + ms_image = ms.tensor(pt_image.numpy()) + pt_control_image = torch.randn(1, 3, 32, 32) + ms_control_image = ms.tensor(pt_control_image.numpy()) + + pt_inputs = { + "prompt": "A painting of a squirrel eating a burger", + "image": pt_image, + "control_image": pt_control_image, + "num_inference_steps": 2, + "guidance_scale": 5.0, + "controlnet_conditioning_scale": 1.0, + "strength": 0.8, + "height": 32, + "width": 32, + "max_sequence_length": 48, + "output_type": "np", + } + + ms_inputs = { + "prompt": "A painting of a squirrel eating a burger", + "image": ms_image, + "control_image": ms_control_image, + "num_inference_steps": 2, + "guidance_scale": 5.0, + "controlnet_conditioning_scale": 1.0, + "strength": 0.8, + "height": 32, + "width": 32, + "max_sequence_length": 48, + "output_type": "np", + } + return pt_inputs, ms_inputs + + @data(*test_cases) + @unpack + def test_inference(self, mode, dtype): + ms.set_context(mode=mode) + + pt_components, ms_components = self.get_dummy_components() + pt_pipe_cls = get_module( + "diffusers.pipelines.flux.pipeline_flux_controlnet_image_to_image.FluxControlNetImg2ImgPipeline" + ) + ms_pipe_cls = get_module( + "mindone.diffusers.pipelines.flux.pipeline_flux_controlnet_image_to_image.FluxControlNetImg2ImgPipeline" + ) + + pt_pipe = pt_pipe_cls(**pt_components) + ms_pipe = ms_pipe_cls(**ms_components) + + ms_dtype, pt_dtype = getattr(ms, dtype), getattr(torch, dtype) + pt_pipe = pt_pipe.to(pt_dtype) + ms_pipe = ms_pipe.to(ms_dtype) + + pt_inputs, ms_inputs = self.get_dummy_inputs() + + torch.manual_seed(0) + pt_image = pt_pipe(**pt_inputs) + torch.manual_seed(0) + ms_image = ms_pipe(**ms_inputs) + + pt_image_slice = pt_image.images[0, -3:, -3:, -1] + ms_image_slice = ms_image[0][0, -3:, -3:, -1] + + threshold = THRESHOLD_FP32 if dtype == "float32" else THRESHOLD_FP16 + assert np.max(np.linalg.norm(pt_image_slice - ms_image_slice) / np.linalg.norm(pt_image_slice)) < threshold diff --git a/tests/diffusers_tests/pipelines/flux/test_pipeline_flux_controlnet_inpainting.py b/tests/diffusers_tests/pipelines/flux/test_pipeline_flux_controlnet_inpainting.py new file mode 100644 index 0000000000..06bdb0f4fb --- /dev/null +++ b/tests/diffusers_tests/pipelines/flux/test_pipeline_flux_controlnet_inpainting.py @@ -0,0 +1,216 @@ +import random +import unittest + +import numpy as np +import torch +from ddt import data, ddt, unpack +from transformers import CLIPTextConfig + +import mindspore as ms + +from ..pipeline_test_utils import ( + THRESHOLD_FP16, + THRESHOLD_FP32, + PipelineTesterMixin, + floats_tensor, + get_module, + get_pipeline_components, +) + +test_cases = [ + {"mode": ms.PYNATIVE_MODE, "dtype": "float32"}, + {"mode": ms.PYNATIVE_MODE, "dtype": "float16"}, + {"mode": ms.GRAPH_MODE, "dtype": "float32"}, + {"mode": ms.GRAPH_MODE, "dtype": "float16"}, +] + + +@ddt +class FluxControlInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase): + pipeline_config = [ + [ + "transformer", + "diffusers.models.transformers.transformer_flux.FluxTransformer2DModel", + "mindone.diffusers.models.transformers.transformer_flux.FluxTransformer2DModel", + dict( + patch_size=1, + in_channels=8, + num_layers=1, + num_single_layers=1, + attention_head_dim=16, + num_attention_heads=2, + joint_attention_dim=32, + pooled_projection_dim=32, + axes_dims_rope=[4, 4, 8], + ), + ], + [ + "text_encoder", + "transformers.models.clip.modeling_clip.CLIPTextModel", + "mindone.transformers.models.clip.modeling_clip.CLIPTextModel", + dict( + config=CLIPTextConfig( + bos_token_id=0, + eos_token_id=2, + hidden_size=32, + intermediate_size=37, + layer_norm_eps=1e-05, + num_attention_heads=4, + num_hidden_layers=5, + pad_token_id=1, + vocab_size=1000, + hidden_act="gelu", + projection_dim=32, + ), + ), + ], + [ + "text_encoder_2", + "transformers.models.t5.modeling_t5.T5EncoderModel", + "mindone.transformers.models.t5.modeling_t5.T5EncoderModel", + dict( + pretrained_model_name_or_path="hf-internal-testing/tiny-random-t5", + revision="refs/pr/1", + ), + ], + [ + "tokenizer", + "transformers.models.clip.tokenization_clip.CLIPTokenizer", + "transformers.models.clip.tokenization_clip.CLIPTokenizer", + dict(pretrained_model_name_or_path="hf-internal-testing/tiny-random-clip"), + ], + [ + "tokenizer_2", + "transformers.models.auto.tokenization_auto.AutoTokenizer", + "transformers.models.auto.tokenization_auto.AutoTokenizer", + dict(pretrained_model_name_or_path="hf-internal-testing/tiny-random-t5"), + ], + [ + "vae", + "diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL", + "mindone.diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL", + dict( + sample_size=32, + in_channels=3, + out_channels=3, + block_out_channels=(4,), + layers_per_block=1, + latent_channels=2, + norm_num_groups=1, + use_quant_conv=False, + use_post_quant_conv=False, + shift_factor=0.0609, + scaling_factor=1.5035, + ), + ], + [ + "controlnet", + "diffusers.models.controlnet_flux.FluxControlNetModel", + "mindone.diffusers.models.controlnet_flux.FluxControlNetModel", + dict( + patch_size=1, + in_channels=8, + num_layers=1, + num_single_layers=1, + attention_head_dim=16, + num_attention_heads=2, + joint_attention_dim=32, + pooled_projection_dim=32, + axes_dims_rope=[4, 4, 8], + ), + ], + [ + "scheduler", + "diffusers.schedulers.scheduling_flow_match_euler_discrete.FlowMatchEulerDiscreteScheduler", + "mindone.diffusers.schedulers.scheduling_flow_match_euler_discrete.FlowMatchEulerDiscreteScheduler", + dict(), + ], + ] + + def get_dummy_components(self): + components = { + key: None + for key in [ + "scheduler", + "text_encoder", + "text_encoder_2", + "tokenizer", + "tokenizer_2", + "transformer", + "vae", + "controlnet", + ] + } + + return get_pipeline_components(components, self.pipeline_config) + + def get_dummy_inputs(self, seed=0): + pt_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)) + pt_mask_image = torch.ones((1, 1, 32, 32)) + pt_control_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)) + + ms_image = ms.tensor(pt_image.numpy()) + ms_mask_image = ms.tensor(pt_mask_image.numpy()) + ms_control_image = ms.tensor(pt_control_image.numpy()) + + pt_inputs = { + "prompt": "A painting of a squirrel eating a burger", + "image": pt_image, + "mask_image": pt_mask_image, + "control_image": pt_control_image, + "num_inference_steps": 2, + "guidance_scale": 5.0, + "height": 32, + "width": 32, + "max_sequence_length": 48, + "strength": 0.8, + "output_type": "np", + } + + ms_inputs = { + "prompt": "A painting of a squirrel eating a burger", + "image": ms_image, + "mask_image": ms_mask_image, + "control_image": ms_control_image, + "num_inference_steps": 2, + "guidance_scale": 5.0, + "height": 32, + "width": 32, + "max_sequence_length": 48, + "strength": 0.8, + "output_type": "np", + } + return pt_inputs, ms_inputs + + @data(*test_cases) + @unpack + def test_inference(self, mode, dtype): + ms.set_context(mode=mode) + + pt_components, ms_components = self.get_dummy_components() + pt_pipe_cls = get_module( + "diffusers.pipelines.flux.pipeline_flux_controlnet_inpainting.FluxControlNetInpaintPipeline" + ) + ms_pipe_cls = get_module( + "mindone.diffusers.pipelines.flux.pipeline_flux_controlnet_inpainting.FluxControlNetInpaintPipeline" + ) + + pt_pipe = pt_pipe_cls(**pt_components) + ms_pipe = ms_pipe_cls(**ms_components) + + ms_dtype, pt_dtype = getattr(ms, dtype), getattr(torch, dtype) + pt_pipe = pt_pipe.to(pt_dtype) + ms_pipe = ms_pipe.to(ms_dtype) + + pt_inputs, ms_inputs = self.get_dummy_inputs() + + torch.manual_seed(0) + pt_image = pt_pipe(**pt_inputs) + torch.manual_seed(0) + ms_image = ms_pipe(**ms_inputs) + + pt_image_slice = pt_image.images[0, -3:, -3:, -1] + ms_image_slice = ms_image[0][0, -3:, -3:, -1] + + threshold = THRESHOLD_FP32 if dtype == "float32" else THRESHOLD_FP16 + assert np.max(np.linalg.norm(pt_image_slice - ms_image_slice) / np.linalg.norm(pt_image_slice)) < threshold diff --git a/tests/diffusers_tests/pipelines/flux/test_pipeline_flux_img2img.py b/tests/diffusers_tests/pipelines/flux/test_pipeline_flux_img2img.py new file mode 100644 index 0000000000..05796acf00 --- /dev/null +++ b/tests/diffusers_tests/pipelines/flux/test_pipeline_flux_img2img.py @@ -0,0 +1,186 @@ +import random +import unittest + +import numpy as np +import torch +from ddt import data, ddt, unpack +from transformers import CLIPTextConfig + +import mindspore as ms + +from ..pipeline_test_utils import ( + THRESHOLD_FP16, + THRESHOLD_FP32, + PipelineTesterMixin, + floats_tensor, + get_module, + get_pipeline_components, +) + +test_cases = [ + {"mode": ms.PYNATIVE_MODE, "dtype": "float32"}, + {"mode": ms.PYNATIVE_MODE, "dtype": "float16"}, + {"mode": ms.GRAPH_MODE, "dtype": "float32"}, + {"mode": ms.GRAPH_MODE, "dtype": "float16"}, +] + + +@ddt +class FluxImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase): + pipeline_config = [ + [ + "transformer", + "diffusers.models.transformers.transformer_flux.FluxTransformer2DModel", + "mindone.diffusers.models.transformers.transformer_flux.FluxTransformer2DModel", + dict( + patch_size=1, + in_channels=4, + num_layers=1, + num_single_layers=1, + attention_head_dim=16, + num_attention_heads=2, + joint_attention_dim=32, + pooled_projection_dim=32, + axes_dims_rope=[4, 4, 8], + ), + ], + [ + "text_encoder", + "transformers.models.clip.modeling_clip.CLIPTextModel", + "mindone.transformers.models.clip.modeling_clip.CLIPTextModel", + dict( + config=CLIPTextConfig( + bos_token_id=0, + eos_token_id=2, + hidden_size=32, + intermediate_size=37, + layer_norm_eps=1e-05, + num_attention_heads=4, + num_hidden_layers=5, + pad_token_id=1, + vocab_size=1000, + hidden_act="gelu", + projection_dim=32, + ), + ), + ], + [ + "text_encoder_2", + "transformers.models.t5.modeling_t5.T5EncoderModel", + "mindone.transformers.models.t5.modeling_t5.T5EncoderModel", + dict( + pretrained_model_name_or_path="hf-internal-testing/tiny-random-t5", + revision="refs/pr/1", + ), + ], + [ + "tokenizer", + "transformers.models.clip.tokenization_clip.CLIPTokenizer", + "transformers.models.clip.tokenization_clip.CLIPTokenizer", + dict(pretrained_model_name_or_path="hf-internal-testing/tiny-random-clip"), + ], + [ + "tokenizer_2", + "transformers.models.auto.tokenization_auto.AutoTokenizer", + "transformers.models.auto.tokenization_auto.AutoTokenizer", + dict(pretrained_model_name_or_path="hf-internal-testing/tiny-random-t5"), + ], + [ + "vae", + "diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL", + "mindone.diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL", + dict( + sample_size=32, + in_channels=3, + out_channels=3, + block_out_channels=(4,), + layers_per_block=1, + latent_channels=1, + norm_num_groups=1, + use_quant_conv=False, + use_post_quant_conv=False, + shift_factor=0.0609, + scaling_factor=1.5035, + ), + ], + [ + "scheduler", + "diffusers.schedulers.scheduling_flow_match_euler_discrete.FlowMatchEulerDiscreteScheduler", + "mindone.diffusers.schedulers.scheduling_flow_match_euler_discrete.FlowMatchEulerDiscreteScheduler", + dict(), + ], + ] + + def get_dummy_components(self): + components = { + key: None + for key in [ + "scheduler", + "text_encoder", + "text_encoder_2", + "tokenizer", + "tokenizer_2", + "transformer", + "vae", + ] + } + + return get_pipeline_components(components, self.pipeline_config) + + def get_dummy_inputs(self, seed=0): + pt_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)) + ms_image = ms.tensor(pt_image.numpy()) + + pt_inputs = { + "prompt": "A painting of a squirrel eating a burger", + "image": pt_image, + "num_inference_steps": 2, + "guidance_scale": 5.0, + "height": 8, + "width": 8, + "max_sequence_length": 48, + "strength": 0.8, + "output_type": "np", + } + + ms_inputs = { + "prompt": "A painting of a squirrel eating a burger", + "image": ms_image, + "num_inference_steps": 2, + "guidance_scale": 5.0, + "height": 8, + "width": 8, + "max_sequence_length": 48, + "strength": 0.8, + "output_type": "np", + } + return pt_inputs, ms_inputs + + @data(*test_cases) + @unpack + def test_flux_inference(self, mode, dtype): + ms.set_context(mode=mode) + + pt_components, ms_components = self.get_dummy_components() + pt_pipe_cls = get_module("diffusers.pipelines.flux.pipeline_flux_img2img.FluxImg2ImgPipeline") + ms_pipe_cls = get_module("mindone.diffusers.pipelines.flux.pipeline_flux_img2img.FluxImg2ImgPipeline") + + pt_pipe = pt_pipe_cls(**pt_components) + ms_pipe = ms_pipe_cls(**ms_components) + + ms_dtype, pt_dtype = getattr(ms, dtype), getattr(torch, dtype) + pt_pipe = pt_pipe.to(pt_dtype) + ms_pipe = ms_pipe.to(ms_dtype) + + pt_inputs, ms_inputs = self.get_dummy_inputs() + + torch.manual_seed(0) + pt_image = pt_pipe(**pt_inputs) + torch.manual_seed(0) + ms_image = ms_pipe(**ms_inputs) + + pt_image_slice = pt_image.images[0, -3:, -3:, -1] + ms_image_slice = ms_image[0][0, -3:, -3:, -1] + + threshold = THRESHOLD_FP32 if dtype == "float32" else THRESHOLD_FP16 + assert np.max(np.linalg.norm(pt_image_slice - ms_image_slice) / np.linalg.norm(pt_image_slice)) < threshold diff --git a/tests/diffusers_tests/pipelines/flux/test_pipeline_flux_inpaint.py b/tests/diffusers_tests/pipelines/flux/test_pipeline_flux_inpaint.py new file mode 100644 index 0000000000..2181c44336 --- /dev/null +++ b/tests/diffusers_tests/pipelines/flux/test_pipeline_flux_inpaint.py @@ -0,0 +1,190 @@ +import random +import unittest + +import numpy as np +import torch +from ddt import data, ddt, unpack +from transformers import CLIPTextConfig + +import mindspore as ms + +from ..pipeline_test_utils import ( + THRESHOLD_FP16, + THRESHOLD_FP32, + PipelineTesterMixin, + floats_tensor, + get_module, + get_pipeline_components, +) + +test_cases = [ + {"mode": ms.PYNATIVE_MODE, "dtype": "float32"}, + {"mode": ms.PYNATIVE_MODE, "dtype": "float16"}, + {"mode": ms.GRAPH_MODE, "dtype": "float32"}, + {"mode": ms.GRAPH_MODE, "dtype": "float16"}, +] + + +@ddt +class FluxInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase): + pipeline_config = [ + [ + "transformer", + "diffusers.models.transformers.transformer_flux.FluxTransformer2DModel", + "mindone.diffusers.models.transformers.transformer_flux.FluxTransformer2DModel", + dict( + patch_size=1, + in_channels=8, + num_layers=1, + num_single_layers=1, + attention_head_dim=16, + num_attention_heads=2, + joint_attention_dim=32, + pooled_projection_dim=32, + axes_dims_rope=[4, 4, 8], + ), + ], + [ + "text_encoder", + "transformers.models.clip.modeling_clip.CLIPTextModel", + "mindone.transformers.models.clip.modeling_clip.CLIPTextModel", + dict( + config=CLIPTextConfig( + bos_token_id=0, + eos_token_id=2, + hidden_size=32, + intermediate_size=37, + layer_norm_eps=1e-05, + num_attention_heads=4, + num_hidden_layers=5, + pad_token_id=1, + vocab_size=1000, + hidden_act="gelu", + projection_dim=32, + ), + ), + ], + [ + "text_encoder_2", + "transformers.models.t5.modeling_t5.T5EncoderModel", + "mindone.transformers.models.t5.modeling_t5.T5EncoderModel", + dict( + pretrained_model_name_or_path="hf-internal-testing/tiny-random-t5", + revision="refs/pr/1", + ), + ], + [ + "tokenizer", + "transformers.models.clip.tokenization_clip.CLIPTokenizer", + "transformers.models.clip.tokenization_clip.CLIPTokenizer", + dict(pretrained_model_name_or_path="hf-internal-testing/tiny-random-clip"), + ], + [ + "tokenizer_2", + "transformers.models.auto.tokenization_auto.AutoTokenizer", + "transformers.models.auto.tokenization_auto.AutoTokenizer", + dict(pretrained_model_name_or_path="hf-internal-testing/tiny-random-t5"), + ], + [ + "vae", + "diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL", + "mindone.diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL", + dict( + sample_size=32, + in_channels=3, + out_channels=3, + block_out_channels=(4,), + layers_per_block=1, + latent_channels=2, + norm_num_groups=1, + use_quant_conv=False, + use_post_quant_conv=False, + shift_factor=0.0609, + scaling_factor=1.5035, + ), + ], + [ + "scheduler", + "diffusers.schedulers.scheduling_flow_match_euler_discrete.FlowMatchEulerDiscreteScheduler", + "mindone.diffusers.schedulers.scheduling_flow_match_euler_discrete.FlowMatchEulerDiscreteScheduler", + dict(), + ], + ] + + def get_dummy_components(self): + components = { + key: None + for key in [ + "scheduler", + "text_encoder", + "text_encoder_2", + "tokenizer", + "tokenizer_2", + "transformer", + "vae", + ] + } + + return get_pipeline_components(components, self.pipeline_config) + + def get_dummy_inputs(self, seed=0): + pt_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)) + ms_image = ms.tensor(pt_image.numpy()) + pt_mask_image = torch.ones((1, 1, 32, 32)) + ms_mask_image = ms.tensor(pt_mask_image.numpy()) + + pt_inputs = { + "prompt": "A painting of a squirrel eating a burger", + "image": pt_image, + "mask_image": pt_mask_image, + "num_inference_steps": 2, + "guidance_scale": 5.0, + "height": 32, + "width": 32, + "max_sequence_length": 48, + "strength": 0.8, + "output_type": "np", + } + + ms_inputs = { + "prompt": "A painting of a squirrel eating a burger", + "image": ms_image, + "mask_image": ms_mask_image, + "num_inference_steps": 2, + "guidance_scale": 5.0, + "height": 32, + "width": 32, + "max_sequence_length": 48, + "strength": 0.8, + "output_type": "np", + } + return pt_inputs, ms_inputs + + @data(*test_cases) + @unpack + def test_flux_inference(self, mode, dtype): + ms.set_context(mode=mode) + + pt_components, ms_components = self.get_dummy_components() + pt_pipe_cls = get_module("diffusers.pipelines.flux.pipeline_flux_inpaint.FluxInpaintPipeline") + ms_pipe_cls = get_module("mindone.diffusers.pipelines.flux.pipeline_flux_inpaint.FluxInpaintPipeline") + + pt_pipe = pt_pipe_cls(**pt_components) + ms_pipe = ms_pipe_cls(**ms_components) + + ms_dtype, pt_dtype = getattr(ms, dtype), getattr(torch, dtype) + pt_pipe = pt_pipe.to(pt_dtype) + ms_pipe = ms_pipe.to(ms_dtype) + + pt_inputs, ms_inputs = self.get_dummy_inputs() + + torch.manual_seed(0) + pt_image = pt_pipe(**pt_inputs) + torch.manual_seed(0) + ms_image = ms_pipe(**ms_inputs) + + pt_image_slice = pt_image.images[0, -3:, -3:, -1] + ms_image_slice = ms_image[0][0, -3:, -3:, -1] + + threshold = THRESHOLD_FP32 if dtype == "float32" else THRESHOLD_FP16 + assert np.max(np.linalg.norm(pt_image_slice - ms_image_slice) / np.linalg.norm(pt_image_slice)) < threshold diff --git a/tests/diffusers/pipelines/kandinsky2_2/__init__.py b/tests/diffusers_tests/pipelines/hunyuan_dit/__init__.py similarity index 100% rename from tests/diffusers/pipelines/kandinsky2_2/__init__.py rename to tests/diffusers_tests/pipelines/hunyuan_dit/__init__.py diff --git a/tests/diffusers/pipelines/hunyuan_dit/test_hunyuan_dit.py b/tests/diffusers_tests/pipelines/hunyuan_dit/test_hunyuan_dit.py similarity index 100% rename from tests/diffusers/pipelines/hunyuan_dit/test_hunyuan_dit.py rename to tests/diffusers_tests/pipelines/hunyuan_dit/test_hunyuan_dit.py diff --git a/tests/diffusers/pipelines/kandinsky3/__init__.py b/tests/diffusers_tests/pipelines/i2vgen_xl/__init__.py similarity index 100% rename from tests/diffusers/pipelines/kandinsky3/__init__.py rename to tests/diffusers_tests/pipelines/i2vgen_xl/__init__.py diff --git a/tests/diffusers/pipelines/i2vgen_xl/test_i2vgenxl.py b/tests/diffusers_tests/pipelines/i2vgen_xl/test_i2vgenxl.py similarity index 100% rename from tests/diffusers/pipelines/i2vgen_xl/test_i2vgenxl.py rename to tests/diffusers_tests/pipelines/i2vgen_xl/test_i2vgenxl.py diff --git a/tests/diffusers/pipelines/latent_consistency_models/__init__.py b/tests/diffusers_tests/pipelines/kandinsky/__init__.py similarity index 100% rename from tests/diffusers/pipelines/latent_consistency_models/__init__.py rename to tests/diffusers_tests/pipelines/kandinsky/__init__.py diff --git a/tests/diffusers/pipelines/kandinsky/test_kandinsky.py b/tests/diffusers_tests/pipelines/kandinsky/test_kandinsky.py similarity index 100% rename from tests/diffusers/pipelines/kandinsky/test_kandinsky.py rename to tests/diffusers_tests/pipelines/kandinsky/test_kandinsky.py diff --git a/tests/diffusers/pipelines/kandinsky/test_kandinsky_combined.py b/tests/diffusers_tests/pipelines/kandinsky/test_kandinsky_combined.py similarity index 100% rename from tests/diffusers/pipelines/kandinsky/test_kandinsky_combined.py rename to tests/diffusers_tests/pipelines/kandinsky/test_kandinsky_combined.py diff --git a/tests/diffusers/pipelines/kandinsky/test_kandinsky_img2img.py b/tests/diffusers_tests/pipelines/kandinsky/test_kandinsky_img2img.py similarity index 100% rename from tests/diffusers/pipelines/kandinsky/test_kandinsky_img2img.py rename to tests/diffusers_tests/pipelines/kandinsky/test_kandinsky_img2img.py diff --git a/tests/diffusers/pipelines/kandinsky/test_kandinsky_inpaint.py b/tests/diffusers_tests/pipelines/kandinsky/test_kandinsky_inpaint.py similarity index 100% rename from tests/diffusers/pipelines/kandinsky/test_kandinsky_inpaint.py rename to tests/diffusers_tests/pipelines/kandinsky/test_kandinsky_inpaint.py diff --git a/tests/diffusers/pipelines/kandinsky/test_kandinsky_prior.py b/tests/diffusers_tests/pipelines/kandinsky/test_kandinsky_prior.py similarity index 100% rename from tests/diffusers/pipelines/kandinsky/test_kandinsky_prior.py rename to tests/diffusers_tests/pipelines/kandinsky/test_kandinsky_prior.py diff --git a/tests/diffusers/pipelines/latent_diffusion/__init__.py b/tests/diffusers_tests/pipelines/kandinsky2_2/__init__.py similarity index 100% rename from tests/diffusers/pipelines/latent_diffusion/__init__.py rename to tests/diffusers_tests/pipelines/kandinsky2_2/__init__.py diff --git a/tests/diffusers/pipelines/kandinsky2_2/test_kandinsky.py b/tests/diffusers_tests/pipelines/kandinsky2_2/test_kandinsky.py similarity index 100% rename from tests/diffusers/pipelines/kandinsky2_2/test_kandinsky.py rename to tests/diffusers_tests/pipelines/kandinsky2_2/test_kandinsky.py diff --git a/tests/diffusers/pipelines/kandinsky2_2/test_kandinsky_combined.py b/tests/diffusers_tests/pipelines/kandinsky2_2/test_kandinsky_combined.py similarity index 100% rename from tests/diffusers/pipelines/kandinsky2_2/test_kandinsky_combined.py rename to tests/diffusers_tests/pipelines/kandinsky2_2/test_kandinsky_combined.py diff --git a/tests/diffusers/pipelines/kandinsky2_2/test_kandinsky_controlnet.py b/tests/diffusers_tests/pipelines/kandinsky2_2/test_kandinsky_controlnet.py similarity index 100% rename from tests/diffusers/pipelines/kandinsky2_2/test_kandinsky_controlnet.py rename to tests/diffusers_tests/pipelines/kandinsky2_2/test_kandinsky_controlnet.py diff --git a/tests/diffusers/pipelines/kandinsky2_2/test_kandinsky_controlnet_img2img.py b/tests/diffusers_tests/pipelines/kandinsky2_2/test_kandinsky_controlnet_img2img.py similarity index 100% rename from tests/diffusers/pipelines/kandinsky2_2/test_kandinsky_controlnet_img2img.py rename to tests/diffusers_tests/pipelines/kandinsky2_2/test_kandinsky_controlnet_img2img.py diff --git a/tests/diffusers/pipelines/kandinsky2_2/test_kandinsky_img2img.py b/tests/diffusers_tests/pipelines/kandinsky2_2/test_kandinsky_img2img.py similarity index 100% rename from tests/diffusers/pipelines/kandinsky2_2/test_kandinsky_img2img.py rename to tests/diffusers_tests/pipelines/kandinsky2_2/test_kandinsky_img2img.py diff --git a/tests/diffusers/pipelines/kandinsky2_2/test_kandinsky_inpaint.py b/tests/diffusers_tests/pipelines/kandinsky2_2/test_kandinsky_inpaint.py similarity index 100% rename from tests/diffusers/pipelines/kandinsky2_2/test_kandinsky_inpaint.py rename to tests/diffusers_tests/pipelines/kandinsky2_2/test_kandinsky_inpaint.py diff --git a/tests/diffusers/pipelines/kandinsky2_2/test_kandinsky_prior.py b/tests/diffusers_tests/pipelines/kandinsky2_2/test_kandinsky_prior.py similarity index 100% rename from tests/diffusers/pipelines/kandinsky2_2/test_kandinsky_prior.py rename to tests/diffusers_tests/pipelines/kandinsky2_2/test_kandinsky_prior.py diff --git a/tests/diffusers/pipelines/kandinsky2_2/test_kandinsky_prior_emb2emb.py b/tests/diffusers_tests/pipelines/kandinsky2_2/test_kandinsky_prior_emb2emb.py similarity index 100% rename from tests/diffusers/pipelines/kandinsky2_2/test_kandinsky_prior_emb2emb.py rename to tests/diffusers_tests/pipelines/kandinsky2_2/test_kandinsky_prior_emb2emb.py diff --git a/tests/diffusers/pipelines/marigold/__init__.py b/tests/diffusers_tests/pipelines/kandinsky3/__init__.py similarity index 100% rename from tests/diffusers/pipelines/marigold/__init__.py rename to tests/diffusers_tests/pipelines/kandinsky3/__init__.py diff --git a/tests/diffusers/pipelines/kandinsky3/test_kandinsky3.py b/tests/diffusers_tests/pipelines/kandinsky3/test_kandinsky3.py similarity index 100% rename from tests/diffusers/pipelines/kandinsky3/test_kandinsky3.py rename to tests/diffusers_tests/pipelines/kandinsky3/test_kandinsky3.py diff --git a/tests/diffusers/pipelines/kandinsky3/test_kandinsky3_img2img.py b/tests/diffusers_tests/pipelines/kandinsky3/test_kandinsky3_img2img.py similarity index 100% rename from tests/diffusers/pipelines/kandinsky3/test_kandinsky3_img2img.py rename to tests/diffusers_tests/pipelines/kandinsky3/test_kandinsky3_img2img.py diff --git a/tests/diffusers/pipelines/pixart_alpha/__init__.py b/tests/diffusers_tests/pipelines/latent_consistency_models/__init__.py similarity index 100% rename from tests/diffusers/pipelines/pixart_alpha/__init__.py rename to tests/diffusers_tests/pipelines/latent_consistency_models/__init__.py diff --git a/tests/diffusers/pipelines/latent_consistency_models/test_latent_consistency_models.py b/tests/diffusers_tests/pipelines/latent_consistency_models/test_latent_consistency_models.py similarity index 100% rename from tests/diffusers/pipelines/latent_consistency_models/test_latent_consistency_models.py rename to tests/diffusers_tests/pipelines/latent_consistency_models/test_latent_consistency_models.py diff --git a/tests/diffusers/pipelines/latent_consistency_models/test_latent_consistency_models_img2img.py b/tests/diffusers_tests/pipelines/latent_consistency_models/test_latent_consistency_models_img2img.py similarity index 100% rename from tests/diffusers/pipelines/latent_consistency_models/test_latent_consistency_models_img2img.py rename to tests/diffusers_tests/pipelines/latent_consistency_models/test_latent_consistency_models_img2img.py diff --git a/tests/diffusers/pipelines/pixart_sigma/__init__.py b/tests/diffusers_tests/pipelines/latent_diffusion/__init__.py similarity index 100% rename from tests/diffusers/pipelines/pixart_sigma/__init__.py rename to tests/diffusers_tests/pipelines/latent_diffusion/__init__.py diff --git a/tests/diffusers/pipelines/latent_diffusion/test_latent_diffusion.py b/tests/diffusers_tests/pipelines/latent_diffusion/test_latent_diffusion.py similarity index 100% rename from tests/diffusers/pipelines/latent_diffusion/test_latent_diffusion.py rename to tests/diffusers_tests/pipelines/latent_diffusion/test_latent_diffusion.py diff --git a/tests/diffusers/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py b/tests/diffusers_tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py similarity index 100% rename from tests/diffusers/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py rename to tests/diffusers_tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py diff --git a/tests/diffusers/pipelines/shap_e/__init__.py b/tests/diffusers_tests/pipelines/marigold/__init__.py similarity index 100% rename from tests/diffusers/pipelines/shap_e/__init__.py rename to tests/diffusers_tests/pipelines/marigold/__init__.py diff --git a/tests/diffusers/pipelines/marigold/test_marigold_depth.py b/tests/diffusers_tests/pipelines/marigold/test_marigold_depth.py similarity index 100% rename from tests/diffusers/pipelines/marigold/test_marigold_depth.py rename to tests/diffusers_tests/pipelines/marigold/test_marigold_depth.py diff --git a/tests/diffusers/pipelines/marigold/test_marigold_normals.py b/tests/diffusers_tests/pipelines/marigold/test_marigold_normals.py similarity index 100% rename from tests/diffusers/pipelines/marigold/test_marigold_normals.py rename to tests/diffusers_tests/pipelines/marigold/test_marigold_normals.py diff --git a/tests/diffusers/pipelines/stable_cascade/__init__.py b/tests/diffusers_tests/pipelines/pag/__init__.py similarity index 100% rename from tests/diffusers/pipelines/stable_cascade/__init__.py rename to tests/diffusers_tests/pipelines/pag/__init__.py diff --git a/tests/diffusers_tests/pipelines/pag/test_pag_controlnet_sd_inpaint.py b/tests/diffusers_tests/pipelines/pag/test_pag_controlnet_sd_inpaint.py new file mode 100644 index 0000000000..25216e520c --- /dev/null +++ b/tests/diffusers_tests/pipelines/pag/test_pag_controlnet_sd_inpaint.py @@ -0,0 +1,218 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This model implementation is heavily based on: + +import random +import unittest + +import numpy as np +import torch +from ddt import data, ddt, unpack +from diffusers.utils.torch_utils import randn_tensor +from PIL import Image +from transformers import CLIPTextConfig + +import mindspore as ms + +from ..pipeline_test_utils import ( + THRESHOLD_FP16, + THRESHOLD_FP32, + PipelineTesterMixin, + floats_tensor, + get_module, + get_pipeline_components, +) + +test_cases = [ + {"mode": ms.PYNATIVE_MODE, "dtype": "float32"}, + {"mode": ms.PYNATIVE_MODE, "dtype": "float16"}, + {"mode": ms.GRAPH_MODE, "dtype": "float32"}, + {"mode": ms.GRAPH_MODE, "dtype": "float16"}, +] + + +@ddt +class StableDiffusionControlNetPAGInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase): + pipeline_config = [ + [ + "unet", + "diffusers.models.unets.unet_2d_condition.UNet2DConditionModel", + "mindone.diffusers.models.unets.unet_2d_condition.UNet2DConditionModel", + dict( + block_out_channels=(32, 64), + layers_per_block=2, + sample_size=32, + in_channels=9, + out_channels=4, + down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), + up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), + cross_attention_dim=32, + ), + ], + [ + "controlnet", + "diffusers.models.controlnet.ControlNetModel", + "mindone.diffusers.models.controlnet.ControlNetModel", + dict( + block_out_channels=(32, 64), + layers_per_block=2, + in_channels=4, + down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), + cross_attention_dim=32, + conditioning_embedding_out_channels=(16, 32), + ), + ], + [ + "scheduler", + "diffusers.schedulers.scheduling_ddim.DDIMScheduler", + "mindone.diffusers.schedulers.scheduling_ddim.DDIMScheduler", + dict( + beta_start=0.00085, + beta_end=0.012, + beta_schedule="scaled_linear", + clip_sample=False, + set_alpha_to_one=False, + ), + ], + [ + "vae", + "diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL", + "mindone.diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL", + dict( + block_out_channels=[32, 64], + in_channels=3, + out_channels=3, + down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], + up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], + latent_channels=4, + ), + ], + [ + "text_encoder", + "transformers.models.clip.modeling_clip.CLIPTextModel", + "mindone.transformers.models.clip.modeling_clip.CLIPTextModel", + dict( + config=CLIPTextConfig( + bos_token_id=0, + eos_token_id=2, + hidden_size=32, + intermediate_size=37, + layer_norm_eps=1e-05, + num_attention_heads=4, + num_hidden_layers=5, + pad_token_id=1, + vocab_size=1000, + ), + ), + ], + [ + "tokenizer", + "transformers.models.clip.tokenization_clip.CLIPTokenizer", + "transformers.models.clip.tokenization_clip.CLIPTokenizer", + dict( + pretrained_model_name_or_path="hf-internal-testing/tiny-random-clip", + ), + ], + ] + + def get_dummy_components(self): + components = { + key: None + for key in [ + "unet", + "controlnet", + "scheduler", + "vae", + "text_encoder", + "tokenizer", + "safety_checker", + "feature_extractor", + "image_encoder", + ] + } + + return get_pipeline_components(components, self.pipeline_config) + + def get_dummy_inputs(self, seed=0): + controlnet_embedder_scale_factor = 2 + pt_control_image = randn_tensor( + (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor), + ) + ms_control_image = ms.tensor(pt_control_image.numpy()) + init_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)) + init_image = init_image.cpu().permute(0, 2, 3, 1)[0] + + image = Image.fromarray(np.uint8(init_image)).convert("RGB").resize((64, 64)) + mask_image = Image.fromarray(np.uint8(init_image + 4)).convert("RGB").resize((64, 64)) + + pt_inputs = { + "prompt": "A painting of a squirrel eating a burger", + "num_inference_steps": 2, + "guidance_scale": 6.0, + "pag_scale": 3.0, + "output_type": "np", + "image": image, + "mask_image": mask_image, + "control_image": pt_control_image, + } + + ms_inputs = { + "prompt": "A painting of a squirrel eating a burger", + "num_inference_steps": 2, + "guidance_scale": 6.0, + "pag_scale": 3.0, + "output_type": "np", + "image": image, + "mask_image": mask_image, + "control_image": ms_control_image, + } + return pt_inputs, ms_inputs + + @data(*test_cases) + @unpack + def test_pag_inference(self, mode, dtype): + ms.set_context(mode=mode) + + pt_components, ms_components = self.get_dummy_components() + pt_pipe_cls = get_module( + "diffusers.pipelines.pag.pipeline_pag_controlnet_sd_inpaint.StableDiffusionControlNetPAGInpaintPipeline" + ) + ms_pipe_cls = get_module( + "mindone.diffusers.pipelines.pag.pipeline_pag_controlnet_sd_inpaint.StableDiffusionControlNetPAGInpaintPipeline" + ) + + pt_pipe_pag = pt_pipe_cls(**pt_components, pag_applied_layers=["mid", "up", "down"]) + ms_pipe_pag = ms_pipe_cls(**ms_components, pag_applied_layers=["mid", "up", "down"]) + + pt_pipe_pag.set_progress_bar_config(disable=None) + ms_pipe_pag.set_progress_bar_config(disable=None) + + ms_dtype, pt_dtype = getattr(ms, dtype), getattr(torch, dtype) + pt_pipe_pag = pt_pipe_pag.to(pt_dtype) + ms_pipe_pag = ms_pipe_pag.to(ms_dtype) + + pt_inputs, ms_inputs = self.get_dummy_inputs() + + torch.manual_seed(0) + pt_image = pt_pipe_pag(**pt_inputs).images + torch.manual_seed(0) + ms_image = ms_pipe_pag(**ms_inputs)[0] + + pt_image_slice = pt_image[0, -3:, -3:, -1] + ms_image_slice = ms_image[0, -3:, -3:, -1] + + threshold = THRESHOLD_FP32 if dtype == "float32" else THRESHOLD_FP16 + assert np.max(np.linalg.norm(pt_image_slice - ms_image_slice) / np.linalg.norm(pt_image_slice)) < threshold diff --git a/tests/diffusers_tests/pipelines/pag/test_pag_controlnet_sdxl_img2img.py b/tests/diffusers_tests/pipelines/pag/test_pag_controlnet_sdxl_img2img.py new file mode 100644 index 0000000000..6072025829 --- /dev/null +++ b/tests/diffusers_tests/pipelines/pag/test_pag_controlnet_sdxl_img2img.py @@ -0,0 +1,256 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random +import unittest + +import numpy as np +import torch +from ddt import data, ddt, unpack +from transformers import CLIPTextConfig + +import mindspore as ms + +from ..pipeline_test_utils import ( + THRESHOLD_FP16, + THRESHOLD_FP32, + PipelineTesterMixin, + floats_tensor, + get_module, + get_pipeline_components, +) + +test_cases = [ + {"mode": ms.PYNATIVE_MODE, "dtype": "float32"}, + {"mode": ms.PYNATIVE_MODE, "dtype": "float16"}, + {"mode": ms.GRAPH_MODE, "dtype": "float32"}, + {"mode": ms.GRAPH_MODE, "dtype": "float16"}, +] + + +@ddt +class StableDiffusionXLControlNetPAGImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase): + skip_first_text_encoder = False + pipeline_config = [ + [ + "unet", + "diffusers.models.unets.unet_2d_condition.UNet2DConditionModel", + "mindone.diffusers.models.unets.unet_2d_condition.UNet2DConditionModel", + dict( + block_out_channels=(32, 64), + layers_per_block=2, + sample_size=32, + in_channels=4, + out_channels=4, + down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), + up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), + # SD2-specific config below + attention_head_dim=(2, 4), + use_linear_projection=True, + addition_embed_type="text_time", + addition_time_embed_dim=8, + transformer_layers_per_block=(1, 2), + projection_class_embeddings_input_dim=80, # 6 * 8 + 32 + cross_attention_dim=64 if not skip_first_text_encoder else 32, + ), + ], + [ + "controlnet", + "diffusers.models.controlnet.ControlNetModel", + "mindone.diffusers.models.controlnet.ControlNetModel", + dict( + block_out_channels=(32, 64), + layers_per_block=2, + in_channels=4, + down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), + conditioning_embedding_out_channels=(16, 32), + # SD2-specific config below + attention_head_dim=(2, 4), + use_linear_projection=True, + addition_embed_type="text_time", + addition_time_embed_dim=8, + transformer_layers_per_block=(1, 2), + projection_class_embeddings_input_dim=80, # 6 * 8 + 32 + cross_attention_dim=64, + ), + ], + [ + "scheduler", + "diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler", + "mindone.diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler", + dict( + beta_start=0.00085, + beta_end=0.012, + steps_offset=1, + beta_schedule="scaled_linear", + timestep_spacing="leading", + ), + ], + [ + "vae", + "diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL", + "mindone.diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL", + dict( + block_out_channels=[32, 64], + in_channels=3, + out_channels=3, + down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], + up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], + latent_channels=4, + ), + ], + [ + "text_encoder", + "transformers.models.clip.modeling_clip.CLIPTextModel", + "mindone.transformers.models.clip.modeling_clip.CLIPTextModel", + dict( + config=CLIPTextConfig( + bos_token_id=0, + eos_token_id=2, + hidden_size=32, + intermediate_size=37, + layer_norm_eps=1e-05, + num_attention_heads=4, + num_hidden_layers=5, + pad_token_id=1, + vocab_size=1000, + # SD2-specific config below + hidden_act="gelu", + projection_dim=32, + ), + ), + ], + [ + "tokenizer", + "transformers.models.clip.tokenization_clip.CLIPTokenizer", + "transformers.models.clip.tokenization_clip.CLIPTokenizer", + dict( + pretrained_model_name_or_path="hf-internal-testing/tiny-random-clip", + ), + ], + [ + "text_encoder_2", + "transformers.models.clip.modeling_clip.CLIPTextModelWithProjection", + "mindone.transformers.models.clip.modeling_clip.CLIPTextModelWithProjection", + dict( + config=CLIPTextConfig( + bos_token_id=0, + eos_token_id=2, + hidden_size=32, + intermediate_size=37, + layer_norm_eps=1e-05, + num_attention_heads=4, + num_hidden_layers=5, + pad_token_id=1, + vocab_size=1000, + # SD2-specific config below + hidden_act="gelu", + projection_dim=32, + ), + ), + ], + [ + "tokenizer_2", + "transformers.models.clip.tokenization_clip.CLIPTokenizer", + "transformers.models.clip.tokenization_clip.CLIPTokenizer", + dict( + pretrained_model_name_or_path="hf-internal-testing/tiny-random-clip", + ), + ], + ] + + def get_dummy_components(self): + components = { + key: None + for key in [ + "unet", + "controlnet", + "scheduler", + "vae", + "text_encoder", + "tokenizer", + "text_encoder_2", + "tokenizer_2", + "image_encoder", + "feature_extractor", + ] + } + + return get_pipeline_components(components, self.pipeline_config) + + def get_dummy_inputs(self, seed=0): + controlnet_embedder_scale_factor = 2 + pt_image = floats_tensor( + (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor), + rng=random.Random(seed), + ) + ms_image = ms.tensor(pt_image.numpy()) + + pt_inputs = { + "prompt": "A painting of a squirrel eating a burger", + "num_inference_steps": 2, + "guidance_scale": 6.0, + "pag_scale": 3.0, + "output_type": "np", + "image": pt_image, + "control_image": pt_image, + } + + ms_inputs = { + "prompt": "A painting of a squirrel eating a burger", + "num_inference_steps": 2, + "guidance_scale": 6.0, + "pag_scale": 3.0, + "output_type": "np", + "image": ms_image, + "control_image": ms_image, + } + return pt_inputs, ms_inputs + + @data(*test_cases) + @unpack + def test_pag_inference(self, mode, dtype): + ms.set_context(mode=mode, jit_syntax_level=ms.STRICT, pynative_synchronize=True) + + pt_components, ms_components = self.get_dummy_components() + pt_pipe_cls = get_module( + "diffusers.pipelines.pag.pipeline_pag_controlnet_sd_xl_img2img.StableDiffusionXLControlNetPAGImg2ImgPipeline" + ) + ms_pipe_cls = get_module( + "mindone.diffusers.pipelines.pag.pipeline_pag_controlnet_sd_xl_img2img.StableDiffusionXLControlNetPAGImg2ImgPipeline" + ) + + pt_pipe_pag = pt_pipe_cls(**pt_components, pag_applied_layers=["mid", "up", "down"]) + ms_pipe_pag = ms_pipe_cls(**ms_components, pag_applied_layers=["mid", "up", "down"]) + + pt_pipe_pag.set_progress_bar_config(disable=None) + ms_pipe_pag.set_progress_bar_config(disable=None) + + ms_dtype, pt_dtype = getattr(ms, dtype), getattr(torch, dtype) + pt_pipe_pag = pt_pipe_pag.to(pt_dtype) + ms_pipe_pag = ms_pipe_pag.to(ms_dtype) + + pt_inputs, ms_inputs = self.get_dummy_inputs() + + torch.manual_seed(0) + pt_image = pt_pipe_pag(**pt_inputs).images + torch.manual_seed(0) + ms_image = ms_pipe_pag(**ms_inputs)[0] + + pt_image_slice = pt_image[0, -3:, -3:, -1] + ms_image_slice = ms_image[0, -3:, -3:, -1] + + threshold = THRESHOLD_FP32 if dtype == "float32" else THRESHOLD_FP16 + assert np.max(np.linalg.norm(pt_image_slice - ms_image_slice) / np.linalg.norm(pt_image_slice)) < threshold diff --git a/tests/diffusers_tests/pipelines/pag/test_pag_sd_img2img.py b/tests/diffusers_tests/pipelines/pag/test_pag_sd_img2img.py new file mode 100644 index 0000000000..1911344f18 --- /dev/null +++ b/tests/diffusers_tests/pipelines/pag/test_pag_sd_img2img.py @@ -0,0 +1,186 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import random +import unittest + +import numpy as np +import torch +from ddt import data, ddt, unpack +from transformers import CLIPTextConfig + +import mindspore as ms + +from ..pipeline_test_utils import ( + THRESHOLD_FP16, + THRESHOLD_FP32, + PipelineTesterMixin, + floats_tensor, + get_module, + get_pipeline_components, +) + +test_cases = [ + {"mode": ms.PYNATIVE_MODE, "dtype": "float32"}, + {"mode": ms.PYNATIVE_MODE, "dtype": "float16"}, + {"mode": ms.GRAPH_MODE, "dtype": "float32"}, + {"mode": ms.GRAPH_MODE, "dtype": "float16"}, +] + + +@ddt +class StableDiffusionPAGImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase): + pipeline_config = [ + [ + "unet", + "diffusers.models.unets.unet_2d_condition.UNet2DConditionModel", + "mindone.diffusers.models.unets.unet_2d_condition.UNet2DConditionModel", + dict( + block_out_channels=(32, 64), + layers_per_block=2, + time_cond_proj_dim=None, + sample_size=32, + in_channels=4, + out_channels=4, + down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), + up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), + cross_attention_dim=32, + ), + ], + [ + "scheduler", + "diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler", + "mindone.diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler", + dict( + beta_start=0.00085, + beta_end=0.012, + steps_offset=1, + beta_schedule="scaled_linear", + timestep_spacing="leading", + ), + ], + [ + "vae", + "diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL", + "mindone.diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL", + dict( + block_out_channels=[32, 64], + in_channels=3, + out_channels=3, + down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], + up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], + latent_channels=4, + sample_size=128, + ), + ], + [ + "text_encoder", + "transformers.models.clip.modeling_clip.CLIPTextModel", + "mindone.transformers.models.clip.modeling_clip.CLIPTextModel", + dict( + config=CLIPTextConfig( + bos_token_id=0, + eos_token_id=2, + hidden_size=32, + intermediate_size=37, + layer_norm_eps=1e-05, + num_attention_heads=4, + num_hidden_layers=5, + pad_token_id=1, + vocab_size=1000, + ), + ), + ], + [ + "tokenizer", + "transformers.models.clip.tokenization_clip.CLIPTokenizer", + "transformers.models.clip.tokenization_clip.CLIPTokenizer", + dict( + pretrained_model_name_or_path="hf-internal-testing/tiny-random-clip", + ), + ], + ] + + def get_dummy_components(self): + components = { + key: None + for key in [ + "unet", + "scheduler", + "vae", + "text_encoder", + "tokenizer", + "safety_checker", + "feature_extractor", + "image_encoder", + ] + } + + return get_pipeline_components(components, self.pipeline_config) + + def get_dummy_inputs(self, seed=0): + pt_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)) + pt_image = pt_image / 2 + 0.5 + ms_image = ms.tensor(pt_image.numpy()) + pt_inputs = { + "prompt": "A painting of a squirrel eating a burger", + "image": pt_image, + "num_inference_steps": 2, + "guidance_scale": 6.0, + "pag_scale": 0.9, + "output_type": "np", + } + ms_inputs = { + "prompt": "A painting of a squirrel eating a burger", + "image": ms_image, + "num_inference_steps": 2, + "guidance_scale": 6.0, + "pag_scale": 0.9, + "output_type": "np", + } + return pt_inputs, ms_inputs + + @data(*test_cases) + @unpack + def test_pag_inference(self, mode, dtype): + ms.set_context(mode=mode) + + pt_components, ms_components = self.get_dummy_components() + pt_pipe_cls = get_module("diffusers.pipelines.pag.pipeline_pag_sd_img2img.StableDiffusionPAGImg2ImgPipeline") + ms_pipe_cls = get_module( + "mindone.diffusers.pipelines.pag.pipeline_pag_sd_img2img.StableDiffusionPAGImg2ImgPipeline" + ) + + pt_pipe_pag = pt_pipe_cls(**pt_components, pag_applied_layers=["mid", "up", "down"]) + ms_pipe_pag = ms_pipe_cls(**ms_components, pag_applied_layers=["mid", "up", "down"]) + + pt_pipe_pag.set_progress_bar_config(disable=None) + ms_pipe_pag.set_progress_bar_config(disable=None) + + ms_dtype, pt_dtype = getattr(ms, dtype), getattr(torch, dtype) + pt_pipe_pag = pt_pipe_pag.to(pt_dtype) + ms_pipe_pag = ms_pipe_pag.to(ms_dtype) + + pt_inputs, ms_inputs = self.get_dummy_inputs() + + torch.manual_seed(0) + pt_image = pt_pipe_pag(**pt_inputs).images + torch.manual_seed(0) + ms_image = ms_pipe_pag(**ms_inputs)[0] + + pt_image_slice = pt_image[0, -3:, -3:, -1] + ms_image_slice = ms_image[0, -3:, -3:, -1] + + threshold = THRESHOLD_FP32 if dtype == "float32" else THRESHOLD_FP16 + assert np.max(np.linalg.norm(pt_image_slice - ms_image_slice) / np.linalg.norm(pt_image_slice)) < threshold diff --git a/tests/diffusers/pipelines/pipeline_test_utils.py b/tests/diffusers_tests/pipelines/pipeline_test_utils.py similarity index 97% rename from tests/diffusers/pipelines/pipeline_test_utils.py rename to tests/diffusers_tests/pipelines/pipeline_test_utils.py index a8b6e111a0..9bec8cae4c 100644 --- a/tests/diffusers/pipelines/pipeline_test_utils.py +++ b/tests/diffusers_tests/pipelines/pipeline_test_utils.py @@ -67,7 +67,13 @@ def randn_tensor( # device on which tensor is created defaults to device rand_device = device batch_size = shape[0] - dtype = torch.float32 if dtype == ms.float32 else torch.float16 + ms_dtype = dtype + if dtype == ms.float32: + dtype = torch.float32 + elif dtype == ms.bfloat16: + dtype = torch.bfloat16 + else: + dtype = torch.float16 layout = layout or torch.strided device = device or torch.device("cpu") @@ -99,7 +105,7 @@ def randn_tensor( else: latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype, layout=layout).to(device) - return ms.Tensor(latents.numpy()) + return ms.Tensor(latents.float().numpy(), dtype=ms_dtype) def get_module(module_path): diff --git a/tests/diffusers/pipelines/stable_diffusion/__init__.py b/tests/diffusers_tests/pipelines/pixart_alpha/__init__.py similarity index 100% rename from tests/diffusers/pipelines/stable_diffusion/__init__.py rename to tests/diffusers_tests/pipelines/pixart_alpha/__init__.py diff --git a/tests/diffusers/pipelines/pixart_alpha/test_pixart.py b/tests/diffusers_tests/pipelines/pixart_alpha/test_pixart.py similarity index 100% rename from tests/diffusers/pipelines/pixart_alpha/test_pixart.py rename to tests/diffusers_tests/pipelines/pixart_alpha/test_pixart.py diff --git a/tests/diffusers/pipelines/stable_diffusion_2/__init__.py b/tests/diffusers_tests/pipelines/pixart_sigma/__init__.py similarity index 100% rename from tests/diffusers/pipelines/stable_diffusion_2/__init__.py rename to tests/diffusers_tests/pipelines/pixart_sigma/__init__.py diff --git a/tests/diffusers/pipelines/pixart_sigma/test_pixart.py b/tests/diffusers_tests/pipelines/pixart_sigma/test_pixart.py similarity index 100% rename from tests/diffusers/pipelines/pixart_sigma/test_pixart.py rename to tests/diffusers_tests/pipelines/pixart_sigma/test_pixart.py diff --git a/tests/diffusers/pipelines/stable_diffusion_3/__init__.py b/tests/diffusers_tests/pipelines/shap_e/__init__.py similarity index 100% rename from tests/diffusers/pipelines/stable_diffusion_3/__init__.py rename to tests/diffusers_tests/pipelines/shap_e/__init__.py diff --git a/tests/diffusers/pipelines/shap_e/test_shap_e.py b/tests/diffusers_tests/pipelines/shap_e/test_shap_e.py similarity index 100% rename from tests/diffusers/pipelines/shap_e/test_shap_e.py rename to tests/diffusers_tests/pipelines/shap_e/test_shap_e.py diff --git a/tests/diffusers/pipelines/shap_e/test_shap_e_img2img.py b/tests/diffusers_tests/pipelines/shap_e/test_shap_e_img2img.py similarity index 100% rename from tests/diffusers/pipelines/shap_e/test_shap_e_img2img.py rename to tests/diffusers_tests/pipelines/shap_e/test_shap_e_img2img.py diff --git a/tests/diffusers/pipelines/stable_diffusion_adapter/__init__.py b/tests/diffusers_tests/pipelines/stable_cascade/__init__.py similarity index 100% rename from tests/diffusers/pipelines/stable_diffusion_adapter/__init__.py rename to tests/diffusers_tests/pipelines/stable_cascade/__init__.py diff --git a/tests/diffusers/pipelines/stable_cascade/test_stable_cascade_combined.py b/tests/diffusers_tests/pipelines/stable_cascade/test_stable_cascade_combined.py similarity index 100% rename from tests/diffusers/pipelines/stable_cascade/test_stable_cascade_combined.py rename to tests/diffusers_tests/pipelines/stable_cascade/test_stable_cascade_combined.py diff --git a/tests/diffusers/pipelines/stable_cascade/test_stable_cascade_decoder.py b/tests/diffusers_tests/pipelines/stable_cascade/test_stable_cascade_decoder.py similarity index 100% rename from tests/diffusers/pipelines/stable_cascade/test_stable_cascade_decoder.py rename to tests/diffusers_tests/pipelines/stable_cascade/test_stable_cascade_decoder.py diff --git a/tests/diffusers/pipelines/stable_cascade/test_stable_cascade_prior.py b/tests/diffusers_tests/pipelines/stable_cascade/test_stable_cascade_prior.py similarity index 100% rename from tests/diffusers/pipelines/stable_cascade/test_stable_cascade_prior.py rename to tests/diffusers_tests/pipelines/stable_cascade/test_stable_cascade_prior.py diff --git a/tests/diffusers/pipelines/stable_diffusion_gligen/__init__.py b/tests/diffusers_tests/pipelines/stable_diffusion/__init__.py similarity index 100% rename from tests/diffusers/pipelines/stable_diffusion_gligen/__init__.py rename to tests/diffusers_tests/pipelines/stable_diffusion/__init__.py diff --git a/tests/diffusers/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/diffusers_tests/pipelines/stable_diffusion/test_stable_diffusion.py similarity index 100% rename from tests/diffusers/pipelines/stable_diffusion/test_stable_diffusion.py rename to tests/diffusers_tests/pipelines/stable_diffusion/test_stable_diffusion.py diff --git a/tests/diffusers/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/diffusers_tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py similarity index 100% rename from tests/diffusers/pipelines/stable_diffusion/test_stable_diffusion_img2img.py rename to tests/diffusers_tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py diff --git a/tests/diffusers/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/diffusers_tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py similarity index 100% rename from tests/diffusers/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py rename to tests/diffusers_tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py diff --git a/tests/diffusers/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py b/tests/diffusers_tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py similarity index 100% rename from tests/diffusers/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py rename to tests/diffusers_tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py diff --git a/tests/diffusers/pipelines/stable_diffusion_gligen_text_image/__init__.py b/tests/diffusers_tests/pipelines/stable_diffusion_2/__init__.py similarity index 100% rename from tests/diffusers/pipelines/stable_diffusion_gligen_text_image/__init__.py rename to tests/diffusers_tests/pipelines/stable_diffusion_2/__init__.py diff --git a/tests/diffusers/pipelines/stable_diffusion_2/test_stable_diffusion.py b/tests/diffusers_tests/pipelines/stable_diffusion_2/test_stable_diffusion.py similarity index 100% rename from tests/diffusers/pipelines/stable_diffusion_2/test_stable_diffusion.py rename to tests/diffusers_tests/pipelines/stable_diffusion_2/test_stable_diffusion.py diff --git a/tests/diffusers/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py b/tests/diffusers_tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py similarity index 100% rename from tests/diffusers/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py rename to tests/diffusers_tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py diff --git a/tests/diffusers/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py b/tests/diffusers_tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py similarity index 100% rename from tests/diffusers/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py rename to tests/diffusers_tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py diff --git a/tests/diffusers/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py b/tests/diffusers_tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py similarity index 100% rename from tests/diffusers/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py rename to tests/diffusers_tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py diff --git a/tests/diffusers/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py b/tests/diffusers_tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py similarity index 100% rename from tests/diffusers/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py rename to tests/diffusers_tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py diff --git a/tests/diffusers/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py b/tests/diffusers_tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py similarity index 100% rename from tests/diffusers/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py rename to tests/diffusers_tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py diff --git a/tests/diffusers/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py b/tests/diffusers_tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py similarity index 100% rename from tests/diffusers/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py rename to tests/diffusers_tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py diff --git a/tests/diffusers/pipelines/stable_diffusion_image_variation/__init__.py b/tests/diffusers_tests/pipelines/stable_diffusion_3/__init__.py similarity index 100% rename from tests/diffusers/pipelines/stable_diffusion_image_variation/__init__.py rename to tests/diffusers_tests/pipelines/stable_diffusion_3/__init__.py diff --git a/tests/diffusers/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3.py b/tests/diffusers_tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3.py similarity index 100% rename from tests/diffusers/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3.py rename to tests/diffusers_tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3.py diff --git a/tests/diffusers/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3_img2img.py b/tests/diffusers_tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3_img2img.py similarity index 100% rename from tests/diffusers/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3_img2img.py rename to tests/diffusers_tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3_img2img.py diff --git a/tests/diffusers/pipelines/stable_diffusion_xl/__init__.py b/tests/diffusers_tests/pipelines/stable_diffusion_adapter/__init__.py similarity index 100% rename from tests/diffusers/pipelines/stable_diffusion_xl/__init__.py rename to tests/diffusers_tests/pipelines/stable_diffusion_adapter/__init__.py diff --git a/tests/diffusers/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py b/tests/diffusers_tests/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py similarity index 100% rename from tests/diffusers/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py rename to tests/diffusers_tests/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py diff --git a/tests/diffusers/pipelines/stable_video_diffusion/__init__.py b/tests/diffusers_tests/pipelines/stable_diffusion_gligen/__init__.py similarity index 100% rename from tests/diffusers/pipelines/stable_video_diffusion/__init__.py rename to tests/diffusers_tests/pipelines/stable_diffusion_gligen/__init__.py diff --git a/tests/diffusers/pipelines/stable_diffusion_gligen/test_stable_diffusion_gligen.py b/tests/diffusers_tests/pipelines/stable_diffusion_gligen/test_stable_diffusion_gligen.py similarity index 100% rename from tests/diffusers/pipelines/stable_diffusion_gligen/test_stable_diffusion_gligen.py rename to tests/diffusers_tests/pipelines/stable_diffusion_gligen/test_stable_diffusion_gligen.py diff --git a/tests/diffusers/pipelines/unclip/__init__.py b/tests/diffusers_tests/pipelines/stable_diffusion_gligen_text_image/__init__.py similarity index 100% rename from tests/diffusers/pipelines/unclip/__init__.py rename to tests/diffusers_tests/pipelines/stable_diffusion_gligen_text_image/__init__.py diff --git a/tests/diffusers/pipelines/stable_diffusion_gligen_text_image/test_stable_diffusion_gligen_text_image.py b/tests/diffusers_tests/pipelines/stable_diffusion_gligen_text_image/test_stable_diffusion_gligen_text_image.py similarity index 100% rename from tests/diffusers/pipelines/stable_diffusion_gligen_text_image/test_stable_diffusion_gligen_text_image.py rename to tests/diffusers_tests/pipelines/stable_diffusion_gligen_text_image/test_stable_diffusion_gligen_text_image.py diff --git a/tests/diffusers/pipelines/wuerstchen/__init__.py b/tests/diffusers_tests/pipelines/stable_diffusion_image_variation/__init__.py similarity index 100% rename from tests/diffusers/pipelines/wuerstchen/__init__.py rename to tests/diffusers_tests/pipelines/stable_diffusion_image_variation/__init__.py diff --git a/tests/diffusers/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py b/tests/diffusers_tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py similarity index 100% rename from tests/diffusers/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py rename to tests/diffusers_tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py diff --git a/tests/diffusers_tests/pipelines/stable_diffusion_xl/__init__.py b/tests/diffusers_tests/pipelines/stable_diffusion_xl/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/diffusers/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py b/tests/diffusers_tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py similarity index 100% rename from tests/diffusers/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py rename to tests/diffusers_tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py diff --git a/tests/diffusers/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py b/tests/diffusers_tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py similarity index 100% rename from tests/diffusers/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py rename to tests/diffusers_tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py diff --git a/tests/diffusers/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py b/tests/diffusers_tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py similarity index 100% rename from tests/diffusers/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py rename to tests/diffusers_tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py diff --git a/tests/diffusers/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py b/tests/diffusers_tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py similarity index 100% rename from tests/diffusers/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py rename to tests/diffusers_tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py diff --git a/tests/diffusers/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_instruction_pix2pix.py b/tests/diffusers_tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_instruction_pix2pix.py similarity index 100% rename from tests/diffusers/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_instruction_pix2pix.py rename to tests/diffusers_tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_instruction_pix2pix.py diff --git a/tests/diffusers_tests/pipelines/stable_video_diffusion/__init__.py b/tests/diffusers_tests/pipelines/stable_video_diffusion/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/diffusers/pipelines/stable_video_diffusion/test_stable_video_diffusion.py b/tests/diffusers_tests/pipelines/stable_video_diffusion/test_stable_video_diffusion.py similarity index 100% rename from tests/diffusers/pipelines/stable_video_diffusion/test_stable_video_diffusion.py rename to tests/diffusers_tests/pipelines/stable_video_diffusion/test_stable_video_diffusion.py diff --git a/tests/diffusers_tests/pipelines/unclip/__init__.py b/tests/diffusers_tests/pipelines/unclip/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/diffusers/pipelines/unclip/test_unclip.py b/tests/diffusers_tests/pipelines/unclip/test_unclip.py similarity index 100% rename from tests/diffusers/pipelines/unclip/test_unclip.py rename to tests/diffusers_tests/pipelines/unclip/test_unclip.py diff --git a/tests/diffusers/pipelines/unclip/test_unclip_image_variation.py b/tests/diffusers_tests/pipelines/unclip/test_unclip_image_variation.py similarity index 100% rename from tests/diffusers/pipelines/unclip/test_unclip_image_variation.py rename to tests/diffusers_tests/pipelines/unclip/test_unclip_image_variation.py diff --git a/tests/diffusers_tests/pipelines/wuerstchen/__init__.py b/tests/diffusers_tests/pipelines/wuerstchen/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/diffusers/pipelines/wuerstchen/test_wuerstchen_combined.py b/tests/diffusers_tests/pipelines/wuerstchen/test_wuerstchen_combined.py similarity index 100% rename from tests/diffusers/pipelines/wuerstchen/test_wuerstchen_combined.py rename to tests/diffusers_tests/pipelines/wuerstchen/test_wuerstchen_combined.py diff --git a/tests/diffusers/pipelines/wuerstchen/test_wuerstchen_decoder.py b/tests/diffusers_tests/pipelines/wuerstchen/test_wuerstchen_decoder.py similarity index 100% rename from tests/diffusers/pipelines/wuerstchen/test_wuerstchen_decoder.py rename to tests/diffusers_tests/pipelines/wuerstchen/test_wuerstchen_decoder.py diff --git a/tests/diffusers/pipelines/wuerstchen/test_wuerstchen_prior.py b/tests/diffusers_tests/pipelines/wuerstchen/test_wuerstchen_prior.py similarity index 100% rename from tests/diffusers/pipelines/wuerstchen/test_wuerstchen_prior.py rename to tests/diffusers_tests/pipelines/wuerstchen/test_wuerstchen_prior.py