From ef7bf311bd79c8eb5d91f245cf2be58e38faeff8 Mon Sep 17 00:00:00 2001
From: Disty0 <semihgulec2005@gmail.com>
Date: Thu, 7 Sep 2023 22:40:29 +0300
Subject: [PATCH 01/33] Initial Intel ARC support with IPEX

---
 XTI_hijack.py                        |   7 +
 fine_tune.py                         |   7 +
 gen_img_diffusers.py                 |   7 +
 gui.sh                               |  17 +-
 kohya_gui.py                         |   4 +
 library/ipex/__init__.py             | 164 +++++++++++++++++
 library/ipex/diffusers.py            | 262 +++++++++++++++++++++++++++
 library/ipex/gradscaler.py           | 179 ++++++++++++++++++
 library/ipex/hijacks.py              | 199 ++++++++++++++++++++
 requirements_linux_ipex.txt          |   3 +
 sdxl_gen_img.py                      |   7 +
 sdxl_minimal_inference.py            |   7 +
 sdxl_train.py                        |   7 +
 sdxl_train_control_net_lllite.py     |   7 +
 sdxl_train_control_net_lllite_alt.py |   7 +
 sdxl_train_network.py                |   7 +
 sdxl_train_textual_inversion.py      |   7 +
 setup.sh                             |   5 +
 setup/setup_common.py                |  37 +++-
 setup/validate_requirements.py       |  35 +++-
 train_controlnet.py                  |   7 +
 train_db.py                          |   7 +
 train_network.py                     |   7 +
 train_textual_inversion.py           |   7 +
 train_textual_inversion_XTI.py       |   7 +
 25 files changed, 993 insertions(+), 17 deletions(-)
 create mode 100644 library/ipex/__init__.py
 create mode 100644 library/ipex/diffusers.py
 create mode 100644 library/ipex/gradscaler.py
 create mode 100644 library/ipex/hijacks.py
 create mode 100644 requirements_linux_ipex.txt

diff --git a/XTI_hijack.py b/XTI_hijack.py
index 36b5d3f2b..ec0849455 100644
--- a/XTI_hijack.py
+++ b/XTI_hijack.py
@@ -1,4 +1,11 @@
 import torch
+try:
+    import intel_extension_for_pytorch as ipex
+    if torch.xpu.is_available():
+        from library.ipex import ipex_init
+        ipex_init()
+except Exception:
+    pass
 from typing import Union, List, Optional, Dict, Any, Tuple
 from diffusers.models.unet_2d_condition import UNet2DConditionOutput
 
diff --git a/fine_tune.py b/fine_tune.py
index f89e897a8..f300d4688 100644
--- a/fine_tune.py
+++ b/fine_tune.py
@@ -10,6 +10,13 @@
 
 from tqdm import tqdm
 import torch
+try:
+    import intel_extension_for_pytorch as ipex
+    if torch.xpu.is_available():
+        from library.ipex import ipex_init
+        ipex_init()
+except Exception:
+    pass
 from accelerate.utils import set_seed
 from diffusers import DDPMScheduler
 
diff --git a/gen_img_diffusers.py b/gen_img_diffusers.py
index 273c0dd86..0ea66cde2 100644
--- a/gen_img_diffusers.py
+++ b/gen_img_diffusers.py
@@ -65,6 +65,13 @@
 import diffusers
 import numpy as np
 import torch
+try:
+    import intel_extension_for_pytorch as ipex
+    if torch.xpu.is_available():
+        from library.ipex import ipex_init
+        ipex_init()
+except Exception:
+    pass
 import torchvision
 from diffusers import (
     AutoencoderKL,
diff --git a/gui.sh b/gui.sh
index 7f66a3eff..253d8577f 100755
--- a/gui.sh
+++ b/gui.sh
@@ -59,12 +59,27 @@ if [[ "$OSTYPE" == "darwin"* ]]; then
     fi
 else
     if [ "$RUNPOD" = false ]; then
-        REQUIREMENTS_FILE="$SCRIPT_DIR/requirements_linux.txt"
+        if [[ "$@" == *"--use-ipex"* ]]; then
+            REQUIREMENTS_FILE="$SCRIPT_DIR/requirements_linux_ipex.txt"
+        else
+            REQUIREMENTS_FILE="$SCRIPT_DIR/requirements_linux.txt"
+        fi
     else
         REQUIREMENTS_FILE="$SCRIPT_DIR/requirements_runpod.txt"
     fi
 fi
 
+#Set OneAPI environmet if it's not set by the user
+if [[ "$@" == *"--use-ipex"* ]] && [ ! -x "$(command -v sycl-ls)" ]
+then
+    echo "Setting OneAPI environment"
+    if [[ -z "$ONEAPI_ROOT" ]]
+    then
+        ONEAPI_ROOT=/opt/intel/oneapi
+    fi
+    source $ONEAPI_ROOT/setvars.sh
+fi
+
 # Validate the requirements and run the script if successful
 if python "$SCRIPT_DIR/setup/validate_requirements.py" -r "$REQUIREMENTS_FILE"; then
     python "$SCRIPT_DIR/kohya_gui.py" "$@"
diff --git a/kohya_gui.py b/kohya_gui.py
index da5a04a3d..da25f2b8f 100644
--- a/kohya_gui.py
+++ b/kohya_gui.py
@@ -133,6 +133,10 @@ def UI(**kwargs):
         '--language', type=str, default=None, help='Set custom language'
     )
 
+    parser.add_argument(
+        '--use-ipex', action='store_true', help='Use IPEX environment'
+    )
+
     args = parser.parse_args()
 
     UI(
diff --git a/library/ipex/__init__.py b/library/ipex/__init__.py
new file mode 100644
index 000000000..3d5f8da16
--- /dev/null
+++ b/library/ipex/__init__.py
@@ -0,0 +1,164 @@
+import os
+import sys
+import contextlib
+import torch
+import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
+from .hijacks import ipex_hijacks
+
+# pylint: disable=protected-access, missing-function-docstring, line-too-long
+
+def ipex_init(): # pylint: disable=too-many-statements
+    #Replace cuda with xpu:
+    torch.cuda.current_device = torch.xpu.current_device
+    torch.cuda.current_stream = torch.xpu.current_stream
+    torch.cuda.device = torch.xpu.device
+    torch.cuda.device_count = torch.xpu.device_count
+    torch.cuda.device_of = torch.xpu.device_of
+    torch.cuda.getDeviceIdListForCard = torch.xpu.getDeviceIdListForCard
+    torch.cuda.get_device_name = torch.xpu.get_device_name
+    torch.cuda.get_device_properties = torch.xpu.get_device_properties
+    torch.cuda.init = torch.xpu.init
+    torch.cuda.is_available = torch.xpu.is_available
+    torch.cuda.is_initialized = torch.xpu.is_initialized
+    torch.cuda.is_current_stream_capturing = lambda: False
+    torch.cuda.set_device = torch.xpu.set_device
+    torch.cuda.stream = torch.xpu.stream
+    torch.cuda.synchronize = torch.xpu.synchronize
+    torch.cuda.Event = torch.xpu.Event
+    torch.cuda.Stream = torch.xpu.Stream
+    torch.cuda.FloatTensor = torch.xpu.FloatTensor
+    torch.Tensor.cuda = torch.Tensor.xpu
+    torch.Tensor.is_cuda = torch.Tensor.is_xpu
+    torch.cuda._initialization_lock = torch.xpu.lazy_init._initialization_lock
+    torch.cuda._initialized = torch.xpu.lazy_init._initialized
+    torch.cuda._lazy_seed_tracker = torch.xpu.lazy_init._lazy_seed_tracker
+    torch.cuda._queued_calls = torch.xpu.lazy_init._queued_calls
+    torch.cuda._tls = torch.xpu.lazy_init._tls
+    torch.cuda.threading = torch.xpu.lazy_init.threading
+    torch.cuda.traceback = torch.xpu.lazy_init.traceback
+    torch.cuda.Optional = torch.xpu.Optional
+    torch.cuda.__cached__ = torch.xpu.__cached__
+    torch.cuda.__loader__ = torch.xpu.__loader__
+    torch.cuda.ComplexFloatStorage = torch.xpu.ComplexFloatStorage
+    torch.cuda.Tuple = torch.xpu.Tuple
+    torch.cuda.streams = torch.xpu.streams
+    torch.cuda._lazy_new = torch.xpu._lazy_new
+    torch.cuda.FloatStorage = torch.xpu.FloatStorage
+    torch.cuda.Any = torch.xpu.Any
+    torch.cuda.__doc__ = torch.xpu.__doc__
+    torch.cuda.default_generators = torch.xpu.default_generators
+    torch.cuda.HalfTensor = torch.xpu.HalfTensor
+    torch.cuda._get_device_index = torch.xpu._get_device_index
+    torch.cuda.__path__ = torch.xpu.__path__
+    torch.cuda.Device = torch.xpu.Device
+    torch.cuda.IntTensor = torch.xpu.IntTensor
+    torch.cuda.ByteStorage = torch.xpu.ByteStorage
+    torch.cuda.set_stream = torch.xpu.set_stream
+    torch.cuda.BoolStorage = torch.xpu.BoolStorage
+    torch.cuda.os = torch.xpu.os
+    torch.cuda.torch = torch.xpu.torch
+    torch.cuda.BFloat16Storage = torch.xpu.BFloat16Storage
+    torch.cuda.Union = torch.xpu.Union
+    torch.cuda.DoubleTensor = torch.xpu.DoubleTensor
+    torch.cuda.ShortTensor = torch.xpu.ShortTensor
+    torch.cuda.LongTensor = torch.xpu.LongTensor
+    torch.cuda.IntStorage = torch.xpu.IntStorage
+    torch.cuda.LongStorage = torch.xpu.LongStorage
+    torch.cuda.__annotations__ = torch.xpu.__annotations__
+    torch.cuda.__package__ = torch.xpu.__package__
+    torch.cuda.__builtins__ = torch.xpu.__builtins__
+    torch.cuda.CharTensor = torch.xpu.CharTensor
+    torch.cuda.List = torch.xpu.List
+    torch.cuda._lazy_init = torch.xpu._lazy_init
+    torch.cuda.BFloat16Tensor = torch.xpu.BFloat16Tensor
+    torch.cuda.DoubleStorage = torch.xpu.DoubleStorage
+    torch.cuda.ByteTensor = torch.xpu.ByteTensor
+    torch.cuda.StreamContext = torch.xpu.StreamContext
+    torch.cuda.ComplexDoubleStorage = torch.xpu.ComplexDoubleStorage
+    torch.cuda.ShortStorage = torch.xpu.ShortStorage
+    torch.cuda._lazy_call = torch.xpu._lazy_call
+    torch.cuda.HalfStorage = torch.xpu.HalfStorage
+    torch.cuda.random = torch.xpu.random
+    torch.cuda._device = torch.xpu._device
+    torch.cuda.classproperty = torch.xpu.classproperty
+    torch.cuda.__name__ = torch.xpu.__name__
+    torch.cuda._device_t = torch.xpu._device_t
+    torch.cuda.warnings = torch.xpu.warnings
+    torch.cuda.__spec__ = torch.xpu.__spec__
+    torch.cuda.BoolTensor = torch.xpu.BoolTensor
+    torch.cuda.CharStorage = torch.xpu.CharStorage
+    torch.cuda.__file__ = torch.xpu.__file__
+    torch.cuda._is_in_bad_fork = torch.xpu.lazy_init._is_in_bad_fork
+    #torch.cuda.is_current_stream_capturing = torch.xpu.is_current_stream_capturing
+
+    #Memory:
+    torch.cuda.memory = torch.xpu.memory
+    if 'linux' in sys.platform and "WSL2" in os.popen("uname -a").read():
+        torch.xpu.empty_cache = lambda: None
+    torch.cuda.empty_cache = torch.xpu.empty_cache
+    torch.cuda.memory_stats = torch.xpu.memory_stats
+    torch.cuda.memory_summary = torch.xpu.memory_summary
+    torch.cuda.memory_snapshot = torch.xpu.memory_snapshot
+    torch.cuda.memory_allocated = torch.xpu.memory_allocated
+    torch.cuda.max_memory_allocated = torch.xpu.max_memory_allocated
+    torch.cuda.memory_reserved = torch.xpu.memory_reserved
+    torch.cuda.memory_cached = torch.xpu.memory_reserved
+    torch.cuda.max_memory_reserved = torch.xpu.max_memory_reserved
+    torch.cuda.max_memory_cached = torch.xpu.max_memory_reserved
+    torch.cuda.reset_peak_memory_stats = torch.xpu.reset_peak_memory_stats
+    torch.cuda.reset_max_memory_cached = torch.xpu.reset_peak_memory_stats
+    torch.cuda.reset_max_memory_allocated = torch.xpu.reset_peak_memory_stats
+    torch.cuda.memory_stats_as_nested_dict = torch.xpu.memory_stats_as_nested_dict
+    torch.cuda.reset_accumulated_memory_stats = torch.xpu.reset_accumulated_memory_stats
+
+    #RNG:
+    torch.cuda.get_rng_state = torch.xpu.get_rng_state
+    torch.cuda.get_rng_state_all = torch.xpu.get_rng_state_all
+    torch.cuda.set_rng_state = torch.xpu.set_rng_state
+    torch.cuda.set_rng_state_all = torch.xpu.set_rng_state_all
+    torch.cuda.manual_seed = torch.xpu.manual_seed
+    torch.cuda.manual_seed_all = torch.xpu.manual_seed_all
+    torch.cuda.seed = torch.xpu.seed
+    torch.cuda.seed_all = torch.xpu.seed_all
+    torch.cuda.initial_seed = torch.xpu.initial_seed
+
+    #AMP:
+    torch.cuda.amp = torch.xpu.amp
+    if not hasattr(torch.cuda.amp, "common"):
+        torch.cuda.amp.common = contextlib.nullcontext()
+    torch.cuda.amp.common.amp_definitely_not_available = lambda: False
+    try:
+        torch.cuda.amp.GradScaler = torch.xpu.amp.GradScaler
+    except Exception: # pylint: disable=broad-exception-caught
+        try:
+            from .gradscaler import gradscaler_init # pylint: disable=import-outside-toplevel, import-error
+            gradscaler_init()
+            torch.cuda.amp.GradScaler = torch.xpu.amp.GradScaler
+        except Exception: # pylint: disable=broad-exception-caught
+            torch.cuda.amp.GradScaler = ipex.cpu.autocast._grad_scaler.GradScaler
+
+    #C
+    torch._C._cuda_getCurrentRawStream = ipex._C._getCurrentStream
+    ipex._C._DeviceProperties.major = 2023
+    ipex._C._DeviceProperties.minor = 2
+
+    #Fix functions with ipex:
+    torch.cuda.mem_get_info = lambda device=None: [(torch.xpu.get_device_properties(device).total_memory - torch.xpu.memory_allocated(device)), torch.xpu.get_device_properties(device).total_memory]
+    torch._utils._get_available_device_type = lambda: "xpu"
+    torch.has_cuda = True
+    torch.cuda.has_half = True
+    torch.cuda.is_bf16_supported = lambda: True
+    torch.cuda.is_fp16_supported = lambda: True
+    torch.version.cuda = "11.7"
+    torch.cuda.get_device_capability = lambda: [11,7]
+    torch.cuda.get_device_properties.major = 11
+    torch.cuda.get_device_properties.minor = 7
+    torch.cuda.ipc_collect = lambda: None
+    torch.cuda.utilization = lambda: 0
+
+    ipex_hijacks()
+    try:
+        from .diffusers import ipex_diffusers # pylint: disable=import-outside-toplevel, import-error
+        ipex_diffusers()
+    except Exception: # pylint: disable=broad-exception-caught
+        pass
diff --git a/library/ipex/diffusers.py b/library/ipex/diffusers.py
new file mode 100644
index 000000000..18563b061
--- /dev/null
+++ b/library/ipex/diffusers.py
@@ -0,0 +1,262 @@
+import torch
+import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
+import torch.nn.functional as F # pylint: disable=ungrouped-imports
+import diffusers #0.20.2 # pylint: disable=import-error
+
+# pylint: disable=protected-access, missing-function-docstring, line-too-long
+
+Attention = diffusers.models.attention_processor.Attention
+
+class SlicedAttnProcessor: # pylint: disable=too-few-public-methods
+    r"""
+    Processor for implementing sliced attention.
+
+    Args:
+        slice_size (`int`, *optional*):
+            The number of steps to compute attention. Uses as many slices as `attention_head_dim // slice_size`, and
+            `attention_head_dim` must be a multiple of the `slice_size`.
+    """
+
+    def __init__(self, slice_size):
+        self.slice_size = slice_size
+
+    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None): # pylint: disable=too-many-statements, too-many-locals, too-many-branches
+        residual = hidden_states
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+        dim = query.shape[-1]
+        query = attn.head_to_batch_dim(query)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        batch_size_attention, query_tokens, shape_three = query.shape
+        hidden_states = torch.zeros(
+            (batch_size_attention, query_tokens, dim // attn.heads), device=query.device, dtype=query.dtype
+        )
+
+        #ARC GPUs can't allocate more than 4GB to a single block, Slice it:
+        block_multiply = 2.4 if query.dtype == torch.float32 else 1.2
+        block_size = (batch_size_attention * query_tokens * shape_three) / 1024 * block_multiply #MB
+        split_2_slice_size = query_tokens
+        if block_size >= 4000:
+            do_split_2 = True
+            #Find something divisible with the query_tokens
+            while ((self.slice_size * split_2_slice_size * shape_three) / 1024 * block_multiply) > 4000:
+                split_2_slice_size = split_2_slice_size // 2
+                if split_2_slice_size <= 1:
+                    split_2_slice_size = 1
+                    break
+        else:
+            do_split_2 = False
+
+        for i in range(batch_size_attention // self.slice_size):
+            start_idx = i * self.slice_size
+            end_idx = (i + 1) * self.slice_size
+
+            if do_split_2:
+                for i2 in range(query_tokens // split_2_slice_size): # pylint: disable=invalid-name
+                    start_idx_2 = i2 * split_2_slice_size
+                    end_idx_2 = (i2 + 1) * split_2_slice_size
+
+                    query_slice = query[start_idx:end_idx, start_idx_2:end_idx_2]
+                    key_slice = key[start_idx:end_idx, start_idx_2:end_idx_2]
+                    attn_mask_slice = attention_mask[start_idx:end_idx, start_idx_2:end_idx_2] if attention_mask is not None else None
+
+                    attn_slice = attn.get_attention_scores(query_slice, key_slice, attn_mask_slice)
+                    attn_slice = torch.bmm(attn_slice, value[start_idx:end_idx, start_idx_2:end_idx_2])
+
+                    hidden_states[start_idx:end_idx, start_idx_2:end_idx_2] = attn_slice
+            else:
+                query_slice = query[start_idx:end_idx]
+                key_slice = key[start_idx:end_idx]
+                attn_mask_slice = attention_mask[start_idx:end_idx] if attention_mask is not None else None
+
+                attn_slice = attn.get_attention_scores(query_slice, key_slice, attn_mask_slice)
+
+                attn_slice = torch.bmm(attn_slice, value[start_idx:end_idx])
+
+                hidden_states[start_idx:end_idx] = attn_slice
+
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+class AttnProcessor2_0: # pylint: disable=too-few-public-methods, invalid-name
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+
+    def __call__( # pylint: disable=too-many-arguments, too-many-statements, too-many-locals, too-many-branches
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+    ):
+        residual = hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        #ARC GPUs can't allocate more than 4GB to a single block, Slice it:
+        shape_one, batch_size_attention, query_tokens, shape_four = query.shape
+        block_multiply = 2.4 if query.dtype == torch.float32 else 1.2
+        block_size = (shape_one * batch_size_attention * query_tokens * shape_four) / 1024 * block_multiply #MB
+        split_slice_size = batch_size_attention
+        if block_size >= 4000:
+            do_split = True
+            #Find something divisible with the shape_one
+            while ((shape_one * split_slice_size * query_tokens * shape_four) / 1024 * block_multiply) > 4000:
+                split_slice_size = split_slice_size // 2
+                if split_slice_size <= 1:
+                    split_slice_size = 1
+                    break
+        else:
+            do_split = False
+
+        split_block_size = (shape_one * split_slice_size * query_tokens * shape_four) / 1024 * block_multiply #MB
+        split_2_slice_size = query_tokens
+        if split_block_size >= 4000:
+            do_split_2 = True
+            #Find something divisible with the batch_size_attention
+            while ((shape_one * split_slice_size * split_2_slice_size * shape_four) / 1024 * block_multiply) > 4000:
+                split_2_slice_size = split_2_slice_size // 2
+                if split_2_slice_size <= 1:
+                    split_2_slice_size = 1
+                    break
+        else:
+            do_split_2 = False
+
+        if do_split:
+            hidden_states = torch.zeros(query.shape, device=query.device, dtype=query.dtype)
+            for i in range(batch_size_attention // split_slice_size):
+                start_idx = i * split_slice_size
+                end_idx = (i + 1) * split_slice_size
+                if do_split_2:
+                    for i2 in range(query_tokens // split_2_slice_size): # pylint: disable=invalid-name
+                        start_idx_2 = i2 * split_2_slice_size
+                        end_idx_2 = (i2 + 1) * split_2_slice_size
+
+                        query_slice = query[:, start_idx:end_idx, start_idx_2:end_idx_2]
+                        key_slice = key[:, start_idx:end_idx, start_idx_2:end_idx_2]
+                        attn_mask_slice = attention_mask[:, start_idx:end_idx, start_idx_2:end_idx_2] if attention_mask is not None else None
+
+                        attn_slice = F.scaled_dot_product_attention(
+                        query_slice, key_slice, value[:, start_idx:end_idx, start_idx_2:end_idx_2],
+                        attn_mask=attn_mask_slice, dropout_p=0.0, is_causal=False
+                        )
+                        hidden_states[:, start_idx:end_idx, start_idx_2:end_idx_2] = attn_slice
+                else:
+                    query_slice = query[:, start_idx:end_idx]
+                    key_slice = key[:, start_idx:end_idx]
+                    attn_mask_slice = attention_mask[:, start_idx:end_idx] if attention_mask is not None else None
+
+                    attn_slice = F.scaled_dot_product_attention(
+                        query_slice, key_slice, value[:, start_idx:end_idx],
+                        attn_mask=attn_mask_slice, dropout_p=0.0, is_causal=False
+                    )
+                    hidden_states[:, start_idx:end_idx] = attn_slice
+        else:
+            hidden_states = F.scaled_dot_product_attention(
+                query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+            )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+def ipex_diffusers():
+    #ARC GPUs can't allocate more than 4GB to a single block:
+    diffusers.models.attention_processor.SlicedAttnProcessor = SlicedAttnProcessor
+    diffusers.models.attention_processor.AttnProcessor2_0 = AttnProcessor2_0
diff --git a/library/ipex/gradscaler.py b/library/ipex/gradscaler.py
new file mode 100644
index 000000000..530212101
--- /dev/null
+++ b/library/ipex/gradscaler.py
@@ -0,0 +1,179 @@
+from collections import defaultdict
+import torch
+import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
+import intel_extension_for_pytorch._C as core # pylint: disable=import-error, unused-import
+
+# pylint: disable=protected-access, missing-function-docstring, line-too-long
+
+OptState = ipex.cpu.autocast._grad_scaler.OptState
+_MultiDeviceReplicator = ipex.cpu.autocast._grad_scaler._MultiDeviceReplicator
+_refresh_per_optimizer_state = ipex.cpu.autocast._grad_scaler._refresh_per_optimizer_state
+
+def _unscale_grads_(self, optimizer, inv_scale, found_inf, allow_fp16): # pylint: disable=unused-argument
+    per_device_inv_scale = _MultiDeviceReplicator(inv_scale)
+    per_device_found_inf = _MultiDeviceReplicator(found_inf)
+
+    # To set up _amp_foreach_non_finite_check_and_unscale_, split grads by device and dtype.
+    # There could be hundreds of grads, so we'd like to iterate through them just once.
+    # However, we don't know their devices or dtypes in advance.
+
+    # https://stackoverflow.com/questions/5029934/defaultdict-of-defaultdict
+    # Google says mypy struggles with defaultdicts type annotations.
+    per_device_and_dtype_grads = defaultdict(lambda: defaultdict(list))  # type: ignore[var-annotated]
+    # sync grad to master weight
+    if hasattr(optimizer, "sync_grad"):
+        optimizer.sync_grad()
+    with torch.no_grad():
+        for group in optimizer.param_groups:
+            for param in group["params"]:
+                if param.grad is None:
+                    continue
+                if (not allow_fp16) and param.grad.dtype == torch.float16:
+                    raise ValueError("Attempting to unscale FP16 gradients.")
+                if param.grad.is_sparse:
+                    # is_coalesced() == False means the sparse grad has values with duplicate indices.
+                    # coalesce() deduplicates indices and adds all values that have the same index.
+                    # For scaled fp16 values, there's a good chance coalescing will cause overflow,
+                    # so we should check the coalesced _values().
+                    if param.grad.dtype is torch.float16:
+                        param.grad = param.grad.coalesce()
+                    to_unscale = param.grad._values()
+                else:
+                    to_unscale = param.grad
+
+                # -: is there a way to split by device and dtype without appending in the inner loop?
+                to_unscale = to_unscale.to("cpu")
+                per_device_and_dtype_grads[to_unscale.device][
+                    to_unscale.dtype
+                ].append(to_unscale)
+
+        for _, per_dtype_grads in per_device_and_dtype_grads.items():
+            for grads in per_dtype_grads.values():
+                core._amp_foreach_non_finite_check_and_unscale_(
+                    grads,
+                    per_device_found_inf.get("cpu"),
+                    per_device_inv_scale.get("cpu"),
+                )
+
+    return per_device_found_inf._per_device_tensors
+
+def unscale_(self, optimizer):
+    """
+    Divides ("unscales") the optimizer's gradient tensors by the scale factor.
+    :meth:`unscale_` is optional, serving cases where you need to
+    :ref:`modify or inspect gradients<working-with-unscaled-gradients>`
+    between the backward pass(es) and :meth:`step`.
+    If :meth:`unscale_` is not called explicitly,  gradients will be unscaled  automatically during :meth:`step`.
+    Simple example, using :meth:`unscale_` to enable clipping of unscaled gradients::
+        ...
+        scaler.scale(loss).backward()
+        scaler.unscale_(optimizer)
+        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
+        scaler.step(optimizer)
+        scaler.update()
+    Args:
+        optimizer (torch.optim.Optimizer):  Optimizer that owns the gradients to be unscaled.
+    .. warning::
+        :meth:`unscale_` should only be called once per optimizer per :meth:`step` call,
+        and only after all gradients for that optimizer's assigned parameters have been accumulated.
+        Calling :meth:`unscale_` twice for a given optimizer between each :meth:`step` triggers a RuntimeError.
+    .. warning::
+        :meth:`unscale_` may unscale sparse gradients out of place, replacing the ``.grad`` attribute.
+    """
+    if not self._enabled:
+        return
+
+    self._check_scale_growth_tracker("unscale_")
+
+    optimizer_state = self._per_optimizer_states[id(optimizer)]
+
+    if optimizer_state["stage"] is OptState.UNSCALED: # pylint: disable=no-else-raise
+        raise RuntimeError(
+            "unscale_() has already been called on this optimizer since the last update()."
+        )
+    elif optimizer_state["stage"] is OptState.STEPPED:
+        raise RuntimeError("unscale_() is being called after step().")
+
+    # FP32 division can be imprecise for certain compile options, so we carry out the reciprocal in FP64.
+    assert self._scale is not None
+    inv_scale = self._scale.to("cpu").double().reciprocal().float().to(self._scale.device)
+    found_inf = torch.full(
+        (1,), 0.0, dtype=torch.float32, device=self._scale.device
+    )
+
+    optimizer_state["found_inf_per_device"] = self._unscale_grads_(
+        optimizer, inv_scale, found_inf, False
+    )
+    optimizer_state["stage"] = OptState.UNSCALED
+
+def update(self, new_scale=None):
+    """
+    Updates the scale factor.
+    If any optimizer steps were skipped the scale is multiplied by ``backoff_factor``
+    to reduce it. If ``growth_interval`` unskipped iterations occurred consecutively,
+    the scale is multiplied by ``growth_factor`` to increase it.
+    Passing ``new_scale`` sets the new scale value manually. (``new_scale`` is not
+    used directly, it's used to fill GradScaler's internal scale tensor. So if
+    ``new_scale`` was a tensor, later in-place changes to that tensor will not further
+    affect the scale GradScaler uses internally.)
+    Args:
+        new_scale (float or :class:`torch.FloatTensor`, optional, default=None):  New scale factor.
+    .. warning::
+        :meth:`update` should only be called at the end of the iteration, after ``scaler.step(optimizer)`` has
+        been invoked for all optimizers used this iteration.
+    """
+    if not self._enabled:
+        return
+
+    _scale, _growth_tracker = self._check_scale_growth_tracker("update")
+
+    if new_scale is not None:
+        # Accept a new user-defined scale.
+        if isinstance(new_scale, float):
+            self._scale.fill_(new_scale)  # type: ignore[union-attr]
+        else:
+            reason = "new_scale should be a float or a 1-element torch.FloatTensor with requires_grad=False."
+            assert isinstance(new_scale, torch.FloatTensor), reason  # type: ignore[attr-defined]
+            assert new_scale.numel() == 1, reason
+            assert new_scale.requires_grad is False, reason
+            self._scale.copy_(new_scale)  # type: ignore[union-attr]
+    else:
+        # Consume shared inf/nan data collected from optimizers to update the scale.
+        # If all found_inf tensors are on the same device as self._scale, this operation is asynchronous.
+        found_infs = [
+            found_inf.to(device="cpu", non_blocking=True)
+            for state in self._per_optimizer_states.values()
+            for found_inf in state["found_inf_per_device"].values()
+        ]
+
+        assert len(found_infs) > 0, "No inf checks were recorded prior to update."
+
+        found_inf_combined = found_infs[0]
+        if len(found_infs) > 1:
+            for i in range(1, len(found_infs)):
+                found_inf_combined += found_infs[i]
+
+        to_device = _scale.device
+        _scale = _scale.to("cpu")
+        _growth_tracker = _growth_tracker.to("cpu")
+
+        core._amp_update_scale_(
+            _scale,
+            _growth_tracker,
+            found_inf_combined,
+            self._growth_factor,
+            self._backoff_factor,
+            self._growth_interval,
+        )
+
+        _scale = _scale.to(to_device)
+        _growth_tracker = _growth_tracker.to(to_device)
+    # To prepare for next iteration, clear the data collected from optimizers this iteration.
+    self._per_optimizer_states = defaultdict(_refresh_per_optimizer_state)
+
+def gradscaler_init():
+    torch.xpu.amp.GradScaler = ipex.cpu.autocast._grad_scaler.GradScaler
+    torch.xpu.amp.GradScaler._unscale_grads_ = _unscale_grads_
+    torch.xpu.amp.GradScaler.unscale_ = unscale_
+    torch.xpu.amp.GradScaler.update = update
+    return torch.xpu.amp.GradScaler
diff --git a/library/ipex/hijacks.py b/library/ipex/hijacks.py
new file mode 100644
index 000000000..f527e8836
--- /dev/null
+++ b/library/ipex/hijacks.py
@@ -0,0 +1,199 @@
+import contextlib
+import importlib
+import torch
+import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
+
+# pylint: disable=protected-access, missing-function-docstring, line-too-long, unnecessary-lambda, no-else-return
+
+class CondFunc: # pylint: disable=missing-class-docstring
+    def __new__(cls, orig_func, sub_func, cond_func):
+        self = super(CondFunc, cls).__new__(cls)
+        if isinstance(orig_func, str):
+            func_path = orig_func.split('.')
+            for i in range(len(func_path)-1, -1, -1):
+                try:
+                    resolved_obj = importlib.import_module('.'.join(func_path[:i]))
+                    break
+                except ImportError:
+                    pass
+            for attr_name in func_path[i:-1]:
+                resolved_obj = getattr(resolved_obj, attr_name)
+            orig_func = getattr(resolved_obj, func_path[-1])
+            setattr(resolved_obj, func_path[-1], lambda *args, **kwargs: self(*args, **kwargs))
+        self.__init__(orig_func, sub_func, cond_func)
+        return lambda *args, **kwargs: self(*args, **kwargs)
+    def __init__(self, orig_func, sub_func, cond_func):
+        self.__orig_func = orig_func
+        self.__sub_func = sub_func
+        self.__cond_func = cond_func
+    def __call__(self, *args, **kwargs):
+        if not self.__cond_func or self.__cond_func(self.__orig_func, *args, **kwargs):
+            return self.__sub_func(self.__orig_func, *args, **kwargs)
+        else:
+            return self.__orig_func(*args, **kwargs)
+
+_utils = torch.utils.data._utils
+def _shutdown_workers(self):
+    if _utils is None or _utils.python_exit_status is True or _utils.python_exit_status is None:
+        return
+    if hasattr(self, "_shutdown") and not self._shutdown:
+        self._shutdown = True
+        try:
+            if hasattr(self, '_pin_memory_thread'):
+                self._pin_memory_thread_done_event.set()
+                self._worker_result_queue.put((None, None))
+                self._pin_memory_thread.join()
+                self._worker_result_queue.cancel_join_thread()
+                self._worker_result_queue.close()
+            self._workers_done_event.set()
+            for worker_id in range(len(self._workers)):
+                if self._persistent_workers or self._workers_status[worker_id]:
+                    self._mark_worker_as_unavailable(worker_id, shutdown=True)
+            for w in self._workers: # pylint: disable=invalid-name
+                w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
+            for q in self._index_queues: # pylint: disable=invalid-name
+                q.cancel_join_thread()
+                q.close()
+        finally:
+            if self._worker_pids_set:
+                _utils.signal_handling._remove_worker_pids(id(self))
+                self._worker_pids_set = False
+            for w in self._workers: # pylint: disable=invalid-name
+                if w.is_alive():
+                    w.terminate()
+
+class DummyDataParallel(torch.nn.Module): # pylint: disable=missing-class-docstring, unused-argument, too-few-public-methods
+    def __new__(cls, module, device_ids=None, output_device=None, dim=0): # pylint: disable=unused-argument
+        if isinstance(device_ids, list) and len(device_ids) > 1:
+            print("IPEX backend doesn't support DataParallel on multiple XPU devices")
+        return module.to("xpu")
+
+def return_null_context(*args, **kwargs): # pylint: disable=unused-argument
+    return contextlib.nullcontext()
+
+def check_device(device):
+    return bool((isinstance(device, torch.device) and device.type == "cuda") or (isinstance(device, str) and "cuda" in device) or isinstance(device, int))
+
+def return_xpu(device):
+    return f"xpu:{device[-1]}" if isinstance(device, str) and ":" in device else f"xpu:{device}" if isinstance(device, int) else torch.device("xpu") if isinstance(device, torch.device) else "xpu"
+
+def ipex_no_cuda(orig_func, *args, **kwargs):
+    torch.cuda.is_available = lambda: False
+    orig_func(*args, **kwargs)
+    torch.cuda.is_available = torch.xpu.is_available
+
+original_autocast = torch.autocast
+def ipex_autocast(*args, **kwargs):
+    if len(args) > 1 and args[0] == "cuda":
+        return original_autocast("xpu", *args[1:], **kwargs)
+    else:
+        return original_autocast(*args, **kwargs)
+
+original_torch_cat = torch.cat
+def torch_cat(tensor, *args, **kwargs):
+    if len(tensor) == 3 and (tensor[0].dtype != tensor[1].dtype or tensor[2].dtype != tensor[1].dtype):
+        return original_torch_cat([tensor[0].to(tensor[1].dtype), tensor[1], tensor[2].to(tensor[1].dtype)], *args, **kwargs)
+    else:
+        return original_torch_cat(tensor, *args, **kwargs)
+
+original_interpolate = torch.nn.functional.interpolate
+def interpolate(tensor, size=None, scale_factor=None, mode='nearest', align_corners=None, recompute_scale_factor=None, antialias=False): # pylint: disable=too-many-arguments
+    if antialias or align_corners is not None:
+        return_device = tensor.device
+        return_dtype = tensor.dtype
+        return original_interpolate(tensor.to("cpu", dtype=torch.float32), size=size, scale_factor=scale_factor, mode=mode,
+        align_corners=align_corners, recompute_scale_factor=recompute_scale_factor, antialias=antialias).to(return_device, dtype=return_dtype)
+    else:
+        return original_interpolate(tensor, size=size, scale_factor=scale_factor, mode=mode,
+        align_corners=align_corners, recompute_scale_factor=recompute_scale_factor, antialias=antialias)
+
+original_linalg_solve = torch.linalg.solve
+def linalg_solve(A, B, *args, **kwargs): # pylint: disable=invalid-name
+    if A.device != torch.device("cpu") or B.device != torch.device("cpu"):
+        return_device = A.device
+        return original_linalg_solve(A.to("cpu"), B.to("cpu"), *args, **kwargs).to(return_device)
+    else:
+        return original_linalg_solve(A, B, *args, **kwargs)
+
+def ipex_hijacks():
+    CondFunc('torch.Tensor.to',
+        lambda orig_func, self, device=None, *args, **kwargs: orig_func(self, return_xpu(device), *args, **kwargs),
+        lambda orig_func, self, device=None, *args, **kwargs: check_device(device))
+    CondFunc('torch.Tensor.cuda',
+        lambda orig_func, self, device=None, *args, **kwargs: orig_func(self, return_xpu(device), *args, **kwargs),
+        lambda orig_func, self, device=None, *args, **kwargs: check_device(device))
+    CondFunc('torch.empty',
+        lambda orig_func, *args, device=None, **kwargs: orig_func(*args, device=return_xpu(device), **kwargs),
+        lambda orig_func, *args, device=None, **kwargs: check_device(device))
+    CondFunc('torch.load',
+        lambda orig_func, *args, map_location=None, **kwargs: orig_func(*args, return_xpu(map_location), **kwargs),
+        lambda orig_func, *args, map_location=None, **kwargs: map_location is None or check_device(map_location))
+    CondFunc('torch.randn',
+        lambda orig_func, *args, device=None, **kwargs: orig_func(*args, device=return_xpu(device), **kwargs),
+        lambda orig_func, *args, device=None, **kwargs: check_device(device))
+    CondFunc('torch.ones',
+        lambda orig_func, *args, device=None, **kwargs: orig_func(*args, device=return_xpu(device), **kwargs),
+        lambda orig_func, *args, device=None, **kwargs: check_device(device))
+    CondFunc('torch.zeros',
+        lambda orig_func, *args, device=None, **kwargs: orig_func(*args, device=return_xpu(device), **kwargs),
+        lambda orig_func, *args, device=None, **kwargs: check_device(device))
+    CondFunc('torch.tensor',
+        lambda orig_func, *args, device=None, **kwargs: orig_func(*args, device=return_xpu(device), **kwargs),
+        lambda orig_func, *args, device=None, **kwargs: check_device(device))
+    CondFunc('torch.linspace',
+        lambda orig_func, *args, device=None, **kwargs: orig_func(*args, device=return_xpu(device), **kwargs),
+        lambda orig_func, *args, device=None, **kwargs: check_device(device))
+
+    CondFunc('torch.Generator',
+        lambda orig_func, device=None: torch.xpu.Generator(device),
+        lambda orig_func, device=None: device is not None and device != torch.device("cpu") and device != "cpu")
+
+    CondFunc('torch.batch_norm',
+        lambda orig_func, input, weight, bias, *args, **kwargs: orig_func(input,
+        weight if weight is not None else torch.ones(input.size()[1], device=input.device),
+        bias if bias is not None else torch.zeros(input.size()[1], device=input.device), *args, **kwargs),
+        lambda orig_func, input, *args, **kwargs: input.device != torch.device("cpu"))
+    CondFunc('torch.instance_norm',
+        lambda orig_func, input, weight, bias, *args, **kwargs: orig_func(input,
+        weight if weight is not None else torch.ones(input.size()[1], device=input.device),
+        bias if bias is not None else torch.zeros(input.size()[1], device=input.device), *args, **kwargs),
+        lambda orig_func, input, *args, **kwargs: input.device != torch.device("cpu"))
+
+    #Functions with dtype errors:
+    CondFunc('torch.nn.modules.GroupNorm.forward',
+        lambda orig_func, self, input: orig_func(self, input.to(self.weight.data.dtype)),
+        lambda orig_func, self, input: input.dtype != self.weight.data.dtype)
+    CondFunc('torch.nn.modules.linear.Linear.forward',
+        lambda orig_func, self, input: orig_func(self, input.to(self.weight.data.dtype)),
+        lambda orig_func, self, input: input.dtype != self.weight.data.dtype)
+    CondFunc('torch.nn.modules.conv.Conv2d.forward',
+        lambda orig_func, self, input: orig_func(self, input.to(self.weight.data.dtype)),
+        lambda orig_func, self, input: input.dtype != self.weight.data.dtype)
+    CondFunc('torch.bmm',
+        lambda orig_func, input, mat2, *args, **kwargs: orig_func(input, mat2.to(input.dtype), *args, **kwargs),
+        lambda orig_func, input, mat2, *args, **kwargs: input.dtype != mat2.dtype)
+    CondFunc('torch.nn.functional.layer_norm',
+        lambda orig_func, input, normalized_shape=None, weight=None, *args, **kwargs:
+        orig_func(input.to(weight.data.dtype), normalized_shape, weight, *args, **kwargs),
+        lambda orig_func, input, normalized_shape=None, weight=None, *args, **kwargs:
+        weight is not None and input.dtype != weight.data.dtype)
+
+    #Diffusers Float64 (ARC GPUs doesn't support double or Float64):
+    if not torch.xpu.has_fp64_dtype():
+        CondFunc('torch.from_numpy',
+        lambda orig_func, ndarray: orig_func(ndarray.astype('float32')),
+        lambda orig_func, ndarray: ndarray.dtype == float)
+
+    #Broken functions when torch.cuda.is_available is True:
+    CondFunc('torch.utils.data.dataloader._BaseDataLoaderIter.__init__',
+        lambda orig_func, *args, **kwargs: ipex_no_cuda(orig_func, *args, **kwargs),
+        lambda orig_func, *args, **kwargs: True)
+
+    #Functions that make compile mad with CondFunc:
+    torch.utils.data.dataloader._MultiProcessingDataLoaderIter._shutdown_workers = _shutdown_workers
+    torch.nn.DataParallel = DummyDataParallel
+    torch.autocast = ipex_autocast
+    torch.cat = torch_cat
+    torch.linalg.solve = linalg_solve
+    torch.nn.functional.interpolate = interpolate
+    torch.backends.cuda.sdp_kernel = return_null_context
diff --git a/requirements_linux_ipex.txt b/requirements_linux_ipex.txt
new file mode 100644
index 000000000..e0fdb1e55
--- /dev/null
+++ b/requirements_linux_ipex.txt
@@ -0,0 +1,3 @@
+torch==2.0.1a0 torchvision==0.15.2a0 intel_extension_for_pytorch==2.0.110+xpu -f https://developer.intel.com/ipex-whl-stable-xpu # no_verify leave this to specify not checking this a verification stage
+tensorboard==2.12.3 tensorflow==2.12.0 intel-extension-for-tensorflow[gpu]
+-r requirements.txt
diff --git a/sdxl_gen_img.py b/sdxl_gen_img.py
index c506ad3fc..f0d308511 100755
--- a/sdxl_gen_img.py
+++ b/sdxl_gen_img.py
@@ -17,6 +17,13 @@
 import diffusers
 import numpy as np
 import torch
+try:
+    import intel_extension_for_pytorch as ipex
+    if torch.xpu.is_available():
+        from library.ipex import ipex_init
+        ipex_init()
+except Exception:
+    pass
 import torchvision
 from diffusers import (
     AutoencoderKL,
diff --git a/sdxl_minimal_inference.py b/sdxl_minimal_inference.py
index 5c8a0bd89..ff865629e 100644
--- a/sdxl_minimal_inference.py
+++ b/sdxl_minimal_inference.py
@@ -9,6 +9,13 @@
 from einops import repeat
 import numpy as np
 import torch
+try:
+    import intel_extension_for_pytorch as ipex
+    if torch.xpu.is_available():
+        from library.ipex import ipex_init
+        ipex_init()
+except Exception:
+    pass
 from tqdm import tqdm
 from transformers import CLIPTokenizer
 from diffusers import EulerDiscreteScheduler
diff --git a/sdxl_train.py b/sdxl_train.py
index 195467b00..6b255d679 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -10,6 +10,13 @@
 
 from tqdm import tqdm
 import torch
+try:
+    import intel_extension_for_pytorch as ipex
+    if torch.xpu.is_available():
+        from library.ipex import ipex_init
+        ipex_init()
+except Exception:
+    pass
 from accelerate.utils import set_seed
 from diffusers import DDPMScheduler
 from library import sdxl_model_util
diff --git a/sdxl_train_control_net_lllite.py b/sdxl_train_control_net_lllite.py
index 09cf16438..f8169bdbf 100644
--- a/sdxl_train_control_net_lllite.py
+++ b/sdxl_train_control_net_lllite.py
@@ -11,6 +11,13 @@
 
 from tqdm import tqdm
 import torch
+try:
+    import intel_extension_for_pytorch as ipex
+    if torch.xpu.is_available():
+        from library.ipex import ipex_init
+        ipex_init()
+except Exception:
+    pass
 from torch.nn.parallel import DistributedDataParallel as DDP
 from accelerate.utils import set_seed
 from diffusers import DDPMScheduler, ControlNetModel
diff --git a/sdxl_train_control_net_lllite_alt.py b/sdxl_train_control_net_lllite_alt.py
index 757194a10..61ebfb581 100644
--- a/sdxl_train_control_net_lllite_alt.py
+++ b/sdxl_train_control_net_lllite_alt.py
@@ -14,6 +14,13 @@
 
 from tqdm import tqdm
 import torch
+try:
+    import intel_extension_for_pytorch as ipex
+    if torch.xpu.is_available():
+        from library.ipex import ipex_init
+        ipex_init()
+except Exception:
+    pass
 from torch.nn.parallel import DistributedDataParallel as DDP
 from accelerate.utils import set_seed
 import accelerate
diff --git a/sdxl_train_network.py b/sdxl_train_network.py
index 8d3a81c3a..2de57c0ac 100644
--- a/sdxl_train_network.py
+++ b/sdxl_train_network.py
@@ -1,5 +1,12 @@
 import argparse
 import torch
+try:
+    import intel_extension_for_pytorch as ipex
+    if torch.xpu.is_available():
+        from library.ipex import ipex_init
+        ipex_init()
+except Exception:
+    pass
 from library import sdxl_model_util, sdxl_train_util, train_util
 import train_network
 
diff --git a/sdxl_train_textual_inversion.py b/sdxl_train_textual_inversion.py
index 123ca35a1..f5cca17b2 100644
--- a/sdxl_train_textual_inversion.py
+++ b/sdxl_train_textual_inversion.py
@@ -3,6 +3,13 @@
 
 import regex
 import torch
+try:
+    import intel_extension_for_pytorch as ipex
+    if torch.xpu.is_available():
+        from library.ipex import ipex_init
+        ipex_init()
+except Exception:
+    pass
 import open_clip
 from library import sdxl_model_util, sdxl_train_util, train_util
 
diff --git a/setup.sh b/setup.sh
index 3f521f37c..5cb6a4e83 100755
--- a/setup.sh
+++ b/setup.sh
@@ -27,6 +27,7 @@ Options:
   -s, --skip-space-check        Skip the 10Gb minimum storage space check.
   -u, --no-gui                  Skips launching the GUI.
   -v, --verbose                 Increase verbosity levels up to 3.
+      --use-ipex                Use IPEX with Intel ARC GPUs.
 EOF
 }
 
@@ -87,6 +88,7 @@ MAXVERBOSITY=6
 DIR=""
 PARENT_DIR=""
 VENV_DIR=""
+USE_IPEX=false
 
 # Function to get the distro name
 get_distro_name() {
@@ -203,6 +205,8 @@ install_python_dependencies() {
     "lin"*)
       if [ "$RUNPOD" = true ]; then
         python "$SCRIPT_DIR/setup/setup_linux.py" --platform-requirements-file=requirements_runpod.txt
+      elif [ "$USE_IPEX" = true ]; then
+        python "$SCRIPT_DIR/setup/setup_linux.py" --platform-requirements-file=requirements_linux_ipex.txt
       else
         python "$SCRIPT_DIR/setup/setup_linux.py" --platform-requirements-file=requirements_linux.txt
       fi
@@ -318,6 +322,7 @@ while getopts ":vb:d:g:inprus-:" opt; do
   s | skip-space-check) SKIP_SPACE_CHECK=true ;;
   u | no-gui) SKIP_GUI=true ;;
   v) ((VERBOSITY = VERBOSITY + 1)) ;;
+  use-ipex) USE_IPEX=true ;;
   h) display_help && exit 0 ;;
   *) display_help && exit 0 ;;
   esac
diff --git a/setup/setup_common.py b/setup/setup_common.py
index 8d94ca9f3..d9cacf35f 100644
--- a/setup/setup_common.py
+++ b/setup/setup_common.py
@@ -195,12 +195,24 @@ def check_torch():
         '/opt/rocm/bin/rocminfo'
     ):
         log.info('AMD toolkit detected')
+    elif (shutil.which('sycl-ls') is not None
+    or os.environ.get('ONEAPI_ROOT') is not None
+    or os.path.exists('/opt/intel/oneapi')):
+        log.info('Intel OneAPI toolkit detected')
     else:
         log.info('Using CPU-only Torch')
 
     try:
         import torch
-
+        try:
+            import intel_extension_for_pytorch as ipex
+            if torch.xpu.is_available():
+                from library.ipex import ipex_init
+                ipex_init()
+                os.environ.setdefault('NEOReadDebugKeys', '1')
+                os.environ.setdefault('ClDeviceGlobalMemSizeAvailablePercent', '100')
+        except Exception:
+            pass
         log.info(f'Torch {torch.__version__}')
 
         # Check if CUDA is available
@@ -208,10 +220,14 @@ def check_torch():
             log.warning('Torch reports CUDA not available')
         else:
             if torch.version.cuda:
-                # Log nVidia CUDA and cuDNN versions
-                log.info(
-                    f'Torch backend: nVidia CUDA {torch.version.cuda} cuDNN {torch.backends.cudnn.version() if torch.backends.cudnn.is_available() else "N/A"}'
-                )
+                if hasattr(torch, "xpu") and torch.xpu.is_available():
+                    # Log Intel IPEX OneAPI version
+                    log.info(f'Torch backend: Intel IPEX OneAPI {ipex.__version__}')
+                else:
+                    # Log nVidia CUDA and cuDNN versions
+                    log.info(
+                        f'Torch backend: nVidia CUDA {torch.version.cuda} cuDNN {torch.backends.cudnn.version() if torch.backends.cudnn.is_available() else "N/A"}'
+                    )
             elif torch.version.hip:
                 # Log AMD ROCm HIP version
                 log.info(f'Torch backend: AMD ROCm HIP {torch.version.hip}')
@@ -222,9 +238,14 @@ def check_torch():
             for device in [
                 torch.cuda.device(i) for i in range(torch.cuda.device_count())
             ]:
-                log.info(
-                    f'Torch detected GPU: {torch.cuda.get_device_name(device)} VRAM {round(torch.cuda.get_device_properties(device).total_memory / 1024 / 1024)} Arch {torch.cuda.get_device_capability(device)} Cores {torch.cuda.get_device_properties(device).multi_processor_count}'
-                )
+                if hasattr(torch, "xpu") and torch.xpu.is_available():
+                    log.info(
+                        f'Torch detected GPU: Torch detected GPU: {torch.xpu.get_device_name(device)} VRAM {round(torch.xpu.get_device_properties(device).total_memory / 1024 / 1024)} Compute Units {torch.xpu.get_device_properties(device).max_compute_units}'
+                    )
+                else:
+                    log.info(
+                        f'Torch detected GPU: {torch.cuda.get_device_name(device)} VRAM {round(torch.cuda.get_device_properties(device).total_memory / 1024 / 1024)} Arch {torch.cuda.get_device_capability(device)} Cores {torch.cuda.get_device_properties(device).multi_processor_count}'
+                    )
                 return int(torch.__version__[0])
     except Exception as e:
         # log.warning(f'Could not load torch: {e}')
diff --git a/setup/validate_requirements.py b/setup/validate_requirements.py
index 73e94eb65..664ccfd9b 100644
--- a/setup/validate_requirements.py
+++ b/setup/validate_requirements.py
@@ -35,12 +35,22 @@ def check_torch():
         '/opt/rocm/bin/rocminfo'
     ):
         log.info('AMD toolkit detected')
+    elif (shutil.which('sycl-ls') is not None
+    or os.environ.get('ONEAPI_ROOT') is not None
+    or os.path.exists('/opt/intel/oneapi')):
+        log.info('Intel OneAPI toolkit detected')
     else:
         log.info('Using CPU-only Torch')
 
     try:
         import torch
-
+        try:
+            import intel_extension_for_pytorch as ipex
+            if torch.xpu.is_available():
+                from library.ipex import ipex_init
+                ipex_init()
+        except Exception:
+            pass
         log.info(f'Torch {torch.__version__}')
 
         # Check if CUDA is available
@@ -48,10 +58,14 @@ def check_torch():
             log.warning('Torch reports CUDA not available')
         else:
             if torch.version.cuda:
-                # Log nVidia CUDA and cuDNN versions
-                log.info(
-                    f'Torch backend: nVidia CUDA {torch.version.cuda} cuDNN {torch.backends.cudnn.version() if torch.backends.cudnn.is_available() else "N/A"}'
-                )
+                if hasattr(torch, "xpu") and torch.xpu.is_available():
+                    # Log Intel IPEX OneAPI version
+                    log.info(f'Torch backend: Intel IPEX {ipex.__version__}')
+                else:
+                    # Log nVidia CUDA and cuDNN versions
+                    log.info(
+                        f'Torch backend: nVidia CUDA {torch.version.cuda} cuDNN {torch.backends.cudnn.version() if torch.backends.cudnn.is_available() else "N/A"}'
+                    )
             elif torch.version.hip:
                 # Log AMD ROCm HIP version
                 log.info(f'Torch backend: AMD ROCm HIP {torch.version.hip}')
@@ -62,9 +76,14 @@ def check_torch():
             for device in [
                 torch.cuda.device(i) for i in range(torch.cuda.device_count())
             ]:
-                log.info(
-                    f'Torch detected GPU: {torch.cuda.get_device_name(device)} VRAM {round(torch.cuda.get_device_properties(device).total_memory / 1024 / 1024)} Arch {torch.cuda.get_device_capability(device)} Cores {torch.cuda.get_device_properties(device).multi_processor_count}'
-                )
+                if hasattr(torch, "xpu") and torch.xpu.is_available():
+                    log.info(
+                        f'Torch detected GPU: Torch detected GPU: {torch.xpu.get_device_name(device)} VRAM {round(torch.xpu.get_device_properties(device).total_memory / 1024 / 1024)} Compute Units {torch.xpu.get_device_properties(device).max_compute_units}'
+                    )
+                else:
+                    log.info(
+                        f'Torch detected GPU: {torch.cuda.get_device_name(device)} VRAM {round(torch.cuda.get_device_properties(device).total_memory / 1024 / 1024)} Arch {torch.cuda.get_device_capability(device)} Cores {torch.cuda.get_device_properties(device).multi_processor_count}'
+                    )
                 return int(torch.__version__[0])
     except Exception as e:
         log.error(f'Could not load torch: {e}')
diff --git a/train_controlnet.py b/train_controlnet.py
index 988304f62..42da44125 100644
--- a/train_controlnet.py
+++ b/train_controlnet.py
@@ -11,6 +11,13 @@
 
 from tqdm import tqdm
 import torch
+try:
+    import intel_extension_for_pytorch as ipex
+    if torch.xpu.is_available():
+        from library.ipex import ipex_init
+        ipex_init()
+except Exception:
+    pass
 from torch.nn.parallel import DistributedDataParallel as DDP
 from accelerate.utils import set_seed
 from diffusers import DDPMScheduler, ControlNetModel
diff --git a/train_db.py b/train_db.py
index 6dde7e9bf..feb147787 100644
--- a/train_db.py
+++ b/train_db.py
@@ -11,6 +11,13 @@
 
 from tqdm import tqdm
 import torch
+try:
+    import intel_extension_for_pytorch as ipex
+    if torch.xpu.is_available():
+        from library.ipex import ipex_init
+        ipex_init()
+except Exception:
+    pass
 from accelerate.utils import set_seed
 from diffusers import DDPMScheduler
 
diff --git a/train_network.py b/train_network.py
index f752607e9..200fc2cfe 100644
--- a/train_network.py
+++ b/train_network.py
@@ -12,6 +12,13 @@
 
 from tqdm import tqdm
 import torch
+try:
+    import intel_extension_for_pytorch as ipex
+    if torch.xpu.is_available():
+        from library.ipex import ipex_init
+        ipex_init()
+except Exception:
+    pass
 from accelerate.utils import set_seed
 from diffusers import DDPMScheduler
 from library import model_util
diff --git a/train_textual_inversion.py b/train_textual_inversion.py
index b65d524cf..1c7b7fcb2 100644
--- a/train_textual_inversion.py
+++ b/train_textual_inversion.py
@@ -7,6 +7,13 @@
 
 from tqdm import tqdm
 import torch
+try:
+    import intel_extension_for_pytorch as ipex
+    if torch.xpu.is_available():
+        from library.ipex import ipex_init
+        ipex_init()
+except Exception:
+    pass
 from accelerate.utils import set_seed
 from diffusers import DDPMScheduler
 from transformers import CLIPTokenizer
diff --git a/train_textual_inversion_XTI.py b/train_textual_inversion_XTI.py
index 79c64cbeb..2c5673be1 100644
--- a/train_textual_inversion_XTI.py
+++ b/train_textual_inversion_XTI.py
@@ -8,6 +8,13 @@
 
 from tqdm import tqdm
 import torch
+try:
+    import intel_extension_for_pytorch as ipex
+    if torch.xpu.is_available():
+        from library.ipex import ipex_init
+        ipex_init()
+except Exception:
+    pass
 from accelerate.utils import set_seed
 import diffusers
 from diffusers import DDPMScheduler

From f8fa3f3de6ed12049fa37e489022791c53f98621 Mon Sep 17 00:00:00 2001
From: Disty0 <semihgulec2005@gmail.com>
Date: Fri, 8 Sep 2023 04:34:19 +0300
Subject: [PATCH 02/33] Fix SDP

---
 gui.sh                      |  13 +-
 library/ipex/__init__.py    | 296 ++++++++++++++++++------------------
 library/ipex/attention.py   | 128 ++++++++++++++++
 library/ipex/diffusers.py   | 146 +-----------------
 library/ipex/hijacks.py     |  11 +-
 requirements_linux_ipex.txt |   2 +-
 6 files changed, 294 insertions(+), 302 deletions(-)
 create mode 100644 library/ipex/attention.py

diff --git a/gui.sh b/gui.sh
index 253d8577f..a7980ccb9 100755
--- a/gui.sh
+++ b/gui.sh
@@ -70,14 +70,19 @@ else
 fi
 
 #Set OneAPI environmet if it's not set by the user
-if [[ "$@" == *"--use-ipex"* ]] && [ ! -x "$(command -v sycl-ls)" ]
+if [[ "$@" == *"--use-ipex"* ]]
 then
     echo "Setting OneAPI environment"
-    if [[ -z "$ONEAPI_ROOT" ]]
+    if [ ! -x "$(command -v sycl-ls)" ]
     then
-        ONEAPI_ROOT=/opt/intel/oneapi
+        if [[ -z "$ONEAPI_ROOT" ]]
+        then
+            ONEAPI_ROOT=/opt/intel/oneapi
+        fi
+        source $ONEAPI_ROOT/setvars.sh
     fi
-    source $ONEAPI_ROOT/setvars.sh
+    export NEOReadDebugKeys=1
+    export ClDeviceGlobalMemSizeAvailablePercent=100
 fi
 
 # Validate the requirements and run the script if successful
diff --git a/library/ipex/__init__.py b/library/ipex/__init__.py
index 3d5f8da16..9ec69012f 100644
--- a/library/ipex/__init__.py
+++ b/library/ipex/__init__.py
@@ -4,161 +4,167 @@
 import torch
 import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
 from .hijacks import ipex_hijacks
+from .attention import attention_init
 
 # pylint: disable=protected-access, missing-function-docstring, line-too-long
 
 def ipex_init(): # pylint: disable=too-many-statements
-    #Replace cuda with xpu:
-    torch.cuda.current_device = torch.xpu.current_device
-    torch.cuda.current_stream = torch.xpu.current_stream
-    torch.cuda.device = torch.xpu.device
-    torch.cuda.device_count = torch.xpu.device_count
-    torch.cuda.device_of = torch.xpu.device_of
-    torch.cuda.getDeviceIdListForCard = torch.xpu.getDeviceIdListForCard
-    torch.cuda.get_device_name = torch.xpu.get_device_name
-    torch.cuda.get_device_properties = torch.xpu.get_device_properties
-    torch.cuda.init = torch.xpu.init
-    torch.cuda.is_available = torch.xpu.is_available
-    torch.cuda.is_initialized = torch.xpu.is_initialized
-    torch.cuda.is_current_stream_capturing = lambda: False
-    torch.cuda.set_device = torch.xpu.set_device
-    torch.cuda.stream = torch.xpu.stream
-    torch.cuda.synchronize = torch.xpu.synchronize
-    torch.cuda.Event = torch.xpu.Event
-    torch.cuda.Stream = torch.xpu.Stream
-    torch.cuda.FloatTensor = torch.xpu.FloatTensor
-    torch.Tensor.cuda = torch.Tensor.xpu
-    torch.Tensor.is_cuda = torch.Tensor.is_xpu
-    torch.cuda._initialization_lock = torch.xpu.lazy_init._initialization_lock
-    torch.cuda._initialized = torch.xpu.lazy_init._initialized
-    torch.cuda._lazy_seed_tracker = torch.xpu.lazy_init._lazy_seed_tracker
-    torch.cuda._queued_calls = torch.xpu.lazy_init._queued_calls
-    torch.cuda._tls = torch.xpu.lazy_init._tls
-    torch.cuda.threading = torch.xpu.lazy_init.threading
-    torch.cuda.traceback = torch.xpu.lazy_init.traceback
-    torch.cuda.Optional = torch.xpu.Optional
-    torch.cuda.__cached__ = torch.xpu.__cached__
-    torch.cuda.__loader__ = torch.xpu.__loader__
-    torch.cuda.ComplexFloatStorage = torch.xpu.ComplexFloatStorage
-    torch.cuda.Tuple = torch.xpu.Tuple
-    torch.cuda.streams = torch.xpu.streams
-    torch.cuda._lazy_new = torch.xpu._lazy_new
-    torch.cuda.FloatStorage = torch.xpu.FloatStorage
-    torch.cuda.Any = torch.xpu.Any
-    torch.cuda.__doc__ = torch.xpu.__doc__
-    torch.cuda.default_generators = torch.xpu.default_generators
-    torch.cuda.HalfTensor = torch.xpu.HalfTensor
-    torch.cuda._get_device_index = torch.xpu._get_device_index
-    torch.cuda.__path__ = torch.xpu.__path__
-    torch.cuda.Device = torch.xpu.Device
-    torch.cuda.IntTensor = torch.xpu.IntTensor
-    torch.cuda.ByteStorage = torch.xpu.ByteStorage
-    torch.cuda.set_stream = torch.xpu.set_stream
-    torch.cuda.BoolStorage = torch.xpu.BoolStorage
-    torch.cuda.os = torch.xpu.os
-    torch.cuda.torch = torch.xpu.torch
-    torch.cuda.BFloat16Storage = torch.xpu.BFloat16Storage
-    torch.cuda.Union = torch.xpu.Union
-    torch.cuda.DoubleTensor = torch.xpu.DoubleTensor
-    torch.cuda.ShortTensor = torch.xpu.ShortTensor
-    torch.cuda.LongTensor = torch.xpu.LongTensor
-    torch.cuda.IntStorage = torch.xpu.IntStorage
-    torch.cuda.LongStorage = torch.xpu.LongStorage
-    torch.cuda.__annotations__ = torch.xpu.__annotations__
-    torch.cuda.__package__ = torch.xpu.__package__
-    torch.cuda.__builtins__ = torch.xpu.__builtins__
-    torch.cuda.CharTensor = torch.xpu.CharTensor
-    torch.cuda.List = torch.xpu.List
-    torch.cuda._lazy_init = torch.xpu._lazy_init
-    torch.cuda.BFloat16Tensor = torch.xpu.BFloat16Tensor
-    torch.cuda.DoubleStorage = torch.xpu.DoubleStorage
-    torch.cuda.ByteTensor = torch.xpu.ByteTensor
-    torch.cuda.StreamContext = torch.xpu.StreamContext
-    torch.cuda.ComplexDoubleStorage = torch.xpu.ComplexDoubleStorage
-    torch.cuda.ShortStorage = torch.xpu.ShortStorage
-    torch.cuda._lazy_call = torch.xpu._lazy_call
-    torch.cuda.HalfStorage = torch.xpu.HalfStorage
-    torch.cuda.random = torch.xpu.random
-    torch.cuda._device = torch.xpu._device
-    torch.cuda.classproperty = torch.xpu.classproperty
-    torch.cuda.__name__ = torch.xpu.__name__
-    torch.cuda._device_t = torch.xpu._device_t
-    torch.cuda.warnings = torch.xpu.warnings
-    torch.cuda.__spec__ = torch.xpu.__spec__
-    torch.cuda.BoolTensor = torch.xpu.BoolTensor
-    torch.cuda.CharStorage = torch.xpu.CharStorage
-    torch.cuda.__file__ = torch.xpu.__file__
-    torch.cuda._is_in_bad_fork = torch.xpu.lazy_init._is_in_bad_fork
-    #torch.cuda.is_current_stream_capturing = torch.xpu.is_current_stream_capturing
+    try:
+        #Replace cuda with xpu:
+        torch.cuda.current_device = torch.xpu.current_device
+        torch.cuda.current_stream = torch.xpu.current_stream
+        torch.cuda.device = torch.xpu.device
+        torch.cuda.device_count = torch.xpu.device_count
+        torch.cuda.device_of = torch.xpu.device_of
+        torch.cuda.getDeviceIdListForCard = torch.xpu.getDeviceIdListForCard
+        torch.cuda.get_device_name = torch.xpu.get_device_name
+        torch.cuda.get_device_properties = torch.xpu.get_device_properties
+        torch.cuda.init = torch.xpu.init
+        torch.cuda.is_available = torch.xpu.is_available
+        torch.cuda.is_initialized = torch.xpu.is_initialized
+        torch.cuda.is_current_stream_capturing = lambda: False
+        torch.cuda.set_device = torch.xpu.set_device
+        torch.cuda.stream = torch.xpu.stream
+        torch.cuda.synchronize = torch.xpu.synchronize
+        torch.cuda.Event = torch.xpu.Event
+        torch.cuda.Stream = torch.xpu.Stream
+        torch.cuda.FloatTensor = torch.xpu.FloatTensor
+        torch.Tensor.cuda = torch.Tensor.xpu
+        torch.Tensor.is_cuda = torch.Tensor.is_xpu
+        torch.cuda._initialization_lock = torch.xpu.lazy_init._initialization_lock
+        torch.cuda._initialized = torch.xpu.lazy_init._initialized
+        torch.cuda._lazy_seed_tracker = torch.xpu.lazy_init._lazy_seed_tracker
+        torch.cuda._queued_calls = torch.xpu.lazy_init._queued_calls
+        torch.cuda._tls = torch.xpu.lazy_init._tls
+        torch.cuda.threading = torch.xpu.lazy_init.threading
+        torch.cuda.traceback = torch.xpu.lazy_init.traceback
+        torch.cuda.Optional = torch.xpu.Optional
+        torch.cuda.__cached__ = torch.xpu.__cached__
+        torch.cuda.__loader__ = torch.xpu.__loader__
+        torch.cuda.ComplexFloatStorage = torch.xpu.ComplexFloatStorage
+        torch.cuda.Tuple = torch.xpu.Tuple
+        torch.cuda.streams = torch.xpu.streams
+        torch.cuda._lazy_new = torch.xpu._lazy_new
+        torch.cuda.FloatStorage = torch.xpu.FloatStorage
+        torch.cuda.Any = torch.xpu.Any
+        torch.cuda.__doc__ = torch.xpu.__doc__
+        torch.cuda.default_generators = torch.xpu.default_generators
+        torch.cuda.HalfTensor = torch.xpu.HalfTensor
+        torch.cuda._get_device_index = torch.xpu._get_device_index
+        torch.cuda.__path__ = torch.xpu.__path__
+        torch.cuda.Device = torch.xpu.Device
+        torch.cuda.IntTensor = torch.xpu.IntTensor
+        torch.cuda.ByteStorage = torch.xpu.ByteStorage
+        torch.cuda.set_stream = torch.xpu.set_stream
+        torch.cuda.BoolStorage = torch.xpu.BoolStorage
+        torch.cuda.os = torch.xpu.os
+        torch.cuda.torch = torch.xpu.torch
+        torch.cuda.BFloat16Storage = torch.xpu.BFloat16Storage
+        torch.cuda.Union = torch.xpu.Union
+        torch.cuda.DoubleTensor = torch.xpu.DoubleTensor
+        torch.cuda.ShortTensor = torch.xpu.ShortTensor
+        torch.cuda.LongTensor = torch.xpu.LongTensor
+        torch.cuda.IntStorage = torch.xpu.IntStorage
+        torch.cuda.LongStorage = torch.xpu.LongStorage
+        torch.cuda.__annotations__ = torch.xpu.__annotations__
+        torch.cuda.__package__ = torch.xpu.__package__
+        torch.cuda.__builtins__ = torch.xpu.__builtins__
+        torch.cuda.CharTensor = torch.xpu.CharTensor
+        torch.cuda.List = torch.xpu.List
+        torch.cuda._lazy_init = torch.xpu._lazy_init
+        torch.cuda.BFloat16Tensor = torch.xpu.BFloat16Tensor
+        torch.cuda.DoubleStorage = torch.xpu.DoubleStorage
+        torch.cuda.ByteTensor = torch.xpu.ByteTensor
+        torch.cuda.StreamContext = torch.xpu.StreamContext
+        torch.cuda.ComplexDoubleStorage = torch.xpu.ComplexDoubleStorage
+        torch.cuda.ShortStorage = torch.xpu.ShortStorage
+        torch.cuda._lazy_call = torch.xpu._lazy_call
+        torch.cuda.HalfStorage = torch.xpu.HalfStorage
+        torch.cuda.random = torch.xpu.random
+        torch.cuda._device = torch.xpu._device
+        torch.cuda.classproperty = torch.xpu.classproperty
+        torch.cuda.__name__ = torch.xpu.__name__
+        torch.cuda._device_t = torch.xpu._device_t
+        torch.cuda.warnings = torch.xpu.warnings
+        torch.cuda.__spec__ = torch.xpu.__spec__
+        torch.cuda.BoolTensor = torch.xpu.BoolTensor
+        torch.cuda.CharStorage = torch.xpu.CharStorage
+        torch.cuda.__file__ = torch.xpu.__file__
+        torch.cuda._is_in_bad_fork = torch.xpu.lazy_init._is_in_bad_fork
+        #torch.cuda.is_current_stream_capturing = torch.xpu.is_current_stream_capturing
 
-    #Memory:
-    torch.cuda.memory = torch.xpu.memory
-    if 'linux' in sys.platform and "WSL2" in os.popen("uname -a").read():
-        torch.xpu.empty_cache = lambda: None
-    torch.cuda.empty_cache = torch.xpu.empty_cache
-    torch.cuda.memory_stats = torch.xpu.memory_stats
-    torch.cuda.memory_summary = torch.xpu.memory_summary
-    torch.cuda.memory_snapshot = torch.xpu.memory_snapshot
-    torch.cuda.memory_allocated = torch.xpu.memory_allocated
-    torch.cuda.max_memory_allocated = torch.xpu.max_memory_allocated
-    torch.cuda.memory_reserved = torch.xpu.memory_reserved
-    torch.cuda.memory_cached = torch.xpu.memory_reserved
-    torch.cuda.max_memory_reserved = torch.xpu.max_memory_reserved
-    torch.cuda.max_memory_cached = torch.xpu.max_memory_reserved
-    torch.cuda.reset_peak_memory_stats = torch.xpu.reset_peak_memory_stats
-    torch.cuda.reset_max_memory_cached = torch.xpu.reset_peak_memory_stats
-    torch.cuda.reset_max_memory_allocated = torch.xpu.reset_peak_memory_stats
-    torch.cuda.memory_stats_as_nested_dict = torch.xpu.memory_stats_as_nested_dict
-    torch.cuda.reset_accumulated_memory_stats = torch.xpu.reset_accumulated_memory_stats
+        #Memory:
+        torch.cuda.memory = torch.xpu.memory
+        if 'linux' in sys.platform and "WSL2" in os.popen("uname -a").read():
+            torch.xpu.empty_cache = lambda: None
+        torch.cuda.empty_cache = torch.xpu.empty_cache
+        torch.cuda.memory_stats = torch.xpu.memory_stats
+        torch.cuda.memory_summary = torch.xpu.memory_summary
+        torch.cuda.memory_snapshot = torch.xpu.memory_snapshot
+        torch.cuda.memory_allocated = torch.xpu.memory_allocated
+        torch.cuda.max_memory_allocated = torch.xpu.max_memory_allocated
+        torch.cuda.memory_reserved = torch.xpu.memory_reserved
+        torch.cuda.memory_cached = torch.xpu.memory_reserved
+        torch.cuda.max_memory_reserved = torch.xpu.max_memory_reserved
+        torch.cuda.max_memory_cached = torch.xpu.max_memory_reserved
+        torch.cuda.reset_peak_memory_stats = torch.xpu.reset_peak_memory_stats
+        torch.cuda.reset_max_memory_cached = torch.xpu.reset_peak_memory_stats
+        torch.cuda.reset_max_memory_allocated = torch.xpu.reset_peak_memory_stats
+        torch.cuda.memory_stats_as_nested_dict = torch.xpu.memory_stats_as_nested_dict
+        torch.cuda.reset_accumulated_memory_stats = torch.xpu.reset_accumulated_memory_stats
 
-    #RNG:
-    torch.cuda.get_rng_state = torch.xpu.get_rng_state
-    torch.cuda.get_rng_state_all = torch.xpu.get_rng_state_all
-    torch.cuda.set_rng_state = torch.xpu.set_rng_state
-    torch.cuda.set_rng_state_all = torch.xpu.set_rng_state_all
-    torch.cuda.manual_seed = torch.xpu.manual_seed
-    torch.cuda.manual_seed_all = torch.xpu.manual_seed_all
-    torch.cuda.seed = torch.xpu.seed
-    torch.cuda.seed_all = torch.xpu.seed_all
-    torch.cuda.initial_seed = torch.xpu.initial_seed
+        #RNG:
+        torch.cuda.get_rng_state = torch.xpu.get_rng_state
+        torch.cuda.get_rng_state_all = torch.xpu.get_rng_state_all
+        torch.cuda.set_rng_state = torch.xpu.set_rng_state
+        torch.cuda.set_rng_state_all = torch.xpu.set_rng_state_all
+        torch.cuda.manual_seed = torch.xpu.manual_seed
+        torch.cuda.manual_seed_all = torch.xpu.manual_seed_all
+        torch.cuda.seed = torch.xpu.seed
+        torch.cuda.seed_all = torch.xpu.seed_all
+        torch.cuda.initial_seed = torch.xpu.initial_seed
 
-    #AMP:
-    torch.cuda.amp = torch.xpu.amp
-    if not hasattr(torch.cuda.amp, "common"):
-        torch.cuda.amp.common = contextlib.nullcontext()
-    torch.cuda.amp.common.amp_definitely_not_available = lambda: False
-    try:
-        torch.cuda.amp.GradScaler = torch.xpu.amp.GradScaler
-    except Exception: # pylint: disable=broad-exception-caught
+        #AMP:
+        torch.cuda.amp = torch.xpu.amp
+        if not hasattr(torch.cuda.amp, "common"):
+            torch.cuda.amp.common = contextlib.nullcontext()
+        torch.cuda.amp.common.amp_definitely_not_available = lambda: False
         try:
-            from .gradscaler import gradscaler_init # pylint: disable=import-outside-toplevel, import-error
-            gradscaler_init()
             torch.cuda.amp.GradScaler = torch.xpu.amp.GradScaler
         except Exception: # pylint: disable=broad-exception-caught
-            torch.cuda.amp.GradScaler = ipex.cpu.autocast._grad_scaler.GradScaler
+            try:
+                from .gradscaler import gradscaler_init # pylint: disable=import-outside-toplevel, import-error
+                gradscaler_init()
+                torch.cuda.amp.GradScaler = torch.xpu.amp.GradScaler
+            except Exception: # pylint: disable=broad-exception-caught
+                torch.cuda.amp.GradScaler = ipex.cpu.autocast._grad_scaler.GradScaler
 
-    #C
-    torch._C._cuda_getCurrentRawStream = ipex._C._getCurrentStream
-    ipex._C._DeviceProperties.major = 2023
-    ipex._C._DeviceProperties.minor = 2
+        #C
+        torch._C._cuda_getCurrentRawStream = ipex._C._getCurrentStream
+        ipex._C._DeviceProperties.major = 2023
+        ipex._C._DeviceProperties.minor = 2
 
-    #Fix functions with ipex:
-    torch.cuda.mem_get_info = lambda device=None: [(torch.xpu.get_device_properties(device).total_memory - torch.xpu.memory_allocated(device)), torch.xpu.get_device_properties(device).total_memory]
-    torch._utils._get_available_device_type = lambda: "xpu"
-    torch.has_cuda = True
-    torch.cuda.has_half = True
-    torch.cuda.is_bf16_supported = lambda: True
-    torch.cuda.is_fp16_supported = lambda: True
-    torch.version.cuda = "11.7"
-    torch.cuda.get_device_capability = lambda: [11,7]
-    torch.cuda.get_device_properties.major = 11
-    torch.cuda.get_device_properties.minor = 7
-    torch.cuda.ipc_collect = lambda: None
-    torch.cuda.utilization = lambda: 0
+        #Fix functions with ipex:
+        torch.cuda.mem_get_info = lambda device=None: [(torch.xpu.get_device_properties(device).total_memory - torch.xpu.memory_allocated(device)), torch.xpu.get_device_properties(device).total_memory]
+        torch._utils._get_available_device_type = lambda: "xpu"
+        torch.has_cuda = True
+        torch.cuda.has_half = True
+        torch.cuda.is_bf16_supported = lambda *args, **kwargs: True
+        torch.cuda.is_fp16_supported = lambda *args, **kwargs: True
+        torch.version.cuda = "11.7"
+        torch.cuda.get_device_capability = lambda *args, **kwargs: [11,7]
+        torch.cuda.get_device_properties.major = 11
+        torch.cuda.get_device_properties.minor = 7
+        torch.cuda.ipc_collect = lambda *args, **kwargs: None
+        torch.cuda.utilization = lambda *args, **kwargs: 0
 
-    ipex_hijacks()
-    try:
-        from .diffusers import ipex_diffusers # pylint: disable=import-outside-toplevel, import-error
-        ipex_diffusers()
-    except Exception: # pylint: disable=broad-exception-caught
-        pass
+        ipex_hijacks()
+        attention_init()
+        try:
+            from .diffusers import ipex_diffusers
+            ipex_diffusers()
+        except Exception: # pylint: disable=broad-exception-caught
+            pass
+    except Exception as e:
+        return False, e
+    return True, None
diff --git a/library/ipex/attention.py b/library/ipex/attention.py
new file mode 100644
index 000000000..d7335bfaf
--- /dev/null
+++ b/library/ipex/attention.py
@@ -0,0 +1,128 @@
+import torch
+import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
+
+# pylint: disable=protected-access, missing-function-docstring, line-too-long
+
+original_torch_bmm = torch.bmm
+def torch_bmm(input, mat2, *, out=None):
+    if input.dtype != mat2.dtype:
+        mat2 = mat2.to(input.dtype)
+
+    #ARC GPUs can't allocate more than 4GB to a single block, Slice it:
+    batch_size_attention, input_tokens, mat2_shape = input.shape[0], input.shape[1], mat2.shape[2]
+    block_multiply = 2.4 if input.dtype == torch.float32 else 1.2
+    block_size = (batch_size_attention * input_tokens * mat2_shape) / 1024 * block_multiply #MB
+    split_slice_size = batch_size_attention
+    if block_size >= 4000:
+        do_split = True
+        #Find something divisible with the input_tokens
+        while ((split_slice_size * input_tokens * mat2_shape) / 1024 * block_multiply) > 4000:
+            split_slice_size = split_slice_size // 2
+            if split_slice_size <= 1:
+                split_slice_size = 1
+                break
+    else:
+        do_split = False
+
+    split_block_size = (split_slice_size * input_tokens * mat2_shape) / 1024 * block_multiply #MB
+    split_2_slice_size = input_tokens
+    if split_block_size >= 4000:
+        do_split_2 = True
+        #Find something divisible with the input_tokens
+        while ((split_slice_size * split_2_slice_size * mat2_shape) / 1024 * block_multiply) > 4000:
+            split_2_slice_size = split_2_slice_size // 2
+            if split_2_slice_size <= 1:
+                split_2_slice_size = 1
+                break
+    else:
+        do_split_2 = False
+
+    if do_split:
+        hidden_states = torch.zeros(input.shape[0], input.shape[1], mat2.shape[2], device=input.device, dtype=input.dtype)
+        for i in range(batch_size_attention // split_slice_size):
+            start_idx = i * split_slice_size
+            end_idx = (i + 1) * split_slice_size
+            if do_split_2:
+                for i2 in range(input_tokens // split_2_slice_size): # pylint: disable=invalid-name
+                    start_idx_2 = i2 * split_2_slice_size
+                    end_idx_2 = (i2 + 1) * split_2_slice_size
+                    hidden_states[start_idx:end_idx, start_idx_2:end_idx_2] = original_torch_bmm(
+                        input[start_idx:end_idx, start_idx_2:end_idx_2],
+                        mat2[start_idx:end_idx, start_idx_2:end_idx_2],
+                        out=out
+                    )
+            else:
+                hidden_states[start_idx:end_idx] = original_torch_bmm(
+                    input[start_idx:end_idx],
+                    mat2[start_idx:end_idx],
+                    out=out
+                )
+    else:
+        return original_torch_bmm(input, mat2, out=out)
+    return hidden_states
+
+original_scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention
+def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False):
+    #ARC GPUs can't allocate more than 4GB to a single block, Slice it:
+    shape_one, batch_size_attention, query_tokens, shape_four = query.shape
+    block_multiply = 2.4 if query.dtype == torch.float32 else 1.2
+    block_size = (shape_one * batch_size_attention * query_tokens * shape_four) / 1024 * block_multiply #MB
+    split_slice_size = batch_size_attention
+    if block_size >= 4000:
+        do_split = True
+        #Find something divisible with the shape_one
+        while ((shape_one * split_slice_size * query_tokens * shape_four) / 1024 * block_multiply) > 4000:
+            split_slice_size = split_slice_size // 2
+            if split_slice_size <= 1:
+                split_slice_size = 1
+                break
+    else:
+        do_split = False
+
+    split_block_size = (shape_one * split_slice_size * query_tokens * shape_four) / 1024 * block_multiply #MB
+    split_2_slice_size = query_tokens
+    if split_block_size >= 4000:
+        do_split_2 = True
+        #Find something divisible with the batch_size_attention
+        while ((shape_one * split_slice_size * split_2_slice_size * shape_four) / 1024 * block_multiply) > 4000:
+            split_2_slice_size = split_2_slice_size // 2
+            if split_2_slice_size <= 1:
+                split_2_slice_size = 1
+                break
+    else:
+        do_split_2 = False
+
+    if do_split:
+        hidden_states = torch.zeros(query.shape, device=query.device, dtype=query.dtype)
+        for i in range(batch_size_attention // split_slice_size):
+            start_idx = i * split_slice_size
+            end_idx = (i + 1) * split_slice_size
+            if do_split_2:
+                for i2 in range(query_tokens // split_2_slice_size): # pylint: disable=invalid-name
+                    start_idx_2 = i2 * split_2_slice_size
+                    end_idx_2 = (i2 + 1) * split_2_slice_size
+                    hidden_states[:, start_idx:end_idx, start_idx_2:end_idx_2] = original_scaled_dot_product_attention(
+                        query[:, start_idx:end_idx, start_idx_2:end_idx_2],
+                        key[:, start_idx:end_idx, start_idx_2:end_idx_2],
+                        value[:, start_idx:end_idx, start_idx_2:end_idx_2],
+                        attn_mask=attn_mask[:, start_idx:end_idx, start_idx_2:end_idx_2] if attn_mask is not None else attn_mask,
+                        dropout_p=dropout_p, is_causal=is_causal
+                    )
+            else:
+                hidden_states[:, start_idx:end_idx] = original_scaled_dot_product_attention(
+                    query[:, start_idx:end_idx],
+                    key[:, start_idx:end_idx],
+                    value[:, start_idx:end_idx],
+                    attn_mask=attn_mask[:, start_idx:end_idx] if attn_mask is not None else attn_mask,
+                    dropout_p=dropout_p, is_causal=is_causal
+                )
+    else:
+        return original_scaled_dot_product_attention(
+            query, key, value, attn_mask=attn_mask, dropout_p=dropout_p, is_causal=is_causal
+        )
+    return hidden_states
+
+def attention_init():
+    #ARC GPUs can't allocate more than 4GB to a single block:
+    torch.bmm = torch_bmm
+    torch.nn.functional.scaled_dot_product_attention = scaled_dot_product_attention
diff --git a/library/ipex/diffusers.py b/library/ipex/diffusers.py
index 18563b061..3435abe14 100644
--- a/library/ipex/diffusers.py
+++ b/library/ipex/diffusers.py
@@ -1,12 +1,9 @@
 import torch
 import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
-import torch.nn.functional as F # pylint: disable=ungrouped-imports
 import diffusers #0.20.2 # pylint: disable=import-error
 
 # pylint: disable=protected-access, missing-function-docstring, line-too-long
 
-Attention = diffusers.models.attention_processor.Attention
-
 class SlicedAttnProcessor: # pylint: disable=too-few-public-methods
     r"""
     Processor for implementing sliced attention.
@@ -20,7 +17,7 @@ class SlicedAttnProcessor: # pylint: disable=too-few-public-methods
     def __init__(self, slice_size):
         self.slice_size = slice_size
 
-    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None): # pylint: disable=too-many-statements, too-many-locals, too-many-branches
+    def __call__(self, attn: diffusers.models.attention_processor.Attention, hidden_states, encoder_hidden_states=None, attention_mask=None): # pylint: disable=too-many-statements, too-many-locals, too-many-branches
         residual = hidden_states
 
         input_ndim = hidden_states.ndim
@@ -116,147 +113,6 @@ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, a
 
         return hidden_states
 
-class AttnProcessor2_0: # pylint: disable=too-few-public-methods, invalid-name
-    r"""
-    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
-    """
-
-    def __init__(self):
-        if not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
-
-    def __call__( # pylint: disable=too-many-arguments, too-many-statements, too-many-locals, too-many-branches
-        self,
-        attn: Attention,
-        hidden_states,
-        encoder_hidden_states=None,
-        attention_mask=None,
-        temb=None,
-    ):
-        residual = hidden_states
-
-        if attn.spatial_norm is not None:
-            hidden_states = attn.spatial_norm(hidden_states, temb)
-
-        input_ndim = hidden_states.ndim
-
-        if input_ndim == 4:
-            batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-
-        if attention_mask is not None:
-            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-            # scaled_dot_product_attention expects attention_mask shape to be
-            # (batch, heads, source_length, target_length)
-            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
-
-        if attn.group_norm is not None:
-            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-
-        query = attn.to_q(hidden_states)
-
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
-        key = attn.to_k(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states)
-
-        inner_dim = key.shape[-1]
-        head_dim = inner_dim // attn.heads
-
-        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-
-        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-
-        #ARC GPUs can't allocate more than 4GB to a single block, Slice it:
-        shape_one, batch_size_attention, query_tokens, shape_four = query.shape
-        block_multiply = 2.4 if query.dtype == torch.float32 else 1.2
-        block_size = (shape_one * batch_size_attention * query_tokens * shape_four) / 1024 * block_multiply #MB
-        split_slice_size = batch_size_attention
-        if block_size >= 4000:
-            do_split = True
-            #Find something divisible with the shape_one
-            while ((shape_one * split_slice_size * query_tokens * shape_four) / 1024 * block_multiply) > 4000:
-                split_slice_size = split_slice_size // 2
-                if split_slice_size <= 1:
-                    split_slice_size = 1
-                    break
-        else:
-            do_split = False
-
-        split_block_size = (shape_one * split_slice_size * query_tokens * shape_four) / 1024 * block_multiply #MB
-        split_2_slice_size = query_tokens
-        if split_block_size >= 4000:
-            do_split_2 = True
-            #Find something divisible with the batch_size_attention
-            while ((shape_one * split_slice_size * split_2_slice_size * shape_four) / 1024 * block_multiply) > 4000:
-                split_2_slice_size = split_2_slice_size // 2
-                if split_2_slice_size <= 1:
-                    split_2_slice_size = 1
-                    break
-        else:
-            do_split_2 = False
-
-        if do_split:
-            hidden_states = torch.zeros(query.shape, device=query.device, dtype=query.dtype)
-            for i in range(batch_size_attention // split_slice_size):
-                start_idx = i * split_slice_size
-                end_idx = (i + 1) * split_slice_size
-                if do_split_2:
-                    for i2 in range(query_tokens // split_2_slice_size): # pylint: disable=invalid-name
-                        start_idx_2 = i2 * split_2_slice_size
-                        end_idx_2 = (i2 + 1) * split_2_slice_size
-
-                        query_slice = query[:, start_idx:end_idx, start_idx_2:end_idx_2]
-                        key_slice = key[:, start_idx:end_idx, start_idx_2:end_idx_2]
-                        attn_mask_slice = attention_mask[:, start_idx:end_idx, start_idx_2:end_idx_2] if attention_mask is not None else None
-
-                        attn_slice = F.scaled_dot_product_attention(
-                        query_slice, key_slice, value[:, start_idx:end_idx, start_idx_2:end_idx_2],
-                        attn_mask=attn_mask_slice, dropout_p=0.0, is_causal=False
-                        )
-                        hidden_states[:, start_idx:end_idx, start_idx_2:end_idx_2] = attn_slice
-                else:
-                    query_slice = query[:, start_idx:end_idx]
-                    key_slice = key[:, start_idx:end_idx]
-                    attn_mask_slice = attention_mask[:, start_idx:end_idx] if attention_mask is not None else None
-
-                    attn_slice = F.scaled_dot_product_attention(
-                        query_slice, key_slice, value[:, start_idx:end_idx],
-                        attn_mask=attn_mask_slice, dropout_p=0.0, is_causal=False
-                    )
-                    hidden_states[:, start_idx:end_idx] = attn_slice
-        else:
-            hidden_states = F.scaled_dot_product_attention(
-                query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
-            )
-
-        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
-        hidden_states = hidden_states.to(query.dtype)
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-
-        if input_ndim == 4:
-            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-
-        if attn.residual_connection:
-            hidden_states = hidden_states + residual
-
-        hidden_states = hidden_states / attn.rescale_output_factor
-
-        return hidden_states
-
 def ipex_diffusers():
     #ARC GPUs can't allocate more than 4GB to a single block:
     diffusers.models.attention_processor.SlicedAttnProcessor = SlicedAttnProcessor
-    diffusers.models.attention_processor.AttnProcessor2_0 = AttnProcessor2_0
diff --git a/library/ipex/hijacks.py b/library/ipex/hijacks.py
index f527e8836..78d7e034b 100644
--- a/library/ipex/hijacks.py
+++ b/library/ipex/hijacks.py
@@ -34,7 +34,7 @@ def __call__(self, *args, **kwargs):
 
 _utils = torch.utils.data._utils
 def _shutdown_workers(self):
-    if _utils is None or _utils.python_exit_status is True or _utils.python_exit_status is None:
+    if torch.utils.data._utils is None or torch.utils.data._utils.python_exit_status is True or torch.utils.data._utils.python_exit_status is None:
         return
     if hasattr(self, "_shutdown") and not self._shutdown:
         self._shutdown = True
@@ -50,13 +50,13 @@ def _shutdown_workers(self):
                 if self._persistent_workers or self._workers_status[worker_id]:
                     self._mark_worker_as_unavailable(worker_id, shutdown=True)
             for w in self._workers: # pylint: disable=invalid-name
-                w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
+                w.join(timeout=torch.utils.data._utils.MP_STATUS_CHECK_INTERVAL)
             for q in self._index_queues: # pylint: disable=invalid-name
                 q.cancel_join_thread()
                 q.close()
         finally:
             if self._worker_pids_set:
-                _utils.signal_handling._remove_worker_pids(id(self))
+                torch.utils.data._utils.signal_handling._remove_worker_pids(id(self))
                 self._worker_pids_set = False
             for w in self._workers: # pylint: disable=invalid-name
                 if w.is_alive():
@@ -84,7 +84,7 @@ def ipex_no_cuda(orig_func, *args, **kwargs):
 
 original_autocast = torch.autocast
 def ipex_autocast(*args, **kwargs):
-    if len(args) > 1 and args[0] == "cuda":
+    if len(args) > 0 and args[0] == "cuda":
         return original_autocast("xpu", *args[1:], **kwargs)
     else:
         return original_autocast(*args, **kwargs)
@@ -169,9 +169,6 @@ def ipex_hijacks():
     CondFunc('torch.nn.modules.conv.Conv2d.forward',
         lambda orig_func, self, input: orig_func(self, input.to(self.weight.data.dtype)),
         lambda orig_func, self, input: input.dtype != self.weight.data.dtype)
-    CondFunc('torch.bmm',
-        lambda orig_func, input, mat2, *args, **kwargs: orig_func(input, mat2.to(input.dtype), *args, **kwargs),
-        lambda orig_func, input, mat2, *args, **kwargs: input.dtype != mat2.dtype)
     CondFunc('torch.nn.functional.layer_norm',
         lambda orig_func, input, normalized_shape=None, weight=None, *args, **kwargs:
         orig_func(input.to(weight.data.dtype), normalized_shape, weight, *args, **kwargs),
diff --git a/requirements_linux_ipex.txt b/requirements_linux_ipex.txt
index e0fdb1e55..28c0bc6b1 100644
--- a/requirements_linux_ipex.txt
+++ b/requirements_linux_ipex.txt
@@ -1,3 +1,3 @@
-torch==2.0.1a0 torchvision==0.15.2a0 intel_extension_for_pytorch==2.0.110+xpu -f https://developer.intel.com/ipex-whl-stable-xpu # no_verify leave this to specify not checking this a verification stage
+torch==2.0.1a0+cxx11.abi torchvision==0.15.2a0+cxx11.abi intel_extension_for_pytorch==2.0.110+xpu -f https://developer.intel.com/ipex-whl-stable-xpu # no_verify leave this to specify not checking this a verification stage
 tensorboard==2.12.3 tensorflow==2.12.0 intel-extension-for-tensorflow[gpu]
 -r requirements.txt

From f2b6c4603e08f1708c6369b5d0880a5cb9b0f24b Mon Sep 17 00:00:00 2001
From: Disty0 <semihgulec2005@gmail.com>
Date: Mon, 11 Sep 2023 23:39:24 +0300
Subject: [PATCH 03/33] Fix finetune

---
 library/model_util.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/library/model_util.py b/library/model_util.py
index 860c170b2..00a3c0495 100644
--- a/library/model_util.py
+++ b/library/model_util.py
@@ -4,6 +4,13 @@
 import math
 import os
 import torch
+try:
+    import intel_extension_for_pytorch as ipex
+    if torch.xpu.is_available():
+        from library.ipex import ipex_init
+        ipex_init()
+except Exception:
+    pass
 import diffusers
 from transformers import CLIPTextModel, CLIPTokenizer, CLIPTextConfig, logging
 from diffusers import AutoencoderKL, DDIMScheduler, StableDiffusionPipeline  # , UNet2DConditionModel

From 15186d5514485b6e2f34b4e7d413533e4e661308 Mon Sep 17 00:00:00 2001
From: Disty0 <semihgulec2005@gmail.com>
Date: Thu, 14 Sep 2023 15:54:55 +0300
Subject: [PATCH 04/33] fix diffusers 0.21 lazy import

---
 library/ipex/diffusers.py | 5 +++--
 library/ipex/hijacks.py   | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/library/ipex/diffusers.py b/library/ipex/diffusers.py
index 3435abe14..4c39896ed 100644
--- a/library/ipex/diffusers.py
+++ b/library/ipex/diffusers.py
@@ -1,6 +1,7 @@
 import torch
 import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
-import diffusers #0.20.2 # pylint: disable=import-error
+import diffusers #0.21.1 # pylint: disable=import-error
+from diffusers.models.attention_processor import Attention
 
 # pylint: disable=protected-access, missing-function-docstring, line-too-long
 
@@ -17,7 +18,7 @@ class SlicedAttnProcessor: # pylint: disable=too-few-public-methods
     def __init__(self, slice_size):
         self.slice_size = slice_size
 
-    def __call__(self, attn: diffusers.models.attention_processor.Attention, hidden_states, encoder_hidden_states=None, attention_mask=None): # pylint: disable=too-many-statements, too-many-locals, too-many-branches
+    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None): # pylint: disable=too-many-statements, too-many-locals, too-many-branches
         residual = hidden_states
 
         input_ndim = hidden_states.ndim
diff --git a/library/ipex/hijacks.py b/library/ipex/hijacks.py
index 78d7e034b..77ed5419a 100644
--- a/library/ipex/hijacks.py
+++ b/library/ipex/hijacks.py
@@ -75,7 +75,7 @@ def check_device(device):
     return bool((isinstance(device, torch.device) and device.type == "cuda") or (isinstance(device, str) and "cuda" in device) or isinstance(device, int))
 
 def return_xpu(device):
-    return f"xpu:{device[-1]}" if isinstance(device, str) and ":" in device else f"xpu:{device}" if isinstance(device, int) else torch.device("xpu") if isinstance(device, torch.device) else "xpu"
+    return f"xpu:{device.split(':')[-1]}" if isinstance(device, str) and ":" in device else f"xpu:{device}" if isinstance(device, int) else torch.device("xpu") if isinstance(device, torch.device) else "xpu"
 
 def ipex_no_cuda(orig_func, *args, **kwargs):
     torch.cuda.is_available = lambda: False

From c1a454f95c2200db3d27dbf4b90bf86f8124b6af Mon Sep 17 00:00:00 2001
From: Disty0 <semihgulec2005@gmail.com>
Date: Sun, 17 Sep 2023 16:20:11 +0300
Subject: [PATCH 05/33] Update SDPA slice rate

---
 library/ipex/attention.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/library/ipex/attention.py b/library/ipex/attention.py
index d7335bfaf..fc4ab6e26 100644
--- a/library/ipex/attention.py
+++ b/library/ipex/attention.py
@@ -65,7 +65,7 @@ def torch_bmm(input, mat2, *, out=None):
 def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False):
     #ARC GPUs can't allocate more than 4GB to a single block, Slice it:
     shape_one, batch_size_attention, query_tokens, shape_four = query.shape
-    block_multiply = 2.4 if query.dtype == torch.float32 else 1.2
+    block_multiply = 3.6 if query.dtype == torch.float32 else 1.8
     block_size = (shape_one * batch_size_attention * query_tokens * shape_four) / 1024 * block_multiply #MB
     split_slice_size = batch_size_attention
     if block_size >= 4000:

From 97d60da67b991eb130e06eb37d605b24f038a63d Mon Sep 17 00:00:00 2001
From: Disty0 <semihgulec2005@gmail.com>
Date: Mon, 18 Sep 2023 15:37:24 +0300
Subject: [PATCH 06/33] Fix SDPA

---
 library/ipex/attention.py | 52 ++++++++++++++++++++++++++++-----------
 1 file changed, 38 insertions(+), 14 deletions(-)

diff --git a/library/ipex/attention.py b/library/ipex/attention.py
index fc4ab6e26..e38689f21 100644
--- a/library/ipex/attention.py
+++ b/library/ipex/attention.py
@@ -64,7 +64,13 @@ def torch_bmm(input, mat2, *, out=None):
 original_scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention
 def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False):
     #ARC GPUs can't allocate more than 4GB to a single block, Slice it:
-    shape_one, batch_size_attention, query_tokens, shape_four = query.shape
+    if len(query.shape) == 3:
+        batch_size_attention, query_tokens, shape_four = query.shape
+        shape_one = 1
+        no_shape_one = True
+    else:
+        shape_one, batch_size_attention, query_tokens, shape_four = query.shape
+        no_shape_one = False
     block_multiply = 3.6 if query.dtype == torch.float32 else 1.8
     block_size = (shape_one * batch_size_attention * query_tokens * shape_four) / 1024 * block_multiply #MB
     split_slice_size = batch_size_attention
@@ -101,21 +107,39 @@ def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.
                 for i2 in range(query_tokens // split_2_slice_size): # pylint: disable=invalid-name
                     start_idx_2 = i2 * split_2_slice_size
                     end_idx_2 = (i2 + 1) * split_2_slice_size
-                    hidden_states[:, start_idx:end_idx, start_idx_2:end_idx_2] = original_scaled_dot_product_attention(
-                        query[:, start_idx:end_idx, start_idx_2:end_idx_2],
-                        key[:, start_idx:end_idx, start_idx_2:end_idx_2],
-                        value[:, start_idx:end_idx, start_idx_2:end_idx_2],
-                        attn_mask=attn_mask[:, start_idx:end_idx, start_idx_2:end_idx_2] if attn_mask is not None else attn_mask,
+                    if no_shape_one:
+                        hidden_states[start_idx:end_idx, start_idx_2:end_idx_2] = original_scaled_dot_product_attention(
+                            query[start_idx:end_idx, start_idx_2:end_idx_2],
+                            key[start_idx:end_idx, start_idx_2:end_idx_2],
+                            value[start_idx:end_idx, start_idx_2:end_idx_2],
+                            attn_mask=attn_mask[start_idx:end_idx, start_idx_2:end_idx_2] if attn_mask is not None else attn_mask,
+                            dropout_p=dropout_p, is_causal=is_causal
+                        )
+                    else:
+                        hidden_states[:, start_idx:end_idx, start_idx_2:end_idx_2] = original_scaled_dot_product_attention(
+                            query[:, start_idx:end_idx, start_idx_2:end_idx_2],
+                            key[:, start_idx:end_idx, start_idx_2:end_idx_2],
+                            value[:, start_idx:end_idx, start_idx_2:end_idx_2],
+                            attn_mask=attn_mask[:, start_idx:end_idx, start_idx_2:end_idx_2] if attn_mask is not None else attn_mask,
+                            dropout_p=dropout_p, is_causal=is_causal
+                        )
+            else:
+                if no_shape_one:
+                    hidden_states[start_idx:end_idx] = original_scaled_dot_product_attention(
+                        query[start_idx:end_idx],
+                        key[start_idx:end_idx],
+                        value[start_idx:end_idx],
+                        attn_mask=attn_mask[start_idx:end_idx] if attn_mask is not None else attn_mask,
+                        dropout_p=dropout_p, is_causal=is_causal
+                    )
+                else:
+                    hidden_states[:, start_idx:end_idx] = original_scaled_dot_product_attention(
+                        query[:, start_idx:end_idx],
+                        key[:, start_idx:end_idx],
+                        value[:, start_idx:end_idx],
+                        attn_mask=attn_mask[:, start_idx:end_idx] if attn_mask is not None else attn_mask,
                         dropout_p=dropout_p, is_causal=is_causal
                     )
-            else:
-                hidden_states[:, start_idx:end_idx] = original_scaled_dot_product_attention(
-                    query[:, start_idx:end_idx],
-                    key[:, start_idx:end_idx],
-                    value[:, start_idx:end_idx],
-                    attn_mask=attn_mask[:, start_idx:end_idx] if attn_mask is not None else attn_mask,
-                    dropout_p=dropout_p, is_causal=is_causal
-                )
     else:
         return original_scaled_dot_product_attention(
             query, key, value, attn_mask=attn_mask, dropout_p=dropout_p, is_causal=is_causal

From 912c9d0456459f7510d75b07fbecdbd61bddaa64 Mon Sep 17 00:00:00 2001
From: Disty0 <semihgulec2005@gmail.com>
Date: Sun, 24 Sep 2023 13:50:12 +0300
Subject: [PATCH 07/33] Remove non GUI changes

---
 XTI_hijack.py                        |   7 -
 fine_tune.py                         |   7 -
 gen_img_diffusers.py                 |   7 -
 library/ipex/__init__.py             | 170 -----------------------
 library/ipex/attention.py            | 152 ---------------------
 library/ipex/diffusers.py            | 119 ----------------
 library/ipex/gradscaler.py           | 179 ------------------------
 library/ipex/hijacks.py              | 196 ---------------------------
 library/model_util.py                |   7 -
 sdxl_gen_img.py                      |   7 -
 sdxl_minimal_inference.py            |   7 -
 sdxl_train.py                        |   7 -
 sdxl_train_control_net_lllite.py     |   7 -
 sdxl_train_control_net_lllite_alt.py |   7 -
 sdxl_train_network.py                |   7 -
 sdxl_train_textual_inversion.py      |   7 -
 train_controlnet.py                  |   7 -
 train_db.py                          |   7 -
 train_network.py                     |   7 -
 train_textual_inversion.py           |   7 -
 train_textual_inversion_XTI.py       |   7 -
 21 files changed, 928 deletions(-)
 delete mode 100644 library/ipex/__init__.py
 delete mode 100644 library/ipex/attention.py
 delete mode 100644 library/ipex/diffusers.py
 delete mode 100644 library/ipex/gradscaler.py
 delete mode 100644 library/ipex/hijacks.py

diff --git a/XTI_hijack.py b/XTI_hijack.py
index ec0849455..36b5d3f2b 100644
--- a/XTI_hijack.py
+++ b/XTI_hijack.py
@@ -1,11 +1,4 @@
 import torch
-try:
-    import intel_extension_for_pytorch as ipex
-    if torch.xpu.is_available():
-        from library.ipex import ipex_init
-        ipex_init()
-except Exception:
-    pass
 from typing import Union, List, Optional, Dict, Any, Tuple
 from diffusers.models.unet_2d_condition import UNet2DConditionOutput
 
diff --git a/fine_tune.py b/fine_tune.py
index f300d4688..f89e897a8 100644
--- a/fine_tune.py
+++ b/fine_tune.py
@@ -10,13 +10,6 @@
 
 from tqdm import tqdm
 import torch
-try:
-    import intel_extension_for_pytorch as ipex
-    if torch.xpu.is_available():
-        from library.ipex import ipex_init
-        ipex_init()
-except Exception:
-    pass
 from accelerate.utils import set_seed
 from diffusers import DDPMScheduler
 
diff --git a/gen_img_diffusers.py b/gen_img_diffusers.py
index 0ea66cde2..273c0dd86 100644
--- a/gen_img_diffusers.py
+++ b/gen_img_diffusers.py
@@ -65,13 +65,6 @@
 import diffusers
 import numpy as np
 import torch
-try:
-    import intel_extension_for_pytorch as ipex
-    if torch.xpu.is_available():
-        from library.ipex import ipex_init
-        ipex_init()
-except Exception:
-    pass
 import torchvision
 from diffusers import (
     AutoencoderKL,
diff --git a/library/ipex/__init__.py b/library/ipex/__init__.py
deleted file mode 100644
index 9ec69012f..000000000
--- a/library/ipex/__init__.py
+++ /dev/null
@@ -1,170 +0,0 @@
-import os
-import sys
-import contextlib
-import torch
-import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
-from .hijacks import ipex_hijacks
-from .attention import attention_init
-
-# pylint: disable=protected-access, missing-function-docstring, line-too-long
-
-def ipex_init(): # pylint: disable=too-many-statements
-    try:
-        #Replace cuda with xpu:
-        torch.cuda.current_device = torch.xpu.current_device
-        torch.cuda.current_stream = torch.xpu.current_stream
-        torch.cuda.device = torch.xpu.device
-        torch.cuda.device_count = torch.xpu.device_count
-        torch.cuda.device_of = torch.xpu.device_of
-        torch.cuda.getDeviceIdListForCard = torch.xpu.getDeviceIdListForCard
-        torch.cuda.get_device_name = torch.xpu.get_device_name
-        torch.cuda.get_device_properties = torch.xpu.get_device_properties
-        torch.cuda.init = torch.xpu.init
-        torch.cuda.is_available = torch.xpu.is_available
-        torch.cuda.is_initialized = torch.xpu.is_initialized
-        torch.cuda.is_current_stream_capturing = lambda: False
-        torch.cuda.set_device = torch.xpu.set_device
-        torch.cuda.stream = torch.xpu.stream
-        torch.cuda.synchronize = torch.xpu.synchronize
-        torch.cuda.Event = torch.xpu.Event
-        torch.cuda.Stream = torch.xpu.Stream
-        torch.cuda.FloatTensor = torch.xpu.FloatTensor
-        torch.Tensor.cuda = torch.Tensor.xpu
-        torch.Tensor.is_cuda = torch.Tensor.is_xpu
-        torch.cuda._initialization_lock = torch.xpu.lazy_init._initialization_lock
-        torch.cuda._initialized = torch.xpu.lazy_init._initialized
-        torch.cuda._lazy_seed_tracker = torch.xpu.lazy_init._lazy_seed_tracker
-        torch.cuda._queued_calls = torch.xpu.lazy_init._queued_calls
-        torch.cuda._tls = torch.xpu.lazy_init._tls
-        torch.cuda.threading = torch.xpu.lazy_init.threading
-        torch.cuda.traceback = torch.xpu.lazy_init.traceback
-        torch.cuda.Optional = torch.xpu.Optional
-        torch.cuda.__cached__ = torch.xpu.__cached__
-        torch.cuda.__loader__ = torch.xpu.__loader__
-        torch.cuda.ComplexFloatStorage = torch.xpu.ComplexFloatStorage
-        torch.cuda.Tuple = torch.xpu.Tuple
-        torch.cuda.streams = torch.xpu.streams
-        torch.cuda._lazy_new = torch.xpu._lazy_new
-        torch.cuda.FloatStorage = torch.xpu.FloatStorage
-        torch.cuda.Any = torch.xpu.Any
-        torch.cuda.__doc__ = torch.xpu.__doc__
-        torch.cuda.default_generators = torch.xpu.default_generators
-        torch.cuda.HalfTensor = torch.xpu.HalfTensor
-        torch.cuda._get_device_index = torch.xpu._get_device_index
-        torch.cuda.__path__ = torch.xpu.__path__
-        torch.cuda.Device = torch.xpu.Device
-        torch.cuda.IntTensor = torch.xpu.IntTensor
-        torch.cuda.ByteStorage = torch.xpu.ByteStorage
-        torch.cuda.set_stream = torch.xpu.set_stream
-        torch.cuda.BoolStorage = torch.xpu.BoolStorage
-        torch.cuda.os = torch.xpu.os
-        torch.cuda.torch = torch.xpu.torch
-        torch.cuda.BFloat16Storage = torch.xpu.BFloat16Storage
-        torch.cuda.Union = torch.xpu.Union
-        torch.cuda.DoubleTensor = torch.xpu.DoubleTensor
-        torch.cuda.ShortTensor = torch.xpu.ShortTensor
-        torch.cuda.LongTensor = torch.xpu.LongTensor
-        torch.cuda.IntStorage = torch.xpu.IntStorage
-        torch.cuda.LongStorage = torch.xpu.LongStorage
-        torch.cuda.__annotations__ = torch.xpu.__annotations__
-        torch.cuda.__package__ = torch.xpu.__package__
-        torch.cuda.__builtins__ = torch.xpu.__builtins__
-        torch.cuda.CharTensor = torch.xpu.CharTensor
-        torch.cuda.List = torch.xpu.List
-        torch.cuda._lazy_init = torch.xpu._lazy_init
-        torch.cuda.BFloat16Tensor = torch.xpu.BFloat16Tensor
-        torch.cuda.DoubleStorage = torch.xpu.DoubleStorage
-        torch.cuda.ByteTensor = torch.xpu.ByteTensor
-        torch.cuda.StreamContext = torch.xpu.StreamContext
-        torch.cuda.ComplexDoubleStorage = torch.xpu.ComplexDoubleStorage
-        torch.cuda.ShortStorage = torch.xpu.ShortStorage
-        torch.cuda._lazy_call = torch.xpu._lazy_call
-        torch.cuda.HalfStorage = torch.xpu.HalfStorage
-        torch.cuda.random = torch.xpu.random
-        torch.cuda._device = torch.xpu._device
-        torch.cuda.classproperty = torch.xpu.classproperty
-        torch.cuda.__name__ = torch.xpu.__name__
-        torch.cuda._device_t = torch.xpu._device_t
-        torch.cuda.warnings = torch.xpu.warnings
-        torch.cuda.__spec__ = torch.xpu.__spec__
-        torch.cuda.BoolTensor = torch.xpu.BoolTensor
-        torch.cuda.CharStorage = torch.xpu.CharStorage
-        torch.cuda.__file__ = torch.xpu.__file__
-        torch.cuda._is_in_bad_fork = torch.xpu.lazy_init._is_in_bad_fork
-        #torch.cuda.is_current_stream_capturing = torch.xpu.is_current_stream_capturing
-
-        #Memory:
-        torch.cuda.memory = torch.xpu.memory
-        if 'linux' in sys.platform and "WSL2" in os.popen("uname -a").read():
-            torch.xpu.empty_cache = lambda: None
-        torch.cuda.empty_cache = torch.xpu.empty_cache
-        torch.cuda.memory_stats = torch.xpu.memory_stats
-        torch.cuda.memory_summary = torch.xpu.memory_summary
-        torch.cuda.memory_snapshot = torch.xpu.memory_snapshot
-        torch.cuda.memory_allocated = torch.xpu.memory_allocated
-        torch.cuda.max_memory_allocated = torch.xpu.max_memory_allocated
-        torch.cuda.memory_reserved = torch.xpu.memory_reserved
-        torch.cuda.memory_cached = torch.xpu.memory_reserved
-        torch.cuda.max_memory_reserved = torch.xpu.max_memory_reserved
-        torch.cuda.max_memory_cached = torch.xpu.max_memory_reserved
-        torch.cuda.reset_peak_memory_stats = torch.xpu.reset_peak_memory_stats
-        torch.cuda.reset_max_memory_cached = torch.xpu.reset_peak_memory_stats
-        torch.cuda.reset_max_memory_allocated = torch.xpu.reset_peak_memory_stats
-        torch.cuda.memory_stats_as_nested_dict = torch.xpu.memory_stats_as_nested_dict
-        torch.cuda.reset_accumulated_memory_stats = torch.xpu.reset_accumulated_memory_stats
-
-        #RNG:
-        torch.cuda.get_rng_state = torch.xpu.get_rng_state
-        torch.cuda.get_rng_state_all = torch.xpu.get_rng_state_all
-        torch.cuda.set_rng_state = torch.xpu.set_rng_state
-        torch.cuda.set_rng_state_all = torch.xpu.set_rng_state_all
-        torch.cuda.manual_seed = torch.xpu.manual_seed
-        torch.cuda.manual_seed_all = torch.xpu.manual_seed_all
-        torch.cuda.seed = torch.xpu.seed
-        torch.cuda.seed_all = torch.xpu.seed_all
-        torch.cuda.initial_seed = torch.xpu.initial_seed
-
-        #AMP:
-        torch.cuda.amp = torch.xpu.amp
-        if not hasattr(torch.cuda.amp, "common"):
-            torch.cuda.amp.common = contextlib.nullcontext()
-        torch.cuda.amp.common.amp_definitely_not_available = lambda: False
-        try:
-            torch.cuda.amp.GradScaler = torch.xpu.amp.GradScaler
-        except Exception: # pylint: disable=broad-exception-caught
-            try:
-                from .gradscaler import gradscaler_init # pylint: disable=import-outside-toplevel, import-error
-                gradscaler_init()
-                torch.cuda.amp.GradScaler = torch.xpu.amp.GradScaler
-            except Exception: # pylint: disable=broad-exception-caught
-                torch.cuda.amp.GradScaler = ipex.cpu.autocast._grad_scaler.GradScaler
-
-        #C
-        torch._C._cuda_getCurrentRawStream = ipex._C._getCurrentStream
-        ipex._C._DeviceProperties.major = 2023
-        ipex._C._DeviceProperties.minor = 2
-
-        #Fix functions with ipex:
-        torch.cuda.mem_get_info = lambda device=None: [(torch.xpu.get_device_properties(device).total_memory - torch.xpu.memory_allocated(device)), torch.xpu.get_device_properties(device).total_memory]
-        torch._utils._get_available_device_type = lambda: "xpu"
-        torch.has_cuda = True
-        torch.cuda.has_half = True
-        torch.cuda.is_bf16_supported = lambda *args, **kwargs: True
-        torch.cuda.is_fp16_supported = lambda *args, **kwargs: True
-        torch.version.cuda = "11.7"
-        torch.cuda.get_device_capability = lambda *args, **kwargs: [11,7]
-        torch.cuda.get_device_properties.major = 11
-        torch.cuda.get_device_properties.minor = 7
-        torch.cuda.ipc_collect = lambda *args, **kwargs: None
-        torch.cuda.utilization = lambda *args, **kwargs: 0
-
-        ipex_hijacks()
-        attention_init()
-        try:
-            from .diffusers import ipex_diffusers
-            ipex_diffusers()
-        except Exception: # pylint: disable=broad-exception-caught
-            pass
-    except Exception as e:
-        return False, e
-    return True, None
diff --git a/library/ipex/attention.py b/library/ipex/attention.py
deleted file mode 100644
index e38689f21..000000000
--- a/library/ipex/attention.py
+++ /dev/null
@@ -1,152 +0,0 @@
-import torch
-import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
-
-# pylint: disable=protected-access, missing-function-docstring, line-too-long
-
-original_torch_bmm = torch.bmm
-def torch_bmm(input, mat2, *, out=None):
-    if input.dtype != mat2.dtype:
-        mat2 = mat2.to(input.dtype)
-
-    #ARC GPUs can't allocate more than 4GB to a single block, Slice it:
-    batch_size_attention, input_tokens, mat2_shape = input.shape[0], input.shape[1], mat2.shape[2]
-    block_multiply = 2.4 if input.dtype == torch.float32 else 1.2
-    block_size = (batch_size_attention * input_tokens * mat2_shape) / 1024 * block_multiply #MB
-    split_slice_size = batch_size_attention
-    if block_size >= 4000:
-        do_split = True
-        #Find something divisible with the input_tokens
-        while ((split_slice_size * input_tokens * mat2_shape) / 1024 * block_multiply) > 4000:
-            split_slice_size = split_slice_size // 2
-            if split_slice_size <= 1:
-                split_slice_size = 1
-                break
-    else:
-        do_split = False
-
-    split_block_size = (split_slice_size * input_tokens * mat2_shape) / 1024 * block_multiply #MB
-    split_2_slice_size = input_tokens
-    if split_block_size >= 4000:
-        do_split_2 = True
-        #Find something divisible with the input_tokens
-        while ((split_slice_size * split_2_slice_size * mat2_shape) / 1024 * block_multiply) > 4000:
-            split_2_slice_size = split_2_slice_size // 2
-            if split_2_slice_size <= 1:
-                split_2_slice_size = 1
-                break
-    else:
-        do_split_2 = False
-
-    if do_split:
-        hidden_states = torch.zeros(input.shape[0], input.shape[1], mat2.shape[2], device=input.device, dtype=input.dtype)
-        for i in range(batch_size_attention // split_slice_size):
-            start_idx = i * split_slice_size
-            end_idx = (i + 1) * split_slice_size
-            if do_split_2:
-                for i2 in range(input_tokens // split_2_slice_size): # pylint: disable=invalid-name
-                    start_idx_2 = i2 * split_2_slice_size
-                    end_idx_2 = (i2 + 1) * split_2_slice_size
-                    hidden_states[start_idx:end_idx, start_idx_2:end_idx_2] = original_torch_bmm(
-                        input[start_idx:end_idx, start_idx_2:end_idx_2],
-                        mat2[start_idx:end_idx, start_idx_2:end_idx_2],
-                        out=out
-                    )
-            else:
-                hidden_states[start_idx:end_idx] = original_torch_bmm(
-                    input[start_idx:end_idx],
-                    mat2[start_idx:end_idx],
-                    out=out
-                )
-    else:
-        return original_torch_bmm(input, mat2, out=out)
-    return hidden_states
-
-original_scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention
-def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False):
-    #ARC GPUs can't allocate more than 4GB to a single block, Slice it:
-    if len(query.shape) == 3:
-        batch_size_attention, query_tokens, shape_four = query.shape
-        shape_one = 1
-        no_shape_one = True
-    else:
-        shape_one, batch_size_attention, query_tokens, shape_four = query.shape
-        no_shape_one = False
-    block_multiply = 3.6 if query.dtype == torch.float32 else 1.8
-    block_size = (shape_one * batch_size_attention * query_tokens * shape_four) / 1024 * block_multiply #MB
-    split_slice_size = batch_size_attention
-    if block_size >= 4000:
-        do_split = True
-        #Find something divisible with the shape_one
-        while ((shape_one * split_slice_size * query_tokens * shape_four) / 1024 * block_multiply) > 4000:
-            split_slice_size = split_slice_size // 2
-            if split_slice_size <= 1:
-                split_slice_size = 1
-                break
-    else:
-        do_split = False
-
-    split_block_size = (shape_one * split_slice_size * query_tokens * shape_four) / 1024 * block_multiply #MB
-    split_2_slice_size = query_tokens
-    if split_block_size >= 4000:
-        do_split_2 = True
-        #Find something divisible with the batch_size_attention
-        while ((shape_one * split_slice_size * split_2_slice_size * shape_four) / 1024 * block_multiply) > 4000:
-            split_2_slice_size = split_2_slice_size // 2
-            if split_2_slice_size <= 1:
-                split_2_slice_size = 1
-                break
-    else:
-        do_split_2 = False
-
-    if do_split:
-        hidden_states = torch.zeros(query.shape, device=query.device, dtype=query.dtype)
-        for i in range(batch_size_attention // split_slice_size):
-            start_idx = i * split_slice_size
-            end_idx = (i + 1) * split_slice_size
-            if do_split_2:
-                for i2 in range(query_tokens // split_2_slice_size): # pylint: disable=invalid-name
-                    start_idx_2 = i2 * split_2_slice_size
-                    end_idx_2 = (i2 + 1) * split_2_slice_size
-                    if no_shape_one:
-                        hidden_states[start_idx:end_idx, start_idx_2:end_idx_2] = original_scaled_dot_product_attention(
-                            query[start_idx:end_idx, start_idx_2:end_idx_2],
-                            key[start_idx:end_idx, start_idx_2:end_idx_2],
-                            value[start_idx:end_idx, start_idx_2:end_idx_2],
-                            attn_mask=attn_mask[start_idx:end_idx, start_idx_2:end_idx_2] if attn_mask is not None else attn_mask,
-                            dropout_p=dropout_p, is_causal=is_causal
-                        )
-                    else:
-                        hidden_states[:, start_idx:end_idx, start_idx_2:end_idx_2] = original_scaled_dot_product_attention(
-                            query[:, start_idx:end_idx, start_idx_2:end_idx_2],
-                            key[:, start_idx:end_idx, start_idx_2:end_idx_2],
-                            value[:, start_idx:end_idx, start_idx_2:end_idx_2],
-                            attn_mask=attn_mask[:, start_idx:end_idx, start_idx_2:end_idx_2] if attn_mask is not None else attn_mask,
-                            dropout_p=dropout_p, is_causal=is_causal
-                        )
-            else:
-                if no_shape_one:
-                    hidden_states[start_idx:end_idx] = original_scaled_dot_product_attention(
-                        query[start_idx:end_idx],
-                        key[start_idx:end_idx],
-                        value[start_idx:end_idx],
-                        attn_mask=attn_mask[start_idx:end_idx] if attn_mask is not None else attn_mask,
-                        dropout_p=dropout_p, is_causal=is_causal
-                    )
-                else:
-                    hidden_states[:, start_idx:end_idx] = original_scaled_dot_product_attention(
-                        query[:, start_idx:end_idx],
-                        key[:, start_idx:end_idx],
-                        value[:, start_idx:end_idx],
-                        attn_mask=attn_mask[:, start_idx:end_idx] if attn_mask is not None else attn_mask,
-                        dropout_p=dropout_p, is_causal=is_causal
-                    )
-    else:
-        return original_scaled_dot_product_attention(
-            query, key, value, attn_mask=attn_mask, dropout_p=dropout_p, is_causal=is_causal
-        )
-    return hidden_states
-
-def attention_init():
-    #ARC GPUs can't allocate more than 4GB to a single block:
-    torch.bmm = torch_bmm
-    torch.nn.functional.scaled_dot_product_attention = scaled_dot_product_attention
diff --git a/library/ipex/diffusers.py b/library/ipex/diffusers.py
deleted file mode 100644
index 4c39896ed..000000000
--- a/library/ipex/diffusers.py
+++ /dev/null
@@ -1,119 +0,0 @@
-import torch
-import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
-import diffusers #0.21.1 # pylint: disable=import-error
-from diffusers.models.attention_processor import Attention
-
-# pylint: disable=protected-access, missing-function-docstring, line-too-long
-
-class SlicedAttnProcessor: # pylint: disable=too-few-public-methods
-    r"""
-    Processor for implementing sliced attention.
-
-    Args:
-        slice_size (`int`, *optional*):
-            The number of steps to compute attention. Uses as many slices as `attention_head_dim // slice_size`, and
-            `attention_head_dim` must be a multiple of the `slice_size`.
-    """
-
-    def __init__(self, slice_size):
-        self.slice_size = slice_size
-
-    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None): # pylint: disable=too-many-statements, too-many-locals, too-many-branches
-        residual = hidden_states
-
-        input_ndim = hidden_states.ndim
-
-        if input_ndim == 4:
-            batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-
-        if attn.group_norm is not None:
-            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-
-        query = attn.to_q(hidden_states)
-        dim = query.shape[-1]
-        query = attn.head_to_batch_dim(query)
-
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
-        key = attn.to_k(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states)
-        key = attn.head_to_batch_dim(key)
-        value = attn.head_to_batch_dim(value)
-
-        batch_size_attention, query_tokens, shape_three = query.shape
-        hidden_states = torch.zeros(
-            (batch_size_attention, query_tokens, dim // attn.heads), device=query.device, dtype=query.dtype
-        )
-
-        #ARC GPUs can't allocate more than 4GB to a single block, Slice it:
-        block_multiply = 2.4 if query.dtype == torch.float32 else 1.2
-        block_size = (batch_size_attention * query_tokens * shape_three) / 1024 * block_multiply #MB
-        split_2_slice_size = query_tokens
-        if block_size >= 4000:
-            do_split_2 = True
-            #Find something divisible with the query_tokens
-            while ((self.slice_size * split_2_slice_size * shape_three) / 1024 * block_multiply) > 4000:
-                split_2_slice_size = split_2_slice_size // 2
-                if split_2_slice_size <= 1:
-                    split_2_slice_size = 1
-                    break
-        else:
-            do_split_2 = False
-
-        for i in range(batch_size_attention // self.slice_size):
-            start_idx = i * self.slice_size
-            end_idx = (i + 1) * self.slice_size
-
-            if do_split_2:
-                for i2 in range(query_tokens // split_2_slice_size): # pylint: disable=invalid-name
-                    start_idx_2 = i2 * split_2_slice_size
-                    end_idx_2 = (i2 + 1) * split_2_slice_size
-
-                    query_slice = query[start_idx:end_idx, start_idx_2:end_idx_2]
-                    key_slice = key[start_idx:end_idx, start_idx_2:end_idx_2]
-                    attn_mask_slice = attention_mask[start_idx:end_idx, start_idx_2:end_idx_2] if attention_mask is not None else None
-
-                    attn_slice = attn.get_attention_scores(query_slice, key_slice, attn_mask_slice)
-                    attn_slice = torch.bmm(attn_slice, value[start_idx:end_idx, start_idx_2:end_idx_2])
-
-                    hidden_states[start_idx:end_idx, start_idx_2:end_idx_2] = attn_slice
-            else:
-                query_slice = query[start_idx:end_idx]
-                key_slice = key[start_idx:end_idx]
-                attn_mask_slice = attention_mask[start_idx:end_idx] if attention_mask is not None else None
-
-                attn_slice = attn.get_attention_scores(query_slice, key_slice, attn_mask_slice)
-
-                attn_slice = torch.bmm(attn_slice, value[start_idx:end_idx])
-
-                hidden_states[start_idx:end_idx] = attn_slice
-
-        hidden_states = attn.batch_to_head_dim(hidden_states)
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-
-        if input_ndim == 4:
-            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-
-        if attn.residual_connection:
-            hidden_states = hidden_states + residual
-
-        hidden_states = hidden_states / attn.rescale_output_factor
-
-        return hidden_states
-
-def ipex_diffusers():
-    #ARC GPUs can't allocate more than 4GB to a single block:
-    diffusers.models.attention_processor.SlicedAttnProcessor = SlicedAttnProcessor
diff --git a/library/ipex/gradscaler.py b/library/ipex/gradscaler.py
deleted file mode 100644
index 530212101..000000000
--- a/library/ipex/gradscaler.py
+++ /dev/null
@@ -1,179 +0,0 @@
-from collections import defaultdict
-import torch
-import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
-import intel_extension_for_pytorch._C as core # pylint: disable=import-error, unused-import
-
-# pylint: disable=protected-access, missing-function-docstring, line-too-long
-
-OptState = ipex.cpu.autocast._grad_scaler.OptState
-_MultiDeviceReplicator = ipex.cpu.autocast._grad_scaler._MultiDeviceReplicator
-_refresh_per_optimizer_state = ipex.cpu.autocast._grad_scaler._refresh_per_optimizer_state
-
-def _unscale_grads_(self, optimizer, inv_scale, found_inf, allow_fp16): # pylint: disable=unused-argument
-    per_device_inv_scale = _MultiDeviceReplicator(inv_scale)
-    per_device_found_inf = _MultiDeviceReplicator(found_inf)
-
-    # To set up _amp_foreach_non_finite_check_and_unscale_, split grads by device and dtype.
-    # There could be hundreds of grads, so we'd like to iterate through them just once.
-    # However, we don't know their devices or dtypes in advance.
-
-    # https://stackoverflow.com/questions/5029934/defaultdict-of-defaultdict
-    # Google says mypy struggles with defaultdicts type annotations.
-    per_device_and_dtype_grads = defaultdict(lambda: defaultdict(list))  # type: ignore[var-annotated]
-    # sync grad to master weight
-    if hasattr(optimizer, "sync_grad"):
-        optimizer.sync_grad()
-    with torch.no_grad():
-        for group in optimizer.param_groups:
-            for param in group["params"]:
-                if param.grad is None:
-                    continue
-                if (not allow_fp16) and param.grad.dtype == torch.float16:
-                    raise ValueError("Attempting to unscale FP16 gradients.")
-                if param.grad.is_sparse:
-                    # is_coalesced() == False means the sparse grad has values with duplicate indices.
-                    # coalesce() deduplicates indices and adds all values that have the same index.
-                    # For scaled fp16 values, there's a good chance coalescing will cause overflow,
-                    # so we should check the coalesced _values().
-                    if param.grad.dtype is torch.float16:
-                        param.grad = param.grad.coalesce()
-                    to_unscale = param.grad._values()
-                else:
-                    to_unscale = param.grad
-
-                # -: is there a way to split by device and dtype without appending in the inner loop?
-                to_unscale = to_unscale.to("cpu")
-                per_device_and_dtype_grads[to_unscale.device][
-                    to_unscale.dtype
-                ].append(to_unscale)
-
-        for _, per_dtype_grads in per_device_and_dtype_grads.items():
-            for grads in per_dtype_grads.values():
-                core._amp_foreach_non_finite_check_and_unscale_(
-                    grads,
-                    per_device_found_inf.get("cpu"),
-                    per_device_inv_scale.get("cpu"),
-                )
-
-    return per_device_found_inf._per_device_tensors
-
-def unscale_(self, optimizer):
-    """
-    Divides ("unscales") the optimizer's gradient tensors by the scale factor.
-    :meth:`unscale_` is optional, serving cases where you need to
-    :ref:`modify or inspect gradients<working-with-unscaled-gradients>`
-    between the backward pass(es) and :meth:`step`.
-    If :meth:`unscale_` is not called explicitly,  gradients will be unscaled  automatically during :meth:`step`.
-    Simple example, using :meth:`unscale_` to enable clipping of unscaled gradients::
-        ...
-        scaler.scale(loss).backward()
-        scaler.unscale_(optimizer)
-        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
-        scaler.step(optimizer)
-        scaler.update()
-    Args:
-        optimizer (torch.optim.Optimizer):  Optimizer that owns the gradients to be unscaled.
-    .. warning::
-        :meth:`unscale_` should only be called once per optimizer per :meth:`step` call,
-        and only after all gradients for that optimizer's assigned parameters have been accumulated.
-        Calling :meth:`unscale_` twice for a given optimizer between each :meth:`step` triggers a RuntimeError.
-    .. warning::
-        :meth:`unscale_` may unscale sparse gradients out of place, replacing the ``.grad`` attribute.
-    """
-    if not self._enabled:
-        return
-
-    self._check_scale_growth_tracker("unscale_")
-
-    optimizer_state = self._per_optimizer_states[id(optimizer)]
-
-    if optimizer_state["stage"] is OptState.UNSCALED: # pylint: disable=no-else-raise
-        raise RuntimeError(
-            "unscale_() has already been called on this optimizer since the last update()."
-        )
-    elif optimizer_state["stage"] is OptState.STEPPED:
-        raise RuntimeError("unscale_() is being called after step().")
-
-    # FP32 division can be imprecise for certain compile options, so we carry out the reciprocal in FP64.
-    assert self._scale is not None
-    inv_scale = self._scale.to("cpu").double().reciprocal().float().to(self._scale.device)
-    found_inf = torch.full(
-        (1,), 0.0, dtype=torch.float32, device=self._scale.device
-    )
-
-    optimizer_state["found_inf_per_device"] = self._unscale_grads_(
-        optimizer, inv_scale, found_inf, False
-    )
-    optimizer_state["stage"] = OptState.UNSCALED
-
-def update(self, new_scale=None):
-    """
-    Updates the scale factor.
-    If any optimizer steps were skipped the scale is multiplied by ``backoff_factor``
-    to reduce it. If ``growth_interval`` unskipped iterations occurred consecutively,
-    the scale is multiplied by ``growth_factor`` to increase it.
-    Passing ``new_scale`` sets the new scale value manually. (``new_scale`` is not
-    used directly, it's used to fill GradScaler's internal scale tensor. So if
-    ``new_scale`` was a tensor, later in-place changes to that tensor will not further
-    affect the scale GradScaler uses internally.)
-    Args:
-        new_scale (float or :class:`torch.FloatTensor`, optional, default=None):  New scale factor.
-    .. warning::
-        :meth:`update` should only be called at the end of the iteration, after ``scaler.step(optimizer)`` has
-        been invoked for all optimizers used this iteration.
-    """
-    if not self._enabled:
-        return
-
-    _scale, _growth_tracker = self._check_scale_growth_tracker("update")
-
-    if new_scale is not None:
-        # Accept a new user-defined scale.
-        if isinstance(new_scale, float):
-            self._scale.fill_(new_scale)  # type: ignore[union-attr]
-        else:
-            reason = "new_scale should be a float or a 1-element torch.FloatTensor with requires_grad=False."
-            assert isinstance(new_scale, torch.FloatTensor), reason  # type: ignore[attr-defined]
-            assert new_scale.numel() == 1, reason
-            assert new_scale.requires_grad is False, reason
-            self._scale.copy_(new_scale)  # type: ignore[union-attr]
-    else:
-        # Consume shared inf/nan data collected from optimizers to update the scale.
-        # If all found_inf tensors are on the same device as self._scale, this operation is asynchronous.
-        found_infs = [
-            found_inf.to(device="cpu", non_blocking=True)
-            for state in self._per_optimizer_states.values()
-            for found_inf in state["found_inf_per_device"].values()
-        ]
-
-        assert len(found_infs) > 0, "No inf checks were recorded prior to update."
-
-        found_inf_combined = found_infs[0]
-        if len(found_infs) > 1:
-            for i in range(1, len(found_infs)):
-                found_inf_combined += found_infs[i]
-
-        to_device = _scale.device
-        _scale = _scale.to("cpu")
-        _growth_tracker = _growth_tracker.to("cpu")
-
-        core._amp_update_scale_(
-            _scale,
-            _growth_tracker,
-            found_inf_combined,
-            self._growth_factor,
-            self._backoff_factor,
-            self._growth_interval,
-        )
-
-        _scale = _scale.to(to_device)
-        _growth_tracker = _growth_tracker.to(to_device)
-    # To prepare for next iteration, clear the data collected from optimizers this iteration.
-    self._per_optimizer_states = defaultdict(_refresh_per_optimizer_state)
-
-def gradscaler_init():
-    torch.xpu.amp.GradScaler = ipex.cpu.autocast._grad_scaler.GradScaler
-    torch.xpu.amp.GradScaler._unscale_grads_ = _unscale_grads_
-    torch.xpu.amp.GradScaler.unscale_ = unscale_
-    torch.xpu.amp.GradScaler.update = update
-    return torch.xpu.amp.GradScaler
diff --git a/library/ipex/hijacks.py b/library/ipex/hijacks.py
deleted file mode 100644
index 77ed5419a..000000000
--- a/library/ipex/hijacks.py
+++ /dev/null
@@ -1,196 +0,0 @@
-import contextlib
-import importlib
-import torch
-import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
-
-# pylint: disable=protected-access, missing-function-docstring, line-too-long, unnecessary-lambda, no-else-return
-
-class CondFunc: # pylint: disable=missing-class-docstring
-    def __new__(cls, orig_func, sub_func, cond_func):
-        self = super(CondFunc, cls).__new__(cls)
-        if isinstance(orig_func, str):
-            func_path = orig_func.split('.')
-            for i in range(len(func_path)-1, -1, -1):
-                try:
-                    resolved_obj = importlib.import_module('.'.join(func_path[:i]))
-                    break
-                except ImportError:
-                    pass
-            for attr_name in func_path[i:-1]:
-                resolved_obj = getattr(resolved_obj, attr_name)
-            orig_func = getattr(resolved_obj, func_path[-1])
-            setattr(resolved_obj, func_path[-1], lambda *args, **kwargs: self(*args, **kwargs))
-        self.__init__(orig_func, sub_func, cond_func)
-        return lambda *args, **kwargs: self(*args, **kwargs)
-    def __init__(self, orig_func, sub_func, cond_func):
-        self.__orig_func = orig_func
-        self.__sub_func = sub_func
-        self.__cond_func = cond_func
-    def __call__(self, *args, **kwargs):
-        if not self.__cond_func or self.__cond_func(self.__orig_func, *args, **kwargs):
-            return self.__sub_func(self.__orig_func, *args, **kwargs)
-        else:
-            return self.__orig_func(*args, **kwargs)
-
-_utils = torch.utils.data._utils
-def _shutdown_workers(self):
-    if torch.utils.data._utils is None or torch.utils.data._utils.python_exit_status is True or torch.utils.data._utils.python_exit_status is None:
-        return
-    if hasattr(self, "_shutdown") and not self._shutdown:
-        self._shutdown = True
-        try:
-            if hasattr(self, '_pin_memory_thread'):
-                self._pin_memory_thread_done_event.set()
-                self._worker_result_queue.put((None, None))
-                self._pin_memory_thread.join()
-                self._worker_result_queue.cancel_join_thread()
-                self._worker_result_queue.close()
-            self._workers_done_event.set()
-            for worker_id in range(len(self._workers)):
-                if self._persistent_workers or self._workers_status[worker_id]:
-                    self._mark_worker_as_unavailable(worker_id, shutdown=True)
-            for w in self._workers: # pylint: disable=invalid-name
-                w.join(timeout=torch.utils.data._utils.MP_STATUS_CHECK_INTERVAL)
-            for q in self._index_queues: # pylint: disable=invalid-name
-                q.cancel_join_thread()
-                q.close()
-        finally:
-            if self._worker_pids_set:
-                torch.utils.data._utils.signal_handling._remove_worker_pids(id(self))
-                self._worker_pids_set = False
-            for w in self._workers: # pylint: disable=invalid-name
-                if w.is_alive():
-                    w.terminate()
-
-class DummyDataParallel(torch.nn.Module): # pylint: disable=missing-class-docstring, unused-argument, too-few-public-methods
-    def __new__(cls, module, device_ids=None, output_device=None, dim=0): # pylint: disable=unused-argument
-        if isinstance(device_ids, list) and len(device_ids) > 1:
-            print("IPEX backend doesn't support DataParallel on multiple XPU devices")
-        return module.to("xpu")
-
-def return_null_context(*args, **kwargs): # pylint: disable=unused-argument
-    return contextlib.nullcontext()
-
-def check_device(device):
-    return bool((isinstance(device, torch.device) and device.type == "cuda") or (isinstance(device, str) and "cuda" in device) or isinstance(device, int))
-
-def return_xpu(device):
-    return f"xpu:{device.split(':')[-1]}" if isinstance(device, str) and ":" in device else f"xpu:{device}" if isinstance(device, int) else torch.device("xpu") if isinstance(device, torch.device) else "xpu"
-
-def ipex_no_cuda(orig_func, *args, **kwargs):
-    torch.cuda.is_available = lambda: False
-    orig_func(*args, **kwargs)
-    torch.cuda.is_available = torch.xpu.is_available
-
-original_autocast = torch.autocast
-def ipex_autocast(*args, **kwargs):
-    if len(args) > 0 and args[0] == "cuda":
-        return original_autocast("xpu", *args[1:], **kwargs)
-    else:
-        return original_autocast(*args, **kwargs)
-
-original_torch_cat = torch.cat
-def torch_cat(tensor, *args, **kwargs):
-    if len(tensor) == 3 and (tensor[0].dtype != tensor[1].dtype or tensor[2].dtype != tensor[1].dtype):
-        return original_torch_cat([tensor[0].to(tensor[1].dtype), tensor[1], tensor[2].to(tensor[1].dtype)], *args, **kwargs)
-    else:
-        return original_torch_cat(tensor, *args, **kwargs)
-
-original_interpolate = torch.nn.functional.interpolate
-def interpolate(tensor, size=None, scale_factor=None, mode='nearest', align_corners=None, recompute_scale_factor=None, antialias=False): # pylint: disable=too-many-arguments
-    if antialias or align_corners is not None:
-        return_device = tensor.device
-        return_dtype = tensor.dtype
-        return original_interpolate(tensor.to("cpu", dtype=torch.float32), size=size, scale_factor=scale_factor, mode=mode,
-        align_corners=align_corners, recompute_scale_factor=recompute_scale_factor, antialias=antialias).to(return_device, dtype=return_dtype)
-    else:
-        return original_interpolate(tensor, size=size, scale_factor=scale_factor, mode=mode,
-        align_corners=align_corners, recompute_scale_factor=recompute_scale_factor, antialias=antialias)
-
-original_linalg_solve = torch.linalg.solve
-def linalg_solve(A, B, *args, **kwargs): # pylint: disable=invalid-name
-    if A.device != torch.device("cpu") or B.device != torch.device("cpu"):
-        return_device = A.device
-        return original_linalg_solve(A.to("cpu"), B.to("cpu"), *args, **kwargs).to(return_device)
-    else:
-        return original_linalg_solve(A, B, *args, **kwargs)
-
-def ipex_hijacks():
-    CondFunc('torch.Tensor.to',
-        lambda orig_func, self, device=None, *args, **kwargs: orig_func(self, return_xpu(device), *args, **kwargs),
-        lambda orig_func, self, device=None, *args, **kwargs: check_device(device))
-    CondFunc('torch.Tensor.cuda',
-        lambda orig_func, self, device=None, *args, **kwargs: orig_func(self, return_xpu(device), *args, **kwargs),
-        lambda orig_func, self, device=None, *args, **kwargs: check_device(device))
-    CondFunc('torch.empty',
-        lambda orig_func, *args, device=None, **kwargs: orig_func(*args, device=return_xpu(device), **kwargs),
-        lambda orig_func, *args, device=None, **kwargs: check_device(device))
-    CondFunc('torch.load',
-        lambda orig_func, *args, map_location=None, **kwargs: orig_func(*args, return_xpu(map_location), **kwargs),
-        lambda orig_func, *args, map_location=None, **kwargs: map_location is None or check_device(map_location))
-    CondFunc('torch.randn',
-        lambda orig_func, *args, device=None, **kwargs: orig_func(*args, device=return_xpu(device), **kwargs),
-        lambda orig_func, *args, device=None, **kwargs: check_device(device))
-    CondFunc('torch.ones',
-        lambda orig_func, *args, device=None, **kwargs: orig_func(*args, device=return_xpu(device), **kwargs),
-        lambda orig_func, *args, device=None, **kwargs: check_device(device))
-    CondFunc('torch.zeros',
-        lambda orig_func, *args, device=None, **kwargs: orig_func(*args, device=return_xpu(device), **kwargs),
-        lambda orig_func, *args, device=None, **kwargs: check_device(device))
-    CondFunc('torch.tensor',
-        lambda orig_func, *args, device=None, **kwargs: orig_func(*args, device=return_xpu(device), **kwargs),
-        lambda orig_func, *args, device=None, **kwargs: check_device(device))
-    CondFunc('torch.linspace',
-        lambda orig_func, *args, device=None, **kwargs: orig_func(*args, device=return_xpu(device), **kwargs),
-        lambda orig_func, *args, device=None, **kwargs: check_device(device))
-
-    CondFunc('torch.Generator',
-        lambda orig_func, device=None: torch.xpu.Generator(device),
-        lambda orig_func, device=None: device is not None and device != torch.device("cpu") and device != "cpu")
-
-    CondFunc('torch.batch_norm',
-        lambda orig_func, input, weight, bias, *args, **kwargs: orig_func(input,
-        weight if weight is not None else torch.ones(input.size()[1], device=input.device),
-        bias if bias is not None else torch.zeros(input.size()[1], device=input.device), *args, **kwargs),
-        lambda orig_func, input, *args, **kwargs: input.device != torch.device("cpu"))
-    CondFunc('torch.instance_norm',
-        lambda orig_func, input, weight, bias, *args, **kwargs: orig_func(input,
-        weight if weight is not None else torch.ones(input.size()[1], device=input.device),
-        bias if bias is not None else torch.zeros(input.size()[1], device=input.device), *args, **kwargs),
-        lambda orig_func, input, *args, **kwargs: input.device != torch.device("cpu"))
-
-    #Functions with dtype errors:
-    CondFunc('torch.nn.modules.GroupNorm.forward',
-        lambda orig_func, self, input: orig_func(self, input.to(self.weight.data.dtype)),
-        lambda orig_func, self, input: input.dtype != self.weight.data.dtype)
-    CondFunc('torch.nn.modules.linear.Linear.forward',
-        lambda orig_func, self, input: orig_func(self, input.to(self.weight.data.dtype)),
-        lambda orig_func, self, input: input.dtype != self.weight.data.dtype)
-    CondFunc('torch.nn.modules.conv.Conv2d.forward',
-        lambda orig_func, self, input: orig_func(self, input.to(self.weight.data.dtype)),
-        lambda orig_func, self, input: input.dtype != self.weight.data.dtype)
-    CondFunc('torch.nn.functional.layer_norm',
-        lambda orig_func, input, normalized_shape=None, weight=None, *args, **kwargs:
-        orig_func(input.to(weight.data.dtype), normalized_shape, weight, *args, **kwargs),
-        lambda orig_func, input, normalized_shape=None, weight=None, *args, **kwargs:
-        weight is not None and input.dtype != weight.data.dtype)
-
-    #Diffusers Float64 (ARC GPUs doesn't support double or Float64):
-    if not torch.xpu.has_fp64_dtype():
-        CondFunc('torch.from_numpy',
-        lambda orig_func, ndarray: orig_func(ndarray.astype('float32')),
-        lambda orig_func, ndarray: ndarray.dtype == float)
-
-    #Broken functions when torch.cuda.is_available is True:
-    CondFunc('torch.utils.data.dataloader._BaseDataLoaderIter.__init__',
-        lambda orig_func, *args, **kwargs: ipex_no_cuda(orig_func, *args, **kwargs),
-        lambda orig_func, *args, **kwargs: True)
-
-    #Functions that make compile mad with CondFunc:
-    torch.utils.data.dataloader._MultiProcessingDataLoaderIter._shutdown_workers = _shutdown_workers
-    torch.nn.DataParallel = DummyDataParallel
-    torch.autocast = ipex_autocast
-    torch.cat = torch_cat
-    torch.linalg.solve = linalg_solve
-    torch.nn.functional.interpolate = interpolate
-    torch.backends.cuda.sdp_kernel = return_null_context
diff --git a/library/model_util.py b/library/model_util.py
index 00a3c0495..860c170b2 100644
--- a/library/model_util.py
+++ b/library/model_util.py
@@ -4,13 +4,6 @@
 import math
 import os
 import torch
-try:
-    import intel_extension_for_pytorch as ipex
-    if torch.xpu.is_available():
-        from library.ipex import ipex_init
-        ipex_init()
-except Exception:
-    pass
 import diffusers
 from transformers import CLIPTextModel, CLIPTokenizer, CLIPTextConfig, logging
 from diffusers import AutoencoderKL, DDIMScheduler, StableDiffusionPipeline  # , UNet2DConditionModel
diff --git a/sdxl_gen_img.py b/sdxl_gen_img.py
index f0d308511..c506ad3fc 100755
--- a/sdxl_gen_img.py
+++ b/sdxl_gen_img.py
@@ -17,13 +17,6 @@
 import diffusers
 import numpy as np
 import torch
-try:
-    import intel_extension_for_pytorch as ipex
-    if torch.xpu.is_available():
-        from library.ipex import ipex_init
-        ipex_init()
-except Exception:
-    pass
 import torchvision
 from diffusers import (
     AutoencoderKL,
diff --git a/sdxl_minimal_inference.py b/sdxl_minimal_inference.py
index ff865629e..5c8a0bd89 100644
--- a/sdxl_minimal_inference.py
+++ b/sdxl_minimal_inference.py
@@ -9,13 +9,6 @@
 from einops import repeat
 import numpy as np
 import torch
-try:
-    import intel_extension_for_pytorch as ipex
-    if torch.xpu.is_available():
-        from library.ipex import ipex_init
-        ipex_init()
-except Exception:
-    pass
 from tqdm import tqdm
 from transformers import CLIPTokenizer
 from diffusers import EulerDiscreteScheduler
diff --git a/sdxl_train.py b/sdxl_train.py
index 6b255d679..195467b00 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -10,13 +10,6 @@
 
 from tqdm import tqdm
 import torch
-try:
-    import intel_extension_for_pytorch as ipex
-    if torch.xpu.is_available():
-        from library.ipex import ipex_init
-        ipex_init()
-except Exception:
-    pass
 from accelerate.utils import set_seed
 from diffusers import DDPMScheduler
 from library import sdxl_model_util
diff --git a/sdxl_train_control_net_lllite.py b/sdxl_train_control_net_lllite.py
index f8169bdbf..09cf16438 100644
--- a/sdxl_train_control_net_lllite.py
+++ b/sdxl_train_control_net_lllite.py
@@ -11,13 +11,6 @@
 
 from tqdm import tqdm
 import torch
-try:
-    import intel_extension_for_pytorch as ipex
-    if torch.xpu.is_available():
-        from library.ipex import ipex_init
-        ipex_init()
-except Exception:
-    pass
 from torch.nn.parallel import DistributedDataParallel as DDP
 from accelerate.utils import set_seed
 from diffusers import DDPMScheduler, ControlNetModel
diff --git a/sdxl_train_control_net_lllite_alt.py b/sdxl_train_control_net_lllite_alt.py
index 61ebfb581..757194a10 100644
--- a/sdxl_train_control_net_lllite_alt.py
+++ b/sdxl_train_control_net_lllite_alt.py
@@ -14,13 +14,6 @@
 
 from tqdm import tqdm
 import torch
-try:
-    import intel_extension_for_pytorch as ipex
-    if torch.xpu.is_available():
-        from library.ipex import ipex_init
-        ipex_init()
-except Exception:
-    pass
 from torch.nn.parallel import DistributedDataParallel as DDP
 from accelerate.utils import set_seed
 import accelerate
diff --git a/sdxl_train_network.py b/sdxl_train_network.py
index 2de57c0ac..8d3a81c3a 100644
--- a/sdxl_train_network.py
+++ b/sdxl_train_network.py
@@ -1,12 +1,5 @@
 import argparse
 import torch
-try:
-    import intel_extension_for_pytorch as ipex
-    if torch.xpu.is_available():
-        from library.ipex import ipex_init
-        ipex_init()
-except Exception:
-    pass
 from library import sdxl_model_util, sdxl_train_util, train_util
 import train_network
 
diff --git a/sdxl_train_textual_inversion.py b/sdxl_train_textual_inversion.py
index f5cca17b2..123ca35a1 100644
--- a/sdxl_train_textual_inversion.py
+++ b/sdxl_train_textual_inversion.py
@@ -3,13 +3,6 @@
 
 import regex
 import torch
-try:
-    import intel_extension_for_pytorch as ipex
-    if torch.xpu.is_available():
-        from library.ipex import ipex_init
-        ipex_init()
-except Exception:
-    pass
 import open_clip
 from library import sdxl_model_util, sdxl_train_util, train_util
 
diff --git a/train_controlnet.py b/train_controlnet.py
index 42da44125..988304f62 100644
--- a/train_controlnet.py
+++ b/train_controlnet.py
@@ -11,13 +11,6 @@
 
 from tqdm import tqdm
 import torch
-try:
-    import intel_extension_for_pytorch as ipex
-    if torch.xpu.is_available():
-        from library.ipex import ipex_init
-        ipex_init()
-except Exception:
-    pass
 from torch.nn.parallel import DistributedDataParallel as DDP
 from accelerate.utils import set_seed
 from diffusers import DDPMScheduler, ControlNetModel
diff --git a/train_db.py b/train_db.py
index feb147787..6dde7e9bf 100644
--- a/train_db.py
+++ b/train_db.py
@@ -11,13 +11,6 @@
 
 from tqdm import tqdm
 import torch
-try:
-    import intel_extension_for_pytorch as ipex
-    if torch.xpu.is_available():
-        from library.ipex import ipex_init
-        ipex_init()
-except Exception:
-    pass
 from accelerate.utils import set_seed
 from diffusers import DDPMScheduler
 
diff --git a/train_network.py b/train_network.py
index 200fc2cfe..f752607e9 100644
--- a/train_network.py
+++ b/train_network.py
@@ -12,13 +12,6 @@
 
 from tqdm import tqdm
 import torch
-try:
-    import intel_extension_for_pytorch as ipex
-    if torch.xpu.is_available():
-        from library.ipex import ipex_init
-        ipex_init()
-except Exception:
-    pass
 from accelerate.utils import set_seed
 from diffusers import DDPMScheduler
 from library import model_util
diff --git a/train_textual_inversion.py b/train_textual_inversion.py
index 1c7b7fcb2..b65d524cf 100644
--- a/train_textual_inversion.py
+++ b/train_textual_inversion.py
@@ -7,13 +7,6 @@
 
 from tqdm import tqdm
 import torch
-try:
-    import intel_extension_for_pytorch as ipex
-    if torch.xpu.is_available():
-        from library.ipex import ipex_init
-        ipex_init()
-except Exception:
-    pass
 from accelerate.utils import set_seed
 from diffusers import DDPMScheduler
 from transformers import CLIPTokenizer
diff --git a/train_textual_inversion_XTI.py b/train_textual_inversion_XTI.py
index 2c5673be1..79c64cbeb 100644
--- a/train_textual_inversion_XTI.py
+++ b/train_textual_inversion_XTI.py
@@ -8,13 +8,6 @@
 
 from tqdm import tqdm
 import torch
-try:
-    import intel_extension_for_pytorch as ipex
-    if torch.xpu.is_available():
-        from library.ipex import ipex_init
-        ipex_init()
-except Exception:
-    pass
 from accelerate.utils import set_seed
 import diffusers
 from diffusers import DDPMScheduler

From c8fcfd45815e6d4698c96b35e8855d3fc9801952 Mon Sep 17 00:00:00 2001
From: Yuta Hayashibe <yuta@hayashibe.jp>
Date: Sun, 1 Oct 2023 21:48:50 +0900
Subject: [PATCH 08/33] Add "venv" to extend-exclude

---
 _typos.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/_typos.toml b/_typos.toml
index 396ee5c57..51b55b3ca 100644
--- a/_typos.toml
+++ b/_typos.toml
@@ -12,4 +12,4 @@ nd="nd"
 
 
 [files]
-extend-exclude = ["_typos.toml"]
+extend-exclude = ["_typos.toml", "venv"]

From 27f9b6ffeb98645ce9972acea46a9726605753b7 Mon Sep 17 00:00:00 2001
From: Yuta Hayashibe <yuta@hayashibe.jp>
Date: Sun, 1 Oct 2023 21:51:24 +0900
Subject: [PATCH 09/33] updated typos to v1.16.15 and fix typos

---
 .github/workflows/typos.yml          |  2 +-
 _typos.toml                          | 18 ++++++++++++++++++
 fine_tune.py                         |  6 +++---
 gen_img_diffusers.py                 |  6 +++---
 library/original_unet.py             |  2 +-
 library/sdxl_original_unet.py        |  2 +-
 library/train_util.py                |  2 +-
 sdxl_gen_img.py                      |  6 +++---
 sdxl_minimal_inference.py            |  2 +-
 sdxl_train.py                        |  6 +++---
 sdxl_train_control_net_lllite.py     |  6 +++---
 sdxl_train_control_net_lllite_old.py |  6 +++---
 tools/cache_latents.py               |  6 +++---
 tools/cache_text_encoder_outputs.py  |  6 +++---
 train_controlnet.py                  |  6 +++---
 train_db.py                          |  6 +++---
 train_network.py                     |  8 ++++----
 train_textual_inversion.py           |  6 +++---
 train_textual_inversion_XTI.py       |  6 +++---
 19 files changed, 63 insertions(+), 45 deletions(-)

diff --git a/.github/workflows/typos.yml b/.github/workflows/typos.yml
index e37838390..90132c334 100644
--- a/.github/workflows/typos.yml
+++ b/.github/workflows/typos.yml
@@ -18,4 +18,4 @@ jobs:
       - uses: actions/checkout@v3
 
       - name: typos-action
-        uses: crate-ci/typos@v1.13.10
+        uses: crate-ci/typos@v1.16.15
diff --git a/_typos.toml b/_typos.toml
index 51b55b3ca..ae9e06b18 100644
--- a/_typos.toml
+++ b/_typos.toml
@@ -9,6 +9,24 @@ parms="parms"
 nin="nin"
 extention="extention" # Intentionally left
 nd="nd"
+shs="shs"
+sts="sts"
+scs="scs"
+cpc="cpc"
+coc="coc"
+cic="cic"
+msm="msm"
+usu="usu"
+ici="ici"
+lvl="lvl"
+dii="dii"
+muk="muk"
+ori="ori"
+hru="hru"
+rik="rik"
+koo="koo"
+yos="yos"
+wn="wn"
 
 
 [files]
diff --git a/fine_tune.py b/fine_tune.py
index f300d4688..2ecb4ff36 100644
--- a/fine_tune.py
+++ b/fine_tune.py
@@ -80,8 +80,8 @@ def train(args):
 
     current_epoch = Value("i", 0)
     current_step = Value("i", 0)
-    ds_for_collater = train_dataset_group if args.max_data_loader_n_workers == 0 else None
-    collater = train_util.collater_class(current_epoch, current_step, ds_for_collater)
+    ds_for_collator = train_dataset_group if args.max_data_loader_n_workers == 0 else None
+    collator = train_util.collator_class(current_epoch, current_step, ds_for_collator)
 
     if args.debug_dataset:
         train_util.debug_dataset(train_dataset_group)
@@ -208,7 +208,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
         train_dataset_group,
         batch_size=1,
         shuffle=True,
-        collate_fn=collater,
+        collate_fn=collator,
         num_workers=n_workers,
         persistent_workers=args.persistent_data_loader_workers,
     )
diff --git a/gen_img_diffusers.py b/gen_img_diffusers.py
index 70ca67942..0ec683a23 100644
--- a/gen_img_diffusers.py
+++ b/gen_img_diffusers.py
@@ -3364,7 +3364,7 @@ def setup_parser() -> argparse.ArgumentParser:
     )
     parser.add_argument("--network_mul", type=float, default=None, nargs="*", help="additional network multiplier / 追加ネットワークの効果の倍率")
     parser.add_argument(
-        "--network_args", type=str, default=None, nargs="*", help="additional argmuments for network (key=value) / ネットワークへの追加の引数"
+        "--network_args", type=str, default=None, nargs="*", help="additional arguments for network (key=value) / ネットワークへの追加の引数"
     )
     parser.add_argument("--network_show_meta", action="store_true", help="show metadata of network model / ネットワークモデルのメタデータを表示する")
     parser.add_argument("--network_merge", action="store_true", help="merge network weights to original model / ネットワークの重みをマージする")
@@ -3390,7 +3390,7 @@ def setup_parser() -> argparse.ArgumentParser:
         "--max_embeddings_multiples",
         type=int,
         default=None,
-        help="max embeding multiples, max token length is 75 * multiples / トークン長をデフォルトの何倍とするか 75*この値 がトークン長となる",
+        help="max embedding multiples, max token length is 75 * multiples / トークン長をデフォルトの何倍とするか 75*この値 がトークン長となる",
     )
     parser.add_argument(
         "--clip_guidance_scale",
@@ -3449,7 +3449,7 @@ def setup_parser() -> argparse.ArgumentParser:
         "--highres_fix_upscaler_args",
         type=str,
         default=None,
-        help="additional argmuments for upscaler (key=value) / upscalerへの追加の引数",
+        help="additional arguments for upscaler (key=value) / upscalerへの追加の引数",
     )
     parser.add_argument(
         "--highres_fix_disable_control_net",
diff --git a/library/original_unet.py b/library/original_unet.py
index c0028ddc2..240b85951 100644
--- a/library/original_unet.py
+++ b/library/original_unet.py
@@ -131,7 +131,7 @@
 UP_BLOCK_TYPES = ["UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"]
 
 
-# region memory effcient attention
+# region memory efficient attention
 
 # FlashAttentionを使うCrossAttention
 # based on https://github.com/lucidrains/memory-efficient-attention-pytorch/blob/main/memory_efficient_attention_pytorch/flash_attention.py
diff --git a/library/sdxl_original_unet.py b/library/sdxl_original_unet.py
index 586909bdb..26a0af319 100644
--- a/library/sdxl_original_unet.py
+++ b/library/sdxl_original_unet.py
@@ -41,7 +41,7 @@
 
 USE_REENTRANT = True
 
-# region memory effcient attention
+# region memory efficient attention
 
 # FlashAttentionを使うCrossAttention
 # based on https://github.com/lucidrains/memory-efficient-attention-pytorch/blob/main/memory_efficient_attention_pytorch/flash_attention.py
diff --git a/library/train_util.py b/library/train_util.py
index 35bfb5f5b..5433357ac 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -4658,7 +4658,7 @@ def __getitem__(self, idx):
 
 
 # collate_fn用 epoch,stepはmultiprocessing.Value
-class collater_class:
+class collator_class:
     def __init__(self, epoch, step, dataset):
         self.current_epoch = epoch
         self.current_step = step
diff --git a/sdxl_gen_img.py b/sdxl_gen_img.py
index ac01b76e0..ab2b6b3d6 100755
--- a/sdxl_gen_img.py
+++ b/sdxl_gen_img.py
@@ -2612,7 +2612,7 @@ def setup_parser() -> argparse.ArgumentParser:
     )
     parser.add_argument("--network_mul", type=float, default=None, nargs="*", help="additional network multiplier / 追加ネットワークの効果の倍率")
     parser.add_argument(
-        "--network_args", type=str, default=None, nargs="*", help="additional argmuments for network (key=value) / ネットワークへの追加の引数"
+        "--network_args", type=str, default=None, nargs="*", help="additional arguments for network (key=value) / ネットワークへの追加の引数"
     )
     parser.add_argument("--network_show_meta", action="store_true", help="show metadata of network model / ネットワークモデルのメタデータを表示する")
     parser.add_argument("--network_merge", action="store_true", help="merge network weights to original model / ネットワークの重みをマージする")
@@ -2631,7 +2631,7 @@ def setup_parser() -> argparse.ArgumentParser:
         "--max_embeddings_multiples",
         type=int,
         default=None,
-        help="max embeding multiples, max token length is 75 * multiples / トークン長をデフォルトの何倍とするか 75*この値 がトークン長となる",
+        help="max embedding multiples, max token length is 75 * multiples / トークン長をデフォルトの何倍とするか 75*この値 がトークン長となる",
     )
     parser.add_argument(
         "--guide_image_path", type=str, default=None, nargs="*", help="image to CLIP guidance / CLIP guided SDでガイドに使う画像"
@@ -2666,7 +2666,7 @@ def setup_parser() -> argparse.ArgumentParser:
         "--highres_fix_upscaler_args",
         type=str,
         default=None,
-        help="additional argmuments for upscaler (key=value) / upscalerへの追加の引数",
+        help="additional arguments for upscaler (key=value) / upscalerへの追加の引数",
     )
     parser.add_argument(
         "--highres_fix_disable_control_net",
diff --git a/sdxl_minimal_inference.py b/sdxl_minimal_inference.py
index ff865629e..45b9edd65 100644
--- a/sdxl_minimal_inference.py
+++ b/sdxl_minimal_inference.py
@@ -101,7 +101,7 @@ def get_timestep_embedding(x, outdim):
         type=str,
         nargs="*",
         default=[],
-        help="LoRA weights, only supports networks.lora, each arguement is a `path;multiplier` (semi-colon separated)",
+        help="LoRA weights, only supports networks.lora, each argument is a `path;multiplier` (semi-colon separated)",
     )
     parser.add_argument("--interactive", action="store_true")
     args = parser.parse_args()
diff --git a/sdxl_train.py b/sdxl_train.py
index 6b255d679..7bde3cab7 100644
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -172,8 +172,8 @@ def train(args):
 
     current_epoch = Value("i", 0)
     current_step = Value("i", 0)
-    ds_for_collater = train_dataset_group if args.max_data_loader_n_workers == 0 else None
-    collater = train_util.collater_class(current_epoch, current_step, ds_for_collater)
+    ds_for_collator = train_dataset_group if args.max_data_loader_n_workers == 0 else None
+    collator = train_util.collator_class(current_epoch, current_step, ds_for_collator)
 
     train_dataset_group.verify_bucket_reso_steps(32)
 
@@ -348,7 +348,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
         train_dataset_group,
         batch_size=1,
         shuffle=True,
-        collate_fn=collater,
+        collate_fn=collator,
         num_workers=n_workers,
         persistent_workers=args.persistent_data_loader_workers,
     )
diff --git a/sdxl_train_control_net_lllite.py b/sdxl_train_control_net_lllite.py
index 61ebfb581..0df61e848 100644
--- a/sdxl_train_control_net_lllite.py
+++ b/sdxl_train_control_net_lllite.py
@@ -106,8 +106,8 @@ def train(args):
 
     current_epoch = Value("i", 0)
     current_step = Value("i", 0)
-    ds_for_collater = train_dataset_group if args.max_data_loader_n_workers == 0 else None
-    collater = train_util.collater_class(current_epoch, current_step, ds_for_collater)
+    ds_for_collator = train_dataset_group if args.max_data_loader_n_workers == 0 else None
+    collator = train_util.collator_class(current_epoch, current_step, ds_for_collator)
 
     train_dataset_group.verify_bucket_reso_steps(32)
 
@@ -245,7 +245,7 @@ def train(args):
         train_dataset_group,
         batch_size=1,
         shuffle=True,
-        collate_fn=collater,
+        collate_fn=collator,
         num_workers=n_workers,
         persistent_workers=args.persistent_data_loader_workers,
     )
diff --git a/sdxl_train_control_net_lllite_old.py b/sdxl_train_control_net_lllite_old.py
index f8169bdbf..79920a972 100644
--- a/sdxl_train_control_net_lllite_old.py
+++ b/sdxl_train_control_net_lllite_old.py
@@ -102,8 +102,8 @@ def train(args):
 
     current_epoch = Value("i", 0)
     current_step = Value("i", 0)
-    ds_for_collater = train_dataset_group if args.max_data_loader_n_workers == 0 else None
-    collater = train_util.collater_class(current_epoch, current_step, ds_for_collater)
+    ds_for_collator = train_dataset_group if args.max_data_loader_n_workers == 0 else None
+    collator = train_util.collator_class(current_epoch, current_step, ds_for_collator)
 
     train_dataset_group.verify_bucket_reso_steps(32)
 
@@ -213,7 +213,7 @@ def train(args):
         train_dataset_group,
         batch_size=1,
         shuffle=True,
-        collate_fn=collater,
+        collate_fn=collator,
         num_workers=n_workers,
         persistent_workers=args.persistent_data_loader_workers,
     )
diff --git a/tools/cache_latents.py b/tools/cache_latents.py
index b6991ac19..17916ef70 100644
--- a/tools/cache_latents.py
+++ b/tools/cache_latents.py
@@ -86,8 +86,8 @@ def cache_to_disk(args: argparse.Namespace) -> None:
 
     current_epoch = Value("i", 0)
     current_step = Value("i", 0)
-    ds_for_collater = train_dataset_group if args.max_data_loader_n_workers == 0 else None
-    collater = train_util.collater_class(current_epoch, current_step, ds_for_collater)
+    ds_for_collator = train_dataset_group if args.max_data_loader_n_workers == 0 else None
+    collator = train_util.collator_class(current_epoch, current_step, ds_for_collator)
 
     # acceleratorを準備する
     print("prepare accelerator")
@@ -120,7 +120,7 @@ def cache_to_disk(args: argparse.Namespace) -> None:
         train_dataset_group,
         batch_size=1,
         shuffle=True,
-        collate_fn=collater,
+        collate_fn=collator,
         num_workers=n_workers,
         persistent_workers=args.persistent_data_loader_workers,
     )
diff --git a/tools/cache_text_encoder_outputs.py b/tools/cache_text_encoder_outputs.py
index 2110e7261..7d9b13d68 100644
--- a/tools/cache_text_encoder_outputs.py
+++ b/tools/cache_text_encoder_outputs.py
@@ -91,8 +91,8 @@ def cache_to_disk(args: argparse.Namespace) -> None:
 
     current_epoch = Value("i", 0)
     current_step = Value("i", 0)
-    ds_for_collater = train_dataset_group if args.max_data_loader_n_workers == 0 else None
-    collater = train_util.collater_class(current_epoch, current_step, ds_for_collater)
+    ds_for_collator = train_dataset_group if args.max_data_loader_n_workers == 0 else None
+    collator = train_util.collator_class(current_epoch, current_step, ds_for_collator)
 
     # acceleratorを準備する
     print("prepare accelerator")
@@ -125,7 +125,7 @@ def cache_to_disk(args: argparse.Namespace) -> None:
         train_dataset_group,
         batch_size=1,
         shuffle=True,
-        collate_fn=collater,
+        collate_fn=collator,
         num_workers=n_workers,
         persistent_workers=args.persistent_data_loader_workers,
     )
diff --git a/train_controlnet.py b/train_controlnet.py
index 42da44125..5bc8d399c 100644
--- a/train_controlnet.py
+++ b/train_controlnet.py
@@ -98,8 +98,8 @@ def train(args):
 
     current_epoch = Value("i", 0)
     current_step = Value("i", 0)
-    ds_for_collater = train_dataset_group if args.max_data_loader_n_workers == 0 else None
-    collater = train_util.collater_class(current_epoch, current_step, ds_for_collater)
+    ds_for_collator = train_dataset_group if args.max_data_loader_n_workers == 0 else None
+    collator = train_util.collator_class(current_epoch, current_step, ds_for_collator)
 
     if args.debug_dataset:
         train_util.debug_dataset(train_dataset_group)
@@ -245,7 +245,7 @@ def train(args):
         train_dataset_group,
         batch_size=1,
         shuffle=True,
-        collate_fn=collater,
+        collate_fn=collator,
         num_workers=n_workers,
         persistent_workers=args.persistent_data_loader_workers,
     )
diff --git a/train_db.py b/train_db.py
index feb147787..a1b9cac8b 100644
--- a/train_db.py
+++ b/train_db.py
@@ -78,8 +78,8 @@ def train(args):
 
     current_epoch = Value("i", 0)
     current_step = Value("i", 0)
-    ds_for_collater = train_dataset_group if args.max_data_loader_n_workers == 0 else None
-    collater = train_util.collater_class(current_epoch, current_step, ds_for_collater)
+    ds_for_collator = train_dataset_group if args.max_data_loader_n_workers == 0 else None
+    collator = train_util.collator_class(current_epoch, current_step, ds_for_collator)
 
     if args.no_token_padding:
         train_dataset_group.disable_token_padding()
@@ -177,7 +177,7 @@ def train(args):
         train_dataset_group,
         batch_size=1,
         shuffle=True,
-        collate_fn=collater,
+        collate_fn=collator,
         num_workers=n_workers,
         persistent_workers=args.persistent_data_loader_workers,
     )
diff --git a/train_network.py b/train_network.py
index 1a1713259..99179814c 100644
--- a/train_network.py
+++ b/train_network.py
@@ -192,8 +192,8 @@ def train(self, args):
 
         current_epoch = Value("i", 0)
         current_step = Value("i", 0)
-        ds_for_collater = train_dataset_group if args.max_data_loader_n_workers == 0 else None
-        collater = train_util.collater_class(current_epoch, current_step, ds_for_collater)
+        ds_for_collator = train_dataset_group if args.max_data_loader_n_workers == 0 else None
+        collator = train_util.collator_class(current_epoch, current_step, ds_for_collator)
 
         if args.debug_dataset:
             train_util.debug_dataset(train_dataset_group)
@@ -342,7 +342,7 @@ def train(self, args):
             train_dataset_group,
             batch_size=1,
             shuffle=True,
-            collate_fn=collater,
+            collate_fn=collator,
             num_workers=n_workers,
             persistent_workers=args.persistent_data_loader_workers,
         )
@@ -954,7 +954,7 @@ def setup_parser() -> argparse.ArgumentParser:
         help="Drops neurons out of training every step (0 or None is default behavior (no dropout), 1 would drop all neurons) / 訓練時に毎ステップでニューロンをdropする（0またはNoneはdropoutなし、1は全ニューロンをdropout）",
     )
     parser.add_argument(
-        "--network_args", type=str, default=None, nargs="*", help="additional argmuments for network (key=value) / ネットワークへの追加の引数"
+        "--network_args", type=str, default=None, nargs="*", help="additional arguments for network (key=value) / ネットワークへの追加の引数"
     )
     parser.add_argument("--network_train_unet_only", action="store_true", help="only training U-Net part / U-Net関連部分のみ学習する")
     parser.add_argument(
diff --git a/train_textual_inversion.py b/train_textual_inversion.py
index 1c7b7fcb2..252add536 100644
--- a/train_textual_inversion.py
+++ b/train_textual_inversion.py
@@ -312,8 +312,8 @@ def train(self, args):
 
         current_epoch = Value("i", 0)
         current_step = Value("i", 0)
-        ds_for_collater = train_dataset_group if args.max_data_loader_n_workers == 0 else None
-        collater = train_util.collater_class(current_epoch, current_step, ds_for_collater)
+        ds_for_collator = train_dataset_group if args.max_data_loader_n_workers == 0 else None
+        collator = train_util.collator_class(current_epoch, current_step, ds_for_collator)
 
         # make captions: tokenstring tokenstring1 tokenstring2 ...tokenstringn という文字列に書き換える超乱暴な実装
         if use_template:
@@ -389,7 +389,7 @@ def train(self, args):
             train_dataset_group,
             batch_size=1,
             shuffle=True,
-            collate_fn=collater,
+            collate_fn=collator,
             num_workers=n_workers,
             persistent_workers=args.persistent_data_loader_workers,
         )
diff --git a/train_textual_inversion_XTI.py b/train_textual_inversion_XTI.py
index 2c5673be1..525e612f1 100644
--- a/train_textual_inversion_XTI.py
+++ b/train_textual_inversion_XTI.py
@@ -236,8 +236,8 @@ def train(args):
     train_dataset_group.enable_XTI(XTI_layers, token_strings=token_strings)
     current_epoch = Value("i", 0)
     current_step = Value("i", 0)
-    ds_for_collater = train_dataset_group if args.max_data_loader_n_workers == 0 else None
-    collater = train_util.collater_class(current_epoch, current_step, ds_for_collater)
+    ds_for_collator = train_dataset_group if args.max_data_loader_n_workers == 0 else None
+    collator = train_util.collator_class(current_epoch, current_step, ds_for_collator)
 
     # make captions: tokenstring tokenstring1 tokenstring2 ...tokenstringn という文字列に書き換える超乱暴な実装
     if use_template:
@@ -309,7 +309,7 @@ def train(args):
         train_dataset_group,
         batch_size=1,
         shuffle=True,
-        collate_fn=collater,
+        collate_fn=collator,
         num_workers=n_workers,
         persistent_workers=args.persistent_data_loader_workers,
     )

From 13d8b22d25d1e9c572a2fb2ea9ce0cd744c8ddc0 Mon Sep 17 00:00:00 2001
From: Yuta Hayashibe <yuta@hayashibe.jp>
Date: Sun, 1 Oct 2023 21:52:16 +0900
Subject: [PATCH 10/33] Add dependabot

---
 .github/dependabot.yml | 7 +++++++
 1 file changed, 7 insertions(+)
 create mode 100644 .github/dependabot.yml

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 000000000..64284b907
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,7 @@
+---
+version: 2
+updates:
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "monthly"

From 6d06b215bfe27d751d1864166c54d28487f0566f Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sun, 1 Oct 2023 22:51:32 +0000
Subject: [PATCH 11/33] Bump actions/checkout from 3 to 4

Bumps [actions/checkout](https://github.com/actions/checkout) from 3 to 4.
- [Release notes](https://github.com/actions/checkout/releases)
- [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md)
- [Commits](https://github.com/actions/checkout/compare/v3...v4)

---
updated-dependencies:
- dependency-name: actions/checkout
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .github/workflows/typos.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/typos.yml b/.github/workflows/typos.yml
index 90132c334..b6865dbfb 100644
--- a/.github/workflows/typos.yml
+++ b/.github/workflows/typos.yml
@@ -15,7 +15,7 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: typos-action
         uses: crate-ci/typos@v1.16.15

From 59789b1d57cbaad87e4e9741dd7637b836b587e5 Mon Sep 17 00:00:00 2001
From: boombbo <chrisbbo812@gmail.com>
Date: Mon, 2 Oct 2023 18:21:09 +0800
Subject: [PATCH 12/33]  "Chinese User.bat and useful readme"

---
 ...55\346\226\207\346\225\231\347\250\213.md" | 188 ++++++++++++++++++
 ...7\345\220\257\345\212\250\345\231\250.bat" |   8 +
 2 files changed, 196 insertions(+)
 create mode 100644 "config_files/accelerate/README_\344\270\255\346\226\207\346\225\231\347\250\213.md"
 create mode 100644 "config_files/accelerate/kohya\344\270\255\346\226\207\345\220\257\345\212\250\345\231\250.bat"

diff --git "a/config_files/accelerate/README_\344\270\255\346\226\207\346\225\231\347\250\213.md" "b/config_files/accelerate/README_\344\270\255\346\226\207\346\225\231\347\250\213.md"
new file mode 100644
index 000000000..3bb4c67fa
--- /dev/null
+++ "b/config_files/accelerate/README_\344\270\255\346\226\207\346\225\231\347\250\213.md"
@@ -0,0 +1,188 @@
+嗨!我把日语 README 文件的主要内容翻译成中文如下:
+
+## 关于这个仓库
+
+这个是用于Stable Diffusion模型训练、图像生成和其他脚本的仓库。 
+
+[英文版 README](./README.md) <-- 更新信息在这里
+
+GUI和PowerShell脚本等使其更易用的功能在[bmaltais的仓库](https://github.com/bmaltais/kohya_ss)(英语)中提供,一并参考。感谢bmaltais。
+
+包含以下脚本:
+
+* 支持DreamBooth、U-Net和文本编码器的训练
+* fine-tuning的支持
+* 图像生成
+* 模型转换(Stable Diffusion ckpt/safetensors 和 Diffusers之间的相互转换)
+
+## 使用方法 (中国用户只需要按照这个安装教程操作）
+- 进入kohya_ss文件夹根目录下，点击 setup.bat 启动安装程序 *（需要科学上网）
+- 根据界面上给出的英文选项：
+Kohya_ss GUI setup menu:
+
+1. Install kohya_ss gui
+2. (Optional) Install cudann files (avoid unless you really need it)
+3. (Optional) Install specific bitsandbytes versions
+4. (Optional) Manually configure accelerate
+5. (Optional) Start Kohya_ss GUI in browser
+6. Quit
+
+Enter your choice: 1
+
+1. Torch 1 (legacy, no longer supported. Will be removed in v21.9.x)
+2. Torch 2 (recommended)
+3. Cancel
+
+Enter your choice: 2
+
+开始安装环境依赖，接着再出来的选项，按照下列选项操作：
+```txt
+- This machine
+- No distributed training
+- NO
+- NO
+- NO
+- all
+- bf16
+```
+--------------------------------------------------------------------
+这里都选择完毕，即可关闭终端窗口，直接点击 gui.bat或者 kohya中文启动器.bat 即可运行kohya
+
+
+当仓库内和note.com有相关文章,请参考那里。(未来可能全部移到这里)
+
+* [关于训练,通用篇](./docs/train_README-ja.md): 数据准备和选项等
+    * [数据集设置](./docs/config_README-ja.md)
+* [DreamBooth训练指南](./docs/train_db_README-ja.md) 
+* [fine-tuning指南](./docs/fine_tune_README_ja.md)
+* [LoRA训练指南](./docs/train_network_README-ja.md)
+* [文本反转训练指南](./docs/train_ti_README-ja.md)
+* [图像生成脚本](./docs/gen_img_README-ja.md)
+* note.com [模型转换脚本](https://note.com/kohya_ss/n/n374f316fe4ad)
+
+## Windows环境所需程序
+
+需要Python 3.10.6和Git。
+
+- Python 3.10.6: https://www.python.org/ftp/python/3.10.6/python-3.10.6-amd64.exe
+- git: https://git-scm.com/download/win
+
+如果要在PowerShell中使用venv,需要按以下步骤更改安全设置:
+(不仅仅是venv,使脚本可以执行。请注意。)
+
+- 以管理员身份打开PowerShell
+- 输入"Set-ExecutionPolicy Unrestricted",选择Y
+- 关闭管理员PowerShell
+
+## 在Windows环境下安装
+
+下例中安装的是PyTorch 1.12.1/CUDA 11.6版。如果要使用CUDA 11.3或PyTorch 1.13,请适当修改。
+
+(如果只显示"python",请将下例中的"python"改为"py")  
+
+在普通(非管理员)PowerShell中依次执行以下命令:
+
+```powershell
+git clone https://github.com/kohya-ss/sd-scripts.git
+cd sd-scripts
+
+python -m venv venv
+.\venv\Scripts\activate
+
+pip install torch==1.12.1+cu116 torchvision==0.13.1+cu116 --extra-index-url https://download.pytorch.org/whl/cu116
+pip install --upgrade -r requirements.txt
+pip install -U -I --no-deps https://github.com/C43H66N12O12S2/stable-diffusion-webui/releases/download/f/xformers-0.0.14.dev0-cp310-cp310-win_amd64.whl
+
+cp .\bitsandbytes_windows\*.dll .\venv\Lib\site-packages\bitsandbytes\
+cp .\bitsandbytes_windows\cextension.py .\venv\Lib\site-packages\bitsandbytes\cextension.py
+cp .\bitsandbytes_windows\main.py .\venv\Lib\site-packages\bitsandbytes\cuda_setup\main.py
+
+accelerate config
+```
+
+在命令提示符中:
+
+```bat
+git clone https://github.com/kohya-ss/sd-scripts.git
+cd sd-scripts
+
+python -m venv venv 
+.\venv\Scripts\activate
+
+pip install torch==1.12.1+cu116 torchvision==0.13.1+cu116 --extra-index-url https://download.pytorch.org/whl/cu116
+pip install --upgrade -r requirements.txt
+pip install -U -I --no-deps https://github.com/C43H66N12O12S2/stable-diffusion-webui/releases/download/f/xformers-0.0.14.dev0-cp310-cp310-win_amd64.whl
+
+copy /y .\bitsandbytes_windows\*.dll .\venv\Lib\site-packages\bitsandbytes\
+copy /y .\bitsandbytes_windows\cextension.py .\venv\Lib\site-packages\bitsandbytes\cextension.py
+copy /y .\bitsandbytes_windows\main.py .\venv\Lib\site-packages\bitsandbytes\cuda_setup\main.py
+
+accelerate config
+```
+
+accelerate config的问题请按以下回答:
+(如果要用bf16训练,最后一个问题选择bf16)
+
+```
+- 此计算机
+- 不进行分布式训练  
+- 否
+- 否 
+- 否
+- 所有
+- fp16
+```
+
+### PyTorch和xformers版本注意事项
+
+在其他版本中训练可能失败。如果没有特殊原因,请使用指定版本。
+
+
+### 可选:使用Lion8bit 
+
+如果要使用Lion8bit,需要将`bitsandbytes`升级到0.38.0以上。首先卸载`bitsandbytes`,然后在Windows中安装适合Windows的whl文件,例如[这里的](https://github.com/jllllll/bitsandbytes-windows-webui)。例如:
+
+```powershell
+pip install https://github.com/jllllll/bitsandbytes-windows-webui/raw/main/bitsandbytes-0.38.1-py3-none-any.whl
+```
+
+升级时用`pip install .`更新这个仓库,并视情况升级其他包。
+
+### 可选:使用PagedAdamW8bit和PagedLion8bit
+
+如果要使用PagedAdamW8bit和PagedLion8bit,需要将`bitsandbytes`升级到0.39.0以上。首先卸载`bitsandbytes`,然后在Windows中安装适合Windows的whl文件,例如[这里的](https://github.com/jllllll/bitsandbytes-windows-webui)。例如:
+
+```powershell
+pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.39.1-py3-none-win_amd64.whl  
+```
+
+升级时用`pip install .`更新这个仓库,并视情况升级其他包。
+
+## 升级
+
+如果有新版本,可以用以下命令更新:
+
+```powershell
+cd sd-scripts
+git pull
+.\venv\Scripts\activate  
+pip install --use-pep517 --upgrade -r requirements.txt
+```
+
+如果命令成功,就可以使用新版本了。
+
+## 致谢
+
+LoRA实现基于[cloneofsimo的仓库](https://github.com/cloneofsimo/lora)。表示感谢。
+
+将Conv2d 3x3扩展到所有层起初由 [cloneofsimo](https://github.com/cloneofsimo/lora) 发布, [KohakuBlueleaf](https://github.com/KohakuBlueleaf/LoCon) 证明了其有效性。深深感谢 KohakuBlueleaf。
+
+## 许可
+
+脚本遵循 ASL 2.0 许可,但包含其他许可的代码部分(Diffusers和cloneofsimo的仓库)。
+
+[Memory Efficient Attention Pytorch](https://github.com/lucidrains/memory-efficient-attention-pytorch): MIT 
+
+[bitsandbytes](https://github.com/TimDettmers/bitsandbytes): MIT
+
+[BLIP](https://github.com/salesforce/BLIP): BSD-3-Clause
\ No newline at end of file
diff --git "a/config_files/accelerate/kohya\344\270\255\346\226\207\345\220\257\345\212\250\345\231\250.bat" "b/config_files/accelerate/kohya\344\270\255\346\226\207\345\220\257\345\212\250\345\231\250.bat"
new file mode 100644
index 000000000..e1f058574
--- /dev/null
+++ "b/config_files/accelerate/kohya\344\270\255\346\226\207\345\220\257\345\212\250\345\231\250.bat"
@@ -0,0 +1,8 @@
+@echo off
+
+call gui.bat --language zh-CN --inbrowser --share --headless 
+
+echo 完成。
+
+pause
+endlocal

From b07f780a262e6a9773e36d1d10af8a3d3933b4b6 Mon Sep 17 00:00:00 2001
From: Disty0 <semihgulec2005@gmail.com>
Date: Mon, 2 Oct 2023 19:29:20 +0300
Subject: [PATCH 13/33] Fix typo

---
 setup/setup_common.py          | 2 +-
 setup/validate_requirements.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup/setup_common.py b/setup/setup_common.py
index d9cacf35f..9a0ecdef3 100644
--- a/setup/setup_common.py
+++ b/setup/setup_common.py
@@ -240,7 +240,7 @@ def check_torch():
             ]:
                 if hasattr(torch, "xpu") and torch.xpu.is_available():
                     log.info(
-                        f'Torch detected GPU: Torch detected GPU: {torch.xpu.get_device_name(device)} VRAM {round(torch.xpu.get_device_properties(device).total_memory / 1024 / 1024)} Compute Units {torch.xpu.get_device_properties(device).max_compute_units}'
+                        f'Torch detected GPU: {torch.xpu.get_device_name(device)} VRAM {round(torch.xpu.get_device_properties(device).total_memory / 1024 / 1024)} Compute Units {torch.xpu.get_device_properties(device).max_compute_units}'
                     )
                 else:
                     log.info(
diff --git a/setup/validate_requirements.py b/setup/validate_requirements.py
index 664ccfd9b..9d17ffda2 100644
--- a/setup/validate_requirements.py
+++ b/setup/validate_requirements.py
@@ -78,7 +78,7 @@ def check_torch():
             ]:
                 if hasattr(torch, "xpu") and torch.xpu.is_available():
                     log.info(
-                        f'Torch detected GPU: Torch detected GPU: {torch.xpu.get_device_name(device)} VRAM {round(torch.xpu.get_device_properties(device).total_memory / 1024 / 1024)} Compute Units {torch.xpu.get_device_properties(device).max_compute_units}'
+                        f'Torch detected GPU: {torch.xpu.get_device_name(device)} VRAM {round(torch.xpu.get_device_properties(device).total_memory / 1024 / 1024)} Compute Units {torch.xpu.get_device_properties(device).max_compute_units}'
                     )
                 else:
                     log.info(

From fd92e1cefdf976da5273b6c863c74eff4c433ca6 Mon Sep 17 00:00:00 2001
From: Disty0 <semihgulec2005@gmail.com>
Date: Mon, 2 Oct 2023 19:34:09 +0300
Subject: [PATCH 14/33] Fix typo warning

---
 gui.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gui.sh b/gui.sh
index a7980ccb9..debca42fa 100755
--- a/gui.sh
+++ b/gui.sh
@@ -69,7 +69,7 @@ else
     fi
 fi
 
-#Set OneAPI environmet if it's not set by the user
+#Set OneAPI if it's not set by the user
 if [[ "$@" == *"--use-ipex"* ]]
 then
     echo "Setting OneAPI environment"

From 76473d0793108b983b072808402375c2ef5cb767 Mon Sep 17 00:00:00 2001
From: Disty0 <semihgulec2005@gmail.com>
Date: Mon, 2 Oct 2023 19:51:14 +0300
Subject: [PATCH 15/33] Update IPEX index

---
 requirements_linux_ipex.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements_linux_ipex.txt b/requirements_linux_ipex.txt
index 28c0bc6b1..61d8a75f4 100644
--- a/requirements_linux_ipex.txt
+++ b/requirements_linux_ipex.txt
@@ -1,3 +1,3 @@
-torch==2.0.1a0+cxx11.abi torchvision==0.15.2a0+cxx11.abi intel_extension_for_pytorch==2.0.110+xpu -f https://developer.intel.com/ipex-whl-stable-xpu # no_verify leave this to specify not checking this a verification stage
+torch==2.0.1a0+cxx11.abi torchvision==0.15.2a0+cxx11.abi intel_extension_for_pytorch==2.0.110+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
 tensorboard==2.12.3 tensorflow==2.12.0 intel-extension-for-tensorflow[gpu]
 -r requirements.txt

From f4667e564e1c53a835664650979cf08a4e9cace0 Mon Sep 17 00:00:00 2001
From: Lucas Freire Sangoi <125471877+DevArqSangoi@users.noreply.github.com>
Date: Wed, 4 Oct 2023 02:24:26 -0300
Subject: [PATCH 16/33] Update lora_gui.py

Changing the algo argument from "lora" to "locon" to match the option selected in the GUI for 'LoCon/LyCORIS'.

The original repository specifies that when using kohya's training script, the algo should be set to "locon", not "lora".
---
 lora_gui.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lora_gui.py b/lora_gui.py
index bfef5229a..62319efe9 100644
--- a/lora_gui.py
+++ b/lora_gui.py
@@ -735,7 +735,7 @@ def train_model(
             )
             return
         run_cmd += f' --network_module=lycoris.kohya'
-        run_cmd += f' --network_args "conv_dim={conv_dim}" "conv_alpha={conv_alpha}" "algo=lora"'
+        run_cmd += f' --network_args "conv_dim={conv_dim}" "conv_alpha={conv_alpha}" "algo=locon"'
 
     if LoRA_type == 'LyCORIS/LoHa':
         try:

From 592014923fdc2905588c37d7b417549f608d49c5 Mon Sep 17 00:00:00 2001
From: Isotr0py <41363108+Isotr0py@users.noreply.github.com>
Date: Wed, 4 Oct 2023 21:48:25 +0800
Subject: [PATCH 17/33] Support JPEG-XL on windows

---
 library/train_util.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/library/train_util.py b/library/train_util.py
index 5433357ac..51610e700 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -96,6 +96,7 @@
 except:
     pass
 
+# JPEG-XL on Linux
 try:
     from jxlpy import JXLImagePlugin
 
@@ -103,6 +104,14 @@
 except:
     pass
 
+# JPEG-XL on Windows
+try:
+    import pillow_jxl
+
+    IMAGE_EXTENSIONS.extend([".jxl", ".JXL"])
+except:
+    pass
+
 IMAGE_TRANSFORMS = transforms.Compose(
     [
         transforms.ToTensor(),

From 93a181f55b9fbf485df52e60da96fa04d8b8f7b3 Mon Sep 17 00:00:00 2001
From: bmaltais <bernard@ducourier.com>
Date: Wed, 4 Oct 2023 19:00:12 -0400
Subject: [PATCH 18/33] Remove torch 1 support from code

---
 README.md                                     |  16 +-
 ...55\346\226\207\346\225\231\347\250\213.md" | 374 +++++++++---------
 ...7\345\220\257\345\212\250\345\231\250.bat" |   8 -
 requirements.txt                              |   2 +-
 setup/setup_windows.py                        |  85 ++--
 setup/validate_requirements.py                |   5 +-
 6 files changed, 235 insertions(+), 255 deletions(-)
 rename "config_files/accelerate/README_\344\270\255\346\226\207\346\225\231\347\250\213.md" => "README_\344\270\255\346\226\207\346\225\231\347\250\213.md" (97%)
 delete mode 100644 "config_files/accelerate/kohya\344\270\255\346\226\207\345\220\257\345\212\250\345\231\250.bat"

diff --git a/README.md b/README.md
index 93f2ced07..e4ef4cbb7 100644
--- a/README.md
+++ b/README.md
@@ -625,6 +625,9 @@ ControlNet-LLLite, a novel method for ControlNet with SDXL, is added. See [docum
 
 ## Change History
 
+* 2023/10/08 (v22.1.0)
+  - Remove support for torch 1 to align with kohya_ss sd-scripts code base.
+
 * 2023/10/01 (v22.0.0)
   - Merging main branch of sd-scripts:
     - [SAI Model Spec](https://github.com/Stability-AI/ModelSpec) metadata is now supported partially. `hash_sha256` is not supported yet.
@@ -648,16 +651,3 @@ ControlNet-LLLite, a novel method for ControlNet with SDXL, is added. See [docum
   - Update wandb module version
   - Add support for Chinese zh-CN localisation. You can use it with `.\gui.bat --language=zh-CN`
   - Add presets support to `Finetuning`. You can add your own finetuning user presets under the `/presets/finetune/user_presets` folder.
-
-* 2023/09/23 (v21.8.10)
-  - Minor point upgrade. Mostly adding a new preset.
-  
-* 2023/08/05 (v21.8.9)
-  - Update sd-script to caode as of Sept 3 2023
-    * ControlNet-LLLite is added. See documentation for details.
-    * JPEG XL is supported. #786
-    * Peak memory usage is reduced. #791
-    * Input perturbation noise is added. See #798 for details.
-    * Dataset subset now has caption_prefix and caption_suffix options. The strings are added to the beginning and the end of the captions before shuffling. You can specify the options in .toml.
-    * Other minor changes.
-  - Added support for Chinese locallisation
diff --git "a/config_files/accelerate/README_\344\270\255\346\226\207\346\225\231\347\250\213.md" "b/README_\344\270\255\346\226\207\346\225\231\347\250\213.md"
similarity index 97%
rename from "config_files/accelerate/README_\344\270\255\346\226\207\346\225\231\347\250\213.md"
rename to "README_\344\270\255\346\226\207\346\225\231\347\250\213.md"
index 3bb4c67fa..f526317b9 100644
--- "a/config_files/accelerate/README_\344\270\255\346\226\207\346\225\231\347\250\213.md"
+++ "b/README_\344\270\255\346\226\207\346\225\231\347\250\213.md"
@@ -1,188 +1,188 @@
-嗨!我把日语 README 文件的主要内容翻译成中文如下:
-
-## 关于这个仓库
-
-这个是用于Stable Diffusion模型训练、图像生成和其他脚本的仓库。 
-
-[英文版 README](./README.md) <-- 更新信息在这里
-
-GUI和PowerShell脚本等使其更易用的功能在[bmaltais的仓库](https://github.com/bmaltais/kohya_ss)(英语)中提供,一并参考。感谢bmaltais。
-
-包含以下脚本:
-
-* 支持DreamBooth、U-Net和文本编码器的训练
-* fine-tuning的支持
-* 图像生成
-* 模型转换(Stable Diffusion ckpt/safetensors 和 Diffusers之间的相互转换)
-
-## 使用方法 (中国用户只需要按照这个安装教程操作）
-- 进入kohya_ss文件夹根目录下，点击 setup.bat 启动安装程序 *（需要科学上网）
-- 根据界面上给出的英文选项：
-Kohya_ss GUI setup menu:
-
-1. Install kohya_ss gui
-2. (Optional) Install cudann files (avoid unless you really need it)
-3. (Optional) Install specific bitsandbytes versions
-4. (Optional) Manually configure accelerate
-5. (Optional) Start Kohya_ss GUI in browser
-6. Quit
-
-Enter your choice: 1
-
-1. Torch 1 (legacy, no longer supported. Will be removed in v21.9.x)
-2. Torch 2 (recommended)
-3. Cancel
-
-Enter your choice: 2
-
-开始安装环境依赖，接着再出来的选项，按照下列选项操作：
-```txt
-- This machine
-- No distributed training
-- NO
-- NO
-- NO
-- all
-- bf16
-```
---------------------------------------------------------------------
-这里都选择完毕，即可关闭终端窗口，直接点击 gui.bat或者 kohya中文启动器.bat 即可运行kohya
-
-
-当仓库内和note.com有相关文章,请参考那里。(未来可能全部移到这里)
-
-* [关于训练,通用篇](./docs/train_README-ja.md): 数据准备和选项等
-    * [数据集设置](./docs/config_README-ja.md)
-* [DreamBooth训练指南](./docs/train_db_README-ja.md) 
-* [fine-tuning指南](./docs/fine_tune_README_ja.md)
-* [LoRA训练指南](./docs/train_network_README-ja.md)
-* [文本反转训练指南](./docs/train_ti_README-ja.md)
-* [图像生成脚本](./docs/gen_img_README-ja.md)
-* note.com [模型转换脚本](https://note.com/kohya_ss/n/n374f316fe4ad)
-
-## Windows环境所需程序
-
-需要Python 3.10.6和Git。
-
-- Python 3.10.6: https://www.python.org/ftp/python/3.10.6/python-3.10.6-amd64.exe
-- git: https://git-scm.com/download/win
-
-如果要在PowerShell中使用venv,需要按以下步骤更改安全设置:
-(不仅仅是venv,使脚本可以执行。请注意。)
-
-- 以管理员身份打开PowerShell
-- 输入"Set-ExecutionPolicy Unrestricted",选择Y
-- 关闭管理员PowerShell
-
-## 在Windows环境下安装
-
-下例中安装的是PyTorch 1.12.1/CUDA 11.6版。如果要使用CUDA 11.3或PyTorch 1.13,请适当修改。
-
-(如果只显示"python",请将下例中的"python"改为"py")  
-
-在普通(非管理员)PowerShell中依次执行以下命令:
-
-```powershell
-git clone https://github.com/kohya-ss/sd-scripts.git
-cd sd-scripts
-
-python -m venv venv
-.\venv\Scripts\activate
-
-pip install torch==1.12.1+cu116 torchvision==0.13.1+cu116 --extra-index-url https://download.pytorch.org/whl/cu116
-pip install --upgrade -r requirements.txt
-pip install -U -I --no-deps https://github.com/C43H66N12O12S2/stable-diffusion-webui/releases/download/f/xformers-0.0.14.dev0-cp310-cp310-win_amd64.whl
-
-cp .\bitsandbytes_windows\*.dll .\venv\Lib\site-packages\bitsandbytes\
-cp .\bitsandbytes_windows\cextension.py .\venv\Lib\site-packages\bitsandbytes\cextension.py
-cp .\bitsandbytes_windows\main.py .\venv\Lib\site-packages\bitsandbytes\cuda_setup\main.py
-
-accelerate config
-```
-
-在命令提示符中:
-
-```bat
-git clone https://github.com/kohya-ss/sd-scripts.git
-cd sd-scripts
-
-python -m venv venv 
-.\venv\Scripts\activate
-
-pip install torch==1.12.1+cu116 torchvision==0.13.1+cu116 --extra-index-url https://download.pytorch.org/whl/cu116
-pip install --upgrade -r requirements.txt
-pip install -U -I --no-deps https://github.com/C43H66N12O12S2/stable-diffusion-webui/releases/download/f/xformers-0.0.14.dev0-cp310-cp310-win_amd64.whl
-
-copy /y .\bitsandbytes_windows\*.dll .\venv\Lib\site-packages\bitsandbytes\
-copy /y .\bitsandbytes_windows\cextension.py .\venv\Lib\site-packages\bitsandbytes\cextension.py
-copy /y .\bitsandbytes_windows\main.py .\venv\Lib\site-packages\bitsandbytes\cuda_setup\main.py
-
-accelerate config
-```
-
-accelerate config的问题请按以下回答:
-(如果要用bf16训练,最后一个问题选择bf16)
-
-```
-- 此计算机
-- 不进行分布式训练  
-- 否
-- 否 
-- 否
-- 所有
-- fp16
-```
-
-### PyTorch和xformers版本注意事项
-
-在其他版本中训练可能失败。如果没有特殊原因,请使用指定版本。
-
-
-### 可选:使用Lion8bit 
-
-如果要使用Lion8bit,需要将`bitsandbytes`升级到0.38.0以上。首先卸载`bitsandbytes`,然后在Windows中安装适合Windows的whl文件,例如[这里的](https://github.com/jllllll/bitsandbytes-windows-webui)。例如:
-
-```powershell
-pip install https://github.com/jllllll/bitsandbytes-windows-webui/raw/main/bitsandbytes-0.38.1-py3-none-any.whl
-```
-
-升级时用`pip install .`更新这个仓库,并视情况升级其他包。
-
-### 可选:使用PagedAdamW8bit和PagedLion8bit
-
-如果要使用PagedAdamW8bit和PagedLion8bit,需要将`bitsandbytes`升级到0.39.0以上。首先卸载`bitsandbytes`,然后在Windows中安装适合Windows的whl文件,例如[这里的](https://github.com/jllllll/bitsandbytes-windows-webui)。例如:
-
-```powershell
-pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.39.1-py3-none-win_amd64.whl  
-```
-
-升级时用`pip install .`更新这个仓库,并视情况升级其他包。
-
-## 升级
-
-如果有新版本,可以用以下命令更新:
-
-```powershell
-cd sd-scripts
-git pull
-.\venv\Scripts\activate  
-pip install --use-pep517 --upgrade -r requirements.txt
-```
-
-如果命令成功,就可以使用新版本了。
-
-## 致谢
-
-LoRA实现基于[cloneofsimo的仓库](https://github.com/cloneofsimo/lora)。表示感谢。
-
-将Conv2d 3x3扩展到所有层起初由 [cloneofsimo](https://github.com/cloneofsimo/lora) 发布, [KohakuBlueleaf](https://github.com/KohakuBlueleaf/LoCon) 证明了其有效性。深深感谢 KohakuBlueleaf。
-
-## 许可
-
-脚本遵循 ASL 2.0 许可,但包含其他许可的代码部分(Diffusers和cloneofsimo的仓库)。
-
-[Memory Efficient Attention Pytorch](https://github.com/lucidrains/memory-efficient-attention-pytorch): MIT 
-
-[bitsandbytes](https://github.com/TimDettmers/bitsandbytes): MIT
-
+嗨!我把日语 README 文件的主要内容翻译成中文如下:
+
+## 关于这个仓库
+
+这个是用于Stable Diffusion模型训练、图像生成和其他脚本的仓库。 
+
+[英文版 README](./README.md) <-- 更新信息在这里
+
+GUI和PowerShell脚本等使其更易用的功能在[bmaltais的仓库](https://github.com/bmaltais/kohya_ss)(英语)中提供,一并参考。感谢bmaltais。
+
+包含以下脚本:
+
+* 支持DreamBooth、U-Net和文本编码器的训练
+* fine-tuning的支持
+* 图像生成
+* 模型转换(Stable Diffusion ckpt/safetensors 和 Diffusers之间的相互转换)
+
+## 使用方法 (中国用户只需要按照这个安装教程操作）
+- 进入kohya_ss文件夹根目录下，点击 setup.bat 启动安装程序 *（需要科学上网）
+- 根据界面上给出的英文选项：
+Kohya_ss GUI setup menu:
+
+1. Install kohya_ss gui
+2. (Optional) Install cudann files (avoid unless you really need it)
+3. (Optional) Install specific bitsandbytes versions
+4. (Optional) Manually configure accelerate
+5. (Optional) Start Kohya_ss GUI in browser
+6. Quit
+
+Enter your choice: 1
+
+1. Torch 1 (legacy, no longer supported. Will be removed in v21.9.x)
+2. Torch 2 (recommended)
+3. Cancel
+
+Enter your choice: 2
+
+开始安装环境依赖，接着再出来的选项，按照下列选项操作：
+```txt
+- This machine
+- No distributed training
+- NO
+- NO
+- NO
+- all
+- bf16
+```
+--------------------------------------------------------------------
+这里都选择完毕，即可关闭终端窗口，直接点击 gui.bat或者 kohya中文启动器.bat 即可运行kohya
+
+
+当仓库内和note.com有相关文章,请参考那里。(未来可能全部移到这里)
+
+* [关于训练,通用篇](./docs/train_README-ja.md): 数据准备和选项等
+    * [数据集设置](./docs/config_README-ja.md)
+* [DreamBooth训练指南](./docs/train_db_README-ja.md) 
+* [fine-tuning指南](./docs/fine_tune_README_ja.md)
+* [LoRA训练指南](./docs/train_network_README-ja.md)
+* [文本反转训练指南](./docs/train_ti_README-ja.md)
+* [图像生成脚本](./docs/gen_img_README-ja.md)
+* note.com [模型转换脚本](https://note.com/kohya_ss/n/n374f316fe4ad)
+
+## Windows环境所需程序
+
+需要Python 3.10.6和Git。
+
+- Python 3.10.6: https://www.python.org/ftp/python/3.10.6/python-3.10.6-amd64.exe
+- git: https://git-scm.com/download/win
+
+如果要在PowerShell中使用venv,需要按以下步骤更改安全设置:
+(不仅仅是venv,使脚本可以执行。请注意。)
+
+- 以管理员身份打开PowerShell
+- 输入"Set-ExecutionPolicy Unrestricted",选择Y
+- 关闭管理员PowerShell
+
+## 在Windows环境下安装
+
+下例中安装的是PyTorch 1.12.1/CUDA 11.6版。如果要使用CUDA 11.3或PyTorch 1.13,请适当修改。
+
+(如果只显示"python",请将下例中的"python"改为"py")  
+
+在普通(非管理员)PowerShell中依次执行以下命令:
+
+```powershell
+git clone https://github.com/kohya-ss/sd-scripts.git
+cd sd-scripts
+
+python -m venv venv
+.\venv\Scripts\activate
+
+pip install torch==1.12.1+cu116 torchvision==0.13.1+cu116 --extra-index-url https://download.pytorch.org/whl/cu116
+pip install --upgrade -r requirements.txt
+pip install -U -I --no-deps https://github.com/C43H66N12O12S2/stable-diffusion-webui/releases/download/f/xformers-0.0.14.dev0-cp310-cp310-win_amd64.whl
+
+cp .\bitsandbytes_windows\*.dll .\venv\Lib\site-packages\bitsandbytes\
+cp .\bitsandbytes_windows\cextension.py .\venv\Lib\site-packages\bitsandbytes\cextension.py
+cp .\bitsandbytes_windows\main.py .\venv\Lib\site-packages\bitsandbytes\cuda_setup\main.py
+
+accelerate config
+```
+
+在命令提示符中:
+
+```bat
+git clone https://github.com/kohya-ss/sd-scripts.git
+cd sd-scripts
+
+python -m venv venv 
+.\venv\Scripts\activate
+
+pip install torch==1.12.1+cu116 torchvision==0.13.1+cu116 --extra-index-url https://download.pytorch.org/whl/cu116
+pip install --upgrade -r requirements.txt
+pip install -U -I --no-deps https://github.com/C43H66N12O12S2/stable-diffusion-webui/releases/download/f/xformers-0.0.14.dev0-cp310-cp310-win_amd64.whl
+
+copy /y .\bitsandbytes_windows\*.dll .\venv\Lib\site-packages\bitsandbytes\
+copy /y .\bitsandbytes_windows\cextension.py .\venv\Lib\site-packages\bitsandbytes\cextension.py
+copy /y .\bitsandbytes_windows\main.py .\venv\Lib\site-packages\bitsandbytes\cuda_setup\main.py
+
+accelerate config
+```
+
+accelerate config的问题请按以下回答:
+(如果要用bf16训练,最后一个问题选择bf16)
+
+```
+- 此计算机
+- 不进行分布式训练  
+- 否
+- 否 
+- 否
+- 所有
+- fp16
+```
+
+### PyTorch和xformers版本注意事项
+
+在其他版本中训练可能失败。如果没有特殊原因,请使用指定版本。
+
+
+### 可选:使用Lion8bit 
+
+如果要使用Lion8bit,需要将`bitsandbytes`升级到0.38.0以上。首先卸载`bitsandbytes`,然后在Windows中安装适合Windows的whl文件,例如[这里的](https://github.com/jllllll/bitsandbytes-windows-webui)。例如:
+
+```powershell
+pip install https://github.com/jllllll/bitsandbytes-windows-webui/raw/main/bitsandbytes-0.38.1-py3-none-any.whl
+```
+
+升级时用`pip install .`更新这个仓库,并视情况升级其他包。
+
+### 可选:使用PagedAdamW8bit和PagedLion8bit
+
+如果要使用PagedAdamW8bit和PagedLion8bit,需要将`bitsandbytes`升级到0.39.0以上。首先卸载`bitsandbytes`,然后在Windows中安装适合Windows的whl文件,例如[这里的](https://github.com/jllllll/bitsandbytes-windows-webui)。例如:
+
+```powershell
+pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.39.1-py3-none-win_amd64.whl  
+```
+
+升级时用`pip install .`更新这个仓库,并视情况升级其他包。
+
+## 升级
+
+如果有新版本,可以用以下命令更新:
+
+```powershell
+cd sd-scripts
+git pull
+.\venv\Scripts\activate  
+pip install --use-pep517 --upgrade -r requirements.txt
+```
+
+如果命令成功,就可以使用新版本了。
+
+## 致谢
+
+LoRA实现基于[cloneofsimo的仓库](https://github.com/cloneofsimo/lora)。表示感谢。
+
+将Conv2d 3x3扩展到所有层起初由 [cloneofsimo](https://github.com/cloneofsimo/lora) 发布, [KohakuBlueleaf](https://github.com/KohakuBlueleaf/LoCon) 证明了其有效性。深深感谢 KohakuBlueleaf。
+
+## 许可
+
+脚本遵循 ASL 2.0 许可,但包含其他许可的代码部分(Diffusers和cloneofsimo的仓库)。
+
+[Memory Efficient Attention Pytorch](https://github.com/lucidrains/memory-efficient-attention-pytorch): MIT 
+
+[bitsandbytes](https://github.com/TimDettmers/bitsandbytes): MIT
+
 [BLIP](https://github.com/salesforce/BLIP): BSD-3-Clause
\ No newline at end of file
diff --git "a/config_files/accelerate/kohya\344\270\255\346\226\207\345\220\257\345\212\250\345\231\250.bat" "b/config_files/accelerate/kohya\344\270\255\346\226\207\345\220\257\345\212\250\345\231\250.bat"
deleted file mode 100644
index e1f058574..000000000
--- "a/config_files/accelerate/kohya\344\270\255\346\226\207\345\220\257\345\212\250\345\231\250.bat"
+++ /dev/null
@@ -1,8 +0,0 @@
-@echo off
-
-call gui.bat --language zh-CN --inbrowser --share --headless 
-
-echo 完成。
-
-pause
-endlocal
diff --git a/requirements.txt b/requirements.txt
index ff2c1a985..52110c8ef 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,7 +13,7 @@ huggingface-hub==0.15.1
 # for loading Diffusers' SDXL
 invisible-watermark==0.2.0
 lion-pytorch==0.0.6
-lycoris_lora==1.8.3
+lycoris_lora==1.9.0
 # for BLIP captioning
 # requests==2.28.2
 # timm==0.6.12
diff --git a/setup/setup_windows.py b/setup/setup_windows.py
index 411e83cd4..ec0084cfd 100644
--- a/setup/setup_windows.py
+++ b/setup/setup_windows.py
@@ -98,31 +98,31 @@ def sync_bits_and_bytes_files():
         log.error(f'An unexpected error occurred: {e}')
 
 
-def install_kohya_ss_torch1():
-    setup_common.check_repo_version()
-    setup_common.check_python()
-
-    # Upgrade pip if needed
-    setup_common.install('--upgrade pip')
-
-    if setup_common.check_torch() == 2:
-        input(
-            f'{YELLOW}\nTorch 2 is already installed in the venv. To install Torch 1 delete the venv and re-run setup.bat\n\nHit enter to continue...{RESET_COLOR}'
-        )
-        return
-
-    # setup_common.install(
-    #     'torch==1.12.1+cu116 torchvision==0.13.1+cu116 --index-url https://download.pytorch.org/whl/cu116',
-    #     'torch torchvision'
-    # )
-    # setup_common.install(
-    #     'https://github.com/C43H66N12O12S2/stable-diffusion-webui/releases/download/f/xformers-0.0.14.dev0-cp310-cp310-win_amd64.whl -U -I --no-deps',
-    #     'xformers-0.0.14'
-    # )
-    setup_common.install_requirements('requirements_windows_torch1.txt', check_no_verify_flag=False)
-    sync_bits_and_bytes_files()
-    setup_common.configure_accelerate(run_accelerate=True)
-    # run_cmd(f'accelerate config')
+# def install_kohya_ss_torch1():
+#     setup_common.check_repo_version()
+#     setup_common.check_python()
+
+#     # Upgrade pip if needed
+#     setup_common.install('--upgrade pip')
+
+#     if setup_common.check_torch() == 2:
+#         input(
+#             f'{YELLOW}\nTorch 2 is already installed in the venv. To install Torch 1 delete the venv and re-run setup.bat\n\nHit enter to continue...{RESET_COLOR}'
+#         )
+#         return
+
+#     # setup_common.install(
+#     #     'torch==1.12.1+cu116 torchvision==0.13.1+cu116 --index-url https://download.pytorch.org/whl/cu116',
+#     #     'torch torchvision'
+#     # )
+#     # setup_common.install(
+#     #     'https://github.com/C43H66N12O12S2/stable-diffusion-webui/releases/download/f/xformers-0.0.14.dev0-cp310-cp310-win_amd64.whl -U -I --no-deps',
+#     #     'xformers-0.0.14'
+#     # )
+#     setup_common.install_requirements('requirements_windows_torch1.txt', check_no_verify_flag=False)
+#     sync_bits_and_bytes_files()
+#     setup_common.configure_accelerate(run_accelerate=True)
+#     # run_cmd(f'accelerate config')
 
 
 def install_kohya_ss_torch2():
@@ -177,23 +177,24 @@ def main_menu():
         print('')
 
         if choice == '1':
-            while True:
-                print('1. Torch 1 (legacy, no longer supported. Will be removed in v21.9.x)')
-                print('2. Torch 2 (recommended)')
-                print('3. Cancel')
-                choice_torch = input('\nEnter your choice: ')
-                print('')
-
-                if choice_torch == '1':
-                    install_kohya_ss_torch1()
-                    break
-                elif choice_torch == '2':
-                    install_kohya_ss_torch2()
-                    break
-                elif choice_torch == '3':
-                    break
-                else:
-                    print('Invalid choice. Please enter a number between 1-3.')
+            install_kohya_ss_torch2()
+            # while True:
+            #     print('1. Torch 1 (legacy, no longer supported. Will be removed in v21.9.x)')
+            #     print('2. Torch 2 (recommended)')
+            #     print('3. Cancel')
+            #     choice_torch = input('\nEnter your choice: ')
+            #     print('')
+
+            #     if choice_torch == '1':
+            #         install_kohya_ss_torch1()
+            #         break
+            #     elif choice_torch == '2':
+            #         install_kohya_ss_torch2()
+            #         break
+            #     elif choice_torch == '3':
+            #         break
+            #     else:
+            #         print('Invalid choice. Please enter a number between 1-3.')
         elif choice == '2':
             cudann_install()
         elif choice == '3':
diff --git a/setup/validate_requirements.py b/setup/validate_requirements.py
index 9d17ffda2..bac2948fe 100644
--- a/setup/validate_requirements.py
+++ b/setup/validate_requirements.py
@@ -110,10 +110,7 @@ def main():
     if args.requirements:
         setup_common.install_requirements(args.requirements, check_no_verify_flag=True)
     else:
-        if torch_ver == 1:
-            setup_common.install_requirements('requirements_windows_torch1.txt', check_no_verify_flag=True)
-        else:
-            setup_common.install_requirements('requirements_windows_torch2.txt', check_no_verify_flag=True)
+        setup_common.install_requirements('requirements_windows_torch2.txt', check_no_verify_flag=True)
 
 
 if __name__ == '__main__':

From a756f0a773063dde4063b4752cb4a17bae8717e1 Mon Sep 17 00:00:00 2001
From: bmaltais <bernard@ducourier.com>
Date: Wed, 4 Oct 2023 19:08:42 -0400
Subject: [PATCH 19/33] More torch 1 code cleanup

---
 README.md              |  8 +++++++-
 setup/setup_windows.py | 10 +++++-----
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index e4ef4cbb7..3c5cc0adb 100644
--- a/README.md
+++ b/README.md
@@ -627,7 +627,13 @@ ControlNet-LLLite, a novel method for ControlNet with SDXL, is added. See [docum
 
 * 2023/10/08 (v22.1.0)
   - Remove support for torch 1 to align with kohya_ss sd-scripts code base.
-
+  - Add Intel ARC GPU support with IPEX support on Linuix / WSL
+    - Users needs to set these manually:
+      * Mixed precision to BF16,
+      * Attention to SDPA,
+      * Optimizer to: AdamW (or any other non 8 bit one).
+    - Run setup with: `./setup.sh --use-ipex`
+    - Run the GUI with: `./gui.sh --use-ipex`
 * 2023/10/01 (v22.0.0)
   - Merging main branch of sd-scripts:
     - [SAI Model Spec](https://github.com/Stability-AI/ModelSpec) metadata is now supported partially. `hash_sha256` is not supported yet.
diff --git a/setup/setup_windows.py b/setup/setup_windows.py
index ec0084cfd..5bb90a015 100644
--- a/setup/setup_windows.py
+++ b/setup/setup_windows.py
@@ -132,11 +132,11 @@ def install_kohya_ss_torch2():
     # Upgrade pip if needed
     setup_common.install('--upgrade pip')
 
-    if setup_common.check_torch() == 1:
-        input(
-            f'{YELLOW}\nTorch 1 is already installed in the venv. To install Torch 2 delete the venv and re-run setup.bat\n\nHit any key to acknowledge.{RESET_COLOR}'
-        )
-        return
+    # if setup_common.check_torch() == 1:
+    #     input(
+    #         f'{YELLOW}\nTorch 1 is already installed in the venv. To install Torch 2 delete the venv and re-run setup.bat\n\nHit any key to acknowledge.{RESET_COLOR}'
+    #     )
+    #     return
 
     # setup_common.install(
     #     'torch==2.0.1+cu118 torchvision==0.15.2+cu118 --index-url https://download.pytorch.org/whl/cu118',

From 5216e6fe00a7e9a6c87c1c724344ec8f8478e1aa Mon Sep 17 00:00:00 2001
From: bmaltais <bernard@ducourier.com>
Date: Wed, 4 Oct 2023 19:30:36 -0400
Subject: [PATCH 20/33] Move localization files to the library folder

---
 dreambooth_gui.py                                  | 2 +-
 finetune_gui.py                                    | 2 +-
 kohya_gui.py                                       | 2 +-
 localization.py => library/localization.py         | 0
 localization_ext.py => library/localization_ext.py | 4 ++--
 lora_gui.py                                        | 2 +-
 textual_inversion_gui.py                           | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)
 rename localization.py => library/localization.py (100%)
 rename localization_ext.py => library/localization_ext.py (93%)

diff --git a/dreambooth_gui.py b/dreambooth_gui.py
index 4b62a5c46..92277d933 100644
--- a/dreambooth_gui.py
+++ b/dreambooth_gui.py
@@ -45,7 +45,7 @@
 from library.class_sample_images import SampleImages, run_cmd_sample
 
 from library.custom_logging import setup_logging
-from localization_ext import add_javascript
+from library.localization_ext import add_javascript
 
 # Set up logging
 log = setup_logging()
diff --git a/finetune_gui.py b/finetune_gui.py
index 892c8be1f..38c6f7556 100644
--- a/finetune_gui.py
+++ b/finetune_gui.py
@@ -34,7 +34,7 @@
 from library.class_sample_images import SampleImages, run_cmd_sample
 
 from library.custom_logging import setup_logging
-from localization_ext import add_javascript
+from library.localization_ext import add_javascript
 
 # Set up logging
 log = setup_logging()
diff --git a/kohya_gui.py b/kohya_gui.py
index da25f2b8f..b6c61f8e6 100644
--- a/kohya_gui.py
+++ b/kohya_gui.py
@@ -10,7 +10,7 @@
 
 import os
 from library.custom_logging import setup_logging
-from localization_ext import add_javascript
+from library.localization_ext import add_javascript
 
 # Set up logging
 log = setup_logging()
diff --git a/localization.py b/library/localization.py
similarity index 100%
rename from localization.py
rename to library/localization.py
diff --git a/localization_ext.py b/library/localization_ext.py
similarity index 93%
rename from localization_ext.py
rename to library/localization_ext.py
index c6007e232..5c33fd0db 100644
--- a/localization_ext.py
+++ b/library/localization_ext.py
@@ -1,6 +1,6 @@
 import os
 import gradio as gr
-import localization
+import library.localization as localization
 
 
 def file_path(fn):
@@ -16,7 +16,7 @@ def js_html_str(language):
 
 def add_javascript(language):
     if language is None:
-        print('no language')
+        # print('no language')
         return
     jsStr = js_html_str(language)
 
diff --git a/lora_gui.py b/lora_gui.py
index 62319efe9..645b42eb5 100644
--- a/lora_gui.py
+++ b/lora_gui.py
@@ -41,7 +41,7 @@
 from library.dataset_balancing_gui import gradio_dataset_balancing_tab
 
 from library.custom_logging import setup_logging
-from localization_ext import add_javascript
+from library.localization_ext import add_javascript
 
 # Set up logging
 log = setup_logging()
diff --git a/textual_inversion_gui.py b/textual_inversion_gui.py
index 502128022..6cfa137b2 100644
--- a/textual_inversion_gui.py
+++ b/textual_inversion_gui.py
@@ -45,7 +45,7 @@
 from library.class_sample_images import SampleImages, run_cmd_sample
 
 from library.custom_logging import setup_logging
-from localization_ext import add_javascript
+from library.localization_ext import add_javascript
 
 # Set up logging
 log = setup_logging()

From a4857fa764effdbdb099fbb6bd54c6d1b46b8238 Mon Sep 17 00:00:00 2001
From: alexds9 <alexds9@gmail.com>
Date: Thu, 5 Oct 2023 21:26:09 +0300
Subject: [PATCH 21/33] Add append_captions feature to wd14 tagger This feature
 allows for appending new tags to the existing content of caption files. If
 the caption file for an image already exists, the tags generated from the
 current run are appended to the existing ones. Duplicate tags are checked and
 avoided.

---
 finetune/tag_images_by_wd14_tagger.py | 31 ++++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/finetune/tag_images_by_wd14_tagger.py b/finetune/tag_images_by_wd14_tagger.py
index 91e4f573e..dde586c75 100644
--- a/finetune/tag_images_by_wd14_tagger.py
+++ b/finetune/tag_images_by_wd14_tagger.py
@@ -165,12 +165,35 @@ def run_batch(path_imgs):
             if len(character_tag_text) > 0:
                 character_tag_text = character_tag_text[2:]
 
+            caption_file = os.path.splitext(image_path)[0] + args.caption_extension
+
             tag_text = ", ".join(combined_tags)
 
-            with open(os.path.splitext(image_path)[0] + args.caption_extension, "wt", encoding="utf-8") as f:
+            if args.append_captions:
+                # Check if file exists
+                if os.path.exists(caption_file):
+
+                    with open(caption_file, "rt", encoding="utf-8") as f:
+
+                        # Read file and remove new lines
+                        existing_content = f.read().strip("\n")  # Remove trailing comma, whitespace, and newlines
+
+                        # Split the content into tags and store them in a list
+                        existing_tags = [tag.strip() for tag in existing_content.split(",") if tag.strip()]
+
+                    # Check and remove repeating tags in tag_text
+                    tag_text = ", ".join([tag for tag in combined_tags if tag not in existing_tags])
+
+                    # If the file has content, prepend a comma to tag_text
+                    if existing_content.strip() and tag_text:
+                        tag_text = ", ".join(existing_tags) + ", " + tag_text
+
+
+            with open(caption_file, "wt", encoding="utf-8") as f:
                 f.write(tag_text + "\n")
                 if args.debug:
-                    print(f"\n{image_path}:\n  Character tags: {character_tag_text}\n  General tags: {general_tag_text}")
+                    print(
+                        f"\n{image_path}:\n  Character tags: {character_tag_text}\n  General tags: {general_tag_text}")
 
     # 読み込みの高速化のためにDataLoaderを使うオプション
     if args.max_data_loader_n_workers is not None:
@@ -282,7 +305,9 @@ def setup_parser() -> argparse.ArgumentParser:
         default="",
         help="comma-separated list of undesired tags to remove from the output / 出力から除外したいタグのカンマ区切りのリスト",
     )
-    parser.add_argument("--frequency_tags", action="store_true", help="Show frequency of tags for images / 画像ごとのタグの出現頻度を表示する")
+    parser.add_argument("--frequency_tags", action="store_true",
+                        help="Show frequency of tags for images / 画像ごとのタグの出現頻度を表示する")
+    parser.add_argument("--append_captions", action="store_true", help="Append captions instead of overwriting")
 
     return parser
 

From 9378da3c8266c0a87d893a2145196ec6efeb76a0 Mon Sep 17 00:00:00 2001
From: alexds9 <alexds9@gmail.com>
Date: Thu, 5 Oct 2023 21:29:46 +0300
Subject: [PATCH 22/33] Fix comment

---
 finetune/tag_images_by_wd14_tagger.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/finetune/tag_images_by_wd14_tagger.py b/finetune/tag_images_by_wd14_tagger.py
index dde586c75..e2ac5c1df 100644
--- a/finetune/tag_images_by_wd14_tagger.py
+++ b/finetune/tag_images_by_wd14_tagger.py
@@ -176,7 +176,7 @@ def run_batch(path_imgs):
                     with open(caption_file, "rt", encoding="utf-8") as f:
 
                         # Read file and remove new lines
-                        existing_content = f.read().strip("\n")  # Remove trailing comma, whitespace, and newlines
+                        existing_content = f.read().strip("\n")  # Remove newlines
 
                         # Split the content into tags and store them in a list
                         existing_tags = [tag.strip() for tag in existing_content.split(",") if tag.strip()]

From 70fe7e18bea63bb2ddc3c8dfdb3a2367d55cb348 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Sun, 8 Oct 2023 20:31:10 +0800
Subject: [PATCH 23/33] add onnx to wd14 tagger

---
 finetune/tag_images_by_wd14_tagger.py | 49 ++++++++++++++++++++++-----
 requirements.txt                      |  4 ++-
 2 files changed, 44 insertions(+), 9 deletions(-)

diff --git a/finetune/tag_images_by_wd14_tagger.py b/finetune/tag_images_by_wd14_tagger.py
index 91e4f573e..816aaddbc 100644
--- a/finetune/tag_images_by_wd14_tagger.py
+++ b/finetune/tag_images_by_wd14_tagger.py
@@ -2,15 +2,14 @@
 import csv
 import glob
 import os
+from pathlib import Path
 
-from PIL import Image
 import cv2
-from tqdm import tqdm
 import numpy as np
-from tensorflow.keras.models import load_model
-from huggingface_hub import hf_hub_download
 import torch
-from pathlib import Path
+from huggingface_hub import hf_hub_download
+from PIL import Image
+from tqdm import tqdm
 
 import library.train_util as train_util
 
@@ -81,6 +80,8 @@ def main(args):
     # https://github.com/toriato/stable-diffusion-webui-wd14-tagger/issues/22
     if not os.path.exists(args.model_dir) or args.force_download:
         print(f"downloading wd14 tagger model from hf_hub. id: {args.repo_id}")
+        if args.onnx:
+            FILES.append("model.onnx")
         for file in FILES:
             hf_hub_download(args.repo_id, file, cache_dir=args.model_dir, force_download=True, force_filename=file)
         for file in SUB_DIR_FILES:
@@ -96,7 +97,35 @@ def main(args):
         print("using existing wd14 tagger model")
 
     # 画像を読み込む
-    model = load_model(args.model_dir)
+    if args.onnx:
+        import onnx
+        import onnxruntime as ort
+
+        onnx_path = f"{args.model_dir}/model.onnx"
+        print("Running wd14 tagger with onnx")
+        print(f"loading onnx model: {onnx_path}")
+        model = onnx.load(onnx_path)
+        input_name = model.graph.input[0].name
+        try:
+            batch_size = model.graph.input[0].type.tensor_type.shape.dim[0].dim_value
+        except:
+            batch_size = model.graph.input[0].type.tensor_type.shape.dim[0].dim_param
+        if args.batch_size != batch_size and type(batch_size) != str:
+            # some rebatch model may use 'N' as dynamic axes
+            print(
+                f"Batch size {args.batch_size} doesn't match onnx model batch size {batch_size}, use model batch size {batch_size}"
+            )
+            args.batch_size = batch_size
+        ort_sess = ort.InferenceSession(
+            model.SerializeToString(),
+            providers=["CUDAExecutionProvider"]
+            if "CUDAExecutionProvider" in ort.get_available_providers()
+            else ["CPUExecutionProvider"],
+        )
+    else:
+        from tensorflow.keras.models import load_model
+
+        model = load_model(f"{args.model_dir}")
 
     # label_names = pd.read_csv("2022_0000_0899_6549/selected_tags.csv")
     # 依存ライブラリを増やしたくないので自力で読むよ
@@ -124,8 +153,11 @@ def main(args):
     def run_batch(path_imgs):
         imgs = np.array([im for _, im in path_imgs])
 
-        probs = model(imgs, training=False)
-        probs = probs.numpy()
+        if args.onnx:
+            probs = ort_sess.run(None, {input_name: imgs})  # onnx output numpy
+        else:
+            probs = model(imgs, training=False)
+            probs = probs.numpy()
 
         for (image_path, _), prob in zip(path_imgs, probs):
             # 最初の4つはratingなので無視する
@@ -283,6 +315,7 @@ def setup_parser() -> argparse.ArgumentParser:
         help="comma-separated list of undesired tags to remove from the output / 出力から除外したいタグのカンマ区切りのリスト",
     )
     parser.add_argument("--frequency_tags", action="store_true", help="Show frequency of tags for images / 画像ごとのタグの出現頻度を表示する")
+    parser.add_argument("--onnx", action="store_true", help="use onnx model for inference")
 
     return parser
 
diff --git a/requirements.txt b/requirements.txt
index 4ca393f52..fa6005ac6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,8 +19,10 @@ huggingface-hub==0.15.1
 # requests==2.28.2
 # timm==0.6.12
 # fairscale==0.4.13
-# for WD14 captioning
+# for WD14 captioning (tensroflow or onnx)
 # tensorflow==2.10.1
+# onnx==1.14.1
+# onnxruntime==1.16.0
 # open clip for SDXL
 open-clip-torch==2.20.0
 # for kohya_ss library

From b8b84021e54b34ed04800e21a18fc67e6e9ce1c1 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Sun, 8 Oct 2023 20:49:03 +0800
Subject: [PATCH 24/33] fix a typo

---
 finetune/tag_images_by_wd14_tagger.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/finetune/tag_images_by_wd14_tagger.py b/finetune/tag_images_by_wd14_tagger.py
index 816aaddbc..6b33af518 100644
--- a/finetune/tag_images_by_wd14_tagger.py
+++ b/finetune/tag_images_by_wd14_tagger.py
@@ -117,7 +117,7 @@ def main(args):
             )
             args.batch_size = batch_size
         ort_sess = ort.InferenceSession(
-            model.SerializeToString(),
+            onnx_path,
             providers=["CUDAExecutionProvider"]
             if "CUDAExecutionProvider" in ort.get_available_providers()
             else ["CPUExecutionProvider"],
@@ -154,7 +154,7 @@ def run_batch(path_imgs):
         imgs = np.array([im for _, im in path_imgs])
 
         if args.onnx:
-            probs = ort_sess.run(None, {input_name: imgs})  # onnx output numpy
+            probs = ort_sess.run(None, {input_name: imgs})[0]  # onnx output numpy
         else:
             probs = model(imgs, training=False)
             probs = probs.numpy()

From d6f458fcb3cda470486a9d0ea3a2dad0c72b46db Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Sun, 8 Oct 2023 23:51:18 +0800
Subject: [PATCH 25/33] fix dependency

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index fa6005ac6..75de48cb9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -22,6 +22,7 @@ huggingface-hub==0.15.1
 # for WD14 captioning (tensroflow or onnx)
 # tensorflow==2.10.1
 # onnx==1.14.1
+# onnxruntime-gpu==1.16.0
 # onnxruntime==1.16.0
 # open clip for SDXL
 open-clip-torch==2.20.0

From 025368f51c31050544934a972ad77b149276bcf1 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 9 Oct 2023 14:06:58 +0900
Subject: [PATCH 26/33] may work dropout in LyCORIS #859

---
 train_network.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/train_network.py b/train_network.py
index 99179814c..2232a384a 100644
--- a/train_network.py
+++ b/train_network.py
@@ -283,7 +283,10 @@ def train(self, args):
         if args.dim_from_weights:
             network, _ = network_module.create_network_from_weights(1, args.network_weights, vae, text_encoder, unet, **net_kwargs)
         else:
-            # LyCORIS will work with this...
+            if "dropout" not in net_kwargs:
+                # workaround for LyCORIS (;^ω^)
+                net_kwargs["dropout"] = args.network_dropout
+
             network = network_module.create_network(
                 1.0,
                 args.network_dim,

From 0d4e8b50d0ce23437a16d4735f785190a4457af3 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 9 Oct 2023 15:09:54 +0900
Subject: [PATCH 27/33] change option to append_tags, minor update

---
 finetune/tag_images_by_wd14_tagger.py | 27 +++++++++++----------------
 1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/finetune/tag_images_by_wd14_tagger.py b/finetune/tag_images_by_wd14_tagger.py
index e2ac5c1df..31ee93bc0 100644
--- a/finetune/tag_images_by_wd14_tagger.py
+++ b/finetune/tag_images_by_wd14_tagger.py
@@ -169,31 +169,26 @@ def run_batch(path_imgs):
 
             tag_text = ", ".join(combined_tags)
 
-            if args.append_captions:
+            if args.append_tags:
                 # Check if file exists
                 if os.path.exists(caption_file):
-
                     with open(caption_file, "rt", encoding="utf-8") as f:
-
                         # Read file and remove new lines
                         existing_content = f.read().strip("\n")  # Remove newlines
 
-                        # Split the content into tags and store them in a list
-                        existing_tags = [tag.strip() for tag in existing_content.split(",") if tag.strip()]
+                    # Split the content into tags and store them in a list
+                    existing_tags = [tag.strip() for tag in existing_content.split(",") if tag.strip()]
 
                     # Check and remove repeating tags in tag_text
-                    tag_text = ", ".join([tag for tag in combined_tags if tag not in existing_tags])
-
-                    # If the file has content, prepend a comma to tag_text
-                    if existing_content.strip() and tag_text:
-                        tag_text = ", ".join(existing_tags) + ", " + tag_text
+                    new_tags = [tag for tag in combined_tags if tag not in existing_tags]
 
+                    # Create new tag_text
+                    tag_text = ", ".join(existing_tags + new_tags)
 
             with open(caption_file, "wt", encoding="utf-8") as f:
                 f.write(tag_text + "\n")
                 if args.debug:
-                    print(
-                        f"\n{image_path}:\n  Character tags: {character_tag_text}\n  General tags: {general_tag_text}")
+                    print(f"\n{image_path}:\n  Character tags: {character_tag_text}\n  General tags: {general_tag_text}")
 
     # 読み込みの高速化のためにDataLoaderを使うオプション
     if args.max_data_loader_n_workers is not None:
@@ -305,15 +300,15 @@ def setup_parser() -> argparse.ArgumentParser:
         default="",
         help="comma-separated list of undesired tags to remove from the output / 出力から除外したいタグのカンマ区切りのリスト",
     )
-    parser.add_argument("--frequency_tags", action="store_true",
-                        help="Show frequency of tags for images / 画像ごとのタグの出現頻度を表示する")
-    parser.add_argument("--append_captions", action="store_true", help="Append captions instead of overwriting")
+    parser.add_argument("--frequency_tags", action="store_true", help="Show frequency of tags for images / 画像ごとのタグの出現頻度を表示する")
+    parser.add_argument("--append_tags", action="store_true", help="Append captions instead of overwriting / 上書きではなくキャプションを追記する")
 
     return parser
 
+
 if __name__ == "__main__":
     parser = setup_parser()
-    
+
     args = parser.parse_args()
 
     # スペルミスしていたオプションを復元する

From 406511c333d99286f19e9a5bf2de55bccfd5302b Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 9 Oct 2023 17:08:58 +0900
Subject: [PATCH 28/33] add error message if model.onnx doesn't exist

---
 finetune/tag_images_by_wd14_tagger.py | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/finetune/tag_images_by_wd14_tagger.py b/finetune/tag_images_by_wd14_tagger.py
index ffe94e7df..965edd7e2 100644
--- a/finetune/tag_images_by_wd14_tagger.py
+++ b/finetune/tag_images_by_wd14_tagger.py
@@ -1,6 +1,5 @@
 import argparse
 import csv
-import glob
 import os
 from pathlib import Path
 
@@ -19,6 +18,7 @@
 # wd-v1-4-swinv2-tagger-v2 / wd-v1-4-vit-tagger / wd-v1-4-vit-tagger-v2/ wd-v1-4-convnext-tagger / wd-v1-4-convnext-tagger-v2
 DEFAULT_WD14_TAGGER_REPO = "SmilingWolf/wd-v1-4-convnext-tagger-v2"
 FILES = ["keras_metadata.pb", "saved_model.pb", "selected_tags.csv"]
+FILES_ONNX = ["model.onnx"]
 SUB_DIR = "variables"
 SUB_DIR_FILES = ["variables.data-00000-of-00001", "variables.index"]
 CSV_FILE = FILES[-1]
@@ -80,9 +80,10 @@ def main(args):
     # https://github.com/toriato/stable-diffusion-webui-wd14-tagger/issues/22
     if not os.path.exists(args.model_dir) or args.force_download:
         print(f"downloading wd14 tagger model from hf_hub. id: {args.repo_id}")
+        files = FILES
         if args.onnx:
-            FILES.append("model.onnx")
-        for file in FILES:
+            files += FILES_ONNX
+        for file in files:
             hf_hub_download(args.repo_id, file, cache_dir=args.model_dir, force_download=True, force_filename=file)
         for file in SUB_DIR_FILES:
             hf_hub_download(
@@ -104,18 +105,29 @@ def main(args):
         onnx_path = f"{args.model_dir}/model.onnx"
         print("Running wd14 tagger with onnx")
         print(f"loading onnx model: {onnx_path}")
+
+        if not os.path.exists(onnx_path):
+            raise Exception(
+                f"onnx model not found: {onnx_path}, please redownload the model with --force_download"
+                + " / onnxモデルが見つかりませんでした。--force_downloadで再ダウンロードしてください"
+            )
+
         model = onnx.load(onnx_path)
         input_name = model.graph.input[0].name
         try:
             batch_size = model.graph.input[0].type.tensor_type.shape.dim[0].dim_value
         except:
             batch_size = model.graph.input[0].type.tensor_type.shape.dim[0].dim_param
+
         if args.batch_size != batch_size and type(batch_size) != str:
             # some rebatch model may use 'N' as dynamic axes
             print(
                 f"Batch size {args.batch_size} doesn't match onnx model batch size {batch_size}, use model batch size {batch_size}"
             )
             args.batch_size = batch_size
+
+        del model
+
         ort_sess = ort.InferenceSession(
             onnx_path,
             providers=["CUDAExecutionProvider"]
@@ -154,7 +166,10 @@ def run_batch(path_imgs):
         imgs = np.array([im for _, im in path_imgs])
 
         if args.onnx:
+            if len(imgs) < args.batch_size:
+                imgs = np.concatenate([imgs, np.zeros((args.batch_size - len(imgs), IMAGE_SIZE, IMAGE_SIZE, 3))], axis=0)
             probs = ort_sess.run(None, {input_name: imgs})[0]  # onnx output numpy
+            probs = probs[: len(path_imgs)]
         else:
             probs = model(imgs, training=False)
             probs = probs.numpy()
@@ -333,7 +348,7 @@ def setup_parser() -> argparse.ArgumentParser:
         help="comma-separated list of undesired tags to remove from the output / 出力から除外したいタグのカンマ区切りのリスト",
     )
     parser.add_argument("--frequency_tags", action="store_true", help="Show frequency of tags for images / 画像ごとのタグの出現頻度を表示する")
-    parser.add_argument("--onnx", action="store_true", help="use onnx model for inference")
+    parser.add_argument("--onnx", action="store_true", help="use onnx model for inference / onnxモデルを推論に使用する")
     parser.add_argument("--append_tags", action="store_true", help="Append captions instead of overwriting / 上書きではなくキャプションを追記する")
 
     return parser

From 66741c035c0ee443399361b50414e1c1e2e8b23e Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 9 Oct 2023 17:59:24 +0900
Subject: [PATCH 29/33] add OFT

---
 networks/oft.py | 430 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 430 insertions(+)
 create mode 100644 networks/oft.py

diff --git a/networks/oft.py b/networks/oft.py
new file mode 100644
index 000000000..ba05885cb
--- /dev/null
+++ b/networks/oft.py
@@ -0,0 +1,430 @@
+# OFT network module
+
+import math
+import os
+from typing import Dict, List, Optional, Tuple, Type, Union
+from diffusers import AutoencoderKL
+from transformers import CLIPTextModel
+import numpy as np
+import torch
+import re
+
+
+RE_UPDOWN = re.compile(r"(up|down)_blocks_(\d+)_(resnets|upsamplers|downsamplers|attentions)_(\d+)_")
+
+
+class OFTModule(torch.nn.Module):
+    """
+    replaces forward method of the original Linear, instead of replacing the original Linear module.
+    """
+
+    def __init__(
+        self,
+        oft_name,
+        org_module: torch.nn.Module,
+        multiplier=1.0,
+        dim=4,
+        alpha=1,
+    ):
+        """
+        dim -> num blocks
+        alpha -> constrait
+        """
+        super().__init__()
+        self.oft_name = oft_name
+
+        self.num_blocks = dim
+
+        if "Linear" in org_module.__class__.__name__:
+            out_dim = org_module.out_features
+        elif "Conv" in org_module.__class__.__name__:
+            out_dim = org_module.out_channels
+
+        if type(alpha) == torch.Tensor:
+            alpha = alpha.detach().numpy()
+        self.constraint = alpha * out_dim
+        self.register_buffer("alpha", torch.tensor(alpha))
+
+        self.block_size = out_dim // self.num_blocks
+        self.oft_blocks = torch.nn.Parameter(torch.zeros(self.num_blocks, self.block_size, self.block_size))
+
+        self.out_dim = out_dim
+        self.shape = org_module.weight.shape
+
+        self.multiplier = multiplier
+        self.org_module = [org_module]  # moduleにならないようにlistに入れる
+
+    def apply_to(self):
+        self.org_forward = self.org_module[0].forward
+        self.org_module[0].forward = self.forward
+
+    def get_weight(self, multiplier=None):
+        if multiplier is None:
+            multiplier = self.multiplier
+
+        block_Q = self.oft_blocks - self.oft_blocks.transpose(1, 2)
+        norm_Q = torch.norm(block_Q.flatten())
+        new_norm_Q = torch.clamp(norm_Q, max=self.constraint)
+        block_Q = block_Q * ((new_norm_Q + 1e-8) / (norm_Q + 1e-8))
+        I = torch.eye(self.block_size, device=self.oft_blocks.device).unsqueeze(0).repeat(self.num_blocks, 1, 1)
+        block_R = torch.matmul(I + block_Q, (I - block_Q).inverse())
+
+        block_R_weighted = self.multiplier * block_R + (1 - self.multiplier) * I
+        R = torch.block_diag(*block_R_weighted)
+
+        return R
+
+    def forward(self, x, scale=None):
+        x = self.org_forward(x)
+        if self.multiplier == 0.0:
+            return x
+
+        R = self.get_weight().to(x.device, dtype=x.dtype)
+        if x.dim() == 4:
+            x = x.permute(0, 2, 3, 1)
+            x = torch.matmul(x, R)
+            x = x.permute(0, 3, 1, 2)
+        else:
+            x = torch.matmul(x, R)
+        return x
+
+
+class OFTInfModule(OFTModule):
+    def __init__(
+        self,
+        oft_name,
+        org_module: torch.nn.Module,
+        multiplier=1.0,
+        dim=4,
+        alpha=1,
+        **kwargs,
+    ):
+        # no dropout for inference
+        super().__init__(oft_name, org_module, multiplier, dim, alpha)
+        self.enabled = True
+        self.network: OFTNetwork = None
+
+    def set_network(self, network):
+        self.network = network
+
+    def forward(self, x, scale=None):
+        if not self.enabled:
+            return self.org_forward(x)
+        return super().forward(x, scale)
+
+    def merge_to(self, multiplier=None, sign=1):
+        R = self.get_weight(multiplier) * sign
+
+        # get org weight
+        org_sd = self.org_module[0].state_dict()
+        org_weight = org_sd["weight"]
+        R = R.to(org_weight.device, dtype=org_weight.dtype)
+
+        if org_weight.dim() == 4:
+            weight = torch.einsum("oihw, op -> pihw", org_weight, R)
+        else:
+            weight = torch.einsum("oi, op -> pi", org_weight, R)
+
+        # set weight to org_module
+        org_sd["weight"] = weight
+        self.org_module[0].load_state_dict(org_sd)
+
+
+def create_network(
+    multiplier: float,
+    network_dim: Optional[int],
+    network_alpha: Optional[float],
+    vae: AutoencoderKL,
+    text_encoder: Union[CLIPTextModel, List[CLIPTextModel]],
+    unet,
+    neuron_dropout: Optional[float] = None,
+    **kwargs,
+):
+    if network_dim is None:
+        network_dim = 4  # default
+    if network_alpha is None:
+        network_alpha = 1.0
+
+    enable_all_linear = kwargs.get("enable_all_linear", None)
+    enable_conv = kwargs.get("enable_conv", None)
+    if enable_all_linear is not None:
+        enable_all_linear = bool(enable_all_linear)
+    if enable_conv is not None:
+        enable_conv = bool(enable_conv)
+
+    network = OFTNetwork(
+        text_encoder,
+        unet,
+        multiplier=multiplier,
+        dim=network_dim,
+        alpha=network_alpha,
+        enable_all_linear=enable_all_linear,
+        enable_conv=enable_conv,
+        varbose=True,
+    )
+    return network
+
+
+# Create network from weights for inference, weights are not loaded here (because can be merged)
+def create_network_from_weights(multiplier, file, vae, text_encoder, unet, weights_sd=None, for_inference=False, **kwargs):
+    if weights_sd is None:
+        if os.path.splitext(file)[1] == ".safetensors":
+            from safetensors.torch import load_file, safe_open
+
+            weights_sd = load_file(file)
+        else:
+            weights_sd = torch.load(file, map_location="cpu")
+
+    # check dim, alpha and if weights have for conv2d
+    dim = None
+    alpha = None
+    has_conv2d = None
+    all_linear = None
+    for name, param in weights_sd.items():
+        if name.endswith(".alpha"):
+            if alpha is None:
+                alpha = param.item()
+        else:
+            if dim is None:
+                dim = param.size()[0]
+            if has_conv2d is None and param.dim() == 4:
+                has_conv2d = True
+            if all_linear is None:
+                if param.dim() == 3 and "attn" not in name:
+                    all_linear = True
+        if dim is not None and alpha is not None and has_conv2d is not None:
+            break
+    if has_conv2d is None:
+        has_conv2d = False
+    if all_linear is None:
+        all_linear = False
+
+    module_class = OFTInfModule if for_inference else OFTModule
+    network = OFTNetwork(
+        text_encoder,
+        unet,
+        multiplier=multiplier,
+        dim=dim,
+        alpha=alpha,
+        enable_all_linear=all_linear,
+        enable_conv=has_conv2d,
+        module_class=module_class,
+    )
+    return network, weights_sd
+
+
+class OFTNetwork(torch.nn.Module):
+    UNET_TARGET_REPLACE_MODULE_ATTN_ONLY = ["CrossAttention"]
+    UNET_TARGET_REPLACE_MODULE_ALL_LINEAR = ["Transformer2DModel"]
+    UNET_TARGET_REPLACE_MODULE_CONV2D_3X3 = ["ResnetBlock2D", "Downsample2D", "Upsample2D"]
+    OFT_PREFIX_UNET = "oft_unet"  # これ変えないほうがいいかな
+
+    def __init__(
+        self,
+        text_encoder: Union[List[CLIPTextModel], CLIPTextModel],
+        unet,
+        multiplier: float = 1.0,
+        dim: int = 4,
+        alpha: float = 1,
+        enable_all_linear: Optional[bool] = False,
+        enable_conv: Optional[bool] = False,
+        module_class: Type[object] = OFTModule,
+        varbose: Optional[bool] = False,
+    ) -> None:
+        super().__init__()
+        self.multiplier = multiplier
+
+        self.dim = dim
+        self.alpha = alpha
+
+        print(
+            f"create OFT network. num blocks: {self.dim}, constraint: {self.alpha}, multiplier: {self.multiplier}, enable_conv: {enable_conv}"
+        )
+
+        # create module instances
+        def create_modules(
+            root_module: torch.nn.Module,
+            target_replace_modules: List[torch.nn.Module],
+        ) -> List[OFTModule]:
+            prefix = self.OFT_PREFIX_UNET
+            ofts = []
+            for name, module in root_module.named_modules():
+                if module.__class__.__name__ in target_replace_modules:
+                    for child_name, child_module in module.named_modules():
+                        is_linear = "Linear" in child_module.__class__.__name__
+                        is_conv2d = "Conv2d" in child_module.__class__.__name__
+                        is_conv2d_1x1 = is_conv2d and child_module.kernel_size == (1, 1)
+
+                        if is_linear or is_conv2d_1x1 or (is_conv2d and enable_conv):
+                            oft_name = prefix + "." + name + "." + child_name
+                            oft_name = oft_name.replace(".", "_")
+                            # print(oft_name)
+
+                            oft = module_class(
+                                oft_name,
+                                child_module,
+                                self.multiplier,
+                                dim,
+                                alpha,
+                            )
+                            ofts.append(oft)
+            return ofts
+
+        # extend U-Net target modules if conv2d 3x3 is enabled, or load from weights
+        if enable_all_linear:
+            target_modules = OFTNetwork.UNET_TARGET_REPLACE_MODULE_ALL_LINEAR
+        else:
+            target_modules = OFTNetwork.UNET_TARGET_REPLACE_MODULE_ATTN_ONLY
+        if enable_conv:
+            target_modules += OFTNetwork.UNET_TARGET_REPLACE_MODULE_CONV2D_3X3
+
+        self.unet_ofts: List[OFTModule] = create_modules(unet, target_modules)
+        print(f"create OFT for U-Net: {len(self.unet_ofts)} modules.")
+
+        # assertion
+        names = set()
+        for oft in self.unet_ofts:
+            assert oft.oft_name not in names, f"duplicated oft name: {oft.oft_name}"
+            names.add(oft.oft_name)
+
+    def set_multiplier(self, multiplier):
+        self.multiplier = multiplier
+        for oft in self.unet_ofts:
+            oft.multiplier = self.multiplier
+
+    def load_weights(self, file):
+        if os.path.splitext(file)[1] == ".safetensors":
+            from safetensors.torch import load_file
+
+            weights_sd = load_file(file)
+        else:
+            weights_sd = torch.load(file, map_location="cpu")
+
+        info = self.load_state_dict(weights_sd, False)
+        return info
+
+    def apply_to(self, text_encoder, unet, apply_text_encoder=True, apply_unet=True):
+        assert apply_unet, "apply_unet must be True"
+
+        for oft in self.unet_ofts:
+            oft.apply_to()
+            self.add_module(oft.oft_name, oft)
+
+    # マージできるかどうかを返す
+    def is_mergeable(self):
+        return True
+
+    # TODO refactor to common function with apply_to
+    def merge_to(self, text_encoder, unet, weights_sd, dtype, device):
+        print("enable OFT for U-Net")
+
+        for oft in self.unet_ofts:
+            sd_for_lora = {}
+            for key in weights_sd.keys():
+                if key.startswith(oft.oft_name):
+                    sd_for_lora[key[len(oft.oft_name) + 1 :]] = weights_sd[key]
+            oft.load_state_dict(sd_for_lora, False)
+            oft.merge_to()
+
+        print(f"weights are merged")
+
+    # 二つのText Encoderに別々の学習率を設定できるようにするといいかも
+    def prepare_optimizer_params(self, text_encoder_lr, unet_lr, default_lr):
+        self.requires_grad_(True)
+        all_params = []
+
+        def enumerate_params(ofts):
+            params = []
+            for oft in ofts:
+                params.extend(oft.parameters())
+
+            # print num of params
+            num_params = 0
+            for p in params:
+                num_params += p.numel()
+            print(f"OFT params: {num_params}")
+            return params
+
+        param_data = {"params": enumerate_params(self.unet_ofts)}
+        if unet_lr is not None:
+            param_data["lr"] = unet_lr
+        all_params.append(param_data)
+
+        return all_params
+
+    def enable_gradient_checkpointing(self):
+        # not supported
+        pass
+
+    def prepare_grad_etc(self, text_encoder, unet):
+        self.requires_grad_(True)
+
+    def on_epoch_start(self, text_encoder, unet):
+        self.train()
+
+    def get_trainable_params(self):
+        return self.parameters()
+
+    def save_weights(self, file, dtype, metadata):
+        if metadata is not None and len(metadata) == 0:
+            metadata = None
+
+        state_dict = self.state_dict()
+
+        if dtype is not None:
+            for key in list(state_dict.keys()):
+                v = state_dict[key]
+                v = v.detach().clone().to("cpu").to(dtype)
+                state_dict[key] = v
+
+        if os.path.splitext(file)[1] == ".safetensors":
+            from safetensors.torch import save_file
+            from library import train_util
+
+            # Precalculate model hashes to save time on indexing
+            if metadata is None:
+                metadata = {}
+            model_hash, legacy_hash = train_util.precalculate_safetensors_hashes(state_dict, metadata)
+            metadata["sshs_model_hash"] = model_hash
+            metadata["sshs_legacy_hash"] = legacy_hash
+
+            save_file(state_dict, file, metadata)
+        else:
+            torch.save(state_dict, file)
+
+    def backup_weights(self):
+        # 重みのバックアップを行う
+        ofts: List[OFTInfModule] = self.unet_ofts
+        for oft in ofts:
+            org_module = oft.org_module[0]
+            if not hasattr(org_module, "_lora_org_weight"):
+                sd = org_module.state_dict()
+                org_module._lora_org_weight = sd["weight"].detach().clone()
+                org_module._lora_restored = True
+
+    def restore_weights(self):
+        # 重みのリストアを行う
+        ofts: List[OFTInfModule] = self.unet_ofts
+        for oft in ofts:
+            org_module = oft.org_module[0]
+            if not org_module._lora_restored:
+                sd = org_module.state_dict()
+                sd["weight"] = org_module._lora_org_weight
+                org_module.load_state_dict(sd)
+                org_module._lora_restored = True
+
+    def pre_calculation(self):
+        # 事前計算を行う
+        ofts: List[OFTInfModule] = self.unet_ofts
+        for oft in ofts:
+            org_module = oft.org_module[0]
+            oft.merge_to()
+            # sd = org_module.state_dict()
+            # org_weight = sd["weight"]
+            # lora_weight = oft.get_weight().to(org_weight.device, dtype=org_weight.dtype)
+            # sd["weight"] = org_weight + lora_weight
+            # assert sd["weight"].shape == org_weight.shape
+            # org_module.load_state_dict(sd)
+
+            org_module._lora_restored = False
+            oft.enabled = False

From cf49e912fc24a83d7bfd5b10c2831fce88756f90 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 9 Oct 2023 17:59:31 +0900
Subject: [PATCH 30/33] update readme

---
 README.md        | 34 ++++++++++++++++++++++++++++++++++
 requirements.txt |  5 ++++-
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index dc8e25ad6..974aeaeaa 100644
--- a/README.md
+++ b/README.md
@@ -249,6 +249,40 @@ ControlNet-LLLite, a novel method for ControlNet with SDXL, is added. See [docum
 
 ## Change History
 
+### Oct 9. 2023 / 2023/10/9
+
+- `tag_images_by_wd_14_tagger.py` now supports Onnx. If you use Onnx, TensorFlow is not required anymore. [#864](https://github.com/kohya-ss/sd-scripts/pull/864) Thanks to Isotr0py!
+  - `--onnx` option is added. If you use Onnx, specify `--onnx` option.
+  - Please install Onnx and other required packages. 
+    1. Uninstall TensorFlow.
+    1. `pip install tensorboard==2.14.1` This is required for the specified version of protobuf.
+    1. `pip install protobuf==3.20.3` This is required for Onnx.
+    1. `pip install onnx==1.14.1`
+    1. `pip install onnxruntime-gpu==1.16.0` or `pip install onnxruntime==1.16.0`
+- `--append_tags` option is added to `tag_images_by_wd_14_tagger.py`. This option appends the tags to the existing tags, instead of replacing them. [#858](https://github.com/kohya-ss/sd-scripts/pull/858) Thanks to a-l-e-x-d-s-9! 
+- [OFT](https://oft.wyliu.com/) is now supported.
+  - You can use `networks.oft` for the network module in `sdxl_train_network.py`.  The usage is the same as `networks.lora`. Some options are not supported.
+  - `sdxl_gen_img.py` also supports OFT as `--network_module`. 
+  - OFT only supports SDXL currently. Because current OFT tweaks Q/K/V and O in the transformer, and SD1/2 have extremely fewer transformers than SDXL.
+  - The implementation is heavily based on laksjdjf's [OFT implementation](https://github.com/laksjdjf/sd-trainer/blob/dev/networks/lora_modules.py). Thanks to laksjdjf!
+- Other bug fixes and improvements.
+
+- `tag_images_by_wd_14_tagger.py` が Onnx をサポートしました。Onnx を使用する場合は TensorFlow は不要です。[#864](https://github.com/kohya-ss/sd-scripts/pull/864) Isotr0py氏に感謝します。
+  - Onnxを使用する場合は、`--onnx` オプションを指定してください。
+  - Onnx とその他の必要なパッケージをインストールしてください。
+    1. TensorFlow をアンインストールしてください。
+    1. `pip install tensorboard==2.14.1` protobufの指定バージョンにこれが必要。
+    1. `pip install protobuf==3.20.3` Onnxのために必要。
+    1. `pip install onnx==1.14.1`
+    1. `pip install onnxruntime-gpu==1.16.0` または `pip install onnxruntime==1.16.0`
+- `tag_images_by_wd_14_tagger.py` に `--append_tags` オプションが追加されました。このオプションを指定すると、既存のタグに上書きするのではなく、新しいタグのみが既存のタグに追加されます。 [#858](https://github.com/kohya-ss/sd-scripts/pull/858) a-l-e-x-d-s-9氏に感謝します。
+- [OFT](https://oft.wyliu.com/) をサポートしました。
+  - `sdxl_train_network.py` の`--network_module`に `networks.oft` を指定してください。使用方法は `networks.lora` と同様ですが一部のオプションは未サポートです。
+  - `sdxl_gen_img.py` でも同様に  OFT を指定できます。
+  - OFT は現在 SDXL のみサポートしています。OFT は現在 transformer の Q/K/V と O を変更しますが、SD1/2 は transfomer の数が SDXL よりも極端に少ないためです。
+  - 実装は laksjdjf 氏の [OFT実装](https://github.com/laksjdjf/sd-trainer/blob/dev/networks/lora_modules.py) を多くの部分で参考にしています。laksjdjf 氏に感謝します。
+- その他のバグ修正と改善。
+
 ### Oct 1. 2023 / 2023/10/1
 
 - SDXL training is now available in the main branch. The sdxl branch is merged into the main branch.
diff --git a/requirements.txt b/requirements.txt
index 75de48cb9..c27131cd7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,11 +19,14 @@ huggingface-hub==0.15.1
 # requests==2.28.2
 # timm==0.6.12
 # fairscale==0.4.13
-# for WD14 captioning (tensroflow or onnx)
+# for WD14 captioning (tensorflow)
 # tensorflow==2.10.1
+# for WD14 captioning (onnx)
 # onnx==1.14.1
 # onnxruntime-gpu==1.16.0
 # onnxruntime==1.16.0
+# this is for onnx: 
+# protobuf==3.20.3
 # open clip for SDXL
 open-clip-torch==2.20.0
 # for kohya_ss library

From 8b79e3b06c1f18d353c37706667de3224bca4f1c Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 9 Oct 2023 18:00:45 +0900
Subject: [PATCH 31/33] fix typos

---
 README.md       | 2 +-
 networks/oft.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 974aeaeaa..5da6181be 100644
--- a/README.md
+++ b/README.md
@@ -279,7 +279,7 @@ ControlNet-LLLite, a novel method for ControlNet with SDXL, is added. See [docum
 - [OFT](https://oft.wyliu.com/) をサポートしました。
   - `sdxl_train_network.py` の`--network_module`に `networks.oft` を指定してください。使用方法は `networks.lora` と同様ですが一部のオプションは未サポートです。
   - `sdxl_gen_img.py` でも同様に  OFT を指定できます。
-  - OFT は現在 SDXL のみサポートしています。OFT は現在 transformer の Q/K/V と O を変更しますが、SD1/2 は transfomer の数が SDXL よりも極端に少ないためです。
+  - OFT は現在 SDXL のみサポートしています。OFT は現在 transformer の Q/K/V と O を変更しますが、SD1/2 は transformer の数が SDXL よりも極端に少ないためです。
   - 実装は laksjdjf 氏の [OFT実装](https://github.com/laksjdjf/sd-trainer/blob/dev/networks/lora_modules.py) を多くの部分で参考にしています。laksjdjf 氏に感謝します。
 - その他のバグ修正と改善。
 
diff --git a/networks/oft.py b/networks/oft.py
index ba05885cb..1d088f877 100644
--- a/networks/oft.py
+++ b/networks/oft.py
@@ -28,7 +28,7 @@ def __init__(
     ):
         """
         dim -> num blocks
-        alpha -> constrait
+        alpha -> constraint
         """
         super().__init__()
         self.oft_name = oft_name

From 72494a668755e168a402bd8c927dbbe762677a60 Mon Sep 17 00:00:00 2001
From: Sam McLeod <sammcj@users.noreply.github.com>
Date: Tue, 10 Oct 2023 12:46:27 +1100
Subject: [PATCH 32/33] fix: linting, spelling

---
 README.md                               | 76 +++++++++----------
 converted_markdown.md                   |  2 +-
 fine_tune_README.md                     | 97 +++++++++++++++++--------
 library/class_source_model.py           |  4 +-
 library/common_gui.py                   |  4 +-
 library/svd_merge_lora_gui.py           |  2 +-
 localizations/en-GB.json                | 24 ++++++
 networks/extract_lora_from_models.py    |  4 +-
 setup/setup_common.py                   |  2 +-
 test/config/finetune-AdamW.json         |  4 +-
 tools/blip2-for-sd/README.md            |  2 +-
 tools/blip2-for-sd/caption_processor.py |  2 +-
 train_db_README.md                      | 14 ++++
 train_network_README.md                 | 10 +--
 train_ti_README.md                      |  7 +-
 15 files changed, 166 insertions(+), 88 deletions(-)
 create mode 100644 localizations/en-GB.json

diff --git a/README.md b/README.md
index 93f2ced07..68b30a3f9 100644
--- a/README.md
+++ b/README.md
@@ -83,62 +83,62 @@ The GUI allows you to set the training parameters and generate and run the requi
 
 ### About SDXL training
 
-The feature of SDXL training is now available in sdxl branch as an experimental feature. 
+The feature of SDXL training is now available in sdxl branch as an experimental feature.
 
-Sep 3, 2023: The feature will be merged into the main branch soon. Following are the changes from the previous version. 
+Sep 3, 2023: The feature will be merged into the main branch soon. Following are the changes from the previous version.
 
 - ControlNet-LLLite is added. See [documentation](./docs/train_lllite_README.md) for details.
-- JPEG XL is supported. [#786](https://github.com/kohya-ss/sd-scripts/pull/786) 
+- JPEG XL is supported. [#786](https://github.com/kohya-ss/sd-scripts/pull/786)
 - Peak memory usage is reduced. [#791](https://github.com/kohya-ss/sd-scripts/pull/791)
 - Input perturbation noise is added. See [#798](https://github.com/kohya-ss/sd-scripts/pull/798) for details.
 - Dataset subset now has `caption_prefix` and `caption_suffix` options. The strings are added to the beginning and the end of the captions before shuffling. You can specify the options in `.toml`.
 - Other minor changes.
 - Thanks for contributions from Isotr0py, vvern999, lansing  and others!
 
-Aug 13, 2023: 
+Aug 13, 2023:
 
 - LoRA-FA is added experimentally. Specify `--network_module networks.lora_fa` option instead of `--network_module networks.lora`. The trained model can be used as a normal LoRA model.
 
-Aug 12, 2023: 
+Aug 12, 2023:
 
 - The default value of noise offset when omitted has been changed to 0 from 0.0357.
 - The different learning rates for each U-Net block are now supported. Specify with `--block_lr` option. Specify 23 values separated by commas like `--block_lr 1e-3,1e-3 ... 1e-3`.
   - 23 values correspond to `0: time/label embed, 1-9: input blocks 0-8, 10-12: mid blocks 0-2, 13-21: output blocks 0-8, 22: out`.
 
-Aug 6, 2023: 
+Aug 6, 2023:
 
 - [SAI Model Spec](https://github.com/Stability-AI/ModelSpec) metadata is now supported partially. `hash_sha256` is not supported yet.
-  - The main items are set automatically. 
+  - The main items are set automatically.
   - You can set title, author, description, license and tags with `--metadata_xxx` options in each training script.
   - Merging scripts also support minimum SAI Model Spec metadata. See the help message for the usage.
   - Metadata editor will be available soon.
 - SDXL LoRA has `sdxl_base_v1-0` now  for `ss_base_model_version` metadata item, instead of `v0-9`.
 
-Aug 4, 2023: 
+Aug 4, 2023:
 
-- `bitsandbytes` is now optional. Please install it if you want to use it. The insructions are in the later section.
-- `albumentations` is not required anymore.
+- `bitsandbytes` is now optional. Please install it if you want to use it. The instructions are in the later section.
+- `albumentations` is not required any more.
 - An issue for pooled output for Textual Inversion training is fixed.
 - `--v_pred_like_loss ratio` option is added. This option adds the loss like v-prediction loss in SDXL training. `0.1` means that the loss is added 10% of the v-prediction loss. The default value is None (disabled).
   - In v-prediction, the loss is higher in the early timesteps (near the noise). This option can be used to increase the loss in the early timesteps.
 - Arbitrary options can be used for Diffusers' schedulers. For example `--lr_scheduler_args "lr_end=1e-8"`.
 - `sdxl_gen_imgs.py` supports batch size > 1.
-- Fix ControlNet to work with attention couple and reginal LoRA in `gen_img_diffusers.py`.
+- Fix ControlNet to work with attention couple and regional LoRA in `gen_img_diffusers.py`.
 
 Summary of the feature:
 
-- `tools/cache_latents.py` is added. This script can be used to cache the latents to disk in advance. 
+- `tools/cache_latents.py` is added. This script can be used to cache the latents to disk in advance.
   - The options are almost the same as `sdxl_train.py'. See the help message for the usage.
   - Please launch the script as follows:
     `accelerate launch  --num_cpu_threads_per_process 1 tools/cache_latents.py ...`
   - This script should work with multi-GPU, but it is not tested in my environment.
 
-- `tools/cache_text_encoder_outputs.py` is added. This script can be used to cache the text encoder outputs to disk in advance. 
+- `tools/cache_text_encoder_outputs.py` is added. This script can be used to cache the text encoder outputs to disk in advance.
   - The options are almost the same as `cache_latents.py' and `sdxl_train.py'. See the help message for the usage.
 
 - `sdxl_train.py` is a script for SDXL fine-tuning. The usage is almost the same as `fine_tune.py`, but it also supports DreamBooth dataset.
   - `--full_bf16` option is added. Thanks to KohakuBlueleaf!
-    - This option enables the full bfloat16 training (includes gradients). This option is useful to reduce the GPU memory usage. 
+    - This option enables the full bfloat16 training (includes gradients). This option is useful to reduce the GPU memory usage.
     - However, bitsandbytes==0.35 doesn't seem to support this. Please use a newer version of bitsandbytes or another optimizer.
     - I cannot find bitsandbytes>0.35.0 that works correctly on Windows.
     - In addition, the full bfloat16 training might be unstable. Please use it at your own risk.
@@ -159,11 +159,11 @@ Summary of the feature:
     1. Training with captions. All captions must include the token string. The token string is replaced with multiple tokens.
     2. Use `--use_object_template` or `--use_style_template` option. The captions are generated from the template. The existing captions are ignored.
   - See below for the format of the embeddings.
-  
+
 - `sdxl_gen_img.py` is added. This script can be used to generate images with SDXL, including LoRA. See the help message for the usage.
   - Textual Inversion is supported, but the name for the embeds in the caption becomes alphabet only. For example, `neg_hand_v1.safetensors` can be activated with `neghandv`.
 
-`requirements.txt` is updated to support SDXL training. 
+`requirements.txt` is updated to support SDXL training.
 
 #### Tips for SDXL training
 
@@ -184,6 +184,7 @@ Summary of the feature:
 - `--bucket_reso_steps` can be set to 32 instead of the default value 64. Smaller values than 32 will not work for SDXL training.
 
 Example of the optimizer settings for Adafactor with the fixed learning rate:
+
 ```toml
 optimizer_type = "adafactor"
 optimizer_args = [ "scale_parameter=False", "relative_step=False", "warmup_init=False" ]
@@ -204,7 +205,6 @@ I would like to express my gratitude to camendutu for their valuable contributio
 | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ------------------- |
 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/camenduru/kohya_ss-colab/blob/main/kohya_ss_colab.ipynb) | kohya_ss_gui_colab |
 
-
 ## Installation
 
 ### Windows
@@ -227,17 +227,17 @@ To set up the project, follow these steps:
 1. Open a terminal and navigate to the desired installation directory.
 
 2. Clone the repository by running the following command:
-   ```
+   ```shell
    git clone https://github.com/bmaltais/kohya_ss.git
    ```
 
 3. Change into the `kohya_ss` directory:
-   ```
+   ```shell
    cd kohya_ss
    ```
 
 4. Run the setup script by executing the following command:
-   ```
+   ```shell
    .\setup.bat
    ```
 
@@ -260,7 +260,7 @@ Please note that the CUDNN 8.6 DLLs needed for this process cannot be hosted on
 To install the necessary dependencies on a Linux system, ensure that you fulfill the following requirements:
 
 - Ensure that `venv` support is pre-installed. You can install it on Ubuntu 22.04 using the command:
-  ```
+  ```shell
   apt install python3.10-venv
   ```
 
@@ -269,7 +269,7 @@ To install the necessary dependencies on a Linux system, ensure that you fulfill
 - Make sure you have Python version 3.10.6 or higher (but lower than 3.11.0) installed on your system.
 
 - If you are using WSL2, set the `LD_LIBRARY_PATH` environment variable by executing the following command:
-  ```
+  ```shell
   export LD_LIBRARY_PATH=/usr/lib/wsl/lib/
   ```
 
@@ -280,22 +280,22 @@ To set up the project on Linux or macOS, perform the following steps:
 1. Open a terminal and navigate to the desired installation directory.
 
 2. Clone the repository by running the following command:
-   ```
+   ```shell
    git clone https://github.com/bmaltais/kohya_ss.git
    ```
 
 3. Change into the `kohya_ss` directory:
-   ```
+   ```shell
    cd kohya_ss
    ```
 
 4. If you encounter permission issues, make the `setup.sh` script executable by running the following command:
-   ```
+   ```shell
    chmod +x ./setup.sh
    ```
 
 5. Run the setup script by executing the following command:
-   ```
+   ```shell
    ./setup.sh
    ```
 
@@ -310,6 +310,7 @@ For macOS and other non-Linux systems, the installation process will attempt to
 If you choose to use the interactive mode, the default values for the accelerate configuration screen will be "This machine," "None," and "No" for the remaining questions. These default answers are the same as the Windows installation.
 
 ### Runpod
+
 #### Manual installation
 
 To install the necessary components for Runpod and run kohya_ss, follow these steps:
@@ -319,25 +320,25 @@ To install the necessary components for Runpod and run kohya_ss, follow these st
 2. SSH into the Runpod.
 
 3. Clone the repository by running the following command:
-   ```
+   ```shell
    cd /workspace
    git clone https://github.com/bmaltais/kohya_ss.git
    ```
 
 4. Run the setup script:
-   ```
+   ```shell
    cd kohya_ss
    ./setup-runpod.sh
    ```
 
 5. Run the gui with:
-   ```
+   ```shell
    ./gui.sh --share --headless
    ```
 
    or with this if you expose 7860 directly via the runpod configuration
 
-   ```
+   ```shell
    ./gui.sh --listen=0.0.0.0 --headless
    ```
 
@@ -355,6 +356,7 @@ To run from a pre-built Runpod template you can:
 
 
 ### Docker
+
 #### Local docker build
 
 If you prefer to use Docker, follow the instructions below:
@@ -546,7 +548,7 @@ The documentation in this section will be moved to a separate document later.
 
 - `sdxl_train.py` is a script for SDXL fine-tuning. The usage is almost the same as `fine_tune.py`, but it also supports DreamBooth dataset.
   - `--full_bf16` option is added. Thanks to KohakuBlueleaf!
-    - This option enables the full bfloat16 training (includes gradients). This option is useful to reduce the GPU memory usage. 
+    - This option enables the full bfloat16 training (includes gradients). This option is useful to reduce the GPU memory usage.
     - The full bfloat16 training might be unstable. Please use it at your own risk.
   - The different learning rates for each U-Net block are now supported in sdxl_train.py. Specify with `--block_lr` option. Specify 23 values separated by commas like `--block_lr 1e-3,1e-3 ... 1e-3`.
     - 23 values correspond to `0: time/label embed, 1-9: input blocks 0-8, 10-12: mid blocks 0-2, 13-21: output blocks 0-8, 22: out`.
@@ -571,13 +573,13 @@ The documentation in this section will be moved to a separate document later.
 
 ### Utility scripts for SDXL
 
-- `tools/cache_latents.py` is added. This script can be used to cache the latents to disk in advance. 
+- `tools/cache_latents.py` is added. This script can be used to cache the latents to disk in advance.
   - The options are almost the same as `sdxl_train.py'. See the help message for the usage.
   - Please launch the script as follows:
     `accelerate launch  --num_cpu_threads_per_process 1 tools/cache_latents.py ...`
   - This script should work with multi-GPU, but it is not tested in my environment.
 
-- `tools/cache_text_encoder_outputs.py` is added. This script can be used to cache the text encoder outputs to disk in advance. 
+- `tools/cache_text_encoder_outputs.py` is added. This script can be used to cache the text encoder outputs to disk in advance.
   - The options are almost the same as `cache_latents.py` and `sdxl_train.py`. See the help message for the usage.
 
 - `sdxl_gen_img.py` is added. This script can be used to generate images with SDXL, including LoRA, Textual Inversion and ControlNet-LLLite. See the help message for the usage.
@@ -601,6 +603,7 @@ The documentation in this section will be moved to a separate document later.
 - `--bucket_reso_steps` can be set to 32 instead of the default value 64. Smaller values than 32 will not work for SDXL training.
 
 Example of the optimizer settings for Adafactor with the fixed learning rate:
+
 ```toml
 optimizer_type = "adafactor"
 optimizer_args = [ "scale_parameter=False", "relative_step=False", "warmup_init=False" ]
@@ -622,13 +625,12 @@ save_file(state_dict, file)
 
 ControlNet-LLLite, a novel method for ControlNet with SDXL, is added. See [documentation](./docs/train_lllite_README.md) for details.
 
-
 ## Change History
 
 * 2023/10/01 (v22.0.0)
   - Merging main branch of sd-scripts:
     - [SAI Model Spec](https://github.com/Stability-AI/ModelSpec) metadata is now supported partially. `hash_sha256` is not supported yet.
-      - The main items are set automatically. 
+      - The main items are set automatically.
       - You can set title, author, description, license and tags with `--metadata_xxx` options in each training script.
       - Merging scripts also support minimum SAI Model Spec metadata. See the help message for the usage.
       - Metadata editor will be available soon.
@@ -639,7 +641,7 @@ ControlNet-LLLite, a novel method for ControlNet with SDXL, is added. See [docum
     - Arbitrary options can be used for Diffusers' schedulers. For example `--lr_scheduler_args "lr_end=1e-8"`.
 
     - LoRA-FA is added experimentally. Specify `--network_module networks.lora_fa` option instead of `--network_module networks.lora`. The trained model can be used as a normal LoRA model.
-    - JPEG XL is supported. [#786](https://github.com/kohya-ss/sd-scripts/pull/786) 
+    - JPEG XL is supported. [#786](https://github.com/kohya-ss/sd-scripts/pull/786)
     - Input perturbation noise is added. See [#798](https://github.com/kohya-ss/sd-scripts/pull/798) for details.
     - Dataset subset now has `caption_prefix` and `caption_suffix` options. The strings are added to the beginning and the end of the captions before shuffling. You can specify the options in `.toml`.
     - Intel ARC support with IPEX is added. [#825](https://github.com/kohya-ss/sd-scripts/pull/825)
@@ -651,7 +653,7 @@ ControlNet-LLLite, a novel method for ControlNet with SDXL, is added. See [docum
 
 * 2023/09/23 (v21.8.10)
   - Minor point upgrade. Mostly adding a new preset.
-  
+
 * 2023/08/05 (v21.8.9)
   - Update sd-script to caode as of Sept 3 2023
     * ControlNet-LLLite is added. See documentation for details.
diff --git a/converted_markdown.md b/converted_markdown.md
index 23dc53753..684af19ad 100644
--- a/converted_markdown.md
+++ b/converted_markdown.md
@@ -903,7 +903,7 @@ US&client=webapp&u=https://d.hatena.ne.jp/keyword/%25A5%25CB%25A5%25E5%25A1%25BC
 
 このパラメータ値は常に25個の数字を指定しなければいけませんが、LoRAはAttentionブロックを学習対象としているので、Attentionブロックの存在しないIN0、IN3、IN6、IN9、IN10、IN11、OUT0、IN1、IN2に対する設定(1、4、7、11、12、14、15、16番目の数字)は学習時は無視されます。
 
-※上級者向け設定です。こだわりがないなら空欄のままで構いません。ここを指定しない場合は「Network Rank(Dimention)」値と「Network
+※上級者向け設定です。こだわりがないなら空欄のままで構いません。ここを指定しない場合は「Network Rank(Dimension)」値と「Network
 Alpha」値がすべてのブロックに適応されます。
 
 
diff --git a/fine_tune_README.md b/fine_tune_README.md
index 7ffd05d4a..696360a90 100644
--- a/fine_tune_README.md
+++ b/fine_tune_README.md
@@ -1,6 +1,9 @@
+# Fine tuning
+
 It is a fine tuning that corresponds to NovelAI's proposed learning method, automatic captioning, tagging, Windows + VRAM 12GB (for v1.4/1.5) environment, etc.
 
-## overview
+## Overview
+
 Fine tuning of U-Net of Stable Diffusion using Diffusers. It corresponds to the following improvements in NovelAI's article (For Aspect Ratio Bucketing, I referred to NovelAI's code, but the final code is all original).
 
 * Use the output of the penultimate layer instead of the last layer of CLIP (Text Encoder).
@@ -14,18 +17,22 @@ Fine tuning of U-Net of Stable Diffusion using Diffusers. It corresponds to the
 Text Encoder is not trained by default. For fine tuning of the whole model, it seems common to learn only U-Net (NovelAI seems to be the same). Text Encoder can also be learned as an option.
 
 ## Additional features
+
 ### Change CLIP output
+
 CLIP (Text Encoder) converts the text into features in order to reflect the prompt in the image. Stable diffusion uses the output of the last layer of CLIP, but you can change it to use the output of the penultimate layer. According to NovelAI, this will reflect prompts more accurately.
 It is also possible to use the output of the last layer as is.
 *Stable Diffusion 2.0 uses the penultimate layer by default. Do not specify the clip_skip option.
 
 ### Training in non-square resolutions
+
 Stable Diffusion is trained at 512\*512, but also at resolutions such as 256\*1024 and 384\*640. It is expected that this will reduce the cropped portion and learn the relationship between prompts and images more correctly.
 The learning resolution is adjusted vertically and horizontally in units of 64 pixels within a range that does not exceed the resolution area (= memory usage) given as a parameter.
 
 In machine learning, it is common to unify all input sizes, but there are no particular restrictions, and in fact it is okay as long as they are unified within the same batch. NovelAI's bucketing seems to refer to classifying training data in advance for each learning resolution according to the aspect ratio. And by creating a batch with the images in each bucket, the image size of the batch is unified.
 
 ### Extending token length from 75 to 225
+
 Stable diffusion has a maximum of 75 tokens (77 tokens including the start and end), but we will extend it to 225 tokens.
 However, the maximum length that CLIP accepts is 75 tokens, so in the case of 225 tokens, we simply divide it into thirds, call CLIP, and then concatenate the results.
 
@@ -49,6 +56,7 @@ For example, store an image like this:
 ![Teacher data folder screenshot](https://user-images.githubusercontent.com/52813779/208907739-8e89d5fa-6ca8-4b60-8927-f484d2a9ae04.png)
 
 ## Automatic captioning
+
 Skip if you just want to learn tags without captions.
 
 Also, when preparing captions manually, prepare them in the same directory as the teacher data image, with the same file name, extension .caption, etc. Each file should be a text file with only one line.
@@ -59,13 +67,13 @@ The latest version no longer requires BLIP downloads, weight downloads, and addi
 
 Run make_captions.py in the finetune folder.
 
-```
+```shell
 python finetune\make_captions.py --batch_size <batch size> <teacher data folder>
 ```
 
 If the batch size is 8 and the training data is placed in the parent folder train_data, it will be as follows.
 
-```
+```shell
 python finetune\make_captions.py --batch_size 8 ..\train_data
 ```
 
@@ -90,11 +98,13 @@ For example, with captions like:
 ![captions and images](https://user-images.githubusercontent.com/52813779/208908947-af936957-5d73-4339-b6c8-945a52857373.png)
 
 ## Tagged by DeepDanbooru
+
 If you do not want to tag the danbooru tag itself, please proceed to "Preprocessing of caption and tag information".
 
 Tagging is done with DeepDanbooru or WD14Tagger. WD14Tagger seems to be more accurate. If you want to tag with WD14Tagger, skip to the next chapter.
 
 ### Environmental arrangement
+
 Clone DeepDanbooru https://github.com/KichangKim/DeepDanbooru into your working folder, or download the zip and extract it. I unzipped it.
 Also, download deepdanbooru-v3-20211112-sgd-e28.zip from Assets of "DeepDanbooru Pretrained Model v3-20211112-sgd-e28" on the DeepDanbooru Releases page https://github.com/KichangKim/DeepDanbooru/releases and extract it to the DeepDanbooru folder.
 
@@ -108,28 +118,29 @@ Make a directory structure like this
 
 Install the necessary libraries for the Diffusers environment. Go to the DeepDanbooru folder and install it (I think it's actually just adding tensorflow-io).
 
-```
+```shell
 pip install -r requirements.txt
 ```
 
 Next, install DeepDanbooru itself.
 
-```
+```shell
 pip install .
 ```
 
 This completes the preparation of the environment for tagging.
 
 ### Implementing tagging
+
 Go to DeepDanbooru's folder and run deepdanbooru to tag.
 
-```
+```shell
 deepdanbooru evaluate <teacher data folder> --project-path deepdanbooru-v3-20211112-sgd-e28 --allow-folder --save-txt
 ```
 
 If you put the training data in the parent folder train_data, it will be as follows.
 
-```
+```shell
 deepdanbooru evaluate ../train_data --project-path deepdanbooru-v3-20211112-sgd-e28 --allow-folder --save-txt
 ```
 
@@ -146,6 +157,7 @@ A tag is attached like this (great amount of information...).
 ![Deep Danbooru tag and image](https://user-images.githubusercontent.com/52813779/208909908-a7920174-266e-48d5-aaef-940aba709519.png)
 
 ## Tagging with WD14Tagger
+
 This procedure uses WD14Tagger instead of DeepDanbooru.
 
 Use the tagger used in Mr. Automatic1111's WebUI. I referred to the information on this github page (https://github.com/toriato/stable-diffusion-webui-wd14-tagger#mrsmilingwolfs-model-aka-waifu-diffusion-14-tagger).
@@ -153,13 +165,16 @@ Use the tagger used in Mr. Automatic1111's WebUI. I referred to the information
 The modules required for the initial environment maintenance have already been installed. Weights are automatically downloaded from Hugging Face.
 
 ### Implementing tagging
+
 Run the script to do the tagging.
-```
+
+```shell
 python tag_images_by_wd14_tagger.py --batch_size <batch size> <teacher data folder>
 ```
 
 If you put the training data in the parent folder train_data, it will be as follows.
-```
+
+```shell
 python tag_images_by_wd14_tagger.py --batch_size 4 ..\train_data
 ```
 
@@ -188,7 +203,7 @@ Combine captions and tags into a single file as metadata for easy processing fro
 
 To put captions into the metadata, run the following in your working folder (if you don't use captions for learning, you don't need to run this) (it's actually a single line, and so on).
 
-```
+```shell
 python merge_captions_to_metadata.py <teacher data folder>
 --in_json <metadata file name to read>
      <metadata file name>
@@ -197,7 +212,7 @@ python merge_captions_to_metadata.py <teacher data folder>
 The metadata file name is an arbitrary name.
 If the training data is train_data, there is no metadata file to read, and the metadata file is meta_cap.json, it will be as follows.
 
-```
+```shell
 python merge_captions_to_metadata.py train_data meta_cap.json
 ```
 
@@ -205,7 +220,7 @@ You can specify the caption extension with the caption_extension option.
 
 If there are multiple teacher data folders, please specify the full_path argument (metadata will have full path information). Then run it for each folder.
 
-```
+```shell
 python merge_captions_to_metadata.py --full_path
      train_data1 meta_cap1.json
 python merge_captions_to_metadata.py --full_path --in_json meta_cap1.json
@@ -219,20 +234,22 @@ __*It is safe to rewrite the in_json option and the write destination each time
 ### Tag preprocessing
 
 Similarly, tags are also collected in metadata (no need to do this if tags are not used for learning).
-```
+
+```shell
 python merge_dd_tags_to_metadata.py <teacher data folder>
      --in_json <metadata file name to load>
      <metadata file name to write>
 ```
 
 With the same directory structure as above, when reading meta_cap.json and writing to meta_cap_dd.json, it will be as follows.
-```
+
+```shell
 python merge_dd_tags_to_metadata.py train_data --in_json meta_cap.json meta_cap_dd.json
 ```
 
 If you have multiple teacher data folders, please specify the full_path argument. Then run it for each folder.
 
-```
+```shell
 python merge_dd_tags_to_metadata.py --full_path --in_json meta_cap2.json
      train_data1 meta_cap_dd1.json
 python merge_dd_tags_to_metadata.py --full_path --in_json meta_cap_dd1.json
@@ -244,6 +261,7 @@ If in_json is omitted, if there is a write destination metadata file, it will be
 __*It is safe to rewrite the in_json option and the write destination each time and write to a separate metadata file. __
 
 ### Cleaning captions and tags
+
 Up to this point, captions and DeepDanbooru tags have been put together in the metadata file. However, captions with automatic captioning are subtle due to spelling variations (*), and tags include underscores and ratings (in the case of DeepDanbooru), so the editor's replacement function etc. You should use it to clean your captions and tags.
 
 *For example, when learning a girl in an anime picture, there are variations in captions such as girl/girls/woman/women. Also, it may be more appropriate to simply use "girl" for things like "anime girl".
@@ -252,13 +270,13 @@ A script for cleaning is provided, so please edit the contents of the script acc
 
 (It is no longer necessary to specify the teacher data folder. All data in the metadata will be cleaned.)
 
-```
+```shell
 python clean_captions_and_tags.py <metadata file name to read> <metadata file name to write>
 ```
 
 Please note that --in_json is not included. For example:
 
-```
+```shell
 python clean_captions_and_tags.py meta_cap_dd.json meta_clean.json
 ```
 
@@ -269,7 +287,8 @@ Preprocessing of captions and tags is now complete.
 In order to speed up the learning, we acquire the latent representation of the image in advance and save it to disk. At the same time, bucketing (classifying the training data according to the aspect ratio) is performed.
 
 In your working folder, type:
-```
+
+```shell
 python prepare_buckets_latents.py <teacher data folder>
      <metadata file name to read> <metadata file name to write>
      <model name or checkpoint for fine tuning>
@@ -280,7 +299,7 @@ python prepare_buckets_latents.py <teacher data folder>
 
 If the model is model.ckpt, batch size 4, training resolution is 512\*512, precision is no (float32), read metadata from meta_clean.json and write to meta_lat.json:
 
-```
+```shell
 python prepare_buckets_latents.py
      train_data meta_clean.json meta_lat.json model.ckpt
      --batch_size 4 --max_resolution 512,512 --mixed_precision no
@@ -294,7 +313,7 @@ You can specify the minimum resolution size with the --min_bucket_reso option an
 If you increase the resolution to something like 768\*768, you should specify something like 1280 for the maximum size.
 
 If you specify the --flip_aug option, it will perform horizontal flip augmentation (data augmentation). You can artificially double the amount of data, but if you specify it when the data is not left-right symmetrical (for example, character appearance, hairstyle, etc.), learning will not go well.
-(This is a simple implementation that acquires the latents for the flipped image and saves the \*\_flip.npz file. No options are required for fline_tune.py. If there is a file with \_flip, Randomly load a file without
+(This is a simple implementation that acquires the latents for the flipped image and saves the \*\_flip.npz file. No options are required for fine_tune.py. If there is a file with \_flip, Randomly load a file without
 
 The batch size may be increased a little more even with 12GB of VRAM.
 The resolution is a number divisible by 64, and is specified by "width, height". The resolution is directly linked to the memory size during fine tuning. 512,512 seems to be the limit with VRAM 12GB (*). 16GB may be raised to 512,704 or 512,768. Even with 256, 256, etc., it seems to be difficult with 8GB of VRAM (because parameters and optimizers require a certain amount of memory regardless of resolution).
@@ -306,24 +325,26 @@ The result of bucketing is displayed as follows.
 ![bucketing result](https://user-images.githubusercontent.com/52813779/208911419-71c00fbb-2ce6-49d5-89b5-b78d7715e441.png)
 
 If you have multiple teacher data folders, please specify the full_path argument. Then run it for each folder.
-```
+
+```shell
 python prepare_buckets_latents.py --full_path
      train_data1 meta_clean.json meta_lat1.json model.ckpt
      --batch_size 4 --max_resolution 512,512 --mixed_precision no
 
 python prepare_buckets_latents.py --full_path
      train_data2 meta_lat1.json meta_lat2.json model.ckpt
-     --batch_size 4 --max_resolution 512,512 --mixed_precision no
-
+     --batch_size 4 --max_resolution 512,512 --mixed_precision no\
 ```
+
 It is possible to make the read source and write destination the same, but separate is safer.
 
 __*It is safe to rewrite the argument each time and write it to a separate metadata file. __
 
-
 ## Run training
+
 For example: Below are the settings for saving memory.
-```
+
+```shell
 accelerate launch --num_cpu_threads_per_process 8 fine_tune.py
      --pretrained_model_name_or_path=model.ckpt
      --in_json meta_lat.json
@@ -364,19 +385,22 @@ Specifies whether to use mixed precision with mixed_precision. Specifying "fp16"
 "fp16" and "bf16" use almost the same amount of memory, and it is said that bf16 has better learning results (I didn't feel much difference in the range I tried).
 If "no" is specified, it will not be used (it will be float32).
 
-* It seems that an error will occur when reading checkpoints learned with bf16 with Mr. AUTOMATIC1111's Web UI. This seems to be because the data type bfloat16 causes an error in the Web UI model safety checker. Save in fp16 or float32 format with the save_precision option. Or it seems to be good to store it in safetytensors format.
+* It seems that an error will occur when reading checkpoints learned with bf16 with Mr. AUTOMATIC1111's Web UI. This seems to be because the data type bfloat16 causes an error in the Web UI model safety checker. Save in fp16 or float32 format with the save_precision option. Or it seems to be good to store it in safetensors format.
 
 Specifying save_every_n_epochs will save the model being trained every time that many epochs have passed.
 
 ### Supports Stable Diffusion 2.0
+
 Specify the --v2 option when using Hugging Face's stable-diffusion-2-base, and specify both --v2 and --v_parameterization options when using stable-diffusion-2 or 768-v-ema.ckpt please.
 
 ### Increase accuracy and speed when memory is available
+
 First, removing gradient_checkpointing will speed it up. However, the batch size that can be set is reduced, so please set while looking at the balance between accuracy and speed.
 
 Increasing the batch size increases speed and accuracy. Increase the speed while checking the speed per data within the range where the memory is sufficient (the speed may actually decrease when the memory is at the limit).
 
 ### Change CLIP output used
+
 Specifying 2 for the clip_skip option uses the output of the next-to-last layer. If 1 or option is omitted, the last layer is used.
 The learned model should be able to be inferred by Automatic1111's web UI.
 
@@ -387,26 +411,31 @@ If the model being trained was originally trained to use the second layer, 2 is
 If you were using the last layer instead, the entire model would have been trained on that assumption. Therefore, if you train again using the second layer, you may need a certain number of teacher data and longer learning to obtain the desired learning result.
 
 ### Extending Token Length
+
 You can learn by extending the token length by specifying 150 or 225 for max_token_length.
 The learned model should be able to be inferred by Automatic1111's web UI.
 
 As with clip_skip, learning with a length different from the learning state of the model may require a certain amount of teacher data and a longer learning time.
 
 ### Save learning log
+
 Specify the log save destination folder in the logging_dir option. Logs in TensorBoard format are saved.
 
 For example, if you specify --logging_dir=logs, a logs folder will be created in your working folder, and logs will be saved in the date/time folder.
 Also, if you specify the --log_prefix option, the specified string will be added before the date and time. Use "--logging_dir=logs --log_prefix=fine_tune_style1" for identification.
 
 To check the log with TensorBoard, open another command prompt and enter the following in the working folder (I think tensorboard is installed when Diffusers is installed, but if it is not installed, pip install Please put it in tensorboard).
-```
+
+```shell
 tensorboard --logdir=logs
 ```
 
 ### Learning Hypernetworks
+
 It will be explained in another article.
 
 ### Learning with fp16 gradient (experimental feature)
+
 The full_fp16 option will change the gradient from normal float32 to float16 (fp16) and learn (it seems to be full fp16 learning instead of mixed precision). As a result, it seems that the SD1.x 512*512 size can be learned with a VRAM usage of less than 8GB, and the SD2.x 512*512 size can be learned with a VRAM usage of less than 12GB.
 
 Specify fp16 in advance in accelerate config and optionally set mixed_precision="fp16" (does not work with bf16).
@@ -419,32 +448,39 @@ It is realized by patching the PyTorch source (confirmed with PyTorch 1.12.1 and
 ### Other Options
 
 #### keep_tokens
+
 If a number is specified, the specified number of tokens (comma-separated strings) from the beginning of the caption are fixed without being shuffled.
 
 If there are both captions and tags, the prompts during learning will be concatenated like "caption, tag 1, tag 2...", so if you set "--keep_tokens=1", the caption will always be at the beginning during learning. will come.
 
 #### dataset_repeats
+
 If the number of data sets is extremely small, the epoch will end soon (it will take some time at the epoch break), so please specify a numerical value and multiply the data by some to make the epoch longer.
 
 #### train_text_encoder
+
 Text Encoder is also a learning target. Slightly increased memory usage.
 
 In normal fine tuning, the Text Encoder is not targeted for training (probably because U-Net is trained to follow the output of the Text Encoder), but if the number of training data is small, the Text Encoder is trained like DreamBooth. also seems to be valid.
 
 #### save_precision
+
 The data format when saving checkpoints can be specified from float, fp16, and bf16 (if not specified, it is the same as the data format during learning). It saves disk space, but the model produces different results. Also, if you specify float or fp16, you should be able to read it on Mr. 1111's Web UI.
 
 *For VAE, the data format of the original checkpoint will remain, so the model size may not be reduced to a little over 2GB even with fp16.
 
 #### save_model_as
+
 Specify the save format of the model. Specify one of ckpt, safetensors, diffusers, diffusers_safetensors.
 
 When reading Stable Diffusion format (ckpt or safetensors) and saving in Diffusers format, missing information is supplemented by dropping v1.5 or v2.1 information from Hugging Face.
 
 #### use_safetensors
-This option saves checkpoints in safetyensors format. The save format will be the default (same format as loaded).
+
+This option saves checkpoints in safetensors format. The save format will be the default (same format as loaded).
 
 #### save_state and resume
+
 The save_state option saves the learning state of the optimizer, etc. in addition to the checkpoint in the folder when saving midway and at the final save. This avoids a decrease in accuracy when learning is resumed after being interrupted (since the optimizer optimizes while having a state, if the state is reset, the optimization must be performed again from the initial state. not). Note that the number of steps is not saved due to Accelerate specifications.
 
 When starting the script, you can resume by specifying the folder where the state is saved with the resume option.
@@ -452,14 +488,17 @@ When starting the script, you can resume by specifying the folder where the stat
 Please note that the learning state will be about 5 GB per save, so please be careful of the disk capacity.
 
 #### gradient_accumulation_steps
+
 Updates the gradient in batches for the specified number of steps. Has a similar effect to increasing the batch size, but consumes slightly more memory.
 
 *The Accelerate specification does not support multiple learning models, so if you set Text Encoder as the learning target and specify a value of 2 or more for this option, an error may occur.
 
 #### lr_scheduler / lr_warmup_steps
+
 You can choose the learning rate scheduler from linear, cosine, cosine_with_restarts, polynomial, constant, constant_with_warmup with the lr_scheduler option. Default is constant.
 
 With lr_warmup_steps, you can specify the number of steps to warm up the scheduler (gradually changing the learning rate). Please do your own research for details.
 
 #### diffusers_xformers
-Uses Diffusers' xformers feature rather than the script's own xformers replacement feature. Hypernetwork learning is no longer possible.
\ No newline at end of file
+
+Uses Diffusers' xformers feature rather than the script's own xformers replacement feature. Hypernetwork learning is no longer possible.
diff --git a/library/class_source_model.py b/library/class_source_model.py
index 938c61fe1..041ed647d 100644
--- a/library/class_source_model.py
+++ b/library/class_source_model.py
@@ -33,8 +33,8 @@ def __init__(
                     label='Model Quick Pick',
                     choices=[
                         'custom',
-                        # 'stabilityai/stable-diffusion-xl-base-0.9',
-                        # 'stabilityai/stable-diffusion-xl-refiner-0.9',
+                        'stabilityai/stable-diffusion-xl-base-1.0',
+                        'stabilityai/stable-diffusion-xl-refiner-1.0',
                         'stabilityai/stable-diffusion-2-1-base/blob/main/v2-1_512-ema-pruned',
                         'stabilityai/stable-diffusion-2-1-base',
                         'stabilityai/stable-diffusion-2-base',
diff --git a/library/common_gui.py b/library/common_gui.py
index 5d9183229..8393642c3 100644
--- a/library/common_gui.py
+++ b/library/common_gui.py
@@ -41,8 +41,8 @@
 
 # define a list of substrings to search for SDXL base models
 SDXL_MODELS = [
-    'stabilityai/stable-diffusion-xl-base-0.9',
-    'stabilityai/stable-diffusion-xl-refiner-0.9',
+    'stabilityai/stable-diffusion-xl-base-1.0',
+    'stabilityai/stable-diffusion-xl-refiner-1.0',
 ]
 
 # define a list of substrings to search for
diff --git a/library/svd_merge_lora_gui.py b/library/svd_merge_lora_gui.py
index 781c1dcfc..27d670328 100644
--- a/library/svd_merge_lora_gui.py
+++ b/library/svd_merge_lora_gui.py
@@ -41,7 +41,7 @@ def svd_merge_lora(
         print(f"Output file '{save_to}' already exists. Aborting.")
         return
 
-    # Check if the ratio total is equal to one. If not mormalise to 1
+    # Check if the ratio total is equal to one. If not normalise to 1
     total_ratio = ratio_a + ratio_b + ratio_c + ratio_d
     if total_ratio != 1:
         ratio_a /= total_ratio
diff --git a/localizations/en-GB.json b/localizations/en-GB.json
new file mode 100644
index 000000000..9238bcb94
--- /dev/null
+++ b/localizations/en-GB.json
@@ -0,0 +1,24 @@
+{
+  "analyze": "analyse",
+  "behavior": "behaviour",
+  "color": "colour",
+  "flavor": "flavour",
+  "honor": "honour",
+  "humor": "humour",
+  "localization": "localisation",
+  "localize": "localise",
+  "neighbor": "neighbour",
+  "offense": "offence",
+  "oriented": "orientated",
+  "practice": "practise",
+  "pretense": "pretence",
+  "program": "programme",
+  "recognize": "recognise",
+  "regularization": "regularisation",
+  "savior": "saviour",
+  "signaling": "signalling",
+  "specialization": "specialisation",
+  "stabilization": "stabilisation",
+  "standardization": "standardisation",
+  "utilize": "utilise"
+}
\ No newline at end of file
diff --git a/networks/extract_lora_from_models.py b/networks/extract_lora_from_models.py
index 7bdfceafb..c948d5b15 100644
--- a/networks/extract_lora_from_models.py
+++ b/networks/extract_lora_from_models.py
@@ -252,13 +252,13 @@ def setup_parser() -> argparse.ArgumentParser:
         "--clamp_quantile",
         type=float,
         default=1,
-        help="Quantile clamping value, float, (0-1). Defailt = 1",
+        help="Quantile clamping value, float, (0-1). Default = 1",
     )
     parser.add_argument(
         "--min_diff",
         type=float,
         default=1,
-        help="Minimum difference betwen finetuned model and base to consider them different enough to extract, float, (0-1). Defailt = 0.01",
+        help="Minimum difference between finetuned model and base to consider them different enough to extract, float, (0-1). Default = 0.01",
     )
     parser.add_argument(
         "--no_metadata",
diff --git a/setup/setup_common.py b/setup/setup_common.py
index 8d94ca9f3..00dd3af85 100644
--- a/setup/setup_common.py
+++ b/setup/setup_common.py
@@ -373,7 +373,7 @@ def process_requirements_line(line, show_stdout: bool = False):
 
 def install_requirements(requirements_file, check_no_verify_flag=False, show_stdout: bool = False):
     if check_no_verify_flag:
-        log.info(f'Verifying modules instalation status from {requirements_file}...')
+        log.info(f'Verifying modules installation status from {requirements_file}...')
     else:
         log.info(f'Installing modules from {requirements_file}...')
     with open(requirements_file, 'r', encoding='utf8') as f:
diff --git a/test/config/finetune-AdamW.json b/test/config/finetune-AdamW.json
index d3128ae82..c4ddbe235 100644
--- a/test/config/finetune-AdamW.json
+++ b/test/config/finetune-AdamW.json
@@ -37,7 +37,7 @@
   "min_bucket_reso": "256",
   "min_snr_gamma": 0,
   "mixed_precision": "bf16",
-  "model_list": "stabilityai/stable-diffusion-xl-base-0.9",
+  "model_list": "stabilityai/stable-diffusion-xl-base-1.0",
   "multires_noise_discount": 0,
   "multires_noise_iterations": 0,
   "noise_offset": 0,
@@ -48,7 +48,7 @@
   "output_dir": "./test/output",
   "output_name": "test_ft",
   "persistent_data_loader_workers": false,
-  "pretrained_model_name_or_path": "stabilityai/stable-diffusion-xl-base-0.9",
+  "pretrained_model_name_or_path": "stabilityai/stable-diffusion-xl-base-1.0",
   "random_crop": false,
   "resume": "",
   "sample_every_n_epochs": 0,
diff --git a/tools/blip2-for-sd/README.md b/tools/blip2-for-sd/README.md
index 0d0b074d2..286d28159 100644
--- a/tools/blip2-for-sd/README.md
+++ b/tools/blip2-for-sd/README.md
@@ -4,7 +4,7 @@ source: https://github.com/Talmendo/blip2-for-sd
 
 Simple script to make BLIP2 output image description in a format suitable for Stable Diffusion.
 
-Format followd is roughly
+Format followed is roughly
 `[STYLE OF PHOTO] photo of a [SUBJECT], [IMPORTANT FEATURE], [MORE DETAILS], [POSE OR ACTION], [FRAMING], [SETTING/BACKGROUND], [LIGHTING], [CAMERA ANGLE], [CAMERA PROPERTIES],in style of [PHOTOGRAPHER]`
 
 ## Usage
diff --git a/tools/blip2-for-sd/caption_processor.py b/tools/blip2-for-sd/caption_processor.py
index 7652d14c6..8de18c33b 100644
--- a/tools/blip2-for-sd/caption_processor.py
+++ b/tools/blip2-for-sd/caption_processor.py
@@ -89,7 +89,7 @@ def caption_me(self, initial_prompt, image):
       p_lighting = self.ask("What is the scene lighting like? For example: soft lighting, studio lighting, natural lighting", image)
       # print(p_lighting)
 
-      p_angle = self.ask("What angle is the picture taken from? Be succint, like: from the side, from below, from front", image)
+      p_angle = self.ask("What angle is the picture taken from? Be succinct, like: from the side, from below, from front", image)
       # print(p_angle)
 
       p_camera = self.ask("What kind of camera could this picture have been taken with? Be specific and guess a brand with specific camera type", image)
diff --git a/train_db_README.md b/train_db_README.md
index 2367d29ae..7c3be2e3b 100644
--- a/train_db_README.md
+++ b/train_db_README.md
@@ -164,6 +164,7 @@ Each yaml file can be found at [https://github.com/Stability-AI/stablediffusion/
 # Other study options
 
 ## Supports Stable Diffusion 2.0 --v2 / --v_parameterization
+
 Specify the v2 option when using Hugging Face's stable-diffusion-2-base, and specify both the v2 and v_parameterization options when using stable-diffusion-2 or 768-v-ema.ckpt.
 
 In addition, learning SD 2.0 seems to be difficult with VRAM 12GB because the Text Encoder is getting bigger.
@@ -179,11 +180,13 @@ The following points have changed significantly in Stable Diffusion 2.0.
 Among these, 1 to 4 are adopted for base, and 1 to 5 are adopted for the one without base (768-v). Enabling 1-4 is the v2 option, and enabling 5 is the v_parameterization option.
 
 ## check training data --debug_dataset
+
 By adding this option, you can check what kind of image data and captions will be learned in advance before learning. Press Esc to exit and return to the command line.
 
 *Please note that it seems to hang when executed in an environment where there is no screen such as Colab.
 
 ## Stop training Text Encoder --stop_text_encoder_training
+
 If you specify a numerical value for the stop_text_encoder_training option, after that number of steps, only the U-Net will be trained without training the Text Encoder. In some cases, the accuracy may be improved.
 
 (Probably only the Text Encoder may overfit first, and I guess that it can be prevented, but the detailed impact is unknown.)
@@ -202,14 +205,17 @@ Use the resume option to resume training from a saved training state. Please spe
 Please note that due to the specifications of Accelerator (?), the number of epochs and global step are not saved, and it will start from 1 even when you resume.
 
 ## No tokenizer padding --no_token_padding
+
 The no_token_padding option does not pad the output of the Tokenizer (same behavior as Diffusers version of old DreamBooth).
 
 ## Training with arbitrary size images --resolution
+
 You can study outside the square. Please specify "width, height" like "448,640" in resolution. Width and height must be divisible by 64. Match the size of the training image and the regularization image.
 
 Personally, I often generate vertically long images, so I sometimes learn with "448, 640".
 
 ## Aspect Ratio Bucketing --enable_bucket / --min_bucket_reso / --max_bucket_reso
+
 It is enabled by specifying the enable_bucket option. Stable Diffusion is trained at 512x512, but also at resolutions such as 256x768 and 384x640.
 
 If you specify this option, you do not need to unify the training images and regularization images to a specific resolution. Choose from several resolutions (aspect ratios) and learn at that resolution.
@@ -224,19 +230,23 @@ When Aspect Ratio Bucketing is enabled, it may be better to prepare regularizati
 (Because the images in one batch are not biased toward training images and regularization images.
 
 ## augmentation --color_aug / --flip_aug
+
 Augmentation is a method of improving model performance by dynamically changing data during learning. Learn while subtly changing the hue with color_aug and flipping left and right with flip_aug.
 
 Since the data changes dynamically, it cannot be specified together with the cache_latents option.
 
 ## Specify data precision when saving --save_precision
+
 Specifying float, fp16, or bf16 as the save_precision option will save the checkpoint in that format (only when saving in Stable Diffusion format). Please use it when you want to reduce the size of checkpoint.
 
 ## save in any format --save_model_as
+
 Specify the save format of the model. Specify one of ckpt, safetensors, diffusers, diffusers_safetensors.
 
 When reading Stable Diffusion format (ckpt or safetensors) and saving in Diffusers format, missing information is supplemented by dropping v1.5 or v2.1 information from Hugging Face.
 
 ## Save learning log --logging_dir / --log_prefix
+
 Specify the log save destination folder in the logging_dir option. Logs in TensorBoard format are saved.
 
 For example, if you specify --logging_dir=logs, a logs folder will be created in your working folder, and logs will be saved in the date/time folder.
@@ -251,9 +261,11 @@ tensorboard --logdir=logs
 Then open your browser and go to http://localhost:6006/ to see it.
 
 ## scheduler related specification of learning rate --lr_scheduler / --lr_warmup_steps
+
 You can choose the learning rate scheduler from linear, cosine, cosine_with_restarts, polynomial, constant, constant_with_warmup with the lr_scheduler option. Default is constant. With lr_warmup_steps, you can specify the number of steps to warm up the scheduler (gradually changing the learning rate). Please do your own research for details.
 
 ## Training with fp16 gradient (experimental feature) --full_fp16
+
 The full_fp16 option will change the gradient from normal float32 to float16 (fp16) and learn (it seems to be full fp16 learning instead of mixed precision).
 As a result, it seems that the SD1.x 512x512 size can be learned with a VRAM usage of less than 8GB, and the SD2.x 512x512 size can be learned with a VRAM usage of less than 12GB.
 
@@ -269,6 +281,7 @@ The setting of the learning rate and the number of steps seems to be severe. Ple
 # Other learning methods
 
 ## Learning multiple classes, multiple identifiers
+
 The method is simple, multiple folders with ``Repetition count_<identifier> <class>`` in the training image folder, and a folder with ``Repetition count_<class>`` in the regularization image folder. Please prepare multiple
 
 For example, learning "sls frog" and "cpc rabbit" at the same time would look like this:
@@ -286,6 +299,7 @@ If you have one class and multiple targets, you can have only one regularized im
 If the number of data varies, it seems that good results can be obtained by adjusting the number of repetitions to unify the number of sheets for each class and identifier.
 
 ## Use captions in DreamBooth
+
 If you put a file with the same file name as the image and the extension .caption (you can change it in the option) in the training image and regularization image folders, the caption will be read from that file and learned as a prompt.
 
 * The folder name (identifier class) will no longer be used for training those images.
diff --git a/train_network_README.md b/train_network_README.md
index b0363a68b..ed62dad8b 100644
--- a/train_network_README.md
+++ b/train_network_README.md
@@ -1,4 +1,4 @@
-## About learning LoRA
+# About learning LoRA
 
 [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685) (arxiv), [LoRA](https://github.com/microsoft/LoRA) (github) to Stable Applied to Diffusion.
 
@@ -96,7 +96,7 @@ Specify the save destination of the model after merging in the --save_to option
 
 Specify the LoRA model file learned in --models. It is possible to specify more than one, in which case they will be merged in order.
 
-For --ratios, specify the application rate of each model (how much weight is reflected in the original model) with a numerical value from 0 to 1.0. For example, if it is close to overfitting, it may be better if the application rate is lowered. Specify as many as the number of models.
+For --ratios, specify the application rate of each model (how much weight is reflected in the original model) with a numerical value from 0 to 1.0. For example, if it is close to over fitting, it may be better if the application rate is lowered. Specify as many as the number of models.
 
 When specifying multiple, it will be as follows.
 
@@ -112,7 +112,7 @@ Applying multiple LoRA models one by one to the SD model and merging multiple Lo
 
 For example, a command line like:
 
-```
+```shell
 python networks\merge_lora.py
      --save_to ..\lora_train1\model-char1-style1-merged.safetensors
      --models ..\lora_train1\last.safetensors ..\lora_train2\last.safetensors --ratios 0.6 0.4
@@ -128,7 +128,6 @@ For --ratios, specify the ratio of each model (how much weight is reflected in t
 
 LoRA trained with v1 and LoRA trained with v2, and LoRA with different number of dimensions cannot be merged. U-Net only LoRA and U-Net+Text Encoder LoRA should be able to merge, but the result is unknown.
 
-
 ### Other Options
 
 * precision
@@ -151,7 +150,8 @@ LoRA approximates the difference between two models (for example, the original m
 ### How to run scripts
 
 Please specify as follows.
-```
+
+```shell
 python networks\extract_lora_from_models.py --model_org base-model.ckpt
      --model_tuned fine-tuned-model.ckpt
      --save_to lora-weights.safetensors --dim 4
diff --git a/train_ti_README.md b/train_ti_README.md
index ba03d5558..e655f8320 100644
--- a/train_ti_README.md
+++ b/train_ti_README.md
@@ -1,4 +1,4 @@
-## About learning Textual Inversion
+# About learning Textual Inversion
 
 [Textual Inversion](https://textual-inversion.github.io/). I heavily referenced https://github.com/huggingface/diffusers/tree/main/examples/textual_inversion for the implementation.
 
@@ -16,7 +16,7 @@ Data preparation is exactly the same as ``train_network.py``, so please refer to
 
 Below is an example command line (DreamBooth technique).
 
-```
+```shell
 accelerate launch --num_cpu_threads_per_process 1 train_textual_inversion.py
      --pretrained_model_name_or_path=..\models\model.ckpt
      --train_data_dir=..\data\db\char1 --output_dir=..\ti_train1
@@ -30,7 +30,7 @@ accelerate launch --num_cpu_threads_per_process 1 train_textual_inversion.py
 
 ``--debug_dataset`` will display the token id after substitution, so you can check if the token string after ``49408`` exists as shown below. I can confirm.
 
-```
+```python
 input ids: tensor([[49406, 49408, 49409, 49410, 49411, 49412, 49413, 49414, 49415, 49407,
           49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
           49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407,
@@ -47,7 +47,6 @@ In ``--init_word``, specify the string of the copy source token when initializin
 
 ``--num_vectors_per_token`` specifies how many tokens to use for this training. The higher the number, the more expressive it is, but it consumes more tokens. For example, if num_vectors_per_token=8, then the specified token string will consume 8 tokens (out of the 77 token limit for a typical prompt).
 
-
 In addition, the following options can be specified.
 
 * --weights

From 155fa42531400f4abb21a1a867a626b9d090b2a6 Mon Sep 17 00:00:00 2001
From: bmaltais <bernard@ducourier.com>
Date: Tue, 10 Oct 2023 20:16:01 -0400
Subject: [PATCH 33/33] Add GUI support for ONNX in WD 14

---
 library/wd14_caption_gui.py     | 32 ++++++++++++++++++++++++++++++++
 requirements.txt                | 11 ++++++-----
 requirements_linux.txt          |  2 +-
 requirements_linux_docker.txt   |  4 ++--
 requirements_linux_ipex.txt     |  2 +-
 requirements_macos_amd64.txt    |  2 +-
 requirements_macos_arm64.txt    |  2 +-
 requirements_runpod.txt         |  2 +-
 requirements_windows_torch2.txt |  2 +-
 9 files changed, 46 insertions(+), 13 deletions(-)

diff --git a/library/wd14_caption_gui.py b/library/wd14_caption_gui.py
index e952e7dd5..60c22b3a1 100644
--- a/library/wd14_caption_gui.py
+++ b/library/wd14_caption_gui.py
@@ -25,6 +25,9 @@ def caption_images(
     frequency_tags,
     prefix,
     postfix,
+    onnx,
+    append_tags,
+    force_download
 ):
     # Check for images_dir_input
     if train_data_dir == '':
@@ -54,6 +57,12 @@ def caption_images(
         run_cmd += f' --remove_underscore'
     if frequency_tags:
         run_cmd += f' --frequency_tags'
+    if onnx:
+        run_cmd += f' --onnx'
+    if append_tags:
+        run_cmd += f' --append_tags'
+    if force_download:
+        run_cmd += f' --force_download'
 
     if not undesired_tags == '':
         run_cmd += f' --undesired_tags="{undesired_tags}"'
@@ -132,6 +141,20 @@ def gradio_wd14_caption_gui_tab(headless=False):
                 interactive=True,
             )
 
+        with gr.Row():
+            onnx = gr.Checkbox(
+                label='Use onnx',
+                value=False,
+                interactive=True,
+                info="https://github.com/onnx/onnx"
+            )
+            append_tags = gr.Checkbox(
+                label='Append TAGs',
+                value=False,
+                interactive=True,
+                info="This option appends the tags to the existing tags, instead of replacing them."
+            )
+
         with gr.Row():
             replace_underscores = gr.Checkbox(
                 label='Replace underscores in filenames with spaces',
@@ -168,6 +191,12 @@ def gradio_wd14_caption_gui_tab(headless=False):
                 ],
                 value='SmilingWolf/wd-v1-4-convnextv2-tagger-v2',
             )
+            
+            force_download = gr.Checkbox(
+                label='Force model re-download',
+                value=False,
+                info='Usefull to force model re download when switching to onnx',
+            )
 
             general_threshold = gr.Slider(
                 value=0.35,
@@ -215,6 +244,9 @@ def gradio_wd14_caption_gui_tab(headless=False):
                 frequency_tags,
                 prefix,
                 postfix,
+                onnx,
+                append_tags,
+                force_download
             ],
             show_progress=False,
         )
diff --git a/requirements.txt b/requirements.txt
index 87f8addcb..216e96147 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,13 +19,14 @@ lycoris_lora==1.9.0
 # timm==0.6.12
 # fairscale==0.4.13
 # for WD14 captioning (tensorflow)
-# tensorflow==2.10.1
+# tensorflow==2.14.0
 # for WD14 captioning (onnx)
-# onnx==1.14.1
-# onnxruntime-gpu==1.16.0
+onnx==1.14.1
+onnxruntime-gpu==1.16.0
 # onnxruntime==1.16.0
-# this is for onnx: 
-# protobuf==3.20.3
+# this is for onnx:
+# tensorboard==2.14.1
+protobuf==3.20.3
 # open clip for SDXL
 open-clip-torch==2.20.0
 opencv-python==4.7.0.68
diff --git a/requirements_linux.txt b/requirements_linux.txt
index d6f93311e..6f64060d9 100644
--- a/requirements_linux.txt
+++ b/requirements_linux.txt
@@ -1,4 +1,4 @@
 torch==2.0.1+cu118 torchvision==0.15.2+cu118 --extra-index-url https://download.pytorch.org/whl/cu118 # no_verify leave this to specify not checking this a verification stage
 xformers==0.0.21 bitsandbytes==0.41.1
-tensorboard==2.12.3 tensorflow==2.12.0
+tensorboard==2.14.1 tensorflow==2.14.0
 -r requirements.txt
diff --git a/requirements_linux_docker.txt b/requirements_linux_docker.txt
index 056b3ea58..bc936c458 100644
--- a/requirements_linux_docker.txt
+++ b/requirements_linux_docker.txt
@@ -1,5 +1,5 @@
 xformers==0.0.20
 bitsandbytes==0.41.1
 accelerate==0.19.0
-tensorboard==2.12.1
-tensorflow==2.12.0
+tensorboard==2.14.1
+tensorflow==2.14.0
diff --git a/requirements_linux_ipex.txt b/requirements_linux_ipex.txt
index 61d8a75f4..20e9ed8bb 100644
--- a/requirements_linux_ipex.txt
+++ b/requirements_linux_ipex.txt
@@ -1,3 +1,3 @@
 torch==2.0.1a0+cxx11.abi torchvision==0.15.2a0+cxx11.abi intel_extension_for_pytorch==2.0.110+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-tensorboard==2.12.3 tensorflow==2.12.0 intel-extension-for-tensorflow[gpu]
+tensorboard==2.14.1 tensorflow==2.14.0 intel-extension-for-tensorflow[gpu]
 -r requirements.txt
diff --git a/requirements_macos_amd64.txt b/requirements_macos_amd64.txt
index 2abef5b8c..24e8768f5 100644
--- a/requirements_macos_amd64.txt
+++ b/requirements_macos_amd64.txt
@@ -1,4 +1,4 @@
 torch==2.0.0 torchvision==0.15.1 -f https://download.pytorch.org/whl/cpu/torch_stable.html
 xformers bitsandbytes==0.41.1
-tensorflow-macos tensorboard==2.12.1
+tensorflow-macos tensorboard==2.14.1
 -r requirements.txt
diff --git a/requirements_macos_arm64.txt b/requirements_macos_arm64.txt
index 3419f4dcd..377949181 100644
--- a/requirements_macos_arm64.txt
+++ b/requirements_macos_arm64.txt
@@ -1,4 +1,4 @@
 torch==2.0.0 torchvision==0.15.1 -f https://download.pytorch.org/whl/cpu/torch_stable.html
 xformers bitsandbytes==0.41.1
-tensorflow-macos tensorflow-metal tensorboard==2.12.1
+tensorflow-macos tensorflow-metal tensorboard==2.14.1
 -r requirements.txt
diff --git a/requirements_runpod.txt b/requirements_runpod.txt
index 13e0c69de..9b9703696 100644
--- a/requirements_runpod.txt
+++ b/requirements_runpod.txt
@@ -1,5 +1,5 @@
 torch==2.0.1+cu118 torchvision==0.15.2+cu118 --extra-index-url https://download.pytorch.org/whl/cu118 # no_verify leave this to specify not checking this a verification stage
 xformers==0.0.21 bitsandbytes==0.41.1
-tensorboard==2.12.3 tensorflow==2.12.0 wheel
+tensorboard==2.14.1 tensorflow==2.14.0 wheel
 tensorrt
 -r requirements.txt
diff --git a/requirements_windows_torch2.txt b/requirements_windows_torch2.txt
index fd488dfac..783c52729 100644
--- a/requirements_windows_torch2.txt
+++ b/requirements_windows_torch2.txt
@@ -2,5 +2,5 @@ torch==2.0.1+cu118 torchvision==0.15.2+cu118 --index-url https://download.pytorc
 xformers==0.0.21
 bitsandbytes==0.35.0 # no_verify
 # https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.1-py3-none-win_amd64.whl # no_verify
-tensorboard==2.12.3 tensorflow==2.12.0
+tensorboard==2.14.1 tensorflow==2.14.0
 -r requirements.txt