ComfyUI/custom_nodes/ComfyUI-Easy-Use/py/modules/layer_diffuse/model.py

import torch.nn as nn
import torch
import cv2
import numpy as np
import comfy.model_management

from comfy.model_patcher import ModelPatcher
from tqdm import tqdm
from typing import Optional, Tuple
from ...libs.utils import install_package
from packaging import version

try:
    install_package("diffusers", "0.27.2", True, "0.25.0")

    from diffusers.configuration_utils import ConfigMixin, register_to_config
    from diffusers.models.modeling_utils import ModelMixin
    from diffusers import __version__
    if __version__:
        if version.parse(__version__) < version.parse("0.26.0"):
            from diffusers.models.unet_2d_blocks import UNetMidBlock2D, get_down_block, get_up_block
        else:
            from diffusers.models.unets.unet_2d_blocks import UNetMidBlock2D, get_down_block, get_up_block

    import functools

    def zero_module(module):
        """
        Zero out the parameters of a module and return it.
        """
        for p in module.parameters():
            p.detach().zero_()
        return module


    class LatentTransparencyOffsetEncoder(torch.nn.Module):
        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)
            self.blocks = torch.nn.Sequential(
                torch.nn.Conv2d(4, 32, kernel_size=3, padding=1, stride=1),
                nn.SiLU(),
                torch.nn.Conv2d(32, 32, kernel_size=3, padding=1, stride=1),
                nn.SiLU(),
                torch.nn.Conv2d(32, 64, kernel_size=3, padding=1, stride=2),
                nn.SiLU(),
                torch.nn.Conv2d(64, 64, kernel_size=3, padding=1, stride=1),
                nn.SiLU(),
                torch.nn.Conv2d(64, 128, kernel_size=3, padding=1, stride=2),
                nn.SiLU(),
                torch.nn.Conv2d(128, 128, kernel_size=3, padding=1, stride=1),
                nn.SiLU(),
                torch.nn.Conv2d(128, 256, kernel_size=3, padding=1, stride=2),
                nn.SiLU(),
                torch.nn.Conv2d(256, 256, kernel_size=3, padding=1, stride=1),
                nn.SiLU(),
                zero_module(torch.nn.Conv2d(256, 4, kernel_size=3, padding=1, stride=1)),
            )

        def __call__(self, x):
            return self.blocks(x)


    # 1024 * 1024 * 3 -> 16 * 16 * 512 -> 1024 * 1024 * 3
    class UNet1024(ModelMixin, ConfigMixin):
        @register_to_config
        def __init__(
                self,
                in_channels: int = 3,
                out_channels: int = 3,
                down_block_types: Tuple[str] = (
                        "DownBlock2D",
                        "DownBlock2D",
                        "DownBlock2D",
                        "DownBlock2D",
                        "AttnDownBlock2D",
                        "AttnDownBlock2D",
                        "AttnDownBlock2D",
                ),
                up_block_types: Tuple[str] = (
                        "AttnUpBlock2D",
                        "AttnUpBlock2D",
                        "AttnUpBlock2D",
                        "UpBlock2D",
                        "UpBlock2D",
                        "UpBlock2D",
                        "UpBlock2D",
                ),
                block_out_channels: Tuple[int] = (32, 32, 64, 128, 256, 512, 512),
                layers_per_block: int = 2,
                mid_block_scale_factor: float = 1,
                downsample_padding: int = 1,
                downsample_type: str = "conv",
                upsample_type: str = "conv",
                dropout: float = 0.0,
                act_fn: str = "silu",
                attention_head_dim: Optional[int] = 8,
                norm_num_groups: int = 4,
                norm_eps: float = 1e-5,
        ):
            super().__init__()

            # input
            self.conv_in = nn.Conv2d(
                in_channels, block_out_channels[0], kernel_size=3, padding=(1, 1)
            )
            self.latent_conv_in = zero_module(
                nn.Conv2d(4, block_out_channels[2], kernel_size=1)
            )

            self.down_blocks = nn.ModuleList([])
            self.mid_block = None
            self.up_blocks = nn.ModuleList([])

            # down
            output_channel = block_out_channels[0]
            for i, down_block_type in enumerate(down_block_types):
                input_channel = output_channel
                output_channel = block_out_channels[i]
                is_final_block = i == len(block_out_channels) - 1

                down_block = get_down_block(
                    down_block_type,
                    num_layers=layers_per_block,
                    in_channels=input_channel,
                    out_channels=output_channel,
                    temb_channels=None,
                    add_downsample=not is_final_block,
                    resnet_eps=norm_eps,
                    resnet_act_fn=act_fn,
                    resnet_groups=norm_num_groups,
                    attention_head_dim=(
                        attention_head_dim
                        if attention_head_dim is not None
                        else output_channel
                    ),
                    downsample_padding=downsample_padding,
                    resnet_time_scale_shift="default",
                    downsample_type=downsample_type,
                    dropout=dropout,
                )
                self.down_blocks.append(down_block)

            # mid
            self.mid_block = UNetMidBlock2D(
                in_channels=block_out_channels[-1],
                temb_channels=None,
                dropout=dropout,
                resnet_eps=norm_eps,
                resnet_act_fn=act_fn,
                output_scale_factor=mid_block_scale_factor,
                resnet_time_scale_shift="default",
                attention_head_dim=(
                    attention_head_dim
                    if attention_head_dim is not None
                    else block_out_channels[-1]
                ),
                resnet_groups=norm_num_groups,
                attn_groups=None,
                add_attention=True,
            )

            # up
            reversed_block_out_channels = list(reversed(block_out_channels))
            output_channel = reversed_block_out_channels[0]
            for i, up_block_type in enumerate(up_block_types):
                prev_output_channel = output_channel
                output_channel = reversed_block_out_channels[i]
                input_channel = reversed_block_out_channels[
                    min(i + 1, len(block_out_channels) - 1)
                ]

                is_final_block = i == len(block_out_channels) - 1

                up_block = get_up_block(
                    up_block_type,
                    num_layers=layers_per_block + 1,
                    in_channels=input_channel,
                    out_channels=output_channel,
                    prev_output_channel=prev_output_channel,
                    temb_channels=None,
                    add_upsample=not is_final_block,
                    resnet_eps=norm_eps,
                    resnet_act_fn=act_fn,
                    resnet_groups=norm_num_groups,
                    attention_head_dim=(
                        attention_head_dim
                        if attention_head_dim is not None
                        else output_channel
                    ),
                    resnet_time_scale_shift="default",
                    upsample_type=upsample_type,
                    dropout=dropout,
                )
                self.up_blocks.append(up_block)
                prev_output_channel = output_channel

            # out
            self.conv_norm_out = nn.GroupNorm(
                num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps
            )
            self.conv_act = nn.SiLU()
            self.conv_out = nn.Conv2d(
                block_out_channels[0], out_channels, kernel_size=3, padding=1
            )

        def forward(self, x, latent):
            sample_latent = self.latent_conv_in(latent)
            sample = self.conv_in(x)
            emb = None

            down_block_res_samples = (sample,)
            for i, downsample_block in enumerate(self.down_blocks):
                if i == 3:
                    sample = sample + sample_latent

                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
                down_block_res_samples += res_samples

            sample = self.mid_block(sample, emb)

            for upsample_block in self.up_blocks:
                res_samples = down_block_res_samples[-len(upsample_block.resnets):]
                down_block_res_samples = down_block_res_samples[
                                         : -len(upsample_block.resnets)
                                         ]
                sample = upsample_block(sample, res_samples, emb)

            sample = self.conv_norm_out(sample)
            sample = self.conv_act(sample)
            sample = self.conv_out(sample)
            return sample


    def checkerboard(shape):
        return np.indices(shape).sum(axis=0) % 2


    def fill_checkerboard_bg(y: torch.Tensor) -> torch.Tensor:
        alpha = y[..., :1]
        fg = y[..., 1:]
        B, H, W, C = fg.shape
        cb = checkerboard(shape=(H // 64, W // 64))
        cb = cv2.resize(cb, (W, H), interpolation=cv2.INTER_NEAREST)
        cb = (0.5 + (cb - 0.5) * 0.1)[None, ..., None]
        cb = torch.from_numpy(cb).to(fg)
        vis = fg * alpha + cb * (1 - alpha)
        return vis


    class TransparentVAEDecoder:
        def __init__(self, sd, device, dtype):
            self.load_device = device
            self.dtype = dtype

            model = UNet1024(in_channels=3, out_channels=4)
            model.load_state_dict(sd, strict=True)
            model.to(self.load_device, dtype=self.dtype)
            model.eval()
            self.model = model

        @torch.no_grad()
        def estimate_single_pass(self, pixel, latent):
            y = self.model(pixel, latent)
            return y

        @torch.no_grad()
        def estimate_augmented(self, pixel, latent):
            args = [
                [False, 0],
                [False, 1],
                [False, 2],
                [False, 3],
                [True, 0],
                [True, 1],
                [True, 2],
                [True, 3],
            ]

            result = []

            for flip, rok in tqdm(args):
                feed_pixel = pixel.clone()
                feed_latent = latent.clone()

                if flip:
                    feed_pixel = torch.flip(feed_pixel, dims=(3,))
                    feed_latent = torch.flip(feed_latent, dims=(3,))

                feed_pixel = torch.rot90(feed_pixel, k=rok, dims=(2, 3))
                feed_latent = torch.rot90(feed_latent, k=rok, dims=(2, 3))

                eps = self.estimate_single_pass(feed_pixel, feed_latent).clip(0, 1)
                eps = torch.rot90(eps, k=-rok, dims=(2, 3))

                if flip:
                    eps = torch.flip(eps, dims=(3,))

                result += [eps]

            result = torch.stack(result, dim=0)
            median = torch.median(result, dim=0).values
            return median

        @torch.no_grad()
        def decode_pixel(
                self, pixel: torch.TensorType, latent: torch.TensorType
        ) -> torch.TensorType:
            # pixel.shape = [B, C=3, H, W]
            assert pixel.shape[1] == 3
            pixel_device = pixel.device
            pixel_dtype = pixel.dtype

            pixel = pixel.to(device=self.load_device, dtype=self.dtype)
            latent = latent.to(device=self.load_device, dtype=self.dtype)
            # y.shape = [B, C=4, H, W]
            y = self.estimate_augmented(pixel, latent)
            y = y.clip(0, 1)
            assert y.shape[1] == 4
            # Restore image to original device of input image.
            return y.to(pixel_device, dtype=pixel_dtype)


    def calculate_weight_adjust_channel(func):
        """Patches ComfyUI's LoRA weight application to accept multi-channel inputs."""
        @functools.wraps(func)
        def calculate_weight(
            patches, weight: torch.Tensor, key: str, intermediate_type=torch.float32
        ) -> torch.Tensor:
            weight = func(patches, weight, key, intermediate_type)

            for p in patches:
                alpha = p[0]
                v = p[1]

                # The recursion call should be handled in the main func call.
                if isinstance(v, list):
                    continue

                if len(v) == 1:
                    patch_type = "diff"
                elif len(v) == 2:
                    patch_type = v[0]
                    v = v[1]

                if patch_type == "diff":
                    w1 = v[0]
                    if all(
                            (
                                    alpha != 0.0,
                                    w1.shape != weight.shape,
                                    w1.ndim == weight.ndim == 4,
                            )
                    ):
                        new_shape = [max(n, m) for n, m in zip(weight.shape, w1.shape)]
                        print(
                            f"Merged with {key} channel changed from {weight.shape} to {new_shape}"
                        )
                        new_diff = alpha * comfy.model_management.cast_to_device(
                            w1, weight.device, weight.dtype
                        )
                        new_weight = torch.zeros(size=new_shape).to(weight)
                        new_weight[
                        : weight.shape[0],
                        : weight.shape[1],
                        : weight.shape[2],
                        : weight.shape[3],
                        ] = weight
                        new_weight[
                        : new_diff.shape[0],
                        : new_diff.shape[1],
                        : new_diff.shape[2],
                        : new_diff.shape[3],
                        ] += new_diff
                        new_weight = new_weight.contiguous().clone()
                        weight = new_weight
            return weight

        return calculate_weight


except ImportError:
    ModelMixin = None
    ConfigMixin = None
    TransparentVAEDecoder = None
    calculate_weight_adjust_channel = None
    print("\33[33mModule 'diffusers' load failed. If you don't have it installed, do it:\033[0m")
    print("\33[33mpip install diffusers\033[0m")