Kandinsky5 model support (#10988)

* Add Kandinsky5 model support

lite and pro T2V tested to work

* Update kandinsky5.py

* Fix fp8

* Fix fp8_scaled text encoder

* Add transformer_options for attention

* Code cleanup, optimizations, use fp32 for all layers originally at fp32

* ImageToVideo -node

* Fix I2V, add necessary latent post process nodes

* Support text to image model

* Support block replace patches (SLG mostly)

* Support official LoRAs

* Don't scale RoPE for lite model as that just doesn't work...

* Update supported_models.py

* Rever RoPE scaling to simpler one

* Fix typo

* Handle latent dim difference for image model in the VAE instead

* Add node to use different prompts for clip_l and qwen25_7b

* Reduce peak VRAM usage a bit

* Further reduce peak VRAM consumption by chunking ffn

* Update chunking

* Update memory_usage_factor

* Code cleanup, don't force the fp32 layers as it has minimal effect

* Allow for stronger changes with first frames normalization

Default values are too weak for any meaningful changes, these should probably be exposed as advanced node options when that's available.

* Add image model's own chat template, remove unused image2video template

* Remove hard error in ReplaceVideoLatentFrames -node

* Update kandinsky5.py

* Update supported_models.py

* Fix typos in prompt template

They were now fixed in the original repository as well

* Update ReplaceVideoLatentFrames

Add tooltips
Make source optional
Better handle negative index

* Rename NormalizeVideoLatentFrames -node

For bit better clarity what it does

* Fix NormalizeVideoLatentStart node out on non-op
This commit is contained in:
Jukka Seppänen
2025-12-06 05:20:22 +02:00
committed by GitHub
parent bed12674a1
commit fd109325db
11 changed files with 791 additions and 3 deletions

View File

@@ -47,6 +47,7 @@ import comfy.ldm.chroma_radiance.model
import comfy.ldm.ace.model
import comfy.ldm.omnigen.omnigen2
import comfy.ldm.qwen_image.model
import comfy.ldm.kandinsky5.model
import comfy.model_management
import comfy.patcher_extension
@@ -1630,3 +1631,49 @@ class HunyuanVideo15_SR_Distilled(HunyuanVideo15):
out = super().extra_conds(**kwargs)
out['disable_time_r'] = comfy.conds.CONDConstant(False)
return out
class Kandinsky5(BaseModel):
def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.kandinsky5.model.Kandinsky5)
def encode_adm(self, **kwargs):
return kwargs["pooled_output"]
def concat_cond(self, **kwargs):
noise = kwargs.get("noise", None)
device = kwargs["device"]
image = torch.zeros_like(noise)
mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None))
if mask is None:
mask = torch.zeros_like(noise)[:, :1]
else:
mask = 1.0 - mask
mask = utils.common_upscale(mask.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")
if mask.shape[-3] < noise.shape[-3]:
mask = torch.nn.functional.pad(mask, (0, 0, 0, 0, 0, noise.shape[-3] - mask.shape[-3]), mode='constant', value=0)
mask = utils.resize_to_batch_size(mask, noise.shape[0])
return torch.cat((image, mask), dim=1)
def extra_conds(self, **kwargs):
out = super().extra_conds(**kwargs)
attention_mask = kwargs.get("attention_mask", None)
if attention_mask is not None:
out['attention_mask'] = comfy.conds.CONDRegular(attention_mask)
cross_attn = kwargs.get("cross_attn", None)
if cross_attn is not None:
out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
time_dim_replace = kwargs.get("time_dim_replace", None)
if time_dim_replace is not None:
out['time_dim_replace'] = comfy.conds.CONDRegular(self.process_latent_in(time_dim_replace))
return out
class Kandinsky5Image(Kandinsky5):
def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
super().__init__(model_config, model_type, device=device)
def concat_cond(self, **kwargs):
return None