Basic implementation of z image fun control union 2.0 (#11304)

The inpaint part is currently missing and will be implemented later. I think they messed up this model pretty bad. They added some control_noise_refiner blocks but don't actually use them. There is a typo in their code so instead of doing control_noise_refiner -> control_layers it runs the whole control_layers twice. Unfortunately they trained with this typo so the model works but is kind of slow and would probably perform a lot better if they corrected their code and trained it again.
2025-12-12 22:39:11 -08:00
parent c5a47a1692
commit da2bfb5b0a
4 changed files with 142 additions and 44 deletions
--- a/comfy/ldm/lumina/model.py
+++ b/comfy/ldm/lumina/model.py
@@ -536,6 +536,7 @@ class NextDiT(nn.Module):
        bsz = len(x)
        pH = pW = self.patch_size
        device = x[0].device
+        orig_x = x

        if self.pad_tokens_multiple is not None:
            pad_extra = (-cap_feats.shape[1]) % self.pad_tokens_multiple
@@ -572,13 +573,21 @@ class NextDiT(nn.Module):

        freqs_cis = self.rope_embedder(torch.cat((cap_pos_ids, x_pos_ids), dim=1)).movedim(1, 2)

+        patches = transformer_options.get("patches", {})
+
        # refine context
        for layer in self.context_refiner:
            cap_feats = layer(cap_feats, cap_mask, freqs_cis[:, :cap_pos_ids.shape[1]], transformer_options=transformer_options)

        padded_img_mask = None
-        for layer in self.noise_refiner:
+        x_input = x
+        for i, layer in enumerate(self.noise_refiner):
            x = layer(x, padded_img_mask, freqs_cis[:, cap_pos_ids.shape[1]:], t, transformer_options=transformer_options)
+            if "noise_refiner" in patches:
+                for p in patches["noise_refiner"]:
+                    out = p({"img": x, "img_input": x_input, "txt": cap_feats, "pe": freqs_cis[:, cap_pos_ids.shape[1]:], "vec": t, "x": orig_x, "block_index": i, "transformer_options": transformer_options, "block_type": "noise_refiner"})
+                    if "img" in out:
+                        x = out["img"]

        padded_full_embed = torch.cat((cap_feats, x), dim=1)
        mask = None
@@ -622,14 +631,15 @@ class NextDiT(nn.Module):

        patches = transformer_options.get("patches", {})
        x_is_tensor = isinstance(x, torch.Tensor)
-        img, mask, img_size, cap_size, freqs_cis = self.patchify_and_embed(x, cap_feats, cap_mask, t, num_tokens, transformer_options=transformer_options)
+        img, mask, img_size, cap_size, freqs_cis = self.patchify_and_embed(x, cap_feats, cap_mask, adaln_input, num_tokens, transformer_options=transformer_options)
        freqs_cis = freqs_cis.to(img.device)

+        img_input = img
        for i, layer in enumerate(self.layers):
            img = layer(img, mask, freqs_cis, adaln_input, transformer_options=transformer_options)
            if "double_block" in patches:
                for p in patches["double_block"]:
-                    out = p({"img": img[:, cap_size[0]:], "txt": img[:, :cap_size[0]], "pe": freqs_cis[:, cap_size[0]:], "vec": adaln_input, "x": x, "block_index": i, "transformer_options": transformer_options})
+                    out = p({"img": img[:, cap_size[0]:], "img_input": img_input[:, cap_size[0]:], "txt": img[:, :cap_size[0]], "pe": freqs_cis[:, cap_size[0]:], "vec": adaln_input, "x": x, "block_index": i, "transformer_options": transformer_options})
                    if "img" in out:
                        img[:, cap_size[0]:] = out["img"]
                    if "txt" in out: