More flexibility with text encoder return values.

Text encoders can now return other values to the CONDITIONING than the cond and pooled output.

More flexibility with text encoder return values.
Text encoders can now return other values to the CONDITIONING than the cond and pooled output.
391c1046 · comfyanonymous · e44fa566 · 391c1046 · 391c1046 · 391c1046
Commit 391c1046 authored Jul 10, 2024 by comfyanonymous
Hide whitespace changes
Inline Side-by-side

Showing with 30 additions and 8 deletions

comfy/sd.py comfy/sd.py +10 -2

comfy/sd1_clip.py comfy/sd1_clip.py +17 -4

nodes.py nodes.py +3 -2

No files found.
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -130,7 +130,7 @@ class CLIP:
    def tokenize(self, text, return_word_ids=False):
        return self.tokenizer.tokenize_with_weights(text, return_word_ids)
-    def encode_from_tokens(self, tokens, return_pooled=False):
+    def encode_from_tokens(self, tokens, return_pooled=False, return_dict=False):
        self.cond_stage_model.reset_clip_options()
        if self.layer_idx is not None:
@@ -140,7 +140,15 @@ class CLIP:
            self.cond_stage_model.set_clip_options({"projected_pooled": False})
        self.load_model()
-        cond, pooled = self.cond_stage_model.encode_token_weights(tokens)
+        o = self.cond_stage_model.encode_token_weights(tokens)
+        cond, pooled = o[:2]
+        if return_dict:
+            out = {"cond": cond, "pooled_output": pooled}
+            if len(o) > 2:
+                for k in o[2]:
+                    out[k] = o[2][k]
+            return out
        if return_pooled:
            return cond, pooled
        return cond

--- a/comfy/sd1_clip.py
+++ b/comfy/sd1_clip.py
@@ -62,7 +62,16 @@ class ClipTokenWeightEncoder:
            r = (out[-1:].to(model_management.intermediate_device()), first_pooled)
        else:
            r = (torch.cat(output, dim=-2).to(model_management.intermediate_device()), first_pooled)
-        r = r + tuple(map(lambda a: a[:sections].flatten().unsqueeze(dim=0).to(model_management.intermediate_device()), o[2:]))
+        if len(o) > 2:
+            extra = {}
+            for k in o[2]:
+                v = o[2][k]
+                if k == "attention_mask":
+                    v = v[:sections].flatten().unsqueeze(dim=0).to(model_management.intermediate_device())
+                extra[k] = v
+            r = r + (extra,)
        return r
 class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
@@ -206,8 +215,12 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
            elif outputs[2] is not None:
                pooled_output = outputs[2].float()
+        extra = {}
        if self.return_attention_masks:
-            return z, pooled_output, attention_mask
+            extra["attention_mask"] = attention_mask
+        if len(extra) > 0:
+            return z, pooled_output, extra
        return z, pooled_output
@@ -547,8 +560,8 @@ class SD1ClipModel(torch.nn.Module):
    def encode_token_weights(self, token_weight_pairs):
        token_weight_pairs = token_weight_pairs[self.clip_name]
-        out, pooled = getattr(self, self.clip).encode_token_weights(token_weight_pairs)
+        out = getattr(self, self.clip).encode_token_weights(token_weight_pairs)
-        return out, pooled
+        return out
    def load_sd(self, sd):
        return getattr(self, self.clip).load_sd(sd)
--- a/nodes.py
+++ b/nodes.py
@@ -55,8 +55,9 @@ class CLIPTextEncode:
    def encode(self, clip, text):
        tokens = clip.tokenize(text)
-        cond, pooled = clip.encode_from_tokens(tokens, return_pooled=True)
+        output = clip.encode_from_tokens(tokens, return_pooled=True, return_dict=True)
-        return ([[cond, {"pooled_output": pooled}]], )
+        cond = output.pop("cond")
+        return ([[cond, output]], )
 class ConditioningCombine:
    @classmethod