text_encoder.py 18.7 KB
Newer Older
wuxk1's avatar
wuxk1 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
"""
This module provides nodes and utilities for loading and Nunchaku text encoders within ComfyUI.
"""

import gc
import logging
import os
import types
from typing import Callable

import comfy
import folder_paths
import torch
from comfy.text_encoders.flux import FluxClipModel
from torch import nn

from nunchaku import NunchakuT5EncoderModel

# Get log level from environment variable (default to INFO)
log_level = os.getenv("LOG_LEVEL", "INFO").upper()

# Configure logging
logging.basicConfig(level=getattr(logging, log_level, logging.INFO), format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)


class NunchakuTextEncoderLoaderV2:
    """
    Node for loading Nunchaku text encoders. It also supports 16-bit and FP8 variants.

    .. note::
        When loading our 4-bit T5, a 16-bit T5 is first initialized on a meta device,
        then replaced by the Nunchaku T5.

    .. warning::
        Our 4-bit T5 currently requires a CUDA device.
        If not on CUDA, the model will be moved automatically, which may cause out-of-memory errors.
        Turing GPUs (20-series) are not supported for now.
    """

    RETURN_TYPES = ("CLIP",)
    FUNCTION = "load_text_encoder"
    CATEGORY = "Nunchaku"
    TITLE = "Nunchaku Text Encoder Loader V2"

    @classmethod
    def INPUT_TYPES(s):
        """
        Defines the input types and tooltips for the node.

        Returns
        -------
        dict
            A dictionary specifying the required inputs and their descriptions for the node interface.
        """
        return {
            "required": {
                "model_type": (["flux.1"],),
                "text_encoder1": (folder_paths.get_filename_list("text_encoders"),),
                "text_encoder2": (folder_paths.get_filename_list("text_encoders"),),
                "t5_min_length": (
                    "INT",
                    {
                        "default": 512,
                        "min": 256,
                        "max": 1024,
                        "step": 128,
                        "display": "number",
                        "lazy": True,
                        "tooltip": "Minimum sequence length for the T5 encoder.",
                    },
                ),
            }
        }

    def load_text_encoder(self, model_type: str, text_encoder1: str, text_encoder2: str, t5_min_length: int):
        """
        Loads the text encoders with the given configuration.

        Parameters
        ----------
        model_type : str
            The type of model to load (e.g., "flux.1").
        text_encoder1 : str
            Filename of the first text encoder checkpoint.
        text_encoder2 : str
            Filename of the second text encoder checkpoint.
        t5_min_length : int
            Minimum sequence length for the T5 encoder.

        Returns
        -------
        tuple
            Tuple containing the loaded CLIP model.
        """
        text_encoder_path1 = folder_paths.get_full_path_or_raise("text_encoders", text_encoder1)
        text_encoder_path2 = folder_paths.get_full_path_or_raise("text_encoders", text_encoder2)
        if model_type == "flux.1":
            clip_type = comfy.sd.CLIPType.FLUX
        else:
            raise ValueError(f"Unknown type {model_type}")

        clip = load_text_encoder_state_dicts(
            [text_encoder_path1, text_encoder_path2],
            embedding_directory=folder_paths.get_folder_paths("embeddings"),
            clip_type=clip_type,
            model_options={},
        )
        clip.tokenizer.t5xxl.min_length = t5_min_length
        return (clip,)


def nunchaku_t5_forward(
    self: NunchakuT5EncoderModel,
    input_ids: torch.LongTensor,
    attention_mask,
    embeds=None,
    intermediate_output=None,
    final_layer_norm_intermediate=True,
    dtype: str | torch.dtype = torch.bfloat16,
    **kwargs,
):
    """
    Forward function wrapper for
    :class:`~nunchaku.models.text_encoders.t5_encoder.NunchakuT5EncoderModel` to be compatible with ComfyUI.

    .. note::
        It moves tensors to CUDA if necessary and runs the encoder.

    Parameters
    ----------
    self : :class:`~nunchaku.models.text_encoders.t5_encoder.NunchakuT5EncoderModel`
        The T5 encoder model instance.
    input_ids : torch.LongTensor
        Input token IDs.
    attention_mask : Any
        Attention mask (must be None).
    embeds : torch.Tensor, optional
        Optional input embeddings.
    intermediate_output : Any, optional
        Not used (must be None).
    final_layer_norm_intermediate : bool, optional
        Whether to apply final layer norm (must be True).
    dtype : str or torch.dtype, optional
        Output data type.
    **kwargs
        Additional keyword arguments.

    Returns
    -------
    tuple
        Tuple of (hidden_states, None).
    """
    assert attention_mask is None
    assert intermediate_output is None
    assert final_layer_norm_intermediate

    def get_device(tensors: list[torch.Tensor]) -> torch.device:
        """
        Returns the device of the first non-None tensor in the list.

        Parameters
        ----------
        tensors : list of torch.Tensor
            List of tensors to check.

        Returns
        -------
        torch.device
            The device of the first non-None tensor, or CPU if all are None.
        """
        for t in tensors:
            if t is not None:
                return t.device
        return torch.device("cpu")

    original_device = None
    if get_device([input_ids, attention_mask, embeds]) != "cuda":
        original_device = get_device([input_ids, attention_mask, embeds])
        logger.warning(
            "Currently, Nunchaku T5 encoder requires CUDA for processing. "
            f"Input tensor is not on {str(original_device)}, moving to CUDA for T5 encoder processing."
        )
        input_ids = input_ids.to(torch.cuda.current_device()) if input_ids is not None else None
        embeds = embeds.to(torch.cuda.current_device()) if embeds is not None else None
        attention_mask = attention_mask.to(torch.cuda.current_device()) if attention_mask is not None else None
        self.encoder = self.encoder.to(torch.cuda.current_device())
    outputs = self.encoder(input_ids=input_ids, inputs_embeds=embeds, attention_mask=attention_mask)

    hidden_states = outputs["last_hidden_state"]
    hidden_states = hidden_states.to(dtype=dtype)
    if original_device is not None:
        hidden_states = hidden_states.to(original_device)
        self.encoder = self.encoder.to(original_device)

        gc.collect()
        torch.cuda.empty_cache()

    return hidden_states, None


class WrappedEmbedding(nn.Module):
    """
    Wrapper for ``nn.Embedding`` for the compatibility with ComfyUI.

    Parameters
    ----------
    embedding : nn.Embedding
        The embedding module to wrap.
    """

    def __init__(self, embedding: nn.Embedding):
        super().__init__()
        self.embedding = embedding

    def forward(self, input: torch.Tensor, out_dtype: torch.dtype | None = None):
        """
        Forward pass through the wrapped embedding.

        Parameters
        ----------
        input : torch.Tensor
            Input tensor of indices.
        out_dtype : torch.dtype, optional
            Output data type (unused).

        Returns
        -------
        torch.Tensor
            Output embedding tensor.
        """
        return self.embedding(input)

    @property
    def weight(self):
        """
        Returns the embedding weights.

        Returns
        -------
        torch.Tensor
            The embedding weights.
        """
        return self.embedding.weight


def nunchaku_flux_clip(nunchaku_t5_path: str | os.PathLike[str], dtype_t5=None) -> Callable:
    """
    Utility function to create a Nunchaku FLUX CLIP model class using a pretrained Nunchaku T5 encoder.

    Parameters
    ----------
    nunchaku_t5_path : str or os.PathLike
        Path to the pretrained Nunchaku T5 encoder model.
    dtype_t5 : torch.dtype, optional
        Data type for the T5 encoder weights.

    Returns
    -------
    Callable
        A class inheriting from ``FluxClipModel`` that uses the Nunchaku T5 encoder.

    Notes
    -----
    Adapted from:
    https://github.com/comfyanonymous/ComfyUI/blob/158419f3a0017c2ce123484b14b6c527716d6ec8/comfy/text_encoders/flux.py#L63
    """

    class NunchakuFluxClipModel(FluxClipModel):
        """
        FLUX CLIP model with a Nunchaku T5 encoder backend.

        Parameters
        ----------
        dtype_t5 : torch.dtype, optional
            Data type for the T5 encoder weights.
        device : str, default="cpu"
            Device to load the model on.
        dtype : torch.dtype, optional
            Data type for the CLIP model.
        model_options : dict, optional
            Additional model options.
        """

        def __init__(
            self,
            dtype_t5=None,
            device="cpu",
            dtype=None,
            model_options={},
        ):
            super(FluxClipModel, self).__init__()
            dtype_t5 = comfy.model_management.pick_weight_dtype(dtype_t5, dtype, device)
            self.clip_l = comfy.sd1_clip.SDClipModel(
                device=device, dtype=dtype, return_projected_pooled=False, model_options=model_options
            )

            # Use meta device for T5XXL to avoid loading into memory before replacement
            with torch.device("meta"):
                self.t5xxl = comfy.text_encoders.sd3_clip.T5XXLModel(
                    device=device, dtype=dtype_t5, model_options=model_options
                )

            transformer = NunchakuT5EncoderModel.from_pretrained(nunchaku_t5_path, device=device, torch_dtype=dtype_t5)
            transformer.forward = types.MethodType(nunchaku_t5_forward, transformer)
            transformer.shared = WrappedEmbedding(transformer.shared)
            self.t5xxl.transformer = transformer
            self.t5xxl.logit_scale = nn.Parameter(torch.zeros_like(self.t5xxl.logit_scale, device=device))

            self.dtypes = set([dtype, dtype_t5])

    return NunchakuFluxClipModel


def load_text_encoder_state_dicts(
    paths: list[str | os.PathLike[str]],
    embedding_directory: str | os.PathLike[str] | None = None,
    clip_type=comfy.sd.CLIPType.FLUX,
    model_options: dict = {},
):
    """
    Utility function to load and assemble text encoder state dicts for Nunchaku models.

    Parameters
    ----------
    paths : list of str or os.PathLike
        List of paths to model state dict files.
    embedding_directory : str or os.PathLike, optional
        Directory containing additional embeddings.
    clip_type : enum, default=comfy.sd.CLIPType.FLUX
        Type of CLIP model to load.
    model_options : dict, optional
        Additional model options.

    Returns
    -------
    comfy.sd.CLIP
        The loaded and assembled CLIP model.

    Raises
    ------
    NotImplementedError
        If the clip_type is not supported or the number of state dicts is not 2.

    Notes
    -----
    Adapted from:
    https://github.com/comfyanonymous/ComfyUI/blob/158419f3a0017c2ce123484b14b6c527716d6ec8/comfy/sd.py#L820
    """
    state_dicts, metadata_list = [], []

    for p in paths:
        sd, metadata = comfy.utils.load_torch_file(p, safe_load=True, return_metadata=True)
        state_dicts.append(sd)
        metadata_list.append(metadata)

    class EmptyClass:
        """Placeholder for CLIP target attributes."""

        pass

    for i in range(len(state_dicts)):
        if "transformer.resblocks.0.ln_1.weight" in state_dicts[i]:
            state_dicts[i] = comfy.utils.clip_text_transformers_convert(state_dicts[i], "", "")
        else:
            if "text_projection" in state_dicts[i]:
                # Old models saved with the CLIPSave node
                state_dicts[i]["text_projection.weight"] = state_dicts[i]["text_projection"].transpose(0, 1)

    tokenizer_data = {}
    clip_target = EmptyClass()
    clip_target.params = {}

    nunchaku_model_id = None
    for i, metadata in enumerate(metadata_list):
        if metadata is not None and metadata.get("model_class", None) == "NunchakuT5EncoderModel":
            nunchaku_model_id = i
            break

    if len(state_dicts) == 2:
        if clip_type == comfy.sd.CLIPType.FLUX:
            if nunchaku_model_id is None:
                clip_target.clip = comfy.text_encoders.flux.flux_clip(**comfy.sd.t5xxl_detect(state_dicts))
            else:
                clip_target.clip = nunchaku_flux_clip(nunchaku_t5_path=paths[nunchaku_model_id], dtype_t5=torch.float16)
            clip_target.tokenizer = comfy.text_encoders.flux.FluxTokenizer
    else:
        raise NotImplementedError(f"Clip type {clip_type} not implemented.")

    parameters = 0
    for c in state_dicts:
        parameters += comfy.utils.calculate_parameters(c)
        tokenizer_data, model_options = comfy.text_encoders.long_clipl.model_options_long_clip(
            c, tokenizer_data, model_options
        )
    clip = comfy.sd.CLIP(
        clip_target,
        embedding_directory=embedding_directory,
        parameters=parameters,
        tokenizer_data=tokenizer_data,
        model_options=model_options,
    )
    for state_dict, metadata in zip(state_dicts, metadata_list):
        if metadata is not None and metadata.get("model_class", None) == "NunchakuT5EncoderModel":
            continue  # Skip Nunchaku T5 model loading here, handled separately above
        m, u = clip.load_sd(state_dict)
        if len(m) > 0:
            logging.warning("clip missing: {}".format(m))

        if len(u) > 0:
            logging.debug("clip unexpected: {}".format(u))

    return clip


class NunchakuTextEncoderLoader:
    """
    Node for loading Nunchaku text encoders (deprecated).

    .. warning::
        This node is deprecated and will be removed in December 2025. Please use
        :class:`NunchakuTextEncoderLoaderV2` instead.

    This node loads a pair of text encoder checkpoints for use with Nunchaku models,
    with optional support for 4-bit T5 models.
    """

    @classmethod
    def INPUT_TYPES(s):
        """
        Defines the input types and tooltips for the node.

        Returns
        -------
        dict
            A dictionary specifying the required inputs and their descriptions for the node interface.
        """
        prefixes = folder_paths.folder_names_and_paths["text_encoders"][0]
        local_folders = set()
        for prefix in prefixes:
            if os.path.exists(prefix) and os.path.isdir(prefix):
                local_folders_ = os.listdir(prefix)
                local_folders_ = [
                    folder
                    for folder in local_folders_
                    if not folder.startswith(".") and os.path.isdir(os.path.join(prefix, folder))
                ]
                local_folders.update(local_folders_)
        model_paths = ["none"] + sorted(list(local_folders))
        return {
            "required": {
                "model_type": (["flux"],),
                "text_encoder1": (folder_paths.get_filename_list("text_encoders"),),
                "text_encoder2": (folder_paths.get_filename_list("text_encoders"),),
                "t5_min_length": (
                    "INT",
                    {
                        "default": 512,
                        "min": 256,
                        "max": 1024,
                        "step": 128,
                        "display": "number",
                        "lazy": True,
                        "tooltip": "Minimum sequence length for the T5 encoder.",
                    },
                ),
                "use_4bit_t5": (["disable", "enable"],),
                "int4_model": (
                    model_paths,
                    {"tooltip": "The name of the 4-bit T5 model."},
                ),
            }
        }

    RETURN_TYPES = ("CLIP",)
    FUNCTION = "load_text_encoder"
    CATEGORY = "Nunchaku"
    TITLE = "Nunchaku Text Encoder Loader (Deprecated)"

    def load_text_encoder(
        self,
        model_type: str,
        text_encoder1: str,
        text_encoder2: str,
        t5_min_length: int,
        use_4bit_t5: str,
        int4_model: str,
    ):
        """
        Loads the text encoders with the given configuration.

        Parameters
        ----------
        model_type : str
            The type of model to load (e.g., "flux").
        text_encoder1 : str
            Filename of the first text encoder checkpoint.
        text_encoder2 : str
            Filename of the second text encoder checkpoint.
        t5_min_length : int
            Minimum sequence length for the T5 encoder.
        use_4bit_t5 : str
            Whether to use a 4-bit T5 model ("enable" or "disable").
        int4_model : str
            The name or path of the 4-bit T5 model.

        Returns
        -------
        tuple
            Tuple containing the loaded CLIP model.

        Warns
        -----
        UserWarning
            If this deprecated node is used.
        """
        logger.warning(
            "Nunchaku Text Encoder Loader will be deprecated in v0.4. "
            "Please use the Nunchaku Text Encoder Loader V2 node instead."
        )
        text_encoder_path1 = folder_paths.get_full_path_or_raise("text_encoders", text_encoder1)
        text_encoder_path2 = folder_paths.get_full_path_or_raise("text_encoders", text_encoder2)
        if model_type == "flux":
            clip_type = comfy.sd.CLIPType.FLUX
        else:
            raise ValueError(f"Unknown type {model_type}")

        clip = comfy.sd.load_clip(
            ckpt_paths=[text_encoder_path1, text_encoder_path2],
            embedding_directory=folder_paths.get_folder_paths("embeddings"),
            clip_type=clip_type,
        )

        if model_type == "flux":
            clip.tokenizer.t5xxl.min_length = t5_min_length

        if use_4bit_t5 == "enable":
            assert int4_model != "none", "Please select a 4-bit T5 model."
            transformer = clip.cond_stage_model.t5xxl.transformer
            param = next(transformer.parameters())
            dtype = param.dtype
            device = param.device

            prefixes = folder_paths.folder_names_and_paths["text_encoders"][0]
            model_path = None
            for prefix in prefixes:
                if os.path.exists(os.path.join(prefix, int4_model)):
                    model_path = os.path.join(prefix, int4_model)
                    break
            if model_path is None:
                model_path = int4_model
            transformer = NunchakuT5EncoderModel.from_pretrained(model_path)
            transformer.forward = types.MethodType(nunchaku_t5_forward, transformer)
            transformer.shared = WrappedEmbedding(transformer.shared)

            clip.cond_stage_model.t5xxl.transformer = (
                transformer.to(device=device, dtype=dtype) if device.type == "cuda" else transformer
            )

        return (clip,)