Initial commit CI/CD

727428ec · jerrrrry · 727428ec · 727428ec · 727428ec · 727428ec
Commit 727428ec authored Jan 21, 2026 by jerrrrry
20 changed files
--- a/IndexKits/index_kits/indexer.py
+++ b/IndexKits/index_kits/indexer.py
+import bisect
+import io
+import json
+import random
+from pathlib import Path
+import ast
+from itertools import chain
+from collections import defaultdict
+from functools import partial
+from glob import glob
+import numpy as np
+import pyarrow as pa
+from PIL import Image
+from tqdm import tqdm
+def get_table(arrow_file):
+    """
+    Read an arrow file and return an arrow table.
+    """
+    return pa.ipc.RecordBatchFileReader(pa.memory_map(f"{arrow_file}", "r")).read_all()
+def assert_type(data, dtype, msg=""):
+    if not isinstance(data, dtype):
+        raise ValueError(f"Expected {msg} type {dtype}, got {type(data)}.")
+def ndarray_to_list(data):
+    if isinstance(data, np.ndarray):
+        data = data.tolist()
+    elif isinstance(data, dict):
+        data = {k: ndarray_to_list(v) for k, v in data.items()}
+    elif isinstance(data, (list, tuple)):
+        # Assert that all elements in data are python integer, not numpy integer.
+        # Because numpy integer cannot be serialized to json.
+        data = [int(x) for x in data]
+    else:
+        raise ValueError(
+            f"Expected data type list, tuple, dict or np.ndarray, got {type(data)}."
+        )
+    return data
+class ArrowIndexV2(object):
+    """
+    ArrowIndexV2 is a new version of ArrowIndex.
+    Parameters
+    ----------
+    index_file: str or pathlib.Path
+        The path of index file. Either index_file or res_dict should be provided.
+    res_dict: dict
+        The index dict. Either index_file or res_dict should be provided.
+    align: int
+        Align the length of indices to be a multiple of align. Generally align should be the batch size * world_size.
+    shadow_file_fn: callable or dict
+        A callable function to map shadow file path to a new path. If None, the shadow file path will not be
+        changed. If a dict is provided, the keys are the shadow names to call the function, and the values are the
+        callable functions to map the shadow file path to a new path. If a callable function is provided, the key
+        is 'default'.
+    Examples
+    --------
+    >>> index_file = 'data.json'
+    >>> indexObj = ArrowIndexV2(index_file)
+    >>> pil_image = indexObj.get_image(0)
+    >>> text = indexObj.get_attribute(0, column='text_zh')
+    """
+    def __init__(
+        self, index_file=None, res_dict=None, align=1, shadow_file_fn=None, **kwargs
+    ):
+        if index_file is not None:
+            with open(index_file, "r") as f:
+                res_dict = json.load(f)
+        elif res_dict is not None:
+            pass
+        else:
+            raise ValueError(f"Either index_file or res_dict should be provided.")
+        self.shadow_file_fn = {}
+        if shadow_file_fn is not None:
+            if not callable(shadow_file_fn) and not isinstance(shadow_file_fn, dict):
+                raise ValueError(
+                    "shadow_file_fn should be a callable function or a dict."
+                )
+            if callable(shadow_file_fn):
+                self.shadow_file_fn["default"] = shadow_file_fn
+            else:
+                for k, v in shadow_file_fn.items():
+                    if not callable(v):
+                        raise ValueError(f"{k} should be a callable function.")
+                    self.shadow_file_fn[k] = v
+        self._data = res_dict
+        self.data_type = res_dict["data_type"]
+        self.arrow_files = res_dict["arrow_files"]
+        self.cum_length = res_dict["cum_length"]
+        self.group_length = res_dict["group_length"]
+        error_msg = f"Expected group_length type list, got {type(self.group_length)}."
+        if isinstance(self.group_length, dict):
+            raise ValueError(
+                f"{error_msg}\nNote: You may using a multi-resolution index file. "
+                "Try `MultiResolutionBucketIndexV2` instead."
+            )
+        elif not isinstance(self.group_length, list):
+            raise ValueError(error_msg)
+        self.indices = res_dict["indices"]
+        if "indices_file" in res_dict:
+            self.indices_file = res_dict["indices_file"]
+            if self.indices_file != "":
+                indices_file = Path(index_file).parent / self.indices_file
+                if Path(indices_file).exists():
+                    self.indices = np.load(indices_file)["x"]
+                else:
+                    raise ValueError(
+                        f"This Index file contains an extra file {indices_file} which is missed."
+                    )
+        else:
+            self.indices_file = ""
+        if not isinstance(self.indices, list) and not isinstance(
+            self.indices, np.ndarray
+        ):
+            raise ValueError(
+                f"Expected indices type list or np.ndarray, got {type(self.indices)}."
+            )
+        if align > 1:
+            if isinstance(self.indices, np.ndarray):
+                self.indices = self.indices.tolist()
+            self.align(align)
+        self.indices = np.asarray(self.indices, int)
+        if len(self.arrow_files) != len(self.cum_length):
+            raise ValueError(
+                f"Length of arrow_files and cum_length does not match. {len(self.arrow_files)} != {len(self.cum_length)}"
+            )
+        if len(self.arrow_files) != len(self.group_length):
+            raise ValueError(
+                f"Length of arrow_files and group_length does not match. {len(self.arrow_files)} != {len(self.group_length)}"
+            )
+        if len(self.indices) == 0:
+            raise ValueError(f"No indices found in index_dict.")
+        if (
+            isinstance(self.indices, list)
+            and self.indices[-1] > self.cum_length[-1] - 1
+        ):
+            raise ValueError(f"Indices exceed cum_length.")
+        # Warning:
+        #  Ensure that indices are an increasing array. Currently,
+        #  no checks are performed due to the potential slowness when dealing with hundreds of millions of data points.
+        self.bias = self.cum_length
+        self._cur_arrow_file = None
+        self._cur_table_map = None
+        self._cur_table = None
+        self._index_bias = 0
+        self.last_index = -1
+        self._shadow_cur_arrow_file = {}
+        self._shadow_cur_table_map = {}
+        self._shadow_cur_table = {}
+        self._shadow_index_bias = {}
+        self.shadow_last_index = {}
+        for k in self.shadow_file_fn.keys():
+            self._shadow_cur_arrow_file[k] = None
+            self._shadow_cur_table_map[k] = None
+            self._shadow_cur_table[k] = None
+            self._shadow_index_bias[k] = 0
+            self.shadow_last_index[k] = -1
+    def __len__(self):
+        return len(self.indices)
+    def __repr__(self):
+        return f"""
+        ArrowIndexV2(
+            data_type        {self.data_type}
+            indices_file     {self.indices_file}
+            arrow_files      Count={len(self.arrow_files):,}  ({self.arrow_files[0]}, ...)
+            cum_length       Count={len(self.cum_length):,}  ({self.cum_length[0]}, ...)
+            group_length     Count={len(self.group_length):,}  ({self.group_length[0]}, ...)
+            indices          Count={len(self.indices):,}
+            example_indices  Count={len(self._data['example_indices']):,}
+        )
+        """
+    def check_exists(self):
+        for arrow_file in tqdm(self.arrow_files):
+            if not Path(arrow_file).exists():
+                print(arrow_file)
+    def align(self, align):
+        """
+        Repeat the index so that the length is a multiple of batch_size * world_size.
+        """
+        if len(self) % align == 0:
+            return
+        repeat_num = align - len(self) % align
+        if repeat_num >= len(self):
+            repeat_n = repeat_num // len(self)
+            repeat_times = [repeat_n + 1 for _ in self.indices]
+            group_length_new = [ll * (repeat_n + 1) for ll in self.group_length]
+            repeat_num -= repeat_n * len(self)
+        else:
+            repeat_times = [1 for _ in range(repeat_num)]
+            group_length_new = [ll for ll in self.group_length]
+        for i in range(repeat_num):
+            repeat_times[-i - 1] += 1
+        repeat_start_idx = len(self) - len(repeat_times)
+        group_id = -1
+        while group_length_new[group_id] == 0:
+            group_id -= 1
+        # Allocate the remaining indices that need to be repeated,
+        # while also counting how many indices have been checked.
+        # If the count reaches the group_length, switch to the next group
+        # The reason for paying attention to group_length is that when repeating indices,
+        # group_length also needs to be updated synchronously..
+        group_acc = 0
+        for i in range(repeat_num):
+            group_length_new[group_id] += 1
+            group_acc += 1
+            if group_acc == self.group_length[group_id]:
+                group_id -= 1
+                while group_length_new[group_id] == 0:
+                    group_id -= 1
+                group_acc = 0
+        temp = []
+        for i, value in enumerate(self.indices[repeat_start_idx:]):
+            temp.extend([value] * repeat_times[i])
+        self.indices = np.concatenate([self.indices[:repeat_start_idx], temp])
+        self.group_length = group_length_new
+    def shuffle(self, seed=None, fast=False):
+        """
+        It takes about 30 seconds for an index consisting of 100_000 arrows.
+        """
+        if fast:
+            return self.shuffle_fast(seed)
+        indices = self.indices.tolist()
+        if seed is not None:
+            state = random.getstate()
+            random.seed(seed)
+        indices_group_list = []
+        group_cum_len = 0
+        for group_len in self.group_length:
+            indices_group = indices[group_cum_len : group_cum_len + group_len]
+            random.shuffle(indices_group)
+            indices_group_list.append((indices_group, group_len))
+            group_cum_len += group_len
+        random.shuffle(indices_group_list)
+        self.group_length = [x[1] for x in indices_group_list]
+        self.indices = np.asarray(
+            list(chain.from_iterable([x[0] for x in indices_group_list]))
+        )
+        if seed is not None:
+            random.setstate(state)
+    def shuffle_fast(self, seed=None):
+        if seed is not None:
+            sampler = np.random.RandomState(seed)
+            sampler.shuffle(self.indices)
+        else:
+            np.random.shuffle(self.indices)
+    def get_table(self, arrow_file, shadow=None):
+        """
+        Read an arrow file and return an arrow table.
+        """
+        if shadow is None:
+            if self._cur_table is not None:
+                if self._cur_arrow_file == arrow_file:
+                    # This is the same arrow file. Return the cached table.
+                    return self._cur_table
+                else:
+                    # This is a different arrow file. Clear the cache.
+                    self._cur_table_map.close()
+                    self._cur_table = None
+            self._cur_arrow_file = arrow_file
+            self._cur_table_map = pa.memory_map(f"{arrow_file}", "r")
+            self._cur_table = pa.ipc.RecordBatchFileReader(
+                self._cur_table_map
+            ).read_all()
+            return self._cur_table
+        else:
+            if self._shadow_cur_table[shadow] is not None:
+                if self._shadow_cur_arrow_file[shadow] == arrow_file:
+                    return self._shadow_cur_table[shadow]
+                else:
+                    self._shadow_cur_table_map[shadow].close()
+                    self._shadow_cur_table[shadow] = None
+            self._shadow_cur_arrow_file[shadow] = arrow_file
+            self._shadow_cur_table_map[shadow] = pa.memory_map(f"{arrow_file}", "r")
+            self._shadow_cur_table[shadow] = pa.ipc.RecordBatchFileReader(
+                self._shadow_cur_table_map[shadow]
+            ).read_all()
+            return self._shadow_cur_table[shadow]
+    def get_arrow_file_by_index(self, index, return_index_bias=False, shadow=None):
+        i = bisect.bisect_right(self.cum_length, index)
+        arrow_file = self.arrow_files[i]
+        if return_index_bias:
+            if i == 0:
+                index_bias = 0
+            else:
+                index_bias = self.cum_length[i - 1]
+            return arrow_file, index_bias
+        return arrow_file
+    def get_arrow_file(self, ind, shadow=None):
+        """
+        Get arrow file by in-dataset index.
+        Parameters
+        ----------
+        ind: int
+            The in-dataset index.
+        shadow: str
+            The shadow name. If None, return the main arrow file. If not None, return the shadow arrow file.
+        Returns
+        -------
+        arrow_file: str
+            The arrow file path.
+        """
+        index = self.indices[ind]
+        return self.get_arrow_file_by_index(index, shadow=shadow)
+    def load_table_by_index(self, index, shadow=None):
+        if shadow is None:
+            if index == self.last_index:
+                return self._cur_table
+            arrow_file, self._index_bias = self.get_arrow_file_by_index(
+                index, return_index_bias=True
+            )
+            self._cur_table = self.get_table(arrow_file)
+            self.last_index = index
+            return self._cur_table
+        else:
+            if index == self.shadow_last_index[shadow]:
+                return self._shadow_cur_table[shadow]
+            shadow_arrow_file, _shadow_index_bias = self.get_arrow_file_by_index(
+                index, return_index_bias=True, shadow=shadow
+            )
+            self._shadow_index_bias[shadow] = _shadow_index_bias
+            self._shadow_cur_table[shadow] = self.get_table(
+                shadow_arrow_file, shadow=shadow
+            )
+            self.shadow_last_index[shadow] = index
+            return self._shadow_cur_table[shadow]
+    def get_data_by_index(
+        self, index, columns=None, allow_missing=False, return_meta=True, shadow=None
+    ):
+        table = self.load_table_by_index(index, shadow=shadow)
+        if isinstance(columns, str):
+            columns = [columns]
+        if columns is None:
+            columns = list(table.column_names)
+        index_bias = (
+            self._index_bias if shadow is None else self._shadow_index_bias[shadow]
+        )
+        in_arrow_index = index - index_bias
+        if return_meta:
+            cur_arrow_file = (
+                self._cur_arrow_file
+                if shadow is None
+                else self._shadow_cur_arrow_file[shadow]
+            )
+            data = {
+                "index": index,
+                "in_arrow_index": in_arrow_index,
+                "arrow_name": cur_arrow_file,
+            }
+        else:
+            data = {}
+        if allow_missing:
+            for col in columns:
+                if col in table.column_names:
+                    data[col] = table[col][in_arrow_index].as_py()
+        else:
+            for col in columns:
+                data[col] = table[col][in_arrow_index].as_py()
+        return data
+    def get_data(
+        self, ind, columns=None, allow_missing=False, return_meta=True, shadow=None
+    ):
+        """
+        Get data by in-dataset index.
+        Parameters
+        ----------
+        ind: int
+            The in-dataset index.
+        columns: str or list
+            The columns to be returned. If None, return all columns.
+        allow_missing: bool
+            If True, omit missing columns. If False, raise an error if the column is missing.
+        return_meta: bool
+            If True, the resulting dict will contain some meta information:
+            in-json index, in-arrow index, and arrow_name.
+        shadow: str
+            The shadow name. If None, return the main data. If not None, return the shadow data.
+        Returns
+        -------
+        data: dict
+            A dict containing the data.
+        """
+        index = self.indices[ind]
+        return self.get_data_by_index(
+            index,
+            columns,
+            allow_missing=allow_missing,
+            return_meta=return_meta,
+            shadow=shadow,
+        )
+    def get_attribute_by_index(self, index, column, shadow=None):
+        table = self.load_table_by_index(index, shadow=shadow)
+        index_bias = (
+            self._index_bias if shadow is None else self._shadow_index_bias[shadow]
+        )
+        return table[column][index - index_bias].as_py()
+    def get_attribute(self, ind, column, shadow=None):
+        """
+        Get single attribute by in-dataset index.
+        Parameters
+        ----------
+        ind: int
+            The in-dataset index.
+        column: str
+            The column name.
+        shadow: str
+            The shadow name. If None, return the main data. If not None, return the shadow data.
+        Returns
+        -------
+        data: can be any type
+        """
+        index = self.indices[ind]
+        return self.get_attribute_by_index(index, column, shadow=shadow)
+    def get_image_by_index(
+        self, index, column="image", ret_type="pil", max_size=-1, shadow=None
+    ):
+        table = self.load_table_by_index(index, shadow=shadow)
+        index_bias = (
+            self._index_bias if shadow is None else self._shadow_index_bias[shadow]
+        )
+        col = "image" if "image" in table.column_names else "binary"
+        temp = table[col][index - index_bias].as_py()
+        image_bytes = io.BytesIO(temp)
+        image_bytes.seek(0)
+        try:
+            # convert(RGB) has two purposes:
+            # 1. Convert the image to RGB mode. Some images are in grayscale/RGBA mode, which will cause channel
+            #    inconsistency in following processing.
+            # 2. Convert the image to RGB mode. Some images are in P mode, which will be forced to use NEAREST resample
+            #    method in resize (even if you specify LANCZOS), which will cause blurry images.
+            pil_image = Image.open(image_bytes).convert("RGB")
+        except Exception as e:
+            print(
+                f"get_image_by_index | Error: {e} ({self.get_arrow_file_by_index(index), index - index_bias})"
+            )
+            pil_image = Image.new("RGB", (256, 256), (255, 255, 255))
+        if max_size > 0:
+            # Resize the image to max_size. max_size is the size of long edge
+            w, h = pil_image.size
+            if w > h:
+                new_w = max_size
+                new_h = int(h * max_size / w)
+            else:
+                new_h = max_size
+                new_w = int(w * max_size / h)
+            pil_image = pil_image.resize((new_w, new_h))
+        if ret_type == "numpy":
+            return np.array(pil_image)
+        return pil_image
+    def get_image(self, ind, column="image", ret_type="pil", max_size=-1, shadow=None):
+        """
+        Get image by in-dataset index.
+        Parameters
+        ----------
+        ind: int
+            The in-dataset index.
+        column: str
+            [Deprecated] The column name of the image. Default to 'image'.
+        ret_type: str
+            The return type. Can be 'pil' or 'numpy'. Default to 'pil'.
+        max_size: int
+            If not -1, resize the image to max_size. max_size is the size of long edge.
+        shadow: str
+            The shadow name. If None, return the main image. If not None, return the shadow image.
+        Returns
+        -------
+        image: PIL.Image.Image or np.ndarray
+        """
+        index = self.indices[ind]
+        return self.get_image_by_index(index, column, ret_type, max_size, shadow=shadow)
+    def get_md5_by_index(self, index, shadow=None):
+        table = self.load_table_by_index(index, shadow=shadow)
+        index_bias = (
+            self._index_bias if shadow is None else self._shadow_index_bias[shadow]
+        )
+        return table["md5"][index - index_bias].as_py()
+    def get_md5(self, ind, shadow=None):
+        index = self.indices[ind]
+        return self.get_md5_by_index(index, shadow=shadow)
+    def get_columns_by_index(self, index, shadow=None):
+        table = self.load_table_by_index(index, shadow=shadow)
+        return table.column_names
+    def get_columns(self, ind, shadow=None):
+        index = self.indices[ind]
+        return self.get_columns_by_index(index, shadow=shadow)
+    def source_distribution(self, save_path=None, shadow=None):
+        sources = defaultdict(int)
+        for index in tqdm(self.indices):
+            source = self.get_attribute_by_index(index, "source", shadow=shadow)
+            sources[source] += 1
+        sources = sorted(sources.items(), key=lambda x: x[1], reverse=True)
+        for k, v in sources:
+            print(f"{k:20s} {v:10d}")
+        if save_path is not None:
+            Path(save_path).write_text(
+                "\n".join([f"{k:20s} {v:10d}" for k, v in sources])
+            )
+    def save(self, save_path):
+        """
+        Save the index to a json file.
+        Parameters
+        ----------
+        save_path: str or pathlib.Path
+            The path to save the index file.
+        """
+        builder = IndexV2Builder(
+            data_type=self.data_type,
+            arrow_files=self.arrow_files,
+            cum_length=self.cum_length,
+            indices=self.indices,
+        )
+        builder.build(save_path)
+    def sample_batch_indices(self, n):
+        return np.random.choice(self.indices, n)
+    def sample_batch(self, n, columns, progress=True, shadow=None):
+        if isinstance(n, int):
+            indices = self.sample_batch_indices(n)
+        else:
+            indices = n
+        if progress:
+            pbar = tqdm(indices)
+        else:
+            pbar = indices
+        batch_data = []
+        for i in pbar:
+            batch_data.append(self.get_data_by_index(i, columns, shadow=shadow))
+        return batch_data
+    @staticmethod
+    def resize_and_crop(image, target_size, resample=Image.LANCZOS, crop_type="random"):
+        """
+        Resize image without changing aspect ratio, then crop the center/random part.
+        Parameters
+        ----------
+        image: PIL.Image.Image
+            The input image to be resized and cropped.
+        target_size: tuple
+            The target size of the image.
+        resample:
+            The resample method. See PIL.Image.Image.resize for details. Default to Image.LANCZOS.
+        crop_type: str
+            'center' or 'random'. If 'center', crop the center part of the image. If 'random',
+            crop a random part of the image. Default to 'random'.
+        Returns
+        -------
+        image: PIL.Image.Image
+            The resized and cropped image.
+        crop_pos: tuple
+            The position of the cropped part. (crop_left, crop_top)
+        """
+        tw, th = target_size
+        w, h = image.size
+        tr = th / tw
+        r = h / w
+        # resize
+        if r < tr:
+            resize_height = th
+            resize_width = int(round(th / h * w))
+        else:
+            resize_width = tw
+            resize_height = int(round(tw / w * h))
+        image = image.resize((resize_width, resize_height), resample=resample)
+        if crop_type == "center":
+            crop_top = int(round((resize_height - th) / 2.0))
+            crop_left = int(round((resize_width - tw) / 2.0))
+        elif crop_type == "random":
+            crop_top = random.randint(0, resize_height - th)
+            crop_left = random.randint(0, resize_width - tw)
+        else:
+            raise ValueError(f"crop_type must be center or random, but got {crop_type}")
+        image = image.crop((crop_left, crop_top, crop_left + tw, crop_top + th))
+        return image, (crop_left, crop_top)
+class IndexV2Builder(object):
+    def __init__(
+        self,
+        arrow_files,
+        indices=None,
+        cum_length=None,
+        group_length=None,
+        data_type=None,
+        max_indices=5_000_000,
+        example_num=1000,
+        config_file=None,
+    ):
+        """
+        Build index v2 from an index dict.
+        Parameters
+        ----------
+        arrow_files: list
+            A list of arrow files.
+        indices: list or dict
+            A list of indices or a dict of indices.
+            If not provided, it will be specified as range(cum_length[-1]).
+        cum_length: list
+            A list of cumulative length of arrow files.
+            If not provided, it will be calculated from arrow files.
+        group_length: list
+            A list of group length or a dict of group length for each arrow file.
+            If not provided, it will be calculated.
+        data_type: str or list
+            Some custom information of this index.
+        max_indices: int
+            If the number of indices is larger than max_indices, the indices will be saved in a separate file.
+            Default to 5_000_000.
+        example_num: int
+            The number of examples to be saved in the index file. Default to 1000.
+        config_file: str
+            The path of config file.
+        Examples
+        --------
+        >>> builder = IndexV2Builder(
+        >>>     data_type='gold',
+        >>>     arrow_files=arrow_files,
+        >>>     cum_length=cum_length,
+        >>>     indices=indices,
+        >>> )
+        >>> builder.build(save_path)
+        """
+        self.arrow_files = arrow_files
+        self.indices = indices
+        self.cum_length = cum_length
+        self.group_length = group_length
+        self.data_type = data_type
+        self.max_indices = max_indices
+        self.example_num = example_num
+        self.config_file = config_file
+        if isinstance(arrow_files, str):
+            if "*" in arrow_files or "?" in arrow_files:
+                self.arrow_files = list(glob(arrow_files))
+            else:
+                self.arrow_files = [arrow_files]
+        elif isinstance(self.arrow_files, tuple):
+            self.arrow_files = list(self.arrow_files)
+        if not isinstance(self.arrow_files, list):
+            raise ValueError(
+                f"Expected arrow_files to be a list, got {type(self.arrow_files)}."
+            )
+        if self.cum_length is None:
+            continuous = False
+            if self.indices is None:
+                self.group_length = []
+                continuous = True
+            print(f"Calculating cum_length...")
+            self.cum_length = []
+            cur_cum_length = 0
+            pbar = tqdm(self.arrow_files)
+            for arrow_file in pbar:
+                table_length = len(get_table(arrow_file))
+                cur_cum_length += table_length
+                self.cum_length.append(cur_cum_length)
+                pbar.set_description(f"{self.cum_length[-1]:>12d}")
+                if continuous:
+                    self.group_length.append(table_length)
+        if self.indices is None:
+            self.indices = list(range(self.cum_length[-1]))
+        if self.group_length is None:
+            self.group_length = []
+        if self.data_type is None:
+            self.data_type = ["Made by IndexV2Builder"]
+        elif isinstance(self.data_type, str):
+            self.data_type = [self.data_type]
+        assert_type(self.data_type, list, "data_type")
+        assert_type(self.cum_length, (list, np.ndarray), "cum_length")
+        assert_type(self.group_length, (list, dict, np.ndarray), "group_length")
+        assert_type(self.indices, (list, dict, np.ndarray), "indices")
+        self.cum_length = ndarray_to_list(self.cum_length)
+        self.group_length = ndarray_to_list(self.group_length)
+        self.indices = ndarray_to_list(self.indices)
+        if isinstance(self.indices, dict):
+            for k, v in self.indices.items():
+                assert_type(v, list, f"indices[{k}]")
+        if len(self.arrow_files) != len(self.cum_length):
+            raise ValueError(
+                f"Length of arrow_files and cum_length does not match. {len(self.arrow_files)} != {len(self.cum_length)}"
+            )
+        if len(self.indices) == 0:
+            raise ValueError(f"No indices found in index_dict.")
+        if (
+            isinstance(self.indices, list)
+            and self.indices[-1] > self.cum_length[-1] - 1
+        ):
+            raise ValueError(
+                f"Indices exceed cum_length. {self.indices[-1]} > {self.cum_length[-1] - 1}"
+            )
+        if len(self.group_length) > 0:
+            if len(self.arrow_files) != len(self.group_length):
+                raise ValueError(
+                    f"Length of arrow_files and group_length does not match. {len(self.arrow_files)} != {len(self.group_length)}"
+                )
+            if sum(self.group_length) != len(self.indices):
+                raise ValueError(
+                    f"Sum of group_length does not match length of indices. {sum(self.group_length)} != {len(self.indices)}"
+                )
+    def encode(self):
+        # Encode arrow files
+        print("Encoding arrow files...")
+        arrow_files = []
+        for arrow_file in tqdm(self.arrow_files):
+            shortname = arrow_file
+            arrow_files.append(shortname)
+        self.arrow_files = arrow_files
+        # Calculate group_length
+        print("Calculating group length...")
+        if isinstance(self.indices, list):
+            if len(self.group_length) == 0:
+                self.group_length = self.calc_group_length(
+                    self.indices, self.cum_length
+                )
+            else:
+                print("Group length already calculated, skip.")
+        elif isinstance(self.indices, dict):
+            if not isinstance(self.group_length, dict):
+                self.group_length = {}
+            for k, v in self.indices.items():
+                print(f"Calculating group length for {k}...")
+                if k not in self.group_length or len(self.group_length[k]) == 0:
+                    self.group_length[k] = self.calc_group_length(v, self.cum_length)
+                else:
+                    print("Group length already calculated, skip.")
+        else:
+            raise ValueError(
+                f"Expected indices type list or dict, got {type(self.indices)}."
+            )
+        return {
+            "data_type": self.data_type,
+            "config_file": self.config_file if self.config_file is not None else "",
+            "indices_file": "",
+            "arrow_files": self.arrow_files,
+            "cum_length": self.cum_length,
+            "group_length": self.group_length,
+            "indices": self.indices,
+            "example_indices": [],
+        }
+    def to_index_v2(self):
+        return ArrowIndexV2(res_dict=self.encode())
+    def build(self, save_path):
+        return self.save(save_path)
+    def save(self, save_path):
+        """
+        Make index v2 from an index dict.
+        Parameters
+        ----------
+        save_path: str or pathlib.Path
+            The path to save the index file.
+        """
+        index_dict = self.encode()
+        # Ensure the indices either a list or a dict.
+        save_path = Path(save_path)
+        save_path.parent.mkdir(exist_ok=True, parents=True)
+        if (
+            isinstance(index_dict["indices"], list)
+            and len(index_dict["indices"]) > self.max_indices
+        ):
+            self.example_indices = index_dict["indices"][: self.example_num]
+            indices_to_save = {"x": index_dict["indices"]}
+            index_dict["indices"] = []
+        elif isinstance(index_dict["indices"], dict):
+            indices_to_save = index_dict["indices"]
+            index_dict["indices"] = {}
+            num_keys = len(indices_to_save)
+            example_num_per_key = max(self.example_num // num_keys, 10)
+            index_dict["example_indices"] = {
+                k: v[:example_num_per_key] for k, v in index_dict["indices"].items()
+            }
+        else:
+            indices_to_save = None
+        # save indices
+        if indices_to_save is not None:
+            indices_file = save_path.parent / f"{save_path.stem}.index"
+            indices_dict = {k: np.array(v) for k, v in indices_to_save.items()}
+            np.savez_compressed(indices_file, **indices_dict)
+            index_dict["indices_file"] = indices_file.name + ".npz"
+        with save_path.open("w") as f:
+            json.dump(index_dict, f, indent=4, ensure_ascii=False)
+    @staticmethod
+    def calc_group_length(indices, cum_length):
+        group_lengths = []
+        cum_ind = 0
+        count = 0
+        for index in tqdm(indices):
+            if index < cum_length[cum_ind]:
+                # index is still in the current group
+                count += 1
+            else:
+                # index has exceeded the current group, need to switch to the next group
+                group_lengths.append(count)
+                cum_ind += 1
+                # if the index exceeds the next group, continue to switch to the next group
+                while index >= cum_length[cum_ind]:
+                    group_lengths.append(0)
+                    cum_ind += 1
+                count = 1
+        # The indices array is exhausted, and the last group containing the index should also be added.
+        group_lengths.append(count)
+        assert len(group_lengths) <= len(cum_length), (
+            len(group_lengths),
+            len(cum_length),
+        )
+        # Check if the number of groups is less than the number of cum_length,
+        # then the last n groups are empty and need to be filled with zeros.
+        if len(group_lengths) < len(cum_length):
+            group_lengths.extend([0] * (len(cum_length) - len(group_lengths)))
+        return group_lengths
--- a/IndexKits/index_kits/sampler.py
+++ b/IndexKits/index_kits/sampler.py
+import math
+import torch.distributed as dist
+from torch.utils.data.distributed import DistributedSampler
+class BlockDistributedSampler(DistributedSampler):
+    def __init__(
+        self,
+        dataset,
+        num_replicas=None,
+        rank=None,
+        shuffle=True,
+        seed=0,
+        drop_last=False,
+        batch_size=-1,
+        start_index=0,
+    ):
+        super().__init__(dataset, num_replicas, rank, shuffle, seed, drop_last)
+        if num_replicas is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            num_replicas = dist.get_world_size()
+        if rank is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            rank = dist.get_rank()
+        if rank >= num_replicas or rank < 0:
+            raise ValueError(
+                "Invalid rank {}, rank should be in the interval"
+                " [0, {}]".format(rank, num_replicas - 1)
+            )
+        if batch_size == -1:
+            raise ValueError("batch_size should be specified")
+        self.dataset = dataset
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        self.drop_last = drop_last
+        self.shuffle = shuffle
+        self.seed = seed
+        self.batch_size = batch_size
+        self._start_index = start_index
+        self.recompute_sizes()
+    @property
+    def start_index(self):
+        return self._start_index
+    @start_index.setter
+    def start_index(self, value):
+        self._start_index = value
+        self.recompute_sizes()
+    def recompute_sizes(self):
+        self.num_samples = (
+            len(self.dataset) // self.batch_size * self.batch_size // self.num_replicas
+            - self._start_index
+        )
+        self.total_size = self.num_samples * self.num_replicas
+    def __iter__(self):
+        indices = list(range(len(self.dataset)))  # type: ignore[arg-type]
+        raw_num_samples = (
+            len(indices) // self.batch_size * self.batch_size // self.num_replicas
+        )
+        raw_total_size = raw_num_samples * self.num_replicas
+        indices = indices[:raw_total_size]
+        # We require that the dataset size is divisible by batch_size * num_replicas
+        # This is naturally satisfied when using index_kits.
+        # In future, we can remove this assertion.
+        assert len(indices) == raw_total_size, f"{len(indices)} vs {raw_total_size}"
+        # subsample with start_index
+        indices = indices[
+            self.rank * raw_num_samples
+            + self.start_index : (self.rank + 1) * raw_num_samples
+        ]
+        assert (
+            len(indices) + self.start_index == raw_num_samples
+        ), f"{len(indices) + self.start_index} vs {raw_num_samples}"
+        # This is a sequential sampler. The shuffle operation is done by the dataset itself.
+        return iter(indices)
+class DistributedSamplerWithStartIndex(DistributedSampler):
+    def __init__(
+        self,
+        dataset,
+        num_replicas=None,
+        rank=None,
+        shuffle=True,
+        seed=0,
+        drop_last=False,
+        start_index=0,
+    ):
+        super().__init__(dataset, num_replicas, rank, shuffle, seed, drop_last)
+        if num_replicas is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            num_replicas = dist.get_world_size()
+        if rank is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            rank = dist.get_rank()
+        if rank >= num_replicas or rank < 0:
+            raise ValueError(
+                "Invalid rank {}, rank should be in the interval"
+                " [0, {}]".format(rank, num_replicas - 1)
+            )
+        self.dataset = dataset
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        self.drop_last = drop_last
+        self._start_index = start_index
+        self.recompute_sizes()
+        self.shuffle = shuffle
+        self.seed = seed
+    @property
+    def start_index(self):
+        return self._start_index
+    @start_index.setter
+    def start_index(self, value):
+        self._start_index = value
+        self.recompute_sizes()
+    def recompute_sizes(self):
+        # If the dataset length is evenly divisible by # of replicas, then there
+        # is no need to drop any data, since the dataset will be split equally.
+        if self.drop_last and (len(self.dataset) - self._start_index) % self.num_replicas != 0:  # type: ignore[arg-type]
+            # Split to nearest available length that is evenly divisible.
+            # This is to ensure each rank receives the same amount of data when
+            # using this Sampler.
+            self.num_samples = math.ceil(
+                ((len(self.dataset) - self._start_index) - self.num_replicas) / self.num_replicas  # type: ignore[arg-type]
+            )
+        else:
+            self.num_samples = math.ceil((len(self.dataset) - self._start_index) / self.num_replicas)  # type: ignore[arg-type]
+        self.total_size = self.num_samples * self.num_replicas
+    def __iter__(self):
+        indices = list(range(self._start_index, len(self.dataset)))  # type: ignore[arg-type]
+        if not self.drop_last:
+            # add extra samples to make it evenly divisible
+            padding_size = self.total_size - len(indices)
+            if padding_size <= len(indices):
+                indices += indices[:padding_size]
+            else:
+                indices += (indices * math.ceil(padding_size / len(indices)))[
+                    :padding_size
+                ]
+        else:
+            # remove tail of data to make it evenly divisible.
+            indices = indices[: self.total_size]
+        assert len(indices) == self.total_size
+        # subsample with start_index
+        indices = indices[self.rank : self.total_size : self.num_replicas]
+        assert len(indices) == self.num_samples
+        return iter(indices)
--- a/IndexKits/setup.py
+++ b/IndexKits/setup.py
+import re
+try:
+    from setuptools import setup
+except ImportError:
+    from distutils.core import setup
+with open("index_kits/__init__.py", "r") as file:
+    regex_version = r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]'
+    version = re.search(regex_version, file.read(), re.MULTILINE).group(1)
+setup(
+    name="index_kits",
+    version=version,
+    author="jarvizhang",
+    author_email="jarvizhang@tencent.com",
+    description="An index kits for streaming reading arrow data.",
+    packages=["index_kits", "index_kits/dataset"],
+    scripts=["bin/idk"],
+    install_requires=[
+        "pillow>=9.3.0",
+        "tqdm>=4.60.0",
+        "pyarrow>=10.0.1",
+        "torch>=1.9",
+    ],
+    python_requires=">=3.8.12",
+)
--- a/LICENSE.txt
+++ b/LICENSE.txt
+TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT
+Tencent Hunyuan DiT Release Date: 14 May 2024
+THIS LICENSE AGREEMENT DOES NOT APPLY IN THE EUROPEAN UNION AND IS EXPRESSLY LIMITED TO THE TERRITORY, AS DEFINED BELOW.
+By clicking to agree or by using, reproducing, modifying, distributing, performing or displaying any portion or element of the Tencent Hunyuan Works, including via any Hosted Service, You will be deemed to have recognized and accepted the content of this Agreement, which is effective immediately.
+1.	DEFINITIONS.
+a.	“Acceptable Use Policy” shall mean the policy made available by Tencent as set forth in the Exhibit A.
+b.	“Agreement” shall mean the terms and conditions for use, reproduction, distribution, modification, performance and displaying of Tencent Hunyuan Works or any portion or element thereof set forth herein.
+c.	“Documentation” shall mean the specifications, manuals and documentation for Tencent Hunyuan made publicly available by Tencent.
+d.	“Hosted Service” shall mean a hosted service offered via an application programming interface (API), web access, or any other electronic or remote means.
+e.	“Licensee,” “You” or “Your” shall mean a natural person or legal entity exercising the rights granted by this Agreement and/or using the Tencent Hunyuan Works for any purpose and in any field of use.
+f.	“Materials” shall mean, collectively, Tencent’s proprietary Tencent Hunyuan and Documentation (and any portion thereof) as made available by Tencent under this Agreement.
+g.	“Model Derivatives” shall mean all: (i) modifications to Tencent Hunyuan or any Model Derivative of Tencent Hunyuan; (ii) works based on Tencent Hunyuan or any Model Derivative of Tencent Hunyuan; or (iii) any other machine learning model which is created by transfer of patterns of the weights, parameters, operations, or Output of Tencent Hunyuan or any Model Derivative of Tencent Hunyuan, to that model in order to cause that model to perform similarly to Tencent Hunyuan or a Model Derivative of Tencent Hunyuan, including distillation methods, methods that use intermediate data representations, or methods based on the generation of synthetic data Outputs by Tencent Hunyuan or a Model Derivative of Tencent Hunyuan for training that model. For clarity, Outputs by themselves are not deemed Model Derivatives.
+h.	“Output” shall mean the information and/or content output of Tencent Hunyuan or a Model Derivative that results from operating or otherwise using Tencent Hunyuan or a Model Derivative, including via a Hosted Service.
+i.	“Tencent,” “We” or “Us” shall mean THL A29 Limited.
+j.	“Tencent Hunyuan” shall mean the large language models, text/image/video/audio/3D generation models, and multimodal large language models and their software and algorithms, including trained model weights, parameters (including optimizer states), machine-learning model code, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing made publicly available by Us, including, without limitation to, Tencent Hunyuan DiT released at https://huggingface.co/Tencent-Hunyuan/HunyuanDiT.
+k.	“Tencent Hunyuan Works” shall mean: (i) the Materials; (ii) Model Derivatives; and (iii) all derivative works thereof.
+l.	“Territory” shall mean the worldwide territory, excluding the territory of the European Union.
+m.	“Third Party” or “Third Parties” shall mean individuals or legal entities that are not under common control with Us or You.
+n.	“including” shall mean including but not limited to.
+2.	GRANT OF RIGHTS.
+We grant You, for the Territory only, a non-exclusive, non-transferable and royalty-free limited license under Tencent’s intellectual property or other rights owned by Us embodied in or utilized by the Materials to use, reproduce, distribute, create derivative works of (including Model Derivatives), and make modifications to the Materials, only in accordance with the terms of this Agreement and the Acceptable Use Policy, and You must not violate (or encourage or permit anyone else to violate) any term of this Agreement or the Acceptable Use Policy.
+3.	DISTRIBUTION.
+You may, subject to Your compliance with this Agreement, distribute or make available to Third Parties the Tencent Hunyuan Works, exclusively in the Territory, provided that You meet all of the following conditions:
+a.	You must provide all such Third Party recipients of the Tencent Hunyuan Works or products or services using them a copy of this Agreement;
+b.	You must cause any modified files to carry prominent notices stating that You changed the files;
+c.	You are encouraged to: (i) publish at least one technology introduction blogpost or one public statement expressing Your experience of using the Tencent Hunyuan Works; and (ii) mark the products or services developed by using the Tencent Hunyuan Works to indicate that the product/service is “Powered by Tencent Hunyuan”; and
+d.	All distributions to Third Parties (other than through a Hosted Service) must be accompanied by a “Notice” text file that contains the following notice: “Tencent Hunyuan is licensed under the Tencent Hunyuan Community License Agreement, Copyright © 2024 Tencent. All Rights Reserved. The trademark rights of “Tencent Hunyuan” are owned by Tencent or its affiliate.”
+You may add Your own copyright statement to Your modifications and, except as set forth in this Section and in Section 5, may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Model Derivatives as a whole, provided Your use, reproduction, modification, distribution, performance and display of the work otherwise complies with the terms and conditions of this Agreement (including as regards the Territory). If You receive Tencent Hunyuan Works from a Licensee as part of an integrated end user product, then this Section 3 of this Agreement will not apply to You.
+4.	ADDITIONAL COMMERCIAL TERMS.
+If, on the Tencent Hunyuan version release date, the monthly active users of all products or services made available by or for Licensee is greater than 100 million monthly active users in the preceding calendar month, You must request a license from Tencent, which Tencent may grant to You in its sole discretion, and You are not authorized to exercise any of the rights under this Agreement unless or until Tencent otherwise expressly grants You such rights.
+5.	RULES OF USE.
+a.	Your use of the Tencent Hunyuan Works must comply with applicable laws and regulations (including trade compliance laws and regulations) and adhere to the Acceptable Use Policy for the Tencent Hunyuan Works, which is hereby incorporated by reference into this Agreement. You must include the use restrictions referenced in these Sections 5(a) and 5(b) as an enforceable provision in any agreement (e.g., license agreement, terms of use, etc.) governing the use and/or distribution of Tencent Hunyuan Works and You must provide notice to subsequent users to whom You distribute that Tencent Hunyuan Works are subject to the use restrictions in these Sections 5(a) and 5(b).
+b.	You must not use the Tencent Hunyuan Works or any Output or results of the Tencent Hunyuan Works to improve any other large language model (other than Tencent Hunyuan or Model Derivatives thereof).
+c.	You must not use, reproduce, modify, distribute, or display the Tencent Hunyuan Works, Output or results of the Tencent Hunyuan Works outside the Territory. Any such use outside the Territory is unlicensed and unauthorized under this Agreement.
+6.	INTELLECTUAL PROPERTY.
+a.	Subject to Tencent’s ownership of Tencent Hunyuan Works made by or for Tencent and intellectual property rights therein, conditioned upon Your compliance with the terms and conditions of this Agreement, as between You and Tencent, You will be the owner of any derivative works and modifications of the Materials and any Model Derivatives that are made by or for You.
+b.	No trademark licenses are granted under this Agreement, and in connection with the Tencent Hunyuan Works, Licensee may not use any name or mark owned by or associated with Tencent or any of its affiliates, except as required for reasonable and customary use in describing and distributing the Tencent Hunyuan Works. Tencent hereby grants You a license to use “Tencent Hunyuan” (the “Mark”) in the Territory solely as required to comply with the provisions of Section 3(c), provided that You comply with any applicable laws related to trademark protection. All goodwill arising out of Your use of the Mark will inure to the benefit of Tencent.
+c.	If You commence a lawsuit or other proceedings (including a cross-claim or counterclaim in a lawsuit) against Us or any person or entity alleging that the Materials or any Output, or any portion of any of the foregoing, infringe any intellectual property or other right owned or licensable by You, then all licenses granted to You under this Agreement shall terminate as of the date such lawsuit or other proceeding is filed. You will defend, indemnify and hold harmless Us from and against any claim by any Third Party arising out of or related to Your or the Third Party’s use or distribution of the Tencent Hunyuan Works.
+d.	Tencent claims no rights in Outputs You generate. You and Your users are solely responsible for Outputs and their subsequent uses.
+7.	DISCLAIMERS OF WARRANTY AND LIMITATIONS OF LIABILITY.
+a.	We are not obligated to support, update, provide training for, or develop any further version of the Tencent Hunyuan Works or to grant any license thereto.
+b.	UNLESS AND ONLY TO THE EXTENT REQUIRED BY APPLICABLE LAW, THE TENCENT HUNYUAN WORKS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED “AS IS” WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES OF ANY KIND INCLUDING ANY WARRANTIES OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, COURSE OF DEALING, USAGE OF TRADE, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING, REPRODUCING, MODIFYING, PERFORMING, DISPLAYING OR DISTRIBUTING ANY OF THE TENCENT HUNYUAN WORKS OR OUTPUTS AND ASSUME ANY AND ALL RISKS ASSOCIATED WITH YOUR OR A THIRD PARTY’S USE OR DISTRIBUTION OF ANY OF THE TENCENT HUNYUAN WORKS OR OUTPUTS AND YOUR EXERCISE OF RIGHTS AND PERMISSIONS UNDER THIS AGREEMENT.
+c.	TO THE FULLEST EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL TENCENT OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, FOR ANY DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, CONSEQUENTIAL OR PUNITIVE DAMAGES, OR LOST PROFITS OF ANY KIND ARISING FROM THIS AGREEMENT OR RELATED TO ANY OF THE TENCENT HUNYUAN WORKS OR OUTPUTS, EVEN IF TENCENT OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
+8.	SURVIVAL AND TERMINATION.
+a.	The term of this Agreement shall commence upon Your acceptance of this Agreement or access to the Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein.
+b.	We may terminate this Agreement if You breach any of the terms or conditions of this Agreement. Upon termination of this Agreement, You must promptly delete and cease use of the Tencent Hunyuan Works. Sections 6(a), 6(c), 7 and 9 shall survive the termination of this Agreement.
+9.	GOVERNING LAW AND JURISDICTION.
+a.	This Agreement and any dispute arising out of or relating to it will be governed by the laws of the Hong Kong Special Administrative Region of the People’s Republic of China, without regard to conflict of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement.
+b.	Exclusive jurisdiction and venue for any dispute arising out of or relating to this Agreement will be a court of competent jurisdiction in the Hong Kong Special Administrative Region of the People’s Republic of China, and Tencent and Licensee consent to the exclusive jurisdiction of such court with respect to any such dispute.
+EXHIBIT A
+ACCEPTABLE USE POLICY
+Tencent reserves the right to update this Acceptable Use Policy from time to time.
+Last modified: [insert date]
+Tencent endeavors to promote safe and fair use of its tools and features, including Tencent Hunyuan. You agree not to use Tencent Hunyuan or Model Derivatives:
+1.	Outside the Territory;
+2.	In any way that violates any applicable national, federal, state, local, international or any other law or regulation;
+3.	To harm Yourself or others;
+4.	To repurpose or distribute output from Tencent Hunyuan or any Model Derivatives to harm Yourself or others;
+5.	To override or circumvent the safety guardrails and safeguards We have put in place;
+6.	For the purpose of exploiting, harming or attempting to exploit or harm minors in any way;
+7.	To generate or disseminate verifiably false information and/or content with the purpose of harming others or influencing elections;
+8.	To generate or facilitate false online engagement, including fake reviews and other means of fake online engagement;
+9.	To intentionally defame, disparage or otherwise harass others;
+10.	To generate and/or disseminate malware (including ransomware) or any other content to be used for the purpose of harming electronic systems;
+11.	To generate or disseminate personal identifiable information with the purpose of harming others;
+12.	To generate or disseminate information (including images, code, posts, articles), and place the information in any public context (including –through the use of bot generated tweets), without expressly and conspicuously identifying that the information and/or content is machine generated;
+13.	To impersonate another individual without consent, authorization, or legal right;
+14.	To make high-stakes automated decisions in domains that affect an individual’s safety, rights or wellbeing (e.g., law enforcement, migration, medicine/health, management of critical infrastructure, safety components of products, essential services, credit, employment, housing, education, social scoring, or insurance);
+15.	In a manner that violates or disrespects the social ethics and moral standards of other countries or regions;
+16.	To perform, facilitate, threaten, incite, plan, promote or encourage violent extremism or terrorism;
+17.	For any use intended to discriminate against or harm individuals or groups based on protected characteristics or categories, online or offline social behavior or known or predicted personal or personality characteristics;
+18.	To intentionally exploit any of the vulnerabilities of a specific group of persons based on their age, social, physical or mental characteristics, in order to materially distort the behavior of a person pertaining to that group in a manner that causes or is likely to cause that person or another person physical or psychological harm;
+19.	For military purposes;
+20.	To engage in the unauthorized or unlicensed practice of any profession including, but not limited to, financial, legal, medical/health, or other professional practices.
\ No newline at end of file
--- a/Notice
+++ b/Notice
+Usage and Legal Notices: 
+Tencent is pleased to support the open source community by making Tencent Hunyuan available. 
+Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved. The below software and/or models in this distribution may have been modified by THL A29 Limited ("Tencent Modifications"). All Tencent Modifications are Copyright (C) THL A29 Limited.
+Tencent Hunyuan is licensed under the Tencent Hunyuan Community License Agreement except for the third-party components listed below. Tencent Hunyuan does not impose any additional limitations beyond what is outlined in the repsective licenses of these third-party components. Users must comply with all terms and conditions of original licenses of these third-party components and must ensure that the usage of the third party components adheres to all relevant laws and regulations.
+For avoidance of doubts, Tencent Hunyuan means the large language models and their software and algorithms, including trained model weights, parameters (including optimizer states), machine-learning model code, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing made publicly available by Tencent in accordance with Tencent Hunyuan Community License Agreement.
+Other dependencies and licenses:
+Open Source Software Licensed under the BSD 3-Clause License and Other Licenses of the Third-Party Components therein:
+--------------------------------------------------------------------
+1. torch
+Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
+Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
+Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
+Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+Copyright (c) 2011-2013 NYU                      (Clement Farabet)
+Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
+Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+Terms of the BSD 3-Clause:
+--------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+For the license of other third party components, please refer to the following URL:
+https://github.com/pytorch/pytorch/blob/v1.13.1/NOTICE
+Open Source Software Licensed under the BSD 3-Clause License and Other Licenses of the Third-Party Components therein:
+--------------------------------------------------------------------
+1. pandas
+Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team
+All rights reserved.
+Copyright (c) 2011-2023, Open source contributors.
+A copy of the BSD 3-Clause is included in this file.
+For the license of other third party components, please refer to the following URL:
+https://github.com/pandas-dev/pandas/tree/v2.0.3/LICENSES
+Open Source Software Licensed under the BSD 3-Clause License and Other Licenses of the Third-Party Components therein:
+--------------------------------------------------------------------
+1. numpy
+Copyright (c) 2005-2022, NumPy Developers.
+All rights reserved.
+A copy of the BSD 3-Clause is included in this file.
+For the license of other third party components, please refer to the following URL:
+https://github.com/numpy/numpy/blob/v1.24.4/LICENSES_bundled.txt
+Open Source Software/Model Licensed under the BSD 3-Clause License and Other Licenses of the Third-Party Components therein:
+--------------------------------------------------------------------
+1. Megatron-LM
+Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+A copy of the BSD 3-Clause is included in this file.
+For the license of other third party components, please refer to the following URL:
+https://github.com/NVIDIA/Megatron-LM/blob/main/LICENSE
+Open Source Software/Models Licensed under the Apache License Version 2.0:
+The below software in this distribution may have been modified by THL A29 Limited ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2023 THL A29 Limited.
+--------------------------------------------------------------------
+1. diffusers
+Copyright (c) diffusers original author and authors
+Please note this software has been modified by Tencent in this distribution.
+2. transformers
+Copyright (c) transformers original author and authors
+3. timm
+Copyright 2019 Ross Wightman
+4. text-to-text-transfer-transformer
+Copyright (c) text-to-text-transfer-transformer original author and authors
+Please note this software has been modified by Tencent in this distribution.
+5. pytorch-fid
+Copyright (c) pytorch-fid original author and authors
+Please note this software has been modified by Tencent in this distribution.
+6. Image-Quality-Assessment-Toolbox
+Copyright 2021 Qunliang Xing
+7. accelerate
+Copyright (c) accelerate original author and authors
+8. IP-Adapter
+Copyright (c) IP-Adapter original author and authors
+Please note this software has been modified by Tencent in this distribution.
+9. mT5
+Copyright (c) mT5 original author and authors
+10. Mistral-7B
+Copyright (c) 2024 Mistral AI, All rights reserved
+11. peft
+Copyright 2023 The HuggingFace Team. All rights reserved.
+Terms of the Apache License Version 2.0:
+--------------------------------------------------------------------
+Apache License
+Version 2.0, January 2004
+http://www.apache.org/licenses/
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+1. Definitions.
+"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
+"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
+"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
+"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
+"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
+"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
+"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
+"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
+"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
+"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
+2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
+3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
+4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
+You must give any other recipients of the Work or Derivative Works a copy of this License; and
+You must cause any modified files to carry prominent notices stating that You changed the files; and
+You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
+If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.
+You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
+5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
+6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
+7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
+8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
+9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
+END OF TERMS AND CONDITIONS
+Open Source Software/Model Licensed under the BSD 3-Clause License:
+--------------------------------------------------------------------
+1. torchvision
+Copyright (c) Soumith Chintala 2016,
+All rights reserved.
+2. flash_attn
+Copyright (c) 2022, the respective contributors, as shown by the AUTHORS file.
+All rights reserved.
+3. apex
+Copyright (c) apex original author and authors
+A copy of the BSD 3-Clause is included in this file.
+Open Source Software Licensed under the HPND License:
+--------------------------------------------------------------------
+1. Pillow
+Copyright © 2010-2023 by Jeffrey A. Clark (Alex) and contributors.
+Terms of the HPND License:
+--------------------------------------------------------------------
+The Python Imaging Library (PIL) is
+    Copyright © 1997-2011 by Secret Labs AB
+    Copyright © 1995-2011 by Fredrik Lundh
+Pillow is the friendly PIL fork. It is
+    Copyright © 2010-2023 by Jeffrey A. Clark (Alex) and contributors.
+Like PIL, Pillow is licensed under the open source HPND License:
+By obtaining, using, and/or copying this software and/or its associated
+documentation, you agree that you have read, understood, and will comply
+with the following terms and conditions:
+Permission to use, copy, modify and distribute this software and its
+documentation for any purpose and without fee is hereby granted,
+provided that the above copyright notice appears in all copies, and that
+both that copyright notice and this permission notice appear in supporting
+documentation, and that the name of Secret Labs AB or the author not be
+used in advertising or publicity pertaining to distribution of the software
+without specific, written prior permission.
+SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
+SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
+IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR ANY SPECIAL,
+INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
+LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THIS SOFTWARE.
+Open Source Software/Model Licensed under the MIT License:
+The below software in this distribution may have been modified by Tencent.
+--------------------------------------------------------------------
+1. einops
+Copyright (c) 2018 Alex Rogozhnikov
+2. loguru
+Copyright (c) 2017
+3. Chinese-CLIP
+Copyright (c) 2012-2022 OFA-Sys Team
+Copyright (c) 2012-2022 Gabriel Ilharco, Mitchell Wortsman, Nicholas Carlini, Rohan Taori, Achal Dave, Vaishaal Shankar, John Miller, Hongseok Namkoong, Hannaneh Hajishirzi, Ali Farhadi, Ludwig Schmidt
+4. DeepSpeed
+Copyright (c) Microsoft Corporation.
+5. glid-3-xl
+Copyright (c) 2021 OpenAI
+6. lazysizes
+Copyright (c) 2015 Alexander Farkas
+7. thingsvision
+Copyright (c) 2021 Vision and Computational Cognition Group
+8. sd-vae-ft-ema
+Copyright (c) sd-vae-ft-ema original author and authors
+9. ComfyUI-Diffusers
+Copyright (c) 2023 Limitex
+10. glide-text2im
+Copyright (c) 2021 OpenAI
+11. improved-diffusion
+Copyright (c) 2021 OpenAI
+Terms of the MIT License:
+--------------------------------------------------------------------
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+Open Source Software Licensed under the MIT License and Other Licenses of the Third-Party Components therein:
+--------------------------------------------------------------------
+1. tqdm
+Copyright (c) 2013 noamraph
+A copy of the MIT is included in this file.
+For the license of other third party components, please refer to the following URL:
+https://github.com/tqdm/tqdm/blob/v4.66.1/LICENCE
+Open Source Software/Model Licensed under the MIT License and Other Licenses of the Third-Party Components therein:
+The below software in this distribution may have been modified by Tencent.
+--------------------------------------------------------------------
+1. generative-models
+Copyright (c) 2023 Stability AI
+A copy of the MIT is included in this file.
+For the license of other third party components, please refer to the following URL:
+https://github.com/Stability-AI/generative-models/blob/main/LICENSE-CODE
+https://github.com/Stability-AI/generative-models/tree/main/model_licenses
+Open Source Software/Model Licensed under the Apache License Version 2.0 and Other Licenses of the Third-Party Components therein:
+--------------------------------------------------------------------
+1. pyarrow
+Copyright 2016-2024 The Apache Software Foundation
+A copy of the Apache License Version 2.0 is included in this file.
+For the license of other third party components, please refer to the following URL:
+https://github.com/apache/arrow/blob/main/NOTICE.txt
+Open Source Software Licensed under the MIT License and Other Licenses of the Third-Party Components therein:
+The below software in this distribution may have been modified by THL A29 Limited ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2023 THL A29 Limited.
+--------------------------------------------------------------------
+1. opencv-python
+Copyright (c) Olli-Pekka Heinisuo
+Terms of the MIT:
+--------------------------------------------------------------------
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+For the license of other third party components, please refer to the following URL:
+https://github.com/opencv/opencv-python/blob/4.x/LICENSE-3RD-PARTY.txt
+Open Source Software Licensed under the MIT License and Other Licenses of the Third-Party Components therein:
+--------------------------------------------------------------------
+1. onnxruntime
+Copyright (c) Microsoft Corporation.
+A copy of the MIT is included in this file.
+For the license of other third party components, please refer to the following URL:
+https://github.com/microsoft/onnxruntime/blob/v1.16.3/ThirdPartyNotices.txt
+Open Source Software/Model Licensed under the Apache License Version 2.0:
+The below software in this distribution may have been modified by Tencent.
+--------------------------------------------------------------------
+1. dwpose
+Copyright 2018-2020 Open-MMLab.
+Please note this software has been modified by Tencent in this distribution.
+Terms of the Apache License Version 2.0:
+--------------------------------------------------------------------
+Apache License
+Version 2.0, January 2004
+http://www.apache.org/licenses/
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+1. Definitions.
+"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
+"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
+"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
+"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
+"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
+"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
+"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
+"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
+"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
+"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
+2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
+3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
+4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
+You must give any other recipients of the Work or Derivative Works a copy of this License; and
+You must cause any modified files to carry prominent notices stating that You changed the files; and
+You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
+If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.
+You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
+5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
+6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
+7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
+8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
+9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
+END OF TERMS AND CONDITIONS
+Open Source Software Licensed under the License agreement for matplotlib and later and Other Licenses of the Third-Party Components therein:
+--------------------------------------------------------------------
+1. matplotlib
+Copyright (c) 2012- Matplotlib Development Team; All Rights Reserved
+Terms of the License agreement for matplotlib versions 1.3.0 and later:
+--------------------------------------------------------------------
+License agreement for matplotlib versions 1.3.0 and later
+=========================================================
+1. This LICENSE AGREEMENT is between the Matplotlib Development Team
+("MDT"), and the Individual or Organization ("Licensee") accessing and
+otherwise using matplotlib software in source or binary form and its
+associated documentation.
+2. Subject to the terms and conditions of this License Agreement, MDT
+hereby grants Licensee a nonexclusive, royalty-free, world-wide license
+to reproduce, analyze, test, perform and/or display publicly, prepare
+derivative works, distribute, and otherwise use matplotlib
+alone or in any derivative version, provided, however, that MDT's
+License Agreement and MDT's notice of copyright, i.e., "Copyright (c)
+2012- Matplotlib Development Team; All Rights Reserved" are retained in
+matplotlib  alone or in any derivative version prepared by
+Licensee.
+3. In the event Licensee prepares a derivative work that is based on or
+incorporates matplotlib or any part thereof, and wants to
+make the derivative work available to others as provided herein, then
+Licensee hereby agrees to include in any such work a brief summary of
+the changes made to matplotlib .
+4. MDT is making matplotlib available to Licensee on an "AS
+IS" basis.  MDT MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
+IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, MDT MAKES NO AND
+DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
+FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF MATPLOTLIB
+WILL NOT INFRINGE ANY THIRD PARTY RIGHTS.
+5. MDT SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF MATPLOTLIB
+ FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR
+LOSS AS A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING
+MATPLOTLIB , OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF
+THE POSSIBILITY THEREOF.
+6. This License Agreement will automatically terminate upon a material
+breach of its terms and conditions.
+7. Nothing in this License Agreement shall be deemed to create any
+relationship of agency, partnership, or joint venture between MDT and
+Licensee.  This License Agreement does not grant permission to use MDT
+trademarks or trade name in a trademark sense to endorse or promote
+products or services of Licensee, or any third party.
+8. By copying, installing or otherwise using matplotlib ,
+Licensee agrees to be bound by the terms and conditions of this License
+Agreement.
+License agreement for matplotlib versions prior to 1.3.0
+========================================================
+1. This LICENSE AGREEMENT is between John D. Hunter ("JDH"), and the
+Individual or Organization ("Licensee") accessing and otherwise using
+matplotlib software in source or binary form and its associated
+documentation.
+2. Subject to the terms and conditions of this License Agreement, JDH
+hereby grants Licensee a nonexclusive, royalty-free, world-wide license
+to reproduce, analyze, test, perform and/or display publicly, prepare
+derivative works, distribute, and otherwise use matplotlib
+alone or in any derivative version, provided, however, that JDH's
+License Agreement and JDH's notice of copyright, i.e., "Copyright (c)
+2002-2011 John D. Hunter; All Rights Reserved" are retained in
+matplotlib  alone or in any derivative version prepared by
+Licensee.
+3. In the event Licensee prepares a derivative work that is based on or
+incorporates matplotlib  or any part thereof, and wants to
+make the derivative work available to others as provided herein, then
+Licensee hereby agrees to include in any such work a brief summary of
+the changes made to matplotlib.
+4. JDH is making matplotlib  available to Licensee on an "AS
+IS" basis.  JDH MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
+IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, JDH MAKES NO AND
+DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
+FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF MATPLOTLIB
+WILL NOT INFRINGE ANY THIRD PARTY RIGHTS.
+5. JDH SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF MATPLOTLIB
+ FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR
+LOSS AS A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING
+MATPLOTLIB , OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF
+THE POSSIBILITY THEREOF.
+6. This License Agreement will automatically terminate upon a material
+breach of its terms and conditions.
+7. Nothing in this License Agreement shall be deemed to create any
+relationship of agency, partnership, or joint venture between JDH and
+Licensee.  This License Agreement does not grant permission to use JDH
+trademarks or trade name in a trademark sense to endorse or promote
+products or services of Licensee, or any third party.
+8. By copying, installing or otherwise using matplotlib,
+Licensee agrees to be bound by the terms and conditions of this License
+Agreement.
+For the license of other third party components, please refer to the following URL:
+https://github.com/matplotlib/matplotlib/blob/v3.7.5/LICENSE
--- a/README.md
+++ b/README.md
+<!-- ## **HunyuanDiT** -->
+<p align="center">
+  <img src="https://raw.githubusercontent.com/Tencent/HunyuanDiT/main/asset/logo.png"  height=100>
+</p>
+# Hunyuan-DiT : A Powerful Multi-Resolution Diffusion Transformer with Fine-Grained Chinese Understanding
+<div align="center">
+  <a href="https://github.com/Tencent/HunyuanDiT"><img src="https://img.shields.io/static/v1?label=Hunyuan-DiT Code&message=Github&color=blue&logo=github-pages"></a> &ensp;
+  <a href="https://dit.hunyuan.tencent.com"><img src="https://img.shields.io/static/v1?label=Project%20Page&message=Github&color=blue&logo=github-pages"></a> &ensp;
+  <a href="https://arxiv.org/abs/2405.08748"><img src="https://img.shields.io/static/v1?label=Tech Report&message=Arxiv:HunYuan-DiT&color=red&logo=arxiv"></a> &ensp;
+  <a href="https://arxiv.org/abs/2403.08857"><img src="https://img.shields.io/static/v1?label=Paper&message=Arxiv:DialogGen&color=red&logo=arxiv"></a> &ensp;
+  <a href="https://huggingface.co/Tencent-Hunyuan/HunyuanDiT"><img src="https://img.shields.io/static/v1?label=Hunyuan-DiT&message=HuggingFace&color=yellow"></a> &ensp;
+  <a href="https://hunyuan.tencent.com/bot/chat"><img src="https://img.shields.io/static/v1?label=Hunyuan Bot&message=Web&color=green"></a> &ensp;
+  <a href="https://huggingface.co/spaces/Tencent-Hunyuan/HunyuanDiT"><img src="https://img.shields.io/static/v1?label=Hunyuan-DiT Demo&message=HuggingFace&color=yellow"></a> &ensp;
+  <a href="./comfyui"><img src="https://img.shields.io/static/v1?label=ComfyUI Support&message=ComfyUI&color=purple&logo=github-pages"></a> &ensp;
+</div>
+-----
+This repo contains PyTorch model definitions, pre-trained weights and inference/sampling code for our paper exploring Hunyuan-DiT. You can find more visualizations on our [project page](https://dit.hunyuan.tencent.com/).
+> [**Hunyuan-DiT: A Powerful Multi-Resolution Diffusion Transformer with Fine-Grained Chinese Understanding**](https://arxiv.org/abs/2405.08748) <br>
+> [**DialogGen: Multi-modal Interactive Dialogue System for Multi-turn Text-to-Image Generation**](https://arxiv.org/abs/2403.08857) <br>
+## 🔥🔥🔥 News!!
+* Dec 17, 2024: :tada: Optimize Lora training with `refined grad checkpoint` and `low-bit optimizer`. Just use `--lowbit-opt` to get started.
+* Sep 13, 2024: 🎉 IPAdapter is officially supported by HunYuanDiT. Document for it: [./ipadapter](./ipadapter). And scaled attention is utilized to replace flash attention on V100 GPUs.
+* Aug 26, 2024, 🎉 HunYuanDIT Controlnet and LoRA are officially supported by ComfyUI. Document for it: [./comfyui](./comfyui)
+* Jul 15, 2024: 🚀 HunYuanDiT and Shakker.Ai have jointly launched a fine-tuning event based on the HunYuanDiT 1.2 model. By publishing a lora or fine-tuned model based on HunYuanDiT, you can earn up to $230 bonus from Shakker.Ai. See [Shakker.Ai](https://www.shakker.ai/activitys/shaker-the-world-hunyuan) for more details.
+* Jul 15, 2024: :tada: Update ComfyUI to support standardized workflows and compatibility with weights from t2i module and Lora training for versions 1.1/1.2, as well as those trained by Kohya or the official script. 
+* Jul 15, 2024: :zap: We offer Docker environments for CUDA 11/12, allowing you to bypass complex installations and play with a single click! See [dockers](#installation-guide-for-linux) for details. 
+* Jul 08, 2024: :tada: HYDiT-v1.2 version is released. Please check [HunyuanDiT-v1.2](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.2) and [Distillation-v1.2](https://huggingface.co/Tencent-Hunyuan/Distillation-v1.2) for more details.
+* Jul 03, 2024: :tada: Kohya-hydit version now available for v1.1 and v1.2 models, with GUI for inference. Official Kohya version is under review. See [kohya](./kohya_ss-hydit) for details.
+* Jun 27, 2024: :art: Hunyuan-Captioner is released, providing fine-grained caption for training data. See [mllm](./mllm) for details.
+* Jun 27, 2024: :tada: Support LoRa and ControlNet in diffusers. See [diffusers](./diffusers) for details.
+* Jun 27, 2024: :tada: 6GB GPU VRAM Inference scripts are released. See [lite](./lite) for details.
+* Jun 19, 2024: :tada: ControlNet is released, supporting canny, pose and depth control. See [training/inference codes](#controlnet) for details.
+* Jun 13, 2024: :zap: HYDiT-v1.1 version is released, which mitigates the issue of image oversaturation and alleviates the watermark issue. Please check [HunyuanDiT-v1.1](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.1) and 
+[Distillation-v1.1](https://huggingface.co/Tencent-Hunyuan/Distillation-v1.1) for more details.
+* Jun 13, 2024: :truck: The training code is released, offering [full-parameter training](#full-parameter-training) and [LoRA training](#lora).
+* Jun 06, 2024: :tada: Hunyuan-DiT is now available in ComfyUI. Please check [ComfyUI](#using-comfyui) for more details.
+* Jun 06, 2024: 🚀 We introduce Distillation version for Hunyuan-DiT acceleration, which achieves **50%** acceleration on NVIDIA GPUs. Please check [Distillation](https://huggingface.co/Tencent-Hunyuan/Distillation) for more details.
+* Jun 05, 2024: 🤗 Hunyuan-DiT is now available in 🤗 Diffusers! Please check the [example](#using--diffusers) below.
+* Jun 04, 2024: :globe_with_meridians: Support Tencent Cloud links to download the pretrained models! Please check the [links](#-download-pretrained-models) below.
+* May 22, 2024: 🚀 We introduce TensorRT version for Hunyuan-DiT acceleration, which achieves **47%** acceleration on NVIDIA GPUs. Please check [TensorRT-libs](https://huggingface.co/Tencent-Hunyuan/TensorRT-libs) for instructions.
+* May 22, 2024: 💬 We support demo running multi-turn text2image generation now. Please check the [script](#using-gradio) below.
+## 🤖 Try it on the web
+Welcome to our web-based [**Tencent Hunyuan Bot**](https://hunyuan.tencent.com/bot/chat), where you can explore our innovative products! Just input the suggested prompts below or any other **imaginative prompts containing drawing-related keywords** to activate the Hunyuan text-to-image generation feature.  Unleash your creativity and create any picture you desire, **all for free!**
+You can use simple prompts similar to natural language text
+> 画一只穿着西装的猪
+>
+> draw a pig in a suit
+>
+> 生成一幅画，赛博朋克风，跑车
+> 
+> generate a painting, cyberpunk style, sports car
+or multi-turn language interactions to create the picture. 
+> 画一个木制的鸟
+>
+> draw a wooden bird
+>
+> 变成玻璃的
+>
+> turn into glass
+## 🤗 Community Contribution Leaderboard
+1. By [@TTPlanetPig](https://github.com/TTPlanetPig)
+   - HunyuanDIT_v1.2 ControlNet models
+     - Inpaint controlnet: https://huggingface.co/TTPlanet/HunyuanDiT_Controlnet_inpainting
+     - Tile controlnet: https://huggingface.co/TTPlanet/HunyuanDiT_Controlnet_tile
+     - Lineart controlnet: https://huggingface.co/TTPlanet/HunyuanDiT_Controlnet_lineart
+   - HunyuanDIT_v1.2 ComfyUI nodes
+     - Comfyui_TTP_CN_Preprocessor: https://github.com/TTPlanetPig/Comfyui_TTP_CN_Preprocessor
+     - Comfyui_TTP_Toolset: https://github.com/TTPlanetPig/Comfyui_TTP_Toolset
+2. By [@sdbds](https://github.com/sdbds) (bilibili up [青龙圣者](https://space.bilibili.com/219296))
+   - Kohya_ss-hydit train tools: https://github.com/zml-ai/HunyuanDIT-PRE/tree/main/kohya_ss-hydit
+3. By [@CrazyBoyM](https://github.com/CrazyBoyM) (bilibili up [飞鸟白菜](https://space.bilibili.com/291593914))
+   - ComfyUI support for HunyuanDIT_v1.2 Controlnet: https://github.com/comfyanonymous/ComfyUI/pull/4245
+4. By [@L_A_X](https://huggingface.co/Laxhar/Freeway_Animation_HunYuan_Demo)
+   - HunyuanDIT_v1.2 base model for anime
+     - Original hf: https://huggingface.co/Laxhar/Freeway_Animation_HunYuan_Demo
+     - Converted ComfyUI model: https://huggingface.co/comfyanonymous/Freeway_Animation_Hunyuan_Demo_ComfyUI_Converted
+## 📑 Open-source Plan
+- Hunyuan-DiT (Text-to-Image Model)
+  - [x] Inference 
+  - [x] Checkpoints 
+  - [x] Distillation Version
+  - [x] TensorRT Version
+  - [x] Training
+  - [x] Lora
+  - [x] Controlnet (Pose, Canny, Depth)
+  - [x] 6GB GPU VRAM Inference 
+  - [x] IP-adapter
+  - [ ] Hunyuan-DiT-S checkpoints (0.7B model)
+- Mllm
+  - Hunyuan-Captioner (Re-caption the raw image-text pairs)
+    - [x] Inference
+  - [Hunyuan-DialogGen](https://github.com/Centaurusalpha/DialogGen) (Prompt Enhancement Model)
+    - [x] Inference
+- [X] Web Demo (Gradio) 
+- [x] Multi-turn T2I Demo (Gradio)
+- [X] Cli Demo 
+- [X] ComfyUI
+- [X] Diffusers
+- [X] Kohya
+- [ ] WebUI
+## Contents
+- [Hunyuan-DiT : A Powerful Multi-Resolution Diffusion Transformer with Fine-Grained Chinese Understanding](#hunyuan-dit--a-powerful-multi-resolution-diffusion-transformer-with-fine-grained-chinese-understanding)
+  - [🔥🔥🔥 News!!](#-news)
+  - [🤖 Try it on the web](#-try-it-on-the-web)
+  - [🤗 Community Contribution Leaderboard](#-community-contribution-leaderboard)
+  - [📑 Open-source Plan](#-open-source-plan)
+  - [Contents](#contents)
+  - [Abstract](#abstract)
+  - [🎉 Hunyuan-DiT Key Features](#-hunyuan-dit-key-features)
+    - [Chinese-English Bilingual DiT Architecture](#chinese-english-bilingual-dit-architecture)
+    - [Multi-turn Text2Image Generation](#multi-turn-text2image-generation)
+  - [📈 Comparisons](#-comparisons)
+  - [🎥 Visualization](#-visualization)
+  - [📜 Requirements](#-requirements)
+  - [🛠️ Dependencies and Installation](#️-dependencies-and-installation)
+    - [Installation Guide for Linux](#installation-guide-for-linux)
+  - [🧱 Download Pretrained Models](#-download-pretrained-models)
+        - [1. Using HF-Mirror](#1-using-hf-mirror)
+        - [2. Resume Download](#2-resume-download)
+  - [:truck: Training](#truck-training)
+    - [Data Preparation](#data-preparation)
+    - [Full-parameter Training](#full-parameter-training)
+    - [LoRA](#lora)
+  - [🔑 Inference](#-inference)
+    - [6GB GPU VRAM Inference](#6gb-gpu-vram-inference)
+    - [Using Gradio](#using-gradio)
+    - [Using 🤗 Diffusers](#using--diffusers)
+    - [Using Command Line](#using-command-line)
+    - [More Configurations](#more-configurations)
+    - [Using ComfyUI](#using-comfyui)
+    - [Using Kohya](#using-kohya)
+    - [Using Previous versions](#using-previous-versions)
+  - [:building\_construction: Adapter](#building_construction-adapter)
+    - [ControlNet](#controlnet)
+    - [IP-Adapter](#IP-Adapter)
+  - [:art: Hunyuan-Captioner](#art-hunyuan-captioner)
+    - [Examples](#examples)
+    - [Instructions](#instructions)
+    - [Inference](#inference)
+    - [Gradio](#gradio)
+  - [🚀 Acceleration (for Linux)](#-acceleration-for-linux)
+  - [🔗 BibTeX](#-bibtex)
+  - [Start History](#start-history)
+## **Abstract**
+We present Hunyuan-DiT, a text-to-image diffusion transformer with fine-grained understanding of both English and Chinese. To construct Hunyuan-DiT, we carefully designed the transformer structure, text encoder, and positional encoding. We also build from scratch a whole data pipeline to update and evaluate data for iterative model optimization. For fine-grained language understanding, we train a Multimodal Large Language Model to refine the captions of the images. Finally, Hunyuan-DiT can perform multi-round multi-modal dialogue with users, generating and refining images according to the context.
+Through our carefully designed holistic human evaluation protocol with more than 50 professional human evaluators, Hunyuan-DiT sets a new state-of-the-art in Chinese-to-image generation compared with other open-source models.
+## 🎉 **Hunyuan-DiT Key Features**
+### **Chinese-English Bilingual DiT Architecture**
+Hunyuan-DiT is a diffusion model in the latent space, as depicted in figure below. Following the Latent Diffusion Model, we use a pre-trained Variational Autoencoder (VAE) to compress the images into low-dimensional latent spaces and train a diffusion model to learn the data distribution with diffusion models. Our diffusion model is parameterized with a transformer. To encode the text prompts, we leverage a combination of pre-trained bilingual (English and Chinese) CLIP and multilingual T5 encoder.
+<p align="center">
+  <img src="https://raw.githubusercontent.com/Tencent/HunyuanDiT/main/asset/framework.png"  height=450>
+</p>
+### Multi-turn Text2Image Generation
+Understanding natural language instructions and performing multi-turn interaction with users are important for a
+text-to-image system. It can help build a dynamic and iterative creation process that bring the user’s idea into reality
+step by step. In this section, we will detail how we empower Hunyuan-DiT with the ability to perform multi-round
+conversations and image generation. We train MLLM to understand the multi-round user dialogue
+and output the new text prompt for image generation.
+<p align="center">
+  <img src="https://raw.githubusercontent.com/Tencent/HunyuanDiT/main/asset/mllm.png"  height=300>
+</p>
+## 📈 Comparisons
+In order to comprehensively compare the generation capabilities of HunyuanDiT and other models, we constructed a 4-dimensional test set, including Text-Image Consistency, Excluding AI Artifacts, Subject Clarity, Aesthetic. More than 50 professional evaluators performs the evaluation.
+<p align="center">
+<table> 
+<thead> 
+<tr> 
+    <th rowspan="2">Model</th> <th rowspan="2">Open Source</th> <th>Text-Image Consistency (%)</th> <th>Excluding AI Artifacts (%)</th> <th>Subject Clarity (%)</th> <th rowspan="2">Aesthetics (%)</th> <th rowspan="2">Overall (%)</th> 
+</tr> 
+</thead> 
+<tbody> 
+<tr> 
+    <td>SDXL</td> <td> ✔ </td> <td>64.3</td> <td>60.6</td> <td>91.1</td> <td>76.3</td> <td>42.7</td> 
+</tr> 
+<tr> 
+    <td>PixArt-α</td> <td> ✔</td> <td>68.3</td> <td>60.9</td> <td>93.2</td> <td>77.5</td> <td>45.5</td> 
+</tr> 
+<tr> 
+    <td>Playground 2.5</td> <td>✔</td> <td>71.9</td> <td>70.8</td> <td>94.9</td> <td>83.3</td> <td>54.3</td> 
+</tr> 
+<tr> 
+    <td>SD 3</td> <td>&#10008</td> <td>77.1</td> <td>69.3</td> <td>94.6</td> <td>82.5</td> <td>56.7</td> 
+</tr> 
+<tr> 
+    <td>MidJourney v6</td><td>&#10008</td> <td>73.5</td> <td>80.2</td> <td>93.5</td> <td>87.2</td> <td>63.3</td> 
+</tr> 
+<tr> 
+    <td>DALL-E 3</td><td>&#10008</td> <td>83.9</td> <td>80.3</td> <td>96.5</td> <td>89.4</td> <td>71.0</td> 
+</tr> 
+<tr style="font-weight: bold; background-color: #f2f2f2;"> 
+    <td>Hunyuan-DiT</td><td>✔</td> <td>74.2</td> <td>74.3</td> <td>95.4</td> <td>86.6</td> <td>59.0</td> 
+</tr>
+</tbody>
+</table>
+</p>
+## 🎥 Visualization
+* **Chinese Elements**
+<p align="center">
+  <img src="https://raw.githubusercontent.com/Tencent/HunyuanDiT/main/asset/chinese elements understanding.png"  height=220>
+</p>
+* **Long Text Input**
+<p align="center">
+  <img src="https://raw.githubusercontent.com/Tencent/HunyuanDiT/main/asset/long text understanding.png"  height=310>
+</p>
+* **Multi-turn Text2Image Generation**
+https://github.com/Tencent/tencent.github.io/assets/27557933/94b4dcc3-104d-44e1-8bb2-dc55108763d1
+---
+## 📜 Requirements
+This repo consists of DialogGen (a prompt enhancement model) and Hunyuan-DiT (a text-to-image model).
+The following table shows the requirements for running the models (batch size = 1):
+|          Model          | --load-4bit (DialogGen) | GPU Peak Memory |       GPU       |
+|:-----------------------:|:-----------------------:|:---------------:|:---------------:|
+| DialogGen + Hunyuan-DiT |            ✘            |       32G       |      A100       |
+| DialogGen + Hunyuan-DiT |            ✔            |       22G       |      A100       |
+|       Hunyuan-DiT       |            -            |       11G       |      A100       |
+|       Hunyuan-DiT       |            -            |       14G       | RTX3090/RTX4090 |
+* An NVIDIA GPU with CUDA support is required. 
+  * We have tested V100 and A100 GPUs.
+  * **Minimum**: The minimum GPU memory required is 11GB.
+  * **Recommended**: We recommend using a GPU with 32GB of memory for better generation quality.
+* Tested operating system: Linux
+## 🛠️ Dependencies and Installation
+Begin by cloning the repository:
+```shell
+git clone https://github.com/tencent/HunyuanDiT
+cd HunyuanDiT
+```
+### Installation Guide for Linux
+We provide an `environment.yml` file for setting up a Conda environment.
+Conda's installation instructions are available [here](https://docs.anaconda.com/free/miniconda/index.html).
+We recommend CUDA versions 11.7 and 12.0+.
+```shell
+# 1. Prepare conda environment
+conda env create -f environment.yml
+# 2. Activate the environment
+conda activate HunyuanDiT
+# 3. Install pip dependencies
+python -m pip install -r requirements.txt
+# 4. Install flash attention v2 for acceleration (requires CUDA 11.6 or above)
+python -m pip install git+https://github.com/Dao-AILab/flash-attention.git@v2.1.2.post3
+```
+Additionally, you can also use docker to set up the environment.
+```shell
+# 1. Use the following link to download the docker image tar file.
+# For CUDA 12
+wget https://dit.hunyuan.tencent.com/download/HunyuanDiT/hunyuan_dit_cu12.tar
+# For CUDA 11
+wget https://dit.hunyuan.tencent.com/download/HunyuanDiT/hunyuan_dit_cu11.tar
+# 2. Import the docker tar file and show the image meta information
+# For CUDA 12
+docker load -i hunyuan_dit_cu12.tar
+# For CUDA 11
+docker load -i hunyuan_dit_cu11.tar  
+docker image ls
+# 3. Run the container based on the image
+docker run -dit --gpus all --init --net=host --uts=host --ipc=host --name hunyuandit --security-opt=seccomp=unconfined --ulimit=stack=67108864 --ulimit=memlock=-1 --privileged  docker_image_tag
+```
+## 🧱 Download Pretrained Models
+To download the model, first install the huggingface-cli. (Detailed instructions are available [here](https://huggingface.co/docs/huggingface_hub/guides/cli).)
+```shell
+python -m pip install "huggingface_hub[cli]"
+```
+Then download the model using the following commands:
+```shell
+# Create a directory named 'ckpts' where the model will be saved, fulfilling the prerequisites for running the demo.
+mkdir ckpts
+# Use the huggingface-cli tool to download the model.
+# The download time may vary from 10 minutes to 1 hour depending on network conditions.
+huggingface-cli download Tencent-Hunyuan/HunyuanDiT-v1.2 --local-dir ./ckpts
+```
+<details>
+<summary>💡Tips for using huggingface-cli (network problem)</summary>
+##### 1. Using HF-Mirror
+If you encounter slow download speeds in China, you can try a mirror to speed up the download process. For example,
+```shell
+HF_ENDPOINT=https://hf-mirror.com huggingface-cli download Tencent-Hunyuan/HunyuanDiT-v1.2 --local-dir ./ckpts
+```
+##### 2. Resume Download
+`huggingface-cli` supports resuming downloads. If the download is interrupted, you can just rerun the download 
+command to resume the download process.
+Note: If an `No such file or directory: 'ckpts/.huggingface/.gitignore.lock'` like error occurs during the download 
+process, you can ignore the error and rerun the download command.
+</details>
+---
+All models will be automatically downloaded. For more information about the model, visit the Hugging Face repository [here](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT).
+|       Model       | #Params |                                        Huggingface Download URL                                        |                                   Tencent Cloud Download URL                                   |
+|:-----------------:|:-------:|:------------------------------------------------------------------------------------------------------:|:----------------------------------------------------------------------------------------------:|
+|        mT5        |  1.6B   |               [mT5](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT/tree/main/t2i/mt5)               |               [mT5](https://dit.hunyuan.tencent.com/download/HunyuanDiT/mt5.zip)               |
+|       CLIP        |  350M   |       [CLIP](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT/tree/main/t2i/clip_text_encoder)        |       [CLIP](https://dit.hunyuan.tencent.com/download/HunyuanDiT/clip_text_encoder.zip)        |
+|     Tokenizer     |  -      |         [Tokenizer](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT/tree/main/t2i/tokenizer)         |         [Tokenizer](https://dit.hunyuan.tencent.com/download/HunyuanDiT/tokenizer.zip)         |
+|     DialogGen     |  7.0B   |           [DialogGen](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT/tree/main/dialoggen)           |         [DialogGen](https://dit.hunyuan.tencent.com/download/HunyuanDiT/dialoggen.zip)         |
+| sdxl-vae-fp16-fix |   83M   | [sdxl-vae-fp16-fix](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT/tree/main/t2i/sdxl-vae-fp16-fix) | [sdxl-vae-fp16-fix](https://dit.hunyuan.tencent.com/download/HunyuanDiT/sdxl-vae-fp16-fix.zip) |
+| Hunyuan-DiT-v1.0  |  1.5B   |          [Hunyuan-DiT](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT/tree/main/t2i/model)          |       [Hunyuan-DiT-v1.0](https://dit.hunyuan.tencent.com/download/HunyuanDiT/model.zip)        |
+| Hunyuan-DiT-v1.1  |  1.5B   |     [Hunyuan-DiT-v1.1](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.1/tree/main/t2i/model)     |     [Hunyuan-DiT-v1.1](https://dit.hunyuan.tencent.com/download/HunyuanDiT/model-v1_1.zip)     |
+| Hunyuan-DiT-v1.2  |  1.5B   |     [Hunyuan-DiT-v1.2](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.2/tree/main/t2i/model)     |     [Hunyuan-DiT-v1.2](https://dit.hunyuan.tencent.com/download/HunyuanDiT/model-v1_2.zip)     |
+|     Data demo     |  -      |                                                   -                                                    |         [Data demo](https://dit.hunyuan.tencent.com/download/HunyuanDiT/data_demo.zip)         |
+## :truck: Training
+### Data Preparation
+  Refer to the commands below to prepare the training data. 
+  1. Install dependencies
+      We offer an efficient data management library, named IndexKits, supporting the management of reading hundreds of millions of data during training, see more in [docs](./IndexKits/README.md).
+      ```shell
+      # 1 Install dependencies
+      cd HunyuanDiT
+      pip install -e ./IndexKits
+     ```
+  2. Data download 
+     Feel free to download the [data demo](https://dit.hunyuan.tencent.com/download/HunyuanDiT/data_demo.zip).
+     ```shell
+     # 2 Data download
+     wget -O ./dataset/data_demo.zip https://dit.hunyuan.tencent.com/download/HunyuanDiT/data_demo.zip
+     unzip ./dataset/data_demo.zip -d ./dataset
+     mkdir ./dataset/porcelain/arrows ./dataset/porcelain/jsons
+     ```
+  3. Data conversion 
+     Create a CSV file for training data with the fields listed in the table below.
+     |    Fields       | Required  |  Description     |   Example   |
+     |:---------------:| :------:  |:----------------:|:-----------:|
+     |   `image_path`  | Required  |  image path               |     `./dataset/porcelain/images/0.png`        | 
+     |   `text_zh`     | Required  |    text               |  青花瓷风格，一只蓝色的鸟儿站在蓝色的花瓶上，周围点缀着白色花朵，背景是白色 | 
+     |   `md5`         | Optional  |    image md5 (Message Digest Algorithm 5)  |    `d41d8cd98f00b204e9800998ecf8427e`         | 
+     |   `width`       | Optional  |    image width    |     `1024 `       | 
+     |   `height`      | Optional  |    image height   |    ` 1024 `       | 
+     > ⚠️ Optional fields like MD5, width, and height can be omitted. If omitted, the script below will automatically calculate them. This process can be time-consuming when dealing with large-scale training data.
+     We utilize [Arrow](https://github.com/apache/arrow) for training data format, offering a standard and efficient in-memory data representation. A conversion script is provided to transform CSV files into Arrow format.
+     ```shell  
+     # 3 Data conversion 
+     python ./hydit/data_loader/csv2arrow.py ./dataset/porcelain/csvfile/image_text.csv ./dataset/porcelain/arrows 1
+     ```
+  4. Data Selection and Configuration File Creation 
+      We configure the training data through YAML files. In these files, you can set up standard data processing strategies for filtering, copying, deduplicating, and more regarding the training data. For more details, see [./IndexKits](IndexKits/docs/MakeDataset.md).
+      For a sample file, please refer to [file](./dataset/yamls/porcelain.yaml). For a full parameter configuration file, see [file](./IndexKits/docs/MakeDataset.md).
+  5. Create training data index file using YAML file.
+     ```shell
+      # Single Resolution Data Preparation
+      idk base -c dataset/yamls/porcelain.yaml -t dataset/porcelain/jsons/porcelain.json
+      # Multi Resolution Data Preparation     
+      idk multireso -c dataset/yamls/porcelain_mt.yaml -t dataset/porcelain/jsons/porcelain_mt.json
+      ```
+  The directory structure for `porcelain` dataset is:
+  ```shell
+   cd ./dataset
+   porcelain
+      ├──images/  (image files)
+      │  ├──0.png
+      │  ├──1.png
+      │  ├──......
+      ├──csvfile/  (csv files containing text-image pairs)
+      │  ├──image_text.csv
+      ├──arrows/  (arrow files containing all necessary training data)
+      │  ├──00000.arrow
+      │  ├──00001.arrow
+      │  ├──......
+      ├──jsons/  (final training data index files which read data from arrow files during training)
+      │  ├──porcelain.json
+      │  ├──porcelain_mt.json
+   ```
+### Full-parameter Training
+  **Requirement:** 
+  1. The minimum requriment is a single GPU with at least 20GB memory, but we recommend to use a GPU with about 30 GB memory to avoid host memory offloading. 
+  2. Additionally, we encourage users to leverage the multiple GPUs across different nodes to speed up training on large datasets. 
+  **Notice:**
+  1. Personal users can also use the light-weight Kohya to finetune the model with about 16 GB memory. Currently, we are trying to further reduce the memory usage of our industry-level framework for personal users. 
+  2. If you have enough GPU memory, please try to remove  `--cpu-offloading` or `--gradient-checkpointing` for less time costs.
+  Specifically for distributed training, you have the flexibility to control **single-node** / **multi-node** training by adjusting parameters such as `--hostfile` and `--master_addr`. For more details, see [link](https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node).
+  ```shell
+  # Single Resolution Training
+  PYTHONPATH=./ sh hydit/train.sh --index-file dataset/porcelain/jsons/porcelain.json
+  # Multi Resolution Training
+  PYTHONPATH=./ sh hydit/train.sh --index-file dataset/porcelain/jsons/porcelain_mt.json --multireso --reso-step 64
+  # Training with old version of HunyuanDiT (<= v1.1)
+  PYTHONPATH=./ sh hydit/train_v1.1.sh --index-file dataset/porcelain/jsons/porcelain.json
+  ```
+  After checkpoints are saved, you can use the following command to evaluate the model.
+  ```shell
+  # Inference
+    #   You should replace the 'log_EXP/xxx/checkpoints/final.pt' with your actual path.
+  python sample_t2i.py --infer-mode fa --prompt "青花瓷风格，一只可爱的哈士奇" --no-enhance --dit-weight log_EXP/xxx/checkpoints/final.pt --load-key module
+  # Old version of HunyuanDiT (<= v1.1)
+  #   You should replace the 'log_EXP/xxx/checkpoints/final.pt' with your actual path.
+  python sample_t2i.py --infer-mode fa --prompt "青花瓷风格，一只可爱的哈士奇" --model-root ./HunyuanDiT-v1.1 --use-style-cond --size-cond 1024 1024 --beta-end 0.03 --no-enhance --dit-weight log_EXP/xxx/checkpoints/final.pt --load-key module
+  ```
+### LoRA
+We provide training and inference scripts for LoRA, detailed in the [./lora](./lora/README.md). 
+  ```shell
+  # Training for porcelain LoRA.
+  PYTHONPATH=./ sh lora/train_lora_with_fa.sh --index-file dataset/porcelain/jsons/porcelain.json
+  # Inference using trained LORA weights.
+  python sample_t2i.py --infer-mode fa --prompt "青花瓷风格，一只小狗"  --no-enhance --lora-ckpt log_EXP/001-lora_porcelain_ema_rank64/checkpoints/0001000.pt
+  ```
+ If you can't install flash_attn, use code:
+  ```shell
+  # Training for porcelain LoRA.
+  PYTHONPATH=./ sh lora/train_lora.sh --index-file dataset/porcelain/jsons/porcelain.json
+  # Inference using trained LORA weights.
+  python sample_t2i.py --infer-mode torch --prompt "青花瓷风格，一只小狗"  --no-enhance --lora-ckpt log_EXP/001-lora_porcelain_ema_rank64/checkpoints/0001000.pt
+  ```
+ We offer two types of trained LoRA weights for `porcelain` and `jade`, see details at [links](https://huggingface.co/Tencent-Hunyuan/HYDiT-LoRA)
+  ```shell
+  cd HunyuanDiT
+  # Use the huggingface-cli tool to download the model.
+  huggingface-cli download Tencent-Hunyuan/HYDiT-LoRA --local-dir ./ckpts/t2i/lora
+  # Quick start
+  python sample_t2i.py --infer-mode fa --prompt "青花瓷风格，一只猫在追蝴蝶"  --no-enhance --load-key ema --lora-ckpt ./ckpts/t2i/lora/porcelain
+  ```
+ <table>
+  <tr>
+    <td colspan="4" align="center">Examples of training data</td>
+  </tr>
+  <tr>
+    <td align="center"><img src="lora/asset/porcelain/train/0.png" alt="Image 0" width="200"/></td>
+    <td align="center"><img src="lora/asset/porcelain/train/1.png" alt="Image 1" width="200"/></td>
+    <td align="center"><img src="lora/asset/porcelain/train/2.png" alt="Image 2" width="200"/></td>
+    <td align="center"><img src="lora/asset/porcelain/train/3.png" alt="Image 3" width="200"/></td>
+  </tr>
+  <tr>
+    <td align="center">青花瓷风格，一只蓝色的鸟儿站在蓝色的花瓶上，周围点缀着白色花朵，背景是白色 （Porcelain style, a blue bird stands on a blue vase, surrounded by white flowers, with a white background.
+）</td>
+    <td align="center">青花瓷风格，这是一幅蓝白相间的陶瓷盘子，上面描绘着一只狐狸和它的幼崽在森林中漫步，背景是白色 （Porcelain style, this is a blue and white ceramic plate depicting a fox and its cubs strolling in the forest, with a white background.）</td>
+    <td align="center">青花瓷风格，在黑色背景上，一只蓝色的狼站在蓝白相间的盘子上，周围是树木和月亮 （Porcelain style, on a black background, a blue wolf stands on a blue and white plate, surrounded by trees and the moon.）</td>
+    <td align="center">青花瓷风格，在蓝色背景上，一只蓝色蝴蝶和白色花朵被放置在中央 （Porcelain style, on a blue background, a blue butterfly and white flowers are placed in the center.）</td>
+  </tr>
+  <tr>
+    <td colspan="4" align="center">Examples of inference results</td>
+  </tr>
+  <tr>
+    <td align="center"><img src="lora/asset/porcelain/inference/0.png" alt="Image 4" width="200"/></td>
+    <td align="center"><img src="lora/asset/porcelain/inference/1.png" alt="Image 5" width="200"/></td>
+    <td align="center"><img src="lora/asset/porcelain/inference/2.png" alt="Image 6" width="200"/></td>
+    <td align="center"><img src="lora/asset/porcelain/inference/3.png" alt="Image 7" width="200"/></td>
+  </tr>
+  <tr>
+    <td align="center">青花瓷风格，苏州园林 （Porcelain style,  Suzhou Gardens.）</td>
+    <td align="center">青花瓷风格，一朵荷花 （Porcelain style,  a lotus flower.）</td>
+    <td align="center">青花瓷风格，一只羊（Porcelain style, a sheep.）</td>
+    <td align="center">青花瓷风格，一个女孩在雨中跳舞（Porcelain style, a girl dancing in the rain.）</td>
+  </tr>
+</table>
+## 🔑 Inference
+### 6GB GPU VRAM Inference
+Running HunyuanDiT in under 6GB GPU VRAM is available now based on [diffusers](https://huggingface.co/docs/diffusers/main/en/api/pipelines/hunyuandit). Here we provide instructions and demo for your quick start.
+> The 6GB version supports Nvidia Ampere architecture series graphics cards such as RTX 3070/3080/4080/4090, A100, and so on.
+The only thing you need do is to install the following library:
+```bash
+pip install -U bitsandbytes
+pip install git+https://github.com/huggingface/diffusers
+pip install torch==2.0.0
+```
+Then you can enjoy your HunyuanDiT text-to-image journey under 6GB GPU VRAM directly!
+Here is a demo for you.
+```bash
+cd HunyuanDiT
+# Quick start
+model_id=Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers-Distilled
+prompt=一个宇航员在骑马
+infer_steps=50
+guidance_scale=6
+python3 lite/inference.py ${model_id} ${prompt} ${infer_steps} ${guidance_scale}
+```
+More details can be found in [./lite](lite/README.md).
+### Using Gradio
+Make sure the conda environment is activated before running the following command.
+```shell
+# By default, we start a Chinese UI. Using Flash Attention for acceleration.
+python app/hydit_app.py --infer-mode fa
+# Using special port and host
+python app/hydit_app.py --infer-mode fa --server_name 0.0.0.0 --server_port 443 --load-key distill
+# You can disable the enhancement model if the GPU memory is insufficient.
+# The enhancement will be unavailable until you restart the app without the `--no-enhance` flag. 
+python app/hydit_app.py --no-enhance --infer-mode fa
+# Start with English UI
+python app/hydit_app.py --lang en --infer-mode fa
+# Start a multi-turn T2I generation UI. 
+# If your GPU memory is less than 32GB, use '--load-4bit' to enable 4-bit quantization, which requires at least 22GB of memory.
+python app/multiTurnT2I_app.py --infer-mode fa
+```
+Then the demo can be accessed through http://0.0.0.0:443. It should be noted that the 0.0.0.0 here needs to be X.X.X.X with your server IP.
+### Using 🤗 Diffusers
+Please install PyTorch version 2.0 or higher in advance to satisfy the requirements of the specified version of the diffusers library.  
+Install 🤗 diffusers, ensuring that the version is at least 0.28.1:
+```shell
+pip install git+https://github.com/huggingface/diffusers.git
+```
+or
+```shell
+pip install diffusers
+```
+You can generate images with both Chinese and English prompts using the following Python script:
+```py
+import torch
+from diffusers import HunyuanDiTPipeline
+pipe = HunyuanDiTPipeline.from_pretrained("Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers", torch_dtype=torch.float16)
+pipe.to("cuda")
+# You may also use English prompt as HunyuanDiT supports both English and Chinese
+# prompt = "An astronaut riding a horse"
+prompt = "一个宇航员在骑马"
+image = pipe(prompt).images[0]
+```
+You can use our distilled model to generate images even faster:
+```py
+import torch
+from diffusers import HunyuanDiTPipeline
+pipe = HunyuanDiTPipeline.from_pretrained("Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers-Distilled", torch_dtype=torch.float16)
+pipe.to("cuda")
+# You may also use English prompt as HunyuanDiT supports both English and Chinese
+# prompt = "An astronaut riding a horse"
+prompt = "一个宇航员在骑马"
+image = pipe(prompt, num_inference_steps=25).images[0]
+```
+More details can be found in [HunyuanDiT-v1.2-Diffusers-Distilled](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers-Distilled)
+**More functions:** For other functions like LoRA and ControlNet, please have a look at the README of [./diffusers](diffusers).
+### Using Command Line
+We provide several commands to quick start: 
+```shell
+# Only Text-to-Image. Flash Attention mode
+python sample_t2i.py --infer-mode fa --prompt "渔舟唱晚" --no-enhance
+# Generate an image with other image sizes.
+python sample_t2i.py --infer-mode fa --prompt "渔舟唱晚" --image-size 1280 768
+# Prompt Enhancement + Text-to-Image. DialogGen loads with 4-bit quantization, but it may loss performance.
+python sample_t2i.py --infer-mode fa --prompt "渔舟唱晚"  --load-4bit
+```
+More example prompts can be found in [example_prompts.txt](example_prompts.txt)
+### More Configurations
+We list some more useful configurations for easy usage:
+|    Argument     |  Default  |                     Description                     |
+|:---------------:|:---------:|:---------------------------------------------------:|
+|   `--prompt`    |   None    |        The text prompt for image generation         |
+| `--image-size`  | 1024 1024 |           The size of the generated image           |
+|    `--seed`     |    42     |        The random seed for generating images        |
+| `--infer-steps` |    100    |          The number of steps for sampling           |
+|  `--negative`   |     -     |      The negative prompt for image generation       |
+| `--infer-mode`  |   torch   |       The inference mode (torch, fa, or trt)        |
+|   `--sampler`   |   ddpm    |    The diffusion sampler (ddpm, ddim, or dpmms)     |
+| `--no-enhance`  |   False   |        Disable the prompt enhancement model         |
+| `--model-root`  |   ckpts   |     The root directory of the model checkpoints     |
+|  `--load-key`   |    ema    | Load the student model or EMA model (ema or module) |
+|  `--load-4bit`  |   Fasle   |     Load DialogGen model with 4bit quantization     |
+### Using ComfyUI
+- Support two workflows: Standard ComfyUI and Diffusers Wrapper, with the former being recommended.
+- Support HunyuanDiT-v1.1 and v1.2.
+- Support module, lora and clip lora models trained by Kohya.
+- Support module, lora models trained by HunyunDiT official training scripts.
+- ControlNet support.
+More details can be found in [./comfyui](comfyui/README.md)
+### Using Kohya
+We support custom codes for kohya_ss GUI, and sd-scripts training codes for HunyuanDiT.
+![dreambooth](kohya_ss-hydit/img/dreambooth.png)
+More details can be found in [./kohya_ss-hydit](kohya_ss-hydit/README.md)
+### Using Previous versions
+* **Hunyuan-DiT <= v1.1**
+```shell
+# ============================== v1.1 ==============================
+# Download the model
+huggingface-cli download Tencent-Hunyuan/HunyuanDiT-v1.1 --local-dir ./HunyuanDiT-v1.1
+# Inference with the model
+python sample_t2i.py --infer-mode fa --prompt "渔舟唱晚" --model-root ./HunyuanDiT-v1.1 --use-style-cond --size-cond 1024 1024 --beta-end 0.03
+# ============================== v1.0 ==============================
+# Download the model
+huggingface-cli download Tencent-Hunyuan/HunyuanDiT --local-dir ./HunyuanDiT-v1.0
+# Inference with the model
+python sample_t2i.py --infer-mode fa --prompt "渔舟唱晚" --model-root ./HunyuanDiT-v1.0 --use-style-cond --size-cond 1024 1024 --beta-end 0.03
+```
+## :building_construction: Adapter
+### ControlNet
+We provide training scripts for ControlNet, detailed in the [./controlnet](./controlnet/README.md). 
+  ```shell
+  # Training for canny ControlNet.
+  PYTHONPATH=./ sh hydit/train_controlnet.sh
+  ```
+ We offer three types of trained ControlNet weights for `canny` `depth` and `pose`, see details at [links](https://huggingface.co/Tencent-Hunyuan/HYDiT-ControlNet)
+  ```shell
+  cd HunyuanDiT
+  # Use the huggingface-cli tool to download the model.
+  # We recommend using distilled weights as the base model for ControlNet inference, as our provided pretrained weights are trained on them.
+  huggingface-cli download Tencent-Hunyuan/HYDiT-ControlNet-v1.2 --local-dir ./ckpts/t2i/controlnet
+  huggingface-cli download Tencent-Hunyuan/Distillation-v1.2 ./pytorch_model_distill.pt --local-dir ./ckpts/t2i/model
+  # Quick start
+  python3 sample_controlnet.py --infer-mode fa --no-enhance --load-key distill --infer-steps 50 --control-type canny --prompt "在夜晚的酒店门前，一座古老的中国风格的狮子雕像矗立着，它的眼睛闪烁着光芒，仿佛在守护着这座建筑。背景是夜晚的酒店前，构图方式是特写，平视，居中构图。这张照片呈现了真实摄影风格，蕴含了中国雕塑文化，同时展现了神秘氛围" --condition-image-path controlnet/asset/input/canny.jpg --control-weight 1.0
+  ```
+ <table>
+  <tr>
+    <td colspan="3" align="center">Condition Input</td>
+  </tr>
+   <tr>
+    <td align="center">Canny ControlNet </td>
+    <td align="center">Depth ControlNet </td>
+    <td align="center">Pose ControlNet </td>
+  </tr>
+  <tr>
+    <td align="center">在夜晚的酒店门前，一座古老的中国风格的狮子雕像矗立着，它的眼睛闪烁着光芒，仿佛在守护着这座建筑。背景是夜晚的酒店前，构图方式是特写，平视，居中构图。这张照片呈现了真实摄影风格，蕴含了中国雕塑文化，同时展现了神秘氛围<br>（At night, an ancient Chinese-style lion statue stands in front of the hotel, its eyes gleaming as if guarding the building. The background is the hotel entrance at night, with a close-up, eye-level, and centered composition. This photo presents a realistic photographic style, embodies Chinese sculpture culture, and reveals a mysterious atmosphere.） </td>
+    <td align="center">在茂密的森林中，一只黑白相间的熊猫静静地坐在绿树红花中，周围是山川和海洋。背景是白天的森林，光线充足。照片采用特写、平视和居中构图的方式，呈现出写实的效果<br>（In the dense forest, a black and white panda sits quietly among the green trees and red flowers, surrounded by mountains and oceans. The background is a daytime forest with ample light. The photo uses a close-up, eye-level, and centered composition to create a realistic effect.） </td>
+    <td align="center">在白天的森林中，一位穿着绿色上衣的亚洲女性站在大象旁边。照片采用了中景、平视和居中构图的方式，呈现出写实的效果。这张照片蕴含了人物摄影文化，并展现了宁静的氛围<br>（In the daytime forest, an Asian woman wearing a green shirt stands beside an elephant. The photo uses a medium shot, eye-level, and centered composition to create a realistic effect. This picture embodies the character photography culture and conveys a serene atmosphere.） </td>
+  </tr>
+  <tr>
+    <td align="center"><img src="controlnet/asset/input/canny.jpg" alt="Image 0" width="200"/></td>
+    <td align="center"><img src="controlnet/asset/input/depth.jpg" alt="Image 1" width="200"/></td>
+    <td align="center"><img src="controlnet/asset/input/pose.jpg" alt="Image 2" width="200"/></td>
+  </tr>
+  <tr>
+    <td colspan="3" align="center">ControlNet Output</td>
+  </tr>
+  <tr>
+    <td align="center"><img src="controlnet/asset/output/canny.jpg" alt="Image 3" width="200"/></td>
+    <td align="center"><img src="controlnet/asset/output/depth.jpg" alt="Image 4" width="200"/></td>
+    <td align="center"><img src="controlnet/asset/output/pose.jpg" alt="Image 5" width="200"/></td>
+  </tr>
+</table>
+### IP-Adapter
+  We provide training scripts for IP-Adapter, detailed in the [./ipadapter](./ipadapter/README.md). 
+  ```shell
+  # Training for IP-Adapter.
+  PYTHONPATH=./ sh hydit/train_ipadapter.sh
+  ```
+   We offer  trained IP-Adapter weights, see details at [links](https://huggingface.co/Tencent-Hunyuan/HYDiT-IP-Adapter)
+  ```shell
+  cd HunyuanDiT
+  # Use the huggingface-cli tool to download the model.
+  # We recommend using module weights as the base model for IP-Adapter inference, as our provided pretrained weights are trained on them.
+  huggingface-cli download Tencent-Hunyuan/IP-Adapter ipa.pt --local-dir ./ckpts/t2i/model
+  huggingface-cli download Tencent-Hunyuan/IP-Adapter clip_img_encoder.pt  --local-dir ./ckpts/t2i/model/clip_img_encoder
+  # Quick start
+  python3 sample_ipadapter.py  --infer-mode fa --ref-image-path ipadapter/asset/input/tiger.png --i-scale 1.0 --prompt 一只老虎在海洋中游泳，背景是海洋。构图方式是居中构图，呈现了动漫风格和文化，营造了平静的氛围。 --infer-steps 100 --is-ipa True --load-key distill
+  ```
+Examples of ref input and IP-Adapter results are as follows:
+<table>
+  <tr>
+    <td colspan="3" align="center">Ref Input</td>
+  </tr>
+  <tr>
+    <td align="center"><img src="ipadapter/asset/input/tiger.png" alt="Image 0" width="200"/></td>
+    <td align="center"><img src="ipadapter/asset/input/beauty.png" alt="Image 1" width="200"/></td>
+    <td align="center"><img src="ipadapter/asset/input/xunyicao.png" alt="Image 2" width="200"/></td>
+  </tr>
+  <tr>
+    <td colspan="3" align="center">IP-Adapter Output</td>
+  </tr>
+  <tr>
+    <td align="center">一只老虎在奔跑。<br>（A tiger running.） </td>
+    <td align="center">一个卡通美女，抱着一只小猪。<br>（A cartoon beauty holding a little pig.） </td>
+    <td align="center">一片紫色薰衣草地。<br>（A purple lavender field.） </td>
+  </tr>
+  <tr>
+    <td align="center"><img src="ipadapter/asset/output/tiger_run.png" alt="Image 3" width="200"/></td>
+    <td align="center"><img src="ipadapter/asset/output/beauty_pig.png" alt="Image 4" width="200"/></td>
+    <td align="center"><img src="ipadapter/asset/output/xunyicao_res.png" alt="Image 5" width="200"/></td>
+  </tr>
+  <tr>
+    <td align="center">一只老虎在看书。<br>（A tiger is reading a book.） </td>
+    <td align="center">一个卡通美女，穿着绿色衣服。<br>（A cartoon beauty wearing green clothes.） </td>
+    <td align="center">一片紫色薰衣草地，有一只可爱的小狗。<br>（A purple lavender field with a cute puppy.） </td>
+  </tr>
+  <tr>
+    <td align="center"><img src="ipadapter/asset/output/tiger_book.png" alt="Image 3" width="200"/></td>
+    <td align="center"><img src="ipadapter/asset/output/beauty_green_cloth.png" alt="Image 4" width="200"/></td>
+    <td align="center"><img src="ipadapter/asset/output/xunyicao_dog.png" alt="Image 5" width="200"/></td>
+  </tr>
+  <tr>
+    <td align="center">一只老虎在咆哮。<br>（A tiger is roaring.） </td>
+    <td align="center">一个卡通美女，戴着墨镜。<br>（A cartoon beauty wearing sunglasses.） </td>
+    <td align="center">水墨风格,一片紫色薰衣草地。<br>（Ink style. A purple lavender field.） </td>
+  </tr>
+  <tr>
+    <td align="center"><img src="ipadapter/asset/output/tiger_roar.png" alt="Image 3" width="200"/></td>
+    <td align="center"><img src="ipadapter/asset/output/beauty_glass.png" alt="Image 4" width="200"/></td>
+    <td align="center"><img src="ipadapter/asset/output/xunyicao_style.png" alt="Image 5" width="200"/></td>
+  </tr>
+</table>
+## :art: Hunyuan-Captioner
+Hunyuan-Captioner meets the need of text-to-image techniques by maintaining a high degree of image-text consistency. It can generate high-quality image descriptions from a variety of angles, including object description, objects relationships, background information, image style, etc. Our code is based on [LLaVA](https://github.com/haotian-liu/LLaVA) implementation.
+### Examples
+<td align="center"><img src="./asset/caption_demo.jpg" alt="Image 3" width="1200"/></td>
+### Instructions
+a. Install dependencies
+The dependencies and installation are basically the same as the [**base model**](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.2).
+b. Model download
+```shell
+# Use the huggingface-cli tool to download the model.
+huggingface-cli download Tencent-Hunyuan/HunyuanCaptioner --local-dir ./ckpts/captioner
+```
+### Inference
+Our model supports three different modes including: **directly generating Chinese caption**, **generating Chinese caption based on specific knowledge**, and **directly generating English caption**. The injected information can be either accurate cues or noisy labels (e.g., raw descriptions crawled from the internet). The model is capable of generating reliable and accurate descriptions based on both the inserted information and the image content.
+|Mode           | Prompt Template                           |Description                           | 
+| ---           | ---                                       | ---                                  |
+|caption_zh     | 描述这张图片                               |Caption in Chinese                    | 
+|insert_content | 根据提示词“{}”,描述这张图片                 |Caption with inserted knowledge| 
+|caption_en     | Please describe the content of this image |Caption in English                    |
+|               |                                           |                                      |
+a. Single picture inference in Chinese
+```bash
+python mllm/caption_demo.py --mode "caption_zh" --image_file "mllm/images/demo1.png" --model_path "./ckpts/captioner"
+```
+b. Insert specific knowledge into caption
+```bash
+python mllm/caption_demo.py --mode "insert_content" --content "宫保鸡丁" --image_file "mllm/images/demo2.png" --model_path "./ckpts/captioner"
+```
+c. Single picture inference in English
+```bash
+python mllm/caption_demo.py --mode "caption_en" --image_file "mllm/images/demo3.png" --model_path "./ckpts/captioner"
+```
+d. Multiple pictures inference in Chinese
+```bash
+### Convert multiple pictures to csv file. 
+python mllm/make_csv.py --img_dir "mllm/images" --input_file "mllm/images/demo.csv"
+### Multiple pictures inference
+python mllm/caption_demo.py --mode "caption_zh" --input_file "mllm/images/demo.csv" --output_file "mllm/images/demo_res.csv" --model_path "./ckpts/captioner"
+```
+(Optional) To convert the output csv file to Arrow format, please refer to [Data Preparation #3](#data-preparation) for detailed instructions. 
+### Gradio 
+To launch a Gradio demo locally, please run the following commands one by one. For more detailed instructions, please refer to [LLaVA](https://github.com/haotian-liu/LLaVA). 
+```bash
+cd mllm
+python -m llava.serve.controller --host 0.0.0.0 --port 10000
+python -m llava.serve.gradio_web_server --controller http://0.0.0.0:10000 --model-list-mode reload --port 443
+python -m llava.serve.model_worker --host 0.0.0.0 --controller http://0.0.0.0:10000 --port 40000 --worker http://0.0.0.0:40000 --model-path "../ckpts/captioner" --model-name LlavaMistral
+```
+Then the demo can be accessed through http://0.0.0.0:443. It should be noted that the 0.0.0.0 here needs to be X.X.X.X with your server IP.
+## 🚀 Acceleration (for Linux)
+- We provide TensorRT version of HunyuanDiT for inference acceleration (faster than flash attention).
+See [Tencent-Hunyuan/TensorRT-libs](https://huggingface.co/Tencent-Hunyuan/TensorRT-libs) for more details.
+- We provide Distillation version of HunyuanDiT for inference acceleration.
+See [Tencent-Hunyuan/Distillation](https://huggingface.co/Tencent-Hunyuan/Distillation) for more details.
+## 🔗 BibTeX
+If you find [Hunyuan-DiT](https://arxiv.org/abs/2405.08748) or [DialogGen](https://arxiv.org/abs/2403.08857) useful for your research and applications, please cite using this BibTeX:
+```BibTeX
+@misc{li2024hunyuandit,
+      title={Hunyuan-DiT: A Powerful Multi-Resolution Diffusion Transformer with Fine-Grained Chinese Understanding}, 
+      author={Zhimin Li and Jianwei Zhang and Qin Lin and Jiangfeng Xiong and Yanxin Long and Xinchi Deng and Yingfang Zhang and Xingchao Liu and Minbin Huang and Zedong Xiao and Dayou Chen and Jiajun He and Jiahao Li and Wenyue Li and Chen Zhang and Rongwei Quan and Jianxiang Lu and Jiabin Huang and Xiaoyan Yuan and Xiaoxiao Zheng and Yixuan Li and Jihong Zhang and Chao Zhang and Meng Chen and Jie Liu and Zheng Fang and Weiyan Wang and Jinbao Xue and Yangyu Tao and Jianchen Zhu and Kai Liu and Sihuan Lin and Yifu Sun and Yun Li and Dongdong Wang and Mingtao Chen and Zhichao Hu and Xiao Xiao and Yan Chen and Yuhong Liu and Wei Liu and Di Wang and Yong Yang and Jie Jiang and Qinglin Lu},
+      year={2024},
+      eprint={2405.08748},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+@article{huang2024dialoggen,
+  title={DialogGen: Multi-modal Interactive Dialogue System for Multi-turn Text-to-Image Generation},
+  author={Huang, Minbin and Long, Yanxin and Deng, Xinchi and Chu, Ruihang and Xiong, Jiangfeng and Liang, Xiaodan and Cheng, Hong and Lu, Qinglin and Liu, Wei},
+  journal={arXiv preprint arXiv:2403.08857},
+  year={2024}
+}
+```
+## Start History
+<a href="https://star-history.com/#Tencent/HunyuanDiT&Date">
+ <picture>
+   <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=Tencent/HunyuanDiT&type=Date&theme=dark" />
+   <source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=Tencent/HunyuanDiT&type=Date" />
+   <img alt="Star History Chart" src="https://api.star-history.com/svg?repos=Tencent/HunyuanDiT&type=Date" />
+ </picture>
+</a>
--- a/app/default.png
+++ b/app/default.png
--- a/app/fail.png
+++ b/app/fail.png
--- a/app/hydit_app.py
+++ b/app/hydit_app.py
+import gradio as gr
+import pandas as pd
+from pathlib import Path
+from PIL import Image, PngImagePlugin
+import sys
+import numpy as np
+import torch
+from torchvision import transforms as T
+sys.path.insert(0, str(Path(__file__).parent.parent))
+import datetime
+from hydit.constants import SAMPLER_FACTORY
+from sample_t2i import inferencer
+import os
+ROOT = Path(__file__).parent.parent
+SAMPLERS = list(SAMPLER_FACTORY.keys())
+norm_transform = T.Compose(
+    [
+        T.ToTensor(),
+        T.Normalize([0.5], [0.5]),
+    ]
+)
+def get_strings(lang):
+    lang_file = Path(f"app/lang/{lang}.csv")
+    strings = pd.read_csv(lang_file, header=0)
+    strings = strings.set_index("key")["value"].to_dict()
+    return strings
+def get_files_with_extension(path, extension):
+    return {
+        os.path.splitext(file)[0]: os.path.join(path, file)
+        for file in os.listdir(path)
+        if os.path.isfile(os.path.join(path, file))
+        and any(file.endswith(ext) for ext in extension)
+    }
+args, gen, enhancer = inferencer()
+output_dir = ROOT / f"{args.output_img_path}"
+os.makedirs(output_dir, exist_ok=True)
+strings = get_strings(args.lang)
+controlnet_list = get_files_with_extension(
+    args.model_root + "/t2i/controlnet",
+    [".pt", ".safetensors"],
+)
+module_list = get_files_with_extension(
+    args.model_root + "/t2i/model",
+    [".pt", ".safetensors"],
+)
+lora_list = get_files_with_extension(
+    args.model_root + "/t2i/lora",
+    [".pt", ".safetensors"],
+)
+def upgrade_dit_model_load(model):
+    model_path = module_list[model]
+    gen.args.dit_weight = model_path
+    gen.load_torch_weights()
+def generate_metadata(
+    prompt,
+    negative_prompt,
+    seed,
+    cfg_scale,
+    infer_steps,
+    sampler,
+    imgW,
+    imgH,
+    controlnet_module,
+    control_weight,
+    lora_ctrls,
+):
+    """生成图像元数据。"""
+    return {
+        "parameters": "Power by HunYun",
+        "prompt": prompt,
+        "negative_prompt": negative_prompt,
+        "seed": seed,
+        "cfg_scale": cfg_scale,
+        "infer_steps": infer_steps,
+        "sampler": sampler,
+        "imgW": imgW,
+        "imgH": imgH,
+        "controlnet_module": controlnet_module,
+        "control_weight": control_weight,
+        "lora_ctrls": [
+            {
+                "lora_enabled": lora_ctrl[0],
+                "lora_model": lora_ctrl[1],
+                "lora_weight": lora_ctrl[2],
+            }
+            for lora_ctrl in zip(*[iter(lora_ctrls)] * 3)
+        ],
+        "model_name": gen.model_name,
+    }
+def infer(
+    prompt,
+    negative_prompt,
+    seed,
+    cfg_scale,
+    infer_steps,
+    sampler,
+    imgW,
+    imgH,
+    input_image,
+    controlnet_module,
+    control_weight,
+    enhance,
+    img_crop_type,
+    *lora_ctrls,
+):
+    if enhance and enhancer is not None:
+        success, enhanced_prompt = enhancer(prompt)
+        if not success:
+            fail_image = Image.open(ROOT / "app/fail.png")
+            return fail_image
+    else:
+        enhanced_prompt = None
+    active_loras = [
+        {"model": lora_ctrls[i + 1], "weight": lora_ctrls[i + 2]}
+        for i in range(0, len(lora_ctrls), 3)
+        if lora_ctrls[i]
+    ]
+    if input_image is not None:
+        # # Convert image to PyTorch tensor if it is a NumPy array
+        if isinstance(input_image, np.ndarray):
+            input_image = Image.fromarray(input_image).convert("RGB")
+        input_image = gen.pixel_perfect_resolution(
+            input_image, imgH, imgW, img_crop_type
+        )
+        # Apply the normalization transform
+        input_image = norm_transform(input_image)
+        # Add batch dimension and move to GPU (if available)
+        input_image = (
+            input_image.unsqueeze(0).cuda()
+            if torch.cuda.is_available()
+            else input_image.unsqueeze(0)
+        )
+    results = gen.predict(
+        prompt,
+        image=input_image,
+        height=imgH,
+        width=imgW,
+        seed=seed,
+        enhanced_prompt=enhanced_prompt,
+        negative_prompt=negative_prompt,
+        infer_steps=infer_steps,
+        guidance_scale=cfg_scale,
+        batch_size=1,
+        src_size_cond=None,
+        sampler=sampler,
+        control_weight=control_weight,
+        controlnet=controlnet_module,
+        lora_ctrls=active_loras,
+    )
+    image = results["images"][0]
+    seed = results["seed"]
+    metadata = generate_metadata(
+        prompt,
+        negative_prompt,
+        seed,
+        cfg_scale,
+        infer_steps,
+        sampler,
+        imgW,
+        imgH,
+        controlnet_module,
+        control_weight,
+        active_loras,
+    )
+    timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    output_path = output_dir.joinpath(f"generated_image_{timestamp}_{seed}.png")
+    png_info = PngImagePlugin.PngInfo()
+    for k, v in metadata.items():
+        png_info.add_text(k, str(v))
+    image.save(
+        output_path,
+        pnginfo=png_info,
+    )
+    return image
+def ui():
+    block = gr.Blocks()
+    description = f"""
+    # {strings['title']}
+    ## {strings['desc']}
+    """
+    with block:
+        with gr.Row():
+            gr.Markdown(description)
+        with gr.Row():
+            with gr.Column():
+                prompt = gr.Textbox(
+                    label=strings["prompt"], value=strings["default prompt"], lines=3
+                )
+                with gr.Row():
+                    imgW = gr.Slider(
+                        label=strings["width"],
+                        minimum=64,
+                        maximum=4096,
+                        value=1024,
+                        step=64,
+                    )
+                    imgH = gr.Slider(
+                        label=strings["height"],
+                        minimum=64,
+                        maximum=4096,
+                        value=1024,
+                        step=64,
+                    )
+                with gr.Row():
+                    infer_steps = gr.Slider(
+                        label=strings["infer steps"],
+                        minimum=1,
+                        maximum=200,
+                        value=100,
+                        step=1,
+                    )
+                    seed = gr.Number(
+                        label=strings["seed"],
+                        minimum=-1,
+                        maximum=1_000_000_000,
+                        value=0,
+                        step=1,
+                        precision=0,
+                    )
+                    enhance = gr.Checkbox(
+                        label=strings["enhance"],
+                        value=enhancer is not None,
+                        interactive=True,
+                    )
+                with gr.Accordion(strings["accordion"], open=False):
+                    with gr.Row():
+                        negative_prompt = gr.Textbox(
+                            label=strings["negative_prompt"],
+                            value=gen.default_negative_prompt,
+                            lines=2,
+                        )
+                    with gr.Row():
+                        sampler = gr.Dropdown(
+                            SAMPLERS, label=strings["sampler"], value="ddpm"
+                        )
+                        cfg_scale = gr.Slider(
+                            label=strings["cfg"],
+                            minimum=1.0,
+                            maximum=16.0,
+                            value=6.0,
+                            step=1,
+                        )
+                    with gr.Accordion(strings["model_list"], open=False):
+                        with gr.Row():
+                            dit_model = gr.Dropdown(
+                                label=strings["dit_model"],
+                                choices=[
+                                    name
+                                    for name, path in get_files_with_extension(
+                                        args.model_root + "/t2i/model",
+                                        [".pt", ".safetensors"],
+                                    ).items()
+                                ],
+                                value=f"pytorch_model_{args.load_key}",
+                            )
+                            dit_model.change(
+                                fn=upgrade_dit_model_load,
+                                inputs=dit_model,
+                                outputs=None,
+                            )
+                    with gr.Accordion(strings["lora_list"], open=False):
+                        lora_ctrls = []
+                        for i in range(5):
+                            with gr.Row():
+                                lora_enabled = gr.Checkbox(
+                                    label="Enable",
+                                    value=False,
+                                )
+                                lora_model = gr.Dropdown(
+                                    label=f"Lora{i+1}",
+                                    choices=["none"]
+                                    + [name for name, path in lora_list.items()],
+                                    value="none",
+                                )
+                                lora_weight = gr.Slider(
+                                    label="weight",
+                                    minimum=-1,
+                                    maximum=2,
+                                    step=0.01,
+                                    value=0,
+                                    scale=5,
+                                )
+                                lora_ctrls += [lora_enabled, lora_model, lora_weight]
+                with gr.Accordion(strings["controlnet"], open=False):
+                    with gr.Row():
+                        controlnet_module = gr.Dropdown(
+                            label=strings["controlnet_model"],
+                            choices=["None"]
+                            + [name for name, path in controlnet_list.items()],
+                            value="None",
+                        )
+                        control_weight = gr.Slider(
+                            label=strings["Control_Weight"],
+                            minimum=0.0,
+                            maximum=2.0,
+                            value=1.0,
+                            step=0.1,
+                        )
+                    input_image = gr.Image(label=strings["input image"])
+                    with gr.Row():
+                        img_crop_type = gr.Radio(
+                            label=strings["Crop_mode"],
+                            choices=[
+                                (strings["Resize"], "Resize"),
+                                (strings["Crop_and_Resize"], "Crop_and_Resize"),
+                                (strings["Resize_and_Fill"], "Resize_and_Fill"),
+                            ],
+                            value="Crop_and_Resize",
+                        )
+                with gr.Row():
+                    advanced_button = gr.Button(strings["run"])
+            with gr.Column():
+                default_img = Image.open(ROOT / "app/default.png")
+                output_img = gr.Image(
+                    label=strings["generated image"],
+                    interactive=False,
+                    format="png",
+                    value=default_img,
+                )
+            advanced_button.click(
+                fn=infer,
+                inputs=[
+                    prompt,
+                    negative_prompt,
+                    seed,
+                    cfg_scale,
+                    infer_steps,
+                    sampler,
+                    imgW,
+                    imgH,
+                    input_image,
+                    controlnet_module,
+                    control_weight,
+                    enhance,
+                    img_crop_type,
+                    *lora_ctrls,
+                ],
+                outputs=output_img,
+            )
+        with gr.Row():
+            gr.Examples(
+                [
+                    ["一只小猫"],
+                    [
+                        "现实主义风格，画面主要描述一个巴洛克风格的花瓶，带有金色的装饰边框，花瓶上盛开着各种色彩鲜艳的花，白色背景"
+                    ],
+                    ["一只聪明的狐狸走在阔叶树林里, 旁边是一条小溪, 细节真实, 摄影"],
+                    ["飞流直下三千尺，疑是银河落九天"],
+                    [
+                        "一只长靴猫手持亮银色的宝剑，身着铠甲，眼神坚毅，站在一堆金币上，背景是暗色调的洞穴，图像上有金币的光影点缀。"
+                    ],
+                    ["麻婆豆腐"],
+                    ["苏州园林"],
+                    [
+                        "一颗新鲜的草莓特写，红色的外表，表面布满许多种子，背景是淡绿色的叶子"
+                    ],
+                    ["请将“杞人忧天”的样子画出来"],
+                    ["枯藤老树昏鸦，小桥流水人家"],
+                    [
+                        "湖水清澈，天空湛蓝，阳光灿烂。一只优雅的白天鹅在湖边游泳。它周围有几只小鸭子，看起来非常可爱，整个画面给人一种宁静祥和的感觉。"
+                    ],
+                    ["一朵鲜艳的红色玫瑰花，花瓣撒有一些水珠，晶莹剔透，特写镜头"],
+                    ["臭豆腐"],
+                    ["九寨沟"],
+                    ["俗语“鲤鱼跃龙门”"],
+                    [
+                        "风格是写实，画面主要描述一个亚洲戏曲艺术家正在表演，她穿着华丽的戏服，脸上戴着精致的面具，身姿优雅，背景是古色古香的舞台，镜头是近景"
+                    ],
+                ],
+                [prompt],
+                label=strings["examples"],
+            )
+    return block
+if __name__ == "__main__":
+    interface = ui()
+    interface.launch(
+        server_name=args.server_name,
+        server_port=args.server_port,
+        share=args.gradio_share,
+    )
--- a/app/lang/en.csv
+++ b/app/lang/en.csv
+key,value
+size,Size
+sampler,Sampler
+prompt,Prompt
+default prompt,"A cute cat"
+negative_prompt,Negative Prompt
+seed,Seed
+cfg,CFG Scale
+infer steps,Sampling Steps
+batch size,Batch Size
+width cond,Width Cond
+height cond,Height Cond
+enhance,Prompt Enhancement
+run,Submit
+square,Square(1024x1024)
+landscape,Landscape(1280x768)
+portrait,Portrait(768x1280)
+accordion,Advanced Options
+generated image,HunYuanDiT Generated Image
+examples,More Examples
+title,Hunyuan-DiT
+desc,A Powerful Multi-Resolution Diffusion Transformer with Fine-Grained Chinese Understanding
+controlnet,controlnet
+controlnet_model,model list
+Control_Weight,Control Weight
+input image,input image
+width,width
+height,height
\ No newline at end of file
--- a/app/lang/zh.csv
+++ b/app/lang/zh.csv
+key,value
+size,尺寸
+sampler,采样器
+prompt,文本描述
+default prompt,"一只可爱的猫"
+negative_prompt,负向词
+seed,种子
+cfg,CFG系数
+infer steps,采样步数
+batch size,批大小
+width cond,宽度条件
+height cond,高度条件
+enhance,文本增强
+run,提交生成
+square,方形(1024x1024)
+portrait,竖屏(1216x832)
+landscape,横屏(832x1216)
+accordion,高级设置
+generated image,生成
+examples,更多示例
+title,HunYuanDiT
+desc,具有细粒度中文理解的高性能多分辨率 Diffusion Transformer 模型
+controlnet,条件控制网络
+controlnet_model,模型列表
+Control_Weight,控制网络权重
+input image,输入图片
+model_list,模型列表
+dit_model,dit模型
+width,width
+height,height
+Crop_mode,裁剪方式
+Resize,仅缩放
+Crop_and_Resize,裁剪并缩放
+Resize_and_Fill,缩放并填充
+lora_list,lora
\ No newline at end of file
--- a/app/multiTurnT2I_app.py
+++ b/app/multiTurnT2I_app.py
+# -- coding: utf-8 --
+#!/usr/bin/env python
+import gradio as gr
+from PIL import Image
+import sys
+import os
+sys.path.append(os.getcwd())
+import json
+import numpy as np
+from pathlib import Path
+import io
+import hashlib
+import requests
+import base64
+import pandas as pd
+from sample_t2i import inferencer
+from mllm.dialoggen_demo import init_dialoggen_model, eval_model
+SIZES = {
+    "正方形(square, 1024x1024)": (1024, 1024),
+    "风景(landscape, 1280x768)": (768, 1280),
+    "人像(portrait, 768x1280)": (1280, 768),
+}
+global_seed = np.random.randint(0, 10000)
+# Helper Functions
+def image_to_base64(image_path):
+    with open(image_path, "rb") as image_file:
+        encoded_image = base64.b64encode(image_file.read()).decode()
+    return encoded_image
+def get_strings(lang):
+    lang_file = Path(f"app/lang/{lang}.csv")
+    strings = pd.read_csv(lang_file, header=0)
+    strings = strings.set_index("key")["value"].to_dict()
+    return strings
+def get_image_md5(image):
+    image_data = io.BytesIO()
+    image.save(image_data, format="PNG")
+    image_data = image_data.getvalue()
+    md5_hash = hashlib.md5(image_data).hexdigest()
+    return md5_hash
+# mllm调用
+def request_dialogGen(
+    server_url="http://0.0.0.0:8080",
+    history_messages=[],
+    question="画一个木制的鸟",
+    image="",
+):
+    if image != "":
+        image = base64.b64encode(open(image, "rb").read()).decode()
+    print("history_messages before request", history_messages)
+    headers = {"accept": "application/json", "Content-Type": "application/json"}
+    data = {
+        "text": question,
+        "image": image,  # "image为空字符串，则进行文本对话"
+        "history": history_messages,
+    }
+    response = requests.post(server_url, headers=headers, json=data)
+    print("response", response)
+    response = response.json()
+    print(response)
+    response_text = response["result"]
+    history_messages = response["history"]
+    print("history_messages before request", history_messages)
+    return history_messages, response_text
+# 画图
+def image_generation(prompt, infer_steps, seed, image_size):
+    print(
+        f"prompt sent to T2I model: {prompt}, infer_steps: {infer_steps}, seed: {seed}, size: {image_size}"
+    )
+    height, width = SIZES[image_size]
+    results = gen.predict(
+        prompt,
+        height=height,
+        width=width,
+        seed=seed,
+        infer_steps=infer_steps,
+        batch_size=1,
+    )
+    image = results["images"][0]
+    file_name = get_image_md5(image)
+    # Save images
+    save_dir = Path("results")
+    save_dir.mkdir(exist_ok=True)
+    save_path = f"results/multiRound_{file_name}.png"
+    image.save(save_path)
+    encoded_image = image_to_base64(save_path)
+    return encoded_image
+# 图文对话
+def chat(history_messages, input_text):
+    history_messages, response_text = request_dialogGen(
+        history_messages=history_messages, question=input_text
+    )
+    return history_messages, response_text
+#
+def pipeline(input_text, state, infer_steps, seed, image_size):
+    # 忽略空输入
+    if len(input_text) == 0:
+        return state, state[0]
+    conversation = state[0]
+    history_messages = state[1]
+    system_prompt = "请先判断用户的意图，若为画图则在输出前加入<画图>:"
+    print(f"input history:{history_messages}")
+    if not isinstance(history_messages, list) and len(history_messages.messages) >= 2:
+        response, history_messages = enhancer(
+            input_text, return_history=True, history=history_messages, skip_special=True
+        )
+    else:
+        response, history_messages = enhancer(
+            input_text,
+            return_history=True,
+            history=history_messages,
+            skip_special=False,
+        )
+    history_messages.messages[-1][-1] = response
+    if "<画图>" in response:
+        intention_draw = True
+    else:
+        intention_draw = False
+    print(f"response:{response}")
+    print("-" * 80)
+    print(f"history_messages:{history_messages}")
+    print(f"intention_draw:{intention_draw}")
+    if intention_draw:
+        prompt = response.split("<画图>")[-1]
+        # 画图
+        image_url = image_generation(prompt, infer_steps, seed, image_size)
+        response = f'<img src="data:image/png;base64,{image_url}" style="display: inline-block;"><p style="font-size: 14px; color: #555; margin-top: 0;">{prompt}</p>'
+    conversation += [((input_text, response))]
+    return [conversation, history_messages], conversation
+# 页面设计
+def upload_image(state, image_input):
+    conversation = state[0]
+    history_messages = state[1]
+    input_image = Image.open(image_input.name).resize((224, 224)).convert("RGB")
+    input_image.save(image_input.name)  # Overwrite with smaller image.
+    system_prompt = "请先判断用户的意图，若为画图则在输出前加入<画图>:"
+    history_messages, response = request_dialogGen(
+        question="这张图描述了什么？",
+        history_messages=history_messages,
+        image=image_input.name,
+    )
+    conversation += [
+        (
+            f'<img src="./file={image_input.name}"  style="display: inline-block;">',
+            response,
+        )
+    ]
+    print("conversation", conversation)
+    print("history_messages after uploading image", history_messages)
+    return [conversation, history_messages], conversation
+def reset():
+    global global_seed
+    global_seed = np.random.randint(0, 10000)
+    return [[], []], []
+def reset_last(state):
+    conversation, history = state[0], state[1]
+    conversation = conversation[:-1]
+    history.messages = history.messages[:-2]
+    return [conversation, history], conversation
+if __name__ == "__main__":
+    # Initialize dialoggen and HunyuanDiT model
+    args, gen, enhancer = inferencer()
+    strings = get_strings(args.lang)
+    css = """
+        #chatbot { min-height: 800px; }
+        #save-btn {
+            background-image: linear-gradient(to right bottom, rgba(130,217,244, 0.9), rgba(158,231,214, 1.0));
+        }
+        #save-btn:hover {
+            background-image: linear-gradient(to right bottom, rgba(110,197,224, 0.9), rgba(138,211,194, 1.0));
+        }
+        #share-btn {
+            background-image: linear-gradient(to right bottom, rgba(130,217,244, 0.9), rgba(158,231,214, 1.0));
+        }
+        #share-btn:hover {
+            background-image: linear-gradient(to right bottom, rgba(110,197,224, 0.9), rgba(138,211,194, 1.0));
+        }
+        #gallery { z-index: 999999; }
+        #gallery img:hover {transform: scale(2.3); z-index: 999999; position: relative; padding-right: 30%; padding-bottom: 30%;}
+        #gallery button img:hover {transform: none; z-index: 999999; position: relative; padding-right: 0; padding-bottom: 0;}
+        @media (hover: none) {
+            #gallery img:hover {transform: none; z-index: 999999; position: relative; padding-right: 0; 0;}
+        }
+        .html2canvas-container { width: 3000px !important; height: 3000px !important; }
+    """
+    with gr.Blocks(css=css) as demo:
+        DESCRIPTION = """# <a style="color: black; text-decoration: none;">多轮对话绘图 Multi-turn Text2Image Generation</a>
+            你可以参照[DialogGen](https://arxiv.org/abs/2403.08857)，通过简单的交互式语句来进行历史图片的修改，例如：主体编辑、增加主体、删除主体、背景更换、风格转换、镜头转换、图像合并。
+            (You can modify historical images through simple interactive statements referred to [DialogGen](https://arxiv.org/abs/2403.08857), such as: enity edit, add object, remove object, change background, change style, change lens, and combine images. )
+            例如，主体编辑 (For example, enity edit) :
+            ```none
+            Round1: 画一个木制的鸟
+            (Round1: draw a wooden bird)
+            Round2: 变成玻璃的
+            (Round2: turn into glass)
+            ```
+        """
+        gr.Markdown(DESCRIPTION)
+        gr_state = gr.State([[], []])  # conversation, chat_history
+        with gr.Row():
+            with gr.Column(scale=1, min_width=1000):
+                with gr.Row():
+                    chatbot = gr.Chatbot(
+                        elem_id="chatbot", label="DialogGen&HunyuanDiT"
+                    )
+                with gr.Row():
+                    infer_steps = gr.Slider(
+                        label="采样步数(sampling steps)",
+                        minimum=1,
+                        maximum=200,
+                        value=100,
+                        step=1,
+                    )
+                    seed = gr.Number(
+                        label="种子(seed)",
+                        minimum=-1,
+                        maximum=1_000_000_000,
+                        value=666,
+                        step=1,
+                        precision=0,
+                    )
+                    size_dropdown = gr.Dropdown(
+                        choices=[
+                            "正方形(square, 1024x1024)",
+                            "风景(landscape, 1280x768)",
+                            "人像(portrait, 768x1280)",
+                        ],
+                        value="正方形(square, 1024x1024)",
+                        label="图片尺寸(Image Size)",
+                    )
+                with gr.Row():
+                    # image_btn = gr.UploadButton("🖼️ Upload Image", file_types=["image"])
+                    text_input = gr.Textbox(
+                        label="提示词(prompt)", placeholder="输入提示词(Type a prompt)"
+                    )
+                    with gr.Column():
+                        submit_btn = gr.Button(
+                            "提交(Submit)", interactive=True, variant="primary"
+                        )
+                        clear_last_btn = gr.Button("回退(Undo)")
+                        clear_btn = gr.Button("全部重置(Reset All)")
+                with gr.Row():
+                    gr.Examples(
+                        [
+                            ["画一个木制的鸟"],
+                            ["一只小猫"],
+                            [
+                                "现实主义风格，画面主要描述一个巴洛克风格的花瓶，带有金色的装饰边框，花瓶上盛开着各种色彩鲜艳的花，白色背景"
+                            ],
+                            [
+                                "一只聪明的狐狸走在阔叶树林里, 旁边是一条小溪, 细节真实, 摄影"
+                            ],
+                            ["飞流直下三千尺，疑是银河落九天"],
+                            [
+                                "一只长靴猫手持亮银色的宝剑，身着铠甲，眼神坚毅，站在一堆金币上，背景是暗色调的洞穴，图像上有金币的光影点缀。"
+                            ],
+                            ["麻婆豆腐"],
+                            ["苏州园林"],
+                            [
+                                "一颗新鲜的草莓特写，红色的外表，表面布满许多种子，背景是淡绿色的叶子"
+                            ],
+                            ["枯藤老树昏鸦，小桥流水人家"],
+                            [
+                                "湖水清澈，天空湛蓝，阳光灿烂。一只优雅的白天鹅在湖边游泳。它周围有几只小鸭子，看起来非常可爱，整个画面给人一种宁静祥和的感觉。"
+                            ],
+                            [
+                                "一朵鲜艳的红色玫瑰花，花瓣撒有一些水珠，晶莹剔透，特写镜头"
+                            ],
+                            ["臭豆腐"],
+                            ["九寨沟"],
+                            ["俗语“鲤鱼跃龙门”"],
+                            [
+                                "风格是写实，画面主要描述一个亚洲戏曲艺术家正在表演，她穿着华丽的戏服，脸上戴着精致的面具，身姿优雅，背景是古色古香的舞台，镜头是近景"
+                            ],
+                        ],
+                        [text_input],
+                        label=strings["examples"],
+                    )
+                gr.Markdown(
+                    """<p style="font-size: 20px; color: #888;">powered by <a href="https://github.com/Centaurusalpha/DialogGen" target="_blank">DialogGen</a> and <a href="https://github.com/Tencent/HunyuanDiT" target="_blank">HunyuanDiT</a></p>"""
+                )
+        text_input.submit(
+            pipeline,
+            [text_input, gr_state, infer_steps, seed, size_dropdown],
+            [gr_state, chatbot],
+        )
+        text_input.submit(lambda: "", None, text_input)  # Reset chatbox.
+        submit_btn.click(
+            pipeline,
+            [text_input, gr_state, infer_steps, seed, size_dropdown],
+            [gr_state, chatbot],
+        )
+        submit_btn.click(lambda: "", None, text_input)  # Reset chatbox.
+        # image_btn.upload(upload_image, [gr_state, image_btn], [gr_state, chatbot])
+        clear_last_btn.click(reset_last, [gr_state], [gr_state, chatbot])
+        clear_btn.click(reset, [], [gr_state, chatbot])
+    interface = demo
+    interface.launch(server_name="0.0.0.0", server_port=443, share=False)
--- a/asset/Hunyuan_DiT_Tech_Report_05140553.pdf
+++ b/asset/Hunyuan_DiT_Tech_Report_05140553.pdf
--- a/asset/caption_demo.jpg
+++ b/asset/caption_demo.jpg
--- a/asset/chinese elements understanding.png
+++ b/asset/chinese elements understanding.png
--- a/asset/cover.png
+++ b/asset/cover.png
--- a/asset/framework.png
+++ b/asset/framework.png
--- a/asset/logo.png
+++ b/asset/logo.png
--- a/asset/long text understanding.png
+++ b/asset/long text understanding.png
--- a/asset/mllm.png
+++ b/asset/mllm.png