"configs/vscode:/vscode.git/clone" did not exist on "c1d19ce23fe6c5647646c541efbda21fd79d5462"
Commit 727428ec authored by jerrrrry's avatar jerrrrry
Browse files

Initial commit CI/CD

parents
import bisect
import io
import json
import random
from pathlib import Path
import ast
from itertools import chain
from collections import defaultdict
from functools import partial
from glob import glob
import numpy as np
import pyarrow as pa
from PIL import Image
from tqdm import tqdm
def get_table(arrow_file):
"""
Read an arrow file and return an arrow table.
"""
return pa.ipc.RecordBatchFileReader(pa.memory_map(f"{arrow_file}", "r")).read_all()
def assert_type(data, dtype, msg=""):
if not isinstance(data, dtype):
raise ValueError(f"Expected {msg} type {dtype}, got {type(data)}.")
def ndarray_to_list(data):
if isinstance(data, np.ndarray):
data = data.tolist()
elif isinstance(data, dict):
data = {k: ndarray_to_list(v) for k, v in data.items()}
elif isinstance(data, (list, tuple)):
# Assert that all elements in data are python integer, not numpy integer.
# Because numpy integer cannot be serialized to json.
data = [int(x) for x in data]
else:
raise ValueError(
f"Expected data type list, tuple, dict or np.ndarray, got {type(data)}."
)
return data
class ArrowIndexV2(object):
"""
ArrowIndexV2 is a new version of ArrowIndex.
Parameters
----------
index_file: str or pathlib.Path
The path of index file. Either index_file or res_dict should be provided.
res_dict: dict
The index dict. Either index_file or res_dict should be provided.
align: int
Align the length of indices to be a multiple of align. Generally align should be the batch size * world_size.
shadow_file_fn: callable or dict
A callable function to map shadow file path to a new path. If None, the shadow file path will not be
changed. If a dict is provided, the keys are the shadow names to call the function, and the values are the
callable functions to map the shadow file path to a new path. If a callable function is provided, the key
is 'default'.
Examples
--------
>>> index_file = 'data.json'
>>> indexObj = ArrowIndexV2(index_file)
>>> pil_image = indexObj.get_image(0)
>>> text = indexObj.get_attribute(0, column='text_zh')
"""
def __init__(
self, index_file=None, res_dict=None, align=1, shadow_file_fn=None, **kwargs
):
if index_file is not None:
with open(index_file, "r") as f:
res_dict = json.load(f)
elif res_dict is not None:
pass
else:
raise ValueError(f"Either index_file or res_dict should be provided.")
self.shadow_file_fn = {}
if shadow_file_fn is not None:
if not callable(shadow_file_fn) and not isinstance(shadow_file_fn, dict):
raise ValueError(
"shadow_file_fn should be a callable function or a dict."
)
if callable(shadow_file_fn):
self.shadow_file_fn["default"] = shadow_file_fn
else:
for k, v in shadow_file_fn.items():
if not callable(v):
raise ValueError(f"{k} should be a callable function.")
self.shadow_file_fn[k] = v
self._data = res_dict
self.data_type = res_dict["data_type"]
self.arrow_files = res_dict["arrow_files"]
self.cum_length = res_dict["cum_length"]
self.group_length = res_dict["group_length"]
error_msg = f"Expected group_length type list, got {type(self.group_length)}."
if isinstance(self.group_length, dict):
raise ValueError(
f"{error_msg}\nNote: You may using a multi-resolution index file. "
"Try `MultiResolutionBucketIndexV2` instead."
)
elif not isinstance(self.group_length, list):
raise ValueError(error_msg)
self.indices = res_dict["indices"]
if "indices_file" in res_dict:
self.indices_file = res_dict["indices_file"]
if self.indices_file != "":
indices_file = Path(index_file).parent / self.indices_file
if Path(indices_file).exists():
self.indices = np.load(indices_file)["x"]
else:
raise ValueError(
f"This Index file contains an extra file {indices_file} which is missed."
)
else:
self.indices_file = ""
if not isinstance(self.indices, list) and not isinstance(
self.indices, np.ndarray
):
raise ValueError(
f"Expected indices type list or np.ndarray, got {type(self.indices)}."
)
if align > 1:
if isinstance(self.indices, np.ndarray):
self.indices = self.indices.tolist()
self.align(align)
self.indices = np.asarray(self.indices, int)
if len(self.arrow_files) != len(self.cum_length):
raise ValueError(
f"Length of arrow_files and cum_length does not match. {len(self.arrow_files)} != {len(self.cum_length)}"
)
if len(self.arrow_files) != len(self.group_length):
raise ValueError(
f"Length of arrow_files and group_length does not match. {len(self.arrow_files)} != {len(self.group_length)}"
)
if len(self.indices) == 0:
raise ValueError(f"No indices found in index_dict.")
if (
isinstance(self.indices, list)
and self.indices[-1] > self.cum_length[-1] - 1
):
raise ValueError(f"Indices exceed cum_length.")
# Warning:
# Ensure that indices are an increasing array. Currently,
# no checks are performed due to the potential slowness when dealing with hundreds of millions of data points.
self.bias = self.cum_length
self._cur_arrow_file = None
self._cur_table_map = None
self._cur_table = None
self._index_bias = 0
self.last_index = -1
self._shadow_cur_arrow_file = {}
self._shadow_cur_table_map = {}
self._shadow_cur_table = {}
self._shadow_index_bias = {}
self.shadow_last_index = {}
for k in self.shadow_file_fn.keys():
self._shadow_cur_arrow_file[k] = None
self._shadow_cur_table_map[k] = None
self._shadow_cur_table[k] = None
self._shadow_index_bias[k] = 0
self.shadow_last_index[k] = -1
def __len__(self):
return len(self.indices)
def __repr__(self):
return f"""
ArrowIndexV2(
data_type {self.data_type}
indices_file {self.indices_file}
arrow_files Count={len(self.arrow_files):,} ({self.arrow_files[0]}, ...)
cum_length Count={len(self.cum_length):,} ({self.cum_length[0]}, ...)
group_length Count={len(self.group_length):,} ({self.group_length[0]}, ...)
indices Count={len(self.indices):,}
example_indices Count={len(self._data['example_indices']):,}
)
"""
def check_exists(self):
for arrow_file in tqdm(self.arrow_files):
if not Path(arrow_file).exists():
print(arrow_file)
def align(self, align):
"""
Repeat the index so that the length is a multiple of batch_size * world_size.
"""
if len(self) % align == 0:
return
repeat_num = align - len(self) % align
if repeat_num >= len(self):
repeat_n = repeat_num // len(self)
repeat_times = [repeat_n + 1 for _ in self.indices]
group_length_new = [ll * (repeat_n + 1) for ll in self.group_length]
repeat_num -= repeat_n * len(self)
else:
repeat_times = [1 for _ in range(repeat_num)]
group_length_new = [ll for ll in self.group_length]
for i in range(repeat_num):
repeat_times[-i - 1] += 1
repeat_start_idx = len(self) - len(repeat_times)
group_id = -1
while group_length_new[group_id] == 0:
group_id -= 1
# Allocate the remaining indices that need to be repeated,
# while also counting how many indices have been checked.
# If the count reaches the group_length, switch to the next group
# The reason for paying attention to group_length is that when repeating indices,
# group_length also needs to be updated synchronously..
group_acc = 0
for i in range(repeat_num):
group_length_new[group_id] += 1
group_acc += 1
if group_acc == self.group_length[group_id]:
group_id -= 1
while group_length_new[group_id] == 0:
group_id -= 1
group_acc = 0
temp = []
for i, value in enumerate(self.indices[repeat_start_idx:]):
temp.extend([value] * repeat_times[i])
self.indices = np.concatenate([self.indices[:repeat_start_idx], temp])
self.group_length = group_length_new
def shuffle(self, seed=None, fast=False):
"""
It takes about 30 seconds for an index consisting of 100_000 arrows.
"""
if fast:
return self.shuffle_fast(seed)
indices = self.indices.tolist()
if seed is not None:
state = random.getstate()
random.seed(seed)
indices_group_list = []
group_cum_len = 0
for group_len in self.group_length:
indices_group = indices[group_cum_len : group_cum_len + group_len]
random.shuffle(indices_group)
indices_group_list.append((indices_group, group_len))
group_cum_len += group_len
random.shuffle(indices_group_list)
self.group_length = [x[1] for x in indices_group_list]
self.indices = np.asarray(
list(chain.from_iterable([x[0] for x in indices_group_list]))
)
if seed is not None:
random.setstate(state)
def shuffle_fast(self, seed=None):
if seed is not None:
sampler = np.random.RandomState(seed)
sampler.shuffle(self.indices)
else:
np.random.shuffle(self.indices)
def get_table(self, arrow_file, shadow=None):
"""
Read an arrow file and return an arrow table.
"""
if shadow is None:
if self._cur_table is not None:
if self._cur_arrow_file == arrow_file:
# This is the same arrow file. Return the cached table.
return self._cur_table
else:
# This is a different arrow file. Clear the cache.
self._cur_table_map.close()
self._cur_table = None
self._cur_arrow_file = arrow_file
self._cur_table_map = pa.memory_map(f"{arrow_file}", "r")
self._cur_table = pa.ipc.RecordBatchFileReader(
self._cur_table_map
).read_all()
return self._cur_table
else:
if self._shadow_cur_table[shadow] is not None:
if self._shadow_cur_arrow_file[shadow] == arrow_file:
return self._shadow_cur_table[shadow]
else:
self._shadow_cur_table_map[shadow].close()
self._shadow_cur_table[shadow] = None
self._shadow_cur_arrow_file[shadow] = arrow_file
self._shadow_cur_table_map[shadow] = pa.memory_map(f"{arrow_file}", "r")
self._shadow_cur_table[shadow] = pa.ipc.RecordBatchFileReader(
self._shadow_cur_table_map[shadow]
).read_all()
return self._shadow_cur_table[shadow]
def get_arrow_file_by_index(self, index, return_index_bias=False, shadow=None):
i = bisect.bisect_right(self.cum_length, index)
arrow_file = self.arrow_files[i]
if return_index_bias:
if i == 0:
index_bias = 0
else:
index_bias = self.cum_length[i - 1]
return arrow_file, index_bias
return arrow_file
def get_arrow_file(self, ind, shadow=None):
"""
Get arrow file by in-dataset index.
Parameters
----------
ind: int
The in-dataset index.
shadow: str
The shadow name. If None, return the main arrow file. If not None, return the shadow arrow file.
Returns
-------
arrow_file: str
The arrow file path.
"""
index = self.indices[ind]
return self.get_arrow_file_by_index(index, shadow=shadow)
def load_table_by_index(self, index, shadow=None):
if shadow is None:
if index == self.last_index:
return self._cur_table
arrow_file, self._index_bias = self.get_arrow_file_by_index(
index, return_index_bias=True
)
self._cur_table = self.get_table(arrow_file)
self.last_index = index
return self._cur_table
else:
if index == self.shadow_last_index[shadow]:
return self._shadow_cur_table[shadow]
shadow_arrow_file, _shadow_index_bias = self.get_arrow_file_by_index(
index, return_index_bias=True, shadow=shadow
)
self._shadow_index_bias[shadow] = _shadow_index_bias
self._shadow_cur_table[shadow] = self.get_table(
shadow_arrow_file, shadow=shadow
)
self.shadow_last_index[shadow] = index
return self._shadow_cur_table[shadow]
def get_data_by_index(
self, index, columns=None, allow_missing=False, return_meta=True, shadow=None
):
table = self.load_table_by_index(index, shadow=shadow)
if isinstance(columns, str):
columns = [columns]
if columns is None:
columns = list(table.column_names)
index_bias = (
self._index_bias if shadow is None else self._shadow_index_bias[shadow]
)
in_arrow_index = index - index_bias
if return_meta:
cur_arrow_file = (
self._cur_arrow_file
if shadow is None
else self._shadow_cur_arrow_file[shadow]
)
data = {
"index": index,
"in_arrow_index": in_arrow_index,
"arrow_name": cur_arrow_file,
}
else:
data = {}
if allow_missing:
for col in columns:
if col in table.column_names:
data[col] = table[col][in_arrow_index].as_py()
else:
for col in columns:
data[col] = table[col][in_arrow_index].as_py()
return data
def get_data(
self, ind, columns=None, allow_missing=False, return_meta=True, shadow=None
):
"""
Get data by in-dataset index.
Parameters
----------
ind: int
The in-dataset index.
columns: str or list
The columns to be returned. If None, return all columns.
allow_missing: bool
If True, omit missing columns. If False, raise an error if the column is missing.
return_meta: bool
If True, the resulting dict will contain some meta information:
in-json index, in-arrow index, and arrow_name.
shadow: str
The shadow name. If None, return the main data. If not None, return the shadow data.
Returns
-------
data: dict
A dict containing the data.
"""
index = self.indices[ind]
return self.get_data_by_index(
index,
columns,
allow_missing=allow_missing,
return_meta=return_meta,
shadow=shadow,
)
def get_attribute_by_index(self, index, column, shadow=None):
table = self.load_table_by_index(index, shadow=shadow)
index_bias = (
self._index_bias if shadow is None else self._shadow_index_bias[shadow]
)
return table[column][index - index_bias].as_py()
def get_attribute(self, ind, column, shadow=None):
"""
Get single attribute by in-dataset index.
Parameters
----------
ind: int
The in-dataset index.
column: str
The column name.
shadow: str
The shadow name. If None, return the main data. If not None, return the shadow data.
Returns
-------
data: can be any type
"""
index = self.indices[ind]
return self.get_attribute_by_index(index, column, shadow=shadow)
def get_image_by_index(
self, index, column="image", ret_type="pil", max_size=-1, shadow=None
):
table = self.load_table_by_index(index, shadow=shadow)
index_bias = (
self._index_bias if shadow is None else self._shadow_index_bias[shadow]
)
col = "image" if "image" in table.column_names else "binary"
temp = table[col][index - index_bias].as_py()
image_bytes = io.BytesIO(temp)
image_bytes.seek(0)
try:
# convert(RGB) has two purposes:
# 1. Convert the image to RGB mode. Some images are in grayscale/RGBA mode, which will cause channel
# inconsistency in following processing.
# 2. Convert the image to RGB mode. Some images are in P mode, which will be forced to use NEAREST resample
# method in resize (even if you specify LANCZOS), which will cause blurry images.
pil_image = Image.open(image_bytes).convert("RGB")
except Exception as e:
print(
f"get_image_by_index | Error: {e} ({self.get_arrow_file_by_index(index), index - index_bias})"
)
pil_image = Image.new("RGB", (256, 256), (255, 255, 255))
if max_size > 0:
# Resize the image to max_size. max_size is the size of long edge
w, h = pil_image.size
if w > h:
new_w = max_size
new_h = int(h * max_size / w)
else:
new_h = max_size
new_w = int(w * max_size / h)
pil_image = pil_image.resize((new_w, new_h))
if ret_type == "numpy":
return np.array(pil_image)
return pil_image
def get_image(self, ind, column="image", ret_type="pil", max_size=-1, shadow=None):
"""
Get image by in-dataset index.
Parameters
----------
ind: int
The in-dataset index.
column: str
[Deprecated] The column name of the image. Default to 'image'.
ret_type: str
The return type. Can be 'pil' or 'numpy'. Default to 'pil'.
max_size: int
If not -1, resize the image to max_size. max_size is the size of long edge.
shadow: str
The shadow name. If None, return the main image. If not None, return the shadow image.
Returns
-------
image: PIL.Image.Image or np.ndarray
"""
index = self.indices[ind]
return self.get_image_by_index(index, column, ret_type, max_size, shadow=shadow)
def get_md5_by_index(self, index, shadow=None):
table = self.load_table_by_index(index, shadow=shadow)
index_bias = (
self._index_bias if shadow is None else self._shadow_index_bias[shadow]
)
return table["md5"][index - index_bias].as_py()
def get_md5(self, ind, shadow=None):
index = self.indices[ind]
return self.get_md5_by_index(index, shadow=shadow)
def get_columns_by_index(self, index, shadow=None):
table = self.load_table_by_index(index, shadow=shadow)
return table.column_names
def get_columns(self, ind, shadow=None):
index = self.indices[ind]
return self.get_columns_by_index(index, shadow=shadow)
def source_distribution(self, save_path=None, shadow=None):
sources = defaultdict(int)
for index in tqdm(self.indices):
source = self.get_attribute_by_index(index, "source", shadow=shadow)
sources[source] += 1
sources = sorted(sources.items(), key=lambda x: x[1], reverse=True)
for k, v in sources:
print(f"{k:20s} {v:10d}")
if save_path is not None:
Path(save_path).write_text(
"\n".join([f"{k:20s} {v:10d}" for k, v in sources])
)
def save(self, save_path):
"""
Save the index to a json file.
Parameters
----------
save_path: str or pathlib.Path
The path to save the index file.
"""
builder = IndexV2Builder(
data_type=self.data_type,
arrow_files=self.arrow_files,
cum_length=self.cum_length,
indices=self.indices,
)
builder.build(save_path)
def sample_batch_indices(self, n):
return np.random.choice(self.indices, n)
def sample_batch(self, n, columns, progress=True, shadow=None):
if isinstance(n, int):
indices = self.sample_batch_indices(n)
else:
indices = n
if progress:
pbar = tqdm(indices)
else:
pbar = indices
batch_data = []
for i in pbar:
batch_data.append(self.get_data_by_index(i, columns, shadow=shadow))
return batch_data
@staticmethod
def resize_and_crop(image, target_size, resample=Image.LANCZOS, crop_type="random"):
"""
Resize image without changing aspect ratio, then crop the center/random part.
Parameters
----------
image: PIL.Image.Image
The input image to be resized and cropped.
target_size: tuple
The target size of the image.
resample:
The resample method. See PIL.Image.Image.resize for details. Default to Image.LANCZOS.
crop_type: str
'center' or 'random'. If 'center', crop the center part of the image. If 'random',
crop a random part of the image. Default to 'random'.
Returns
-------
image: PIL.Image.Image
The resized and cropped image.
crop_pos: tuple
The position of the cropped part. (crop_left, crop_top)
"""
tw, th = target_size
w, h = image.size
tr = th / tw
r = h / w
# resize
if r < tr:
resize_height = th
resize_width = int(round(th / h * w))
else:
resize_width = tw
resize_height = int(round(tw / w * h))
image = image.resize((resize_width, resize_height), resample=resample)
if crop_type == "center":
crop_top = int(round((resize_height - th) / 2.0))
crop_left = int(round((resize_width - tw) / 2.0))
elif crop_type == "random":
crop_top = random.randint(0, resize_height - th)
crop_left = random.randint(0, resize_width - tw)
else:
raise ValueError(f"crop_type must be center or random, but got {crop_type}")
image = image.crop((crop_left, crop_top, crop_left + tw, crop_top + th))
return image, (crop_left, crop_top)
class IndexV2Builder(object):
def __init__(
self,
arrow_files,
indices=None,
cum_length=None,
group_length=None,
data_type=None,
max_indices=5_000_000,
example_num=1000,
config_file=None,
):
"""
Build index v2 from an index dict.
Parameters
----------
arrow_files: list
A list of arrow files.
indices: list or dict
A list of indices or a dict of indices.
If not provided, it will be specified as range(cum_length[-1]).
cum_length: list
A list of cumulative length of arrow files.
If not provided, it will be calculated from arrow files.
group_length: list
A list of group length or a dict of group length for each arrow file.
If not provided, it will be calculated.
data_type: str or list
Some custom information of this index.
max_indices: int
If the number of indices is larger than max_indices, the indices will be saved in a separate file.
Default to 5_000_000.
example_num: int
The number of examples to be saved in the index file. Default to 1000.
config_file: str
The path of config file.
Examples
--------
>>> builder = IndexV2Builder(
>>> data_type='gold',
>>> arrow_files=arrow_files,
>>> cum_length=cum_length,
>>> indices=indices,
>>> )
>>> builder.build(save_path)
"""
self.arrow_files = arrow_files
self.indices = indices
self.cum_length = cum_length
self.group_length = group_length
self.data_type = data_type
self.max_indices = max_indices
self.example_num = example_num
self.config_file = config_file
if isinstance(arrow_files, str):
if "*" in arrow_files or "?" in arrow_files:
self.arrow_files = list(glob(arrow_files))
else:
self.arrow_files = [arrow_files]
elif isinstance(self.arrow_files, tuple):
self.arrow_files = list(self.arrow_files)
if not isinstance(self.arrow_files, list):
raise ValueError(
f"Expected arrow_files to be a list, got {type(self.arrow_files)}."
)
if self.cum_length is None:
continuous = False
if self.indices is None:
self.group_length = []
continuous = True
print(f"Calculating cum_length...")
self.cum_length = []
cur_cum_length = 0
pbar = tqdm(self.arrow_files)
for arrow_file in pbar:
table_length = len(get_table(arrow_file))
cur_cum_length += table_length
self.cum_length.append(cur_cum_length)
pbar.set_description(f"{self.cum_length[-1]:>12d}")
if continuous:
self.group_length.append(table_length)
if self.indices is None:
self.indices = list(range(self.cum_length[-1]))
if self.group_length is None:
self.group_length = []
if self.data_type is None:
self.data_type = ["Made by IndexV2Builder"]
elif isinstance(self.data_type, str):
self.data_type = [self.data_type]
assert_type(self.data_type, list, "data_type")
assert_type(self.cum_length, (list, np.ndarray), "cum_length")
assert_type(self.group_length, (list, dict, np.ndarray), "group_length")
assert_type(self.indices, (list, dict, np.ndarray), "indices")
self.cum_length = ndarray_to_list(self.cum_length)
self.group_length = ndarray_to_list(self.group_length)
self.indices = ndarray_to_list(self.indices)
if isinstance(self.indices, dict):
for k, v in self.indices.items():
assert_type(v, list, f"indices[{k}]")
if len(self.arrow_files) != len(self.cum_length):
raise ValueError(
f"Length of arrow_files and cum_length does not match. {len(self.arrow_files)} != {len(self.cum_length)}"
)
if len(self.indices) == 0:
raise ValueError(f"No indices found in index_dict.")
if (
isinstance(self.indices, list)
and self.indices[-1] > self.cum_length[-1] - 1
):
raise ValueError(
f"Indices exceed cum_length. {self.indices[-1]} > {self.cum_length[-1] - 1}"
)
if len(self.group_length) > 0:
if len(self.arrow_files) != len(self.group_length):
raise ValueError(
f"Length of arrow_files and group_length does not match. {len(self.arrow_files)} != {len(self.group_length)}"
)
if sum(self.group_length) != len(self.indices):
raise ValueError(
f"Sum of group_length does not match length of indices. {sum(self.group_length)} != {len(self.indices)}"
)
def encode(self):
# Encode arrow files
print("Encoding arrow files...")
arrow_files = []
for arrow_file in tqdm(self.arrow_files):
shortname = arrow_file
arrow_files.append(shortname)
self.arrow_files = arrow_files
# Calculate group_length
print("Calculating group length...")
if isinstance(self.indices, list):
if len(self.group_length) == 0:
self.group_length = self.calc_group_length(
self.indices, self.cum_length
)
else:
print("Group length already calculated, skip.")
elif isinstance(self.indices, dict):
if not isinstance(self.group_length, dict):
self.group_length = {}
for k, v in self.indices.items():
print(f"Calculating group length for {k}...")
if k not in self.group_length or len(self.group_length[k]) == 0:
self.group_length[k] = self.calc_group_length(v, self.cum_length)
else:
print("Group length already calculated, skip.")
else:
raise ValueError(
f"Expected indices type list or dict, got {type(self.indices)}."
)
return {
"data_type": self.data_type,
"config_file": self.config_file if self.config_file is not None else "",
"indices_file": "",
"arrow_files": self.arrow_files,
"cum_length": self.cum_length,
"group_length": self.group_length,
"indices": self.indices,
"example_indices": [],
}
def to_index_v2(self):
return ArrowIndexV2(res_dict=self.encode())
def build(self, save_path):
return self.save(save_path)
def save(self, save_path):
"""
Make index v2 from an index dict.
Parameters
----------
save_path: str or pathlib.Path
The path to save the index file.
"""
index_dict = self.encode()
# Ensure the indices either a list or a dict.
save_path = Path(save_path)
save_path.parent.mkdir(exist_ok=True, parents=True)
if (
isinstance(index_dict["indices"], list)
and len(index_dict["indices"]) > self.max_indices
):
self.example_indices = index_dict["indices"][: self.example_num]
indices_to_save = {"x": index_dict["indices"]}
index_dict["indices"] = []
elif isinstance(index_dict["indices"], dict):
indices_to_save = index_dict["indices"]
index_dict["indices"] = {}
num_keys = len(indices_to_save)
example_num_per_key = max(self.example_num // num_keys, 10)
index_dict["example_indices"] = {
k: v[:example_num_per_key] for k, v in index_dict["indices"].items()
}
else:
indices_to_save = None
# save indices
if indices_to_save is not None:
indices_file = save_path.parent / f"{save_path.stem}.index"
indices_dict = {k: np.array(v) for k, v in indices_to_save.items()}
np.savez_compressed(indices_file, **indices_dict)
index_dict["indices_file"] = indices_file.name + ".npz"
with save_path.open("w") as f:
json.dump(index_dict, f, indent=4, ensure_ascii=False)
@staticmethod
def calc_group_length(indices, cum_length):
group_lengths = []
cum_ind = 0
count = 0
for index in tqdm(indices):
if index < cum_length[cum_ind]:
# index is still in the current group
count += 1
else:
# index has exceeded the current group, need to switch to the next group
group_lengths.append(count)
cum_ind += 1
# if the index exceeds the next group, continue to switch to the next group
while index >= cum_length[cum_ind]:
group_lengths.append(0)
cum_ind += 1
count = 1
# The indices array is exhausted, and the last group containing the index should also be added.
group_lengths.append(count)
assert len(group_lengths) <= len(cum_length), (
len(group_lengths),
len(cum_length),
)
# Check if the number of groups is less than the number of cum_length,
# then the last n groups are empty and need to be filled with zeros.
if len(group_lengths) < len(cum_length):
group_lengths.extend([0] * (len(cum_length) - len(group_lengths)))
return group_lengths
import math
import torch.distributed as dist
from torch.utils.data.distributed import DistributedSampler
class BlockDistributedSampler(DistributedSampler):
def __init__(
self,
dataset,
num_replicas=None,
rank=None,
shuffle=True,
seed=0,
drop_last=False,
batch_size=-1,
start_index=0,
):
super().__init__(dataset, num_replicas, rank, shuffle, seed, drop_last)
if num_replicas is None:
if not dist.is_available():
raise RuntimeError("Requires distributed package to be available")
num_replicas = dist.get_world_size()
if rank is None:
if not dist.is_available():
raise RuntimeError("Requires distributed package to be available")
rank = dist.get_rank()
if rank >= num_replicas or rank < 0:
raise ValueError(
"Invalid rank {}, rank should be in the interval"
" [0, {}]".format(rank, num_replicas - 1)
)
if batch_size == -1:
raise ValueError("batch_size should be specified")
self.dataset = dataset
self.num_replicas = num_replicas
self.rank = rank
self.epoch = 0
self.drop_last = drop_last
self.shuffle = shuffle
self.seed = seed
self.batch_size = batch_size
self._start_index = start_index
self.recompute_sizes()
@property
def start_index(self):
return self._start_index
@start_index.setter
def start_index(self, value):
self._start_index = value
self.recompute_sizes()
def recompute_sizes(self):
self.num_samples = (
len(self.dataset) // self.batch_size * self.batch_size // self.num_replicas
- self._start_index
)
self.total_size = self.num_samples * self.num_replicas
def __iter__(self):
indices = list(range(len(self.dataset))) # type: ignore[arg-type]
raw_num_samples = (
len(indices) // self.batch_size * self.batch_size // self.num_replicas
)
raw_total_size = raw_num_samples * self.num_replicas
indices = indices[:raw_total_size]
# We require that the dataset size is divisible by batch_size * num_replicas
# This is naturally satisfied when using index_kits.
# In future, we can remove this assertion.
assert len(indices) == raw_total_size, f"{len(indices)} vs {raw_total_size}"
# subsample with start_index
indices = indices[
self.rank * raw_num_samples
+ self.start_index : (self.rank + 1) * raw_num_samples
]
assert (
len(indices) + self.start_index == raw_num_samples
), f"{len(indices) + self.start_index} vs {raw_num_samples}"
# This is a sequential sampler. The shuffle operation is done by the dataset itself.
return iter(indices)
class DistributedSamplerWithStartIndex(DistributedSampler):
def __init__(
self,
dataset,
num_replicas=None,
rank=None,
shuffle=True,
seed=0,
drop_last=False,
start_index=0,
):
super().__init__(dataset, num_replicas, rank, shuffle, seed, drop_last)
if num_replicas is None:
if not dist.is_available():
raise RuntimeError("Requires distributed package to be available")
num_replicas = dist.get_world_size()
if rank is None:
if not dist.is_available():
raise RuntimeError("Requires distributed package to be available")
rank = dist.get_rank()
if rank >= num_replicas or rank < 0:
raise ValueError(
"Invalid rank {}, rank should be in the interval"
" [0, {}]".format(rank, num_replicas - 1)
)
self.dataset = dataset
self.num_replicas = num_replicas
self.rank = rank
self.epoch = 0
self.drop_last = drop_last
self._start_index = start_index
self.recompute_sizes()
self.shuffle = shuffle
self.seed = seed
@property
def start_index(self):
return self._start_index
@start_index.setter
def start_index(self, value):
self._start_index = value
self.recompute_sizes()
def recompute_sizes(self):
# If the dataset length is evenly divisible by # of replicas, then there
# is no need to drop any data, since the dataset will be split equally.
if self.drop_last and (len(self.dataset) - self._start_index) % self.num_replicas != 0: # type: ignore[arg-type]
# Split to nearest available length that is evenly divisible.
# This is to ensure each rank receives the same amount of data when
# using this Sampler.
self.num_samples = math.ceil(
((len(self.dataset) - self._start_index) - self.num_replicas) / self.num_replicas # type: ignore[arg-type]
)
else:
self.num_samples = math.ceil((len(self.dataset) - self._start_index) / self.num_replicas) # type: ignore[arg-type]
self.total_size = self.num_samples * self.num_replicas
def __iter__(self):
indices = list(range(self._start_index, len(self.dataset))) # type: ignore[arg-type]
if not self.drop_last:
# add extra samples to make it evenly divisible
padding_size = self.total_size - len(indices)
if padding_size <= len(indices):
indices += indices[:padding_size]
else:
indices += (indices * math.ceil(padding_size / len(indices)))[
:padding_size
]
else:
# remove tail of data to make it evenly divisible.
indices = indices[: self.total_size]
assert len(indices) == self.total_size
# subsample with start_index
indices = indices[self.rank : self.total_size : self.num_replicas]
assert len(indices) == self.num_samples
return iter(indices)
import re
try:
from setuptools import setup
except ImportError:
from distutils.core import setup
with open("index_kits/__init__.py", "r") as file:
regex_version = r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]'
version = re.search(regex_version, file.read(), re.MULTILINE).group(1)
setup(
name="index_kits",
version=version,
author="jarvizhang",
author_email="jarvizhang@tencent.com",
description="An index kits for streaming reading arrow data.",
packages=["index_kits", "index_kits/dataset"],
scripts=["bin/idk"],
install_requires=[
"pillow>=9.3.0",
"tqdm>=4.60.0",
"pyarrow>=10.0.1",
"torch>=1.9",
],
python_requires=">=3.8.12",
)
TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT
Tencent Hunyuan DiT Release Date: 14 May 2024
THIS LICENSE AGREEMENT DOES NOT APPLY IN THE EUROPEAN UNION AND IS EXPRESSLY LIMITED TO THE TERRITORY, AS DEFINED BELOW.
By clicking to agree or by using, reproducing, modifying, distributing, performing or displaying any portion or element of the Tencent Hunyuan Works, including via any Hosted Service, You will be deemed to have recognized and accepted the content of this Agreement, which is effective immediately.
1. DEFINITIONS.
a. “Acceptable Use Policy” shall mean the policy made available by Tencent as set forth in the Exhibit A.
b. “Agreement” shall mean the terms and conditions for use, reproduction, distribution, modification, performance and displaying of Tencent Hunyuan Works or any portion or element thereof set forth herein.
c. “Documentation” shall mean the specifications, manuals and documentation for Tencent Hunyuan made publicly available by Tencent.
d. “Hosted Service” shall mean a hosted service offered via an application programming interface (API), web access, or any other electronic or remote means.
e. “Licensee,” “You” or “Your” shall mean a natural person or legal entity exercising the rights granted by this Agreement and/or using the Tencent Hunyuan Works for any purpose and in any field of use.
f. “Materials” shall mean, collectively, Tencent’s proprietary Tencent Hunyuan and Documentation (and any portion thereof) as made available by Tencent under this Agreement.
g. “Model Derivatives” shall mean all: (i) modifications to Tencent Hunyuan or any Model Derivative of Tencent Hunyuan; (ii) works based on Tencent Hunyuan or any Model Derivative of Tencent Hunyuan; or (iii) any other machine learning model which is created by transfer of patterns of the weights, parameters, operations, or Output of Tencent Hunyuan or any Model Derivative of Tencent Hunyuan, to that model in order to cause that model to perform similarly to Tencent Hunyuan or a Model Derivative of Tencent Hunyuan, including distillation methods, methods that use intermediate data representations, or methods based on the generation of synthetic data Outputs by Tencent Hunyuan or a Model Derivative of Tencent Hunyuan for training that model. For clarity, Outputs by themselves are not deemed Model Derivatives.
h. “Output” shall mean the information and/or content output of Tencent Hunyuan or a Model Derivative that results from operating or otherwise using Tencent Hunyuan or a Model Derivative, including via a Hosted Service.
i. “Tencent,” “We” or “Us” shall mean THL A29 Limited.
j. “Tencent Hunyuan” shall mean the large language models, text/image/video/audio/3D generation models, and multimodal large language models and their software and algorithms, including trained model weights, parameters (including optimizer states), machine-learning model code, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing made publicly available by Us, including, without limitation to, Tencent Hunyuan DiT released at https://huggingface.co/Tencent-Hunyuan/HunyuanDiT.
k. “Tencent Hunyuan Works” shall mean: (i) the Materials; (ii) Model Derivatives; and (iii) all derivative works thereof.
l. “Territory” shall mean the worldwide territory, excluding the territory of the European Union.
m. “Third Party” or “Third Parties” shall mean individuals or legal entities that are not under common control with Us or You.
n. “including” shall mean including but not limited to.
2. GRANT OF RIGHTS.
We grant You, for the Territory only, a non-exclusive, non-transferable and royalty-free limited license under Tencent’s intellectual property or other rights owned by Us embodied in or utilized by the Materials to use, reproduce, distribute, create derivative works of (including Model Derivatives), and make modifications to the Materials, only in accordance with the terms of this Agreement and the Acceptable Use Policy, and You must not violate (or encourage or permit anyone else to violate) any term of this Agreement or the Acceptable Use Policy.
3. DISTRIBUTION.
You may, subject to Your compliance with this Agreement, distribute or make available to Third Parties the Tencent Hunyuan Works, exclusively in the Territory, provided that You meet all of the following conditions:
a. You must provide all such Third Party recipients of the Tencent Hunyuan Works or products or services using them a copy of this Agreement;
b. You must cause any modified files to carry prominent notices stating that You changed the files;
c. You are encouraged to: (i) publish at least one technology introduction blogpost or one public statement expressing Your experience of using the Tencent Hunyuan Works; and (ii) mark the products or services developed by using the Tencent Hunyuan Works to indicate that the product/service is “Powered by Tencent Hunyuan”; and
d. All distributions to Third Parties (other than through a Hosted Service) must be accompanied by a “Notice” text file that contains the following notice: “Tencent Hunyuan is licensed under the Tencent Hunyuan Community License Agreement, Copyright © 2024 Tencent. All Rights Reserved. The trademark rights of “Tencent Hunyuan” are owned by Tencent or its affiliate.”
You may add Your own copyright statement to Your modifications and, except as set forth in this Section and in Section 5, may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Model Derivatives as a whole, provided Your use, reproduction, modification, distribution, performance and display of the work otherwise complies with the terms and conditions of this Agreement (including as regards the Territory). If You receive Tencent Hunyuan Works from a Licensee as part of an integrated end user product, then this Section 3 of this Agreement will not apply to You.
4. ADDITIONAL COMMERCIAL TERMS.
If, on the Tencent Hunyuan version release date, the monthly active users of all products or services made available by or for Licensee is greater than 100 million monthly active users in the preceding calendar month, You must request a license from Tencent, which Tencent may grant to You in its sole discretion, and You are not authorized to exercise any of the rights under this Agreement unless or until Tencent otherwise expressly grants You such rights.
5. RULES OF USE.
a. Your use of the Tencent Hunyuan Works must comply with applicable laws and regulations (including trade compliance laws and regulations) and adhere to the Acceptable Use Policy for the Tencent Hunyuan Works, which is hereby incorporated by reference into this Agreement. You must include the use restrictions referenced in these Sections 5(a) and 5(b) as an enforceable provision in any agreement (e.g., license agreement, terms of use, etc.) governing the use and/or distribution of Tencent Hunyuan Works and You must provide notice to subsequent users to whom You distribute that Tencent Hunyuan Works are subject to the use restrictions in these Sections 5(a) and 5(b).
b. You must not use the Tencent Hunyuan Works or any Output or results of the Tencent Hunyuan Works to improve any other large language model (other than Tencent Hunyuan or Model Derivatives thereof).
c. You must not use, reproduce, modify, distribute, or display the Tencent Hunyuan Works, Output or results of the Tencent Hunyuan Works outside the Territory. Any such use outside the Territory is unlicensed and unauthorized under this Agreement.
6. INTELLECTUAL PROPERTY.
a. Subject to Tencent’s ownership of Tencent Hunyuan Works made by or for Tencent and intellectual property rights therein, conditioned upon Your compliance with the terms and conditions of this Agreement, as between You and Tencent, You will be the owner of any derivative works and modifications of the Materials and any Model Derivatives that are made by or for You.
b. No trademark licenses are granted under this Agreement, and in connection with the Tencent Hunyuan Works, Licensee may not use any name or mark owned by or associated with Tencent or any of its affiliates, except as required for reasonable and customary use in describing and distributing the Tencent Hunyuan Works. Tencent hereby grants You a license to use “Tencent Hunyuan” (the “Mark”) in the Territory solely as required to comply with the provisions of Section 3(c), provided that You comply with any applicable laws related to trademark protection. All goodwill arising out of Your use of the Mark will inure to the benefit of Tencent.
c. If You commence a lawsuit or other proceedings (including a cross-claim or counterclaim in a lawsuit) against Us or any person or entity alleging that the Materials or any Output, or any portion of any of the foregoing, infringe any intellectual property or other right owned or licensable by You, then all licenses granted to You under this Agreement shall terminate as of the date such lawsuit or other proceeding is filed. You will defend, indemnify and hold harmless Us from and against any claim by any Third Party arising out of or related to Your or the Third Party’s use or distribution of the Tencent Hunyuan Works.
d. Tencent claims no rights in Outputs You generate. You and Your users are solely responsible for Outputs and their subsequent uses.
7. DISCLAIMERS OF WARRANTY AND LIMITATIONS OF LIABILITY.
a. We are not obligated to support, update, provide training for, or develop any further version of the Tencent Hunyuan Works or to grant any license thereto.
b. UNLESS AND ONLY TO THE EXTENT REQUIRED BY APPLICABLE LAW, THE TENCENT HUNYUAN WORKS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED “AS IS” WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES OF ANY KIND INCLUDING ANY WARRANTIES OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, COURSE OF DEALING, USAGE OF TRADE, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING, REPRODUCING, MODIFYING, PERFORMING, DISPLAYING OR DISTRIBUTING ANY OF THE TENCENT HUNYUAN WORKS OR OUTPUTS AND ASSUME ANY AND ALL RISKS ASSOCIATED WITH YOUR OR A THIRD PARTY’S USE OR DISTRIBUTION OF ANY OF THE TENCENT HUNYUAN WORKS OR OUTPUTS AND YOUR EXERCISE OF RIGHTS AND PERMISSIONS UNDER THIS AGREEMENT.
c. TO THE FULLEST EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL TENCENT OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, FOR ANY DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, CONSEQUENTIAL OR PUNITIVE DAMAGES, OR LOST PROFITS OF ANY KIND ARISING FROM THIS AGREEMENT OR RELATED TO ANY OF THE TENCENT HUNYUAN WORKS OR OUTPUTS, EVEN IF TENCENT OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
8. SURVIVAL AND TERMINATION.
a. The term of this Agreement shall commence upon Your acceptance of this Agreement or access to the Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein.
b. We may terminate this Agreement if You breach any of the terms or conditions of this Agreement. Upon termination of this Agreement, You must promptly delete and cease use of the Tencent Hunyuan Works. Sections 6(a), 6(c), 7 and 9 shall survive the termination of this Agreement.
9. GOVERNING LAW AND JURISDICTION.
a. This Agreement and any dispute arising out of or relating to it will be governed by the laws of the Hong Kong Special Administrative Region of the People’s Republic of China, without regard to conflict of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement.
b. Exclusive jurisdiction and venue for any dispute arising out of or relating to this Agreement will be a court of competent jurisdiction in the Hong Kong Special Administrative Region of the People’s Republic of China, and Tencent and Licensee consent to the exclusive jurisdiction of such court with respect to any such dispute.
EXHIBIT A
ACCEPTABLE USE POLICY
Tencent reserves the right to update this Acceptable Use Policy from time to time.
Last modified: [insert date]
Tencent endeavors to promote safe and fair use of its tools and features, including Tencent Hunyuan. You agree not to use Tencent Hunyuan or Model Derivatives:
1. Outside the Territory;
2. In any way that violates any applicable national, federal, state, local, international or any other law or regulation;
3. To harm Yourself or others;
4. To repurpose or distribute output from Tencent Hunyuan or any Model Derivatives to harm Yourself or others;
5. To override or circumvent the safety guardrails and safeguards We have put in place;
6. For the purpose of exploiting, harming or attempting to exploit or harm minors in any way;
7. To generate or disseminate verifiably false information and/or content with the purpose of harming others or influencing elections;
8. To generate or facilitate false online engagement, including fake reviews and other means of fake online engagement;
9. To intentionally defame, disparage or otherwise harass others;
10. To generate and/or disseminate malware (including ransomware) or any other content to be used for the purpose of harming electronic systems;
11. To generate or disseminate personal identifiable information with the purpose of harming others;
12. To generate or disseminate information (including images, code, posts, articles), and place the information in any public context (including –through the use of bot generated tweets), without expressly and conspicuously identifying that the information and/or content is machine generated;
13. To impersonate another individual without consent, authorization, or legal right;
14. To make high-stakes automated decisions in domains that affect an individual’s safety, rights or wellbeing (e.g., law enforcement, migration, medicine/health, management of critical infrastructure, safety components of products, essential services, credit, employment, housing, education, social scoring, or insurance);
15. In a manner that violates or disrespects the social ethics and moral standards of other countries or regions;
16. To perform, facilitate, threaten, incite, plan, promote or encourage violent extremism or terrorism;
17. For any use intended to discriminate against or harm individuals or groups based on protected characteristics or categories, online or offline social behavior or known or predicted personal or personality characteristics;
18. To intentionally exploit any of the vulnerabilities of a specific group of persons based on their age, social, physical or mental characteristics, in order to materially distort the behavior of a person pertaining to that group in a manner that causes or is likely to cause that person or another person physical or psychological harm;
19. For military purposes;
20. To engage in the unauthorized or unlicensed practice of any profession including, but not limited to, financial, legal, medical/health, or other professional practices.
\ No newline at end of file
Usage and Legal Notices:
Tencent is pleased to support the open source community by making Tencent Hunyuan available.
Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. The below software and/or models in this distribution may have been modified by THL A29 Limited ("Tencent Modifications"). All Tencent Modifications are Copyright (C) THL A29 Limited.
Tencent Hunyuan is licensed under the Tencent Hunyuan Community License Agreement except for the third-party components listed below. Tencent Hunyuan does not impose any additional limitations beyond what is outlined in the repsective licenses of these third-party components. Users must comply with all terms and conditions of original licenses of these third-party components and must ensure that the usage of the third party components adheres to all relevant laws and regulations.
For avoidance of doubts, Tencent Hunyuan means the large language models and their software and algorithms, including trained model weights, parameters (including optimizer states), machine-learning model code, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing made publicly available by Tencent in accordance with Tencent Hunyuan Community License Agreement.
Other dependencies and licenses:
Open Source Software Licensed under the BSD 3-Clause License and Other Licenses of the Third-Party Components therein:
--------------------------------------------------------------------
1. torch
Copyright (c) 2016- Facebook, Inc (Adam Paszke)
Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
Copyright (c) 2011-2013 NYU (Clement Farabet)
Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
Terms of the BSD 3-Clause:
--------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
For the license of other third party components, please refer to the following URL:
https://github.com/pytorch/pytorch/blob/v1.13.1/NOTICE
Open Source Software Licensed under the BSD 3-Clause License and Other Licenses of the Third-Party Components therein:
--------------------------------------------------------------------
1. pandas
Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team
All rights reserved.
Copyright (c) 2011-2023, Open source contributors.
A copy of the BSD 3-Clause is included in this file.
For the license of other third party components, please refer to the following URL:
https://github.com/pandas-dev/pandas/tree/v2.0.3/LICENSES
Open Source Software Licensed under the BSD 3-Clause License and Other Licenses of the Third-Party Components therein:
--------------------------------------------------------------------
1. numpy
Copyright (c) 2005-2022, NumPy Developers.
All rights reserved.
A copy of the BSD 3-Clause is included in this file.
For the license of other third party components, please refer to the following URL:
https://github.com/numpy/numpy/blob/v1.24.4/LICENSES_bundled.txt
Open Source Software/Model Licensed under the BSD 3-Clause License and Other Licenses of the Third-Party Components therein:
--------------------------------------------------------------------
1. Megatron-LM
Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
A copy of the BSD 3-Clause is included in this file.
For the license of other third party components, please refer to the following URL:
https://github.com/NVIDIA/Megatron-LM/blob/main/LICENSE
Open Source Software/Models Licensed under the Apache License Version 2.0:
The below software in this distribution may have been modified by THL A29 Limited ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2023 THL A29 Limited.
--------------------------------------------------------------------
1. diffusers
Copyright (c) diffusers original author and authors
Please note this software has been modified by Tencent in this distribution.
2. transformers
Copyright (c) transformers original author and authors
3. timm
Copyright 2019 Ross Wightman
4. text-to-text-transfer-transformer
Copyright (c) text-to-text-transfer-transformer original author and authors
Please note this software has been modified by Tencent in this distribution.
5. pytorch-fid
Copyright (c) pytorch-fid original author and authors
Please note this software has been modified by Tencent in this distribution.
6. Image-Quality-Assessment-Toolbox
Copyright 2021 Qunliang Xing
7. accelerate
Copyright (c) accelerate original author and authors
8. IP-Adapter
Copyright (c) IP-Adapter original author and authors
Please note this software has been modified by Tencent in this distribution.
9. mT5
Copyright (c) mT5 original author and authors
10. Mistral-7B
Copyright (c) 2024 Mistral AI, All rights reserved
11. peft
Copyright 2023 The HuggingFace Team. All rights reserved.
Terms of the Apache License Version 2.0:
--------------------------------------------------------------------
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
You must give any other recipients of the Work or Derivative Works a copy of this License; and
You must cause any modified files to carry prominent notices stating that You changed the files; and
You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.
You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
Open Source Software/Model Licensed under the BSD 3-Clause License:
--------------------------------------------------------------------
1. torchvision
Copyright (c) Soumith Chintala 2016,
All rights reserved.
2. flash_attn
Copyright (c) 2022, the respective contributors, as shown by the AUTHORS file.
All rights reserved.
3. apex
Copyright (c) apex original author and authors
A copy of the BSD 3-Clause is included in this file.
Open Source Software Licensed under the HPND License:
--------------------------------------------------------------------
1. Pillow
Copyright © 2010-2023 by Jeffrey A. Clark (Alex) and contributors.
Terms of the HPND License:
--------------------------------------------------------------------
The Python Imaging Library (PIL) is
Copyright © 1997-2011 by Secret Labs AB
Copyright © 1995-2011 by Fredrik Lundh
Pillow is the friendly PIL fork. It is
Copyright © 2010-2023 by Jeffrey A. Clark (Alex) and contributors.
Like PIL, Pillow is licensed under the open source HPND License:
By obtaining, using, and/or copying this software and/or its associated
documentation, you agree that you have read, understood, and will comply
with the following terms and conditions:
Permission to use, copy, modify and distribute this software and its
documentation for any purpose and without fee is hereby granted,
provided that the above copyright notice appears in all copies, and that
both that copyright notice and this permission notice appear in supporting
documentation, and that the name of Secret Labs AB or the author not be
used in advertising or publicity pertaining to distribution of the software
without specific, written prior permission.
SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR ANY SPECIAL,
INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
Open Source Software/Model Licensed under the MIT License:
The below software in this distribution may have been modified by Tencent.
--------------------------------------------------------------------
1. einops
Copyright (c) 2018 Alex Rogozhnikov
2. loguru
Copyright (c) 2017
3. Chinese-CLIP
Copyright (c) 2012-2022 OFA-Sys Team
Copyright (c) 2012-2022 Gabriel Ilharco, Mitchell Wortsman, Nicholas Carlini, Rohan Taori, Achal Dave, Vaishaal Shankar, John Miller, Hongseok Namkoong, Hannaneh Hajishirzi, Ali Farhadi, Ludwig Schmidt
4. DeepSpeed
Copyright (c) Microsoft Corporation.
5. glid-3-xl
Copyright (c) 2021 OpenAI
6. lazysizes
Copyright (c) 2015 Alexander Farkas
7. thingsvision
Copyright (c) 2021 Vision and Computational Cognition Group
8. sd-vae-ft-ema
Copyright (c) sd-vae-ft-ema original author and authors
9. ComfyUI-Diffusers
Copyright (c) 2023 Limitex
10. glide-text2im
Copyright (c) 2021 OpenAI
11. improved-diffusion
Copyright (c) 2021 OpenAI
Terms of the MIT License:
--------------------------------------------------------------------
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
Open Source Software Licensed under the MIT License and Other Licenses of the Third-Party Components therein:
--------------------------------------------------------------------
1. tqdm
Copyright (c) 2013 noamraph
A copy of the MIT is included in this file.
For the license of other third party components, please refer to the following URL:
https://github.com/tqdm/tqdm/blob/v4.66.1/LICENCE
Open Source Software/Model Licensed under the MIT License and Other Licenses of the Third-Party Components therein:
The below software in this distribution may have been modified by Tencent.
--------------------------------------------------------------------
1. generative-models
Copyright (c) 2023 Stability AI
A copy of the MIT is included in this file.
For the license of other third party components, please refer to the following URL:
https://github.com/Stability-AI/generative-models/blob/main/LICENSE-CODE
https://github.com/Stability-AI/generative-models/tree/main/model_licenses
Open Source Software/Model Licensed under the Apache License Version 2.0 and Other Licenses of the Third-Party Components therein:
--------------------------------------------------------------------
1. pyarrow
Copyright 2016-2024 The Apache Software Foundation
A copy of the Apache License Version 2.0 is included in this file.
For the license of other third party components, please refer to the following URL:
https://github.com/apache/arrow/blob/main/NOTICE.txt
Open Source Software Licensed under the MIT License and Other Licenses of the Third-Party Components therein:
The below software in this distribution may have been modified by THL A29 Limited ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2023 THL A29 Limited.
--------------------------------------------------------------------
1. opencv-python
Copyright (c) Olli-Pekka Heinisuo
Terms of the MIT:
--------------------------------------------------------------------
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
For the license of other third party components, please refer to the following URL:
https://github.com/opencv/opencv-python/blob/4.x/LICENSE-3RD-PARTY.txt
Open Source Software Licensed under the MIT License and Other Licenses of the Third-Party Components therein:
--------------------------------------------------------------------
1. onnxruntime
Copyright (c) Microsoft Corporation.
A copy of the MIT is included in this file.
For the license of other third party components, please refer to the following URL:
https://github.com/microsoft/onnxruntime/blob/v1.16.3/ThirdPartyNotices.txt
Open Source Software/Model Licensed under the Apache License Version 2.0:
The below software in this distribution may have been modified by Tencent.
--------------------------------------------------------------------
1. dwpose
Copyright 2018-2020 Open-MMLab.
Please note this software has been modified by Tencent in this distribution.
Terms of the Apache License Version 2.0:
--------------------------------------------------------------------
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
You must give any other recipients of the Work or Derivative Works a copy of this License; and
You must cause any modified files to carry prominent notices stating that You changed the files; and
You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.
You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
Open Source Software Licensed under the License agreement for matplotlib and later and Other Licenses of the Third-Party Components therein:
--------------------------------------------------------------------
1. matplotlib
Copyright (c) 2012- Matplotlib Development Team; All Rights Reserved
Terms of the License agreement for matplotlib versions 1.3.0 and later:
--------------------------------------------------------------------
License agreement for matplotlib versions 1.3.0 and later
=========================================================
1. This LICENSE AGREEMENT is between the Matplotlib Development Team
("MDT"), and the Individual or Organization ("Licensee") accessing and
otherwise using matplotlib software in source or binary form and its
associated documentation.
2. Subject to the terms and conditions of this License Agreement, MDT
hereby grants Licensee a nonexclusive, royalty-free, world-wide license
to reproduce, analyze, test, perform and/or display publicly, prepare
derivative works, distribute, and otherwise use matplotlib
alone or in any derivative version, provided, however, that MDT's
License Agreement and MDT's notice of copyright, i.e., "Copyright (c)
2012- Matplotlib Development Team; All Rights Reserved" are retained in
matplotlib alone or in any derivative version prepared by
Licensee.
3. In the event Licensee prepares a derivative work that is based on or
incorporates matplotlib or any part thereof, and wants to
make the derivative work available to others as provided herein, then
Licensee hereby agrees to include in any such work a brief summary of
the changes made to matplotlib .
4. MDT is making matplotlib available to Licensee on an "AS
IS" basis. MDT MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, MDT MAKES NO AND
DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF MATPLOTLIB
WILL NOT INFRINGE ANY THIRD PARTY RIGHTS.
5. MDT SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF MATPLOTLIB
FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR
LOSS AS A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING
MATPLOTLIB , OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF
THE POSSIBILITY THEREOF.
6. This License Agreement will automatically terminate upon a material
breach of its terms and conditions.
7. Nothing in this License Agreement shall be deemed to create any
relationship of agency, partnership, or joint venture between MDT and
Licensee. This License Agreement does not grant permission to use MDT
trademarks or trade name in a trademark sense to endorse or promote
products or services of Licensee, or any third party.
8. By copying, installing or otherwise using matplotlib ,
Licensee agrees to be bound by the terms and conditions of this License
Agreement.
License agreement for matplotlib versions prior to 1.3.0
========================================================
1. This LICENSE AGREEMENT is between John D. Hunter ("JDH"), and the
Individual or Organization ("Licensee") accessing and otherwise using
matplotlib software in source or binary form and its associated
documentation.
2. Subject to the terms and conditions of this License Agreement, JDH
hereby grants Licensee a nonexclusive, royalty-free, world-wide license
to reproduce, analyze, test, perform and/or display publicly, prepare
derivative works, distribute, and otherwise use matplotlib
alone or in any derivative version, provided, however, that JDH's
License Agreement and JDH's notice of copyright, i.e., "Copyright (c)
2002-2011 John D. Hunter; All Rights Reserved" are retained in
matplotlib alone or in any derivative version prepared by
Licensee.
3. In the event Licensee prepares a derivative work that is based on or
incorporates matplotlib or any part thereof, and wants to
make the derivative work available to others as provided herein, then
Licensee hereby agrees to include in any such work a brief summary of
the changes made to matplotlib.
4. JDH is making matplotlib available to Licensee on an "AS
IS" basis. JDH MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, JDH MAKES NO AND
DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF MATPLOTLIB
WILL NOT INFRINGE ANY THIRD PARTY RIGHTS.
5. JDH SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF MATPLOTLIB
FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR
LOSS AS A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING
MATPLOTLIB , OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF
THE POSSIBILITY THEREOF.
6. This License Agreement will automatically terminate upon a material
breach of its terms and conditions.
7. Nothing in this License Agreement shall be deemed to create any
relationship of agency, partnership, or joint venture between JDH and
Licensee. This License Agreement does not grant permission to use JDH
trademarks or trade name in a trademark sense to endorse or promote
products or services of Licensee, or any third party.
8. By copying, installing or otherwise using matplotlib,
Licensee agrees to be bound by the terms and conditions of this License
Agreement.
For the license of other third party components, please refer to the following URL:
https://github.com/matplotlib/matplotlib/blob/v3.7.5/LICENSE
<!-- ## **HunyuanDiT** -->
<p align="center">
<img src="https://raw.githubusercontent.com/Tencent/HunyuanDiT/main/asset/logo.png" height=100>
</p>
# Hunyuan-DiT : A Powerful Multi-Resolution Diffusion Transformer with Fine-Grained Chinese Understanding
<div align="center">
<a href="https://github.com/Tencent/HunyuanDiT"><img src="https://img.shields.io/static/v1?label=Hunyuan-DiT Code&message=Github&color=blue&logo=github-pages"></a> &ensp;
<a href="https://dit.hunyuan.tencent.com"><img src="https://img.shields.io/static/v1?label=Project%20Page&message=Github&color=blue&logo=github-pages"></a> &ensp;
<a href="https://arxiv.org/abs/2405.08748"><img src="https://img.shields.io/static/v1?label=Tech Report&message=Arxiv:HunYuan-DiT&color=red&logo=arxiv"></a> &ensp;
<a href="https://arxiv.org/abs/2403.08857"><img src="https://img.shields.io/static/v1?label=Paper&message=Arxiv:DialogGen&color=red&logo=arxiv"></a> &ensp;
<a href="https://huggingface.co/Tencent-Hunyuan/HunyuanDiT"><img src="https://img.shields.io/static/v1?label=Hunyuan-DiT&message=HuggingFace&color=yellow"></a> &ensp;
<a href="https://hunyuan.tencent.com/bot/chat"><img src="https://img.shields.io/static/v1?label=Hunyuan Bot&message=Web&color=green"></a> &ensp;
<a href="https://huggingface.co/spaces/Tencent-Hunyuan/HunyuanDiT"><img src="https://img.shields.io/static/v1?label=Hunyuan-DiT Demo&message=HuggingFace&color=yellow"></a> &ensp;
<a href="./comfyui"><img src="https://img.shields.io/static/v1?label=ComfyUI Support&message=ComfyUI&color=purple&logo=github-pages"></a> &ensp;
</div>
-----
This repo contains PyTorch model definitions, pre-trained weights and inference/sampling code for our paper exploring Hunyuan-DiT. You can find more visualizations on our [project page](https://dit.hunyuan.tencent.com/).
> [**Hunyuan-DiT: A Powerful Multi-Resolution Diffusion Transformer with Fine-Grained Chinese Understanding**](https://arxiv.org/abs/2405.08748) <br>
> [**DialogGen: Multi-modal Interactive Dialogue System for Multi-turn Text-to-Image Generation**](https://arxiv.org/abs/2403.08857) <br>
## 🔥🔥🔥 News!!
* Dec 17, 2024: :tada: Optimize Lora training with `refined grad checkpoint` and `low-bit optimizer`. Just use `--lowbit-opt` to get started.
* Sep 13, 2024: 🎉 IPAdapter is officially supported by HunYuanDiT. Document for it: [./ipadapter](./ipadapter). And scaled attention is utilized to replace flash attention on V100 GPUs.
* Aug 26, 2024, 🎉 HunYuanDIT Controlnet and LoRA are officially supported by ComfyUI. Document for it: [./comfyui](./comfyui)
* Jul 15, 2024: 🚀 HunYuanDiT and Shakker.Ai have jointly launched a fine-tuning event based on the HunYuanDiT 1.2 model. By publishing a lora or fine-tuned model based on HunYuanDiT, you can earn up to $230 bonus from Shakker.Ai. See [Shakker.Ai](https://www.shakker.ai/activitys/shaker-the-world-hunyuan) for more details.
* Jul 15, 2024: :tada: Update ComfyUI to support standardized workflows and compatibility with weights from t2i module and Lora training for versions 1.1/1.2, as well as those trained by Kohya or the official script.
* Jul 15, 2024: :zap: We offer Docker environments for CUDA 11/12, allowing you to bypass complex installations and play with a single click! See [dockers](#installation-guide-for-linux) for details.
* Jul 08, 2024: :tada: HYDiT-v1.2 version is released. Please check [HunyuanDiT-v1.2](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.2) and [Distillation-v1.2](https://huggingface.co/Tencent-Hunyuan/Distillation-v1.2) for more details.
* Jul 03, 2024: :tada: Kohya-hydit version now available for v1.1 and v1.2 models, with GUI for inference. Official Kohya version is under review. See [kohya](./kohya_ss-hydit) for details.
* Jun 27, 2024: :art: Hunyuan-Captioner is released, providing fine-grained caption for training data. See [mllm](./mllm) for details.
* Jun 27, 2024: :tada: Support LoRa and ControlNet in diffusers. See [diffusers](./diffusers) for details.
* Jun 27, 2024: :tada: 6GB GPU VRAM Inference scripts are released. See [lite](./lite) for details.
* Jun 19, 2024: :tada: ControlNet is released, supporting canny, pose and depth control. See [training/inference codes](#controlnet) for details.
* Jun 13, 2024: :zap: HYDiT-v1.1 version is released, which mitigates the issue of image oversaturation and alleviates the watermark issue. Please check [HunyuanDiT-v1.1](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.1) and
[Distillation-v1.1](https://huggingface.co/Tencent-Hunyuan/Distillation-v1.1) for more details.
* Jun 13, 2024: :truck: The training code is released, offering [full-parameter training](#full-parameter-training) and [LoRA training](#lora).
* Jun 06, 2024: :tada: Hunyuan-DiT is now available in ComfyUI. Please check [ComfyUI](#using-comfyui) for more details.
* Jun 06, 2024: 🚀 We introduce Distillation version for Hunyuan-DiT acceleration, which achieves **50%** acceleration on NVIDIA GPUs. Please check [Distillation](https://huggingface.co/Tencent-Hunyuan/Distillation) for more details.
* Jun 05, 2024: 🤗 Hunyuan-DiT is now available in 🤗 Diffusers! Please check the [example](#using--diffusers) below.
* Jun 04, 2024: :globe_with_meridians: Support Tencent Cloud links to download the pretrained models! Please check the [links](#-download-pretrained-models) below.
* May 22, 2024: 🚀 We introduce TensorRT version for Hunyuan-DiT acceleration, which achieves **47%** acceleration on NVIDIA GPUs. Please check [TensorRT-libs](https://huggingface.co/Tencent-Hunyuan/TensorRT-libs) for instructions.
* May 22, 2024: 💬 We support demo running multi-turn text2image generation now. Please check the [script](#using-gradio) below.
## 🤖 Try it on the web
Welcome to our web-based [**Tencent Hunyuan Bot**](https://hunyuan.tencent.com/bot/chat), where you can explore our innovative products! Just input the suggested prompts below or any other **imaginative prompts containing drawing-related keywords** to activate the Hunyuan text-to-image generation feature. Unleash your creativity and create any picture you desire, **all for free!**
You can use simple prompts similar to natural language text
> 画一只穿着西装的猪
>
> draw a pig in a suit
>
> 生成一幅画,赛博朋克风,跑车
>
> generate a painting, cyberpunk style, sports car
or multi-turn language interactions to create the picture.
> 画一个木制的鸟
>
> draw a wooden bird
>
> 变成玻璃的
>
> turn into glass
## 🤗 Community Contribution Leaderboard
1. By [@TTPlanetPig](https://github.com/TTPlanetPig)
- HunyuanDIT_v1.2 ControlNet models
- Inpaint controlnet: https://huggingface.co/TTPlanet/HunyuanDiT_Controlnet_inpainting
- Tile controlnet: https://huggingface.co/TTPlanet/HunyuanDiT_Controlnet_tile
- Lineart controlnet: https://huggingface.co/TTPlanet/HunyuanDiT_Controlnet_lineart
- HunyuanDIT_v1.2 ComfyUI nodes
- Comfyui_TTP_CN_Preprocessor: https://github.com/TTPlanetPig/Comfyui_TTP_CN_Preprocessor
- Comfyui_TTP_Toolset: https://github.com/TTPlanetPig/Comfyui_TTP_Toolset
2. By [@sdbds](https://github.com/sdbds) (bilibili up [青龙圣者](https://space.bilibili.com/219296))
- Kohya_ss-hydit train tools: https://github.com/zml-ai/HunyuanDIT-PRE/tree/main/kohya_ss-hydit
3. By [@CrazyBoyM](https://github.com/CrazyBoyM) (bilibili up [飞鸟白菜](https://space.bilibili.com/291593914))
- ComfyUI support for HunyuanDIT_v1.2 Controlnet: https://github.com/comfyanonymous/ComfyUI/pull/4245
4. By [@L_A_X](https://huggingface.co/Laxhar/Freeway_Animation_HunYuan_Demo)
- HunyuanDIT_v1.2 base model for anime
- Original hf: https://huggingface.co/Laxhar/Freeway_Animation_HunYuan_Demo
- Converted ComfyUI model: https://huggingface.co/comfyanonymous/Freeway_Animation_Hunyuan_Demo_ComfyUI_Converted
## 📑 Open-source Plan
- Hunyuan-DiT (Text-to-Image Model)
- [x] Inference
- [x] Checkpoints
- [x] Distillation Version
- [x] TensorRT Version
- [x] Training
- [x] Lora
- [x] Controlnet (Pose, Canny, Depth)
- [x] 6GB GPU VRAM Inference
- [x] IP-adapter
- [ ] Hunyuan-DiT-S checkpoints (0.7B model)
- Mllm
- Hunyuan-Captioner (Re-caption the raw image-text pairs)
- [x] Inference
- [Hunyuan-DialogGen](https://github.com/Centaurusalpha/DialogGen) (Prompt Enhancement Model)
- [x] Inference
- [X] Web Demo (Gradio)
- [x] Multi-turn T2I Demo (Gradio)
- [X] Cli Demo
- [X] ComfyUI
- [X] Diffusers
- [X] Kohya
- [ ] WebUI
## Contents
- [Hunyuan-DiT : A Powerful Multi-Resolution Diffusion Transformer with Fine-Grained Chinese Understanding](#hunyuan-dit--a-powerful-multi-resolution-diffusion-transformer-with-fine-grained-chinese-understanding)
- [🔥🔥🔥 News!!](#-news)
- [🤖 Try it on the web](#-try-it-on-the-web)
- [🤗 Community Contribution Leaderboard](#-community-contribution-leaderboard)
- [📑 Open-source Plan](#-open-source-plan)
- [Contents](#contents)
- [Abstract](#abstract)
- [🎉 Hunyuan-DiT Key Features](#-hunyuan-dit-key-features)
- [Chinese-English Bilingual DiT Architecture](#chinese-english-bilingual-dit-architecture)
- [Multi-turn Text2Image Generation](#multi-turn-text2image-generation)
- [📈 Comparisons](#-comparisons)
- [🎥 Visualization](#-visualization)
- [📜 Requirements](#-requirements)
- [🛠️ Dependencies and Installation](#️-dependencies-and-installation)
- [Installation Guide for Linux](#installation-guide-for-linux)
- [🧱 Download Pretrained Models](#-download-pretrained-models)
- [1. Using HF-Mirror](#1-using-hf-mirror)
- [2. Resume Download](#2-resume-download)
- [:truck: Training](#truck-training)
- [Data Preparation](#data-preparation)
- [Full-parameter Training](#full-parameter-training)
- [LoRA](#lora)
- [🔑 Inference](#-inference)
- [6GB GPU VRAM Inference](#6gb-gpu-vram-inference)
- [Using Gradio](#using-gradio)
- [Using 🤗 Diffusers](#using--diffusers)
- [Using Command Line](#using-command-line)
- [More Configurations](#more-configurations)
- [Using ComfyUI](#using-comfyui)
- [Using Kohya](#using-kohya)
- [Using Previous versions](#using-previous-versions)
- [:building\_construction: Adapter](#building_construction-adapter)
- [ControlNet](#controlnet)
- [IP-Adapter](#IP-Adapter)
- [:art: Hunyuan-Captioner](#art-hunyuan-captioner)
- [Examples](#examples)
- [Instructions](#instructions)
- [Inference](#inference)
- [Gradio](#gradio)
- [🚀 Acceleration (for Linux)](#-acceleration-for-linux)
- [🔗 BibTeX](#-bibtex)
- [Start History](#start-history)
## **Abstract**
We present Hunyuan-DiT, a text-to-image diffusion transformer with fine-grained understanding of both English and Chinese. To construct Hunyuan-DiT, we carefully designed the transformer structure, text encoder, and positional encoding. We also build from scratch a whole data pipeline to update and evaluate data for iterative model optimization. For fine-grained language understanding, we train a Multimodal Large Language Model to refine the captions of the images. Finally, Hunyuan-DiT can perform multi-round multi-modal dialogue with users, generating and refining images according to the context.
Through our carefully designed holistic human evaluation protocol with more than 50 professional human evaluators, Hunyuan-DiT sets a new state-of-the-art in Chinese-to-image generation compared with other open-source models.
## 🎉 **Hunyuan-DiT Key Features**
### **Chinese-English Bilingual DiT Architecture**
Hunyuan-DiT is a diffusion model in the latent space, as depicted in figure below. Following the Latent Diffusion Model, we use a pre-trained Variational Autoencoder (VAE) to compress the images into low-dimensional latent spaces and train a diffusion model to learn the data distribution with diffusion models. Our diffusion model is parameterized with a transformer. To encode the text prompts, we leverage a combination of pre-trained bilingual (English and Chinese) CLIP and multilingual T5 encoder.
<p align="center">
<img src="https://raw.githubusercontent.com/Tencent/HunyuanDiT/main/asset/framework.png" height=450>
</p>
### Multi-turn Text2Image Generation
Understanding natural language instructions and performing multi-turn interaction with users are important for a
text-to-image system. It can help build a dynamic and iterative creation process that bring the user’s idea into reality
step by step. In this section, we will detail how we empower Hunyuan-DiT with the ability to perform multi-round
conversations and image generation. We train MLLM to understand the multi-round user dialogue
and output the new text prompt for image generation.
<p align="center">
<img src="https://raw.githubusercontent.com/Tencent/HunyuanDiT/main/asset/mllm.png" height=300>
</p>
## 📈 Comparisons
In order to comprehensively compare the generation capabilities of HunyuanDiT and other models, we constructed a 4-dimensional test set, including Text-Image Consistency, Excluding AI Artifacts, Subject Clarity, Aesthetic. More than 50 professional evaluators performs the evaluation.
<p align="center">
<table>
<thead>
<tr>
<th rowspan="2">Model</th> <th rowspan="2">Open Source</th> <th>Text-Image Consistency (%)</th> <th>Excluding AI Artifacts (%)</th> <th>Subject Clarity (%)</th> <th rowspan="2">Aesthetics (%)</th> <th rowspan="2">Overall (%)</th>
</tr>
</thead>
<tbody>
<tr>
<td>SDXL</td> <td></td> <td>64.3</td> <td>60.6</td> <td>91.1</td> <td>76.3</td> <td>42.7</td>
</tr>
<tr>
<td>PixArt-α</td> <td></td> <td>68.3</td> <td>60.9</td> <td>93.2</td> <td>77.5</td> <td>45.5</td>
</tr>
<tr>
<td>Playground 2.5</td> <td></td> <td>71.9</td> <td>70.8</td> <td>94.9</td> <td>83.3</td> <td>54.3</td>
</tr>
<tr>
<td>SD 3</td> <td>&#10008</td> <td>77.1</td> <td>69.3</td> <td>94.6</td> <td>82.5</td> <td>56.7</td>
</tr>
<tr>
<td>MidJourney v6</td><td>&#10008</td> <td>73.5</td> <td>80.2</td> <td>93.5</td> <td>87.2</td> <td>63.3</td>
</tr>
<tr>
<td>DALL-E 3</td><td>&#10008</td> <td>83.9</td> <td>80.3</td> <td>96.5</td> <td>89.4</td> <td>71.0</td>
</tr>
<tr style="font-weight: bold; background-color: #f2f2f2;">
<td>Hunyuan-DiT</td><td></td> <td>74.2</td> <td>74.3</td> <td>95.4</td> <td>86.6</td> <td>59.0</td>
</tr>
</tbody>
</table>
</p>
## 🎥 Visualization
* **Chinese Elements**
<p align="center">
<img src="https://raw.githubusercontent.com/Tencent/HunyuanDiT/main/asset/chinese elements understanding.png" height=220>
</p>
* **Long Text Input**
<p align="center">
<img src="https://raw.githubusercontent.com/Tencent/HunyuanDiT/main/asset/long text understanding.png" height=310>
</p>
* **Multi-turn Text2Image Generation**
https://github.com/Tencent/tencent.github.io/assets/27557933/94b4dcc3-104d-44e1-8bb2-dc55108763d1
---
## 📜 Requirements
This repo consists of DialogGen (a prompt enhancement model) and Hunyuan-DiT (a text-to-image model).
The following table shows the requirements for running the models (batch size = 1):
| Model | --load-4bit (DialogGen) | GPU Peak Memory | GPU |
|:-----------------------:|:-----------------------:|:---------------:|:---------------:|
| DialogGen + Hunyuan-DiT | ✘ | 32G | A100 |
| DialogGen + Hunyuan-DiT | ✔ | 22G | A100 |
| Hunyuan-DiT | - | 11G | A100 |
| Hunyuan-DiT | - | 14G | RTX3090/RTX4090 |
* An NVIDIA GPU with CUDA support is required.
* We have tested V100 and A100 GPUs.
* **Minimum**: The minimum GPU memory required is 11GB.
* **Recommended**: We recommend using a GPU with 32GB of memory for better generation quality.
* Tested operating system: Linux
## 🛠️ Dependencies and Installation
Begin by cloning the repository:
```shell
git clone https://github.com/tencent/HunyuanDiT
cd HunyuanDiT
```
### Installation Guide for Linux
We provide an `environment.yml` file for setting up a Conda environment.
Conda's installation instructions are available [here](https://docs.anaconda.com/free/miniconda/index.html).
We recommend CUDA versions 11.7 and 12.0+.
```shell
# 1. Prepare conda environment
conda env create -f environment.yml
# 2. Activate the environment
conda activate HunyuanDiT
# 3. Install pip dependencies
python -m pip install -r requirements.txt
# 4. Install flash attention v2 for acceleration (requires CUDA 11.6 or above)
python -m pip install git+https://github.com/Dao-AILab/flash-attention.git@v2.1.2.post3
```
Additionally, you can also use docker to set up the environment.
```shell
# 1. Use the following link to download the docker image tar file.
# For CUDA 12
wget https://dit.hunyuan.tencent.com/download/HunyuanDiT/hunyuan_dit_cu12.tar
# For CUDA 11
wget https://dit.hunyuan.tencent.com/download/HunyuanDiT/hunyuan_dit_cu11.tar
# 2. Import the docker tar file and show the image meta information
# For CUDA 12
docker load -i hunyuan_dit_cu12.tar
# For CUDA 11
docker load -i hunyuan_dit_cu11.tar
docker image ls
# 3. Run the container based on the image
docker run -dit --gpus all --init --net=host --uts=host --ipc=host --name hunyuandit --security-opt=seccomp=unconfined --ulimit=stack=67108864 --ulimit=memlock=-1 --privileged docker_image_tag
```
## 🧱 Download Pretrained Models
To download the model, first install the huggingface-cli. (Detailed instructions are available [here](https://huggingface.co/docs/huggingface_hub/guides/cli).)
```shell
python -m pip install "huggingface_hub[cli]"
```
Then download the model using the following commands:
```shell
# Create a directory named 'ckpts' where the model will be saved, fulfilling the prerequisites for running the demo.
mkdir ckpts
# Use the huggingface-cli tool to download the model.
# The download time may vary from 10 minutes to 1 hour depending on network conditions.
huggingface-cli download Tencent-Hunyuan/HunyuanDiT-v1.2 --local-dir ./ckpts
```
<details>
<summary>💡Tips for using huggingface-cli (network problem)</summary>
##### 1. Using HF-Mirror
If you encounter slow download speeds in China, you can try a mirror to speed up the download process. For example,
```shell
HF_ENDPOINT=https://hf-mirror.com huggingface-cli download Tencent-Hunyuan/HunyuanDiT-v1.2 --local-dir ./ckpts
```
##### 2. Resume Download
`huggingface-cli` supports resuming downloads. If the download is interrupted, you can just rerun the download
command to resume the download process.
Note: If an `No such file or directory: 'ckpts/.huggingface/.gitignore.lock'` like error occurs during the download
process, you can ignore the error and rerun the download command.
</details>
---
All models will be automatically downloaded. For more information about the model, visit the Hugging Face repository [here](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT).
| Model | #Params | Huggingface Download URL | Tencent Cloud Download URL |
|:-----------------:|:-------:|:------------------------------------------------------------------------------------------------------:|:----------------------------------------------------------------------------------------------:|
| mT5 | 1.6B | [mT5](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT/tree/main/t2i/mt5) | [mT5](https://dit.hunyuan.tencent.com/download/HunyuanDiT/mt5.zip) |
| CLIP | 350M | [CLIP](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT/tree/main/t2i/clip_text_encoder) | [CLIP](https://dit.hunyuan.tencent.com/download/HunyuanDiT/clip_text_encoder.zip) |
| Tokenizer | - | [Tokenizer](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT/tree/main/t2i/tokenizer) | [Tokenizer](https://dit.hunyuan.tencent.com/download/HunyuanDiT/tokenizer.zip) |
| DialogGen | 7.0B | [DialogGen](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT/tree/main/dialoggen) | [DialogGen](https://dit.hunyuan.tencent.com/download/HunyuanDiT/dialoggen.zip) |
| sdxl-vae-fp16-fix | 83M | [sdxl-vae-fp16-fix](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT/tree/main/t2i/sdxl-vae-fp16-fix) | [sdxl-vae-fp16-fix](https://dit.hunyuan.tencent.com/download/HunyuanDiT/sdxl-vae-fp16-fix.zip) |
| Hunyuan-DiT-v1.0 | 1.5B | [Hunyuan-DiT](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT/tree/main/t2i/model) | [Hunyuan-DiT-v1.0](https://dit.hunyuan.tencent.com/download/HunyuanDiT/model.zip) |
| Hunyuan-DiT-v1.1 | 1.5B | [Hunyuan-DiT-v1.1](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.1/tree/main/t2i/model) | [Hunyuan-DiT-v1.1](https://dit.hunyuan.tencent.com/download/HunyuanDiT/model-v1_1.zip) |
| Hunyuan-DiT-v1.2 | 1.5B | [Hunyuan-DiT-v1.2](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.2/tree/main/t2i/model) | [Hunyuan-DiT-v1.2](https://dit.hunyuan.tencent.com/download/HunyuanDiT/model-v1_2.zip) |
| Data demo | - | - | [Data demo](https://dit.hunyuan.tencent.com/download/HunyuanDiT/data_demo.zip) |
## :truck: Training
### Data Preparation
Refer to the commands below to prepare the training data.
1. Install dependencies
We offer an efficient data management library, named IndexKits, supporting the management of reading hundreds of millions of data during training, see more in [docs](./IndexKits/README.md).
```shell
# 1 Install dependencies
cd HunyuanDiT
pip install -e ./IndexKits
```
2. Data download
Feel free to download the [data demo](https://dit.hunyuan.tencent.com/download/HunyuanDiT/data_demo.zip).
```shell
# 2 Data download
wget -O ./dataset/data_demo.zip https://dit.hunyuan.tencent.com/download/HunyuanDiT/data_demo.zip
unzip ./dataset/data_demo.zip -d ./dataset
mkdir ./dataset/porcelain/arrows ./dataset/porcelain/jsons
```
3. Data conversion
Create a CSV file for training data with the fields listed in the table below.
| Fields | Required | Description | Example |
|:---------------:| :------: |:----------------:|:-----------:|
| `image_path` | Required | image path | `./dataset/porcelain/images/0.png` |
| `text_zh` | Required | text | 青花瓷风格,一只蓝色的鸟儿站在蓝色的花瓶上,周围点缀着白色花朵,背景是白色 |
| `md5` | Optional | image md5 (Message Digest Algorithm 5) | `d41d8cd98f00b204e9800998ecf8427e` |
| `width` | Optional | image width | `1024 ` |
| `height` | Optional | image height | ` 1024 ` |
> ⚠️ Optional fields like MD5, width, and height can be omitted. If omitted, the script below will automatically calculate them. This process can be time-consuming when dealing with large-scale training data.
We utilize [Arrow](https://github.com/apache/arrow) for training data format, offering a standard and efficient in-memory data representation. A conversion script is provided to transform CSV files into Arrow format.
```shell
# 3 Data conversion
python ./hydit/data_loader/csv2arrow.py ./dataset/porcelain/csvfile/image_text.csv ./dataset/porcelain/arrows 1
```
4. Data Selection and Configuration File Creation
We configure the training data through YAML files. In these files, you can set up standard data processing strategies for filtering, copying, deduplicating, and more regarding the training data. For more details, see [./IndexKits](IndexKits/docs/MakeDataset.md).
For a sample file, please refer to [file](./dataset/yamls/porcelain.yaml). For a full parameter configuration file, see [file](./IndexKits/docs/MakeDataset.md).
5. Create training data index file using YAML file.
```shell
# Single Resolution Data Preparation
idk base -c dataset/yamls/porcelain.yaml -t dataset/porcelain/jsons/porcelain.json
# Multi Resolution Data Preparation
idk multireso -c dataset/yamls/porcelain_mt.yaml -t dataset/porcelain/jsons/porcelain_mt.json
```
The directory structure for `porcelain` dataset is:
```shell
cd ./dataset
porcelain
├──images/ (image files)
│ ├──0.png
│ ├──1.png
│ ├──......
├──csvfile/ (csv files containing text-image pairs)
│ ├──image_text.csv
├──arrows/ (arrow files containing all necessary training data)
│ ├──00000.arrow
│ ├──00001.arrow
│ ├──......
├──jsons/ (final training data index files which read data from arrow files during training)
│ ├──porcelain.json
│ ├──porcelain_mt.json
```
### Full-parameter Training
**Requirement:**
1. The minimum requriment is a single GPU with at least 20GB memory, but we recommend to use a GPU with about 30 GB memory to avoid host memory offloading.
2. Additionally, we encourage users to leverage the multiple GPUs across different nodes to speed up training on large datasets.
**Notice:**
1. Personal users can also use the light-weight Kohya to finetune the model with about 16 GB memory. Currently, we are trying to further reduce the memory usage of our industry-level framework for personal users.
2. If you have enough GPU memory, please try to remove `--cpu-offloading` or `--gradient-checkpointing` for less time costs.
Specifically for distributed training, you have the flexibility to control **single-node** / **multi-node** training by adjusting parameters such as `--hostfile` and `--master_addr`. For more details, see [link](https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node).
```shell
# Single Resolution Training
PYTHONPATH=./ sh hydit/train.sh --index-file dataset/porcelain/jsons/porcelain.json
# Multi Resolution Training
PYTHONPATH=./ sh hydit/train.sh --index-file dataset/porcelain/jsons/porcelain_mt.json --multireso --reso-step 64
# Training with old version of HunyuanDiT (<= v1.1)
PYTHONPATH=./ sh hydit/train_v1.1.sh --index-file dataset/porcelain/jsons/porcelain.json
```
After checkpoints are saved, you can use the following command to evaluate the model.
```shell
# Inference
# You should replace the 'log_EXP/xxx/checkpoints/final.pt' with your actual path.
python sample_t2i.py --infer-mode fa --prompt "青花瓷风格,一只可爱的哈士奇" --no-enhance --dit-weight log_EXP/xxx/checkpoints/final.pt --load-key module
# Old version of HunyuanDiT (<= v1.1)
# You should replace the 'log_EXP/xxx/checkpoints/final.pt' with your actual path.
python sample_t2i.py --infer-mode fa --prompt "青花瓷风格,一只可爱的哈士奇" --model-root ./HunyuanDiT-v1.1 --use-style-cond --size-cond 1024 1024 --beta-end 0.03 --no-enhance --dit-weight log_EXP/xxx/checkpoints/final.pt --load-key module
```
### LoRA
We provide training and inference scripts for LoRA, detailed in the [./lora](./lora/README.md).
```shell
# Training for porcelain LoRA.
PYTHONPATH=./ sh lora/train_lora_with_fa.sh --index-file dataset/porcelain/jsons/porcelain.json
# Inference using trained LORA weights.
python sample_t2i.py --infer-mode fa --prompt "青花瓷风格,一只小狗" --no-enhance --lora-ckpt log_EXP/001-lora_porcelain_ema_rank64/checkpoints/0001000.pt
```
If you can't install flash_attn, use code:
```shell
# Training for porcelain LoRA.
PYTHONPATH=./ sh lora/train_lora.sh --index-file dataset/porcelain/jsons/porcelain.json
# Inference using trained LORA weights.
python sample_t2i.py --infer-mode torch --prompt "青花瓷风格,一只小狗" --no-enhance --lora-ckpt log_EXP/001-lora_porcelain_ema_rank64/checkpoints/0001000.pt
```
We offer two types of trained LoRA weights for `porcelain` and `jade`, see details at [links](https://huggingface.co/Tencent-Hunyuan/HYDiT-LoRA)
```shell
cd HunyuanDiT
# Use the huggingface-cli tool to download the model.
huggingface-cli download Tencent-Hunyuan/HYDiT-LoRA --local-dir ./ckpts/t2i/lora
# Quick start
python sample_t2i.py --infer-mode fa --prompt "青花瓷风格,一只猫在追蝴蝶" --no-enhance --load-key ema --lora-ckpt ./ckpts/t2i/lora/porcelain
```
<table>
<tr>
<td colspan="4" align="center">Examples of training data</td>
</tr>
<tr>
<td align="center"><img src="lora/asset/porcelain/train/0.png" alt="Image 0" width="200"/></td>
<td align="center"><img src="lora/asset/porcelain/train/1.png" alt="Image 1" width="200"/></td>
<td align="center"><img src="lora/asset/porcelain/train/2.png" alt="Image 2" width="200"/></td>
<td align="center"><img src="lora/asset/porcelain/train/3.png" alt="Image 3" width="200"/></td>
</tr>
<tr>
<td align="center">青花瓷风格,一只蓝色的鸟儿站在蓝色的花瓶上,周围点缀着白色花朵,背景是白色 (Porcelain style, a blue bird stands on a blue vase, surrounded by white flowers, with a white background.
)</td>
<td align="center">青花瓷风格,这是一幅蓝白相间的陶瓷盘子,上面描绘着一只狐狸和它的幼崽在森林中漫步,背景是白色 (Porcelain style, this is a blue and white ceramic plate depicting a fox and its cubs strolling in the forest, with a white background.)</td>
<td align="center">青花瓷风格,在黑色背景上,一只蓝色的狼站在蓝白相间的盘子上,周围是树木和月亮 (Porcelain style, on a black background, a blue wolf stands on a blue and white plate, surrounded by trees and the moon.)</td>
<td align="center">青花瓷风格,在蓝色背景上,一只蓝色蝴蝶和白色花朵被放置在中央 (Porcelain style, on a blue background, a blue butterfly and white flowers are placed in the center.)</td>
</tr>
<tr>
<td colspan="4" align="center">Examples of inference results</td>
</tr>
<tr>
<td align="center"><img src="lora/asset/porcelain/inference/0.png" alt="Image 4" width="200"/></td>
<td align="center"><img src="lora/asset/porcelain/inference/1.png" alt="Image 5" width="200"/></td>
<td align="center"><img src="lora/asset/porcelain/inference/2.png" alt="Image 6" width="200"/></td>
<td align="center"><img src="lora/asset/porcelain/inference/3.png" alt="Image 7" width="200"/></td>
</tr>
<tr>
<td align="center">青花瓷风格,苏州园林 (Porcelain style, Suzhou Gardens.)</td>
<td align="center">青花瓷风格,一朵荷花 (Porcelain style, a lotus flower.)</td>
<td align="center">青花瓷风格,一只羊(Porcelain style, a sheep.)</td>
<td align="center">青花瓷风格,一个女孩在雨中跳舞(Porcelain style, a girl dancing in the rain.)</td>
</tr>
</table>
## 🔑 Inference
### 6GB GPU VRAM Inference
Running HunyuanDiT in under 6GB GPU VRAM is available now based on [diffusers](https://huggingface.co/docs/diffusers/main/en/api/pipelines/hunyuandit). Here we provide instructions and demo for your quick start.
> The 6GB version supports Nvidia Ampere architecture series graphics cards such as RTX 3070/3080/4080/4090, A100, and so on.
The only thing you need do is to install the following library:
```bash
pip install -U bitsandbytes
pip install git+https://github.com/huggingface/diffusers
pip install torch==2.0.0
```
Then you can enjoy your HunyuanDiT text-to-image journey under 6GB GPU VRAM directly!
Here is a demo for you.
```bash
cd HunyuanDiT
# Quick start
model_id=Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers-Distilled
prompt=一个宇航员在骑马
infer_steps=50
guidance_scale=6
python3 lite/inference.py ${model_id} ${prompt} ${infer_steps} ${guidance_scale}
```
More details can be found in [./lite](lite/README.md).
### Using Gradio
Make sure the conda environment is activated before running the following command.
```shell
# By default, we start a Chinese UI. Using Flash Attention for acceleration.
python app/hydit_app.py --infer-mode fa
# Using special port and host
python app/hydit_app.py --infer-mode fa --server_name 0.0.0.0 --server_port 443 --load-key distill
# You can disable the enhancement model if the GPU memory is insufficient.
# The enhancement will be unavailable until you restart the app without the `--no-enhance` flag.
python app/hydit_app.py --no-enhance --infer-mode fa
# Start with English UI
python app/hydit_app.py --lang en --infer-mode fa
# Start a multi-turn T2I generation UI.
# If your GPU memory is less than 32GB, use '--load-4bit' to enable 4-bit quantization, which requires at least 22GB of memory.
python app/multiTurnT2I_app.py --infer-mode fa
```
Then the demo can be accessed through http://0.0.0.0:443. It should be noted that the 0.0.0.0 here needs to be X.X.X.X with your server IP.
### Using 🤗 Diffusers
Please install PyTorch version 2.0 or higher in advance to satisfy the requirements of the specified version of the diffusers library.
Install 🤗 diffusers, ensuring that the version is at least 0.28.1:
```shell
pip install git+https://github.com/huggingface/diffusers.git
```
or
```shell
pip install diffusers
```
You can generate images with both Chinese and English prompts using the following Python script:
```py
import torch
from diffusers import HunyuanDiTPipeline
pipe = HunyuanDiTPipeline.from_pretrained("Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers", torch_dtype=torch.float16)
pipe.to("cuda")
# You may also use English prompt as HunyuanDiT supports both English and Chinese
# prompt = "An astronaut riding a horse"
prompt = "一个宇航员在骑马"
image = pipe(prompt).images[0]
```
You can use our distilled model to generate images even faster:
```py
import torch
from diffusers import HunyuanDiTPipeline
pipe = HunyuanDiTPipeline.from_pretrained("Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers-Distilled", torch_dtype=torch.float16)
pipe.to("cuda")
# You may also use English prompt as HunyuanDiT supports both English and Chinese
# prompt = "An astronaut riding a horse"
prompt = "一个宇航员在骑马"
image = pipe(prompt, num_inference_steps=25).images[0]
```
More details can be found in [HunyuanDiT-v1.2-Diffusers-Distilled](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers-Distilled)
**More functions:** For other functions like LoRA and ControlNet, please have a look at the README of [./diffusers](diffusers).
### Using Command Line
We provide several commands to quick start:
```shell
# Only Text-to-Image. Flash Attention mode
python sample_t2i.py --infer-mode fa --prompt "渔舟唱晚" --no-enhance
# Generate an image with other image sizes.
python sample_t2i.py --infer-mode fa --prompt "渔舟唱晚" --image-size 1280 768
# Prompt Enhancement + Text-to-Image. DialogGen loads with 4-bit quantization, but it may loss performance.
python sample_t2i.py --infer-mode fa --prompt "渔舟唱晚" --load-4bit
```
More example prompts can be found in [example_prompts.txt](example_prompts.txt)
### More Configurations
We list some more useful configurations for easy usage:
| Argument | Default | Description |
|:---------------:|:---------:|:---------------------------------------------------:|
| `--prompt` | None | The text prompt for image generation |
| `--image-size` | 1024 1024 | The size of the generated image |
| `--seed` | 42 | The random seed for generating images |
| `--infer-steps` | 100 | The number of steps for sampling |
| `--negative` | - | The negative prompt for image generation |
| `--infer-mode` | torch | The inference mode (torch, fa, or trt) |
| `--sampler` | ddpm | The diffusion sampler (ddpm, ddim, or dpmms) |
| `--no-enhance` | False | Disable the prompt enhancement model |
| `--model-root` | ckpts | The root directory of the model checkpoints |
| `--load-key` | ema | Load the student model or EMA model (ema or module) |
| `--load-4bit` | Fasle | Load DialogGen model with 4bit quantization |
### Using ComfyUI
- Support two workflows: Standard ComfyUI and Diffusers Wrapper, with the former being recommended.
- Support HunyuanDiT-v1.1 and v1.2.
- Support module, lora and clip lora models trained by Kohya.
- Support module, lora models trained by HunyunDiT official training scripts.
- ControlNet support.
More details can be found in [./comfyui](comfyui/README.md)
### Using Kohya
We support custom codes for kohya_ss GUI, and sd-scripts training codes for HunyuanDiT.
![dreambooth](kohya_ss-hydit/img/dreambooth.png)
More details can be found in [./kohya_ss-hydit](kohya_ss-hydit/README.md)
### Using Previous versions
* **Hunyuan-DiT <= v1.1**
```shell
# ============================== v1.1 ==============================
# Download the model
huggingface-cli download Tencent-Hunyuan/HunyuanDiT-v1.1 --local-dir ./HunyuanDiT-v1.1
# Inference with the model
python sample_t2i.py --infer-mode fa --prompt "渔舟唱晚" --model-root ./HunyuanDiT-v1.1 --use-style-cond --size-cond 1024 1024 --beta-end 0.03
# ============================== v1.0 ==============================
# Download the model
huggingface-cli download Tencent-Hunyuan/HunyuanDiT --local-dir ./HunyuanDiT-v1.0
# Inference with the model
python sample_t2i.py --infer-mode fa --prompt "渔舟唱晚" --model-root ./HunyuanDiT-v1.0 --use-style-cond --size-cond 1024 1024 --beta-end 0.03
```
## :building_construction: Adapter
### ControlNet
We provide training scripts for ControlNet, detailed in the [./controlnet](./controlnet/README.md).
```shell
# Training for canny ControlNet.
PYTHONPATH=./ sh hydit/train_controlnet.sh
```
We offer three types of trained ControlNet weights for `canny` `depth` and `pose`, see details at [links](https://huggingface.co/Tencent-Hunyuan/HYDiT-ControlNet)
```shell
cd HunyuanDiT
# Use the huggingface-cli tool to download the model.
# We recommend using distilled weights as the base model for ControlNet inference, as our provided pretrained weights are trained on them.
huggingface-cli download Tencent-Hunyuan/HYDiT-ControlNet-v1.2 --local-dir ./ckpts/t2i/controlnet
huggingface-cli download Tencent-Hunyuan/Distillation-v1.2 ./pytorch_model_distill.pt --local-dir ./ckpts/t2i/model
# Quick start
python3 sample_controlnet.py --infer-mode fa --no-enhance --load-key distill --infer-steps 50 --control-type canny --prompt "在夜晚的酒店门前,一座古老的中国风格的狮子雕像矗立着,它的眼睛闪烁着光芒,仿佛在守护着这座建筑。背景是夜晚的酒店前,构图方式是特写,平视,居中构图。这张照片呈现了真实摄影风格,蕴含了中国雕塑文化,同时展现了神秘氛围" --condition-image-path controlnet/asset/input/canny.jpg --control-weight 1.0
```
<table>
<tr>
<td colspan="3" align="center">Condition Input</td>
</tr>
<tr>
<td align="center">Canny ControlNet </td>
<td align="center">Depth ControlNet </td>
<td align="center">Pose ControlNet </td>
</tr>
<tr>
<td align="center">在夜晚的酒店门前,一座古老的中国风格的狮子雕像矗立着,它的眼睛闪烁着光芒,仿佛在守护着这座建筑。背景是夜晚的酒店前,构图方式是特写,平视,居中构图。这张照片呈现了真实摄影风格,蕴含了中国雕塑文化,同时展现了神秘氛围<br>(At night, an ancient Chinese-style lion statue stands in front of the hotel, its eyes gleaming as if guarding the building. The background is the hotel entrance at night, with a close-up, eye-level, and centered composition. This photo presents a realistic photographic style, embodies Chinese sculpture culture, and reveals a mysterious atmosphere.) </td>
<td align="center">在茂密的森林中,一只黑白相间的熊猫静静地坐在绿树红花中,周围是山川和海洋。背景是白天的森林,光线充足。照片采用特写、平视和居中构图的方式,呈现出写实的效果<br>(In the dense forest, a black and white panda sits quietly among the green trees and red flowers, surrounded by mountains and oceans. The background is a daytime forest with ample light. The photo uses a close-up, eye-level, and centered composition to create a realistic effect.) </td>
<td align="center">在白天的森林中,一位穿着绿色上衣的亚洲女性站在大象旁边。照片采用了中景、平视和居中构图的方式,呈现出写实的效果。这张照片蕴含了人物摄影文化,并展现了宁静的氛围<br>(In the daytime forest, an Asian woman wearing a green shirt stands beside an elephant. The photo uses a medium shot, eye-level, and centered composition to create a realistic effect. This picture embodies the character photography culture and conveys a serene atmosphere.) </td>
</tr>
<tr>
<td align="center"><img src="controlnet/asset/input/canny.jpg" alt="Image 0" width="200"/></td>
<td align="center"><img src="controlnet/asset/input/depth.jpg" alt="Image 1" width="200"/></td>
<td align="center"><img src="controlnet/asset/input/pose.jpg" alt="Image 2" width="200"/></td>
</tr>
<tr>
<td colspan="3" align="center">ControlNet Output</td>
</tr>
<tr>
<td align="center"><img src="controlnet/asset/output/canny.jpg" alt="Image 3" width="200"/></td>
<td align="center"><img src="controlnet/asset/output/depth.jpg" alt="Image 4" width="200"/></td>
<td align="center"><img src="controlnet/asset/output/pose.jpg" alt="Image 5" width="200"/></td>
</tr>
</table>
### IP-Adapter
We provide training scripts for IP-Adapter, detailed in the [./ipadapter](./ipadapter/README.md).
```shell
# Training for IP-Adapter.
PYTHONPATH=./ sh hydit/train_ipadapter.sh
```
We offer trained IP-Adapter weights, see details at [links](https://huggingface.co/Tencent-Hunyuan/HYDiT-IP-Adapter)
```shell
cd HunyuanDiT
# Use the huggingface-cli tool to download the model.
# We recommend using module weights as the base model for IP-Adapter inference, as our provided pretrained weights are trained on them.
huggingface-cli download Tencent-Hunyuan/IP-Adapter ipa.pt --local-dir ./ckpts/t2i/model
huggingface-cli download Tencent-Hunyuan/IP-Adapter clip_img_encoder.pt --local-dir ./ckpts/t2i/model/clip_img_encoder
# Quick start
python3 sample_ipadapter.py --infer-mode fa --ref-image-path ipadapter/asset/input/tiger.png --i-scale 1.0 --prompt 一只老虎在海洋中游泳,背景是海洋。构图方式是居中构图,呈现了动漫风格和文化,营造了平静的氛围。 --infer-steps 100 --is-ipa True --load-key distill
```
Examples of ref input and IP-Adapter results are as follows:
<table>
<tr>
<td colspan="3" align="center">Ref Input</td>
</tr>
<tr>
<td align="center"><img src="ipadapter/asset/input/tiger.png" alt="Image 0" width="200"/></td>
<td align="center"><img src="ipadapter/asset/input/beauty.png" alt="Image 1" width="200"/></td>
<td align="center"><img src="ipadapter/asset/input/xunyicao.png" alt="Image 2" width="200"/></td>
</tr>
<tr>
<td colspan="3" align="center">IP-Adapter Output</td>
</tr>
<tr>
<td align="center">一只老虎在奔跑。<br>(A tiger running.) </td>
<td align="center">一个卡通美女,抱着一只小猪。<br>(A cartoon beauty holding a little pig.) </td>
<td align="center">一片紫色薰衣草地。<br>(A purple lavender field.) </td>
</tr>
<tr>
<td align="center"><img src="ipadapter/asset/output/tiger_run.png" alt="Image 3" width="200"/></td>
<td align="center"><img src="ipadapter/asset/output/beauty_pig.png" alt="Image 4" width="200"/></td>
<td align="center"><img src="ipadapter/asset/output/xunyicao_res.png" alt="Image 5" width="200"/></td>
</tr>
<tr>
<td align="center">一只老虎在看书。<br>(A tiger is reading a book.) </td>
<td align="center">一个卡通美女,穿着绿色衣服。<br>(A cartoon beauty wearing green clothes.) </td>
<td align="center">一片紫色薰衣草地,有一只可爱的小狗。<br>(A purple lavender field with a cute puppy.) </td>
</tr>
<tr>
<td align="center"><img src="ipadapter/asset/output/tiger_book.png" alt="Image 3" width="200"/></td>
<td align="center"><img src="ipadapter/asset/output/beauty_green_cloth.png" alt="Image 4" width="200"/></td>
<td align="center"><img src="ipadapter/asset/output/xunyicao_dog.png" alt="Image 5" width="200"/></td>
</tr>
<tr>
<td align="center">一只老虎在咆哮。<br>(A tiger is roaring.) </td>
<td align="center">一个卡通美女,戴着墨镜。<br>(A cartoon beauty wearing sunglasses.) </td>
<td align="center">水墨风格,一片紫色薰衣草地。<br>(Ink style. A purple lavender field.) </td>
</tr>
<tr>
<td align="center"><img src="ipadapter/asset/output/tiger_roar.png" alt="Image 3" width="200"/></td>
<td align="center"><img src="ipadapter/asset/output/beauty_glass.png" alt="Image 4" width="200"/></td>
<td align="center"><img src="ipadapter/asset/output/xunyicao_style.png" alt="Image 5" width="200"/></td>
</tr>
</table>
## :art: Hunyuan-Captioner
Hunyuan-Captioner meets the need of text-to-image techniques by maintaining a high degree of image-text consistency. It can generate high-quality image descriptions from a variety of angles, including object description, objects relationships, background information, image style, etc. Our code is based on [LLaVA](https://github.com/haotian-liu/LLaVA) implementation.
### Examples
<td align="center"><img src="./asset/caption_demo.jpg" alt="Image 3" width="1200"/></td>
### Instructions
a. Install dependencies
The dependencies and installation are basically the same as the [**base model**](https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.2).
b. Model download
```shell
# Use the huggingface-cli tool to download the model.
huggingface-cli download Tencent-Hunyuan/HunyuanCaptioner --local-dir ./ckpts/captioner
```
### Inference
Our model supports three different modes including: **directly generating Chinese caption**, **generating Chinese caption based on specific knowledge**, and **directly generating English caption**. The injected information can be either accurate cues or noisy labels (e.g., raw descriptions crawled from the internet). The model is capable of generating reliable and accurate descriptions based on both the inserted information and the image content.
|Mode | Prompt Template |Description |
| --- | --- | --- |
|caption_zh | 描述这张图片 |Caption in Chinese |
|insert_content | 根据提示词“{}”,描述这张图片 |Caption with inserted knowledge|
|caption_en | Please describe the content of this image |Caption in English |
| | | |
a. Single picture inference in Chinese
```bash
python mllm/caption_demo.py --mode "caption_zh" --image_file "mllm/images/demo1.png" --model_path "./ckpts/captioner"
```
b. Insert specific knowledge into caption
```bash
python mllm/caption_demo.py --mode "insert_content" --content "宫保鸡丁" --image_file "mllm/images/demo2.png" --model_path "./ckpts/captioner"
```
c. Single picture inference in English
```bash
python mllm/caption_demo.py --mode "caption_en" --image_file "mllm/images/demo3.png" --model_path "./ckpts/captioner"
```
d. Multiple pictures inference in Chinese
```bash
### Convert multiple pictures to csv file.
python mllm/make_csv.py --img_dir "mllm/images" --input_file "mllm/images/demo.csv"
### Multiple pictures inference
python mllm/caption_demo.py --mode "caption_zh" --input_file "mllm/images/demo.csv" --output_file "mllm/images/demo_res.csv" --model_path "./ckpts/captioner"
```
(Optional) To convert the output csv file to Arrow format, please refer to [Data Preparation #3](#data-preparation) for detailed instructions.
### Gradio
To launch a Gradio demo locally, please run the following commands one by one. For more detailed instructions, please refer to [LLaVA](https://github.com/haotian-liu/LLaVA).
```bash
cd mllm
python -m llava.serve.controller --host 0.0.0.0 --port 10000
python -m llava.serve.gradio_web_server --controller http://0.0.0.0:10000 --model-list-mode reload --port 443
python -m llava.serve.model_worker --host 0.0.0.0 --controller http://0.0.0.0:10000 --port 40000 --worker http://0.0.0.0:40000 --model-path "../ckpts/captioner" --model-name LlavaMistral
```
Then the demo can be accessed through http://0.0.0.0:443. It should be noted that the 0.0.0.0 here needs to be X.X.X.X with your server IP.
## 🚀 Acceleration (for Linux)
- We provide TensorRT version of HunyuanDiT for inference acceleration (faster than flash attention).
See [Tencent-Hunyuan/TensorRT-libs](https://huggingface.co/Tencent-Hunyuan/TensorRT-libs) for more details.
- We provide Distillation version of HunyuanDiT for inference acceleration.
See [Tencent-Hunyuan/Distillation](https://huggingface.co/Tencent-Hunyuan/Distillation) for more details.
## 🔗 BibTeX
If you find [Hunyuan-DiT](https://arxiv.org/abs/2405.08748) or [DialogGen](https://arxiv.org/abs/2403.08857) useful for your research and applications, please cite using this BibTeX:
```BibTeX
@misc{li2024hunyuandit,
title={Hunyuan-DiT: A Powerful Multi-Resolution Diffusion Transformer with Fine-Grained Chinese Understanding},
author={Zhimin Li and Jianwei Zhang and Qin Lin and Jiangfeng Xiong and Yanxin Long and Xinchi Deng and Yingfang Zhang and Xingchao Liu and Minbin Huang and Zedong Xiao and Dayou Chen and Jiajun He and Jiahao Li and Wenyue Li and Chen Zhang and Rongwei Quan and Jianxiang Lu and Jiabin Huang and Xiaoyan Yuan and Xiaoxiao Zheng and Yixuan Li and Jihong Zhang and Chao Zhang and Meng Chen and Jie Liu and Zheng Fang and Weiyan Wang and Jinbao Xue and Yangyu Tao and Jianchen Zhu and Kai Liu and Sihuan Lin and Yifu Sun and Yun Li and Dongdong Wang and Mingtao Chen and Zhichao Hu and Xiao Xiao and Yan Chen and Yuhong Liu and Wei Liu and Di Wang and Yong Yang and Jie Jiang and Qinglin Lu},
year={2024},
eprint={2405.08748},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@article{huang2024dialoggen,
title={DialogGen: Multi-modal Interactive Dialogue System for Multi-turn Text-to-Image Generation},
author={Huang, Minbin and Long, Yanxin and Deng, Xinchi and Chu, Ruihang and Xiong, Jiangfeng and Liang, Xiaodan and Cheng, Hong and Lu, Qinglin and Liu, Wei},
journal={arXiv preprint arXiv:2403.08857},
year={2024}
}
```
## Start History
<a href="https://star-history.com/#Tencent/HunyuanDiT&Date">
<picture>
<source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=Tencent/HunyuanDiT&type=Date&theme=dark" />
<source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=Tencent/HunyuanDiT&type=Date" />
<img alt="Star History Chart" src="https://api.star-history.com/svg?repos=Tencent/HunyuanDiT&type=Date" />
</picture>
</a>
import gradio as gr
import pandas as pd
from pathlib import Path
from PIL import Image, PngImagePlugin
import sys
import numpy as np
import torch
from torchvision import transforms as T
sys.path.insert(0, str(Path(__file__).parent.parent))
import datetime
from hydit.constants import SAMPLER_FACTORY
from sample_t2i import inferencer
import os
ROOT = Path(__file__).parent.parent
SAMPLERS = list(SAMPLER_FACTORY.keys())
norm_transform = T.Compose(
[
T.ToTensor(),
T.Normalize([0.5], [0.5]),
]
)
def get_strings(lang):
lang_file = Path(f"app/lang/{lang}.csv")
strings = pd.read_csv(lang_file, header=0)
strings = strings.set_index("key")["value"].to_dict()
return strings
def get_files_with_extension(path, extension):
return {
os.path.splitext(file)[0]: os.path.join(path, file)
for file in os.listdir(path)
if os.path.isfile(os.path.join(path, file))
and any(file.endswith(ext) for ext in extension)
}
args, gen, enhancer = inferencer()
output_dir = ROOT / f"{args.output_img_path}"
os.makedirs(output_dir, exist_ok=True)
strings = get_strings(args.lang)
controlnet_list = get_files_with_extension(
args.model_root + "/t2i/controlnet",
[".pt", ".safetensors"],
)
module_list = get_files_with_extension(
args.model_root + "/t2i/model",
[".pt", ".safetensors"],
)
lora_list = get_files_with_extension(
args.model_root + "/t2i/lora",
[".pt", ".safetensors"],
)
def upgrade_dit_model_load(model):
model_path = module_list[model]
gen.args.dit_weight = model_path
gen.load_torch_weights()
def generate_metadata(
prompt,
negative_prompt,
seed,
cfg_scale,
infer_steps,
sampler,
imgW,
imgH,
controlnet_module,
control_weight,
lora_ctrls,
):
"""生成图像元数据。"""
return {
"parameters": "Power by HunYun",
"prompt": prompt,
"negative_prompt": negative_prompt,
"seed": seed,
"cfg_scale": cfg_scale,
"infer_steps": infer_steps,
"sampler": sampler,
"imgW": imgW,
"imgH": imgH,
"controlnet_module": controlnet_module,
"control_weight": control_weight,
"lora_ctrls": [
{
"lora_enabled": lora_ctrl[0],
"lora_model": lora_ctrl[1],
"lora_weight": lora_ctrl[2],
}
for lora_ctrl in zip(*[iter(lora_ctrls)] * 3)
],
"model_name": gen.model_name,
}
def infer(
prompt,
negative_prompt,
seed,
cfg_scale,
infer_steps,
sampler,
imgW,
imgH,
input_image,
controlnet_module,
control_weight,
enhance,
img_crop_type,
*lora_ctrls,
):
if enhance and enhancer is not None:
success, enhanced_prompt = enhancer(prompt)
if not success:
fail_image = Image.open(ROOT / "app/fail.png")
return fail_image
else:
enhanced_prompt = None
active_loras = [
{"model": lora_ctrls[i + 1], "weight": lora_ctrls[i + 2]}
for i in range(0, len(lora_ctrls), 3)
if lora_ctrls[i]
]
if input_image is not None:
# # Convert image to PyTorch tensor if it is a NumPy array
if isinstance(input_image, np.ndarray):
input_image = Image.fromarray(input_image).convert("RGB")
input_image = gen.pixel_perfect_resolution(
input_image, imgH, imgW, img_crop_type
)
# Apply the normalization transform
input_image = norm_transform(input_image)
# Add batch dimension and move to GPU (if available)
input_image = (
input_image.unsqueeze(0).cuda()
if torch.cuda.is_available()
else input_image.unsqueeze(0)
)
results = gen.predict(
prompt,
image=input_image,
height=imgH,
width=imgW,
seed=seed,
enhanced_prompt=enhanced_prompt,
negative_prompt=negative_prompt,
infer_steps=infer_steps,
guidance_scale=cfg_scale,
batch_size=1,
src_size_cond=None,
sampler=sampler,
control_weight=control_weight,
controlnet=controlnet_module,
lora_ctrls=active_loras,
)
image = results["images"][0]
seed = results["seed"]
metadata = generate_metadata(
prompt,
negative_prompt,
seed,
cfg_scale,
infer_steps,
sampler,
imgW,
imgH,
controlnet_module,
control_weight,
active_loras,
)
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
output_path = output_dir.joinpath(f"generated_image_{timestamp}_{seed}.png")
png_info = PngImagePlugin.PngInfo()
for k, v in metadata.items():
png_info.add_text(k, str(v))
image.save(
output_path,
pnginfo=png_info,
)
return image
def ui():
block = gr.Blocks()
description = f"""
# {strings['title']}
## {strings['desc']}
"""
with block:
with gr.Row():
gr.Markdown(description)
with gr.Row():
with gr.Column():
prompt = gr.Textbox(
label=strings["prompt"], value=strings["default prompt"], lines=3
)
with gr.Row():
imgW = gr.Slider(
label=strings["width"],
minimum=64,
maximum=4096,
value=1024,
step=64,
)
imgH = gr.Slider(
label=strings["height"],
minimum=64,
maximum=4096,
value=1024,
step=64,
)
with gr.Row():
infer_steps = gr.Slider(
label=strings["infer steps"],
minimum=1,
maximum=200,
value=100,
step=1,
)
seed = gr.Number(
label=strings["seed"],
minimum=-1,
maximum=1_000_000_000,
value=0,
step=1,
precision=0,
)
enhance = gr.Checkbox(
label=strings["enhance"],
value=enhancer is not None,
interactive=True,
)
with gr.Accordion(strings["accordion"], open=False):
with gr.Row():
negative_prompt = gr.Textbox(
label=strings["negative_prompt"],
value=gen.default_negative_prompt,
lines=2,
)
with gr.Row():
sampler = gr.Dropdown(
SAMPLERS, label=strings["sampler"], value="ddpm"
)
cfg_scale = gr.Slider(
label=strings["cfg"],
minimum=1.0,
maximum=16.0,
value=6.0,
step=1,
)
with gr.Accordion(strings["model_list"], open=False):
with gr.Row():
dit_model = gr.Dropdown(
label=strings["dit_model"],
choices=[
name
for name, path in get_files_with_extension(
args.model_root + "/t2i/model",
[".pt", ".safetensors"],
).items()
],
value=f"pytorch_model_{args.load_key}",
)
dit_model.change(
fn=upgrade_dit_model_load,
inputs=dit_model,
outputs=None,
)
with gr.Accordion(strings["lora_list"], open=False):
lora_ctrls = []
for i in range(5):
with gr.Row():
lora_enabled = gr.Checkbox(
label="Enable",
value=False,
)
lora_model = gr.Dropdown(
label=f"Lora{i+1}",
choices=["none"]
+ [name for name, path in lora_list.items()],
value="none",
)
lora_weight = gr.Slider(
label="weight",
minimum=-1,
maximum=2,
step=0.01,
value=0,
scale=5,
)
lora_ctrls += [lora_enabled, lora_model, lora_weight]
with gr.Accordion(strings["controlnet"], open=False):
with gr.Row():
controlnet_module = gr.Dropdown(
label=strings["controlnet_model"],
choices=["None"]
+ [name for name, path in controlnet_list.items()],
value="None",
)
control_weight = gr.Slider(
label=strings["Control_Weight"],
minimum=0.0,
maximum=2.0,
value=1.0,
step=0.1,
)
input_image = gr.Image(label=strings["input image"])
with gr.Row():
img_crop_type = gr.Radio(
label=strings["Crop_mode"],
choices=[
(strings["Resize"], "Resize"),
(strings["Crop_and_Resize"], "Crop_and_Resize"),
(strings["Resize_and_Fill"], "Resize_and_Fill"),
],
value="Crop_and_Resize",
)
with gr.Row():
advanced_button = gr.Button(strings["run"])
with gr.Column():
default_img = Image.open(ROOT / "app/default.png")
output_img = gr.Image(
label=strings["generated image"],
interactive=False,
format="png",
value=default_img,
)
advanced_button.click(
fn=infer,
inputs=[
prompt,
negative_prompt,
seed,
cfg_scale,
infer_steps,
sampler,
imgW,
imgH,
input_image,
controlnet_module,
control_weight,
enhance,
img_crop_type,
*lora_ctrls,
],
outputs=output_img,
)
with gr.Row():
gr.Examples(
[
["一只小猫"],
[
"现实主义风格,画面主要描述一个巴洛克风格的花瓶,带有金色的装饰边框,花瓶上盛开着各种色彩鲜艳的花,白色背景"
],
["一只聪明的狐狸走在阔叶树林里, 旁边是一条小溪, 细节真实, 摄影"],
["飞流直下三千尺,疑是银河落九天"],
[
"一只长靴猫手持亮银色的宝剑,身着铠甲,眼神坚毅,站在一堆金币上,背景是暗色调的洞穴,图像上有金币的光影点缀。"
],
["麻婆豆腐"],
["苏州园林"],
[
"一颗新鲜的草莓特写,红色的外表,表面布满许多种子,背景是淡绿色的叶子"
],
["请将“杞人忧天”的样子画出来"],
["枯藤老树昏鸦,小桥流水人家"],
[
"湖水清澈,天空湛蓝,阳光灿烂。一只优雅的白天鹅在湖边游泳。它周围有几只小鸭子,看起来非常可爱,整个画面给人一种宁静祥和的感觉。"
],
["一朵鲜艳的红色玫瑰花,花瓣撒有一些水珠,晶莹剔透,特写镜头"],
["臭豆腐"],
["九寨沟"],
["俗语“鲤鱼跃龙门”"],
[
"风格是写实,画面主要描述一个亚洲戏曲艺术家正在表演,她穿着华丽的戏服,脸上戴着精致的面具,身姿优雅,背景是古色古香的舞台,镜头是近景"
],
],
[prompt],
label=strings["examples"],
)
return block
if __name__ == "__main__":
interface = ui()
interface.launch(
server_name=args.server_name,
server_port=args.server_port,
share=args.gradio_share,
)
key,value
size,Size
sampler,Sampler
prompt,Prompt
default prompt,"A cute cat"
negative_prompt,Negative Prompt
seed,Seed
cfg,CFG Scale
infer steps,Sampling Steps
batch size,Batch Size
width cond,Width Cond
height cond,Height Cond
enhance,Prompt Enhancement
run,Submit
square,Square(1024x1024)
landscape,Landscape(1280x768)
portrait,Portrait(768x1280)
accordion,Advanced Options
generated image,HunYuanDiT Generated Image
examples,More Examples
title,Hunyuan-DiT
desc,A Powerful Multi-Resolution Diffusion Transformer with Fine-Grained Chinese Understanding
controlnet,controlnet
controlnet_model,model list
Control_Weight,Control Weight
input image,input image
width,width
height,height
\ No newline at end of file
key,value
size,尺寸
sampler,采样器
prompt,文本描述
default prompt,"一只可爱的猫"
negative_prompt,负向词
seed,种子
cfg,CFG系数
infer steps,采样步数
batch size,批大小
width cond,宽度条件
height cond,高度条件
enhance,文本增强
run,提交生成
square,方形(1024x1024)
portrait,竖屏(1216x832)
landscape,横屏(832x1216)
accordion,高级设置
generated image,生成
examples,更多示例
title,HunYuanDiT
desc,具有细粒度中文理解的高性能多分辨率 Diffusion Transformer 模型
controlnet,条件控制网络
controlnet_model,模型列表
Control_Weight,控制网络权重
input image,输入图片
model_list,模型列表
dit_model,dit模型
width,width
height,height
Crop_mode,裁剪方式
Resize,仅缩放
Crop_and_Resize,裁剪并缩放
Resize_and_Fill,缩放并填充
lora_list,lora
\ No newline at end of file
# -- coding: utf-8 --
#!/usr/bin/env python
import gradio as gr
from PIL import Image
import sys
import os
sys.path.append(os.getcwd())
import json
import numpy as np
from pathlib import Path
import io
import hashlib
import requests
import base64
import pandas as pd
from sample_t2i import inferencer
from mllm.dialoggen_demo import init_dialoggen_model, eval_model
SIZES = {
"正方形(square, 1024x1024)": (1024, 1024),
"风景(landscape, 1280x768)": (768, 1280),
"人像(portrait, 768x1280)": (1280, 768),
}
global_seed = np.random.randint(0, 10000)
# Helper Functions
def image_to_base64(image_path):
with open(image_path, "rb") as image_file:
encoded_image = base64.b64encode(image_file.read()).decode()
return encoded_image
def get_strings(lang):
lang_file = Path(f"app/lang/{lang}.csv")
strings = pd.read_csv(lang_file, header=0)
strings = strings.set_index("key")["value"].to_dict()
return strings
def get_image_md5(image):
image_data = io.BytesIO()
image.save(image_data, format="PNG")
image_data = image_data.getvalue()
md5_hash = hashlib.md5(image_data).hexdigest()
return md5_hash
# mllm调用
def request_dialogGen(
server_url="http://0.0.0.0:8080",
history_messages=[],
question="画一个木制的鸟",
image="",
):
if image != "":
image = base64.b64encode(open(image, "rb").read()).decode()
print("history_messages before request", history_messages)
headers = {"accept": "application/json", "Content-Type": "application/json"}
data = {
"text": question,
"image": image, # "image为空字符串,则进行文本对话"
"history": history_messages,
}
response = requests.post(server_url, headers=headers, json=data)
print("response", response)
response = response.json()
print(response)
response_text = response["result"]
history_messages = response["history"]
print("history_messages before request", history_messages)
return history_messages, response_text
# 画图
def image_generation(prompt, infer_steps, seed, image_size):
print(
f"prompt sent to T2I model: {prompt}, infer_steps: {infer_steps}, seed: {seed}, size: {image_size}"
)
height, width = SIZES[image_size]
results = gen.predict(
prompt,
height=height,
width=width,
seed=seed,
infer_steps=infer_steps,
batch_size=1,
)
image = results["images"][0]
file_name = get_image_md5(image)
# Save images
save_dir = Path("results")
save_dir.mkdir(exist_ok=True)
save_path = f"results/multiRound_{file_name}.png"
image.save(save_path)
encoded_image = image_to_base64(save_path)
return encoded_image
# 图文对话
def chat(history_messages, input_text):
history_messages, response_text = request_dialogGen(
history_messages=history_messages, question=input_text
)
return history_messages, response_text
#
def pipeline(input_text, state, infer_steps, seed, image_size):
# 忽略空输入
if len(input_text) == 0:
return state, state[0]
conversation = state[0]
history_messages = state[1]
system_prompt = "请先判断用户的意图,若为画图则在输出前加入<画图>:"
print(f"input history:{history_messages}")
if not isinstance(history_messages, list) and len(history_messages.messages) >= 2:
response, history_messages = enhancer(
input_text, return_history=True, history=history_messages, skip_special=True
)
else:
response, history_messages = enhancer(
input_text,
return_history=True,
history=history_messages,
skip_special=False,
)
history_messages.messages[-1][-1] = response
if "<画图>" in response:
intention_draw = True
else:
intention_draw = False
print(f"response:{response}")
print("-" * 80)
print(f"history_messages:{history_messages}")
print(f"intention_draw:{intention_draw}")
if intention_draw:
prompt = response.split("<画图>")[-1]
# 画图
image_url = image_generation(prompt, infer_steps, seed, image_size)
response = f'<img src="data:image/png;base64,{image_url}" style="display: inline-block;"><p style="font-size: 14px; color: #555; margin-top: 0;">{prompt}</p>'
conversation += [((input_text, response))]
return [conversation, history_messages], conversation
# 页面设计
def upload_image(state, image_input):
conversation = state[0]
history_messages = state[1]
input_image = Image.open(image_input.name).resize((224, 224)).convert("RGB")
input_image.save(image_input.name) # Overwrite with smaller image.
system_prompt = "请先判断用户的意图,若为画图则在输出前加入<画图>:"
history_messages, response = request_dialogGen(
question="这张图描述了什么?",
history_messages=history_messages,
image=image_input.name,
)
conversation += [
(
f'<img src="./file={image_input.name}" style="display: inline-block;">',
response,
)
]
print("conversation", conversation)
print("history_messages after uploading image", history_messages)
return [conversation, history_messages], conversation
def reset():
global global_seed
global_seed = np.random.randint(0, 10000)
return [[], []], []
def reset_last(state):
conversation, history = state[0], state[1]
conversation = conversation[:-1]
history.messages = history.messages[:-2]
return [conversation, history], conversation
if __name__ == "__main__":
# Initialize dialoggen and HunyuanDiT model
args, gen, enhancer = inferencer()
strings = get_strings(args.lang)
css = """
#chatbot { min-height: 800px; }
#save-btn {
background-image: linear-gradient(to right bottom, rgba(130,217,244, 0.9), rgba(158,231,214, 1.0));
}
#save-btn:hover {
background-image: linear-gradient(to right bottom, rgba(110,197,224, 0.9), rgba(138,211,194, 1.0));
}
#share-btn {
background-image: linear-gradient(to right bottom, rgba(130,217,244, 0.9), rgba(158,231,214, 1.0));
}
#share-btn:hover {
background-image: linear-gradient(to right bottom, rgba(110,197,224, 0.9), rgba(138,211,194, 1.0));
}
#gallery { z-index: 999999; }
#gallery img:hover {transform: scale(2.3); z-index: 999999; position: relative; padding-right: 30%; padding-bottom: 30%;}
#gallery button img:hover {transform: none; z-index: 999999; position: relative; padding-right: 0; padding-bottom: 0;}
@media (hover: none) {
#gallery img:hover {transform: none; z-index: 999999; position: relative; padding-right: 0; 0;}
}
.html2canvas-container { width: 3000px !important; height: 3000px !important; }
"""
with gr.Blocks(css=css) as demo:
DESCRIPTION = """# <a style="color: black; text-decoration: none;">多轮对话绘图 Multi-turn Text2Image Generation</a>
你可以参照[DialogGen](https://arxiv.org/abs/2403.08857),通过简单的交互式语句来进行历史图片的修改,例如:主体编辑、增加主体、删除主体、背景更换、风格转换、镜头转换、图像合并。
(You can modify historical images through simple interactive statements referred to [DialogGen](https://arxiv.org/abs/2403.08857), such as: enity edit, add object, remove object, change background, change style, change lens, and combine images. )
例如,主体编辑 (For example, enity edit) :
```none
Round1: 画一个木制的鸟
(Round1: draw a wooden bird)
Round2: 变成玻璃的
(Round2: turn into glass)
```
"""
gr.Markdown(DESCRIPTION)
gr_state = gr.State([[], []]) # conversation, chat_history
with gr.Row():
with gr.Column(scale=1, min_width=1000):
with gr.Row():
chatbot = gr.Chatbot(
elem_id="chatbot", label="DialogGen&HunyuanDiT"
)
with gr.Row():
infer_steps = gr.Slider(
label="采样步数(sampling steps)",
minimum=1,
maximum=200,
value=100,
step=1,
)
seed = gr.Number(
label="种子(seed)",
minimum=-1,
maximum=1_000_000_000,
value=666,
step=1,
precision=0,
)
size_dropdown = gr.Dropdown(
choices=[
"正方形(square, 1024x1024)",
"风景(landscape, 1280x768)",
"人像(portrait, 768x1280)",
],
value="正方形(square, 1024x1024)",
label="图片尺寸(Image Size)",
)
with gr.Row():
# image_btn = gr.UploadButton("🖼️ Upload Image", file_types=["image"])
text_input = gr.Textbox(
label="提示词(prompt)", placeholder="输入提示词(Type a prompt)"
)
with gr.Column():
submit_btn = gr.Button(
"提交(Submit)", interactive=True, variant="primary"
)
clear_last_btn = gr.Button("回退(Undo)")
clear_btn = gr.Button("全部重置(Reset All)")
with gr.Row():
gr.Examples(
[
["画一个木制的鸟"],
["一只小猫"],
[
"现实主义风格,画面主要描述一个巴洛克风格的花瓶,带有金色的装饰边框,花瓶上盛开着各种色彩鲜艳的花,白色背景"
],
[
"一只聪明的狐狸走在阔叶树林里, 旁边是一条小溪, 细节真实, 摄影"
],
["飞流直下三千尺,疑是银河落九天"],
[
"一只长靴猫手持亮银色的宝剑,身着铠甲,眼神坚毅,站在一堆金币上,背景是暗色调的洞穴,图像上有金币的光影点缀。"
],
["麻婆豆腐"],
["苏州园林"],
[
"一颗新鲜的草莓特写,红色的外表,表面布满许多种子,背景是淡绿色的叶子"
],
["枯藤老树昏鸦,小桥流水人家"],
[
"湖水清澈,天空湛蓝,阳光灿烂。一只优雅的白天鹅在湖边游泳。它周围有几只小鸭子,看起来非常可爱,整个画面给人一种宁静祥和的感觉。"
],
[
"一朵鲜艳的红色玫瑰花,花瓣撒有一些水珠,晶莹剔透,特写镜头"
],
["臭豆腐"],
["九寨沟"],
["俗语“鲤鱼跃龙门”"],
[
"风格是写实,画面主要描述一个亚洲戏曲艺术家正在表演,她穿着华丽的戏服,脸上戴着精致的面具,身姿优雅,背景是古色古香的舞台,镜头是近景"
],
],
[text_input],
label=strings["examples"],
)
gr.Markdown(
"""<p style="font-size: 20px; color: #888;">powered by <a href="https://github.com/Centaurusalpha/DialogGen" target="_blank">DialogGen</a> and <a href="https://github.com/Tencent/HunyuanDiT" target="_blank">HunyuanDiT</a></p>"""
)
text_input.submit(
pipeline,
[text_input, gr_state, infer_steps, seed, size_dropdown],
[gr_state, chatbot],
)
text_input.submit(lambda: "", None, text_input) # Reset chatbox.
submit_btn.click(
pipeline,
[text_input, gr_state, infer_steps, seed, size_dropdown],
[gr_state, chatbot],
)
submit_btn.click(lambda: "", None, text_input) # Reset chatbox.
# image_btn.upload(upload_image, [gr_state, image_btn], [gr_state, chatbot])
clear_last_btn.click(reset_last, [gr_state], [gr_state, chatbot])
clear_btn.click(reset, [], [gr_state, chatbot])
interface = demo
interface.launch(server_name="0.0.0.0", server_port=443, share=False)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment