Unverified Commit 0213179b authored by Tolga Cangöz's avatar Tolga Cangöz Committed by GitHub
Browse files

Update README and example code for AnyText usage (#11028)

* [Documentation] Update README and example code with additional usage instructions for AnyText

* [Documentation] Update README for AnyTextPipeline and improve logging in code

* Remove wget command for font file from example docstring in anytext.py
parent a7d53a59
# AnyTextPipeline Pipeline # AnyTextPipeline
Project page: https://aigcdesigngroup.github.io/homepage_anytext Project page: https://aigcdesigngroup.github.io/homepage_anytext
"AnyText comprises a diffusion pipeline with two primary elements: an auxiliary latent module and a text embedding module. The former uses inputs like text glyph, position, and masked image to generate latent features for text generation or editing. The latter employs an OCR model for encoding stroke data as embeddings, which blend with image caption embeddings from the tokenizer to generate texts that seamlessly integrate with the background. We employed text-control diffusion loss and text perceptual loss for training to further enhance writing accuracy." "AnyText comprises a diffusion pipeline with two primary elements: an auxiliary latent module and a text embedding module. The former uses inputs like text glyph, position, and masked image to generate latent features for text generation or editing. The latter employs an OCR model for encoding stroke data as embeddings, which blend with image caption embeddings from the tokenizer to generate texts that seamlessly integrate with the background. We employed text-control diffusion loss and text perceptual loss for training to further enhance writing accuracy."
Each text line that needs to be generated should be enclosed in double quotes. For any usage questions, please refer to the [paper](https://arxiv.org/abs/2311.03054). > **Note:** Each text line that needs to be generated should be enclosed in double quotes.
For any usage questions, please refer to the [paper](https://arxiv.org/abs/2311.03054).
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/gist/tolgacangoz/b87ec9d2f265b448dd947c9d4a0da389/anytext.ipynb)
```py ```py
# This example requires the `anytext_controlnet.py` file:
# !git clone --depth 1 https://github.com/huggingface/diffusers.git
# %cd diffusers/examples/research_projects/anytext
# Let's choose a font file shared by an HF staff:
# !wget https://huggingface.co/spaces/ysharma/TranslateQuotesInImageForwards/resolve/main/arial-unicode-ms.ttf
import torch import torch
from diffusers import DiffusionPipeline from diffusers import DiffusionPipeline
from anytext_controlnet import AnyTextControlNetModel from anytext_controlnet import AnyTextControlNetModel
from diffusers.utils import load_image from diffusers.utils import load_image
# I chose a font file shared by an HF staff:
# !wget https://huggingface.co/spaces/ysharma/TranslateQuotesInImageForwards/resolve/main/arial-unicode-ms.ttf
anytext_controlnet = AnyTextControlNetModel.from_pretrained("tolgacangoz/anytext-controlnet", torch_dtype=torch.float16, anytext_controlnet = AnyTextControlNetModel.from_pretrained("tolgacangoz/anytext-controlnet", torch_dtype=torch.float16,
variant="fp16",) variant="fp16",)
...@@ -26,6 +33,7 @@ pipe = DiffusionPipeline.from_pretrained("tolgacangoz/anytext", font_path="arial ...@@ -26,6 +33,7 @@ pipe = DiffusionPipeline.from_pretrained("tolgacangoz/anytext", font_path="arial
# generate image # generate image
prompt = 'photo of caramel macchiato coffee on the table, top-down perspective, with "Any" "Text" written on it using cream' prompt = 'photo of caramel macchiato coffee on the table, top-down perspective, with "Any" "Text" written on it using cream'
draw_pos = load_image("https://raw.githubusercontent.com/tyxsspa/AnyText/refs/heads/main/example_images/gen9.png") draw_pos = load_image("https://raw.githubusercontent.com/tyxsspa/AnyText/refs/heads/main/example_images/gen9.png")
# There are two modes: "generate" and "edit". "edit" mode requires `ori_image` parameter for the image to be edited.
image = pipe(prompt, num_inference_steps=20, mode="generate", draw_pos=draw_pos, image = pipe(prompt, num_inference_steps=20, mode="generate", draw_pos=draw_pos,
).images[0] ).images[0]
image image
......
...@@ -146,14 +146,17 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name ...@@ -146,14 +146,17 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
EXAMPLE_DOC_STRING = """ EXAMPLE_DOC_STRING = """
Examples: Examples:
```py ```py
>>> # This example requires the `anytext_controlnet.py` file:
>>> # !git clone --depth 1 https://github.com/huggingface/diffusers.git
>>> # %cd diffusers/examples/research_projects/anytext
>>> # Let's choose a font file shared by an HF staff:
>>> # !wget https://huggingface.co/spaces/ysharma/TranslateQuotesInImageForwards/resolve/main/arial-unicode-ms.ttf
>>> import torch >>> import torch
>>> from diffusers import DiffusionPipeline >>> from diffusers import DiffusionPipeline
>>> from anytext_controlnet import AnyTextControlNetModel >>> from anytext_controlnet import AnyTextControlNetModel
>>> from diffusers.utils import load_image >>> from diffusers.utils import load_image
>>> # I chose a font file shared by an HF staff:
>>> !wget https://huggingface.co/spaces/ysharma/TranslateQuotesInImageForwards/resolve/main/arial-unicode-ms.ttf
>>> anytext_controlnet = AnyTextControlNetModel.from_pretrained("tolgacangoz/anytext-controlnet", torch_dtype=torch.float16, >>> anytext_controlnet = AnyTextControlNetModel.from_pretrained("tolgacangoz/anytext-controlnet", torch_dtype=torch.float16,
... variant="fp16",) ... variant="fp16",)
>>> pipe = DiffusionPipeline.from_pretrained("tolgacangoz/anytext", font_path="arial-unicode-ms.ttf", >>> pipe = DiffusionPipeline.from_pretrained("tolgacangoz/anytext", font_path="arial-unicode-ms.ttf",
...@@ -165,6 +168,7 @@ EXAMPLE_DOC_STRING = """ ...@@ -165,6 +168,7 @@ EXAMPLE_DOC_STRING = """
>>> # generate image >>> # generate image
>>> prompt = 'photo of caramel macchiato coffee on the table, top-down perspective, with "Any" "Text" written on it using cream' >>> prompt = 'photo of caramel macchiato coffee on the table, top-down perspective, with "Any" "Text" written on it using cream'
>>> draw_pos = load_image("https://raw.githubusercontent.com/tyxsspa/AnyText/refs/heads/main/example_images/gen9.png") >>> draw_pos = load_image("https://raw.githubusercontent.com/tyxsspa/AnyText/refs/heads/main/example_images/gen9.png")
>>> # There are two modes: "generate" and "edit". "edit" mode requires `ori_image` parameter for the image to be edited.
>>> image = pipe(prompt, num_inference_steps=20, mode="generate", draw_pos=draw_pos, >>> image = pipe(prompt, num_inference_steps=20, mode="generate", draw_pos=draw_pos,
... ).images[0] ... ).images[0]
>>> image >>> image
...@@ -257,11 +261,11 @@ class EmbeddingManager(ModelMixin, ConfigMixin): ...@@ -257,11 +261,11 @@ class EmbeddingManager(ModelMixin, ConfigMixin):
idx = tokenized_text[i] == self.placeholder_token.to(device) idx = tokenized_text[i] == self.placeholder_token.to(device)
if sum(idx) > 0: if sum(idx) > 0:
if i >= len(self.text_embs_all): if i >= len(self.text_embs_all):
print("truncation for log images...") logger.warning("truncation for log images...")
break break
text_emb = torch.cat(self.text_embs_all[i], dim=0) text_emb = torch.cat(self.text_embs_all[i], dim=0)
if sum(idx) != len(text_emb): if sum(idx) != len(text_emb):
print("truncation for long caption...") logger.warning("truncation for long caption...")
text_emb = text_emb.to(embedded_text.device) text_emb = text_emb.to(embedded_text.device)
embedded_text[i][idx] = text_emb[: sum(idx)] embedded_text[i][idx] = text_emb[: sum(idx)]
return embedded_text return embedded_text
...@@ -1058,6 +1062,8 @@ class AuxiliaryLatentModule(ModelMixin, ConfigMixin): ...@@ -1058,6 +1062,8 @@ class AuxiliaryLatentModule(ModelMixin, ConfigMixin):
raise ValueError(f"Can't read ori_image image from {ori_image}!") raise ValueError(f"Can't read ori_image image from {ori_image}!")
elif isinstance(ori_image, torch.Tensor): elif isinstance(ori_image, torch.Tensor):
ori_image = ori_image.cpu().numpy() ori_image = ori_image.cpu().numpy()
elif isinstance(ori_image, PIL.Image.Image):
ori_image = np.array(ori_image.convert("RGB"))
else: else:
if not isinstance(ori_image, np.ndarray): if not isinstance(ori_image, np.ndarray):
raise ValueError(f"Unknown format of ori_image: {type(ori_image)}") raise ValueError(f"Unknown format of ori_image: {type(ori_image)}")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment