Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
ComfyUI
Commits
8489cba1
"doc/git@developer.sourcefind.cn:wangsen/paddle_dbnet.git" did not exist on "97d728125657618948218b23fc2c9e7f59324e7b"
Commit
8489cba1
authored
Apr 13, 2023
by
BlenderNeko
Browse files
add unique ID per word/embedding for tokenizer
parent
f5f70138
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
70 additions
and
45 deletions
+70
-45
comfy/sd1_clip.py
comfy/sd1_clip.py
+70
-45
No files found.
comfy/sd1_clip.py
View file @
8489cba1
...
@@ -224,60 +224,85 @@ class SD1Tokenizer:
...
@@ -224,60 +224,85 @@ class SD1Tokenizer:
self
.
inv_vocab
=
{
v
:
k
for
k
,
v
in
vocab
.
items
()}
self
.
inv_vocab
=
{
v
:
k
for
k
,
v
in
vocab
.
items
()}
self
.
embedding_directory
=
embedding_directory
self
.
embedding_directory
=
embedding_directory
self
.
max_word_length
=
8
self
.
max_word_length
=
8
self
.
embedding_identifier
=
"embedding:"
def
tokenize_with_weights
(
self
,
text
):
def
_try_get_embedding
(
self
,
name
:
str
):
'''
Takes a potential embedding name and tries to retrieve it.
Returns a Tuple consisting of the embedding and any leftover string, embedding can be None.
'''
embedding_name
=
name
[
len
(
self
.
embedding_identifier
):].
strip
(
'
\n
'
)
embed
=
load_embed
(
embedding_name
,
self
.
embedding_directory
)
if
embed
is
None
:
stripped
=
embedding_name
.
strip
(
','
)
if
len
(
stripped
)
<
len
(
embedding_name
):
embed
=
load_embed
(
stripped
,
self
.
embedding_directory
)
return
(
embed
,
embedding_name
[
len
(
stripped
):])
return
(
embed
,
""
)
def
tokenize_with_weights
(
self
,
text
:
str
):
'''
Takes a prompt and converts it to a list of (token, weight, word id) elements.
Tokens can both be integer tokens and pre computed CLIP tensors.
Word id values are unique per word and embedding, where the id 0 is reserved for non word tokens.
Returned list has the dimensions NxM where M is the input size of CLIP
'''
text
=
escape_important
(
text
)
text
=
escape_important
(
text
)
parsed_weights
=
token_weights
(
text
,
1.0
)
parsed_weights
=
token_weights
(
text
,
1.0
)
#tokenize words
tokens
=
[]
tokens
=
[]
for
t
in
parsed_weights
:
for
weighted_segment
,
weight
in
parsed_weights
:
to_tokenize
=
unescape_important
(
t
[
0
]).
replace
(
"
\n
"
,
" "
).
split
(
' '
)
to_tokenize
=
unescape_important
(
weighted_segment
).
replace
(
"
\n
"
,
" "
).
split
(
' '
)
while
len
(
to_tokenize
)
>
0
:
to_tokenize
=
[
x
for
x
in
to_tokenize
if
x
!=
""
]
word
=
to_tokenize
.
pop
(
0
)
for
word
in
to_tokenize
:
temp_tokens
=
[]
#if we find an embedding, deal with the embedding
embedding_identifier
=
"embedding:"
if
word
.
startswith
(
self
.
embedding_identifier
)
and
self
.
embedding_directory
is
not
None
:
if
word
.
startswith
(
embedding_identifier
)
and
self
.
embedding_directory
is
not
None
:
embed
,
leftover
=
self
.
_try_get_embedding
(
word
)
embedding_name
=
word
[
len
(
embedding_identifier
):].
strip
(
'
\n
'
)
embed
=
load_embed
(
embedding_name
,
self
.
embedding_directory
)
if
embed
is
None
:
if
embed
is
None
:
stripped
=
embedding_name
.
strip
(
','
)
print
(
f
"warning, embedding:
{
word
}
does not exist, ignoring"
)
if
len
(
stripped
)
<
len
(
embedding_name
):
else
:
embed
=
load_embed
(
stripped
,
self
.
embedding_directory
)
if
embed
is
not
None
:
to_tokenize
.
insert
(
0
,
embedding_name
[
len
(
stripped
):])
if
embed
is
not
None
:
if
len
(
embed
.
shape
)
==
1
:
if
len
(
embed
.
shape
)
==
1
:
t
emp_tokens
+=
[(
embed
,
t
[
1
]
)]
t
okens
.
append
(
[(
embed
,
weight
)]
)
else
:
else
:
for
x
in
range
(
embed
.
shape
[
0
]):
tokens
.
append
([(
embed
[
x
],
weight
)
for
x
in
range
(
embed
.
shape
[
0
])])
temp_tokens
+=
[(
embed
[
x
],
t
[
1
])]
#if we accidentally have leftover text, continue parsing using leftover, else move on to next word
if
leftover
!=
""
:
word
=
leftover
else
:
else
:
print
(
"warning, embedding:{} does not exist, ignoring"
.
format
(
embedding_name
))
continue
elif
len
(
word
)
>
0
:
#parse word
tt
=
self
.
tokenizer
(
word
)[
"input_ids"
][
1
:
-
1
]
tokens
.
append
([(
t
,
weight
)
for
t
in
self
.
tokenizer
(
word
)[
"input_ids"
][
1
:
-
1
]])
for
x
in
tt
:
temp_tokens
+=
[(
x
,
t
[
1
])]
#reshape token array to CLIP input size
tokens_left
=
self
.
max_tokens_per_section
-
(
len
(
tokens
)
%
self
.
max_tokens_per_section
)
batched_tokens
=
[]
batch
=
[]
#try not to split words in different sections
batched_tokens
.
append
(
batch
)
if
tokens_left
<
len
(
temp_tokens
)
and
len
(
temp_tokens
)
<
(
self
.
max_word_length
):
for
i
,
t_group
in
enumerate
(
tokens
):
for
x
in
range
(
tokens_left
):
#start a new batch if there is not enough room
tokens
+=
[(
self
.
end_token
,
1.0
)]
if
len
(
t_group
)
+
len
(
batch
)
>
self
.
max_tokens_per_section
:
tokens
+=
temp_tokens
remaining_length
=
self
.
max_tokens_per_section
-
len
(
batch
)
#fill remaining space depending on length of tokens
out_tokens
=
[]
if
len
(
t_group
)
>
self
.
max_word_length
:
for
x
in
range
(
0
,
len
(
tokens
),
self
.
max_tokens_per_section
):
#put part of group of tokens in the batch
o_token
=
[(
self
.
start_token
,
1.0
)]
+
tokens
[
x
:
min
(
self
.
max_tokens_per_section
+
x
,
len
(
tokens
))]
batch
.
extend
([(
t
,
w
,
i
+
1
)
for
t
,
w
in
t_group
[:
remaining_length
]])
o_token
+=
[(
self
.
end_token
,
1.0
)]
t_group
=
t_group
[
remaining_length
:]
if
self
.
pad_with_end
:
else
:
o_token
+=
[(
self
.
end_token
,
1.0
)]
*
(
self
.
max_length
-
len
(
o_token
))
#filler tokens
else
:
batch
.
extend
([(
self
.
end_token
,
1.0
,
0
)]
*
remaining_length
)
o_token
+=
[(
0
,
1.0
)]
*
(
self
.
max_length
-
len
(
o_token
))
batch
=
[]
batched_tokens
.
append
(
batch
)
out_tokens
+=
[
o_token
]
#put current group of tokens in the batch
batch
.
extend
([(
t
,
w
,
i
+
1
)
for
t
,
w
in
t_group
])
#fill last batch
batch
.
extend
([(
self
.
end_token
,
1.0
,
0
)]
*
(
self
.
max_tokens_per_section
-
len
(
batch
)))
#add start and end tokens
batched_tokens
=
[[(
self
.
start_token
,
1.0
,
0
)]
+
x
+
[(
self
.
end_token
,
1.0
,
0
)]
for
x
in
batched_tokens
]
return
batched_tokens
return
out_tokens
def
untokenize
(
self
,
token_weight_pair
):
def
untokenize
(
self
,
token_weight_pair
):
return
list
(
map
(
lambda
a
:
(
a
,
self
.
inv_vocab
[
a
[
0
]]),
token_weight_pair
))
return
list
(
map
(
lambda
a
:
(
a
,
self
.
inv_vocab
[
a
[
0
]]),
token_weight_pair
))
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment