Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
text-generation-inference
Commits
47954b81
Unverified
Commit
47954b81
authored
Sep 27, 2023
by
OlivierDehaene
Committed by
GitHub
Sep 27, 2023
Browse files
feat: format code (#1070)
parent
b32e9ce9
Changes
28
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
636 additions
and
231 deletions
+636
-231
clients/python/text_generation/client.py
clients/python/text_generation/client.py
+1
-1
clients/python/text_generation/types.py
clients/python/text_generation/types.py
+3
-1
integration-tests/models/test_flash_awq.py
integration-tests/models/test_flash_awq.py
+18
-9
integration-tests/models/test_flash_awq_sharded.py
integration-tests/models/test_flash_awq_sharded.py
+20
-3
integration-tests/models/test_idefics.py
integration-tests/models/test_idefics.py
+1
-3
server/tests/utils/test_tokens.py
server/tests/utils/test_tokens.py
+5
-2
server/text_generation_server/cli.py
server/text_generation_server/cli.py
+9
-4
server/text_generation_server/models/__init__.py
server/text_generation_server/models/__init__.py
+14
-14
server/text_generation_server/models/causal_lm.py
server/text_generation_server/models/causal_lm.py
+6
-3
server/text_generation_server/models/custom_modeling/bloom_modeling.py
...eneration_server/models/custom_modeling/bloom_modeling.py
+4
-1
server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
...ion_server/models/custom_modeling/flash_llama_modeling.py
+5
-1
server/text_generation_server/models/custom_modeling/idefics_image_processing.py
...server/models/custom_modeling/idefics_image_processing.py
+22
-8
server/text_generation_server/models/custom_modeling/idefics_modeling.py
...eration_server/models/custom_modeling/idefics_modeling.py
+274
-86
server/text_generation_server/models/custom_modeling/idefics_perceiver.py
...ration_server/models/custom_modeling/idefics_perceiver.py
+60
-29
server/text_generation_server/models/custom_modeling/idefics_processing.py
...ation_server/models/custom_modeling/idefics_processing.py
+35
-9
server/text_generation_server/models/custom_modeling/idefics_vision.py
...eneration_server/models/custom_modeling/idefics_vision.py
+81
-24
server/text_generation_server/models/custom_modeling/neox_modeling.py
...generation_server/models/custom_modeling/neox_modeling.py
+4
-1
server/text_generation_server/models/flash_causal_lm.py
server/text_generation_server/models/flash_causal_lm.py
+6
-3
server/text_generation_server/models/idefics_causal_lm.py
server/text_generation_server/models/idefics_causal_lm.py
+66
-28
server/text_generation_server/models/model.py
server/text_generation_server/models/model.py
+2
-1
No files found.
clients/python/text_generation/client.py
View file @
47954b81
...
...
@@ -137,7 +137,7 @@ class Client:
typical_p
=
typical_p
,
watermark
=
watermark
,
decoder_input_details
=
decoder_input_details
,
top_n_tokens
=
top_n_tokens
top_n_tokens
=
top_n_tokens
,
)
request
=
Request
(
inputs
=
prompt
,
stream
=
False
,
parameters
=
parameters
)
...
...
clients/python/text_generation/types.py
View file @
47954b81
...
...
@@ -133,7 +133,9 @@ class Request(BaseModel):
and
parameters
.
best_of
>
1
and
field_value
):
raise
ValidationError
(
"`best_of` != 1 is not supported when `stream` == True"
)
raise
ValidationError
(
"`best_of` != 1 is not supported when `stream` == True"
)
return
field_value
...
...
integration-tests/models/test_flash_awq.py
View file @
47954b81
...
...
@@ -3,7 +3,11 @@ import pytest
@
pytest
.
fixture
(
scope
=
"module"
)
def
flash_llama_awq_handle
(
launcher
):
with
launcher
(
"abhinavkulkarni/codellama-CodeLlama-7b-Python-hf-w4-g128-awq"
,
num_shard
=
1
,
quantize
=
"awq"
)
as
handle
:
with
launcher
(
"abhinavkulkarni/codellama-CodeLlama-7b-Python-hf-w4-g128-awq"
,
num_shard
=
1
,
quantize
=
"awq"
,
)
as
handle
:
yield
handle
...
...
@@ -12,6 +16,7 @@ async def flash_llama_awq(flash_llama_awq_handle):
await
flash_llama_awq_handle
.
health
(
300
)
return
flash_llama_awq_handle
.
client
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
private
async
def
test_flash_llama_awq
(
flash_llama_awq
,
response_snapshot
):
...
...
@@ -20,11 +25,13 @@ async def test_flash_llama_awq(flash_llama_awq, response_snapshot):
)
assert
response
.
details
.
generated_tokens
==
10
assert
response
.
generated_text
==
"
\n
What is the difference between Deep Learning and Machine"
assert
(
response
.
generated_text
==
"
\n
What is the difference between Deep Learning and Machine"
)
assert
response
==
response_snapshot
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
private
async
def
test_flash_llama_awq_all_params
(
flash_llama_awq
,
response_snapshot
):
...
...
@@ -49,16 +56,18 @@ async def test_flash_llama_awq_all_params(flash_llama_awq, response_snapshot):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
private
async
def
test_flash_llama_awq_load
(
flash_llama_awq
,
generate_load
,
response_snapshot
):
async
def
test_flash_llama_awq_load
(
flash_llama_awq
,
generate_load
,
response_snapshot
):
responses
=
await
generate_load
(
flash_llama_awq
,
"What is Deep Learning?"
,
max_new_tokens
=
10
,
n
=
4
)
assert
len
(
responses
)
==
4
assert
all
([
r
.
generated_text
==
"
\n
What is the difference between Deep Learning and Machine"
for
r
in
responses
])
assert
all
(
[
r
.
generated_text
==
"
\n
What is the difference between Deep Learning and Machine"
for
r
in
responses
]
)
assert
responses
==
response_snapshot
integration-tests/models/test_flash_awq_sharded.py
View file @
47954b81
import
pytest
@
pytest
.
fixture
(
scope
=
"module"
)
def
flash_llama_awq_handle_sharded
(
launcher
):
with
launcher
(
"abhinavkulkarni/codellama-CodeLlama-7b-Python-hf-w4-g128-awq"
,
num_shard
=
2
,
quantize
=
"awq"
)
as
handle
:
with
launcher
(
"abhinavkulkarni/codellama-CodeLlama-7b-Python-hf-w4-g128-awq"
,
num_shard
=
2
,
quantize
=
"awq"
,
)
as
handle
:
yield
handle
@
pytest
.
fixture
(
scope
=
"module"
)
async
def
flash_llama_awq_sharded
(
flash_llama_awq_handle_sharded
):
await
flash_llama_awq_handle_sharded
.
health
(
300
)
return
flash_llama_awq_handle_sharded
.
client
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
private
async
def
test_flash_llama_awq_sharded
(
flash_llama_awq_sharded
,
response_snapshot
):
...
...
@@ -18,9 +25,13 @@ async def test_flash_llama_awq_sharded(flash_llama_awq_sharded, response_snapsho
)
assert
response
.
details
.
generated_tokens
==
10
assert
response
.
generated_text
==
"
\n
What is the difference between Deep Learning and Machine"
assert
(
response
.
generated_text
==
"
\n
What is the difference between Deep Learning and Machine"
)
assert
response
==
response_snapshot
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
private
async
def
test_flash_llama_awq_load_sharded
(
...
...
@@ -31,6 +42,12 @@ async def test_flash_llama_awq_load_sharded(
)
assert
len
(
responses
)
==
4
assert
all
([
r
.
generated_text
==
"
\n
What is the difference between Deep Learning and Machine"
for
r
in
responses
])
assert
all
(
[
r
.
generated_text
==
"
\n
What is the difference between Deep Learning and Machine"
for
r
in
responses
]
)
assert
responses
==
response_snapshot
integration-tests/models/test_idefics.py
View file @
47954b81
...
...
@@ -3,9 +3,7 @@ import pytest
@
pytest
.
fixture
(
scope
=
"module"
)
def
idefics_handle
(
launcher
):
with
launcher
(
"HuggingFaceM4/idefics-9b-instruct"
,
num_shard
=
2
)
as
handle
:
with
launcher
(
"HuggingFaceM4/idefics-9b-instruct"
,
num_shard
=
2
)
as
handle
:
yield
handle
...
...
server/tests/utils/test_tokens.py
View file @
47954b81
...
...
@@ -45,12 +45,15 @@ def test_stopping_criteria_max():
assert
criteria
(
1
,
""
)
==
(
False
,
None
)
assert
criteria
(
1
,
""
)
==
(
True
,
FinishReason
.
FINISH_REASON_LENGTH
)
def
test_batch_top_tokens
():
top_n_tokens
=
[
0
,
2
,
3
,
4
,
5
]
top_n_tokens_tensor
=
torch
.
tensor
(
top_n_tokens
)
inp_logprobs
=
torch
.
tensor
([[
-
1.
,
-
3.
,
-
4.
,
-
2.
,
-
3.
]]
*
5
)
inp_logprobs
=
torch
.
tensor
([[
-
1.
0
,
-
3.
0
,
-
4.
0
,
-
2.
0
,
-
3.
0
]]
*
5
)
topn_tok_ids
,
topn_tok_logprobs
=
batch_top_tokens
(
top_n_tokens
,
top_n_tokens_tensor
,
inp_logprobs
)
topn_tok_ids
,
topn_tok_logprobs
=
batch_top_tokens
(
top_n_tokens
,
top_n_tokens_tensor
,
inp_logprobs
)
assert
topn_tok_ids
[
0
]
==
[]
assert
topn_tok_ids
[
1
]
==
[
0
,
3
]
...
...
server/text_generation_server/cli.py
View file @
47954b81
...
...
@@ -125,8 +125,12 @@ def download_weights(
if
not
is_local_model
:
try
:
adapter_config_filename
=
hf_hub_download
(
model_id
,
revision
=
revision
,
filename
=
"adapter_config.json"
)
utils
.
download_and_unload_peft
(
model_id
,
revision
,
trust_remote_code
=
trust_remote_code
)
adapter_config_filename
=
hf_hub_download
(
model_id
,
revision
=
revision
,
filename
=
"adapter_config.json"
)
utils
.
download_and_unload_peft
(
model_id
,
revision
,
trust_remote_code
=
trust_remote_code
)
is_local_model
=
True
utils
.
weight_files
(
model_id
,
revision
,
extension
)
return
...
...
@@ -179,11 +183,12 @@ def download_weights(
import
transformers
import
json
if
is_local_model
:
config_filename
=
os
.
path
.
join
(
model_id
,
"config.json"
)
else
:
config_filename
=
hf_hub_download
(
model_id
,
revision
=
revision
,
filename
=
"config.json"
)
config_filename
=
hf_hub_download
(
model_id
,
revision
=
revision
,
filename
=
"config.json"
)
with
open
(
config_filename
,
"r"
)
as
f
:
config
=
json
.
load
(
f
)
architecture
=
config
[
"architectures"
][
0
]
...
...
server/text_generation_server/models/__init__.py
View file @
47954b81
...
...
@@ -153,7 +153,11 @@ def get_model(
)
elif
model_type
==
"mpt"
:
return
MPTSharded
(
model_id
,
revision
,
quantize
=
quantize
,
dtype
=
dtype
,
trust_remote_code
=
trust_remote_code
model_id
,
revision
,
quantize
=
quantize
,
dtype
=
dtype
,
trust_remote_code
=
trust_remote_code
,
)
elif
model_type
==
"gpt_neox"
:
...
...
@@ -269,13 +273,9 @@ def get_model(
"gptq quantization is not supported for AutoModel, you can try to quantize it with `text-generation-server quantize ORIGINAL_MODEL_ID NEW_MODEL_ID`"
)
if
quantize
==
"awq"
:
raise
ValueError
(
"awq quantization is not supported for AutoModel"
)
raise
ValueError
(
"awq quantization is not supported for AutoModel"
)
elif
(
quantize
==
"bitsandbytes-fp4"
)
or
(
quantize
==
"bitsandbytes-nf4"
):
raise
ValueError
(
"4bit quantization is not supported for AutoModel"
)
raise
ValueError
(
"4bit quantization is not supported for AutoModel"
)
if
model_type
in
modeling_auto
.
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
:
return
CausalLM
(
model_id
,
...
...
server/text_generation_server/models/causal_lm.py
View file @
47954b81
...
...
@@ -643,9 +643,12 @@ class CausalLM(Model):
# Decode generated tokens
output_text
,
_
,
_
=
self
.
decode_token
(
all_input_ids
[:,
0
],
prefix_offset
=
len
(
all_input_ids
)
-
stopping_criteria
.
current_tokens
-
1
,
read_offset
=
len
(
all_input_ids
)
-
stopping_criteria
.
current_tokens
,
skip_special_tokens
=
True
prefix_offset
=
len
(
all_input_ids
)
-
stopping_criteria
.
current_tokens
-
1
,
read_offset
=
len
(
all_input_ids
)
-
stopping_criteria
.
current_tokens
,
skip_special_tokens
=
True
,
)
# Get seed
if
isinstance
(
next_token_chooser
.
choice
,
Sampling
):
...
...
server/text_generation_server/models/custom_modeling/bloom_modeling.py
View file @
47954b81
...
...
@@ -40,7 +40,10 @@ from text_generation_server.utils.layers import (
)
CUSTOM_KERNELS_ENABLED
=
False
if
torch
.
cuda
.
is_available
()
and
not
os
.
environ
.
get
(
"DISABLE_CUSTOM_KERNELS"
,
"False"
)
==
"True"
:
if
(
torch
.
cuda
.
is_available
()
and
not
os
.
environ
.
get
(
"DISABLE_CUSTOM_KERNELS"
,
"False"
)
==
"True"
):
try
:
from
custom_kernels
import
fused_bloom_attention_cuda
...
...
server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
View file @
47954b81
...
...
@@ -169,6 +169,7 @@ def load_attention(config, prefix, weights):
bias
=
False
,
)
def
_load_gqa
(
config
,
prefix
:
str
,
weights
):
assert
config
.
hidden_size
%
config
.
num_attention_heads
==
0
assert
config
.
num_attention_heads
%
weights
.
process_group
.
size
()
==
0
...
...
@@ -211,7 +212,10 @@ class FlashLlamaAttention(torch.nn.Module):
# config=config, prefix=f"{prefix}.rotary_emb", weights=weights
# )
self
.
rotary_emb
=
PositionRotaryEmbedding
.
static
(
config
=
config
,
dim
=
self
.
head_size
,
base
=
config
.
rope_theta
,
device
=
weights
.
device
config
=
config
,
dim
=
self
.
head_size
,
base
=
config
.
rope_theta
,
device
=
weights
.
device
,
)
self
.
softmax_scale
=
self
.
head_size
**-
0.5
...
...
server/text_generation_server/models/custom_modeling/idefics_image_processing.py
View file @
47954b81
...
...
@@ -20,7 +20,12 @@ import numpy as np
from
PIL
import
Image
from
transformers.image_processing_utils
import
BaseImageProcessor
,
BatchFeature
from
transformers.image_transforms
import
resize
,
to_channel_dimension_format
,
rescale
,
normalize
from
transformers.image_transforms
import
(
resize
,
to_channel_dimension_format
,
rescale
,
normalize
,
)
from
transformers.image_utils
import
(
ChannelDimension
,
ImageInput
,
...
...
@@ -121,7 +126,11 @@ class IdeficsImageProcessor(BaseImageProcessor):
a PyTorch tensor of the processed images
"""
image_size
=
image_size
if
image_size
is
not
None
else
self
.
image_size
image_num_channels
=
image_num_channels
if
image_num_channels
is
not
None
else
self
.
image_num_channels
image_num_channels
=
(
image_num_channels
if
image_num_channels
is
not
None
else
self
.
image_num_channels
)
image_mean
=
image_mean
if
image_mean
is
not
None
else
self
.
image_mean
image_std
=
image_std
if
image_std
is
not
None
else
self
.
image_std
size
=
(
image_size
,
image_size
)
...
...
@@ -160,9 +169,13 @@ class IdeficsImageProcessor(BaseImageProcessor):
images
=
[
resize
(
x
,
size
,
resample
=
PILImageResampling
.
BICUBIC
)
for
x
in
images
]
images
=
[
self
.
rescale
(
image
=
image
,
scale
=
1
/
255
)
for
image
in
images
]
images
=
[
self
.
normalize
(
x
,
mean
=
image_mean
,
std
=
image_std
)
for
x
in
images
]
images
=
[
to_channel_dimension_format
(
x
,
ChannelDimension
.
FIRST
)
for
x
in
images
]
images
=
[
to_channel_dimension_format
(
x
,
ChannelDimension
.
FIRST
)
for
x
in
images
]
# TODO: this converts to torch tensors - switch to convert_to_tensors once it becomes available
images
=
BatchFeature
(
data
=
{
"pixel_values"
:
images
},
tensor_type
=
TensorType
.
PYTORCH
)[
"pixel_values"
]
images
=
BatchFeature
(
data
=
{
"pixel_values"
:
images
},
tensor_type
=
TensorType
.
PYTORCH
)[
"pixel_values"
]
return
images
...
...
@@ -185,7 +198,9 @@ class IdeficsImageProcessor(BaseImageProcessor):
response
.
raise_for_status
()
return
Image
.
open
(
BytesIO
(
response
.
content
))
else
:
raise
ValueError
(
f
"only a single or a list of entries is supported but got type=
{
type
(
image_url_or_urls
)
}
"
)
raise
ValueError
(
f
"only a single or a list of entries is supported but got type=
{
type
(
image_url_or_urls
)
}
"
)
def
rescale
(
self
,
...
...
@@ -255,10 +270,9 @@ class IdeficsImageProcessor(BaseImageProcessor):
`np.ndarray`: The normalized image.
"""
# TODO 4.32
return
normalize
(
image
,
mean
=
mean
,
std
=
std
,
data_format
=
data_format
,
**
kwargs
)
return
normalize
(
image
,
mean
=
mean
,
std
=
std
,
data_format
=
data_format
,
**
kwargs
)
import
transformers
transformers
.
IdeficsImageProcessor
=
IdeficsImageProcessor
server/text_generation_server/models/custom_modeling/idefics_modeling.py
View file @
47954b81
This diff is collapsed.
Click to expand it.
server/text_generation_server/models/custom_modeling/idefics_perceiver.py
View file @
47954b81
...
...
@@ -46,7 +46,8 @@ from text_generation_server.utils.layers import (
TensorParallelRowLinear
,
)
EPS
=
1e-5
EPS
=
1e-5
class
IdeficsPerceiverResampler
(
nn
.
Module
):
def
__init__
(
...
...
@@ -78,7 +79,12 @@ class IdeficsPerceiverResampler(nn.Module):
"""
super
().
__init__
()
self
.
embed_dim
,
self
.
n_heads
,
self
.
head_dim
,
self
.
n_latents
=
embed_dim
,
n_heads
,
head_dim
,
n_latents
self
.
embed_dim
,
self
.
n_heads
,
self
.
head_dim
,
self
.
n_latents
=
(
embed_dim
,
n_heads
,
head_dim
,
n_latents
,
)
self
.
qk_layer_norms
=
config
.
perceiver_config
.
qk_layer_norms_perceiver
# Create Latents for Perceiver
...
...
@@ -107,14 +113,16 @@ class IdeficsPerceiverResampler(nn.Module):
prefix
=
f
"
{
prefix
}
.blocks.
{
layer_id
}
.1"
,
intermediate_size
=
self
.
intermediate_dim
,
config
=
config
,
weights
=
weights
weights
=
weights
,
),
]
)
for
layer_id
in
range
(
depth
)
]
)
self
.
layer_norm
=
nn
.
LayerNorm
.
load
(
prefix
=
f
"
{
prefix
}
.layer_norm"
,
weights
=
weights
,
eps
=
EPS
)
self
.
layer_norm
=
nn
.
LayerNorm
.
load
(
prefix
=
f
"
{
prefix
}
.layer_norm"
,
weights
=
weights
,
eps
=
EPS
)
def
forward
(
self
,
context
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""Resample arbitrary length context & *compress* down to self.n_latents latent embeddings"""
...
...
@@ -130,25 +138,34 @@ class IdeficsPerceiverResampler(nn.Module):
class
IdeficsPerceiverAttention
(
nn
.
Module
):
def
__init__
(
self
,
def
__init__
(
self
,
prefix
,
config
,
embed_dim
:
int
,
n_heads
:
int
,
head_dim
:
int
,
qk_layer_norms
:
bool
,
weights
weights
,
)
->
None
:
"""Perceiver Cross-Attention Module --> let long-form inputs be `context`, resampled embeddings be `latents`"""
super
().
__init__
()
self
.
embed_dim
,
self
.
n_heads
,
self
.
head_dim
=
embed_dim
,
n_heads
,
head_dim
self
.
qk_layer_norms
=
qk_layer_norms
# Normalization & Scaling
self
.
context_layer_norm
=
nn
.
LayerNorm
.
load
(
prefix
=
f
"
{
prefix
}
.context_layer_norm"
,
weights
=
weights
,
eps
=
EPS
)
self
.
latents_layer_norm
=
nn
.
LayerNorm
.
load
(
prefix
=
f
"
{
prefix
}
.latents_layer_norm"
,
weights
=
weights
,
eps
=
EPS
)
self
.
context_layer_norm
=
nn
.
LayerNorm
.
load
(
prefix
=
f
"
{
prefix
}
.context_layer_norm"
,
weights
=
weights
,
eps
=
EPS
)
self
.
latents_layer_norm
=
nn
.
LayerNorm
.
load
(
prefix
=
f
"
{
prefix
}
.latents_layer_norm"
,
weights
=
weights
,
eps
=
EPS
)
if
self
.
qk_layer_norms
:
self
.
q_layer_norm
=
nn
.
LayerNorm
.
load
(
prefix
=
f
"
{
prefix
}
.q_layer_norm"
,
weights
=
weights
,
eps
=
EPS
)
self
.
k_layer_norm
=
nn
.
LayerNorm
.
load
(
prefix
=
f
"
{
prefix
}
.k_layer_norm"
,
weights
=
weights
,
eps
=
EPS
)
self
.
q_layer_norm
=
nn
.
LayerNorm
.
load
(
prefix
=
f
"
{
prefix
}
.q_layer_norm"
,
weights
=
weights
,
eps
=
EPS
)
self
.
k_layer_norm
=
nn
.
LayerNorm
.
load
(
prefix
=
f
"
{
prefix
}
.k_layer_norm"
,
weights
=
weights
,
eps
=
EPS
)
self
.
qk_scale
=
self
.
head_dim
**-
0.5
...
...
@@ -202,7 +219,12 @@ class IdeficsPerceiverAttention(nn.Module):
# Multiheaded Self-Attention w/ stable softmax (subtract per-row max -- `amax` -- before softmax call)
# =>> `attn` should be a 2D matrix of shape [n_latents x (context + n_latents)]
# einsum.rearrange(x, "bsz seq (heads embed) -> bsz heads seq embed", heads=self.n_heads)
q
,
k
,
v
=
[
x
.
reshape
(
batch_size
,
x
.
shape
[
1
],
self
.
n_heads
,
self
.
head_dim
).
transpose
(
1
,
2
)
for
x
in
(
q
,
k
,
v
)]
q
,
k
,
v
=
[
x
.
reshape
(
batch_size
,
x
.
shape
[
1
],
self
.
n_heads
,
self
.
head_dim
).
transpose
(
1
,
2
)
for
x
in
(
q
,
k
,
v
)
]
if
self
.
qk_layer_norms
:
q
=
self
.
q_layer_norm
(
q
)
...
...
@@ -219,7 +241,8 @@ class IdeficsPerceiverAttention(nn.Module):
class
IdeficsMLP
(
nn
.
Module
):
def
__init__
(
self
,
def
__init__
(
self
,
prefix
,
intermediate_size
,
config
,
...
...
@@ -230,14 +253,22 @@ class IdeficsMLP(nn.Module):
self
.
embed_dim
=
config
.
vision_config
.
embed_dim
self
.
ln
=
nn
.
LayerNorm
.
load
(
prefix
=
f
"
{
prefix
}
.ln"
,
weights
=
weights
,
eps
=
EPS
)
self
.
fc
=
TensorParallelColumnLinear
.
load
(
config
=
config
,
prefix
=
f
"
{
prefix
}
.fc"
,
weights
=
weights
,
bias
=
False
,
config
=
config
,
prefix
=
f
"
{
prefix
}
.fc"
,
weights
=
weights
,
bias
=
False
,
)
self
.
act
=
nn
.
ReLU
()
self
.
c_proj
=
TensorParallelRowLinear
.
load
(
config
=
config
,
prefix
=
f
"
{
prefix
}
.c_proj"
,
weights
=
weights
,
bias
=
False
,
config
=
config
,
prefix
=
f
"
{
prefix
}
.c_proj"
,
weights
=
weights
,
bias
=
False
,
)
def
forward
(
self
,
hidden_states
:
Optional
[
Tuple
[
torch
.
FloatTensor
]])
->
torch
.
FloatTensor
:
def
forward
(
self
,
hidden_states
:
Optional
[
Tuple
[
torch
.
FloatTensor
]]
)
->
torch
.
FloatTensor
:
hidden_states
=
self
.
ln
(
hidden_states
)
hidden_states
=
self
.
fc
(
hidden_states
)
hidden_states
=
self
.
act
(
hidden_states
)
...
...
server/text_generation_server/models/custom_modeling/idefics_processing.py
View file @
47954b81
...
...
@@ -21,9 +21,16 @@ from urllib.parse import urlparse
from
transformers.feature_extraction_utils
import
BatchFeature
from
transformers.processing_utils
import
ProcessorMixin
from
transformers.tokenization_utils_base
import
BatchEncoding
,
PaddingStrategy
,
TextInput
,
TruncationStrategy
from
transformers.tokenization_utils_base
import
(
BatchEncoding
,
PaddingStrategy
,
TextInput
,
TruncationStrategy
,
)
from
transformers.utils
import
TensorType
,
is_torch_available
from
text_generation_server.models.custom_modeling.idefics_image_processing
import
IdeficsImageProcessor
from
text_generation_server.models.custom_modeling.idefics_image_processing
import
(
IdeficsImageProcessor
,
)
if
is_torch_available
():
...
...
@@ -124,7 +131,14 @@ class IdeficsProcessor(ProcessorMixin):
image_processor_class
=
"IdeficsImageProcessor"
tokenizer_class
=
"LlamaTokenizerFast"
def
__init__
(
self
,
image_processor
,
tokenizer
=
None
,
image_size
=
224
,
add_end_of_utterance_token
=
None
,
**
kwargs
):
def
__init__
(
self
,
image_processor
,
tokenizer
=
None
,
image_size
=
224
,
add_end_of_utterance_token
=
None
,
**
kwargs
,
):
if
image_processor
is
None
:
raise
ValueError
(
"You need to specify an `image_processor`."
)
if
tokenizer
is
None
:
...
...
@@ -142,7 +156,8 @@ class IdeficsProcessor(ProcessorMixin):
self
.
tokenizer_was_trained_with_end_of_utterance_token
=
(
True
if
"<end_of_utterance>"
in
self
.
tokenizer
.
special_tokens_map
.
get
(
"additional_special_tokens"
,
[])
if
"<end_of_utterance>"
in
self
.
tokenizer
.
special_tokens_map
.
get
(
"additional_special_tokens"
,
[])
else
False
)
...
...
@@ -265,7 +280,9 @@ class IdeficsProcessor(ProcessorMixin):
# if the value isn't overriden by the user, check if the tokenizer was trained with this token and then use it
if
add_end_of_utterance_token
is
None
:
add_end_of_utterance_token
=
self
.
tokenizer_was_trained_with_end_of_utterance_token
add_end_of_utterance_token
=
(
self
.
tokenizer_was_trained_with_end_of_utterance_token
)
# turn non-batched prompts into batched
if
not
any
(
isinstance
(
i
,
list
)
for
i
in
prompts
):
...
...
@@ -358,10 +375,14 @@ class IdeficsProcessor(ProcessorMixin):
current_images
=
images
[:
local_max_num_images
]
if
len
(
current_images
)
>
0
:
padded_image_tensor
=
torch
.
zeros
(
max_num_images
,
*
current_images
.
size
()[
1
:])
padded_image_tensor
=
torch
.
zeros
(
max_num_images
,
*
current_images
.
size
()[
1
:]
)
padded_image_tensor
[:
current_images
.
size
(
0
)]
=
current_images
else
:
padded_image_tensor
=
torch
.
zeros
(
max_num_images
,
*
self
.
default_image_dims
)
padded_image_tensor
=
torch
.
zeros
(
max_num_images
,
*
self
.
default_image_dims
)
output_images
.
append
(
padded_image_tensor
)
output_input_ids
.
append
(
torch
.
tensor
(
padded_input_ids
))
...
...
@@ -373,14 +394,19 @@ class IdeficsProcessor(ProcessorMixin):
output_attention_masks
=
torch
.
stack
(
output_attention_masks
)
if
at_least_one_image
:
image_attention_mask
,
_
=
image_attention_mask_for_packed_input_ids
(
output_input_ids
,
self
.
tokenizer
)
image_attention_mask
,
_
=
image_attention_mask_for_packed_input_ids
(
output_input_ids
,
self
.
tokenizer
)
image_attention_mask
=
incremental_to_binary_attention_mask
(
image_attention_mask
,
num_classes
=
max_num_images
)
else
:
# in full language mode we set the image mask to all-0s
image_attention_mask
=
torch
.
zeros
(
output_input_ids
.
shape
[
0
],
output_input_ids
.
shape
[
1
],
1
,
dtype
=
torch
.
bool
output_input_ids
.
shape
[
0
],
output_input_ids
.
shape
[
1
],
1
,
dtype
=
torch
.
bool
,
)
return
BatchFeature
(
...
...
server/text_generation_server/models/custom_modeling/idefics_vision.py
View file @
47954b81
...
...
@@ -75,7 +75,9 @@ class IdeficsVisionEmbeddings(nn.Module):
self
.
image_size
=
config
.
image_size
self
.
patch_size
=
config
.
patch_size
self
.
class_embedding
=
nn
.
Parameter
(
weights
.
get_tensor
(
f
"
{
prefix
}
.class_embedding"
))
self
.
class_embedding
=
nn
.
Parameter
(
weights
.
get_tensor
(
f
"
{
prefix
}
.class_embedding"
)
)
self
.
patch_embedding
=
nn
.
Conv2d
.
load_no_bias
(
prefix
=
f
"
{
prefix
}
.patch_embedding"
,
...
...
@@ -91,12 +93,16 @@ class IdeficsVisionEmbeddings(nn.Module):
self
.
position_embedding
=
TensorParallelEmbedding
(
prefix
=
"model.vision_model.embeddings.position_embedding"
,
weights
=
weights
)
self
.
position_ids
=
torch
.
arange
(
self
.
num_positions
).
expand
((
1
,
-
1
)).
to
(
device
=
weights
.
device
)
self
.
position_ids
=
(
torch
.
arange
(
self
.
num_positions
).
expand
((
1
,
-
1
)).
to
(
device
=
weights
.
device
)
)
def
forward
(
self
,
pixel_values
:
torch
.
FloatTensor
)
->
torch
.
Tensor
:
batch_size
=
pixel_values
.
shape
[
0
]
target_dtype
=
self
.
patch_embedding
.
weight
.
dtype
patch_embeds
=
self
.
patch_embedding
(
pixel_values
.
to
(
dtype
=
target_dtype
))
# shape = [*, width, grid, grid]
patch_embeds
=
self
.
patch_embedding
(
pixel_values
.
to
(
dtype
=
target_dtype
)
)
# shape = [*, width, grid, grid]
patch_embeds
=
patch_embeds
.
flatten
(
2
).
transpose
(
1
,
2
)
class_embeds
=
self
.
class_embedding
.
expand
(
batch_size
,
1
,
-
1
)
...
...
@@ -132,7 +138,6 @@ class IdeficsVisionAttention(nn.Module):
self
.
num_heads
=
self
.
num_heads
//
weights
.
process_group
.
size
()
self
.
embed_dim
=
self
.
embed_dim
//
weights
.
process_group
.
size
()
self
.
k_proj
=
TensorParallelColumnLinear
.
load
(
config
,
prefix
=
f
"
{
prefix
}
.k_proj"
,
weights
=
weights
,
bias
=
True
)
...
...
@@ -147,7 +152,11 @@ class IdeficsVisionAttention(nn.Module):
)
def
_shape
(
self
,
tensor
:
torch
.
Tensor
,
seq_len
:
int
,
bsz
:
int
):
return
tensor
.
view
(
bsz
,
seq_len
,
self
.
num_heads
,
self
.
head_dim
).
transpose
(
1
,
2
).
contiguous
()
return
(
tensor
.
view
(
bsz
,
seq_len
,
self
.
num_heads
,
self
.
head_dim
)
.
transpose
(
1
,
2
)
.
contiguous
()
)
def
forward
(
self
,
...
...
@@ -186,7 +195,10 @@ class IdeficsVisionAttention(nn.Module):
f
"Attention mask should be of size
{
(
bsz
,
1
,
tgt_len
,
src_len
)
}
, but is"
f
"
{
causal_attention_mask
.
size
()
}
"
)
attn_weights
=
attn_weights
.
view
(
bsz
,
self
.
num_heads
,
tgt_len
,
src_len
)
+
causal_attention_mask
attn_weights
=
(
attn_weights
.
view
(
bsz
,
self
.
num_heads
,
tgt_len
,
src_len
)
+
causal_attention_mask
)
attn_weights
=
attn_weights
.
view
(
bsz
*
self
.
num_heads
,
tgt_len
,
src_len
)
if
attention_mask
is
not
None
:
...
...
@@ -194,7 +206,10 @@ class IdeficsVisionAttention(nn.Module):
raise
ValueError
(
f
"Attention mask should be of size
{
(
bsz
,
1
,
tgt_len
,
src_len
)
}
, but is
{
attention_mask
.
size
()
}
"
)
attn_weights
=
attn_weights
.
view
(
bsz
,
self
.
num_heads
,
tgt_len
,
src_len
)
+
attention_mask
attn_weights
=
(
attn_weights
.
view
(
bsz
,
self
.
num_heads
,
tgt_len
,
src_len
)
+
attention_mask
)
attn_weights
=
attn_weights
.
view
(
bsz
*
self
.
num_heads
,
tgt_len
,
src_len
)
attn_weights
=
nn
.
functional
.
softmax
(
attn_weights
,
dim
=-
1
)
...
...
@@ -204,12 +219,18 @@ class IdeficsVisionAttention(nn.Module):
# make sure that attn_weights keeps its gradient.
# In order to do so, attn_weights have to reshaped
# twice and have to be reused in the following
attn_weights_reshaped
=
attn_weights
.
view
(
bsz
,
self
.
num_heads
,
tgt_len
,
src_len
)
attn_weights
=
attn_weights_reshaped
.
view
(
bsz
*
self
.
num_heads
,
tgt_len
,
src_len
)
attn_weights_reshaped
=
attn_weights
.
view
(
bsz
,
self
.
num_heads
,
tgt_len
,
src_len
)
attn_weights
=
attn_weights_reshaped
.
view
(
bsz
*
self
.
num_heads
,
tgt_len
,
src_len
)
else
:
attn_weights_reshaped
=
None
attn_probs
=
nn
.
functional
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_probs
=
nn
.
functional
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_output
=
torch
.
bmm
(
attn_probs
,
value_states
)
...
...
@@ -253,11 +274,15 @@ class IdeficsVisionEncoderLayer(nn.Module):
def
__init__
(
self
,
prefix
,
config
,
weights
):
super
().
__init__
()
self
.
embed_dim
=
config
.
hidden_size
self
.
self_attn
=
IdeficsVisionAttention
(
prefix
=
f
"
{
prefix
}
.self_attn"
,
config
=
config
,
weights
=
weights
)
self
.
self_attn
=
IdeficsVisionAttention
(
prefix
=
f
"
{
prefix
}
.self_attn"
,
config
=
config
,
weights
=
weights
)
self
.
layer_norm1
=
nn
.
LayerNorm
.
load
(
prefix
=
f
"
{
prefix
}
.layer_norm1"
,
weights
=
weights
,
eps
=
config
.
layer_norm_eps
)
self
.
mlp
=
IdeficsVisionMLP
(
prefix
=
f
"
{
prefix
}
.mlp"
,
config
=
config
,
weights
=
weights
)
self
.
mlp
=
IdeficsVisionMLP
(
prefix
=
f
"
{
prefix
}
.mlp"
,
config
=
config
,
weights
=
weights
)
self
.
layer_norm2
=
nn
.
LayerNorm
.
load
(
prefix
=
f
"
{
prefix
}
.layer_norm2"
,
weights
=
weights
,
eps
=
config
.
layer_norm_eps
)
...
...
@@ -318,7 +343,11 @@ class IdeficsVisionEncoder(nn.Module):
self
.
config
=
config
self
.
layers
=
nn
.
ModuleList
(
[
IdeficsVisionEncoderLayer
(
prefix
=
f
"
{
prefix
}
.encoder.layers.
{
layer_id
}
"
,
config
=
config
,
weights
=
weights
)
IdeficsVisionEncoderLayer
(
prefix
=
f
"
{
prefix
}
.encoder.layers.
{
layer_id
}
"
,
config
=
config
,
weights
=
weights
,
)
for
layer_id
in
range
(
config
.
num_hidden_layers
)
]
)
...
...
@@ -362,11 +391,19 @@ class IdeficsVisionEncoder(nn.Module):
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
output_attentions
=
output_attentions
if
output_attentions
is
not
None
else
self
.
config
.
output_attentions
output_attentions
=
(
output_attentions
if
output_attentions
is
not
None
else
self
.
config
.
output_attentions
)
output_hidden_states
=
(
output_hidden_states
if
output_hidden_states
is
not
None
else
self
.
config
.
output_hidden_states
output_hidden_states
if
output_hidden_states
is
not
None
else
self
.
config
.
output_hidden_states
)
return_dict
=
(
return_dict
if
return_dict
is
not
None
else
self
.
config
.
use_return_dict
)
return_dict
=
return_dict
if
return_dict
is
not
None
else
self
.
config
.
use_return_dict
encoder_states
=
()
if
output_hidden_states
else
None
all_attentions
=
()
if
output_attentions
else
None
...
...
@@ -406,9 +443,15 @@ class IdeficsVisionEncoder(nn.Module):
encoder_states
=
encoder_states
+
(
hidden_states
,)
if
not
return_dict
:
return
tuple
(
v
for
v
in
[
hidden_states
,
encoder_states
,
all_attentions
]
if
v
is
not
None
)
return
tuple
(
v
for
v
in
[
hidden_states
,
encoder_states
,
all_attentions
]
if
v
is
not
None
)
return
BaseModelOutput
(
last_hidden_state
=
hidden_states
,
hidden_states
=
encoder_states
,
attentions
=
all_attentions
last_hidden_state
=
hidden_states
,
hidden_states
=
encoder_states
,
attentions
=
all_attentions
,
)
...
...
@@ -419,13 +462,19 @@ class IdeficsVisionTransformer(nn.Module):
self
.
config
=
config
embed_dim
=
config
.
hidden_size
self
.
embeddings
=
IdeficsVisionEmbeddings
(
prefix
=
f
"
{
prefix
}
.embeddings"
,
config
=
config
,
weights
=
weights
)
self
.
embeddings
=
IdeficsVisionEmbeddings
(
prefix
=
f
"
{
prefix
}
.embeddings"
,
config
=
config
,
weights
=
weights
)
self
.
pre_layrnorm
=
nn
.
LayerNorm
.
load
(
prefix
=
f
"
{
prefix
}
.pre_layrnorm"
,
weights
=
weights
,
eps
=
config
.
layer_norm_eps
)
self
.
encoder
=
IdeficsVisionEncoder
(
prefix
=
prefix
,
config
=
config
,
weights
=
weights
)
self
.
encoder
=
IdeficsVisionEncoder
(
prefix
=
prefix
,
config
=
config
,
weights
=
weights
)
self
.
post_layernorm
=
nn
.
LayerNorm
.
load
(
prefix
=
f
"
{
prefix
}
.post_layernorm"
,
weights
=
weights
,
eps
=
config
.
layer_norm_eps
prefix
=
f
"
{
prefix
}
.post_layernorm"
,
weights
=
weights
,
eps
=
config
.
layer_norm_eps
,
)
# copied from transformers.models.clip.modeling_clip.CLIPVisionTransformer.forward
...
...
@@ -440,11 +489,19 @@ class IdeficsVisionTransformer(nn.Module):
Returns:
"""
output_attentions
=
output_attentions
if
output_attentions
is
not
None
else
self
.
config
.
output_attentions
output_attentions
=
(
output_attentions
if
output_attentions
is
not
None
else
self
.
config
.
output_attentions
)
output_hidden_states
=
(
output_hidden_states
if
output_hidden_states
is
not
None
else
self
.
config
.
output_hidden_states
output_hidden_states
if
output_hidden_states
is
not
None
else
self
.
config
.
output_hidden_states
)
return_dict
=
(
return_dict
if
return_dict
is
not
None
else
self
.
config
.
use_return_dict
)
return_dict
=
return_dict
if
return_dict
is
not
None
else
self
.
config
.
use_return_dict
if
pixel_values
is
None
:
raise
ValueError
(
"You have to specify pixel_values"
)
...
...
server/text_generation_server/models/custom_modeling/neox_modeling.py
View file @
47954b81
...
...
@@ -49,7 +49,10 @@ from text_generation_server.utils.layers import (
CUSTOM_KERNELS_ENABLED
=
False
if
torch
.
cuda
.
is_available
()
and
not
os
.
environ
.
get
(
"DISABLE_CUSTOM_KERNELS"
,
"False"
)
==
"True"
:
if
(
torch
.
cuda
.
is_available
()
and
not
os
.
environ
.
get
(
"DISABLE_CUSTOM_KERNELS"
,
"False"
)
==
"True"
):
try
:
from
custom_kernels
import
fused_attention_cuda
...
...
server/text_generation_server/models/flash_causal_lm.py
View file @
47954b81
...
...
@@ -1005,9 +1005,12 @@ class FlashCausalLM(Model):
# Decode generated tokens
output_text
,
_
,
_
=
self
.
decode_token
(
all_input_ids
,
prefix_offset
=
len
(
all_input_ids
)
-
stopping_criteria
.
current_tokens
-
1
,
read_offset
=
len
(
all_input_ids
)
-
stopping_criteria
.
current_tokens
,
skip_special_tokens
=
True
prefix_offset
=
len
(
all_input_ids
)
-
stopping_criteria
.
current_tokens
-
1
,
read_offset
=
len
(
all_input_ids
)
-
stopping_criteria
.
current_tokens
,
skip_special_tokens
=
True
,
)
generated_text
=
GeneratedText
(
output_text
,
...
...
server/text_generation_server/models/idefics_causal_lm.py
View file @
47954b81
...
...
@@ -8,7 +8,13 @@ import re
from
dataclasses
import
dataclass
from
opentelemetry
import
trace
from
transformers
import
AutoProcessor
,
AutoTokenizer
,
AutoModelForCausalLM
,
PreTrainedTokenizerBase
,
ProcessorMixin
from
transformers
import
(
AutoProcessor
,
AutoTokenizer
,
AutoModelForCausalLM
,
PreTrainedTokenizerBase
,
ProcessorMixin
,
)
from
typing
import
Optional
,
Tuple
,
List
,
Type
,
Dict
from
text_generation_server.models
import
Model
...
...
@@ -23,7 +29,8 @@ from text_generation_server.utils import NextTokenChooser, StoppingCriteria, Sam
import
re
IMAGES
=
re
.
compile
(
r
'!\[[^\]]*\]\((.*?)\s*(\"(?:.*[^\"])\")?\s*\)'
)
IMAGES
=
re
.
compile
(
r
"!\[[^\]]*\]\((.*?)\s*(\"(?:.*[^\"])\")?\s*\)"
)
def
split
(
string
):
parts
=
[]
...
...
@@ -41,6 +48,7 @@ def split(string):
return
parts
tracer
=
trace
.
get_tracer
(
__name__
)
...
...
@@ -141,8 +149,12 @@ class IdeficsCausalLMBatch(Batch):
).
to
(
device
)
for
_
in
pb
.
requests
:
input_len
=
tokenized_inputs
[
"input_ids"
].
shape
[
1
]
prefix_offsets
.
append
(
input_len
-
5
)
# To decode without potential fallbacks errors
read_offsets
.
append
(
input_len
)
# To decode without potential fallbacks errors
prefix_offsets
.
append
(
input_len
-
5
)
# To decode without potential fallbacks errors
read_offsets
.
append
(
input_len
)
# To decode without potential fallbacks errors
input_lengths
=
tokenized_inputs
[
"attention_mask"
].
sum
(
1
)
max_input_length
=
input_lengths
.
max
()
...
...
@@ -158,14 +170,21 @@ class IdeficsCausalLMBatch(Batch):
attention_mask
[:,
:
max_input_length
]
=
tokenized_inputs
[
"attention_mask"
]
# Do the same for image_attention_mask
image_attention_mask
=
input_ids
.
new_zeros
(
(
pb
.
size
,
max_input_length
+
padding_right_offset
,
tokenized_inputs
[
"pixel_values"
].
size
(
1
))
(
pb
.
size
,
max_input_length
+
padding_right_offset
,
tokenized_inputs
[
"pixel_values"
].
size
(
1
),
)
image_attention_mask
[:,
:
max_input_length
,
:]
=
tokenized_inputs
[
"image_attention_mask"
]
)
image_attention_mask
[:,
:
max_input_length
,
:]
=
tokenized_inputs
[
"image_attention_mask"
]
position_ids
=
tokenized_inputs
[
"attention_mask"
].
long
().
cumsum
(
-
1
)
-
1
position_ids
.
masked_fill_
(
tokenized_inputs
[
"attention_mask"
]
==
0
,
1
)
all_input_ids
=
tokenized_inputs
[
"input_ids"
].
T
.
split
(
1
,
dim
=
1
)
# It's input_ids but splitted into a tuple of tensors where each tensor is (seq_len, 1) size. It is then transformed into a list
all_input_ids
=
tokenized_inputs
[
"input_ids"
].
T
.
split
(
1
,
dim
=
1
)
# It's input_ids but splitted into a tuple of tensors where each tensor is (seq_len, 1) size. It is then transformed into a list
max_tokens
=
len
(
inputs
)
*
(
max_input_length
+
max_decode_tokens
)
...
...
@@ -259,7 +278,7 @@ class IdeficsCausalLMBatch(Batch):
self
.
image_attention_mask
.
shape
[
1
]
-
self
.
padding_right_offset
)
+
new_padding_right_offset
,
:
:
,
]
if
self
.
image_hidden_states
is
None
:
image_hidden_states
=
None
...
...
@@ -308,7 +327,9 @@ class IdeficsCausalLMBatch(Batch):
@
classmethod
@
tracer
.
start_as_current_span
(
"concatenate"
)
def
concatenate
(
cls
,
batches
:
List
[
"IdeficsCausalLMBatch"
])
->
"IdeficsCausalLMBatch"
:
def
concatenate
(
cls
,
batches
:
List
[
"IdeficsCausalLMBatch"
]
)
->
"IdeficsCausalLMBatch"
:
# It adds new requests to the batch
# Used for padding
total_batch_size
=
0
...
...
@@ -383,12 +404,20 @@ class IdeficsCausalLMBatch(Batch):
curr_batch_max_num_images
=
batch
.
pixel_values
.
size
(
1
)
if
pixel_values
is
None
:
pixel_values
=
batch
.
pixel_values
.
new_zeros
((
total_batch_size
,
max_num_images
,
3
,
224
,
224
))
pixel_values
[
start_index
:
end_index
,
:
curr_batch_max_num_images
]
=
batch
.
pixel_values
pixel_values
=
batch
.
pixel_values
.
new_zeros
(
(
total_batch_size
,
max_num_images
,
3
,
224
,
224
)
)
pixel_values
[
start_index
:
end_index
,
:
curr_batch_max_num_images
]
=
batch
.
pixel_values
if
image_attention_mask
is
None
:
image_attention_mask
=
batch
.
image_attention_mask
.
new_zeros
(
(
total_batch_size
,
max_input_length
+
padding_right_offset
,
max_num_images
)
(
total_batch_size
,
max_input_length
+
padding_right_offset
,
max_num_images
,
)
)
# We need to slice the attention mask to remove padding from previous steps
...
...
@@ -409,11 +438,9 @@ class IdeficsCausalLMBatch(Batch):
image_attention_mask
[
start_index
:
end_index
,
left_offset
:
-
padding_right_offset
,
:
curr_batch_max_num_images
:
curr_batch_max_num_images
,
]
=
batch
.
image_attention_mask
[
:,
batch_left_offset
:
-
batch
.
padding_right_offset
,
:
:,
batch_left_offset
:
-
batch
.
padding_right_offset
,
:
]
# Create empty tensor
...
...
@@ -550,7 +577,9 @@ class IdeficsCausalLM(Model):
dtype
:
Optional
[
torch
.
dtype
]
=
None
,
trust_remote_code
:
bool
=
False
,
):
from
text_generation_server.models.custom_modeling.idefics_modeling
import
IdeficsForVisionText2Text
from
text_generation_server.models.custom_modeling.idefics_modeling
import
(
IdeficsForVisionText2Text
,
)
if
torch
.
cuda
.
is_available
():
device
=
torch
.
device
(
"cuda"
)
...
...
@@ -650,9 +679,13 @@ class IdeficsCausalLM(Model):
# this is due to the nature IDEFICS: it's an encoder decoder, and so when decoding, only the currently generated
# token need to attend to the encoder hidden states (i.e. the vision encoder)
# Also see seq2seq_lm.Seq2SeqLM.generate_token which has roughly the same logic
image_attention_mask
=
batch
.
image_attention_mask
[:,
-
(
batch
.
padding_right_offset
+
1
)].
unsqueeze
(
1
)
image_attention_mask
=
batch
.
image_attention_mask
[
:,
-
(
batch
.
padding_right_offset
+
1
)
].
unsqueeze
(
1
)
else
:
image_attention_mask
=
batch
.
image_attention_mask
[:,
:
-
batch
.
padding_right_offset
]
image_attention_mask
=
batch
.
image_attention_mask
[
:,
:
-
batch
.
padding_right_offset
]
logits
,
past
,
image_hidden_states
=
self
.
forward
(
input_ids
=
batch
.
input_ids
,
...
...
@@ -725,9 +758,12 @@ class IdeficsCausalLM(Model):
# Decode generated tokens
output_text
,
_
,
_
=
self
.
decode_token
(
all_input_ids
[:,
0
],
prefix_offset
=
len
(
all_input_ids
)
-
stopping_criteria
.
current_tokens
-
1
,
read_offset
=
len
(
all_input_ids
)
-
stopping_criteria
.
current_tokens
,
skip_special_tokens
=
True
prefix_offset
=
len
(
all_input_ids
)
-
stopping_criteria
.
current_tokens
-
1
,
read_offset
=
len
(
all_input_ids
)
-
stopping_criteria
.
current_tokens
,
skip_special_tokens
=
True
,
)
# Get seed
if
isinstance
(
next_token_chooser
.
choice
,
Sampling
):
...
...
@@ -761,7 +797,7 @@ class IdeficsCausalLM(Model):
else
:
prefill_tokens
=
None
top_tokens
=
None
top_tokens
=
None
generation
=
Generation
(
request
.
id
,
...
...
@@ -771,7 +807,7 @@ class IdeficsCausalLM(Model):
next_token_text
,
next_token_id_squeezed
.
item
()
in
self
.
all_special_ids
,
generated_text
,
top_tokens
top_tokens
,
)
generations
.
append
(
generation
)
...
...
@@ -793,7 +829,9 @@ class IdeficsCausalLM(Model):
# Update attention_mask as we added a new token to input_ids
batch
.
attention_mask
[:,
-
batch
.
padding_right_offset
]
=
1
batch
.
image_attention_mask
[:,
-
batch
.
padding_right_offset
,
:]
=
batch
.
image_attention_mask
[:,
-
(
batch
.
padding_right_offset
+
1
),
:]
batch
.
image_attention_mask
[
:,
-
batch
.
padding_right_offset
,
:
]
=
batch
.
image_attention_mask
[:,
-
(
batch
.
padding_right_offset
+
1
),
:]
# Decrease right offset
batch
.
padding_right_offset
-=
1
...
...
server/text_generation_server/models/model.py
View file @
47954b81
...
...
@@ -71,7 +71,8 @@ class Model(ABC):
# The prefix text is necessary only to defeat cleanup algorithms in the decode
# which decide to add a space or not depending on the surrounding ids.
prefix_text
=
self
.
tokenizer
.
decode
(
all_input_ids
[
prefix_offset
:
read_offset
],
skip_special_tokens
=
skip_special_tokens
all_input_ids
[
prefix_offset
:
read_offset
],
skip_special_tokens
=
skip_special_tokens
,
)
new_text
=
self
.
tokenizer
.
decode
(
all_input_ids
[
prefix_offset
:],
skip_special_tokens
=
skip_special_tokens
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment