Unverified Commit 9a092f37 authored by David Corvoysier's avatar David Corvoysier Committed by GitHub
Browse files

Update neuron backend (#2314)

* feat(neuron): align with latest optimum-neuron

* feat(neuron): support pre-exported neuron models

* fix(neuron): correctly use max_length

* fix(neuron): adapt loglikelihood

The evaluation of log likelihood was not working for neuron models
using continuous batching, such as all cached neuron LLama models.

* refactor(neuron): remove dead code
parent 88ea85b4
import copy import copy
import json
import logging import logging
import subprocess
from collections import defaultdict from collections import defaultdict
from typing import List, Optional, Union from typing import List, Optional, Union
...@@ -33,54 +31,6 @@ except ImportError: ...@@ -33,54 +31,6 @@ except ImportError:
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def get_nc_count() -> Union[int, None]:
"""Returns the number of neuron cores on the current instance."""
try:
cmd = "neuron-ls --json-output"
result = subprocess.run(cmd, shell=True, capture_output=True)
print(f"inferring nc_count from `neuron-ls` {result.stdout}")
json_output = json.loads(result.stdout)
count = sum([x["nc_count"] for x in json_output])
print(f"nc_count={count}")
return count
except Exception:
return None
def wrap_constant_batch_size(func):
def _decorator(self, input_ids):
"""input_ids a 2D array with batch_size on dim=0
makes sure the func runs with self.batch_size
"""
# access a from TestSample
batch_size = input_ids.shape[0]
if batch_size < self.batch_size:
# handle the event of input_ids.shape[0] != batch_size
# Neuron cores expect constant batch_size
input_ids = torch.concat(
(
input_ids,
# add missing_batch_size dummy
torch.zeros(
[self.batch_size - batch_size, *input_ids.size()[1:]],
dtype=input_ids.dtype,
device=input_ids.device,
),
),
dim=0,
)
elif batch_size > self.batch_size:
raise ValueError(
f"The specified batch_size ({batch_size}) exceeds the model static batch size ({self.batch_size})"
)
# return the forward pass that requires constant batch size
return func(self, input_ids)[:batch_size]
return _decorator
class CustomNeuronModelForCausalLM(NeuronModelForCausalLM): class CustomNeuronModelForCausalLM(NeuronModelForCausalLM):
"""NeuronModelForCausalLM with `stopping_criteria` in `generate`""" """NeuronModelForCausalLM with `stopping_criteria` in `generate`"""
...@@ -146,7 +96,7 @@ class CustomNeuronModelForCausalLM(NeuronModelForCausalLM): ...@@ -146,7 +96,7 @@ class CustomNeuronModelForCausalLM(NeuronModelForCausalLM):
raise ValueError( raise ValueError(
f"The specified batch_size ({batch_size}) exceeds the model static batch size ({self.batch_size})" f"The specified batch_size ({batch_size}) exceeds the model static batch size ({self.batch_size})"
) )
elif batch_size < self.batch_size: elif batch_size < self.batch_size and not self.continuous_batching:
logger.warning( logger.warning(
"Inputs will be padded to match the model static batch size. This will increase latency." "Inputs will be padded to match the model static batch size. This will increase latency."
) )
...@@ -158,8 +108,6 @@ class CustomNeuronModelForCausalLM(NeuronModelForCausalLM): ...@@ -158,8 +108,6 @@ class CustomNeuronModelForCausalLM(NeuronModelForCausalLM):
if attention_mask is not None: if attention_mask is not None:
padding = torch.zeros(padding_shape, dtype=torch.int64) padding = torch.zeros(padding_shape, dtype=torch.int64)
padded_attention_mask = torch.cat([attention_mask, padding]) padded_attention_mask = torch.cat([attention_mask, padding])
# Drop the current generation context and clear the Key/Value cache
self.reset_generation()
output_ids = self.generate_tokens( output_ids = self.generate_tokens(
padded_input_ids, padded_input_ids,
...@@ -179,8 +127,6 @@ class NEURON_HF(TemplateLM): ...@@ -179,8 +127,6 @@ class NEURON_HF(TemplateLM):
Tested with neuron 2.17.0 Tested with neuron 2.17.0
""" """
_DEFAULT_MAX_LENGTH = 2048
def __init__( def __init__(
self, self,
pretrained: Optional[str] = "TinyLlama/TinyLlama-1.1B-Chat-v1.0", pretrained: Optional[str] = "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
...@@ -203,7 +149,7 @@ class NEURON_HF(TemplateLM): ...@@ -203,7 +149,7 @@ class NEURON_HF(TemplateLM):
"please install neuron via pip install transformers-neuron ", "please install neuron via pip install transformers-neuron ",
"also make sure you are running on an AWS inf2 instance", "also make sure you are running on an AWS inf2 instance",
) )
if version.parse(optimum_neuron_version) != version.parse("0.0.17"): if version.parse(optimum_neuron_version) != version.parse("0.0.24"):
logger.warning( logger.warning(
'`optimum-neuron` model requires `pip install "optimum[neuronx]>=0.0.17" ' '`optimum-neuron` model requires `pip install "optimum[neuronx]>=0.0.17" '
"preferably using the Hugging Face Neuron Deep Learning AMI (Ubuntu 22.04) " "preferably using the Hugging Face Neuron Deep Learning AMI (Ubuntu 22.04) "
...@@ -217,35 +163,16 @@ class NEURON_HF(TemplateLM): ...@@ -217,35 +163,16 @@ class NEURON_HF(TemplateLM):
self.batch_size_per_gpu = int(batch_size) self.batch_size_per_gpu = int(batch_size)
batch_size = int(batch_size) batch_size = int(batch_size)
if tp_degree is None:
# execute `neuron-ls --json-output | jq '.[0].nc_count'``
# to get the number of neuron cores on your instance
tp_degree = get_nc_count()
assert isinstance(tp_degree, int), (
f"model_args must include tp_degree. tp_degree must be set to an integer,"
f" but is tp_degree=`{tp_degree}` with type=`{type(tp_degree)}`."
"Set it to number of neuron cores on your instance."
" For inf2.xlarge and inf2.8xlarge, set it to `2`."
" For inf2.24xlarge, set it to `12`."
" For inf2.48xlarge, set it to `24`."
)
revision = str(revision) # cast to string if not already one
# TODO: update this to be less of a hack once subfolder is fixed in HF
revision = revision + ("/" + subfolder if subfolder is not None else "")
self._config = transformers.AutoConfig.from_pretrained( self._config = transformers.AutoConfig.from_pretrained(
pretrained, pretrained,
revision=revision, revision=revision,
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
) )
torch_dtype = lm_eval.models.utils.get_dtype(dtype)
assert torch_dtype in [ revision = str(revision) # cast to string if not already one
torch.float16, # TODO: update this to be less of a hack once subfolder is fixed in HF
torch.bfloat16, revision = revision + ("/" + subfolder if subfolder is not None else "")
], "Only float16 and bfloat16 are supported"
self.tokenizer = transformers.AutoTokenizer.from_pretrained( self.tokenizer = transformers.AutoTokenizer.from_pretrained(
pretrained if tokenizer is None else tokenizer, pretrained if tokenizer is None else tokenizer,
...@@ -254,7 +181,20 @@ class NEURON_HF(TemplateLM): ...@@ -254,7 +181,20 @@ class NEURON_HF(TemplateLM):
use_fast=use_fast_tokenizer, use_fast=use_fast_tokenizer,
) )
# Neuron specific code neuron_config = getattr(self._config, "neuron", None)
if neuron_config is None:
# Check export parameters
if tp_degree is not None:
assert isinstance(tp_degree, int), (
f"tp_degree must be set to an integer,"
f" but is tp_degree=`{tp_degree}` with type=`{type(tp_degree)}`."
"Set it to a number lower than the number of neuron cores on your instance."
" For inf2.xlarge and inf2.8xlarge, set it to `2`."
" For inf2.24xlarge, set it <= `12`."
" For inf2.48xlarge, set it <= `24`."
)
torch_dtype = lm_eval.models.utils.get_dtype(dtype)
if torch_dtype == torch.float16: if torch_dtype == torch.float16:
self.amp_dtype = "f16" self.amp_dtype = "f16"
elif torch_dtype == torch.bfloat16: elif torch_dtype == torch.bfloat16:
...@@ -262,28 +202,37 @@ class NEURON_HF(TemplateLM): ...@@ -262,28 +202,37 @@ class NEURON_HF(TemplateLM):
elif torch_dtype == torch.float32: elif torch_dtype == torch.float32:
self.amp_dtype = "f32" self.amp_dtype = "f32"
else: else:
raise NotImplementedError("Only float16 and bfloat16 are implemented.") raise NotImplementedError(
"Only float16/bfloat16/float32 are supported."
compiler_args = {"num_cores": tp_degree, "auto_cast_type": self.amp_dtype} )
input_shapes = {
"batch_size": batch_size,
"sequence_length": self._DEFAULT_MAX_LENGTH,
}
print(f"{'='*20} \n exporting model to neuron")
self.model = CustomNeuronModelForCausalLM.from_pretrained(
pretrained,
revision=revision,
trust_remote_code=trust_remote_code,
low_cpu_mem_usage=low_cpu_mem_usage,
export=True,
batch_size=batch_size,
num_cores=tp_degree,
auto_cast_type=self.amp_dtype,
sequence_length=max_length,
)
neuron_config = self.model.config.neuron
print( print(
f"{'='*20} \n loading model to neuron with" f"SUCCESS: neuron model exported with config {neuron_config}. \n {'='*20}"
f" {compiler_args}, {input_shapes}..." )
else:
print(
f"{'='*20} \n loading neuron model with config" f" {neuron_config}..."
) )
self.model = CustomNeuronModelForCausalLM.from_pretrained( self.model = CustomNeuronModelForCausalLM.from_pretrained(
pretrained, pretrained,
revision=revision, revision=revision,
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
low_cpu_mem_usage=low_cpu_mem_usage, low_cpu_mem_usage=low_cpu_mem_usage,
export=True,
**compiler_args,
**input_shapes,
) )
print(f"SUCCESS: neuron model compiled. \n {'='*20}") print(f"SUCCESS: neuron model loaded. \n {'='*20}")
self.truncation = truncation self.truncation = truncation
...@@ -291,8 +240,6 @@ class NEURON_HF(TemplateLM): ...@@ -291,8 +240,6 @@ class NEURON_HF(TemplateLM):
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
self.add_bos_token = add_bos_token self.add_bos_token = add_bos_token
self._max_length = max_length
self.batch_schedule = 1 self.batch_schedule = 1
self.batch_sizes = {} self.batch_sizes = {}
...@@ -313,17 +260,7 @@ class NEURON_HF(TemplateLM): ...@@ -313,17 +260,7 @@ class NEURON_HF(TemplateLM):
@property @property
def max_length(self): def max_length(self):
if self._max_length: # if max length manually set, return it return self.model.max_length
return self._max_length
seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx")
for attr in seqlen_config_attrs:
if hasattr(self.model.config, attr):
return getattr(self.model.config, attr)
if hasattr(self.tokenizer, "model_max_length"):
if self.tokenizer.model_max_length == 1000000000000000019884624838656:
return self._DEFAULT_MAX_LENGTH
return self.tokenizer.model_max_length
return self._DEFAULT_MAX_LENGTH
@property @property
def max_gen_toks(self) -> int: def max_gen_toks(self) -> int:
...@@ -391,34 +328,6 @@ class NEURON_HF(TemplateLM): ...@@ -391,34 +328,6 @@ class NEURON_HF(TemplateLM):
def tok_decode(self, tokens): def tok_decode(self, tokens):
return self.tokenizer.decode(tokens) return self.tokenizer.decode(tokens)
@wrap_constant_batch_size
def _model_call(self, input_ids: torch.Tensor):
"""
get logits for the entire sequence
:param input_ids: torch.Tensor
A torch tensor of shape [batch, sequence_cont]
the size of sequence may vary from call to call
:return
A torch tensor of shape [batch, sequence, vocab] with the
logits returned from the model's decoder-lm head
"""
_, sequence_length = input_ids.shape
with torch.inference_mode():
cache_ids = torch.arange(0, sequence_length, dtype=torch.int32).split(1)
input_ids_split = input_ids.split(1, dim=1)
return torch.concat(
[
self.model.forward(
input_ids=input_id, cache_ids=cache_id, return_dict=False
)[0]
for input_id, cache_id in zip(input_ids_split, cache_ids)
],
dim=1,
)
def _model_generate(self, context, max_length, stop, **generation_kwargs): def _model_generate(self, context, max_length, stop, **generation_kwargs):
# we require users to pass do_sample=True explicitly # we require users to pass do_sample=True explicitly
# for non-greedy gen. This should be reevaluated when considering beam search. # for non-greedy gen. This should be reevaluated when considering beam search.
...@@ -580,15 +489,41 @@ class NEURON_HF(TemplateLM): ...@@ -580,15 +489,41 @@ class NEURON_HF(TemplateLM):
cont_toks_list.append(continuation_enc) cont_toks_list.append(continuation_enc)
inplens.append(inplen) inplens.append(inplen)
# create encoder attn mask and batched conts, if seq2seq # Add dummy inputs up to the model static batch size
call_kwargs = {} if len(inps) < self.batch_size:
inps = inps + [
torch.zeros_like(inps[0]),
] * (self.batch_size - len(inps))
masks = [torch.ones_like(inp) for inp in inps]
batched_inps = lm_eval.models.utils.pad_and_concat( batched_inps = lm_eval.models.utils.pad_and_concat(
padding_len_inp, inps, padding_side="right" padding_len_inp, inps, padding_side="right"
) # [batch, padding_len_inp] ) # [batch, padding_len_inp]
batched_masks = lm_eval.models.utils.pad_and_concat(
padding_len_inp, masks, padding_side="right"
)
if self.model.model.neuron_config.output_all_logits:
inputs = self.model.prepare_inputs_for_prefill(
batched_inps, batched_masks
)
multi_logits = F.log_softmax( multi_logits = F.log_softmax(
self._model_call(batched_inps, **call_kwargs), dim=-1 self.model.forward(**inputs).logits, dim=-1
) # [batch, padding_length (inp or cont), vocab] ) # [batch, padding_length (inp or cont), vocab]
else:
# The model will only return the logits for the last input token, so we need
# to iterate over inputs to accumulate logits.
# To speed things up we use the KV cache as we would do when generating.
inputs = self.model.prepare_inputs_for_prefill(
batched_inps[:, :1], batched_masks[:, :1]
)
outputs = [self.model.forward(**inputs).logits]
for i in range(1, padding_len_inp):
inputs = self.model.prepare_inputs_for_decode(
batched_inps[:, : i + 1], batched_masks[:, : i + 1]
)
outputs.append(self.model.forward(**inputs).logits)
multi_logits = F.log_softmax(torch.concat(outputs, dim=1), dim=-1)
for (cache_key, _, _), logits, inplen, cont_toks in zip( for (cache_key, _, _), logits, inplen, cont_toks in zip(
chunk, multi_logits, inplens, cont_toks_list chunk, multi_logits, inplens, cont_toks_list
......
import pytest
import torch
from lm_eval.models.neuron_optimum import wrap_constant_batch_size
def test_wrap_constant_batch_size():
class Tester:
def __init__(self, batch_size):
self.batch_size = batch_size
@wrap_constant_batch_size
def test_constant_batch_size(self, inputs):
assert len(inputs) == self.batch_size
return inputs
batch_size_test = 8
for i in range(1, batch_size_test + 1):
tensor = torch.ones([i, 2, 2])
out = Tester(batch_size=batch_size_test).test_constant_batch_size(tensor)
torch.testing.assert_allclose(out, tensor)
with pytest.raises(ValueError):
Tester(batch_size=batch_size_test).test_constant_batch_size(
torch.ones([batch_size_test + 1, 2, 2])
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment