"tests/optimization/test_optimization.py" did not exist on "e4f9dca018928db0911cfc949306c2db6326a3f0"
Unverified Commit 690fe73f authored by Patrick von Platen's avatar Patrick von Platen Committed by GitHub
Browse files

[Whisper] Finalize batched SOTA long-form generation (#27658)



* finalize

* make fix copies whisper

* [Tests] Make sure that we don't run tests mulitple times

* Update src/transformers/models/whisper/modeling_whisper.py

* [Tests] Make sure that we don't run tests mulitple times

* fix more

* improve

* improve

* improve further

* improve more

* improve

* fix more

* git commit and git push

* fix more

* fix more

* fix more

* New try

* Fix more whisper stuff

* Improve

* correct more

* correct more

* correct more

* Fix some tests

* Add more tests

* correct more

* correct more

* correct more

* push

* correct more

* Fix more

* Better

* without dec mask

* correct more

* clean

* save intermediate

* Fix more

* Fix VAD for large-v2

* Save new

* Correct more

* make cleaner

* correct tests

* correct src

* Finish

* Fix more

* Fix more

* finish

* Fix edge cases

* fix return_dict_in_generate

* fix all tests

* make style

* add docstrings

* add docstrings

* Fix logit processor

* make style

* fix pipeline test

* fix more style

* Apply suggestions from code review

* apply feedback Sanchit

* correct more

* Apply suggestions from code review
Co-authored-by: default avatarSanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>

* Apply suggestions from code review
Co-authored-by: default avatarJoao Gante <joaofranciscocardosogante@gmail.com>
Co-authored-by: default avatarSanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>

* correct more

* correct more

* correct more

* Fix staticmethod

* correct more

* fix

* fix slow tests

* make style

* fix tokenizer test

* fix tokenizer test

* Apply suggestions from code review
Co-authored-by: default avatarArthur <48595927+ArthurZucker@users.noreply.github.com>

* finish

* finish

* revert kwargs change

---------
Co-authored-by: default avatarSanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
Co-authored-by: default avatarJoao Gante <joaofranciscocardosogante@gmail.com>
Co-authored-by: default avatarArthur <48595927+ArthurZucker@users.noreply.github.com>
parent d4fc1eb4
......@@ -95,6 +95,7 @@ class LogitsProcessorList(list):
scores = processor(input_ids, scores, **kwargs)
else:
scores = processor(input_ids, scores)
return scores
......@@ -1657,6 +1658,9 @@ class SuppressTokensAtBeginLogitsProcessor(LogitsProcessor):
self.begin_suppress_tokens = list(begin_suppress_tokens)
self.begin_index = begin_index
def set_begin_index(self, begin_index):
self.begin_index = begin_index
@add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
if input_ids.shape[1] == self.begin_index:
......@@ -1778,6 +1782,7 @@ class WhisperTimeStampLogitsProcessor(LogitsProcessor):
max_initial_timestamp_index (`int`, *optional*, defaults to 1):
Used to set the maximum value of the initial timestamp. This is used to prevent the model from
predicting timestamps that are too far in the future.
begin_index (`Optional`, *optional*): Token index of the first token that is generated by the model.
_detect_timestamp_from_logprob (`bool`, *optional*): Whether timestamps can be predicted from logprobs over all timestamps.
Examples:
......@@ -1810,11 +1815,11 @@ class WhisperTimeStampLogitsProcessor(LogitsProcessor):
"""
def __init__(
self, generate_config, _detect_timestamp_from_logprob: Optional[bool] = None
self, generate_config, begin_index: Optional[int] = None, _detect_timestamp_from_logprob: Optional[bool] = None
): # support for the kwargs
self.eos_token_id = generate_config.eos_token_id
self.no_timestamps_token_id = generate_config.no_timestamps_token_id
self.timestamp_begin = generate_config.no_timestamps_token_id + 1
self.eos_token_id = generate_config.eos_token_id or generate_config.bos_token_id
# this variable is mostly just used for testing
self._detect_timestamp_from_logprob = (
......@@ -1823,10 +1828,17 @@ class WhisperTimeStampLogitsProcessor(LogitsProcessor):
else getattr(generate_config, "_detect_timestamp_from_logprob", True)
)
self.begin_index = (
len(generate_config.forced_decoder_ids) + 1 if generate_config.forced_decoder_ids is not None else 1
num_forced_ids = (
len(generate_config.forced_decoder_ids) if generate_config.forced_decoder_ids is not None else 0
)
self.begin_index = begin_index or (num_forced_ids + 1)
self.max_initial_timestamp_index = getattr(generate_config, "max_initial_timestamp_index", None)
# TODO(Patrick): Make sure that official models have max_initial_timestamp_index set to 50
# self.max_initial_timestamp_index = 50
def set_begin_index(self, begin_index):
self.begin_index = begin_index
@add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
......@@ -1878,6 +1890,60 @@ class WhisperTimeStampLogitsProcessor(LogitsProcessor):
return scores
class WhisperNoSpeechDetection(LogitsProcessor):
r"""This processor can be used to detect silence when using Whisper. It should take as input unprocessed logits to follow the original implementation"""
def __init__(self, no_speech_token: int, begin_index: int, scores_is_logprobs: bool = False):
self.no_speech_token = no_speech_token
# offset between <start-of-transcription> token, <SOT>, in paper and first generated token
# is equal to the position of the first generated token index
self.start_of_trans_offset = begin_index
# `self.begin_index` is a running value that is changed on the fly
self.begin_index = begin_index
self._no_speech_prob = [0.0]
self.is_scores_logprobs = scores_is_logprobs
# overwritten dynamically
self.model = None
self.inputs = None
def set_model(self, model):
self.model = model
def set_inputs(self, inputs):
self.inputs = {**self.model.prepare_inputs_for_generation(**inputs), **inputs}
self.inputs["input_features"] = self.inputs.pop("inputs")
@property
def no_speech_prob(self):
return self._no_speech_prob
def set_begin_index(self, begin_index):
self.begin_index = begin_index
@add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
if input_ids.shape[1] == self.begin_index:
if self.start_of_trans_offset > 1:
with torch.no_grad():
logits = self.model(**self.inputs).logits
no_speech_index = self.begin_index - self.start_of_trans_offset
no_speech_scores = logits[:, no_speech_index]
else:
no_speech_scores = scores
if self.is_scores_logprobs:
probs = no_speech_scores.exp()
else:
probs = no_speech_scores.float().softmax(dim=-1)
self._no_speech_prob = probs[:, self.no_speech_token]
return scores
class ClassifierFreeGuidanceLogitsProcessor(LogitsProcessor):
r"""
[`LogitsProcessor`] for classifier free guidance (CFG). The scores are split over the batch dimension,
......
......@@ -518,6 +518,8 @@ class GenerationMixin:
# exception: Donut checkpoints have task-specific decoder starts and don't expect a BOS token
elif self.config.model_type == "vision-encoder-decoder" and "donut" in self.name_or_path.lower():
pass
elif self.config.model_type in ["whisper"]:
pass
# user input but doesn't start with decoder_start_token_id -> prepend decoder_start_token_id (and adjust
# decoder_attention_mask if provided)
elif (decoder_input_ids[:, 0] != decoder_start_token_id).all().item():
......
This diff is collapsed.
......@@ -530,10 +530,21 @@ class WhisperTokenizer(PreTrainedTokenizer):
"""
timestamp_begin = self.all_special_ids[-1] + 1
outputs = [[]]
cur_max_timestamp = 0.0
prev_segments_len = 0.0
for token in token_ids:
if token >= timestamp_begin:
timestamp = f"<|{(token - timestamp_begin) * time_precision:.2f}|>"
outputs.append(timestamp)
timestamp = float((token - timestamp_begin) * time_precision)
if timestamp < cur_max_timestamp:
# next segment has started
prev_segments_len += cur_max_timestamp
cur_max_timestamp = timestamp
outputs.append(f"<|{(timestamp + prev_segments_len):.2f}|>")
outputs.append([])
else:
outputs[-1].append(token)
......@@ -631,7 +642,7 @@ class WhisperTokenizer(PreTrainedTokenizer):
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = None,
output_offsets: bool = False,
time_precision=0.02,
time_precision: float = 0.02,
decode_with_timestamps: bool = False,
normalize: bool = False,
basic_normalize: bool = False,
......
......@@ -224,10 +224,21 @@ class WhisperTokenizerFast(PreTrainedTokenizerFast):
"""
timestamp_begin = self.all_special_ids[-1] + 1
outputs = [[]]
cur_max_timestamp = 0.0
prev_segments_len = 0.0
for token in token_ids:
if token >= timestamp_begin:
timestamp = f"<|{(token - timestamp_begin) * time_precision:.2f}|>"
outputs.append(timestamp)
timestamp = float((token - timestamp_begin) * time_precision)
if timestamp < cur_max_timestamp:
# next segment has started
prev_segments_len += cur_max_timestamp
cur_max_timestamp = timestamp
outputs.append(f"<|{(timestamp + prev_segments_len):.2f}|>")
outputs.append([])
else:
outputs[-1].append(token)
......@@ -330,7 +341,7 @@ class WhisperTokenizerFast(PreTrainedTokenizerFast):
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = None,
output_offsets: bool = False,
time_precision=0.02,
time_precision: float = 0.02,
decode_with_timestamps: bool = False,
normalize: bool = False,
basic_normalize: bool = False,
......
......@@ -1152,7 +1152,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
@slow
def test_whisper_longform(self):
# fmt: off
EXPECTED_RESULT = """ Folks, if you watch the show, you know, I spent a lot of time right over there. Patiently and astutely scrutinizing the boxwood and mahogany chest set of the day's biggest stories developing the central headline pawns, definitely maneuvering an oso topical night to F6, fainting a classic Sicilian, nade door variation on the news, all the while seeing eight moves deep and patiently marshalling the latest press releases into a fisher's shows in Lip Nitsky attack that culminates in the elegant lethal slow-played, all-passant checkmate that is my nightly monologue. But sometimes, sometimes, folks, I. CHEERING AND APPLAUSE Sometimes I startle away, cubside down in the monkey bars of a condemned playground on a super fun site. Get all hept up on goofballs. Rummage that were discarded tag bag of defective toys. Yank out of fist bowl of disembodied doll limbs, toss them on a stained kid's place mat from a defunct denny's, set up a table inside a rusty cargo container down by the Wharf and challenged toothless drifters to the godless bughouse blitz of tournament that is my segment. Meanwhile!"""
EXPECTED_RESULT = """ Folks, if you watch the show, you know, I spent a lot of time right over there. Patiently and astutely scrutinizing the boxwood and mahogany chest set of the day's biggest stories developing the central headline pawns, definitely maneuvering an oso topical night to F6, fainting a classic Sicilian, nade door variation on the news, all the while seeing eight moves deep and patiently marshalling the latest press releases into a fisher's shows in Lip Nitsky attack that culminates in the elegant lethal slow-played, all-passant checkmate that is my nightly monologue. But sometimes, sometimes, folks, I. CHEERING AND APPLAUSE Sometimes I startle away, cubside down in the monkey bars of a condemned playground on a super fun site. Get all hept up on goofballs. Rummage that were discarded tag bag of defective toys. Yank out a fist bowl of disembodied doll limbs, toss them on a stained kid's place mat from a defunct dennies. set up a table inside a rusty cargo container down by the Wharf and challenged toothless drifters to the godless bughouse blitz of tournament that is my segment. Meanwhile."""
# fmt: on
processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment