Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
74fb524e
Unverified
Commit
74fb524e
authored
Dec 05, 2022
by
Sanchit Gandhi
Committed by
GitHub
Dec 05, 2022
Browse files
[Whisper] Fix decoder ids methods (#20599)
* [Whisper] Fix decoder ids methods * enum property
parent
ef0f85cd
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
22 additions
and
2 deletions
+22
-2
src/transformers/models/whisper/tokenization_whisper.py
src/transformers/models/whisper/tokenization_whisper.py
+3
-2
tests/models/whisper/test_processor_whisper.py
tests/models/whisper/test_processor_whisper.py
+19
-0
No files found.
src/transformers/models/whisper/tokenization_whisper.py
View file @
74fb524e
...
...
@@ -583,5 +583,6 @@ class WhisperTokenizer(PreTrainedTokenizer):
return
input_ids
def
get_decoder_prompt_ids
(
self
,
task
=
None
,
language
=
None
,
no_timestamps
=
True
):
self
.
set_prefix_tokens
(
task
=
task
,
language
=
language
,
predict_timestamps
=
no_timestamps
)
return
self
.
prefix_tokens
self
.
set_prefix_tokens
(
task
=
task
,
language
=
language
,
predict_timestamps
=
not
no_timestamps
)
forced_decoder_ids
=
[(
rank
+
1
,
token
)
for
rank
,
token
in
enumerate
(
self
.
prefix_tokens
)]
return
forced_decoder_ids
tests/models/whisper/test_processor_whisper.py
View file @
74fb524e
...
...
@@ -26,6 +26,11 @@ if is_speech_available():
from
transformers
import
WhisperFeatureExtractor
,
WhisperProcessor
START_OF_TRANSCRIPT
=
50257
TRANSCRIBE
=
50358
NOTIMESTAMPS
=
50362
@
require_torch
@
require_torchaudio
@
require_sentencepiece
...
...
@@ -128,3 +133,17 @@ class WhisperProcessorTest(unittest.TestCase):
feature_extractor
.
model_input_names
,
msg
=
"`processor` and `feature_extractor` model input names do not match"
,
)
def
test_get_decoder_prompt_ids
(
self
):
feature_extractor
=
self
.
get_feature_extractor
()
tokenizer
=
self
.
get_tokenizer
()
processor
=
WhisperProcessor
(
tokenizer
=
tokenizer
,
feature_extractor
=
feature_extractor
)
forced_decoder_ids
=
processor
.
get_decoder_prompt_ids
(
task
=
"transcribe"
,
no_timestamps
=
True
)
self
.
assertIsInstance
(
forced_decoder_ids
,
list
)
for
ids
in
forced_decoder_ids
:
self
.
assertIsInstance
(
ids
,
(
list
,
tuple
))
expected_ids
=
[
START_OF_TRANSCRIPT
,
TRANSCRIBE
,
NOTIMESTAMPS
]
self
.
assertListEqual
([
ids
[
-
1
]
for
ids
in
forced_decoder_ids
],
expected_ids
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment