"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "0951d31788251d11c9f9e8352853edd071297cb4"
Unverified Commit c2882403 authored by Arthur's avatar Arthur Committed by GitHub
Browse files

[Whisper Docs] Nits (#24367)



* nits

* config doc did not match

* Apply suggestions from code review
Co-authored-by: default avatarSanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>

---------
Co-authored-by: default avatarSanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
parent 83dc5762
......@@ -127,7 +127,7 @@ class WhisperConfig(PretrainedConfig):
Padding token id.
bos_token_id (`int`, *optional*, defaults to 50256):
Begin of stream token id.
eos_token_id (`int`, *optional*, defaults to 50257):
eos_token_id (`int`, *optional*, defaults to 50256):
End of stream token id.
suppress_tokens (`List[int]`, *optional*):
A list containing the non-speech tokens that will be used by the logit processor in the `generate`
......@@ -216,7 +216,7 @@ class WhisperConfig(PretrainedConfig):
max_source_positions=1500,
max_target_positions=448,
pad_token_id=50256,
bos_token_id=50257,
bos_token_id=50256,
eos_token_id=50256,
suppress_tokens=None,
begin_suppress_tokens=[220, 50256],
......
......@@ -231,8 +231,9 @@ class WhisperTokenizer(PreTrainedTokenizer):
unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
bos_token (`str`, *optional*, defaults to `"<|startoftranscript|>"`):
The beginning of sequence token.
bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The beginning of sequence token. The `decoder_start_token_id` is used to set the first token as
`"<|startoftranscript|>"` when generating.
eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The end of sequence token.
add_prefix_space (`bool`, *optional*, defaults to `False`):
......@@ -261,7 +262,7 @@ class WhisperTokenizer(PreTrainedTokenizer):
normalizer_file=None,
errors="replace",
unk_token="<|endoftext|>",
bos_token="<|startoftranscript|>",
bos_token="<|endoftext|>",
eos_token="<|endoftext|>",
pad_token=None,
add_prefix_space=False,
......
......@@ -105,8 +105,9 @@ class WhisperTokenizerFast(PreTrainedTokenizerFast):
unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
bos_token (`str`, *optional*, defaults to `<|startoftranscript|>`):
The beginning of sequence token.
bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The beginning of sequence token. The `decoder_start_token_id` is used to set the first token as
`"<|startoftranscript|>"` when generating.
eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
The end of sequence token.
add_prefix_space (`bool`, *optional*, defaults to `False`):
......@@ -138,7 +139,7 @@ class WhisperTokenizerFast(PreTrainedTokenizerFast):
normalizer_file=None,
tokenizer_file=None,
unk_token="<|endoftext|>",
bos_token="<|startoftranscript|>",
bos_token="<|endoftext|>",
eos_token="<|endoftext|>",
add_prefix_space=False,
language=None,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment