tokenizer.py 2.23 KB
Newer Older
chenych's avatar
chenych committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Utils for tokenization."""

from typing import Optional

chenych's avatar
chenych committed
18
from transformers import AutoProcessor, AutoTokenizer, PreTrainedTokenizer, ProcessorMixin
chenych's avatar
chenych committed
19
20


chenych's avatar
chenych committed
21
def get_tokenizer(model_path: str, override_chat_template: Optional[str] = None, **kwargs) -> PreTrainedTokenizer:
chenych's avatar
chenych committed
22
    """Create a huggingface pretrained tokenizer."""
chenych's avatar
chenych committed
23
    tokenizer = AutoTokenizer.from_pretrained(model_path, **kwargs)
chenych's avatar
chenych committed
24
25
    if override_chat_template is not None:
        tokenizer.chat_template = override_chat_template
chenych's avatar
chenych committed
26

chenych's avatar
chenych committed
27
28
    if tokenizer.bos_token == "<bos>" and tokenizer.eos_token == "<eos>":
        # the EOS token in gemma2 & gemma3 is ambiguious, which may worsen RL performance.
chenych's avatar
chenych committed
29
30
31
32
        # https://huggingface.co/google/gemma-2-2b-it/commit/17a01657f5c87135bcdd0ec7abb4b2dece04408a
        print("Found gemma model. Set eos_token and eos_token_id to <end_of_turn> and 107.")
        tokenizer.eos_token = "<end_of_turn>"

chenych's avatar
chenych committed
33
34
    if tokenizer.pad_token_id is None:
        print("Pad token is None. Set it to eos_token.")
chenych's avatar
chenych committed
35
36
37
38
39
        tokenizer.pad_token = tokenizer.eos_token

    return tokenizer


chenych's avatar
chenych committed
40
def get_processor(model_path: str, override_chat_template: Optional[str] = None, **kwargs) -> Optional[ProcessorMixin]:
chenych's avatar
chenych committed
41
    """Create a huggingface pretrained processor."""
chenych's avatar
chenych committed
42
43
44
    processor = AutoProcessor.from_pretrained(model_path, **kwargs)
    if override_chat_template is not None:
        processor.chat_template = override_chat_template
chenych's avatar
chenych committed
45
46
47
48
49
50
51

    # Avoid load tokenizer, see:
    # https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/auto/processing_auto.py#L344
    if processor is not None and "Processor" not in processor.__class__.__name__:
        processor = None

    return processor