first add

eb2cf5ac · Rayyyyy · eb2cf5ac · eb2cf5ac · eb2cf5ac · eb2cf5ac
Commit eb2cf5ac authored Apr 20, 2024 by Rayyyyy
6 changed files
--- a/llama/test_tokenizer.py
+++ b/llama/test_tokenizer.py
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
+import os
+from unittest import TestCase
+from llama.tokenizer import ChatFormat, Tokenizer
+# TOKENIZER_PATH=<path> python -m unittest llama/test_tokenizer.py
+class TokenizerTests(TestCase):
+    def setUp(self):
+        self.tokenizer = Tokenizer(os.environ["TOKENIZER_PATH"])
+        self.format = ChatFormat(self.tokenizer)
+    def test_special_tokens(self):
+        self.assertEqual(
+            self.tokenizer.special_tokens["<|begin_of_text|>"],
+            128000,
+        )
+    def test_encode(self):
+        self.assertEqual(
+            self.tokenizer.encode(
+                "This is a test sentence.",
+                bos=True,
+                eos=True
+            ),
+            [128000, 2028, 374, 264, 1296, 11914, 13, 128001],
+        )
+    def test_decode(self):
+        self.assertEqual(
+            self.tokenizer.decode(
+                [128000, 2028, 374, 264, 1296, 11914, 13, 128001],
+            ),
+            "<|begin_of_text|>This is a test sentence.<|end_of_text|>",
+        )
+    def test_encode_message(self):
+        message = {
+            "role": "user",
+            "content": "This is a test sentence.",
+        }
+        self.assertEqual(
+            self.format.encode_message(message),
+            [
+                128006,  # <|start_header_id|>
+                882,  # "user"
+                128007,  # <|end_of_header|>
+                271,  # "\n\n"
+                2028, 374, 264, 1296, 11914, 13,  # This is a test sentence.
+                128009,  # <|eot_id|>
+            ]
+        )
+    def test_encode_dialog(self):
+        dialog = [
+            {
+                "role": "system",
+                "content": "This is a test sentence.",
+            },
+            {
+                "role": "user",
+                "content": "This is a response.",
+            }
+        ]
+        self.assertEqual(
+            self.format.encode_dialog_prompt(dialog),
+            [
+                128000,  # <|begin_of_text|>
+                128006,  # <|start_header_id|>
+                9125,     # "system"
+                128007,  # <|end_of_header|>
+                271,     # "\n\n"
+                2028, 374, 264, 1296, 11914, 13,  # "This is a test sentence."
+                128009,  # <|eot_id|>
+                128006,  # <|start_header_id|>
+                882,     # "user"
+                128007,  # <|end_of_header|>
+                271,     # "\n\n"
+                2028, 374, 264, 2077, 13,  # "This is a response.",
+                128009,  # <|eot_id|>
+                128006,  # <|start_header_id|>
+                78191,   # "assistant"
+                128007,  # <|end_of_header|>
+                271,     # "\n\n"
+            ]
+        )
--- a/llama/tokenizer.py
+++ b/llama/tokenizer.py
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
+import os
+from logging import getLogger
+from pathlib import Path
+from typing import (
+    AbstractSet,
+    cast,
+    Collection,
+    Dict,
+    Iterator,
+    List,
+    Literal,
+    Sequence,
+    TypedDict,
+    Union,
+)
+import tiktoken
+from tiktoken.load import load_tiktoken_bpe
+logger = getLogger(__name__)
+Role = Literal["system", "user", "assistant"]
+class Message(TypedDict):
+    role: Role
+    content: str
+Dialog = Sequence[Message]
+class Tokenizer:
+    """
+    Tokenizing and encoding/decoding text using the Tiktoken tokenizer.
+    """
+    special_tokens: Dict[str, int]
+    num_reserved_special_tokens = 256
+    pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"  # noqa: E501
+    def __init__(self, model_path: str):
+        """
+        Initializes the Tokenizer with a Tiktoken model.
+        Args:
+            model_path (str): The path to the Tiktoken model file.
+        """
+        assert os.path.isfile(model_path), model_path
+        mergeable_ranks = load_tiktoken_bpe(model_path)
+        num_base_tokens = len(mergeable_ranks)
+        special_tokens = [
+            "<|begin_of_text|>",
+            "<|end_of_text|>",
+            "<|reserved_special_token_0|>",
+            "<|reserved_special_token_1|>",
+            "<|reserved_special_token_2|>",
+            "<|reserved_special_token_3|>",
+            "<|start_header_id|>",
+            "<|end_header_id|>",
+            "<|reserved_special_token_4|>",
+            "<|eot_id|>",  # end of turn
+        ] + [
+            f"<|reserved_special_token_{i}|>"
+            for i in range(5, self.num_reserved_special_tokens - 5)
+        ]
+        self.special_tokens = {
+            token: num_base_tokens + i for i, token in enumerate(special_tokens)
+        }
+        self.model = tiktoken.Encoding(
+            name=Path(model_path).name,
+            pat_str=self.pat_str,
+            mergeable_ranks=mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        logger.info(f"Reloaded tiktoken model from {model_path}")
+        self.n_words: int = self.model.n_vocab
+        # BOS / EOS token IDs
+        self.bos_id: int = self.special_tokens["<|begin_of_text|>"]
+        self.eos_id: int = self.special_tokens["<|end_of_text|>"]
+        self.pad_id: int = -1
+        self.stop_tokens = {
+            self.special_tokens["<|end_of_text|>"],
+            self.special_tokens["<|eot_id|>"],
+        }
+        logger.info(
+            f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
+        )
+    def encode(
+        self,
+        s: str,
+        *,
+        bos: bool,
+        eos: bool,
+        allowed_special: Union[Literal["all"], AbstractSet[str]] = set(),
+        disallowed_special: Union[Literal["all"], Collection[str]] = (),
+    ) -> List[int]:
+        """
+        Encodes a string into a list of token IDs.
+        Args:
+            s (str): The input string to be encoded.
+            bos (bool): Whether to prepend the beginning-of-sequence token.
+            eos (bool): Whether to append the end-of-sequence token.
+            allowed_tokens ("all"|set[str]): allowed special tokens in string
+            disallowed_tokens ("all"|set[str]): special tokens that raise an error when in string
+        Returns:
+            list[int]: A list of token IDs.
+        By default, setting disallowed_special=() encodes a string by ignoring
+        special tokens. Specifically:
+        - Setting `disallowed_special` to () will cause all text corresponding
+          to special tokens to be encoded as natural text (insteading of raising
+          an error).
+        - Setting `allowed_special` to "all" will treat all text corresponding
+          to special tokens to be encoded as special tokens.
+        """
+        assert type(s) is str
+        # The tiktoken tokenizer can handle <=400k chars without
+        # pyo3_runtime.PanicException.
+        TIKTOKEN_MAX_ENCODE_CHARS = 400_000
+        # https://github.com/openai/tiktoken/issues/195
+        # Here we iterate over subsequences and split if we exceed the limit
+        # of max consecutive non-whitespace or whitespace characters.
+        MAX_NO_WHITESPACES_CHARS = 25_000
+        substrs = (
+            substr
+            for i in range(0, len(s), TIKTOKEN_MAX_ENCODE_CHARS)
+            for substr in self._split_whitespaces_or_nonwhitespaces(
+                s[i : i + TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS
+            )
+        )
+        t: List[int] = []
+        for substr in substrs:
+            t.extend(
+                self.model.encode(
+                    substr,
+                    allowed_special=allowed_special,
+                    disallowed_special=disallowed_special,
+                )
+            )
+        if bos:
+            t.insert(0, self.bos_id)
+        if eos:
+            t.append(self.eos_id)
+        return t
+    def decode(self, t: Sequence[int]) -> str:
+        """
+        Decodes a list of token IDs into a string.
+        Args:
+            t (List[int]): The list of token IDs to be decoded.
+        Returns:
+            str: The decoded string.
+        """
+        # Typecast is safe here. Tiktoken doesn't do anything list-related with the sequence.
+        return self.model.decode(cast(List[int], t))
+    @staticmethod
+    def _split_whitespaces_or_nonwhitespaces(
+        s: str, max_consecutive_slice_len: int
+    ) -> Iterator[str]:
+        """
+        Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
+        consecutive whitespaces or consecutive non-whitespaces.
+        """
+        current_slice_len = 0
+        current_slice_is_space = s[0].isspace() if len(s) > 0 else False
+        slice_start = 0
+        for i in range(len(s)):
+            is_now_space = s[i].isspace()
+            if current_slice_is_space ^ is_now_space:
+                current_slice_len = 1
+                current_slice_is_space = is_now_space
+            else:
+                current_slice_len += 1
+                if current_slice_len > max_consecutive_slice_len:
+                    yield s[slice_start:i]
+                    slice_start = i
+                    current_slice_len = 1
+        yield s[slice_start:]
+class ChatFormat:
+    def __init__(self, tokenizer: Tokenizer):
+        self.tokenizer = tokenizer
+    def encode_header(self, message: Message) -> List[int]:
+        tokens = []
+        tokens.append(self.tokenizer.special_tokens["<|start_header_id|>"])
+        tokens.extend(self.tokenizer.encode(message["role"], bos=False, eos=False))
+        tokens.append(self.tokenizer.special_tokens["<|end_header_id|>"])
+        tokens.extend(self.tokenizer.encode("\n\n", bos=False, eos=False))
+        return tokens
+    def encode_message(self, message: Message) -> List[int]:
+        tokens = self.encode_header(message)
+        tokens.extend(
+            self.tokenizer.encode(message["content"].strip(), bos=False, eos=False)
+        )
+        tokens.append(self.tokenizer.special_tokens["<|eot_id|>"])
+        return tokens
+    def encode_dialog_prompt(self, dialog: Dialog) -> List[int]:
+        tokens = []
+        tokens.append(self.tokenizer.special_tokens["<|begin_of_text|>"])
+        for message in dialog:
+            tokens.extend(self.encode_message(message))
+        # Add the start of an assistant message for the model to complete.
+        tokens.extend(self.encode_header({"role": "assistant", "content": ""}))
+        return tokens
--- a/model.properties
+++ b/model.properties
+# 模型唯一标识
+modelCode=596
+# 模型名称
+modelName=llama3_pytorch
+# 模型描述
+modelDescription=Meta最新开源模型llama3
+# 应用场景
+appScenario=推理,对话问答,制造,广媒,家居,教育
+# 框架类型
+frameType=pytorch
--- a/requirements.txt
+++ b/requirements.txt
+fairscale
+fire
+tiktoken==0.4.0
+blobfile
--- a/setup.py
+++ b/setup.py
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
+from setuptools import find_packages, setup
+def get_requirements(path: str):
+    return [l.strip() for l in open(path)]
+setup(
+    name="llama3",
+    version="0.0.1",
+    packages=find_packages(),
+    install_requires=get_requirements("requirements.txt"),
+)
--- a/test.sh
+++ b/test.sh
+#!/bin/bash
+echo "Export params ..."
+export HIP_VISIBLE_DEVICES=0,1,2,3 # 自行修改为训练的卡号和数量
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export USE_MIOPEN_BATCHNORM=1
+echo "Start ..."
+# Meta-Llama-3-8B-Instruct 模型
+torchrun --nproc_per_node 1 example_chat_completion.py \
+    --ckpt_dir ./Meta-Llama-3-8B-Instruct/original/ \
+    --tokenizer_path ./Meta-Llama-3-8B-Instruct/original/tokenizer.model \
+    --max_seq_len 512 --max_batch_size 6
+# Meta-Llama-3-8B 模型
+torchrun --nproc_per_node 1 example_text_completion.py \
+    --ckpt_dir Meta-Llama-3-8B/original/ \
+    --tokenizer_path Meta-Llama-3-8B/original/tokenizer.model \
+    --max_seq_len 128 --max_batch_size 4