chat.py

# Copyright (c) OpenMMLab. All rights reserved.
"""Chat through command line.

This submodule allows user to chat with language model through command line,
and optionally accelerate model using backends like deepspeed.

Example 1: Chat with default setting

```python
python -m lmdeploy.pytorch.chat $PATH_TO_HF_MODEL
```

Example 2: Disable sampling

```python
python -m lmdeploy.pytorch.chat \
    $PATH_TO_LLAMA_MODEL_IN_HF_FORMAT \
    --temperature 0
```

Example 3: Accelerate with deepspeed inference

```python
python -m lmdeploy.pytorch.chat \
    $PATH_TO_LLAMA_MODEL_IN_HF_FORMAT \
    --accel deepspeed
```

Note: to use deepspeed, you need to install deepspeed,
    and if hope to accelerate InternLM, you need a customized version
    https://github.com/wangruohui/DeepSpeed/tree/support_internlm_0.10.0

Example 4: Tensor parallel the model on 2 GPUs

```python
deepspeed --module --num_gpus 2 lmdeploy.pytorch.chat \
    $PATH_TO_LLAMA_MODEL_IN_HF_FORMAT \
    --accel deepspeed \
```

This module also allow the following control commands to change
generation behaviors during chat.

- `exit`: terminate and exit chat
- `config set key=value`: change generation config `key` to `value`,
    e.g. config temperature=0 disable sampling for following chats
- `clear`: clear chat history
"""

import itertools
import logging
from typing import Optional

import fire
import torch
from transformers import GenerationConfig, PreTrainedModel

from .adapters import init_adapter
from .dist import get_local_rank, get_rank, get_world_size
from .model import accel_model, init_model
from .session import BasicSessionManagerWithHistory
from .utils import BasicStreamer, TerminalIO, control

logger = logging.getLogger(__name__)


def set_logging(log_file: str, debug: bool):
    torch.set_printoptions(linewidth=120)
    level = logging.DEBUG if debug else logging.INFO
    log_file = log_file or 'chat.log'
    if r := get_rank() != 0:
        log_file = log_file + f'.{r}'
    logging.basicConfig(level=level,
                        format=('%(filename)s: '
                                '%(levelname)s: '
                                '%(funcName)s(): '
                                '%(lineno)d:\t'
                                '%(message)s'),
                        filename=log_file,
                        filemode='w')
    print(f'Worker {get_rank()} logging to {log_file}')


def main(
    model_path: str,
    tokenizer_path: Optional[str] = None,
    accel: Optional[str] = None,
    max_new_tokens: int = 128,
    temperature: float = 0.8,
    top_p: float = 0.95,
    seed: int = 0,
    use_fast_tokenizer: bool = True,
    max_alloc: int = 2048,
    max_session_len: int = None,
    log_file: Optional[str] = None,
    debug: bool = False,
    adapter: Optional[str] = None,
):
    """Chat with model through terminal.

    Args:
        model_path (str): Path to model.
        tokenizer_path (str): Path to tokenizer.
        accel (str): Model accelerator.
        max_new_tokens (int): Maximum number of tokens to generate.
        temperature (float): Temperature for sampling.
        top_p (float): Top p for sampling.
        seed (int): Random seed.
        use_fast_tokenizer (bool): Whether to use fast tokenizer.
            This argument is directly pass to transformer's ``AutoTokenizer.from_pretrained``.
            Generally, user should choose to use fast tokenizers.
            But if using fast raise some error, try to force using a slow one.
        max_alloc (int): Maximum memory to allocate (for deepspeed).
        max_session_len (int): Maximum number of tokens allowed for all chat sessions.
            This include both history and current session.
        log_file (str): Path to log file.
        debug (bool): Whether to enable debug mode.
        adapter (str): Force to use an adapter.
            Generally user should not use this argument because adapter is selected based
            on the type of model. Only when it is impossible, e.g. distinguishing llama 1/2
            based on `LlamaforCausalLM` class, this argument is required.
            Currently, only "llama1" is acceptable for llama1 models.
    """  # noqa: E501
    set_logging(log_file, debug)

    # workers should sync in sampling
    torch.manual_seed(seed)

    local_rank = get_local_rank()
    world_size = get_world_size()

    # Init model and tokenizer
    if not tokenizer_path:
        tokenizer_path = model_path

    model, tokenizer = init_model(
        model_path,
        tokenizer_path,
        use_fast_tokenizer=use_fast_tokenizer,
    )

    # Init adapter based on model and tokenizer
    adapter = init_adapter(model, tokenizer, adapter)

    # Accelerate model
    model: PreTrainedModel = accel_model(model,
                                         accel,
                                         max_alloc=max_alloc,
                                         tp_size=world_size)

    # warmup
    warmup_config = GenerationConfig(
        max_new_tokens=1,
        do_sample=temperature > 0,
        temperature=temperature,
        top_p=top_p,
    )
    model.generate(torch.tensor([[6]], device=get_local_rank()), warmup_config)

    gen_config = GenerationConfig(
        max_new_tokens=max_new_tokens,
        do_sample=temperature > 0,
        temperature=temperature,
        top_p=top_p,
    )

    # Session manager handling history
    max_session_len = max_alloc if max_session_len is None else max_session_len
    sm = BasicSessionManagerWithHistory(max_session_len=max_session_len,
                                        start_ids=adapter.start_ids,
                                        sep_ids=adapter.sep_ids)
    io = TerminalIO()
    streamer = BasicStreamer(adapter.decode, io.output)

    for r in itertools.count(1):
        # User input from IO
        logger.info(f'Round {r}')

        prompt: str = io.input()
        logger.info(f'User input: {prompt}')

        # Allow user to change config during runtime or exit
        if control(prompt, gen_config, sm):
            continue

        # Tokenize and apply model specific templates
        input_ids = adapter.encode_and_decorate(prompt)
        logger.info(f'Input ids:\n{input_ids}')

        # Prepend chat history (tensor concatenation)
        input_ids = sm.prepend_history(input_ids)
        logger.info(f'Input ids with history:\n{input_ids}')

        # Generate
        input_ids = input_ids.cuda(local_rank)
        # returned tensor including input and generated output
        output = model.generate(input_ids,
                                gen_config,
                                streamer=streamer,
                                stopping_criteria=adapter.stopping_criteria)
        logger.info(f'Output:\n{output}')

        # Save output into session manager and maybe trim some history
        sm.add_to_history(output)


def cli():
    fire.Fire(main)


if __name__ == '__main__':
    cli()