inference.py

import argparse
import json
import os
from typing import Dict

import torch
from chatio import dummy_io, rich_io, simple_io
from coati.dataset.conversation import setup_conversation_template
from coati.models import generate_streaming
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedModel

from colossalai.logging import get_dist_logger

logger = get_dist_logger()


def get_gpu_memory(max_gpus=None):
    """
    Get the available memory for each GPU.

    Args:
        max_gpus (int, optional): The maximum number of GPUs to consider. Defaults to None.

    Returns:
        list: A list of available memory for each GPU.
    """
    gpu_memory = []
    num_gpus = torch.cuda.device_count() if max_gpus is None else min(max_gpus, torch.cuda.device_count())

    for gpu_id in range(num_gpus):
        # Code to get GPU memory goes here
        with torch.cuda.device(gpu_id):
            device = torch.cuda.current_device()
            gpu_properties = torch.cuda.get_device_properties(device)
            total_memory = gpu_properties.total_memory / (1024**3)
            allocated_memory = torch.cuda.memory_allocated() / (1024**3)
            available_memory = total_memory - allocated_memory
            gpu_memory.append(available_memory)
    return gpu_memory


def load_model_and_tokenizer(model_path, tokenizer_path, device="cuda", **kwargs):
    """
    Load the model and tokenizer from the specified paths and move the model to the specified device.

    Args:
        model_path (str): The path to the pre-trained model.
        tokenizer_path (str): The path to the pre-trained tokenizer.
        device (str, optional): The device to move the model to. Defaults to "cuda".
        **kwargs: Additional keyword arguments to be passed to the `AutoModelForCausalLM.from_pretrained` function.

    Returns:
        tuple: A tuple containing the loaded model and tokenizer.
    """

    model = AutoModelForCausalLM.from_pretrained(model_path, **kwargs)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    tokenizer.pad_token = tokenizer.eos_token
    model.to(device)

    return model, tokenizer


def _set_default_generate_kwargs(model: PreTrainedModel) -> Dict:
    """
    Set default keyword arguments for generation based on the given model.

    Args:
        model (PreTrainedModel): The model used for generation.

    Returns:
        Dict: A dictionary containing the default keyword arguments for generation.
    """
    unwrapped_model = model
    new_kwargs = {}
    # Use huggingface models method directly
    if hasattr(unwrapped_model, "prepare_inputs_for_generation"):
        new_kwargs["prepare_inputs_fn"] = unwrapped_model.prepare_inputs_for_generation

    if hasattr(unwrapped_model, "_update_model_kwargs_for_generation"):
        new_kwargs["update_model_kwargs_fn"] = unwrapped_model._update_model_kwargs_for_generation
    return new_kwargs


def generation_wrapper(*args, **kwargs):
    input_ids = args[1]
    tokenizer = args[2]
    for output in generate_streaming(*args, **kwargs):
        yield tokenizer.batch_decode(output[:, input_ids.size(1) :], skip_special_tokens=True)[0]


def main(args):
    conversation_template_config = json.load(open(args.conversation_template_config, "r", encoding="utf8"))

    max_new_tokens = args.max_new_tokens
    model_max_length = args.model_max_length
    model, tokenizer = load_model_and_tokenizer(
        args.model_path, args.tokenizer_path or args.model_path, local_files_only=True
    )

    assert max_new_tokens <= model_max_length
    if hasattr(tokenizer, "pad_token") and hasattr(tokenizer, "eos_token") and tokenizer.eos_token is not None:
        try:
            # Some tokenizers doesn't allow to set pad_token mannually e.g., Qwen
            tokenizer.pad_token = tokenizer.eos_token
        except AttributeError as e:
            logger.warning(f"Unable to set pad token to eos token, {str(e)}")
    tokenizer.padding_side = "left"

    model_kwargs = {
        "max_new_tokens": max_new_tokens,
        # 'early_stopping': True,
        # 'top_k': -1,
        # 'top_p': 1.0,
        # 'temperature': 1.0,
        # 'temperature':0.1,
    }
    round = 1

    conv = setup_conversation_template(tokenizer, conversation_template_config, args.conversation_template_config)

    while True:
        if args.io == "simple":
            chat_io = simple_io
        elif args.io == "rich":
            chat_io = rich_io
        elif args.io == "dummy":
            chat_io = dummy_io
        else:
            raise ValueError(f"Unknown io type: {args.io}")
        # raw_text = print(">>> Human:", end=" ")
        inp = chat_io.prompt_for_input("user")

        if not inp:
            print("prompt should not be empty!")
            continue

        if inp.strip() == "clear":
            conv.clear()
            os.system("clear")
            continue

        if inp.strip() == "exit":
            print("End of chat.")
            break

        query_text = inp.strip()

        conv.append_message("user", query_text)

        chat_io.prompt_for_output("assistant")

        prompt = conv.get_prompt(add_generation_prompt=True)
        print(prompt + "<end_of_prompt>")
        input_ids = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)["input_ids"].to(
            torch.cuda.current_device()
        )
        default_generate_kwargs = _set_default_generate_kwargs(model)
        model_kwargs.update(default_generate_kwargs)
        output_stream = generation_wrapper(
            model,
            input_ids,
            tokenizer,
            max_length=model_max_length,
            temperature=0.7,
            early_stopping=True,
            stop_token_ids=conversation_template_config["stop_ids"],
            **model_kwargs,
        )

        # print(f">>> Assistant:", end=" ")
        outputs = chat_io.stream_output(output_stream)

        conv.append_message("assistant", outputs.strip())

        with open("round.txt", mode="a", encoding="utf-8") as f:
            f.write("\n\n" + "=" * 10 + "\n")
            f.write(f"round {round}:\n{conv.save_prompt()}\n\n")
            f.write("=" * 10 + "\n")

        # print(f">>> Assistant:", end=" ")

        round += 1


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_path", type=str, default=None)
    parser.add_argument("--tokenizer_path", type=str, default=None)
    parser.add_argument("--conversation_template_config", type=str, default=None)
    parser.add_argument("--model_max_length", type=int, default=2048)
    parser.add_argument("--max_new_tokens", type=int, default=512)
    parser.add_argument("--io", type=str, default="rich", choices=["simple", "rich", "dummy"])
    args = parser.parse_args()
    main(args)