test_infer_engine.py

from itertools import accumulate

import pytest
import torch
import torch.nn as nn
from packaging import version
from transformers import BloomConfig, BloomForCausalLM, LlamaConfig, LlamaForCausalLM
from transformers.tokenization_utils_base import BatchEncoding

import colossalai
from colossalai.inference.tensor_parallel import TPInferEngine
from colossalai.inference.tensor_parallel.batch_infer_state import BatchInferState
from colossalai.logging import disable_existing_loggers
from colossalai.shardformer import ShardConfig
from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn

TP_SIZE = 2
MAX_BATCH_SIZE = 4
MAX_INPUT_LEN = 16
MAX_OUTPUT_LEN = 8

CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse('11.5')


@parameterize('test_config', [{
    'tp_size': TP_SIZE,
}])
def run(test_config):
    model_config = BloomConfig(num_hidden_layers=4, hidden_size=128, intermediate_size=256, num_attention_heads=4)
    model = BloomForCausalLM(model_config)
    model = model.half()
    model.to(torch.cuda.current_device())

    # 1. check TPInferEngine init and model optimization
    shard_config = ShardConfig(enable_tensor_parallelism=True if test_config['tp_size'] > 1 else False,
                               inference_only=True)
    infer_engine = TPInferEngine(model, shard_config, MAX_BATCH_SIZE, MAX_INPUT_LEN, MAX_OUTPUT_LEN)

    assert infer_engine.cache_manager is not None
    assert infer_engine.tp_size == TP_SIZE
    assert infer_engine.head_num == model_config.num_attention_heads // TP_SIZE

    # 2. check data preparation
    input_ids_list = [[80540, 15473, 3331, 11970, 90472, 361, 61335], [80540, 15473, 3331, 11970],
                      [80540, 15473, 3331, 11970], [80540, 15473]]
    batch_size = len(input_ids_list)
    max_seq_len = max(len(li) for li in input_ids_list)
    attention_mask = [[0] * max_seq_len for _ in range(batch_size)]
    for i, li in enumerate(input_ids_list):
        attention_mask[i][max_seq_len - len(li):] = [1 for _ in range(len(li))]
    data = dict(input_ids=input_ids_list, attention_mask=attention_mask)
    inputs_batch_encoding = BatchEncoding(data=data)
    seq_lengths = [len(li) for li in input_ids_list]
    start_loc = list(accumulate([0] + seq_lengths[:-1]))
    seq_lengths = torch.tensor(seq_lengths, dtype=torch.int32)
    start_loc = torch.tensor(start_loc, dtype=torch.int32)
    # input token id list as inputs
    batch_state_out1 = infer_engine.prepare_batch_state(inputs_batch_encoding)
    # BatchEncoding as inputs
    batch_state_out2 = infer_engine.prepare_batch_state(input_ids_list)

    assert batch_state_out1.batch_size == batch_state_out2.batch_size == batch_size
    assert torch.equal(batch_state_out1.seq_len, batch_state_out2.seq_len)

    # The following tests are discarded for now, and will be reused after all features are added
    # assert torch.equal(batch_state_out1.seq_len.to(seq_lengths.device), seq_lengths)
    # assert torch.equal(batch_state_out2.seq_len.to(seq_lengths.device), seq_lengths)
    # assert torch.equal(batch_state_out1.start_loc.to(start_loc.device), start_loc)
    # assert torch.equal(batch_state_out2.start_loc.to(start_loc.device), start_loc)

    # 3. check optimized model generate
    input_ids = torch.randint(low=10, high=1000, size=(MAX_BATCH_SIZE, MAX_INPUT_LEN))
    generate_kwargs = dict(do_sample=False)
    infer_engine.generate(input_ids, **generate_kwargs)

    torch.cuda.empty_cache()


def check_engine(rank, world_size, port):
    disable_existing_loggers()
    colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
    run()


@pytest.mark.skipif(not CUDA_SUPPORT, reason="kv-cache manager engine requires cuda version to be higher than 11.5")
@pytest.mark.dist
@rerun_if_address_is_in_use()
@clear_cache_before_run()
def test_engine():
    spawn(check_engine, TP_SIZE)


if __name__ == '__main__':
    test_engine()