eagle_x.py

import torch
from PIL import Image
from abc import abstractproperty
import sys
import os.path as osp
from .base import BaseModel
from ..smp import *
from ..dataset import DATASET_TYPE
import copy


# This function is used to split Eagle-X5-34B
def split_model(model_name):
    import math
    device_map = {}
    num_gpus = torch.cuda.device_count()
    rank, world_size = get_rank_and_world_size()
    num_gpus = num_gpus // world_size

    num_layers_map = {
        'Eagle-X5-34B-Chat': 60,
        'Eagle-X5-34B-Plus': 60
    }
    if model_name not in num_layers_map:
        return 'cuda'
    num_layers = num_layers_map[model_name] + 8
    # Since the first GPU will be used for ViT, treat it as 0.5 GPU.
    num_layers_per_gpu = math.ceil(num_layers / num_gpus)
    num_layers_per_gpu = [num_layers_per_gpu] * num_gpus
    num_layers_per_gpu[-1] = num_layers - sum(num_layers_per_gpu[:-1])
    num_layers_per_gpu[0] -= 4
    layer_cnt = 0
    for i, num_layer in enumerate(num_layers_per_gpu):
        for j in range(num_layer):
            device_map[f'model.layers.{layer_cnt}'] = rank + world_size * i
            layer_cnt += 1
    device_map['model.vision_tower'] = rank
    device_map['model.embed_tokens'] = rank
    device_map['model.norm'] = rank
    device_map['model.rotary_emb'] = rank
    device_map['model.mm_projector'] = rank
    device_map['lm_head'] = rank
    device_map[f'model.layers.{num_layers - 1}'] = rank

    logging.warning("Remove L157-L158 in https://github.com/NVlabs/EAGLE/blob/fef95f103b5e9899acbbe2c237e5b99147ab7e8e/eagle/model/builder.py to make it work properly.")  # noqa: E501
    return device_map


class Eagle(BaseModel):
    INSTALL_REQ = True
    INTERLEAVE = True

    def __init__(self,
                 model_path='NVEagle/Eagle-X5-7B',
                 **kwargs):
        try:
            from eagle.model.builder import load_pretrained_model
            from eagle.utils import disable_torch_init
            from eagle.mm_utils import get_model_name_from_path
        except Exception as e:
            logging.critical('''Please install eagle before using Eagle,
            you can install it from "https://github.com/NVlabs/EAGLE.git"''')
            raise e

        warnings.warn('Please install the latest version of eagle from github before you evaluate the Eagle model.')
        assert osp.exists(model_path) or splitlen(model_path) == 2
        model_name = get_model_name_from_path(model_path)
        rank, world_size = get_rank_and_world_size()

        device_map = split_model(model_path.split('/')[-1])

        self.tokenizer, self.model, self.image_processor, self.context_len = (
            load_pretrained_model(model_path, None, model_name, False, False, device_map=device_map)
        )
        self.model.eval()
        self.conv_mode = 'vicuna_v1'

        default_kwargs = dict(
            do_sample=True,
            temperature=0.2,
            top_p=0.5,
            num_beams=1,
            max_new_tokens=512,
            use_cache=True
        )

        default_kwargs.update(kwargs)
        self.kwargs = default_kwargs
        warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
        torch.cuda.empty_cache()

    def generate_inner(self, message, dataset=None):
        try:
            from eagle import conversation as conversation_lib
            from eagle.constants import (IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN,
                                         DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN)
            from eagle.conversation import conv_templates, SeparatorStyle
            from eagle.mm_utils import tokenizer_image_token, process_images, KeywordsStoppingCriteria
        except Exception as e:
            logging.critical('''Please install eagle before using Eagle,
            you can install it from "https://github.com/NVlabs/EAGLE.git"''')
            raise e

        kwargs = self.kwargs

        images = []
        prompt = ''

        for s in message:
            if s['type'] == 'image':
                images.append(s['value'])
            elif s['type'] == 'text':
                prompt += s['value']

        DEFAULT_IMAGE_TOKEN = DEFAULT_IMAGE_TOKEN * len(images)
        if self.model.config.mm_use_im_start_end:
            prompt = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + prompt
        else:
            prompt = DEFAULT_IMAGE_TOKEN + '\n' + prompt

        conv = conv_templates[self.conv_mode].copy()
        conv.append_message(conv.roles[0], prompt)
        conv.append_message(conv.roles[1], None)
        prompt = conv.get_prompt()
        images = [Image.open(s).convert('RGB') for s in images]

        image_tensor = process_images(images, self.image_processor, self.model.config)
        input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
        input_ids = input_ids.to(device='cuda', non_blocking=True)
        image_tensor = image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True)

        with torch.inference_mode():
            output_ids = self.model.generate(
                input_ids.unsqueeze(0),
                images=image_tensor,
                image_sizes=[img.size for img in images],
                **kwargs
            )

        outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
        return outputs

    def use_custom_prompt(self, dataset):
        assert dataset is not None
        if listinstr(['MMMU'], dataset):
            return False
        if DATASET_TYPE(dataset) == 'MCQ' or dataset == 'MMVet':
            return True
        return False

    def build_prompt(self, line, dataset=None):
        assert dataset is None or isinstance(dataset, str)
        assert self.use_custom_prompt(dataset)
        tgt_path = self.dump_image(line, dataset)
        question = line['question']
        if dataset == 'MMVet':
            prompt = question + '\nAnswer the question directly. '
        elif DATASET_TYPE(dataset) == 'MCQ':
            options = {
                cand: line[cand]
                for cand in string.ascii_uppercase
                if cand in line and not pd.isna(line[cand])
            }
            options_prompt = ''
            for key, item in options.items():
                options_prompt += f'{key}. {item}\n'

            hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
            prompt = f'Hint: {hint}\n' if hint is not None else ''
            prompt += f'{question}\n'
            prompt += (
                f'{options_prompt}\nAnswer with the option’s letter from the given choices directly. '
                if len(options) else 'Answer the question directly. '
            )
        else:
            raise NotImplementedError

        message = [dict(type='text', value=prompt)]
        message.extend([dict(type='image', value=s) for s in tgt_path])
        return message