instructblip.py 1.93 KB
Newer Older
luopl's avatar
luopl committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import torch
from PIL import Image
import os.path as osp
import sys
from .base import BaseModel
from ..smp import *


class InstructBLIP(BaseModel):

    INSTALL_REQ = True
    INTERLEAVE = False

    def __init__(self, name):
        self.config_map = {
            'instructblip_7b': 'misc/blip2_instruct_vicuna7b.yaml',
            'instructblip_13b': 'misc/blip2_instruct_vicuna13b.yaml',
        }

        self.file_path = __file__
        config_root = osp.dirname(self.file_path)

        try:
            from lavis.models import load_preprocess
            from omegaconf import OmegaConf
            from lavis.common.registry import registry
        except Exception as e:
            logging.critical('Please install lavis before using InstructBLIP. ')
            raise e

        assert name in self.config_map
        cfg_path = osp.join(config_root, self.config_map[name])
        cfg = OmegaConf.load(cfg_path)

        model_cfg = cfg.model
        assert osp.exists(model_cfg.llm_model) or splitlen(model_cfg.llm_model) == 2
        model_cls = registry.get_model_class(name='blip2_vicuna_instruct')
        model = model_cls.from_config(model_cfg)
        model.eval()

        self.device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'
        device = self.device
        model.to(device)
        self.model = model
        self.kwargs = {'max_length': 512}

        preprocess_cfg = cfg.preprocess
        vis_processors, _ = load_preprocess(preprocess_cfg)
        self.vis_processors = vis_processors

    def generate_inner(self, message, dataset=None):
        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
        vis_processors = self.vis_processors
        raw_image = Image.open(image_path).convert('RGB')
        image_tensor = vis_processors['eval'](raw_image).unsqueeze(0).to(self.device)
        outputs = self.model.generate(dict(image=image_tensor, prompt=prompt))
        return outputs[0]