idefics.py 2.93 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import torch
import torch.distributed

from typing import List, Optional, Tuple

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoProcessor,
)

from text_generation_server.models.custom_modeling.idefics_config import IdeficsConfig
from text_generation_server.models.custom_modeling.idefics_processing import (
    IdeficsProcessor,
)
from transformers import LlamaTokenizerFast
from text_generation_server.models.custom_modeling.idefics_modeling import (
    IdeficsForVisionText2Text,
)
from text_generation_server.models.idefics_causal_lm import IdeficsCausalLM
from text_generation_server.utils import (
    initialize_torch_distributed,
    weight_files,
    Weights,
)


class IDEFICSSharded(IdeficsCausalLM):
    def __init__(
        self,
        model_id: str,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
xuxzh1's avatar
last  
xuxzh1 committed
34
        speculator: Optional[str] = None,
35
36
37
38
39
40
41
42
        dtype: Optional[torch.dtype] = None,
        trust_remote_code: bool = False,
    ):
        self.process_group, rank, world_size = initialize_torch_distributed()
        if torch.cuda.is_available():
            device = torch.device(f"cuda:{rank}")
            # 9b seems to work correctly enough in float16, but 80b seems
            # to be really saturating for f16.
Nicolas Patry's avatar
Nicolas Patry committed
43
            dtype = torch.float16 if dtype is None else dtype
44
45
        else:
            device = torch.device("cpu")
Wang, Yi's avatar
Wang, Yi committed
46
            dtype = torch.float32 if dtype is None else dtype
47
48
49
50
51
52
53
54
        self.device, self.dtype = device, dtype

        config = IdeficsConfig.from_pretrained(
            model_id,
            revision=revision,
            trust_remote_code=trust_remote_code,
        )
        config.quantize = quantize
xuxzh1's avatar
last  
xuxzh1 committed
55
        config.speculator = speculator
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
        config.vision_config.quantize = quantize

        tokenizer = LlamaTokenizerFast.from_pretrained(
            model_id,
            revision=revision,
            padding_side="left",
            truncation_side="left",
            trust_remote_code=trust_remote_code,
        )
        self.processor = IdeficsProcessor.from_pretrained(
            model_id,
            revision=revision,
            padding_side="left",
            truncation_side="left",
            trust_remote_code=trust_remote_code,
        )

        torch.distributed.barrier(group=self.process_group)
        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
        weights = Weights(
            filenames,
            device=device,
            dtype=dtype,
            process_group=self.process_group,
        )

        model = IdeficsForVisionText2Text(config, weights)

        torch.distributed.barrier(group=self.process_group)
        super(IdeficsCausalLM, self).__init__(
xuxzh1's avatar
last  
xuxzh1 committed
86
            model_id=model_id,
87
88
89
90
91
92
93
94
            model=model,
            tokenizer=tokenizer,
            requires_padding=True,
            dtype=dtype,
            device=device,
            rank=rank,
            world_size=world_size,
        )