trans_cli_vision_demo.py 3.63 KB
Newer Older
Rayyyyy's avatar
Rayyyyy committed
1
2
3
4
5
6
7
8
9
10
11
12
"""
This script creates a CLI demo with transformers backend for the glm-4v-9b model,
allowing users to interact with the model through a command-line interface.

Usage:
- Run the script to start the CLI demo.
- Interact with the model by typing questions and receiving responses.

Note: The script includes a modification to handle markdown to plain text conversion,
ensuring that the CLI interface displays formatted text correctly.
"""

Rayyyyy's avatar
Rayyyyy committed
13
from threading import Thread
Rayyyyy's avatar
Rayyyyy committed
14

Rayyyyy's avatar
Rayyyyy committed
15
import torch
Rayyyyy's avatar
Rayyyyy committed
16
from PIL import Image
Rayyyyy's avatar
Rayyyyy committed
17
from transformers import (
Rayyyyy's avatar
Rayyyyy committed
18
    AutoModel,
Rayyyyy's avatar
Rayyyyy committed
19
20
21
    AutoTokenizer,
    StoppingCriteria,
    StoppingCriteriaList,
Rayyyyy's avatar
Rayyyyy committed
22
    TextIteratorStreamer,
Rayyyyy's avatar
Rayyyyy committed
23
24
25
)


Rayyyyy's avatar
Rayyyyy committed
26
MODEL_PATH = "THUDM/glm-4v-9b"
Rayyyyy's avatar
Rayyyyy committed
27

Rayyyyy's avatar
Rayyyyy committed
28
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True, encode_special_tokens=True)
Rayyyyy's avatar
Rayyyyy committed
29

Rayyyyy's avatar
Rayyyyy committed
30
## For BF16 inference
Rayyyyy's avatar
Rayyyyy committed
31
32
33
model = AutoModel.from_pretrained(
    MODEL_PATH,
    trust_remote_code=True,
Rayyyyy's avatar
Rayyyyy committed
34
    torch_dtype=torch.bfloat16,
Rayyyyy's avatar
Rayyyyy committed
35
    device_map="auto",
Rayyyyy's avatar
Rayyyyy committed
36
37
38
39
40
41
42
43
44
45
).eval()

## For INT4 inference
# model = AutoModel.from_pretrained(
#     MODEL_PATH,
#     trust_remote_code=True,
#     quantization_config=BitsAndBytesConfig(load_in_4bit=True),
#     torch_dtype=torch.bfloat16,
#     low_cpu_mem_usage=True
# ).eval()
Rayyyyy's avatar
Rayyyyy committed
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89


class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        stop_ids = model.config.eos_token_id
        for stop_id in stop_ids:
            if input_ids[0][-1] == stop_id:
                return True
        return False


if __name__ == "__main__":
    history = []
    max_length = 1024
    top_p = 0.8
    temperature = 0.6
    stop = StopOnTokens()
    uploaded = False
    image = None
    print("Welcome to the GLM-4-9B CLI chat. Type your messages below.")
    image_path = input("Image Path:")
    try:
        image = Image.open(image_path).convert("RGB")
    except:
        print("Invalid image path. Continuing with text conversation.")
    while True:
        user_input = input("\nYou: ")
        if user_input.lower() in ["exit", "quit"]:
            break
        history.append([user_input, ""])

        messages = []
        for idx, (user_msg, model_msg) in enumerate(history):
            if idx == len(history) - 1 and not model_msg:
                messages.append({"role": "user", "content": user_msg})
                if image and not uploaded:
                    messages[-1].update({"image": image})
                    uploaded = True
                break
            if user_msg:
                messages.append({"role": "user", "content": user_msg})
            if model_msg:
                messages.append({"role": "assistant", "content": model_msg})
        model_inputs = tokenizer.apply_chat_template(
Rayyyyy's avatar
Rayyyyy committed
90
91
92
            messages, add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True
        ).to(next(model.parameters()).device)
        streamer = TextIteratorStreamer(tokenizer=tokenizer, timeout=60, skip_prompt=True, skip_special_tokens=True)
Rayyyyy's avatar
Rayyyyy committed
93
94
95
96
97
98
99
100
101
102
103
104
105
        generate_kwargs = {
            **model_inputs,
            "streamer": streamer,
            "max_new_tokens": max_length,
            "do_sample": True,
            "top_p": top_p,
            "temperature": temperature,
            "stopping_criteria": StoppingCriteriaList([stop]),
            "repetition_penalty": 1.2,
            "eos_token_id": [151329, 151336, 151338],
        }
        t = Thread(target=model.generate, kwargs=generate_kwargs)
        t.start()
Rayyyyy's avatar
Rayyyyy committed
106
        print("GLM-4V:", end="", flush=True)
Rayyyyy's avatar
Rayyyyy committed
107
108
109
110
111
112
        for new_token in streamer:
            if new_token:
                print(new_token, end="", flush=True)
                history[-1][1] += new_token

        history[-1][1] = history[-1][1].strip()