Commit a3d43354 authored by wanglch's avatar wanglch
Browse files

Initial commit

parents
Pipeline #1498 failed with stages
in 0 seconds
# Video Demo
[中文版README](./README_zh.md)
This folder contains sample code for running the CogVLM2-Video model.
## Installation
Before executing the code, please make sure that the dependencies in `basic_demo` and the additional dependencies in the current folder have been correctly installed.
```shell
pip install -r requirements.txt
```
## CLI call model
Run this code to start a conversation in the command line. Please note that to run this code, the model must be loaded on a GPU
```shell
CUDA_VISIBLE_DEVICES=0 python cli_demo.py
```
## Restful API Demo
Run this code to launch a Restful API server:
```shell
python api_demo.py
```
This will start a Restful API on the 5000 port. Run following code to make a request to the server:
```shell
python test_api.py
```
## Gradio Demo
After launch the Restful API server, you can run this code to start a Gradio web demo:
```shell
python gradio_demo.py
```
Then open the browser and visit `http://0.0.0.0:7868/` to chat with the model.
\ No newline at end of file
# Video Demo
[Read this in English.](./README.md)
该文件夹下为运行 CogVLM2-Video 模型的示例代码。
## 安装
在执行代码之前,请您确保已经正确安装了 `basic_demo`中的依赖以及当前文件夹下的额外依赖。
```shell
pip install -r requirements.txt
```
## CLI 调用模型
运行本代码以开始在命令行中对话。请注意,运行该代码,模型必须在一张GPU上载入
```shell
CUDA_VISIBLE_DEVICES=0 python cli_demo.py
```
## Restful API
运行以下代码以启动一个 Restful API 服务器:
```shell
python api_demo.py
```
这将会在5000端口启动一个 Restful API。运行以下代码以向服务器发送请求:
```shell
python test_api.py
```
## Gradio 演示
在启动 Restful API 服务器后,你可以运行以下代码来启动 Gradio 网页演示:
```shell
python gradio_demo.py
```
然后打开浏览器并访问 `http://0.0.0.0:7868/` 来与模型进行聊天。
\ No newline at end of file
from flask import Flask, request, jsonify
import traceback
from inference import predict
app = Flask(__name__)
@app.route('/video_qa', methods=['POST'])
def video_qa():
if 'video' not in request.files:
return jsonify({'error': 'no video file found'}), 400
video = request.files['video']
if video.filename == '':
return jsonify({'error': 'no chosen file'}), 400
if 'question' not in request.form:
question = ""
else:
question = request.form['question']
if question is None or question == "" or question == "@Caption":
question = "Please describe the video in detail."
print("Get question:", question)
if 'temperature' not in request.form:
temperature = 0.001
print("No temperature found, use default value 0.001")
else:
temperature = float(request.form['temperature'])
print("Get temperature:", temperature)
try:
answer = predict(prompt=question, video_data=video.read(), temperature=temperature)
return jsonify(
{"answer": answer})
except:
traceback.print_exc()
return jsonify({"error": traceback.format_exc()}), 500
if __name__ == '__main__':
app.run(debug=False, host="0.0.0.0", port=5000)
import io
import numpy as np
import torch
from decord import cpu, VideoReader, bridge
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import argparse
MODEL_PATH = "THUDM/cogvlm2-video-llama3-chat"
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
TORCH_TYPE = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[
0] >= 8 else torch.float16
parser = argparse.ArgumentParser(description="CogVLM2-Video CLI Demo")
parser.add_argument('--quant', type=int, choices=[4, 8], help='Enable 4-bit or 8-bit precision loading', default=0)
args = parser.parse_args()
if 'int4' in MODEL_PATH:
args.quant = 4
def load_video(video_path, strategy='chat'):
bridge.set_bridge('torch')
with open(video_path, 'rb') as f:
mp4_stream = f.read()
num_frames = 24
if mp4_stream is not None:
decord_vr = VideoReader(io.BytesIO(mp4_stream), ctx=cpu(0))
else:
decord_vr = VideoReader(video_path, ctx=cpu(0))
frame_id_list = None
total_frames = len(decord_vr)
if strategy == 'base':
clip_end_sec = 60
clip_start_sec = 0
start_frame = int(clip_start_sec * decord_vr.get_avg_fps())
end_frame = min(total_frames,
int(clip_end_sec * decord_vr.get_avg_fps())) if clip_end_sec is not None else total_frames
frame_id_list = np.linspace(start_frame, end_frame - 1, num_frames, dtype=int)
elif strategy == 'chat':
timestamps = decord_vr.get_frame_timestamp(np.arange(total_frames))
timestamps = [i[0] for i in timestamps]
max_second = round(max(timestamps)) + 1
frame_id_list = []
for second in range(max_second):
closest_num = min(timestamps, key=lambda x: abs(x - second))
index = timestamps.index(closest_num)
frame_id_list.append(index)
if len(frame_id_list) >= num_frames:
break
video_data = decord_vr.get_batch(frame_id_list)
video_data = video_data.permute(3, 0, 1, 2)
return video_data
tokenizer = AutoTokenizer.from_pretrained(
MODEL_PATH,
trust_remote_code=True,
# padding_side="left"
)
if torch.cuda.is_available() and torch.cuda.get_device_properties(0).total_memory < 48 * 1024 ** 3 and not args.quant:
print("GPU memory is less than 48GB. Please use cli_demo_multi_gpus.py or pass `--quant 4` or `--quant 8`.")
exit()
# Load the model
if args.quant == 4:
model = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
torch_dtype=TORCH_TYPE,
trust_remote_code=True,
quantization_config=BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=TORCH_TYPE,
),
low_cpu_mem_usage=True
).eval()
elif args.quant == 8:
model = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
torch_dtype=TORCH_TYPE,
trust_remote_code=True,
quantization_config=BitsAndBytesConfig(
load_in_8bit=True,
bnb_4bit_compute_dtype=TORCH_TYPE,
),
low_cpu_mem_usage=True
).eval()
else:
model = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
torch_dtype=TORCH_TYPE,
trust_remote_code=True
).eval().to(DEVICE)
while True:
strategy = 'base' if 'cogvlm2-video-llama3-base' in MODEL_PATH else 'chat'
print(f"using with {strategy} model")
video_path = input("video path >>>>> ")
if video_path == '':
print('You did not enter video path, the following will be a plain text conversation.')
video = None
else:
video = load_video(video_path, strategy=strategy)
history = []
while True:
query = input("Human:")
if query == "clear":
break
inputs = model.build_conversation_input_ids(
tokenizer=tokenizer,
query=query,
images=[video],
history=history,
template_version=strategy
)
inputs = {
'input_ids': inputs['input_ids'].unsqueeze(0).to(DEVICE),
'token_type_ids': inputs['token_type_ids'].unsqueeze(0).to(DEVICE),
'attention_mask': inputs['attention_mask'].unsqueeze(0).to(DEVICE),
'images': [[inputs['images'][0].to('cuda').to(TORCH_TYPE)]],
}
gen_kwargs = {
"max_new_tokens": 2048,
"pad_token_id": 128002,
"top_k": 1,
"do_sample": True,
"top_p": 0.1,
"temperature": 0.1,
}
with torch.no_grad():
outputs = model.generate(**inputs, **gen_kwargs)
outputs = outputs[:, inputs['input_ids'].shape[1]:]
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("\nCogVLM2-Video:", response)
history.append((query, response))
import gradio as gr
import requests
def load_video_data(video_path):
with open(video_path, 'rb') as file:
video_data = file.read()
return video_data
class ChatAgent:
def __init__(self):
pass
def answer(self, video_path, prompt, max_new_tokens, num_beams, temperature):
url = 'http://127.0.0.1:5000/video_qa'
files = {'video': open(video_path, 'rb')}
data = {'question': prompt, 'temperature': temperature}
response = requests.post(url, files=files, data=data)
if response.status_code != 200:
return f"Something went wrong: {response.text}"
else:
return response.json()["answer"]
def gradio_reset():
return (
None,
gr.update(value=None, interactive=True),
gr.update(placeholder='Please upload your video first', interactive=False),
gr.update(value="Upload & Start Chat", interactive=True),
)
def upload_video(gr_video):
if gr_video is None:
return None, gr.update(interactive=True, placeholder='Please upload video/image first!'), gr.update(
interactive=True)
else:
print(f"Get video: {gr_video}")
return (
gr.update(interactive=True),
gr.update(interactive=True, placeholder='Type and press Enter'),
gr.update(value="Start Chatting", interactive=False)
)
def gradio_ask(user_message, chatbot):
if len(user_message) == 0:
return gr.update(interactive=True, placeholder='Input should not be empty!'), chatbot
chatbot = chatbot + [[user_message, None]]
return '', chatbot
def gradio_answer(video_path, chatbot, num_beams, temperature):
if len(chatbot) == 0 or video_path is None:
return chatbot
response = agent.answer(video_path=video_path, prompt=chatbot[-1][0], max_new_tokens=200, num_beams=num_beams,
temperature=temperature)
print(f"Question: {chatbot[-1][0]} Answer: {response}")
chatbot[-1][1] = response
return chatbot
agent = ChatAgent()
def main():
with gr.Blocks(title="VideoHub",
css="#chatbot {overflow:auto; height:500px;} #InputVideo {overflow:visible; height:320px;} footer {visibility: none}") as demo:
with gr.Row():
with gr.Column(scale=0.5, visible=True) as video_upload:
with gr.Tab("Video", elem_id='video_tab'):
up_video = gr.Video(interactive=True, include_audio=True, elem_id="video_upload", height=360)
upload_button = gr.Button(value="Upload & Start Chat", interactive=True, variant="primary")
temperature = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.1,
step=0.1,
interactive=True,
label="Temperature",
)
num_beams = gr.Slider(
minimum=1,
maximum=5,
value=1,
step=1,
interactive=True,
label="beam search numbers",
)
with gr.Column(visible=True) as input_raws:
chatbot = gr.Chatbot(elem_id="chatbot", label='VideoHub')
with gr.Row():
with gr.Column(scale=0.7):
text_input = gr.Textbox(show_label=False, placeholder='Please upload your video first',
interactive=False, container=False)
with gr.Column(scale=0.15, min_width=0):
run = gr.Button("💭Send")
with gr.Column(scale=0.15, min_width=0):
clear = gr.Button("🔄Clear")
upload_button.click(upload_video, [up_video],
[up_video, text_input, upload_button])
text_input.submit(gradio_ask, [text_input, chatbot],
[text_input, chatbot]).then(
gradio_answer, [up_video, chatbot, num_beams, temperature], [chatbot]
)
run.click(gradio_ask, [text_input, chatbot], [text_input, chatbot]).then(
gradio_answer, [up_video, chatbot, num_beams, temperature], [chatbot]
)
run.click(lambda: "", None, text_input)
clear.click(gradio_reset, [],
[chatbot, up_video, text_input, upload_button], queue=False)
demo.launch(server_name="0.0.0.0", server_port=7868)
if __name__ == '__main__':
main()
import io
import numpy as np
import torch
from decord import cpu, VideoReader, bridge
from transformers import AutoModelForCausalLM, AutoTokenizer
import argparse
MODEL_PATH = "THUDM/cogvlm2-video-llama3-chat"
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
TORCH_TYPE = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[
0] >= 8 else torch.float16
parser = argparse.ArgumentParser(description="CogVLM2-Video CLI Demo")
parser.add_argument('--quant', type=int, choices=[4, 8], help='Enable 4-bit or 8-bit precision loading', default=0)
args = parser.parse_args([])
def load_video(video_data, strategy='chat'):
bridge.set_bridge('torch')
mp4_stream = video_data
num_frames = 24
decord_vr = VideoReader(io.BytesIO(mp4_stream), ctx=cpu(0))
frame_id_list = None
total_frames = len(decord_vr)
if strategy == 'base':
clip_end_sec = 60
clip_start_sec = 0
start_frame = int(clip_start_sec * decord_vr.get_avg_fps())
end_frame = min(total_frames,
int(clip_end_sec * decord_vr.get_avg_fps())) if clip_end_sec is not None else total_frames
frame_id_list = np.linspace(start_frame, end_frame - 1, num_frames, dtype=int)
elif strategy == 'chat':
timestamps = decord_vr.get_frame_timestamp(np.arange(total_frames))
timestamps = [i[0] for i in timestamps]
max_second = round(max(timestamps)) + 1
frame_id_list = []
for second in range(max_second):
closest_num = min(timestamps, key=lambda x: abs(x - second))
index = timestamps.index(closest_num)
frame_id_list.append(index)
if len(frame_id_list) >= num_frames:
break
# while len(frame_id_list) < num_frames:
# frame_id_list.append(frame_id_list[-1])
video_data = decord_vr.get_batch(frame_id_list)
video_data = video_data.permute(3, 0, 1, 2)
return video_data
tokenizer = AutoTokenizer.from_pretrained(
MODEL_PATH,
trust_remote_code=True,
# padding_side="left"
)
model = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
torch_dtype=TORCH_TYPE,
trust_remote_code=True
).eval().to(DEVICE)
def predict(prompt, video_data, temperature):
strategy = 'chat'
video = load_video(video_data, strategy=strategy)
history = []
query = prompt
inputs = model.build_conversation_input_ids(
tokenizer=tokenizer,
query=query,
images=[video],
history=history,
template_version=strategy
)
inputs = {
'input_ids': inputs['input_ids'].unsqueeze(0).to('cuda'),
'token_type_ids': inputs['token_type_ids'].unsqueeze(0).to('cuda'),
'attention_mask': inputs['attention_mask'].unsqueeze(0).to('cuda'),
'images': [[inputs['images'][0].to('cuda').to(TORCH_TYPE)]],
}
gen_kwargs = {
"max_new_tokens": 2048,
"pad_token_id": 128002,
"top_k": 1,
"do_sample": False,
"top_p": 0.1,
"temperature": temperature,
}
with torch.no_grad():
outputs = model.generate(**inputs, **gen_kwargs)
outputs = outputs[:, inputs['input_ids'].shape[1]:]
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
return response
decord>=0.6.0
#根据https://download.pytorch.org/whl/torch/,python版本为[3.8,3.11]
torch==2.1.0
torchvision== 0.16.0
pytorchvideo==0.1.5
xformers
transformers==4.42.4
#如果运行提示transformers错误,则可能是transformers代码问题,拉取最新的试下
#git+https://github.com/huggingface/transformers.git
huggingface-hub>=0.23.0
pillow
chainlit>=1.0
pydantic>=2.7.1
timm>=0.9.16
openai>=1.30.1
loguru>=0.7.2
pydantic>=2.7.1
einops
sse-starlette>=2.1.0
flask
gunicorn
gevent
requests
gradio
import requests
url = 'http://127.0.0.1:5000/video_qa'
video_file = "test.mp4"
question = "Describe this video in detail."
temperature=0.2
files = {'video': open(video_file, 'rb')}
data = {'question': question,'temperature': temperature}
response = requests.post(url, files=files, data=data)
print(response.json()["answer"])
"""
This is a simple chat demo using CogVLM2 model in ChainLit.
"""
import os
import dataclasses
from typing import List
from PIL import Image
import chainlit as cl
from chainlit.input_widget import Slider
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from huggingface_hub.inference._generated.types import TextGenerationStreamOutput, TextGenerationStreamOutputToken
import threading
import torch
import os
os.environ["HIP_VISIBLE_DEVICES"] = "7"
MODEL_PATH = './cogvlm2-llama3-chinese-chat-19B'
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
TORCH_TYPE = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[
0] >= 8 else torch.float16
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
quant = int(os.environ.get('QUANT', 0))
if 'int4' in MODEL_PATH:
quant = 4
print(f'Quant = {quant}')
# Load the model
if quant == 4:
model = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
torch_dtype=TORCH_TYPE,
trust_remote_code=True,
load_in_4bit=True,
low_cpu_mem_usage=True
).eval()
elif quant == 8:
model = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
torch_dtype=TORCH_TYPE,
trust_remote_code=True,
load_in_8bit=True, # Assuming transformers support this argument; check documentation if not
low_cpu_mem_usage=True
).eval()
else:
model = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
torch_dtype=TORCH_TYPE,
trust_remote_code=True
).eval().to(DEVICE)
@cl.on_chat_start
def on_chat_start():
print("Welcome use CogVLM2 chat demo")
async def get_response(query, history, gen_kwargs, images=None):
if images is None:
input_by_model = model.build_conversation_input_ids(
tokenizer,
query=query,
history=history,
template_version='chat'
)
else:
input_by_model = model.build_conversation_input_ids(
tokenizer,
query=query,
history=history,
images=images[-1:], # only use the last image, CogVLM2 only support one image
template_version='chat'
)
inputs = {
'input_ids': input_by_model['input_ids'].unsqueeze(0).to(DEVICE),
'token_type_ids': input_by_model['token_type_ids'].unsqueeze(0).to(DEVICE),
'attention_mask': input_by_model['attention_mask'].unsqueeze(0).to(DEVICE),
'images': [[input_by_model['images'][0].to(DEVICE).to(TORCH_TYPE)]] if images is not None else None,
}
streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
gen_kwargs['streamer'] = streamer
gen_kwargs = {**gen_kwargs, **inputs}
with torch.no_grad():
thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)
thread.start()
for next_text in streamer:
yield TextGenerationStreamOutput(
index=0,
token=TextGenerationStreamOutputToken(
id=0,
logprob=0,
text=next_text,
special=False,
)
)
@dataclasses.dataclass
class Conversation:
"""A class that keeps all conversation history."""
roles: List[str]
messages: List[List[str]]
version: str = "Unknown"
def append_message(self, role, message):
self.messages.append([role, message])
def get_prompt(self):
if not self.messages:
return None, []
last_role, last_msg = self.messages[-2]
if isinstance(last_msg, tuple):
query, _ = last_msg
else:
query = last_msg
history = []
for role, msg in self.messages[:-2]:
if isinstance(msg, tuple):
text, _ = msg
else:
text = msg
if role == "USER":
history.append((text, ""))
else:
if history:
history[-1] = (history[-1][0], text)
return query, history
def get_images(self):
for role, msg in reversed(self.messages):
if isinstance(msg, tuple):
msg, image = msg
if image is None:
continue
if image.mode != 'RGB':
image = image.convert('RGB')
width, height = image.size
if width > 1344 or height > 1344:
max_len = 1344
aspect_ratio = width / height
if width > height:
new_width = max_len
new_height = int(new_width / aspect_ratio)
else:
new_height = max_len
new_width = int(new_height * aspect_ratio)
image = image.resize((new_width, new_height))
return [image]
return None
def copy(self):
return Conversation(
roles=self.roles,
messages=[[x, y] for x, y in self.messages],
version=self.version,
)
def dict(self):
if len(self.get_images()) > 0:
return {
"roles": self.roles,
"messages": [
[x, y[0] if type(y) is tuple else y] for x, y in self.messages
],
}
return {
"roles": self.roles,
"messages": self.messages,
}
default_conversation = Conversation(
roles=("USER", "ASSISTANT"),
messages=()
)
async def request(conversation: Conversation, settings):
gen_kwargs = {
"temperature": settings["temperature"],
"top_p": settings["top_p"],
"max_new_tokens": int(settings["max_token"]),
"top_k": int(settings["top_k"]),
"do_sample": True,
}
query, history = conversation.get_prompt()
images = conversation.get_images()
chainlit_message = cl.Message(content="", author="CogVLM2")
text = ""
async for response in get_response(query, history, gen_kwargs, images):
output = response.token.text
text += output
conversation.messages[-1][-1] = text
await chainlit_message.stream_token(text, is_sequence=True)
await chainlit_message.send()
return conversation
@cl.on_chat_start
async def start():
settings = await cl.ChatSettings(
[
Slider(id="temperature", label="Temperature", initial=0.5, min=0.01, max=1, step=0.05),
Slider(id="top_p", label="Top P", initial=0.7, min=0, max=1, step=0.1),
Slider(id="top_k", label="Top K", initial=5, min=0, max=50, step=1),
Slider(id="max_token", label="Max output tokens", initial=2048, min=0, max=8192, step=1),
]
).send()
conversation = default_conversation.copy()
cl.user_session.set("conversation", conversation)
cl.user_session.set("settings", settings)
@cl.on_settings_update
async def setup_agent(settings):
cl.user_session.set("settings", settings)
@cl.on_message
async def main(message: cl.Message):
image = next(
(
Image.open(file.path)
for file in message.elements or []
if "image" in file.mime and file.path is not None
),
None,
)
conv = cl.user_session.get("conversation") # type: Conversation
settings = cl.user_session.get("settings")
text = message.content
conv_message = (text, image)
conv.append_message(conv.roles[0], conv_message)
conv.append_message(conv.roles[1], None)
conv = await request(conv, settings)
cl.user_session.set("conversation", conv)
\ No newline at end of file
chainlit run web_demo.py
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment