"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "7566734d6f6467f1442bf0494cb14108d72c61b3"
Unverified Commit e316c521 authored by Raushan Turganbay's avatar Raushan Turganbay Committed by GitHub
Browse files

VideoLLaVa: fix chat format in docs (#32083)

fix chat format
parent 22f888b3
...@@ -98,7 +98,7 @@ indices = np.arange(0, total_frames, total_frames / 8).astype(int) ...@@ -98,7 +98,7 @@ indices = np.arange(0, total_frames, total_frames / 8).astype(int)
video = read_video_pyav(container, indices) video = read_video_pyav(container, indices)
# For better results, we recommend to prompt the model in the following format # For better results, we recommend to prompt the model in the following format
prompt = "USER: <video>Why is this funny? ASSISTANT:" prompt = "USER: <video>\nWhy is this funny? ASSISTANT:"
inputs = processor(text=prompt, videos=video, return_tensors="pt") inputs = processor(text=prompt, videos=video, return_tensors="pt")
out = model.generate(**inputs, max_new_tokens=60) out = model.generate(**inputs, max_new_tokens=60)
...@@ -108,7 +108,7 @@ processor.batch_decode(out, skip_special_tokens=True, clean_up_tokenization_spac ...@@ -108,7 +108,7 @@ processor.batch_decode(out, skip_special_tokens=True, clean_up_tokenization_spac
For multiple turns conversation change the prompt format to: For multiple turns conversation change the prompt format to:
```bash ```bash
"USER: <video>What do you see in this video? ASSISTANT: A baby reading a book. USER: Why is the it funny? ASSISTANT:" "USER: <video>\nWhat do you see in this video? ASSISTANT: A baby reading a book. USER: Why is the it funny? ASSISTANT:"
``` ```
### Mixed Media Mode ### Mixed Media Mode
...@@ -123,7 +123,7 @@ import requests ...@@ -123,7 +123,7 @@ import requests
# Load and image and write a new prompt # Load and image and write a new prompt
url = "http://images.cocodataset.org/val2017/000000039769.jpg" url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw) image = Image.open(requests.get(url, stream=True).raw)
prompt = "USER: <image> How many cats are there in the image? ASSISTANT: There are two cats. USER: <video>Why is this video funny? ASSISTANT:" prompt = "USER: <image>\nHow many cats are there in the image? ASSISTANT: There are two cats. USER: <video>\nWhy is this video funny? ASSISTANT:"
inputs = processor(text=prompt, images=image, videos=clip, padding=True, return_tensors="pt") inputs = processor(text=prompt, images=image, videos=clip, padding=True, return_tensors="pt")
......
...@@ -456,7 +456,7 @@ class VideoLlavaForConditionalGeneration(VideoLlavaPreTrainedModel): ...@@ -456,7 +456,7 @@ class VideoLlavaForConditionalGeneration(VideoLlavaPreTrainedModel):
>>> model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf") >>> model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")
>>> processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-7B-hf") >>> processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")
>>> prompt = "USER: <video>Why is this video funny? ASSISTANT:" >>> prompt = "USER: <video>\nWhy is this video funny? ASSISTANT:"
>>> video_path = hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset") >>> video_path = hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset")
>>> container = av.open(video_path) >>> container = av.open(video_path)
...@@ -476,8 +476,8 @@ class VideoLlavaForConditionalGeneration(VideoLlavaPreTrainedModel): ...@@ -476,8 +476,8 @@ class VideoLlavaForConditionalGeneration(VideoLlavaPreTrainedModel):
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw) >>> image = Image.open(requests.get(url, stream=True).raw)
>>> prompt = [ >>> prompt = [
... "USER: <image> How many cats do you see? ASSISTANT:", ... "USER: <image>\nHow many cats do you see? ASSISTANT:",
... "USER: <video>Why is this video funny? ASSISTANT:" ... "USER: <video>\nWhy is this video funny? ASSISTANT:"
... ] ... ]
>>> inputs = processor(text=prompt, images=image, videos=clip, padding=True, return_tensors="pt") >>> inputs = processor(text=prompt, images=image, videos=clip, padding=True, return_tensors="pt")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment