Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
e316c521
Unverified
Commit
e316c521
authored
Jul 19, 2024
by
Raushan Turganbay
Committed by
GitHub
Jul 19, 2024
Browse files
VideoLLaVa: fix chat format in docs (#32083)
fix chat format
parent
22f888b3
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
6 additions
and
6 deletions
+6
-6
docs/source/en/model_doc/video_llava.md
docs/source/en/model_doc/video_llava.md
+3
-3
src/transformers/models/video_llava/modeling_video_llava.py
src/transformers/models/video_llava/modeling_video_llava.py
+3
-3
No files found.
docs/source/en/model_doc/video_llava.md
View file @
e316c521
...
@@ -98,7 +98,7 @@ indices = np.arange(0, total_frames, total_frames / 8).astype(int)
...
@@ -98,7 +98,7 @@ indices = np.arange(0, total_frames, total_frames / 8).astype(int)
video
=
read_video_pyav
(
container
,
indices
)
video
=
read_video_pyav
(
container
,
indices
)
# For better results, we recommend to prompt the model in the following format
# For better results, we recommend to prompt the model in the following format
prompt
=
"USER: <video>Why is this funny? ASSISTANT:"
prompt
=
"USER: <video>
\n
Why is this funny? ASSISTANT:"
inputs
=
processor
(
text
=
prompt
,
videos
=
video
,
return_tensors
=
"pt"
)
inputs
=
processor
(
text
=
prompt
,
videos
=
video
,
return_tensors
=
"pt"
)
out
=
model
.
generate
(
**
inputs
,
max_new_tokens
=
60
)
out
=
model
.
generate
(
**
inputs
,
max_new_tokens
=
60
)
...
@@ -108,7 +108,7 @@ processor.batch_decode(out, skip_special_tokens=True, clean_up_tokenization_spac
...
@@ -108,7 +108,7 @@ processor.batch_decode(out, skip_special_tokens=True, clean_up_tokenization_spac
For multiple turns conversation change the prompt format to:
For multiple turns conversation change the prompt format to:
```
bash
```
bash
"USER: <video>What do you see in this video? ASSISTANT: A baby reading a book. USER: Why is the it funny? ASSISTANT:"
"USER: <video>
\n
What do you see in this video? ASSISTANT: A baby reading a book. USER: Why is the it funny? ASSISTANT:"
```
```
### Mixed Media Mode
### Mixed Media Mode
...
@@ -123,7 +123,7 @@ import requests
...
@@ -123,7 +123,7 @@ import requests
# Load and image and write a new prompt
# Load and image and write a new prompt
url
=
"http://images.cocodataset.org/val2017/000000039769.jpg"
url
=
"http://images.cocodataset.org/val2017/000000039769.jpg"
image
=
Image
.
open
(
requests
.
get
(
url
,
stream
=
True
).
raw
)
image
=
Image
.
open
(
requests
.
get
(
url
,
stream
=
True
).
raw
)
prompt
=
"USER: <image>
How many cats are there in the image? ASSISTANT: There are two cats. USER: <video>Why is this video funny? ASSISTANT:"
prompt
=
"USER: <image>
\n
How many cats are there in the image? ASSISTANT: There are two cats. USER: <video>
\n
Why is this video funny? ASSISTANT:"
inputs
=
processor
(
text
=
prompt
,
images
=
image
,
videos
=
clip
,
padding
=
True
,
return_tensors
=
"pt"
)
inputs
=
processor
(
text
=
prompt
,
images
=
image
,
videos
=
clip
,
padding
=
True
,
return_tensors
=
"pt"
)
...
...
src/transformers/models/video_llava/modeling_video_llava.py
View file @
e316c521
...
@@ -456,7 +456,7 @@ class VideoLlavaForConditionalGeneration(VideoLlavaPreTrainedModel):
...
@@ -456,7 +456,7 @@ class VideoLlavaForConditionalGeneration(VideoLlavaPreTrainedModel):
>>> model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")
>>> model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")
>>> processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")
>>> processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")
>>> prompt = "USER: <video>Why is this video funny? ASSISTANT:"
>>> prompt = "USER: <video>
\n
Why is this video funny? ASSISTANT:"
>>> video_path = hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset")
>>> video_path = hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset")
>>> container = av.open(video_path)
>>> container = av.open(video_path)
...
@@ -476,8 +476,8 @@ class VideoLlavaForConditionalGeneration(VideoLlavaPreTrainedModel):
...
@@ -476,8 +476,8 @@ class VideoLlavaForConditionalGeneration(VideoLlavaPreTrainedModel):
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> prompt = [
>>> prompt = [
... "USER: <image>
How many cats do you see? ASSISTANT:",
... "USER: <image>
\n
How many cats do you see? ASSISTANT:",
... "USER: <video>Why is this video funny? ASSISTANT:"
... "USER: <video>
\n
Why is this video funny? ASSISTANT:"
... ]
... ]
>>> inputs = processor(text=prompt, images=image, videos=clip, padding=True, return_tensors="pt")
>>> inputs = processor(text=prompt, images=image, videos=clip, padding=True, return_tensors="pt")
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment