Commit ef30d662 authored by bailuo's avatar bailuo
Browse files

init

parents
Pipeline #2496 failed with stages
in 0 seconds
395.0000,340.0000,532.0000,407.0000
This source diff could not be displayed because it is too large. You can view the blob instead.
build:
gpu: true
cuda: "12.4"
python_version: "3.10"
system_packages:
- "libgl1-mesa-glx"
- "libglib2.0-0"
- "ffmpeg"
python_packages:
- "torch==2.4.0"
- "torchvision"
- "transformers==4.42.3"
- "opencv-python-headless<4.10"
- "peft<0.14.0"
- "timm==1.0.9"
- "einops==0.8.0"
- "sentencepiece==0.2.0"
- "mmengine<1"
- "accelerate"
- "numpy<2"
run:
- FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE pip install flash-attn --no-build-isolation
- curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.8.2/pget_linux_x86_64" && chmod +x /usr/local/bin/pget
predict: "predict.py:Predictor"
\ No newline at end of file
import argparse
import os
from PIL import Image
from transformers import AutoModelForCausalLM, AutoTokenizer
import cv2
try:
from mmengine.visualization import Visualizer
except ImportError:
Visualizer = None
print("Warning: mmengine is not installed, visualization is disabled.")
def parse_args():
parser = argparse.ArgumentParser(description='Video Reasoning Segmentation')
parser.add_argument('image_folder', help='Path to image file')
parser.add_argument('--model_path', default="ByteDance/Sa2VA-8B")
parser.add_argument('--work-dir', default=None, help='The dir to save results.')
parser.add_argument('--text', type=str, default="<image>Please describe the video content.")
parser.add_argument('--select', type=int, default=-1)
args = parser.parse_args()
return args
def visualize(pred_mask, image_path, work_dir):
visualizer = Visualizer()
img = cv2.imread(image_path)
visualizer.set_image(img)
visualizer.draw_binary_masks(pred_mask, colors='g', alphas=0.4)
visual_result = visualizer.get_image()
output_path = os.path.join(work_dir, os.path.basename(image_path))
cv2.imwrite(output_path, visual_result)
if __name__ == "__main__":
cfg = parse_args()
model_path = cfg.model_path
model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype="auto",
device_map="auto",
trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(
model_path,
trust_remote_code=True
)
image_files = []
image_paths = []
image_extensions = {".jpg", ".jpeg", ".png", ".bmp", ".gif", ".tiff"}
for filename in sorted(list(os.listdir(cfg.image_folder))):
if os.path.splitext(filename)[1].lower() in image_extensions:
image_files.append(filename)
image_paths.append(os.path.join(cfg.image_folder, filename))
vid_frames = []
for img_path in image_paths:
img = Image.open(img_path).convert('RGB')
vid_frames.append(img)
if cfg.select > 0:
img_frame = vid_frames[cfg.select - 1]
print(f"Selected frame {cfg.select}")
print(f"The input is:\n{cfg.text}")
result = model.predict_forward(
image=img_frame,
text=cfg.text,
tokenizer=tokenizer,
)
else:
print(f"The input is:\n{cfg.text}")
result = model.predict_forward(
video=vid_frames,
text=cfg.text,
tokenizer=tokenizer,
)
prediction = result['prediction']
print(f"The output is:\n{prediction}")
if '[SEG]' in prediction and Visualizer is not None:
_seg_idx = 0
pred_masks = result['prediction_masks'][_seg_idx]
for frame_idx in range(len(vid_frames)):
pred_mask = pred_masks[frame_idx]
if cfg.work_dir:
os.makedirs(cfg.work_dir, exist_ok=True)
visualize(pred_mask, image_paths[frame_idx], cfg.work_dir)
else:
os.makedirs('./temp_visualize_results', exist_ok=True)
visualize(pred_mask, image_paths[frame_idx], './temp_visualize_results')
else:
pass
from cog import BasePredictor, Input, Path, BaseModel
import os
import cv2
import time
import subprocess
import numpy as np
from PIL import Image
from transformers import AutoModelForCausalLM, AutoTokenizer
from mmengine.visualization import Visualizer
from typing import Optional
MODEL_CACHE = "checkpoints"
# MODEL_URL = "https://weights.replicate.delivery/default/ByteDance/Sa2VA-4B/model.tar"
MODEL_URL = "https://weights.replicate.delivery/default/ByteDance/Sa2VA-8B/model.tar"
# MODEL_URL = "https://weights.replicate.delivery/default/ByteDance/Sa2VA-26B/model.tar"
class Output(BaseModel):
img: Optional[Path]
response: str
def download_weights(url, dest):
start = time.time()
print("downloading url: ", url)
print("downloading to: ", dest)
subprocess.check_call(["pget", "-xf", url, dest], close_fds=False)
print("downloading took: ", time.time() - start)
class Predictor(BasePredictor):
def setup(self) -> None:
"""Load the model into memory to make running multiple predictions efficient"""
os.environ["TRANSFORMERS_OFFLINE"] = "1"
# Download weights if they don't exist
if not os.path.exists(MODEL_CACHE):
download_weights(MODEL_URL, MODEL_CACHE)
# Load model and tokenizer
self.model = AutoModelForCausalLM.from_pretrained(
MODEL_CACHE,
torch_dtype="auto",
device_map="cuda:0",
trust_remote_code=True,
).eval().cuda()
self.tokenizer = AutoTokenizer.from_pretrained(
MODEL_CACHE,
trust_remote_code=True,
)
def predict(
self,
image: Path = Input(description="Input image for segmentation"),
instruction: str = Input(description="Text instruction for the model"),
) -> Output:
"""Run a single prediction on the model"""
# Prepare the image
image = Image.open(str(image)).convert('RGB')
# Prepare the input
text_prompts = f"<image>{instruction}"
input_dict = {
'image': image,
'text': text_prompts,
'past_text': '',
'mask_prompts': None,
'tokenizer': self.tokenizer,
}
# Get model prediction
return_dict = self.model.predict_forward(**input_dict)
answer = return_dict["prediction"]
# Handle segmentation if present
output_path = None
if '[SEG]' in answer:
pred_masks = return_dict["prediction_masks"][0]
# Ensure mask is in the correct format
if isinstance(pred_masks, np.ndarray):
binary_mask = (pred_masks > 0.5).astype('uint8') * 255
else:
binary_mask = (pred_masks.cpu().numpy() > 0.5).astype('uint8') * 255
# Ensure mask has valid dimensions
if binary_mask.ndim == 2:
height, width = binary_mask.shape
elif binary_mask.ndim == 3:
# If we have a 3D array, take the first channel
binary_mask = binary_mask[0] if binary_mask.shape[0] == 1 else binary_mask[:, :, 0]
height, width = binary_mask.shape
else:
return Output(img=None, response=str(answer))
# Check if dimensions are valid and mask is not empty
if width > 0 and height > 0 and np.any(binary_mask):
# Create output directory if it doesn't exist
os.makedirs("/tmp", exist_ok=True)
# Save the binary mask
output_path = "/tmp/output.png"
if cv2.imwrite(output_path, binary_mask):
return Output(img=Path(output_path), response=str(answer))
return Output(img=None, response=str(answer))
from cog import BasePredictor, Input, Path, BaseModel
import os
import cv2
import time
import shutil
import subprocess
import numpy as np
from PIL import Image
import tempfile
from transformers import AutoModelForCausalLM, AutoTokenizer
from mmengine.visualization import Visualizer
from typing import Optional
from third_parts import VideoReader
MODEL_CACHE = "checkpoints"
# MODEL_URL = "https://weights.replicate.delivery/default/ByteDance/Sa2VA-4B/model.tar"
MODEL_URL = "https://weights.replicate.delivery/default/ByteDance/Sa2VA-8B/model.tar"
# MODEL_URL = "https://weights.replicate.delivery/default/ByteDance/Sa2VA-26B/model.tar"
class Output(BaseModel):
masked_video: Optional[Path]
response: str
def download_weights(url, dest):
start = time.time()
print("downloading url: ", url)
print("downloading to: ", dest)
subprocess.check_call(["pget", "-xf", url, dest], close_fds=False)
print("downloading took: ", time.time() - start)
def read_video(video_path, video_interval):
# First verify the video can be opened
cap = cv2.VideoCapture(str(video_path))
if not cap.isOpened():
raise ValueError(f"Failed to open video file: {video_path}")
cap.release()
# Read frames using VideoReader
vid_frames = VideoReader(video_path)[::video_interval]
if len(vid_frames) == 0:
raise ValueError(f"No frames could be read from video: {video_path}")
temp_dir = tempfile.mkdtemp()
os.makedirs(temp_dir, exist_ok=True)
image_paths = []
processed_frames = []
for frame_idx, frame_image in enumerate(vid_frames):
if frame_image is None:
continue
# Convert BGR to RGB
frame_image = frame_image[..., ::-1] # BGR to RGB
frame_image = Image.fromarray(frame_image)
processed_frames.append(frame_image)
image_path = os.path.join(temp_dir, f"frame_{frame_idx:04d}.jpg")
frame_image.save(image_path, format="JPEG")
image_paths.append(image_path)
if not processed_frames:
raise ValueError("No valid frames were processed from the video")
return processed_frames, image_paths
def visualize(pred_mask, image_path, work_dir):
visualizer = Visualizer()
img = cv2.imread(image_path)
visualizer.set_image(img)
visualizer.draw_binary_masks(pred_mask, colors='g', alphas=0.4)
visual_result = visualizer.get_image()
output_path = os.path.join(work_dir, os.path.basename(image_path))
cv2.imwrite(output_path, visual_result)
return output_path
class Predictor(BasePredictor):
def setup(self) -> None:
"""Load the model into memory to make running multiple predictions efficient"""
os.environ["TRANSFORMERS_OFFLINE"] = "1"
# Download weights if they don't exist
if not os.path.exists(MODEL_CACHE):
download_weights(MODEL_URL, MODEL_CACHE)
# Load model and tokenizer
self.model = AutoModelForCausalLM.from_pretrained(
MODEL_CACHE,
torch_dtype="auto",
device_map="cuda:0",
trust_remote_code=True,
).eval().cuda()
self.tokenizer = AutoTokenizer.from_pretrained(
MODEL_CACHE,
trust_remote_code=True,
)
def predict(
self,
video: Path = Input(description="Input video for segmentation"),
instruction: str = Input(description="Text instruction for the model"),
frame_interval: int = Input(description="Frame interval for processing", default=6, ge=1, le=30),
) -> Output:
"""Run a single prediction on the model"""
# clean up past runs remove /tmp/output folder
if os.path.exists("/tmp/output"):
shutil.rmtree("/tmp/output")
os.makedirs("/tmp/output")
# Process video frames
vid_frames, image_paths = read_video(str(video), frame_interval)
# Get video properties for output
cap = cv2.VideoCapture(str(video))
if not cap.isOpened():
raise ValueError("Failed to open video file")
original_fps = cap.get(cv2.CAP_PROP_FPS)
if original_fps == 0:
original_fps = 30.0 # Default to 30fps if unable to read
new_fps = original_fps / frame_interval if frame_interval > 1 else original_fps
cap.release()
# Prepare the input
question = f"<image>{instruction}"
result = self.model.predict_forward(
video=vid_frames,
text=question,
tokenizer=self.tokenizer,
)
prediction = result['prediction']
output_video_path = None
masked_video_path = None
if '[SEG]' in prediction:
_seg_idx = 0
pred_masks = result['prediction_masks'][_seg_idx]
seg_frames = []
masked_only_frames = []
temp_dir = tempfile.mkdtemp()
os.makedirs(temp_dir, exist_ok=True)
# Process each frame
for frame_idx in range(len(vid_frames)):
pred_mask = pred_masks[frame_idx]
# Create visualized frame with segmentation overlay
seg_frame = visualize(pred_mask, image_paths[frame_idx], temp_dir)
seg_frames.append(seg_frame)
# Create binary mask frame
binary_mask = (pred_mask.astype('uint8') * 255)
binary_mask_path = os.path.join(temp_dir, f"binary_mask_{frame_idx}.png")
cv2.imwrite(binary_mask_path, binary_mask)
masked_only_frames.append(binary_mask_path)
# Read first frame for dimensions
frame = cv2.imread(seg_frames[0])
height, width, layers = frame.shape
# Create output video files
masked_video_path = "/tmp/output/masked_video.mp4"
temp_masked_path = "/tmp/output/temp_masked.avi"
# Define video writer using a more basic codec
fourcc = cv2.VideoWriter_fourcc(*'MJPG')
masked_video_writer = cv2.VideoWriter(temp_masked_path, fourcc, new_fps, (width, height), isColor=False)
# Write frames to video
for mask_frame_path in masked_only_frames:
mask_frame = cv2.imread(mask_frame_path, cv2.IMREAD_GRAYSCALE)
masked_video_writer.write(mask_frame)
# Release video writer
masked_video_writer.release()
# Convert to web-compatible MP4 using ffmpeg
subprocess.run([
'ffmpeg', '-i', temp_masked_path, '-c:v', 'libx264',
'-preset', 'fast', '-pix_fmt', 'yuv420p', masked_video_path
], check=True)
# Clean up temporary file
os.remove(temp_masked_path)
return Output(
masked_video=Path(masked_video_path) if masked_video_path else None,
response=str(prediction)
)
\ No newline at end of file
#torch==2.3.1
#torchvision==0.18.1
transformers==4.42.3
opencv-python-headless<4.10
peft<0.14.0
timm==1.0.9
einops==0.8.0
flash_attn
sentencepiece==0.2.0
mmengine<1
gradio==4.44.0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment