init

ef30d662 · bailuo · ef30d662 · ef30d662 · ef30d662 · ef30d662
Commit ef30d662 authored Mar 13, 2025 by bailuo
20 changed files
--- a/data/GOT-10k_Test_000001/00000090.jpg
+++ b/data/GOT-10k_Test_000001/00000090.jpg
--- a/data/GOT-10k_Test_000001/00000091.jpg
+++ b/data/GOT-10k_Test_000001/00000091.jpg
--- a/data/GOT-10k_Test_000001/00000092.jpg
+++ b/data/GOT-10k_Test_000001/00000092.jpg
--- a/data/GOT-10k_Test_000001/00000093.jpg
+++ b/data/GOT-10k_Test_000001/00000093.jpg
--- a/data/GOT-10k_Test_000001/00000094.jpg
+++ b/data/GOT-10k_Test_000001/00000094.jpg
--- a/data/GOT-10k_Test_000001/00000095.jpg
+++ b/data/GOT-10k_Test_000001/00000095.jpg
--- a/data/GOT-10k_Test_000001/00000096.jpg
+++ b/data/GOT-10k_Test_000001/00000096.jpg
--- a/data/GOT-10k_Test_000001/00000097.jpg
+++ b/data/GOT-10k_Test_000001/00000097.jpg
--- a/data/GOT-10k_Test_000001/00000098.jpg
+++ b/data/GOT-10k_Test_000001/00000098.jpg
--- a/data/GOT-10k_Test_000001/00000099.jpg
+++ b/data/GOT-10k_Test_000001/00000099.jpg
--- a/data/GOT-10k_Test_000001/00000100.jpg
+++ b/data/GOT-10k_Test_000001/00000100.jpg
--- a/data/GOT-10k_Test_000001/groundtruth.txt
+++ b/data/GOT-10k_Test_000001/groundtruth.txt
+395.0000,340.0000,532.0000,407.0000
--- a/demo.ipynb
+++ b/demo.ipynb
--- a/demo/cog.yaml
+++ b/demo/cog.yaml
+build:
+  gpu: true
+  cuda: "12.4"
+  python_version: "3.10"
+  system_packages:
+    - "libgl1-mesa-glx"
+    - "libglib2.0-0"
+    - "ffmpeg"
+  python_packages:
+    - "torch==2.4.0"
+    - "torchvision"
+    - "transformers==4.42.3"
+    - "opencv-python-headless<4.10"
+    - "peft<0.14.0"
+    - "timm==1.0.9"
+    - "einops==0.8.0"
+    - "sentencepiece==0.2.0"
+    - "mmengine<1"
+    - "accelerate"
+    - "numpy<2"
+  
+  run:
+    - FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE pip install flash-attn --no-build-isolation
+    - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.8.2/pget_linux_x86_64" && chmod +x /usr/local/bin/pget
+
+predict: "predict.py:Predictor" 
\ No newline at end of file
--- a/demo/demo.py
+++ b/demo/demo.py
+import argparse
+import os
+
+from PIL import Image
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+import cv2
+try:
+    from mmengine.visualization import Visualizer
+except ImportError:
+    Visualizer = None
+    print("Warning: mmengine is not installed, visualization is disabled.")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Video Reasoning Segmentation')
+    parser.add_argument('image_folder', help='Path to image file')
+    parser.add_argument('--model_path', default="ByteDance/Sa2VA-8B")
+    parser.add_argument('--work-dir', default=None, help='The dir to save results.')
+    parser.add_argument('--text', type=str, default="<image>Please describe the video content.")
+    parser.add_argument('--select', type=int, default=-1)
+    args = parser.parse_args()
+    return args
+
+
+def visualize(pred_mask, image_path, work_dir):
+    visualizer = Visualizer()
+    img = cv2.imread(image_path)
+    visualizer.set_image(img)
+    visualizer.draw_binary_masks(pred_mask, colors='g', alphas=0.4)
+    visual_result = visualizer.get_image()
+
+    output_path = os.path.join(work_dir, os.path.basename(image_path))
+    cv2.imwrite(output_path, visual_result)
+
+if __name__ == "__main__":
+    cfg = parse_args()
+    model_path = cfg.model_path
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        torch_dtype="auto",
+        device_map="auto",
+        trust_remote_code=True
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_path,
+        trust_remote_code=True
+    )
+
+    image_files = []
+    image_paths = []
+    image_extensions = {".jpg", ".jpeg", ".png", ".bmp", ".gif", ".tiff"}
+    for filename in sorted(list(os.listdir(cfg.image_folder))):
+        if os.path.splitext(filename)[1].lower() in image_extensions:
+            image_files.append(filename)
+            image_paths.append(os.path.join(cfg.image_folder, filename))
+
+    vid_frames = []
+    for img_path in image_paths:
+        img = Image.open(img_path).convert('RGB')
+        vid_frames.append(img)
+
+
+    if cfg.select > 0:
+        img_frame = vid_frames[cfg.select - 1]
+
+        print(f"Selected frame {cfg.select}")
+        print(f"The input is:\n{cfg.text}")
+        result = model.predict_forward(
+            image=img_frame,
+            text=cfg.text,
+            tokenizer=tokenizer,
+        )
+    else:
+        print(f"The input is:\n{cfg.text}")
+        result = model.predict_forward(
+            video=vid_frames,
+            text=cfg.text,
+            tokenizer=tokenizer,
+        )
+
+    prediction = result['prediction']
+    print(f"The output is:\n{prediction}")
+
+    if '[SEG]' in prediction and Visualizer is not None:
+        _seg_idx = 0
+        pred_masks = result['prediction_masks'][_seg_idx]
+        for frame_idx in range(len(vid_frames)):
+            pred_mask = pred_masks[frame_idx]
+            if cfg.work_dir:
+                os.makedirs(cfg.work_dir, exist_ok=True)
+                visualize(pred_mask, image_paths[frame_idx], cfg.work_dir)
+            else:
+                os.makedirs('./temp_visualize_results', exist_ok=True)
+                visualize(pred_mask, image_paths[frame_idx], './temp_visualize_results')
+    else:
+        pass
--- a/demo/predict-img.py
+++ b/demo/predict-img.py
+from cog import BasePredictor, Input, Path, BaseModel
+import os
+import cv2
+import time
+import subprocess
+import numpy as np
+from PIL import Image
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from mmengine.visualization import Visualizer
+from typing import Optional
+
+MODEL_CACHE = "checkpoints"
+# MODEL_URL = "https://weights.replicate.delivery/default/ByteDance/Sa2VA-4B/model.tar"
+MODEL_URL = "https://weights.replicate.delivery/default/ByteDance/Sa2VA-8B/model.tar"
+# MODEL_URL = "https://weights.replicate.delivery/default/ByteDance/Sa2VA-26B/model.tar"
+
+class Output(BaseModel):
+    img: Optional[Path]
+    response: str
+
+def download_weights(url, dest):
+    start = time.time()
+    print("downloading url: ", url)
+    print("downloading to: ", dest)
+    subprocess.check_call(["pget", "-xf", url, dest], close_fds=False)
+    print("downloading took: ", time.time() - start)
+    
+class Predictor(BasePredictor):
+    def setup(self) -> None:
+        """Load the model into memory to make running multiple predictions efficient"""
+        os.environ["TRANSFORMERS_OFFLINE"] = "1"
+
+        # Download weights if they don't exist
+        if not os.path.exists(MODEL_CACHE):
+            download_weights(MODEL_URL, MODEL_CACHE)
+        
+        # Load model and tokenizer
+        self.model = AutoModelForCausalLM.from_pretrained(
+            MODEL_CACHE,
+            torch_dtype="auto",
+            device_map="cuda:0",
+            trust_remote_code=True,
+        ).eval().cuda()
+
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            MODEL_CACHE,
+            trust_remote_code=True,
+        )
+
+    def predict(
+        self,
+        image: Path = Input(description="Input image for segmentation"),
+        instruction: str = Input(description="Text instruction for the model"),
+    ) -> Output:
+        """Run a single prediction on the model"""
+        # Prepare the image
+        image = Image.open(str(image)).convert('RGB')
+        
+        # Prepare the input
+        text_prompts = f"<image>{instruction}"
+        input_dict = {
+            'image': image,
+            'text': text_prompts,
+            'past_text': '',
+            'mask_prompts': None,
+            'tokenizer': self.tokenizer,
+        }
+        
+        # Get model prediction
+        return_dict = self.model.predict_forward(**input_dict)
+        answer = return_dict["prediction"]
+        
+        # Handle segmentation if present
+        output_path = None
+        if '[SEG]' in answer:
+            pred_masks = return_dict["prediction_masks"][0]
+            
+            # Ensure mask is in the correct format
+            if isinstance(pred_masks, np.ndarray):
+                binary_mask = (pred_masks > 0.5).astype('uint8') * 255
+            else:
+                binary_mask = (pred_masks.cpu().numpy() > 0.5).astype('uint8') * 255
+            
+            # Ensure mask has valid dimensions
+            if binary_mask.ndim == 2:
+                height, width = binary_mask.shape
+            elif binary_mask.ndim == 3:
+                # If we have a 3D array, take the first channel
+                binary_mask = binary_mask[0] if binary_mask.shape[0] == 1 else binary_mask[:, :, 0]
+                height, width = binary_mask.shape
+            else:
+                return Output(img=None, response=str(answer))
+                
+            # Check if dimensions are valid and mask is not empty
+            if width > 0 and height > 0 and np.any(binary_mask):
+                # Create output directory if it doesn't exist
+                os.makedirs("/tmp", exist_ok=True)
+                
+                # Save the binary mask
+                output_path = "/tmp/output.png"
+                if cv2.imwrite(output_path, binary_mask):
+                    return Output(img=Path(output_path), response=str(answer))
+
+        return Output(img=None, response=str(answer))
--- a/demo/predict-vid.py
+++ b/demo/predict-vid.py
+from cog import BasePredictor, Input, Path, BaseModel
+import os
+import cv2
+import time
+import shutil
+import subprocess
+import numpy as np
+from PIL import Image
+import tempfile
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from mmengine.visualization import Visualizer
+from typing import Optional
+from third_parts import VideoReader
+
+MODEL_CACHE = "checkpoints"
+# MODEL_URL = "https://weights.replicate.delivery/default/ByteDance/Sa2VA-4B/model.tar"
+MODEL_URL = "https://weights.replicate.delivery/default/ByteDance/Sa2VA-8B/model.tar"
+# MODEL_URL = "https://weights.replicate.delivery/default/ByteDance/Sa2VA-26B/model.tar"
+
+class Output(BaseModel):
+    masked_video: Optional[Path]
+    response: str
+
+def download_weights(url, dest):
+    start = time.time()
+    print("downloading url: ", url)
+    print("downloading to: ", dest)
+    subprocess.check_call(["pget", "-xf", url, dest], close_fds=False)
+    print("downloading took: ", time.time() - start)
+
+def read_video(video_path, video_interval):
+    # First verify the video can be opened
+    cap = cv2.VideoCapture(str(video_path))
+    if not cap.isOpened():
+        raise ValueError(f"Failed to open video file: {video_path}")
+    cap.release()
+
+    # Read frames using VideoReader
+    vid_frames = VideoReader(video_path)[::video_interval]
+    if len(vid_frames) == 0:
+        raise ValueError(f"No frames could be read from video: {video_path}")
+    
+    temp_dir = tempfile.mkdtemp()
+    os.makedirs(temp_dir, exist_ok=True)
+    image_paths = []
+    processed_frames = []
+    
+    for frame_idx, frame_image in enumerate(vid_frames):
+        if frame_image is None:
+            continue
+            
+        # Convert BGR to RGB
+        frame_image = frame_image[..., ::-1]  # BGR to RGB
+        frame_image = Image.fromarray(frame_image)
+        processed_frames.append(frame_image)
+
+        image_path = os.path.join(temp_dir, f"frame_{frame_idx:04d}.jpg")
+        frame_image.save(image_path, format="JPEG")
+        image_paths.append(image_path)
+    
+    if not processed_frames:
+        raise ValueError("No valid frames were processed from the video")
+        
+    return processed_frames, image_paths
+
+def visualize(pred_mask, image_path, work_dir):
+    visualizer = Visualizer()
+    img = cv2.imread(image_path)
+    visualizer.set_image(img)
+    visualizer.draw_binary_masks(pred_mask, colors='g', alphas=0.4)
+    visual_result = visualizer.get_image()
+
+    output_path = os.path.join(work_dir, os.path.basename(image_path))
+    cv2.imwrite(output_path, visual_result)
+    return output_path
+    
+class Predictor(BasePredictor):
+    def setup(self) -> None:
+        """Load the model into memory to make running multiple predictions efficient"""
+        os.environ["TRANSFORMERS_OFFLINE"] = "1"
+
+        # Download weights if they don't exist
+        if not os.path.exists(MODEL_CACHE):
+            download_weights(MODEL_URL, MODEL_CACHE)
+        
+        # Load model and tokenizer
+        self.model = AutoModelForCausalLM.from_pretrained(
+            MODEL_CACHE,
+            torch_dtype="auto",
+            device_map="cuda:0",
+            trust_remote_code=True,
+        ).eval().cuda()
+
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            MODEL_CACHE,
+            trust_remote_code=True,
+        )
+
+    def predict(
+        self,
+        video: Path = Input(description="Input video for segmentation"),
+        instruction: str = Input(description="Text instruction for the model"),
+        frame_interval: int = Input(description="Frame interval for processing", default=6, ge=1, le=30),
+    ) -> Output:
+        """Run a single prediction on the model"""
+        # clean up past runs remove /tmp/output folder
+        if os.path.exists("/tmp/output"):
+            shutil.rmtree("/tmp/output")
+        
+        os.makedirs("/tmp/output")
+
+        # Process video frames
+        vid_frames, image_paths = read_video(str(video), frame_interval)
+        
+        # Get video properties for output
+        cap = cv2.VideoCapture(str(video))
+        if not cap.isOpened():
+            raise ValueError("Failed to open video file")
+        original_fps = cap.get(cv2.CAP_PROP_FPS) 
+        if original_fps == 0:
+            original_fps = 30.0  # Default to 30fps if unable to read
+        new_fps = original_fps / frame_interval if frame_interval > 1 else original_fps
+        cap.release()
+
+        # Prepare the input
+        question = f"<image>{instruction}"
+        result = self.model.predict_forward(
+            video=vid_frames,
+            text=question,
+            tokenizer=self.tokenizer,
+        )
+        prediction = result['prediction']
+
+        output_video_path = None
+        masked_video_path = None
+
+        if '[SEG]' in prediction:
+            _seg_idx = 0
+            pred_masks = result['prediction_masks'][_seg_idx]
+            seg_frames = []
+            masked_only_frames = []
+
+            temp_dir = tempfile.mkdtemp()
+            os.makedirs(temp_dir, exist_ok=True)
+
+            # Process each frame
+            for frame_idx in range(len(vid_frames)):
+                pred_mask = pred_masks[frame_idx]
+                
+                # Create visualized frame with segmentation overlay
+                seg_frame = visualize(pred_mask, image_paths[frame_idx], temp_dir)
+                seg_frames.append(seg_frame)
+
+                # Create binary mask frame
+                binary_mask = (pred_mask.astype('uint8') * 255)
+                binary_mask_path = os.path.join(temp_dir, f"binary_mask_{frame_idx}.png")
+                cv2.imwrite(binary_mask_path, binary_mask)
+                masked_only_frames.append(binary_mask_path)
+
+            # Read first frame for dimensions
+            frame = cv2.imread(seg_frames[0])
+            height, width, layers = frame.shape
+
+            # Create output video files
+            masked_video_path = "/tmp/output/masked_video.mp4"
+            temp_masked_path = "/tmp/output/temp_masked.avi"
+
+            # Define video writer using a more basic codec
+            fourcc = cv2.VideoWriter_fourcc(*'MJPG')
+            masked_video_writer = cv2.VideoWriter(temp_masked_path, fourcc, new_fps, (width, height), isColor=False)
+
+            # Write frames to video
+            for mask_frame_path in masked_only_frames:
+                mask_frame = cv2.imread(mask_frame_path, cv2.IMREAD_GRAYSCALE)
+                masked_video_writer.write(mask_frame)
+
+            # Release video writer
+            masked_video_writer.release()
+
+            # Convert to web-compatible MP4 using ffmpeg
+            subprocess.run([
+                'ffmpeg', '-i', temp_masked_path, '-c:v', 'libx264',
+                '-preset', 'fast', '-pix_fmt', 'yuv420p', masked_video_path
+            ], check=True)
+
+            # Clean up temporary file
+            os.remove(temp_masked_path)
+
+        return Output(
+            masked_video=Path(masked_video_path) if masked_video_path else None,
+            response=str(prediction)
+        ) 
\ No newline at end of file
--- a/demo/requirements.txt
+++ b/demo/requirements.txt
+#torch==2.3.1
+#torchvision==0.18.1
+transformers==4.42.3
+opencv-python-headless<4.10
+peft<0.14.0
+timm==1.0.9
+einops==0.8.0
+flash_attn
+sentencepiece==0.2.0
+mmengine<1
+
+
+gradio==4.44.0
--- a/doc/Sa2VA.png
+++ b/doc/Sa2VA.png
--- a/doc/result.png
+++ b/doc/result.png