InstruceBLIP

c04f261a · dongchy920 · c04f261a · c04f261a · c04f261a · c04f261a
Commit c04f261a authored Aug 22, 2024 by dongchy920
20 changed files
--- a/examples/albef_vqa.ipynb
+++ b/examples/albef_vqa.ipynb
--- a/examples/albef_zero_shot_classification.ipynb
+++ b/examples/albef_zero_shot_classification.ipynb
--- a/examples/blip2_feature_extraction.ipynb
+++ b/examples/blip2_feature_extraction.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from PIL import Image\n",
+    "\n",
+    "from lavis.models import load_model_and_preprocess"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Load an example image"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "raw_image = Image.open(\"../docs/_static/merlion.png\").convert(\"RGB\")\n",
+    "caption = \"a large fountain spewing water into the air\"\n",
+    "\n",
+    "display(raw_image.resize((596, 437)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# setup device to use\n",
+    "device = torch.device(\"cuda\") if torch.cuda.is_available() else \"cpu\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model, vis_processors, txt_processors = load_model_and_preprocess(name=\"blip2_feature_extractor\", model_type=\"pretrain\", is_eval=True, device=device)\n",
+    "image = vis_processors[\"eval\"](raw_image).unsqueeze(0).to(device)\n",
+    "text_input = txt_processors[\"eval\"](caption)\n",
+    "sample = {\"image\": image, \"text_input\": [text_input]}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Multimodal features"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "features_multimodal = model.extract_features(sample)\n",
+    "print(features_multimodal.multimodal_embeds.shape)\n",
+    "# torch.Size([1, 32, 768]), 32 is the number of queries"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Unimodal features"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "features_image = model.extract_features(sample, mode=\"image\")\n",
+    "features_text = model.extract_features(sample, mode=\"text\")\n",
+    "print(features_image.image_embeds.shape)\n",
+    "# torch.Size([1, 32, 768])\n",
+    "print(features_text.text_embeds.shape)\n",
+    "# torch.Size([1, 12, 768])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Normalized low-dimensional unimodal features"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# low-dimensional projected features\n",
+    "print(features_image.image_embeds_proj.shape)\n",
+    "# torch.Size([1, 32, 256])\n",
+    "print(features_text.text_embeds_proj.shape)\n",
+    "# torch.Size([1, 12, 256])\n",
+    "similarity = (features_image.image_embeds_proj @ features_text.text_embeds_proj[:,0,:].t()).max()\n",
+    "print(similarity)\n",
+    "# tensor([[0.3642]])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.13"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "d4d1e4263499bec80672ea0156c357c1ee493ec2b1c70f0acce89fc37c4a6abe"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/examples/blip2_image_text_matching.ipynb
+++ b/examples/blip2_image_text_matching.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from PIL import Image\n",
+    "\n",
+    "from lavis.models import load_model_and_preprocess\n",
+    "from lavis.processors import load_processor"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Load an example image and text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "raw_image = Image.open(\"../docs/_static/merlion.png\").convert(\"RGB\")\n",
+    "display(raw_image.resize((596, 437)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# setup device to use\n",
+    "device = torch.device(\"cuda\") if torch.cuda.is_available() else \"cpu\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "caption = \"merlion in Singapore\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Load model and preprocessors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model, vis_processors, text_processors = load_model_and_preprocess(\"blip2_image_text_matching\", \"pretrain\", device=device, is_eval=True)\n",
+    "# model, vis_processors, text_processors = load_model_and_preprocess(\"blip2_image_text_matching\", \"coco\", device=device, is_eval=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Preprocess image and text inputs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "img = vis_processors[\"eval\"](raw_image).unsqueeze(0).to(device)\n",
+    "txt = text_processors[\"eval\"](caption)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Compute image-text matching (ITM) score"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "itm_output = model({\"image\": img, \"text_input\": txt}, match_head=\"itm\")\n",
+    "itm_scores = torch.nn.functional.softmax(itm_output, dim=1)\n",
+    "print(f'The image and text are matched with a probability of {itm_scores[:, 1].item():.3%}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "itc_score = model({\"image\": img, \"text_input\": txt}, match_head='itc')\n",
+    "print('The image feature and text feature has a cosine similarity of %.4f'%itc_score)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.13"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "d4d1e4263499bec80672ea0156c357c1ee493ec2b1c70f0acce89fc37c4a6abe"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/examples/blip2_instructed_generation.ipynb
+++ b/examples/blip2_instructed_generation.ipynb
--- a/examples/blip_feature_extraction.ipynb
+++ b/examples/blip_feature_extraction.ipynb
--- a/examples/blip_image_captioning.ipynb
+++ b/examples/blip_image_captioning.ipynb
--- a/examples/blip_image_text_matching.ipynb
+++ b/examples/blip_image_text_matching.ipynb
--- a/examples/blip_text_localization.ipynb
+++ b/examples/blip_text_localization.ipynb
--- a/examples/blip_vqa.ipynb
+++ b/examples/blip_vqa.ipynb
--- a/examples/blip_zero_shot_classification.ipynb
+++ b/examples/blip_zero_shot_classification.ipynb
--- a/examples/clip_feature_extraction.ipynb
+++ b/examples/clip_feature_extraction.ipynb
--- a/examples/clip_zero_shot_classification.ipynb
+++ b/examples/clip_zero_shot_classification.ipynb
--- a/get_test_score_iconqa.py
+++ b/get_test_score_iconqa.py
+import os
+import json
+import sys
+
+def compute_accuracy(path):
+    with open(path, 'r') as f:
+        data = json.load(f)
+
+    correct_answers = 0
+    total_questions = len(data)
+
+    for item in data:
+        if item['pred_ans'] == item['gt_ans']:
+            correct_answers += 1
+
+    return correct_answers / total_questions
+
+def find_latest_subdir(base_dir):
+    subdirs = [os.path.join(base_dir, d) for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]
+    latest_subdir = max(subdirs, key=os.path.getmtime)
+    return latest_subdir
+
+def save_accuracy_to_json(path, accuracy):
+    with open(path, 'w') as f:
+        json.dump({"test_accuracy": accuracy}, f, indent=4)
+
+if __name__ == '__main__':
+    if len(sys.argv) != 2:
+        print("Usage: python script_name.py <int_value>")
+        sys.exit(1)
+
+    int_value = int(sys.argv[1])
+    # TODO: Fix this base_path to each local environment
+    base_path = f"/input/results/iconqa/iconqa_{int_value}"
+    latest_dir = find_latest_subdir(base_path)
+    json_path = os.path.join(latest_dir, "result/test_iconqa_result.json")
+
+    if os.path.exists(json_path):
+        accuracy = compute_accuracy(json_path)
+        print(f"Accuracy: {accuracy * 100:.2f}%")
+
+        # Save accuracy to a new JSON file in the same directory
+        accuracy_json_path = os.path.join(latest_dir, "result/test_accuracy.json")
+        save_accuracy_to_json(accuracy_json_path, accuracy)
+
+    else:
+        print(f"JSON file not found at {json_path}")
--- a/get_test_score_scienceqa.py
+++ b/get_test_score_scienceqa.py
+import os
+import json
+import sys
+
+def compute_accuracy(path):
+    with open(path, 'r') as f:
+        data = json.load(f)
+
+    correct_answers = 0
+    total_questions = len(data)
+
+    for item in data:
+        if item['pred_ans'] == item['gt_ans']:
+            correct_answers += 1
+
+    return correct_answers / total_questions
+
+def find_latest_subdir(base_dir):
+    subdirs = [os.path.join(base_dir, d) for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]
+    latest_subdir = max(subdirs, key=os.path.getmtime)
+    return latest_subdir
+
+def save_accuracy_to_json(path, accuracy):
+    with open(path, 'w') as f:
+        json.dump({"test_accuracy": accuracy}, f, indent=4)
+
+if __name__ == '__main__':
+    if len(sys.argv) != 2:
+        print("Usage: python script_name.py <int_value>")
+        sys.exit(1)
+
+    int_value = int(sys.argv[1])
+    # TODO: Fix this base_path to each local environment
+    base_path = f"/input/results/scienceqa/scienceqa_{int_value}"
+    latest_dir = find_latest_subdir(base_path)
+    json_path = os.path.join(latest_dir, "result/test_scienceqa_result.json")
+
+    if os.path.exists(json_path):
+        accuracy = compute_accuracy(json_path)
+        print(f"Accuracy: {accuracy * 100:.2f}%")
+
+        # Save accuracy to a new JSON file in the same directory
+        accuracy_json_path = os.path.join(latest_dir, "result/test_accuracy.json")
+        save_accuracy_to_json(accuracy_json_path, accuracy)
+
+    else:
+        print(f"JSON file not found at {json_path}")
--- a/iconqa_data_preprocess.py
+++ b/iconqa_data_preprocess.py
+import json
+
+with open('problems.json', 'r') as f:
+    data = json.load(f)
+
+train, test, val = [], [], []
+
+for key, value in data.items():
+    split = value["split"]
+    ques_type = value["ques_type"]
+    if ques_type == "choose_txt":
+        data = value
+        data['id'] = key
+        if split == "train" :
+            train.append(data)
+        elif split == "test":
+            test.append(data)
+        elif split == "val":
+            val.append(data)
+
+with open('/input/iconqa/annotations/train.json', 'w') as train_file:
+    json.dump(train, train_file, ensure_ascii=False, indent=4)
+
+with open('/input/iconqa/annotations/test.json', 'w') as test_file:
+    json.dump(test, test_file, ensure_ascii=False, indent=4)
+
+with open('/input/iconqa/annotations/val.json', 'w') as val_file:
+    json.dump(val, val_file, ensure_ascii=False, indent=4)
\ No newline at end of file
--- a/lavis/__init__.py
+++ b/lavis/__init__.py
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+import sys
+
+from omegaconf import OmegaConf
+
+from lavis.common.registry import registry
+
+from lavis.datasets.builders import *
+from lavis.models import *
+from lavis.processors import *
+from lavis.tasks import *
+
+
+root_dir = os.path.dirname(os.path.abspath(__file__))
+default_cfg = OmegaConf.load(os.path.join(root_dir, "configs/default.yaml"))
+
+# It makes output directory "lavis/output" by default
+# However, If change this, all of absolute path will be changed. It will make error to get yaml files.
+registry.register_path("library_root", root_dir)
+repo_root = os.path.join(root_dir, "..")
+registry.register_path("repo_root", repo_root)
+cache_root = os.path.join(repo_root, default_cfg.env.cache_root)
+registry.register_path("cache_root", cache_root)
+
+registry.register("MAX_INT", sys.maxsize)
+registry.register("SPLIT_NAMES", ["train", "val", "test"])
--- a/lavis/common/annotator/canny/__init__.py
+++ b/lavis/common/annotator/canny/__init__.py
+import cv2
+
+
+class CannyDetector:
+    def __call__(self, img, low_threshold, high_threshold):
+        return cv2.Canny(img, low_threshold, high_threshold)
--- a/lavis/common/annotator/ckpts/download.sh
+++ b/lavis/common/annotator/ckpts/download.sh
+#! /bin/bash
+
+wget https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/dpt_hybrid-midas-501f0c75.pt
+wget https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/network-bsds500.pth
+
--- a/lavis/common/annotator/hed/__init__.py
+++ b/lavis/common/annotator/hed/__init__.py
+import numpy as np
+import cv2
+import os
+import torch
+from einops import rearrange
+from annotator.util import annotator_ckpts_path
+
+
+class Network(torch.nn.Module):
+    def __init__(self, model_path):
+        super().__init__()
+
+        self.netVggOne = torch.nn.Sequential(
+            torch.nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1),
+            torch.nn.ReLU(inplace=False),
+            torch.nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1),
+            torch.nn.ReLU(inplace=False)
+        )
+
+        self.netVggTwo = torch.nn.Sequential(
+            torch.nn.MaxPool2d(kernel_size=2, stride=2),
+            torch.nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1),
+            torch.nn.ReLU(inplace=False),
+            torch.nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1),
+            torch.nn.ReLU(inplace=False)
+        )
+
+        self.netVggThr = torch.nn.Sequential(
+            torch.nn.MaxPool2d(kernel_size=2, stride=2),
+            torch.nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1),
+            torch.nn.ReLU(inplace=False),
+            torch.nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1),
+            torch.nn.ReLU(inplace=False),
+            torch.nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1),
+            torch.nn.ReLU(inplace=False)
+        )
+
+        self.netVggFou = torch.nn.Sequential(
+            torch.nn.MaxPool2d(kernel_size=2, stride=2),
+            torch.nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1),
+            torch.nn.ReLU(inplace=False),
+            torch.nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
+            torch.nn.ReLU(inplace=False),
+            torch.nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
+            torch.nn.ReLU(inplace=False)
+        )
+
+        self.netVggFiv = torch.nn.Sequential(
+            torch.nn.MaxPool2d(kernel_size=2, stride=2),
+            torch.nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
+            torch.nn.ReLU(inplace=False),
+            torch.nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
+            torch.nn.ReLU(inplace=False),
+            torch.nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
+            torch.nn.ReLU(inplace=False)
+        )
+
+        self.netScoreOne = torch.nn.Conv2d(in_channels=64, out_channels=1, kernel_size=1, stride=1, padding=0)
+        self.netScoreTwo = torch.nn.Conv2d(in_channels=128, out_channels=1, kernel_size=1, stride=1, padding=0)
+        self.netScoreThr = torch.nn.Conv2d(in_channels=256, out_channels=1, kernel_size=1, stride=1, padding=0)
+        self.netScoreFou = torch.nn.Conv2d(in_channels=512, out_channels=1, kernel_size=1, stride=1, padding=0)
+        self.netScoreFiv = torch.nn.Conv2d(in_channels=512, out_channels=1, kernel_size=1, stride=1, padding=0)
+
+        self.netCombine = torch.nn.Sequential(
+            torch.nn.Conv2d(in_channels=5, out_channels=1, kernel_size=1, stride=1, padding=0),
+            torch.nn.Sigmoid()
+        )
+
+        self.load_state_dict({strKey.replace('module', 'net'): tenWeight for strKey, tenWeight in torch.load(model_path).items()})
+
+    def forward(self, tenInput):
+        tenInput = tenInput * 255.0
+        tenInput = tenInput - torch.tensor(data=[104.00698793, 116.66876762, 122.67891434], dtype=tenInput.dtype, device=tenInput.device).view(1, 3, 1, 1)
+
+        tenVggOne = self.netVggOne(tenInput)
+        tenVggTwo = self.netVggTwo(tenVggOne)
+        tenVggThr = self.netVggThr(tenVggTwo)
+        tenVggFou = self.netVggFou(tenVggThr)
+        tenVggFiv = self.netVggFiv(tenVggFou)
+
+        tenScoreOne = self.netScoreOne(tenVggOne)
+        tenScoreTwo = self.netScoreTwo(tenVggTwo)
+        tenScoreThr = self.netScoreThr(tenVggThr)
+        tenScoreFou = self.netScoreFou(tenVggFou)
+        tenScoreFiv = self.netScoreFiv(tenVggFiv)
+
+        tenScoreOne = torch.nn.functional.interpolate(input=tenScoreOne, size=(tenInput.shape[2], tenInput.shape[3]), mode='bilinear', align_corners=False)
+        tenScoreTwo = torch.nn.functional.interpolate(input=tenScoreTwo, size=(tenInput.shape[2], tenInput.shape[3]), mode='bilinear', align_corners=False)
+        tenScoreThr = torch.nn.functional.interpolate(input=tenScoreThr, size=(tenInput.shape[2], tenInput.shape[3]), mode='bilinear', align_corners=False)
+        tenScoreFou = torch.nn.functional.interpolate(input=tenScoreFou, size=(tenInput.shape[2], tenInput.shape[3]), mode='bilinear', align_corners=False)
+        tenScoreFiv = torch.nn.functional.interpolate(input=tenScoreFiv, size=(tenInput.shape[2], tenInput.shape[3]), mode='bilinear', align_corners=False)
+
+        return self.netCombine(torch.cat([ tenScoreOne, tenScoreTwo, tenScoreThr, tenScoreFou, tenScoreFiv ], 1))
+
+
+class HEDdetector:
+    def __init__(self):
+        remote_model_path = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/network-bsds500.pth"
+        modelpath = os.path.join(annotator_ckpts_path, "network-bsds500.pth")
+        if not os.path.exists(modelpath):
+            from basicsr.utils.download_util import load_file_from_url
+            load_file_from_url(remote_model_path, model_dir=annotator_ckpts_path)
+        self.netNetwork = Network(modelpath).cuda().eval()
+
+    def __call__(self, input_image):
+        assert input_image.ndim == 3
+        input_image = input_image[:, :, ::-1].copy()
+        with torch.no_grad():
+            image_hed = torch.from_numpy(input_image).float().cuda()
+            image_hed = image_hed / 255.0
+            image_hed = rearrange(image_hed, 'h w c -> 1 c h w')
+            edge = self.netNetwork(image_hed)[0]
+            edge = (edge.cpu().numpy() * 255.0).clip(0, 255).astype(np.uint8)
+            return edge[0]
+
+
+def nms(x, t, s):
+    x = cv2.GaussianBlur(x.astype(np.float32), (0, 0), s)
+
+    f1 = np.array([[0, 0, 0], [1, 1, 1], [0, 0, 0]], dtype=np.uint8)
+    f2 = np.array([[0, 1, 0], [0, 1, 0], [0, 1, 0]], dtype=np.uint8)
+    f3 = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]], dtype=np.uint8)
+    f4 = np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0]], dtype=np.uint8)
+
+    y = np.zeros_like(x)
+
+    for f in [f1, f2, f3, f4]:
+        np.putmask(y, cv2.dilate(x, kernel=f) == x, x)
+
+    z = np.zeros_like(y, dtype=np.uint8)
+    z[y > t] = 255
+    return z