"git@developer.sourcefind.cn:wuxk1/megatron-lm.git" did not exist on "816fb89025e8ed67035b04c66c7f74da19c9bb74"
Commit c04f261a authored by dongchy920's avatar dongchy920
Browse files

InstruceBLIP

parents
Pipeline #1594 canceled with stages
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"from PIL import Image\n",
"\n",
"from lavis.models import load_model_and_preprocess"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Load an example image"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"raw_image = Image.open(\"../docs/_static/merlion.png\").convert(\"RGB\")\n",
"caption = \"a large fountain spewing water into the air\"\n",
"\n",
"display(raw_image.resize((596, 437)))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# setup device to use\n",
"device = torch.device(\"cuda\") if torch.cuda.is_available() else \"cpu\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model, vis_processors, txt_processors = load_model_and_preprocess(name=\"blip2_feature_extractor\", model_type=\"pretrain\", is_eval=True, device=device)\n",
"image = vis_processors[\"eval\"](raw_image).unsqueeze(0).to(device)\n",
"text_input = txt_processors[\"eval\"](caption)\n",
"sample = {\"image\": image, \"text_input\": [text_input]}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Multimodal features"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"features_multimodal = model.extract_features(sample)\n",
"print(features_multimodal.multimodal_embeds.shape)\n",
"# torch.Size([1, 32, 768]), 32 is the number of queries"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Unimodal features"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"features_image = model.extract_features(sample, mode=\"image\")\n",
"features_text = model.extract_features(sample, mode=\"text\")\n",
"print(features_image.image_embeds.shape)\n",
"# torch.Size([1, 32, 768])\n",
"print(features_text.text_embeds.shape)\n",
"# torch.Size([1, 12, 768])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Normalized low-dimensional unimodal features"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# low-dimensional projected features\n",
"print(features_image.image_embeds_proj.shape)\n",
"# torch.Size([1, 32, 256])\n",
"print(features_text.text_embeds_proj.shape)\n",
"# torch.Size([1, 12, 256])\n",
"similarity = (features_image.image_embeds_proj @ features_text.text_embeds_proj[:,0,:].t()).max()\n",
"print(similarity)\n",
"# tensor([[0.3642]])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.13"
},
"vscode": {
"interpreter": {
"hash": "d4d1e4263499bec80672ea0156c357c1ee493ec2b1c70f0acce89fc37c4a6abe"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"from PIL import Image\n",
"\n",
"from lavis.models import load_model_and_preprocess\n",
"from lavis.processors import load_processor"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Load an example image and text"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"raw_image = Image.open(\"../docs/_static/merlion.png\").convert(\"RGB\")\n",
"display(raw_image.resize((596, 437)))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# setup device to use\n",
"device = torch.device(\"cuda\") if torch.cuda.is_available() else \"cpu\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"caption = \"merlion in Singapore\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Load model and preprocessors"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model, vis_processors, text_processors = load_model_and_preprocess(\"blip2_image_text_matching\", \"pretrain\", device=device, is_eval=True)\n",
"# model, vis_processors, text_processors = load_model_and_preprocess(\"blip2_image_text_matching\", \"coco\", device=device, is_eval=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Preprocess image and text inputs"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"img = vis_processors[\"eval\"](raw_image).unsqueeze(0).to(device)\n",
"txt = text_processors[\"eval\"](caption)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Compute image-text matching (ITM) score"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"itm_output = model({\"image\": img, \"text_input\": txt}, match_head=\"itm\")\n",
"itm_scores = torch.nn.functional.softmax(itm_output, dim=1)\n",
"print(f'The image and text are matched with a probability of {itm_scores[:, 1].item():.3%}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"itc_score = model({\"image\": img, \"text_input\": txt}, match_head='itc')\n",
"print('The image feature and text feature has a cosine similarity of %.4f'%itc_score)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.13"
},
"vscode": {
"interpreter": {
"hash": "d4d1e4263499bec80672ea0156c357c1ee493ec2b1c70f0acce89fc37c4a6abe"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
import os
import json
import sys
def compute_accuracy(path):
with open(path, 'r') as f:
data = json.load(f)
correct_answers = 0
total_questions = len(data)
for item in data:
if item['pred_ans'] == item['gt_ans']:
correct_answers += 1
return correct_answers / total_questions
def find_latest_subdir(base_dir):
subdirs = [os.path.join(base_dir, d) for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]
latest_subdir = max(subdirs, key=os.path.getmtime)
return latest_subdir
def save_accuracy_to_json(path, accuracy):
with open(path, 'w') as f:
json.dump({"test_accuracy": accuracy}, f, indent=4)
if __name__ == '__main__':
if len(sys.argv) != 2:
print("Usage: python script_name.py <int_value>")
sys.exit(1)
int_value = int(sys.argv[1])
# TODO: Fix this base_path to each local environment
base_path = f"/input/results/iconqa/iconqa_{int_value}"
latest_dir = find_latest_subdir(base_path)
json_path = os.path.join(latest_dir, "result/test_iconqa_result.json")
if os.path.exists(json_path):
accuracy = compute_accuracy(json_path)
print(f"Accuracy: {accuracy * 100:.2f}%")
# Save accuracy to a new JSON file in the same directory
accuracy_json_path = os.path.join(latest_dir, "result/test_accuracy.json")
save_accuracy_to_json(accuracy_json_path, accuracy)
else:
print(f"JSON file not found at {json_path}")
import os
import json
import sys
def compute_accuracy(path):
with open(path, 'r') as f:
data = json.load(f)
correct_answers = 0
total_questions = len(data)
for item in data:
if item['pred_ans'] == item['gt_ans']:
correct_answers += 1
return correct_answers / total_questions
def find_latest_subdir(base_dir):
subdirs = [os.path.join(base_dir, d) for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]
latest_subdir = max(subdirs, key=os.path.getmtime)
return latest_subdir
def save_accuracy_to_json(path, accuracy):
with open(path, 'w') as f:
json.dump({"test_accuracy": accuracy}, f, indent=4)
if __name__ == '__main__':
if len(sys.argv) != 2:
print("Usage: python script_name.py <int_value>")
sys.exit(1)
int_value = int(sys.argv[1])
# TODO: Fix this base_path to each local environment
base_path = f"/input/results/scienceqa/scienceqa_{int_value}"
latest_dir = find_latest_subdir(base_path)
json_path = os.path.join(latest_dir, "result/test_scienceqa_result.json")
if os.path.exists(json_path):
accuracy = compute_accuracy(json_path)
print(f"Accuracy: {accuracy * 100:.2f}%")
# Save accuracy to a new JSON file in the same directory
accuracy_json_path = os.path.join(latest_dir, "result/test_accuracy.json")
save_accuracy_to_json(accuracy_json_path, accuracy)
else:
print(f"JSON file not found at {json_path}")
import json
with open('problems.json', 'r') as f:
data = json.load(f)
train, test, val = [], [], []
for key, value in data.items():
split = value["split"]
ques_type = value["ques_type"]
if ques_type == "choose_txt":
data = value
data['id'] = key
if split == "train" :
train.append(data)
elif split == "test":
test.append(data)
elif split == "val":
val.append(data)
with open('/input/iconqa/annotations/train.json', 'w') as train_file:
json.dump(train, train_file, ensure_ascii=False, indent=4)
with open('/input/iconqa/annotations/test.json', 'w') as test_file:
json.dump(test, test_file, ensure_ascii=False, indent=4)
with open('/input/iconqa/annotations/val.json', 'w') as val_file:
json.dump(val, val_file, ensure_ascii=False, indent=4)
\ No newline at end of file
"""
Copyright (c) 2022, salesforce.com, inc.
All rights reserved.
SPDX-License-Identifier: BSD-3-Clause
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""
import os
import sys
from omegaconf import OmegaConf
from lavis.common.registry import registry
from lavis.datasets.builders import *
from lavis.models import *
from lavis.processors import *
from lavis.tasks import *
root_dir = os.path.dirname(os.path.abspath(__file__))
default_cfg = OmegaConf.load(os.path.join(root_dir, "configs/default.yaml"))
# It makes output directory "lavis/output" by default
# However, If change this, all of absolute path will be changed. It will make error to get yaml files.
registry.register_path("library_root", root_dir)
repo_root = os.path.join(root_dir, "..")
registry.register_path("repo_root", repo_root)
cache_root = os.path.join(repo_root, default_cfg.env.cache_root)
registry.register_path("cache_root", cache_root)
registry.register("MAX_INT", sys.maxsize)
registry.register("SPLIT_NAMES", ["train", "val", "test"])
import cv2
class CannyDetector:
def __call__(self, img, low_threshold, high_threshold):
return cv2.Canny(img, low_threshold, high_threshold)
#! /bin/bash
wget https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/dpt_hybrid-midas-501f0c75.pt
wget https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/network-bsds500.pth
import numpy as np
import cv2
import os
import torch
from einops import rearrange
from annotator.util import annotator_ckpts_path
class Network(torch.nn.Module):
def __init__(self, model_path):
super().__init__()
self.netVggOne = torch.nn.Sequential(
torch.nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1),
torch.nn.ReLU(inplace=False),
torch.nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1),
torch.nn.ReLU(inplace=False)
)
self.netVggTwo = torch.nn.Sequential(
torch.nn.MaxPool2d(kernel_size=2, stride=2),
torch.nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1),
torch.nn.ReLU(inplace=False),
torch.nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1),
torch.nn.ReLU(inplace=False)
)
self.netVggThr = torch.nn.Sequential(
torch.nn.MaxPool2d(kernel_size=2, stride=2),
torch.nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1),
torch.nn.ReLU(inplace=False),
torch.nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1),
torch.nn.ReLU(inplace=False),
torch.nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1),
torch.nn.ReLU(inplace=False)
)
self.netVggFou = torch.nn.Sequential(
torch.nn.MaxPool2d(kernel_size=2, stride=2),
torch.nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1),
torch.nn.ReLU(inplace=False),
torch.nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
torch.nn.ReLU(inplace=False),
torch.nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
torch.nn.ReLU(inplace=False)
)
self.netVggFiv = torch.nn.Sequential(
torch.nn.MaxPool2d(kernel_size=2, stride=2),
torch.nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
torch.nn.ReLU(inplace=False),
torch.nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
torch.nn.ReLU(inplace=False),
torch.nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
torch.nn.ReLU(inplace=False)
)
self.netScoreOne = torch.nn.Conv2d(in_channels=64, out_channels=1, kernel_size=1, stride=1, padding=0)
self.netScoreTwo = torch.nn.Conv2d(in_channels=128, out_channels=1, kernel_size=1, stride=1, padding=0)
self.netScoreThr = torch.nn.Conv2d(in_channels=256, out_channels=1, kernel_size=1, stride=1, padding=0)
self.netScoreFou = torch.nn.Conv2d(in_channels=512, out_channels=1, kernel_size=1, stride=1, padding=0)
self.netScoreFiv = torch.nn.Conv2d(in_channels=512, out_channels=1, kernel_size=1, stride=1, padding=0)
self.netCombine = torch.nn.Sequential(
torch.nn.Conv2d(in_channels=5, out_channels=1, kernel_size=1, stride=1, padding=0),
torch.nn.Sigmoid()
)
self.load_state_dict({strKey.replace('module', 'net'): tenWeight for strKey, tenWeight in torch.load(model_path).items()})
def forward(self, tenInput):
tenInput = tenInput * 255.0
tenInput = tenInput - torch.tensor(data=[104.00698793, 116.66876762, 122.67891434], dtype=tenInput.dtype, device=tenInput.device).view(1, 3, 1, 1)
tenVggOne = self.netVggOne(tenInput)
tenVggTwo = self.netVggTwo(tenVggOne)
tenVggThr = self.netVggThr(tenVggTwo)
tenVggFou = self.netVggFou(tenVggThr)
tenVggFiv = self.netVggFiv(tenVggFou)
tenScoreOne = self.netScoreOne(tenVggOne)
tenScoreTwo = self.netScoreTwo(tenVggTwo)
tenScoreThr = self.netScoreThr(tenVggThr)
tenScoreFou = self.netScoreFou(tenVggFou)
tenScoreFiv = self.netScoreFiv(tenVggFiv)
tenScoreOne = torch.nn.functional.interpolate(input=tenScoreOne, size=(tenInput.shape[2], tenInput.shape[3]), mode='bilinear', align_corners=False)
tenScoreTwo = torch.nn.functional.interpolate(input=tenScoreTwo, size=(tenInput.shape[2], tenInput.shape[3]), mode='bilinear', align_corners=False)
tenScoreThr = torch.nn.functional.interpolate(input=tenScoreThr, size=(tenInput.shape[2], tenInput.shape[3]), mode='bilinear', align_corners=False)
tenScoreFou = torch.nn.functional.interpolate(input=tenScoreFou, size=(tenInput.shape[2], tenInput.shape[3]), mode='bilinear', align_corners=False)
tenScoreFiv = torch.nn.functional.interpolate(input=tenScoreFiv, size=(tenInput.shape[2], tenInput.shape[3]), mode='bilinear', align_corners=False)
return self.netCombine(torch.cat([ tenScoreOne, tenScoreTwo, tenScoreThr, tenScoreFou, tenScoreFiv ], 1))
class HEDdetector:
def __init__(self):
remote_model_path = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/network-bsds500.pth"
modelpath = os.path.join(annotator_ckpts_path, "network-bsds500.pth")
if not os.path.exists(modelpath):
from basicsr.utils.download_util import load_file_from_url
load_file_from_url(remote_model_path, model_dir=annotator_ckpts_path)
self.netNetwork = Network(modelpath).cuda().eval()
def __call__(self, input_image):
assert input_image.ndim == 3
input_image = input_image[:, :, ::-1].copy()
with torch.no_grad():
image_hed = torch.from_numpy(input_image).float().cuda()
image_hed = image_hed / 255.0
image_hed = rearrange(image_hed, 'h w c -> 1 c h w')
edge = self.netNetwork(image_hed)[0]
edge = (edge.cpu().numpy() * 255.0).clip(0, 255).astype(np.uint8)
return edge[0]
def nms(x, t, s):
x = cv2.GaussianBlur(x.astype(np.float32), (0, 0), s)
f1 = np.array([[0, 0, 0], [1, 1, 1], [0, 0, 0]], dtype=np.uint8)
f2 = np.array([[0, 1, 0], [0, 1, 0], [0, 1, 0]], dtype=np.uint8)
f3 = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]], dtype=np.uint8)
f4 = np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0]], dtype=np.uint8)
y = np.zeros_like(x)
for f in [f1, f2, f3, f4]:
np.putmask(y, cv2.dilate(x, kernel=f) == x, x)
z = np.zeros_like(y, dtype=np.uint8)
z[y > t] = 255
return z
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment