Commit 5c60119d authored by mashun1's avatar mashun1
Browse files

uitars

parents
Pipeline #2683 canceled with stages
[
{
"role": "user",
"content": "You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. \n\n## Output Format\n```\nThought: ...\nAction: ...\n```\n\n## Action Space\n\nclick(start_box='<|box_start|>(x1, y1)<|box_end|>')\nleft_double(start_box='<|box_start|>(x1, y1)<|box_end|>')\nright_single(start_box='<|box_start|>(x1, y1)<|box_end|>')\ndrag(start_box='<|box_start|>(x1, y1)<|box_end|>', end_box='<|box_start|>(x3, y3)<|box_end|>')\nhotkey(key='')\ntype(content='') #If you want to submit your input, use \"\\n\" at the end of `content`.\nscroll(start_box='<|box_start|>(x1, y1)<|box_end|>', direction='down or up or right or left')\nwait() #Sleep for 5s and take a screenshot to check for any changes.\nfinished(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format.\n\n\n## Note\n- Use Chinese in `Thought` part.\n- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.\n\n## User Instruction\nCould you help me set the image to Palette-Based?"
},
{
"role": "assistant",
"content": "Thought: 我看到屏幕上弹出了一个颜色配置文件转换的对话框,要继续操作的话得先处理掉它。对话框右下角有个\"Convert\"按钮,我需要点击它来确认这个转换操作。\nAction: click(start_box='(424,452)')"
},
{
"role": "assistant",
"content": "Thought: 看到这张复古电视机的照片,我需要把它设置成基于调色板的方式。让我先点击顶部菜单栏的\"Tools\"选项,这样就能找到相关的颜色设置选项了。\nAction: click(start_box='(154,49)')"
},
{
"role": "assistant",
"content": "Thought: 刚才点开了Tools菜单,但发现这不是正确的路径。要设置图片为基于调色板的方式,我需要先关闭当前打开的Tools菜单,然后重新寻找正确的颜色设置选项。让我点击一下Tools按钮把它关掉。\nAction: click(start_box='(154,49)')"
},
{
"role": "user",
"content": [
{
"type": "image",
"url": "data\\resized_images\\resized_image_0.png"
}
]
},
{
"role": "assistant",
"content": "Thought: 刚才在Tools菜单里没找到想要的选项,让我换个思路。我注意到顶部菜单栏有个\"Colors\"选项,这应该就是处理颜色设置的地方。我需要点击它来继续寻找将图片设置为基于调色板方式的选项。\nAction: click(start_box='(134,49)')"
},
{
"role": "user",
"content": [
{
"type": "image",
"url": "data\\resized_images\\resized_image_1.png"
}
]
},
{
"role": "assistant",
"content": "Thought: 我在Colors菜单里仔细浏览了一遍,发现这里并没有直接设置调色板的选项。看来得换个思路了 - 先把这个菜单关掉,然后去Edit菜单看看,那里应该能找到更多关于图像处理的选项。毕竟要实现基于调色板的设置,可能需要通过其他路径来实现。\nAction: click(start_box='(134,49)')"
},
{
"role": "user",
"content": [
{
"type": "image",
"url": "data\\resized_images\\resized_image_2.png"
}
]
},
{
"role": "assistant",
"content": "Thought: 我在Colors菜单里没找到想要的选项,让我换个思路。我注意到顶部菜单栏有个\"Edit\"选项,这应该就是处理颜色设置的地方。我需要点击它来继续寻找将图片设置为基于调色板方式的选项。\nAction: click(start_box='(45,49)')"
},
{
"role": "user",
"content": [
{
"type": "image",
"url": "data\\resized_images\\resized_image_3.png"
}
]
},
{
"role": "assistant",
"content": "Thought: 我在Edit菜单里发现了Preferences选项,这正是我需要的。通过这个选项,我应该能找到调整图片颜色模式的设置。让我点击它进入设置界面,这样就能把图片转换成基于调色板的方式了。\nAction: click(start_box='(67,421)')"
},
{
"role": "user",
"content": [
{
"type": "image",
"url": "data\\resized_images\\resized_image_4.png"
}
]
}
]
\ No newline at end of file
[
{
"role": "system",
"content": "You are a helpful assistant.",
"loss_mask": 0
},
{
"role": "user",
"content": "You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. \n\n## Output Format\n\nAction: ...\n\n\n## Action Space\nclick(start_box='<|box_start|>(x1,y1)<|box_end|>')\nselect(start_box='<|box_start|>(x1,y1)<|box_end|>') # To expand a dropdown menu or select an item from it.\ntype(content='', start_box='<|box_start|>(x1,y1)<|box_end|>') # First click on the textbox, and then type the content.\n\n## User Instruction\nBuy a pop rock album CD from the United Kingdom that was released in 2016, is between £15 and £20 and in perfect condition.\n\n ",
"loss_mask": 0
},
{
"role": "assistant",
"content": "Action: click(start_box='<|box_start|>(495,30)<|box_end|>')",
"loss_mask": 0
},
{
"role": "assistant",
"content": "Action: click(start_box='<|box_start|>(381,153)<|box_end|>')",
"loss_mask": 0
},
{
"role": "assistant",
"content": "Action: click(start_box='<|box_start|>(66,263)<|box_end|>')",
"loss_mask": 0
},
{
"role": "assistant",
"content": "Action: click(start_box='<|box_start|>(66,851)<|box_end|>')",
"loss_mask": 0
},
{
"role": "user",
"content": "<|vision_start|><|image_pad|>...<|image_pad|><|vision_end|>",
"loss_mask": 0
},
{
"role": "assistant",
"content": "Action: click(start_box='<|box_start|>(66,638)<|box_end|>')",
"loss_mask": 0
},
{
"role": "user",
"content": "<|vision_start|><|image_pad|>...<|image_pad|><|vision_end|>",
"loss_mask": 0
},
{
"role": "assistant",
"content": "Action: click(start_box='<|box_start|>(66,86)<|box_end|>')",
"loss_mask": 0
},
{
"role": "user",
"content": "<|vision_start|><|image_pad|>...<|image_pad|><|vision_end|>",
"loss_mask": 0
},
{
"role": "assistant",
"content": "Action: click(start_box='<|box_start|>(66,5)<|box_end|>')",
"loss_mask": 0
},
{
"role": "user",
"content": "<|vision_start|><|image_pad|>...<|image_pad|><|vision_end|>",
"loss_mask": 0
},
{
"role": "assistant",
"content": "Action: click(start_box='<|box_start|>(293,218)<|box_end|>')",
"loss_mask": 1
}
]
import torch
from transformers import AutoModelForVision2Seq, AutoProcessor
import json
import re
def standardize_messages(messages):
for message in messages:
if isinstance(message["content"], str):
# 如果是字符串,转换成标准格式
message["content"] = [
{
"type": "text",
"text": message["content"]
}
]
return messages
def add_box_token(input_string):
# Step 1: Split the string into individual actions
if "Action: " in input_string and "start_box=" in input_string:
suffix = input_string.split("Action: ")[0] + "Action: "
actions = input_string.split("Action: ")[1:]
processed_actions = []
for action in actions:
action = action.strip()
# Step 2: Extract coordinates (start_box or end_box) using regex
coordinates = re.findall(r"(start_box|end_box)='\((\d+),\s*(\d+)\)'", action)
updated_action = action # Start with the original action
for coord_type, x, y in coordinates:
# Convert x and y to integers
updated_action = updated_action.replace(f"{coord_type}='({x},{y})'", f"{coord_type}='<|box_start|>({x},{y})<|box_end|>'")
processed_actions.append(updated_action)
# Step 5: Reconstruct the final string
final_string = suffix + "\n\n".join(processed_actions)
else:
final_string = input_string
return final_string
result = {}
messages = json.load(open("./data/test_messages_07.json"))
for message in messages:
if message["role"] == "assistant":
message["content"] = add_box_token(message["content"])
print(message["content"])
messages = standardize_messages(messages)
if __name__ == "__main__":
from argparse import ArgumentParser
from utils.coordinate_extract import extract_coordinates
from utils.plot_image import plot
parser = ArgumentParser()
parser.add_argument("--model_path", type=str, required=True)
args = parser.parse_args()
model = AutoModelForVision2Seq.from_pretrained(
args.model_path,
torch_dtype=torch.float16,
device_map="auto"
)
processor = AutoProcessor.from_pretrained(args.model_path)
# if hasattr(processor, "image_processor"):
# image_processor = processor.image_processor
# # 替换 size
# image_processor.size = {
# "shortest_edge": 768,
# "longest_edge": 1024
# }
# processor.image_processor = image_processor
inputs = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt"
).to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=400)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)
model_raw_response = output_text[0]
coordinate = extract_coordinates(model_raw_response)
# Please use re to parse the coordinate values
model_output_width, model_output_height = coordinate
plot(model_output_width, model_output_height)
\ No newline at end of file
import torch
import re
import json
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
def standardize_messages(messages):
for message in messages:
if isinstance(message["content"], str):
# 如果是字符串,转换成标准格式
message["content"] = [
{
"type": "text",
"text": message["content"]
}
]
return messages
def add_box_token(input_string):
# Step 1: Split the string into individual actions
if "Action: " in input_string and "start_box=" in input_string:
suffix = input_string.split("Action: ")[0] + "Action: "
actions = input_string.split("Action: ")[1:]
processed_actions = []
for action in actions:
action = action.strip()
# Step 2: Extract coordinates (start_box or end_box) using regex
coordinates = re.findall(r"(start_box|end_box)='\((\d+),\s*(\d+)\)'", action)
updated_action = action # Start with the original action
for coord_type, x, y in coordinates:
# Convert x and y to integers
updated_action = updated_action.replace(f"{coord_type}='({x},{y})'", f"{coord_type}='<|box_start|>({x},{y})<|box_end|>'")
processed_actions.append(updated_action)
# Step 5: Reconstruct the final string
final_string = suffix + "\n\n".join(processed_actions)
else:
final_string = input_string
return final_string
def standardize_messages(messages):
for message in messages:
if isinstance(message["content"], str):
# 如果是字符串,转换成标准格式
message["content"] = [
{
"type": "text",
"text": message["content"]
}
]
return messages
result = {}
messages = json.load(open("./data/test_messages_07.json"))
for message in messages:
if message["role"] == "assistant":
message["content"] = add_box_token(message["content"])
print(message["content"])
messages = standardize_messages(messages)
if __name__ == "__main__":
from argparse import ArgumentParser
from utils.coordinate_extract import extract_coordinates
from utils.plot_image import plot
parser = ArgumentParser()
parser.add_argument("--model_path", type=str, required=True)
args = parser.parse_args()
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
args.model_path,
torch_dtype=torch.float16,
device_map="auto",
attn_implementation="sdpa"
)
processor = AutoProcessor.from_pretrained(args.model_path)
inputs = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt"
).to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=400)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)
model_raw_response = output_text[0]
coordinate = extract_coordinates(model_raw_response)
# Please use re to parse the coordinate values
model_output_width, model_output_height = coordinate
plot(model_output_width, model_output_height)
\ No newline at end of file
# 模型唯一标识
modelCode=1520
# 模型名称
modelName=UI-TARS_pytorch
# 模型描述
modelDescription=端到端agent模型
# 应用场景
appScenario=推理,对话问答,电商,教育,交通,能源
# 框架类型
frameType=Pytorch
COMPUTER_USE = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
## Output Format
```
Thought: ...
Action: ...
```
## Action Space
click(start_box='<|box_start|>(x1,y1)<|box_end|>')
left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
hotkey(key='')
type(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \\n at the end of content.
scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
wait() #Sleep for 5s and take a screenshot to check for any changes.
finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
## Note
- Use {language} in `Thought` part.
- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
## User Instruction
{instruction}
"""
MOBILE_USE = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
## Output Format
```
Thought: ...
Action: ...
```
## Action Space
click(start_box='<|box_start|>(x1,y1)<|box_end|>')
long_press(start_box='<|box_start|>(x1,y1)<|box_end|>')
type(content='') #If you want to submit your input, use "\\n" at the end of `content`.
scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left')
open_app(app_name=\'\')
drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
press_home()
press_back()
finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
## Note
- Use {language} in `Thought` part.
- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
## User Instruction
{instruction}
"""
\ No newline at end of file
import json
import re
def add_box_token(input_string):
# Step 1: Split the string into individual actions
if "Action: " in input_string and "start_box=" in input_string:
suffix = input_string.split("Action: ")[0] + "Action: "
actions = input_string.split("Action: ")[1:]
processed_actions = []
for action in actions:
action = action.strip()
# Step 2: Extract coordinates (start_box or end_box) using regex
coordinates = re.findall(r"(start_box|end_box)='\((\d+),\s*(\d+)\)'", action)
updated_action = action # Start with the original action
for coord_type, x, y in coordinates:
# Convert x and y to integers
updated_action = updated_action.replace(f"{coord_type}='({x},{y})'", f"{coord_type}='<|box_start|>({x},{y})<|box_end|>'")
processed_actions.append(updated_action)
# Step 5: Reconstruct the final string
final_string = suffix + "\n\n".join(processed_actions)
else:
final_string = input_string
return final_string
result = {}
messages = json.load(open("./data/test_messages.json"))
for message in messages:
if message["role"] == "assistant":
message["content"] = add_box_token(message["content"])
print(message["content"])
import re
def extract_coordinates(input_string):
# 使用正则表达式提取坐标
coordinates = re.findall(r"start_box='?\((\d+),(\d+)\)'?", input_string)
# 如果有匹配到坐标,返回第一个匹配项
if coordinates:
return tuple(map(int, coordinates[0])) # 返回坐标的整数值
return None # 如果没有找到坐标,返回 None
# # 示例调用
# input_string = 'Thought: 我现在在Preferences窗口中,但看到的是系统资源设置界面。要设置图片为基于调色板的方式,我需要关闭这个窗口,然后找到正确的颜色设置选项。让我先点击"Cancel"按钮关闭这个窗口,然后再寻找正确的颜色设置选项。\nAction: click(start_box=\'(395,810)\')'
# coords = extract_coordinates(input_string)
# print(coords) # 输出:(395, 810)
# 1 读取base64图像
# 2 保存该图像
# 3 按比例进行放缩
# 4 保存该图像
# 5 修改数据文件格式
# 6 保存该文件
import base64
from PIL import Image
from io import BytesIO
import json
import os
import numpy as np
import cv2
import re
def decode_image(data_uri):
# 分离data:image/png;base64,部分
header, encoded = data_uri.split(",", 1)
img_bytes = base64.b64decode(encoded)
img = Image.open(BytesIO(img_bytes))
return img
def scale_action_coordinates(content_str: str, scale: float) -> str:
pattern = r"start_box='\((\d+),\s*(\d+)\)'"
match = re.search(pattern, content_str)
if match:
x, y = map(int, match.groups())
new_x = int(x * scale)
new_y = int(y * scale)
new_box = f"start_box='({new_x},{new_y})'"
# 替换原始字符串中的坐标
content_str = re.sub(pattern, new_box, content_str)
return content_str
def resize_image(img, scale: float = 0.7):
img_np = np.array(img)
img_cv2 = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)
new_w = int(img_cv2.shape[1] * scale)
new_h = int(img_cv2.shape[0] * scale)
resized_img = cv2.resize(img_cv2, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
result_img = Image.fromarray(cv2.cvtColor(resized_img, cv2.COLOR_BGR2RGB))
return result_img
def generate(data_path: str,
save_root: str,
messages_file_name: str,
scale: float = 0.7):
with open(data_path, "r", encoding="utf8") as f:
messages = json.load(f)
idx = 0
ori_img_save_dir = os.path.join(save_root, "images")
resized_img_save_dir = os.path.join(save_root, "resized_images")
os.makedirs(ori_img_save_dir, exist_ok=True)
os.makedirs(resized_img_save_dir, exist_ok=True)
for block in messages:
role = block.get("role", "")
content = block.get("content", None)
if role.lower() == "user":
if isinstance(content, list):
for sub_block in content:
image_url = sub_block.get("image_url", None)
if image_url:
url = image_url.get("url", "")
if url:
original_img = decode_image(url)
resized_img = resize_image(original_img, scale=scale)
original_img.save(os.path.join(ori_img_save_dir, f"image_{idx}.png"))
resized_img.save(os.path.join(resized_img_save_dir, f"resized_image_{idx}.png"))
del sub_block['image_url']
sub_block['type'] = "image"
sub_block['url'] = os.path.join(resized_img_save_dir, f"resized_image_{idx}.png")
idx += 1
elif role.lower() == "assistant":
block['content'] = scale_action_coordinates(content, scale)
messages_save_path = os.path.join(save_root, messages_file_name)
with open(messages_save_path, "w", encoding='utf8') as f:
json.dump(messages, f, indent=4, ensure_ascii=False)
if __name__ == "__main__":
generate("data/test_messages.json", "data", "test_messages_07.json")
from PIL import Image
import matplotlib.pyplot as plt
import json
import base64
from io import BytesIO
from PIL import Image
import math
IMAGE_FACTOR = 28
MIN_PIXELS = 100 * 28 * 28
MAX_PIXELS = 16384 * 28 * 28
MAX_RATIO = 200
VIDEO_MIN_PIXELS = 128 * 28 * 28
VIDEO_MAX_PIXELS = 768 * 28 * 28
FRAME_FACTOR = 2
FPS = 2.0
FPS_MIN_FRAMES = 4
FPS_MAX_FRAMES = 768
def round_by_factor(number: int, factor: int) -> int:
"""Returns the closest integer to 'number' that is divisible by 'factor'."""
return round(number / factor) * factor
def ceil_by_factor(number: int, factor: int) -> int:
"""Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
return math.ceil(number / factor) * factor
def floor_by_factor(number: int, factor: int) -> int:
"""Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
return math.floor(number / factor) * factor
def smart_resize(
height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
) -> tuple[int, int]:
"""
Rescales the image so that the following conditions are met:
1. Both dimensions (height and width) are divisible by 'factor'.
2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
3. The aspect ratio of the image is maintained as closely as possible.
"""
if max(height, width) / min(height, width) > MAX_RATIO:
raise ValueError(
f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
)
h_bar = max(factor, round_by_factor(height, factor))
w_bar = max(factor, round_by_factor(width, factor))
if h_bar * w_bar > max_pixels:
beta = math.sqrt((height * width) / max_pixels)
h_bar = floor_by_factor(height / beta, factor)
w_bar = floor_by_factor(width / beta, factor)
elif h_bar * w_bar < min_pixels:
beta = math.sqrt(min_pixels / (height * width))
h_bar = ceil_by_factor(height * beta, factor)
w_bar = ceil_by_factor(width * beta, factor)
return h_bar, w_bar
def plot(model_output_width,
model_output_height,
image_path: str = './data/images/resized_image4.png',
save_path: str = "test.png"):
# Open the image
img = Image.open(image_path)
width, height = img.size
print(f'Original coordinate: {width}, {height}')
# Calculate the new dimensions
new_height, new_width = smart_resize(height, width)
new_coordinate = (int(model_output_width/new_width * width), int(model_output_height/new_height * height))
print(f'Resized dimensions: {new_width}, {new_height}')
print(new_coordinate)
# Display the image
plt.imshow(img)
plt.scatter([new_coordinate[0]], [new_coordinate[1]], c='red', s=50) # Mark the point with a red dot
plt.axis('off') # Set to 'off' to hide the axes
plt.savefig(save_path, dpi=350, bbox_inches='tight', pad_inches=0)
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment