Unverified Commit 4419410c authored by MPU王荣胜's avatar MPU王荣胜 Committed by GitHub
Browse files

add data

parent c3a4340d
import xml.etree.ElementTree as ET
import os
import shutil
from tqdm import tqdm
for i in tqdm(range(1,4000)):
file_path = 'ecgen-radiology/'+str(i)+'.xml'
# 判断文件存在
if not os.path.isfile(file_path):
continue
else:
# 解析XML文件内容
tree = ET.parse(file_path)
root = tree.getroot()
# 遍历XML文件中所有的<url>标签
num = 1
for url in root.findall('.//url'):
s = str(url.text)
filename = s.split("/")[-1].split(".")[0]
src_file = './NLMCXR_png/'+str(filename)+'.png'
# 新建images文件夹
dst_file = './images/'+str(i)+'_'+str(num)+'.png'
shutil.copy(src_file, dst_file)
num = num+1
\ No newline at end of file
import json
from tqdm import tqdm
with open('./openi-en.json') as f:
data = json.load(f)
markdown_content = ''
for i in tqdm(range(len(data['annotations']))):
# 获取字典对象
#img = data['annotations'][i]['image_id']
annotation = data['annotations'][i]['caption']
markdown_content = markdown_content + str(annotation) + '\n\n'
with open('openi-en-md.md', 'w') as f1:
f1.write(markdown_content)
print(len(data['annotations']))
\ No newline at end of file
import json
from tqdm import tqdm
# 读取data.md文件,获取每行非空内容的列表
with open('data_ch.md', 'r', encoding='utf-8') as f:
data_lines = [line.strip() for line in f if line.strip()]
print(len(data_lines))
with open('./filter_cap.json') as f:
data = json.load(f)
print(len(data['annotations']))
for i in tqdm(range(len(data['annotations']))):
data['annotations'][i]['caption'] = data_lines[i]
with open('data_ch.json', 'w') as f1:
json.dump(data, f1)
with open('./data_ch.json') as f2:
data2 = json.load(f2)
print(len(data2['annotations']))
\ No newline at end of file
This diff is collapsed.
import json
import openai
import time
from tqdm import tqdm
# 设置 OpenAI API 账户信息
openai.api_key = "xxx"
# 定义翻译函数
def translate_text(text):
# 请翻译成中文,你可以适当润色翻译的内容,但是要保证整句话通顺并且原意不变:
prompt = "Translate the following English text to Chinese"+str(text)
completion = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "你是一个非常优秀的中英文翻译器。"},
{"role": "user", "content": prompt}
]
)
#print(completion.choices[0].message['content'])
return str(completion.choices[0].message['content'])
with open('./openi-en.json') as f:
data = json.load(f)
for i in tqdm(range(len(data['annotations']))):
# 获取字典对象
annotation = data['annotations'][i]['caption']
translation = translate_text(annotation)
#print(translation)
data['annotations'][i]['caption'] = str(translation)
# 写入json文件
with open('openi-zh.json', 'w') as f1:
json.dump(data, f1)
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment