add data

4419410c · MPU王荣胜 · GitHub · c3a4340d · 4419410c · 4419410c
Unverified Commit 4419410c authored May 25, 2023 by MPU王荣胜 Committed by GitHub May 25, 2023
11 changed files
--- a/data/demo/rub.png
+++ b/data/demo/rub.png
--- a/data/demo/tianye.png
+++ b/data/demo/tianye.png
--- a/data/demo/titan.png
+++ b/data/demo/titan.png
--- a/data/demo/tower.png
+++ b/data/demo/tower.png
--- a/data/demo/traf.png
+++ b/data/demo/traf.png
--- a/data/demo/woman.png
+++ b/data/demo/woman.png
--- a/data/from_xml_get_images_id.py
+++ b/data/from_xml_get_images_id.py
+import xml.etree.ElementTree as ET
+import os
+import shutil
+from tqdm import tqdm
+
+for i in tqdm(range(1,4000)):
+    file_path = 'ecgen-radiology/'+str(i)+'.xml'
+    # 判断文件存在
+    if not os.path.isfile(file_path):
+        continue
+    else:
+        # 解析XML文件内容
+        tree = ET.parse(file_path)
+        root = tree.getroot()
+        # 遍历XML文件中所有的<url>标签
+        num = 1
+        for url in root.findall('.//url'):
+            s = str(url.text)
+            filename = s.split("/")[-1].split(".")[0]
+            src_file = './NLMCXR_png/'+str(filename)+'.png'
+            # 新建images文件夹
+            dst_file = './images/'+str(i)+'_'+str(num)+'.png'
+            shutil.copy(src_file, dst_file)
+            num = num+1
\ No newline at end of file
--- a/data/json2md.py
+++ b/data/json2md.py
+import json
+from tqdm import tqdm
+
+
+with open('./openi-en.json') as f:
+    data = json.load(f)
+
+markdown_content = ''
+for i in tqdm(range(len(data['annotations']))):
+    # 获取字典对象
+    #img = data['annotations'][i]['image_id']
+    annotation = data['annotations'][i]['caption']
+    markdown_content = markdown_content + str(annotation) + '\n\n'
+
+with open('openi-en-md.md', 'w') as f1:
+    f1.write(markdown_content)
+
+print(len(data['annotations']))
\ No newline at end of file
--- a/data/merge_ch2json.py
+++ b/data/merge_ch2json.py
+import json
+from tqdm import tqdm
+
+# 读取data.md文件，获取每行非空内容的列表
+with open('data_ch.md', 'r', encoding='utf-8') as f:
+    data_lines = [line.strip() for line in f if line.strip()]
+
+print(len(data_lines))
+
+with open('./filter_cap.json') as f:
+    data = json.load(f)
+
+print(len(data['annotations']))
+
+for i in tqdm(range(len(data['annotations']))):
+    data['annotations'][i]['caption'] = data_lines[i]
+
+with open('data_ch.json', 'w') as f1:
+    json.dump(data, f1)
+
+with open('./data_ch.json') as f2:
+    data2 = json.load(f2)
+
+print(len(data2['annotations']))
\ No newline at end of file
--- a/data/openi-en.json
+++ b/data/openi-en.json
--- a/data/translation_en2zh.py
+++ b/data/translation_en2zh.py
+import json 
+import openai
+import time
+from tqdm import tqdm
+
+# 设置 OpenAI API 账户信息
+openai.api_key = "xxx"
+
+# 定义翻译函数
+def translate_text(text):
+    # 请翻译成中文，你可以适当润色翻译的内容，但是要保证整句话通顺并且原意不变：
+    prompt = "Translate the following English text to Chinese"+str(text)
+    completion = openai.ChatCompletion.create(
+    model="gpt-3.5-turbo",
+    messages=[
+        {"role": "system", "content": "你是一个非常优秀的中英文翻译器。"},
+        {"role": "user", "content": prompt}
+    ]
+    )
+    #print(completion.choices[0].message['content'])
+    return str(completion.choices[0].message['content'])
+
+with open('./openi-en.json') as f:
+    data = json.load(f)
+
+for i in tqdm(range(len(data['annotations']))):
+    # 获取字典对象
+    annotation = data['annotations'][i]['caption']
+
+    translation = translate_text(annotation)
+    #print(translation)
+    data['annotations'][i]['caption'] = str(translation)
+
+# 写入json文件
+with open('openi-zh.json', 'w') as f1:
+    json.dump(data, f1)
\ No newline at end of file