Commit 6f3e0c0c authored by Dingquan Yu's avatar Dingquan Yu
Browse files

now used asynchronised version in parse_msa_data

parent aec12764
...@@ -21,7 +21,7 @@ import dataclasses ...@@ -21,7 +21,7 @@ import dataclasses
from multiprocessing import cpu_count from multiprocessing import cpu_count
import tempfile import tempfile
from typing import Mapping, Optional, Sequence, Any, MutableMapping, Union from typing import Mapping, Optional, Sequence, Any, MutableMapping, Union
import asyncio
import numpy as np import numpy as np
import torch import torch
...@@ -737,30 +737,37 @@ class DataPipeline: ...@@ -737,30 +737,37 @@ class DataPipeline:
fp.close() fp.close()
else: else:
for f in os.listdir(alignment_dir): # Now will split the following steps into multiple processes
path = os.path.join(alignment_dir, f) async def parse_stockholm_file(alignment_dir: str, stockholm_file: str):
filename, ext = os.path.splitext(f) path = os.path.join(alignment_dir, stockholm_file)
file_name,_ = os.path.splitext(stockholm_file)
if(ext == ".a3m"): with open(path, "r") as infile:
import time msa = parsers.parse_stockholm(infile.read())
start = time.time() infile.close()
with open(path, "r") as fp: return {file_name: msa}
msa = parsers.parse_a3m(fp.read())
end = time.time() async def parse_a3m_file(alignment_dir: str, a3m_file: str):
calculate_elapse(start, end, "parser.parse_a3m") path = os.path.join(alignment_dir, a3m_file)
elif(ext == ".sto" and not "hmm_output" == filename): file_name,_ = os.path.splitext(a3m_file)
import time with open(path, "r") as infile:
start = time.time() msa = parsers.parse_a3m(infile.read())
with open(path, "r") as fp: infile.close()
msa = parsers.parse_stockholm( return {file_name: msa}
fp.read()
) async def run_parse_all_msa_files(stockholm_files: list, a3m_files: list, alignment_dir:str):
end = time.time() all_tasks = [asyncio.create_task(parse_stockholm_file(alignment_dir, sto)) for sto in stockholm_files]
calculate_elapse(start, end, "parsers.parse_stockholm") all_tasks += [asyncio.create_task(parse_a3m_file(alignment_dir, a3m)) for a3m in a3m_files]
else: results = await asyncio.gather(*all_tasks)
continue return results
stockholm_files = [i for i in os.listdir(alignment_dir) if (i.endswith('.sto') and ("hmm_output" not in i))]
msa_data[f] = msa a3m_files = [i for i in os.listdir(alignment_dir) if i.endswith('.a3m')]
import time
start = time.time()
msa_results = asyncio.run(run_parse_all_msa_files(stockholm_files, a3m_files, alignment_dir))
end = time.time()
calculate_elapse(start, end, "asynchronised version")
for i in msa_results:
msa_data.update({k:v for k,v in i.items()})
return msa_data return msa_data
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment