Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
OpenFold
Commits
c3c627e7
Commit
c3c627e7
authored
Dec 01, 2023
by
Dingquan Yu
Browse files
now run in a subprocess
parent
2e1941a0
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
65 additions
and
45 deletions
+65
-45
openfold/data/data_pipeline.py
openfold/data/data_pipeline.py
+14
-45
openfold/data/parse_msa_files.py
openfold/data/parse_msa_files.py
+51
-0
No files found.
openfold/data/data_pipeline.py
View file @
c3c627e7
...
@@ -21,15 +21,14 @@ import dataclasses
...
@@ -21,15 +21,14 @@ import dataclasses
from
multiprocessing
import
cpu_count
from
multiprocessing
import
cpu_count
import
tempfile
import
tempfile
from
typing
import
Mapping
,
Optional
,
Sequence
,
Any
,
MutableMapping
,
Union
from
typing
import
Mapping
,
Optional
,
Sequence
,
Any
,
MutableMapping
,
Union
import
asyncio
,
multiprocessing
import
subprocess
import
concurrent.futures
import
numpy
as
np
import
numpy
as
np
import
torch
import
torch
from
openfold.data
import
templates
,
parsers
,
mmcif_parsing
,
msa_identifiers
,
msa_pairing
,
feature_processing_multimer
from
openfold.data
import
templates
,
parsers
,
mmcif_parsing
,
msa_identifiers
,
msa_pairing
,
feature_processing_multimer
from
openfold.data.templates
import
get_custom_template_features
,
empty_template_feats
from
openfold.data.templates
import
get_custom_template_features
,
empty_template_feats
from
openfold.data.tools
import
jackhmmer
,
hhblits
,
hhsearch
,
hmmsearch
from
openfold.data.tools
import
jackhmmer
,
hhblits
,
hhsearch
,
hmmsearch
from
openfold.data.tools.utils
import
to_date
from
openfold.data.tools.utils
import
to_date
,
NonDaemonicProcess
,
NonDaemonicProcessPool
from
openfold.np
import
residue_constants
,
protein
from
openfold.np
import
residue_constants
,
protein
...
@@ -739,51 +738,21 @@ class DataPipeline:
...
@@ -739,51 +738,21 @@ class DataPipeline:
fp
.
close
()
fp
.
close
()
else
:
else
:
# Now will split the following steps into multiple processes
# Now will split the following steps into multiple processes
def
parse_stockholm_file
(
alignment_dir
:
str
,
stockholm_file
:
str
,
queue
:
multiprocessing
.
Queue
):
path
=
os
.
path
.
join
(
alignment_dir
,
stockholm_file
)
file_name
,
_
=
os
.
path
.
splitext
(
stockholm_file
)
with
open
(
path
,
"r"
)
as
infile
:
msa
=
parsers
.
parse_stockholm
(
infile
.
read
())
infile
.
close
()
queue
.
put
({
file_name
:
msa
})
def
parse_a3m_file
(
alignment_dir
:
str
,
a3m_file
:
str
,
queue
:
multiprocessing
.
Queue
):
# a3m_tasks = [(alignment_dir, a3m) for a3m in a3m_files]
path
=
os
.
path
.
join
(
alignment_dir
,
a3m_file
)
# sto_tasks = [(alignment_dir, sto) for sto in stockholm_files]
file_name
,
_
=
os
.
path
.
splitext
(
a3m_file
)
# with NonDaemonicProcessPool(len(a3m_tasks) + len(sto_tasks)) as pool:
with
open
(
path
,
"r"
)
as
infile
:
# a3m_results = pool.starmap(parse_a3m_file, a3m_tasks)
msa
=
parsers
.
parse_a3m
(
infile
.
read
())
# sto_results = pool.starmap(parse_stockholm_file, sto_tasks)
infile
.
close
()
# msa_results = {**a3m_results, **sto_results}
queue
.
put
({
file_name
:
msa
})
import
time
,
json
current_directory
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
def
run_parse_all_msa_files_multiprocessing
(
stockholm_files
:
list
,
a3m_files
:
list
,
alignment_dir
:
str
):
print
(
f
"#### line 764 start running in multiprocessing way"
)
msa_results
=
{}
processes
=
[]
queue
=
multiprocessing
.
Queue
()
for
f
in
stockholm_files
:
process
=
multiprocessing
.
Process
(
target
=
parse_stockholm_file
,
args
=
(
alignment_dir
,
f
,
queue
))
processes
.
append
(
process
)
process
.
start
()
for
f
in
a3m_files
:
process
=
multiprocessing
.
Process
(
target
=
parse_a3m_file
,
args
=
(
alignment_dir
,
f
,
queue
))
processes
.
append
(
process
)
process
.
start
()
for
p
in
processes
:
res
=
queue
.
get
()
msa_results
.
update
(
res
)
p
.
join
()
return
msa_results
stockholm_files
=
[
i
for
i
in
os
.
listdir
(
alignment_dir
)
if
(
i
.
endswith
(
'.sto'
)
and
(
"hmm_output"
not
in
i
))]
a3m_files
=
[
i
for
i
in
os
.
listdir
(
alignment_dir
)
if
i
.
endswith
(
'.a3m'
)]
import
time
start
=
time
.
time
()
start
=
time
.
time
()
# msa_data = asyncio.run(run_parse_all_msa_files(stockholm_files, a3m_files, alignment_dir))
cmd
=
f
"
{
current_directory
}
/parse_msa_files.py
{
alignment_dir
}
"
msa_data
=
run_parse_all_msa_files_multiprocessing
(
stockholm_files
,
a3m_files
,
alignment_dir
)
msa_data
=
subprocess
.
check_output
([
'python'
,
cmd
],
capture_output
=
True
,
text
=
True
)
msa_data
=
json
.
load
(
msa_data
)
end
=
time
.
time
()
end
=
time
.
time
()
calculate_elapse
(
start
,
end
,
"
asynchronised
version"
)
calculate_elapse
(
start
,
end
,
"
multiprocessing
version"
)
return
msa_data
return
msa_data
...
...
openfold/data/parse_msa_files.py
0 → 100644
View file @
c3c627e7
import
os
,
multiprocessing
,
argparse
,
json
from
openfold.data
import
parsers
def
parse_stockholm_file
(
alignment_dir
:
str
,
stockholm_file
:
str
,
queue
:
multiprocessing
.
Queue
):
path
=
os
.
path
.
join
(
alignment_dir
,
stockholm_file
)
file_name
,
_
=
os
.
path
.
splitext
(
stockholm_file
)
with
open
(
path
,
"r"
)
as
infile
:
msa
=
parsers
.
parse_stockholm
(
infile
.
read
())
infile
.
close
()
queue
.
put
({
file_name
:
msa
})
def
parse_a3m_file
(
alignment_dir
:
str
,
a3m_file
:
str
,
queue
:
multiprocessing
.
Queue
):
path
=
os
.
path
.
join
(
alignment_dir
,
a3m_file
)
file_name
,
_
=
os
.
path
.
splitext
(
a3m_file
)
with
open
(
path
,
"r"
)
as
infile
:
msa
=
parsers
.
parse_a3m
(
infile
.
read
())
infile
.
close
()
queue
.
put
({
file_name
:
msa
})
def
run_parse_all_msa_files_multiprocessing
(
stockholm_files
:
list
,
a3m_files
:
list
,
alignment_dir
:
str
):
print
(
f
"#### line 764 start running in multiprocessing way"
)
msa_results
=
{}
processes
=
[]
queue
=
multiprocessing
.
Queue
()
for
f
in
stockholm_files
:
process
=
multiprocessing
.
Process
(
target
=
parse_stockholm_file
,
args
=
(
alignment_dir
,
f
,
queue
))
process
.
deamon
=
False
processes
.
append
(
process
)
process
.
start
()
for
f
in
a3m_files
:
process
=
multiprocessing
.
Process
(
target
=
parse_a3m_file
,
args
=
(
alignment_dir
,
f
,
queue
))
process
.
daemon
=
False
processes
.
append
(
process
)
process
.
start
()
for
p
in
processes
:
res
=
queue
.
get
()
msa_results
.
update
(
res
)
p
.
join
()
def
main
(
alignment_dir
):
parser
=
argparse
.
ArgumentParser
(
description
=
'Process msa files in parallel'
)
parser
.
add_argument
(
'alignment_dir'
,
metavar
=
'N'
,
type
=
int
,
nargs
=
'+'
,
help
=
'an integer for the accumulator'
)
args
=
parser
.
parse_args
()
stockholm_files
=
[
i
for
i
in
os
.
listdir
(
alignment_dir
)
if
(
i
.
endswith
(
'.sto'
)
and
(
"hmm_output"
not
in
i
))]
a3m_files
=
[
i
for
i
in
os
.
listdir
(
alignment_dir
)
if
i
.
endswith
(
'.a3m'
)]
msa_data
=
run_parse_all_msa_files_multiprocessing
(
stockholm_files
,
a3m_files
,
alignment_dir
)
return
json
.
dumps
(
msa_data
)
if
__name__
==
"__main__"
:
main
()
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment