Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
OpenFold
Commits
3ced7cf2
"git@developer.sourcefind.cn:OpenDAS/openpcdet.git" did not exist on "e466496785ef8990d996ebd9d321505fa42c0660"
Commit
3ced7cf2
authored
Aug 01, 2022
by
Gustaf Ahdritz
Browse files
Improve validation download script
parent
200d9517
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
85 additions
and
20 deletions
+85
-20
README.md
README.md
+4
-0
scripts/download_cameo.py
scripts/download_cameo.py
+81
-20
No files found.
README.md
View file @
3ced7cf2
...
...
@@ -289,6 +289,10 @@ python3 scripts/generate_chain_data_cache.py \
where the
`cluster_file`
argument is a file of chain clusters, one cluster
per line (e.g.
[
PDB40
](
https://cdn.rcsb.org/resources/sequence/clusters/clusters-by-entity-40.txt
)
).
Optionally, download an AlphaFold-style validation set from CAMEO using
`scripts/download_cameo.py`
. Use the resulting FASTA files to generate
validation alignments.
Finally, call the training script:
```
bash
...
...
scripts/download_cameo.py
View file @
3ced7cf2
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import
argparse
import
json
import
os
import
urllib.
request
import
request
s
""" Downloads CAMEO proteins from PDB. The "cameo_table_path" should be a file
containing a CAMEO target table (exported using the "Copy to clipboard"
option). Useful for constructing validation sets.
E.g. https://www.cameo3d.org/modeling/targets/3-months/?to_date=2022-07-02
"""
from
openfold.data
import
mmcif_parsing
VALID_PERIODS
=
[
"1-year"
,
"6-months"
,
"3-months"
,
"1-month"
,
"1-week"
,
]
def
generate_url
(
period
,
end_date
):
return
'/'
.
join
([
"https://www.cameo3d.org/"
,
"modeling"
,
"targets"
,
period
,
"ajax"
,
f
"?to_date=
{
end_date
}
"
,
])
def
main
(
args
):
with
open
(
args
.
cameo_table_path
,
"r"
)
as
fp
:
lines
=
[
l
.
strip
()
for
l
in
fp
.
readlines
()]
data_dir_path
=
os
.
path
.
join
(
args
.
output_dir
,
"data_dir"
)
fasta_dir_path
=
os
.
path
.
join
(
args
.
output_dir
,
"fasta_dir"
)
os
.
makedirs
(
data_dir_path
,
exist_ok
=
True
)
os
.
makedirs
(
fasta_dir_path
,
exist_ok
=
True
)
url
=
generate_url
(
args
.
period
,
args
.
end_date
)
raw_data
=
requests
.
get
(
url
).
text
parsed_data
=
json
.
loads
(
raw_data
)
chain_data
=
parsed_data
[
"aaData"
]
for
chain
in
chain_data
:
pdb_id
=
chain
[
"pdbid"
]
chain_id
=
chain
[
"pdbid_chain"
]
pdb_url
=
f
"https://files.rcsb.org/view/
{
pdb_id
.
upper
()
}
.cif"
pdb_file
=
requests
.
get
(
pdb_url
).
text
splits
=
[
l
.
split
()
for
l
in
lines
]
prots
,
chain_ids
=
zip
(
*
[
s
[
5
:
7
]
for
s
in
splits
])
chain_ids
=
[
chain_id
.
strip
(
'['
).
strip
(
']'
)
for
chain_id
in
chain_ids
]
parsed_cif
=
mmcif_parsing
.
parse
(
file_id
=
pdb_id
,
mmcif_string
=
pdb_file
)
mmcif_object
=
parsed_cif
.
mmcif_object
if
(
mmcif_object
is
None
):
raise
list
(
parsed_cif
.
errors
.
values
())[
0
]
for
prot
in
prots
:
url
=
f
"https://files.rcsb.org/view/
{
prot
.
upper
()
}
.cif"
out_path
=
os
.
path
.
join
(
args
.
output_dir
,
f
"
{
prot
}
.cif"
)
if
(
not
os
.
path
.
exists
(
out_path
)):
urllib
.
request
.
urlretrieve
(
url
,
out_path
)
seq
=
mmcif_object
.
chain_to_seqres
[
chain_id
]
if
(
args
.
max_seqlen
>
0
):
if
(
len
(
seq
)
>
len
(
seq
)):
continue
fasta_file
=
'
\n
'
.
join
([
f
">
{
pdb_id
}
_
{
chain_id
}
"
,
seq
,
])
fasta_filename
=
f
"
{
pdb_id
}
_
{
chain_id
}
.fasta"
with
open
(
os
.
path
.
join
(
fasta_dir_path
,
fasta_filename
),
"w"
)
as
fp
:
fp
.
write
(
fasta_file
)
cif_filename
=
f
"
{
pdb_id
}
.cif"
with
open
(
os
.
path
.
join
(
data_dir_path
,
cif_filename
),
"w"
)
as
fp
:
fp
.
write
(
pdb_file
)
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"cameo_table_path"
,
type
=
str
)
parser
.
add_argument
(
"output_dir"
,
type
=
str
)
parser
.
add_argument
(
"period"
,
type
=
str
,
help
=
f
"""The length of the period from which to draw CAMEO proteins.
Choose from
{
VALID_PERIODS
}
"""
)
parser
.
add_argument
(
"end_date"
,
type
=
str
,
help
=
"The date marking the end of the period (YYYY-MM-DD)"
)
parser
.
add_argument
(
"output_dir"
)
parser
.
add_argument
(
"--max_seqlen"
,
type
=
int
,
default
=
700
,
help
=
"The maximum length in residues of downloaded proteins (or -1)"
)
args
=
parser
.
parse_args
()
main
(
args
)
if
(
args
.
period
not
in
VALID_PERIODS
):
raise
ValueError
(
f
"Invalid period. Choose from
{
VALID_PERIODS
}
"
)
main
(
args
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment