Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
OpenFold
Commits
39a6d0e6
Commit
39a6d0e6
authored
Apr 09, 2023
by
Christina Floristean
Browse files
Merging in main branch
parents
d8ee9c5f
84659c93
Changes
101
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
614 additions
and
37 deletions
+614
-37
scripts/colabfold_search.sh
scripts/colabfold_search.sh
+1
-1
scripts/convert_of_weights_to_jax.py
scripts/convert_of_weights_to_jax.py
+103
-0
scripts/download_cameo.py
scripts/download_cameo.py
+104
-0
scripts/download_colabfold_envdb.sh
scripts/download_colabfold_envdb.sh
+1
-1
scripts/download_mgnify.sh
scripts/download_mgnify.sh
+0
-2
scripts/download_openfold_params.sh
scripts/download_openfold_params.sh
+34
-0
scripts/download_openfold_params_gdrive.sh
scripts/download_openfold_params_gdrive.sh
+67
-0
scripts/download_openfold_params_huggingface.sh
scripts/download_openfold_params_huggingface.sh
+32
-0
scripts/download_pdb70.sh
scripts/download_pdb70.sh
+1
-1
scripts/download_roda_pdbs.sh
scripts/download_roda_pdbs.sh
+52
-0
scripts/download_uniref30.sh
scripts/download_uniref30.sh
+2
-1
scripts/download_uniref90.sh
scripts/download_uniref90.sh
+1
-2
scripts/flatten_roda.sh
scripts/flatten_roda.sh
+42
-0
scripts/generate_alphafold_feature_dict.py
scripts/generate_alphafold_feature_dict.py
+39
-11
scripts/generate_chain_data_cache.py
scripts/generate_chain_data_cache.py
+2
-3
scripts/install_hh_suite.sh
scripts/install_hh_suite.sh
+8
-8
scripts/install_third_party_dependencies.sh
scripts/install_third_party_dependencies.sh
+15
-4
scripts/precompute_alignments.py
scripts/precompute_alignments.py
+1
-1
scripts/prep_mmseqs_dbs.sh
scripts/prep_mmseqs_dbs.sh
+2
-2
scripts/slurm_scripts/run_uniclust30_search.sh
scripts/slurm_scripts/run_uniclust30_search.sh
+107
-0
No files found.
scripts/colabfold_search.sh
View file @
39a6d0e6
#!/bin/bash -e
# Copied from colabfold.mmseqs.com
#!/bin/bash -e
MMSEQS
=
"
$1
"
QUERY
=
"
$2
"
DBBASE
=
"
$3
"
...
...
scripts/convert_of_weights_to_jax.py
0 → 100644
View file @
39a6d0e6
# Copyright 2022 AlQuraishi Laboratory
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Converts OpenFold .pt checkpoints into AlphaFold .npz ones, which can then be
# used to run inference using DeepMind's JAX code.
import
argparse
import
numpy
as
np
import
torch
from
openfold.config
import
model_config
from
openfold.model.model
import
AlphaFold
from
openfold.utils.import_weights
import
(
Param
,
ParamType
,
generate_translation_dict
,
process_translation_dict
,
)
from
openfold.utils.tensor_utils
import
tree_map
def
reshape_fn
(
of_param
,
af_weight
):
transformations
=
{
ParamType
.
LinearWeight
:
lambda
w
:
w
.
transpose
(
-
1
,
-
2
),
ParamType
.
LinearWeightMHA
:
lambda
w
:
w
.
transpose
(
-
1
,
-
2
).
reshape
(
af_weight
.
shape
),
ParamType
.
LinearMHAOutputWeight
:
lambda
w
:
w
.
transpose
(
-
1
,
-
2
).
reshape
(
af_weight
.
shape
),
ParamType
.
LinearBiasMHA
:
lambda
w
:
w
.
reshape
(
af_weight
.
shape
),
ParamType
.
LinearWeightOPM
:
lambda
w
:
w
.
transpose
(
-
1
,
-
2
).
reshape
(
af_weight
.
shape
),
ParamType
.
Other
:
lambda
w
:
w
,
}
if
(
of_param
.
stacked
):
of_weight
=
torch
.
stack
([
torch
.
Tensor
(
p
)
for
p
in
of_param
.
param
])
else
:
of_weight
=
torch
.
Tensor
(
of_param
.
param
)
return
transformations
[
of_param
.
param_type
](
of_weight
)
def
transfer
(
of_dict
,
af_weight_template
):
for
k
in
of_dict
:
if
(
type
(
of_dict
[
k
])
==
dict
):
transfer
(
of_dict
[
k
],
af_weight_template
[
k
])
else
:
reshaped
=
reshape_fn
(
of_dict
[
k
],
af_weight_template
[
k
])
reshaped
=
reshaped
.
detach
().
numpy
()
np
.
copyto
(
af_weight_template
[
k
],
reshaped
)
def
main
(
args
):
d
=
torch
.
load
(
args
.
of_pt_path
)
config
=
model_config
(
args
.
config_preset
)
model
=
AlphaFold
(
config
)
model
.
load_state_dict
(
d
)
translation
=
generate_translation_dict
(
model
,
args
.
config_preset
)
translation
=
process_translation_dict
(
translation
)
af_weight_template
=
np
.
load
(
args
.
template_npz_path
)
af_weight_template
=
{
k
:
v
for
k
,
v
in
af_weight_template
.
items
()
if
k
in
translation
}
zero
=
lambda
n
:
n
*
0
af_weight_template
=
tree_map
(
zero
,
af_weight_template
,
np
.
ndarray
)
transfer
(
translation
,
af_weight_template
)
np
.
savez
(
args
.
out_path
,
**
af_weight_template
)
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"of_pt_path"
,
type
=
str
,
help
=
"Path to OpenFold .pt checkpoint file"
)
parser
.
add_argument
(
"config_preset"
,
type
=
str
,
help
=
"The corresponding config preset"
)
parser
.
add_argument
(
"out_path"
,
type
=
str
,
help
=
"Path for output .npz file"
)
parser
.
add_argument
(
"--template_npz_path"
,
type
=
str
,
default
=
"openfold/resources/params/params_model_1_ptm.npz"
,
help
=
"""Path to an AlphaFold checkpoint w/ a superset of the OF
checkpoint's parameters. params_model_1_ptm.npz always works.
"""
)
args
=
parser
.
parse_args
()
main
(
args
)
scripts/download_cameo.py
0 → 100644
View file @
39a6d0e6
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import
argparse
import
json
import
os
import
re
import
requests
from
openfold.data
import
mmcif_parsing
VALID_PERIODS
=
[
"1-year"
,
"6-months"
,
"3-months"
,
"1-month"
,
"1-week"
,
]
def
generate_url
(
period
,
end_date
):
return
'/'
.
join
([
"https://www.cameo3d.org/"
,
"modeling"
,
"targets"
,
period
,
"ajax"
,
f
"?to_date=
{
end_date
}
"
,
])
def
main
(
args
):
data_dir_path
=
os
.
path
.
join
(
args
.
output_dir
,
"data_dir"
)
fasta_dir_path
=
os
.
path
.
join
(
args
.
output_dir
,
"fasta_dir"
)
os
.
makedirs
(
data_dir_path
,
exist_ok
=
True
)
os
.
makedirs
(
fasta_dir_path
,
exist_ok
=
True
)
url
=
generate_url
(
args
.
period
,
args
.
end_date
)
raw_data
=
requests
.
get
(
url
).
text
parsed_data
=
json
.
loads
(
raw_data
)
chain_data
=
parsed_data
[
"aaData"
]
for
chain
in
chain_data
:
pdb_id
=
chain
[
"pdbid"
]
chain_id
=
chain
[
"pdbid_chain"
]
pdb_url
=
f
"https://files.rcsb.org/view/
{
pdb_id
.
upper
()
}
.cif"
pdb_file
=
requests
.
get
(
pdb_url
).
text
parsed_cif
=
mmcif_parsing
.
parse
(
file_id
=
pdb_id
,
mmcif_string
=
pdb_file
)
mmcif_object
=
parsed_cif
.
mmcif_object
if
(
mmcif_object
is
None
):
raise
list
(
parsed_cif
.
errors
.
values
())[
0
]
seq
=
mmcif_object
.
chain_to_seqres
[
chain_id
]
if
(
args
.
max_seqlen
>
0
):
if
(
len
(
seq
)
>
len
(
seq
)):
continue
fasta_file
=
'
\n
'
.
join
([
f
">
{
pdb_id
}
_
{
chain_id
}
"
,
seq
,
])
fasta_filename
=
f
"
{
pdb_id
}
_
{
chain_id
}
.fasta"
with
open
(
os
.
path
.
join
(
fasta_dir_path
,
fasta_filename
),
"w"
)
as
fp
:
fp
.
write
(
fasta_file
)
cif_filename
=
f
"
{
pdb_id
}
.cif"
with
open
(
os
.
path
.
join
(
data_dir_path
,
cif_filename
),
"w"
)
as
fp
:
fp
.
write
(
pdb_file
)
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"period"
,
type
=
str
,
help
=
f
"""The length of the period from which to draw CAMEO proteins.
Choose from
{
VALID_PERIODS
}
"""
)
parser
.
add_argument
(
"end_date"
,
type
=
str
,
help
=
"The date marking the end of the period (YYYY-MM-DD)"
)
parser
.
add_argument
(
"output_dir"
)
parser
.
add_argument
(
"--max_seqlen"
,
type
=
int
,
default
=
700
,
help
=
"The maximum length in residues of downloaded proteins (or -1)"
)
args
=
parser
.
parse_args
()
if
(
args
.
period
not
in
VALID_PERIODS
):
raise
ValueError
(
f
"Invalid period. Choose from
{
VALID_PERIODS
}
"
)
date_regex
=
re
.
compile
(
"^[0-9]{4}-[0-9]{2}-[0-9]{2}$"
)
if
(
not
date_regex
.
match
(
args
.
end_date
)):
raise
ValueError
(
f
"Invalid end_date:
{
args
.
end_date
}
. Use YYYY-MM-DD format"
)
main
(
args
)
scripts/download_colabfold_envdb.sh
View file @
39a6d0e6
...
...
@@ -35,4 +35,4 @@ SOURCE_URL="http://wwwuser.gwdg.de/~compbiol/colabfold/colabfold_envdb_202108.ta
BASENAME
=
$(
basename
"
${
SOURCE_URL
}
"
)
mkdir
--parents
"
${
ROOT_DIR
}
"
aria2c
"
${
SOURCE_URL
}
"
--dir
=
"
${
ROOT_DIR
}
"
-x
4
aria2c
"
${
SOURCE_URL
}
"
--dir
=
"
${
ROOT_DIR
}
"
-x
4
--check-certificate
=
false
scripts/download_mgnify.sh
View file @
39a6d0e6
...
...
@@ -38,6 +38,4 @@ BASENAME=$(basename "${SOURCE_URL}")
mkdir
--parents
"
${
ROOT_DIR
}
"
aria2c
"
${
SOURCE_URL
}
"
--dir
=
"
${
ROOT_DIR
}
"
pushd
"
${
ROOT_DIR
}
"
gunzip
"
${
ROOT_DIR
}
/
${
BASENAME
}
"
popd
scripts/download_
data
.sh
→
scripts/download_
openfold_params
.sh
View file @
39a6d0e6
...
...
@@ -14,9 +14,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Downloads
and unzips all required data for AlphaFold
.
# Downloads
OpenFold parameters
.
#
# Usage: bash download_
all_data
.sh /path/to/download/directory
# Usage: bash download_
openfold_params_huggingface
.sh /path/to/download/directory
set
-e
if
[[
$#
-eq
0
]]
;
then
...
...
@@ -24,28 +24,11 @@ if [[ $# -eq 0 ]]; then
exit
1
fi
if
!
command
-v
a
ria2c
&> /dev/null
;
then
echo
"Error: a
ria2c
could not be found. Please install a
ria2c (sudo apt install aria2)
."
if
!
command
-v
a
ws
&> /dev/null
;
then
echo
"Error: a
ws
could not be found. Please install a
ws
."
exit
1
fi
DOWNLOAD_DIR
=
"
$1
"
DOWNLOAD_MODE
=
"
${
2
:-
full_dbs
}
"
# Default mode to full_dbs.
if
[[
"
${
DOWNLOAD_MODE
}
"
!=
full_dbs
&&
"
${
DOWNLOAD_MODE
}
"
!=
reduced_dbs
]]
then
echo
"DOWNLOAD_MODE
${
DOWNLOAD_MODE
}
not recognized."
exit
1
fi
SCRIPT_DIR
=
"
$(
dirname
"
$(
realpath
"
$0
"
)
"
)
"
echo
"Downloading AlphaFold parameters..."
bash
"
${
SCRIPT_DIR
}
/download_alphafold_params.sh"
"
${
DOWNLOAD_DIR
}
"
echo
"Downloading PDB70..."
bash
"
${
SCRIPT_DIR
}
/download_pdb70.sh"
"
${
DOWNLOAD_DIR
}
"
echo
"Downloading PDB mmCIF files..."
bash
"
${
SCRIPT_DIR
}
/download_pdb_mmcif.sh"
"
${
DOWNLOAD_DIR
}
"
echo
"All data downloaded."
DOWNLOAD_DIR
=
"
${
1
}
/openfold_params"
mkdir
-p
"
${
DOWNLOAD_DIR
}
"
aws s3
cp
--no-sign-request
--region
us-east-1 s3://openfold/openfold_params/
"
${
DOWNLOAD_DIR
}
"
--recursive
scripts/download_openfold_params_gdrive.sh
0 → 100755
View file @
39a6d0e6
#!/bin/bash
#
# Copyright 2021 DeepMind Technologies Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Downloads and unzips OpenFold parameters from Google Drive. Alternative to
# the HuggingFace version.
#
# Usage: bash download_openfold_params_gdrive.sh /path/to/download/directory
set
-e
if
[[
$#
-eq
0
]]
;
then
echo
"Error: download directory must be provided as an input argument."
exit
1
fi
FILE_ID
=
"1GVzZA2nbdBbz6TKydvzquhfELJ3Movnb"
FILENAME
=
"openfold_params_07_22.tar.gz"
download_from_gdrive
()
{
FILE_ID
=
"
$1
"
OUT_DIR
=
"
$2
"
MSG
=
$(
wget
\
--quiet
\
--save-cookies
/tmp/cookies_
$$
.txt
\
--keep-session-cookies
\
--no-check-certificate
\
"https://docs.google.com/uc?export=download&id=
${
FILE_ID
}
"
\
-O-
\
)
CONFIRM
=
$(
echo
$MSG
|
sed
-rn
"s/.*confirm=([0-9A-Za-z_]+).*/
\1\n
/p"
)
FILENAME
=
$(
echo
$MSG
|
sed
-e
"s/.*<a href=
\"\/
open?id=
${
FILE_ID
}
\"
>
\(
.*
\)
<
\/
a> (.*/
\1
/"
)
FILEPATH
=
"
${
OUT_DIR
}
/
${
FILENAME
}
"
wget
\
--quiet
\
--load-cookies
/tmp/cookies_
$$
.txt
\
"https://docs.google.com/uc?export=download&confirm=
${
CONFIRM
}
&id=
${
FILE_ID
}
"
\
-O
"
${
FILEPATH
}
"
rm
/tmp/cookies_
$$
.txt
echo
$FILEPATH
}
DOWNLOAD_DIR
=
"
$1
"
mkdir
-p
"
${
DOWNLOAD_DIR
}
"
DOWNLOAD_PATH
=
$(
download_from_gdrive
$FILE_ID
"
${
DOWNLOAD_DIR
}
"
)
DOWNLOAD_FILENAME
=
$(
basename
"
${
DOWNLOAD_PATH
}
"
)
if
[[
$FILENAME
!=
$DOWNLOAD_FILENAME
]]
;
then
echo
"Error: Downloaded filename
${
DOWNLOAD_FILENAME
}
does not match expected filename
${
FILENAME
}
"
rm
"
${
DOWNLOAD_PATH
}
"
exit
fi
tar
--extract
--verbose
--file
=
"
${
DOWNLOAD_PATH
}
"
\
--directory
=
"
${
DOWNLOAD_DIR
}
"
--preserve-permissions
rm
"
${
DOWNLOAD_PATH
}
"
scripts/download_openfold_params_huggingface.sh
0 → 100755
View file @
39a6d0e6
#!/bin/bash
#
# Copyright 2021 DeepMind Technologies Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Downloads and unzips OpenFold parameters.
#
# Usage: bash download_openfold_params_huggingface.sh /path/to/download/directory
set
-e
if
[[
$#
-eq
0
]]
;
then
echo
"Error: download directory must be provided as an input argument."
exit
1
fi
URL
=
"https://huggingface.co/nz/OpenFold"
DOWNLOAD_DIR
=
"
${
1
}
/openfold_params/"
mkdir
-p
"
${
DOWNLOAD_DIR
}
"
git clone
$URL
"
${
DOWNLOAD_DIR
}
"
rm
-rf
"
${
DOWNLOAD_DIR
}
/.git"
scripts/download_pdb70.sh
View file @
39a6d0e6
...
...
@@ -35,7 +35,7 @@ SOURCE_URL="http://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/hhsuite_dbs/
BASENAME
=
$(
basename
"
${
SOURCE_URL
}
"
)
mkdir
--parents
"
${
ROOT_DIR
}
"
aria2c
"
${
SOURCE_URL
}
"
--dir
=
"
${
ROOT_DIR
}
"
aria2c
"
${
SOURCE_URL
}
"
--dir
=
"
${
ROOT_DIR
}
"
--check-certificate
=
false
tar
--extract
--verbose
--file
=
"
${
ROOT_DIR
}
/
${
BASENAME
}
"
\
--directory
=
"
${
ROOT_DIR
}
"
rm
"
${
ROOT_DIR
}
/
${
BASENAME
}
"
scripts/download_roda_pdbs.sh
0 → 100755
View file @
39a6d0e6
#!/bin/bash
#
# Copyright 2021 AlQuraishi Laboratories
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Downloads .cif files matching the RODA alignments. Outputs a list of
# RODA alignments for which .cif files could not be found..
if
[[
$#
!=
2
]]
;
then
echo
"usage: ./download_roda_pdbs.sh <out_dir> <roda_pdb_alignment_dir>"
exit
1
fi
OUT_DIR
=
$1
RODA_ALIGNMENT_DIR
=
$2
if
[[
-d
$OUT_DIR
]]
;
then
echo
"
${
OUT_DIR
}
already exists. Download failed..."
exit
1
fi
SERVER
=
snapshotrsync.rcsb.org
# RCSB server name
PORT
=
873
# port RCSB server is using
rsync
-rlpt
-v
-z
--delete
--port
=
$PORT
$SERVER
::20220103/pub/pdb/data/structures/divided/mmCIF/
$OUT_DIR
2>&1
>
/dev/null
for
f
in
$(
find
$OUT_DIR
-mindepth
2
-type
f
)
;
do
mv
$f
$OUT_DIR
BASENAME
=
$(
basename
$f
)
gunzip
"
${
OUT_DIR
}
/
${
BASENAME
}
"
done
find
$OUT_DIR
-mindepth
1
-type
d,l
-delete
for
d
in
$(
find
$RODA_ALIGNMENT_DIR
-mindepth
1
-maxdepth
1
-type
d
)
;
do
BASENAME
=
$(
basename
$d
)
PDB_ID
=
$(
echo
$BASENAME
|
cut
-d
'_'
-f
1
)
CIF_PATH
=
"
${
OUT_DIR
}
/
${
PDB_ID
}
.cif"
if
[[
!
-f
$CIF_PATH
]]
;
then
echo
$d
fi
done
scripts/download_uniref30.sh
View file @
39a6d0e6
...
...
@@ -35,4 +35,5 @@ SOURCE_URL="http://wwwuser.gwdg.de/~compbiol/colabfold/uniref30_2103.tar.gz"
BASENAME
=
$(
basename
"
${
SOURCE_URL
}
"
)
mkdir
--parents
"
${
ROOT_DIR
}
"
aria2c
"
${
SOURCE_URL
}
"
--dir
=
"
${
ROOT_DIR
}
"
-x
4
aria2c
"
${
SOURCE_URL
}
"
--dir
=
"
${
ROOT_DIR
}
"
-x
4
--check-certificate
=
false
gunzip
"
${
ROOT_DIR
}
/
${
BASENAME
}
"
scripts/download_uniref90.sh
View file @
39a6d0e6
...
...
@@ -36,6 +36,5 @@ BASENAME=$(basename "${SOURCE_URL}")
mkdir
--parents
"
${
ROOT_DIR
}
"
aria2c
"
${
SOURCE_URL
}
"
--dir
=
"
${
ROOT_DIR
}
"
pushd
"
${
ROOT_DIR
}
"
gunzip
"
${
ROOT_DIR
}
/
${
BASENAME
}
"
popd
scripts/flatten_roda.sh
0 → 100755
View file @
39a6d0e6
#!/usr/bin/env sh
#
# Flattens a downloaded RODA database into the format expected by OpenFold
# Args:
# roda_dir:
# The path to the database you want to flatten. E.g. "roda/pdb"
# or "roda/uniclust30". Note that, to save space, this script
# will empty this directory.
# output_dir:
# The directory in which to construct the reformatted data
if
[[
$#
!=
2
]]
;
then
echo
"usage: ./flatten_roda.sh <roda_dir> <output_dir>"
exit
1
fi
RODA_DIR
=
$1
OUTPUT_DIR
=
$2
DATA_DIR
=
"
${
OUTPUT_DIR
}
/data"
ALIGNMENT_DIR
=
"
${
OUTPUT_DIR
}
/alignments"
mkdir
-p
"
${
DATA_DIR
}
"
mkdir
-p
"
${
ALIGNMENT_DIR
}
"
for
chain_dir
in
$(
ls
"
${
RODA_DIR
}
"
)
;
do
CHAIN_DIR_PATH
=
"
${
RODA_DIR
}
/
${
chain_dir
}
"
for
subdir
in
$(
ls
"
${
CHAIN_DIR_PATH
}
"
)
;
do
if
[[
$subdir
=
"pdb"
]]
||
[[
$subdir
=
"cif"
]]
;
then
mv
"
${
CHAIN_DIR_PATH
}
/
${
subdir
}
"
/
*
"
${
DATA_DIR
}
"
else
CHAIN_ALIGNMENT_DIR
=
"
${
ALIGNMENT_DIR
}
/
${
chain_dir
}
"
mkdir
-p
"
${
CHAIN_ALIGNMENT_DIR
}
"
mv
"
${
CHAIN_DIR_PATH
}
/
${
subdir
}
"
/
*
"
${
CHAIN_ALIGNMENT_DIR
}
"
fi
done
done
NO_DATA_FILES
=
$(
find
"
${
DATA_DIR
}
"
-type
f |
wc
-l
)
if
[[
$NO_DATA_FILES
=
0
]]
;
then
rm
-rf
${
DATA_DIR
}
fi
scripts/generate_alphafold_feature_dict.py
View file @
39a6d0e6
...
...
@@ -2,35 +2,62 @@ import argparse
import
os
import
pickle
from
alphafold.data
import
pipeline
,
templates
from
alphafold.data
import
pipeline
,
pipeline_multimer
,
templates
from
alphafold.data.tools
import
hmmsearch
,
hhsearch
from
scripts.utils
import
add_data_args
def
main
(
args
):
template_featurizer
=
templates
.
TemplateHitFeaturizer
(
mmcif_dir
=
args
.
mmcif_dir
,
max_template_date
=
args
.
max_template_date
,
max_hits
=
20
,
kalign_binary_path
=
args
.
kalign_binary_path
,
release_dates_path
=
None
,
obsolete_pdbs_path
=
args
.
obsolete_pdbs_path
,
)
if
(
args
.
multimer
):
template_searcher
=
hmmsearch
.
Hmmsearch
(
binary_path
=
args
.
hmmsearch_binary_path
,
hmmbuild_binary_path
=
args
.
hmmbuild_binary_path
,
database_path
=
args
.
pdb_seqres_database_path
,
)
template_featurizer
=
templates
.
HmmsearchHitFeaturizer
(
mmcif_dir
=
args
.
template_mmcif_dir
,
max_template_date
=
args
.
max_template_date
,
max_hits
=
20
,
kalign_binary_path
=
args
.
kalign_binary_path
,
release_dates_path
=
args
.
release_dates_path
,
obsolete_pdbs_path
=
args
.
obsolete_pdbs_path
)
else
:
template_searcher
=
hhsearch
.
HHSearch
(
binary_path
=
args
.
hhsearch_binary_path
,
databases
=
[
args
.
pdb70_database_path
],
)
template_featurizer
=
templates
.
HhsearchHitFeaturizer
(
mmcif_dir
=
args
.
template_mmcif_dir
,
max_template_date
=
args
.
max_template_date
,
max_hits
=
20
,
kalign_binary_path
=
args
.
kalign_binary_path
,
release_dates_path
=
None
,
obsolete_pdbs_path
=
args
.
obsolete_pdbs_path
)
data_pipeline
=
pipeline
.
DataPipeline
(
jackhmmer_binary_path
=
args
.
jackhmmer_binary_path
,
hhblits_binary_path
=
args
.
hhblits_binary_path
,
hhsearch_binary_path
=
args
.
hhsearch_binary_path
,
uniref90_database_path
=
args
.
uniref90_database_path
,
mgnify_database_path
=
args
.
mgnify_database_path
,
bfd_database_path
=
args
.
bfd_database_path
,
uniclust30_database_path
=
args
.
uniclust30_database_path
,
pdb70_database_path
=
args
.
pdb70_database_path
,
small_bfd_database_path
=
None
,
template_featurizer
=
template_featurizer
,
template_searcher
=
template_searcher
,
use_small_bfd
=
False
,
)
if
(
args
.
multimer
):
data_pipeline
=
pipeline_multimer
.
DataPipeline
(
monomer_data_pipeline
=
data_pipeline
,
jackhmmer_binary_path
=
args
.
jackhmmer_binary_path
,
uniprot_database_path
=
args
.
uniprot_database_path
)
feature_dict
=
data_pipeline
.
process
(
input_fasta_path
=
args
.
fasta_path
,
msa_output_dir
=
args
.
output_dir
,
...
...
@@ -44,6 +71,7 @@ if __name__ == "__main__":
parser
.
add_argument
(
"fasta_path"
,
type
=
str
)
parser
.
add_argument
(
"mmcif_dir"
,
type
=
str
)
parser
.
add_argument
(
"output_dir"
,
type
=
str
)
parser
.
add_argument
(
"--multimer"
,
action
=
'store_true'
)
add_data_args
(
parser
)
args
=
parser
.
parse_args
()
...
...
scripts/generate_chain_data_cache.py
View file @
39a6d0e6
...
...
@@ -54,9 +54,8 @@ def parse_file(
chain_dict
[
"seq"
]
=
residue_constants
.
aatype_to_str_sequence
(
protein_object
.
aatype
,
)
local_data
[
"resolution"
]
=
0.
cluster_size
=
chain_cluster_size_dict
.
get
(
file_id
.
upper
(),
-
1
)
chain_dict
[
"resolution"
]
=
0.
if
(
chain_cluster_size_dict
is
not
None
):
cluster_size
=
chain_cluster_size_dict
.
get
(
full_name
.
upper
(),
-
1
...
...
scripts/install_hh_suite.sh
View file @
39a6d0e6
#!/bin/bash
git clone
--branch
v3.3.0 https://github.com/soedinglab/hh-suite.git /tmp/hh-suite
&&
mkdir
/tmp/hh-suite/build
&&
pushd
/tmp/hh-suite/build
&&
cmake
-DCMAKE_INSTALL_PREFIX
=
/opt/hhsuite ..
&&
make
-j
4
&&
make
install
&&
ln
-s
/opt/hhsuite/bin/
*
/usr/bin
&&
popd
&&
rm
-rf
/tmp/hh-suite
git clone
--branch
v3.3.0 https://github.com/soedinglab/hh-suite.git /tmp/hh-suite
\
&&
mkdir
/tmp/hh-suite/build
\
&&
pushd
/tmp/hh-suite/build
\
&&
cmake
-DCMAKE_INSTALL_PREFIX
=
/opt/hhsuite ..
\
&&
make
-j
4
&&
make
install
\
&&
ln
-s
f
/opt/hhsuite/bin/
*
/usr/bin
\
&&
popd
\
&&
rm
-rf
/tmp/hh-suite
scripts/install_third_party_dependencies.sh
View file @
39a6d0e6
...
...
@@ -15,7 +15,15 @@ wget -P /tmp \
export
PATH
=
lib/conda/bin:
$PATH
lib/conda/bin/python3
-m
pip
install
nvidia-pyindex
conda
env
create
--name
=
${
ENV_NAME
}
-f
environment.yml
source
activate
${
ENV_NAME
}
source
scripts/activate_conda_env.sh
echo
"Attempting to install FlashAttention"
git clone https://github.com/HazyResearch/flash-attention
CUR_DIR
=
$PWD
cd
flash-attention
git checkout 5b838a8bef
python3 setup.py
install
cd
$CUR_DIR
# Install DeepMind's OpenMM patch
OPENFOLD_DIR
=
$PWD
...
...
@@ -24,15 +32,18 @@ pushd lib/conda/envs/$ENV_NAME/lib/python3.7/site-packages/ \
&&
popd
# Download folding resources
wget
-
q
-P
openfold/resources
\
wget
-
-no-check-certificate
-P
openfold/resources
\
https://git.scicore.unibas.ch/schwede/openstructure/-/raw/7102c63615b64735c4941278d92b554ec94415f8/modules/mol/alg/src/stereo_chemical_props.txt
# Certain tests need access to this file
mkdir
-p
tests/test_data/alphafold/common
ln
-rs
openfold/resources/stereo_chemical_props.txt tests/test_data/alphafold/common
# Download pretrained openfold weights
scripts/download_alphafold_params.sh openfold/resources
echo
"Downloading OpenFold parameters..."
bash scripts/download_openfold_params.sh openfold/resources
echo
"Downloading AlphaFold parameters..."
bash scripts/download_alphafold_params.sh openfold/resources
# Decompress test data
gunzip
tests/test_data/sample_feats.pickle.gz
scripts/precompute_alignments.py
View file @
39a6d0e6
...
...
@@ -227,7 +227,7 @@ if __name__ == "__main__":
)
add_data_args
(
parser
)
parser
.
add_argument
(
"--raise_errors"
,
type
=
bool
,
default
=
False
,
"--raise_errors"
,
action
=
"store_true"
,
default
=
False
,
help
=
"Whether to crash on parsing errors"
)
parser
.
add_argument
(
...
...
scripts/prep_mmseqs_dbs.sh
View file @
39a6d0e6
...
...
@@ -23,12 +23,12 @@ DOWNLOAD_DIR="$1"
ROOT_DIR
=
"
${
DOWNLOAD_DIR
}
/mmseqs_dbs"
mkdir
-p
$ROOT_DIR
for
f
in
$(
ls
${
DOWNLOAD_DIR
}
/
*
.tar
.gz
)
for
f
in
$(
ls
${
DOWNLOAD_DIR
}
/
*
.tar
*
)
do
tar
--extract
--verbose
--file
=
"
${
f
}
"
\
--directory
=
$ROOT_DIR
rm
"
${
f
}
"
BASENAME
=
"
$(
basename
{
f%%.
*
}
)
"
BASENAME
=
"
$(
basename
$
{
f
%%.*
}
)
"
DB_NAME
=
"
${
BASENAME
}
_db"
OLD_PWD
=
$(
pwd
)
cd
$ROOT_DIR
...
...
scripts/slurm_scripts/run_uniclust30_search.sh
0 → 100755
View file @
39a6d0e6
#!/bin/bash
# Generates uniclust30 all-against-all alignments on a SLURM cluster.
# Thanks to Milot Mirdita for help & feedback on this script.
set
-e
if
[[
$#
!=
3
]]
;
then
echo
"usage: ./run_uniclust30_search.sh <uniclust30_path> <scratch_dir> <out_dir>"
exit
fi
UNICLUST_PATH
=
$1
SCRATCH_DIR_BN
=
$2
OUT_DIR
=
$3
CPUS_PER_TASK
=
4
MAX_SIZE
=
10000000000
# 10GB
SCRATCH_DIR
=
"
${
SCRATCH_DIR_BN
}
_
${
SLURM_NODEID
}
"
mkdir
-p
${
SCRATCH_DIR
}
mkdir
-p
${
OUT_DIR
}
# copy database to local ssd
DB_BN
=
$(
basename
$UNICLUST_PATH
)
DB_DIR
=
"/dev/shm/uniclust30"
mkdir
-p
$DB_DIR
cp
${
UNICLUST_PATH
}*
.ff
*
$DB_DIR
DB
=
"
${
DB_DIR
}
/
${
DB_BN
}
"
for
f
in
$(
ls
$OUT_DIR
/
*
.zip
)
do
zipinfo
-1
$f
'*/'
|
awk
-F
/
'{print $(NF-1)}'
>>
${
DB_DIR
}
/already_searched.txt
done
python3 filter_ffindex.py
${
DB
}
_a3m.ffindex
${
DB_DIR
}
/already_searched.txt
${
DB_DIR
}
/filtered_a3m.ffindex
TARGET
=
"
${
DB
}
_a3m_
${
SLURM_NODEID
}
.ffindex"
split
-n
"l/
$((
SLURM_NODEID
+
1
))
/
${
SLURM_JOB_NUM_NODES
}
"
"
${
DB_DIR
}
/filtered_a3m.ffindex"
>
$TARGET
open_sem
()
{
mkfifo
pipe-
$$
exec
3<
>
pipe-
$$
rm
pipe-
$$
local
i
=
$1
for
((
;
i>0
;
i--
))
;
do
printf
%s 000
>
&3
done
}
# run the given command asynchronously and pop/push tokens
run_with_lock
()
{
local
x
# this read waits until there is something to read
read
-u
3
-n
3 x
&&
((
0
==
x
))
||
exit
$x
(
(
"
$@
"
;
)
# push the return code of the command to the semaphore
printf
'%.3d'
$?
>
&3
)
&
}
task
()
{
dd
if
=
"
${
DB
}
_a3m.ffdata"
ibs
=
1
skip
=
"
${
OFF
}
"
count
=
"
${
LEN
}
"
status
=
none |
\
hhblits
-i
stdin
\
-oa3m
"
${
SCRATCH_DIR
}
/
${
KEY
}
/uniclust30.a3m"
\
-v
0
\
-o
/dev/null
\
-cpu
$CPUS_PER_TASK
\
-d
$DB
\
-n
3
\
-e
0.001
}
zip_or_not
()
{
SIZE
=
$(
du
-hbs
$SCRATCH_DIR
|
sed
's/|/ /'
|
awk
'{print $1}'
)
#if [[ "$SIZE" -gt "$MAX_SIZE" ]]
if
[[
"2"
-gt
"1"
]]
then
wait
RANDOM_NAME
=
$(
cat
/dev/urandom |
tr
-cd
'a-f0-9'
|
head
-c
32
)
zip
-r
"
${
OUT_DIR
}
/
${
RANDOM_NAME
}
.zip"
$SCRATCH_DIR
find
$SCRATCH_DIR
-mindepth
1
-type
d
-exec
rm
-rf
{}
+
fi
}
N
=
$((
$(
nproc
)
/
${
CPUS_PER_TASK
}))
open_sem
$N
while
read
-r
KEY OFF LEN
;
do
PROT_DIR
=
"
${
SCRATCH_DIR
}
/
${
KEY
}
"
if
[[
-d
$PROT_DIR
]]
then
continue
fi
mkdir
-p
$PROT_DIR
run_with_lock task
"
${
KEY
}
"
"
${
OFF
}
"
"
${
LEN
}
"
zip_or_not
done
<
$TARGET
wait
zip_or_not
wait
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment