Unverified Commit 19ce8406 authored by shenggan's avatar shenggan Committed by GitHub
Browse files

Align dataset with alphafold v2.3 (#140)

* update to alphafold 2.3 dataset

* fix uniprot dataset
parent da5fe1a6
......@@ -126,9 +126,9 @@ python inference.py target.fasta data/pdb_mmcif/mmcif_files/ \
--output_dir ./ \
--gpus 2 \
--uniref90_database_path data/uniref90/uniref90.fasta \
--mgnify_database_path data/mgnify/mgy_clusters_2018_12.fa \
--mgnify_database_path data/mgnify/mgy_clusters_2022_05.fa \
--pdb70_database_path data/pdb70/pdb70 \
--uniclust30_database_path data/uniclust30/uniclust30_2018_08/uniclust30_2018_08 \
--uniref30_database_path data/uniref30/UniRef30_2021_03 \
--bfd_database_path data/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt \
--jackhmmer_binary_path `which jackhmmer` \
--hhblits_binary_path `which hhblits` \
......@@ -150,9 +150,9 @@ python inference.py target.fasta data/pdb_mmcif/mmcif_files/ \
--output_dir ./ \
--gpus 2 \
--uniref90_database_path data/uniref90/uniref90.fasta \
--mgnify_database_path data/mgnify/mgy_clusters_2018_12.fa \
--mgnify_database_path data/mgnify/mgy_clusters_2022_05.fa \
--pdb70_database_path data/pdb70/pdb70 \
--uniclust30_database_path data/uniclust30/uniclust30_2018_08/uniclust30_2018_08 \
--uniref30_database_path data/uniref30/UniRef30_2021_03 \
--bfd_database_path data/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt \
--jackhmmer_binary_path `which jackhmmer` \
--hhblits_binary_path `which hhblits` \
......@@ -173,9 +173,9 @@ python inference.py target.fasta data/pdb_mmcif/mmcif_files/ \
--output_dir ./ \
--gpus 2 \
--uniref90_database_path data/uniref90/uniref90.fasta \
--mgnify_database_path data/mgnify/mgy_clusters_2018_12.fa \
--mgnify_database_path data/mgnify/mgy_clusters_2022_05.fa \
--pdb70_database_path data/pdb70/pdb70 \
--uniclust30_database_path data/uniclust30/uniclust30_2018_08/uniclust30_2018_08 \
--uniref30_database_path data/uniref30/UniRef30_2021_03 \
--bfd_database_path data/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt \
--jackhmmer_binary_path `which jackhmmer` \
--hhblits_binary_path `which hhblits` \
......@@ -194,11 +194,11 @@ python inference.py target.fasta data/pdb_mmcif/mmcif_files/ \
--gpus 2 \
--model_preset multimer \
--uniref90_database_path data/uniref90/uniref90.fasta \
--mgnify_database_path data/mgnify/mgy_clusters_2018_12.fa \
--mgnify_database_path data/mgnify/mgy_clusters_2022_05.fa \
--pdb70_database_path data/pdb70/pdb70 \
--uniclust30_database_path data/uniclust30/uniclust30_2018_08/uniclust30_2018_08 \
--uniref30_database_path data/uniref30/UniRef30_2021_03 \
--bfd_database_path data/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt \
--uniprot_database_path data/uniprot/uniprot_sprot.fasta \
--uniprot_database_path data/uniprot/uniprot.fasta \
--pdb_seqres_database_path data/pdb_seqres/pdb_seqres.txt \
--param_path data/params/params_model_1_multimer.npz \
--model_name model_1_multimer \
......
......@@ -270,7 +270,7 @@ class AlignmentRunner:
uniref90_database_path: Optional[str] = None,
mgnify_database_path: Optional[str] = None,
bfd_database_path: Optional[str] = None,
uniclust30_database_path: Optional[str] = None,
uniref30_database_path: Optional[str] = None,
pdb70_database_path: Optional[str] = None,
use_small_bfd: Optional[bool] = None,
no_cpus: Optional[int] = None,
......@@ -296,14 +296,14 @@ class AlignmentRunner:
Path to BFD database. Depending on the value of use_small_bfd,
one of hhblits_binary_path or jackhmmer_binary_path must be
provided.
uniclust30_database_path:
Path to uniclust30. Searched alongside BFD if use_small_bfd is
uniref30_database_path:
Path to uniref30. Searched alongside BFD if use_small_bfd is
false.
pdb70_database_path:
Path to pdb70 database.
use_small_bfd:
Whether to search the BFD database alone with jackhmmer or
in conjunction with uniclust30 with hhblits.
in conjunction with uniref30 with hhblits.
no_cpus:
The number of CPUs available for alignment. By default, all
CPUs are used.
......@@ -367,7 +367,7 @@ class AlignmentRunner:
)
self.jackhmmer_small_bfd_runner = None
self.hhblits_bfd_uniclust_runner = None
self.hhblits_bfd_uniref_runner = None
if(bfd_database_path is not None):
if use_small_bfd:
self.jackhmmer_small_bfd_runner = jackhmmer.Jackhmmer(
......@@ -377,9 +377,9 @@ class AlignmentRunner:
)
else:
dbs = [bfd_database_path]
if(uniclust30_database_path is not None):
dbs.append(uniclust30_database_path)
self.hhblits_bfd_uniclust_runner = hhblits.HHBlits(
if(uniref30_database_path is not None):
dbs.append(uniref30_database_path)
self.hhblits_bfd_uniref_runner = hhblits.HHBlits(
binary_path=hhblits_binary_path,
databases=dbs,
n_cpu=no_cpus,
......@@ -446,14 +446,14 @@ class AlignmentRunner:
bfd_out_path = os.path.join(output_dir, "small_bfd_hits.sto")
with open(bfd_out_path, "w") as f:
f.write(jackhmmer_small_bfd_result["sto"])
elif(self.hhblits_bfd_uniclust_runner is not None):
hhblits_bfd_uniclust_result = (
self.hhblits_bfd_uniclust_runner.query(fasta_path)
elif(self.hhblits_bfd_uniref_runner is not None):
hhblits_bfd_uniref_result = (
self.hhblits_bfd_uniref_runner.query(fasta_path)
)
if output_dir is not None:
bfd_out_path = os.path.join(output_dir, "bfd_uniclust_hits.a3m")
bfd_out_path = os.path.join(output_dir, "bfd_uniref_hits.a3m")
with open(bfd_out_path, "w") as f:
f.write(hhblits_bfd_uniclust_result["a3m"])
f.write(hhblits_bfd_uniref_result["a3m"])
......@@ -470,7 +470,7 @@ class AlignmentRunnerMultimer:
uniref90_database_path: Optional[str] = None,
mgnify_database_path: Optional[str] = None,
bfd_database_path: Optional[str] = None,
uniclust30_database_path: Optional[str] = None,
uniref30_database_path: Optional[str] = None,
uniprot_database_path: Optional[str] = None,
pdb_seqres_database_path: Optional[str] = None,
use_small_bfd: Optional[bool] = None,
......@@ -495,12 +495,12 @@ class AlignmentRunnerMultimer:
Path to BFD database. Depending on the value of use_small_bfd,
one of hhblits_binary_path or jackhmmer_binary_path must be
provided.
uniclust30_database_path:
Path to uniclust30. Searched alongside BFD if use_small_bfd is
uniref30_database_path:
Path to uniref30. Searched alongside BFD if use_small_bfd is
false.
use_small_bfd:
Whether to search the BFD database alone with jackhmmer or
in conjunction with uniclust30 with hhblits.
in conjunction with uniref30 with hhblits.
no_cpus:
The number of CPUs available for alignment. By default, all
CPUs are used.
......@@ -559,7 +559,7 @@ class AlignmentRunnerMultimer:
)
self.jackhmmer_small_bfd_runner = None
self.hhblits_bfd_uniclust_runner = None
self.hhblits_bfd_uniref_runner = None
if(bfd_database_path is not None):
if use_small_bfd:
self.jackhmmer_small_bfd_runner = jackhmmer.Jackhmmer(
......@@ -569,9 +569,9 @@ class AlignmentRunnerMultimer:
)
else:
dbs = [bfd_database_path]
if(uniclust30_database_path is not None):
dbs.append(uniclust30_database_path)
self.hhblits_bfd_uniclust_runner = hhblits.HHBlits(
if(uniref30_database_path is not None):
dbs.append(uniref30_database_path)
self.hhblits_bfd_uniref_runner = hhblits.HHBlits(
binary_path=hhblits_binary_path,
databases=dbs,
n_cpu=no_cpus,
......@@ -647,10 +647,10 @@ class AlignmentRunnerMultimer:
msa_out_path=bfd_out_path,
msa_format="sto",
)
elif(self.hhblits_bfd_uniclust_runner is not None):
bfd_out_path = os.path.join(output_dir, "bfd_uniclust_hits.a3m")
hhblits_bfd_uniclust_result = run_msa_tool(
msa_runner=self.hhblits_bfd_uniclust_runner,
elif(self.hhblits_bfd_uniref_runner is not None):
bfd_out_path = os.path.join(output_dir, "bfd_uniref_hits.a3m")
hhblits_bfd_uniref_result = run_msa_tool(
msa_runner=self.hhblits_bfd_uniref_runner,
fasta_path=fasta_path,
msa_out_path=bfd_out_path,
msa_format="a3m",
......
......@@ -16,7 +16,7 @@ class FastFoldDataWorkFlow:
uniref90_database_path: Optional[str] = None,
mgnify_database_path: Optional[str] = None,
bfd_database_path: Optional[str] = None,
uniclust30_database_path: Optional[str] = None,
uniref30_database_path: Optional[str] = None,
pdb70_database_path: Optional[str] = None,
use_small_bfd: Optional[bool] = None,
no_cpus: Optional[int] = None,
......@@ -154,13 +154,13 @@ class FastFoldDataWorkFlow:
if not self.use_small_bfd:
# Run HHBlits on BFD
bfd_out_path = os.path.join(alignment_dir, "bfd_uniclust_hits.a3m")
bfd_out_path = os.path.join(alignment_dir, "bfd_uniref_hits.a3m")
# generate workflow for STEP4
bfd_node = self.hhblits_bfd_factory.gen_node(fasta_path, bfd_out_path)
else:
# Run Jackhmmer on small_bfd
bfd_out_path = os.path.join(alignment_dir, "bfd_uniclust_hits.a3m")
bfd_out_path = os.path.join(alignment_dir, "bfd_uniref_hits.a3m")
# generate workflow for STEP4_2
bfd_node = self.jackhmmer_small_bfd_factory.gen_node(fasta_path, bfd_out_path, output_format="sto")
......
......@@ -19,7 +19,7 @@ class FastFoldMultimerDataWorkFlow:
uniref90_database_path: Optional[str] = None,
mgnify_database_path: Optional[str] = None,
bfd_database_path: Optional[str] = None,
uniclust30_database_path: Optional[str] = None,
uniref30_database_path: Optional[str] = None,
uniprot_database_path: Optional[str] = None,
pdb_seqres_database_path: Optional[str] = None,
use_small_bfd: Optional[bool] = None,
......@@ -171,13 +171,13 @@ class FastFoldMultimerDataWorkFlow:
if not self.use_small_bfd:
# Run HHBlits on BFD
bfd_out_path = os.path.join(alignment_dir, "bfd_uniclust_hits.a3m")
bfd_out_path = os.path.join(alignment_dir, "bfd_uniref_hits.a3m")
# generate workflow for STEP4
bfd_node = self.hhblits_bfd_factory.gen_node(fasta_path, bfd_out_path)
else:
# Run Jackhmmer on small_bfd
bfd_out_path = os.path.join(alignment_dir, "bfd_uniclust_hits.sto")
bfd_out_path = os.path.join(alignment_dir, "bfd_uniref_hits.sto")
# generate workflow for STEP4_2
bfd_node = self.jackhmmer_small_bfd_factory.gen_node(fasta_path, bfd_out_path, output_format="sto")
......
......@@ -71,7 +71,7 @@ def add_data_args(parser: argparse.ArgumentParser):
default=None,
)
parser.add_argument(
'--uniclust30_database_path',
'--uniref30_database_path',
type=str,
default=None,
)
......@@ -181,7 +181,7 @@ def inference_multimer_model(args):
uniref90_database_path=args.uniref90_database_path,
mgnify_database_path=args.mgnify_database_path,
bfd_database_path=args.bfd_database_path,
uniclust30_database_path=args.uniclust30_database_path,
uniref30_database_path=args.uniref30_database_path,
uniprot_database_path=args.uniprot_database_path,
pdb_seqres_database_path=args.pdb_seqres_database_path,
use_small_bfd=(args.bfd_database_path is None),
......@@ -196,7 +196,7 @@ def inference_multimer_model(args):
uniref90_database_path=args.uniref90_database_path,
mgnify_database_path=args.mgnify_database_path,
bfd_database_path=args.bfd_database_path,
uniclust30_database_path=args.uniclust30_database_path,
uniref30_database_path=args.uniref30_database_path,
uniprot_database_path=args.uniprot_database_path,
pdb_seqres_database_path=args.pdb_seqres_database_path,
use_small_bfd=(args.bfd_database_path is None),
......@@ -341,7 +341,7 @@ def inference_monomer_model(args):
assert args.bfd_database_path is not None
else:
assert args.bfd_database_path is not None
assert args.uniclust30_database_path is not None
assert args.uniref30_database_path is not None
data_processor = data_pipeline.DataPipeline(template_featurizer=template_featurizer,)
......@@ -385,7 +385,7 @@ def inference_monomer_model(args):
uniref90_database_path=args.uniref90_database_path,
mgnify_database_path=args.mgnify_database_path,
bfd_database_path=args.bfd_database_path,
uniclust30_database_path=args.uniclust30_database_path,
uniref30_database_path=args.uniref30_database_path,
pdb70_database_path=args.pdb70_database_path,
use_small_bfd=use_small_bfd,
no_cpus=args.cpus,
......@@ -401,7 +401,7 @@ def inference_monomer_model(args):
uniref90_database_path=args.uniref90_database_path,
mgnify_database_path=args.mgnify_database_path,
bfd_database_path=args.bfd_database_path,
uniclust30_database_path=args.uniclust30_database_path,
uniref30_database_path=args.uniref30_database_path,
pdb70_database_path=args.pdb70_database_path,
use_small_bfd=use_small_bfd,
no_cpus=args.cpus,
......
......@@ -7,9 +7,9 @@
python inference.py target.fasta data/pdb_mmcif/mmcif_files \
--output_dir ./ \
--uniref90_database_path data/uniref90/uniref90.fasta \
--mgnify_database_path data/mgnify/mgy_clusters_2018_12.fa \
--mgnify_database_path data/mgnify/mgy_clusters_2022_05.fa \
--pdb70_database_path data/pdb70/pdb70 \
--uniclust30_database_path data/uniclust30/uniclust30_2018_08/uniclust30_2018_08 \
--uniref30_database_path data/uniref30/UniRef30_2021_03 \
--bfd_database_path data/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt \
--jackhmmer_binary_path `which jackhmmer` \
--hhblits_binary_path `which hhblits` \
......
......@@ -8,16 +8,16 @@ python inference.py target.fasta data/pdb_mmcif/mmcif_files \
--output_dir ./ \
--gpus 1 \
--uniref90_database_path data/uniref90/uniref90.fasta \
--mgnify_database_path data/mgnify/mgy_clusters_2018_12.fa \
--mgnify_database_path data/mgnify/mgy_clusters_2022_05.fa \
--pdb70_database_path data/pdb70/pdb70 \
--pdb_seqres_database_path data/pdb_seqres/pdb_seqres.txt \
--uniprot_database_path data/uniprot/uniprot_sprot.fasta \
--uniclust30_database_path data/uniclust30/uniclust30_2018_08/uniclust30_2018_08 \
--uniprot_database_path data/uniprot/uniprot.fasta \
--uniref30_database_path data/uniref30/UniRef30_2021_03 \
--bfd_database_path data/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt \
--jackhmmer_binary_path `which jackhmmer` \
--hhblits_binary_path `which hhblits` \
--hhsearch_binary_path `which hhsearch` \
--kalign_binary_path `which kalign` \
--model_preset multimer \
--param_path data/params/params_model_1_multimer_v2.npz \
--param_path data/params/params_model_1_multimer_v3.npz \
--model_name model_1_multimer \
......@@ -59,17 +59,17 @@ bash "${SCRIPT_DIR}/download_pdb70.sh" "${DOWNLOAD_DIR}"
echo "Downloading PDB mmCIF files..."
bash "${SCRIPT_DIR}/download_pdb_mmcif.sh" "${DOWNLOAD_DIR}"
echo "Downloading Uniclust30..."
bash "${SCRIPT_DIR}/download_uniclust30.sh" "${DOWNLOAD_DIR}"
echo "Downloading Uniref30..."
bash "${SCRIPT_DIR}/download_uniref30.sh" "${DOWNLOAD_DIR}"
echo "Downloading Uniref90..."
bash "${SCRIPT_DIR}/download_uniref90.sh" "${DOWNLOAD_DIR}"
# UniProt and PDB SeqRes for multimer version
# echo "Downloading UniProt..."
# bash "${SCRIPT_DIR}/download_uniprot.sh" "${DOWNLOAD_DIR}"
echo "Downloading UniProt..."
bash "${SCRIPT_DIR}/download_uniprot.sh" "${DOWNLOAD_DIR}"
# echo "Downloading PDB SeqRes..."
# bash "${SCRIPT_DIR}/download_pdb_seqres.sh" "${DOWNLOAD_DIR}"
echo "Downloading PDB SeqRes..."
bash "${SCRIPT_DIR}/download_pdb_seqres.sh" "${DOWNLOAD_DIR}"
echo "All data downloaded."
......@@ -31,7 +31,7 @@ fi
DOWNLOAD_DIR="$1"
ROOT_DIR="${DOWNLOAD_DIR}/params"
SOURCE_URL="https://storage.googleapis.com/alphafold/alphafold_params_2022-03-02.tar"
SOURCE_URL="https://storage.googleapis.com/alphafold/alphafold_params_2022-12-06.tar"
BASENAME=$(basename "${SOURCE_URL}")
mkdir --parents "${ROOT_DIR}"
......
......@@ -32,8 +32,8 @@ fi
DOWNLOAD_DIR="$1"
ROOT_DIR="${DOWNLOAD_DIR}/mgnify"
# Mirror of:
# ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/peptide_database/2018_12/mgy_clusters.fa.gz
SOURCE_URL="https://storage.googleapis.com/alphafold-databases/casp14_versions/mgy_clusters_2018_12.fa.gz"
# ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/peptide_database/2022_05/mgy_clusters.fa.gz
SOURCE_URL="https://storage.googleapis.com/alphafold-databases/v2.3/mgy_clusters_2022_05.fa.gz"
BASENAME=$(basename "${SOURCE_URL}")
mkdir --parents "${ROOT_DIR}"
......
......@@ -36,3 +36,7 @@ BASENAME=$(basename "${SOURCE_URL}")
mkdir --parents "${ROOT_DIR}"
aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
# Keep only protein sequences.
grep --after-context=1 --no-group-separator '>.* mol:protein' "${ROOT_DIR}/pdb_seqres.txt" > "${ROOT_DIR}/pdb_seqres_filtered.txt"
mv "${ROOT_DIR}/pdb_seqres_filtered.txt" "${ROOT_DIR}/pdb_seqres.txt"
......@@ -14,9 +14,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Downloads and unzips the Uniclust30 database for AlphaFold.
# Downloads and unzips the uniref30 database for AlphaFold.
#
# Usage: bash download_uniclust30.sh /path/to/download/directory
# Usage: bash download_uniref30.sh /path/to/download/directory
set -e
if [[ $# -eq 0 ]]; then
......@@ -30,10 +30,10 @@ if ! command -v aria2c &> /dev/null ; then
fi
DOWNLOAD_DIR="$1"
ROOT_DIR="${DOWNLOAD_DIR}/uniclust30"
ROOT_DIR="${DOWNLOAD_DIR}/uniref30"
# Mirror of:
# http://wwwuser.gwdg.de/~compbiol/uniclust/2018_08/uniclust30_2018_08_hhsuite.tar.gz
SOURCE_URL="https://storage.googleapis.com/alphafold-databases/casp14_versions/uniclust30_2018_08_hhsuite.tar.gz"
# https://wwwuser.gwdg.de/~compbiol/uniclust/2021_03/UniRef30_2021_03.tar.gz
SOURCE_URL="https://storage.googleapis.com/alphafold-databases/v2.3/UniRef30_2021_03.tar.gz"
BASENAME=$(basename "${SOURCE_URL}")
mkdir --parents "${ROOT_DIR}"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment