Unverified Commit 19ce8406 authored by shenggan's avatar shenggan Committed by GitHub
Browse files

Align dataset with alphafold v2.3 (#140)

* update to alphafold 2.3 dataset

* fix uniprot dataset
parent da5fe1a6
...@@ -126,9 +126,9 @@ python inference.py target.fasta data/pdb_mmcif/mmcif_files/ \ ...@@ -126,9 +126,9 @@ python inference.py target.fasta data/pdb_mmcif/mmcif_files/ \
--output_dir ./ \ --output_dir ./ \
--gpus 2 \ --gpus 2 \
--uniref90_database_path data/uniref90/uniref90.fasta \ --uniref90_database_path data/uniref90/uniref90.fasta \
--mgnify_database_path data/mgnify/mgy_clusters_2018_12.fa \ --mgnify_database_path data/mgnify/mgy_clusters_2022_05.fa \
--pdb70_database_path data/pdb70/pdb70 \ --pdb70_database_path data/pdb70/pdb70 \
--uniclust30_database_path data/uniclust30/uniclust30_2018_08/uniclust30_2018_08 \ --uniref30_database_path data/uniref30/UniRef30_2021_03 \
--bfd_database_path data/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt \ --bfd_database_path data/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt \
--jackhmmer_binary_path `which jackhmmer` \ --jackhmmer_binary_path `which jackhmmer` \
--hhblits_binary_path `which hhblits` \ --hhblits_binary_path `which hhblits` \
...@@ -150,9 +150,9 @@ python inference.py target.fasta data/pdb_mmcif/mmcif_files/ \ ...@@ -150,9 +150,9 @@ python inference.py target.fasta data/pdb_mmcif/mmcif_files/ \
--output_dir ./ \ --output_dir ./ \
--gpus 2 \ --gpus 2 \
--uniref90_database_path data/uniref90/uniref90.fasta \ --uniref90_database_path data/uniref90/uniref90.fasta \
--mgnify_database_path data/mgnify/mgy_clusters_2018_12.fa \ --mgnify_database_path data/mgnify/mgy_clusters_2022_05.fa \
--pdb70_database_path data/pdb70/pdb70 \ --pdb70_database_path data/pdb70/pdb70 \
--uniclust30_database_path data/uniclust30/uniclust30_2018_08/uniclust30_2018_08 \ --uniref30_database_path data/uniref30/UniRef30_2021_03 \
--bfd_database_path data/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt \ --bfd_database_path data/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt \
--jackhmmer_binary_path `which jackhmmer` \ --jackhmmer_binary_path `which jackhmmer` \
--hhblits_binary_path `which hhblits` \ --hhblits_binary_path `which hhblits` \
...@@ -173,9 +173,9 @@ python inference.py target.fasta data/pdb_mmcif/mmcif_files/ \ ...@@ -173,9 +173,9 @@ python inference.py target.fasta data/pdb_mmcif/mmcif_files/ \
--output_dir ./ \ --output_dir ./ \
--gpus 2 \ --gpus 2 \
--uniref90_database_path data/uniref90/uniref90.fasta \ --uniref90_database_path data/uniref90/uniref90.fasta \
--mgnify_database_path data/mgnify/mgy_clusters_2018_12.fa \ --mgnify_database_path data/mgnify/mgy_clusters_2022_05.fa \
--pdb70_database_path data/pdb70/pdb70 \ --pdb70_database_path data/pdb70/pdb70 \
--uniclust30_database_path data/uniclust30/uniclust30_2018_08/uniclust30_2018_08 \ --uniref30_database_path data/uniref30/UniRef30_2021_03 \
--bfd_database_path data/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt \ --bfd_database_path data/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt \
--jackhmmer_binary_path `which jackhmmer` \ --jackhmmer_binary_path `which jackhmmer` \
--hhblits_binary_path `which hhblits` \ --hhblits_binary_path `which hhblits` \
...@@ -194,11 +194,11 @@ python inference.py target.fasta data/pdb_mmcif/mmcif_files/ \ ...@@ -194,11 +194,11 @@ python inference.py target.fasta data/pdb_mmcif/mmcif_files/ \
--gpus 2 \ --gpus 2 \
--model_preset multimer \ --model_preset multimer \
--uniref90_database_path data/uniref90/uniref90.fasta \ --uniref90_database_path data/uniref90/uniref90.fasta \
--mgnify_database_path data/mgnify/mgy_clusters_2018_12.fa \ --mgnify_database_path data/mgnify/mgy_clusters_2022_05.fa \
--pdb70_database_path data/pdb70/pdb70 \ --pdb70_database_path data/pdb70/pdb70 \
--uniclust30_database_path data/uniclust30/uniclust30_2018_08/uniclust30_2018_08 \ --uniref30_database_path data/uniref30/UniRef30_2021_03 \
--bfd_database_path data/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt \ --bfd_database_path data/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt \
--uniprot_database_path data/uniprot/uniprot_sprot.fasta \ --uniprot_database_path data/uniprot/uniprot.fasta \
--pdb_seqres_database_path data/pdb_seqres/pdb_seqres.txt \ --pdb_seqres_database_path data/pdb_seqres/pdb_seqres.txt \
--param_path data/params/params_model_1_multimer.npz \ --param_path data/params/params_model_1_multimer.npz \
--model_name model_1_multimer \ --model_name model_1_multimer \
......
...@@ -270,7 +270,7 @@ class AlignmentRunner: ...@@ -270,7 +270,7 @@ class AlignmentRunner:
uniref90_database_path: Optional[str] = None, uniref90_database_path: Optional[str] = None,
mgnify_database_path: Optional[str] = None, mgnify_database_path: Optional[str] = None,
bfd_database_path: Optional[str] = None, bfd_database_path: Optional[str] = None,
uniclust30_database_path: Optional[str] = None, uniref30_database_path: Optional[str] = None,
pdb70_database_path: Optional[str] = None, pdb70_database_path: Optional[str] = None,
use_small_bfd: Optional[bool] = None, use_small_bfd: Optional[bool] = None,
no_cpus: Optional[int] = None, no_cpus: Optional[int] = None,
...@@ -296,14 +296,14 @@ class AlignmentRunner: ...@@ -296,14 +296,14 @@ class AlignmentRunner:
Path to BFD database. Depending on the value of use_small_bfd, Path to BFD database. Depending on the value of use_small_bfd,
one of hhblits_binary_path or jackhmmer_binary_path must be one of hhblits_binary_path or jackhmmer_binary_path must be
provided. provided.
uniclust30_database_path: uniref30_database_path:
Path to uniclust30. Searched alongside BFD if use_small_bfd is Path to uniref30. Searched alongside BFD if use_small_bfd is
false. false.
pdb70_database_path: pdb70_database_path:
Path to pdb70 database. Path to pdb70 database.
use_small_bfd: use_small_bfd:
Whether to search the BFD database alone with jackhmmer or Whether to search the BFD database alone with jackhmmer or
in conjunction with uniclust30 with hhblits. in conjunction with uniref30 with hhblits.
no_cpus: no_cpus:
The number of CPUs available for alignment. By default, all The number of CPUs available for alignment. By default, all
CPUs are used. CPUs are used.
...@@ -367,7 +367,7 @@ class AlignmentRunner: ...@@ -367,7 +367,7 @@ class AlignmentRunner:
) )
self.jackhmmer_small_bfd_runner = None self.jackhmmer_small_bfd_runner = None
self.hhblits_bfd_uniclust_runner = None self.hhblits_bfd_uniref_runner = None
if(bfd_database_path is not None): if(bfd_database_path is not None):
if use_small_bfd: if use_small_bfd:
self.jackhmmer_small_bfd_runner = jackhmmer.Jackhmmer( self.jackhmmer_small_bfd_runner = jackhmmer.Jackhmmer(
...@@ -377,9 +377,9 @@ class AlignmentRunner: ...@@ -377,9 +377,9 @@ class AlignmentRunner:
) )
else: else:
dbs = [bfd_database_path] dbs = [bfd_database_path]
if(uniclust30_database_path is not None): if(uniref30_database_path is not None):
dbs.append(uniclust30_database_path) dbs.append(uniref30_database_path)
self.hhblits_bfd_uniclust_runner = hhblits.HHBlits( self.hhblits_bfd_uniref_runner = hhblits.HHBlits(
binary_path=hhblits_binary_path, binary_path=hhblits_binary_path,
databases=dbs, databases=dbs,
n_cpu=no_cpus, n_cpu=no_cpus,
...@@ -446,14 +446,14 @@ class AlignmentRunner: ...@@ -446,14 +446,14 @@ class AlignmentRunner:
bfd_out_path = os.path.join(output_dir, "small_bfd_hits.sto") bfd_out_path = os.path.join(output_dir, "small_bfd_hits.sto")
with open(bfd_out_path, "w") as f: with open(bfd_out_path, "w") as f:
f.write(jackhmmer_small_bfd_result["sto"]) f.write(jackhmmer_small_bfd_result["sto"])
elif(self.hhblits_bfd_uniclust_runner is not None): elif(self.hhblits_bfd_uniref_runner is not None):
hhblits_bfd_uniclust_result = ( hhblits_bfd_uniref_result = (
self.hhblits_bfd_uniclust_runner.query(fasta_path) self.hhblits_bfd_uniref_runner.query(fasta_path)
) )
if output_dir is not None: if output_dir is not None:
bfd_out_path = os.path.join(output_dir, "bfd_uniclust_hits.a3m") bfd_out_path = os.path.join(output_dir, "bfd_uniref_hits.a3m")
with open(bfd_out_path, "w") as f: with open(bfd_out_path, "w") as f:
f.write(hhblits_bfd_uniclust_result["a3m"]) f.write(hhblits_bfd_uniref_result["a3m"])
...@@ -470,7 +470,7 @@ class AlignmentRunnerMultimer: ...@@ -470,7 +470,7 @@ class AlignmentRunnerMultimer:
uniref90_database_path: Optional[str] = None, uniref90_database_path: Optional[str] = None,
mgnify_database_path: Optional[str] = None, mgnify_database_path: Optional[str] = None,
bfd_database_path: Optional[str] = None, bfd_database_path: Optional[str] = None,
uniclust30_database_path: Optional[str] = None, uniref30_database_path: Optional[str] = None,
uniprot_database_path: Optional[str] = None, uniprot_database_path: Optional[str] = None,
pdb_seqres_database_path: Optional[str] = None, pdb_seqres_database_path: Optional[str] = None,
use_small_bfd: Optional[bool] = None, use_small_bfd: Optional[bool] = None,
...@@ -495,12 +495,12 @@ class AlignmentRunnerMultimer: ...@@ -495,12 +495,12 @@ class AlignmentRunnerMultimer:
Path to BFD database. Depending on the value of use_small_bfd, Path to BFD database. Depending on the value of use_small_bfd,
one of hhblits_binary_path or jackhmmer_binary_path must be one of hhblits_binary_path or jackhmmer_binary_path must be
provided. provided.
uniclust30_database_path: uniref30_database_path:
Path to uniclust30. Searched alongside BFD if use_small_bfd is Path to uniref30. Searched alongside BFD if use_small_bfd is
false. false.
use_small_bfd: use_small_bfd:
Whether to search the BFD database alone with jackhmmer or Whether to search the BFD database alone with jackhmmer or
in conjunction with uniclust30 with hhblits. in conjunction with uniref30 with hhblits.
no_cpus: no_cpus:
The number of CPUs available for alignment. By default, all The number of CPUs available for alignment. By default, all
CPUs are used. CPUs are used.
...@@ -559,7 +559,7 @@ class AlignmentRunnerMultimer: ...@@ -559,7 +559,7 @@ class AlignmentRunnerMultimer:
) )
self.jackhmmer_small_bfd_runner = None self.jackhmmer_small_bfd_runner = None
self.hhblits_bfd_uniclust_runner = None self.hhblits_bfd_uniref_runner = None
if(bfd_database_path is not None): if(bfd_database_path is not None):
if use_small_bfd: if use_small_bfd:
self.jackhmmer_small_bfd_runner = jackhmmer.Jackhmmer( self.jackhmmer_small_bfd_runner = jackhmmer.Jackhmmer(
...@@ -569,9 +569,9 @@ class AlignmentRunnerMultimer: ...@@ -569,9 +569,9 @@ class AlignmentRunnerMultimer:
) )
else: else:
dbs = [bfd_database_path] dbs = [bfd_database_path]
if(uniclust30_database_path is not None): if(uniref30_database_path is not None):
dbs.append(uniclust30_database_path) dbs.append(uniref30_database_path)
self.hhblits_bfd_uniclust_runner = hhblits.HHBlits( self.hhblits_bfd_uniref_runner = hhblits.HHBlits(
binary_path=hhblits_binary_path, binary_path=hhblits_binary_path,
databases=dbs, databases=dbs,
n_cpu=no_cpus, n_cpu=no_cpus,
...@@ -647,10 +647,10 @@ class AlignmentRunnerMultimer: ...@@ -647,10 +647,10 @@ class AlignmentRunnerMultimer:
msa_out_path=bfd_out_path, msa_out_path=bfd_out_path,
msa_format="sto", msa_format="sto",
) )
elif(self.hhblits_bfd_uniclust_runner is not None): elif(self.hhblits_bfd_uniref_runner is not None):
bfd_out_path = os.path.join(output_dir, "bfd_uniclust_hits.a3m") bfd_out_path = os.path.join(output_dir, "bfd_uniref_hits.a3m")
hhblits_bfd_uniclust_result = run_msa_tool( hhblits_bfd_uniref_result = run_msa_tool(
msa_runner=self.hhblits_bfd_uniclust_runner, msa_runner=self.hhblits_bfd_uniref_runner,
fasta_path=fasta_path, fasta_path=fasta_path,
msa_out_path=bfd_out_path, msa_out_path=bfd_out_path,
msa_format="a3m", msa_format="a3m",
......
...@@ -16,7 +16,7 @@ class FastFoldDataWorkFlow: ...@@ -16,7 +16,7 @@ class FastFoldDataWorkFlow:
uniref90_database_path: Optional[str] = None, uniref90_database_path: Optional[str] = None,
mgnify_database_path: Optional[str] = None, mgnify_database_path: Optional[str] = None,
bfd_database_path: Optional[str] = None, bfd_database_path: Optional[str] = None,
uniclust30_database_path: Optional[str] = None, uniref30_database_path: Optional[str] = None,
pdb70_database_path: Optional[str] = None, pdb70_database_path: Optional[str] = None,
use_small_bfd: Optional[bool] = None, use_small_bfd: Optional[bool] = None,
no_cpus: Optional[int] = None, no_cpus: Optional[int] = None,
...@@ -154,13 +154,13 @@ class FastFoldDataWorkFlow: ...@@ -154,13 +154,13 @@ class FastFoldDataWorkFlow:
if not self.use_small_bfd: if not self.use_small_bfd:
# Run HHBlits on BFD # Run HHBlits on BFD
bfd_out_path = os.path.join(alignment_dir, "bfd_uniclust_hits.a3m") bfd_out_path = os.path.join(alignment_dir, "bfd_uniref_hits.a3m")
# generate workflow for STEP4 # generate workflow for STEP4
bfd_node = self.hhblits_bfd_factory.gen_node(fasta_path, bfd_out_path) bfd_node = self.hhblits_bfd_factory.gen_node(fasta_path, bfd_out_path)
else: else:
# Run Jackhmmer on small_bfd # Run Jackhmmer on small_bfd
bfd_out_path = os.path.join(alignment_dir, "bfd_uniclust_hits.a3m") bfd_out_path = os.path.join(alignment_dir, "bfd_uniref_hits.a3m")
# generate workflow for STEP4_2 # generate workflow for STEP4_2
bfd_node = self.jackhmmer_small_bfd_factory.gen_node(fasta_path, bfd_out_path, output_format="sto") bfd_node = self.jackhmmer_small_bfd_factory.gen_node(fasta_path, bfd_out_path, output_format="sto")
......
...@@ -19,7 +19,7 @@ class FastFoldMultimerDataWorkFlow: ...@@ -19,7 +19,7 @@ class FastFoldMultimerDataWorkFlow:
uniref90_database_path: Optional[str] = None, uniref90_database_path: Optional[str] = None,
mgnify_database_path: Optional[str] = None, mgnify_database_path: Optional[str] = None,
bfd_database_path: Optional[str] = None, bfd_database_path: Optional[str] = None,
uniclust30_database_path: Optional[str] = None, uniref30_database_path: Optional[str] = None,
uniprot_database_path: Optional[str] = None, uniprot_database_path: Optional[str] = None,
pdb_seqres_database_path: Optional[str] = None, pdb_seqres_database_path: Optional[str] = None,
use_small_bfd: Optional[bool] = None, use_small_bfd: Optional[bool] = None,
...@@ -171,13 +171,13 @@ class FastFoldMultimerDataWorkFlow: ...@@ -171,13 +171,13 @@ class FastFoldMultimerDataWorkFlow:
if not self.use_small_bfd: if not self.use_small_bfd:
# Run HHBlits on BFD # Run HHBlits on BFD
bfd_out_path = os.path.join(alignment_dir, "bfd_uniclust_hits.a3m") bfd_out_path = os.path.join(alignment_dir, "bfd_uniref_hits.a3m")
# generate workflow for STEP4 # generate workflow for STEP4
bfd_node = self.hhblits_bfd_factory.gen_node(fasta_path, bfd_out_path) bfd_node = self.hhblits_bfd_factory.gen_node(fasta_path, bfd_out_path)
else: else:
# Run Jackhmmer on small_bfd # Run Jackhmmer on small_bfd
bfd_out_path = os.path.join(alignment_dir, "bfd_uniclust_hits.sto") bfd_out_path = os.path.join(alignment_dir, "bfd_uniref_hits.sto")
# generate workflow for STEP4_2 # generate workflow for STEP4_2
bfd_node = self.jackhmmer_small_bfd_factory.gen_node(fasta_path, bfd_out_path, output_format="sto") bfd_node = self.jackhmmer_small_bfd_factory.gen_node(fasta_path, bfd_out_path, output_format="sto")
......
...@@ -71,7 +71,7 @@ def add_data_args(parser: argparse.ArgumentParser): ...@@ -71,7 +71,7 @@ def add_data_args(parser: argparse.ArgumentParser):
default=None, default=None,
) )
parser.add_argument( parser.add_argument(
'--uniclust30_database_path', '--uniref30_database_path',
type=str, type=str,
default=None, default=None,
) )
...@@ -181,7 +181,7 @@ def inference_multimer_model(args): ...@@ -181,7 +181,7 @@ def inference_multimer_model(args):
uniref90_database_path=args.uniref90_database_path, uniref90_database_path=args.uniref90_database_path,
mgnify_database_path=args.mgnify_database_path, mgnify_database_path=args.mgnify_database_path,
bfd_database_path=args.bfd_database_path, bfd_database_path=args.bfd_database_path,
uniclust30_database_path=args.uniclust30_database_path, uniref30_database_path=args.uniref30_database_path,
uniprot_database_path=args.uniprot_database_path, uniprot_database_path=args.uniprot_database_path,
pdb_seqres_database_path=args.pdb_seqres_database_path, pdb_seqres_database_path=args.pdb_seqres_database_path,
use_small_bfd=(args.bfd_database_path is None), use_small_bfd=(args.bfd_database_path is None),
...@@ -196,7 +196,7 @@ def inference_multimer_model(args): ...@@ -196,7 +196,7 @@ def inference_multimer_model(args):
uniref90_database_path=args.uniref90_database_path, uniref90_database_path=args.uniref90_database_path,
mgnify_database_path=args.mgnify_database_path, mgnify_database_path=args.mgnify_database_path,
bfd_database_path=args.bfd_database_path, bfd_database_path=args.bfd_database_path,
uniclust30_database_path=args.uniclust30_database_path, uniref30_database_path=args.uniref30_database_path,
uniprot_database_path=args.uniprot_database_path, uniprot_database_path=args.uniprot_database_path,
pdb_seqres_database_path=args.pdb_seqres_database_path, pdb_seqres_database_path=args.pdb_seqres_database_path,
use_small_bfd=(args.bfd_database_path is None), use_small_bfd=(args.bfd_database_path is None),
...@@ -341,7 +341,7 @@ def inference_monomer_model(args): ...@@ -341,7 +341,7 @@ def inference_monomer_model(args):
assert args.bfd_database_path is not None assert args.bfd_database_path is not None
else: else:
assert args.bfd_database_path is not None assert args.bfd_database_path is not None
assert args.uniclust30_database_path is not None assert args.uniref30_database_path is not None
data_processor = data_pipeline.DataPipeline(template_featurizer=template_featurizer,) data_processor = data_pipeline.DataPipeline(template_featurizer=template_featurizer,)
...@@ -385,7 +385,7 @@ def inference_monomer_model(args): ...@@ -385,7 +385,7 @@ def inference_monomer_model(args):
uniref90_database_path=args.uniref90_database_path, uniref90_database_path=args.uniref90_database_path,
mgnify_database_path=args.mgnify_database_path, mgnify_database_path=args.mgnify_database_path,
bfd_database_path=args.bfd_database_path, bfd_database_path=args.bfd_database_path,
uniclust30_database_path=args.uniclust30_database_path, uniref30_database_path=args.uniref30_database_path,
pdb70_database_path=args.pdb70_database_path, pdb70_database_path=args.pdb70_database_path,
use_small_bfd=use_small_bfd, use_small_bfd=use_small_bfd,
no_cpus=args.cpus, no_cpus=args.cpus,
...@@ -401,7 +401,7 @@ def inference_monomer_model(args): ...@@ -401,7 +401,7 @@ def inference_monomer_model(args):
uniref90_database_path=args.uniref90_database_path, uniref90_database_path=args.uniref90_database_path,
mgnify_database_path=args.mgnify_database_path, mgnify_database_path=args.mgnify_database_path,
bfd_database_path=args.bfd_database_path, bfd_database_path=args.bfd_database_path,
uniclust30_database_path=args.uniclust30_database_path, uniref30_database_path=args.uniref30_database_path,
pdb70_database_path=args.pdb70_database_path, pdb70_database_path=args.pdb70_database_path,
use_small_bfd=use_small_bfd, use_small_bfd=use_small_bfd,
no_cpus=args.cpus, no_cpus=args.cpus,
......
...@@ -7,9 +7,9 @@ ...@@ -7,9 +7,9 @@
python inference.py target.fasta data/pdb_mmcif/mmcif_files \ python inference.py target.fasta data/pdb_mmcif/mmcif_files \
--output_dir ./ \ --output_dir ./ \
--uniref90_database_path data/uniref90/uniref90.fasta \ --uniref90_database_path data/uniref90/uniref90.fasta \
--mgnify_database_path data/mgnify/mgy_clusters_2018_12.fa \ --mgnify_database_path data/mgnify/mgy_clusters_2022_05.fa \
--pdb70_database_path data/pdb70/pdb70 \ --pdb70_database_path data/pdb70/pdb70 \
--uniclust30_database_path data/uniclust30/uniclust30_2018_08/uniclust30_2018_08 \ --uniref30_database_path data/uniref30/UniRef30_2021_03 \
--bfd_database_path data/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt \ --bfd_database_path data/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt \
--jackhmmer_binary_path `which jackhmmer` \ --jackhmmer_binary_path `which jackhmmer` \
--hhblits_binary_path `which hhblits` \ --hhblits_binary_path `which hhblits` \
......
...@@ -8,16 +8,16 @@ python inference.py target.fasta data/pdb_mmcif/mmcif_files \ ...@@ -8,16 +8,16 @@ python inference.py target.fasta data/pdb_mmcif/mmcif_files \
--output_dir ./ \ --output_dir ./ \
--gpus 1 \ --gpus 1 \
--uniref90_database_path data/uniref90/uniref90.fasta \ --uniref90_database_path data/uniref90/uniref90.fasta \
--mgnify_database_path data/mgnify/mgy_clusters_2018_12.fa \ --mgnify_database_path data/mgnify/mgy_clusters_2022_05.fa \
--pdb70_database_path data/pdb70/pdb70 \ --pdb70_database_path data/pdb70/pdb70 \
--pdb_seqres_database_path data/pdb_seqres/pdb_seqres.txt \ --pdb_seqres_database_path data/pdb_seqres/pdb_seqres.txt \
--uniprot_database_path data/uniprot/uniprot_sprot.fasta \ --uniprot_database_path data/uniprot/uniprot.fasta \
--uniclust30_database_path data/uniclust30/uniclust30_2018_08/uniclust30_2018_08 \ --uniref30_database_path data/uniref30/UniRef30_2021_03 \
--bfd_database_path data/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt \ --bfd_database_path data/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt \
--jackhmmer_binary_path `which jackhmmer` \ --jackhmmer_binary_path `which jackhmmer` \
--hhblits_binary_path `which hhblits` \ --hhblits_binary_path `which hhblits` \
--hhsearch_binary_path `which hhsearch` \ --hhsearch_binary_path `which hhsearch` \
--kalign_binary_path `which kalign` \ --kalign_binary_path `which kalign` \
--model_preset multimer \ --model_preset multimer \
--param_path data/params/params_model_1_multimer_v2.npz \ --param_path data/params/params_model_1_multimer_v3.npz \
--model_name model_1_multimer \ --model_name model_1_multimer \
...@@ -59,17 +59,17 @@ bash "${SCRIPT_DIR}/download_pdb70.sh" "${DOWNLOAD_DIR}" ...@@ -59,17 +59,17 @@ bash "${SCRIPT_DIR}/download_pdb70.sh" "${DOWNLOAD_DIR}"
echo "Downloading PDB mmCIF files..." echo "Downloading PDB mmCIF files..."
bash "${SCRIPT_DIR}/download_pdb_mmcif.sh" "${DOWNLOAD_DIR}" bash "${SCRIPT_DIR}/download_pdb_mmcif.sh" "${DOWNLOAD_DIR}"
echo "Downloading Uniclust30..." echo "Downloading Uniref30..."
bash "${SCRIPT_DIR}/download_uniclust30.sh" "${DOWNLOAD_DIR}" bash "${SCRIPT_DIR}/download_uniref30.sh" "${DOWNLOAD_DIR}"
echo "Downloading Uniref90..." echo "Downloading Uniref90..."
bash "${SCRIPT_DIR}/download_uniref90.sh" "${DOWNLOAD_DIR}" bash "${SCRIPT_DIR}/download_uniref90.sh" "${DOWNLOAD_DIR}"
# UniProt and PDB SeqRes for multimer version # UniProt and PDB SeqRes for multimer version
# echo "Downloading UniProt..." echo "Downloading UniProt..."
# bash "${SCRIPT_DIR}/download_uniprot.sh" "${DOWNLOAD_DIR}" bash "${SCRIPT_DIR}/download_uniprot.sh" "${DOWNLOAD_DIR}"
# echo "Downloading PDB SeqRes..." echo "Downloading PDB SeqRes..."
# bash "${SCRIPT_DIR}/download_pdb_seqres.sh" "${DOWNLOAD_DIR}" bash "${SCRIPT_DIR}/download_pdb_seqres.sh" "${DOWNLOAD_DIR}"
echo "All data downloaded." echo "All data downloaded."
...@@ -31,7 +31,7 @@ fi ...@@ -31,7 +31,7 @@ fi
DOWNLOAD_DIR="$1" DOWNLOAD_DIR="$1"
ROOT_DIR="${DOWNLOAD_DIR}/params" ROOT_DIR="${DOWNLOAD_DIR}/params"
SOURCE_URL="https://storage.googleapis.com/alphafold/alphafold_params_2022-03-02.tar" SOURCE_URL="https://storage.googleapis.com/alphafold/alphafold_params_2022-12-06.tar"
BASENAME=$(basename "${SOURCE_URL}") BASENAME=$(basename "${SOURCE_URL}")
mkdir --parents "${ROOT_DIR}" mkdir --parents "${ROOT_DIR}"
......
...@@ -32,8 +32,8 @@ fi ...@@ -32,8 +32,8 @@ fi
DOWNLOAD_DIR="$1" DOWNLOAD_DIR="$1"
ROOT_DIR="${DOWNLOAD_DIR}/mgnify" ROOT_DIR="${DOWNLOAD_DIR}/mgnify"
# Mirror of: # Mirror of:
# ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/peptide_database/2018_12/mgy_clusters.fa.gz # ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/peptide_database/2022_05/mgy_clusters.fa.gz
SOURCE_URL="https://storage.googleapis.com/alphafold-databases/casp14_versions/mgy_clusters_2018_12.fa.gz" SOURCE_URL="https://storage.googleapis.com/alphafold-databases/v2.3/mgy_clusters_2022_05.fa.gz"
BASENAME=$(basename "${SOURCE_URL}") BASENAME=$(basename "${SOURCE_URL}")
mkdir --parents "${ROOT_DIR}" mkdir --parents "${ROOT_DIR}"
......
...@@ -36,3 +36,7 @@ BASENAME=$(basename "${SOURCE_URL}") ...@@ -36,3 +36,7 @@ BASENAME=$(basename "${SOURCE_URL}")
mkdir --parents "${ROOT_DIR}" mkdir --parents "${ROOT_DIR}"
aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}" aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
# Keep only protein sequences.
grep --after-context=1 --no-group-separator '>.* mol:protein' "${ROOT_DIR}/pdb_seqres.txt" > "${ROOT_DIR}/pdb_seqres_filtered.txt"
mv "${ROOT_DIR}/pdb_seqres_filtered.txt" "${ROOT_DIR}/pdb_seqres.txt"
...@@ -14,9 +14,9 @@ ...@@ -14,9 +14,9 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# #
# Downloads and unzips the Uniclust30 database for AlphaFold. # Downloads and unzips the uniref30 database for AlphaFold.
# #
# Usage: bash download_uniclust30.sh /path/to/download/directory # Usage: bash download_uniref30.sh /path/to/download/directory
set -e set -e
if [[ $# -eq 0 ]]; then if [[ $# -eq 0 ]]; then
...@@ -30,10 +30,10 @@ if ! command -v aria2c &> /dev/null ; then ...@@ -30,10 +30,10 @@ if ! command -v aria2c &> /dev/null ; then
fi fi
DOWNLOAD_DIR="$1" DOWNLOAD_DIR="$1"
ROOT_DIR="${DOWNLOAD_DIR}/uniclust30" ROOT_DIR="${DOWNLOAD_DIR}/uniref30"
# Mirror of: # Mirror of:
# http://wwwuser.gwdg.de/~compbiol/uniclust/2018_08/uniclust30_2018_08_hhsuite.tar.gz # https://wwwuser.gwdg.de/~compbiol/uniclust/2021_03/UniRef30_2021_03.tar.gz
SOURCE_URL="https://storage.googleapis.com/alphafold-databases/casp14_versions/uniclust30_2018_08_hhsuite.tar.gz" SOURCE_URL="https://storage.googleapis.com/alphafold-databases/v2.3/UniRef30_2021_03.tar.gz"
BASENAME=$(basename "${SOURCE_URL}") BASENAME=$(basename "${SOURCE_URL}")
mkdir --parents "${ROOT_DIR}" mkdir --parents "${ROOT_DIR}"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment