Commit 15cd3506 authored by mashun1's avatar mashun1
Browse files

Merge branch 'dtk24.04.1'

parents 24e633dc 19085464
......@@ -184,13 +184,6 @@ class NotebookUtilsTest(parameterized.TestCase):
[np.array([], dtype=templates.TEMPLATE_FEATURES[feat_name]).dtype
for feat_name in template_features])
def test_get_pae_json(self):
pae = np.array([[0.01, 13.12345], [20.0987, 0.0]])
pae_json = notebook_utils.get_pae_json(pae=pae, max_pae=31.75)
self.assertEqual(
pae_json, '[{"predicted_aligned_error":[[0.0,13.1],[20.1,0.0]],'
'"max_predicted_aligned_error":31.75}]')
def test_check_cell_execution_order_correct(self):
notebook_utils.check_cell_execution_order({1, 2}, 3)
......
......@@ -27,19 +27,10 @@ from alphafold.relax import utils
import ml_collections
import numpy as np
import jax
try:
# openmm >= 7.6
import openmm
from openmm import unit
from openmm import app as openmm_app
from openmm.app.internal.pdbstructure import PdbStructure
except ImportError:
# openmm < 7.6
from simtk import openmm
from simtk import unit
from simtk.openmm import app as openmm_app
from simtk.openmm.app.internal.pdbstructure import PdbStructure
import openmm
from openmm import unit
from openmm import app as openmm_app
from openmm.app.internal.pdbstructure import PdbStructure
ENERGY = unit.kilocalories_per_mole
......@@ -101,7 +92,7 @@ def _openmm_minimize(
_add_restraints(system, pdb, stiffness, restraint_set, exclude_residues)
integrator = openmm.LangevinIntegrator(0, 0.01, 0.0)
platform = openmm.Platform.getPlatformByName("HIP" if use_gpu else "CPU")
platform = openmm.Platform.getPlatformByName("CUDA" if use_gpu else "CPU")
simulation = openmm_app.Simulation(
pdb.topology, system, integrator, platform)
simulation.context.setPositions(pdb.positions)
......@@ -497,7 +488,7 @@ def run_pipeline(
else:
pdb_string = ret["min_pdb"]
# Calculation of violations can cause CUDA errors for some JAX versions.
with jax.default_device(jax.devices("cpu")[0]):
with jax.default_device(jax.local_devices(backend="cpu")[0]):
ret.update(get_violation_metrics(prot))
ret.update({
"num_exclusions": len(exclude_residues),
......
......@@ -20,8 +20,8 @@ cases like removing chains of length one (see clean_structure).
import io
import pdbfixer
from simtk.openmm import app
from simtk.openmm.app import element
from openmm import app
from openmm.app import element
def fix_pdb(pdbfile, alterations_info):
......
......@@ -17,7 +17,7 @@ import io
from absl.testing import absltest
from alphafold.relax import cleanup
from simtk.openmm.app.internal import pdbstructure
from openmm.app.internal import pdbstructure
def _pdb_to_structure(pdb_str):
......
# Copyright 2021 DeepMind Technologies Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Single source of truth for the AlphaFold version."""
__version__ = '2.3.2'
......@@ -12,8 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
ARG CUDA=11.1.1
FROM nvidia/cuda:${CUDA}-cudnn8-runtime-ubuntu18.04
ARG CUDA=12.2.2
FROM nvidia/cuda:${CUDA}-cudnn8-runtime-ubuntu20.04
# FROM directive resets ARGS, so we specify again (the value is retained if
# previously set).
ARG CUDA
......@@ -53,14 +53,11 @@ RUN wget -q -P /tmp \
# Install conda packages.
ENV PATH="/opt/conda/bin:$PATH"
RUN conda install -qy conda==4.13.0 \
&& conda install -y -c conda-forge \
openmm=7.5.1 \
cudatoolkit==${CUDA_VERSION} \
pdbfixer \
pip \
python=3.8 \
&& conda clean --all --force-pkgs-dirs --yes
ENV LD_LIBRARY_PATH="/opt/conda/lib:$LD_LIBRARY_PATH"
RUN conda install -qy conda==24.1.2 pip python=3.11 \
&& conda install -y -c nvidia cuda=${CUDA_VERSION} \
&& conda install -y -c conda-forge openmm=8.0.0 pdbfixer \
&& conda clean --all --force-pkgs-dirs --yes
COPY . /app/alphafold
RUN wget -q -P /app/alphafold/alphafold/common/ \
......@@ -70,17 +67,16 @@ RUN wget -q -P /app/alphafold/alphafold/common/ \
RUN pip3 install --upgrade pip --no-cache-dir \
&& pip3 install -r /app/alphafold/requirements.txt --no-cache-dir \
&& pip3 install --upgrade --no-cache-dir \
jax==0.3.25 \
jaxlib==0.3.25+cuda11.cudnn805 \
jax==0.4.26 \
jaxlib==0.4.26+cuda12.cudnn89 \
-f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
# Apply OpenMM patch.
WORKDIR /opt/conda/lib/python3.8/site-packages
RUN patch -p0 < /app/alphafold/docker/openmm.patch
# Add SETUID bit to the ldconfig binary so that non-root users can run it.
RUN chmod u+s /sbin/ldconfig.real
# Currently needed to avoid undefined_symbol error.
RUN ln -sf /usr/lib/x86_64-linux-gnu/libffi.so.7 /opt/conda/lib/libffi.so.7
# We need to run `ldconfig` first to ensure GPUs are visible, due to some quirk
# with Debian. See https://github.com/NVIDIA/nvidia-docker/issues/1399 for
# details.
......
image.png

130 KB

......@@ -8,11 +8,11 @@
"source": [
"# AlphaFold Colab\n",
"\n",
"This Colab notebook allows you to easily predict the structure of a protein using a slightly simplified version of [AlphaFold v2.3.1](https://doi.org/10.1038/s41586-021-03819-2). \n",
"This Colab notebook allows you to easily predict the structure of a protein using a slightly simplified version of [AlphaFold v2.3.2](https://doi.org/10.1038/s41586-021-03819-2). \n",
"\n",
"**Differences to AlphaFold v2.3.1**\n",
"**Differences to AlphaFold v2.3.2**\n",
"\n",
"In comparison to AlphaFold v2.3.1, this Colab notebook uses **no templates (homologous structures)** and a selected portion of the [BFD database](https://bfd.mmseqs.com/). We have validated these changes on several thousand recent PDB structures. While accuracy will be near-identical to the full AlphaFold system on many targets, a small fraction have a large drop in accuracy due to the smaller MSA and lack of templates. For best reliability, we recommend instead using the [full open source AlphaFold](https://github.com/deepmind/alphafold/), or the [AlphaFold Protein Structure Database](https://alphafold.ebi.ac.uk/).\n",
"In comparison to AlphaFold v2.3.2, this Colab notebook uses **no templates (homologous structures)** and a selected portion of the [BFD database](https://bfd.mmseqs.com/). We have validated these changes on several thousand recent PDB structures. While accuracy will be near-identical to the full AlphaFold system on many targets, a small fraction have a large drop in accuracy due to the smaller MSA and lack of templates. For best reliability, we recommend instead using the [full open source AlphaFold](https://github.com/deepmind/alphafold/), or the [AlphaFold Protein Structure Database](https://alphafold.ebi.ac.uk/).\n",
"\n",
"**This Colab has a small drop in average accuracy for multimers compared to local AlphaFold installation, for full multimer accuracy it is highly recommended to run [AlphaFold locally](https://github.com/deepmind/alphafold#running-alphafold).** Moreover, the AlphaFold-Multimer requires searching for MSA for every unique sequence in the complex, hence it is substantially slower. If your notebook times-out due to slow multimer MSA search, we recommend either using Colab Pro or running AlphaFold locally.\n",
"\n",
......@@ -90,7 +90,7 @@
" with tqdm.notebook.tqdm(total=100, bar_format=TQDM_BAR_FORMAT) as pbar:\n",
" with io.capture_output() as captured:\n",
" # Uninstall default Colab version of TF.\n",
" %shell pip uninstall -y tensorflow\n",
" %shell pip uninstall -y tensorflow keras\n",
"\n",
" %shell sudo apt install --quiet --yes hmmer\n",
" pbar.update(6)\n",
......@@ -109,10 +109,10 @@
"\n",
" PATH=%env PATH\n",
" %env PATH=/opt/conda/bin:{PATH}\n",
" %shell conda install -qy conda==4.13.0 \\\n",
" %shell conda install -qy conda==24.1.2 \\\n",
" \u0026\u0026 conda install -qy -c conda-forge \\\n",
" python=3.9 \\\n",
" openmm=7.5.1 \\\n",
" python=3.10 \\\n",
" openmm=8.0.0 \\\n",
" pdbfixer\n",
" pbar.update(80)\n",
"\n",
......@@ -160,19 +160,14 @@
" %shell pip3 install -r ./alphafold/requirements.txt\n",
" # Run setup.py to install only AlphaFold.\n",
" %shell pip3 install --no-dependencies ./alphafold\n",
" %shell pip3 install --upgrade pyopenssl\n",
" %shell pip3 install pyopenssl==22.0.0\n",
" pbar.update(10)\n",
"\n",
" # Apply OpenMM patch.\n",
" %shell pushd /opt/conda/lib/python3.9/site-packages/ \u0026\u0026 \\\n",
" patch -p0 \u003c /content/alphafold/docker/openmm.patch \u0026\u0026 \\\n",
" popd\n",
"\n",
" # Make sure stereo_chemical_props.txt is in all locations where it could be searched for.\n",
" %shell mkdir -p /content/alphafold/alphafold/common\n",
" %shell cp -f /content/stereo_chemical_props.txt /content/alphafold/alphafold/common\n",
" %shell mkdir -p /opt/conda/lib/python3.9/site-packages/alphafold/common/\n",
" %shell cp -f /content/stereo_chemical_props.txt /opt/conda/lib/python3.9/site-packages/alphafold/common/\n",
" %shell mkdir -p /opt/conda/lib/python3.10/site-packages/alphafold/common/\n",
" %shell cp -f /content/stereo_chemical_props.txt /opt/conda/lib/python3.10/site-packages/alphafold/common/\n",
"\n",
" # Load parameters\n",
" %shell mkdir --parents \"{PARAMS_DIR}\"\n",
......@@ -197,7 +192,7 @@
"\n",
"# Make sure everything we need is on the path.\n",
"import sys\n",
"sys.path.append('/opt/conda/lib/python3.9/site-packages')\n",
"sys.path.append('/opt/conda/lib/python3.10/site-packages')\n",
"sys.path.append('/content/alphafold')\n",
"\n",
"executed_cells.add(2)"
......@@ -374,6 +369,7 @@
"from alphafold.data import pipeline_multimer\n",
"from alphafold.data.tools import jackhmmer\n",
"\n",
"from alphafold.common import confidence\n",
"from alphafold.common import protein\n",
"\n",
"from alphafold.relax import relax\n",
......@@ -786,7 +782,7 @@
"pae_output_path = os.path.join(output_dir, 'predicted_aligned_error.json')\n",
"if pae_outputs:\n",
" # Save predicted aligned error in the same format as the AF EMBL DB.\n",
" pae_data = notebook_utils.get_pae_json(pae=pae, max_pae=max_pae.item())\n",
" pae_data = confidence.pae_json(pae=pae, max_pae=max_pae.item())\n",
" with open(pae_output_path, 'w') as f:\n",
" f.write(pae_data)\n",
"\n",
......
absl-py==1.0.0
biopython==1.79
chex==0.0.7
dm-haiku==0.0.9
dm-tree==0.1.6
# docker==5.0.0
chex==0.1.86
dm-haiku==0.0.12
dm-tree==0.1.8
docker==5.0.0
immutabledict==2.0.0
# jax==0.3.25
jax==0.4.26
ml-collections==0.1.0
numpy==1.21.6
pandas==1.3.4
scipy==1.7.0
# tensorflow-cpu==2.11.0
numpy==1.24.3
pandas==2.0.3
scipy==1.11.1
tensorflow-cpu==2.16.1
absl-py==1.0.0
biopython==1.79
chex==0.1.86
dm-tree==0.1.8
docker==5.0.0
immutabledict==2.0.0
ml-collections==0.1.0
numpy==1.24.3
pandas==2.0.3
scipy==1.11.1
tensorflow-cpu==2.16.1
matplotlib
cython
\ No newline at end of file
......@@ -22,11 +22,12 @@ import random
import shutil
import sys
import time
from typing import Any, Dict, Mapping, Union
from typing import Any, Dict, Union
from absl import app
from absl import flags
from absl import logging
from alphafold.common import confidence
from alphafold.common import protein
from alphafold.common import residue_constants
from alphafold.data import pipeline
......@@ -60,7 +61,6 @@ flags.DEFINE_list(
'basename is used to name the output directories for each prediction.')
flags.DEFINE_string('data_dir', None, 'Path to directory of supporting data.')
flags.DEFINE_list('model_names', None, 'Names of models to use.')
flags.DEFINE_string('output_dir', None, 'Path to a directory that will '
'store the results.')
flags.DEFINE_string('jackhmmer_binary_path', shutil.which('jackhmmer'),
......@@ -172,6 +172,63 @@ def _jnp_to_np(output: Dict[str, Any]) -> Dict[str, Any]:
return output
def _save_confidence_json_file(
plddt: np.ndarray, output_dir: str, model_name: str
) -> None:
confidence_json = confidence.confidence_json(plddt)
# Save the confidence json.
confidence_json_output_path = os.path.join(
output_dir, f'confidence_{model_name}.json'
)
with open(confidence_json_output_path, 'w') as f:
f.write(confidence_json)
def _save_mmcif_file(
prot: protein.Protein,
output_dir: str,
model_name: str,
file_id: str,
model_type: str,
) -> None:
"""Crate mmCIF string and save to a file.
Args:
prot: Protein object.
output_dir: Directory to which files are saved.
model_name: Name of a model.
file_id: The file ID (usually the PDB ID) to be used in the mmCIF.
model_type: Monomer or multimer.
"""
mmcif_string = protein.to_mmcif(prot, file_id, model_type)
# Save the MMCIF.
mmcif_output_path = os.path.join(output_dir, f'{model_name}.cif')
with open(mmcif_output_path, 'w') as f:
f.write(mmcif_string)
def _save_pae_json_file(
pae: np.ndarray, max_pae: float, output_dir: str, model_name: str
) -> None:
"""Check prediction result for PAE data and save to a JSON file if present.
Args:
pae: The n_res x n_res PAE array.
max_pae: The maximum possible PAE value.
output_dir: Directory to which files are saved.
model_name: Name of a model.
"""
pae_json = confidence.pae_json(pae, max_pae)
# Save the PAE json.
pae_json_output_path = os.path.join(output_dir, f'pae_{model_name}.json')
with open(pae_json_output_path, 'w') as f:
f.write(pae_json)
def predict_structure(
fasta_path: str,
fasta_name: str,
......@@ -181,7 +238,10 @@ def predict_structure(
amber_relaxer: relax.AmberRelaxation,
benchmark: bool,
random_seed: int,
models_to_relax: ModelsToRelax):
models_to_relax: ModelsToRelax,
model_type: str,
):
"""Predicts structure using AlphaFold for the given sequence."""
logging.info('Predicting %s', fasta_name)
timings = {}
......@@ -194,11 +254,6 @@ def predict_structure(
# Get features.
t_0 = time.time()
# features_output_path = os.path.join(output_dir, 'features.pkl')
# if os.path.exists(features_output_path):
# feature_dict = pickle.load(open(features_output_path, 'rb'))
# else:
feature_dict = data_pipeline.process(
input_fasta_path=fasta_path,
msa_output_dir=msa_output_dir)
......@@ -219,6 +274,7 @@ def predict_structure(
num_models = len(model_runners)
for model_index, (model_name, model_runner) in enumerate(
model_runners.items()):
logging.info('Running model %s on %s', model_name, fasta_name)
t_0 = time.time()
model_random_seed = model_index + random_seed * num_models
......@@ -246,8 +302,17 @@ def predict_structure(
model_name, fasta_name, t_diff)
plddt = prediction_result['plddt']
_save_confidence_json_file(plddt, output_dir, model_name)
ranking_confidences[model_name] = prediction_result['ranking_confidence']
if (
'predicted_aligned_error' in prediction_result
and 'max_predicted_aligned_error' in prediction_result
):
pae = prediction_result['predicted_aligned_error']
max_pae = prediction_result['max_predicted_aligned_error']
_save_pae_json_file(pae, float(max_pae), output_dir, model_name)
# Remove jax dependency from results.
np_prediction_result = _jnp_to_np(dict(prediction_result))
......@@ -272,6 +337,14 @@ def predict_structure(
with open(unrelaxed_pdb_path, 'w') as f:
f.write(unrelaxed_pdbs[model_name])
_save_mmcif_file(
prot=unrelaxed_protein,
output_dir=output_dir,
model_name=f'unrelaxed_{model_name}',
file_id=str(model_index),
model_type=model_type,
)
# Rank by model confidence.
ranked_order = [
model_name for model_name, confidence in
......@@ -303,6 +376,15 @@ def predict_structure(
with open(relaxed_output_path, 'w') as f:
f.write(relaxed_pdb_str)
relaxed_protein = protein.from_pdb_string(relaxed_pdb_str)
_save_mmcif_file(
prot=relaxed_protein,
output_dir=output_dir,
model_name=f'relaxed_{model_name}',
file_id='0',
model_type=model_type,
)
# Write out relaxed PDBs in rank order.
for idx, model_name in enumerate(ranked_order):
ranked_output_path = os.path.join(output_dir, f'ranked_{idx}.pdb')
......@@ -312,6 +394,19 @@ def predict_structure(
else:
f.write(unrelaxed_pdbs[model_name])
if model_name in relaxed_pdbs:
protein_instance = protein.from_pdb_string(relaxed_pdbs[model_name])
else:
protein_instance = protein.from_pdb_string(unrelaxed_pdbs[model_name])
_save_mmcif_file(
prot=protein_instance,
output_dir=output_dir,
model_name=f'ranked_{idx}',
file_id=str(idx),
model_type=model_type,
)
ranking_output_path = os.path.join(output_dir, 'ranking_debug.json')
with open(ranking_output_path, 'w') as f:
label = 'iptm+ptm' if 'iptm' in prediction_result else 'plddts'
......@@ -348,6 +443,7 @@ def main(argv):
should_be_set=not use_small_bfd)
run_multimer_system = 'multimer' in FLAGS.model_preset
model_type = 'Multimer' if run_multimer_system else 'Monomer'
_check_flag('pdb70_database_path', 'model_preset',
should_be_set=not run_multimer_system)
_check_flag('pdb_seqres_database_path', 'model_preset',
......@@ -362,6 +458,7 @@ def main(argv):
# Check for duplicate FASTA file names.
fasta_names = [pathlib.Path(p).stem for p in FLAGS.fasta_paths]
if len(fasta_names) != len(set(fasta_names)):
raise ValueError('All FASTA paths must have a unique basename.')
......@@ -414,8 +511,7 @@ def main(argv):
data_pipeline = monomer_data_pipeline
model_runners = {}
# model_names = config.MODEL_PRESETS[FLAGS.model_preset]
model_names = FLAGS.model_names
model_names = config.MODEL_PRESETS[FLAGS.model_preset]
for model_name in model_names:
model_config = config.model_config(model_name)
if run_multimer_system:
......@@ -456,7 +552,9 @@ def main(argv):
amber_relaxer=amber_relaxer,
benchmark=FLAGS.benchmark,
random_seed=random_seed,
models_to_relax=FLAGS.models_to_relax)
models_to_relax=FLAGS.models_to_relax,
model_type=model_type,
)
if __name__ == '__main__':
......@@ -464,7 +562,6 @@ if __name__ == '__main__':
'fasta_paths',
'output_dir',
'data_dir',
'model_names',
'uniref90_database_path',
'mgnify_database_path',
'template_mmcif_dir',
......
......@@ -24,6 +24,8 @@ import mock
import numpy as np
# Internal import (7716).
TEST_DATA_DIR = 'alphafold/common/testdata/'
class RunAlphafoldTest(parameterized.TestCase):
......@@ -58,7 +60,18 @@ class RunAlphafoldTest(parameterized.TestCase):
'max_predicted_aligned_error': np.array(0.),
}
model_runner_mock.multimer_mode = False
amber_relaxer_mock.process.return_value = ('RELAXED', None, [1., 0., 0.])
with open(
os.path.join(
absltest.get_default_test_srcdir(), TEST_DATA_DIR, 'glucagon.pdb'
)
) as f:
pdb_string = f.read()
amber_relaxer_mock.process.return_value = (
pdb_string,
None,
[1.0, 0.0, 0.0],
)
out_dir = self.create_tempdir().full_path
fasta_path = os.path.join(out_dir, 'target.fasta')
......@@ -76,7 +89,8 @@ class RunAlphafoldTest(parameterized.TestCase):
benchmark=False,
random_seed=0,
models_to_relax=models_to_relax,
)
model_type='Monomer',
)
base_output_files = os.listdir(out_dir)
self.assertIn('target.fasta', base_output_files)
......@@ -84,11 +98,22 @@ class RunAlphafoldTest(parameterized.TestCase):
target_output_files = os.listdir(os.path.join(out_dir, 'test'))
expected_files = [
'features.pkl', 'msas', 'ranked_0.pdb', 'ranking_debug.json',
'result_model1.pkl', 'timings.json', 'unrelaxed_model1.pdb',
'confidence_model1.json',
'features.pkl',
'msas',
'pae_model1.json',
'ranked_0.cif',
'ranked_0.pdb',
'ranking_debug.json',
'result_model1.pkl',
'timings.json',
'unrelaxed_model1.cif',
'unrelaxed_model1.pdb',
]
if models_to_relax == run_alphafold.ModelsToRelax.ALL:
expected_files.extend(['relaxed_model1.pdb', 'relax_metrics.json'])
expected_files.extend(
['relaxed_model1.cif', 'relaxed_model1.pdb', 'relax_metrics.json']
)
with open(os.path.join(out_dir, 'test', 'relax_metrics.json')) as f:
relax_metrics = json.loads(f.read())
self.assertDictEqual({'model1': {'remaining_violations': [1.0, 0.0, 0.0],
......
download_dir=/data/alphafold2
#!/bin/bash
download_dir=/home/chuangkj/alphafold2_jax/downloads
python3 run_alphafold.py \
--fasta_paths=monomer.fasta \
--fasta_paths=rcsb_pdb_8U23.fasta \
--output_dir=./ \
--use_precomputed_msas=false \
--data_dir=$download_dir \
--model_names="model_1" \
--uniref90_database_path=$download_dir/uniref90/uniref90.fasta \
--mgnify_database_path=$download_dir/mgnify/mgy_clusters_2022_05.fa \
--mgnify_database_path=$download_dir/mgnify/mgy_clusters_2018_12.fa \
--bfd_database_path=$download_dir/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt \
--uniref30_database_path=$download_dir/uniref30/UniRef30_2021_03 \
--uniref30_database_path=$download_dir/uniclust30/uniclust30_2018_08/uniclust30_2018_08 \
--pdb70_database_path=$download_dir/pdb70/pdb70 \
--template_mmcif_dir=$download_dir/pdb_mmcif/mmcif_files \
--obsolete_pdbs_path=$download_dir/pdb_mmcif/obsolete.dat \
--max_template_date=2020-05-14 \
--max_template_date=2024-05-14 \
--model_preset=monomer \
--db_preset=full_dbs \
--models_to_relax=best \
......
download_dir=/data/alphafold2
python3 run_alphafold.py \
#!/bin/bash
download_dir=/home/chuangkj/alphafold2_jax/downloads
python3 run_alphafold.py \
--fasta_paths=multimer.fasta \
--output_dir=./ \
--use_precomputed_msas=false \
--num_multimer_predictions_per_model=1 \
--data_dir=$download_dir \
--model_names="model_1_multimer_v3" \
--uniref90_database_path=$download_dir/uniref90/uniref90.fasta \
--mgnify_database_path=$download_dir/mgnify/mgy_clusters_2022_05.fa \
--mgnify_database_path=$download_dir/mgnify/mgy_clusters_2018_12.fa \
--bfd_database_path=$download_dir/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt \
--uniref30_database_path=$download_dir/uniref30/UniRef30_2021_03 \
--uniprot_database_path=$download_dir/uniprot/uniprot.fasta \
--uniref30_database_path=$download_dir/uniclust30/uniclust30_2018_08/uniclust30_2018_08 \
--uniprot_database_path=$download_dir/uniprot/uniprot_trembl.fasta \
--pdb_seqres_database_path=$download_dir/pdb_seqres/pdb_seqres.txt \
--template_mmcif_dir=$download_dir/pdb_mmcif/mmcif_files \
--obsolete_pdbs_path=$download_dir/pdb_mmcif/obsolete.dat \
--max_template_date=2020-05-14 \
--max_template_date=2024-05-14 \
--model_preset=multimer \
--db_preset=full_dbs \
--models_to_relax=best \
--use_gpu_relax=false \
--benchmark=true
......@@ -32,12 +32,12 @@ fi
DOWNLOAD_DIR="$1"
ROOT_DIR="${DOWNLOAD_DIR}/mgnify"
# Mirror of:
# ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/peptide_database/2022_05/mgy_clusters.fa.gz
# https://ftp.ebi.ac.uk/pub/databases/metagenomics/peptide_database/2022_05/mgy_clusters.fa.gz
SOURCE_URL="https://storage.googleapis.com/alphafold-databases/v2.3/mgy_clusters_2022_05.fa.gz"
BASENAME=$(basename "${SOURCE_URL}")
mkdir --parents "${ROOT_DIR}"
aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
# pushd "${ROOT_DIR}"
pushd "${ROOT_DIR}"
gunzip "${ROOT_DIR}/${BASENAME}"
# popd
popd
......@@ -45,24 +45,9 @@ echo " * rsync.ebi.ac.uk::pub/databases/pdb/data/structures/divided/mmCIF/ (Eur
echo " * ftp.pdbj.org::ftp_data/structures/divided/mmCIF/ (Asia)"
echo "or see https://www.wwpdb.org/ftp/pdb-ftp-sites for more download options."
mkdir --parents "${RAW_DIR}"
# rsync --recursive --links --perms --times --compress --info=progress2 --delete --port=33444 \
# rsync.rcsb.org::ftp_data/structures/divided/mmCIF/ \
# "${RAW_DIR}"
# (Asia)
rsync -rlpt -v -z --info=progress2 --delete \
ftp.pdbj.org::ftp_data/structures/divided/mmCIF/ \
"${RAW_DIR}"
# (Europe)
# rsync -rlpt -v -z --info=progress2 --delete \
# rsync.ebi.ac.uk::pub/databases/pdb/data/structures/divided/mmCIF/ \
# "${RAW_DIR}"
# fast
# rsync --recursive --links --perms --times --compress --info=progress2 --delete \
# data.pdbj.org::ftp_data/structures/divided/mmCIF/ "${RAW_DIR}"
rsync --recursive --links --perms --times --compress --info=progress2 --delete --port=33444 \
rsync.rcsb.org::ftp_data/structures/divided/mmCIF/ \
"${RAW_DIR}"
echo "Unzipping all mmCIF files..."
find "${RAW_DIR}/" -type f -iname "*.gz" -exec gunzip {} +
......@@ -77,4 +62,4 @@ done
# Delete empty download directory structure.
find "${RAW_DIR}" -type d -empty -delete
aria2c "ftp://ftp.wwpdb.org/pub/pdb/data/status/obsolete.dat" --dir="${ROOT_DIR}"
aria2c "https://files.wwpdb.org/pub/pdb/data/status/obsolete.dat" --dir="${ROOT_DIR}"
......@@ -31,7 +31,7 @@ fi
DOWNLOAD_DIR="$1"
ROOT_DIR="${DOWNLOAD_DIR}/pdb_seqres"
SOURCE_URL="ftp://ftp.wwpdb.org/pub/pdb/derived_data/pdb_seqres.txt"
SOURCE_URL="https://files.wwpdb.org/pub/pdb/derived_data/pdb_seqres.txt"
BASENAME=$(basename "${SOURCE_URL}")
mkdir --parents "${ROOT_DIR}"
......
......@@ -36,6 +36,6 @@ BASENAME=$(basename "${SOURCE_URL}")
mkdir --parents "${ROOT_DIR}"
aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
# pushd "${ROOT_DIR}"
pushd "${ROOT_DIR}"
gunzip "${ROOT_DIR}/${BASENAME}"
# popd
popd
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment