Docker testing

3c89c1c7 · Brian Loyal · 148afc98 · 3c89c1c7 · 3c89c1c7 · 3c89c1c7
Commit 3c89c1c7 authored Jun 15, 2022 by Brian Loyal
Show whitespace changes
Inline Side-by-side

Showing with 234 additions and 20 deletions

Dockerfile Dockerfile +29 -20

ec2-testing.txt ec2-testing.txt +172 -0

run_batch_job.sh run_batch_job.sh +33 -0

No files found.
--- a/Dockerfile
+++ b/Dockerfile
 FROM nvidia/cuda:10.2-cudnn8-runtime-ubuntu18.04
-RUN apt-key del 7fa2af80
-RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
 RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
-RUN apt-get update && apt-get install -y wget cuda-minimal-build-10-2 git
+RUN apt-get update \
-RUN wget -P /tmp \
+    && apt-get install -y wget cuda-minimal-build-10-2 git zip \
-    "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh" \
+    && apt-get clean
-    && bash /tmp/Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda \
-    && rm /tmp/Miniconda3-latest-Linux-x86_64.sh
-ENV PATH /opt/conda/bin:$PATH
-COPY environment.yml /opt/openfold/environment.yml
+RUN wget -q -P /tmp \
+  https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh \
+  && bash /tmp/Mambaforge-Linux-x86_64.sh -b -p /opt/mamba \
+  && rm /tmp/Mambaforge-Linux-x86_64.sh
-# installing into the base environment since the docker container wont do anything other than run openfold
+ENV PATH="/opt/mamba/bin:$PATH"
-RUN conda env update -n base --file /opt/openfold/environment.yml && conda clean --all
+RUN wget -O "awscliv2.zip" "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" \
+    && unzip awscliv2.zip \
+    && ./aws/install \
+    && rm awscliv2.zip
+RUN git clone --branch main --single-branch https://github.com/aqlaboratory/openfold.git /opt/openfold \
+    && cd /opt/openfold \
+    && git reset ec5619fc970e28e7b81ce452f5e08e7dd6a7cb31 \
+    && rm -rf /opt/openfold/imgs /opt/openfold/notebooks /opt/openfold/tests
+RUN mamba env update -n base --file /opt/openfold/environment.yml \
+    && mamba clean --all
-COPY openfold /opt/openfold/openfold
-COPY scripts /opt/openfold/scripts
-COPY run_pretrained_openfold.py /opt/openfold/run_pretrained_openfold.py
-COPY train_openfold.py /opt/openfold/train_openfold.py
-COPY setup.py /opt/openfold/setup.py
-COPY lib/openmm.patch /opt/openfold/lib/openmm.patch
 RUN wget -q -P /opt/openfold/openfold/resources \
    https://git.scicore.unibas.ch/schwede/openstructure/-/raw/7102c63615b64735c4941278d92b554ec94415f8/modules/mol/alg/src/stereo_chemical_props.txt
-RUN patch -p0 -d /opt/conda/lib/python3.7/site-packages/ < /opt/openfold/lib/openmm.patch
+RUN patch -p0 -d /opt/mamba/lib/python3.7/site-packages/ < /opt/openfold/lib/openmm.patch
+COPY run_batch_job.sh /opt/openfold
 WORKDIR /opt/openfold 
 RUN python3 setup.py install
\ No newline at end of file
--- a/ec2-testing.txt
+++ b/ec2-testing.txt
+python3 run_pretrained_openfold.py \
+    target.fasta \
+    data/pdb_mmcif/mmcif_files/ \
+    --uniref90_database_path data/uniref90/uniref90.fasta \
+    --mgnify_database_path data/mgnify/mgy_clusters_2018_12.fa \
+    --pdb70_database_path data/pdb70/pdb70 \
+    --uniclust30_database_path data/uniclust30/uniclust30_2018_08/uniclust30_2018_08 \
+    --output_dir ./ \
+    --bfd_database_path data/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt \
+    --model_device cuda:1 \
+    --jackhmmer_binary_path lib/conda/envs/openfold_venv/bin/jackhmmer \
+    --hhblits_binary_path lib/conda/envs/openfold_venv/bin/hhblits \
+    --hhsearch_binary_path lib/conda/envs/openfold_venv/bin/hhsearch \
+    --kalign_binary_path lib/conda/envs/openfold_venv/bin/kalign
+docker run \
+--gpus all \
+-v $PWD/:/data \
+-v /mnt/alphafold_database/:/database \
+-ti openfold:latest \
+python3 /opt/openfold/run_pretrained_openfold.py \
+/data/input.fasta \
+/database/pdb_mmcif/mmcif_files/ \
+--uniref90_database_path /database/uniref90/uniref90.fasta \
+--mgnify_database_path /database/mgnify/mgy_clusters_2018_12.fa \
+--pdb70_database_path /database/pdb70/pdb70 \
+--uniclust30_database_path /database/uniclust30/uniclust30_2018_08/uniclust30_2018_08 \
+--output_dir /data \
+--bfd_database_path /database/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt \
+--model_device cuda:0 \
+--jackhmmer_binary_path /opt/conda/bin/jackhmmer \
+--hhblits_binary_path /opt/conda/bin/hhblits \
+--hhsearch_binary_path /opt/conda/bin/hhsearch \
+--kalign_binary_path /opt/conda/bin/kalign \
+--param_path /database/params/params_model_1.npz
+# T1083
+docker run \
+--rm \
+--gpus all \
+-v /home/ec2-user/data:/data \
+-v /fsx/:/database/ \
+-e CUDA_VISIBLE_DEVICES=0 \
+617302ff1b5a \
+python3 /opt/openfold/run_pretrained_openfold.py \
+/data/T1083/input.fasta \
+/database/pdb_mmcif/mmcif_files/ \
+--use_precomputed_alignments /data \
+--output_dir /data/T1083 \
+--model_device cuda:0 \
+--param_path /database/params/params_model_1.npz
+# T1070
+docker run \
+--rm \
+--gpus all \
+-v /home/ec2-user/data:/data \
+-v /fsx/:/database/ \
+-e CUDA_VISIBLE_DEVICES=0 \
+617302ff1b5a \
+python3 /opt/openfold/run_pretrained_openfold.py \
+/data/T1070/input.fasta \
+/database/pdb_mmcif/mmcif_files/ \
+--use_precomputed_alignments /data \
+--output_dir /data/T1070 \
+--model_device cuda:0 \
+--param_path /database/params/params_model_1.npz
+# Notes: The current implementation of OpenFold assumes that the sequences 
+# in the input FASTA file are each in a single line. Sequences that span multiple
+# lines are interpreted as sequence IDs and cause errors.
+# https://github.com/aqlaboratory/openfold/issues/89
+# Also, it seems like it's not possible to run the MSA step with the reduced DB at this time
+### MMseqs 2 
+python3 scripts/precompute_alignments_mmseqs.py \
+    /home/ubuntu/data/T1082/input.fasta \
+    /fsx/mmseqs_dbs \
+    uniref30_2103_db \
+    /home/ubuntu/data/T1082 \
+    /home/ubuntu/openfold/mmseqs/bin/mmseqs \
+    --hhsearch_binary_path home/ubuntu/openfold/lib/conda/envs/openfold_venv/bin/hhsearch \
+    --pdb70 pdb70 \
+    --env_db colabfold_envdb_202108_db
+bash scripts/colabfold_search.sh \
+    /home/ubuntu/openfold/mmseqs/bin/mmseqs \
+    /home/ubuntu/data/T1080/input.fasta \
+    /fsx/mmseqs_dbs \
+    /home/ubuntu/data/T1080/output \
+    uniref30_2103_db \
+    "" \
+    colabfold_envdb_202108_db \ 
+    1 0 1 1 0
+#Note seems to take a fair amount of time to finish (30 minutes - not faster than hhblits). 
+Need to retry using memory recommendations from https://colabfold.mmseqs.com/
+e.g. 768 GiB RAM. A x2gd.12xlarge looks to be the most cost-effective instance for this ($4/hr), or else
+a r5.24xlarge if we need an AMD ($6/hr)
+sudo wget -P /tmp "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh" \
+    && sudo bash /tmp/Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda \
+    && sudo rm /tmp/Miniconda3-latest-Linux-x86_64.sh
+220614
+docker run \
+--gpus all \
+-v /home/ec2-user/data:/data \
+-v /fsx/:/database/ \
+-ti openfold:latest \
+python3 /opt/openfold/run_pretrained_openfold.py \
+/data/fasta_dir \
+/database/pdb_mmcif/mmcif_files/ \
+--use_precomputed_alignments /data/alignments/ \
+--output_dir /data \
+--model_device cuda:0 \
+--jax_param_path /database/params/params_model_1.npz
+python3 /opt/openfold/run_pretrained_openfold.py \
+/data/fasta_dir \
+/database/pdb_mmcif/mmcif_files/ \
+--use_precomputed_alignments /data/alignments/ \
+--output_dir /data \
+--model_device cuda:0 \
+--jax_param_path /database/params/params_model_1.npz
+docker run \
+--gpus all \
+-v /home/ec2-user/data:/data \
+-v /fsx/:/database/ \
+-ti openfold:latest \
+python3 /opt/openfold/run_pretrained_openfold.py \
+/data/fasta_dir \
+/database/pdb_mmcif/mmcif_files/ \
+--uniref90_database_path /database/uniref90/uniref90.fasta \
+--mgnify_database_path /database/mgnify/mgy_clusters_2018_12.fa \
+--pdb70_database_path /database/pdb70/pdb70 \
+--uniclust30_database_path /database/uniclust30/uniclust30_2018_08/uniclust30_2018_08 \
+--output_dir /data \
+--model_device cuda:0 \
+--jackhmmer_binary_path /opt/conda/bin/jackhmmer \
+--hhblits_binary_path /opt/conda/bin/hhblits \
+--hhsearch_binary_path /opt/conda/bin/hhsearch \
+--kalign_binary_path /opt/conda/bin/kalign \
+--jax_param_path /database/params/params_model_1.npz
+docker run --gpus all -v /home/ec2-user/data:/data -v /fsx/:/database/ -ti openfold:latest bash run_batch_job.sh \
+    s3://sagemaker-us-east-2-032243382548/openfold_testing/T1084.fasta \
+    /data/fasta_dir \
+    "python3 /opt/openfold/run_pretrained_openfold.py \
+    /data/fasta_dir \
+    /database/pdb_mmcif/mmcif_files/ \
+    --uniref90_database_path /database/uniref90/uniref90.fasta \
+    --mgnify_database_path /database/mgnify/mgy_clusters_2018_12.fa \
+    --pdb70_database_path /database/pdb70/pdb70 \
+    --uniclust30_database_path /database/uniclust30/uniclust30_2018_08/uniclust30_2018_08 \
+    --output_dir /data \
+    --model_device cuda:0 \
+    --jackhmmer_binary_path /opt/mamba/bin/jackhmmer \
+    --hhblits_binary_path /opt/mamba/bin/hhblits \
+    --hhsearch_binary_path /opt/mamba/bin/hhsearch \
+    --kalign_binary_path /opt/mamba/bin/kalign \
+    --jax_param_path /database/params/params_model_1.npz" \
+    /data \
+    s3://sagemaker-us-east-2-032243382548/openfold_testing/
\ No newline at end of file
--- a/run_batch_job.sh
+++ b/run_batch_job.sh
+#!/bin/bash
+################
+# Example CMD
+./run_batch_job.sh \
+    s3://sagemaker-us-east-2-032243382548/openfold_testing/T1084.fasta \
+    /data/fasta_dir \
+    "python3 /opt/openfold/run_pretrained_openfold.py \
+    /data/fasta_dir \
+    /database/pdb_mmcif/mmcif_files/ \
+    --uniref90_database_path /database/uniref90/uniref90.fasta \
+    --mgnify_database_path /database/mgnify/mgy_clusters_2018_12.fa \
+    --pdb70_database_path /database/pdb70/pdb70 \
+    --uniclust30_database_path /database/uniclust30/uniclust30_2018_08/uniclust30_2018_08 \
+    --output_dir /data \
+    --model_device cuda:0 \
+    --jackhmmer_binary_path /opt/conda/bin/jackhmmer \
+    --hhblits_binary_path /opt/conda/bin/hhblits \
+    --hhsearch_binary_path /opt/conda/bin/hhsearch \
+    --kalign_binary_path /opt/conda/bin/kalign \
+    --jax_param_path /database/params/params_model_1.npz" \
+    /data \
+    s3://sagemaker-us-east-2-032243382548/openfold_testing/
+input_source=$1
+input_destination=$2
+script=$3
+output_source=$4
+output_destination=$5
+aws s3 cp $input_source $input_destination
+$script
+a2s s3 cp $output_source $output_destination
\ No newline at end of file