Merge branch 'main' of https://github.com/hpcaitech/FastFold

b14e47f4 · zhuwenwen · 490cb6f5 · 05681304 · b14e47f4 · b14e47f4
Commit b14e47f4 authored Apr 26, 2023 by zhuwenwen
20 changed files
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
+name: Build
+on: 
+  pull_request:
+    types: [synchronize, labeled]
+jobs:
+  build:
+    name: Build and Test FastFold
+    if: |
+        github.event.pull_request.draft == false &&
+        github.base_ref == 'main' &&
+        github.event.pull_request.base.repo.full_name == 'hpcaitech/FastFold' &&
+        contains( github.event.pull_request.labels.*.name, 'Run Build and Test')
+    runs-on: [self-hosted, gpu]
+    container:
+      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      options: --gpus all --rm -v /data/scratch/fastfold:/data/scratch/fastfold
+    timeout-minutes: 40
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          repository: hpcaitech/FastFold
+          ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
+      - name: Install FastFold
+        run: |
+          [ ! -z "$(ls -A /github/home/fastfold_cache/)" ] && cp -r /github/home/fastfold_cache/* /__w/FastFold/FastFold/
+          pip install -r requirements/requirements.txt
+          pip install -e .
+          pip install -r requirements/test_requirements.txt
+          cp -r /__w/FastFold/FastFold/build /github/home/fastfold_cache/
+          cp /__w/FastFold/FastFold/*.so /github/home/fastfold_cache/
+      - name: Unit Testing
+        run: |
+          PYTHONPATH=$PWD pytest tests
+        env:
+          NCCL_SHM_DISABLE: 1
--- a/.github/workflows/release_bdist.yml
+++ b/.github/workflows/release_bdist.yml
+name: Release bdist wheel
+on:
+  workflow_dispatch:
+    inputs:
+      torch_version:
+        type: string
+        description: torch version, separated by comma
+        required: true
+        default: "all"
+      cuda_version:
+        type: string
+        description: cuda version, separated by comma
+        required: true
+      github_ref:
+        type: string
+        description: Branch or Tag
+        default: 'main'
+        required: true
+jobs:
+  matrix_preparation:
+    name: Prepare Container List
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+    - id: set-matrix
+      env:
+        TORCH_VERSIONS: ${{ inputs.torch_version }}
+        CUDA_VERSIONS: ${{ inputs.cuda_version }}
+      run: |
+        echo $TORCH_VERSIONS
+        echo $CUDA_VERSIONS
+        IFS=','
+        DOCKER_IMAGE=()
+        for cv in $CUDA_VERSIONS
+        do
+            DOCKER_IMAGE+=("\"hpcaitech/cuda-conda:${cv}\"")
+        done
+        container=$( IFS=',' ; echo "${DOCKER_IMAGE[*]}" )
+        container="[${container}]"
+        echo "$container"
+        echo "::set-output name=matrix::{\"container\":$(echo "$container")}"
+  build:
+    name: Release bdist wheels
+    needs: matrix_preparation
+    if: github.repository == 'hpcaitech/FastFold' && contains(fromJson('["FrankLeeeee", "feifeibear", "Shenggan", "Gy-Lu"]'), github.actor)
+    runs-on: [self-hosted, gpu]
+    strategy:
+      fail-fast: false
+      matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
+    container:
+      image: ${{ matrix.container }}
+      options: --gpus all --rm
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - name: Copy scripts and checkout
+        run: |
+          cp -r ./.github/workflows/scripts/build* ./
+          ln -s /github/home/pip_wheels ./pip_wheels
+          git checkout $git_ref
+        env:
+          git_ref: ${{ github.event.inputs.github_ref }}
+      - name: Build bdist wheel
+        run: |
+          pip install beautifulsoup4 requests packaging
+          python ./build_fastfold_wheel.py --torch_version $TORCH_VERSIONS
+        env:
+          TORCH_VERSIONS: ${{ inputs.torch_version }}
+      - name: 🚀 Deploy
+        uses: garygrossgarten/github-action-scp@release
+        with:
+          local: all_dist
+          remote: ${{ secrets.PRIVATE_PYPI_DIR }}
+          host: ${{ secrets.PRIVATE_PYPI_HOST }}
+          username: ${{ secrets.PRIVATE_PYPI_USER }}
+          password: ${{ secrets.PRIVATE_PYPI_PASSWD }}
\ No newline at end of file
--- a/.github/workflows/scripts/build_fastfold_wheel.py
+++ b/.github/workflows/scripts/build_fastfold_wheel.py
+import requests
+from bs4 import BeautifulSoup
+import argparse
+import os
+import subprocess
+from packaging import version
+from functools import cmp_to_key
+WHEEL_TEXT_ROOT_URL = 'https://github.com/hpcaitech/public_assets/tree/main/colossalai/torch_build/torch_wheels'
+RAW_TEXT_FILE_PREFIX = 'https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/torch_build/torch_wheels'
+CUDA_HOME = os.environ['CUDA_HOME']
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--torch_version', type=str)
+    return parser.parse_args()
+def get_cuda_bare_metal_version():
+    raw_output = subprocess.check_output([CUDA_HOME + "/bin/nvcc", "-V"], universal_newlines=True)
+    output = raw_output.split()
+    release_idx = output.index("release") + 1
+    release = output[release_idx].split(".")
+    bare_metal_major = release[0]
+    bare_metal_minor = release[1][0]
+    return bare_metal_major, bare_metal_minor
+def all_wheel_info():
+    page_text = requests.get(WHEEL_TEXT_ROOT_URL).text
+    soup = BeautifulSoup(page_text)
+    all_a_links = soup.find_all('a')
+    wheel_info = dict()
+    for a_link in all_a_links:
+        if 'cuda' in a_link.text and '.txt' in a_link.text:
+            filename = a_link.text
+            torch_version, cuda_version = filename.rstrip('.txt').split('-')
+            cuda_version = cuda_version.lstrip('cuda')
+            if float(cuda_version) < 11.1:
+                continue
+            if torch_version not in wheel_info:
+                wheel_info[torch_version] = dict()
+            wheel_info[torch_version][cuda_version] = dict()
+            file_text = requests.get(f'{RAW_TEXT_FILE_PREFIX}/{filename}').text
+            lines = file_text.strip().split('\n')
+            for line in lines:
+                parts = line.split('\t')
+                method, url, python_version = parts[:3]
+                if float(python_version) < 3.8 or method == "conda":
+                    continue
+                wheel_info[torch_version][cuda_version][python_version] = dict(url=url)
+    return wheel_info
+def build_fastfold(wheel_info):
+    cuda_version_major, cuda_version_minor = get_cuda_bare_metal_version()
+    cuda_version_on_host = f'{cuda_version_major}.{cuda_version_minor}'
+    for torch_version, cuda_versioned_wheel_info in wheel_info.items():
+        for cuda_version, python_versioned_wheel_info in cuda_versioned_wheel_info.items():
+            if cuda_version_on_host == cuda_version:
+                for python_version, wheel_info in python_versioned_wheel_info.items():
+                    url = wheel_info['url']
+                    filename = url.split('/')[-1].replace('%2B', '+')
+                    cmd = f'bash ./build_fastfold_wheel.sh {url} {filename} {cuda_version} {python_version}'
+                    os.system(cmd)
+def main():
+    args = parse_args()
+    wheel_info = all_wheel_info()
+    # filter wheels on condition
+    all_torch_versions = list(wheel_info.keys())
+    def _compare_version(a, b):
+        if version.parse(a) > version.parse(b):
+            return 1
+        else:
+            return -1
+    all_torch_versions.sort(key=cmp_to_key(_compare_version))
+    if args.torch_version != 'all':
+        torch_versions = args.torch_version.split(',')
+        # only keep the torch versions specified
+        for key in all_torch_versions:
+            if key not in torch_versions:
+                wheel_info.pop(key)
+    build_fastfold(wheel_info)
+if __name__ == '__main__':
+    main()
--- a/.github/workflows/scripts/build_fastfold_wheel.sh
+++ b/.github/workflows/scripts/build_fastfold_wheel.sh
+#!/usr/bin/env bash
+url=${1}
+filename=${2}
+cuda_version=${3}
+python_version=${4}
+git reset --hard HEAD
+mkdir -p ./all_dist
+source activate base
+conda create -n $python_version -y python=$python_version
+source activate $python_version
+wget -nc -q -O ./$filename $url
+pip install ./$filename
+pip install numpy
+python setup.py bdist_wheel
+mv ./dist/* ./all_dist
+python setup.py clean
+conda deactivate
+conda env remove -n $python_version
\ No newline at end of file
--- a/.gitignore
+++ b/.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# vscode
+.vscode/
+# setup
+dist/
+build/
\ No newline at end of file
--- a/LICENSE
+++ b/LICENSE
--- a/README.md
+++ b/README.md
+![](/assets/fold.jpg)
+# FastFold
+[![](https://img.shields.io/badge/Paper-PDF-green?style=flat&logo=arXiv&logoColor=green)](https://arxiv.org/abs/2203.00854)
+![](https://img.shields.io/badge/Made%20with-ColossalAI-blueviolet?style=flat)
+![](https://img.shields.io/badge/Habana-support-blue?style=flat&logo=intel&logoColor=blue)
+![](https://img.shields.io/github/v/release/hpcaitech/FastFold)
+[![GitHub license](https://img.shields.io/github/license/hpcaitech/FastFold)](https://github.com/hpcaitech/FastFold/blob/main/LICENSE)
+## News :triangular_flag_on_post:
+- [2023/01] Compatible with AlphaFold v2.3
+- [2023/01] Added support for inference and training of AlphaFold on [Intel Habana](https://habana.ai/) platform. For usage instructions, see [here](#Inference-or-Training-on-Intel-Habana).
+<br>
+Optimizing Protein Structure Prediction Model Training and Inference on Heterogeneous Clusters
+FastFold provides a **high-performance implementation of Evoformer** with the following characteristics.
+1. Excellent kernel performance on GPU platform
+2. Supporting Dynamic Axial Parallelism(DAP)
+    * Break the memory limit of single GPU and reduce the overall training time
+    * DAP can significantly speed up inference and make ultra-long sequence inference possible
+3. Ease of use
+    * Huge performance gains with a few lines changes
+    * You don't need to care about how the parallel part is implemented
+4. Faster data processing, about 3x times faster on monomer, about 3Nx times faster on multimer with N sequence.
+5. Great Reduction on GPU memory, able to inference sequence containing more than **10000** residues.
+## Installation
+To install FastFold, you will need:
+ Python 3.8 or 3.9.
+ [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads) 11.3 or above
+ PyTorch 1.12 or above 
+For now, You can install FastFold:
+### Using Conda (Recommended)
+We highly recommend installing an Anaconda or Miniconda environment and install PyTorch with conda.
+Lines below would create a new conda environment called "fastfold":
+```shell
+git clone https://github.com/hpcaitech/FastFold
+cd FastFold
+conda env create --name=fastfold -f environment.yml
+conda activate fastfold
+python setup.py install
+```
+#### Advanced
+To leverage the power of FastFold, we recommend you to install [Triton](https://github.com/openai/triton).
+**NOTE: Triron needs CUDA 11.4 to run.**
+```bash
+pip install -U --pre triton
+```
+## Use Docker
+### Build On Your Own
+Run the following command to build a docker image from Dockerfile provided.
+> Building FastFold from scratch requires GPU support, you need to use Nvidia Docker Runtime as the default when doing `docker build`. More details can be found [here](https://stackoverflow.com/questions/59691207/docker-build-with-nvidia-runtime).
+```shell
+cd FastFold
+docker build -t fastfold ./docker
+```
+Run the following command to start the docker container in interactive mode.
+```shell
+docker run -ti --gpus all --rm --ipc=host fastfold bash
+```
+## Usage
+You can use `Evoformer` as `nn.Module` in your project after `from fastfold.model.fastnn import Evoformer`:
+```python
+from fastfold.model.fastnn import Evoformer
+evoformer_layer = Evoformer()
+```
+If you want to use Dynamic Axial Parallelism, add a line of initialize with `fastfold.distributed.init_dap`.
+```python
+from fastfold.distributed import init_dap
+init_dap(args.dap_size)
+```
+### Download the dataset
+You can down the dataset used to train FastFold  by the script `download_all_data.sh`:
+    ./scripts/download_all_data.sh data/
+### Inference
+You can use FastFold with `inject_fastnn`. This will replace the evoformer from OpenFold with the high performance evoformer from FastFold.
+```python
+from fastfold.utils import inject_fastnn
+model = AlphaFold(config)
+import_jax_weights_(model, args.param_path, version=args.model_name)
+model = inject_fastnn(model)
+```
+For Dynamic Axial Parallelism, you can refer to `./inference.py`. Here is an example of 2 GPUs parallel inference:
+```shell
+python inference.py target.fasta data/pdb_mmcif/mmcif_files/ \
+    --output_dir .outputs/ \
+    --gpus 2 \
+    --uniref90_database_path data/uniref90/uniref90.fasta \
+    --mgnify_database_path data/mgnify/mgy_clusters_2022_05.fa \
+    --pdb70_database_path data/pdb70/pdb70 \
+    --uniref30_database_path data/uniref30/UniRef30_2021_03 \
+    --bfd_database_path data/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt \
+    --jackhmmer_binary_path `which jackhmmer` \
+    --hhblits_binary_path `which hhblits` \
+    --hhsearch_binary_path `which hhsearch` \
+    --kalign_binary_path `which kalign` \
+    --enable_workflow \
+    --inplace
+```
+or run the script `./inference.sh`, you can change the parameter in the script, especisally those data path.
+```shell
+./inference.sh
+```
+Alphafold's data pre-processing takes a lot of time, so we speed up the data pre-process by [ray](https://docs.ray.io/en/latest/workflows/concepts.html) workflow, which achieves a 3x times faster speed. To run the inference with ray workflow, we add parameter `--enable_workflow` by default.
+To reduce memory usage of embedding presentations, we also add parameter `--inplace` to share memory by defaul.
+#### inference with lower memory usage
+Alphafold's embedding presentations take up a lot of memory as the sequence length increases. To reduce memory usage, 
+you should add parameter `--chunk_size [N]` to cmdline or shell script `./inference.sh`. 
+The smaller you set N, the less memory will be used, but it will affect the speed. We can inference 
+a sequence of length 10000 in bf16 with 61GB memory on a Nvidia A100(80GB). For fp32, the max length is 8000.
+> You need to set `PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:15000` to inference such an extreme long sequence.
+```shell
+python inference.py target.fasta data/pdb_mmcif/mmcif_files/ \
+    --output_dir .outputs/ \
+    --gpus 2 \
+    --uniref90_database_path data/uniref90/uniref90.fasta \
+    --mgnify_database_path data/mgnify/mgy_clusters_2022_05.fa \
+    --pdb70_database_path data/pdb70/pdb70 \
+    --uniref30_database_path data/uniref30/UniRef30_2021_03 \
+    --bfd_database_path data/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt \
+    --jackhmmer_binary_path `which jackhmmer` \
+    --hhblits_binary_path `which hhblits` \
+    --hhsearch_binary_path `which hhsearch` \
+    --kalign_binary_path `which kalign`  \
+    --enable_workflow \
+    --inplace
+    --chunk_size N \
+```
+#### inference multimer sequence
+Alphafold Multimer is supported. You can the following cmd or shell script `./inference_multimer.sh`.
+Workflow and memory parameters mentioned above can also be used.
+```shell
+python inference.py target.fasta data/pdb_mmcif/mmcif_files/ \
+    --output_dir ./ \
+    --gpus 2 \
+    --model_preset multimer \
+    --uniref90_database_path data/uniref90/uniref90.fasta \
+    --mgnify_database_path data/mgnify/mgy_clusters_2022_05.fa \
+    --pdb70_database_path data/pdb70/pdb70 \
+    --uniref30_database_path data/uniref30/UniRef30_2021_03 \
+    --bfd_database_path data/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt \
+    --uniprot_database_path data/uniprot/uniprot.fasta \
+    --pdb_seqres_database_path data/pdb_seqres/pdb_seqres.txt  \
+    --param_path data/params/params_model_1_multimer.npz \
+    --model_name model_1_multimer \
+    --jackhmmer_binary_path `which jackhmmer` \
+    --hhblits_binary_path `which hhblits` \
+    --hhsearch_binary_path `which hhsearch` \
+    --kalign_binary_path `which kalign`
+```
+### Inference or Training on Intel Habana
+To run AlphaFold inference or training on Intel Habana, you can follow the instructions in the [Installation Guide](https://docs.habana.ai/en/latest/Installation_Guide/) to set up your environment on Amazon EC2 DL1 instances or on-premise environments, and please use SynapseAI R1.7.1 to test as it was verified internally.
+Once you have prepared your dataset and installed fastfold, you can use the following scripts:
+```shell
+cd fastfold/habana/fastnn/custom_op/; python setup.py build (this is for Gaudi, for Gaudi2 please use setup2.py) ; cd -
+bash habana/inference.sh
+bash habana/train.sh
+```
+## Performance Benchmark
+We have included a performance benchmark script in `./benchmark`. You can benchmark the performance of Evoformer using different settings.
+```shell
+cd ./benchmark
+torchrun --nproc_per_node=1 perf.py --msa-length 128 --res-length 256
+```
+Benchmark Dynamic Axial Parallelism with 2 GPUs:
+```shell
+cd ./benchmark
+torchrun --nproc_per_node=2 perf.py --msa-length 128 --res-length 256 --dap-size 2
+```
+If you want to benchmark with [OpenFold](https://github.com/aqlaboratory/openfold), you need to install OpenFold first and benchmark with option `--openfold`:
+```shell
+torchrun --nproc_per_node=1 perf.py --msa-length 128 --res-length 256 --openfold
+```
+## Cite us
+Cite this paper, if you use FastFold in your research publication.
+```
+@misc{cheng2022fastfold,
+      title={FastFold: Reducing AlphaFold Training Time from 11 Days to 67 Hours}, 
+      author={Shenggan Cheng and Ruidong Wu and Zhongming Yu and Binrui Li and Xiwen Zhang and Jian Peng and Yang You},
+      year={2022},
+      eprint={2203.00854},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG}
+}
+```
+## Acknowledgments
+We would like to extend our special thanks to the Intel Habana team for their support in providing us with technology and resources on the Habana platform.
--- a/assets/fold.jpg
+++ b/assets/fold.jpg
--- a/benchmark/perf.py
+++ b/benchmark/perf.py
+import argparse
+import os
+import torch
+import torch.nn as nn
+from fastfold.distributed import init_dap
+from fastfold.model.fastnn import Evoformer
+def main():
+    parser = argparse.ArgumentParser(description='Evoformer Standalone Perf Benchmark')
+    parser.add_argument("--dap-size", default=1, type=int, help='batch size')
+    parser.add_argument('--batch-size', default=1, type=int, help='batch size')
+    parser.add_argument('--msa-length', default=132, type=int, help='Sequence Length of MSA')
+    parser.add_argument('--res-length',
+                        default=256,
+                        type=int,
+                        help='Sequence Length of Residues')
+    parser.add_argument('--trials', default=50, type=int, help='Number of Trials to Execute')
+    parser.add_argument('--warmup-trials', default=5, type=int, help='Warmup Trials to discard')
+    parser.add_argument('--layers',
+                        default=12,
+                        type=int,
+                        help='Evoformer Layers to Execute')
+    parser.add_argument('--cm', default=256, type=int, help='MSA hidden dimension')
+    parser.add_argument('--cz', default=128, type=int, help='Pair hidden dimension')
+    parser.add_argument('--heads', default=8, type=int, help='Number of Multihead Attention heads')
+    parser.add_argument('--openfold',
+                        action='store_true',
+                        help='Benchmark with Evoformer Implementation from OpenFold.')
+    parser.add_argument('--fwd', action='store_true', help='Only execute Fwd Pass.')
+    parser.add_argument('--prof', action='store_true', help='run with profiler.')
+    args = parser.parse_args()
+    init_dap(args.dap_size)
+    precision = torch.bfloat16
+    if args.dap_size > 1:
+        # (PyTorch issue) Currently All2All communication does not support the Bfloat16 datatype in PyTorch
+        precision = torch.float16
+    if not torch.cuda.is_available():
+        raise NotImplementedError('Running on CPU is not supported')
+    torch.manual_seed(42)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(42)
+    if args.openfold:
+        from openfold.model.evoformer import EvoformerBlock
+        class OpenFoldEvoformer(nn.Module):
+            def __init__(self, d_node, d_pair):
+                super(OpenFoldEvoformer, self).__init__()
+                self.d_node = d_node
+                self.d_pair = d_pair
+                self.c_hidden_msa_att = int(d_node / 8)
+                self.c_hidden_pair_att = int(d_pair / 8)
+                self.EvoformerBlock = EvoformerBlock(c_m=d_node,
+                                                     c_z=d_pair,
+                                                     c_hidden_msa_att=self.c_hidden_msa_att,
+                                                     c_hidden_opm=self.c_hidden_msa_att,
+                                                     c_hidden_mul=self.d_pair,
+                                                     c_hidden_pair_att=self.c_hidden_pair_att,
+                                                     no_heads_msa=8,
+                                                     no_heads_pair=4,
+                                                     transition_n=4,
+                                                     msa_dropout=0.15,
+                                                     pair_dropout=0.25,
+                                                     inf=1e9,
+                                                     eps=1e-10)
+            def forward(self, node, pair, node_mask, pair_mask):
+                node, pair = self.EvoformerBlock(node, pair, node_mask, pair_mask)
+                return node, pair
+    attn_layers = []
+    for idx in range(0, args.layers):
+        if args.openfold:
+            attn_layers.append(OpenFoldEvoformer(d_node=args.cm, d_pair=args.cz))
+        else:
+            attn_layers.append(Evoformer(d_node=args.cm, d_pair=args.cz))
+        attn_layers[idx].cuda()
+        attn_layers[idx].to(dtype=precision)
+    start_evt_fwd = []
+    start_evt_bwd = []
+    stop_evt_bwd = []
+    for recorded_trial in range(0, args.trials):
+        start_evt_fwd.append(torch.cuda.Event(enable_timing=True))
+        start_evt_bwd.append(torch.cuda.Event(enable_timing=True))
+        stop_evt_bwd.append(torch.cuda.Event(enable_timing=True))
+    inputs_node = torch.randn(args.batch_size,
+                              args.msa_length // args.dap_size,
+                              args.res_length,
+                              args.cm,
+                              dtype=precision,
+                              device=torch.device("cuda")).requires_grad_(True)
+    inputs_pair = torch.randn(args.batch_size,
+                              args.res_length // args.dap_size,
+                              args.res_length,
+                              args.cz,
+                              dtype=precision,
+                              device=torch.device("cuda")).requires_grad_(True)
+    node_mask = torch.ones((args.batch_size, args.msa_length, args.res_length),
+                           dtype=precision,
+                           device=torch.device("cuda")).requires_grad_(False)
+    pair_mask = torch.ones((args.batch_size, args.res_length, args.res_length),
+                           dtype=precision,
+                           device=torch.device("cuda")).requires_grad_(False)
+    grads_node = torch.randn_like(inputs_pair)
+    if args.prof:
+        prof = torch.profiler.profile(
+            schedule=torch.profiler.schedule(wait=1,
+                                             warmup=args.warmup_trials,
+                                             active=args.trials,
+                                             repeat=1),
+            on_trace_ready=torch.profiler.tensorboard_trace_handler('./log/fastfold'),
+            profile_memory=False,
+            record_shapes=False,
+            with_stack=False)
+        prof.start()
+    for trial in range(0, args.trials + args.warmup_trials):
+        layer_inputs = inputs_node, inputs_pair
+        evt_idx = trial - args.warmup_trials
+        torch.distributed.barrier()
+        torch.cuda.synchronize()
+        if evt_idx >= 0:
+            start_evt_fwd[evt_idx].record()
+        for lyr_idx in range(0, args.layers):
+            layer_inputs = attn_layers[lyr_idx].forward(*layer_inputs, node_mask, pair_mask)
+        torch.cuda.synchronize()
+        if evt_idx >= 0:
+            start_evt_bwd[evt_idx].record()
+        if not args.fwd:
+            layer_inputs[1].backward(grads_node)
+        if evt_idx >= 0:
+            stop_evt_bwd[evt_idx].record()
+        if args.prof:
+            prof.step()
+    if args.prof:
+        prof.stop()
+    torch.distributed.barrier()
+    torch.cuda.synchronize()
+    elapsed_time_fwd = 0.0
+    elapsed_time_bwd = 0.0
+    for evt_idx in range(0, args.trials):
+        elapsed_time_fwd += start_evt_fwd[evt_idx].elapsed_time(start_evt_bwd[evt_idx])
+        elapsed_time_bwd += start_evt_bwd[evt_idx].elapsed_time(stop_evt_bwd[evt_idx])
+    print("[ MSA Attn ] Input: {:4d}, {:4d}, {:4d}, ({:4d} {:4d}) Fwd Time / Layer: {:.3f} ms Bwd Time / Layer: {:.3f} ms".format(
+        args.batch_size, args.msa_length, args.res_length,     \
+        args.cm, args.cz,                                      \
+        elapsed_time_fwd / ( args.trials * args.layers ),      \
+        elapsed_time_bwd / ( args.trials * args.layers )))
+if __name__ == '__main__':
+    main()
--- a/demo.py
+++ b/demo.py
+# Copyright 2023 HPC-AI Tech Inc.
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import time
+import fastfold
+import numpy as np
+import torch
+import torch.multiprocessing as mp
+from fastfold.config import model_config
+from fastfold.data import data_transforms
+from fastfold.model.fastnn import set_chunk_size
+from fastfold.model.hub import AlphaFold
+from fastfold.utils.inject_fastnn import inject_fastnn
+from fastfold.utils.tensor_utils import tensor_tree_map
+if int(torch.__version__.split(".")[0]) >= 1 and int(torch.__version__.split(".")[1]) > 11:
+    torch.backends.cuda.matmul.allow_tf32 = True
+def random_template_feats(n_templ, n):
+    b = []
+    batch = {
+        "template_mask": np.random.randint(0, 2, (*b, n_templ)),
+        "template_pseudo_beta_mask": np.random.randint(0, 2, (*b, n_templ, n)),
+        "template_pseudo_beta": np.random.rand(*b, n_templ, n, 3),
+        "template_aatype": np.random.randint(0, 22, (*b, n_templ, n)),
+        "template_all_atom_mask": np.random.randint(0, 2, (*b, n_templ, n, 37)),
+        "template_all_atom_positions": np.random.rand(*b, n_templ, n, 37, 3) * 10,
+        "template_torsion_angles_sin_cos": np.random.rand(*b, n_templ, n, 7, 2),
+        "template_alt_torsion_angles_sin_cos": np.random.rand(*b, n_templ, n, 7, 2),
+        "template_torsion_angles_mask": np.random.rand(*b, n_templ, n, 7),
+    }
+    batch = {k: v.astype(np.float32) for k, v in batch.items()}
+    batch["template_aatype"] = batch["template_aatype"].astype(np.int64)
+    return batch
+def random_extra_msa_feats(n_extra, n):
+    b = []
+    batch = {
+        "extra_msa": np.random.randint(0, 22, (*b, n_extra, n)).astype(np.int64),
+        "extra_has_deletion": np.random.randint(0, 2, (*b, n_extra, n)).astype(np.float32),
+        "extra_deletion_value": np.random.rand(*b, n_extra, n).astype(np.float32),
+        "extra_msa_mask": np.random.randint(0, 2, (*b, n_extra, n)).astype(np.float32),
+    }
+    return batch
+def generate_batch(n_res):
+    batch = {}
+    tf = torch.randint(21, size=(n_res,))
+    batch["target_feat"] = torch.nn.functional.one_hot(tf, 22).float()
+    batch["aatype"] = torch.argmax(batch["target_feat"], dim=-1)
+    batch["residue_index"] = torch.arange(n_res)
+    batch["msa_feat"] = torch.rand((128, n_res, 49))
+    t_feats = random_template_feats(4, n_res)
+    batch.update({k: torch.tensor(v) for k, v in t_feats.items()})
+    extra_feats = random_extra_msa_feats(5120, n_res)
+    batch.update({k: torch.tensor(v) for k, v in extra_feats.items()})
+    batch["msa_mask"] = torch.randint(low=0, high=2, size=(128, n_res)).float()
+    batch["seq_mask"] = torch.randint(low=0, high=2, size=(n_res,)).float()
+    batch.update(data_transforms.make_atom14_masks(batch))
+    batch["no_recycling_iters"] = torch.tensor(2.)
+    add_recycling_dims = lambda t: (t.unsqueeze(-1).expand(*t.shape, 3))
+    batch = tensor_tree_map(add_recycling_dims, batch)
+    return batch
+def inference_model(rank, world_size, result_q, batch, args):
+    os.environ['RANK'] = str(rank)
+    os.environ['LOCAL_RANK'] = str(rank)
+    os.environ['WORLD_SIZE'] = str(world_size)
+    # init distributed for Dynamic Axial Parallelism
+    fastfold.distributed.init_dap()
+    torch.cuda.set_device(rank)
+    config = model_config(args.model_name)
+    if args.chunk_size:
+        config.globals.chunk_size = args.chunk_size
+    config.globals.inplace = args.inplace
+    config.globals.is_multimer = False
+    model = AlphaFold(config)
+    model = inject_fastnn(model)
+    model = model.eval()
+    model = model.cuda()
+    set_chunk_size(model.globals.chunk_size)
+    with torch.no_grad():
+        batch = {k: torch.as_tensor(v).cuda() for k, v in batch.items()}
+        t = time.perf_counter()
+        out = model(batch)
+        print(f"Inference time: {time.perf_counter() - t}")
+    out = tensor_tree_map(lambda x: np.array(x.cpu()), out)
+    result_q.put(out)
+    torch.distributed.barrier()
+    torch.cuda.synchronize()
+def inference_monomer_model(args):
+    batch = generate_batch(args.n_res)
+    manager = mp.Manager()
+    result_q = manager.Queue()
+    torch.multiprocessing.spawn(inference_model, nprocs=args.gpus, args=(args.gpus, result_q, batch, args))
+    out = result_q.get()
+    # get unrelexed pdb and save
+    # batch = tensor_tree_map(lambda x: np.array(x[..., -1].cpu()), batch)
+    # plddt = out["plddt"]
+    # plddt_b_factors = np.repeat(plddt[..., None], residue_constants.atom_type_num, axis=-1)
+    # unrelaxed_protein = protein.from_prediction(features=batch,
+    #                                             result=out,
+    #                                             b_factors=plddt_b_factors)
+    # with open('demo_unrelex.pdb', 'w+') as fp:
+    #     fp.write(unrelaxed_protein)
+def main(args):
+    inference_monomer_model(args)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--gpus", type=int, default=1, help="""Number of GPUs with which to run inference""")
+    parser.add_argument("--n_res", type=int, default=50, help="virtual residue number of random data")
+    parser.add_argument("--model_name", type=str, default="model_1", help="model name of alphafold")
+    parser.add_argument('--chunk_size', type=int, default=None)
+    parser.add_argument('--inplace', default=False, action='store_true')
+    args = parser.parse_args()
+    main(args) 
\ No newline at end of file
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
+FROM hpcaitech/pytorch-cuda:1.12.0-11.3.0
+RUN conda install openmm=7.7.0 pdbfixer -c conda-forge -y \
+ && conda install hmmer==3.3.2 hhsuite=3.3.0 kalign2=2.04 -c bioconda -y
+RUN pip install biopython==1.79 dm-tree==0.1.6 ml-collections==0.1.0 \
+scipy==1.7.1 ray pyarrow pandas einops
+RUN pip install colossalai
+Run git clone https://github.com/hpcaitech/FastFold.git \
+ && cd ./FastFold \
+ && python setup.py install
--- a/environment.yml
+++ b/environment.yml
+name: fastfold
+channels:
+  - conda-forge
+  - bioconda
+  - pytorch
+dependencies:
+  - pip:
+      - biopython==1.79
+      - dm-tree==0.1.6
+      - ml-collections==0.1.0
+      - PyYAML==5.4.1
+      - requests==2.26.0
+      - scipy==1.7.1
+      - tqdm==4.62.2
+      - typing-extensions==4.3.0
+      - einops
+      - ray==2.0.0
+      - pyarrow
+      - pandas
+      - colossalai==0.2.7
+  - pytorch::pytorch=1.12
+  - pytorch::torchvision
+  - pytorch::torchaudio
+  - conda-forge::cudatoolkit=11.3
+  - conda-forge::python=3.8
+  - conda-forge::setuptools=59.5.0
+  - conda-forge::pip
+  - conda-forge::openmm=7.7.0
+  - conda-forge::pdbfixer
+  - bioconda::hmmer==3.3.2
+  - bioconda::hhsuite==3.3.0
+  - bioconda::kalign2==2.04
--- a/fastfold/__init__.py
+++ b/fastfold/__init__.py
+VERSION = "0.1.0-beta"
\ No newline at end of file
--- a/fastfold/common/__init__.py
+++ b/fastfold/common/__init__.py
--- a/fastfold/common/protein.py
+++ b/fastfold/common/protein.py
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Protein data type."""
+import dataclasses
+import io
+from typing import Any, Mapping, Optional
+import re
+from fastfold.common import residue_constants
+from Bio.PDB import PDBParser
+import numpy as np
+FeatureDict = Mapping[str, np.ndarray]
+ModelOutput = Mapping[str, Any]  # Is a nested dict.
+PICO_TO_ANGSTROM = 0.01
+PDB_CHAIN_IDS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
+PDB_MAX_CHAINS = len(PDB_CHAIN_IDS)
+assert(PDB_MAX_CHAINS == 62)
+@dataclasses.dataclass(frozen=True)
+class Protein:
+    """Protein structure representation."""
+    # Cartesian coordinates of atoms in angstroms. The atom types correspond to
+    # residue_constants.atom_types, i.e. the first three are N, CA, CB.
+    atom_positions: np.ndarray  # [num_res, num_atom_type, 3]
+    # Amino-acid type for each residue represented as an integer between 0 and
+    # 20, where 20 is 'X'.
+    aatype: np.ndarray  # [num_res]
+    # Binary float mask to indicate presence of a particular atom. 1.0 if an atom
+    # is present and 0.0 if not. This should be used for loss masking.
+    atom_mask: np.ndarray  # [num_res, num_atom_type]
+    # Residue index as used in PDB. It is not necessarily continuous or 0-indexed.
+    residue_index: np.ndarray  # [num_res]
+    # 0-indexed number corresponding to the chain in the protein that this 
+    # residue belongs to
+    chain_index: np.ndarray # [num_res]
+    # B-factors, or temperature factors, of each residue (in sq. angstroms units),
+    # representing the displacement of the residue from its ground truth mean
+    # value.
+    b_factors: np.ndarray  # [num_res, num_atom_type]
+    def __post_init__(self):
+        if(len(np.unique(self.chain_index)) > PDB_MAX_CHAINS):
+            raise ValueError(
+                f"Cannot build an instance with more than {PDB_MAX_CHAINS} "
+                "chains because these cannot be written to PDB format"
+            )
+def from_pdb_string(pdb_str: str, chain_id: Optional[str] = None) -> Protein:
+    """Takes a PDB string and constructs a Protein object.
+    WARNING: All non-standard residue types will be converted into UNK. All
+      non-standard atoms will be ignored.
+    Args:
+      pdb_str: The contents of the pdb file
+      chain_id: If chain_id is specified (e.g. A), then only that chain is 
+      parsed. Else, all chains are parsed.
+    Returns:
+      A new `Protein` parsed from the pdb contents.
+    """
+    pdb_fh = io.StringIO(pdb_str)
+    parser = PDBParser(QUIET=True)
+    structure = parser.get_structure("none", pdb_fh)
+    models = list(structure.get_models())
+    if len(models) != 1:
+        raise ValueError(
+            f"Only single model PDBs are supported. Found {len(models)} models."
+        )
+    model = models[0]
+    atom_positions = []
+    aatype = []
+    atom_mask = []
+    residue_index = []
+    chain_ids = []
+    b_factors = []
+    for chain in model:
+        if(chain_id is not None and chain.id != chain_id):
+            continue
+        for res in chain:
+            if res.id[2] != " ":
+                raise ValueError(
+                    f"PDB contains an insertion code at chain {chain.id} and residue "
+                    f"index {res.id[1]}. These are not supported."
+                )
+            res_shortname = residue_constants.restype_3to1.get(res.resname, "X")
+            restype_idx = residue_constants.restype_order.get(
+                res_shortname, residue_constants.restype_num
+            )
+            pos = np.zeros((residue_constants.atom_type_num, 3))
+            mask = np.zeros((residue_constants.atom_type_num,))
+            res_b_factors = np.zeros((residue_constants.atom_type_num,))
+            for atom in res:
+                if atom.name not in residue_constants.atom_types:
+                    continue
+                pos[residue_constants.atom_order[atom.name]] = atom.coord
+                mask[residue_constants.atom_order[atom.name]] = 1.0
+                res_b_factors[
+                    residue_constants.atom_order[atom.name]
+                ] = atom.bfactor
+            if np.sum(mask) < 0.5:
+                # If no known atom positions are reported for the residue then skip it.
+                continue
+            aatype.append(restype_idx)
+            atom_positions.append(pos)
+            atom_mask.append(mask)
+            residue_index.append(res.id[1])
+            chain_ids.append(chain.id)
+            b_factors.append(res_b_factors)
+    # Chain IDs are usually characters so map these to ints
+    unique_chain_ids = np.unique(chain_ids)
+    chain_id_mapping = {cid: n for n, cid in enumerate(unique_chain_ids)}
+    chain_index = np.array([chain_id_mapping[cid] for cid in chain_ids])
+    return Protein(
+        atom_positions=np.array(atom_positions),
+        atom_mask=np.array(atom_mask),
+        aatype=np.array(aatype),
+        residue_index=np.array(residue_index),
+        chain_index=chain_index,
+        b_factors=np.array(b_factors),
+    )
+def from_proteinnet_string(proteinnet_str: str) -> Protein:
+    tag_re = r'(\[[A-Z]+\]\n)'
+    tags = [
+        tag.strip() for tag in re.split(tag_re, proteinnet_str) if len(tag) > 0
+    ]
+    groups = zip(tags[0::2], [l.split('\n') for l in tags[1::2]])
+    atoms = ['N', 'CA', 'C']
+    aatype = None
+    atom_positions = None
+    atom_mask = None
+    for g in groups:
+        if("[PRIMARY]" == g[0]):
+            seq = g[1][0].strip()
+            for i in range(len(seq)):
+                if(seq[i] not in residue_constants.restypes):
+                    seq[i] = 'X'
+            aatype = np.array([
+                residue_constants.restype_order.get(
+                    res_symbol, residue_constants.restype_num
+                ) for res_symbol in seq
+            ])
+        elif("[TERTIARY]" == g[0]):
+            tertiary = []
+            for axis in range(3):
+                tertiary.append(list(map(float, g[1][axis].split())))
+            tertiary_np = np.array(tertiary)
+            atom_positions = np.zeros(
+                (len(tertiary[0])//3, residue_constants.atom_type_num, 3)
+            ).astype(np.float32)
+            for i, atom in enumerate(atoms):
+                atom_positions[:, residue_constants.atom_order[atom], :] = (
+                    np.transpose(tertiary_np[:, i::3])
+                )
+            atom_positions *= PICO_TO_ANGSTROM
+        elif("[MASK]" == g[0]):
+            mask = np.array(list(map({'-': 0, '+': 1}.get, g[1][0].strip())))
+            atom_mask = np.zeros(
+                (len(mask), residue_constants.atom_type_num,)
+            ).astype(np.float32)
+            for i, atom in enumerate(atoms):
+                atom_mask[:, residue_constants.atom_order[atom]] = 1
+            atom_mask *= mask[..., None]
+    return Protein(
+        atom_positions=atom_positions,
+        atom_mask=atom_mask,
+        aatype=aatype,
+        residue_index=np.arange(len(aatype)),
+        b_factors=None,
+    )
+def _chain_end(atom_index, end_resname, chain_name, residue_index) -> str:
+    chain_end = 'TER'
+    return(
+        f'{chain_end:<6}{atom_index:>5}      {end_resname:>3} '
+        f'{chain_name:>1}{residue_index:>4}'
+    )
+def to_pdb(prot: Protein) -> str:
+    """Converts a `Protein` instance to a PDB string.
+    Args:
+      prot: The protein to convert to PDB.
+    Returns:
+      PDB string.
+    """
+    restypes = residue_constants.restypes + ["X"]
+    res_1to3 = lambda r: residue_constants.restype_1to3.get(restypes[r], "UNK")
+    atom_types = residue_constants.atom_types
+    pdb_lines = []
+    atom_mask = prot.atom_mask
+    aatype = prot.aatype
+    atom_positions = prot.atom_positions
+    residue_index = prot.residue_index.astype(np.int32)
+    chain_index = prot.chain_index.astype(np.int32)
+    b_factors = prot.b_factors
+    if np.any(aatype > residue_constants.restype_num):
+        raise ValueError("Invalid aatypes.")
+    # Construct a mapping from chain integer indices to chain ID strings.
+    chain_ids = {}
+    for i in np.unique(chain_index): # np.unique gives sorted output.
+        if i >= PDB_MAX_CHAINS:
+            raise ValueError(
+                f"The PDB format supports at most {PDB_MAX_CHAINS} chains."
+            )
+        chain_ids[i] = PDB_CHAIN_IDS[i]
+    pdb_lines.append("MODEL     1")
+    atom_index = 1
+    last_chain_index = chain_index[0]
+    # Add all atom sites.
+    for i in range(aatype.shape[0]):
+        # Close the previous chain if in a multichain PDB.
+        if last_chain_index != chain_index[i]:
+            pdb_lines.append(
+                _chain_end(
+                    atom_index, 
+                    res_1to3(aatype[i - 1]), 
+                    chain_ids[chain_index[i - 1]], 
+                    residue_index[i - 1]
+                )
+            )
+            last_chain_index = chain_index[i]
+            atom_index += 1 # Atom index increases at the TER symbol.
+        res_name_3 = res_1to3(aatype[i])
+        for atom_name, pos, mask, b_factor in zip(
+            atom_types, atom_positions[i], atom_mask[i], b_factors[i]
+        ):
+            if mask < 0.5:
+                continue
+            record_type = "ATOM"
+            name = atom_name if len(atom_name) == 4 else f" {atom_name}"
+            alt_loc = ""
+            insertion_code = ""
+            occupancy = 1.00
+            element = atom_name[
+                0
+            ]  # Protein supports only C, N, O, S, this works.
+            charge = ""
+            # PDB is a columnar format, every space matters here!
+            atom_line = (
+                f"{record_type:<6}{atom_index:>5} {name:<4}{alt_loc:>1}"
+                f"{res_name_3:>3} {chain_ids[chain_index[i]]:>1}"
+                f"{residue_index[i]:>4}{insertion_code:>1}   "
+                f"{pos[0]:>8.3f}{pos[1]:>8.3f}{pos[2]:>8.3f}"
+                f"{occupancy:>6.2f}{b_factor:>6.2f}          "
+                f"{element:>2}{charge:>2}"
+            )
+            pdb_lines.append(atom_line)
+            atom_index += 1
+    # Close the final chain.
+    pdb_lines.append(
+        _chain_end(
+            atom_index, 
+            res_1to3(aatype[-1]), 
+            chain_ids[chain_index[-1]], 
+            residue_index[-1]
+        )
+    )
+    pdb_lines.append("ENDMDL")
+    pdb_lines.append("END")
+    # Pad all lines to 80 characters
+    pdb_lines = [line.ljust(80) for line in pdb_lines]
+    return '\n'.join(pdb_lines) + '\n' # Add terminating newline.
+def ideal_atom_mask(prot: Protein) -> np.ndarray:
+    """Computes an ideal atom mask.
+    `Protein.atom_mask` typically is defined according to the atoms that are
+    reported in the PDB. This function computes a mask according to heavy atoms
+    that should be present in the given sequence of amino acids.
+    Args:
+      prot: `Protein` whose fields are `numpy.ndarray` objects.
+    Returns:
+      An ideal atom mask.
+    """
+    return residue_constants.STANDARD_ATOM_MASK[prot.aatype]
+def from_prediction(
+    features: FeatureDict,
+    result: ModelOutput,
+    b_factors: Optional[np.ndarray] = None,
+    remove_leading_feature_dimension: bool = False,
+) -> Protein:
+    """Assembles a protein from a prediction.
+    Args:
+      features: Dictionary holding model inputs.
+      result: Dictionary holding model outputs.
+      b_factors: (Optional) B-factors to use for the protein.
+      remove_leading_feature_dimension: Whether to remove the leading dimension 
+        of the `features` values
+    Returns:
+      A protein instance.
+    """
+    def _maybe_remove_leading_dim(arr: np.ndarray) -> np.ndarray:
+        return arr[0] if remove_leading_feature_dimension else arr
+    if 'asym_id' in features:
+        chain_index = _maybe_remove_leading_dim(features["asym_id"])
+    else:
+        chain_index = np.zeros_like(
+            _maybe_remove_leading_dim(features["aatype"])
+        )
+    if b_factors is None:
+        b_factors = np.zeros_like(result["final_atom_mask"])
+    return Protein(
+        aatype=_maybe_remove_leading_dim(features["aatype"]),
+        atom_positions=result["final_atom_positions"],
+        atom_mask=result["final_atom_mask"],
+        residue_index=_maybe_remove_leading_dim(features["residue_index"]) + 1,
+        chain_index=chain_index,
+        b_factors=b_factors,
+    )
--- a/fastfold/common/residue_constants.py
+++ b/fastfold/common/residue_constants.py
--- a/fastfold/config.py
+++ b/fastfold/config.py
--- a/fastfold/data/__init__.py
+++ b/fastfold/data/__init__.py
--- a/fastfold/data/data_modules.py
+++ b/fastfold/data/data_modules.py
--- a/fastfold/data/data_pipeline.py
+++ b/fastfold/data/data_pipeline.py