Merge branch 'main' of https://github.com/hpcaitech/FastFold

b14e47f4 · zhuwenwen · 490cb6f5 · 05681304 · b14e47f4 · b14e47f4
Commit b14e47f4 authored Apr 26, 2023 by zhuwenwen
20 changed files
--- a/scripts/download_alphafold_params.sh
+++ b/scripts/download_alphafold_params.sh
+#!/bin/bash
+#
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Downloads and unzips the AlphaFold parameters.
+#
+# Usage: bash download_alphafold_params.sh /path/to/download/directory
+set -e
+
+if [[ $# -eq 0 ]]; then
+    echo "Error: download directory must be provided as an input argument."
+    exit 1
+fi
+
+if ! command -v aria2c &> /dev/null ; then
+    echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
+    exit 1
+fi
+
+DOWNLOAD_DIR="$1"
+ROOT_DIR="${DOWNLOAD_DIR}/params"
+SOURCE_URL="https://storage.googleapis.com/alphafold/alphafold_params_2022-12-06.tar"
+BASENAME=$(basename "${SOURCE_URL}")
+
+mkdir --parents "${ROOT_DIR}"
+aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
+tar --extract --verbose --file="${ROOT_DIR}/${BASENAME}" \
+  --directory="${ROOT_DIR}" --preserve-permissions
+rm "${ROOT_DIR}/${BASENAME}"
--- a/scripts/download_bfd.sh
+++ b/scripts/download_bfd.sh
+#!/bin/bash
+#
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Downloads and unzips the BFD database for AlphaFold.
+#
+# Usage: bash download_bfd.sh /path/to/download/directory
+set -e
+
+if [[ $# -eq 0 ]]; then
+    echo "Error: download directory must be provided as an input argument."
+    exit 1
+fi
+
+if ! command -v aria2c &> /dev/null ; then
+    echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
+    exit 1
+fi
+
+DOWNLOAD_DIR="$1"
+ROOT_DIR="${DOWNLOAD_DIR}/bfd"
+# Mirror of:
+# https://bfd.mmseqs.com/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt.tar.gz.
+SOURCE_URL="https://storage.googleapis.com/alphafold-databases/casp14_versions/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt.tar.gz"
+BASENAME=$(basename "${SOURCE_URL}")
+
+mkdir --parents "${ROOT_DIR}"
+aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
+tar --extract --verbose --file="${ROOT_DIR}/${BASENAME}" \
+  --directory="${ROOT_DIR}"
+rm "${ROOT_DIR}/${BASENAME}"
--- a/scripts/download_mgnify.sh
+++ b/scripts/download_mgnify.sh
+#!/bin/bash
+#
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Downloads and unzips the MGnify database for AlphaFold.
+#
+# Usage: bash download_mgnify.sh /path/to/download/directory
+set -e
+
+if [[ $# -eq 0 ]]; then
+    echo "Error: download directory must be provided as an input argument."
+    exit 1
+fi
+
+if ! command -v aria2c &> /dev/null ; then
+    echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
+    exit 1
+fi
+
+DOWNLOAD_DIR="$1"
+ROOT_DIR="${DOWNLOAD_DIR}/mgnify"
+# Mirror of:
+# ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/peptide_database/2022_05/mgy_clusters.fa.gz
+SOURCE_URL="https://storage.googleapis.com/alphafold-databases/v2.3/mgy_clusters_2022_05.fa.gz"
+BASENAME=$(basename "${SOURCE_URL}")
+
+mkdir --parents "${ROOT_DIR}"
+aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
+pushd "${ROOT_DIR}"
+gunzip "${ROOT_DIR}/${BASENAME}"
+popd
--- a/scripts/download_pdb70.sh
+++ b/scripts/download_pdb70.sh
+#!/bin/bash
+#
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Downloads and unzips the PDB70 database for AlphaFold.
+#
+# Usage: bash download_pdb70.sh /path/to/download/directory
+set -e
+
+if [[ $# -eq 0 ]]; then
+    echo "Error: download directory must be provided as an input argument."
+    exit 1
+fi
+
+if ! command -v aria2c &> /dev/null ; then
+    echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
+    exit 1
+fi
+
+DOWNLOAD_DIR="$1"
+ROOT_DIR="${DOWNLOAD_DIR}/pdb70"
+SOURCE_URL="http://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/hhsuite_dbs/old-releases/pdb70_from_mmcif_200401.tar.gz"
+BASENAME=$(basename "${SOURCE_URL}")
+
+mkdir --parents "${ROOT_DIR}"
+aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
+tar --extract --verbose --file="${ROOT_DIR}/${BASENAME}" \
+  --directory="${ROOT_DIR}"
+rm "${ROOT_DIR}/${BASENAME}"
--- a/scripts/download_pdb_mmcif.sh
+++ b/scripts/download_pdb_mmcif.sh
+#!/bin/bash
+#
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Downloads, unzips and flattens the PDB database for AlphaFold.
+#
+# Usage: bash download_pdb_mmcif.sh /path/to/download/directory
+set -e
+
+if [[ $# -eq 0 ]]; then
+    echo "Error: download directory must be provided as an input argument."
+    exit 1
+fi
+
+if ! command -v aria2c &> /dev/null ; then
+    echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
+    exit 1
+fi
+
+if ! command -v rsync &> /dev/null ; then
+    echo "Error: rsync could not be found. Please install rsync."
+    exit 1
+fi
+
+DOWNLOAD_DIR="$1"
+ROOT_DIR="${DOWNLOAD_DIR}/pdb_mmcif"
+RAW_DIR="${ROOT_DIR}/raw"
+MMCIF_DIR="${ROOT_DIR}/mmcif_files"
+
+echo "Running rsync to fetch all mmCIF files (note that the rsync progress estimate might be inaccurate)..."
+echo "If the download speed is too slow, try changing the mirror to:"
+echo "  * rsync.ebi.ac.uk::pub/databases/pdb/data/structures/divided/mmCIF/ (Europe)"
+echo "  * ftp.pdbj.org::ftp_data/structures/divided/mmCIF/ (Asia)"
+echo "or see https://www.wwpdb.org/ftp/pdb-ftp-sites for more download options."
+mkdir --parents "${RAW_DIR}"
+rsync --recursive --links --perms --times --compress --info=progress2 --delete --port=33444 \
+  rsync.rcsb.org::ftp_data/structures/divided/mmCIF/ \
+  "${RAW_DIR}"
+
+echo "Unzipping all mmCIF files..."
+find "${RAW_DIR}/" -type f -iname "*.gz" -exec gunzip {} +
+
+echo "Flattening all mmCIF files..."
+mkdir --parents "${MMCIF_DIR}"
+find "${RAW_DIR}" -type d -empty -delete  # Delete empty directories.
+for subdir in "${RAW_DIR}"/*; do
+  mv "${subdir}/"*.cif "${MMCIF_DIR}"
+done
+
+# Delete empty download directory structure.
+find "${RAW_DIR}" -type d -empty -delete
+
+aria2c "ftp://ftp.wwpdb.org/pub/pdb/data/status/obsolete.dat" --dir="${ROOT_DIR}"
--- a/scripts/download_pdb_seqres.sh
+++ b/scripts/download_pdb_seqres.sh
+#!/bin/bash
+#
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Downloads and unzips the PDB SeqRes database for AlphaFold.
+#
+# Usage: bash download_pdb_seqres.sh /path/to/download/directory
+set -e
+
+if [[ $# -eq 0 ]]; then
+    echo "Error: download directory must be provided as an input argument."
+    exit 1
+fi
+
+if ! command -v aria2c &> /dev/null ; then
+    echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
+    exit 1
+fi
+
+DOWNLOAD_DIR="$1"
+ROOT_DIR="${DOWNLOAD_DIR}/pdb_seqres"
+SOURCE_URL="ftp://ftp.wwpdb.org/pub/pdb/derived_data/pdb_seqres.txt"
+BASENAME=$(basename "${SOURCE_URL}")
+
+mkdir --parents "${ROOT_DIR}"
+aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
+
+# Keep only protein sequences.
+grep --after-context=1 --no-group-separator '>.* mol:protein' "${ROOT_DIR}/pdb_seqres.txt" > "${ROOT_DIR}/pdb_seqres_filtered.txt"
+mv "${ROOT_DIR}/pdb_seqres_filtered.txt" "${ROOT_DIR}/pdb_seqres.txt"
--- a/scripts/download_small_bfd.sh
+++ b/scripts/download_small_bfd.sh
+#!/bin/bash
+#
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Downloads and unzips the Small BFD database for AlphaFold.
+#
+# Usage: bash download_small_bfd.sh /path/to/download/directory
+set -e
+
+if [[ $# -eq 0 ]]; then
+    echo "Error: download directory must be provided as an input argument."
+    exit 1
+fi
+
+if ! command -v aria2c &> /dev/null ; then
+    echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
+    exit 1
+fi
+
+DOWNLOAD_DIR="$1"
+ROOT_DIR="${DOWNLOAD_DIR}/small_bfd"
+SOURCE_URL="https://storage.googleapis.com/alphafold-databases/reduced_dbs/bfd-first_non_consensus_sequences.fasta.gz"
+BASENAME=$(basename "${SOURCE_URL}")
+
+mkdir --parents "${ROOT_DIR}"
+trickle -s  -u 1024 -d 10240 aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
+pushd "${ROOT_DIR}"
+gunzip "${ROOT_DIR}/${BASENAME}"
+popd
--- a/scripts/download_uniprot.sh
+++ b/scripts/download_uniprot.sh
+#!/bin/bash
+#
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Downloads, unzips and merges the SwissProt and TrEMBL databases for
+# AlphaFold-Multimer.
+#
+# Usage: bash download_uniprot.sh /path/to/download/directory
+set -e
+
+if [[ $# -eq 0 ]]; then
+    echo "Error: download directory must be provided as an input argument."
+    exit 1
+fi
+
+if ! command -v aria2c &> /dev/null ; then
+    echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
+    exit 1
+fi
+
+DOWNLOAD_DIR="$1"
+ROOT_DIR="${DOWNLOAD_DIR}/uniprot"
+
+TREMBL_SOURCE_URL="ftp://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz"
+TREMBL_BASENAME=$(basename "${TREMBL_SOURCE_URL}")
+TREMBL_UNZIPPED_BASENAME="${TREMBL_BASENAME%.gz}"
+
+SPROT_SOURCE_URL="ftp://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz"
+SPROT_BASENAME=$(basename "${SPROT_SOURCE_URL}")
+SPROT_UNZIPPED_BASENAME="${SPROT_BASENAME%.gz}"
+
+mkdir --parents "${ROOT_DIR}"
+aria2c "${TREMBL_SOURCE_URL}" --dir="${ROOT_DIR}"
+aria2c "${SPROT_SOURCE_URL}" --dir="${ROOT_DIR}"
+pushd "${ROOT_DIR}"
+gunzip "${ROOT_DIR}/${TREMBL_BASENAME}"
+gunzip "${ROOT_DIR}/${SPROT_BASENAME}"
+
+# Concatenate TrEMBL and SwissProt, rename to uniprot and clean up.
+cat "${ROOT_DIR}/${SPROT_UNZIPPED_BASENAME}" >> "${ROOT_DIR}/${TREMBL_UNZIPPED_BASENAME}"
+mv "${ROOT_DIR}/${TREMBL_UNZIPPED_BASENAME}" "${ROOT_DIR}/uniprot.fasta"
+rm "${ROOT_DIR}/${SPROT_UNZIPPED_BASENAME}"
+popd
--- a/scripts/download_uniref30.sh
+++ b/scripts/download_uniref30.sh
+#!/bin/bash
+#
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Downloads and unzips the uniref30 database for AlphaFold.
+#
+# Usage: bash download_uniref30.sh /path/to/download/directory
+set -e
+
+if [[ $# -eq 0 ]]; then
+    echo "Error: download directory must be provided as an input argument."
+    exit 1
+fi
+
+if ! command -v aria2c &> /dev/null ; then
+    echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
+    exit 1
+fi
+
+DOWNLOAD_DIR="$1"
+ROOT_DIR="${DOWNLOAD_DIR}/uniref30"
+# Mirror of:
+# https://wwwuser.gwdg.de/~compbiol/uniclust/2021_03/UniRef30_2021_03.tar.gz
+SOURCE_URL="https://storage.googleapis.com/alphafold-databases/v2.3/UniRef30_2021_03.tar.gz"
+BASENAME=$(basename "${SOURCE_URL}")
+
+mkdir --parents "${ROOT_DIR}"
+aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
+tar --extract --verbose --file="${ROOT_DIR}/${BASENAME}" \
+  --directory="${ROOT_DIR}"
+rm "${ROOT_DIR}/${BASENAME}"
--- a/scripts/download_uniref90.sh
+++ b/scripts/download_uniref90.sh
+#!/bin/bash
+#
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Downloads and unzips the UniRef90 database for AlphaFold.
+#
+# Usage: bash download_uniref90.sh /path/to/download/directory
+set -e
+
+if [[ $# -eq 0 ]]; then
+    echo "Error: download directory must be provided as an input argument."
+    exit 1
+fi
+
+if ! command -v aria2c &> /dev/null ; then
+    echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
+    exit 1
+fi
+
+DOWNLOAD_DIR="$1"
+ROOT_DIR="${DOWNLOAD_DIR}/uniref90"
+SOURCE_URL="ftp://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/uniref90.fasta.gz"
+BASENAME=$(basename "${SOURCE_URL}")
+
+mkdir --parents "${ROOT_DIR}"
+aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
+pushd "${ROOT_DIR}"
+gunzip "${ROOT_DIR}/${BASENAME}"
+popd
--- a/setup.py
+++ b/setup.py
+import os
+import subprocess
+
+import torch
+from setuptools import setup, find_packages
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME
+
+# ninja build does not work unless include_dirs are abs path
+this_dir = os.path.dirname(os.path.abspath(__file__))
+
+
+def get_cuda_bare_metal_version(cuda_dir):
+    raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
+    output = raw_output.split()
+    release_idx = output.index("release") + 1
+    release = output[release_idx].split(".")
+    bare_metal_major = release[0]
+    bare_metal_minor = release[1][0]
+
+    return raw_output, bare_metal_major, bare_metal_minor
+
+
+def check_cuda_torch_binary_vs_bare_metal(cuda_dir):
+    raw_output, bare_metal_major, bare_metal_minor = get_cuda_bare_metal_version(cuda_dir)
+    torch_binary_major = torch.version.cuda.split(".")[0]
+    torch_binary_minor = torch.version.cuda.split(".")[1]
+
+    print("\nCompiling cuda extensions with")
+    print(raw_output + "from " + cuda_dir + "/bin\n")
+
+    if (bare_metal_major != torch_binary_major) or (bare_metal_minor != torch_binary_minor):
+        raise RuntimeError(
+            "Cuda extensions are being compiled with a version of Cuda that does " +
+            "not match the version used to compile Pytorch binaries.  " +
+            "Pytorch binaries were compiled with Cuda {}.\n".format(torch.version.cuda) +
+            "In some cases, a minor-version mismatch will not cause later errors:  " +
+            "https://github.com/NVIDIA/apex/pull/323#discussion_r287021798.  "
+            "You can try commenting out this check (at your own risk).")
+
+
+def append_nvcc_threads(nvcc_extra_args):
+    _, bare_metal_major, bare_metal_minor = get_cuda_bare_metal_version(CUDA_HOME)
+    if int(bare_metal_major) >= 11 and int(bare_metal_minor) >= 2:
+        return nvcc_extra_args + ["--threads", "4"]
+    return nvcc_extra_args
+
+
+if not torch.cuda.is_available():
+    print("======== NOTICE: torch.cuda.is_available == False")
+#     # https://github.com/NVIDIA/apex/issues/486
+#     # Extension builds after https://github.com/pytorch/pytorch/pull/23408 attempt to query torch.cuda.get_device_capability(),
+#     # which will fail if you are compiling in an environment without visible GPUs (e.g. during an nvidia-docker build command).
+#     print(
+#         '\nWarning: Torch did not find available GPUs on this system.\n',
+#         'If your intention is to cross-compile, this is not an error.\n'
+#         'By default, FastFold will cross-compile for Pascal (compute capabilities 6.0, 6.1, 6.2),\n'
+#         'Volta (compute capability 7.0), Turing (compute capability 7.5),\n'
+#         'and, if the CUDA version is >= 11.0, Ampere (compute capability 8.0).\n'
+#         'If you wish to cross-compile for a single specific architecture,\n'
+#         'export TORCH_CUDA_ARCH_LIST="compute capability" before running setup.py.\n')
+#     if os.environ.get("TORCH_CUDA_ARCH_LIST", None) is None:
+#         _, bare_metal_major, _ = get_cuda_bare_metal_version(CUDA_HOME)
+#         if int(bare_metal_major) == 11:
+#             os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5;8.0"
+#         else:
+#             os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5"
+
+print("\n\ntorch.__version__  = {}\n\n".format(torch.__version__))
+TORCH_MAJOR = int(torch.__version__.split('.')[0])
+TORCH_MINOR = int(torch.__version__.split('.')[1])
+
+if TORCH_MAJOR < 1 or (TORCH_MAJOR == 1 and TORCH_MINOR < 10):
+    raise RuntimeError("FastFold requires Pytorch 1.10 or newer.\n" +
+                       "The latest stable release can be obtained from https://pytorch.org/")
+
+cmdclass = {}
+ext_modules = []
+
+# Set up macros for forward/backward compatibility hack around
+# https://github.com/pytorch/pytorch/commit/4404762d7dd955383acee92e6f06b48144a0742e
+# and
+# https://github.com/NVIDIA/apex/issues/456
+# https://github.com/pytorch/pytorch/commit/eb7b39e02f7d75c26d8a795ea8c7fd911334da7e#diff-4632522f237f1e4e728cb824300403ac
+version_dependent_macros = ['-DVERSION_GE_1_1', '-DVERSION_GE_1_3', '-DVERSION_GE_1_5']
+
+if CUDA_HOME:
+    # check_cuda_torch_binary_vs_bare_metal(CUDA_HOME)
+
+    def cuda_ext_helper(name, sources, extra_cuda_flags):
+        return CUDAExtension(
+            name=name,
+            sources=[
+                os.path.join('fastfold/model/fastnn/kernel/cuda_native/csrc', path) for path in sources
+            ],
+            include_dirs=[
+                os.path.join(this_dir, 'fastfold/model/fastnn/kernel/cuda_native/csrc/include')
+            ],
+            extra_compile_args={
+                'cxx': ['-O3'] + version_dependent_macros,
+                'nvcc':
+                    append_nvcc_threads(['-O3', '--use_fast_math'] + version_dependent_macros +
+                                        extra_cuda_flags)
+            })
+
+
+
+    cc_flag = ['-gencode', 'arch=compute_70,code=sm_70']
+    _, bare_metal_major, _ = get_cuda_bare_metal_version(CUDA_HOME)
+    if int(bare_metal_major) >= 11:
+        cc_flag.append('-gencode')
+        cc_flag.append('arch=compute_80,code=sm_80')
+
+    extra_cuda_flags = [
+        '-std=c++14', '-maxrregcount=50', '-U__CUDA_NO_HALF_OPERATORS__',
+        '-U__CUDA_NO_HALF_CONVERSIONS__', '--expt-relaxed-constexpr', '--expt-extended-lambda'
+    ]
+
+    ext_modules.append(
+        cuda_ext_helper('fastfold_layer_norm_cuda',
+                        ['layer_norm_cuda.cpp', 'layer_norm_cuda_kernel.cu'],
+                        extra_cuda_flags + cc_flag))
+
+    ext_modules.append(
+        cuda_ext_helper('fastfold_softmax_cuda', ['softmax_cuda.cpp', 'softmax_cuda_kernel.cu'],
+                        extra_cuda_flags + cc_flag))
+else:
+    print("======== NOTICE: install without cuda kernel")
+
+setup(
+    name='fastfold',
+    version='0.2.0',
+    packages=find_packages(exclude=(
+        'assets',
+        'benchmark',
+        '*.egg-info',
+    )),
+    description=
+    'Optimizing Protein Structure Prediction Model Training and Inference on GPU Clusters',
+    ext_modules=ext_modules,
+    package_data={'fastfold': ['model/fastnn/kernel/cuda_native/csrc/*']},
+    cmdclass={'build_ext': BuildExtension} if ext_modules else {},
+    install_requires=['einops', 'colossalai'],
+)
--- a/tests/__init__.py
+++ b/tests/__init__.py
--- a/tests/test_fastnn/test_attention_core.py
+++ b/tests/test_fastnn/test_attention_core.py
+import math
+
+import pytest
+
+import torch
+from einops import rearrange
+
+TEST_TRITON = False
+try:
+    from fastfold.model.fastnn.kernel import fused_attention_core
+except:
+    print("Skip triton attention test!")
+    TEST_TRITON = False
+
+
+def torch_core_attention(q, k, v, mask, bias):
+
+    scaling = 1. / math.sqrt(q.size(-1))
+    q = q * scaling
+
+    logits = torch.matmul(q.float(), k.float().transpose(-1, -2))
+    logits += bias.float()
+    logits += (1e20 * (mask - 1))[..., :, None, None, :]
+
+    weights = torch.nn.functional.softmax(logits.float(), -1).to(dtype=q.dtype)
+
+    weighted_avg = torch.matmul(weights, v)
+
+    weighted_avg = rearrange(weighted_avg, 'b1 b2 h n d -> b1 b2 n (h d)')
+
+    return weighted_avg
+
+@pytest.mark.skipif(TEST_TRITON == False, reason="triton is not available")
+def test_fused_attention_core():
+    if TEST_TRITON:
+        batch_, chunk_, head_, d_head = 1, 8, 4, 32
+        test_seq_ = [32, 256, 370, 500, 512, 700, 1024, 1600]
+        test_dtype = [torch.float16, torch.bfloat16]
+        test_device = torch.device("cuda")
+
+        tolerance_eps = {torch.float16: 1e-4, torch.bfloat16: 1e-4}
+
+        for seq_ in test_seq_:
+            for dtype in test_dtype:
+                q = torch.empty((batch_, chunk_, head_, seq_, d_head), dtype=dtype,
+                                device="cuda").normal_(mean=0, std=.5).requires_grad_()
+                k = torch.empty((batch_, chunk_, head_, seq_, d_head), dtype=dtype,
+                                device="cuda").normal_(mean=0, std=.5).requires_grad_()
+                v = torch.empty((batch_, chunk_, head_, seq_, d_head), dtype=dtype,
+                                device="cuda").normal_(mean=0, std=.5).requires_grad_()
+
+                mask = torch.empty(
+                    (batch_, chunk_, seq_), device="cuda").normal_(mean=0, std=.5) > 0
+                mask = mask.to(device=test_device, dtype=dtype).requires_grad_(False)
+
+                bias = torch.randn(batch_, head_, seq_, seq_).to(device=test_device,
+                                                                 dtype=dtype).requires_grad_(True)
+
+                ref_out = torch_core_attention(q, k, v, mask, bias)
+                tri_out = fused_attention_core(q, k, v, mask, bias)
+                # compare
+                torch.allclose(ref_out, tri_out, atol=tolerance_eps[dtype])
+
+
+if __name__ == "__main__":
+    test_fused_attention_core()
--- a/tests/test_fastnn/test_basic_ops.py
+++ b/tests/test_fastnn/test_basic_ops.py
+import torch
+from fastfold.model.fastnn.ops import Linear as FastLinear
+from fastfold.model.nn.primitives import Linear
+
+
+def test_linear():    
+    c_in = 3
+    c_out = 4
+    seq = 5
+    
+    fast_linear = FastLinear(c_in, c_out).cuda()
+    linear = Linear(c_in, c_out).cuda()
+    
+    fast_linear.weight = linear.weight
+    fast_linear.bias = linear.bias
+
+    x = torch.randn((seq, c_in)).cuda()
+
+    out1 = fast_linear(x)
+    out2 = linear(x)
+    assert torch.allclose(out1, out2, atol=1e-8)
+
+
+if __name__ == "__main__":
+    test_linear()
--- a/tests/test_fastnn/test_evoformer.py
+++ b/tests/test_fastnn/test_evoformer.py
+import torch
+import pytest
+import os
+import copy
+import torch.multiprocessing as mp
+from functools import partial
+import fastfold
+from fastfold.config import model_config
+from fastfold.model.fastnn.ops import set_chunk_size
+from fastfold.model.hub import AlphaFold
+from fastfold.utils.inject_fastnn import inject_fastnn
+from fastfold.utils.import_weights import import_jax_weights_
+from fastfold.utils.test_utils import get_param_path
+
+
+@pytest.fixture(scope="module")
+def get_module_and_output():
+    with torch.no_grad():
+        config = model_config('model_1')
+        config.globals.inplace = False
+        target_module = AlphaFold(config)
+        import_jax_weights_(target_module, get_param_path())
+
+        fast_module = copy.deepcopy(target_module)
+        fast_module = inject_fastnn(fast_module)
+        fast_module = fast_module.evoformer
+        fast_module_1 = fast_module.blocks[0].eval().cuda()
+        fast_module_2 = fast_module.blocks[-1].eval().cuda()
+
+        target_module = target_module.evoformer
+        target_module_1 = target_module.blocks[0].eval().cuda()
+        target_module_2 = target_module.blocks[-1].eval().cuda()
+        
+        msa_len = 80
+        seq_len = 80
+        m = torch.randn((msa_len, seq_len, 256))
+        m_mask = torch.ones((msa_len, seq_len))
+        z = torch.randn((seq_len, seq_len, 128))
+        z_mask = torch.ones((seq_len, seq_len))
+        data = [m, z, m_mask, z_mask]
+        inputs = [copy.deepcopy(i).cuda() for i in data]
+        
+        m_out, z_out = target_module_1(*inputs)
+        m_out, z_out = target_module_2(m_out, z_out, inputs[2], inputs[3])
+
+    return fast_module_1, fast_module_2, m_out, z_out, data
+
+
+@pytest.mark.parametrize('world_size', [1, 2])
+@pytest.mark.parametrize('chunk_size', [None, 32])
+@pytest.mark.parametrize('inplace', [False, True])
+def test_state_dict(world_size, chunk_size, inplace, get_module_and_output):
+    run_func = partial(_test_evoformer, world_size=world_size, chunk_size=chunk_size, inplace=inplace, get_module_and_output=get_module_and_output)
+    mp.spawn(run_func, nprocs=world_size)
+
+
+def _test_evoformer(rank, world_size, chunk_size, inplace, get_module_and_output):
+    os.environ['RANK'] = str(rank)
+    os.environ['LOCAL_RANK'] = str(rank)
+    os.environ['WORLD_SIZE'] = str(world_size)
+    # init distributed for Dynamic Axial Parallelism
+    fastfold.distributed.init_dap()
+
+    fast_module_1, fast_module_2, m_out, z_out, data = get_module_and_output
+    
+    fast_module_1 = copy.deepcopy(fast_module_1).eval().cuda()
+    fast_module_2 = copy.deepcopy(fast_module_2).eval().cuda()
+    inputs = [copy.deepcopy(i).cuda() for i in data]
+    
+    set_chunk_size(chunk_size)
+    with torch.no_grad():
+        if not inplace:
+            m_fast, z_fast = fast_module_1(*inputs)
+            m_fast, z_fast = fast_module_2(m_fast, z_fast, inputs[2], inputs[3])
+        else:
+            m_fast, z_fast = fast_module_1.inplace([inputs[0]], [inputs[1]], inputs[2], inputs[3])
+            m_fast, z_fast = fast_module_2.inplace(m_fast, z_fast, inputs[2], inputs[3])
+            m_fast = m_fast[0]
+            z_fast = z_fast[0]
+
+    error = torch.mean(torch.abs(m_out.cuda() - m_fast))
+    assert error < 5e-4, f"Test m failed at chunk size: {chunk_size}, inplace: {inplace}. The position dif is {error}"
+    error = torch.mean(torch.abs(z_out.cuda() - z_fast))
+    assert error < 5e-4, f"Test z failed at chunk size: {chunk_size}, inplace: {inplace}. The position dif is {error}"
--- a/tests/test_fastnn/test_evoformer_stack.py
+++ b/tests/test_fastnn/test_evoformer_stack.py
+import torch
+import pytest
+import os
+import copy
+import torch.multiprocessing as mp
+from functools import partial
+import fastfold
+from fastfold.config import model_config
+from fastfold.model.fastnn.ops import set_chunk_size
+from fastfold.model.hub import AlphaFold
+from fastfold.utils.inject_fastnn import inject_fastnn
+from fastfold.utils.import_weights import import_jax_weights_
+from fastfold.utils.test_utils import get_param_path
+
+
+@pytest.fixture(scope="module")
+def get_module_and_output():
+    with torch.no_grad():
+        config = model_config('model_1')
+        config.globals.inplace = False
+        model = AlphaFold(config)
+        import_jax_weights_(model, get_param_path())
+
+        fast_model = copy.deepcopy(model)
+        fast_model = inject_fastnn(fast_model)
+        fast_model = fast_model.evoformer
+        fast_model.eval().cuda()
+
+        model = model.evoformer
+        model.eval().cuda()
+        
+        msa_len = 50
+        seq_len = 52
+        m = torch.randn((msa_len, seq_len, 256))
+        m_mask = torch.ones((msa_len, seq_len)).to(dtype=m.dtype)
+        z = torch.randn((seq_len, seq_len, 128))
+        z_mask = torch.ones((seq_len, seq_len)).to(dtype=z.dtype)
+        data = [m, z, m_mask, z_mask]
+        inputs = [copy.deepcopy(i).cuda() for i in data]
+        out = model(
+            *inputs, chunk_size=None, _mask_trans=config.model._mask_trans)
+    return fast_model, config, out, data
+
+
+@pytest.mark.parametrize('world_size', [1, 2])
+@pytest.mark.parametrize('chunk_size', [None, 1])
+@pytest.mark.parametrize('inplace', [False, True])
+def test_state_dict(world_size, chunk_size, inplace, get_module_and_output):
+    run_func = partial(_test_evoformer_stack, world_size=world_size, chunk_size=chunk_size, 
+                       inplace=inplace, get_module_and_output=get_module_and_output)
+    mp.spawn(run_func, nprocs=world_size)
+
+
+def _test_evoformer_stack(rank, world_size, chunk_size, inplace, get_module_and_output):
+    os.environ['RANK'] = str(rank)
+    os.environ['LOCAL_RANK'] = str(rank)
+    os.environ['WORLD_SIZE'] = str(world_size)
+    # init distributed for Dynamic Axial Parallelism
+    fastfold.distributed.init_dap()
+
+    fast_module, config, out, data = get_module_and_output
+    inputs = [copy.deepcopy(i).cuda() for i in data]
+    fast_module = copy.deepcopy(fast_module).eval().cuda()
+
+    with torch.no_grad():
+        set_chunk_size(chunk_size)
+        if not inplace:
+            m_fast, z_fast, s_fast = fast_module(
+                *inputs, chunk_size=chunk_size, _mask_trans=config.model._mask_trans)
+        else:
+            m_fast, z_fast, s_fast = fast_module.inplace(
+                [inputs[0]], [inputs[1]], inputs[2], inputs[3], chunk_size=chunk_size, _mask_trans=config.model._mask_trans)
+            m_fast = m_fast[0]
+            z_fast = z_fast[0]
+
+    error = torch.mean(torch.abs(out[0].cuda() - m_fast))
+    assert error < 2e-3, f"Test m failed at chunk size: {chunk_size}, inplace: {inplace}. The position dif is {error}"
+    error = torch.mean(torch.abs(out[1].cuda() - z_fast))
+    assert error < 2e-3, f"Test z failed at chunk size: {chunk_size}, inplace: {inplace}. The position dif is {error}"
+    error = torch.mean(torch.abs(out[2].cuda() - s_fast))
+    assert error < 2e-3, f"Test s failed at chunk size: {chunk_size}, inplace: {inplace}. The position dif is {error}"
--- a/tests/test_fastnn/test_extramsa_stack.py
+++ b/tests/test_fastnn/test_extramsa_stack.py
+import torch
+import pytest
+import os
+import copy
+import torch.multiprocessing as mp
+from functools import partial
+import fastfold
+from fastfold.config import model_config
+from fastfold.model.fastnn.ops import set_chunk_size
+from fastfold.model.hub import AlphaFold
+from fastfold.utils.inject_fastnn import inject_fastnn
+from fastfold.utils.import_weights import import_jax_weights_
+from fastfold.utils.test_utils import get_param_path
+
+
+@pytest.fixture(scope="module")
+def get_openfold_module_and_data():
+    with torch.no_grad():
+        config = model_config('model_1')
+        config.globals.inplace = False
+        
+        target_module = AlphaFold(config)
+        import_jax_weights_(target_module, get_param_path())
+        fast_module = copy.deepcopy(target_module)
+        fast_module = inject_fastnn(fast_module)
+        fast_module = fast_module.extra_msa_stack
+        fast_module = fast_module.cuda().eval()
+        
+        extra_msa_len = 300
+        seq_len = 64
+        m = torch.randn((extra_msa_len, seq_len, 64)).cuda()
+        m_mask = torch.ones((extra_msa_len, seq_len)).cuda().to(dtype=m.dtype)
+        m_mask[64:, :] = 0.
+        z = torch.randn((seq_len, seq_len, 128)).cuda()
+        z_mask = torch.ones((seq_len, seq_len)).cuda().to(dtype=z.dtype)
+        data = [m, z, m_mask, z_mask]
+        inputs = [copy.deepcopy(i).cuda() for i in data]
+        
+        target_module = target_module.extra_msa_stack
+        target_module = target_module.eval().cuda()
+        z_out = target_module(
+            inputs[0], inputs[1], msa_mask=inputs[2], pair_mask=inputs[3], chunk_size=None, _mask_trans=config.model._mask_trans)
+
+    return z_out, config, fast_module, data
+
+
+@pytest.mark.parametrize('world_size', [1, 2])
+@pytest.mark.parametrize('chunk_size', [None, 32])
+@pytest.mark.parametrize('inplace', [False, True])
+def test_state_dict(world_size, chunk_size, inplace, get_openfold_module_and_data):
+    run_func = partial(_test_extramsa_stack, world_size=world_size, chunk_size=chunk_size, inplace=inplace, 
+                       get_openfold_module_and_data=get_openfold_module_and_data)
+    mp.spawn(run_func, nprocs=world_size)
+
+
+def _test_extramsa_stack(rank, world_size, chunk_size, inplace, get_openfold_module_and_data):
+    os.environ['RANK'] = str(rank)
+    os.environ['LOCAL_RANK'] = str(rank)
+    os.environ['WORLD_SIZE'] = str(world_size)
+    # init distributed for Dynamic Axial Parallelism
+    fastfold.distributed.init_dap()
+
+    z_out, config, fast_module, data = get_openfold_module_and_data
+    inputs = [copy.deepcopy(i).cuda() for i in data]
+    fast_module = copy.deepcopy(fast_module).eval().cuda()
+
+    with torch.no_grad():
+        set_chunk_size(chunk_size)
+        if not inplace:
+            z_fast = fast_module(
+                inputs[0], inputs[1], msa_mask=inputs[2], pair_mask=inputs[3], chunk_size=chunk_size, _mask_trans=config.model._mask_trans)
+        else:
+            z_fast = fast_module.inplace(
+                [inputs[0]], [inputs[1]], msa_mask=inputs[2], pair_mask=inputs[3], chunk_size=chunk_size, _mask_trans=config.model._mask_trans)
+            z_fast = z_fast[0]
+
+    error = torch.mean(torch.abs(z_out.cuda() - z_fast))
+    assert error < 1e-3, f"Test z failed at chunk size: {chunk_size}, inplace: {inplace}. The position dif is {error}"
--- a/tests/test_fastnn/test_layernorm.py
+++ b/tests/test_fastnn/test_layernorm.py
+import torch
+from fastfold.model.fastnn.kernel import LayerNorm as FastLayerNorm
+from fastfold.model.fastnn.kernel.layer_norm import FusedLayerNormAffineFunction
+
+triton = True
+try:
+    from fastfold.model.fastnn.kernel.layer_norm import LayerNormTritonFunc
+except:
+    print("Skip triton layernorm test!")
+    triton = False
+
+
+def test_layernorm():
+
+    # [batch, dim]
+    test_shape = [[64, 64], [64, 128], [64, 129], [64, 1024]]
+    test_dtype = [torch.float32, torch.float16, torch.bfloat16]
+    test_device = torch.device("cuda")
+
+    tolerance_eps = {torch.float32: 10e-5, torch.float16: 10e-2, torch.bfloat16: 10e-2}
+
+    for shape in test_shape:
+        for dtype in test_dtype:
+            sample_input = torch.rand(shape).to(device=test_device,
+                                                dtype=dtype).requires_grad_(False)
+
+            dim_ = sample_input.size()[-1]
+            torch_module = torch.nn.LayerNorm(normalized_shape=dim_).to(device=test_device,
+                                                                        dtype=dtype)
+            fastnn_cuda_module = FastLayerNorm(normalized_shape=dim_).to(device=test_device, dtype=dtype)
+            if triton:
+                fastnn_triton_module = FastLayerNorm(normalized_shape=dim_).to(device=test_device, dtype=dtype)
+
+            # Forward
+            torch_out = torch_module(sample_input)
+            
+            fastnn_cuda_out = FusedLayerNormAffineFunction.apply(sample_input, fastnn_cuda_module.weight, fastnn_cuda_module.bias, 
+                                                                 fastnn_cuda_module.normalized_shape, fastnn_cuda_module.eps)
+            forward_error = torch.max(torch.abs(torch_out - fastnn_cuda_out)).cpu().item()
+            assert forward_error < tolerance_eps[dtype], f"Error when {shape} {dtype}"
+            
+            if triton:
+                fastnn_triton_out = LayerNormTritonFunc.apply(sample_input, fastnn_triton_module.normalized_shape, fastnn_triton_module.weight, 
+                                                            fastnn_triton_module.bias, fastnn_triton_module.eps)
+                forward_error = torch.max(torch.abs(torch_out - fastnn_triton_out)).cpu().item()
+                assert forward_error < tolerance_eps[dtype], f"Error when {shape} {dtype}"
+
+            # Backward
+            out_grad = torch.rand_like(torch_out).requires_grad_(False)
+            torch_out.backward(out_grad)
+            fastnn_cuda_out.backward(out_grad)
+
+            backward_weight_error = torch.max(
+                torch.abs(torch_module.weight.grad - fastnn_cuda_module.weight.grad)).cpu().item()
+            assert backward_weight_error < tolerance_eps[dtype], f"Error when {shape} {dtype}"
+            backward_bias_error = torch.max(
+                torch.abs(torch_module.bias.grad - fastnn_cuda_module.bias.grad)).cpu().item()
+            assert backward_bias_error < tolerance_eps[dtype], f"Error when {shape} {dtype}"
+
+            if triton:
+                fastnn_triton_out.backward(out_grad)
+                backward_weight_error = torch.max(
+                    torch.abs(torch_module.weight.grad - fastnn_triton_module.weight.grad)).cpu().item()
+                assert backward_weight_error < tolerance_eps[dtype], f"Error when {shape} {dtype}"
+                backward_bias_error = torch.max(
+                    torch.abs(torch_module.bias.grad - fastnn_triton_module.bias.grad)).cpu().item()
+                assert backward_bias_error < tolerance_eps[dtype], f"Error when {shape} {dtype}"
+
+
+if __name__ == "__main__":
+    test_layernorm()
--- a/tests/test_fastnn/test_msa_att_col.py
+++ b/tests/test_fastnn/test_msa_att_col.py
+import torch
+import pytest
+import os
+import copy
+import torch.multiprocessing as mp
+from functools import partial
+import fastfold
+from fastfold.config import model_config
+from fastfold.model.fastnn.ops import set_chunk_size
+from fastfold.model.hub import AlphaFold
+from fastfold.utils.inject_fastnn import inject_fastnn
+from fastfold.utils.import_weights import import_jax_weights_
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
+from fastfold.distributed.comm import gather, scatter, row_to_col
+from fastfold.utils.test_utils import get_param_path
+
+
+@pytest.fixture(scope="module")
+def get_openfold_module_and_data():
+    with torch.no_grad():
+        config = model_config('model_1')
+        config.globals.inplace = False
+        target_module = AlphaFold(config)
+        import_jax_weights_(target_module, get_param_path())
+
+        fast_module = copy.deepcopy(target_module)
+        fast_module = inject_fastnn(fast_module)
+        fast_module = fast_module.evoformer.blocks[0].msa.MSAColumnAttention.eval().cuda()
+        target_module = target_module.evoformer.blocks[0].msa_att_col.eval().cuda()
+
+        msa_len = 300
+        seq_len = 300
+        m = torch.randn((msa_len, seq_len, 256)).cuda()
+        m_mask = torch.ones((msa_len, seq_len)).cuda().to(dtype=m.dtype)
+        m_out = m + target_module(m, mask=m_mask, chunk_size=None)
+    return m_out, m, m_mask, fast_module
+
+
+@pytest.mark.parametrize('world_size', [1, 2])
+@pytest.mark.parametrize('chunk_size', [None, 32])
+def test_state_dict(world_size, chunk_size, get_openfold_module_and_data):
+    run_func = partial(_test_msa_att_col, world_size=world_size, chunk_size=chunk_size, get_openfold_module_and_data=get_openfold_module_and_data)
+    mp.spawn(run_func, nprocs=world_size)
+
+
+def _test_msa_att_col(rank, world_size, chunk_size, get_openfold_module_and_data):
+    os.environ['RANK'] = str(rank)
+    os.environ['LOCAL_RANK'] = str(rank)
+    os.environ['WORLD_SIZE'] = str(world_size)
+    # init distributed for Dynamic Axial Parallelism
+    fastfold.distributed.init_dap()
+
+    m_out, m, m_mask, fast_module = get_openfold_module_and_data
+    fast_module = copy.deepcopy(fast_module).cuda()
+    
+    fast_m = copy.deepcopy(m.cuda()).unsqueeze(0)
+    dap_size = gpc.get_world_size(ParallelMode.TENSOR)
+    seq_length = m_mask.cuda().size(-1)
+    padding_size = (int(seq_length / dap_size) + 1) * dap_size - seq_length
+    fast_m = torch.nn.functional.pad(fast_m, (0, 0, 0, padding_size))
+    fast_m = scatter(fast_m, dim=1)
+    fast_m_mask = copy.deepcopy(m_mask.cuda()).unsqueeze(0)
+    fast_m_mask = torch.nn.functional.pad(fast_m_mask, (0, padding_size))
+
+    with torch.no_grad():
+        set_chunk_size(chunk_size)
+        fast_m = row_to_col(fast_m)
+        fast_m_mask = scatter(fast_m_mask, dim=2)
+        m_fast = fast_module(fast_m, fast_m_mask)
+        m_fast = m_fast.squeeze(0)
+        m_fast = gather(m_fast, dim=1)
+        m_fast = m_fast[:, :-padding_size, :]
+
+    error = torch.max(torch.abs(m_out.cuda() - m_fast))
+    assert error < 1e-4, f"Test m failed at chunk size: {chunk_size}. The position dif is {error}"
--- a/tests/test_fastnn/test_msa_att_row.py
+++ b/tests/test_fastnn/test_msa_att_row.py
+import torch
+import pytest
+import os
+import copy
+import torch.multiprocessing as mp
+from functools import partial
+import fastfold
+from fastfold.config import model_config
+from fastfold.model.fastnn.ops import set_chunk_size
+from fastfold.model.hub import AlphaFold
+from fastfold.utils.inject_fastnn import inject_fastnn
+from fastfold.utils.import_weights import import_jax_weights_
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
+from fastfold.utils.test_utils import get_param_path
+from fastfold.distributed.comm import gather, scatter
+
+
+@pytest.fixture(scope="module")
+def get_openfold_module_and_data():
+    with torch.no_grad():
+        config = model_config('model_1')
+        config.globals.inplace = False
+        target_module = AlphaFold(config)
+        import_jax_weights_(target_module, get_param_path())
+
+        fast_module = copy.deepcopy(target_module)
+        fast_module = inject_fastnn(fast_module)
+        fast_module = fast_module.evoformer.blocks[0].msa.MSARowAttentionWithPairBias.eval().cuda()
+        target_module1 = target_module.evoformer.blocks[0].msa_att_row.eval().cuda()
+        target_module2 = target_module.evoformer.blocks[0].msa_dropout_layer.eval().cuda()
+        
+        msa_len = 300
+        seq_len = 300
+        m = torch.randn((msa_len, seq_len, 256)).cuda()
+        m_mask = torch.ones((msa_len, seq_len)).cuda().to(dtype=m.dtype)
+        z = torch.randn((seq_len, seq_len, 128)).cuda()
+        z_mask = torch.ones((seq_len, seq_len)).cuda().to(dtype=z.dtype)
+        m_out = m + target_module2(target_module1(m, z=z, mask=m_mask, chunk_size=None))
+        
+        
+    return m_out, m, z, m_mask, z_mask, fast_module
+
+
+@pytest.mark.parametrize('world_size', [1, 2])
+@pytest.mark.parametrize('chunk_size', [None, 32])
+def test_state_dict(world_size, chunk_size, get_openfold_module_and_data):
+    run_func = partial(_test_msa_att_row, world_size=world_size, chunk_size=chunk_size, get_openfold_module_and_data=get_openfold_module_and_data)
+    mp.spawn(run_func, nprocs=world_size)
+
+
+def _test_msa_att_row(rank, world_size, chunk_size, get_openfold_module_and_data):
+    os.environ['RANK'] = str(rank)
+    os.environ['LOCAL_RANK'] = str(rank)
+    os.environ['WORLD_SIZE'] = str(world_size)
+    # init distributed for Dynamic Axial Parallelism
+    fastfold.distributed.init_dap()
+
+    m_out, m, z, m_mask, z_mask, fast_module = get_openfold_module_and_data
+    fast_module = copy.deepcopy(fast_module).cuda()
+
+    fast_m = copy.deepcopy(m.cuda()).unsqueeze(0)
+    fast_z = copy.deepcopy(z.cuda()).unsqueeze(0)
+    dap_size = gpc.get_world_size(ParallelMode.TENSOR)
+    seq_length = z_mask.cuda().size(-1)
+    padding_size = (int(seq_length / dap_size) + 1) * dap_size - seq_length
+    fast_m = torch.nn.functional.pad(fast_m, (0, 0, 0, padding_size))
+    fast_z = torch.nn.functional.pad(fast_z, (0, 0, 0, padding_size, 0, padding_size))
+    fast_m = scatter(fast_m, dim=1)
+    fast_z = scatter(fast_z, dim=1)
+    fast_m_mask = copy.deepcopy(m_mask.cuda()).unsqueeze(0)
+    fast_m_mask = torch.nn.functional.pad(fast_m_mask, (0, padding_size))
+    
+    with torch.no_grad():
+        set_chunk_size(chunk_size)
+        fast_m_mask = scatter(fast_m_mask.cuda(), dim=1)
+        m_fast = fast_module(fast_m.cuda(), fast_z.cuda(), fast_m_mask)
+        m_fast = m_fast.squeeze(0)
+        m_fast = gather(m_fast, dim=0)
+        m_fast = m_fast[:, :-padding_size, :]
+
+    error = torch.max(torch.abs(m_out.cuda() - m_fast))
+    assert error < 5e-5, f"Test m failed at chunk size: {chunk_size}. The position dif is {error}"