test_model.py

# Copyright 2021 AlQuraishi Laboratory
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from pathlib import Path
import pickle
import torch
import torch.nn as nn
import numpy as np
import unittest
from openfold.config import model_config
from openfold.data import data_transforms
from openfold.model.model import AlphaFold
from openfold.utils.tensor_utils import tensor_tree_map
import tests.compare_utils as compare_utils
from tests.config import consts
from tests.data_utils import (
    random_template_feats,
    random_extra_msa_feats,
)

if compare_utils.alphafold_is_installed():
    alphafold = compare_utils.import_alphafold()
    import jax
    import haiku as hk


class TestModel(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        if compare_utils.alphafold_is_installed():
            if consts.is_multimer:
                cls.am_atom = alphafold.model.all_atom_multimer
                cls.am_fold = alphafold.model.folding_multimer
                cls.am_modules = alphafold.model.modules_multimer
                cls.am_rigid = alphafold.model.geometry
            else:
                cls.am_atom = alphafold.model.all_atom
                cls.am_fold = alphafold.model.folding
                cls.am_modules = alphafold.model.modules
                cls.am_rigid = alphafold.model.r3

    def test_dry_run(self):
        n_seq = consts.n_seq
        n_templ = consts.n_templ
        n_res = consts.n_res
        n_extra_seq = consts.n_extra

        c = model_config(consts.model)
        c.model.evoformer_stack.no_blocks = 4  # no need to go overboard here
        c.model.evoformer_stack.blocks_per_ckpt = None  # don't want to set up
        # deepspeed for this test

        model = AlphaFold(c).cuda()
        model.eval()

        batch = {}
        tf = torch.randint(c.model.input_embedder.tf_dim - 1, size=(n_res,))
        batch["target_feat"] = nn.functional.one_hot(
            tf, c.model.input_embedder.tf_dim
        ).float()
        batch["aatype"] = torch.argmax(batch["target_feat"], dim=-1)
        batch["residue_index"] = torch.arange(n_res)

        batch["msa_feat"] = torch.rand((n_seq, n_res, c.model.input_embedder.msa_dim))
        t_feats = random_template_feats(n_templ, n_res)
        batch.update({k: torch.tensor(v) for k, v in t_feats.items()})
        extra_feats = random_extra_msa_feats(n_extra_seq, n_res)
        batch.update({k: torch.tensor(v) for k, v in extra_feats.items()})
        batch["msa_mask"] = torch.randint(
            low=0, high=2, size=(n_seq, n_res)
        ).float()
        batch["seq_mask"] = torch.randint(low=0, high=2, size=(n_res,)).float()
        batch.update(data_transforms.make_atom14_masks(batch))
        batch["no_recycling_iters"] = torch.tensor(2.)

        if consts.is_multimer:
            batch["asym_id"] = torch.randint(0, 1, size=(n_res,))
            batch["entity_id"] = torch.randint(0, 1, size=(n_res,))
            batch["sym_id"] = torch.randint(0, 1, size=(n_res,))
            batch["extra_deletion_matrix"] = torch.randint(0, 2, size=(n_extra_seq, n_res))

        add_recycling_dims = lambda t: (
            t.unsqueeze(-1).expand(*t.shape, c.data.common.max_recycling_iters)
        )
        batch = tensor_tree_map(add_recycling_dims, batch)

        to_cuda_device = lambda t: t.cuda()
        batch = tensor_tree_map(to_cuda_device, batch)

        with torch.no_grad():
            out = model(batch)

    def test_dry_run_seqemb_mode(self):
        n_seq = 1
        n_templ = consts.n_templ
        n_res = consts.n_res
        msa_dim = 49

        c = model_config("seq_model_esm1b")
        c.model.evoformer_stack.no_blocks = 2
        c.model.evoformer_stack.blocks_per_ckpt = None
        model = AlphaFold(c)
        model.to(torch.device('cuda'))
        model.eval()

        batch = {}
        tf = torch.randint(c.model.preembedding_embedder.tf_dim - 1, size=(n_res,))
        batch["target_feat"] = nn.functional.one_hot(tf, c.model.preembedding_embedder.tf_dim).float()
        batch["aatype"] = torch.argmax(batch["target_feat"], dim=-1)
        batch["residue_index"] = torch.arange(n_res)
        batch["msa_feat"] = torch.rand((n_seq, n_res, msa_dim))
        batch["seq_embedding"] = torch.rand((n_res, c.model.preembedding_embedder.preembedding_dim))

        t_feats = random_template_feats(n_templ, n_res)
        batch.update({k: torch.tensor(v) for k, v in t_feats.items()})

        batch["seq_mask"] = torch.randint(low=0, high=2, size=(n_res,)).float()
        batch.update(data_transforms.make_atom14_masks(batch))
        batch["msa_mask"] = torch.randint(low=0, high=2, size=(n_seq, n_res)).float()

        batch["no_recycling_iters"] = torch.tensor(2.)
        add_recycling_dims = lambda t: (
            t.unsqueeze(-1).expand(*t.shape, c.data.common.max_recycling_iters)
        )
        batch = tensor_tree_map(add_recycling_dims, batch)

        to_cuda_device = lambda t: t.to(torch.device("cuda"))
        batch = tensor_tree_map(to_cuda_device, batch)

        with torch.no_grad():
            out = model(batch)

    @compare_utils.skip_unless_alphafold_installed()
    @unittest.skipIf(consts.is_multimer, "Additional changes required for multimer.")
    def test_compare(self):
        #TODO: Fix test data for multimer MSA features
        def run_alphafold(batch):
            config = compare_utils.get_alphafold_config()

            model = self.am_modules.AlphaFold(config.model)

            return model(
                batch=batch,
                is_training=False,
                return_representations=True,
            )

        f = hk.transform(run_alphafold)

        params = compare_utils.fetch_alphafold_module_weights("")

        fpath = Path(__file__).parent.resolve() / "test_data/sample_feats.pickle"
        with open(str(fpath), "rb") as fp:
            batch = pickle.load(fp)

        out_gt = f.apply(params, jax.random.PRNGKey(42), batch)

        out_gt = out_gt["structure_module"]["final_atom_positions"]
        # atom37_to_atom14 doesn't like batches
        batch["residx_atom14_to_atom37"] = batch["residx_atom14_to_atom37"][0]
        batch["atom14_atom_exists"] = batch["atom14_atom_exists"][0]

        out_gt = self.am_atom.atom37_to_atom14(out_gt, batch)
        out_gt = torch.as_tensor(np.array(out_gt.block_until_ready()))

        batch["no_recycling_iters"] = np.array([3., 3., 3., 3.,])
        batch = {k: torch.as_tensor(v).cuda() for k, v in batch.items()}

        batch["aatype"] = batch["aatype"].long()
        batch["template_aatype"] = batch["template_aatype"].long()
        batch["extra_msa"] = batch["extra_msa"].long()
        batch["residx_atom37_to_atom14"] = batch[
            "residx_atom37_to_atom14"
        ].long()
        batch["template_all_atom_mask"] = batch["template_all_atom_masks"]
        batch.update(
            data_transforms.atom37_to_torsion_angles("template_")(batch)
        )

        # Move the recycling dimension to the end
        move_dim = lambda t: t.permute(*range(len(t.shape))[1:], 0)
        batch = tensor_tree_map(move_dim, batch)

        with torch.no_grad():
            model = compare_utils.get_global_pretrained_openfold()
            out_repro = model(batch)

        out_repro = tensor_tree_map(lambda t: t.cpu(), out_repro)

        out_repro = out_repro["sm"]["positions"][-1]
        out_repro = out_repro.squeeze(0)

        self.assertTrue(torch.max(torch.abs(out_gt - out_repro)) < 1e-3)