data_utils.py 5.41 KB
Newer Older
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# Copyright 2021 AlQuraishi Laboratory
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
from random import randint
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
16
import numpy as np
17
from scipy.spatial.transform import Rotation
Geoffrey Yu's avatar
Geoffrey Yu committed
18
import pickle,os
19
from tests.config import consts
Geoffrey Yu's avatar
Geoffrey Yu committed
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import gzip

def process_label(all_atom_positions: np.ndarray, operation) -> np.ndarray:
    """
    Adapted from unifold
    https://github.com/dptech-corp/Uni-Fold/blob/b1c89a2cebd4e4ee4c47b4e443f92beeb9138fbb/unifold/dataset.py#L55-L61
    """
    if operation == "I":
        return all_atom_positions
    rot, trans = operation
    rot = np.array(rot).reshape(3, 3)
    trans = np.array(trans).reshape(3)
    return all_atom_positions @ rot.T + trans

def load_single_label(
    label_id,label_dir,
    symmetry_operation=None,
):
    """
    Adapted from unifold
    https://github.com/dptech-corp/Uni-Fold/blob/b1c89a2cebd4e4ee4c47b4e443f92beeb9138fbb/unifold/dataset.py#L101-L116
    
    args:
    label: is the dictionary of numpy arrays. created by loading the label pickle file
    """
    label_path = os.path.join(label_dir,f"{label_id}.label.pkl.gz")
    label = pickle.load(gzip.open(label_path,"rb"))
    if symmetry_operation is not None:
        label["all_atom_positions"] = process_label(
            label["all_atom_positions"], symmetry_operation
        )
    label = {
        k: v
        for k, v in label.items()
        if k in ["aatype", "all_atom_positions", "all_atom_mask", "resolution"]
    }
    return label

def load_labels(label_dir,label_ids:list):
    symmetry_operations = ["I" for _ in label_ids] # for now suppose there are NO symmetry operations
    all_chain_labels = [
        load_single_label(l, label_dir, o)
        for l, o in zip(label_ids, symmetry_operations)
    ]

65
66
67
68
69
70
71
72
73
74
75
76


def random_asym_ids(n_res, split_chains=True, min_chain_len=4):
    n_chain = randint(1, n_res // min_chain_len) if consts.is_multimer else 1

    if not split_chains:
        return [0] * n_res

    assert n_res >= n_chain

    pieces = []
    asym_ids = []
77
    final_idx = n_chain - 1
78
    for idx in range(n_chain - 1):
Christina Floristean's avatar
Christina Floristean committed
79
80
        n_stop = (n_res - sum(pieces) - n_chain + idx - min_chain_len)
        if n_stop <= min_chain_len:
81
            final_idx = idx
Christina Floristean's avatar
Christina Floristean committed
82
83
            break
        piece = randint(min_chain_len, n_stop)
84
85
        pieces.append(piece)
        asym_ids.extend(piece * [idx])
86
    asym_ids.extend((n_res - sum(pieces)) * [final_idx])
87
88
89

    return np.array(asym_ids).astype(np.int64)

Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
90
91
92

def random_template_feats(n_templ, n, batch_size=None):
    b = []
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
93
    if batch_size is not None:
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
94
95
96
97
98
99
        b.append(batch_size)
    batch = {
        "template_mask": np.random.randint(0, 2, (*b, n_templ)),
        "template_pseudo_beta_mask": np.random.randint(0, 2, (*b, n_templ, n)),
        "template_pseudo_beta": np.random.rand(*b, n_templ, n, 3),
        "template_aatype": np.random.randint(0, 22, (*b, n_templ, n)),
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
100
        "template_all_atom_mask": np.random.randint(
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
101
102
            0, 2, (*b, n_templ, n, 37)
        ),
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
103
104
105
106
107
108
109
110
        "template_all_atom_positions": 
            np.random.rand(*b, n_templ, n, 37, 3) * 10,
        "template_torsion_angles_sin_cos": 
            np.random.rand(*b, n_templ, n, 7, 2),
        "template_alt_torsion_angles_sin_cos": 
            np.random.rand(*b, n_templ, n, 7, 2),
        "template_torsion_angles_mask": 
            np.random.rand(*b, n_templ, n, 7),
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
111
    }
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
112
    batch = {k: v.astype(np.float32) for k, v in batch.items()}
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
113
    batch["template_aatype"] = batch["template_aatype"].astype(np.int64)
114
115
116
117
118

    if consts.is_multimer:
        asym_ids = np.array(random_asym_ids(n))
        batch["asym_id"] = np.tile(asym_ids[np.newaxis, :], (*b, n_templ, 1))

Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
119
120
    return batch

121

Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
122
123
def random_extra_msa_feats(n_extra, n, batch_size=None):
    b = []
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
124
    if batch_size is not None:
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
125
126
        b.append(batch_size)
    batch = {
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
127
128
129
130
131
132
133
134
135
136
137
138
        "extra_msa": np.random.randint(0, 22, (*b, n_extra, n)).astype(
            np.int64
        ),
        "extra_has_deletion": np.random.randint(0, 2, (*b, n_extra, n)).astype(
            np.float32
        ),
        "extra_deletion_value": np.random.rand(*b, n_extra, n).astype(
            np.float32
        ),
        "extra_msa_mask": np.random.randint(0, 2, (*b, n_extra, n)).astype(
            np.float32
        ),
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
139
140
    }
    return batch
141
142


143
def random_affines_vector(dim):
144
145
146
147
    prod_dim = 1
    for d in dim:
        prod_dim *= d

148
    affines = np.zeros((prod_dim, 7)).astype(np.float32)
149
150
151

    for i in range(prod_dim):
        affines[i, :4] = Rotation.random(random_state=42).as_quat()
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
152
153
154
        affines[i, 4:] = np.random.rand(
            3,
        ).astype(np.float32)
155
156
157
158

    return affines.reshape(*dim, 7)


159
def random_affines_4x4(dim):
160
161
162
163
    prod_dim = 1
    for d in dim:
        prod_dim *= d

164
    affines = np.zeros((prod_dim, 4, 4)).astype(np.float32)
165
166
167

    for i in range(prod_dim):
        affines[i, :3, :3] = Rotation.random(random_state=42).as_matrix()
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
168
169
170
        affines[i, :3, 3] = np.random.rand(
            3,
        ).astype(np.float32)
171
172
173
174

    affines[:, 3, 3] = 1

    return affines.reshape(*dim, 4, 4)