data_modules.py 24.4 KB
Newer Older
1
import copy
2
3
4
5
from functools import partial
import json
import logging
import os
6
import pickle
Gustaf Ahdritz's avatar
Fixes  
Gustaf Ahdritz committed
7
from typing import Optional, Sequence, List, Any
8
9

import ml_collections as mlc
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
10
import numpy as np
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import pytorch_lightning as pl
import torch
from torch.utils.data import RandomSampler

from openfold.data import (
    data_pipeline,
    feature_pipeline,
    mmcif_parsing,
    templates,
)
from openfold.utils.tensor_utils import tensor_tree_map, dict_multimap


class OpenFoldSingleDataset(torch.utils.data.Dataset):
    def __init__(self,
        data_dir: str,
        alignment_dir: str, 
        template_mmcif_dir: str,
        max_template_date: str,
        config: mlc.ConfigDict,
        kalign_binary_path: str = '/usr/bin/kalign',
        max_template_hits: int = 4,
33
        obsolete_pdbs_file_path: Optional[str] = None,
34
        template_release_dates_cache_path: Optional[str] = None,
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
35
        shuffle_top_k_prefiltered: Optional[int] = None,
36
        treat_pdb_as_distillation: bool = True,
Gustaf Ahdritz's avatar
Fixes  
Gustaf Ahdritz committed
37
        mapping_path: Optional[str] = None,
38
        mode: str = "train", 
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
39
        _output_raw: bool = False,
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
40
        _alignment_index: Optional[Any] = None
41
42
43
44
45
46
47
48
49
50
51
    ):
        """
            Args:
                data_dir:
                    A path to a directory containing mmCIF files (in train
                    mode) or FASTA files (in inference mode).
                alignment_dir:
                    A path to a directory containing only data in the format 
                    output by an AlignmentRunner 
                    (defined in openfold.features.alignment_runner).
                    I.e. a directory of directories named {PDB_ID}_{CHAIN_ID}
52
53
                    or simply {PDB_ID}, each containing .a3m, .sto, and .hhr
                    files.
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
54
55
                template_mmcif_dir:
                    Path to a directory containing template mmCIF files.
56
57
                config:
                    A dataset config object. See openfold.config
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
58
59
60
61
62
63
64
65
                kalign_binary_path:
                    Path to kalign binary.
                max_template_hits:
                    An upper bound on how many templates are considered. During
                    training, the templates ultimately used are subsampled
                    from this total quantity.
                template_release_dates_cache_path:
                    Path to the output of scripts/generate_mmcif_cache.
66
67
                obsolete_pdbs_file_path:
                    Path to the file containing replacements for obsolete PDBs.
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
68
69
70
71
72
                shuffle_top_k_prefiltered:
                    Whether to uniformly shuffle the top k template hits before
                    parsing max_template_hits of them. Can be used to
                    approximate DeepMind's training-time template subsampling
                    scheme much more performantly.
73
74
75
76
                treat_pdb_as_distillation:
                    Whether to assume that .pdb files in the data_dir are from
                    the self-distillation set (and should be subjected to
                    special distillation set preprocessing steps).
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
77
78
                mode:
                    "train", "val", or "predict"
79
80
81
82
83
        """
        super(OpenFoldSingleDataset, self).__init__()
        self.data_dir = data_dir
        self.alignment_dir = alignment_dir
        self.config = config
84
        self.treat_pdb_as_distillation = treat_pdb_as_distillation
85
        self.mode = mode
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
86
        self._output_raw = _output_raw
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
87
        self._alignment_index = _alignment_index
88

89
        valid_modes = ["train", "eval", "predict"]
90
91
92
93
94
95
        if(mode not in valid_modes):
            raise ValueError(f'mode must be one of {valid_modes}')

        if(template_release_dates_cache_path is None):
            logging.warning(
                "Template release dates cache does not exist. Remember to run "
96
                "scripts/generate_mmcif_cache.py before running OpenFold"
97
98
            )

Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
99
100
101
        if(_alignment_index is not None):
            self._chain_ids = list(_alignment_index.keys())
        elif(mapping_path is None):
Gustaf Ahdritz's avatar
Fixes  
Gustaf Ahdritz committed
102
103
104
105
106
107
108
109
110
            self._chain_ids = list(os.listdir(alignment_dir))
        else:
            with open(mapping_path, "r") as f:
                self._chain_ids = [l.strip() for l in f.readlines()]
        
        self._chain_id_to_idx_dict = {
            chain: i for i, chain in enumerate(self._chain_ids)
        }

111
112
113
114
115
116
        template_featurizer = templates.TemplateHitFeaturizer(
            mmcif_dir=template_mmcif_dir,
            max_template_date=max_template_date,
            max_hits=max_template_hits,
            kalign_binary_path=kalign_binary_path,
            release_dates_path=template_release_dates_cache_path,
117
            obsolete_pdbs_path=obsolete_pdbs_file_path,
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
118
            _shuffle_top_k_prefiltered=shuffle_top_k_prefiltered,
119
120
121
122
123
124
        )

        self.data_pipeline = data_pipeline.DataPipeline(
            template_featurizer=template_featurizer,
        )

Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
125
        if(not self._output_raw):
126
127
            self.feature_pipeline = feature_pipeline.FeaturePipeline(config) 

Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
128
    def _parse_mmcif(self, path, file_id, chain_id, alignment_dir, _alignment_index):
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
        with open(path, 'r') as f:
            mmcif_string = f.read()

        mmcif_object = mmcif_parsing.parse(
            file_id=file_id, mmcif_string=mmcif_string
        )

        # Crash if an error is encountered. Any parsing errors should have
        # been dealt with at the alignment stage.
        if(mmcif_object.mmcif_object is None):
            raise list(mmcif_object.errors.values())[0]

        mmcif_object = mmcif_object.mmcif_object

        data = self.data_pipeline.process_mmcif(
            mmcif=mmcif_object,
            alignment_dir=alignment_dir,
            chain_id=chain_id,
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
147
            _alignment_index=_alignment_index
148
149
150
        )

        return data
Gustaf Ahdritz's avatar
Fixes  
Gustaf Ahdritz committed
151
152
153
154
155
156
157

    def chain_id_to_idx(self, chain_id):
        return self._chain_id_to_idx_dict[chain_id]

    def idx_to_chain_id(self, idx):
        return self._chain_ids[idx]

158
    def __getitem__(self, idx):
Gustaf Ahdritz's avatar
Fixes  
Gustaf Ahdritz committed
159
        name = self.idx_to_chain_id(idx)
160
161
        alignment_dir = os.path.join(self.alignment_dir, name)

Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
162
163
164
165
166
        _alignment_index = None
        if(self._alignment_index is not None):
            alignment_dir = self.alignment_dir
            _alignment_index = self._alignment_index[name]

Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
167
        if(self.mode == 'train' or self.mode == 'eval'):
168
169
170
171
172
173
174
            spl = name.rsplit('_', 1)
            if(len(spl) == 2):
                file_id, chain_id = spl
            else:
                file_id, = spl
                chain_id = None

Gustaf's avatar
Gustaf committed
175
176
            path = os.path.join(self.data_dir, file_id)
            if(os.path.exists(path + ".cif")):
177
                data = self._parse_mmcif(
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
178
                    path + ".cif", file_id, chain_id, alignment_dir, _alignment_index,
Gustaf's avatar
Gustaf committed
179
180
181
                )
            elif(os.path.exists(path + ".core")):
                data = self.data_pipeline.process_core(
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
182
                    path + ".core", alignment_dir, _alignment_index,
183
                )
184
            elif(os.path.exists(path + ".pdb")):
185
                data = self.data_pipeline.process_pdb(
Gustaf's avatar
Gustaf committed
186
                    pdb_path=path + ".pdb",
187
188
189
                    alignment_dir=alignment_dir,
                    is_distillation=self.treat_pdb_as_distillation,
                    chain_id=chain_id,
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
190
                    _alignment_index=_alignment_index,
191
                )
192
193
            else:
                raise ValueError("Invalid file type")
194
195
196
        else:
            path = os.path.join(name, name + ".fasta")
            data = self.data_pipeline.process_fasta(
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
197
                fasta_path=path,
198
                alignment_dir=alignment_dir,
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
199
                _alignment_index=_alignment_index,
200
201
            )

Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
202
        if(self._output_raw):
203
204
205
            return data

        feats = self.feature_pipeline.process_features(
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
206
            data, self.mode 
207
208
209
210
211
        )

        return feats

    def __len__(self):
Gustaf Ahdritz's avatar
Fixes  
Gustaf Ahdritz committed
212
213
214
        return len(self._chain_ids) 


Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
215
def deterministic_train_filter(
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
216
    prot_data_cache_entry: Any,
Gustaf Ahdritz's avatar
Fixes  
Gustaf Ahdritz committed
217
218
219
220
    max_resolution: float = 9.,
    max_single_aa_prop: float = 0.8,
) -> bool:
    # Hard filters
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
221
    resolution = prot_data_cache_entry.get("resolution", None)
Gustaf Ahdritz's avatar
Fixes  
Gustaf Ahdritz committed
222
223
224
    if(resolution is not None and resolution > max_resolution):
        return False

Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
225
    seq = prot_data_cache_entry["seq"]
Gustaf Ahdritz's avatar
Fixes  
Gustaf Ahdritz committed
226
227
228
229
230
231
232
233
234
    counts = {}
    for aa in seq:
        counts.setdefault(aa, 0)
        counts[aa] += 1
    largest_aa_count = max(counts.values())
    largest_single_aa_prop = largest_aa_count / len(seq)
    if(largest_single_aa_prop > max_single_aa_prop):
        return False

Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
235
236
237
238
    return True


def get_stochastic_train_filter_prob(
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
239
    prot_data_cache_entry: Any,
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
240
) -> List[float]:
Gustaf Ahdritz's avatar
Fixes  
Gustaf Ahdritz committed
241
242
243
    # Stochastic filters
    probabilities = []
    
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
244
    cluster_size = prot_data_cache_entry.get("cluster_size", None)
Gustaf Ahdritz's avatar
Fixes  
Gustaf Ahdritz committed
245
246
247
    if(cluster_size is not None and cluster_size > 0):
        probabilities.append(1 / cluster_size)
    
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
248
    chain_length = len(prot_data_cache_entry["seq"])
Gustaf Ahdritz's avatar
Fixes  
Gustaf Ahdritz committed
249
    probabilities.append((1 / 512) * (max(min(chain_length, 512), 256)))
250

Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
251
252
253
254
    # Risk of underflow here?
    out = 1
    for p in probabilities:
        out *= p
255

Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
256
    return out
257
258


Gustaf Ahdritz's avatar
Fixes  
Gustaf Ahdritz committed
259
class OpenFoldDataset(torch.utils.data.Dataset):
260
    """
Gustaf Ahdritz's avatar
Fixes  
Gustaf Ahdritz committed
261
262
263
264
        Implements the stochastic filters applied during AlphaFold's training.
        Because samples are selected from constituent datasets randomly, the
        length of an OpenFoldFilteredDataset is arbitrary. Samples are selected
        and filtered once at initialization.
265
266
267
268
269
    """
    def __init__(self,
        datasets: Sequence[OpenFoldSingleDataset],
        probabilities: Sequence[int],
        epoch_len: int,
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
270
        prot_data_cache_paths: List[str],
Gustaf Ahdritz's avatar
Fixes  
Gustaf Ahdritz committed
271
272
        generator: torch.Generator = None,
        _roll_at_init: bool = True,
273
274
    ):
        self.datasets = datasets
Gustaf Ahdritz's avatar
Fixes  
Gustaf Ahdritz committed
275
        self.probabilities = probabilities
276
        self.epoch_len = epoch_len
Gustaf Ahdritz's avatar
Fixes  
Gustaf Ahdritz committed
277
278
        self.generator = generator
        
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
279
280
        self.prot_data_caches = []
        for path in prot_data_cache_paths:
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
281
            with open(path, "r") as fp:
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
282
                self.prot_data_caches.append(json.load(fp))
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
283

Gustaf Ahdritz's avatar
Fixes  
Gustaf Ahdritz committed
284
285
286
287
288
289
290
291
292
293
294
295
        def looped_shuffled_dataset_idx(dataset_len):
            while True:
                # Uniformly shuffle each dataset's indices
                weights = [1. for _ in range(dataset_len)]
                shuf = torch.multinomial(
                    torch.tensor(weights),
                    num_samples=dataset_len,
                    replacement=False,
                    generator=self.generator,
                )
                for idx in shuf:
                    yield idx
296

Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
297
298
299
300
        def looped_samples(dataset_idx):
            max_cache_len = int(epoch_len * probabilities[dataset_idx])
            dataset = self.datasets[dataset_idx]
            idx_iter = looped_shuffled_dataset_idx(len(dataset))
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
301
            prot_data_cache = self.prot_data_caches[dataset_idx]
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
302
303
304
305
306
307
            while True:
                weights = []
                idx = []
                for _ in range(max_cache_len):
                    candidate_idx = next(idx_iter)
                    chain_id = dataset.idx_to_chain_id(candidate_idx)
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
308
309
                    prot_data_cache_entry = prot_data_cache[chain_id]
                    if(not deterministic_train_filter(prot_data_cache_entry)):
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
310
311
312
                        continue

                    p = get_stochastic_train_filter_prob(
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
313
                        prot_data_cache_entry,
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
314
315
316
317
318
319
320
321
322
323
                    )
                    weights.append([1. - p, p])
                    idx.append(candidate_idx)

                samples = torch.multinomial(
                    torch.tensor(weights),
                    num_samples=1,
                    generator=self.generator,
                )
                samples = samples.squeeze()
324

Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
325
326
327
328
329
330
                cache = [i for i, s in zip(idx, samples) if s]

                for datapoint_idx in cache:
                    yield datapoint_idx

        self._samples = [looped_samples(i) for i in range(len(self.datasets))]
331

Gustaf Ahdritz's avatar
Fixes  
Gustaf Ahdritz committed
332
333
334
335
336
337
        if(_roll_at_init):
            self.reroll()

    def __getitem__(self, idx):
        dataset_idx, datapoint_idx = self.datapoints[idx]
        return self.datasets[dataset_idx][datapoint_idx]
338
339
340
341

    def __len__(self):
        return self.epoch_len

Gustaf Ahdritz's avatar
Fixes  
Gustaf Ahdritz committed
342
343
344
345
346
347
348
349
350
351
    def reroll(self):
        dataset_choices = torch.multinomial(
            torch.tensor(self.probabilities),
            num_samples=self.epoch_len,
            replacement=True,
            generator=self.generator,
        )

        self.datapoints = []
        for dataset_idx in dataset_choices:
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
352
353
            samples = self._samples[dataset_idx]
            datapoint_idx = next(samples)
Gustaf Ahdritz's avatar
Fixes  
Gustaf Ahdritz committed
354
355
            self.datapoints.append((dataset_idx, datapoint_idx))

356
357

class OpenFoldBatchCollator:
Gustaf Ahdritz's avatar
Fixes  
Gustaf Ahdritz committed
358
    def __init__(self, config, stage="train"):
359
        self.stage = stage
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
360
        self.feature_pipeline = feature_pipeline.FeaturePipeline(config)
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381

    def __call__(self, raw_prots):
        processed_prots = []
        for prot in raw_prots:
            features = self.feature_pipeline.process_features(
                prot, self.stage
            )
            processed_prots.append(features)

        stack_fn = partial(torch.stack, dim=0)
        return dict_multimap(stack_fn, processed_prots) 


class OpenFoldDataLoader(torch.utils.data.DataLoader):
    def __init__(self, *args, config, stage="train", generator=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.config = config
        self.stage = stage    

        if(generator is None):
            generator = torch.Generator()
382
        
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
383
384
385
        self.generator = generator
        self._prep_batch_properties_probs()

Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
386
387
388
389
390
391
392
393
394
    def _prep_batch_properties_probs(self):
        keyed_probs = []
        stage_cfg = self.config[self.stage]

        max_iters = self.config.common.max_recycling_iters
        if(stage_cfg.supervised):
            clamp_prob = self.config.supervised.clamp_prob
            keyed_probs.append(
                ("use_clamped_fape", [1 - clamp_prob, clamp_prob])
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
395
            )
Gustaf Ahdritz's avatar
Merge  
Gustaf Ahdritz committed
396
        
Gustaf Ahdritz's avatar
Fixes  
Gustaf Ahdritz committed
397
398
399
400
        if(stage_cfg.uniform_recycling):
            recycling_probs = [
                1. / (max_iters + 1) for _ in range(max_iters + 1)
            ]
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
401
402
403
404
405
        else:
            recycling_probs = [
                0. for _ in range(max_iters + 1)
            ]
            recycling_probs[-1] = 1.
Gustaf Ahdritz's avatar
Merge  
Gustaf Ahdritz committed
406
        
Gustaf Ahdritz's avatar
Fixes  
Gustaf Ahdritz committed
407
408
409
        keyed_probs.append(
            ("no_recycling_iters", recycling_probs)
        )
410

Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
411
412
413
414
415
416
417
418
419
        keys, probs = zip(*keyed_probs)
        max_len = max([len(p) for p in probs])
        padding = [[0.] * (max_len - len(p)) for p in probs] 
        
        self.prop_keys = keys
        self.prop_probs_tensor = torch.tensor(
            [p + pad for p, pad in zip(probs, padding)],
            dtype=torch.float32,
        )
420

Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
421
    def _add_batch_properties(self, batch):
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
422
423
424
425
        samples = torch.multinomial(
            self.prop_probs_tensor,
            num_samples=1, # 1 per row
            replacement=True,
426
            generator=self.generator
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
427
428
        )

Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
429
430
431
432
        aatype = batch["aatype"]
        batch_dims = aatype.shape[:-2]
        recycling_dim = aatype.shape[-1]
        no_recycling = recycling_dim
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
433
        for i, key in enumerate(self.prop_keys):
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
434
435
436
437
438
            sample = int(samples[i][0])
            sample_tensor = torch.tensor(
                sample, 
                device=aatype.device, 
                requires_grad=False
439
            )
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
440
441
442
443
444
445
446
447
            orig_shape = sample_tensor.shape
            sample_tensor = sample_tensor.view(
                (1,) * len(batch_dims) + sample_tensor.shape + (1,)
            )
            sample_tensor = sample_tensor.expand(
                batch_dims + orig_shape + (recycling_dim,)
            )
            batch[key] = sample_tensor
448

Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
            if(key == "no_recycling_iters"):
                no_recycling = sample 
        
        resample_recycling = lambda t: t[..., :no_recycling + 1]
        batch = tensor_tree_map(resample_recycling, batch)

        return batch

    def __iter__(self):
        it = super().__iter__()

        def _batch_prop_gen(iterator):
            for batch in iterator:
                yield self._add_batch_properties(batch)

        return _batch_prop_gen(it)
465
466
467
468
469
470
471
472
473


class OpenFoldDataModule(pl.LightningDataModule):
    def __init__(self,
        config: mlc.ConfigDict,
        template_mmcif_dir: str,
        max_template_date: str,
        train_data_dir: Optional[str] = None,
        train_alignment_dir: Optional[str] = None,
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
474
        train_prot_data_cache_path: Optional[str] = None,
475
476
        distillation_data_dir: Optional[str] = None,
        distillation_alignment_dir: Optional[str] = None,
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
477
        distillation_prot_data_cache_path: Optional[str] = None,
478
479
480
481
482
483
484
        val_data_dir: Optional[str] = None,
        val_alignment_dir: Optional[str] = None,
        predict_data_dir: Optional[str] = None,
        predict_alignment_dir: Optional[str] = None,
        kalign_binary_path: str = '/usr/bin/kalign',
        train_mapping_path: Optional[str] = None,
        distillation_mapping_path: Optional[str] = None,
485
        obsolete_pdbs_file_path: Optional[str] = None,
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
486
487
        template_release_dates_cache_path: Optional[str] = None,
        batch_seed: Optional[int] = None,
Gustaf Ahdritz's avatar
Fixes  
Gustaf Ahdritz committed
488
        train_epoch_len: int = 50000, 
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
489
        _alignment_index_path: Optional[str] = None,
490
491
492
493
494
495
496
497
498
        **kwargs
    ):
        super(OpenFoldDataModule, self).__init__()

        self.config = config
        self.template_mmcif_dir = template_mmcif_dir
        self.max_template_date = max_template_date
        self.train_data_dir = train_data_dir
        self.train_alignment_dir = train_alignment_dir
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
499
        self.train_prot_data_cache_path = train_prot_data_cache_path
500
501
        self.distillation_data_dir = distillation_data_dir
        self.distillation_alignment_dir = distillation_alignment_dir
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
502
503
        self.distillation_prot_data_cache_path = (
            distillation_prot_data_cache_path
Gustaf Ahdritz's avatar
Fixes  
Gustaf Ahdritz committed
504
        )
505
506
507
508
509
510
511
512
513
514
        self.val_data_dir = val_data_dir
        self.val_alignment_dir = val_alignment_dir
        self.predict_data_dir = predict_data_dir
        self.predict_alignment_dir = predict_alignment_dir
        self.kalign_binary_path = kalign_binary_path
        self.train_mapping_path = train_mapping_path
        self.distillation_mapping_path = distillation_mapping_path
        self.template_release_dates_cache_path = (
            template_release_dates_cache_path
        )
515
        self.obsolete_pdbs_file_path = obsolete_pdbs_file_path
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
516
        self.batch_seed = batch_seed
Gustaf Ahdritz's avatar
Fixes  
Gustaf Ahdritz committed
517
        self.train_epoch_len = train_epoch_len
518
519
520
521
522
523
524
525
526

        if(self.train_data_dir is None and self.predict_data_dir is None):
            raise ValueError(
                'At least one of train_data_dir or predict_data_dir must be '
                'specified'
            )

        self.training_mode = self.train_data_dir is not None

Gustaf Ahdritz's avatar
Fixes  
Gustaf Ahdritz committed
527
        if(self.training_mode and train_alignment_dir is None):
528
529
530
            raise ValueError(
                'In training mode, train_alignment_dir must be specified'
            )
Gustaf Ahdritz's avatar
Fixes  
Gustaf Ahdritz committed
531
        elif(not self.training_mode and predict_alignment_dir is None):
532
533
534
535
536
537
538
539
540
            raise ValueError(
                'In inference mode, predict_alignment_dir must be specified'
            )      
        elif(val_data_dir is not None and val_alignment_dir is None):
            raise ValueError(
                'If val_data_dir is specified, val_alignment_dir must '
                'be specified as well'
        )

Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
541
542
543
544
545
546
        # An ad-hoc measure for our particular filesystem restrictions
        self._alignment_index = None
        if(_alignment_index_path is not None):
            with open(_alignment_index_path, "r") as fp:
                self._alignment_index = json.load(fp)

Gustaf Ahdritz's avatar
Fixes  
Gustaf Ahdritz committed
547
    def setup(self):
548
549
550
551
552
553
554
555
        # Most of the arguments are the same for the three datasets 
        dataset_gen = partial(OpenFoldSingleDataset,
            template_mmcif_dir=self.template_mmcif_dir,
            max_template_date=self.max_template_date,
            config=self.config,
            kalign_binary_path=self.kalign_binary_path,
            template_release_dates_cache_path=
                self.template_release_dates_cache_path,
556
557
            obsolete_pdbs_file_path=
                self.obsolete_pdbs_file_path,
558
559
        )

Gustaf Ahdritz's avatar
Fixes  
Gustaf Ahdritz committed
560
561
        if(self.training_mode):
            train_dataset = dataset_gen(
562
563
564
565
                data_dir=self.train_data_dir,
                alignment_dir=self.train_alignment_dir,
                mapping_path=self.train_mapping_path,
                max_template_hits=self.config.train.max_template_hits,
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
566
567
                shuffle_top_k_prefiltered=
                    self.config.train.shuffle_top_k_prefiltered,
568
                treat_pdb_as_distillation=False,
569
                mode="train",
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
570
                _output_raw=True,
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
571
                _alignment_index=self._alignment_index,
572
573
            )

Gustaf Ahdritz's avatar
Fixes  
Gustaf Ahdritz committed
574
            distillation_dataset = None
575
576
577
578
579
580
            if(self.distillation_data_dir is not None):
                distillation_dataset = dataset_gen(
                    data_dir=self.distillation_data_dir,
                    alignment_dir=self.distillation_alignment_dir,
                    mapping_path=self.distillation_mapping_path,
                    max_template_hits=self.train.max_template_hits,
581
                    treat_pdb_as_distillation=True,
582
                    mode="train",
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
583
                    _output_raw=True,
584
585
586
                )

                d_prob = self.config.train.distillation_prob
Gustaf Ahdritz's avatar
Fixes  
Gustaf Ahdritz committed
587
588
589
590
591
           
            if(distillation_dataset is not None):
                datasets = [train_dataset, distillation_dataset]
                d_prob = self.config.train.distillation_prob
                probabilities = [1 - d_prob, d_prob]
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
592
593
594
                prot_data_cache_paths = [
                    self.train_prot_data_cache_path,
                    self.distillation_prot_data_cache_path,
Gustaf Ahdritz's avatar
Fixes  
Gustaf Ahdritz committed
595
596
597
598
                ]
            else:
                datasets = [train_dataset]
                probabilities = [1.]   
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
599
600
                prot_data_cache_paths = [
                    self.train_prot_data_cache_path,
Gustaf Ahdritz's avatar
Fixes  
Gustaf Ahdritz committed
601
602
603
604
605
606
                ]

            self.train_dataset = OpenFoldDataset(
                datasets=datasets,
                probabilities=probabilities,
                epoch_len=self.train_epoch_len,
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
607
                prot_data_cache_paths=prot_data_cache_paths,
Gustaf Ahdritz's avatar
Fixes  
Gustaf Ahdritz committed
608
609
                _roll_at_init=False,
            )
610
611
    
            if(self.val_data_dir is not None):
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
612
                self.eval_dataset = dataset_gen(
613
614
615
616
617
                    data_dir=self.val_data_dir,
                    alignment_dir=self.val_alignment_dir,
                    mapping_path=None,
                    max_template_hits=self.config.eval.max_template_hits,
                    mode="eval",
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
618
                    _output_raw=True,
619
                )
620
            else:
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
621
                self.eval_dataset = None
622
623
624
625
626
627
628
629
630
        else:           
            self.predict_dataset = dataset_gen(
                data_dir=self.predict_data_dir,
                alignment_dir=self.predict_alignment_dir,
                mapping_path=None,
                max_template_hits=self.config.predict.max_template_hits,
                mode="predict",
            )

Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
631
    def _gen_dataloader(self, stage):
632
        generator = torch.Generator()
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
633
634
        if(self.batch_seed is not None):
            generator = generator.manual_seed(self.batch_seed)
635

Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
636
637
638
        dataset = None
        if(stage == "train"):
            dataset = self.train_dataset
Gustaf Ahdritz's avatar
Fixes  
Gustaf Ahdritz committed
639
640
641
            
            # Filter the dataset, if necessary
            dataset.reroll()
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
642
643
644
645
646
647
648
649
650
651
652
653
654
655
        elif(stage == "eval"):
            dataset = self.eval_dataset
        elif(stage == "predict"):
            dataset = self.predict_dataset
        else:
            raise ValueError("Invalid stage")

        batch_collator = OpenFoldBatchCollator(self.config, stage)

        dl = OpenFoldDataLoader(
            dataset,
            config=self.config,
            stage=stage,
            generator=generator,
656
657
            batch_size=self.config.data_module.data_loaders.batch_size,
            num_workers=self.config.data_module.data_loaders.num_workers,
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
658
            collate_fn=batch_collator,
659
660
        )

Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
661
        return dl
662

Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
663
664
665
666
667
668
    def train_dataloader(self):
        return self._gen_dataloader("train") 

    def val_dataloader(self):
        if(self.eval_dataset is not None):
            return self._gen_dataloader("eval")
669
        return None
670
671

    def predict_dataloader(self):
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
672
        return self._gen_dataloader("predict") 
673
674
675
676
677


class DummyDataset(torch.utils.data.Dataset):
    def __init__(self, batch_path):
        with open(batch_path, "rb") as f:
Gustaf Ahdritz's avatar
Gustaf Ahdritz committed
678
            self.batch = pickle.load(f)
679
680
681
682
683
684
685
686
687

    def __getitem__(self, idx):
        return copy.deepcopy(self.batch)

    def __len__(self):
        return 1000


class DummyDataLoader(pl.LightningDataModule):
688
    def __init__(self, batch_path):
689
        super().__init__()
690
        self.dataset = DummyDataset(batch_path)
691
692
693

    def train_dataloader(self):
        return torch.utils.data.DataLoader(self.dataset)