experiment.py 10.1 KB
Newer Older
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
1
2
3
4
5
6
7
8
9
10
11
12
13
#!/usr/bin/env python
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

""""
This file is the entry point for launching experiments with Implicitron.

Launch Training
---------------
Experiment config .yaml files are located in the
14
15
16
`projects/implicitron_trainer/configs` folder. To launch an experiment,
specify the name of the file. Specific config values can also be overridden
from the command line, for example:
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
17
18
19
20
21

```
./experiment.py --config-name base_config.yaml override.param.one=42 override.param.two=84
```

22
23
24
25
26
27
28
29
30
31
32
To run an experiment on a specific GPU, specify the `gpu_idx` key in the
config file / CLI. To run on a different device, specify the device in
`run_training`.

Main functions
---------------
- The Experiment class defines `run` which creates the model, optimizer, and other
  objects used in training, then starts TrainingLoop's `run` function.
- TrainingLoop takes care of the actual training logic: forward and backward passes,
  evaluation and testing, as well as model checkpointing, visualization, and metric
  printing.
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51

Outputs
--------
The outputs of the experiment are saved and logged in multiple ways:
  - Checkpoints:
        Model, optimizer and stats are stored in the directory
        named by the `exp_dir` key from the config file / CLI parameters.
  - Stats
        Stats are logged and plotted to the file "train_stats.pdf" in the
        same directory. The stats are also saved as part of the checkpoint file.
  - Visualizations
        Prredictions are plotted to a visdom server running at the
        port specified by the `visdom_server` and `visdom_port` keys in the
        config file.

"""
import logging
import os
import warnings
52
53

from dataclasses import field
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
54
55
56

import hydra
import torch
57
from accelerate import Accelerator
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
58
59
from omegaconf import DictConfig, OmegaConf
from packaging import version
60
61
62
63
64
65
66

from pytorch3d.implicitron.dataset.data_source import (
    DataSourceBase,
    ImplicitronDataSource,
)
from pytorch3d.implicitron.models.generic_model import ImplicitronModelBase

67
68
69
70
from pytorch3d.implicitron.models.renderer.multipass_ea import (
    MultiPassEmissionAbsorptionRenderer,
)
from pytorch3d.implicitron.models.renderer.ray_sampler import AdaptiveRaySampler
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
71
from pytorch3d.implicitron.tools.config import (
72
    Configurable,
73
    expand_args_fields,
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
74
    remove_unused_components,
75
    run_auto_creation,
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
76
77
)

78
79
80
from .impl.model_factory import ModelFactoryBase
from .impl.optimizer_factory import OptimizerFactoryBase
from .impl.training_loop import TrainingLoopBase
81

Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
82
83
84
85
86
87
88
89
90
91
92
93
94
95
logger = logging.getLogger(__name__)

if version.parse(hydra.__version__) < version.Version("1.1"):
    raise ValueError(
        f"Hydra version {hydra.__version__} is too old."
        " (Implicitron requires version 1.1 or later.)"
    )

try:
    # only makes sense in FAIR cluster
    import pytorch3d.implicitron.fair_cluster.slurm  # noqa: F401
except ModuleNotFoundError:
    pass

96
97
no_accelerate = os.environ.get("PYTORCH3D_NO_ACCELERATE") is not None

Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
98

99
class Experiment(Configurable):  # pyre-ignore: 13
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
100
    """
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
    This class is at the top level of Implicitron's config hierarchy. Its
    members are high-level components necessary for training an implicit rende-
    ring network.

    Members:
        data_source: An object that produces datasets and dataloaders.
        model_factory: An object that produces an implicit rendering model as
            well as its corresponding Stats object.
        optimizer_factory: An object that produces the optimizer and lr
            scheduler.
        training_loop: An object that runs training given the outputs produced
            by the data_source, model_factory and optimizer_factory.
        detect_anomaly: Whether torch.autograd should detect anomalies. Useful
            for debugging, but might slow down the training.
        exp_dir: Root experimentation directory. Checkpoints and training stats
            will be saved here.
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
117
118
    """

119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
    data_source: DataSourceBase
    data_source_class_type: str = "ImplicitronDataSource"
    model_factory: ModelFactoryBase
    model_factory_class_type: str = "ImplicitronModelFactory"
    optimizer_factory: OptimizerFactoryBase
    optimizer_factory_class_type: str = "ImplicitronOptimizerFactory"
    training_loop: TrainingLoopBase
    training_loop_class_type: str = "ImplicitronTrainingLoop"

    detect_anomaly: bool = False
    exp_dir: str = "./data/default_experiment/"

    hydra: dict = field(
        default_factory=lambda: {
            "run": {"dir": "."},  # Make hydra not change the working dir.
            "output_subdir": None,  # disable storing the .hydra logs
        }
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
136
137
    )

138
139
    def __post_init__(self):
        run_auto_creation(self)
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
140

141
142
143
    def run(self) -> None:
        # Make sure the config settings are self-consistent.
        self._check_config_consistent()
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
144

145
146
147
148
149
150
151
152
        # Initialize the accelerator if desired.
        if no_accelerate:
            accelerator = None
            device = torch.device("cuda:0")
        else:
            accelerator = Accelerator(device_placement=False)
            logger.info(accelerator.state)
            device = accelerator.device
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
153

154
155
        logger.info(f"Running experiment on device: {device}")
        os.makedirs(self.exp_dir, exist_ok=True)
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
156

157
158
159
160
        # set the debug mode
        if self.detect_anomaly:
            logger.info("Anomaly detection!")
        torch.autograd.set_detect_anomaly(self.detect_anomaly)
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
161

162
163
        # Initialize the datasets and dataloaders.
        datasets, dataloaders = self.data_source.get_datasets_and_dataloaders()
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
164

165
166
167
168
169
        # Init the model and the corresponding Stats object.
        model = self.model_factory(
            accelerator=accelerator,
            exp_dir=self.exp_dir,
        )
170

171
172
173
174
175
        stats = self.model_factory.load_stats(
            exp_dir=self.exp_dir,
            log_vars=model.log_vars,
        )
        start_epoch = stats.epoch + 1
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
176

177
        model.to(device)
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
178

179
180
181
182
183
184
        # Init the optimizer and LR scheduler.
        optimizer, scheduler = self.optimizer_factory(
            accelerator=accelerator,
            exp_dir=self.exp_dir,
            last_epoch=start_epoch,
            model=model,
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
185
        )
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
186

187
188
189
190
191
192
193
194
        # Wrap all modules in the distributed library
        # Note: we don't pass the scheduler to prepare as it
        # doesn't need to be stepped at each optimizer step
        train_loader = dataloaders.train
        val_loader = dataloaders.val
        test_loader = dataloaders.test
        if accelerator is not None:
            (
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
195
196
                model,
                optimizer,
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
                train_loader,
                val_loader,
            ) = accelerator.prepare(model, optimizer, train_loader, val_loader)

        task = self.data_source.get_task()
        all_train_cameras = self.data_source.all_train_cameras

        # Enter the main training loop.
        self.training_loop.run(
            train_loader=train_loader,
            val_loader=val_loader,
            test_loader=test_loader,
            model=model,
            optimizer=optimizer,
            scheduler=scheduler,
            all_train_cameras=all_train_cameras,
            accelerator=accelerator,
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
214
            device=device,
215
216
217
            exp_dir=self.exp_dir,
            stats=stats,
            task=task,
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
218
        )
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
219

220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
    def _check_config_consistent(self) -> None:
        if hasattr(self.optimizer_factory, "resume") and hasattr(
            self.model_factory, "resume"
        ):
            assert (
                # pyre-ignore [16]
                not self.optimizer_factory.resume
                # pyre-ignore [16]
                or self.model_factory.resume
            ), "Cannot resume the optimizer without resuming the model."
        if hasattr(self.optimizer_factory, "resume_epoch") and hasattr(
            self.model_factory, "resume_epoch"
        ):
            assert (
                # pyre-ignore [16]
                self.optimizer_factory.resume_epoch
                # pyre-ignore [16]
                == self.model_factory.resume_epoch
            ), "Optimizer and model must resume from the same epoch."
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
239
240


241
def _setup_envvars_for_cluster() -> bool:
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
    """
    Prepares to run on cluster if relevant.
    Returns whether FAIR cluster in use.
    """
    # TODO: How much of this is needed in general?

    try:
        import submitit
    except ImportError:
        return False

    try:
        # Only needed when launching on cluster with slurm and submitit
        job_env = submitit.JobEnvironment()
    except RuntimeError:
        return False

    os.environ["LOCAL_RANK"] = str(job_env.local_rank)
    os.environ["RANK"] = str(job_env.global_rank)
    os.environ["WORLD_SIZE"] = str(job_env.num_tasks)
    os.environ["MASTER_ADDR"] = "localhost"
    os.environ["MASTER_PORT"] = "42918"
    logger.info(
        "Num tasks %s, global_rank %s"
        % (str(job_env.num_tasks), str(job_env.global_rank))
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
267
268
    )

269
    return True
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
270

271

272
273
274
275
276
277
278
279
280
281
282
283
def dump_cfg(cfg: DictConfig) -> None:
    remove_unused_components(cfg)
    # dump the exp config to the exp dir
    os.makedirs(cfg.exp_dir, exist_ok=True)
    try:
        cfg_filename = os.path.join(cfg.exp_dir, "expconfig.yaml")
        OmegaConf.save(config=cfg, f=cfg_filename)
    except PermissionError:
        warnings.warn("Can't dump config due to insufficient permissions!")


expand_args_fields(Experiment)
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
284
cs = hydra.core.config_store.ConfigStore.instance()
285
cs.store(name="default_config", node=Experiment)
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
286
287
288
289


@hydra.main(config_path="./configs/", config_name="default_config")
def experiment(cfg: DictConfig) -> None:
290
291
292
293
294
295
296
297
298
    # CUDA_VISIBLE_DEVICES must have been set.

    if "CUDA_DEVICE_ORDER" not in os.environ:
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

    if not _setup_envvars_for_cluster():
        logger.info("Running locally")

    # TODO: The following may be needed for hydra/submitit it to work
299
    expand_args_fields(ImplicitronModelBase)
300
301
302
303
    expand_args_fields(AdaptiveRaySampler)
    expand_args_fields(MultiPassEmissionAbsorptionRenderer)
    expand_args_fields(ImplicitronDataSource)

304
305
306
    experiment = Experiment(**cfg)
    dump_cfg(cfg)
    experiment.run()
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
307
308
309
310


if __name__ == "__main__":
    experiment()