[DGL-Go] Inference for Node Prediction Pipeline (full & ns) (#4095)

* Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update

[DGL-Go] Inference for Node Prediction Pipeline (full & ns) (#4095)
* Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update
31e4a89b · Mufei Li · GitHub · 69226588 · 31e4a89b · 31e4a89b
Unverified Commit 31e4a89b authored Jun 21, 2022 by Mufei Li Committed by GitHub Jun 21, 2022
20 changed files
--- a/dglgo/dglgo/pipeline/nodepred_sample/nodepred-ns.jinja-py
+++ b/dglgo/dglgo/pipeline/nodepred_sample/nodepred-ns.jinja-py
@@ -3,8 +3,9 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import dgl
-from dgl.data import AsNodePredDataset
+import os
+from dgl.data import AsNodePredDataset
 {{ data_import_code }}
 {{ model_code }}
@@ -13,7 +14,7 @@ from dgl.data import AsNodePredDataset
 class EarlyStopping:
    def __init__(self,
                 patience: int = -1,
-                 checkpoint_path: str = 'checkpoint.pt'):
+                 checkpoint_path: str = 'checkpoint.pth'):
        self.patience = patience
        self.checkpoint_path = checkpoint_path
        self.counter = 0
@@ -42,6 +43,9 @@ class EarlyStopping:
    def load_checkpoint(self, model):
        model.load_state_dict(torch.load(self.checkpoint_path))
+    def close(self):
+        os.remove(self.checkpoint_path)
 {% endif %}
@@ -134,39 +138,50 @@ def train(cfg, pipeline_cfg, device, data, model, optimizer, loss_fcn):
    {% if user_cfg.early_stop %}
    stopper.load_checkpoint(model)
+    stopper.close()
    {% endif %}
    model.eval()
    with torch.no_grad():
        test_acc = evaluate(model, test_g, test_nfeat, test_labels, test_nid, cfg["eval_device"])
    return test_acc
-def main():
+def main(run, cfg, data):
-    {{ user_cfg_str }}
    device = cfg['device']
    pipeline_cfg = cfg["general_pipeline"]
+    model = {{ model_class_name }}(**cfg["model"])
+    model = model.to(device)
+    loss = torch.nn.{{ user_cfg.general_pipeline.loss }}()
+    optimizer = torch.optim.{{ user_cfg.general_pipeline.optimizer.name }}(model.parameters(), **pipeline_cfg["optimizer"])
+    test_acc = train(cfg, pipeline_cfg, device, data, model, optimizer, loss)
+    cpt_path = os.path.join(pipeline_cfg["save_path"], 'run_{}.pth'.format(run))
+    torch.save({'cfg': cfg, 'model': model.state_dict()}, cpt_path)
+    print('Saved training checkpoint to {}'.format(cpt_path))
+    return test_acc
+if __name__ == '__main__':
+    {{ user_cfg_str }}
+    if not torch.cuda.is_available():
+        cfg['device'] = 'cpu'
    # load data
    data = AsNodePredDataset({{data_initialize_code}})
-    # create model
    model_cfg = cfg["model"]
    cfg["model"]["data_info"] = {
        "in_size": model_cfg['embed_size'] if model_cfg['embed_size'] > 0 else data[0].ndata['feat'].shape[1],
        "out_size": data.num_classes,
        "num_nodes": data[0].num_nodes()
    }
-    model = {{ model_class_name }}(**cfg["model"])
-    model = model.to(device)
-    loss = torch.nn.{{ user_cfg.general_pipeline.loss }}()
-    optimizer = torch.optim.{{ user_cfg.general_pipeline.optimizer.name }}(model.parameters(), **pipeline_cfg["optimizer"])
-    test_acc = train(cfg, pipeline_cfg, device, data, model, optimizer, loss)
-    torch.save(model.state_dict(), pipeline_cfg["save_path"])
-    return test_acc
-if __name__ == '__main__':
+    os.makedirs(cfg['general_pipeline']["save_path"])
    all_acc = []
    num_runs = {{ user_cfg.general_pipeline.num_runs }}
    for run in range(num_runs):
        print(f'Run experiment #{run}')
-        test_acc = main()
+        test_acc = main(run, cfg, data)
        print("Test Accuracy {:.4f}".format(test_acc))
        all_acc.append(test_acc)
    avg_acc = np.round(np.mean(all_acc), 6)

--- a/dglgo/dglgo/utils/early_stop.py
+++ b/dglgo/dglgo/utils/early_stop.py
@@ -3,7 +3,7 @@ import torch
 class EarlyStopping:
    def __init__(self,
                 patience: int = -1,
-                 checkpoint_path: str = 'checkpoint.pt'):
+                 checkpoint_path: str = 'checkpoint.pth'):
        self.patience = patience
        self.checkpoint_path = checkpoint_path
        self.counter = 0

--- a/dglgo/dglgo/utils/enter_config.py
+++ b/dglgo/dglgo/utils/enter_config.py
@@ -14,7 +14,7 @@ from .base_model import DGLBaseModel
-class PipelineConfig(DGLBaseModel):    
+class PipelineConfig(DGLBaseModel):
    node_embed_size: Optional[int] = -1
    early_stop: Optional[dict]
    num_epochs: int = 200
@@ -25,5 +25,5 @@ class PipelineConfig(DGLBaseModel):
 class UserConfig(DGLBaseModel):
    version: Optional[str] = "0.0.1"
    pipeline_name: PipelineFactory.get_pipeline_enum()
+    pipeline_mode: str
    device: str = "cpu"
-    # general_pipeline: PipelineConfig = PipelineConfig()
\ No newline at end of file
--- a/dglgo/dglgo/utils/factory.py
+++ b/dglgo/dglgo/utils/factory.py
@@ -265,6 +265,23 @@ class PipelineFactory:
            "PipelineName", {k: k for k, v in cls.registry.items()})
        return enum_class
+class ApplyPipelineFactory:
+    """The factory class for creating executors for inference"""
+    registry: Dict[str, PipelineBase] = {}
+    """ Internal registry for available executors """
+    @classmethod
+    def register(cls, name: str) -> Callable:
+        def inner_wrapper(wrapped_class) -> Callable:
+            if name in cls.registry:
+                logger.warning(
+                    'Executor %s already exists. Will replace it', name)
+            cls.registry[name] = wrapped_class()
+            return wrapped_class
+        return inner_wrapper
 model_dir = Path(__file__).parent.parent / "model"

--- a/dglgo/recipes/graphpred_hiv_gin.yaml
+++ b/dglgo/recipes/graphpred_hiv_gin.yaml
 version: 0.0.1
 pipeline_name: graphpred
+pipeline_mode: train
 device: cuda:0                # Torch device name, e.q. cpu or cuda or cuda:0
 data:
  name: ogbg-molhiv
@@ -26,4 +27,4 @@ general_pipeline:
  loss: BCEWithLogitsLoss
  metric: roc_auc_score
  num_epochs: 100             # Number of training epochs
-  save_path: model.pth        # Path to save the model
+  save_path: "results"        # Directory to save the experiment results
--- a/dglgo/recipes/graphpred_hiv_pna.yaml
+++ b/dglgo/recipes/graphpred_hiv_pna.yaml
 version: 0.0.1
 pipeline_name: graphpred
+pipeline_mode: train
 device: cuda:0                # Torch device name, e.q. cpu or cuda or cuda:0
 data:
  name: ogbg-molhiv
@@ -33,4 +34,4 @@ general_pipeline:
  loss: BCEWithLogitsLoss
  metric: roc_auc_score
  num_epochs: 200             # Number of training epochs
-  save_path: model.pth        # Path to save the model
+  save_path: "results"        # Directory to save the experiment results
--- a/dglgo/recipes/graphpred_pcba_gin.yaml
+++ b/dglgo/recipes/graphpred_pcba_gin.yaml
 version: 0.0.1
 pipeline_name: graphpred
+pipeline_mode: train
 device: cuda:0                # Torch device name, e.q. cpu or cuda or cuda:0
 data:
  name: ogbg-molpcba
@@ -26,4 +27,4 @@ general_pipeline:
  loss: BCEWithLogitsLoss
  metric: average_precision_score
  num_epochs: 100             # Number of training epochs
-  save_path: model.pth        # Path to save the model
+  save_path: "results"        # Directory to save the experiment results
--- a/dglgo/recipes/linkpred_citation2_sage.yaml
+++ b/dglgo/recipes/linkpred_citation2_sage.yaml
 version: 0.0.1
 pipeline_name: linkpred
+pipeline_mode: train
 device: cpu
 data:
  name: ogbl-citation2
@@ -15,7 +16,7 @@ node_model:
  aggregator_type: gcn        # Aggregator type to use (``mean``, ``gcn``, ``pool``, ``lstm``).
 edge_model:
  name: ele
-  hidden_size: 64             # Hidden size.        
+  hidden_size: 64             # Hidden size.
  num_layers: 2               # Number of hidden layers.
  bias: true                  # Whether to use bias in the linaer layer.
 neg_sampler:
@@ -31,5 +32,5 @@ general_pipeline:
    name: Adam
    lr: 0.005
  loss: BCELoss
-  save_path: "model.pth"
+  save_path: "results"        # Directory to save the experiment results
  num_runs: 1                 # Number of experiments to run
--- a/dglgo/recipes/linkpred_collab_sage.yaml
+++ b/dglgo/recipes/linkpred_collab_sage.yaml
 version: 0.0.1
 pipeline_name: linkpred
+pipeline_mode: train
 device: cpu
 data:
  name: ogbl-collab
@@ -15,7 +16,7 @@ node_model:
  aggregator_type: gcn        # Aggregator type to use (``mean``, ``gcn``, ``pool``, ``lstm``).
 edge_model:
  name: ele
-  hidden_size: 64             # Hidden size.        
+  hidden_size: 64             # Hidden size.
  num_layers: 2               # Number of hidden layers.
  bias: true                  # Whether to use bias in the linaer layer.
 neg_sampler:
@@ -31,5 +32,5 @@ general_pipeline:
    name: Adam
    lr: 0.005
  loss: BCELoss
-  save_path: "model.pth"
+  save_path: "results"        # Directory to save the experiment results
  num_runs: 1                 # Number of experiments to run
--- a/dglgo/recipes/linkpred_cora_sage.yaml
+++ b/dglgo/recipes/linkpred_cora_sage.yaml
 version: 0.0.1
 pipeline_name: linkpred
+pipeline_mode: train
 device: cuda
 data:
  name: cora
@@ -15,7 +16,7 @@ node_model:
  aggregator_type: gcn        # Aggregator type to use (``mean``, ``gcn``, ``pool``, ``lstm``).
 edge_model:
  name: ele
-  hidden_size: 64             # Hidden size.        
+  hidden_size: 64             # Hidden size.
  num_layers: 2               # Number of hidden layers.
  bias: true                  # Whether to use bias in the linaer layer.
 neg_sampler:
@@ -31,5 +32,5 @@ general_pipeline:
    name: Adam
    lr: 0.005
  loss: BCELoss
-  save_path: "model.pth"
+  save_path: "results"        # Directory to save the experiment results
  num_runs: 1                 # Number of experiments to run
--- a/dglgo/recipes/nodepred-ns_arxiv_gcn.yaml
+++ b/dglgo/recipes/nodepred-ns_arxiv_gcn.yaml
 # Accuracy across 5 runs: 0.593288 ± 0.006103
 version: 0.0.1
 pipeline_name: nodepred-ns
+pipeline_mode: train
 device: 'cuda:0'
 eval_device: 'cpu'
 data:
@@ -31,5 +32,5 @@ general_pipeline:
    lr: 0.005
    weight_decay: 0.0
  loss: CrossEntropyLoss
-  save_path: "model.pth"
+  save_path: "results"        # Directory to save the experiment results
  num_runs: 5
--- a/dglgo/recipes/nodepred-ns_product_sage.yaml
+++ b/dglgo/recipes/nodepred-ns_product_sage.yaml
 # Accuracy across 1 runs: 0.796911
 version: 0.0.1
 pipeline_name: nodepred-ns
+pipeline_mode: train
 device: cuda
 eval_device: cpu
 data:
@@ -35,5 +36,5 @@ general_pipeline:
    lr: 0.005
    weight_decay: 0.0
  loss: CrossEntropyLoss
-  save_path: "model.pth"
+  save_path: "results"        # Directory to save the experiment results
  num_runs: 5                 # Number of experiments to run
--- a/dglgo/recipes/nodepred_citeseer_gat.yaml
+++ b/dglgo/recipes/nodepred_citeseer_gat.yaml
 # Accuracy across 10 runs: 0.7097 ± 0.006914
 version: 0.0.1
 pipeline_name: nodepred
+pipeline_mode: train
 device: cuda:0
 data:
  name: citeseer
@@ -28,5 +29,5 @@ general_pipeline:
    lr: 0.005
    weight_decay: 0.0005
  loss: CrossEntropyLoss
-  save_path: "model.pth"
+  save_path: "results"       # Directory to save the experiment results
  num_runs: 10               # Number of experiments to run
--- a/dglgo/recipes/nodepred_citeseer_gcn.yaml
+++ b/dglgo/recipes/nodepred_citeseer_gcn.yaml
 # Accuracy across 10 runs: 0.6852 ± 0.008875
 version: 0.0.1
 pipeline_name: nodepred
+pipeline_mode: train
 device: cuda:0
 data:
  name: citeseer
@@ -24,5 +25,5 @@ general_pipeline:
    lr: 0.01
    weight_decay: 0.0005
  loss: CrossEntropyLoss
-  save_path: "model.pth"
+  save_path: "results"        # Directory to save the experiment results
  num_runs: 10                # Number of experiments to run
--- a/dglgo/recipes/nodepred_citeseer_sage.yaml
+++ b/dglgo/recipes/nodepred_citeseer_sage.yaml
 # Accuracy across 10 runs: 0.6994 ± 0.004005
 version: 0.0.1
 pipeline_name: nodepred
+pipeline_mode: train
 device: cuda:0
 data:
  name: citeseer
@@ -23,5 +24,5 @@ general_pipeline:
    lr: 0.01
    weight_decay: 0.0005
  loss: CrossEntropyLoss
-  save_path: "model.pth"
+  save_path: "results"        # Directory to save the experiment results
  num_runs: 10                # Number of experiments to run
--- a/dglgo/recipes/nodepred_cora_gat.yaml
+++ b/dglgo/recipes/nodepred_cora_gat.yaml
 # Accuracy across 10 runs: 0.8208 ± 0.00663
 version: 0.0.1
 pipeline_name: nodepred
+pipeline_mode: train
 device: cuda:0
 data:
  name: cora
@@ -28,5 +29,5 @@ general_pipeline:
    lr: 0.005
    weight_decay: 0.0005
  loss: CrossEntropyLoss
-  save_path: "model.pth"
+  save_path: "results"        # Directory to save the experiment results
  num_runs: 10                # Number of experiments to run
--- a/dglgo/recipes/nodepred_cora_gcn.yaml
+++ b/dglgo/recipes/nodepred_cora_gcn.yaml
 # Accuracy across 10 runs: 0.802 ± 0.005329
 version: 0.0.1
 pipeline_name: nodepred
+pipeline_mode: train
 device: cuda:0
 data:
  name: cora
@@ -24,5 +25,5 @@ general_pipeline:
    lr: 0.01
    weight_decay: 0.0005
  loss: CrossEntropyLoss
-  save_path: "model.pth"
+  save_path: "results"        # Directory to save the experiment results
  num_runs: 10                # Number of experiments to run
--- a/dglgo/recipes/nodepred_cora_sage.yaml
+++ b/dglgo/recipes/nodepred_cora_sage.yaml
 # Accuracy across 10 runs: 0.8163 ± 0.006856
 version: 0.0.1
 pipeline_name: nodepred
+pipeline_mode: train
 device: cuda:0
 data:
  name: cora
@@ -23,5 +24,5 @@ general_pipeline:
    lr: 0.01
    weight_decay: 0.0005
  loss: CrossEntropyLoss
-  save_path: "model.pth"
+  save_path: "results"        # Directory to save the experiment results
  num_runs: 10                # Number of experiments to run
--- a/dglgo/recipes/nodepred_pubmed_gat.yaml
+++ b/dglgo/recipes/nodepred_pubmed_gat.yaml
 # Accuracy across 10 runs: 0.7788 ± 0.002227
 version: 0.0.1
 pipeline_name: nodepred
+pipeline_mode: train
 device: cuda:0
 data:
  name: pubmed
@@ -28,5 +29,5 @@ general_pipeline:
    lr: 0.005
    weight_decay: 0.001
  loss: CrossEntropyLoss
-  save_path: "model.pth"
+  save_path: "results"        # Directory to save the experiment results
  num_runs: 10                # Number of experiments to run
--- a/dglgo/recipes/nodepred_pubmed_gcn.yaml
+++ b/dglgo/recipes/nodepred_pubmed_gcn.yaml
 # Accuracy across 10 runs: 0.7826 ± 0.004317
 version: 0.0.1
 pipeline_name: nodepred
+pipeline_mode: train
 device: cuda:0
 data:
  name: pubmed
@@ -24,5 +25,5 @@ general_pipeline:
    lr: 0.01
    weight_decay: 0.0005
  loss: CrossEntropyLoss
-  save_path: "model.pth"
+  save_path: "results"        # Directory to save the experiment results
  num_runs: 10                # Number of experiments to run