Unverified Commit 494d4cd7 authored by Rhett Ying's avatar Rhett Ying Committed by GitHub
Browse files

[DistGB] enable graphbolt for homograph distDGL (#7182)

parent 09e48552
...@@ -138,3 +138,63 @@ python3 ~/workspace/dgl/tools/launch.py \ ...@@ -138,3 +138,63 @@ python3 ~/workspace/dgl/tools/launch.py \
--ip_config ip_config.txt \ --ip_config ip_config.txt \
"python3 node_classification.py --graph_name ogbn-products --ip_config ip_config.txt --num_epochs 30 --batch_size 1000 --num_gpus 4" "python3 node_classification.py --graph_name ogbn-products --ip_config ip_config.txt --num_epochs 30 --batch_size 1000 --num_gpus 4"
``` ```
### Running with GraphBolt
In order to run with `GraphBolt`, we need to partition graph into `GraphBolt` data formats.Please note that both `DGL` and `GraphBolt` partitions are saved together.
```
python3 partition_graph.py --dataset ogbn-products --num_parts 2 --balance_train --balance_edges --use_graphbolt
```
#### Partition sizes compared to DGL
Compared to `DGL`, `GraphBolt` partitions are much smaller(reduced to **16%** and **19%** for `ogbn-products` and `ogbn-papers100M` respectively).
`ogbn-products`
| Data Formats | File Name | Part 0 | Part 1 |
| ------------ | ---------------------------- | ------ | ------ |
| DGL | graph.dgl | 1.5GB | 1.6GB |
| GraphBolt | fused_csc_sampling_graph.pt | 255MB | 265MB |
`ogbn-papers100M`
| Data Formats | File Name | Part 0 | Part 1 |
| ------------ | ---------------------------- | ------ | ------ |
| DGL | graph.dgl | 23GB | 22GB |
| GraphBolt | fused_csc_sampling_graph.pt | 4.4GB | 4.1GB |
Then run example with `--use_graphbolt`.
```
python3 ~/workspace/dgl/tools/launch.py \
--workspace ~/workspace/dgl/examples/pytorch/graphsage/dist/ \
--num_trainers 4 \
--num_samplers 0 \
--num_servers 2 \
--part_config data/ogbn-products.json \
--ip_config ip_config.txt \
"python3 node_classification.py --graph_name ogbn-products --ip_config ip_config.txt --num_epochs 10 --use_graphbolt"
```
#### Performance compared to `DGL`
Compared to `DGL`, `GraphBolt`'s sampler works faster(reduced to **80%** and **77%** for `ogbn-products` and `ogbn-papers100M` respectively). `Min` and `Max` are statistics of all trainers on all nodes(machines).
As for RAM usage, the shared memory(measured by **shared** field of `free` command) usage is decreased due to smaller graph partitions in `GraphBolt` though the peak memory used by processes(measured by **used** field of `free` command) does not decrease.
`ogbn-products`
| Data Formats | Sample Time Per Epoch (CPU) | Test Accuracy (10 epochs) | shared | used (peak) |
| ------------ | --------------------------- | -------------------------------- | ----- | ---- |
| DGL | Min: 1.2884s, Max: 1.4159s | Min: 64.38%, Max: 70.42% | 2.4GB | 7.8GB|
| GraphBolt | Min: 1.0589s, Max: 1.1400s | Min: 61.68%, Max: 71.23% | 1.1GB | 7.8GB|
`ogbn-papers100M`
| Data Formats | Sample Time Per Epoch (CPU) | Test Accuracy (10 epochs) | shared | used (peak) |
| ------------ | --------------------------- | -------------------------------- | ----- | ---- |
| DGL | Min: 5.5570s, Max: 6.1900s | Min: 29.12%, Max: 34.33% | 84GB | 43GB |
| GraphBolt | Min: 4.5046s, Max: 4.7718s | Min: 29.11%, Max: 33.49% | 67GB | 43GB |
...@@ -340,7 +340,7 @@ def main(args): ...@@ -340,7 +340,7 @@ def main(args):
""" """
host_name = socket.gethostname() host_name = socket.gethostname()
print(f"{host_name}: Initializing DistDGL.") print(f"{host_name}: Initializing DistDGL.")
dgl.distributed.initialize(args.ip_config) dgl.distributed.initialize(args.ip_config, use_graphbolt=args.use_graphbolt)
print(f"{host_name}: Initializing PyTorch process group.") print(f"{host_name}: Initializing PyTorch process group.")
th.distributed.init_process_group(backend=args.backend) th.distributed.init_process_group(backend=args.backend)
print(f"{host_name}: Initializing DistGraph.") print(f"{host_name}: Initializing DistGraph.")
...@@ -457,6 +457,11 @@ if __name__ == "__main__": ...@@ -457,6 +457,11 @@ if __name__ == "__main__":
help="Pad train nid to the same length across machine, to ensure num " help="Pad train nid to the same length across machine, to ensure num "
"of batches to be the same.", "of batches to be the same.",
) )
parser.add_argument(
"--use_graphbolt",
action="store_true",
help="Use GraphBolt for distributed train.",
)
args = parser.parse_args() args = parser.parse_args()
print(f"Arguments: {args}") print(f"Arguments: {args}")
main(args) main(args)
...@@ -87,6 +87,11 @@ if __name__ == "__main__": ...@@ -87,6 +87,11 @@ if __name__ == "__main__":
default="data", default="data",
help="Output path of partitioned graph.", help="Output path of partitioned graph.",
) )
argparser.add_argument(
"--use_graphbolt",
action="store_true",
help="Use GraphBolt for distributed train.",
)
args = argparser.parse_args() args = argparser.parse_args()
start = time.time() start = time.time()
...@@ -127,4 +132,5 @@ if __name__ == "__main__": ...@@ -127,4 +132,5 @@ if __name__ == "__main__":
balance_ntypes=balance_ntypes, balance_ntypes=balance_ntypes,
balance_edges=args.balance_edges, balance_edges=args.balance_edges,
num_trainers_per_machine=args.num_trainers_per_machine, num_trainers_per_machine=args.num_trainers_per_machine,
use_graphbolt=args.use_graphbolt,
) )
...@@ -241,6 +241,9 @@ def initialize( ...@@ -241,6 +241,9 @@ def initialize(
distributed API. For example, when used with Pytorch, users have to invoke this function distributed API. For example, when used with Pytorch, users have to invoke this function
before Pytorch's `pytorch.distributed.init_process_group`. before Pytorch's `pytorch.distributed.init_process_group`.
""" """
print(
f"Initialize the distributed services with graphbolt: {use_graphbolt}"
)
if net_type is not None: if net_type is not None:
dgl_warning( dgl_warning(
"net_type is deprecated and will be removed in future release." "net_type is deprecated and will be removed in future release."
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment