[DistDGL] remove use of tensorpipe from examples and docs (#5837)

c9778b55 · Rhett Ying · GitHub · 44bcb3f8 · c9778b55 · c9778b55
Unverified Commit c9778b55 authored Jun 09, 2023 by Rhett Ying Committed by GitHub Jun 09, 2023
3 changed files
--- a/docs/source/guide/distributed-tools.rst
+++ b/docs/source/guide/distributed-tools.rst
@@ -49,62 +49,3 @@ files should be specified as relative paths to the workspace.
 The launch script creates a specified number of training jobs
 (``--num_trainers``) on each machine.  In addition, users need to specify the
 number of sampler processes for each trainer (``--num_samplers``).
-
-Launching a Persistent Graph Server
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. warning::
-
-    Persistent graph server is an experimental feature. It is only available
-    when the ``net_etype`` argument of :func:`dgl.distributed.initialize`
-    is ``"tensorpipe"``.
-
-Normally, all the server and trainer processes will be killed after the training is done.
-However, sometimes users may wish to try out different models or training configurations
-against the *same* graph data. Repetitively loading the same graph data
-could be costly. To avoid that, DGL
-allows users to launch a persistent graph server to be shared across multiple training
-jobs. A persistent graph server will stay alive even all training workers have 
-finished and exited. Below shows an example of launching a persistent graph server:
-
-We first launch the graph server together with the first group of training workers.
-
-.. code:: bash
-
-    python3 tools/launch.py               \
-      --workspace /my/workspace/          \
-      --num_trainers 2                    \
-      --num_samplers 4                    \
-      --num_servers 1                     \
-      --part_config data/mygraph.json     \
-      --ip_config ip_config.txt           \
-      --keep_alive                        \
-      --server_name long_live             \
-      "python3 my_train_script.py"
-
-Pay attention to the ``--keep_alive`` option, which indicates the server should
-stay alive after workers have finished. ``--server_name`` is the given name of
-the server which will be referred when launching new training jobs.
-
-Then launch trainers as normal which will automatically connect to the existing
-persistent server.
-
-.. code:: bash
-
-    python3 tools/launch.py               \
-      --workspace /my/workspace/          \
-      --num_trainers 2                    \
-      --num_samplers 4                    \
-      --num_servers 1                     \
-      --part_config data/mygraph.json     \
-      --ip_config ip_config.txt           \
-      "python3 my_train_script.py"
-
-There are several restrictions when using persistent graph servers:
-
-* All the arguments for ``launch.py`` should be kept same as previous launch. And below
-  arguments for specific training script should be kept same as well: ``--graph-name``,
-  ``--ip_config``.
-* There is no data consistency control on the server side so data update must be carefully
-  handled. For example, it is recommended to avoid having multiple groups of trainers
-  update node/edge embeddings at the same time.
--- a/examples/distributed/graphsage/node_classification.py
+++ b/examples/distributed/graphsage/node_classification.py
@@ -340,7 +340,7 @@ def main(args):
    """
    host_name = socket.gethostname()
    print(f"{host_name}: Initializing DistDGL.")
-    dgl.distributed.initialize(args.ip_config, net_type=args.net_type)
+    dgl.distributed.initialize(args.ip_config)
    print(f"{host_name}: Initializing PyTorch process group.")
    th.distributed.init_process_group(backend=args.backend)
    print(f"{host_name}: Initializing DistGraph.")
@@ -457,12 +457,6 @@ if __name__ == "__main__":
        help="Pad train nid to the same length across machine, to ensure num "
        "of batches to be the same.",
    )
-    parser.add_argument(
-        "--net_type",
-        type=str,
-        default="socket",
-        help="backend net type, 'socket' or 'tensorpipe'",
-    )
    args = parser.parse_args()
    print(f"Arguments: {args}")
    main(args)
--- a/examples/pytorch/graphsage/dist/train_dist.py
+++ b/examples/pytorch/graphsage/dist/train_dist.py
@@ -292,7 +292,7 @@ def run(args, device, data):

 def main(args):
    print(socket.gethostname(), "Initializing DGL dist")
-    dgl.distributed.initialize(args.ip_config, net_type=args.net_type)
+    dgl.distributed.initialize(args.ip_config)
    if not args.standalone:
        print(socket.gethostname(), "Initializing DGL process group")
        th.distributed.init_process_group(backend=args.backend)
@@ -411,12 +411,6 @@ if __name__ == "__main__":
        help="Pad train nid to the same length across machine, to ensure num "
        "of batches to be the same.",
    )
-    parser.add_argument(
-        "--net_type",
-        type=str,
-        default="socket",
-        help="backend net type, 'socket' or 'tensorpipe'",
-    )
    args = parser.parse_args()

    print(args)