[Dist] remove dependecy of load_partition_book in change tool (#4802)

* remove dependecy of load_partition_book in change tool * fix issue * fix issue Co-authored-by: Ubuntu <ubuntu@ip-172-31-16-19.ap-northeast-1.compute.internal>

[Dist] remove dependecy of load_partition_book in change tool (#4802)
* remove dependecy of load_partition_book in change tool * fix issue * fix issue Co-authored-by: Ubuntu <ubuntu@ip-172-31-16-19.ap-northeast-1.compute.internal>
dccf1f16 · peizhou001 · GitHub · 8ae50c42 · dccf1f16
Unverified Commit dccf1f16 authored Nov 04, 2022 by peizhou001 Committed by GitHub Nov 04, 2022
Show whitespace changes
Inline Side-by-side

Showing with 76 additions and 64 deletions

tools/change_etype_to_canonical_etype.py tools/change_etype_to_canonical_etype.py +76 -64

No files found.
--- a/tools/change_etype_to_canonical_etype.py
+++ b/tools/change_etype_to_canonical_etype.py
@@ -9,11 +9,13 @@ import torch
 import dgl
 from dgl._ffi.base import DGLError
 from dgl.data.utils import load_graphs
-from dgl.distributed import load_partition_book
+from dgl.utils import toindex

-etypes_key = "etypes"
-edge_map_key = "edge_map"
-canonical_etypes_delimiter = ":"
+ETYPES_KEY = "etypes"
+EDGE_MAP_KEY = "edge_map"
+NTYPES_KEY = "ntypes"
+NUM_PARTS_KEY = "num_parts"
+CANONICAL_ETYPE_DELIMITER = ":"


 def convert_conf(part_config):
@@ -22,95 +24,105 @@ def convert_conf(part_config):
        logging.info("Checking if the provided json file need to be changed.")
        if is_old_version(config):
            logging.info("Changing the partition configuration file.")
-            canonical_etypes = etype2canonical_etype(part_config)
-            # convert edge_map key from etype -> c_etype
+            canonical_etypes = {}
+            if len(config[NTYPES_KEY]) == 1:
+                ntype = list(config[NTYPES_KEY].keys())[0]
+                canonical_etypes = {
+                    CANONICAL_ETYPE_DELIMITER.join((ntype, etype, ntype)): eid
+                    for etype, eid in config[ETYPES_KEY].items()
+                }
+            else:
+                canonical_etypes = etype2canonical_etype(part_config, config)
+            reverse_c_etypes = {v: k for k, v in canonical_etypes.items()}
+            # Convert edge_map keys from etype -> c_etype.
            new_edge_map = {}
-            for e_type, range in config[edge_map_key].items():
-                eid = config[etypes_key][e_type]
-                c_etype = [
-                    key
-                    for key in canonical_etypes
-                    if canonical_etypes[key] == eid
-                ][0]
+            for e_type, range in config[EDGE_MAP_KEY].items():
+                eid = config[ETYPES_KEY][e_type]
+                c_etype = reverse_c_etypes[eid]
                new_edge_map[c_etype] = range
-            config[edge_map_key] = new_edge_map
-            config[etypes_key] = canonical_etypes
+            config[EDGE_MAP_KEY] = new_edge_map
+            config[ETYPES_KEY] = canonical_etypes
            logging.info("Dumping the content to disk.")
            f.seek(0)
            json.dump(config, f, indent=4)
            f.truncate()


-def etype2canonical_etype(part_config):
-    gpb, _, _, etypes = load_partition_book(part_config=part_config, part_id=0)
-    eid = []
-    etype_id = []
-    for etype in etypes:
-        type_eid = torch.zeros((1,), dtype=torch.int64)
-        eid.append(gpb.map_to_homo_eid(type_eid, etype))
-        etype_id.append(etypes[etype])
-    eid = torch.cat(eid, 0)
-    etype_id = torch.IntTensor(etype_id)
-    partition_id = gpb.eid2partid(eid)
+def etype2canonical_etype(part_config, config):
+    num_parts = config[NUM_PARTS_KEY]
+    edge_map = config[EDGE_MAP_KEY]
+    etypes = list(edge_map.keys())
+    # Get part id of each seed edge.
+    partition_ids = []
+    for _, bound in edge_map.items():
+        for i in range(num_parts):
+            if bound[i][1] > bound[i][0]:
+                partition_ids.append(i)
+                break
+    partition_ids = torch.tensor(partition_ids)
+
+    # Get starting index of each partition.
+    shifts = []
+    for i in range(num_parts):
+        shifts.append(edge_map[etypes[0]][i][0])
+    shifts = torch.tensor(shifts)
+
    canonical_etypes = {}
    part_ids = [
-        part_id
-        for part_id in range(gpb.num_partitions())
-        if part_id in partition_id
+        part_id for part_id in range(num_parts) if part_id in partition_ids
    ]
    for part_id in part_ids:
-        seed_edges = torch.masked_select(eid, partition_id == part_id)
-        seed_edge_tids = torch.masked_select(etype_id, partition_id == part_id)
+        seed_etypes = [
+            etypes[i] for i in range(len(etypes)) if partition_ids[i] == part_id
+        ]
        c_etype = _find_c_etypes_in_partition(
-            seed_edges, seed_edge_tids, part_id, part_config
+            part_id,
+            seed_etypes,
+            config[ETYPES_KEY],
+            config[NTYPES_KEY],
+            edge_map,
+            shifts,
+            part_config,
        )
        canonical_etypes.update(c_etype)
    return canonical_etypes


 def _find_c_etypes_in_partition(
-    seed_edges, seed_edge_tids, part_id, part_config
+    part_id, seed_etypes, etypes, ntypes, edge_map, shifts, config_path
 ):
-    folder = os.path.dirname(os.path.realpath(part_config))
-    partition_book = {}
-    local_g = dgl.DGLGraph()
    try:
+        folder = os.path.dirname(os.path.realpath(config_path))
        local_g = load_graphs(f"{folder}/part{part_id}/graph.dgl")[0][0]
-        partition_book = load_partition_book(
-            part_config=part_config, part_id=part_id
-        )[0]
+        local_eids = [
+            edge_map[etype][part_id][0] - shifts[part_id]
+            for etype in seed_etypes
+        ]
+        local_eids = toindex(torch.tensor(local_eids))
+        local_eids = local_eids.tousertensor()
+        local_src, local_dst = local_g.find_edges(local_eids)
+        src_ntids, dst_ntids = (
+            local_g.ndata[dgl.NTYPE][local_src],
+            local_g.ndata[dgl.NTYPE][local_dst],
+        )
+        ntypes = {v: k for k, v in ntypes.items()}
+        src_ntypes = [ntypes[ntid.item()] for ntid in src_ntids]
+        dst_ntypes = [ntypes[ntid.item()] for ntid in dst_ntids]
+        c_etypes = list(zip(src_ntypes, seed_etypes, dst_ntypes))
+        c_etypes = [
+            CANONICAL_ETYPE_DELIMITER.join(c_etype) for c_etype in c_etypes
+        ]
+        return {k: etypes[v] for (k, v) in zip(c_etypes, seed_etypes)}
    except DGLError as e:
+        print(e)
        logging.fatal(
            f"Graph data of partition {part_id} is requested but not found."
        )
-        raise e
-
-    ntypes, etypes = partition_book.ntypes, partition_book.etypes
-    src, dst = _find_edges(local_g, partition_book, seed_edges)
-    src_tids, _ = partition_book.map_to_per_ntype(src)
-    dst_tids, _ = partition_book.map_to_per_ntype(dst)
-    canonical_etypes = {}
-    for src_tid, etype_id, dst_tid in zip(src_tids, seed_edge_tids, dst_tids):
-        src_tid = src_tid.item()
-        etype_id = etype_id.item()
-        dst_tid = dst_tid.item()
-        c_etype = (ntypes[src_tid], etypes[etype_id], ntypes[dst_tid])
-        canonical_etypes[canonical_etypes_delimiter.join(c_etype)] = etype_id
-    return canonical_etypes
-
-
-def _find_edges(local_g, partition_book, seed_edges):
-    local_eids = partition_book.eid2localeid(seed_edges, partition_book.partid)
-    local_src, local_dst = local_g.find_edges(local_eids)
-    global_nid_mapping = local_g.ndata[dgl.NID]
-    global_src = global_nid_mapping[local_src]
-    global_dst = global_nid_mapping[local_dst]
-    return global_src, global_dst


 def is_old_version(config):
-    first_etype = list(config[etypes_key].keys())[0]
-    etype_tuple = first_etype.split(canonical_etypes_delimiter)
+    first_etype = list(config[ETYPES_KEY].keys())[0]
+    etype_tuple = first_etype.split(CANONICAL_ETYPE_DELIMITER)
    return len(etype_tuple) == 1