Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
08fd6cf8
Unverified
Commit
08fd6cf8
authored
Nov 28, 2022
by
peizhou001
Committed by
GitHub
Nov 28, 2022
Browse files
[Feature] Add parquet support for node/edge features in chunked data (#4933)
parent
e0fc038d
Changes
15
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
130 additions
and
46 deletions
+130
-46
tests/tools/create_chunked_dataset.py
tests/tools/create_chunked_dataset.py
+2
-1
tests/tools/test_array_readwriter.py
tests/tools/test_array_readwriter.py
+25
-0
tests/tools/test_dist_part.py
tests/tools/test_dist_part.py
+22
-13
tools/chunk_graph.py
tools/chunk_graph.py
+9
-8
tools/distpartitioning/array_readwriter/__init__.py
tools/distpartitioning/array_readwriter/__init__.py
+1
-1
tools/distpartitioning/array_readwriter/csv.py
tools/distpartitioning/array_readwriter/csv.py
+0
-0
tools/distpartitioning/array_readwriter/numpy_array.py
tools/distpartitioning/array_readwriter/numpy_array.py
+0
-0
tools/distpartitioning/array_readwriter/parquet.py
tools/distpartitioning/array_readwriter/parquet.py
+41
-0
tools/distpartitioning/array_readwriter/registry.py
tools/distpartitioning/array_readwriter/registry.py
+0
-0
tools/distpartitioning/constants.py
tools/distpartitioning/constants.py
+1
-0
tools/distpartitioning/dataset_utils.py
tools/distpartitioning/dataset_utils.py
+23
-13
tools/distpartitioning/utils.py
tools/distpartitioning/utils.py
+4
-6
tools/files.py
tools/files.py
+0
-0
tools/partition_algo/random_partition.py
tools/partition_algo/random_partition.py
+2
-2
tools/utils/__init__.py
tools/utils/__init__.py
+0
-2
No files found.
tests/tools/create_chunked_dataset.py
View file @
08fd6cf8
...
@@ -15,7 +15,7 @@ from dgl.data.utils import load_graphs, load_tensors
...
@@ -15,7 +15,7 @@ from dgl.data.utils import load_graphs, load_tensors
def
create_chunked_dataset
(
def
create_chunked_dataset
(
root_dir
,
num_chunks
,
include_masks
=
False
root_dir
,
num_chunks
,
include_masks
=
False
,
data_fmt
=
'numpy'
):
):
"""
"""
This function creates a sample dataset, based on MAG240 dataset.
This function creates a sample dataset, based on MAG240 dataset.
...
@@ -233,6 +233,7 @@ def create_chunked_dataset(
...
@@ -233,6 +233,7 @@ def create_chunked_dataset(
edge_data
,
edge_data
,
num_chunks
=
num_chunks
,
num_chunks
=
num_chunks
,
output_path
=
output_dir
,
output_path
=
output_dir
,
data_fmt
=
data_fmt
,
)
)
print
(
'Done with creating chunked graph'
)
print
(
'Done with creating chunked graph'
)
...
...
tests/tools/test_array_readwriter.py
0 → 100644
View file @
08fd6cf8
import
os
import
tempfile
import
numpy
as
np
import
pytest
from
distpartitioning
import
array_readwriter
@
pytest
.
mark
.
parametrize
(
"shape"
,
[[
500
],
[
300
,
10
],
[
200
,
5
,
5
],
[
100
,
5
,
5
,
5
]]
)
@
pytest
.
mark
.
parametrize
(
"format"
,
[
"numpy"
,
"parquet"
])
def
test_array_readwriter
(
format
,
shape
):
original_array
=
np
.
random
.
rand
(
*
shape
)
fmt_meta
=
{
"name"
:
format
}
with
tempfile
.
TemporaryDirectory
()
as
test_dir
:
path
=
os
.
path
.
join
(
test_dir
,
f
"nodes.
{
format
}
"
)
array_readwriter
.
get_array_parser
(
**
fmt_meta
).
write
(
path
,
original_array
)
array
=
array_readwriter
.
get_array_parser
(
**
fmt_meta
).
read
(
path
)
assert
original_array
.
shape
==
array
.
shape
assert
np
.
array_equal
(
original_array
,
array
)
tests/tools/test_dist_part.py
View file @
08fd6cf8
...
@@ -8,16 +8,14 @@ import pytest
...
@@ -8,16 +8,14 @@ import pytest
import
torch
import
torch
from
chunk_graph
import
chunk_graph
from
chunk_graph
import
chunk_graph
from
create_chunked_dataset
import
create_chunked_dataset
from
create_chunked_dataset
import
create_chunked_dataset
from
distpartitioning
import
array_readwriter
import
dgl
import
dgl
from
dgl.data.utils
import
load_graphs
,
load_tensors
from
dgl.data.utils
import
load_graphs
,
load_tensors
from
dgl.distributed.partition
import
(
from
dgl.distributed.partition
import
(
RESERVED_FIELD_DTYPE
,
RESERVED_FIELD_DTYPE
,
_etype_tuple_to_str
,
load_partition
,
_get_inner_edge_mask
,
_get_inner_node_mask
,
_get_inner_node_mask
,
load_partition
)
_get_inner_edge_mask
,
_etype_tuple_to_str
,
)
def
_verify_partition_data_types
(
part_g
):
def
_verify_partition_data_types
(
part_g
):
...
@@ -80,11 +78,12 @@ def _verify_graph_feats(
...
@@ -80,11 +78,12 @@ def _verify_graph_feats(
@
pytest
.
mark
.
parametrize
(
"num_chunks"
,
[
1
,
8
])
@
pytest
.
mark
.
parametrize
(
"num_chunks"
,
[
1
,
8
])
def
test_chunk_graph
(
num_chunks
):
@
pytest
.
mark
.
parametrize
(
"data_fmt"
,
[
'numpy'
,
'parquet'
])
def
test_chunk_graph
(
num_chunks
,
data_fmt
):
with
tempfile
.
TemporaryDirectory
()
as
root_dir
:
with
tempfile
.
TemporaryDirectory
()
as
root_dir
:
g
=
create_chunked_dataset
(
root_dir
,
num_chunks
)
g
=
create_chunked_dataset
(
root_dir
,
num_chunks
,
data_fmt
=
data_fmt
)
# check metadata.json
# check metadata.json
output_dir
=
os
.
path
.
join
(
root_dir
,
"chunked-data"
)
output_dir
=
os
.
path
.
join
(
root_dir
,
"chunked-data"
)
...
@@ -111,12 +110,16 @@ def test_chunk_graph(num_chunks):
...
@@ -111,12 +110,16 @@ def test_chunk_graph(num_chunks):
assert
isinstance
(
int
(
num2
),
int
)
assert
isinstance
(
int
(
num2
),
int
)
# check node/edge_data
# check node/edge_data
suffix
=
'npy'
if
data_fmt
==
'numpy'
else
'parquet'
reader_fmt_meta
=
{
"name"
:
data_fmt
}
def
test_data
(
sub_dir
,
feat
,
expected_data
,
expected_shape
):
def
test_data
(
sub_dir
,
feat
,
expected_data
,
expected_shape
):
data
=
[]
data
=
[]
for
i
in
range
(
num_chunks
):
for
i
in
range
(
num_chunks
):
fname
=
os
.
path
.
join
(
sub_dir
,
f
'
{
feat
}
-
{
i
}
.
npy
'
)
fname
=
os
.
path
.
join
(
sub_dir
,
f
'
{
feat
}
-
{
i
}
.
{
suffix
}
'
)
assert
os
.
path
.
isfile
(
fname
)
assert
os
.
path
.
isfile
(
fname
)
feat_array
=
np
.
load
(
fname
)
feat_array
=
array_readwriter
.
get_array_parser
(
**
reader_fmt_meta
).
read
(
fname
)
assert
feat_array
.
shape
[
0
]
==
expected_shape
assert
feat_array
.
shape
[
0
]
==
expected_shape
data
.
append
(
feat_array
)
data
.
append
(
feat_array
)
data
=
np
.
concatenate
(
data
,
0
)
data
=
np
.
concatenate
(
data
,
0
)
...
@@ -136,7 +139,7 @@ def test_chunk_graph(num_chunks):
...
@@ -136,7 +139,7 @@ def test_chunk_graph(num_chunks):
test_data
(
sub_dir
,
feat
,
data
,
g
.
num_edges
(
c_etype
)
//
num_chunks
)
test_data
(
sub_dir
,
feat
,
data
,
g
.
num_edges
(
c_etype
)
//
num_chunks
)
def
_test_pipeline
(
num_chunks
,
num_parts
,
world_size
,
graph_formats
=
None
):
def
_test_pipeline
(
num_chunks
,
num_parts
,
world_size
,
graph_formats
=
None
,
data_fmt
=
'numpy'
):
if
num_chunks
<
num_parts
:
if
num_chunks
<
num_parts
:
# num_parts should less/equal than num_chunks
# num_parts should less/equal than num_chunks
return
return
...
@@ -147,7 +150,7 @@ def _test_pipeline(num_chunks, num_parts, world_size, graph_formats=None):
...
@@ -147,7 +150,7 @@ def _test_pipeline(num_chunks, num_parts, world_size, graph_formats=None):
with
tempfile
.
TemporaryDirectory
()
as
root_dir
:
with
tempfile
.
TemporaryDirectory
()
as
root_dir
:
g
=
create_chunked_dataset
(
root_dir
,
num_chunks
)
g
=
create_chunked_dataset
(
root_dir
,
num_chunks
,
data_fmt
=
data_fmt
)
# Step1: graph partition
# Step1: graph partition
in_dir
=
os
.
path
.
join
(
root_dir
,
"chunked-data"
)
in_dir
=
os
.
path
.
join
(
root_dir
,
"chunked-data"
)
...
@@ -224,3 +227,9 @@ def test_pipeline_basics(num_chunks, num_parts, world_size):
...
@@ -224,3 +227,9 @@ def test_pipeline_basics(num_chunks, num_parts, world_size):
def
test_pipeline_formats
(
graph_formats
):
def
test_pipeline_formats
(
graph_formats
):
_test_pipeline
(
4
,
4
,
4
,
graph_formats
)
_test_pipeline
(
4
,
4
,
4
,
graph_formats
)
@
pytest
.
mark
.
parametrize
(
"data_fmt"
,
[
'numpy'
,
"parquet"
]
)
def
test_pipeline_feature_format
(
data_fmt
):
_test_pipeline
(
4
,
4
,
4
,
data_fmt
=
data_fmt
)
tools/chunk_graph.py
View file @
08fd6cf8
...
@@ -6,7 +6,8 @@ import pathlib
...
@@ -6,7 +6,8 @@ import pathlib
from
contextlib
import
contextmanager
from
contextlib
import
contextmanager
import
torch
import
torch
from
utils
import
array_readwriter
,
setdir
from
distpartitioning
import
array_readwriter
from
files
import
setdir
import
dgl
import
dgl
...
@@ -25,7 +26,7 @@ def chunk_numpy_array(arr, fmt_meta, chunk_sizes, path_fmt):
...
@@ -25,7 +26,7 @@ def chunk_numpy_array(arr, fmt_meta, chunk_sizes, path_fmt):
return
paths
return
paths
def
_chunk_graph
(
g
,
name
,
ndata_paths
,
edata_paths
,
num_chunks
,
output_path
):
def
_chunk_graph
(
g
,
name
,
ndata_paths
,
edata_paths
,
num_chunks
,
output_path
,
data_fmt
):
# First deal with ndata and edata that are homogeneous (i.e. not a dict-of-dict)
# First deal with ndata and edata that are homogeneous (i.e. not a dict-of-dict)
if
len
(
g
.
ntypes
)
==
1
and
not
isinstance
(
if
len
(
g
.
ntypes
)
==
1
and
not
isinstance
(
next
(
iter
(
ndata_paths
.
values
())),
dict
next
(
iter
(
ndata_paths
.
values
())),
dict
...
@@ -94,6 +95,8 @@ def _chunk_graph(g, name, ndata_paths, edata_paths, num_chunks, output_path):
...
@@ -94,6 +95,8 @@ def _chunk_graph(g, name, ndata_paths, edata_paths, num_chunks, output_path):
metadata
[
"edges"
][
etypestr
]
=
edges_meta
metadata
[
"edges"
][
etypestr
]
=
edges_meta
# Chunk node data
# Chunk node data
reader_fmt_meta
,
writer_fmt_meta
=
{
"name"
:
"numpy"
},
{
"name"
:
data_fmt
}
file_suffix
=
'npy'
if
data_fmt
==
'numpy'
else
'parquet'
metadata
[
"node_data"
]
=
{}
metadata
[
"node_data"
]
=
{}
with
setdir
(
"node_data"
):
with
setdir
(
"node_data"
):
for
ntype
,
ndata_per_type
in
ndata_paths
.
items
():
for
ntype
,
ndata_per_type
in
ndata_paths
.
items
():
...
@@ -104,7 +107,6 @@ def _chunk_graph(g, name, ndata_paths, edata_paths, num_chunks, output_path):
...
@@ -104,7 +107,6 @@ def _chunk_graph(g, name, ndata_paths, edata_paths, num_chunks, output_path):
"Chunking node data for type %s key %s"
%
(
ntype
,
key
)
"Chunking node data for type %s key %s"
%
(
ntype
,
key
)
)
)
ndata_key_meta
=
{}
ndata_key_meta
=
{}
reader_fmt_meta
=
writer_fmt_meta
=
{
"name"
:
"numpy"
}
arr
=
array_readwriter
.
get_array_parser
(
arr
=
array_readwriter
.
get_array_parser
(
**
reader_fmt_meta
**
reader_fmt_meta
).
read
(
path
)
).
read
(
path
)
...
@@ -113,7 +115,7 @@ def _chunk_graph(g, name, ndata_paths, edata_paths, num_chunks, output_path):
...
@@ -113,7 +115,7 @@ def _chunk_graph(g, name, ndata_paths, edata_paths, num_chunks, output_path):
arr
,
arr
,
writer_fmt_meta
,
writer_fmt_meta
,
num_nodes_per_chunk_dict
[
ntype
],
num_nodes_per_chunk_dict
[
ntype
],
key
+
"-%d.
npy"
,
key
+
"-%d.
"
+
file_suffix
,
)
)
ndata_meta
[
key
]
=
ndata_key_meta
ndata_meta
[
key
]
=
ndata_key_meta
...
@@ -131,7 +133,6 @@ def _chunk_graph(g, name, ndata_paths, edata_paths, num_chunks, output_path):
...
@@ -131,7 +133,6 @@ def _chunk_graph(g, name, ndata_paths, edata_paths, num_chunks, output_path):
%
(
etypestr
,
key
)
%
(
etypestr
,
key
)
)
)
edata_key_meta
=
{}
edata_key_meta
=
{}
reader_fmt_meta
=
writer_fmt_meta
=
{
"name"
:
"numpy"
}
arr
=
array_readwriter
.
get_array_parser
(
arr
=
array_readwriter
.
get_array_parser
(
**
reader_fmt_meta
**
reader_fmt_meta
).
read
(
path
)
).
read
(
path
)
...
@@ -141,7 +142,7 @@ def _chunk_graph(g, name, ndata_paths, edata_paths, num_chunks, output_path):
...
@@ -141,7 +142,7 @@ def _chunk_graph(g, name, ndata_paths, edata_paths, num_chunks, output_path):
arr
,
arr
,
writer_fmt_meta
,
writer_fmt_meta
,
num_edges_per_chunk_dict
[
etype
],
num_edges_per_chunk_dict
[
etype
],
key
+
"-%d.
npy"
,
key
+
"-%d.
"
+
file_suffix
,
)
)
edata_meta
[
key
]
=
edata_key_meta
edata_meta
[
key
]
=
edata_key_meta
...
@@ -153,7 +154,7 @@ def _chunk_graph(g, name, ndata_paths, edata_paths, num_chunks, output_path):
...
@@ -153,7 +154,7 @@ def _chunk_graph(g, name, ndata_paths, edata_paths, num_chunks, output_path):
logging
.
info
(
"Saved metadata in %s"
%
os
.
path
.
abspath
(
metadata_path
))
logging
.
info
(
"Saved metadata in %s"
%
os
.
path
.
abspath
(
metadata_path
))
def
chunk_graph
(
g
,
name
,
ndata_paths
,
edata_paths
,
num_chunks
,
output_path
):
def
chunk_graph
(
g
,
name
,
ndata_paths
,
edata_paths
,
num_chunks
,
output_path
,
data_fmt
=
'numpy'
):
"""
"""
Split the graph into multiple chunks.
Split the graph into multiple chunks.
...
@@ -184,7 +185,7 @@ def chunk_graph(g, name, ndata_paths, edata_paths, num_chunks, output_path):
...
@@ -184,7 +185,7 @@ def chunk_graph(g, name, ndata_paths, edata_paths, num_chunks, output_path):
for
key
in
edata
.
keys
():
for
key
in
edata
.
keys
():
edata
[
key
]
=
os
.
path
.
abspath
(
edata
[
key
])
edata
[
key
]
=
os
.
path
.
abspath
(
edata
[
key
])
with
setdir
(
output_path
):
with
setdir
(
output_path
):
_chunk_graph
(
g
,
name
,
ndata_paths
,
edata_paths
,
num_chunks
,
output_path
)
_chunk_graph
(
g
,
name
,
ndata_paths
,
edata_paths
,
num_chunks
,
output_path
,
data_fmt
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
tools/
utils
/array_readwriter/__init__.py
→
tools/
distpartitioning
/array_readwriter/__init__.py
View file @
08fd6cf8
from
.
import
csv
,
numpy_array
from
.
import
csv
,
numpy_array
,
parquet
from
.registry
import
get_array_parser
,
register_array_parser
from
.registry
import
get_array_parser
,
register_array_parser
tools/
utils
/array_readwriter/csv.py
→
tools/
distpartitioning
/array_readwriter/csv.py
View file @
08fd6cf8
File moved
tools/
utils
/array_readwriter/numpy_array.py
→
tools/
distpartitioning
/array_readwriter/numpy_array.py
View file @
08fd6cf8
File moved
tools/distpartitioning/array_readwriter/parquet.py
0 → 100644
View file @
08fd6cf8
import
logging
import
pandas
as
pd
import
pyarrow
import
pyarrow.parquet
from
.registry
import
register_array_parser
@
register_array_parser
(
"parquet"
)
class
ParquetArrayParser
(
object
):
def
__init__
(
self
):
pass
def
read
(
self
,
path
):
logging
.
info
(
"Reading from %s using parquet format"
%
path
)
metadata
=
pyarrow
.
parquet
.
read_metadata
(
path
)
metadata
=
metadata
.
schema
.
to_arrow_schema
().
metadata
# As parquet data are tabularized, we assume the dim of ndarray is 2.
# If not, it should be explictly specified in the file as metadata.
shape
=
metadata
.
get
(
b
"shape"
,
None
)
table
=
pyarrow
.
parquet
.
read_table
(
path
,
memory_map
=
True
)
logging
.
info
(
"Done reading from %s"
%
path
)
arr
=
table
.
to_pandas
().
to_numpy
()
if
not
shape
:
logging
.
warning
(
"Shape information not found in the metadata, read the data as "
"a 2 dim array."
)
shape
=
tuple
(
eval
(
shape
.
decode
()))
if
shape
else
arr
.
shape
return
arr
.
reshape
(
shape
)
def
write
(
self
,
path
,
array
):
logging
.
info
(
"Writing to %s using parquet format"
%
path
)
shape
=
array
.
shape
if
len
(
shape
)
>
2
:
array
=
array
.
reshape
(
shape
[
0
],
-
1
)
table
=
pyarrow
.
Table
.
from_pandas
(
pd
.
DataFrame
(
array
))
table
=
table
.
replace_schema_metadata
({
"shape"
:
str
(
shape
)})
pyarrow
.
parquet
.
write_table
(
table
,
path
)
logging
.
info
(
"Done writing to %s"
%
path
)
tools/
utils
/array_readwriter/registry.py
→
tools/
distpartitioning
/array_readwriter/registry.py
View file @
08fd6cf8
File moved
tools/distpartitioning/constants.py
View file @
08fd6cf8
...
@@ -30,6 +30,7 @@ STR_NODE_DATA = "node_data"
...
@@ -30,6 +30,7 @@ STR_NODE_DATA = "node_data"
STR_EDGE_DATA
=
"edge_data"
STR_EDGE_DATA
=
"edge_data"
STR_NUMPY
=
"numpy"
STR_NUMPY
=
"numpy"
STR_PARQUET
=
"parquet"
STR_CSV
=
"csv"
STR_CSV
=
"csv"
STR_NAME
=
"name"
STR_NAME
=
"name"
...
...
tools/distpartitioning/dataset_utils.py
View file @
08fd6cf8
...
@@ -5,6 +5,7 @@ import numpy as np
...
@@ -5,6 +5,7 @@ import numpy as np
import
pyarrow
import
pyarrow
import
torch
import
torch
from
pyarrow
import
csv
from
pyarrow
import
csv
import
array_readwriter
import
constants
import
constants
from
utils
import
get_idranges
,
map_partid_rank
from
utils
import
get_idranges
,
map_partid_rank
...
@@ -106,7 +107,7 @@ def get_dataset(input_dir, graph_name, rank, world_size, num_parts, schema_map):
...
@@ -106,7 +107,7 @@ def get_dataset(input_dir, graph_name, rank, world_size, num_parts, schema_map):
to mention all the files present for this given node feature.
to mention all the files present for this given node feature.
Data read from each of the node features file is a multi-dimensional tensor data and is read
Data read from each of the node features file is a multi-dimensional tensor data and is read
in numpy format, which is also the storage format of node features on the permanent storage.
in numpy
or parquet
format, which is also the storage format of node features on the permanent storage.
"node_type" : ["ntype0-name", "ntype1-name", ....], #m node types
"node_type" : ["ntype0-name", "ntype1-name", ....], #m node types
"num_nodes_per_chunk" : [
"num_nodes_per_chunk" : [
...
@@ -143,12 +144,13 @@ def get_dataset(input_dir, graph_name, rank, world_size, num_parts, schema_map):
...
@@ -143,12 +144,13 @@ def get_dataset(input_dir, graph_name, rank, world_size, num_parts, schema_map):
if
((
dataset_features
is
not
None
)
and
(
len
(
dataset_features
)
>
0
)):
if
((
dataset_features
is
not
None
)
and
(
len
(
dataset_features
)
>
0
)):
for
ntype_name
,
ntype_feature_data
in
dataset_features
.
items
():
for
ntype_name
,
ntype_feature_data
in
dataset_features
.
items
():
for
feat_name
,
feat_data
in
ntype_feature_data
.
items
():
for
feat_name
,
feat_data
in
ntype_feature_data
.
items
():
assert
feat_data
[
constants
.
STR_FORMAT
][
constants
.
STR_NAME
]
==
constants
.
STR_NUMPY
assert
(
feat_data
[
constants
.
STR_FORMAT
][
constants
.
STR_NAME
]
in
[
constants
.
STR_NUMPY
,
constants
.
STR_PARQUET
])
# It is guaranteed that num_chunks is always greater
# It is guaranteed that num_chunks is always greater
# than num_partitions.
# than num_partitions.
num_chunks
=
len
(
feat_data
[
constants
.
STR_DATA
])
num_chunks
=
len
(
feat_data
[
constants
.
STR_DATA
])
read_list
=
np
.
array_split
(
np
.
arange
(
num_chunks
),
num_parts
)
read_list
=
np
.
array_split
(
np
.
arange
(
num_chunks
),
num_parts
)
reader_fmt_meta
=
{
"name"
:
feat_data
[
constants
.
STR_FORMAT
][
constants
.
STR_NAME
]}
for
local_part_id
in
range
(
num_parts
):
for
local_part_id
in
range
(
num_parts
):
if
map_partid_rank
(
local_part_id
,
world_size
)
==
rank
:
if
map_partid_rank
(
local_part_id
,
world_size
)
==
rank
:
nfeat
=
[]
nfeat
=
[]
...
@@ -158,7 +160,11 @@ def get_dataset(input_dir, graph_name, rank, world_size, num_parts, schema_map):
...
@@ -158,7 +160,11 @@ def get_dataset(input_dir, graph_name, rank, world_size, num_parts, schema_map):
if
not
os
.
path
.
isabs
(
nfeat_file
):
if
not
os
.
path
.
isabs
(
nfeat_file
):
nfeat_file
=
os
.
path
.
join
(
input_dir
,
nfeat_file
)
nfeat_file
=
os
.
path
.
join
(
input_dir
,
nfeat_file
)
logging
.
info
(
f
'Loading node feature[
{
feat_name
}
] of ntype[
{
ntype_name
}
] from
{
nfeat_file
}
'
)
logging
.
info
(
f
'Loading node feature[
{
feat_name
}
] of ntype[
{
ntype_name
}
] from
{
nfeat_file
}
'
)
nfeat
.
append
(
np
.
load
(
nfeat_file
))
nfeat
.
append
(
array_readwriter
.
get_array_parser
(
**
reader_fmt_meta
).
read
(
nfeat_file
)
)
nfeat
=
np
.
concatenate
(
nfeat
)
if
len
(
nfeat
)
!=
0
else
np
.
array
([])
nfeat
=
np
.
concatenate
(
nfeat
)
if
len
(
nfeat
)
!=
0
else
np
.
array
([])
node_features
[
ntype_name
+
"/"
+
feat_name
+
"/"
+
str
(
local_part_id
//
world_size
)]
=
torch
.
from_numpy
(
nfeat
)
node_features
[
ntype_name
+
"/"
+
feat_name
+
"/"
+
str
(
local_part_id
//
world_size
)]
=
torch
.
from_numpy
(
nfeat
)
nfeat_tids
.
append
(
node_tids
[
ntype_name
][
local_part_id
])
nfeat_tids
.
append
(
node_tids
[
ntype_name
][
local_part_id
])
...
@@ -241,7 +247,8 @@ def get_dataset(input_dir, graph_name, rank, world_size, num_parts, schema_map):
...
@@ -241,7 +247,8 @@ def get_dataset(input_dir, graph_name, rank, world_size, num_parts, schema_map):
if
dataset_features
and
(
len
(
dataset_features
)
>
0
):
if
dataset_features
and
(
len
(
dataset_features
)
>
0
):
for
etype_name
,
etype_feature_data
in
dataset_features
.
items
():
for
etype_name
,
etype_feature_data
in
dataset_features
.
items
():
for
feat_name
,
feat_data
in
etype_feature_data
.
items
():
for
feat_name
,
feat_data
in
etype_feature_data
.
items
():
assert
feat_data
[
constants
.
STR_FORMAT
][
constants
.
STR_NAME
]
==
constants
.
STR_NUMPY
assert
feat_data
[
constants
.
STR_FORMAT
][
constants
.
STR_NAME
]
in
[
constants
.
STR_NUMPY
,
constants
.
STR_PARQUET
]
reader_fmt_meta
=
{
"name"
:
feat_data
[
constants
.
STR_FORMAT
][
constants
.
STR_NAME
]}
num_chunks
=
len
(
feat_data
[
constants
.
STR_DATA
])
num_chunks
=
len
(
feat_data
[
constants
.
STR_DATA
])
read_list
=
np
.
array_split
(
np
.
arange
(
num_chunks
),
num_parts
)
read_list
=
np
.
array_split
(
np
.
arange
(
num_chunks
),
num_parts
)
for
local_part_id
in
range
(
num_parts
):
for
local_part_id
in
range
(
num_parts
):
...
@@ -249,14 +256,17 @@ def get_dataset(input_dir, graph_name, rank, world_size, num_parts, schema_map):
...
@@ -249,14 +256,17 @@ def get_dataset(input_dir, graph_name, rank, world_size, num_parts, schema_map):
efeats
=
[]
efeats
=
[]
efeat_tids
=
[]
efeat_tids
=
[]
for
idx
in
read_list
[
local_part_id
]:
for
idx
in
read_list
[
local_part_id
]:
feature_fname
=
feat_data
[
constants
.
STR_DATA
][
idx
]
efeat_file
=
feat_data
[
constants
.
STR_DATA
][
idx
]
if
(
os
.
path
.
isabs
(
feature_fname
)):
if
not
os
.
path
.
isabs
(
efeat_file
):
logging
.
info
(
f
'Loading numpy from
{
feature_fname
}
'
)
efeat_file
=
os
.
path
.
join
(
input_dir
,
efeat_file
)
efeats
.
append
(
torch
.
from_numpy
(
np
.
load
(
feature_fname
)))
logging
.
info
(
else
:
f
'Loading edge feature[
{
feat_name
}
] of etype[
{
etype_name
}
] from
{
efeat_file
}
'
numpy_path
=
os
.
path
.
join
(
input_dir
,
feature_fname
)
)
logging
.
info
(
f
'Loading numpy from
{
numpy_path
}
'
)
efeats
.
append
(
efeats
.
append
(
torch
.
from_numpy
(
np
.
load
(
numpy_path
)))
array_readwriter
.
get_array_parser
(
**
reader_fmt_meta
).
read
(
efeat_file
)
)
efeat_tids
.
append
(
edge_tids
[
etype_name
][
local_part_id
])
efeat_tids
.
append
(
edge_tids
[
etype_name
][
local_part_id
])
edge_features
[
etype_name
+
'/'
+
feat_name
+
"/"
+
str
(
local_part_id
//
world_size
)]
=
torch
.
from_numpy
(
np
.
concatenate
(
efeats
))
edge_features
[
etype_name
+
'/'
+
feat_name
+
"/"
+
str
(
local_part_id
//
world_size
)]
=
torch
.
from_numpy
(
np
.
concatenate
(
efeats
))
edge_feature_tids
[
etype_name
+
"/"
+
feat_name
+
"/"
+
str
(
local_part_id
//
world_size
)]
=
efeat_tids
edge_feature_tids
[
etype_name
+
"/"
+
feat_name
+
"/"
+
str
(
local_part_id
//
world_size
)]
=
efeat_tids
...
...
tools/distpartitioning/utils.py
View file @
08fd6cf8
...
@@ -2,17 +2,15 @@ import json
...
@@ -2,17 +2,15 @@ import json
import
logging
import
logging
import
os
import
os
import
dgl
import
constants
import
numpy
as
np
import
numpy
as
np
import
psutil
import
psutil
import
pyarrow
import
pyarrow
import
torch
from
pyarrow
import
csv
from
pyarrow
import
csv
import
constants
import
dgl
from
dgl.distributed.partition
import
(
from
dgl.distributed.partition
import
_dump_part_config
_dump_part_config
)
def
read_ntype_partition_files
(
schema_map
,
input_dir
):
def
read_ntype_partition_files
(
schema_map
,
input_dir
):
"""
"""
...
...
tools/
utils/
files.py
→
tools/files.py
View file @
08fd6cf8
File moved
tools/partition_algo/random_partition.py
View file @
08fd6cf8
...
@@ -7,8 +7,8 @@ import sys
...
@@ -7,8 +7,8 @@ import sys
import
numpy
as
np
import
numpy
as
np
from
base
import
PartitionMeta
,
dump_partition_meta
from
base
import
PartitionMeta
,
dump_partition_meta
from
utils
import
array_readwriter
,
setdir
from
distpartitioning
import
array_readwriter
from
files
import
setdir
def
_random_partition
(
metadata
,
num_parts
):
def
_random_partition
(
metadata
,
num_parts
):
num_nodes_per_type
=
[
sum
(
_
)
for
_
in
metadata
[
"num_nodes_per_chunk"
]]
num_nodes_per_type
=
[
sum
(
_
)
for
_
in
metadata
[
"num_nodes_per_chunk"
]]
...
...
tools/utils/__init__.py
deleted
100644 → 0
View file @
e0fc038d
from
.
import
array_readwriter
from
.files
import
*
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment