"...git@developer.sourcefind.cn:renzhc/diffusers_dcu.git" did not exist on "480510ada99a8fd7cae8de47bb202382250d6873"
Unverified Commit 9890201d authored by Theodore Vasiloudis's avatar Theodore Vasiloudis Committed by GitHub
Browse files

[Dist] Allow reading and writing single-column vector Parquet files. (#5098)

* Allow reading and writing single-column vector Parquet files.

These files are commonly produced by Spark ML's feature processing code.

* [Dist] Only write single-column vector files for Parquet in tests.
parent 7ee550f0
...@@ -82,6 +82,7 @@ def _test_chunk_graph( ...@@ -82,6 +82,7 @@ def _test_chunk_graph(
num_chunks, num_chunks,
data_fmt = 'numpy', data_fmt = 'numpy',
edges_fmt = 'csv', edges_fmt = 'csv',
vector_rows = False,
num_chunks_nodes = None, num_chunks_nodes = None,
num_chunks_edges = None, num_chunks_edges = None,
num_chunks_node_data = None, num_chunks_node_data = None,
...@@ -91,6 +92,7 @@ def _test_chunk_graph( ...@@ -91,6 +92,7 @@ def _test_chunk_graph(
g = create_chunked_dataset(root_dir, num_chunks, g = create_chunked_dataset(root_dir, num_chunks,
data_fmt=data_fmt, edges_fmt=edges_fmt, data_fmt=data_fmt, edges_fmt=edges_fmt,
vector_rows=vector_rows,
num_chunks_nodes=num_chunks_nodes, num_chunks_nodes=num_chunks_nodes,
num_chunks_edges=num_chunks_edges, num_chunks_edges=num_chunks_edges,
num_chunks_node_data=num_chunks_node_data, num_chunks_node_data=num_chunks_node_data,
...@@ -191,6 +193,11 @@ def _test_chunk_graph( ...@@ -191,6 +193,11 @@ def _test_chunk_graph(
def test_chunk_graph_basics(num_chunks, data_fmt, edges_fmt): def test_chunk_graph_basics(num_chunks, data_fmt, edges_fmt):
_test_chunk_graph(num_chunks, data_fmt=data_fmt, edges_fmt=edges_fmt) _test_chunk_graph(num_chunks, data_fmt=data_fmt, edges_fmt=edges_fmt)
@pytest.mark.parametrize("num_chunks", [1, 8])
@pytest.mark.parametrize("vector_rows", [True, False])
def test_chunk_graph_vector_rows(num_chunks, vector_rows):
_test_chunk_graph(num_chunks, data_fmt='parquet', edges_fmt='parquet', vector_rows=vector_rows)
@pytest.mark.parametrize( @pytest.mark.parametrize(
"num_chunks, " "num_chunks, "
......
...@@ -6,18 +6,25 @@ import torch ...@@ -6,18 +6,25 @@ import torch
import dgl import dgl
from distpartitioning import array_readwriter from distpartitioning import array_readwriter
from distpartitioning.array_readwriter.parquet import ParquetArrayParser
from files import setdir from files import setdir
def _chunk_numpy_array(arr, fmt_meta, chunk_sizes, path_fmt): def _chunk_numpy_array(arr, fmt_meta, chunk_sizes, path_fmt, vector_rows=False):
paths = [] paths = []
offset = 0 offset = 0
for j, n in enumerate(chunk_sizes): for j, n in enumerate(chunk_sizes):
path = os.path.abspath(path_fmt % j) path = os.path.abspath(path_fmt % j)
arr_chunk = arr[offset: offset + n] arr_chunk = arr[offset: offset + n]
shape = arr_chunk.shape
logging.info("Chunking %d-%d" % (offset, offset + n)) logging.info("Chunking %d-%d" % (offset, offset + n))
array_readwriter.get_array_parser(**fmt_meta).write(path, arr_chunk) # If requested we write multi-column arrays as single-column vector Parquet files
array_parser = array_readwriter.get_array_parser(**fmt_meta)
if isinstance(array_parser, ParquetArrayParser) and len(shape) > 1 and shape[1] > 1:
array_parser.write(path, arr_chunk, vector_rows=vector_rows)
else:
array_parser.write(path, arr_chunk)
offset += n offset += n
paths.append(path) paths.append(path)
...@@ -76,7 +83,8 @@ def _initialize_num_chunks(g, num_chunks, kwargs=None): ...@@ -76,7 +83,8 @@ def _initialize_num_chunks(g, num_chunks, kwargs=None):
def _chunk_graph( def _chunk_graph(
g, name, ndata_paths, edata_paths, num_chunks, data_fmt, edges_format, **kwargs g, name, ndata_paths, edata_paths, num_chunks, data_fmt, edges_format,
vector_rows=False, **kwargs
): ):
# First deal with ndata and edata that are homogeneous # First deal with ndata and edata that are homogeneous
# (i.e. not a dict-of-dict) # (i.e. not a dict-of-dict)
...@@ -190,6 +198,7 @@ def _chunk_graph( ...@@ -190,6 +198,7 @@ def _chunk_graph(
writer_fmt_meta, writer_fmt_meta,
chunk_sizes, chunk_sizes,
key + "-%d." + file_suffix, key + "-%d." + file_suffix,
vector_rows=vector_rows,
) )
ndata_meta[key] = ndata_key_meta ndata_meta[key] = ndata_key_meta
...@@ -230,6 +239,7 @@ def _chunk_graph( ...@@ -230,6 +239,7 @@ def _chunk_graph(
writer_fmt_meta, writer_fmt_meta,
chunk_sizes, chunk_sizes,
key + "-%d." + file_suffix, key + "-%d." + file_suffix,
vector_rows=vector_rows,
) )
edata_meta[key] = edata_key_meta edata_meta[key] = edata_key_meta
...@@ -250,6 +260,7 @@ def chunk_graph( ...@@ -250,6 +260,7 @@ def chunk_graph(
output_path, output_path,
data_fmt="numpy", data_fmt="numpy",
edges_fmt='csv', edges_fmt='csv',
vector_rows=False,
**kwargs, **kwargs,
): ):
""" """
...@@ -276,6 +287,10 @@ def chunk_graph( ...@@ -276,6 +287,10 @@ def chunk_graph(
The output directory saving the chunked graph. The output directory saving the chunked graph.
data_fmt : str data_fmt : str
Format of node/edge data: 'numpy' or 'parquet'. Format of node/edge data: 'numpy' or 'parquet'.
edges_fmt : str
Format of edges files: 'csv' or 'parquet'.
vector_rows : str
When true will write parquet files as single-column vector row files.
kwargs : dict kwargs : dict
Key word arguments to control chunk details. Key word arguments to control chunk details.
""" """
...@@ -287,12 +302,14 @@ def chunk_graph( ...@@ -287,12 +302,14 @@ def chunk_graph(
edata[key] = os.path.abspath(edata[key]) edata[key] = os.path.abspath(edata[key])
with setdir(output_path): with setdir(output_path):
_chunk_graph( _chunk_graph(
g, name, ndata_paths, edata_paths, num_chunks, data_fmt, edges_fmt, **kwargs g, name, ndata_paths, edata_paths, num_chunks, data_fmt, edges_fmt,
vector_rows, **kwargs
) )
def create_chunked_dataset( def create_chunked_dataset(
root_dir, num_chunks, data_fmt="numpy", edges_fmt='csv', **kwargs): root_dir, num_chunks, data_fmt="numpy", edges_fmt='csv',
vector_rows=False, **kwargs):
""" """
This function creates a sample dataset, based on MAG240 dataset. This function creates a sample dataset, based on MAG240 dataset.
...@@ -531,6 +548,7 @@ def create_chunked_dataset( ...@@ -531,6 +548,7 @@ def create_chunked_dataset(
output_path=output_dir, output_path=output_dir,
data_fmt=data_fmt, data_fmt=data_fmt,
edges_fmt=edges_fmt, edges_fmt=edges_fmt,
vector_rows=vector_rows,
**kwargs, **kwargs,
) )
print("Done with creating chunked graph") print("Done with creating chunked graph")
......
import logging import logging
import numpy as np
import pandas as pd import pandas as pd
import pyarrow import pyarrow
import pyarrow.parquet import pyarrow.parquet
...@@ -16,26 +17,44 @@ class ParquetArrayParser(object): ...@@ -16,26 +17,44 @@ class ParquetArrayParser(object):
logging.info("Reading from %s using parquet format" % path) logging.info("Reading from %s using parquet format" % path)
metadata = pyarrow.parquet.read_metadata(path) metadata = pyarrow.parquet.read_metadata(path)
metadata = metadata.schema.to_arrow_schema().metadata metadata = metadata.schema.to_arrow_schema().metadata
# As parquet data are tabularized, we assume the dim of ndarray is 2. # As parquet data are tabularized, we assume the dim of ndarray is 2.
# If not, it should be explictly specified in the file as metadata. # If not, it should be explictly specified in the file as metadata.
shape = metadata.get(b"shape", None) if metadata:
shape = metadata.get(b"shape", None)
else:
shape = None
table = pyarrow.parquet.read_table(path, memory_map=True) table = pyarrow.parquet.read_table(path, memory_map=True)
logging.info("Done reading from %s" % path)
arr = table.to_pandas().to_numpy() data_types = table.schema.types
# Spark ML feature processing produces single-column parquet files where each row is a vector object
if len(data_types) == 1 and isinstance(data_types[0], pyarrow.ListType):
arr = np.array(table.to_pandas().iloc[:, 0].to_list())
logging.debug(f"Parquet data under {path} converted from single vector per row to ndarray")
else:
arr = table.to_pandas().to_numpy()
if not shape: if not shape:
logging.warning( logging.warning(
"Shape information not found in the metadata, read the data as " "Shape information not found in the metadata, read the data as "
"a 2 dim array." "a 2 dim array."
) )
logging.info("Done reading from %s" % path)
shape = tuple(eval(shape.decode())) if shape else arr.shape shape = tuple(eval(shape.decode())) if shape else arr.shape
return arr.reshape(shape) return arr.reshape(shape)
def write(self, path, array): def write(self, path, array, vector_rows=False):
logging.info("Writing to %s using parquet format" % path) logging.info("Writing to %s using parquet format" % path)
shape = array.shape shape = array.shape
if len(shape) > 2: if len(shape) > 2:
array = array.reshape(shape[0], -1) array = array.reshape(shape[0], -1)
table = pyarrow.Table.from_pandas(pd.DataFrame(array)) if vector_rows:
table = table.replace_schema_metadata({"shape": str(shape)}) table = pyarrow.table(
[pyarrow.array(array.tolist())],
names=["vector"])
logging.info("Writing to %s using single-vector rows..." % path)
else:
table = pyarrow.Table.from_pandas(pd.DataFrame(array))
table = table.replace_schema_metadata({"shape": str(shape)})
pyarrow.parquet.write_table(table, path) pyarrow.parquet.write_table(table, path)
logging.info("Done writing to %s" % path) logging.info("Done writing to %s" % path)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment