[Distributed][Feature] New distributed partitioning pipeline (#4439)

ad7be8be · Minjie Wang · GitHub · ee672c0b · 7e2ed9f8 · ad7be8be
Unverified Commit ad7be8be authored Aug 22, 2022 by Minjie Wang Committed by GitHub Aug 22, 2022
4 changed files
--- a/tools/utils/array_readwriter/csv.py
+++ b/tools/utils/array_readwriter/csv.py
+import logging
+import pandas as pd
+import pyarrow
+import pyarrow.csv
+from .registry import register_array_parser
+@register_array_parser("csv")
+class CSVArrayParser(object):
+    def __init__(self, delimiter=','):
+        self.delimiter = delimiter
+    def read(self, path):
+        logging.info('Reading from %s using CSV format with configuration %s' % (
+            path, self.__dict__))
+        # do not read the first line as header
+        read_options = pyarrow.csv.ReadOptions(autogenerate_column_names=True)
+        parse_options = pyarrow.csv.ParseOptions(delimiter=self.delimiter)
+        arr = pyarrow.csv.read_csv(path, read_options=read_options, parse_options=parse_options)
+        logging.info('Done reading from %s' % path)
+        return arr.to_pandas().to_numpy()
+    def write(self, path, arr):
+        logging.info('Writing to %s using CSV format with configuration %s' % (
+            path, self.__dict__))
+        write_options = pyarrow.csv.WriteOptions(include_header=False, delimiter=self.delimiter)
+        arr = pyarrow.Table.from_pandas(pd.DataFrame(arr))
+        pyarrow.csv.write_csv(arr, path, write_options=write_options)
+        logging.info('Done writing to %s' % path)
--- a/tools/utils/array_readwriter/numpy_array.py
+++ b/tools/utils/array_readwriter/numpy_array.py
+import logging
+import numpy as np
+from numpy.lib.format import open_memmap
+from .registry import register_array_parser
+@register_array_parser("numpy")
+class NumpyArrayParser(object):
+    def __init__(self):
+        pass
+    def read(self, path):
+        logging.info('Reading from %s using numpy format' % path)
+        arr = np.load(path, mmap_mode='r')
+        logging.info('Done reading from %s' % path)
+        return arr
+    def write(self, path, arr):
+        logging.info('Writing to %s using numpy format' % path)
+        # np.save would load the entire memmap array up into CPU.  So we manually open
+        # an empty npy file with memmap mode and manually flush it instead.
+        new_arr = open_memmap(path, mode='w+', dtype=arr.dtype, shape=arr.shape)
+        new_arr[:] = arr[:]
+        logging.info('Done writing to %s' % path)
--- a/tools/utils/array_readwriter/registry.py
+++ b/tools/utils/array_readwriter/registry.py
+REGISTRY = {}
+def register_array_parser(name):
+    def _deco(cls):
+        REGISTRY[name] = cls
+        return cls
+    return _deco
+def get_array_parser(**fmt_meta):
+    cls = REGISTRY[fmt_meta.pop('name')]
+    return cls(**fmt_meta)
--- a/tools/utils/files.py
+++ b/tools/utils/files.py
+import os
+from contextlib import contextmanager
+import logging
+from numpy.lib.format import open_memmap
+@contextmanager
+def setdir(path):
+    try:
+        os.makedirs(path, exist_ok=True)
+        cwd = os.getcwd()
+        logging.info('Changing directory to %s' % path)
+        logging.info('Previously: %s' % cwd)
+        os.chdir(path)
+        yield
+    finally:
+        logging.info('Restoring directory to %s' % cwd)
+        os.chdir(cwd)