Unverified Commit ad7be8be authored by Minjie Wang's avatar Minjie Wang Committed by GitHub
Browse files

[Distributed][Feature] New distributed partitioning pipeline (#4439)

parents ee672c0b 7e2ed9f8
import logging
import pandas as pd
import pyarrow
import pyarrow.csv
from .registry import register_array_parser
@register_array_parser("csv")
class CSVArrayParser(object):
def __init__(self, delimiter=','):
self.delimiter = delimiter
def read(self, path):
logging.info('Reading from %s using CSV format with configuration %s' % (
path, self.__dict__))
# do not read the first line as header
read_options = pyarrow.csv.ReadOptions(autogenerate_column_names=True)
parse_options = pyarrow.csv.ParseOptions(delimiter=self.delimiter)
arr = pyarrow.csv.read_csv(path, read_options=read_options, parse_options=parse_options)
logging.info('Done reading from %s' % path)
return arr.to_pandas().to_numpy()
def write(self, path, arr):
logging.info('Writing to %s using CSV format with configuration %s' % (
path, self.__dict__))
write_options = pyarrow.csv.WriteOptions(include_header=False, delimiter=self.delimiter)
arr = pyarrow.Table.from_pandas(pd.DataFrame(arr))
pyarrow.csv.write_csv(arr, path, write_options=write_options)
logging.info('Done writing to %s' % path)
import logging
import numpy as np
from numpy.lib.format import open_memmap
from .registry import register_array_parser
@register_array_parser("numpy")
class NumpyArrayParser(object):
def __init__(self):
pass
def read(self, path):
logging.info('Reading from %s using numpy format' % path)
arr = np.load(path, mmap_mode='r')
logging.info('Done reading from %s' % path)
return arr
def write(self, path, arr):
logging.info('Writing to %s using numpy format' % path)
# np.save would load the entire memmap array up into CPU. So we manually open
# an empty npy file with memmap mode and manually flush it instead.
new_arr = open_memmap(path, mode='w+', dtype=arr.dtype, shape=arr.shape)
new_arr[:] = arr[:]
logging.info('Done writing to %s' % path)
REGISTRY = {}
def register_array_parser(name):
def _deco(cls):
REGISTRY[name] = cls
return cls
return _deco
def get_array_parser(**fmt_meta):
cls = REGISTRY[fmt_meta.pop('name')]
return cls(**fmt_meta)
import os
from contextlib import contextmanager
import logging
from numpy.lib.format import open_memmap
@contextmanager
def setdir(path):
try:
os.makedirs(path, exist_ok=True)
cwd = os.getcwd()
logging.info('Changing directory to %s' % path)
logging.info('Previously: %s' % cwd)
os.chdir(path)
yield
finally:
logging.info('Restoring directory to %s' % cwd)
os.chdir(cwd)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment