Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
ad7be8be
Unverified
Commit
ad7be8be
authored
Aug 22, 2022
by
Minjie Wang
Committed by
GitHub
Aug 22, 2022
Browse files
[Distributed][Feature] New distributed partitioning pipeline (#4439)
parents
ee672c0b
7e2ed9f8
Changes
24
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
79 additions
and
0 deletions
+79
-0
tools/utils/array_readwriter/csv.py
tools/utils/array_readwriter/csv.py
+28
-0
tools/utils/array_readwriter/numpy_array.py
tools/utils/array_readwriter/numpy_array.py
+23
-0
tools/utils/array_readwriter/registry.py
tools/utils/array_readwriter/registry.py
+11
-0
tools/utils/files.py
tools/utils/files.py
+17
-0
No files found.
tools/utils/array_readwriter/csv.py
0 → 100644
View file @
ad7be8be
import
logging
import
pandas
as
pd
import
pyarrow
import
pyarrow.csv
from
.registry
import
register_array_parser
@
register_array_parser
(
"csv"
)
class
CSVArrayParser
(
object
):
def
__init__
(
self
,
delimiter
=
','
):
self
.
delimiter
=
delimiter
def
read
(
self
,
path
):
logging
.
info
(
'Reading from %s using CSV format with configuration %s'
%
(
path
,
self
.
__dict__
))
# do not read the first line as header
read_options
=
pyarrow
.
csv
.
ReadOptions
(
autogenerate_column_names
=
True
)
parse_options
=
pyarrow
.
csv
.
ParseOptions
(
delimiter
=
self
.
delimiter
)
arr
=
pyarrow
.
csv
.
read_csv
(
path
,
read_options
=
read_options
,
parse_options
=
parse_options
)
logging
.
info
(
'Done reading from %s'
%
path
)
return
arr
.
to_pandas
().
to_numpy
()
def
write
(
self
,
path
,
arr
):
logging
.
info
(
'Writing to %s using CSV format with configuration %s'
%
(
path
,
self
.
__dict__
))
write_options
=
pyarrow
.
csv
.
WriteOptions
(
include_header
=
False
,
delimiter
=
self
.
delimiter
)
arr
=
pyarrow
.
Table
.
from_pandas
(
pd
.
DataFrame
(
arr
))
pyarrow
.
csv
.
write_csv
(
arr
,
path
,
write_options
=
write_options
)
logging
.
info
(
'Done writing to %s'
%
path
)
tools/utils/array_readwriter/numpy_array.py
0 → 100644
View file @
ad7be8be
import
logging
import
numpy
as
np
from
numpy.lib.format
import
open_memmap
from
.registry
import
register_array_parser
@
register_array_parser
(
"numpy"
)
class
NumpyArrayParser
(
object
):
def
__init__
(
self
):
pass
def
read
(
self
,
path
):
logging
.
info
(
'Reading from %s using numpy format'
%
path
)
arr
=
np
.
load
(
path
,
mmap_mode
=
'r'
)
logging
.
info
(
'Done reading from %s'
%
path
)
return
arr
def
write
(
self
,
path
,
arr
):
logging
.
info
(
'Writing to %s using numpy format'
%
path
)
# np.save would load the entire memmap array up into CPU. So we manually open
# an empty npy file with memmap mode and manually flush it instead.
new_arr
=
open_memmap
(
path
,
mode
=
'w+'
,
dtype
=
arr
.
dtype
,
shape
=
arr
.
shape
)
new_arr
[:]
=
arr
[:]
logging
.
info
(
'Done writing to %s'
%
path
)
tools/utils/array_readwriter/registry.py
0 → 100644
View file @
ad7be8be
REGISTRY
=
{}
def
register_array_parser
(
name
):
def
_deco
(
cls
):
REGISTRY
[
name
]
=
cls
return
cls
return
_deco
def
get_array_parser
(
**
fmt_meta
):
cls
=
REGISTRY
[
fmt_meta
.
pop
(
'name'
)]
return
cls
(
**
fmt_meta
)
tools/utils/files.py
0 → 100644
View file @
ad7be8be
import
os
from
contextlib
import
contextmanager
import
logging
from
numpy.lib.format
import
open_memmap
@
contextmanager
def
setdir
(
path
):
try
:
os
.
makedirs
(
path
,
exist_ok
=
True
)
cwd
=
os
.
getcwd
()
logging
.
info
(
'Changing directory to %s'
%
path
)
logging
.
info
(
'Previously: %s'
%
cwd
)
os
.
chdir
(
path
)
yield
finally
:
logging
.
info
(
'Restoring directory to %s'
%
cwd
)
os
.
chdir
(
cwd
)
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment