Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
98dd890b
Commit
98dd890b
authored
May 08, 2020
by
A. Unique TensorFlower
Browse files
refactors some preprocessing code.
PiperOrigin-RevId: 310658964
parent
0fc994b6
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
101 additions
and
69 deletions
+101
-69
official/recommendation/data_preprocessing.py
official/recommendation/data_preprocessing.py
+82
-55
official/recommendation/data_test.py
official/recommendation/data_test.py
+1
-2
official/recommendation/movielens.py
official/recommendation/movielens.py
+14
-5
official/recommendation/ncf_common.py
official/recommendation/ncf_common.py
+4
-7
No files found.
official/recommendation/data_preprocessing.py
View file @
98dd890b
...
@@ -16,18 +16,21 @@
...
@@ -16,18 +16,21 @@
from
__future__
import
absolute_import
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
division
# from __future__ import google_type_annotations
from
__future__
import
print_function
from
__future__
import
print_function
import
os
import
os
import
pickle
import
pickle
import
time
import
time
import
timeit
import
timeit
# pylint: disable=wrong-import-order
# pylint: disable=wrong-import-order
from
absl
import
logging
from
absl
import
logging
import
numpy
as
np
import
numpy
as
np
import
pandas
as
pd
import
pandas
as
pd
import
tensorflow
as
tf
import
tensorflow
as
tf
import
typing
import
typing
from
typing
import
Dict
,
Text
,
Tuple
# pylint: enable=wrong-import-order
# pylint: enable=wrong-import-order
from
official.recommendation
import
constants
as
rconst
from
official.recommendation
import
constants
as
rconst
...
@@ -35,19 +38,88 @@ from official.recommendation import data_pipeline
...
@@ -35,19 +38,88 @@ from official.recommendation import data_pipeline
from
official.recommendation
import
movielens
from
official.recommendation
import
movielens
DATASET_TO_NUM_USERS_AND_ITEMS
=
{
"ml-1m"
:
(
6040
,
3706
),
"ml-20m"
:
(
138493
,
26744
)
}
_EXPECTED_CACHE_KEYS
=
(
_EXPECTED_CACHE_KEYS
=
(
rconst
.
TRAIN_USER_KEY
,
rconst
.
TRAIN_ITEM_KEY
,
rconst
.
EVAL_USER_KEY
,
rconst
.
TRAIN_USER_KEY
,
rconst
.
TRAIN_ITEM_KEY
,
rconst
.
EVAL_USER_KEY
,
rconst
.
EVAL_ITEM_KEY
,
rconst
.
USER_MAP
,
rconst
.
ITEM_MAP
)
rconst
.
EVAL_ITEM_KEY
,
rconst
.
USER_MAP
,
rconst
.
ITEM_MAP
)
def
_filter_index_sort
(
raw_rating_path
,
cache_path
):
def
read_dataframe
(
# type: (str, str, bool) -> (dict, bool)
raw_rating_path
:
Text
)
->
Tuple
[
Dict
[
int
,
int
],
Dict
[
int
,
int
],
pd
.
DataFrame
]:
"""Read in data CSV, and output DataFrame for downstream processing.
This function reads in the raw CSV of positive items, and performs three
preprocessing transformations:
1) Filter out all users who have not rated at least a certain number
of items. (Typically 20 items)
2) Zero index the users and items such that the largest user_id is
`num_users - 1` and the largest item_id is `num_items - 1`
3) Sort the dataframe by user_id, with timestamp as a secondary sort key.
This allows the dataframe to be sliced by user in-place, and for the last
item to be selected simply by calling the `-1` index of a user's slice.
Args:
raw_rating_path: The path to the CSV which contains the raw dataset.
Returns:
A dict mapping raw user IDs to regularized user IDs, a dict mapping raw
item IDs to regularized item IDs, and a filtered, zero-index remapped,
sorted dataframe.
"""
with
tf
.
io
.
gfile
.
GFile
(
raw_rating_path
)
as
f
:
df
=
pd
.
read_csv
(
f
)
# Get the info of users who have more than 20 ratings on items
grouped
=
df
.
groupby
(
movielens
.
USER_COLUMN
)
df
=
grouped
.
filter
(
lambda
x
:
len
(
x
)
>=
rconst
.
MIN_NUM_RATINGS
)
# type: pd.DataFrame
original_users
=
df
[
movielens
.
USER_COLUMN
].
unique
()
original_items
=
df
[
movielens
.
ITEM_COLUMN
].
unique
()
# Map the ids of user and item to 0 based index for following processing
logging
.
info
(
"Generating user_map and item_map..."
)
user_map
=
{
user
:
index
for
index
,
user
in
enumerate
(
original_users
)}
item_map
=
{
item
:
index
for
index
,
item
in
enumerate
(
original_items
)}
df
[
movielens
.
USER_COLUMN
]
=
df
[
movielens
.
USER_COLUMN
].
apply
(
lambda
user
:
user_map
[
user
])
df
[
movielens
.
ITEM_COLUMN
]
=
df
[
movielens
.
ITEM_COLUMN
].
apply
(
lambda
item
:
item_map
[
item
])
num_users
=
len
(
original_users
)
num_items
=
len
(
original_items
)
assert
num_users
<=
np
.
iinfo
(
rconst
.
USER_DTYPE
).
max
assert
num_items
<=
np
.
iinfo
(
rconst
.
ITEM_DTYPE
).
max
assert
df
[
movielens
.
USER_COLUMN
].
max
()
==
num_users
-
1
assert
df
[
movielens
.
ITEM_COLUMN
].
max
()
==
num_items
-
1
# This sort is used to shard the dataframe by user, and later to select
# the last item for a user to be used in validation.
logging
.
info
(
"Sorting by user, timestamp..."
)
# This sort is equivalent to
# df.sort_values([movielens.USER_COLUMN, movielens.TIMESTAMP_COLUMN],
# inplace=True)
# except that the order of items with the same user and timestamp are
# sometimes different. For some reason, this sort results in a better
# hit-rate during evaluation, matching the performance of the MLPerf
# reference implementation.
df
.
sort_values
(
by
=
movielens
.
TIMESTAMP_COLUMN
,
inplace
=
True
)
df
.
sort_values
([
movielens
.
USER_COLUMN
,
movielens
.
TIMESTAMP_COLUMN
],
inplace
=
True
,
kind
=
"mergesort"
)
# The dataframe does not reconstruct indices in the sort or filter steps.
return
user_map
,
item_map
,
df
.
reset_index
()
def
_filter_index_sort
(
raw_rating_path
:
Text
,
cache_path
:
Text
)
->
Tuple
[
pd
.
DataFrame
,
bool
]:
"""Read in data CSV, and output structured data.
"""Read in data CSV, and output structured data.
This function reads in the raw CSV of positive items, and performs three
This function reads in the raw CSV of positive items, and performs three
...
@@ -100,52 +172,7 @@ def _filter_index_sort(raw_rating_path, cache_path):
...
@@ -100,52 +172,7 @@ def _filter_index_sort(raw_rating_path, cache_path):
if
valid_cache
:
if
valid_cache
:
data
=
cached_data
data
=
cached_data
else
:
else
:
with
tf
.
io
.
gfile
.
GFile
(
raw_rating_path
)
as
f
:
user_map
,
item_map
,
df
=
read_dataframe
(
raw_rating_path
)
df
=
pd
.
read_csv
(
f
)
# Get the info of users who have more than 20 ratings on items
grouped
=
df
.
groupby
(
movielens
.
USER_COLUMN
)
df
=
grouped
.
filter
(
lambda
x
:
len
(
x
)
>=
rconst
.
MIN_NUM_RATINGS
)
# type: pd.DataFrame
original_users
=
df
[
movielens
.
USER_COLUMN
].
unique
()
original_items
=
df
[
movielens
.
ITEM_COLUMN
].
unique
()
# Map the ids of user and item to 0 based index for following processing
logging
.
info
(
"Generating user_map and item_map..."
)
user_map
=
{
user
:
index
for
index
,
user
in
enumerate
(
original_users
)}
item_map
=
{
item
:
index
for
index
,
item
in
enumerate
(
original_items
)}
df
[
movielens
.
USER_COLUMN
]
=
df
[
movielens
.
USER_COLUMN
].
apply
(
lambda
user
:
user_map
[
user
])
df
[
movielens
.
ITEM_COLUMN
]
=
df
[
movielens
.
ITEM_COLUMN
].
apply
(
lambda
item
:
item_map
[
item
])
num_users
=
len
(
original_users
)
num_items
=
len
(
original_items
)
assert
num_users
<=
np
.
iinfo
(
rconst
.
USER_DTYPE
).
max
assert
num_items
<=
np
.
iinfo
(
rconst
.
ITEM_DTYPE
).
max
assert
df
[
movielens
.
USER_COLUMN
].
max
()
==
num_users
-
1
assert
df
[
movielens
.
ITEM_COLUMN
].
max
()
==
num_items
-
1
# This sort is used to shard the dataframe by user, and later to select
# the last item for a user to be used in validation.
logging
.
info
(
"Sorting by user, timestamp..."
)
# This sort is equivalent to
# df.sort_values([movielens.USER_COLUMN, movielens.TIMESTAMP_COLUMN],
# inplace=True)
# except that the order of items with the same user and timestamp are
# sometimes different. For some reason, this sort results in a better
# hit-rate during evaluation, matching the performance of the MLPerf
# reference implementation.
df
.
sort_values
(
by
=
movielens
.
TIMESTAMP_COLUMN
,
inplace
=
True
)
df
.
sort_values
([
movielens
.
USER_COLUMN
,
movielens
.
TIMESTAMP_COLUMN
],
inplace
=
True
,
kind
=
"mergesort"
)
# The dataframe does not reconstruct indices in the sort or filter steps.
df
=
df
.
reset_index
()
grouped
=
df
.
groupby
(
movielens
.
USER_COLUMN
,
group_keys
=
False
)
grouped
=
df
.
groupby
(
movielens
.
USER_COLUMN
,
group_keys
=
False
)
eval_df
,
train_df
=
grouped
.
tail
(
1
),
grouped
.
apply
(
lambda
x
:
x
.
iloc
[:
-
1
])
eval_df
,
train_df
=
grouped
.
tail
(
1
),
grouped
.
apply
(
lambda
x
:
x
.
iloc
[:
-
1
])
...
@@ -201,7 +228,7 @@ def instantiate_pipeline(dataset,
...
@@ -201,7 +228,7 @@ def instantiate_pipeline(dataset,
raw_data
,
_
=
_filter_index_sort
(
raw_rating_path
,
cache_path
)
raw_data
,
_
=
_filter_index_sort
(
raw_rating_path
,
cache_path
)
user_map
,
item_map
=
raw_data
[
"user_map"
],
raw_data
[
"item_map"
]
user_map
,
item_map
=
raw_data
[
"user_map"
],
raw_data
[
"item_map"
]
num_users
,
num_items
=
DATASET_TO_NUM_USERS_AND_ITEMS
[
dataset
]
num_users
,
num_items
=
movielens
.
DATASET_TO_NUM_USERS_AND_ITEMS
[
dataset
]
if
num_users
!=
len
(
user_map
):
if
num_users
!=
len
(
user_map
):
raise
ValueError
(
"Expected to find {} users, but found {}"
.
format
(
raise
ValueError
(
"Expected to find {} users, but found {}"
.
format
(
...
...
official/recommendation/data_test.py
View file @
98dd890b
...
@@ -95,8 +95,7 @@ class BaseTest(tf.test.TestCase):
...
@@ -95,8 +95,7 @@ class BaseTest(tf.test.TestCase):
movielens
.
download
=
mock_download
movielens
.
download
=
mock_download
movielens
.
NUM_RATINGS
[
DATASET
]
=
NUM_PTS
movielens
.
NUM_RATINGS
[
DATASET
]
=
NUM_PTS
data_preprocessing
.
DATASET_TO_NUM_USERS_AND_ITEMS
[
DATASET
]
=
(
NUM_USERS
,
movielens
.
DATASET_TO_NUM_USERS_AND_ITEMS
[
DATASET
]
=
(
NUM_USERS
,
NUM_ITEMS
)
NUM_ITEMS
)
def
make_params
(
self
,
train_epochs
=
1
):
def
make_params
(
self
,
train_epochs
=
1
):
return
{
return
{
...
...
official/recommendation/movielens.py
View file @
98dd890b
...
@@ -84,6 +84,8 @@ NUM_RATINGS = {
...
@@ -84,6 +84,8 @@ NUM_RATINGS = {
ML_20M
:
20000263
ML_20M
:
20000263
}
}
DATASET_TO_NUM_USERS_AND_ITEMS
=
{
ML_1M
:
(
6040
,
3706
),
ML_20M
:
(
138493
,
26744
)}
def
_download_and_clean
(
dataset
,
data_dir
):
def
_download_and_clean
(
dataset
,
data_dir
):
"""Download MovieLens dataset in a standard format.
"""Download MovieLens dataset in a standard format.
...
@@ -284,17 +286,24 @@ def integerize_genres(dataframe):
...
@@ -284,17 +286,24 @@ def integerize_genres(dataframe):
return
dataframe
return
dataframe
def
define_flags
():
"""Add flags specifying data usage arguments."""
flags
.
DEFINE_enum
(
name
=
"dataset"
,
default
=
None
,
enum_values
=
DATASETS
,
case_sensitive
=
False
,
help
=
flags_core
.
help_wrap
(
"Dataset to be trained and evaluated."
))
def
define_data_download_flags
():
def
define_data_download_flags
():
"""Add flags specifying data download arguments."""
"""Add flags specifying data download
and usage
arguments."""
flags
.
DEFINE_string
(
flags
.
DEFINE_string
(
name
=
"data_dir"
,
default
=
"/tmp/movielens-data/"
,
name
=
"data_dir"
,
default
=
"/tmp/movielens-data/"
,
help
=
flags_core
.
help_wrap
(
help
=
flags_core
.
help_wrap
(
"Directory to download and extract data."
))
"Directory to download and extract data."
))
flags
.
DEFINE_enum
(
define_flags
()
name
=
"dataset"
,
default
=
None
,
enum_values
=
DATASETS
,
case_sensitive
=
False
,
help
=
flags_core
.
help_wrap
(
"Dataset to be trained and evaluated."
))
def
main
(
_
):
def
main
(
_
):
...
...
official/recommendation/ncf_common.py
View file @
98dd890b
...
@@ -50,7 +50,7 @@ def get_inputs(params):
...
@@ -50,7 +50,7 @@ def get_inputs(params):
if
FLAGS
.
use_synthetic_data
:
if
FLAGS
.
use_synthetic_data
:
producer
=
data_pipeline
.
DummyConstructor
()
producer
=
data_pipeline
.
DummyConstructor
()
num_users
,
num_items
=
data_preprocessing
.
DATASET_TO_NUM_USERS_AND_ITEMS
[
num_users
,
num_items
=
movielens
.
DATASET_TO_NUM_USERS_AND_ITEMS
[
FLAGS
.
dataset
]
FLAGS
.
dataset
]
num_train_steps
=
rconst
.
SYNTHETIC_BATCHES_PER_EPOCH
num_train_steps
=
rconst
.
SYNTHETIC_BATCHES_PER_EPOCH
num_eval_steps
=
rconst
.
SYNTHETIC_BATCHES_PER_EPOCH
num_eval_steps
=
rconst
.
SYNTHETIC_BATCHES_PER_EPOCH
...
@@ -163,21 +163,18 @@ def define_ncf_flags():
...
@@ -163,21 +163,18 @@ def define_ncf_flags():
flags
.
adopt_module_key_flags
(
flags_core
)
flags
.
adopt_module_key_flags
(
flags_core
)
movielens
.
define_flags
()
flags_core
.
set_defaults
(
flags_core
.
set_defaults
(
model_dir
=
"/tmp/ncf/"
,
model_dir
=
"/tmp/ncf/"
,
data_dir
=
"/tmp/movielens-data/"
,
data_dir
=
"/tmp/movielens-data/"
,
dataset
=
movielens
.
ML_1M
,
train_epochs
=
2
,
train_epochs
=
2
,
batch_size
=
99000
,
batch_size
=
99000
,
tpu
=
None
tpu
=
None
)
)
# Add ncf-specific flags
# Add ncf-specific flags
flags
.
DEFINE_enum
(
name
=
"dataset"
,
default
=
"ml-1m"
,
enum_values
=
[
"ml-1m"
,
"ml-20m"
],
case_sensitive
=
False
,
help
=
flags_core
.
help_wrap
(
"Dataset to be trained and evaluated."
))
flags
.
DEFINE_boolean
(
flags
.
DEFINE_boolean
(
name
=
"download_if_missing"
,
default
=
True
,
help
=
flags_core
.
help_wrap
(
name
=
"download_if_missing"
,
default
=
True
,
help
=
flags_core
.
help_wrap
(
"Download data to data_dir if it is not already present."
))
"Download data to data_dir if it is not already present."
))
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment