sys.exit("ERROR: Criteo Kaggle Display Ad Challenge Dataset path is invalid; please download from https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset")
else:
# WARNING: The raw data consist of day_0.gz,... ,day_23.gz text files
# Each line in the file is a sample, consisting of 13 continuous and
# 26 categorical features (an extra space indicates that feature is
# missing and will be interpreted as 0).
foriinrange(days):
datafile_i=datafile+"_"+str(i)# + ".gz"
ifpath.exists(str(datafile_i)):
print("Reading data from path=%s"%(str(datafile_i)))
# file day_<number>
total_per_file_count=0
withopen(str(datafile_i))asf:
for_inf:
total_per_file_count+=1
total_per_file.append(total_per_file_count)
total_count+=total_per_file_count
else:
sys.exit("ERROR: Criteo Terabyte Dataset path is invalid; please download from https://labs.criteo.com/2013/12/download-terabyte-click-logs")
# process a file worth of data and reinitialize data
# note that a file main contain a single or multiple splits
defprocess_one_file(
datfile,
npzfile,
split,
num_data_in_split,
dataset_multiprocessing,
convertDictsDay=None,
resultDay=None
):
ifdataset_multiprocessing:
convertDicts_day=[{}for_inrange(26)]
withopen(str(datfile))asf:
y=np.zeros(num_data_in_split,dtype="i4")# 4 byte int
X_int=np.zeros((num_data_in_split,13),dtype="i4")# 4 byte int
X_cat=np.zeros((num_data_in_split,26),dtype="i4")# 4 byte int