from torch.utils.data import Dataset import sys import dbop from sklearn.model_selection import train_test_split import numpy as np from tool import load_joblib, save_joblib, print_label_sta, linear_soomth import torch #import torch_mlu # import torch_mlu import re from torch.nn.utils.rnn import pad_packed_sequence, pad_sequence, pack_padded_sequence import datetime import copy import random def check_abnormal_in_mat(xmat, desiresize=[697,3,16,8]): for x in range(0, len(xmat)): if len(xmat[x]) != desiresize[1]: print("%d" % x) for y in range(0, len(xmat[x])): if len(xmat[x][y]) != desiresize[2]: print("%d, %d" % (x, y)) for z in range(0, len(xmat[x][y])): if len(xmat[x][y][z]) != desiresize[3]: print("%d, %d, %d" % (x, y, z)) class HourDataset(Dataset): def __init__(self, least_hourlen, random_seed): self.mode = None self.least_hourlen = least_hourlen self.X_train = None self.X_test = None self.y_train = None self.y_test = None self.spots_aday = 16 self.days_2_include = 3 self.person_day_mode = False self.random_seed = random_seed def _case_pass(self, rec): used_time = int(re.findall("\d+", rec['scale_usd_tm'])[0]) if len(rec['pred_iars']) > self.least_hourlen and used_time > 250: return True else: return False def _putin_timeslots(self, rec, begdate, enddate): gap_days = (enddate - begdate).days vec = [] for i in range(0, gap_days): for x in range(0, self.spots_aday):#16 hours in a day vec.append([]) for i in range(0, len(rec['begs'])): cur_time = datetime.datetime.fromtimestamp(rec['begs'][i]) # print("beg:%s, this:%s, end:%s" % (str(begdate), str(cur_time), str(enddate))) if begdate < cur_time < enddate: idx = ((cur_time - begdate).days * self.spots_aday) + (cur_time.hour - 8) if rec['pred_ival'][i] != None: vec[idx].extend(rec['pred_ival'][i])#3 vec[idx].extend(rec['pred_pval'][i])#3 else: vec[idx].extend([None]*3) vec[idx].extend([None]*3) vec[idx].append(rec['pred_iars'][i])#1 vec[idx].append(rec['pred_pars'][i])#1 return vec def _get_lab(self, rec): score = rec['phq'] if score > 15: # if score >= 10: return 1 elif score < 5: return 0 else: return -4 # score = rec['dass'][0] # if score >= 21: # return 1 # elif score <= 9: # return 0 # else: # return -4 def get_type_a_ftlab(self, rec, day_range): lab = self._get_lab(rec) if lab != -4 and self._case_pass(rec): scale_day = datetime.datetime.strptime(rec['rating_daystr'], "%Y%m%d") scale_day = datetime.datetime(scale_day.year, scale_day.month, scale_day.day, 0,0,0) beg_day = scale_day + datetime.timedelta(days=(-1 * day_range)) beg_day = datetime.datetime(beg_day.year, beg_day.month, beg_day.day, 0,0,0) person_vec = self._putin_timeslots(rec, beg_day, scale_day) return person_vec, lab, rec['uid'], beg_day, scale_day else: return None, None, None, None, None def under_sample(self, X, y): # npX = np.array(X) # npy = np.array(y) # origin_shape = npX.shape # npX = npX.reshape(origin_shape[0], origin_shape[1]*origin_shape[2]*origin_shape[3]) # feats, labs = RandomUnderSampler().fit_resample(npX, npy) # feats = feats.reshape(-1, origin_shape[1]*origin_shape[2], origin_shape[3]) sta_dict = {} indices_dict = {} for i in range(0, len(y)): if y[i] not in sta_dict: sta_dict[y[i]] = 1 indices_dict[y[i]] = [i] else: sta_dict[y[i]] += 1 indices_dict[y[i]].append(i) n_sample = min(sta_dict.values()) balanced_indeces_collection = [] random.seed(self.random_seed) for one in indices_dict: balanced_indeces_collection.extend(random.sample(indices_dict[one], n_sample)) feats = [] labs = [] for idx in balanced_indeces_collection: feats.append(X[balanced_indeces_collection[idx]]) labs.append(y[balanced_indeces_collection[idx]]) return feats, labs def _check_day_buf(self, daybuf): n_invalid = 0 for hour in daybuf: if (len(hour) == 0) or (hour.count(None) > 0): n_invalid += 1 return n_invalid def _feats_back_selection(\ self, allfeats, alllabs, alluids, miss_tole=6): ret_feats = [] ret_labs = [] ret_uids = [] #person, hour, feat for i in range(0, len(allfeats)):#person person_buf = [] nday = 0 nrec = 0 daybuf = [] for j in range(len(allfeats[i])-1, -1, -1):#days backward daybuf.append(allfeats[i][j]) nrec += 1 if nrec % self.spots_aday == 0:#went through a day n_invalid = self._check_day_buf(daybuf) if n_invalid < miss_tole: person_buf.append(daybuf) nday += 1 if nday >= self.days_2_include: break daybuf = [] if nday >= self.days_2_include: ret_feats.append(person_buf) ret_labs.append(alllabs[i]) ret_uids.append(alluids[i]) return ret_feats, ret_labs, ret_uids def _fill_empty_with_mask(self, day_data, ftsetlen=8, mask=-1): for i in range(0, len(day_data)):#hour if len(day_data[i]) == 0: day_data[i] = [mask] * ftsetlen for j in range(0, len(day_data[i])):#ft if day_data[i][j] == None: day_data[i][j] = -1 def load(self, split=True): print('begins to load!') day_range = 30 src_coll = dbop.GetMongoCollection('FLP', 'nj_2021_hour') recs = src_coll.find({}) allfeats = [] alllabs = [] alluids = [] all_dayrange = [] for rec in recs: feat, lab, uid, begday, endday = self.get_type_a_ftlab(rec, day_range) if feat != None: allfeats.append(feat) alllabs.append(lab) alluids.append(uid) all_dayrange.append([begday, endday]) print_label_sta(alllabs) print("data loaded!") recs.close() fbs_feats, fbs_labs, fbs_uids =\ self._feats_back_selection(allfeats, alllabs, alluids) print('After backward selection') print_label_sta(fbs_labs) print("feat shape: " + str(np.array(fbs_feats, dtype=object).shape)) #fill empty with mask for p1 in fbs_feats: for p1day1 in p1: self._fill_empty_with_mask(p1day1) #linear smooth for i in range(0, len(fbs_feats)):#i=person for j in range(0, len(fbs_feats[i])):#j=day buf = copy.deepcopy(fbs_feats[i][j]) buf_t = np.array(buf).T.tolist() for k in range(0, len(buf_t)): buf_t[k] = linear_soomth(buf_t[k]) fbs_feats[i][j] = np.array(buf_t).T.tolist() # check_abnormal_in_mat(fbs_feats) us_feats, us_labs = self.under_sample(fbs_feats, fbs_labs) print('After under sample') print_label_sta(us_labs) if split: self.X_train, self.X_test, self.y_train, self.y_test =\ train_test_split(us_feats, us_labs, test_size=0.3, shuffle=False) print("Dataset: train: %s, test: %s" % (str(np.array(self.X_train).shape), str(np.array(self.X_test).shape))) if self.person_day_mode: new_y_train = [] for oneytr in self.y_train: new_y_train.extend([oneytr] * self.days_2_include) self.y_train = new_y_train new_y_test = [] for oneyte in self.y_test: new_y_test.extend([oneyte] * self.days_2_include) self.y_test = new_y_test self.X_train = np.array(self.X_train) self.X_train = self.X_train.reshape(-1, self.spots_aday, self.get_input_size()) self.X_test = np.array(self.X_test) self.X_test = self.X_test.reshape(-1, self.spots_aday, self.get_input_size()) print("p_d_mode: train: %d-%d, test: %d-%d" %\ (len(self.X_train), len(self.y_train), len(self.X_test), len(self.y_test))) def get_input_size(self): shape = self.X_train.shape return shape[len(shape)-1] def get_time_step(self): shape = self.X_train.shape return shape[len(shape)-2] def savefile(self, path): save_dict = {'X_train':self.X_train, 'X_test':self.X_test,\ 'y_train':self.y_train, 'y_test':self.y_test} save_joblib(save_dict, path) def loadfile(self, path): load_dict = load_joblib(path) x_test = load_dict['X_test'] y_test = load_dict['y_test'] X_train=load_dict['X_train'] y_train= load_dict['y_train'] self.X_train = np.vstack((X_train, X_train, X_train, X_train, X_train, X_train, X_train, X_train, X_train, X_train,X_train, X_train, X_train, X_train, X_train, X_train, X_train, X_train, X_train, X_train)) self.y_train = np.hstack((y_train, y_train, y_train, y_train, y_train, y_train, y_train, y_train, y_train, y_train,y_train, y_train, y_train, y_train, y_train, y_train, y_train, y_train, y_train, y_train)) # self.X_train=X_train # self.y_train=y_train self.X_test = x_test self.y_test =y_test print ("train shape:" + str(np.array(self.X_train).shape)) def set_mode(self, mode): if mode == "train": self.mode = "train" elif mode == "test": self.mode = "test" def __getitem__(self, index): ftensor = None ltensor = None if self.mode == "train": npfeat = np.array(self.X_train[index], dtype=np.float32) ftensor = torch.from_numpy(npfeat).to(torch.float32) nplab = np.array(self.y_train[index]) ltensor = torch.from_numpy(nplab).to(torch.int64) elif self.mode == "test": npfeat = np.array(self.X_test[index]) ftensor = torch.from_numpy(npfeat).to(torch.float32) nplab = np.array(self.y_test[index]) ltensor = torch.from_numpy(nplab).to(torch.int64) return ftensor, ltensor def __len__(self): if self.mode == "train": return len(self.X_train) elif self.mode == "test": return len(self.X_test) if __name__ == "__main__": hd = HourDataset(100) hd.load() hd.savefile('dat_3day_pcase_miss3_2')