Commit 5fd039c4 authored by mashun's avatar mashun
Browse files

particlenet

parent 1e030ab7
......@@ -2,3 +2,4 @@ tf-keras/converted
tf-keras/original
*pycache*
*.h5
\ No newline at end of file
......@@ -52,7 +52,7 @@ original/
└── val.h5
```
运行`convert_dataset.ipynb`对数据进行处理,处理后,
运行`convert_dataset.py`对数据进行处理,处理后,
```
converted/
......@@ -61,15 +61,21 @@ converted/
└── val_file_0.awkd
```
提示:执行下述命令启动服务,`jupyter notebook --no-browser --ip=0.0.0.0 --allow-root`
<!-- 提示:执行下述命令启动服务,`jupyter notebook --no-browser --ip=0.0.0.0 --allow-root` -->
## 训练
`keras_train.ipynb`
```bash
python keras_train.py
```
## 推理
`keras_train.ipynb`
```bash
python predict_demo.py
```
注意:代码仅提供简单的推理测试,需根据需求进行修改。
## result
......
File mode changed from 100644 to 100755
import os
import pandas as pd
import numpy as np
import awkward
import uproot_methods
import logging
logging.basicConfig(level=logging.DEBUG, format='[%(asctime)s] %(levelname)s: %(message)s')
def _transform(dataframe, start=0, stop=-1, jet_size=0.8):
from collections import OrderedDict
v = OrderedDict()
df = dataframe.iloc[start:stop]
def _col_list(prefix, max_particles=200):
return ['%s_%d'%(prefix,i) for i in range(max_particles)]
_px = df[_col_list('PX')].values
_py = df[_col_list('PY')].values
_pz = df[_col_list('PZ')].values
_e = df[_col_list('E')].values
mask = _e>0
n_particles = np.sum(mask, axis=1)
px = awkward.JaggedArray.fromcounts(n_particles, _px[mask])
py = awkward.JaggedArray.fromcounts(n_particles, _py[mask])
pz = awkward.JaggedArray.fromcounts(n_particles, _pz[mask])
energy = awkward.JaggedArray.fromcounts(n_particles, _e[mask])
p4 = uproot_methods.TLorentzVectorArray.from_cartesian(px, py, pz, energy)
pt = p4.pt
jet_p4 = p4.sum()
# outputs
_label = df['is_signal_new'].values
v['label'] = np.stack((_label, 1-_label), axis=-1)
v['train_val_test'] = df['ttv'].values
v['jet_pt'] = jet_p4.pt
v['jet_eta'] = jet_p4.eta
v['jet_phi'] = jet_p4.phi
v['jet_mass'] = jet_p4.mass
v['n_parts'] = n_particles
v['part_px'] = px
v['part_py'] = py
v['part_pz'] = pz
v['part_energy'] = energy
v['part_pt_log'] = np.log(pt)
v['part_ptrel'] = pt/v['jet_pt']
v['part_logptrel'] = np.log(v['part_ptrel'])
v['part_e_log'] = np.log(energy)
v['part_erel'] = energy/jet_p4.energy
v['part_logerel'] = np.log(v['part_erel'])
v['part_raw_etarel'] = (p4.eta - v['jet_eta'])
_jet_etasign = np.sign(v['jet_eta'])
_jet_etasign[_jet_etasign==0] = 1
v['part_etarel'] = v['part_raw_etarel'] * _jet_etasign
v['part_phirel'] = p4.delta_phi(jet_p4)
v['part_deltaR'] = np.hypot(v['part_etarel'], v['part_phirel'])
def _make_image(var_img, rec, n_pixels = 64, img_ranges = [[-0.8, 0.8], [-0.8, 0.8]]):
wgt = rec[var_img]
x = rec['part_etarel']
y = rec['part_phirel']
img = np.zeros(shape=(len(wgt), n_pixels, n_pixels))
for i in range(len(wgt)):
hist2d, xedges, yedges = np.histogram2d(x[i], y[i], bins=[n_pixels, n_pixels], range=img_ranges, weights=wgt[i])
img[i] = hist2d
return img
# v['img'] = _make_image('part_ptrel', v)
return v
def convert(source, destdir, basename, step=None, limit=None):
df = pd.read_hdf(source, key='table')
logging.info('Total events: %s' % str(df.shape[0]))
if limit is not None:
df = df.iloc[0:limit]
logging.info('Restricting to the first %s events:' % str(df.shape[0]))
if step is None:
step = df.shape[0]
idx=-1
while True:
idx+=1
start=idx*step
if start>=df.shape[0]: break
if not os.path.exists(destdir):
os.makedirs(destdir)
output = os.path.join(destdir, '%s_%d.awkd'%(basename, idx))
logging.info(output)
if os.path.exists(output):
logging.warning('... file already exist: continue ...')
continue
v=_transform(df, start=start, stop=start+step)
awkward.save(output, v, mode='x')
srcDir = 'original'
destDir = 'converted'
# conver training file
convert(os.path.join(srcDir, 'train.h5'), destdir=destDir, basename='train_file')
# conver validation file
convert(os.path.join(srcDir, 'val.h5'), destdir=destDir, basename='val_file')
# conver testing file
convert(os.path.join(srcDir, 'test.h5'), destdir=destDir, basename='test_file')
\ No newline at end of file
File mode changed from 100644 to 100755
import numpy as np
import awkward
import tensorflow as tf
from tensorflow import keras
from tf_keras_model import get_particle_net, get_particle_net_lite
import os
import logging
logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s')
def stack_arrays(a, keys, axis=-1):
flat_arr = np.stack([a[k].flatten() for k in keys], axis=axis)
return awkward.JaggedArray.fromcounts(a[keys[0]].counts, flat_arr)
def pad_array(a, maxlen, value=0., dtype='float32'):
x = (np.ones((len(a), maxlen)) * value).astype(dtype)
for idx, s in enumerate(a):
if not len(s):
continue
trunc = s[:maxlen].astype(dtype)
x[idx, :len(trunc)] = trunc
return x
class Dataset(object):
def __init__(self, filepath, feature_dict = {}, label='label', pad_len=100, data_format='channel_first'):
self.filepath = filepath
self.feature_dict = feature_dict
if len(feature_dict)==0:
feature_dict['points'] = ['part_etarel', 'part_phirel']
feature_dict['features'] = ['part_pt_log', 'part_e_log', 'part_etarel', 'part_phirel']
feature_dict['mask'] = ['part_pt_log']
self.label = label
self.pad_len = pad_len
assert data_format in ('channel_first', 'channel_last')
self.stack_axis = 1 if data_format=='channel_first' else -1
self._values = {}
self._label = None
self._load()
def _load(self):
logging.info('Start loading file %s' % self.filepath)
counts = None
with awkward.load(self.filepath) as a:
self._label = a[self.label]
for k in self.feature_dict:
cols = self.feature_dict[k]
if not isinstance(cols, (list, tuple)):
cols = [cols]
arrs = []
for col in cols:
if counts is None:
counts = a[col].counts
else:
assert np.array_equal(counts, a[col].counts)
arrs.append(pad_array(a[col], self.pad_len))
self._values[k] = np.stack(arrs, axis=self.stack_axis)
logging.info('Finished loading file %s' % self.filepath)
def __len__(self):
return len(self._label)
def __getitem__(self, key):
if key==self.label:
return self._label
else:
return self._values[key]
@property
def X(self):
return self._values
@property
def y(self):
return self._label
def shuffle(self, seed=None):
if seed is not None:
np.random.seed(seed)
shuffle_indices = np.arange(self.__len__())
np.random.shuffle(shuffle_indices)
for k in self._values:
self._values[k] = self._values[k][shuffle_indices]
self._label = self._label[shuffle_indices]
def lr_schedule(epoch):
lr = 1e-3
if epoch > 10:
lr *= 0.1
elif epoch > 20:
lr *= 0.01
logging.info('Learning rate: %f'%lr)
return lr
if __name__ == "__main__":
train_dataset = Dataset('converted/train_file_0.awkd', data_format='channel_last')
val_dataset = Dataset('converted/val_file_0.awkd', data_format='channel_last')
model_type = 'particle_net_lite' # choose between 'particle_net' and 'particle_net_lite'
num_classes = train_dataset.y.shape[1]
input_shapes = {k:train_dataset[k].shape[1:] for k in train_dataset.X}
if 'lite' in model_type:
model = get_particle_net_lite(num_classes, input_shapes)
else:
model = get_particle_net(num_classes, input_shapes)
# Training parameters
batch_size = 1024 if 'lite' in model_type else 384
epochs = 30
model.compile(loss='categorical_crossentropy',
optimizer=keras.optimizers.Adam(learning_rate=lr_schedule(0)),
metrics=['accuracy'])
model.summary()
save_dir = 'model_checkpoints'
model_name = '%s_model.{epoch:03d}.h5' % model_type
if not os.path.isdir(save_dir):
os.makedirs(save_dir)
filepath = os.path.join(save_dir, model_name)
# Prepare callbacks for model saving and for learning rate adjustment.
checkpoint = keras.callbacks.ModelCheckpoint(filepath=filepath,
monitor='val_accuracy',
verbose=1,
save_best_only=False)
lr_scheduler = keras.callbacks.LearningRateScheduler(lr_schedule)
progress_bar = keras.callbacks.ProgbarLogger()
callbacks = [checkpoint, lr_scheduler, progress_bar]
train_dataset.shuffle()
model.fit(train_dataset.X, train_dataset.y,
batch_size=batch_size,
# epochs=epochs,
epochs=1, # --- train only for 1 epoch here for demonstration ---
validation_data=(val_dataset.X, val_dataset.y),
shuffle=True,
callbacks=callbacks)
\ No newline at end of file
import tensorflow as tf
import numpy as np
from keras_train import Dataset
# 加载保存的 Keras 模型 (.h5)
model = tf.keras.models.load_model('model_checkpoints/particle_net_lite_model.001.h5')
val_dataset = Dataset('converted/val_file_0.awkd', data_format='channel_last')
predictions = model.predict(val_dataset.X)
print(predictions)
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment