particlenet

5fd039c4 · mashun · 1e030ab7 · 5fd039c4 · 5fd039c4 · 5fd039c4
Commit 5fd039c4 authored Nov 15, 2024 by mashun
9 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@ tf-keras/converted
 tf-keras/original

 *pycache*
+*.h5
\ No newline at end of file
--- a/README.md
+++ b/README.md
@@ -52,7 +52,7 @@ original/
 └── val.h5
 ```

-运行`convert_dataset.ipynb`对数据进行处理，处理后，
+运行`convert_dataset.py`对数据进行处理，处理后，

 ```
 converted/
@@ -61,15 +61,21 @@ converted/
 └── val_file_0.awkd
 ```

-提示：执行下述命令启动服务，`jupyter notebook --no-browser --ip=0.0.0.0 --allow-root`
+<!-- 提示：执行下述命令启动服务，`jupyter notebook --no-browser --ip=0.0.0.0 --allow-root` -->

 ## 训练

-见`keras_train.ipynb`
+```bash
+python keras_train.py
+```

 ## 推理

-见`keras_train.ipynb`
+```bash
+python predict_demo.py
+```
+
+注意：代码仅提供简单的推理测试，需根据需求进行修改。

 ## result


--- a/tf-keras/convert_dataset.ipynb
+++ b/tf-keras/convert_dataset.ipynb
--- a/tf-keras/convert_dataset.py
+++ b/tf-keras/convert_dataset.py
+import os
+import pandas as pd
+import numpy as np
+import awkward
+import uproot_methods
+
+import logging
+logging.basicConfig(level=logging.DEBUG, format='[%(asctime)s] %(levelname)s: %(message)s')
+
+
+def _transform(dataframe, start=0, stop=-1, jet_size=0.8):
+    from collections import OrderedDict
+    v = OrderedDict()
+
+    df = dataframe.iloc[start:stop]
+    def _col_list(prefix, max_particles=200):
+        return ['%s_%d'%(prefix,i) for i in range(max_particles)]
+    
+    _px = df[_col_list('PX')].values
+    _py = df[_col_list('PY')].values
+    _pz = df[_col_list('PZ')].values
+    _e = df[_col_list('E')].values
+    
+    mask = _e>0
+    n_particles = np.sum(mask, axis=1)
+
+    px = awkward.JaggedArray.fromcounts(n_particles, _px[mask])
+    py = awkward.JaggedArray.fromcounts(n_particles, _py[mask])
+    pz = awkward.JaggedArray.fromcounts(n_particles, _pz[mask])
+    energy = awkward.JaggedArray.fromcounts(n_particles, _e[mask])
+
+    p4 = uproot_methods.TLorentzVectorArray.from_cartesian(px, py, pz, energy)
+    pt = p4.pt
+
+    jet_p4 = p4.sum()
+
+    # outputs
+    _label = df['is_signal_new'].values
+    v['label'] = np.stack((_label, 1-_label), axis=-1)
+    v['train_val_test'] = df['ttv'].values
+    
+    v['jet_pt'] = jet_p4.pt
+    v['jet_eta'] = jet_p4.eta
+    v['jet_phi'] = jet_p4.phi
+    v['jet_mass'] = jet_p4.mass
+    v['n_parts'] = n_particles
+
+    v['part_px'] = px
+    v['part_py'] = py
+    v['part_pz'] = pz
+    v['part_energy'] = energy
+
+    v['part_pt_log'] = np.log(pt)
+    v['part_ptrel'] = pt/v['jet_pt']
+    v['part_logptrel'] = np.log(v['part_ptrel'])
+
+    v['part_e_log'] = np.log(energy)
+    v['part_erel'] = energy/jet_p4.energy
+    v['part_logerel'] = np.log(v['part_erel'])
+
+    v['part_raw_etarel'] = (p4.eta - v['jet_eta'])
+    _jet_etasign = np.sign(v['jet_eta'])
+    _jet_etasign[_jet_etasign==0] = 1
+    v['part_etarel'] = v['part_raw_etarel'] * _jet_etasign
+
+    v['part_phirel'] = p4.delta_phi(jet_p4)
+    v['part_deltaR'] = np.hypot(v['part_etarel'], v['part_phirel'])
+
+    def _make_image(var_img, rec, n_pixels = 64, img_ranges = [[-0.8, 0.8], [-0.8, 0.8]]):
+        wgt = rec[var_img]
+        x = rec['part_etarel']
+        y = rec['part_phirel']
+        img = np.zeros(shape=(len(wgt), n_pixels, n_pixels))
+        for i in range(len(wgt)):
+            hist2d, xedges, yedges = np.histogram2d(x[i], y[i], bins=[n_pixels, n_pixels], range=img_ranges, weights=wgt[i])
+            img[i] = hist2d
+        return img
+
+#     v['img'] = _make_image('part_ptrel', v)
+
+    return v
+
+
+def convert(source, destdir, basename, step=None, limit=None):
+    df = pd.read_hdf(source, key='table')
+    logging.info('Total events: %s' % str(df.shape[0]))
+    if limit is not None:
+        df = df.iloc[0:limit]
+        logging.info('Restricting to the first %s events:' % str(df.shape[0]))
+    if step is None:
+        step = df.shape[0]
+    idx=-1
+    while True:
+        idx+=1
+        start=idx*step
+        if start>=df.shape[0]: break
+        if not os.path.exists(destdir):
+            os.makedirs(destdir)
+        output = os.path.join(destdir, '%s_%d.awkd'%(basename, idx))
+        logging.info(output)
+        if os.path.exists(output):
+            logging.warning('... file already exist: continue ...')
+            continue
+        v=_transform(df, start=start, stop=start+step)
+        awkward.save(output, v, mode='x')
+
+
+srcDir = 'original'
+destDir = 'converted'
+
+
+# conver training file
+convert(os.path.join(srcDir, 'train.h5'), destdir=destDir, basename='train_file')
+
+
+# conver validation file
+convert(os.path.join(srcDir, 'val.h5'), destdir=destDir, basename='val_file')
+
+
+# conver testing file
+convert(os.path.join(srcDir, 'test.h5'), destdir=destDir, basename='test_file')
\ No newline at end of file
--- a/tf-keras/keras_train.ipynb
+++ b/tf-keras/keras_train.ipynb
--- a/tf-keras/keras_train.py
+++ b/tf-keras/keras_train.py
+import numpy as np
+import awkward
+
+import tensorflow as tf
+from tensorflow import keras
+from tf_keras_model import get_particle_net, get_particle_net_lite
+
+import os
+import logging
+
+logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s')
+
+
+def stack_arrays(a, keys, axis=-1):
+    flat_arr = np.stack([a[k].flatten() for k in keys], axis=axis)
+    return awkward.JaggedArray.fromcounts(a[keys[0]].counts, flat_arr)
+
+
+def pad_array(a, maxlen, value=0., dtype='float32'):
+    x = (np.ones((len(a), maxlen)) * value).astype(dtype)
+    for idx, s in enumerate(a):
+        if not len(s):
+            continue
+        trunc = s[:maxlen].astype(dtype)
+        x[idx, :len(trunc)] = trunc
+    return x
+
+
+class Dataset(object):
+
+    def __init__(self, filepath, feature_dict = {}, label='label', pad_len=100, data_format='channel_first'):
+        self.filepath = filepath
+        self.feature_dict = feature_dict
+        if len(feature_dict)==0:
+            feature_dict['points'] = ['part_etarel', 'part_phirel']
+            feature_dict['features'] = ['part_pt_log', 'part_e_log', 'part_etarel', 'part_phirel']
+            feature_dict['mask'] = ['part_pt_log']
+        self.label = label
+        self.pad_len = pad_len
+        assert data_format in ('channel_first', 'channel_last')
+        self.stack_axis = 1 if data_format=='channel_first' else -1
+        self._values = {}
+        self._label = None
+        self._load()
+
+    def _load(self):
+        logging.info('Start loading file %s' % self.filepath)
+        counts = None
+        with awkward.load(self.filepath) as a:
+            self._label = a[self.label]
+            for k in self.feature_dict:
+                cols = self.feature_dict[k]
+                if not isinstance(cols, (list, tuple)):
+                    cols = [cols]
+                arrs = []
+                for col in cols:
+                    if counts is None:
+                        counts = a[col].counts
+                    else:
+                        assert np.array_equal(counts, a[col].counts)
+                    arrs.append(pad_array(a[col], self.pad_len))
+                self._values[k] = np.stack(arrs, axis=self.stack_axis)
+        logging.info('Finished loading file %s' % self.filepath)
+
+
+    def __len__(self):
+        return len(self._label)
+
+    def __getitem__(self, key):
+        if key==self.label:
+            return self._label
+        else:
+            return self._values[key]
+    
+    @property
+    def X(self):
+        return self._values
+    
+    @property
+    def y(self):
+        return self._label
+
+    def shuffle(self, seed=None):
+        if seed is not None:
+            np.random.seed(seed)
+        shuffle_indices = np.arange(self.__len__())
+        np.random.shuffle(shuffle_indices)
+        for k in self._values:
+            self._values[k] = self._values[k][shuffle_indices]
+        self._label = self._label[shuffle_indices]
+
+
+def lr_schedule(epoch):
+    lr = 1e-3
+    if epoch > 10:
+        lr *= 0.1
+    elif epoch > 20:
+        lr *= 0.01
+    logging.info('Learning rate: %f'%lr)
+    return lr
+
+
+if __name__ == "__main__":
+        
+    train_dataset = Dataset('converted/train_file_0.awkd', data_format='channel_last')
+    val_dataset = Dataset('converted/val_file_0.awkd', data_format='channel_last')
+
+    model_type = 'particle_net_lite' # choose between 'particle_net' and 'particle_net_lite'
+    num_classes = train_dataset.y.shape[1]
+    input_shapes = {k:train_dataset[k].shape[1:] for k in train_dataset.X}
+    if 'lite' in model_type:
+        model = get_particle_net_lite(num_classes, input_shapes)
+    else:
+        model = get_particle_net(num_classes, input_shapes)
+        
+    # Training parameters
+    batch_size = 1024 if 'lite' in model_type else 384
+    epochs = 30
+
+    model.compile(loss='categorical_crossentropy',
+                optimizer=keras.optimizers.Adam(learning_rate=lr_schedule(0)),
+                metrics=['accuracy'])
+    model.summary()
+
+
+    save_dir = 'model_checkpoints'
+    model_name = '%s_model.{epoch:03d}.h5' % model_type
+    if not os.path.isdir(save_dir):
+        os.makedirs(save_dir)
+    filepath = os.path.join(save_dir, model_name)
+
+
+    # Prepare callbacks for model saving and for learning rate adjustment.
+    checkpoint = keras.callbacks.ModelCheckpoint(filepath=filepath,
+                                monitor='val_accuracy',
+                                verbose=1,
+                                save_best_only=False)
+
+
+    lr_scheduler = keras.callbacks.LearningRateScheduler(lr_schedule)
+    progress_bar = keras.callbacks.ProgbarLogger()
+    callbacks = [checkpoint, lr_scheduler, progress_bar]
+
+
+    train_dataset.shuffle()
+    model.fit(train_dataset.X, train_dataset.y,
+            batch_size=batch_size,
+    #           epochs=epochs,
+            epochs=1, # --- train only for 1 epoch here for demonstration ---
+            validation_data=(val_dataset.X, val_dataset.y),
+            shuffle=True,
+            callbacks=callbacks)
\ No newline at end of file
--- a/tf-keras/predict_demo.py
+++ b/tf-keras/predict_demo.py
+import tensorflow as tf
+import numpy as np
+
+from keras_train import Dataset
+
+
+# 加载保存的 Keras 模型 (.h5)
+model = tf.keras.models.load_model('model_checkpoints/particle_net_lite_model.001.h5')
+
+val_dataset = Dataset('converted/val_file_0.awkd', data_format='channel_last')
+
+predictions = model.predict(val_dataset.X)
+
+print(predictions)
--- a/tf-keras/requirements.txt
+++ b/tf-keras/requirements.txt
--- a/tf-keras/tf_keras_model.py
+++ b/tf-keras/tf_keras_model.py