{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "import pandas as pd\n", "import numpy as np\n", "import awkward\n", "import uproot_methods" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import logging\n", "logging.basicConfig(level=logging.DEBUG, format='[%(asctime)s] %(levelname)s: %(message)s')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def _transform(dataframe, start=0, stop=-1, jet_size=0.8):\n", " from collections import OrderedDict\n", " v = OrderedDict()\n", "\n", " df = dataframe.iloc[start:stop]\n", " def _col_list(prefix, max_particles=200):\n", " return ['%s_%d'%(prefix,i) for i in range(max_particles)]\n", " \n", " _px = df[_col_list('PX')].values\n", " _py = df[_col_list('PY')].values\n", " _pz = df[_col_list('PZ')].values\n", " _e = df[_col_list('E')].values\n", " \n", " mask = _e>0\n", " n_particles = np.sum(mask, axis=1)\n", "\n", " px = awkward.JaggedArray.fromcounts(n_particles, _px[mask])\n", " py = awkward.JaggedArray.fromcounts(n_particles, _py[mask])\n", " pz = awkward.JaggedArray.fromcounts(n_particles, _pz[mask])\n", " energy = awkward.JaggedArray.fromcounts(n_particles, _e[mask])\n", "\n", " p4 = uproot_methods.TLorentzVectorArray.from_cartesian(px, py, pz, energy)\n", " pt = p4.pt\n", "\n", " jet_p4 = p4.sum()\n", "\n", " # outputs\n", " _label = df['is_signal_new'].values\n", " v['label'] = np.stack((_label, 1-_label), axis=-1)\n", " v['train_val_test'] = df['ttv'].values\n", " \n", " v['jet_pt'] = jet_p4.pt\n", " v['jet_eta'] = jet_p4.eta\n", " v['jet_phi'] = jet_p4.phi\n", " v['jet_mass'] = jet_p4.mass\n", " v['n_parts'] = n_particles\n", "\n", " v['part_px'] = px\n", " v['part_py'] = py\n", " v['part_pz'] = pz\n", " v['part_energy'] = energy\n", "\n", " v['part_pt_log'] = np.log(pt)\n", " v['part_ptrel'] = pt/v['jet_pt']\n", " v['part_logptrel'] = np.log(v['part_ptrel'])\n", "\n", " v['part_e_log'] = np.log(energy)\n", " v['part_erel'] = energy/jet_p4.energy\n", " v['part_logerel'] = np.log(v['part_erel'])\n", "\n", " v['part_raw_etarel'] = (p4.eta - v['jet_eta'])\n", " _jet_etasign = np.sign(v['jet_eta'])\n", " _jet_etasign[_jet_etasign==0] = 1\n", " v['part_etarel'] = v['part_raw_etarel'] * _jet_etasign\n", "\n", " v['part_phirel'] = p4.delta_phi(jet_p4)\n", " v['part_deltaR'] = np.hypot(v['part_etarel'], v['part_phirel'])\n", "\n", " def _make_image(var_img, rec, n_pixels = 64, img_ranges = [[-0.8, 0.8], [-0.8, 0.8]]):\n", " wgt = rec[var_img]\n", " x = rec['part_etarel']\n", " y = rec['part_phirel']\n", " img = np.zeros(shape=(len(wgt), n_pixels, n_pixels))\n", " for i in range(len(wgt)):\n", " hist2d, xedges, yedges = np.histogram2d(x[i], y[i], bins=[n_pixels, n_pixels], range=img_ranges, weights=wgt[i])\n", " img[i] = hist2d\n", " return img\n", "\n", "# v['img'] = _make_image('part_ptrel', v)\n", "\n", " return v" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def convert(source, destdir, basename, step=None, limit=None):\n", " df = pd.read_hdf(source, key='table')\n", " logging.info('Total events: %s' % str(df.shape[0]))\n", " if limit is not None:\n", " df = df.iloc[0:limit]\n", " logging.info('Restricting to the first %s events:' % str(df.shape[0]))\n", " if step is None:\n", " step = df.shape[0]\n", " idx=-1\n", " while True:\n", " idx+=1\n", " start=idx*step\n", " if start>=df.shape[0]: break\n", " if not os.path.exists(destdir):\n", " os.makedirs(destdir)\n", " output = os.path.join(destdir, '%s_%d.awkd'%(basename, idx))\n", " logging.info(output)\n", " if os.path.exists(output):\n", " logging.warning('... file already exist: continue ...')\n", " continue\n", " v=_transform(df, start=start, stop=start+step)\n", " awkward.save(output, v, mode='x')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "srcDir = 'original'\n", "destDir = 'converted'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# conver training file\n", "convert(os.path.join(srcDir, 'train.h5'), destdir=destDir, basename='train_file')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# conver validation file\n", "convert(os.path.join(srcDir, 'val.h5'), destdir=destDir, basename='val_file')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# conver testing file\n", "convert(os.path.join(srcDir, 'test.h5'), destdir=destDir, basename='test_file')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" } }, "nbformat": 4, "nbformat_minor": 2 }