Commit 524a1b6e authored by mashun's avatar mashun
Browse files

particle

parents
Pipeline #1943 failed with stages
in 0 seconds
import os
import numpy as np
import awkward as ak
import argparse
'''
Datasets introduction:
https://energyflow.network/docs/datasets/#quark-and-gluon-jets
Download:
- Pythia8 Quark and Gluon Jets for Energy Flow:
- https://zenodo.org/record/3164691
- Herwig7.1 Quark and Gluon Jets:
- https://zenodo.org/record/3066475
Versions:
- awkward==2.6.4
- vector==1.4.0
'''
def _p4_from_ptetaphim(pt, eta, phi, mass):
import vector
vector.register_awkward()
return vector.zip({'pt': pt, 'eta': eta, 'phi': phi, 'mass': mass})
def _transform(X, y, start=0, stop=-1):
# source_array: (num_data, max_num_particles, 4)
# (pt,y,phi,pid)
X = X[start:stop].astype(np.float32)
y = y[start:stop]
origPT = X[:, :, 0]
indices = np.argsort(-origPT, axis=1)
_pt = np.take_along_axis(X[:, :, 0], indices, axis=1)
_eta = np.take_along_axis(X[:, :, 1], indices, axis=1)
_phi = np.take_along_axis(X[:, :, 2], indices, axis=1)
_pid = np.take_along_axis(X[:, :, 3], indices, axis=1)
mask = _pt > 0
n_particles = np.sum(mask, axis=1)
pt = ak.unflatten(_pt[mask], n_particles)
eta = ak.unflatten(_eta[mask], n_particles)
phi = ak.unflatten(_phi[mask], n_particles)
mass = ak.zeros_like(pt)
PID = ak.unflatten(_pid[mask], n_particles)
p4 = _p4_from_ptetaphim(pt, eta, phi, mass)
px = p4.x
py = p4.y
pz = p4.z
energy = p4.energy
jet_p4 = ak.sum(p4, axis=1)
# outputs
v = {}
v['label'] = y
v['jet_pt'] = jet_p4.pt
v['jet_eta'] = jet_p4.eta
v['jet_phi'] = jet_p4.phi
v['jet_energy'] = jet_p4.energy
v['jet_mass'] = jet_p4.mass
v['jet_nparticles'] = n_particles
v['part_px'] = px
v['part_py'] = py
v['part_pz'] = pz
v['part_energy'] = energy
_jet_etasign = ak.to_numpy(np.sign(v['jet_eta']))
_jet_etasign[_jet_etasign == 0] = 1
v['part_deta'] = (p4.eta - v['jet_eta']) * _jet_etasign
v['part_dphi'] = p4.deltaphi(jet_p4)
v['part_pid'] = PID
v['part_isCHPlus'] = ak.values_astype((PID == 211) + (PID == 321) + (PID == 2212), 'float32')
v['part_isCHMinus'] = ak.values_astype((PID == -211) + (PID == -321) + (PID == -2212), 'float32')
v['part_isNeutralHadron'] = ak.values_astype((PID == 130) + (PID == 2112) + (PID == -2112), 'float32')
v['part_isPhoton'] = ak.values_astype(PID == 22, 'float32')
v['part_isEPlus'] = ak.values_astype(PID == -11, 'float32')
v['part_isEMinus'] = ak.values_astype(PID == 11, 'float32')
v['part_isMuPlus'] = ak.values_astype(PID == -13, 'float32')
v['part_isMuMinus'] = ak.values_astype(PID == 13, 'float32')
v['part_isChargedHadron'] = v['part_isCHPlus'] + v['part_isCHMinus']
v['part_isElectron'] = v['part_isEPlus'] + v['part_isEMinus']
v['part_isMuon'] = v['part_isMuPlus'] + v['part_isMuMinus']
v['part_charge'] = (v['part_isCHPlus'] + v['part_isEPlus'] + v['part_isMuPlus']
) - (v['part_isCHMinus'] + v['part_isEMinus'] + v['part_isMuMinus'])
for k in list(v.keys()):
if k.endswith('Plus') or k.endswith('Minus'):
del v[k]
return v
def convert(sources, destdir, basename):
if not os.path.exists(destdir):
os.makedirs(destdir)
for idx, sourcefile in enumerate(sources):
npfile = np.load(sourcefile)
output = os.path.join(destdir, '%s_%d.parquet' % (basename, idx))
print(sourcefile)
print(str(npfile['X'].shape))
print(output)
if os.path.exists(output):
os.remove(output)
v = _transform(npfile['X'], npfile['y'])
arr = ak.Array(v)
ak.to_parquet(arr, output, compression='LZ4', compression_level=4)
def natural_sort(l):
import re
def convert(text): return int(text) if text.isdigit() else text.lower()
def alphanum_key(key): return [convert(c) for c in re.split('([0-9]+)', key)]
return sorted(l, key=alphanum_key)
if __name__ == '__main__':
parser = argparse.ArgumentParser('Convert qg benchmark datasets')
parser.add_argument('-i', '--inputdir', required=True, help='Directory of input numpy files.')
parser.add_argument('-o', '--outputdir', required=True, help='Output directory.')
parser.add_argument('--train-test-split', default=0.9, help='Training / testing split fraction.')
args = parser.parse_args()
import glob
sources = natural_sort(glob.glob(os.path.join(args.inputdir, 'QG_jets*.npz')))
n_train = int(args.train_test_split * len(sources))
train_sources = sources[:n_train]
test_sources = sources[n_train:]
convert(train_sources, destdir=args.outputdir, basename='train_file')
convert(test_sources, destdir=args.outputdir, basename='test_file')
import os
import pandas as pd
import numpy as np
import awkward as ak
import argparse
'''
Datasets introduction:
- The Machine Learning landscape of top taggers:
- https://scipost.org/SciPostPhys.7.1.014
Download:
- https://zenodo.org/record/2603256
Versions:
- awkward==2.6.4
- vector==1.4.0
- pandas==2.2.2
- tables==3.9.2
'''
def _p4_from_pxpypze(px, py, pz, energy):
import vector
vector.register_awkward()
return vector.zip({'px': px, 'py': py, 'pz': pz, 'energy': energy})
def _transform(dataframe, start=0, stop=-1):
df = dataframe.iloc[start:stop]
def _col_list(prefix, max_particles=200):
return ['%s_%d' % (prefix, i) for i in range(max_particles)]
_px = df[_col_list('PX')].values
_py = df[_col_list('PY')].values
_pz = df[_col_list('PZ')].values
_e = df[_col_list('E')].values
mask = _e > 0
n_particles = np.sum(mask, axis=1)
px = ak.unflatten(_px[mask], n_particles)
py = ak.unflatten(_py[mask], n_particles)
pz = ak.unflatten(_pz[mask], n_particles)
energy = ak.unflatten(_e[mask], n_particles)
p4 = _p4_from_pxpypze(px, py, pz, energy)
jet_p4 = ak.sum(p4, axis=1)
# outputs
v = {}
v['label'] = df['is_signal_new'].values
v['jet_pt'] = jet_p4.pt
v['jet_eta'] = jet_p4.eta
v['jet_phi'] = jet_p4.phi
v['jet_energy'] = jet_p4.energy
v['jet_mass'] = jet_p4.mass
v['jet_nparticles'] = n_particles
v['part_px'] = px
v['part_py'] = py
v['part_pz'] = pz
v['part_energy'] = energy
_jet_etasign = ak.to_numpy(np.sign(v['jet_eta']))
_jet_etasign[_jet_etasign == 0] = 1
v['part_deta'] = (p4.eta - v['jet_eta']) * _jet_etasign
v['part_dphi'] = p4.deltaphi(jet_p4)
return v
def convert(source, destdir, basename):
df = pd.read_hdf(source, key='table')
print('Total events: %s' % str(df.shape[0]))
if not os.path.exists(destdir):
os.makedirs(destdir)
output = os.path.join(destdir, '%s.parquet' % basename)
print(output)
if os.path.exists(output):
os.remove(output)
v = _transform(df)
arr = ak.Array(v)
ak.to_parquet(arr, output, compression='LZ4', compression_level=4)
if __name__ == '__main__':
parser = argparse.ArgumentParser('Convert top benchmark h5 datasets')
parser.add_argument('-i', '--inputdir', required=True, help='Directory of input h5 files.')
parser.add_argument('-o', '--outputdir', required=True, help='Output directory.')
args = parser.parse_args()
# conver training file
convert(os.path.join(args.inputdir, 'train.h5'), destdir=args.outputdir, basename='train_file')
# conver validation file
convert(os.path.join(args.inputdir, 'val.h5'), destdir=args.outputdir, basename='val_file')
# conver testing file
convert(os.path.join(args.inputdir, 'test.h5'), destdir=args.outputdir, basename='test_file')
'''
Adapted from:
https://github.com/keras-team/keras/blob/master/keras/utils/data_utils.py
'''
import hashlib
import os
import shutil
import zipfile
import tarfile
import urllib
import requests
from tqdm import tqdm
def _download(url, fname, chunk_size=1024):
'''https://gist.github.com/yanqd0/c13ed29e29432e3cf3e7c38467f42f51'''
resp = requests.get(url, stream=True)
total = int(resp.headers.get('content-length', 0))
with open(fname, 'wb') as file, tqdm(
desc=fname,
total=total,
unit='iB',
unit_scale=True,
unit_divisor=1024,
) as bar:
for data in resp.iter_content(chunk_size=chunk_size):
size = file.write(data)
bar.update(size)
def extract_archive(file_path, path='.', archive_format='auto'):
"""Extracts an archive if it matches tar, tar.gz, tar.bz, or zip formats.
Args:
file_path: path to the archive file
path: path to extract the archive file
archive_format: Archive format to try for extracting the file.
Options are 'auto', 'tar', 'zip', and None.
'tar' includes tar, tar.gz, and tar.bz files.
The default 'auto' is ['tar', 'zip'].
None or an empty list will return no matches found.
Returns:
True if a match was found and an archive extraction was completed,
False otherwise.
"""
if archive_format is None:
return False
if archive_format == 'auto':
archive_format = ['tar', 'zip']
if isinstance(archive_format, str):
archive_format = [archive_format]
for archive_type in archive_format:
if archive_type == 'tar':
open_fn = tarfile.open
is_match_fn = tarfile.is_tarfile
if archive_type == 'zip':
open_fn = zipfile.ZipFile
is_match_fn = zipfile.is_zipfile
if is_match_fn(file_path):
with open_fn(file_path) as archive:
try:
archive.extractall(path)
except (tarfile.TarError, RuntimeError, KeyboardInterrupt):
if os.path.exists(path):
if os.path.isfile(path):
os.remove(path)
else:
shutil.rmtree(path)
raise
return True
return False
def _hash_file(fpath, algorithm='md5', chunk_size=131071):
"""Calculates a file sha256 or md5 hash.
# Example
```python
>>> from keras.data_utils import _hash_file
>>> _hash_file('/path/to/file.zip')
'e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855'
```
# Arguments
fpath: path to the file being validated
algorithm: hash algorithm, one of 'auto', 'sha256', or 'md5'.
The default 'auto' detects the hash algorithm in use.
chunk_size: Bytes to read at a time, important for large files.
# Returns
The file hash
"""
if (algorithm == 'sha256') or (algorithm == 'auto'):
hasher = hashlib.sha256()
else:
hasher = hashlib.md5()
with open(fpath, 'rb') as fpath_file:
for chunk in iter(lambda: fpath_file.read(chunk_size), b''):
hasher.update(chunk)
return hasher.hexdigest()
def validate_file(fpath, file_hash, algorithm='md5', chunk_size=131071):
"""Validates a file against a sha256 or md5 hash.
# Arguments
fpath: path to the file being validated
file_hash: The expected hash string of the file.
The sha256 and md5 hash algorithms are both supported.
algorithm: Hash algorithm, one of 'auto', 'sha256', or 'md5'.
The default 'auto' detects the hash algorithm in use.
chunk_size: Bytes to read at a time, important for large files.
# Returns
Whether the file is valid
"""
if ((algorithm == 'sha256') or (algorithm == 'auto' and len(file_hash) == 64)):
hasher = 'sha256'
else:
hasher = 'md5'
return str(_hash_file(fpath, hasher, chunk_size)) == str(file_hash)
def get_file(origin=None,
fname=None,
file_hash=None,
datadir='datasets',
hash_algorithm='md5',
extract=False,
force_download=False,
archive_format='auto'):
"""Downloads a file from a URL if it not already in the cache.
By default the file at the url `origin` is downloaded to the
cache_dir `~/.keras`, placed in the cache_subdir `datasets`,
and given the filename `fname`. The final location of a file
`example.txt` would therefore be `~/.keras/datasets/example.txt`.
Files in tar, tar.gz, tar.bz, and zip formats can also be extracted.
Passing a hash will verify the file after download. The command line
programs `shasum` and `sha256sum` can compute the hash.
Args:
fname: Name of the file. If an absolute path `/path/to/file.txt` is
specified the file will be saved at that location. If `None`, the
name of the file at `origin` will be used.
origin: Original URL of the file.
file_hash: The expected hash string of the file after download.
The sha256 and md5 hash algorithms are both supported.
cache_subdir: Subdirectory under the Keras cache dir where the file is
saved. If an absolute path `/path/to/folder` is
specified the file will be saved at that location.
hash_algorithm: Select the hash algorithm to verify the file.
options are `'md5'`, `'sha256'`, and `'auto'`.
The default 'auto' detects the hash algorithm in use.
extract: True tries extracting the file as an Archive, like tar or zip.
archive_format: Archive format to try for extracting the file.
Options are `'auto'`, `'tar'`, `'zip'`, and `None`.
`'tar'` includes tar, tar.gz, and tar.bz files.
The default `'auto'` corresponds to `['tar', 'zip']`.
None or an empty list will return no matches found.
cache_dir: Location to store cached files, when None it
defaults to the default directory `datasets/`.
Returns:
Path to the downloaded file
"""
if origin is None:
raise ValueError('Please specify the "origin" argument (URL of the file '
'to download).')
os.makedirs(datadir, exist_ok=True)
if not fname:
fname = os.path.basename(urllib.parse.urlsplit(origin).path)
if not fname:
raise ValueError(
f"Can't parse the file name from the origin provided: '{origin}'."
"Please specify the `fname` as the input param.")
fpath = os.path.join(datadir, fname)
download = False
if os.path.exists(fpath) and not force_download:
# File found; verify integrity if a hash was provided.
print(f'A local file already found at {fpath}, checking hash...')
if file_hash is not None:
if validate_file(fpath, file_hash, algorithm=hash_algorithm):
print('Local file hash matches, no need to download.')
else:
print(
'A local file was found, but it seems to be '
f'incomplete or outdated because the {hash_algorithm} '
f'file hash does not match the original value of {file_hash} '
'so we will re-download the data.')
download = True
else:
download = True
if download:
print(f'Downloading data from {origin} to {fpath}')
error_msg = 'URL fetch failure on {}: {}'
try:
try:
_download(origin, fpath)
except requests.exceptions.RequestException as e:
raise Exception(error_msg.format(origin, e.msg))
except (Exception, KeyboardInterrupt) as e:
if os.path.exists(fpath):
os.remove(fpath)
raise
if file_hash is not None:
if not validate_file(fpath, file_hash, algorithm=hash_algorithm):
if os.path.exists(fpath):
os.remove(fpath)
raise RuntimeError(f'Checksum does not match for file {fpath}')
if extract:
extract_archive(fpath, datadir, archive_format)
return fpath, download
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment