particle

524a1b6e · mashun · 524a1b6e · 524a1b6e · 524a1b6e
Commit 524a1b6e authored Nov 21, 2024 by mashun
Showing with 471 additions and 0 deletions

utils/convert_qg_datasets.py utils/convert_qg_datasets.py +146 -0

utils/convert_top_datasets.py utils/convert_top_datasets.py +106 -0

utils/dataset_utils.py utils/dataset_utils.py +219 -0

No files found.
--- a/utils/convert_qg_datasets.py
+++ b/utils/convert_qg_datasets.py
+import os
+import numpy as np
+import awkward as ak
+import argparse
+'''
+Datasets introduction:
+https://energyflow.network/docs/datasets/#quark-and-gluon-jets
+Download:
+- Pythia8 Quark and Gluon Jets for Energy Flow:
+  - https://zenodo.org/record/3164691
+- Herwig7.1 Quark and Gluon Jets:
+  - https://zenodo.org/record/3066475
+Versions:
+ - awkward==2.6.4
+ - vector==1.4.0
+'''
+def _p4_from_ptetaphim(pt, eta, phi, mass):
+    import vector
+    vector.register_awkward()
+    return vector.zip({'pt': pt, 'eta': eta, 'phi': phi, 'mass': mass})
+def _transform(X, y, start=0, stop=-1):
+    # source_array: (num_data, max_num_particles, 4)
+    # (pt,y,phi,pid)
+    X = X[start:stop].astype(np.float32)
+    y = y[start:stop]
+    origPT = X[:, :, 0]
+    indices = np.argsort(-origPT, axis=1)
+    _pt = np.take_along_axis(X[:, :, 0], indices, axis=1)
+    _eta = np.take_along_axis(X[:, :, 1], indices, axis=1)
+    _phi = np.take_along_axis(X[:, :, 2], indices, axis=1)
+    _pid = np.take_along_axis(X[:, :, 3], indices, axis=1)
+    mask = _pt > 0
+    n_particles = np.sum(mask, axis=1)
+    pt = ak.unflatten(_pt[mask], n_particles)
+    eta = ak.unflatten(_eta[mask], n_particles)
+    phi = ak.unflatten(_phi[mask], n_particles)
+    mass = ak.zeros_like(pt)
+    PID = ak.unflatten(_pid[mask], n_particles)
+    p4 = _p4_from_ptetaphim(pt, eta, phi, mass)
+    px = p4.x
+    py = p4.y
+    pz = p4.z
+    energy = p4.energy
+    jet_p4 = ak.sum(p4, axis=1)
+    # outputs
+    v = {}
+    v['label'] = y
+    v['jet_pt'] = jet_p4.pt
+    v['jet_eta'] = jet_p4.eta
+    v['jet_phi'] = jet_p4.phi
+    v['jet_energy'] = jet_p4.energy
+    v['jet_mass'] = jet_p4.mass
+    v['jet_nparticles'] = n_particles
+    v['part_px'] = px
+    v['part_py'] = py
+    v['part_pz'] = pz
+    v['part_energy'] = energy
+    _jet_etasign = ak.to_numpy(np.sign(v['jet_eta']))
+    _jet_etasign[_jet_etasign == 0] = 1
+    v['part_deta'] = (p4.eta - v['jet_eta']) * _jet_etasign
+    v['part_dphi'] = p4.deltaphi(jet_p4)
+    v['part_pid'] = PID
+    v['part_isCHPlus'] = ak.values_astype((PID == 211) + (PID == 321) + (PID == 2212), 'float32')
+    v['part_isCHMinus'] = ak.values_astype((PID == -211) + (PID == -321) + (PID == -2212), 'float32')
+    v['part_isNeutralHadron'] = ak.values_astype((PID == 130) + (PID == 2112) + (PID == -2112), 'float32')
+    v['part_isPhoton'] = ak.values_astype(PID == 22, 'float32')
+    v['part_isEPlus'] = ak.values_astype(PID == -11, 'float32')
+    v['part_isEMinus'] = ak.values_astype(PID == 11, 'float32')
+    v['part_isMuPlus'] = ak.values_astype(PID == -13, 'float32')
+    v['part_isMuMinus'] = ak.values_astype(PID == 13, 'float32')
+    v['part_isChargedHadron'] = v['part_isCHPlus'] + v['part_isCHMinus']
+    v['part_isElectron'] = v['part_isEPlus'] + v['part_isEMinus']
+    v['part_isMuon'] = v['part_isMuPlus'] + v['part_isMuMinus']
+    v['part_charge'] = (v['part_isCHPlus'] + v['part_isEPlus'] + v['part_isMuPlus']
+                        ) - (v['part_isCHMinus'] + v['part_isEMinus'] + v['part_isMuMinus'])
+    for k in list(v.keys()):
+        if k.endswith('Plus') or k.endswith('Minus'):
+            del v[k]
+    return v
+def convert(sources, destdir, basename):
+    if not os.path.exists(destdir):
+        os.makedirs(destdir)
+    for idx, sourcefile in enumerate(sources):
+        npfile = np.load(sourcefile)
+        output = os.path.join(destdir, '%s_%d.parquet' % (basename, idx))
+        print(sourcefile)
+        print(str(npfile['X'].shape))
+        print(output)
+        if os.path.exists(output):
+            os.remove(output)
+        v = _transform(npfile['X'], npfile['y'])
+        arr = ak.Array(v)
+        ak.to_parquet(arr, output, compression='LZ4', compression_level=4)
+def natural_sort(l):
+    import re
+    def convert(text): return int(text) if text.isdigit() else text.lower()
+    def alphanum_key(key): return [convert(c) for c in re.split('([0-9]+)', key)]
+    return sorted(l, key=alphanum_key)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('Convert qg benchmark datasets')
+    parser.add_argument('-i', '--inputdir', required=True, help='Directory of input numpy files.')
+    parser.add_argument('-o', '--outputdir', required=True, help='Output directory.')
+    parser.add_argument('--train-test-split', default=0.9, help='Training / testing split fraction.')
+    args = parser.parse_args()
+    import glob
+    sources = natural_sort(glob.glob(os.path.join(args.inputdir, 'QG_jets*.npz')))
+    n_train = int(args.train_test_split * len(sources))
+    train_sources = sources[:n_train]
+    test_sources = sources[n_train:]
+    convert(train_sources, destdir=args.outputdir, basename='train_file')
+    convert(test_sources, destdir=args.outputdir, basename='test_file')
--- a/utils/convert_top_datasets.py
+++ b/utils/convert_top_datasets.py
+import os
+import pandas as pd
+import numpy as np
+import awkward as ak
+import argparse
+'''
+Datasets introduction:
+  - The Machine Learning landscape of top taggers: 
+    - https://scipost.org/SciPostPhys.7.1.014
+Download:
+  - https://zenodo.org/record/2603256
+Versions:
+ - awkward==2.6.4
+ - vector==1.4.0
+ - pandas==2.2.2
+ - tables==3.9.2
+'''
+def _p4_from_pxpypze(px, py, pz, energy):
+    import vector
+    vector.register_awkward()
+    return vector.zip({'px': px, 'py': py, 'pz': pz, 'energy': energy})
+def _transform(dataframe, start=0, stop=-1):
+    df = dataframe.iloc[start:stop]
+    def _col_list(prefix, max_particles=200):
+        return ['%s_%d' % (prefix, i) for i in range(max_particles)]
+    _px = df[_col_list('PX')].values
+    _py = df[_col_list('PY')].values
+    _pz = df[_col_list('PZ')].values
+    _e = df[_col_list('E')].values
+    mask = _e > 0
+    n_particles = np.sum(mask, axis=1)
+    px = ak.unflatten(_px[mask], n_particles)
+    py = ak.unflatten(_py[mask], n_particles)
+    pz = ak.unflatten(_pz[mask], n_particles)
+    energy = ak.unflatten(_e[mask], n_particles)
+    p4 = _p4_from_pxpypze(px, py, pz, energy)
+    jet_p4 = ak.sum(p4, axis=1)
+    # outputs
+    v = {}
+    v['label'] = df['is_signal_new'].values
+    v['jet_pt'] = jet_p4.pt
+    v['jet_eta'] = jet_p4.eta
+    v['jet_phi'] = jet_p4.phi
+    v['jet_energy'] = jet_p4.energy
+    v['jet_mass'] = jet_p4.mass
+    v['jet_nparticles'] = n_particles
+    v['part_px'] = px
+    v['part_py'] = py
+    v['part_pz'] = pz
+    v['part_energy'] = energy
+    _jet_etasign = ak.to_numpy(np.sign(v['jet_eta']))
+    _jet_etasign[_jet_etasign == 0] = 1
+    v['part_deta'] = (p4.eta - v['jet_eta']) * _jet_etasign
+    v['part_dphi'] = p4.deltaphi(jet_p4)
+    return v
+def convert(source, destdir, basename):
+    df = pd.read_hdf(source, key='table')
+    print('Total events: %s' % str(df.shape[0]))
+    if not os.path.exists(destdir):
+        os.makedirs(destdir)
+    output = os.path.join(destdir, '%s.parquet' % basename)
+    print(output)
+    if os.path.exists(output):
+        os.remove(output)
+    v = _transform(df)
+    arr = ak.Array(v)
+    ak.to_parquet(arr, output, compression='LZ4', compression_level=4)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('Convert top benchmark h5 datasets')
+    parser.add_argument('-i', '--inputdir', required=True, help='Directory of input h5 files.')
+    parser.add_argument('-o', '--outputdir', required=True, help='Output directory.')
+    args = parser.parse_args()
+    # conver training file
+    convert(os.path.join(args.inputdir, 'train.h5'), destdir=args.outputdir, basename='train_file')
+    # conver validation file
+    convert(os.path.join(args.inputdir, 'val.h5'), destdir=args.outputdir, basename='val_file')
+    # conver testing file
+    convert(os.path.join(args.inputdir, 'test.h5'), destdir=args.outputdir, basename='test_file')
--- a/utils/dataset_utils.py
+++ b/utils/dataset_utils.py
+'''
+Adapted from:
+https://github.com/keras-team/keras/blob/master/keras/utils/data_utils.py
+'''
+import hashlib
+import os
+import shutil
+import zipfile
+import tarfile
+import urllib
+import requests
+from tqdm import tqdm
+def _download(url, fname, chunk_size=1024):
+    '''https://gist.github.com/yanqd0/c13ed29e29432e3cf3e7c38467f42f51'''
+    resp = requests.get(url, stream=True)
+    total = int(resp.headers.get('content-length', 0))
+    with open(fname, 'wb') as file, tqdm(
+        desc=fname,
+        total=total,
+        unit='iB',
+        unit_scale=True,
+        unit_divisor=1024,
+    ) as bar:
+        for data in resp.iter_content(chunk_size=chunk_size):
+            size = file.write(data)
+            bar.update(size)
+def extract_archive(file_path, path='.', archive_format='auto'):
+    """Extracts an archive if it matches tar, tar.gz, tar.bz, or zip formats.
+    Args:
+        file_path: path to the archive file
+        path: path to extract the archive file
+        archive_format: Archive format to try for extracting the file.
+            Options are 'auto', 'tar', 'zip', and None.
+            'tar' includes tar, tar.gz, and tar.bz files.
+            The default 'auto' is ['tar', 'zip'].
+            None or an empty list will return no matches found.
+    Returns:
+        True if a match was found and an archive extraction was completed,
+        False otherwise.
+    """
+    if archive_format is None:
+        return False
+    if archive_format == 'auto':
+        archive_format = ['tar', 'zip']
+    if isinstance(archive_format, str):
+        archive_format = [archive_format]
+    for archive_type in archive_format:
+        if archive_type == 'tar':
+            open_fn = tarfile.open
+            is_match_fn = tarfile.is_tarfile
+        if archive_type == 'zip':
+            open_fn = zipfile.ZipFile
+            is_match_fn = zipfile.is_zipfile
+        if is_match_fn(file_path):
+            with open_fn(file_path) as archive:
+                try:
+                    archive.extractall(path)
+                except (tarfile.TarError, RuntimeError, KeyboardInterrupt):
+                    if os.path.exists(path):
+                        if os.path.isfile(path):
+                            os.remove(path)
+                        else:
+                            shutil.rmtree(path)
+                    raise
+            return True
+    return False
+def _hash_file(fpath, algorithm='md5', chunk_size=131071):
+    """Calculates a file sha256 or md5 hash.
+    # Example
+    ```python
+        >>> from keras.data_utils import _hash_file
+        >>> _hash_file('/path/to/file.zip')
+        'e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855'
+    ```
+    # Arguments
+        fpath: path to the file being validated
+        algorithm: hash algorithm, one of 'auto', 'sha256', or 'md5'.
+            The default 'auto' detects the hash algorithm in use.
+        chunk_size: Bytes to read at a time, important for large files.
+    # Returns
+        The file hash
+    """
+    if (algorithm == 'sha256') or (algorithm == 'auto'):
+        hasher = hashlib.sha256()
+    else:
+        hasher = hashlib.md5()
+    with open(fpath, 'rb') as fpath_file:
+        for chunk in iter(lambda: fpath_file.read(chunk_size), b''):
+            hasher.update(chunk)
+    return hasher.hexdigest()
+def validate_file(fpath, file_hash, algorithm='md5', chunk_size=131071):
+    """Validates a file against a sha256 or md5 hash.
+    # Arguments
+        fpath: path to the file being validated
+        file_hash:  The expected hash string of the file.
+            The sha256 and md5 hash algorithms are both supported.
+        algorithm: Hash algorithm, one of 'auto', 'sha256', or 'md5'.
+            The default 'auto' detects the hash algorithm in use.
+        chunk_size: Bytes to read at a time, important for large files.
+    # Returns
+        Whether the file is valid
+    """
+    if ((algorithm == 'sha256') or (algorithm == 'auto' and len(file_hash) == 64)):
+        hasher = 'sha256'
+    else:
+        hasher = 'md5'
+    return str(_hash_file(fpath, hasher, chunk_size)) == str(file_hash)
+def get_file(origin=None,
+             fname=None,
+             file_hash=None,
+             datadir='datasets',
+             hash_algorithm='md5',
+             extract=False,
+             force_download=False,
+             archive_format='auto'):
+    """Downloads a file from a URL if it not already in the cache.
+    By default the file at the url `origin` is downloaded to the
+    cache_dir `~/.keras`, placed in the cache_subdir `datasets`,
+    and given the filename `fname`. The final location of a file
+    `example.txt` would therefore be `~/.keras/datasets/example.txt`.
+    Files in tar, tar.gz, tar.bz, and zip formats can also be extracted.
+    Passing a hash will verify the file after download. The command line
+    programs `shasum` and `sha256sum` can compute the hash.
+    Args:
+        fname: Name of the file. If an absolute path `/path/to/file.txt` is
+            specified the file will be saved at that location. If `None`, the
+            name of the file at `origin` will be used.
+        origin: Original URL of the file.
+        file_hash: The expected hash string of the file after download.
+            The sha256 and md5 hash algorithms are both supported.
+        cache_subdir: Subdirectory under the Keras cache dir where the file is
+            saved. If an absolute path `/path/to/folder` is
+            specified the file will be saved at that location.
+        hash_algorithm: Select the hash algorithm to verify the file.
+            options are `'md5'`, `'sha256'`, and `'auto'`.
+            The default 'auto' detects the hash algorithm in use.
+        extract: True tries extracting the file as an Archive, like tar or zip.
+        archive_format: Archive format to try for extracting the file.
+            Options are `'auto'`, `'tar'`, `'zip'`, and `None`.
+            `'tar'` includes tar, tar.gz, and tar.bz files.
+            The default `'auto'` corresponds to `['tar', 'zip']`.
+            None or an empty list will return no matches found.
+        cache_dir: Location to store cached files, when None it
+            defaults to the default directory `datasets/`.
+    Returns:
+        Path to the downloaded file
+    """
+    if origin is None:
+        raise ValueError('Please specify the "origin" argument (URL of the file '
+                         'to download).')
+    os.makedirs(datadir, exist_ok=True)
+    if not fname:
+        fname = os.path.basename(urllib.parse.urlsplit(origin).path)
+        if not fname:
+            raise ValueError(
+                f"Can't parse the file name from the origin provided: '{origin}'."
+                "Please specify the `fname` as the input param.")
+    fpath = os.path.join(datadir, fname)
+    download = False
+    if os.path.exists(fpath) and not force_download:
+        # File found; verify integrity if a hash was provided.
+        print(f'A local file already found at {fpath}, checking hash...')
+        if file_hash is not None:
+            if validate_file(fpath, file_hash, algorithm=hash_algorithm):
+                print('Local file hash matches, no need to download.')
+            else:
+                print(
+                    'A local file was found, but it seems to be '
+                    f'incomplete or outdated because the {hash_algorithm} '
+                    f'file hash does not match the original value of {file_hash} '
+                    'so we will re-download the data.')
+                download = True
+    else:
+        download = True
+    if download:
+        print(f'Downloading data from {origin} to {fpath}')
+        error_msg = 'URL fetch failure on {}: {}'
+        try:
+            try:
+                _download(origin, fpath)
+            except requests.exceptions.RequestException as e:
+                raise Exception(error_msg.format(origin, e.msg))
+        except (Exception, KeyboardInterrupt) as e:
+            if os.path.exists(fpath):
+                os.remove(fpath)
+            raise
+        if file_hash is not None:
+            if not validate_file(fpath, file_hash, algorithm=hash_algorithm):
+                if os.path.exists(fpath):
+                    os.remove(fpath)
+                raise RuntimeError(f'Checksum does not match for file {fpath}')
+    if extract:
+        extract_archive(fpath, datadir, archive_format)
+    return fpath, download