Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
particle_transformer_pytorch
Commits
524a1b6e
Commit
524a1b6e
authored
Nov 21, 2024
by
mashun
Browse files
particle
parents
Pipeline
#1943
failed with stages
in 0 seconds
Changes
43
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
471 additions
and
0 deletions
+471
-0
utils/convert_qg_datasets.py
utils/convert_qg_datasets.py
+146
-0
utils/convert_top_datasets.py
utils/convert_top_datasets.py
+106
-0
utils/dataset_utils.py
utils/dataset_utils.py
+219
-0
No files found.
utils/convert_qg_datasets.py
0 → 100644
View file @
524a1b6e
import
os
import
numpy
as
np
import
awkward
as
ak
import
argparse
'''
Datasets introduction:
https://energyflow.network/docs/datasets/#quark-and-gluon-jets
Download:
- Pythia8 Quark and Gluon Jets for Energy Flow:
- https://zenodo.org/record/3164691
- Herwig7.1 Quark and Gluon Jets:
- https://zenodo.org/record/3066475
Versions:
- awkward==2.6.4
- vector==1.4.0
'''
def
_p4_from_ptetaphim
(
pt
,
eta
,
phi
,
mass
):
import
vector
vector
.
register_awkward
()
return
vector
.
zip
({
'pt'
:
pt
,
'eta'
:
eta
,
'phi'
:
phi
,
'mass'
:
mass
})
def
_transform
(
X
,
y
,
start
=
0
,
stop
=-
1
):
# source_array: (num_data, max_num_particles, 4)
# (pt,y,phi,pid)
X
=
X
[
start
:
stop
].
astype
(
np
.
float32
)
y
=
y
[
start
:
stop
]
origPT
=
X
[:,
:,
0
]
indices
=
np
.
argsort
(
-
origPT
,
axis
=
1
)
_pt
=
np
.
take_along_axis
(
X
[:,
:,
0
],
indices
,
axis
=
1
)
_eta
=
np
.
take_along_axis
(
X
[:,
:,
1
],
indices
,
axis
=
1
)
_phi
=
np
.
take_along_axis
(
X
[:,
:,
2
],
indices
,
axis
=
1
)
_pid
=
np
.
take_along_axis
(
X
[:,
:,
3
],
indices
,
axis
=
1
)
mask
=
_pt
>
0
n_particles
=
np
.
sum
(
mask
,
axis
=
1
)
pt
=
ak
.
unflatten
(
_pt
[
mask
],
n_particles
)
eta
=
ak
.
unflatten
(
_eta
[
mask
],
n_particles
)
phi
=
ak
.
unflatten
(
_phi
[
mask
],
n_particles
)
mass
=
ak
.
zeros_like
(
pt
)
PID
=
ak
.
unflatten
(
_pid
[
mask
],
n_particles
)
p4
=
_p4_from_ptetaphim
(
pt
,
eta
,
phi
,
mass
)
px
=
p4
.
x
py
=
p4
.
y
pz
=
p4
.
z
energy
=
p4
.
energy
jet_p4
=
ak
.
sum
(
p4
,
axis
=
1
)
# outputs
v
=
{}
v
[
'label'
]
=
y
v
[
'jet_pt'
]
=
jet_p4
.
pt
v
[
'jet_eta'
]
=
jet_p4
.
eta
v
[
'jet_phi'
]
=
jet_p4
.
phi
v
[
'jet_energy'
]
=
jet_p4
.
energy
v
[
'jet_mass'
]
=
jet_p4
.
mass
v
[
'jet_nparticles'
]
=
n_particles
v
[
'part_px'
]
=
px
v
[
'part_py'
]
=
py
v
[
'part_pz'
]
=
pz
v
[
'part_energy'
]
=
energy
_jet_etasign
=
ak
.
to_numpy
(
np
.
sign
(
v
[
'jet_eta'
]))
_jet_etasign
[
_jet_etasign
==
0
]
=
1
v
[
'part_deta'
]
=
(
p4
.
eta
-
v
[
'jet_eta'
])
*
_jet_etasign
v
[
'part_dphi'
]
=
p4
.
deltaphi
(
jet_p4
)
v
[
'part_pid'
]
=
PID
v
[
'part_isCHPlus'
]
=
ak
.
values_astype
((
PID
==
211
)
+
(
PID
==
321
)
+
(
PID
==
2212
),
'float32'
)
v
[
'part_isCHMinus'
]
=
ak
.
values_astype
((
PID
==
-
211
)
+
(
PID
==
-
321
)
+
(
PID
==
-
2212
),
'float32'
)
v
[
'part_isNeutralHadron'
]
=
ak
.
values_astype
((
PID
==
130
)
+
(
PID
==
2112
)
+
(
PID
==
-
2112
),
'float32'
)
v
[
'part_isPhoton'
]
=
ak
.
values_astype
(
PID
==
22
,
'float32'
)
v
[
'part_isEPlus'
]
=
ak
.
values_astype
(
PID
==
-
11
,
'float32'
)
v
[
'part_isEMinus'
]
=
ak
.
values_astype
(
PID
==
11
,
'float32'
)
v
[
'part_isMuPlus'
]
=
ak
.
values_astype
(
PID
==
-
13
,
'float32'
)
v
[
'part_isMuMinus'
]
=
ak
.
values_astype
(
PID
==
13
,
'float32'
)
v
[
'part_isChargedHadron'
]
=
v
[
'part_isCHPlus'
]
+
v
[
'part_isCHMinus'
]
v
[
'part_isElectron'
]
=
v
[
'part_isEPlus'
]
+
v
[
'part_isEMinus'
]
v
[
'part_isMuon'
]
=
v
[
'part_isMuPlus'
]
+
v
[
'part_isMuMinus'
]
v
[
'part_charge'
]
=
(
v
[
'part_isCHPlus'
]
+
v
[
'part_isEPlus'
]
+
v
[
'part_isMuPlus'
]
)
-
(
v
[
'part_isCHMinus'
]
+
v
[
'part_isEMinus'
]
+
v
[
'part_isMuMinus'
])
for
k
in
list
(
v
.
keys
()):
if
k
.
endswith
(
'Plus'
)
or
k
.
endswith
(
'Minus'
):
del
v
[
k
]
return
v
def
convert
(
sources
,
destdir
,
basename
):
if
not
os
.
path
.
exists
(
destdir
):
os
.
makedirs
(
destdir
)
for
idx
,
sourcefile
in
enumerate
(
sources
):
npfile
=
np
.
load
(
sourcefile
)
output
=
os
.
path
.
join
(
destdir
,
'%s_%d.parquet'
%
(
basename
,
idx
))
print
(
sourcefile
)
print
(
str
(
npfile
[
'X'
].
shape
))
print
(
output
)
if
os
.
path
.
exists
(
output
):
os
.
remove
(
output
)
v
=
_transform
(
npfile
[
'X'
],
npfile
[
'y'
])
arr
=
ak
.
Array
(
v
)
ak
.
to_parquet
(
arr
,
output
,
compression
=
'LZ4'
,
compression_level
=
4
)
def
natural_sort
(
l
):
import
re
def
convert
(
text
):
return
int
(
text
)
if
text
.
isdigit
()
else
text
.
lower
()
def
alphanum_key
(
key
):
return
[
convert
(
c
)
for
c
in
re
.
split
(
'([0-9]+)'
,
key
)]
return
sorted
(
l
,
key
=
alphanum_key
)
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
'Convert qg benchmark datasets'
)
parser
.
add_argument
(
'-i'
,
'--inputdir'
,
required
=
True
,
help
=
'Directory of input numpy files.'
)
parser
.
add_argument
(
'-o'
,
'--outputdir'
,
required
=
True
,
help
=
'Output directory.'
)
parser
.
add_argument
(
'--train-test-split'
,
default
=
0.9
,
help
=
'Training / testing split fraction.'
)
args
=
parser
.
parse_args
()
import
glob
sources
=
natural_sort
(
glob
.
glob
(
os
.
path
.
join
(
args
.
inputdir
,
'QG_jets*.npz'
)))
n_train
=
int
(
args
.
train_test_split
*
len
(
sources
))
train_sources
=
sources
[:
n_train
]
test_sources
=
sources
[
n_train
:]
convert
(
train_sources
,
destdir
=
args
.
outputdir
,
basename
=
'train_file'
)
convert
(
test_sources
,
destdir
=
args
.
outputdir
,
basename
=
'test_file'
)
utils/convert_top_datasets.py
0 → 100644
View file @
524a1b6e
import
os
import
pandas
as
pd
import
numpy
as
np
import
awkward
as
ak
import
argparse
'''
Datasets introduction:
- The Machine Learning landscape of top taggers:
- https://scipost.org/SciPostPhys.7.1.014
Download:
- https://zenodo.org/record/2603256
Versions:
- awkward==2.6.4
- vector==1.4.0
- pandas==2.2.2
- tables==3.9.2
'''
def
_p4_from_pxpypze
(
px
,
py
,
pz
,
energy
):
import
vector
vector
.
register_awkward
()
return
vector
.
zip
({
'px'
:
px
,
'py'
:
py
,
'pz'
:
pz
,
'energy'
:
energy
})
def
_transform
(
dataframe
,
start
=
0
,
stop
=-
1
):
df
=
dataframe
.
iloc
[
start
:
stop
]
def
_col_list
(
prefix
,
max_particles
=
200
):
return
[
'%s_%d'
%
(
prefix
,
i
)
for
i
in
range
(
max_particles
)]
_px
=
df
[
_col_list
(
'PX'
)].
values
_py
=
df
[
_col_list
(
'PY'
)].
values
_pz
=
df
[
_col_list
(
'PZ'
)].
values
_e
=
df
[
_col_list
(
'E'
)].
values
mask
=
_e
>
0
n_particles
=
np
.
sum
(
mask
,
axis
=
1
)
px
=
ak
.
unflatten
(
_px
[
mask
],
n_particles
)
py
=
ak
.
unflatten
(
_py
[
mask
],
n_particles
)
pz
=
ak
.
unflatten
(
_pz
[
mask
],
n_particles
)
energy
=
ak
.
unflatten
(
_e
[
mask
],
n_particles
)
p4
=
_p4_from_pxpypze
(
px
,
py
,
pz
,
energy
)
jet_p4
=
ak
.
sum
(
p4
,
axis
=
1
)
# outputs
v
=
{}
v
[
'label'
]
=
df
[
'is_signal_new'
].
values
v
[
'jet_pt'
]
=
jet_p4
.
pt
v
[
'jet_eta'
]
=
jet_p4
.
eta
v
[
'jet_phi'
]
=
jet_p4
.
phi
v
[
'jet_energy'
]
=
jet_p4
.
energy
v
[
'jet_mass'
]
=
jet_p4
.
mass
v
[
'jet_nparticles'
]
=
n_particles
v
[
'part_px'
]
=
px
v
[
'part_py'
]
=
py
v
[
'part_pz'
]
=
pz
v
[
'part_energy'
]
=
energy
_jet_etasign
=
ak
.
to_numpy
(
np
.
sign
(
v
[
'jet_eta'
]))
_jet_etasign
[
_jet_etasign
==
0
]
=
1
v
[
'part_deta'
]
=
(
p4
.
eta
-
v
[
'jet_eta'
])
*
_jet_etasign
v
[
'part_dphi'
]
=
p4
.
deltaphi
(
jet_p4
)
return
v
def
convert
(
source
,
destdir
,
basename
):
df
=
pd
.
read_hdf
(
source
,
key
=
'table'
)
print
(
'Total events: %s'
%
str
(
df
.
shape
[
0
]))
if
not
os
.
path
.
exists
(
destdir
):
os
.
makedirs
(
destdir
)
output
=
os
.
path
.
join
(
destdir
,
'%s.parquet'
%
basename
)
print
(
output
)
if
os
.
path
.
exists
(
output
):
os
.
remove
(
output
)
v
=
_transform
(
df
)
arr
=
ak
.
Array
(
v
)
ak
.
to_parquet
(
arr
,
output
,
compression
=
'LZ4'
,
compression_level
=
4
)
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
'Convert top benchmark h5 datasets'
)
parser
.
add_argument
(
'-i'
,
'--inputdir'
,
required
=
True
,
help
=
'Directory of input h5 files.'
)
parser
.
add_argument
(
'-o'
,
'--outputdir'
,
required
=
True
,
help
=
'Output directory.'
)
args
=
parser
.
parse_args
()
# conver training file
convert
(
os
.
path
.
join
(
args
.
inputdir
,
'train.h5'
),
destdir
=
args
.
outputdir
,
basename
=
'train_file'
)
# conver validation file
convert
(
os
.
path
.
join
(
args
.
inputdir
,
'val.h5'
),
destdir
=
args
.
outputdir
,
basename
=
'val_file'
)
# conver testing file
convert
(
os
.
path
.
join
(
args
.
inputdir
,
'test.h5'
),
destdir
=
args
.
outputdir
,
basename
=
'test_file'
)
utils/dataset_utils.py
0 → 100644
View file @
524a1b6e
'''
Adapted from:
https://github.com/keras-team/keras/blob/master/keras/utils/data_utils.py
'''
import
hashlib
import
os
import
shutil
import
zipfile
import
tarfile
import
urllib
import
requests
from
tqdm
import
tqdm
def
_download
(
url
,
fname
,
chunk_size
=
1024
):
'''https://gist.github.com/yanqd0/c13ed29e29432e3cf3e7c38467f42f51'''
resp
=
requests
.
get
(
url
,
stream
=
True
)
total
=
int
(
resp
.
headers
.
get
(
'content-length'
,
0
))
with
open
(
fname
,
'wb'
)
as
file
,
tqdm
(
desc
=
fname
,
total
=
total
,
unit
=
'iB'
,
unit_scale
=
True
,
unit_divisor
=
1024
,
)
as
bar
:
for
data
in
resp
.
iter_content
(
chunk_size
=
chunk_size
):
size
=
file
.
write
(
data
)
bar
.
update
(
size
)
def
extract_archive
(
file_path
,
path
=
'.'
,
archive_format
=
'auto'
):
"""Extracts an archive if it matches tar, tar.gz, tar.bz, or zip formats.
Args:
file_path: path to the archive file
path: path to extract the archive file
archive_format: Archive format to try for extracting the file.
Options are 'auto', 'tar', 'zip', and None.
'tar' includes tar, tar.gz, and tar.bz files.
The default 'auto' is ['tar', 'zip'].
None or an empty list will return no matches found.
Returns:
True if a match was found and an archive extraction was completed,
False otherwise.
"""
if
archive_format
is
None
:
return
False
if
archive_format
==
'auto'
:
archive_format
=
[
'tar'
,
'zip'
]
if
isinstance
(
archive_format
,
str
):
archive_format
=
[
archive_format
]
for
archive_type
in
archive_format
:
if
archive_type
==
'tar'
:
open_fn
=
tarfile
.
open
is_match_fn
=
tarfile
.
is_tarfile
if
archive_type
==
'zip'
:
open_fn
=
zipfile
.
ZipFile
is_match_fn
=
zipfile
.
is_zipfile
if
is_match_fn
(
file_path
):
with
open_fn
(
file_path
)
as
archive
:
try
:
archive
.
extractall
(
path
)
except
(
tarfile
.
TarError
,
RuntimeError
,
KeyboardInterrupt
):
if
os
.
path
.
exists
(
path
):
if
os
.
path
.
isfile
(
path
):
os
.
remove
(
path
)
else
:
shutil
.
rmtree
(
path
)
raise
return
True
return
False
def
_hash_file
(
fpath
,
algorithm
=
'md5'
,
chunk_size
=
131071
):
"""Calculates a file sha256 or md5 hash.
# Example
```python
>>> from keras.data_utils import _hash_file
>>> _hash_file('/path/to/file.zip')
'e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855'
```
# Arguments
fpath: path to the file being validated
algorithm: hash algorithm, one of 'auto', 'sha256', or 'md5'.
The default 'auto' detects the hash algorithm in use.
chunk_size: Bytes to read at a time, important for large files.
# Returns
The file hash
"""
if
(
algorithm
==
'sha256'
)
or
(
algorithm
==
'auto'
):
hasher
=
hashlib
.
sha256
()
else
:
hasher
=
hashlib
.
md5
()
with
open
(
fpath
,
'rb'
)
as
fpath_file
:
for
chunk
in
iter
(
lambda
:
fpath_file
.
read
(
chunk_size
),
b
''
):
hasher
.
update
(
chunk
)
return
hasher
.
hexdigest
()
def
validate_file
(
fpath
,
file_hash
,
algorithm
=
'md5'
,
chunk_size
=
131071
):
"""Validates a file against a sha256 or md5 hash.
# Arguments
fpath: path to the file being validated
file_hash: The expected hash string of the file.
The sha256 and md5 hash algorithms are both supported.
algorithm: Hash algorithm, one of 'auto', 'sha256', or 'md5'.
The default 'auto' detects the hash algorithm in use.
chunk_size: Bytes to read at a time, important for large files.
# Returns
Whether the file is valid
"""
if
((
algorithm
==
'sha256'
)
or
(
algorithm
==
'auto'
and
len
(
file_hash
)
==
64
)):
hasher
=
'sha256'
else
:
hasher
=
'md5'
return
str
(
_hash_file
(
fpath
,
hasher
,
chunk_size
))
==
str
(
file_hash
)
def
get_file
(
origin
=
None
,
fname
=
None
,
file_hash
=
None
,
datadir
=
'datasets'
,
hash_algorithm
=
'md5'
,
extract
=
False
,
force_download
=
False
,
archive_format
=
'auto'
):
"""Downloads a file from a URL if it not already in the cache.
By default the file at the url `origin` is downloaded to the
cache_dir `~/.keras`, placed in the cache_subdir `datasets`,
and given the filename `fname`. The final location of a file
`example.txt` would therefore be `~/.keras/datasets/example.txt`.
Files in tar, tar.gz, tar.bz, and zip formats can also be extracted.
Passing a hash will verify the file after download. The command line
programs `shasum` and `sha256sum` can compute the hash.
Args:
fname: Name of the file. If an absolute path `/path/to/file.txt` is
specified the file will be saved at that location. If `None`, the
name of the file at `origin` will be used.
origin: Original URL of the file.
file_hash: The expected hash string of the file after download.
The sha256 and md5 hash algorithms are both supported.
cache_subdir: Subdirectory under the Keras cache dir where the file is
saved. If an absolute path `/path/to/folder` is
specified the file will be saved at that location.
hash_algorithm: Select the hash algorithm to verify the file.
options are `'md5'`, `'sha256'`, and `'auto'`.
The default 'auto' detects the hash algorithm in use.
extract: True tries extracting the file as an Archive, like tar or zip.
archive_format: Archive format to try for extracting the file.
Options are `'auto'`, `'tar'`, `'zip'`, and `None`.
`'tar'` includes tar, tar.gz, and tar.bz files.
The default `'auto'` corresponds to `['tar', 'zip']`.
None or an empty list will return no matches found.
cache_dir: Location to store cached files, when None it
defaults to the default directory `datasets/`.
Returns:
Path to the downloaded file
"""
if
origin
is
None
:
raise
ValueError
(
'Please specify the "origin" argument (URL of the file '
'to download).'
)
os
.
makedirs
(
datadir
,
exist_ok
=
True
)
if
not
fname
:
fname
=
os
.
path
.
basename
(
urllib
.
parse
.
urlsplit
(
origin
).
path
)
if
not
fname
:
raise
ValueError
(
f
"Can't parse the file name from the origin provided: '
{
origin
}
'."
"Please specify the `fname` as the input param."
)
fpath
=
os
.
path
.
join
(
datadir
,
fname
)
download
=
False
if
os
.
path
.
exists
(
fpath
)
and
not
force_download
:
# File found; verify integrity if a hash was provided.
print
(
f
'A local file already found at
{
fpath
}
, checking hash...'
)
if
file_hash
is
not
None
:
if
validate_file
(
fpath
,
file_hash
,
algorithm
=
hash_algorithm
):
print
(
'Local file hash matches, no need to download.'
)
else
:
print
(
'A local file was found, but it seems to be '
f
'incomplete or outdated because the
{
hash_algorithm
}
'
f
'file hash does not match the original value of
{
file_hash
}
'
'so we will re-download the data.'
)
download
=
True
else
:
download
=
True
if
download
:
print
(
f
'Downloading data from
{
origin
}
to
{
fpath
}
'
)
error_msg
=
'URL fetch failure on {}: {}'
try
:
try
:
_download
(
origin
,
fpath
)
except
requests
.
exceptions
.
RequestException
as
e
:
raise
Exception
(
error_msg
.
format
(
origin
,
e
.
msg
))
except
(
Exception
,
KeyboardInterrupt
)
as
e
:
if
os
.
path
.
exists
(
fpath
):
os
.
remove
(
fpath
)
raise
if
file_hash
is
not
None
:
if
not
validate_file
(
fpath
,
file_hash
,
algorithm
=
hash_algorithm
):
if
os
.
path
.
exists
(
fpath
):
os
.
remove
(
fpath
)
raise
RuntimeError
(
f
'Checksum does not match for file
{
fpath
}
'
)
if
extract
:
extract_archive
(
fpath
,
datadir
,
archive_format
)
return
fpath
,
download
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment