Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
qwen2.5-coder_pytorch
Commits
53b3977b
Commit
53b3977b
authored
Jul 11, 2025
by
dongchy920
Browse files
Initial commit
parents
Pipeline
#2841
failed with stages
in 0 seconds
Changes
350
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1086 additions
and
0 deletions
+1086
-0
finetuning/sft/utils/decont.py
finetuning/sft/utils/decont.py
+315
-0
finetuning/sft/utils/multiple_metrics/.gitkeep
finetuning/sft/utils/multiple_metrics/.gitkeep
+0
-0
finetuning/sft/utils/multiple_metrics/__init__.py
finetuning/sft/utils/multiple_metrics/__init__.py
+0
-0
finetuning/sft/utils/multiple_metrics/containerized_eval.py
finetuning/sft/utils/multiple_metrics/containerized_eval.py
+77
-0
finetuning/sft/utils/multiple_metrics/eval_cpp.py
finetuning/sft/utils/multiple_metrics/eval_cpp.py
+41
-0
finetuning/sft/utils/multiple_metrics/eval_cs.py
finetuning/sft/utils/multiple_metrics/eval_cs.py
+83
-0
finetuning/sft/utils/multiple_metrics/eval_dlang.py
finetuning/sft/utils/multiple_metrics/eval_dlang.py
+73
-0
finetuning/sft/utils/multiple_metrics/eval_go.py
finetuning/sft/utils/multiple_metrics/eval_go.py
+45
-0
finetuning/sft/utils/multiple_metrics/eval_java.py
finetuning/sft/utils/multiple_metrics/eval_java.py
+53
-0
finetuning/sft/utils/multiple_metrics/eval_javascript.py
finetuning/sft/utils/multiple_metrics/eval_javascript.py
+52
-0
finetuning/sft/utils/multiple_metrics/eval_julia.py
finetuning/sft/utils/multiple_metrics/eval_julia.py
+23
-0
finetuning/sft/utils/multiple_metrics/eval_lua.py
finetuning/sft/utils/multiple_metrics/eval_lua.py
+19
-0
finetuning/sft/utils/multiple_metrics/eval_php.py
finetuning/sft/utils/multiple_metrics/eval_php.py
+22
-0
finetuning/sft/utils/multiple_metrics/eval_pl.py
finetuning/sft/utils/multiple_metrics/eval_pl.py
+22
-0
finetuning/sft/utils/multiple_metrics/eval_python.py
finetuning/sft/utils/multiple_metrics/eval_python.py
+22
-0
finetuning/sft/utils/multiple_metrics/eval_r.py
finetuning/sft/utils/multiple_metrics/eval_r.py
+50
-0
finetuning/sft/utils/multiple_metrics/eval_racket.py
finetuning/sft/utils/multiple_metrics/eval_racket.py
+49
-0
finetuning/sft/utils/multiple_metrics/eval_ruby.py
finetuning/sft/utils/multiple_metrics/eval_ruby.py
+45
-0
finetuning/sft/utils/multiple_metrics/eval_rust.py
finetuning/sft/utils/multiple_metrics/eval_rust.py
+56
-0
finetuning/sft/utils/multiple_metrics/eval_scala.py
finetuning/sft/utils/multiple_metrics/eval_scala.py
+39
-0
No files found.
finetuning/sft/utils/decont.py
0 → 100644
View file @
53b3977b
import
jsonlines
import
requests
import
json
import
argparse
import
multiprocessing
as
mp
import
traceback
import
argparse
import
tqdm
import
time
import
tempfile
from
datasketch
import
MinHash
,
MinHashLSH
import
subprocess
import
collections
import
numpy
as
np
import
hashlib
from
func_timeout
import
func_set_timeout
import
pandas
as
pd
import
os
import
sys
import
xlsxwriter
import
itertools
import
copy
import
re
import
numpy
as
np
import
pandas
as
pd
import
math
from
sacrebleu
import
metrics
import
ahocorasick
import
datasets
import
itertools
from
pathlib
import
Path
from
utils
import
utils
DATA_DIR
=
"./test_data/"
def
has_n_gram_overlap
(
string1
,
string2
,
n_gram
=
10
,
if_tokenize
=
False
):
if
if_tokenize
:
string1
=
nltk
.
tokenize
.
word_tokenize
(
string1
)
string2
=
nltk
.
tokenize
.
word_tokenize
(
string2
)
string1
=
" "
.
join
(
string1
)
string2
=
" "
.
join
(
string2
)
tokens1
=
string1
.
split
()
tokens2
=
string2
.
split
()
grams1
=
set
([
" "
.
join
(
tokens1
[
i
:
i
+
n_gram
])
for
i
in
range
(
len
(
tokens1
)
-
(
n_gram
-
1
))])
grams2
=
set
([
" "
.
join
(
tokens2
[
i
:
i
+
n_gram
])
for
i
in
range
(
len
(
tokens2
)
-
(
n_gram
-
1
))])
overlap
=
grams1
.
intersection
(
grams2
)
return
len
(
overlap
)
>
0
def
has_n_gram_overlap_with_testset
(
string1
,
testset
,
n_gram
=
10
,
if_tokenize
=
False
,
overlaps
=
[],
verbose
=
False
):
if
if_tokenize
:
string1
=
nltk
.
tokenize
.
word_tokenize
(
string1
)
string1
=
" "
.
join
(
string1
)
tokens1
=
string1
.
split
()
grams1
=
set
([
" "
.
join
(
tokens1
[
i
:
i
+
n_gram
])
for
i
in
range
(
len
(
tokens1
)
-
(
n_gram
-
1
))])
overlap
=
grams1
.
intersection
(
testset
)
overlaps
.
extend
(
list
(
overlap
))
if
len
(
overlap
)
>
0
and
verbose
:
print
(
overlap
)
return
len
(
overlap
)
>
0
def
get_n_gram
(
string
,
n_gram
=
10
,
if_tokenize
=
False
):
if
if_tokenize
:
string1
=
nltk
.
tokenize
.
word_tokenize
(
string1
)
string1
=
" "
.
join
(
string1
)
tokens1
=
string
.
split
()
return
set
([
" "
.
join
(
tokens1
[
i
:
i
+
n_gram
])
for
i
in
range
(
len
(
tokens1
)
-
(
n_gram
-
1
))])
def
load_leetcode_test_data
():
data
=
utils
.
read_jsonl_file
(
"livecodebench.jsonl"
)
samples
=
[]
for
obj
in
data
:
samples
.
append
({
"prompt"
:
obj
[
"prompt"
]})
return
samples
def
load_humanevalpack_test_data
(
data_path
=
f
"
{
DATA_DIR
}
/humanevalpack"
):
ds1
=
datasets
.
load_dataset
(
data_path
,
"python"
,
trust_remote_code
=
True
)[
"test"
]
ds2
=
datasets
.
load_dataset
(
data_path
,
"js"
,
trust_remote_code
=
True
)[
"test"
]
ds3
=
datasets
.
load_dataset
(
data_path
,
"java"
,
trust_remote_code
=
True
)[
"test"
]
ds4
=
datasets
.
load_dataset
(
data_path
,
"go"
,
trust_remote_code
=
True
)[
"test"
]
ds5
=
datasets
.
load_dataset
(
data_path
,
"cpp"
,
trust_remote_code
=
True
)[
"test"
]
ds6
=
datasets
.
load_dataset
(
data_path
,
"rust"
,
trust_remote_code
=
True
)[
"test"
]
combined_dataset
=
datasets
.
concatenate_datasets
([
ds1
,
ds2
,
ds3
,
ds4
,
ds5
,
ds6
])
data
=
[]
for
j
,
sample
in
enumerate
(
combined_dataset
):
data
.
append
(
sample
)
return
data
def
load_multiply_e
():
data
=
{}
samples
=
[]
for
lg
in
[
"sh"
,
"ts"
,
"cs"
,
"php"
,
"java"
,
"cpp"
,
"js"
,
"go"
,
"rs"
]:
objs
=
utils
.
read_jsonl_file
(
f
"
{
DATA_DIR
}
/multiple/data/humaneval-
{
lg
}
.jsonl"
)
for
j
,
sample
in
enumerate
(
objs
):
samples
.
append
({
"prompt"
:
sample
[
"prompt"
]})
return
samples
def
load_mbpp_test_data
(
data_path
=
f
"
{
DATA_DIR
}
/mbpp/"
):
ds
=
utils
.
read_jsonl_file
(
f
"
{
data_path
}
/mbpp.jsonl"
)
data
=
[]
for
j
,
sample
in
enumerate
(
ds
):
data
.
append
(
sample
)
return
data
def
load_ds1000_test_data
(
data_path
=
"data/ds1000_data/"
):
def
extract_ds_1000_prompt
(
prompt
:
str
):
if
"SOLUTION START"
in
prompt
:
assert
prompt
.
count
(
"SOLUTION START"
)
==
1
return
prompt
.
split
(
"SOLUTION START"
)[
0
]
elif
"BEGIN SOLUTION"
in
prompt
:
assert
prompt
.
count
(
"BEGIN SOLUTION"
)
==
1
return
prompt
.
split
(
"BEGIN SOLUTION"
)[
0
]
else
:
return
prompt
def
load_ds_1000
(
data_path
):
data
=
[]
for
prompt_file
in
Path
(
data_path
).
glob
(
"*/Insertion/q*/prompt.txt"
):
with
open
(
prompt_file
)
as
f
:
data
.
append
(
extract_ds_1000_prompt
({
"insertion"
:
f
.
read
()}))
return
data
return
load_ds_1000
(
data_path
)
def
load_codeapex_data
():
ds
=
utils
.
read_jsonl_file
(
"data/eval/eval_codeapex_v1.jsonl"
)
data
=
[]
for
j
,
sample
in
enumerate
(
ds
):
data
.
append
(
sample
)
return
data
def
get_testset_n_gram
(
n_gram
=
10
,
test_set
=
[
"mbpp"
,
"multiple"
,
"humanevalpack"
]):
print
(
"Start Loading decont test set"
)
mbpp_data
=
load_mbpp_test_data
()
humaneval_data
=
load_humanevalpack_test_data
()
multiply_e_data
=
load_multiply_e
()
ds1000_data
=
load_ds1000_test_data
()
codeapex_data
=
load_codeapex_data
()
#leetcode_data = load_leetcode_test_data()
print
(
"Successfully Loading decont test set"
)
all_grams
=
set
([])
if
"mbpp"
in
test_set
:
for
obj
in
mbpp_data
:
n_grams
=
get_n_gram
(
obj
[
"text"
]
+
"
\n
"
+
obj
[
"code"
]
+
"
\n
"
.
join
(
obj
[
"test_list"
]),
n_gram
=
n_gram
)
all_grams
.
update
(
n_grams
)
if
"humanevalpack"
in
test_set
:
for
obj
in
humaneval_data
:
n_grams
=
get_n_gram
(
obj
[
"instruction"
]
+
obj
[
"prompt"
]
+
obj
[
"canonical_solution"
]
+
obj
[
"test"
],
n_gram
=
n_gram
)
all_grams
.
update
(
n_grams
)
if
"multiple"
in
test_set
:
for
obj
in
multiply_e_data
:
n_grams
=
get_n_gram
(
obj
[
"prompt"
],
n_gram
=
n_gram
)
all_grams
.
update
(
n_grams
)
if
"ds1000"
in
test_set
:
for
obj
in
ds1000_data
:
n_grams
=
get_n_gram
(
obj
[
"insertion"
],
n_gram
=
n_gram
)
all_grams
.
update
(
n_grams
)
if
"codeapex"
in
test_set
:
for
obj
in
codeapex_data
:
n_grams
=
get_n_gram
(
obj
[
"prompt"
],
n_gram
=
n_gram
)
all_grams
.
update
(
n_grams
)
# for obj in leetcode_data:
# n_grams = get_n_gram(obj["prompt"], n_gram = n_gram)
# all_grams.update(n_grams)
return
all_grams
def
decontaminate_for_cpt
(
text
,
testset_n_gram
,
testset_func_names
,
n_gram
=
10
,
if_tokenize
=
False
,
verbose
=
False
):
"""
True denotes contamination
"""
if
has_n_gram_overlap_with_testset
(
text
,
testset_n_gram
,
n_gram
=
n_gram
,
if_tokenize
=
if_tokenize
,
verbose
=
verbose
):
return
True
if
contain_func_name
(
text
,
testset_func_names
):
return
True
return
False
def
contain_func_name
(
text
,
testset_func_names
):
if
f
'
{
extract_func_name
(
text
)
}
'
in
testset_func_names
:
return
True
else
:
return
False
def
extract_func_name
(
text
):
if
re
.
search
(
r
"def (.*?)\(().*?\)"
,
text
)
is
not
None
:
return
re
.
search
(
r
"def (.*?)\(().*?\)"
,
text
).
group
(
1
).
strip
()
if
re
.
search
(
r
"public \w+ \w+ (.*?)\(().*?\)"
,
text
)
is
not
None
:
return
re
.
search
(
r
"public \w+ \w+ (.*?)\(().*?\)"
,
text
).
group
(
1
).
strip
()
else
:
return
None
def
extract_class_name
(
text
):
if
re
.
search
(
r
"public\w+(\s+)\w*{"
,
text
)
is
not
None
:
return
re
.
search
(
r
"def (.*?)\(().*?\)"
,
text
).
group
(
1
).
strip
()
else
:
return
None
def
get_testset_func_name
(
datasets
=
[
"humaneval"
,
"mbpp"
]):
test_func_names
=
set
()
if
"humaneval"
in
datasets
:
humaneval_data
=
load_humanevalpack_test_data
()
test_func_names
.
update
(
set
([
obj
[
"entry_point"
]
for
obj
in
humaneval_data
]))
if
"mbpp"
in
datasets
:
mbpp_data
=
load_mbpp_test_data
()
test_func_names
.
update
(
set
([
extract_func_name
(
obj
[
"code"
])
for
obj
in
mbpp_data
]))
return
test_func_names
def
deduplicate_similar_strings
(
objs
,
num_perm
=
512
,
jaccard_threshold
=
0.8
):
"""
# # Example usage
# strings = ["hello", "h3llo", "helloo", "world", "w0rld", "word", "whirled"]
# deduplicated = deduplicate_similar_strings(strings, jaccard_threshold=0.8)
# print(deduplicated)
"""
# Create an LSH index with a given Jaccard similarity threshold
lsh
=
MinHashLSH
(
threshold
=
jaccard_threshold
,
num_perm
=
num_perm
)
# Create MinHash objects for each string and add to the LSH index
signatures
=
{}
for
i
,
obj
in
tqdm
.
tqdm
(
enumerate
(
objs
)):
minhash
=
MinHash
(
num_perm
=
num_perm
)
for
word
in
obj
[
"text"
].
split
():
minhash
.
update
(
word
.
encode
(
'utf8'
))
lsh
.
insert
(
f
'string_
{
i
}
'
,
minhash
)
signatures
[
f
'string_
{
i
}
'
]
=
minhash
unique_strings
=
[]
processed
=
set
()
for
i
,
obj
in
enumerate
(
objs
):
key
=
f
'string_
{
i
}
'
if
key
in
processed
:
continue
similar_keys
=
lsh
.
query
(
signatures
[
key
])
for
sim_key
in
similar_keys
:
processed
.
add
(
sim_key
)
unique_strings
.
append
(
obj
)
print
(
f
"
{
len
(
objs
)
}
->
{
len
(
unique_strings
)
}
"
)
return
unique_strings
def
deduplicate_similar_strings_chatml
(
objs
,
num_perm
=
512
,
jaccard_threshold
=
0.6
):
"""
# # Example usage
# strings = ["hello", "h3llo", "helloo", "world", "w0rld", "word", "whirled"]
# deduplicated = deduplicate_similar_strings(strings, jaccard_threshold=0.8)
# print(deduplicated)
"""
# Create an LSH index with a given Jaccard similarity threshold
lsh
=
MinHashLSH
(
threshold
=
jaccard_threshold
,
num_perm
=
num_perm
)
# Create MinHash objects for each string and add to the LSH index
signatures
=
{}
for
i
,
obj
in
tqdm
.
tqdm
(
enumerate
(
objs
)):
minhash
=
MinHash
(
num_perm
=
num_perm
)
for
word
in
(
obj
[
"messages"
][
1
][
"content"
]
+
"
\n
"
+
obj
[
"messages"
][
1
][
"content"
]).
split
():
minhash
.
update
(
word
.
encode
(
'utf8'
))
lsh
.
insert
(
f
'string_
{
i
}
'
,
minhash
)
signatures
[
f
'string_
{
i
}
'
]
=
minhash
unique_strings
=
[]
processed
=
set
()
for
i
,
obj
in
enumerate
(
objs
):
key
=
f
'string_
{
i
}
'
if
key
in
processed
:
continue
similar_keys
=
lsh
.
query
(
signatures
[
key
])
for
sim_key
in
similar_keys
:
processed
.
add
(
sim_key
)
unique_strings
.
append
(
obj
)
return
unique_strings
def
multi_tasks
(
objs
,
workers
=
64
,
path
=
"data/system_role/log_gpt.jsonl"
,
task
=
None
,
prompt_template
=
None
,
chunk_size
=
None
,
language
=
None
):
p
=
mp
.
Pool
(
workers
)
if
chunk_size
:
results
=
[]
job_num
=
math
.
ceil
(
len
(
objs
)
/
chunk_size
)
print
(
f
"job num:
{
job_num
}
"
)
for
worker_id
in
range
(
job_num
):
results
.
append
(
p
.
apply_async
(
MPLogExceptions
(
task
),
args
=
(
objs
[
worker_id
*
chunk_size
:(
worker_id
+
1
)
*
chunk_size
],
worker_id
,
workers
,
None
,
path
,
prompt_template
,
language
)))
else
:
chunk_size
=
math
.
ceil
(
len
(
objs
)
/
float
(
workers
))
results
=
[]
for
worker_id
in
range
(
workers
):
results
.
append
(
p
.
apply_async
(
MPLogExceptions
(
task
),
args
=
(
objs
[
worker_id
*
chunk_size
:(
worker_id
+
1
)
*
chunk_size
],
worker_id
,
workers
,
None
,
path
,
prompt_template
,
language
)))
p
.
close
()
p
.
join
()
output_objs
=
[]
for
result
in
results
:
output_objs
.
extend
(
result
.
get
())
return
output_objs
if
__name__
==
"__main__"
:
test_n_grams
=
get_testset_n_gram
()
objs
=
read_jsonl_file
(
"./sft.jsonl"
)
cnt
=
0
for
obj
in
tqdm
.
tqdm
(
objs
):
overlaps
=
[]
dialog
=
"
\n
"
.
join
([
obj
[
"messages"
][
i
][
"content"
]
for
i
in
range
(
1
,
len
(
obj
[
"messages"
]))])
if
has_n_gram_overlap_with_testset
(
obj
[
"text"
],
test_n_grams
,
n_gram
=
10
,
if_tokenize
=
False
,
overlaps
=
overlaps
):
print
(
obj
[
"text"
])
cnt
+=
1
print
(
cnt
)
finetuning/sft/utils/multiple_metrics/.gitkeep
0 → 100644
View file @
53b3977b
finetuning/sft/utils/multiple_metrics/__init__.py
0 → 100644
View file @
53b3977b
finetuning/sft/utils/multiple_metrics/containerized_eval.py
0 → 100644
View file @
53b3977b
"""
NOTE: Nothing containerized about this any more. This is just a helper
for problem_evaluator.py.
"""
import
tempfile
from
pathlib
import
Path
from
.
import
(
eval_cpp
,
eval_dlang
,
eval_java
,
eval_javascript
,
eval_julia
,
eval_lua
,
eval_php
,
eval_python
,
eval_r
,
eval_racket
,
eval_ruby
,
eval_rust
,
eval_swift
,
eval_ts
,
eval_go
,
eval_pl
,
eval_sh
,
eval_scala
,
eval_cs
)
EVALUATORS
=
{
"rb"
:
(
eval_ruby
.
eval_script
,
".rb"
),
"lua"
:
(
eval_lua
.
eval_script
,
".lua"
),
"python"
:
(
eval_python
.
eval_script
,
".py"
),
"py"
:
(
eval_python
.
eval_script
,
".py"
),
"notypes.py"
:
(
eval_python
.
eval_script
,
".py"
),
"julia"
:
(
eval_julia
.
eval_script
,
".jl"
),
"java"
:
(
eval_java
.
eval_script
,
".java"
),
"rust"
:
(
eval_rust
.
eval_script
,
".rs"
),
"rs"
:
(
eval_rust
.
eval_script
,
".rs"
),
"swift"
:
(
eval_swift
.
eval_script
,
".swift"
),
"lua"
:
(
eval_lua
.
eval_script
,
".lua"
),
"racket"
:
(
eval_racket
.
eval_script
,
".rkt"
),
"rkt"
:
(
eval_racket
.
eval_script
,
".rkt"
),
"javascript"
:
(
eval_javascript
.
eval_script
,
".js"
),
"js"
:
(
eval_javascript
.
eval_script
,
".js"
),
"cpp"
:
(
eval_cpp
.
eval_script
,
".cpp"
),
"cs"
:
(
eval_cs
.
eval_script
,
".cs"
),
"php"
:
(
eval_php
.
eval_script
,
".php"
),
"humaneval_to_dlang.py"
:
(
eval_dlang
.
eval_script
,
".d"
),
"d"
:
(
eval_dlang
.
eval_script
,
".d"
),
"r"
:
(
eval_r
.
eval_script
,
".r"
),
"humaneval_to_r.py"
:
(
eval_r
.
eval_script
,
".r"
),
"jl"
:
(
eval_julia
.
eval_script
,
".jl"
),
"ts"
:
(
eval_ts
.
eval_script
,
".ts"
),
"go"
:
(
eval_go
.
eval_script
,
".go"
),
"pl"
:
(
eval_pl
.
eval_script
,
".pl"
),
"sh"
:
(
eval_sh
.
eval_script
,
".sh"
),
"scala"
:
(
eval_scala
.
eval_script
,
".scala"
),
}
def
eval_string_script
(
language
,
program
):
if
language
in
EVALUATORS
:
(
eval_script
,
file_ext
)
=
EVALUATORS
[
language
]
else
:
eval_module
=
__import__
(
f
"eval_
{
language
}
"
if
language
!=
"go_test.go"
else
"eval_go"
)
eval_script
=
eval_module
.
eval_script
file_ext
=
f
".
{
language
}
"
if
language
!=
"go_test.go"
else
"_test.go"
with
tempfile
.
NamedTemporaryFile
(
suffix
=
file_ext
,
delete
=
True
)
as
f
:
f
.
write
(
program
.
encode
(
"utf-8"
))
f
.
flush
()
result
=
eval_script
(
Path
(
f
.
name
))
# Only save the first 2K of output from the running program. Any futher
# output is very likely an exceptionally long stack trace or a long
# series of prints.
if
type
(
result
[
"stdout"
])
==
bytes
:
result
[
"stdout"
]
=
result
[
"stdout"
].
decode
(
"utf-8"
,
errors
=
"ignore"
)
if
result
[
"stdout"
]
is
None
:
result
[
"stdout"
]
=
""
if
result
[
"stderr"
]
is
None
:
result
[
"stderr"
]
=
""
if
type
(
result
[
"stderr"
])
==
bytes
:
result
[
"stderr"
]
=
result
[
"stderr"
].
decode
(
"utf-8"
,
errors
=
"ignore"
)
assert
type
(
result
[
"stdout"
])
==
str
assert
type
(
result
[
"stderr"
])
==
str
return
{
"program"
:
program
,
"stdout"
:
result
[
"stdout"
].
replace
(
"!!int"
,
""
)[:
2048
],
"stderr"
:
result
[
"stderr"
][:
2048
],
"exit_code"
:
result
[
"exit_code"
],
"status"
:
result
[
"status"
],
}
finetuning/sft/utils/multiple_metrics/eval_cpp.py
0 → 100644
View file @
53b3977b
from
pathlib
import
Path
from
.generic_eval
import
main
from
.safe_subprocess
import
run
LANG_NAME
=
"C++"
LANG_EXT
=
".cpp"
def
eval_script
(
path
:
Path
):
basename
=
"."
.
join
(
str
(
path
).
split
(
"."
)[:
-
1
])
build_result
=
run
([
"g++"
,
path
,
"-o"
,
basename
,
"-std=c++17"
])
if
build_result
.
exit_code
!=
0
:
return
{
"status"
:
"SyntaxError"
,
"exit_code"
:
build_result
.
exit_code
,
"stdout"
:
build_result
.
stdout
,
"stderr"
:
build_result
.
stderr
,
}
run_result
=
run
([
basename
])
if
"In file included from /shared/centos7/gcc/9.2.0-skylake/"
in
run_result
.
stderr
:
raise
Exception
(
"Skylake bug encountered"
)
if
"/4.8.2"
in
run_result
.
stderr
:
raise
Exception
(
"Ancient compiler encountered"
)
if
run_result
.
timeout
:
status
=
"Timeout"
elif
run_result
.
exit_code
!=
0
:
status
=
"Exception"
else
:
status
=
"OK"
return
{
"status"
:
status
,
"exit_code"
:
run_result
.
exit_code
,
"stdout"
:
run_result
.
stdout
,
"stderr"
:
run_result
.
stderr
,
}
if
__name__
==
"__main__"
:
main
(
eval_script
,
LANG_NAME
,
LANG_EXT
)
finetuning/sft/utils/multiple_metrics/eval_cs.py
0 → 100644
View file @
53b3977b
import
os
import
subprocess
from
.generic_eval
import
main
LANG_NAME
=
"CSharp"
LANG_EXT
=
".cs"
# Following files have problems:
# 137,
# 22: Any
# 148: Elipsis
def
eval_script
(
path
:
str
):
if
".cs"
not
in
path
.
name
:
return
basename
=
"."
.
join
(
str
(
path
).
split
(
"."
)[:
-
1
])
binaryname
=
basename
+
".exe"
try
:
# build = subprocess.run(
# ["csc", "/d:DEBUG", "-r:System.Numerics.dll", path, f"/out:{binaryname}"],
# capture_output=True,
# )
build
=
subprocess
.
run
(
[
"mcs"
,
"/d:DEBUG"
,
"-r:System.Numerics.dll"
,
path
,
f
"/out:
{
binaryname
}
"
],
capture_output
=
True
,
)
except
:
subprocess
.
run
(
"apt-get install mono-devel"
)
status
=
None
returncode
=
-
1
output
=
None
if
build
.
returncode
!=
0
:
# Well, it's a compile error. May be a type error or
# something. But, why break the set convention
status
=
"SyntaxError"
returncode
=
build
.
returncode
output
=
build
else
:
try
:
output
=
subprocess
.
run
(
[
"mono"
,
binaryname
],
env
=
{
"PATH"
:
os
.
getenv
(
"PATH"
),
"MONO_TRACE_LISTENER"
:
"Console.Error"
},
capture_output
=
True
,
timeout
=
10
,
)
returncode
=
output
.
returncode
output
.
stderr
=
str
(
output
.
stderr
,
"utf-8"
)
# mono return 0 even when failing
fail
=
(
"System.Diagnostics.DefaultTraceListener.Fail"
in
output
.
stderr
or
"Unhandled Exception"
in
output
.
stderr
)
output
.
returncode
=
1
if
fail
else
0
if
output
.
returncode
==
0
:
status
=
"OK"
else
:
# Well, it's a panic
status
=
"Exception"
except
subprocess
.
TimeoutExpired
as
exc
:
status
=
"Timeout"
output
=
exc
os
.
remove
(
binaryname
)
if
output
.
stdout
is
not
None
:
output
.
stdout
=
output
.
stdout
.
decode
(
"utf-8"
)
else
:
output
.
stdout
=
"None"
if
output
.
stderr
==
""
:
output
.
stderr
=
"None"
return
{
"status"
:
status
,
"exit_code"
:
returncode
,
"stdout"
:
output
.
stdout
,
"stderr"
:
output
.
stderr
,
}
if
__name__
==
"__main__"
:
main
(
eval_script
,
LANG_NAME
,
LANG_EXT
)
finetuning/sft/utils/multiple_metrics/eval_dlang.py
0 → 100644
View file @
53b3977b
import
os
import
re
from
pathlib
import
Path
from
.safe_subprocess
import
run
ENABLE_SYNTAX_CHECK
=
False
def
eval_script
(
path
:
Path
):
result
=
run
([
"rdmd"
,
"-unittest"
,
str
(
path
)],
timeout_seconds
=
15
)
if
"might not be correctly installed"
in
result
.
stderr
:
raise
Exception
(
"D is not correctly installed"
)
if
result
.
timeout
:
status
=
"Timeout"
elif
result
.
exit_code
==
0
:
status
=
"OK"
elif
"Error:"
in
result
.
stderr
:
status
=
"SyntaxError"
else
:
status
=
"Exception"
return
{
"status"
:
status
,
"exit_code"
:
result
.
exit_code
,
"stdout"
:
result
.
stdout
,
"stderr"
:
result
.
stderr
,
}
DIR
=
"d-keep-code_davinci_001_temp_0.2"
def
main
():
directory
=
Path
(
Path
(
__file__
).
parent
,
".."
,
"datasets"
,
DIR
).
resolve
()
count
=
{
"OK"
:
0
,
"Timeout"
:
0
,
"Exception"
:
0
,
"SyntaxError"
:
0
}
for
filename
in
os
.
listdir
(
directory
):
path
=
Path
.
joinpath
(
directory
,
filename
)
r
=
eval_script
(
path
)
status
=
r
[
"status"
]
count
[
status
]
+=
1
if
ENABLE_SYNTAX_CHECK
and
status
==
"SyntaxError"
:
error_msgs
=
r
[
"stderr"
].
split
(
"
\n
"
)
with
open
(
path
)
as
source_file
:
lines
=
source_file
.
readlines
()
unittest_line_start
=
lines
.
index
(
"unittest
\n
"
)
unittest_line_end
=
len
(
lines
)
for
err_msg_line
in
error_msgs
:
matched_parts
=
re
.
match
(
r
"(\/?.*?\.[\w:]+\/.*.d)\(([0-9]+)\): Error: (.*)"
,
err_msg_line
[
2
:
-
1
],
)
_file
,
line_num
=
matched_parts
[
1
],
int
(
matched_parts
[
2
])
if
(
unittest_line_start
<=
line_num
and
line_num
<=
unittest_line_end
):
print
(
"==============="
)
print
(
path
,
"contains error in unit test part"
)
print
(
error_msgs
)
print
(
"==============="
)
filename
=
filename
.
split
(
"."
)[
0
]
print
(
f
"Dlang,
{
filename
}
,
{
status
}
"
)
print
(
DIR
+
":"
+
str
(
count
))
if
__name__
==
"__main__"
:
main
()
finetuning/sft/utils/multiple_metrics/eval_go.py
0 → 100644
View file @
53b3977b
import
argparse
import
subprocess
from
pathlib
import
Path
from
sys
import
exit
from
.generic_eval
import
main
as
gmain
def
eval_script
(
path
:
Path
):
status
=
None
stdout
=
None
stderr
=
None
exit_code
=
None
try
:
build
=
subprocess
.
run
(
[
"go"
,
"test"
,
path
],
timeout
=
30
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
)
stdout
=
build
.
stdout
.
decode
(
"utf-8"
,
errors
=
"ignore"
)
stderr
=
build
.
stderr
.
decode
(
"utf-8"
,
errors
=
"ignore"
)
exit_code
=
build
.
returncode
# write to stderr just so that we can redirect stdout to a csv
if
"[setup failed]"
in
stdout
or
"[build failed]"
in
stdout
:
status
=
"SyntaxError"
elif
"FAIL"
in
stdout
:
status
=
"Exception"
else
:
status
=
"OK"
except
subprocess
.
TimeoutExpired
:
status
=
"Timeout"
return
{
"status"
:
status
,
"exit_code"
:
exit_code
,
"stdout"
:
stdout
,
"stderr"
:
stderr
,
}
if
__name__
==
"__main__"
:
gmain
(
eval_script
,
"Go"
,
".go"
)
finetuning/sft/utils/multiple_metrics/eval_java.py
0 → 100644
View file @
53b3977b
import
os
import
tempfile
from
pathlib
import
Path
from
.generic_eval
import
main
from
.safe_subprocess
import
run
LANG_NAME
=
"Java"
LANG_EXT
=
".java"
# Following files have problems:
# 137,
# 22: Any
# 148: Elipsis
def
eval_script
(
path
:
Path
):
sys_env
=
os
.
environ
.
copy
()
javatuples_path
=
Path
(
f
"
{
os
.
path
.
dirname
(
__file__
)
}
/javatuples-1.2.jar"
)
sys_env
[
"CLASSPATH"
]
=
f
"
{
javatuples_path
}
"
with
tempfile
.
TemporaryDirectory
()
as
outdir
:
# Each Java file contains the class with same name `JAVA_CLASS_NAME`
# Hence, javac will same JAVA_CLASS_NAME.class file for each problem
# Write class for each problem to a different temp dir
# Use UTF8 encoding with javac
result
=
run
([
"javac"
,
"-encoding"
,
"UTF8"
,
"-d"
,
outdir
,
path
],
env
=
sys_env
)
if
result
.
exit_code
!=
0
:
# Well, it's a compile error. May be a type error or
# something. But, why break the set convention
status
=
"SyntaxError"
else
:
result
=
run
([
"java"
,
"-ea"
,
"-cp"
,
f
"
{
outdir
}
"
,
"Problem"
],
env
=
sys_env
)
if
result
.
timeout
:
status
=
"Timeout"
elif
result
.
exit_code
==
0
:
status
=
"OK"
else
:
status
=
"Exception"
return
{
"status"
:
status
,
"exit_code"
:
result
.
exit_code
,
"stdout"
:
result
.
stdout
,
"stderr"
:
result
.
stderr
,
}
if
__name__
==
"__main__"
:
main
(
eval_script
,
LANG_NAME
,
LANG_EXT
)
finetuning/sft/utils/multiple_metrics/eval_javascript.py
0 → 100644
View file @
53b3977b
import
os
import
subprocess
from
pathlib
import
Path
def
eval_script
(
path
:
Path
):
try
:
# Assumes exit-code 0 is all okay
output
=
subprocess
.
run
([
"node"
,
str
(
path
)],
capture_output
=
True
,
timeout
=
5
)
if
output
.
returncode
==
0
:
status
=
"OK"
else
:
outmessage
=
str
(
output
)
if
"ERR_ASSERTION"
in
outmessage
:
status
=
"AssertionError"
elif
"SyntaxError"
in
outmessage
:
status
=
"SyntaxError"
elif
"ReferenceError"
in
outmessage
:
status
=
"ReferenceError"
else
:
status
=
"Exception"
returncode
=
output
.
returncode
except
subprocess
.
TimeoutExpired
as
exc
:
status
=
"Timeout"
output
=
exc
returncode
=
-
1
except
subprocess
.
CalledProcessError
as
exc
:
status
=
"Exception"
returncode
=
exc
.
returncode
output
=
exc
return
{
"status"
:
status
,
"exit_code"
:
returncode
,
"stdout"
:
""
if
output
.
stdout
is
None
else
output
.
stdout
.
decode
(
"utf-8"
),
"stderr"
:
""
if
output
.
stderr
is
None
else
output
.
stderr
.
decode
(
"utf-8"
),
}
def
main
():
directory
=
Path
(
Path
(
__file__
).
parent
,
".."
,
"datasets"
,
"js-keep-code_davinci_001_temp_0.2"
).
resolve
()
for
filename
in
os
.
listdir
(
directory
):
r
=
eval_script
(
Path
.
joinpath
(
directory
,
filename
))
filename
=
filename
.
split
(
"."
)[
0
]
print
(
f
"JavaScript,
{
filename
}
,
{
r
[
'status'
]
}
"
)
if
__name__
==
"__main__"
:
main
()
finetuning/sft/utils/multiple_metrics/eval_julia.py
0 → 100644
View file @
53b3977b
from
pathlib
import
Path
from
.safe_subprocess
import
run
def
eval_script
(
path
:
Path
):
result
=
run
([
"julia"
,
str
(
path
)],
timeout_seconds
=
5
)
if
result
.
timeout
:
status
=
"Timeout"
elif
result
.
exit_code
==
0
:
status
=
"OK"
# TODO(arjun): I would like this to be reviewed more carefully by John.
elif
len
(
result
.
stderr
)
<
1
:
status
=
"Exception"
else
:
status
=
"SyntaxError"
return
{
"status"
:
status
,
"exit_code"
:
result
.
exit_code
,
"stdout"
:
result
.
stdout
,
"stderr"
:
result
.
stderr
,
}
finetuning/sft/utils/multiple_metrics/eval_lua.py
0 → 100644
View file @
53b3977b
from
pathlib
import
Path
from
.safe_subprocess
import
run
def
eval_script
(
path
:
Path
):
r
=
run
([
"lua"
,
str
(
path
)])
if
r
.
timeout
:
status
=
"Timeout"
elif
r
.
exit_code
==
0
:
status
=
"OK"
else
:
status
=
"Exception"
return
{
"status"
:
status
,
"exit_code"
:
r
.
exit_code
,
"stdout"
:
r
.
stdout
,
"stderr"
:
r
.
stderr
,
}
finetuning/sft/utils/multiple_metrics/eval_php.py
0 → 100644
View file @
53b3977b
from
pathlib
import
Path
from
.safe_subprocess
import
run
LANG_NAME
=
"PHP"
LANG_EXT
=
".php"
def
eval_script
(
path
:
Path
):
r
=
run
([
"php"
,
path
])
if
"PHP Parse error"
in
r
.
stdout
:
status
=
"SyntaxError"
elif
r
.
exit_code
!=
0
:
status
=
"Exception"
else
:
status
=
"OK"
return
{
"status"
:
status
,
"exit_code"
:
r
.
exit_code
,
"stdout"
:
r
.
stdout
,
"stderr"
:
r
.
stderr
,
}
finetuning/sft/utils/multiple_metrics/eval_pl.py
0 → 100644
View file @
53b3977b
from
pathlib
import
Path
from
.safe_subprocess
import
run
def
eval_script
(
path
:
Path
):
r
=
run
([
"perl"
,
path
])
if
r
.
timeout
:
status
=
"Timeout"
elif
r
.
exit_code
!=
0
:
status
=
"Exception"
elif
"ERROR"
in
r
.
stdout
or
"ERROR"
in
r
.
stderr
:
status
=
"Exception"
else
:
status
=
"OK"
return
{
"status"
:
status
,
"exit_code"
:
r
.
exit_code
,
"stdout"
:
r
.
stdout
,
"stderr"
:
r
.
stderr
,
}
finetuning/sft/utils/multiple_metrics/eval_python.py
0 → 100644
View file @
53b3977b
from
pathlib
import
Path
import
os
from
.safe_subprocess
import
run
def
eval_script
(
path
:
Path
):
os
.
environ
[
"PATH"
]
=
"miniconda3/envs/qwen/bin/:"
+
os
.
environ
[
"PATH"
]
r
=
run
([
"python"
,
str
(
path
)])
if
r
.
timeout
:
status
=
"Timeout"
elif
r
.
exit_code
==
0
:
status
=
"OK"
elif
"SyntaxError"
in
r
.
stderr
:
status
=
"SyntaxError"
else
:
status
=
"Exception"
return
{
"status"
:
status
,
"exit_code"
:
r
.
exit_code
,
"stdout"
:
r
.
stdout
,
"stderr"
:
r
.
stderr
,
}
finetuning/sft/utils/multiple_metrics/eval_r.py
0 → 100644
View file @
53b3977b
import
os
import
subprocess
from
pathlib
import
Path
def
eval_script
(
path
:
Path
):
try
:
# Assumes exit-code 0 is all okay
# Run R on the file, capturing stderr
output
=
subprocess
.
run
([
"Rscript"
,
str
(
path
)],
capture_output
=
True
,
timeout
=
5
)
if
output
.
returncode
==
0
:
status
=
"OK"
else
:
outmessage
=
str
(
output
)
if
"unexpected"
in
outmessage
:
status
=
"SyntaxError"
elif
"err=b''"
in
outmessage
:
status
=
"AssertionError"
else
:
status
=
"Exception"
returncode
=
output
.
returncode
except
subprocess
.
TimeoutExpired
as
exc
:
status
=
"Timeout"
output
=
exc
returncode
=
-
1
except
subprocess
.
CalledProcessError
as
exc
:
status
=
"Exception"
returncode
=
exc
.
returncode
output
=
exc
return
{
"status"
:
status
,
"exit_code"
:
returncode
,
"stdout"
:
output
.
stdout
,
"stderr"
:
output
.
stderr
,
}
def
main
():
directory
=
Path
(
Path
(
__file__
).
parent
,
".."
,
"datasets"
,
"R-keep-code_davinci_001_temp_0.2"
).
resolve
()
for
filename
in
os
.
listdir
(
directory
):
r
=
eval_script
(
Path
.
joinpath
(
directory
,
filename
))
filename
=
filename
.
split
(
"."
)[
0
]
print
(
f
"R,
{
filename
}
,
{
r
[
'status'
]
}
"
)
if
__name__
==
"__main__"
:
main
()
finetuning/sft/utils/multiple_metrics/eval_racket.py
0 → 100644
View file @
53b3977b
"""
Evaluates a generated Racket program (.rkt).
"""
import
os
from
pathlib
import
Path
from
.safe_subprocess
import
run
def
eval_script
(
path
:
Path
):
result
=
run
([
"racket"
,
str
(
path
)])
if
(
"standard-module-name-resolver: collection not found
\n
for module path: rackunit"
in
result
.
stderr
):
print
(
f
"Failed to run evaluation for
{
path
}
: rackunit is not installed"
)
return
None
# rackunit produces exit code 0 even if tests fail.
if
len
(
result
.
stderr
)
>
0
or
result
.
exit_code
!=
0
:
if
"read-syntax"
in
result
.
stderr
:
status
=
"SyntaxError"
else
:
status
=
"Exception"
else
:
status
=
"OK"
return
{
"status"
:
status
,
"exit_code"
:
result
.
exit_code
,
"stdout"
:
result
.
stdout
,
"stderr"
:
result
.
stderr
,
}
def
main
():
directory
=
Path
(
Path
(
__file__
).
parent
,
".."
,
"datasets"
,
"racket-keep-code_davinci_001_temp_0.2"
).
resolve
()
for
filename
in
os
.
listdir
(
directory
):
r
=
eval_script
(
Path
.
joinpath
(
directory
,
filename
))
filename
=
filename
.
split
(
"."
)[
0
]
print
(
f
"Racket,
{
filename
}
,
{
r
[
'status'
]
}
"
)
if
__name__
==
"__main__"
:
main
()
finetuning/sft/utils/multiple_metrics/eval_ruby.py
0 → 100644
View file @
53b3977b
import
argparse
import
subprocess
from
pathlib
import
Path
from
.generic_eval
import
main
as
gmain
def
eval_script
(
path
:
Path
):
try
:
# Assumes exit-code 0 is all okay
# Need check=True for Ruby to pass errors to CalledProcessError
output
=
subprocess
.
run
(
[
"ruby"
,
path
],
check
=
True
,
capture_output
=
True
,
timeout
=
5
)
if
output
.
returncode
==
0
:
status
=
"OK"
out
=
output
.
stderr
error
=
output
.
stdout
returncode
=
0
else
:
raise
Exception
(
"there's an issue with check = True for Ruby, INVESTIGATE!"
)
except
subprocess
.
TimeoutExpired
as
exc
:
status
=
"Timeout"
out
=
exc
.
stdout
error
=
exc
.
stderr
returncode
=
-
1
except
subprocess
.
CalledProcessError
as
exc
:
returncode
=
exc
.
returncode
out
=
exc
.
stdout
error
=
exc
.
stderr
# failure with code 1 but no error message is an Exception from Failed tests
if
len
(
error
)
<
1
:
status
=
"Exception"
else
:
# everything that prints out an error message is a SyntaxError
status
=
"SyntaxError"
return
{
"status"
:
status
,
"exit_code"
:
returncode
,
"stdout"
:
out
,
"stderr"
:
error
,
}
if
__name__
==
"__main__"
:
gmain
(
eval_script
,
"Ruby"
,
".rb"
)
finetuning/sft/utils/multiple_metrics/eval_rust.py
0 → 100644
View file @
53b3977b
import
os
import
subprocess
from
pathlib
import
Path
from
.generic_eval
import
main
LANG_NAME
=
"Rust"
LANG_EXT
=
".rs"
def
eval_script
(
path
:
Path
):
basename
=
"."
.
join
(
str
(
path
).
split
(
"."
)[:
-
1
])
try
:
build
=
subprocess
.
run
(
[
"rustc"
,
path
,
"-o"
,
basename
],
capture_output
=
True
,
timeout
=
15
)
except
subprocess
.
TimeoutExpired
as
exc
:
return
{
"status"
:
"Timeout"
,
"exit_code"
:
-
1
,
"stdout"
:
"Compiler timeout"
,
"stderr"
:
"Compiler timeout"
,
}
status
=
None
returncode
=
-
1
output
=
None
if
build
.
returncode
!=
0
:
# Well, it's a compile error. May be a type error or
# something. But, why break the set convention
status
=
"SyntaxError"
returncode
=
build
.
returncode
output
=
build
else
:
try
:
# Assumes exit-code 0 is all okay
output
=
subprocess
.
run
([
basename
],
capture_output
=
True
,
timeout
=
5
)
returncode
=
output
.
returncode
if
output
.
returncode
==
0
:
status
=
"OK"
else
:
# Well, it's a panic
status
=
"Exception"
except
subprocess
.
TimeoutExpired
as
exc
:
status
=
"Timeout"
output
=
exc
os
.
remove
(
basename
)
return
{
"status"
:
status
,
"exit_code"
:
returncode
,
"stdout"
:
""
if
output
.
stdout
is
None
else
output
.
stdout
.
decode
(
"utf-8"
),
"stderr"
:
""
if
output
.
stderr
is
None
else
output
.
stderr
.
decode
(
"utf-8"
),
}
if
__name__
==
"__main__"
:
main
(
eval_script
,
LANG_NAME
,
LANG_EXT
)
finetuning/sft/utils/multiple_metrics/eval_scala.py
0 → 100644
View file @
53b3977b
import
tempfile
from
pathlib
import
Path
from
.safe_subprocess
import
run
LANG_NAME
=
"Scala"
LANG_EXT
=
".scala"
def
eval_script
(
path
:
Path
):
with
tempfile
.
TemporaryDirectory
()
as
outdir
:
# Each Scala file contains the class with same name `JAVA_CLASS_NAME`
# Hence, scalac will same JAVA_CLASS_NAME.class file for each problem
# Write class for each problem to a different temp dir
build
=
run
([
"scalac"
,
"-d"
,
outdir
,
path
],
timeout_seconds
=
45
)
if
build
.
exit_code
!=
0
:
# Well, it's a compile error. May be a type error or
# something. But, why break the set convention
return
{
"status"
:
"SyntaxError"
,
"exit_code"
:
build
.
exit_code
,
"stdout"
:
build
.
stdout
,
"stderr"
:
build
.
stderr
,
}
# "Problem" is the name of the class we emit.
r
=
run
([
"scala"
,
"-cp"
,
f
"
{
outdir
}
"
,
"Problem"
])
if
r
.
timeout
:
status
=
"Timeout"
elif
r
.
exit_code
==
0
and
r
.
stderr
==
""
:
status
=
"OK"
else
:
# Well, it's a panic
status
=
"Exception"
return
{
"status"
:
status
,
"exit_code"
:
r
.
exit_code
,
"stdout"
:
r
.
stdout
,
"stderr"
:
r
.
stderr
,
}
Prev
1
…
12
13
14
15
16
17
18
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment