Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
727c4598
Unverified
Commit
727c4598
authored
Jun 19, 2023
by
digger yu
Committed by
GitHub
Jun 19, 2023
Browse files
[nfc] fix dim not defined and fix typo (#3991)
parent
ca768eb6
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
8 additions
and
8 deletions
+8
-8
applications/Chat/evaluate/gpt_evaluate.py
applications/Chat/evaluate/gpt_evaluate.py
+3
-3
applications/Chat/evaluate/unieval/evaluator.py
applications/Chat/evaluate/unieval/evaluator.py
+2
-2
applications/Chat/evaluate/unieval/utils.py
applications/Chat/evaluate/unieval/utils.py
+1
-1
applications/Chat/tests/test_data.py
applications/Chat/tests/test_data.py
+2
-2
No files found.
applications/Chat/evaluate/gpt_evaluate.py
View file @
727c4598
...
@@ -361,7 +361,7 @@ def get_gpt_evaluation_without_logprobs(prompt: Dict[str, Any],
...
@@ -361,7 +361,7 @@ def get_gpt_evaluation_without_logprobs(prompt: Dict[str, Any],
"""
"""
Use chat models(gpt-3.5-turbo or gpt-4) to evaluate one model answer.
Use chat models(gpt-3.5-turbo or gpt-4) to evaluate one model answer.
Temprature is set to 0 to make the model more deterministic.
Temp
e
rature is set to 0 to make the model more deterministic.
Args:
Args:
prompt: a dictionary including prompt template, CoT and metrics.
prompt: a dictionary including prompt template, CoT and metrics.
...
@@ -435,7 +435,7 @@ def get_gpt_evaluation_with_logprobs(prompt: Dict[str, Any],
...
@@ -435,7 +435,7 @@ def get_gpt_evaluation_with_logprobs(prompt: Dict[str, Any],
Use completion model(text-davinci-003) to evaluate one model answer.
Use completion model(text-davinci-003) to evaluate one model answer.
Only completion models can return log probabilities.
Only completion models can return log probabilities.
Temprature is set to 0 to make the model more deterministic.
Temp
e
rature is set to 0 to make the model more deterministic.
Args:
Args:
prompt: a dictionary including prompt template, CoT and metrics.
prompt: a dictionary including prompt template, CoT and metrics.
...
@@ -593,7 +593,7 @@ def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float:
...
@@ -593,7 +593,7 @@ def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float:
def
calculate_scores_form_response
(
response
:
str
,
evaluation
:
Dict
[
str
,
Any
])
->
int
:
def
calculate_scores_form_response
(
response
:
str
,
evaluation
:
Dict
[
str
,
Any
])
->
int
:
"""
"""
Calculate the score from the response returned by gpt-3.5-turbo or gpt-4.
Calculate the score from the response returned by gpt-3.5-turbo or gpt-4.
Different from text-davinci-003, this fuction directly calculates the score according to the plain response returned by gpt-3.5-turbo or gpt-4.
Different from text-davinci-003, this fu
n
ction directly calculates the score according to the plain response returned by gpt-3.5-turbo or gpt-4.
Although text-davinci-003 can return log probabilities, it costs ten times as much as gpt-3.5-turbo.
Although text-davinci-003 can return log probabilities, it costs ten times as much as gpt-3.5-turbo.
Args:
Args:
...
...
applications/Chat/evaluate/unieval/evaluator.py
View file @
727c4598
...
@@ -277,7 +277,7 @@ class FactEvaluator:
...
@@ -277,7 +277,7 @@ class FactEvaluator:
n_data
=
len
(
data
)
n_data
=
len
(
data
)
eval_scores
=
[{}
for
_
in
range
(
n_data
)]
eval_scores
=
[{}
for
_
in
range
(
n_data
)]
# Calculate average sentence-level scores for fac
u
tal consistency
# Calculate average sentence-level scores for fact
u
al consistency
src_list
,
output_list
=
[],
[]
src_list
,
output_list
=
[],
[]
n_sents
=
[]
# the number of sentences in the claim
n_sents
=
[]
# the number of sentences in the claim
for
i
in
range
(
n_data
):
for
i
in
range
(
n_data
):
...
@@ -288,7 +288,7 @@ class FactEvaluator:
...
@@ -288,7 +288,7 @@ class FactEvaluator:
src_list
.
append
(
source
)
src_list
.
append
(
source
)
output_list
.
append
(
system_outputs
[
j
])
output_list
.
append
(
system_outputs
[
j
])
input_list
=
add_question
(
dimension
=
self
.
dim
,
output
=
output_list
,
src
=
src_list
,
task
=
self
.
task
)
input_list
=
add_question
(
dimension
=
self
.
dim
,
output
=
output_list
,
src
=
src_list
,
task
=
self
.
task
)
sent_score
=
self
.
scorer
.
score
(
input_list
,
self
.
task
,
category
,
dim
)
sent_score
=
self
.
scorer
.
score
(
input_list
,
self
.
task
,
category
,
self
.
dim
)
# Get average score for each sample
# Get average score for each sample
start_idx
=
0
start_idx
=
0
...
...
applications/Chat/evaluate/unieval/utils.py
View file @
727c4598
...
@@ -37,7 +37,7 @@ def add_question(dimension, output, src=None, ref=None, context=None, task=None)
...
@@ -37,7 +37,7 @@ def add_question(dimension, output, src=None, ref=None, context=None, task=None)
src: source input for different NLG tasks. For example, source document for summarization
src: source input for different NLG tasks. For example, source document for summarization
and dialogue history for dialogue response generation.
and dialogue history for dialogue response generation.
output: output text generated by the models
output: output text generated by the models
ref: human-annotat
a
ed groundtruth
ref: human-annotated groundtruth
context: the context needed to evaluate several specific dimension. For example,
context: the context needed to evaluate several specific dimension. For example,
additional factual information when evaluating engagingness and groundedness in dialogues.
additional factual information when evaluating engagingness and groundedness in dialogues.
"""
"""
...
...
applications/Chat/tests/test_data.py
View file @
727c4598
...
@@ -33,7 +33,7 @@ def gather_and_equal(tensor: torch.Tensor) -> bool:
...
@@ -33,7 +33,7 @@ def gather_and_equal(tensor: torch.Tensor) -> bool:
def
run_test_data
(
strategy
):
def
run_test_data
(
strategy
):
EXPERINCE_BATCH_SIZE
=
4
EXPERI
E
NCE_BATCH_SIZE
=
4
SAMPLE_BATCH_SIZE
=
2
SAMPLE_BATCH_SIZE
=
2
if
strategy
==
'ddp'
:
if
strategy
==
'ddp'
:
...
@@ -54,7 +54,7 @@ def run_test_data(strategy):
...
@@ -54,7 +54,7 @@ def run_test_data(strategy):
# experience of all ranks should be the same
# experience of all ranks should be the same
for
_
in
range
(
2
):
for
_
in
range
(
2
):
data
=
get_data
(
EXPERINCE_BATCH_SIZE
)
data
=
get_data
(
EXPERI
E
NCE_BATCH_SIZE
)
assert
gather_and_equal
(
data
[
'input_ids'
])
assert
gather_and_equal
(
data
[
'input_ids'
])
assert
gather_and_equal
(
data
[
'attention_mask'
])
assert
gather_and_equal
(
data
[
'attention_mask'
])
experience
=
experience_maker
.
make_experience
(
**
data
,
experience
=
experience_maker
.
make_experience
(
**
data
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment