[nfc] fix dim not defined and fix typo (#3991)

727c4598 · digger yu · GitHub · ca768eb6 · 727c4598 · 727c4598
Unverified Commit 727c4598 authored Jun 19, 2023 by digger yu Committed by GitHub Jun 19, 2023
4 changed files
--- a/applications/Chat/evaluate/gpt_evaluate.py
+++ b/applications/Chat/evaluate/gpt_evaluate.py
@@ -361,7 +361,7 @@ def get_gpt_evaluation_without_logprobs(prompt: Dict[str, Any],
    """
    Use chat models(gpt-3.5-turbo or gpt-4) to evaluate one model answer.
-    Temprature is set to 0 to make the model more deterministic.
+    Temperature is set to 0 to make the model more deterministic.
    Args:
        prompt: a dictionary including prompt template, CoT and metrics.
@@ -435,7 +435,7 @@ def get_gpt_evaluation_with_logprobs(prompt: Dict[str, Any],
    Use completion model(text-davinci-003) to evaluate one model answer.
    Only completion models can return log probabilities.
-    Temprature is set to 0 to make the model more deterministic.
+    Temperature is set to 0 to make the model more deterministic.
    Args:
        prompt: a dictionary including prompt template, CoT and metrics.
@@ -593,7 +593,7 @@ def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float:
 def calculate_scores_form_response(response: str, evaluation: Dict[str, Any]) -> int:
    """
    Calculate the score from the response returned by gpt-3.5-turbo or gpt-4.
-    Different from text-davinci-003, this fuction directly calculates the score according to the plain response returned by gpt-3.5-turbo or gpt-4.
+    Different from text-davinci-003, this function directly calculates the score according to the plain response returned by gpt-3.5-turbo or gpt-4.
    Although text-davinci-003 can return log probabilities, it costs ten times as much as gpt-3.5-turbo.
    Args:

--- a/applications/Chat/evaluate/unieval/evaluator.py
+++ b/applications/Chat/evaluate/unieval/evaluator.py
@@ -277,7 +277,7 @@ class FactEvaluator:
        n_data = len(data)
        eval_scores = [{} for _ in range(n_data)]
-        # Calculate average sentence-level scores for facutal consistency
+        # Calculate average sentence-level scores for factual consistency
        src_list, output_list = [], []
        n_sents = []    # the number of sentences in the claim
        for i in range(n_data):
@@ -288,7 +288,7 @@ class FactEvaluator:
                src_list.append(source)
                output_list.append(system_outputs[j])
        input_list = add_question(dimension=self.dim, output=output_list, src=src_list, task=self.task)
-        sent_score = self.scorer.score(input_list, self.task, category, dim)
+        sent_score = self.scorer.score(input_list, self.task, category, self.dim)
        # Get average score for each sample
        start_idx = 0

--- a/applications/Chat/evaluate/unieval/utils.py
+++ b/applications/Chat/evaluate/unieval/utils.py
@@ -37,7 +37,7 @@ def add_question(dimension, output, src=None, ref=None, context=None, task=None)
        src: source input for different NLG tasks. For example, source document for summarization
             and dialogue history for dialogue response generation.
        output: output text generated by the models
-        ref: human-annotataed groundtruth
+        ref: human-annotated groundtruth
        context: the context needed to evaluate several specific dimension. For example,
                 additional factual information when evaluating engagingness and groundedness in dialogues.
    """

--- a/applications/Chat/tests/test_data.py
+++ b/applications/Chat/tests/test_data.py
@@ -33,7 +33,7 @@ def gather_and_equal(tensor: torch.Tensor) -> bool:
 def run_test_data(strategy):
-    EXPERINCE_BATCH_SIZE = 4
+    EXPERIENCE_BATCH_SIZE = 4
    SAMPLE_BATCH_SIZE = 2
    if strategy == 'ddp':
@@ -54,7 +54,7 @@ def run_test_data(strategy):
    # experience of all ranks should be the same
    for _ in range(2):
-        data = get_data(EXPERINCE_BATCH_SIZE)
+        data = get_data(EXPERIENCE_BATCH_SIZE)
        assert gather_and_equal(data['input_ids'])
        assert gather_and_equal(data['attention_mask'])
        experience = experience_maker.make_experience(**data,