"csrc/vscode:/vscode.git/clone" did not exist on "fa87c1aeea17a8f1db6f7c93300bb96e2b6c1c72"
Unverified Commit 7c9fbcf8 authored by PabloAgustin's avatar PabloAgustin Committed by GitHub
Browse files

New healthcare benchmark: careqa (#2714)



* New healthcare benchmark: careqa

* LAUNCH_MN5_ACC <python main.py --config config/mn5.yml --models Llama-3.2-1B-Instruct --tasks careqa_open --num_fewshot 0>

* Add fixes, READMES, and remove task_list.txt

* pre-commit passed, add formatting updates; add nanmean agg_metric

* Fix import error.

* Wrapped imports in try excepts

* Wrapped imports in try excepts; also metrics to catch bert_score import error

* Try except to catch ImportErrors as well

* use np.nan

* pre-commit

---------
Co-authored-by: default avatarPabloAgustin <pablo.martin@bsc.es>
Co-authored-by: default avatarBaber <baber@hey.com>
parent 2c8ffb80
import re
from lm_eval.tasks.mimic_repsum.utils import doc_to_target
def process_results(doc, results):
(loglikelihood,) = results
_words = len(re.split(r"\s+", doc_to_target(doc)))
_bytes = len(doc_to_target(doc).encode("utf-8"))
return {
"word_perplexity": (loglikelihood, _words),
"byte_perplexity": (loglikelihood, _bytes),
"bits_per_byte": (loglikelihood, _bytes),
}
# MTS-Dialog
### Paper
Title: `An Empirical Study of Clinical Note Generation from Doctor-Patient Encounters`
Abstract: [https://aclanthology.org/2023.eacl-main.168/](https://aclanthology.org/2023.eacl-main.168/)
MTS-Dialog is a collection of 1,700 doctor-patient dialogues and corresponding clinical notes.
This task implements open-ended Question Answering (QA) on MTS-Dialog.
#### Tasks
* `mts_dialog`: Open-Ended QA in english.
* `mts_dialog_perplexity`: Open-Ended QA in english, evaluated with perplexity.
### Citation
```bibtex
@inproceedings{ben-abacha-etal-2023-empirical,
title = "An Empirical Study of Clinical Note Generation from Doctor-Patient Encounters",
author = "Ben Abacha, Asma and
Yim, Wen-wai and
Fan, Yadan and
Lin, Thomas",
editor = "Vlachos, Andreas and
Augenstein, Isabelle",
booktitle = "Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics",
month = may,
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.eacl-main.168/",
doi = "10.18653/v1/2023.eacl-main.168",
pages = "2291--2302",
abstract = "Medical doctors spend on average 52 to 102 minutes per day writing clinical notes from their patient encounters (Hripcsak et al., 2011). Reducing this workload calls for relevant and efficient summarization methods. In this paper, we introduce new resources and empirical investigations for the automatic summarization of doctor-patient conversations in a clinical setting. In particular, we introduce the MTS-Dialog dataset; a new collection of 1,700 doctor-patient dialogues and corresponding clinical notes. We use this new dataset to investigate the feasibility of this task and the relevance of existing language models, data augmentation, and guided summarization techniques. We compare standard evaluation metrics based on n-gram matching, contextual embeddings, and Fact Extraction to assess the accuracy and the factual consistency of the generated summaries. To ground these results, we perform an expert-based evaluation using relevant natural language generation criteria and task-specific criteria such as critical omissions, and study the correlation between the automatic metrics and expert judgments. To the best of our knowledge, this study is the first attempt to introduce an open dataset of doctor-patient conversations and clinical notes, with detailed automated and manual evaluations of clinical note generation."
}
```
task: mts_dialog
dataset_path: har1/MTS_Dialogue-Clinical_Note
description: >
Instructions: The following text is from a collection of medical dialogs between doctor and patient. Extract all relevant information to compose a note that summarizes the relevant content of the dialog.
output_type: generate_until
training_split: train
validation_split: train
test_split: train
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
process_results: !function utils.process_results
generation_kwargs:
until:
- "\n\n"
metric_list:
- metric: bleu
aggregation: nanmean
higher_is_better: true
- metric: rouge1
aggregation: nanmean
higher_is_better: true
- metric: rouge2
aggregation: nanmean
higher_is_better: true
- metric: rougeL
aggregation: nanmean
higher_is_better: true
- metric: bert_score
aggregation: nanmean
higher_is_better: true
- metric: bleurt
aggregation: nanmean
higher_is_better: true
metadata:
version: 1.2
include: mts_dialog.yaml
task: mts_dialog_perplexity
output_type: loglikelihood_rolling
doc_to_text: ""
process_results: !function utils_perplexity.process_results
metric_list:
- metric: word_perplexity
higher_is_better: false
- metric: byte_perplexity
higher_is_better: false
- metric: bits_per_byte
higher_is_better: false
metadata:
version: 1.0
import numpy as np
try:
import evaluate
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")
bleurt = evaluate.load("bleurt", "bleurt-base-512", module_type="metric")
except (ModuleNotFoundError, ImportError):
raise ModuleNotFoundError(
"Please install evaluation metrics via pip install evaluate and pip install bert-score",
)
except Exception as e:
raise RuntimeError(
f"Error loading evaluation metrics: {str(e)}. Please check your installation."
)
def doc_eval(pred, refs):
try:
bleu_results = bleu.compute(predictions=pred, references=refs)
except Exception as e:
print(f"Bleu error: {e}")
bleu_results = {"bleu": np.NAN}
try:
rouge_results = rouge.compute(predictions=pred, references=refs)
except Exception as e:
print(f"Rouge error: {e}")
rouge_results = {"rouge1": np.NAN, "rouge2": np.NAN, "rougeL": np.NAN}
try:
bleurt_scores = bleurt.compute(predictions=pred, references=refs)["scores"]
except Exception as e:
print(f"Bleurt error: {e}")
bleurt_scores = [np.NAN]
try:
bert_scores = bertscore.compute(predictions=pred, references=refs, lang="en")[
"f1"
]
except Exception as e:
print(f"Bert error: {e}")
bert_scores = [np.NAN]
if bleu_results["bleu"] == 0:
# Sometimes bleu is 0.0 and this breaks the stderr computation.
bleu_results["bleu"] += 1e-5
results = {
"bleu": bleu_results["bleu"],
"rouge1": rouge_results["rouge1"],
"rouge2": rouge_results["rouge2"],
"rougeL": rouge_results["rougeL"],
"bleurt": np.mean(bleurt_scores),
"bert_score": np.mean(bert_scores),
}
return results
def doc_to_text(doc) -> str:
return doc["dialogue"]
def doc_to_target(doc) -> str:
return doc["section_text"]
def process_results(doc, results):
pred, refs = [results[0]], [doc_to_target(doc)]
if len(refs[0]) < 5 or len(pred[0]) < 5:
return {
"bleu": np.NAN,
"rouge1": np.NAN,
"rouge2": np.NAN,
"rougeL": np.NAN,
"bleurt": np.NAN,
"bert_score": np.NAN,
}
results = doc_eval(pred, refs)
return {
"bleu": results["bleu"],
"rouge1": results["rouge1"],
"rouge2": results["rouge2"],
"rougeL": results["rougeL"],
"bleurt": results["bleurt"],
"bert_score": results["bert_score"],
}
import re
from lm_eval.tasks.mts_dialog.utils import doc_to_target
def process_results(doc, results):
(loglikelihood,) = results
_words = len(re.split(r"\s+", doc_to_target(doc)))
_bytes = len(doc_to_target(doc).encode("utf-8"))
return {
"word_perplexity": (loglikelihood, _words),
"byte_perplexity": (loglikelihood, _bytes),
"bits_per_byte": (loglikelihood, _bytes),
}
# OLAPH
### Paper
Title: `OLAPH: Improving Factuality in Biomedical Long-form Question Answering`
Abstract: [https://arxiv.org/abs/2405.12701](https://arxiv.org/abs/2405.12701)
#### Tasks
* `olaph`: Open-Ended QA in english.
* `olaph_perplexity`: Open-Ended QA in english, evaluated with perplexity.
### Citation
```bibtex
@misc{jeong2024olaphimprovingfactualitybiomedical,
title={OLAPH: Improving Factuality in Biomedical Long-form Question Answering},
author={Minbyul Jeong and Hyeon Hwang and Chanwoong Yoon and Taewhoo Lee and Jaewoo Kang},
year={2024},
eprint={2405.12701},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2405.12701},
}
```
task: olaph
dataset_path: dmis-lab/MedLFQA
description: >
Instructions: You are a helpful healthcare assistant. Answer the following question as concisely as possible without omitting relevant information.
output_type: generate_until
training_split: test
validation_split: test
test_split: test
process_docs: !function utils.process_docs
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
process_results: !function utils.process_results
generation_kwargs:
until:
- "\n\n"
metric_list:
- metric: bleu
aggregation: nanmean
higher_is_better: true
- metric: rouge1
aggregation: nanmean
higher_is_better: true
- metric: rouge2
aggregation: nanmean
higher_is_better: true
- metric: rougeL
aggregation: nanmean
higher_is_better: true
- metric: bert_score
aggregation: nanmean
higher_is_better: true
- metric: bleurt
aggregation: nanmean
higher_is_better: true
metadata:
version: 1.2
task: olaph_perplexity
#include: olaph.yaml
dataset_path: dmis-lab/MedLFQA
description: >
Instructions: You are a helpful healthcare assistant. Answer the following question as concisely as possible without omitting relevant information.
training_split: test
validation_split: test
test_split: test
output_type: loglikelihood_rolling
doc_to_text: ""
process_docs: !function utils.process_docs
doc_to_target: !function utils.doc_to_target
process_results: !function utils_perplexity.process_results
generation_kwargs:
until:
- "\n\n"
metric_list:
- metric: word_perplexity
higher_is_better: false
- metric: byte_perplexity
higher_is_better: false
- metric: bits_per_byte
higher_is_better: false
metadata:
version: 1.0
import datasets
import numpy as np
try:
import evaluate
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")
bleurt = evaluate.load("bleurt", "bleurt-base-512", module_type="metric")
except (ModuleNotFoundError, ImportError):
raise ModuleNotFoundError(
"Please install evaluation metrics via pip install evaluate and pip install bert-score",
)
except Exception as e:
raise RuntimeError(
f"Error loading evaluation metrics: {str(e)}. Please check your installation."
)
def doc_eval(pred, refs):
try:
bleu_results = bleu.compute(predictions=pred, references=refs)
except Exception as e:
print(f"Bleu error: {e}")
bleu_results = {"bleu": np.NAN}
try:
rouge_results = rouge.compute(predictions=pred, references=refs)
except Exception as e:
print(f"Rouge error: {e}")
rouge_results = {"rouge1": np.NAN, "rouge2": np.NAN, "rougeL": np.NAN}
try:
bleurt_scores = bleurt.compute(predictions=pred, references=refs)["scores"]
except Exception as e:
print(f"Bleurt error: {e}")
bleurt_scores = [np.NAN]
try:
bert_scores = bertscore.compute(predictions=pred, references=refs, lang="en")[
"f1"
]
except Exception as e:
print(f"Bert error: {e}")
bert_scores = [np.NAN]
if bleu_results["bleu"] == 0:
# Sometimes bleu is 0.0 and this breaks the stderr computation.
bleu_results["bleu"] += 1e-5
results = {
"bleu": bleu_results["bleu"],
"rouge1": rouge_results["rouge1"],
"rouge2": rouge_results["rouge2"],
"rougeL": rouge_results["rougeL"],
"bleurt": np.mean(bleurt_scores),
"bert_score": np.mean(bert_scores),
}
return results
def doc_to_text(doc) -> str:
return doc["Question"]
def doc_to_target(doc) -> str:
return doc["Free_form_answer"]
def process_docs(dataset: datasets.Dataset):
def _helper(doc):
return doc
num_entries = len(dataset)
one_percent_index = int(0.1 * num_entries)
# Select the first 1% of instances
filtered_dataset = dataset.select(range(one_percent_index))
return filtered_dataset.map(_helper)
def process_results(doc, results):
pred, refs = [results[0]], [doc_to_target(doc)]
if len(refs[0]) < 10 or len(pred[0]) < 10:
return {
"bleu": np.NAN,
"rouge1": np.NAN,
"rouge2": np.NAN,
"rougeL": np.NAN,
"bleurt": np.NAN,
"bert_score": np.NAN,
}
results = doc_eval(pred, refs)
return {
"bleu": results["bleu"],
"rouge1": results["rouge1"],
"rouge2": results["rouge2"],
"rougeL": results["rougeL"],
"bleurt": results["bleurt"],
"bert_score": results["bert_score"],
}
import re
from lm_eval.tasks.olaph.utils import doc_to_target
def process_results(doc, results):
(loglikelihood,) = results
_words = len(re.split(r"\s+", doc_to_target(doc)))
_bytes = len(doc_to_target(doc).encode("utf-8"))
return {
"word_perplexity": (loglikelihood, _words),
"byte_perplexity": (loglikelihood, _bytes),
"bits_per_byte": (loglikelihood, _bytes),
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment