Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
cc88a0cf
Unverified
Commit
cc88a0cf
authored
May 20, 2022
by
Taekyoon
Committed by
GitHub
May 20, 2022
Browse files
Add nsmc task (Korean language task)
- Add nsmc dataset and task modules
parents
4887d9d3
5e7738d7
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
177 additions
and
10 deletions
+177
-10
lm_eval/datasets/nsmc/nsmc.py
lm_eval/datasets/nsmc/nsmc.py
+98
-0
lm_eval/models/gpt2.py
lm_eval/models/gpt2.py
+2
-2
lm_eval/tasks/__init__.py
lm_eval/tasks/__init__.py
+7
-8
lm_eval/tasks/nsmc.py
lm_eval/tasks/nsmc.py
+70
-0
No files found.
lm_eval/datasets/nsmc/nsmc.py
0 → 100644
View file @
cc88a0cf
# coding=utf-8
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Naver movie review corpus for binary sentiment classification"""
import
csv
import
datasets
_CITATION
=
"""
\
@InProceedings{Park:2016,
title = "Naver Sentiment Movie Corpus",
author = "Lucy Park",
year = "2016",
howpublished = {
\\
url{https://github.com/e9t/nsmc}}
}
"""
_DESCRIPTION
=
"""
\
This is a movie review dataset in the Korean language. Reviews were scraped from Naver movies. The dataset construction is based on the method noted in Large movie review dataset from Maas et al., 2011.
"""
_HOMEPAGE
=
"https://github.com/e9t/nsmc/"
_LICENSE
=
"CC0 1.0 Universal (CC0 1.0)"
_URL
=
"https://raw.githubusercontent.com/e9t/nsmc/master/"
_URLs
=
{
"train"
:
_URL
+
"ratings_train.txt"
,
"test"
:
_URL
+
"ratings_test.txt"
,
}
# TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case
class
NSMC
(
datasets
.
GeneratorBasedBuilder
):
"""Korean Naver movie review dataset."""
VERSION
=
datasets
.
Version
(
"1.1.0"
)
def
_info
(
self
):
return
datasets
.
DatasetInfo
(
description
=
_DESCRIPTION
,
features
=
datasets
.
Features
(
{
"id"
:
datasets
.
Value
(
"string"
),
"document"
:
datasets
.
Value
(
"string"
),
"label"
:
datasets
.
ClassLabel
(
names
=
[
"부정"
,
"긍정"
]),
}
),
supervised_keys
=
None
,
homepage
=
_HOMEPAGE
,
license
=
_LICENSE
,
citation
=
_CITATION
,
)
def
_split_generators
(
self
,
dl_manager
):
downloaded_files
=
dl_manager
.
download_and_extract
(
_URLs
)
return
[
datasets
.
SplitGenerator
(
name
=
datasets
.
Split
.
TRAIN
,
gen_kwargs
=
{
"filepath"
:
downloaded_files
[
"train"
],
"split"
:
"train"
,
},
),
datasets
.
SplitGenerator
(
name
=
datasets
.
Split
.
TEST
,
gen_kwargs
=
{
"filepath"
:
downloaded_files
[
"validation"
],
"split"
:
"test"
,
},
),
]
def
_generate_examples
(
self
,
filepath
,
split
):
with
open
(
filepath
,
encoding
=
"utf-8"
)
as
f
:
next
(
f
)
reader
=
csv
.
reader
(
f
,
delimiter
=
"
\t
"
)
for
id_
,
row
in
enumerate
(
reader
):
yield
id_
,
{
"id"
:
row
[
0
],
"document"
:
row
[
1
],
"label"
:
int
(
row
[
2
]),
}
lm_eval/models/gpt2.py
View file @
cc88a0cf
...
...
@@ -75,7 +75,7 @@ class HFLM(BaseLM):
def
tok_encode
(
self
,
string
:
str
):
return
self
.
tokenizer
.
encode
(
string
,
add_special_tokens
=
False
)
def
tok_decode
(
self
,
tokens
):
return
self
.
tokenizer
.
decode
(
tokens
)
...
...
@@ -89,7 +89,7 @@ class HFLM(BaseLM):
"""
with
torch
.
no_grad
():
return
self
.
gpt2
(
inps
)[
0
][:,
:,
:
50257
]
def
_model_generate
(
self
,
context
,
max_length
,
eos_token_id
):
return
self
.
gpt2
.
generate
(
context
,
...
...
lm_eval/tasks/__init__.py
View file @
cc88a0cf
...
...
@@ -52,6 +52,7 @@ from . import asdiv
from
.
import
gsm8k
from
.
import
storycloze
from
.
import
klue
from
.
import
nsmc
########################################
# Translation tasks
...
...
@@ -103,13 +104,13 @@ TASK_REGISTRY = {
"record"
:
superglue
.
ReCoRD
,
"wic"
:
superglue
.
WordsInContext
,
"wsc"
:
superglue
.
SGWinogradSchemaChallenge
,
# Order by benchmark/genre?
"coqa"
:
coqa
.
CoQA
,
"drop"
:
drop
.
DROP
,
"lambada"
:
lambada
.
LAMBADA
,
"lambada_cloze"
:
lambada_cloze
.
LAMBADA_cloze
,
# multilingual lambada
**
lambada_multilingual
.
construct_tasks
(),
...
...
@@ -229,7 +230,7 @@ TASK_REGISTRY = {
"pile_ubuntu-irc"
:
pile
.
PileUbuntuIrc
,
"pile_wikipedia"
:
pile
.
PileWikipedia
,
"pile_youtubesubtitles"
:
pile
.
PileYoutubeSubtitles
,
# BLiMP
"blimp_adjunct_island"
:
blimp
.
BlimpAdjunctIsland
,
"blimp_anaphor_gender_agreement"
:
blimp
.
BlimpAnaphorGenderAgreement
,
...
...
@@ -298,14 +299,12 @@ TASK_REGISTRY = {
"blimp_wh_vs_that_no_gap_long_distance"
:
blimp
.
BlimpWhVsThatNoGapLongDistance
,
"blimp_wh_vs_that_with_gap"
:
blimp
.
BlimpWhVsThatWithGap
,
"blimp_wh_vs_that_with_gap_long_distance"
:
blimp
.
BlimpWhVsThatWithGapLongDistance
,
"klue_sts"
:
klue
.
STS
,
"nsmc"
:
nsmc
.
NSMC
,
# Requires manual download of data.
# "storycloze_2016": storycloze.StoryCloze2016,
# "storycloze_2018": storycloze.StoryCloze2018,
# "sat": sat.SATAnalogies,
# KLUE
"klue_sts"
:
klue
.
STS
}
...
...
@@ -325,7 +324,7 @@ def get_task_name_from_object(task_object):
for
name
,
class_
in
TASK_REGISTRY
.
items
():
if
class_
is
task_object
:
return
name
# this gives a mechanism for non-registered tasks to have a custom name anyways when reporting
return
task_object
.
EVAL_HARNESS_NAME
if
hasattr
(
task_object
,
"EVAL_HARNESS_NAME"
)
else
type
(
task_object
).
__name__
...
...
lm_eval/tasks/nsmc.py
0 → 100644
View file @
cc88a0cf
"""
NSMC:
"""
import
numpy
as
np
from
lm_eval.base
import
rf
,
Task
from
lm_eval.metrics
import
mean
,
matthews_corrcoef
,
f1_score
,
yesno
from
lm_eval.utils
import
general_detokenize
_CITATION
=
"""
@inproceedings{zellers2019hellaswag,
title={NSMC: Can a Machine Really Finish Your Sentence?},
author={Zellers, Rowan and Holtzman, Ari and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin},
booktitle ={Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},
year={2019}
}
"""
class
NSMC
(
Task
):
VERSION
=
0
DATASET_PATH
=
"nsmc"
DATASET_NAME
=
None
def
has_training_docs
(
self
):
return
True
def
has_validation_docs
(
self
):
return
True
def
has_test_docs
(
self
):
return
False
def
training_docs
(
self
):
if
self
.
_training_docs
is
None
:
self
.
_training_docs
=
list
(
self
.
dataset
[
"train"
])
return
self
.
_training_docs
def
validation_docs
(
self
):
return
self
.
dataset
[
"test"
]
def
doc_to_text
(
self
,
doc
):
return
"다음 문장은 긍정일까요 부정일까요?
\n
{}
\n
정답:"
.
format
(
general_detokenize
(
doc
[
"document"
]),
)
def
doc_to_target
(
self
,
doc
):
return
" {}"
.
format
({
1
:
"긍정"
,
0
:
"부정"
}[
doc
[
"label"
]])
def
construct_requests
(
self
,
doc
,
ctx
):
ll_positive
,
_
=
rf
.
loglikelihood
(
ctx
,
" 긍정"
)
ll_negative
,
_
=
rf
.
loglikelihood
(
ctx
,
" 부정"
)
return
ll_positive
,
ll_negative
def
process_results
(
self
,
doc
,
results
):
ll_positive
,
ll_negative
=
results
pred
=
ll_positive
>
ll_negative
gold
=
doc
[
"label"
]
return
{
"acc"
:
pred
==
gold
}
def
higher_is_better
(
self
):
return
{
"acc"
:
True
}
def
aggregation
(
self
):
return
{
"acc"
:
mean
}
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment