Unverified Commit b03d5dc5 authored by Fengzhe Zhou's avatar Fengzhe Zhou Committed by GitHub
Browse files

[Sync] Sync Internal (#941)

parent bbec7d87
......@@ -12,36 +12,9 @@ compassbench_v1_reason_groups = [
summarizer = dict(
dataset_abbrs=[
['reasonbench', 'acc_origin'],
['reasonbench_cn_circular', 'acc_origin'],
['reasonbench_en_circular', 'acc_origin'],
['reasonbench_cn_commonsense_circular', 'acc_origin'],
['reasonbench_cn_abductive_circular', 'acc_origin'],
['reasonbench_cn_deductive_circular', 'acc_origin'],
['reasonbench_cn_inductive_circular', 'acc_origin'],
['reasonbench_en_commonsense_circular', 'acc_origin'],
['reasonbench_en_abductive_circular', 'acc_origin'],
['reasonbench_en_deductive_logiqa_zh_translated_circular', 'acc_origin'],
['reasonbench_en_inductive_circular', 'acc_origin'],
['reasonbench_cn_commonsense_circular', 'acc_origin'],
['reasonbench_cn_abductive_alphanlg_translated_circular', 'acc_origin'],
['reasonbench_cn_deductive_bbh3obj_translated_circular', 'acc_origin'],
['reasonbench_cn_deductive_logiqa_zh_circular', 'acc_origin'],
['reasonbench_cn_inductive_deer_translated_circular', 'acc_origin'],
['reasonbench_cn_inductive_selfgenerated_circular', 'acc_origin'],
['reasonbench_en_commonsense_circular', 'acc_origin'],
['reasonbench_en_abductive_alphanlg_circular', 'acc_origin'],
['reasonbench_en_deductive_logiqa_zh_translated_circular', 'acc_origin'],
['reasonbench_en_inductive_deer_circular', 'acc_origin'],
['reasonbench_en_inductive_selfgenerated_circular', 'acc_origin'],
['reasonbench', 'perf_circular'],
['reasonbench_cn_circular', 'perf_circular'],
['reasonbench_en_circular', 'perf_circular'],
['reasonbench_cn_commonsense_circular', 'perf_circular'],
['reasonbench_cn_abductive_circular', 'perf_circular'],
['reasonbench_cn_deductive_circular', 'perf_circular'],
......@@ -50,18 +23,6 @@ summarizer = dict(
['reasonbench_en_abductive_circular', 'perf_circular'],
['reasonbench_en_deductive_logiqa_zh_translated_circular', 'perf_circular'],
['reasonbench_en_inductive_circular', 'perf_circular'],
['reasonbench_cn_commonsense_circular', 'perf_circular'],
['reasonbench_cn_abductive_alphanlg_translated_circular', 'perf_circular'],
['reasonbench_cn_deductive_bbh3obj_translated_circular', 'perf_circular'],
['reasonbench_cn_deductive_logiqa_zh_circular', 'perf_circular'],
['reasonbench_cn_inductive_deer_translated_circular', 'perf_circular'],
['reasonbench_cn_inductive_selfgenerated_circular', 'perf_circular'],
['reasonbench_en_commonsense_circular', 'perf_circular'],
['reasonbench_en_abductive_alphanlg_circular', 'perf_circular'],
['reasonbench_en_deductive_logiqa_zh_translated_circular', 'perf_circular'],
['reasonbench_en_inductive_deer_circular', 'perf_circular'],
['reasonbench_en_inductive_selfgenerated_circular', 'perf_circular'],
],
summary_groups=compassbench_v1_reason_groups,
)
......@@ -39,6 +39,22 @@ _base_summary_groups = [
['plugin_eval-review_str_v1', 'review_quality'],
],
},
{
'name': 'plugin_eval_one_review',
'subsets': [
['plugin_eval-instruct_v1', 'format_metric'],
['plugin_eval-instruct_v1', 'args_em_metric'],
['plugin_eval-plan_str_v1', 'f1_score'],
['plugin_eval-plan_json_v1', 'f1_score'],
['plugin_eval-reason_str_v1', 'thought'],
['plugin_eval-reason_retrieve_understand_json_v1', 'thought'],
['plugin_eval-retrieve_str_v1', 'name'],
['plugin_eval-reason_retrieve_understand_json_v1', 'name'],
['plugin_eval-understand_str_v1', 'args'],
['plugin_eval-reason_retrieve_understand_json_v1', 'args'],
['plugin_eval-review_str_v1', 'review_quality'],
]
},
{
'name': 'plugin_eval',
'subsets': [
......@@ -53,7 +69,6 @@ _base_summary_groups = [
['plugin_eval-understand_str_v1', 'args'],
['plugin_eval-reason_retrieve_understand_json_v1', 'args'],
['plugin_eval-review_str_v1', 'review_quality'],
['copy_plugin_eval-review_str_v1', 'naive_average'], # a hack for review * 2
]
},
]
......
......@@ -20,16 +20,12 @@ import functools
import random
import re
try:
import immutabledict
except ImportError:
immutabledict = None
import nltk
WORD_LIST = ['western', 'sentence', 'signal', 'dump', 'spot', 'opposite', 'bottom', 'potato', 'administration', 'working', 'welcome', 'morning', 'good', 'agency', 'primary', 'wish', 'responsibility', 'press', 'problem', 'president', 'steal', 'brush', 'read', 'type', 'beat', 'trainer', 'growth', 'lock', 'bone', 'case', 'equal', 'comfortable', 'region', 'replacement', 'performance', 'mate', 'walk', 'medicine', 'film', 'thing', 'rock', 'tap', 'total', 'competition', 'ease', 'south', 'establishment', 'gather', 'parking', 'world', 'plenty', 'breath', 'claim', 'alcohol', 'trade', 'dear', 'highlight', 'street', 'matter', 'decision', 'mess', 'agreement', 'studio', 'coach', 'assist', 'brain', 'wing', 'style', 'private', 'top', 'brown', 'leg', 'buy', 'procedure', 'method', 'speed', 'high', 'company', 'valuable', 'pie', 'analyst', 'session', 'pattern', 'district', 'pleasure', 'dinner', 'swimming', 'joke', 'order', 'plate', 'department', 'motor', 'cell', 'spend', 'cabinet', 'difference', 'power', 'examination', 'engine', 'horse', 'dimension', 'pay', 'toe', 'curve', 'literature', 'bother', 'fire', 'possibility', 'debate', 'activity', 'passage', 'hello', 'cycle', 'background', 'quiet', 'author', 'effect', 'actor', 'page', 'bicycle', 'error', 'throat', 'attack', 'character', 'phone', 'tea', 'increase', 'outcome', 'file', 'specific', 'inspector', 'internal', 'potential', 'staff', 'building', 'employer', 'shoe', 'hand', 'direction', 'garden', 'purchase', 'interview', 'study', 'recognition', 'member', 'spiritual', 'oven', 'sandwich', 'weird', 'passenger', 'particular', 'response', 'reaction', 'size', 'variation', 'a', 'cancel', 'candy', 'exit', 'guest', 'condition', 'fly', 'price', 'weakness', 'convert', 'hotel', 'great', 'mouth', 'mind', 'song', 'sugar', 'suspect', 'telephone', 'ear', 'roof', 'paint', 'refrigerator', 'organization', 'jury', 'reward', 'engineering', 'day', 'possession', 'crew', 'bar', 'road', 'description', 'celebration', 'score', 'mark', 'letter', 'shower', 'suggestion', 'sir', 'luck', 'national', 'progress', 'hall', 'stroke', 'theory', 'offer', 'story', 'tax', 'definition', 'history', 'ride', 'medium', 'opening', 'glass', 'elevator', 'stomach', 'question', 'ability', 'leading', 'village', 'computer', 'city', 'grand', 'confidence', 'candle', 'priest', 'recommendation', 'point', 'necessary', 'body', 'desk', 'secret', 'horror', 'noise', 'culture', 'warning', 'water', 'round', 'diet', 'flower', 'bus', 'tough', 'permission', 'week', 'prompt', 'connection', 'abuse', 'height', 'save', 'corner', 'border', 'stress', 'drive', 'stop', 'rip', 'meal', 'listen', 'confusion', 'girlfriend', 'living', 'relation', 'significance', 'plan', 'creative', 'atmosphere', 'blame', 'invite', 'housing', 'paper', 'drink', 'roll', 'silver', 'drunk', 'age', 'damage', 'smoke', 'environment', 'pack', 'savings', 'influence', 'tourist', 'rain', 'post', 'sign', 'grandmother', 'run', 'profit', 'push', 'clerk', 'final', 'wine', 'swim', 'pause', 'stuff', 'singer', 'funeral', 'average', 'source', 'scene', 'tradition', 'personal', 'snow', 'nobody', 'distance', 'sort', 'sensitive', 'animal', 'major', 'negotiation', 'click', 'mood', 'period', 'arrival', 'expression', 'holiday', 'repeat', 'dust', 'closet', 'gold', 'bad', 'sail', 'combination', 'clothes', 'emphasis', 'duty', 'black', 'step', 'school', 'jump', 'document', 'professional', 'lip', 'chemical', 'front', 'wake', 'while', 'inside', 'watch', 'row', 'subject', 'penalty', 'balance', 'possible', 'adult', 'aside', 'sample', 'appeal', 'wedding', 'depth', 'king', 'award', 'wife', 'blow', 'site', 'camp', 'music', 'safe', 'gift', 'fault', 'guess', 'act', 'shame', 'drama', 'capital', 'exam', 'stupid', 'record', 'sound', 'swing', 'novel', 'minimum', 'ratio', 'machine', 'shape', 'lead', 'operation', 'salary', 'cloud', 'affair', 'hit', 'chapter', 'stage', 'quantity', 'access', 'army', 'chain', 'traffic', 'kick', 'analysis', 'airport', 'time', 'vacation', 'philosophy', 'ball', 'chest', 'thanks', 'place', 'mountain', 'advertising', 'red', 'past', 'rent', 'return', 'tour', 'house', 'construction', 'net', 'native', 'war', 'figure', 'fee', 'spray', 'user', 'dirt', 'shot', 'task', 'stick', 'friend', 'software', 'promotion', 'interaction', 'surround', 'block', 'purpose', 'practice', 'conflict', 'routine', 'requirement', 'bonus', 'hole', 'state', 'junior', 'sweet', 'catch', 'tear', 'fold', 'wall', 'editor', 'life', 'position', 'pound', 'respect', 'bathroom', 'coat', 'script', 'job', 'teach', 'birth', 'view', 'resolve', 'theme', 'employee', 'doubt', 'market', 'education', 'serve', 'recover', 'tone', 'harm', 'miss', 'union', 'understanding', 'cow', 'river', 'association', 'concept', 'training', 'recipe', 'relationship', 'reserve', 'depression', 'proof', 'hair', 'revenue', 'independent', 'lift', 'assignment', 'temporary', 'amount', 'loss', 'edge', 'track', 'check', 'rope', 'estimate', 'pollution', 'stable', 'message', 'delivery', 'perspective', 'mirror', 'assistant', 'representative', 'witness', 'nature', 'judge', 'fruit', 'tip', 'devil', 'town', 'emergency', 'upper', 'drop', 'stay', 'human', 'neck', 'speaker', 'network', 'sing', 'resist', 'league', 'trip', 'signature', 'lawyer', 'importance', 'gas', 'choice', 'engineer', 'success', 'part', 'external', 'worker', 'simple', 'quarter', 'student', 'heart', 'pass', 'spite', 'shift', 'rough', 'lady', 'grass', 'community', 'garage', 'youth', 'standard', 'skirt', 'promise', 'blind', 'television', 'disease', 'commission', 'positive', 'energy', 'calm', 'presence', 'tune', 'basis', 'preference', 'head', 'common', 'cut', 'somewhere', 'presentation', 'current', 'thought', 'revolution', 'effort', 'master', 'implement', 'republic', 'floor', 'principle', 'stranger', 'shoulder', 'grade', 'button', 'tennis', 'police', 'collection', 'account', 'register', 'glove', 'divide', 'professor', 'chair', 'priority', 'combine', 'peace', 'extension', 'maybe', 'evening', 'frame', 'sister', 'wave', 'code', 'application', 'mouse', 'match', 'counter', 'bottle', 'half', 'cheek', 'resolution', 'back', 'knowledge', 'make', 'discussion', 'screw', 'length', 'accident', 'battle', 'dress', 'knee', 'log', 'package', 'it', 'turn', 'hearing', 'newspaper', 'layer', 'wealth', 'profile', 'imagination', 'answer', 'weekend', 'teacher', 'appearance', 'meet', 'bike', 'rise', 'belt', 'crash', 'bowl', 'equivalent', 'support', 'image', 'poem', 'risk', 'excitement', 'remote', 'secretary', 'public', 'produce', 'plane', 'display', 'money', 'sand', 'situation', 'punch', 'customer', 'title', 'shake', 'mortgage', 'option', 'number', 'pop', 'window', 'extent', 'nothing', 'experience', 'opinion', 'departure', 'dance', 'indication', 'boy', 'material', 'band', 'leader', 'sun', 'beautiful', 'muscle', 'farmer', 'variety', 'fat', 'handle', 'director', 'opportunity', 'calendar', 'outside', 'pace', 'bath', 'fish', 'consequence', 'put', 'owner', 'go', 'doctor', 'information', 'share', 'hurt', 'protection', 'career', 'finance', 'force', 'golf', 'garbage', 'aspect', 'kid', 'food', 'boot', 'milk', 'respond', 'objective', 'reality', 'raw', 'ring', 'mall', 'one', 'impact', 'area', 'news', 'international', 'series', 'impress', 'mother', 'shelter', 'strike', 'loan', 'month', 'seat', 'anything', 'entertainment', 'familiar', 'clue', 'year', 'glad', 'supermarket', 'natural', 'god', 'cost', 'conversation', 'tie', 'ruin', 'comfort', 'earth', 'storm', 'percentage', 'assistance', 'budget', 'strength', 'beginning', 'sleep', 'other', 'young', 'unit', 'fill', 'store', 'desire', 'hide', 'value', 'cup', 'maintenance', 'nurse', 'function', 'tower', 'role', 'class', 'camera', 'database', 'panic', 'nation', 'basket', 'ice', 'art', 'spirit', 'chart', 'exchange', 'feedback', 'statement', 'reputation', 'search', 'hunt', 'exercise', 'nasty', 'notice', 'male', 'yard', 'annual', 'collar', 'date', 'platform', 'plant', 'fortune', 'passion', 'friendship', 'spread', 'cancer', 'ticket', 'attitude', 'island', 'active', 'object', 'service', 'buyer', 'bite', 'card', 'face', 'steak', 'proposal', 'patient', 'heat', 'rule', 'resident', 'broad', 'politics', 'west', 'knife', 'expert', 'girl', 'design', 'salt', 'baseball', 'grab', 'inspection', 'cousin', 'couple', 'magazine', 'cook', 'dependent', 'security', 'chicken', 'version', 'currency', 'ladder', 'scheme', 'kitchen', 'employment', 'local', 'attention', 'manager', 'fact', 'cover', 'sad', 'guard', 'relative', 'county', 'rate', 'lunch', 'program', 'initiative', 'gear', 'bridge', 'breast', 'talk', 'dish', 'guarantee', 'beer', 'vehicle', 'reception', 'woman', 'substance', 'copy', 'lecture', 'advantage', 'park', 'cold', 'death', 'mix', 'hold', 'scale', 'tomorrow', 'blood', 'request', 'green', 'cookie', 'church', 'strip', 'forever', 'beyond', 'debt', 'tackle', 'wash', 'following', 'feel', 'maximum', 'sector', 'sea', 'property', 'economics', 'menu', 'bench', 'try', 'language', 'start', 'call', 'solid', 'address', 'income', 'foot', 'senior', 'honey', 'few', 'mixture', 'cash', 'grocery', 'link', 'map', 'form', 'factor', 'pot', 'model', 'writer', 'farm', 'winter', 'skill', 'anywhere', 'birthday', 'policy', 'release', 'husband', 'lab', 'hurry', 'mail', 'equipment', 'sink', 'pair', 'driver', 'consideration', 'leather', 'skin', 'blue', 'boat', 'sale', 'brick', 'two', 'feed', 'square', 'dot', 'rush', 'dream', 'location', 'afternoon', 'manufacturer', 'control', 'occasion', 'trouble', 'introduction', 'advice', 'bet', 'eat', 'kill', 'category', 'manner', 'office', 'estate', 'pride', 'awareness', 'slip', 'crack', 'client', 'nail', 'shoot', 'membership', 'soft', 'anybody', 'web', 'official', 'individual', 'pizza', 'interest', 'bag', 'spell', 'profession', 'queen', 'deal', 'resource', 'ship', 'guy', 'chocolate', 'joint', 'formal', 'upstairs', 'car', 'resort', 'abroad', 'dealer', 'associate', 'finger', 'surgery', 'comment', 'team', 'detail', 'crazy', 'path', 'tale', 'initial', 'arm', 'radio', 'demand', 'single', 'draw', 'yellow', 'contest', 'piece', 'quote', 'pull', 'commercial', 'shirt', 'contribution', 'cream', 'channel', 'suit', 'discipline', 'instruction', 'concert', 'speech', 'low', 'effective', 'hang', 'scratch', 'industry', 'breakfast', 'lay', 'join', 'metal', 'bedroom', 'minute', 'product', 'rest', 'temperature', 'many', 'give', 'argument', 'print', 'purple', 'laugh', 'health', 'credit', 'investment', 'sell', 'setting', 'lesson', 'egg', 'middle', 'marriage', 'level', 'evidence', 'phrase', 'love', 'self', 'benefit', 'guidance', 'affect', 'you', 'dad', 'anxiety', 'special', 'boyfriend', 'test', 'blank', 'payment', 'soup', 'obligation', 'reply', 'smile', 'deep', 'complaint', 'addition', 'review', 'box', 'towel', 'minor', 'fun', 'soil', 'issue', 'cigarette', 'internet', 'gain', 'tell', 'entry', 'spare', 'incident', 'family', 'refuse', 'branch', 'can', 'pen', 'grandfather', 'constant', 'tank', 'uncle', 'climate', 'ground', 'volume', 'communication', 'kind', 'poet', 'child', 'screen', 'mine', 'quit', 'gene', 'lack', 'charity', 'memory', 'tooth', 'fear', 'mention', 'marketing', 'reveal', 'reason', 'court', 'season', 'freedom', 'land', 'sport', 'audience', 'classroom', 'law', 'hook', 'win', 'carry', 'eye', 'smell', 'distribution', 'research', 'country', 'dare', 'hope', 'whereas', 'stretch', 'library', 'if', 'delay', 'college', 'plastic', 'book', 'present', 'use', 'worry', 'champion', 'goal', 'economy', 'march', 'election', 'reflection', 'midnight', 'slide', 'inflation', 'action', 'challenge', 'guitar', 'coast', 'apple', 'campaign', 'field', 'jacket', 'sense', 'way', 'visual', 'remove', 'weather', 'trash', 'cable', 'regret', 'buddy', 'beach', 'historian', 'courage', 'sympathy', 'truck', 'tension', 'permit', 'nose', 'bed', 'son', 'person', 'base', 'meat', 'usual', 'air', 'meeting', 'worth', 'game', 'independence', 'physical', 'brief', 'play', 'raise', 'board', 'she', 'key', 'writing', 'pick', 'command', 'party', 'yesterday', 'spring', 'candidate', 'physics', 'university', 'concern', 'development', 'change', 'string', 'target', 'instance', 'room', 'bitter', 'bird', 'football', 'normal', 'split', 'impression', 'wood', 'long', 'meaning', 'stock', 'cap', 'leadership', 'media', 'ambition', 'fishing', 'essay', 'salad', 'repair', 'today', 'designer', 'night', 'bank', 'drawing', 'inevitable', 'phase', 'vast', 'chip', 'anger', 'switch', 'cry', 'twist', 'personality', 'attempt', 'storage', 'being', 'preparation', 'bat', 'selection', 'white', 'technology', 'contract', 'side', 'section', 'station', 'till', 'structure', 'tongue', 'taste', 'truth', 'difficulty', 'group', 'limit', 'main', 'move', 'feeling', 'light', 'example', 'mission', 'might', 'wait', 'wheel', 'shop', 'host', 'classic', 'alternative', 'cause', 'agent', 'consist', 'table', 'airline', 'text', 'pool', 'craft', 'range', 'fuel', 'tool', 'partner', 'load', 'entrance', 'deposit', 'hate', 'article', 'video', 'summer', 'feature', 'extreme', 'mobile', 'hospital', 'flight', 'fall', 'pension', 'piano', 'fail', 'result', 'rub', 'gap', 'system', 'report', 'suck', 'ordinary', 'wind', 'nerve', 'ask', 'shine', 'note', 'line', 'mom', 'perception', 'brother', 'reference', 'bend', 'charge', 'treat', 'trick', 'term', 'homework', 'bake', 'bid', 'status', 'project', 'strategy', 'orange', 'let', 'enthusiasm', 'parent', 'concentrate', 'device', 'travel', 'poetry', 'business', 'society', 'kiss', 'end', 'vegetable', 'employ', 'schedule', 'hour', 'brave', 'focus', 'process', 'movie', 'illegal', 'general', 'coffee', 'ad', 'highway', 'chemistry', 'psychology', 'hire', 'bell', 'conference', 'relief', 'show', 'neat', 'funny', 'weight', 'quality', 'club', 'daughter', 'zone', 'touch', 'tonight', 'shock', 'burn', 'excuse', 'name', 'survey', 'landscape', 'advance', 'satisfaction', 'bread', 'disaster', 'item', 'hat', 'prior', 'shopping', 'visit', 'east', 'photo', 'home', 'idea', 'father', 'comparison', 'cat', 'pipe', 'winner', 'count', 'lake', 'fight', 'prize', 'foundation', 'dog', 'keep', 'ideal', 'fan', 'struggle', 'peak', 'safety', 'solution', 'hell', 'conclusion', 'population', 'strain', 'alarm', 'measurement', 'second', 'train', 'race', 'due', 'insurance', 'boss', 'tree', 'monitor', 'sick', 'course', 'drag', 'appointment', 'slice', 'still', 'care', 'patience', 'rich', 'escape', 'emotion', 'royal', 'female', 'childhood', 'government', 'picture', 'will', 'sock', 'big', 'gate', 'oil', 'cross', 'pin', 'improvement', 'championship', 'silly', 'help', 'sky', 'pitch', 'man', 'diamond', 'most', 'transition', 'work', 'science', 'committee', 'moment', 'fix', 'teaching', 'dig', 'specialist', 'complex', 'guide', 'people', 'dead', 'voice', 'original', 'break', 'topic', 'data', 'degree', 'reading', 'recording', 'bunch', 'reach', 'judgment', 'lie', 'regular', 'set', 'painting', 'mode', 'list', 'player', 'bear', 'north', 'wonder', 'carpet', 'heavy', 'officer', 'negative', 'clock', 'unique', 'baby', 'pain', 'assumption', 'disk', 'iron', 'bill', 'drawer', 'look', 'double', 'mistake', 'finish', 'future', 'brilliant', 'contact', 'math', 'rice', 'leave', 'restaurant', 'discount', 'sex', 'virus', 'bit', 'trust', 'event', 'wear', 'juice', 'failure', 'bug', 'context', 'mud', 'whole', 'wrap', 'intention', 'draft', 'pressure', 'cake', 'dark', 'explanation', 'space', 'angle', 'word', 'efficiency', 'management', 'habit', 'star', 'chance', 'finding', 'transportation', 'stand', 'criticism', 'flow', 'door', 'injury', 'insect', 'surprise', 'apartment'] # pylint: disable=line-too-long
# ISO 639-1 codes to language names.
LANGUAGE_CODES = immutabledict.immutabledict({
LANGUAGE_CODES = {
'en': 'English',
'es': 'Spanish',
'pt': 'Portuguese',
......@@ -60,7 +56,7 @@ LANGUAGE_CODES = immutabledict.immutabledict({
'pa': 'Punjabi',
'ml': 'Malayalam',
'fi': 'Finnish',
})
}
_ALPHABETS = '([A-Za-z])'
_PREFIXES = '(Mr|St|Mrs|Ms|Dr)[.]'
......
......@@ -24,3 +24,15 @@ def TheoremQA_postprocess(text: str) -> str:
else:
text = matches[0].strip().strip('.,?!\"\';:')
return text
def TheoremQA_postprocess_v2(text: str) -> str:
prediction = text.strip().strip('\n').split('\n')[-1]
tmp = ''
for entry in prediction.split(' ')[::-1]:
if entry == 'is' or entry == 'be' or entry == 'are' or entry.endswith(
':'):
break
tmp = entry + ' ' + tmp
prediction = tmp.strip().strip('.')
return prediction
import json
import os.path as osp
from datasets import Dataset
from datasets import Dataset, DatasetDict
from opencompass.registry import LOAD_DATASET
......@@ -71,6 +71,32 @@ class hellaswagDataset_V3(BaseDataset):
return dataset
@LOAD_DATASET.register_module()
class hellaswagDatasetwithICE(BaseDataset):
@staticmethod
def load(path):
dataset_dict = DatasetDict()
for split, filename in [
['train', 'hellaswag_train_sampled25.jsonl'],
['val', 'hellaswag.jsonl'],
]:
dataset = []
with open(osp.join(path, filename), 'r', encoding='utf-8') as f:
for line in f:
data = json.loads(line)
dataset.append({
'ctx': data['query'].split(': ', 1)[-1],
'A': data['choices'][0],
'B': data['choices'][1],
'C': data['choices'][2],
'D': data['choices'][3],
'label': 'ABCD'[data['gold']],
})
dataset_dict[split] = Dataset.from_list(dataset)
return dataset_dict
class hellaswagDatasetClean(BaseDataset):
# load the contamination annotations of CEval from
......
......@@ -156,10 +156,13 @@ def humaneval_postprocess_v2(text: str) -> str:
"""This is an advanced version of previous postprocess to handle more
situations, better to use this one."""
try:
# for chatGLM raw text
text = eval(text)
# for chatGLM related text
eval_text = eval(text)
except Exception:
pass
else:
if isinstance(eval_text, str):
text = eval_text
text = text.lstrip('\n')
if '```' in text:
blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
......
......@@ -77,9 +77,10 @@ class NQEvaluator(BaseEvaluator):
cnt = 0
for pred, cand_ans in zip(processed_predictions, processed_answers):
detail = {'pred': pred, 'answer': cand_ans, 'correct': False}
cnt += int(any([cand == pred for cand in cand_ans]))
if int(any([cand == pred for cand in cand_ans])):
detail['correct'] = True
# is_correct = any([cand == pred for cand in cand_ans])
is_correct = any([cand in pred for cand in cand_ans])
cnt += int(is_correct)
detail['correct'] = is_correct
details.append(detail)
score = cnt / len(predictions) * 100
......
import json
import os
from datasets import Dataset
from datasets import Dataset, DatasetDict
from opencompass.registry import LOAD_DATASET
......@@ -20,12 +20,12 @@ class winograndeDataset(BaseDataset):
for line in f:
line = json.loads(line)
prompt = line['sentence']
continue_prompt = prompt.split('_')
continue_prompt = prompt.split('_')[1]
data_item = {
'opt1': prompt.replace('_', line['option1']),
'opt2': prompt.replace('_', line['option2']),
'answer': line['answer'],
'cont': continue_prompt[1]
'cont': continue_prompt,
}
dataset_list.append(data_item)
dataset_list = Dataset.from_list(dataset_list)
......@@ -44,13 +44,43 @@ class winograndeDataset_V2(BaseDataset):
for line in f:
line = json.loads(line)
prompt = line['sentence']
continue_prompt = prompt.split('_')[1]
answer = line['answer']
answer = ' AB'[int(answer)] if answer != '' else 'NULL'
data_item = {
'opt1': prompt.replace('_', line['option1']),
'opt2': prompt.replace('_', line['option2']),
'answer': answer,
'cont': continue_prompt,
}
dataset_list.append(data_item)
dataset_list = Dataset.from_list(dataset_list)
return dataset_list
@LOAD_DATASET.register_module()
class winograndeDataset_V3(BaseDataset):
"""Disconnect from Huggingface, winograndeDataset_V2."""
@staticmethod
def load(path):
dataset_dict = DatasetDict()
for split in ['train_xs', 'dev']:
filename = os.path.join(path, f'{split}.jsonl')
dataset_list = []
with open(filename, 'r', encoding='utf-8') as f:
for line in f:
line = json.loads(line)
prompt = line['sentence']
continue_prompt = prompt.split('_')[1]
answer = line['answer']
answer = ' AB'[int(answer)] if answer != '' else 'NULL'
data_item = {
'opt1': prompt.replace('_', line['option1']),
'opt2': prompt.replace('_', line['option2']),
'answer': answer,
'cont': continue_prompt,
}
dataset_list.append(data_item)
dataset_dict[split] = Dataset.from_list(dataset_list)
return dataset_dict
......@@ -13,6 +13,7 @@ from .huggingface import HuggingFace # noqa: F401, F403
from .huggingface import HuggingFaceCausalLM # noqa: F401, F403
from .huggingface import HuggingFaceChatGLM3 # noqa: F401, F403
from .intern_model import InternLM # noqa: F401, F403
from .krgpt_api import KrGPT # noqa: F401
from .lightllm_api import LightllmAPI # noqa: F401
from .llama2 import Llama2, Llama2Chat # noqa: F401, F403
from .lmdeploy_pytorch import LmdeployPytorchModel # noqa: F401
......
import json
from concurrent.futures import ThreadPoolExecutor
from typing import Dict, List, Optional, Union
import requests
from opencompass.registry import MODELS
from opencompass.utils.logging import get_logger
from opencompass.utils.prompt import PromptList
from .base_api import BaseAPIModel
PromptType = Union[PromptList, str]
@MODELS.register_module()
class KrGPT(BaseAPIModel):
is_api: bool = True
def __init__(
self,
path: str = 'KrGPT',
url: str = 'http://101.69.162.5:9300/v1/chat/completions',
max_seq_len: int = 2048,
meta_template: Optional[Dict] = None,
retry: int = 2,
generation_kwargs: Optional[Dict] = dict(),
):
super().__init__(
path=path,
max_seq_len=max_seq_len,
meta_template=meta_template,
retry=retry,
generation_kwargs=generation_kwargs,
)
self.logger = get_logger()
self.url = url
self.generation_kwargs = generation_kwargs
self.max_out_len = self.generation_kwargs.get('max_new_tokens', 1024)
def generate(self, inputs: List[str], max_out_len: int,
**kwargs) -> List[str]:
"""Generate results given a list of inputs.
Args:
inputs (List[str]): A list of strings or PromptDicts.
The PromptDict should be organized in OpenCompass'
API format.
max_out_len (int): The maximum length of the output.
Returns:
List[str]: A list of generated strings.
"""
with ThreadPoolExecutor() as executor:
results = list(
executor.map(self._generate, inputs,
[self.max_out_len] * len(inputs)))
return results
def _generate(self,
input: PromptType,
max_out_len: int,
temperature: float = 0.0) -> str:
"""Generate results given a list of inputs.
Args:
inputs (PromptType): A string or PromptDict.
The PromptDict should be organized in OpenCompass'
API format.
max_out_len (int): The maximum length of the output.
temperature (float): What sampling temperature to use,
between 0 and 2. Higher values like 0.8 will make the output
more random, while lower values like 0.2 will make it more
focused and deterministic.
Returns:
str: The generated string.
"""
assert isinstance(input, (str, PromptList))
if isinstance(input, str):
messages = [{'role': 'user', 'content': input}]
else:
messages = []
for item in input:
msg = {'content': item['prompt']}
if item['role'] == 'HUMAN':
msg['role'] = 'user'
elif item['role'] == 'BOT':
msg['role'] = 'assistant'
elif item['role'] == 'SYSTEM':
msg['role'] = 'system'
messages.append(msg)
max_num_retries = 0
while max_num_retries < self.retry:
header = {'content-type': 'application/json'}
try:
data = dict(messages=messages)
raw_response = requests.post(self.url,
headers=header,
data=json.dumps(data))
except requests.ConnectionError:
self.logger.error('Got connection error, retrying...')
continue
try:
response = raw_response.json()
except requests.JSONDecodeError:
self.logger.error('JsonDecode error, got',
str(raw_response.content))
continue
try:
return response['choices'][0]['message']['content'].strip()
except KeyError:
self.logger.error('Find error message in response: ',
str(response))
# if 'error' in response:
# if response['error']['code'] == 'rate_limit_exceeded':
# time.sleep(1)
# continue
# elif response['error']['code'] == 'insufficient_quota':
# self.invalid_keys.add(key)
# self.logger.warn(f'insufficient_quota key: {key}')
# continue
# self.logger.error('Find error message in response: ',
# str(response['error']))
max_num_retries += 1
raise RuntimeError('Calling OpenAI failed after retrying for '
f'{max_num_retries} times. Check the logs for '
'details.')
......@@ -415,6 +415,13 @@ class OpenAIAllesAPIN(OpenAI):
self.logger.error(data)
else:
return choices[0]['message']['content'].strip()
try:
match = re.match(r'Error code: \d+ - (.*)', response['data'])
err = eval(match.group(1))['error']
if err['code'] == 'content_filter' and err['status'] == 400:
return err['message']
except Exception:
pass
self.logger.error(response['msg'])
self.logger.error(response)
time.sleep(1)
......
import datetime
import json
import os
import os.path as osp
import random
......@@ -38,6 +39,7 @@ class DLCRunner(BaseRunner):
task: ConfigDict,
aliyun_cfg: ConfigDict,
max_num_workers: int = 32,
eval_with_gpu: list = ['plugin_eval'],
retry: int = 2,
debug: bool = False,
lark_bot_url: str = None):
......@@ -46,6 +48,8 @@ class DLCRunner(BaseRunner):
self.max_num_workers = max_num_workers
self.retry = retry
self.eval_with_gpu = eval_with_gpu
logger = get_logger()
logger.warning(
'To ensure the integrity of the log results, the log displayed '
......@@ -93,19 +97,62 @@ class DLCRunner(BaseRunner):
num_gpus = task.num_gpus
task_name = task.name
is_eval_task = 'OpenICLEval' in task_name
if is_eval_task and num_gpus == 0:
for check_name in self.eval_with_gpu:
if check_name in task_name:
num_gpus = 1
break
# Dump task config to file
mmengine.mkdir_or_exist('tmp/')
param_file = f'tmp/{os.getpid()}_params.py'
pwd = os.getcwd()
try:
cfg.dump(param_file)
if self.aliyun_cfg.get('bashrc_path') is not None:
# using user's conda env
bashrc_path = self.aliyun_cfg['bashrc_path']
assert osp.exists(bashrc_path)
assert self.aliyun_cfg.get('conda_env_name') is not None
conda_env_name = self.aliyun_cfg['conda_env_name']
shell_cmd = (f'source {bashrc_path}; '
f'conda activate {conda_env_name}; ')
else:
# using public conda env
# users can also set `python_env_path` to their
# own env python path
assert self.aliyun_cfg.get('python_env_path') is not None
shell_cmd = (
f'export PATH={self.aliyun_cfg["python_env_path"]}/bin:$PATH; ' # noqa: E501
f'export PYTHONPATH={pwd}:$PYTHONPATH; ')
huggingface_cache = self.aliyun_cfg.get('huggingface_cache')
if huggingface_cache is not None:
# HUGGINGFACE_HUB_CACHE is a Legacy env variable, here we set
# `HF_HUB_CACHE` and `HUGGINGFACE_HUB_CACHE` for bc
shell_cmd += f'export HF_HUB_CACHE={huggingface_cache}; '
shell_cmd += f'export HUGGINGFACE_HUB_CACHE={huggingface_cache}; ' # noqa: E501
torch_cache = self.aliyun_cfg.get('torch_cache')
if torch_cache is not None:
shell_cmd += f'export TORCH_HOME={torch_cache}; '
hf_offline = self.aliyun_cfg.get('hf_offline', True)
if hf_offline:
shell_cmd += 'export HF_DATASETS_OFFLINE=1; export TRANSFORMERS_OFFLINE=1; export HF_EVALUATE_OFFLINE=1; ' # noqa: E501
http_proxy = self.aliyun_cfg.get('http_proxy')
if http_proxy is not None:
shell_cmd += f'export http_proxy={http_proxy}; export https_proxy={http_proxy}; ' # noqa: E501
shell_cmd += f'export HTTP_PROXY={http_proxy}; export HTTPS_PROXY={http_proxy}; ' # noqa: E501
# Build up DLC command
pwd = os.getcwd()
shell_cmd = (
f'source {self.aliyun_cfg["bashrc_path"]}; '
f'conda activate {self.aliyun_cfg["conda_env_name"]}; '
f'cd {pwd}; '
'{task_cmd}')
hf_endpoint = self.aliyun_cfg.get('hf_endpoint')
if hf_endpoint is not None:
shell_cmd += f'export HF_ENDPOINT={hf_endpoint}; '
shell_cmd += f'cd {pwd}; '
shell_cmd += '{task_cmd}'
tmpl = ('dlc create job'
f" --command '{shell_cmd}'"
......@@ -114,11 +161,10 @@ class DLCRunner(BaseRunner):
f" -c {self.aliyun_cfg['dlc_config_path']}"
f" --workspace_id {self.aliyun_cfg['workspace_id']}"
' --worker_count 1'
f' --worker_cpu {max(num_gpus * 6, 8)}'
f' --worker_cpu {max(num_gpus * 8, 32)}'
f' --worker_gpu {num_gpus}'
f' --worker_memory {max(num_gpus * 64, 48)}'
f" --worker_image {self.aliyun_cfg['worker_image']}"
' --interactive')
f' --worker_memory {max(num_gpus * 128, 256)}'
f" --worker_image {self.aliyun_cfg['worker_image']}")
get_cmd = partial(task.get_command,
cfg_path=param_file,
template=tmpl)
......@@ -139,77 +185,64 @@ class DLCRunner(BaseRunner):
time.sleep(random.randint(0, 10))
def _run_within_retry():
try:
process = subprocess.Popen(cmd,
shell=True,
text=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
job_id = None
job_allocated = False
job_finished = False
last_end_time = datetime.datetime.now().strftime(
'%Y-%m-%dT%H:%M:%SZ')
while True:
if not job_allocated:
line = process.stdout.readline()
if not line:
break
match = re.search(r'(dlc[0-9a-z]+)', line)
if match and job_id is None:
job_id = match.group(1)
stdout.write(line)
match = re.search(r'Job .* is \[Running\]', line)
if match:
job_allocated = True
else:
try:
process.wait(10)
except subprocess.TimeoutExpired:
pass
else:
job_finished = True
if job_finished:
this_end_time = datetime.datetime.now(
).strftime('%Y-%m-%dT%H:%M:%SZ')
else:
this_end_time = (
datetime.datetime.now() -
datetime.timedelta(seconds=10)
).strftime('%Y-%m-%dT%H:%M:%SZ')
logs_cmd = (
'dlc logs'
output = subprocess.getoutput(cmd)
match = re.search(r'\|\s+(dlc[0-9a-z]+)\s+\|', output)
if match is None:
raise RuntimeError(
f'Failed to launch dlc job for {output}')
else:
job_id = match.group(1)
stdout.write(output)
pod_create_time = None
pri_time = None
initial_time = datetime.datetime.now()
while True:
# 1. Avoid to request dlc too frequently.
# 2. DLC job may not be ready immediately after creation.
for _ in range(5):
time.sleep(2)
try:
job_info = json.loads(
subprocess.getoutput(f'dlc get job {job_id}'))
break
except: # noqa: E722
pass
else:
raise RuntimeError(
f'Failed to get job info for {job_id}')
status = job_info['Status']
if status == 'Failed':
return -1
elif status == 'Succeeded':
return 0
elif status != 'Running':
continue
# The pod time could be different from the real time.
# Therefore we need to extract the pod start time from
# the `job_info` and calculate the `start_time` and
# `end_time` in pod.
if pod_create_time is None:
pod_create_time = job_info['GmtCreateTime']
pri_time = pod_create_time
pod_create_time = datetime.datetime.strptime(
pod_create_time, '%Y-%m-%dT%H:%M:%SZ')
elasped_time = datetime.datetime.now() - initial_time
cur_time = (pod_create_time +
elasped_time).strftime('%Y-%m-%dT%H:%M:%SZ')
logs_cmd = ('dlc logs'
f' {job_id} {job_id}-worker-0'
f' --start_time {last_end_time}'
f' --end_time {this_end_time}'
f" -c {self.aliyun_cfg['dlc_config_path']}")
log_process = subprocess.Popen(
logs_cmd,
shell=True,
text=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
log_output, log_err = log_process.communicate()
log_output = '\n'.join(log_output.split('\n')[2:])
stdout.write(log_output)
last_end_time = this_end_time
f" -c {self.aliyun_cfg['dlc_config_path']}"
f' --start_time {pri_time}'
f' --end_time {cur_time}')
log_output = subprocess.getoutput(logs_cmd)
if '[WARN] No logs found for the pod' not in log_output:
pri_time = cur_time
stdout.write(log_output)
stdout.flush()
if job_finished:
break
process.wait()
return process.returncode
finally:
if job_id is not None:
cancel_cmd = (
'dlc stop job'
f' {job_id}'
f" -c {self.aliyun_cfg['dlc_config_path']}"
' -f')
subprocess.run(cancel_cmd,
shell=True,
text=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
return_code = _run_within_retry()
retry = self.retry
......
......@@ -6,7 +6,8 @@ from mmengine.config import Config, ConfigDict
from opencompass.openicl.icl_inferencer import (AgentInferencer,
ChatInferencer, CLPInferencer,
GenInferencer, PPLInferencer,
GenInferencer, LLInferencer,
PPLInferencer,
PPLOnlyInferencer)
from opencompass.registry import ICL_PROMPT_TEMPLATES, ICL_RETRIEVERS
from opencompass.utils import (Menu, build_dataset_from_cfg,
......@@ -81,14 +82,15 @@ def print_prompts(model_cfg, dataset_cfg, count=1):
supported_inferencer = [
AgentInferencer, PPLInferencer, GenInferencer, CLPInferencer,
PPLOnlyInferencer, ChatInferencer
PPLOnlyInferencer, ChatInferencer, LLInferencer
]
if infer_cfg.inferencer.type not in supported_inferencer:
print(f'Only {supported_inferencer} are supported')
return
for idx in range(min(count, len(ice_idx_list))):
if issubclass(infer_cfg.inferencer.type, PPLInferencer):
if issubclass(infer_cfg.inferencer.type,
(PPLInferencer, LLInferencer)):
labels = retriever.get_labels(ice_template=ice_template,
prompt_template=prompt_template)
ice = retriever.generate_ice(ice_idx_list[idx],
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment