Unverified Commit b03d5dc5 authored by Fengzhe Zhou's avatar Fengzhe Zhou Committed by GitHub
Browse files

[Sync] Sync Internal (#941)

parent bbec7d87
...@@ -12,36 +12,9 @@ compassbench_v1_reason_groups = [ ...@@ -12,36 +12,9 @@ compassbench_v1_reason_groups = [
summarizer = dict( summarizer = dict(
dataset_abbrs=[ dataset_abbrs=[
['reasonbench', 'acc_origin'],
['reasonbench_cn_circular', 'acc_origin'],
['reasonbench_en_circular', 'acc_origin'],
['reasonbench_cn_commonsense_circular', 'acc_origin'],
['reasonbench_cn_abductive_circular', 'acc_origin'],
['reasonbench_cn_deductive_circular', 'acc_origin'],
['reasonbench_cn_inductive_circular', 'acc_origin'],
['reasonbench_en_commonsense_circular', 'acc_origin'],
['reasonbench_en_abductive_circular', 'acc_origin'],
['reasonbench_en_deductive_logiqa_zh_translated_circular', 'acc_origin'],
['reasonbench_en_inductive_circular', 'acc_origin'],
['reasonbench_cn_commonsense_circular', 'acc_origin'],
['reasonbench_cn_abductive_alphanlg_translated_circular', 'acc_origin'],
['reasonbench_cn_deductive_bbh3obj_translated_circular', 'acc_origin'],
['reasonbench_cn_deductive_logiqa_zh_circular', 'acc_origin'],
['reasonbench_cn_inductive_deer_translated_circular', 'acc_origin'],
['reasonbench_cn_inductive_selfgenerated_circular', 'acc_origin'],
['reasonbench_en_commonsense_circular', 'acc_origin'],
['reasonbench_en_abductive_alphanlg_circular', 'acc_origin'],
['reasonbench_en_deductive_logiqa_zh_translated_circular', 'acc_origin'],
['reasonbench_en_inductive_deer_circular', 'acc_origin'],
['reasonbench_en_inductive_selfgenerated_circular', 'acc_origin'],
['reasonbench', 'perf_circular'], ['reasonbench', 'perf_circular'],
['reasonbench_cn_circular', 'perf_circular'], ['reasonbench_cn_circular', 'perf_circular'],
['reasonbench_en_circular', 'perf_circular'], ['reasonbench_en_circular', 'perf_circular'],
['reasonbench_cn_commonsense_circular', 'perf_circular'], ['reasonbench_cn_commonsense_circular', 'perf_circular'],
['reasonbench_cn_abductive_circular', 'perf_circular'], ['reasonbench_cn_abductive_circular', 'perf_circular'],
['reasonbench_cn_deductive_circular', 'perf_circular'], ['reasonbench_cn_deductive_circular', 'perf_circular'],
...@@ -50,18 +23,6 @@ summarizer = dict( ...@@ -50,18 +23,6 @@ summarizer = dict(
['reasonbench_en_abductive_circular', 'perf_circular'], ['reasonbench_en_abductive_circular', 'perf_circular'],
['reasonbench_en_deductive_logiqa_zh_translated_circular', 'perf_circular'], ['reasonbench_en_deductive_logiqa_zh_translated_circular', 'perf_circular'],
['reasonbench_en_inductive_circular', 'perf_circular'], ['reasonbench_en_inductive_circular', 'perf_circular'],
['reasonbench_cn_commonsense_circular', 'perf_circular'],
['reasonbench_cn_abductive_alphanlg_translated_circular', 'perf_circular'],
['reasonbench_cn_deductive_bbh3obj_translated_circular', 'perf_circular'],
['reasonbench_cn_deductive_logiqa_zh_circular', 'perf_circular'],
['reasonbench_cn_inductive_deer_translated_circular', 'perf_circular'],
['reasonbench_cn_inductive_selfgenerated_circular', 'perf_circular'],
['reasonbench_en_commonsense_circular', 'perf_circular'],
['reasonbench_en_abductive_alphanlg_circular', 'perf_circular'],
['reasonbench_en_deductive_logiqa_zh_translated_circular', 'perf_circular'],
['reasonbench_en_inductive_deer_circular', 'perf_circular'],
['reasonbench_en_inductive_selfgenerated_circular', 'perf_circular'],
], ],
summary_groups=compassbench_v1_reason_groups, summary_groups=compassbench_v1_reason_groups,
) )
...@@ -39,6 +39,22 @@ _base_summary_groups = [ ...@@ -39,6 +39,22 @@ _base_summary_groups = [
['plugin_eval-review_str_v1', 'review_quality'], ['plugin_eval-review_str_v1', 'review_quality'],
], ],
}, },
{
'name': 'plugin_eval_one_review',
'subsets': [
['plugin_eval-instruct_v1', 'format_metric'],
['plugin_eval-instruct_v1', 'args_em_metric'],
['plugin_eval-plan_str_v1', 'f1_score'],
['plugin_eval-plan_json_v1', 'f1_score'],
['plugin_eval-reason_str_v1', 'thought'],
['plugin_eval-reason_retrieve_understand_json_v1', 'thought'],
['plugin_eval-retrieve_str_v1', 'name'],
['plugin_eval-reason_retrieve_understand_json_v1', 'name'],
['plugin_eval-understand_str_v1', 'args'],
['plugin_eval-reason_retrieve_understand_json_v1', 'args'],
['plugin_eval-review_str_v1', 'review_quality'],
]
},
{ {
'name': 'plugin_eval', 'name': 'plugin_eval',
'subsets': [ 'subsets': [
...@@ -53,7 +69,6 @@ _base_summary_groups = [ ...@@ -53,7 +69,6 @@ _base_summary_groups = [
['plugin_eval-understand_str_v1', 'args'], ['plugin_eval-understand_str_v1', 'args'],
['plugin_eval-reason_retrieve_understand_json_v1', 'args'], ['plugin_eval-reason_retrieve_understand_json_v1', 'args'],
['plugin_eval-review_str_v1', 'review_quality'], ['plugin_eval-review_str_v1', 'review_quality'],
['copy_plugin_eval-review_str_v1', 'naive_average'], # a hack for review * 2
] ]
}, },
] ]
......
...@@ -20,16 +20,12 @@ import functools ...@@ -20,16 +20,12 @@ import functools
import random import random
import re import re
try:
import immutabledict
except ImportError:
immutabledict = None
import nltk import nltk
WORD_LIST = ['western', 'sentence', 'signal', 'dump', 'spot', 'opposite', 'bottom', 'potato', 'administration', 'working', 'welcome', 'morning', 'good', 'agency', 'primary', 'wish', 'responsibility', 'press', 'problem', 'president', 'steal', 'brush', 'read', 'type', 'beat', 'trainer', 'growth', 'lock', 'bone', 'case', 'equal', 'comfortable', 'region', 'replacement', 'performance', 'mate', 'walk', 'medicine', 'film', 'thing', 'rock', 'tap', 'total', 'competition', 'ease', 'south', 'establishment', 'gather', 'parking', 'world', 'plenty', 'breath', 'claim', 'alcohol', 'trade', 'dear', 'highlight', 'street', 'matter', 'decision', 'mess', 'agreement', 'studio', 'coach', 'assist', 'brain', 'wing', 'style', 'private', 'top', 'brown', 'leg', 'buy', 'procedure', 'method', 'speed', 'high', 'company', 'valuable', 'pie', 'analyst', 'session', 'pattern', 'district', 'pleasure', 'dinner', 'swimming', 'joke', 'order', 'plate', 'department', 'motor', 'cell', 'spend', 'cabinet', 'difference', 'power', 'examination', 'engine', 'horse', 'dimension', 'pay', 'toe', 'curve', 'literature', 'bother', 'fire', 'possibility', 'debate', 'activity', 'passage', 'hello', 'cycle', 'background', 'quiet', 'author', 'effect', 'actor', 'page', 'bicycle', 'error', 'throat', 'attack', 'character', 'phone', 'tea', 'increase', 'outcome', 'file', 'specific', 'inspector', 'internal', 'potential', 'staff', 'building', 'employer', 'shoe', 'hand', 'direction', 'garden', 'purchase', 'interview', 'study', 'recognition', 'member', 'spiritual', 'oven', 'sandwich', 'weird', 'passenger', 'particular', 'response', 'reaction', 'size', 'variation', 'a', 'cancel', 'candy', 'exit', 'guest', 'condition', 'fly', 'price', 'weakness', 'convert', 'hotel', 'great', 'mouth', 'mind', 'song', 'sugar', 'suspect', 'telephone', 'ear', 'roof', 'paint', 'refrigerator', 'organization', 'jury', 'reward', 'engineering', 'day', 'possession', 'crew', 'bar', 'road', 'description', 'celebration', 'score', 'mark', 'letter', 'shower', 'suggestion', 'sir', 'luck', 'national', 'progress', 'hall', 'stroke', 'theory', 'offer', 'story', 'tax', 'definition', 'history', 'ride', 'medium', 'opening', 'glass', 'elevator', 'stomach', 'question', 'ability', 'leading', 'village', 'computer', 'city', 'grand', 'confidence', 'candle', 'priest', 'recommendation', 'point', 'necessary', 'body', 'desk', 'secret', 'horror', 'noise', 'culture', 'warning', 'water', 'round', 'diet', 'flower', 'bus', 'tough', 'permission', 'week', 'prompt', 'connection', 'abuse', 'height', 'save', 'corner', 'border', 'stress', 'drive', 'stop', 'rip', 'meal', 'listen', 'confusion', 'girlfriend', 'living', 'relation', 'significance', 'plan', 'creative', 'atmosphere', 'blame', 'invite', 'housing', 'paper', 'drink', 'roll', 'silver', 'drunk', 'age', 'damage', 'smoke', 'environment', 'pack', 'savings', 'influence', 'tourist', 'rain', 'post', 'sign', 'grandmother', 'run', 'profit', 'push', 'clerk', 'final', 'wine', 'swim', 'pause', 'stuff', 'singer', 'funeral', 'average', 'source', 'scene', 'tradition', 'personal', 'snow', 'nobody', 'distance', 'sort', 'sensitive', 'animal', 'major', 'negotiation', 'click', 'mood', 'period', 'arrival', 'expression', 'holiday', 'repeat', 'dust', 'closet', 'gold', 'bad', 'sail', 'combination', 'clothes', 'emphasis', 'duty', 'black', 'step', 'school', 'jump', 'document', 'professional', 'lip', 'chemical', 'front', 'wake', 'while', 'inside', 'watch', 'row', 'subject', 'penalty', 'balance', 'possible', 'adult', 'aside', 'sample', 'appeal', 'wedding', 'depth', 'king', 'award', 'wife', 'blow', 'site', 'camp', 'music', 'safe', 'gift', 'fault', 'guess', 'act', 'shame', 'drama', 'capital', 'exam', 'stupid', 'record', 'sound', 'swing', 'novel', 'minimum', 'ratio', 'machine', 'shape', 'lead', 'operation', 'salary', 'cloud', 'affair', 'hit', 'chapter', 'stage', 'quantity', 'access', 'army', 'chain', 'traffic', 'kick', 'analysis', 'airport', 'time', 'vacation', 'philosophy', 'ball', 'chest', 'thanks', 'place', 'mountain', 'advertising', 'red', 'past', 'rent', 'return', 'tour', 'house', 'construction', 'net', 'native', 'war', 'figure', 'fee', 'spray', 'user', 'dirt', 'shot', 'task', 'stick', 'friend', 'software', 'promotion', 'interaction', 'surround', 'block', 'purpose', 'practice', 'conflict', 'routine', 'requirement', 'bonus', 'hole', 'state', 'junior', 'sweet', 'catch', 'tear', 'fold', 'wall', 'editor', 'life', 'position', 'pound', 'respect', 'bathroom', 'coat', 'script', 'job', 'teach', 'birth', 'view', 'resolve', 'theme', 'employee', 'doubt', 'market', 'education', 'serve', 'recover', 'tone', 'harm', 'miss', 'union', 'understanding', 'cow', 'river', 'association', 'concept', 'training', 'recipe', 'relationship', 'reserve', 'depression', 'proof', 'hair', 'revenue', 'independent', 'lift', 'assignment', 'temporary', 'amount', 'loss', 'edge', 'track', 'check', 'rope', 'estimate', 'pollution', 'stable', 'message', 'delivery', 'perspective', 'mirror', 'assistant', 'representative', 'witness', 'nature', 'judge', 'fruit', 'tip', 'devil', 'town', 'emergency', 'upper', 'drop', 'stay', 'human', 'neck', 'speaker', 'network', 'sing', 'resist', 'league', 'trip', 'signature', 'lawyer', 'importance', 'gas', 'choice', 'engineer', 'success', 'part', 'external', 'worker', 'simple', 'quarter', 'student', 'heart', 'pass', 'spite', 'shift', 'rough', 'lady', 'grass', 'community', 'garage', 'youth', 'standard', 'skirt', 'promise', 'blind', 'television', 'disease', 'commission', 'positive', 'energy', 'calm', 'presence', 'tune', 'basis', 'preference', 'head', 'common', 'cut', 'somewhere', 'presentation', 'current', 'thought', 'revolution', 'effort', 'master', 'implement', 'republic', 'floor', 'principle', 'stranger', 'shoulder', 'grade', 'button', 'tennis', 'police', 'collection', 'account', 'register', 'glove', 'divide', 'professor', 'chair', 'priority', 'combine', 'peace', 'extension', 'maybe', 'evening', 'frame', 'sister', 'wave', 'code', 'application', 'mouse', 'match', 'counter', 'bottle', 'half', 'cheek', 'resolution', 'back', 'knowledge', 'make', 'discussion', 'screw', 'length', 'accident', 'battle', 'dress', 'knee', 'log', 'package', 'it', 'turn', 'hearing', 'newspaper', 'layer', 'wealth', 'profile', 'imagination', 'answer', 'weekend', 'teacher', 'appearance', 'meet', 'bike', 'rise', 'belt', 'crash', 'bowl', 'equivalent', 'support', 'image', 'poem', 'risk', 'excitement', 'remote', 'secretary', 'public', 'produce', 'plane', 'display', 'money', 'sand', 'situation', 'punch', 'customer', 'title', 'shake', 'mortgage', 'option', 'number', 'pop', 'window', 'extent', 'nothing', 'experience', 'opinion', 'departure', 'dance', 'indication', 'boy', 'material', 'band', 'leader', 'sun', 'beautiful', 'muscle', 'farmer', 'variety', 'fat', 'handle', 'director', 'opportunity', 'calendar', 'outside', 'pace', 'bath', 'fish', 'consequence', 'put', 'owner', 'go', 'doctor', 'information', 'share', 'hurt', 'protection', 'career', 'finance', 'force', 'golf', 'garbage', 'aspect', 'kid', 'food', 'boot', 'milk', 'respond', 'objective', 'reality', 'raw', 'ring', 'mall', 'one', 'impact', 'area', 'news', 'international', 'series', 'impress', 'mother', 'shelter', 'strike', 'loan', 'month', 'seat', 'anything', 'entertainment', 'familiar', 'clue', 'year', 'glad', 'supermarket', 'natural', 'god', 'cost', 'conversation', 'tie', 'ruin', 'comfort', 'earth', 'storm', 'percentage', 'assistance', 'budget', 'strength', 'beginning', 'sleep', 'other', 'young', 'unit', 'fill', 'store', 'desire', 'hide', 'value', 'cup', 'maintenance', 'nurse', 'function', 'tower', 'role', 'class', 'camera', 'database', 'panic', 'nation', 'basket', 'ice', 'art', 'spirit', 'chart', 'exchange', 'feedback', 'statement', 'reputation', 'search', 'hunt', 'exercise', 'nasty', 'notice', 'male', 'yard', 'annual', 'collar', 'date', 'platform', 'plant', 'fortune', 'passion', 'friendship', 'spread', 'cancer', 'ticket', 'attitude', 'island', 'active', 'object', 'service', 'buyer', 'bite', 'card', 'face', 'steak', 'proposal', 'patient', 'heat', 'rule', 'resident', 'broad', 'politics', 'west', 'knife', 'expert', 'girl', 'design', 'salt', 'baseball', 'grab', 'inspection', 'cousin', 'couple', 'magazine', 'cook', 'dependent', 'security', 'chicken', 'version', 'currency', 'ladder', 'scheme', 'kitchen', 'employment', 'local', 'attention', 'manager', 'fact', 'cover', 'sad', 'guard', 'relative', 'county', 'rate', 'lunch', 'program', 'initiative', 'gear', 'bridge', 'breast', 'talk', 'dish', 'guarantee', 'beer', 'vehicle', 'reception', 'woman', 'substance', 'copy', 'lecture', 'advantage', 'park', 'cold', 'death', 'mix', 'hold', 'scale', 'tomorrow', 'blood', 'request', 'green', 'cookie', 'church', 'strip', 'forever', 'beyond', 'debt', 'tackle', 'wash', 'following', 'feel', 'maximum', 'sector', 'sea', 'property', 'economics', 'menu', 'bench', 'try', 'language', 'start', 'call', 'solid', 'address', 'income', 'foot', 'senior', 'honey', 'few', 'mixture', 'cash', 'grocery', 'link', 'map', 'form', 'factor', 'pot', 'model', 'writer', 'farm', 'winter', 'skill', 'anywhere', 'birthday', 'policy', 'release', 'husband', 'lab', 'hurry', 'mail', 'equipment', 'sink', 'pair', 'driver', 'consideration', 'leather', 'skin', 'blue', 'boat', 'sale', 'brick', 'two', 'feed', 'square', 'dot', 'rush', 'dream', 'location', 'afternoon', 'manufacturer', 'control', 'occasion', 'trouble', 'introduction', 'advice', 'bet', 'eat', 'kill', 'category', 'manner', 'office', 'estate', 'pride', 'awareness', 'slip', 'crack', 'client', 'nail', 'shoot', 'membership', 'soft', 'anybody', 'web', 'official', 'individual', 'pizza', 'interest', 'bag', 'spell', 'profession', 'queen', 'deal', 'resource', 'ship', 'guy', 'chocolate', 'joint', 'formal', 'upstairs', 'car', 'resort', 'abroad', 'dealer', 'associate', 'finger', 'surgery', 'comment', 'team', 'detail', 'crazy', 'path', 'tale', 'initial', 'arm', 'radio', 'demand', 'single', 'draw', 'yellow', 'contest', 'piece', 'quote', 'pull', 'commercial', 'shirt', 'contribution', 'cream', 'channel', 'suit', 'discipline', 'instruction', 'concert', 'speech', 'low', 'effective', 'hang', 'scratch', 'industry', 'breakfast', 'lay', 'join', 'metal', 'bedroom', 'minute', 'product', 'rest', 'temperature', 'many', 'give', 'argument', 'print', 'purple', 'laugh', 'health', 'credit', 'investment', 'sell', 'setting', 'lesson', 'egg', 'middle', 'marriage', 'level', 'evidence', 'phrase', 'love', 'self', 'benefit', 'guidance', 'affect', 'you', 'dad', 'anxiety', 'special', 'boyfriend', 'test', 'blank', 'payment', 'soup', 'obligation', 'reply', 'smile', 'deep', 'complaint', 'addition', 'review', 'box', 'towel', 'minor', 'fun', 'soil', 'issue', 'cigarette', 'internet', 'gain', 'tell', 'entry', 'spare', 'incident', 'family', 'refuse', 'branch', 'can', 'pen', 'grandfather', 'constant', 'tank', 'uncle', 'climate', 'ground', 'volume', 'communication', 'kind', 'poet', 'child', 'screen', 'mine', 'quit', 'gene', 'lack', 'charity', 'memory', 'tooth', 'fear', 'mention', 'marketing', 'reveal', 'reason', 'court', 'season', 'freedom', 'land', 'sport', 'audience', 'classroom', 'law', 'hook', 'win', 'carry', 'eye', 'smell', 'distribution', 'research', 'country', 'dare', 'hope', 'whereas', 'stretch', 'library', 'if', 'delay', 'college', 'plastic', 'book', 'present', 'use', 'worry', 'champion', 'goal', 'economy', 'march', 'election', 'reflection', 'midnight', 'slide', 'inflation', 'action', 'challenge', 'guitar', 'coast', 'apple', 'campaign', 'field', 'jacket', 'sense', 'way', 'visual', 'remove', 'weather', 'trash', 'cable', 'regret', 'buddy', 'beach', 'historian', 'courage', 'sympathy', 'truck', 'tension', 'permit', 'nose', 'bed', 'son', 'person', 'base', 'meat', 'usual', 'air', 'meeting', 'worth', 'game', 'independence', 'physical', 'brief', 'play', 'raise', 'board', 'she', 'key', 'writing', 'pick', 'command', 'party', 'yesterday', 'spring', 'candidate', 'physics', 'university', 'concern', 'development', 'change', 'string', 'target', 'instance', 'room', 'bitter', 'bird', 'football', 'normal', 'split', 'impression', 'wood', 'long', 'meaning', 'stock', 'cap', 'leadership', 'media', 'ambition', 'fishing', 'essay', 'salad', 'repair', 'today', 'designer', 'night', 'bank', 'drawing', 'inevitable', 'phase', 'vast', 'chip', 'anger', 'switch', 'cry', 'twist', 'personality', 'attempt', 'storage', 'being', 'preparation', 'bat', 'selection', 'white', 'technology', 'contract', 'side', 'section', 'station', 'till', 'structure', 'tongue', 'taste', 'truth', 'difficulty', 'group', 'limit', 'main', 'move', 'feeling', 'light', 'example', 'mission', 'might', 'wait', 'wheel', 'shop', 'host', 'classic', 'alternative', 'cause', 'agent', 'consist', 'table', 'airline', 'text', 'pool', 'craft', 'range', 'fuel', 'tool', 'partner', 'load', 'entrance', 'deposit', 'hate', 'article', 'video', 'summer', 'feature', 'extreme', 'mobile', 'hospital', 'flight', 'fall', 'pension', 'piano', 'fail', 'result', 'rub', 'gap', 'system', 'report', 'suck', 'ordinary', 'wind', 'nerve', 'ask', 'shine', 'note', 'line', 'mom', 'perception', 'brother', 'reference', 'bend', 'charge', 'treat', 'trick', 'term', 'homework', 'bake', 'bid', 'status', 'project', 'strategy', 'orange', 'let', 'enthusiasm', 'parent', 'concentrate', 'device', 'travel', 'poetry', 'business', 'society', 'kiss', 'end', 'vegetable', 'employ', 'schedule', 'hour', 'brave', 'focus', 'process', 'movie', 'illegal', 'general', 'coffee', 'ad', 'highway', 'chemistry', 'psychology', 'hire', 'bell', 'conference', 'relief', 'show', 'neat', 'funny', 'weight', 'quality', 'club', 'daughter', 'zone', 'touch', 'tonight', 'shock', 'burn', 'excuse', 'name', 'survey', 'landscape', 'advance', 'satisfaction', 'bread', 'disaster', 'item', 'hat', 'prior', 'shopping', 'visit', 'east', 'photo', 'home', 'idea', 'father', 'comparison', 'cat', 'pipe', 'winner', 'count', 'lake', 'fight', 'prize', 'foundation', 'dog', 'keep', 'ideal', 'fan', 'struggle', 'peak', 'safety', 'solution', 'hell', 'conclusion', 'population', 'strain', 'alarm', 'measurement', 'second', 'train', 'race', 'due', 'insurance', 'boss', 'tree', 'monitor', 'sick', 'course', 'drag', 'appointment', 'slice', 'still', 'care', 'patience', 'rich', 'escape', 'emotion', 'royal', 'female', 'childhood', 'government', 'picture', 'will', 'sock', 'big', 'gate', 'oil', 'cross', 'pin', 'improvement', 'championship', 'silly', 'help', 'sky', 'pitch', 'man', 'diamond', 'most', 'transition', 'work', 'science', 'committee', 'moment', 'fix', 'teaching', 'dig', 'specialist', 'complex', 'guide', 'people', 'dead', 'voice', 'original', 'break', 'topic', 'data', 'degree', 'reading', 'recording', 'bunch', 'reach', 'judgment', 'lie', 'regular', 'set', 'painting', 'mode', 'list', 'player', 'bear', 'north', 'wonder', 'carpet', 'heavy', 'officer', 'negative', 'clock', 'unique', 'baby', 'pain', 'assumption', 'disk', 'iron', 'bill', 'drawer', 'look', 'double', 'mistake', 'finish', 'future', 'brilliant', 'contact', 'math', 'rice', 'leave', 'restaurant', 'discount', 'sex', 'virus', 'bit', 'trust', 'event', 'wear', 'juice', 'failure', 'bug', 'context', 'mud', 'whole', 'wrap', 'intention', 'draft', 'pressure', 'cake', 'dark', 'explanation', 'space', 'angle', 'word', 'efficiency', 'management', 'habit', 'star', 'chance', 'finding', 'transportation', 'stand', 'criticism', 'flow', 'door', 'injury', 'insect', 'surprise', 'apartment'] # pylint: disable=line-too-long WORD_LIST = ['western', 'sentence', 'signal', 'dump', 'spot', 'opposite', 'bottom', 'potato', 'administration', 'working', 'welcome', 'morning', 'good', 'agency', 'primary', 'wish', 'responsibility', 'press', 'problem', 'president', 'steal', 'brush', 'read', 'type', 'beat', 'trainer', 'growth', 'lock', 'bone', 'case', 'equal', 'comfortable', 'region', 'replacement', 'performance', 'mate', 'walk', 'medicine', 'film', 'thing', 'rock', 'tap', 'total', 'competition', 'ease', 'south', 'establishment', 'gather', 'parking', 'world', 'plenty', 'breath', 'claim', 'alcohol', 'trade', 'dear', 'highlight', 'street', 'matter', 'decision', 'mess', 'agreement', 'studio', 'coach', 'assist', 'brain', 'wing', 'style', 'private', 'top', 'brown', 'leg', 'buy', 'procedure', 'method', 'speed', 'high', 'company', 'valuable', 'pie', 'analyst', 'session', 'pattern', 'district', 'pleasure', 'dinner', 'swimming', 'joke', 'order', 'plate', 'department', 'motor', 'cell', 'spend', 'cabinet', 'difference', 'power', 'examination', 'engine', 'horse', 'dimension', 'pay', 'toe', 'curve', 'literature', 'bother', 'fire', 'possibility', 'debate', 'activity', 'passage', 'hello', 'cycle', 'background', 'quiet', 'author', 'effect', 'actor', 'page', 'bicycle', 'error', 'throat', 'attack', 'character', 'phone', 'tea', 'increase', 'outcome', 'file', 'specific', 'inspector', 'internal', 'potential', 'staff', 'building', 'employer', 'shoe', 'hand', 'direction', 'garden', 'purchase', 'interview', 'study', 'recognition', 'member', 'spiritual', 'oven', 'sandwich', 'weird', 'passenger', 'particular', 'response', 'reaction', 'size', 'variation', 'a', 'cancel', 'candy', 'exit', 'guest', 'condition', 'fly', 'price', 'weakness', 'convert', 'hotel', 'great', 'mouth', 'mind', 'song', 'sugar', 'suspect', 'telephone', 'ear', 'roof', 'paint', 'refrigerator', 'organization', 'jury', 'reward', 'engineering', 'day', 'possession', 'crew', 'bar', 'road', 'description', 'celebration', 'score', 'mark', 'letter', 'shower', 'suggestion', 'sir', 'luck', 'national', 'progress', 'hall', 'stroke', 'theory', 'offer', 'story', 'tax', 'definition', 'history', 'ride', 'medium', 'opening', 'glass', 'elevator', 'stomach', 'question', 'ability', 'leading', 'village', 'computer', 'city', 'grand', 'confidence', 'candle', 'priest', 'recommendation', 'point', 'necessary', 'body', 'desk', 'secret', 'horror', 'noise', 'culture', 'warning', 'water', 'round', 'diet', 'flower', 'bus', 'tough', 'permission', 'week', 'prompt', 'connection', 'abuse', 'height', 'save', 'corner', 'border', 'stress', 'drive', 'stop', 'rip', 'meal', 'listen', 'confusion', 'girlfriend', 'living', 'relation', 'significance', 'plan', 'creative', 'atmosphere', 'blame', 'invite', 'housing', 'paper', 'drink', 'roll', 'silver', 'drunk', 'age', 'damage', 'smoke', 'environment', 'pack', 'savings', 'influence', 'tourist', 'rain', 'post', 'sign', 'grandmother', 'run', 'profit', 'push', 'clerk', 'final', 'wine', 'swim', 'pause', 'stuff', 'singer', 'funeral', 'average', 'source', 'scene', 'tradition', 'personal', 'snow', 'nobody', 'distance', 'sort', 'sensitive', 'animal', 'major', 'negotiation', 'click', 'mood', 'period', 'arrival', 'expression', 'holiday', 'repeat', 'dust', 'closet', 'gold', 'bad', 'sail', 'combination', 'clothes', 'emphasis', 'duty', 'black', 'step', 'school', 'jump', 'document', 'professional', 'lip', 'chemical', 'front', 'wake', 'while', 'inside', 'watch', 'row', 'subject', 'penalty', 'balance', 'possible', 'adult', 'aside', 'sample', 'appeal', 'wedding', 'depth', 'king', 'award', 'wife', 'blow', 'site', 'camp', 'music', 'safe', 'gift', 'fault', 'guess', 'act', 'shame', 'drama', 'capital', 'exam', 'stupid', 'record', 'sound', 'swing', 'novel', 'minimum', 'ratio', 'machine', 'shape', 'lead', 'operation', 'salary', 'cloud', 'affair', 'hit', 'chapter', 'stage', 'quantity', 'access', 'army', 'chain', 'traffic', 'kick', 'analysis', 'airport', 'time', 'vacation', 'philosophy', 'ball', 'chest', 'thanks', 'place', 'mountain', 'advertising', 'red', 'past', 'rent', 'return', 'tour', 'house', 'construction', 'net', 'native', 'war', 'figure', 'fee', 'spray', 'user', 'dirt', 'shot', 'task', 'stick', 'friend', 'software', 'promotion', 'interaction', 'surround', 'block', 'purpose', 'practice', 'conflict', 'routine', 'requirement', 'bonus', 'hole', 'state', 'junior', 'sweet', 'catch', 'tear', 'fold', 'wall', 'editor', 'life', 'position', 'pound', 'respect', 'bathroom', 'coat', 'script', 'job', 'teach', 'birth', 'view', 'resolve', 'theme', 'employee', 'doubt', 'market', 'education', 'serve', 'recover', 'tone', 'harm', 'miss', 'union', 'understanding', 'cow', 'river', 'association', 'concept', 'training', 'recipe', 'relationship', 'reserve', 'depression', 'proof', 'hair', 'revenue', 'independent', 'lift', 'assignment', 'temporary', 'amount', 'loss', 'edge', 'track', 'check', 'rope', 'estimate', 'pollution', 'stable', 'message', 'delivery', 'perspective', 'mirror', 'assistant', 'representative', 'witness', 'nature', 'judge', 'fruit', 'tip', 'devil', 'town', 'emergency', 'upper', 'drop', 'stay', 'human', 'neck', 'speaker', 'network', 'sing', 'resist', 'league', 'trip', 'signature', 'lawyer', 'importance', 'gas', 'choice', 'engineer', 'success', 'part', 'external', 'worker', 'simple', 'quarter', 'student', 'heart', 'pass', 'spite', 'shift', 'rough', 'lady', 'grass', 'community', 'garage', 'youth', 'standard', 'skirt', 'promise', 'blind', 'television', 'disease', 'commission', 'positive', 'energy', 'calm', 'presence', 'tune', 'basis', 'preference', 'head', 'common', 'cut', 'somewhere', 'presentation', 'current', 'thought', 'revolution', 'effort', 'master', 'implement', 'republic', 'floor', 'principle', 'stranger', 'shoulder', 'grade', 'button', 'tennis', 'police', 'collection', 'account', 'register', 'glove', 'divide', 'professor', 'chair', 'priority', 'combine', 'peace', 'extension', 'maybe', 'evening', 'frame', 'sister', 'wave', 'code', 'application', 'mouse', 'match', 'counter', 'bottle', 'half', 'cheek', 'resolution', 'back', 'knowledge', 'make', 'discussion', 'screw', 'length', 'accident', 'battle', 'dress', 'knee', 'log', 'package', 'it', 'turn', 'hearing', 'newspaper', 'layer', 'wealth', 'profile', 'imagination', 'answer', 'weekend', 'teacher', 'appearance', 'meet', 'bike', 'rise', 'belt', 'crash', 'bowl', 'equivalent', 'support', 'image', 'poem', 'risk', 'excitement', 'remote', 'secretary', 'public', 'produce', 'plane', 'display', 'money', 'sand', 'situation', 'punch', 'customer', 'title', 'shake', 'mortgage', 'option', 'number', 'pop', 'window', 'extent', 'nothing', 'experience', 'opinion', 'departure', 'dance', 'indication', 'boy', 'material', 'band', 'leader', 'sun', 'beautiful', 'muscle', 'farmer', 'variety', 'fat', 'handle', 'director', 'opportunity', 'calendar', 'outside', 'pace', 'bath', 'fish', 'consequence', 'put', 'owner', 'go', 'doctor', 'information', 'share', 'hurt', 'protection', 'career', 'finance', 'force', 'golf', 'garbage', 'aspect', 'kid', 'food', 'boot', 'milk', 'respond', 'objective', 'reality', 'raw', 'ring', 'mall', 'one', 'impact', 'area', 'news', 'international', 'series', 'impress', 'mother', 'shelter', 'strike', 'loan', 'month', 'seat', 'anything', 'entertainment', 'familiar', 'clue', 'year', 'glad', 'supermarket', 'natural', 'god', 'cost', 'conversation', 'tie', 'ruin', 'comfort', 'earth', 'storm', 'percentage', 'assistance', 'budget', 'strength', 'beginning', 'sleep', 'other', 'young', 'unit', 'fill', 'store', 'desire', 'hide', 'value', 'cup', 'maintenance', 'nurse', 'function', 'tower', 'role', 'class', 'camera', 'database', 'panic', 'nation', 'basket', 'ice', 'art', 'spirit', 'chart', 'exchange', 'feedback', 'statement', 'reputation', 'search', 'hunt', 'exercise', 'nasty', 'notice', 'male', 'yard', 'annual', 'collar', 'date', 'platform', 'plant', 'fortune', 'passion', 'friendship', 'spread', 'cancer', 'ticket', 'attitude', 'island', 'active', 'object', 'service', 'buyer', 'bite', 'card', 'face', 'steak', 'proposal', 'patient', 'heat', 'rule', 'resident', 'broad', 'politics', 'west', 'knife', 'expert', 'girl', 'design', 'salt', 'baseball', 'grab', 'inspection', 'cousin', 'couple', 'magazine', 'cook', 'dependent', 'security', 'chicken', 'version', 'currency', 'ladder', 'scheme', 'kitchen', 'employment', 'local', 'attention', 'manager', 'fact', 'cover', 'sad', 'guard', 'relative', 'county', 'rate', 'lunch', 'program', 'initiative', 'gear', 'bridge', 'breast', 'talk', 'dish', 'guarantee', 'beer', 'vehicle', 'reception', 'woman', 'substance', 'copy', 'lecture', 'advantage', 'park', 'cold', 'death', 'mix', 'hold', 'scale', 'tomorrow', 'blood', 'request', 'green', 'cookie', 'church', 'strip', 'forever', 'beyond', 'debt', 'tackle', 'wash', 'following', 'feel', 'maximum', 'sector', 'sea', 'property', 'economics', 'menu', 'bench', 'try', 'language', 'start', 'call', 'solid', 'address', 'income', 'foot', 'senior', 'honey', 'few', 'mixture', 'cash', 'grocery', 'link', 'map', 'form', 'factor', 'pot', 'model', 'writer', 'farm', 'winter', 'skill', 'anywhere', 'birthday', 'policy', 'release', 'husband', 'lab', 'hurry', 'mail', 'equipment', 'sink', 'pair', 'driver', 'consideration', 'leather', 'skin', 'blue', 'boat', 'sale', 'brick', 'two', 'feed', 'square', 'dot', 'rush', 'dream', 'location', 'afternoon', 'manufacturer', 'control', 'occasion', 'trouble', 'introduction', 'advice', 'bet', 'eat', 'kill', 'category', 'manner', 'office', 'estate', 'pride', 'awareness', 'slip', 'crack', 'client', 'nail', 'shoot', 'membership', 'soft', 'anybody', 'web', 'official', 'individual', 'pizza', 'interest', 'bag', 'spell', 'profession', 'queen', 'deal', 'resource', 'ship', 'guy', 'chocolate', 'joint', 'formal', 'upstairs', 'car', 'resort', 'abroad', 'dealer', 'associate', 'finger', 'surgery', 'comment', 'team', 'detail', 'crazy', 'path', 'tale', 'initial', 'arm', 'radio', 'demand', 'single', 'draw', 'yellow', 'contest', 'piece', 'quote', 'pull', 'commercial', 'shirt', 'contribution', 'cream', 'channel', 'suit', 'discipline', 'instruction', 'concert', 'speech', 'low', 'effective', 'hang', 'scratch', 'industry', 'breakfast', 'lay', 'join', 'metal', 'bedroom', 'minute', 'product', 'rest', 'temperature', 'many', 'give', 'argument', 'print', 'purple', 'laugh', 'health', 'credit', 'investment', 'sell', 'setting', 'lesson', 'egg', 'middle', 'marriage', 'level', 'evidence', 'phrase', 'love', 'self', 'benefit', 'guidance', 'affect', 'you', 'dad', 'anxiety', 'special', 'boyfriend', 'test', 'blank', 'payment', 'soup', 'obligation', 'reply', 'smile', 'deep', 'complaint', 'addition', 'review', 'box', 'towel', 'minor', 'fun', 'soil', 'issue', 'cigarette', 'internet', 'gain', 'tell', 'entry', 'spare', 'incident', 'family', 'refuse', 'branch', 'can', 'pen', 'grandfather', 'constant', 'tank', 'uncle', 'climate', 'ground', 'volume', 'communication', 'kind', 'poet', 'child', 'screen', 'mine', 'quit', 'gene', 'lack', 'charity', 'memory', 'tooth', 'fear', 'mention', 'marketing', 'reveal', 'reason', 'court', 'season', 'freedom', 'land', 'sport', 'audience', 'classroom', 'law', 'hook', 'win', 'carry', 'eye', 'smell', 'distribution', 'research', 'country', 'dare', 'hope', 'whereas', 'stretch', 'library', 'if', 'delay', 'college', 'plastic', 'book', 'present', 'use', 'worry', 'champion', 'goal', 'economy', 'march', 'election', 'reflection', 'midnight', 'slide', 'inflation', 'action', 'challenge', 'guitar', 'coast', 'apple', 'campaign', 'field', 'jacket', 'sense', 'way', 'visual', 'remove', 'weather', 'trash', 'cable', 'regret', 'buddy', 'beach', 'historian', 'courage', 'sympathy', 'truck', 'tension', 'permit', 'nose', 'bed', 'son', 'person', 'base', 'meat', 'usual', 'air', 'meeting', 'worth', 'game', 'independence', 'physical', 'brief', 'play', 'raise', 'board', 'she', 'key', 'writing', 'pick', 'command', 'party', 'yesterday', 'spring', 'candidate', 'physics', 'university', 'concern', 'development', 'change', 'string', 'target', 'instance', 'room', 'bitter', 'bird', 'football', 'normal', 'split', 'impression', 'wood', 'long', 'meaning', 'stock', 'cap', 'leadership', 'media', 'ambition', 'fishing', 'essay', 'salad', 'repair', 'today', 'designer', 'night', 'bank', 'drawing', 'inevitable', 'phase', 'vast', 'chip', 'anger', 'switch', 'cry', 'twist', 'personality', 'attempt', 'storage', 'being', 'preparation', 'bat', 'selection', 'white', 'technology', 'contract', 'side', 'section', 'station', 'till', 'structure', 'tongue', 'taste', 'truth', 'difficulty', 'group', 'limit', 'main', 'move', 'feeling', 'light', 'example', 'mission', 'might', 'wait', 'wheel', 'shop', 'host', 'classic', 'alternative', 'cause', 'agent', 'consist', 'table', 'airline', 'text', 'pool', 'craft', 'range', 'fuel', 'tool', 'partner', 'load', 'entrance', 'deposit', 'hate', 'article', 'video', 'summer', 'feature', 'extreme', 'mobile', 'hospital', 'flight', 'fall', 'pension', 'piano', 'fail', 'result', 'rub', 'gap', 'system', 'report', 'suck', 'ordinary', 'wind', 'nerve', 'ask', 'shine', 'note', 'line', 'mom', 'perception', 'brother', 'reference', 'bend', 'charge', 'treat', 'trick', 'term', 'homework', 'bake', 'bid', 'status', 'project', 'strategy', 'orange', 'let', 'enthusiasm', 'parent', 'concentrate', 'device', 'travel', 'poetry', 'business', 'society', 'kiss', 'end', 'vegetable', 'employ', 'schedule', 'hour', 'brave', 'focus', 'process', 'movie', 'illegal', 'general', 'coffee', 'ad', 'highway', 'chemistry', 'psychology', 'hire', 'bell', 'conference', 'relief', 'show', 'neat', 'funny', 'weight', 'quality', 'club', 'daughter', 'zone', 'touch', 'tonight', 'shock', 'burn', 'excuse', 'name', 'survey', 'landscape', 'advance', 'satisfaction', 'bread', 'disaster', 'item', 'hat', 'prior', 'shopping', 'visit', 'east', 'photo', 'home', 'idea', 'father', 'comparison', 'cat', 'pipe', 'winner', 'count', 'lake', 'fight', 'prize', 'foundation', 'dog', 'keep', 'ideal', 'fan', 'struggle', 'peak', 'safety', 'solution', 'hell', 'conclusion', 'population', 'strain', 'alarm', 'measurement', 'second', 'train', 'race', 'due', 'insurance', 'boss', 'tree', 'monitor', 'sick', 'course', 'drag', 'appointment', 'slice', 'still', 'care', 'patience', 'rich', 'escape', 'emotion', 'royal', 'female', 'childhood', 'government', 'picture', 'will', 'sock', 'big', 'gate', 'oil', 'cross', 'pin', 'improvement', 'championship', 'silly', 'help', 'sky', 'pitch', 'man', 'diamond', 'most', 'transition', 'work', 'science', 'committee', 'moment', 'fix', 'teaching', 'dig', 'specialist', 'complex', 'guide', 'people', 'dead', 'voice', 'original', 'break', 'topic', 'data', 'degree', 'reading', 'recording', 'bunch', 'reach', 'judgment', 'lie', 'regular', 'set', 'painting', 'mode', 'list', 'player', 'bear', 'north', 'wonder', 'carpet', 'heavy', 'officer', 'negative', 'clock', 'unique', 'baby', 'pain', 'assumption', 'disk', 'iron', 'bill', 'drawer', 'look', 'double', 'mistake', 'finish', 'future', 'brilliant', 'contact', 'math', 'rice', 'leave', 'restaurant', 'discount', 'sex', 'virus', 'bit', 'trust', 'event', 'wear', 'juice', 'failure', 'bug', 'context', 'mud', 'whole', 'wrap', 'intention', 'draft', 'pressure', 'cake', 'dark', 'explanation', 'space', 'angle', 'word', 'efficiency', 'management', 'habit', 'star', 'chance', 'finding', 'transportation', 'stand', 'criticism', 'flow', 'door', 'injury', 'insect', 'surprise', 'apartment'] # pylint: disable=line-too-long
# ISO 639-1 codes to language names. # ISO 639-1 codes to language names.
LANGUAGE_CODES = immutabledict.immutabledict({ LANGUAGE_CODES = {
'en': 'English', 'en': 'English',
'es': 'Spanish', 'es': 'Spanish',
'pt': 'Portuguese', 'pt': 'Portuguese',
...@@ -60,7 +56,7 @@ LANGUAGE_CODES = immutabledict.immutabledict({ ...@@ -60,7 +56,7 @@ LANGUAGE_CODES = immutabledict.immutabledict({
'pa': 'Punjabi', 'pa': 'Punjabi',
'ml': 'Malayalam', 'ml': 'Malayalam',
'fi': 'Finnish', 'fi': 'Finnish',
}) }
_ALPHABETS = '([A-Za-z])' _ALPHABETS = '([A-Za-z])'
_PREFIXES = '(Mr|St|Mrs|Ms|Dr)[.]' _PREFIXES = '(Mr|St|Mrs|Ms|Dr)[.]'
......
...@@ -24,3 +24,15 @@ def TheoremQA_postprocess(text: str) -> str: ...@@ -24,3 +24,15 @@ def TheoremQA_postprocess(text: str) -> str:
else: else:
text = matches[0].strip().strip('.,?!\"\';:') text = matches[0].strip().strip('.,?!\"\';:')
return text return text
def TheoremQA_postprocess_v2(text: str) -> str:
prediction = text.strip().strip('\n').split('\n')[-1]
tmp = ''
for entry in prediction.split(' ')[::-1]:
if entry == 'is' or entry == 'be' or entry == 'are' or entry.endswith(
':'):
break
tmp = entry + ' ' + tmp
prediction = tmp.strip().strip('.')
return prediction
import json import json
import os.path as osp import os.path as osp
from datasets import Dataset from datasets import Dataset, DatasetDict
from opencompass.registry import LOAD_DATASET from opencompass.registry import LOAD_DATASET
...@@ -71,6 +71,32 @@ class hellaswagDataset_V3(BaseDataset): ...@@ -71,6 +71,32 @@ class hellaswagDataset_V3(BaseDataset):
return dataset return dataset
@LOAD_DATASET.register_module()
class hellaswagDatasetwithICE(BaseDataset):
@staticmethod
def load(path):
dataset_dict = DatasetDict()
for split, filename in [
['train', 'hellaswag_train_sampled25.jsonl'],
['val', 'hellaswag.jsonl'],
]:
dataset = []
with open(osp.join(path, filename), 'r', encoding='utf-8') as f:
for line in f:
data = json.loads(line)
dataset.append({
'ctx': data['query'].split(': ', 1)[-1],
'A': data['choices'][0],
'B': data['choices'][1],
'C': data['choices'][2],
'D': data['choices'][3],
'label': 'ABCD'[data['gold']],
})
dataset_dict[split] = Dataset.from_list(dataset)
return dataset_dict
class hellaswagDatasetClean(BaseDataset): class hellaswagDatasetClean(BaseDataset):
# load the contamination annotations of CEval from # load the contamination annotations of CEval from
......
...@@ -156,10 +156,13 @@ def humaneval_postprocess_v2(text: str) -> str: ...@@ -156,10 +156,13 @@ def humaneval_postprocess_v2(text: str) -> str:
"""This is an advanced version of previous postprocess to handle more """This is an advanced version of previous postprocess to handle more
situations, better to use this one.""" situations, better to use this one."""
try: try:
# for chatGLM raw text # for chatGLM related text
text = eval(text) eval_text = eval(text)
except Exception: except Exception:
pass pass
else:
if isinstance(eval_text, str):
text = eval_text
text = text.lstrip('\n') text = text.lstrip('\n')
if '```' in text: if '```' in text:
blocks = re.findall(r'```(.*?)```', text, re.DOTALL) blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
......
...@@ -77,9 +77,10 @@ class NQEvaluator(BaseEvaluator): ...@@ -77,9 +77,10 @@ class NQEvaluator(BaseEvaluator):
cnt = 0 cnt = 0
for pred, cand_ans in zip(processed_predictions, processed_answers): for pred, cand_ans in zip(processed_predictions, processed_answers):
detail = {'pred': pred, 'answer': cand_ans, 'correct': False} detail = {'pred': pred, 'answer': cand_ans, 'correct': False}
cnt += int(any([cand == pred for cand in cand_ans])) # is_correct = any([cand == pred for cand in cand_ans])
if int(any([cand == pred for cand in cand_ans])): is_correct = any([cand in pred for cand in cand_ans])
detail['correct'] = True cnt += int(is_correct)
detail['correct'] = is_correct
details.append(detail) details.append(detail)
score = cnt / len(predictions) * 100 score = cnt / len(predictions) * 100
......
import json import json
import os import os
from datasets import Dataset from datasets import Dataset, DatasetDict
from opencompass.registry import LOAD_DATASET from opencompass.registry import LOAD_DATASET
...@@ -20,12 +20,12 @@ class winograndeDataset(BaseDataset): ...@@ -20,12 +20,12 @@ class winograndeDataset(BaseDataset):
for line in f: for line in f:
line = json.loads(line) line = json.loads(line)
prompt = line['sentence'] prompt = line['sentence']
continue_prompt = prompt.split('_') continue_prompt = prompt.split('_')[1]
data_item = { data_item = {
'opt1': prompt.replace('_', line['option1']), 'opt1': prompt.replace('_', line['option1']),
'opt2': prompt.replace('_', line['option2']), 'opt2': prompt.replace('_', line['option2']),
'answer': line['answer'], 'answer': line['answer'],
'cont': continue_prompt[1] 'cont': continue_prompt,
} }
dataset_list.append(data_item) dataset_list.append(data_item)
dataset_list = Dataset.from_list(dataset_list) dataset_list = Dataset.from_list(dataset_list)
...@@ -44,13 +44,43 @@ class winograndeDataset_V2(BaseDataset): ...@@ -44,13 +44,43 @@ class winograndeDataset_V2(BaseDataset):
for line in f: for line in f:
line = json.loads(line) line = json.loads(line)
prompt = line['sentence'] prompt = line['sentence']
continue_prompt = prompt.split('_')[1]
answer = line['answer'] answer = line['answer']
answer = ' AB'[int(answer)] if answer != '' else 'NULL' answer = ' AB'[int(answer)] if answer != '' else 'NULL'
data_item = { data_item = {
'opt1': prompt.replace('_', line['option1']), 'opt1': prompt.replace('_', line['option1']),
'opt2': prompt.replace('_', line['option2']), 'opt2': prompt.replace('_', line['option2']),
'answer': answer, 'answer': answer,
'cont': continue_prompt,
} }
dataset_list.append(data_item) dataset_list.append(data_item)
dataset_list = Dataset.from_list(dataset_list) dataset_list = Dataset.from_list(dataset_list)
return dataset_list return dataset_list
@LOAD_DATASET.register_module()
class winograndeDataset_V3(BaseDataset):
"""Disconnect from Huggingface, winograndeDataset_V2."""
@staticmethod
def load(path):
dataset_dict = DatasetDict()
for split in ['train_xs', 'dev']:
filename = os.path.join(path, f'{split}.jsonl')
dataset_list = []
with open(filename, 'r', encoding='utf-8') as f:
for line in f:
line = json.loads(line)
prompt = line['sentence']
continue_prompt = prompt.split('_')[1]
answer = line['answer']
answer = ' AB'[int(answer)] if answer != '' else 'NULL'
data_item = {
'opt1': prompt.replace('_', line['option1']),
'opt2': prompt.replace('_', line['option2']),
'answer': answer,
'cont': continue_prompt,
}
dataset_list.append(data_item)
dataset_dict[split] = Dataset.from_list(dataset_list)
return dataset_dict
...@@ -13,6 +13,7 @@ from .huggingface import HuggingFace # noqa: F401, F403 ...@@ -13,6 +13,7 @@ from .huggingface import HuggingFace # noqa: F401, F403
from .huggingface import HuggingFaceCausalLM # noqa: F401, F403 from .huggingface import HuggingFaceCausalLM # noqa: F401, F403
from .huggingface import HuggingFaceChatGLM3 # noqa: F401, F403 from .huggingface import HuggingFaceChatGLM3 # noqa: F401, F403
from .intern_model import InternLM # noqa: F401, F403 from .intern_model import InternLM # noqa: F401, F403
from .krgpt_api import KrGPT # noqa: F401
from .lightllm_api import LightllmAPI # noqa: F401 from .lightllm_api import LightllmAPI # noqa: F401
from .llama2 import Llama2, Llama2Chat # noqa: F401, F403 from .llama2 import Llama2, Llama2Chat # noqa: F401, F403
from .lmdeploy_pytorch import LmdeployPytorchModel # noqa: F401 from .lmdeploy_pytorch import LmdeployPytorchModel # noqa: F401
......
import json
from concurrent.futures import ThreadPoolExecutor
from typing import Dict, List, Optional, Union
import requests
from opencompass.registry import MODELS
from opencompass.utils.logging import get_logger
from opencompass.utils.prompt import PromptList
from .base_api import BaseAPIModel
PromptType = Union[PromptList, str]
@MODELS.register_module()
class KrGPT(BaseAPIModel):
is_api: bool = True
def __init__(
self,
path: str = 'KrGPT',
url: str = 'http://101.69.162.5:9300/v1/chat/completions',
max_seq_len: int = 2048,
meta_template: Optional[Dict] = None,
retry: int = 2,
generation_kwargs: Optional[Dict] = dict(),
):
super().__init__(
path=path,
max_seq_len=max_seq_len,
meta_template=meta_template,
retry=retry,
generation_kwargs=generation_kwargs,
)
self.logger = get_logger()
self.url = url
self.generation_kwargs = generation_kwargs
self.max_out_len = self.generation_kwargs.get('max_new_tokens', 1024)
def generate(self, inputs: List[str], max_out_len: int,
**kwargs) -> List[str]:
"""Generate results given a list of inputs.
Args:
inputs (List[str]): A list of strings or PromptDicts.
The PromptDict should be organized in OpenCompass'
API format.
max_out_len (int): The maximum length of the output.
Returns:
List[str]: A list of generated strings.
"""
with ThreadPoolExecutor() as executor:
results = list(
executor.map(self._generate, inputs,
[self.max_out_len] * len(inputs)))
return results
def _generate(self,
input: PromptType,
max_out_len: int,
temperature: float = 0.0) -> str:
"""Generate results given a list of inputs.
Args:
inputs (PromptType): A string or PromptDict.
The PromptDict should be organized in OpenCompass'
API format.
max_out_len (int): The maximum length of the output.
temperature (float): What sampling temperature to use,
between 0 and 2. Higher values like 0.8 will make the output
more random, while lower values like 0.2 will make it more
focused and deterministic.
Returns:
str: The generated string.
"""
assert isinstance(input, (str, PromptList))
if isinstance(input, str):
messages = [{'role': 'user', 'content': input}]
else:
messages = []
for item in input:
msg = {'content': item['prompt']}
if item['role'] == 'HUMAN':
msg['role'] = 'user'
elif item['role'] == 'BOT':
msg['role'] = 'assistant'
elif item['role'] == 'SYSTEM':
msg['role'] = 'system'
messages.append(msg)
max_num_retries = 0
while max_num_retries < self.retry:
header = {'content-type': 'application/json'}
try:
data = dict(messages=messages)
raw_response = requests.post(self.url,
headers=header,
data=json.dumps(data))
except requests.ConnectionError:
self.logger.error('Got connection error, retrying...')
continue
try:
response = raw_response.json()
except requests.JSONDecodeError:
self.logger.error('JsonDecode error, got',
str(raw_response.content))
continue
try:
return response['choices'][0]['message']['content'].strip()
except KeyError:
self.logger.error('Find error message in response: ',
str(response))
# if 'error' in response:
# if response['error']['code'] == 'rate_limit_exceeded':
# time.sleep(1)
# continue
# elif response['error']['code'] == 'insufficient_quota':
# self.invalid_keys.add(key)
# self.logger.warn(f'insufficient_quota key: {key}')
# continue
# self.logger.error('Find error message in response: ',
# str(response['error']))
max_num_retries += 1
raise RuntimeError('Calling OpenAI failed after retrying for '
f'{max_num_retries} times. Check the logs for '
'details.')
...@@ -415,6 +415,13 @@ class OpenAIAllesAPIN(OpenAI): ...@@ -415,6 +415,13 @@ class OpenAIAllesAPIN(OpenAI):
self.logger.error(data) self.logger.error(data)
else: else:
return choices[0]['message']['content'].strip() return choices[0]['message']['content'].strip()
try:
match = re.match(r'Error code: \d+ - (.*)', response['data'])
err = eval(match.group(1))['error']
if err['code'] == 'content_filter' and err['status'] == 400:
return err['message']
except Exception:
pass
self.logger.error(response['msg']) self.logger.error(response['msg'])
self.logger.error(response) self.logger.error(response)
time.sleep(1) time.sleep(1)
......
import datetime import datetime
import json
import os import os
import os.path as osp import os.path as osp
import random import random
...@@ -38,6 +39,7 @@ class DLCRunner(BaseRunner): ...@@ -38,6 +39,7 @@ class DLCRunner(BaseRunner):
task: ConfigDict, task: ConfigDict,
aliyun_cfg: ConfigDict, aliyun_cfg: ConfigDict,
max_num_workers: int = 32, max_num_workers: int = 32,
eval_with_gpu: list = ['plugin_eval'],
retry: int = 2, retry: int = 2,
debug: bool = False, debug: bool = False,
lark_bot_url: str = None): lark_bot_url: str = None):
...@@ -46,6 +48,8 @@ class DLCRunner(BaseRunner): ...@@ -46,6 +48,8 @@ class DLCRunner(BaseRunner):
self.max_num_workers = max_num_workers self.max_num_workers = max_num_workers
self.retry = retry self.retry = retry
self.eval_with_gpu = eval_with_gpu
logger = get_logger() logger = get_logger()
logger.warning( logger.warning(
'To ensure the integrity of the log results, the log displayed ' 'To ensure the integrity of the log results, the log displayed '
...@@ -93,19 +97,62 @@ class DLCRunner(BaseRunner): ...@@ -93,19 +97,62 @@ class DLCRunner(BaseRunner):
num_gpus = task.num_gpus num_gpus = task.num_gpus
task_name = task.name task_name = task.name
is_eval_task = 'OpenICLEval' in task_name
if is_eval_task and num_gpus == 0:
for check_name in self.eval_with_gpu:
if check_name in task_name:
num_gpus = 1
break
# Dump task config to file # Dump task config to file
mmengine.mkdir_or_exist('tmp/') mmengine.mkdir_or_exist('tmp/')
param_file = f'tmp/{os.getpid()}_params.py' param_file = f'tmp/{os.getpid()}_params.py'
pwd = os.getcwd()
try: try:
cfg.dump(param_file) cfg.dump(param_file)
if self.aliyun_cfg.get('bashrc_path') is not None:
# Build up DLC command # using user's conda env
pwd = os.getcwd() bashrc_path = self.aliyun_cfg['bashrc_path']
assert osp.exists(bashrc_path)
assert self.aliyun_cfg.get('conda_env_name') is not None
conda_env_name = self.aliyun_cfg['conda_env_name']
shell_cmd = (f'source {bashrc_path}; '
f'conda activate {conda_env_name}; ')
else:
# using public conda env
# users can also set `python_env_path` to their
# own env python path
assert self.aliyun_cfg.get('python_env_path') is not None
shell_cmd = ( shell_cmd = (
f'source {self.aliyun_cfg["bashrc_path"]}; ' f'export PATH={self.aliyun_cfg["python_env_path"]}/bin:$PATH; ' # noqa: E501
f'conda activate {self.aliyun_cfg["conda_env_name"]}; ' f'export PYTHONPATH={pwd}:$PYTHONPATH; ')
f'cd {pwd}; '
'{task_cmd}') huggingface_cache = self.aliyun_cfg.get('huggingface_cache')
if huggingface_cache is not None:
# HUGGINGFACE_HUB_CACHE is a Legacy env variable, here we set
# `HF_HUB_CACHE` and `HUGGINGFACE_HUB_CACHE` for bc
shell_cmd += f'export HF_HUB_CACHE={huggingface_cache}; '
shell_cmd += f'export HUGGINGFACE_HUB_CACHE={huggingface_cache}; ' # noqa: E501
torch_cache = self.aliyun_cfg.get('torch_cache')
if torch_cache is not None:
shell_cmd += f'export TORCH_HOME={torch_cache}; '
hf_offline = self.aliyun_cfg.get('hf_offline', True)
if hf_offline:
shell_cmd += 'export HF_DATASETS_OFFLINE=1; export TRANSFORMERS_OFFLINE=1; export HF_EVALUATE_OFFLINE=1; ' # noqa: E501
http_proxy = self.aliyun_cfg.get('http_proxy')
if http_proxy is not None:
shell_cmd += f'export http_proxy={http_proxy}; export https_proxy={http_proxy}; ' # noqa: E501
shell_cmd += f'export HTTP_PROXY={http_proxy}; export HTTPS_PROXY={http_proxy}; ' # noqa: E501
hf_endpoint = self.aliyun_cfg.get('hf_endpoint')
if hf_endpoint is not None:
shell_cmd += f'export HF_ENDPOINT={hf_endpoint}; '
shell_cmd += f'cd {pwd}; '
shell_cmd += '{task_cmd}'
tmpl = ('dlc create job' tmpl = ('dlc create job'
f" --command '{shell_cmd}'" f" --command '{shell_cmd}'"
...@@ -114,11 +161,10 @@ class DLCRunner(BaseRunner): ...@@ -114,11 +161,10 @@ class DLCRunner(BaseRunner):
f" -c {self.aliyun_cfg['dlc_config_path']}" f" -c {self.aliyun_cfg['dlc_config_path']}"
f" --workspace_id {self.aliyun_cfg['workspace_id']}" f" --workspace_id {self.aliyun_cfg['workspace_id']}"
' --worker_count 1' ' --worker_count 1'
f' --worker_cpu {max(num_gpus * 6, 8)}' f' --worker_cpu {max(num_gpus * 8, 32)}'
f' --worker_gpu {num_gpus}' f' --worker_gpu {num_gpus}'
f' --worker_memory {max(num_gpus * 64, 48)}' f' --worker_memory {max(num_gpus * 128, 256)}'
f" --worker_image {self.aliyun_cfg['worker_image']}" f" --worker_image {self.aliyun_cfg['worker_image']}")
' --interactive')
get_cmd = partial(task.get_command, get_cmd = partial(task.get_command,
cfg_path=param_file, cfg_path=param_file,
template=tmpl) template=tmpl)
...@@ -139,77 +185,64 @@ class DLCRunner(BaseRunner): ...@@ -139,77 +185,64 @@ class DLCRunner(BaseRunner):
time.sleep(random.randint(0, 10)) time.sleep(random.randint(0, 10))
def _run_within_retry(): def _run_within_retry():
try: output = subprocess.getoutput(cmd)
process = subprocess.Popen(cmd, match = re.search(r'\|\s+(dlc[0-9a-z]+)\s+\|', output)
shell=True, if match is None:
text=True, raise RuntimeError(
stdout=subprocess.PIPE, f'Failed to launch dlc job for {output}')
stderr=subprocess.PIPE)
job_id = None
job_allocated = False
job_finished = False
last_end_time = datetime.datetime.now().strftime(
'%Y-%m-%dT%H:%M:%SZ')
while True:
if not job_allocated:
line = process.stdout.readline()
if not line:
break
match = re.search(r'(dlc[0-9a-z]+)', line)
if match and job_id is None:
job_id = match.group(1)
stdout.write(line)
match = re.search(r'Job .* is \[Running\]', line)
if match:
job_allocated = True
else: else:
job_id = match.group(1)
stdout.write(output)
pod_create_time = None
pri_time = None
initial_time = datetime.datetime.now()
while True:
# 1. Avoid to request dlc too frequently.
# 2. DLC job may not be ready immediately after creation.
for _ in range(5):
time.sleep(2)
try: try:
process.wait(10) job_info = json.loads(
except subprocess.TimeoutExpired: subprocess.getoutput(f'dlc get job {job_id}'))
break
except: # noqa: E722
pass pass
else: else:
job_finished = True raise RuntimeError(
if job_finished: f'Failed to get job info for {job_id}')
this_end_time = datetime.datetime.now(
).strftime('%Y-%m-%dT%H:%M:%SZ') status = job_info['Status']
else: if status == 'Failed':
this_end_time = ( return -1
datetime.datetime.now() - elif status == 'Succeeded':
datetime.timedelta(seconds=10) return 0
).strftime('%Y-%m-%dT%H:%M:%SZ') elif status != 'Running':
logs_cmd = ( continue
'dlc logs'
# The pod time could be different from the real time.
# Therefore we need to extract the pod start time from
# the `job_info` and calculate the `start_time` and
# `end_time` in pod.
if pod_create_time is None:
pod_create_time = job_info['GmtCreateTime']
pri_time = pod_create_time
pod_create_time = datetime.datetime.strptime(
pod_create_time, '%Y-%m-%dT%H:%M:%SZ')
elasped_time = datetime.datetime.now() - initial_time
cur_time = (pod_create_time +
elasped_time).strftime('%Y-%m-%dT%H:%M:%SZ')
logs_cmd = ('dlc logs'
f' {job_id} {job_id}-worker-0' f' {job_id} {job_id}-worker-0'
f' --start_time {last_end_time}' f" -c {self.aliyun_cfg['dlc_config_path']}"
f' --end_time {this_end_time}' f' --start_time {pri_time}'
f" -c {self.aliyun_cfg['dlc_config_path']}") f' --end_time {cur_time}')
log_process = subprocess.Popen( log_output = subprocess.getoutput(logs_cmd)
logs_cmd,
shell=True, if '[WARN] No logs found for the pod' not in log_output:
text=True, pri_time = cur_time
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
log_output, log_err = log_process.communicate()
log_output = '\n'.join(log_output.split('\n')[2:])
stdout.write(log_output) stdout.write(log_output)
last_end_time = this_end_time
stdout.flush() stdout.flush()
if job_finished:
break
process.wait()
return process.returncode
finally:
if job_id is not None:
cancel_cmd = (
'dlc stop job'
f' {job_id}'
f" -c {self.aliyun_cfg['dlc_config_path']}"
' -f')
subprocess.run(cancel_cmd,
shell=True,
text=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
return_code = _run_within_retry() return_code = _run_within_retry()
retry = self.retry retry = self.retry
......
...@@ -6,7 +6,8 @@ from mmengine.config import Config, ConfigDict ...@@ -6,7 +6,8 @@ from mmengine.config import Config, ConfigDict
from opencompass.openicl.icl_inferencer import (AgentInferencer, from opencompass.openicl.icl_inferencer import (AgentInferencer,
ChatInferencer, CLPInferencer, ChatInferencer, CLPInferencer,
GenInferencer, PPLInferencer, GenInferencer, LLInferencer,
PPLInferencer,
PPLOnlyInferencer) PPLOnlyInferencer)
from opencompass.registry import ICL_PROMPT_TEMPLATES, ICL_RETRIEVERS from opencompass.registry import ICL_PROMPT_TEMPLATES, ICL_RETRIEVERS
from opencompass.utils import (Menu, build_dataset_from_cfg, from opencompass.utils import (Menu, build_dataset_from_cfg,
...@@ -81,14 +82,15 @@ def print_prompts(model_cfg, dataset_cfg, count=1): ...@@ -81,14 +82,15 @@ def print_prompts(model_cfg, dataset_cfg, count=1):
supported_inferencer = [ supported_inferencer = [
AgentInferencer, PPLInferencer, GenInferencer, CLPInferencer, AgentInferencer, PPLInferencer, GenInferencer, CLPInferencer,
PPLOnlyInferencer, ChatInferencer PPLOnlyInferencer, ChatInferencer, LLInferencer
] ]
if infer_cfg.inferencer.type not in supported_inferencer: if infer_cfg.inferencer.type not in supported_inferencer:
print(f'Only {supported_inferencer} are supported') print(f'Only {supported_inferencer} are supported')
return return
for idx in range(min(count, len(ice_idx_list))): for idx in range(min(count, len(ice_idx_list))):
if issubclass(infer_cfg.inferencer.type, PPLInferencer): if issubclass(infer_cfg.inferencer.type,
(PPLInferencer, LLInferencer)):
labels = retriever.get_labels(ice_template=ice_template, labels = retriever.get_labels(ice_template=ice_template,
prompt_template=prompt_template) prompt_template=prompt_template)
ice = retriever.generate_ice(ice_idx_list[idx], ice = retriever.generate_ice(ice_idx_list[idx],
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment