[Sync] Sync Internal (#941)

b03d5dc5 · Fengzhe Zhou · GitHub · bbec7d87 · b03d5dc5 · b03d5dc5
Unverified Commit b03d5dc5 authored Mar 04, 2024 by Fengzhe Zhou Committed by GitHub Mar 04, 2024
13 changed files
--- a/configs/summarizers/compassbench_v1_reason.py
+++ b/configs/summarizers/compassbench_v1_reason.py
@@ -12,36 +12,9 @@ compassbench_v1_reason_groups = [

 summarizer = dict(
    dataset_abbrs=[
-        ['reasonbench', 'acc_origin'],
-        ['reasonbench_cn_circular', 'acc_origin'],
-        ['reasonbench_en_circular', 'acc_origin'],
-
-        ['reasonbench_cn_commonsense_circular', 'acc_origin'],
-        ['reasonbench_cn_abductive_circular', 'acc_origin'],
-        ['reasonbench_cn_deductive_circular', 'acc_origin'],
-        ['reasonbench_cn_inductive_circular', 'acc_origin'],
-        ['reasonbench_en_commonsense_circular', 'acc_origin'],
-        ['reasonbench_en_abductive_circular', 'acc_origin'],
-        ['reasonbench_en_deductive_logiqa_zh_translated_circular', 'acc_origin'],
-        ['reasonbench_en_inductive_circular', 'acc_origin'],
-
-        ['reasonbench_cn_commonsense_circular', 'acc_origin'],
-        ['reasonbench_cn_abductive_alphanlg_translated_circular', 'acc_origin'],
-        ['reasonbench_cn_deductive_bbh3obj_translated_circular', 'acc_origin'],
-        ['reasonbench_cn_deductive_logiqa_zh_circular', 'acc_origin'],
-        ['reasonbench_cn_inductive_deer_translated_circular', 'acc_origin'],
-        ['reasonbench_cn_inductive_selfgenerated_circular', 'acc_origin'],
-        ['reasonbench_en_commonsense_circular', 'acc_origin'],
-        ['reasonbench_en_abductive_alphanlg_circular', 'acc_origin'],
-        ['reasonbench_en_deductive_logiqa_zh_translated_circular', 'acc_origin'],
-        ['reasonbench_en_inductive_deer_circular', 'acc_origin'],
-        ['reasonbench_en_inductive_selfgenerated_circular', 'acc_origin'],
-
-
        ['reasonbench', 'perf_circular'],
        ['reasonbench_cn_circular', 'perf_circular'],
        ['reasonbench_en_circular', 'perf_circular'],
-
        ['reasonbench_cn_commonsense_circular', 'perf_circular'],
        ['reasonbench_cn_abductive_circular', 'perf_circular'],
        ['reasonbench_cn_deductive_circular', 'perf_circular'],
@@ -50,18 +23,6 @@ summarizer = dict(
        ['reasonbench_en_abductive_circular', 'perf_circular'],
        ['reasonbench_en_deductive_logiqa_zh_translated_circular', 'perf_circular'],
        ['reasonbench_en_inductive_circular', 'perf_circular'],
-
-        ['reasonbench_cn_commonsense_circular', 'perf_circular'],
-        ['reasonbench_cn_abductive_alphanlg_translated_circular', 'perf_circular'],
-        ['reasonbench_cn_deductive_bbh3obj_translated_circular', 'perf_circular'],
-        ['reasonbench_cn_deductive_logiqa_zh_circular', 'perf_circular'],
-        ['reasonbench_cn_inductive_deer_translated_circular', 'perf_circular'],
-        ['reasonbench_cn_inductive_selfgenerated_circular', 'perf_circular'],
-        ['reasonbench_en_commonsense_circular', 'perf_circular'],
-        ['reasonbench_en_abductive_alphanlg_circular', 'perf_circular'],
-        ['reasonbench_en_deductive_logiqa_zh_translated_circular', 'perf_circular'],
-        ['reasonbench_en_inductive_deer_circular', 'perf_circular'],
-        ['reasonbench_en_inductive_selfgenerated_circular', 'perf_circular'],
    ],
    summary_groups=compassbench_v1_reason_groups,
 )
--- a/configs/summarizers/groups/plugineval.py
+++ b/configs/summarizers/groups/plugineval.py
@@ -39,6 +39,22 @@ _base_summary_groups = [
            ['plugin_eval-review_str_v1', 'review_quality'],
        ],
    },
+    {
+        'name': 'plugin_eval_one_review',
+        'subsets': [
+            ['plugin_eval-instruct_v1', 'format_metric'],
+            ['plugin_eval-instruct_v1', 'args_em_metric'],
+            ['plugin_eval-plan_str_v1', 'f1_score'],
+            ['plugin_eval-plan_json_v1', 'f1_score'],
+            ['plugin_eval-reason_str_v1', 'thought'],
+            ['plugin_eval-reason_retrieve_understand_json_v1', 'thought'],
+            ['plugin_eval-retrieve_str_v1', 'name'],
+            ['plugin_eval-reason_retrieve_understand_json_v1', 'name'],
+            ['plugin_eval-understand_str_v1', 'args'],
+            ['plugin_eval-reason_retrieve_understand_json_v1', 'args'],
+            ['plugin_eval-review_str_v1', 'review_quality'],
+        ]
+    },
    {
        'name': 'plugin_eval',
        'subsets': [
@@ -53,7 +69,6 @@ _base_summary_groups = [
            ['plugin_eval-understand_str_v1', 'args'],
            ['plugin_eval-reason_retrieve_understand_json_v1', 'args'],
            ['plugin_eval-review_str_v1', 'review_quality'],
-            ['copy_plugin_eval-review_str_v1', 'naive_average'],  # a hack for review * 2
        ]
    },
 ]

--- a/opencompass/datasets/IFEval/instructions_util.py
+++ b/opencompass/datasets/IFEval/instructions_util.py
@@ -20,16 +20,12 @@ import functools
 import random
 import re

-try:
-    import immutabledict
-except ImportError:
-    immutabledict = None
 import nltk

 WORD_LIST = ['western', 'sentence', 'signal', 'dump', 'spot', 'opposite', 'bottom', 'potato', 'administration', 'working', 'welcome', 'morning', 'good', 'agency', 'primary', 'wish', 'responsibility', 'press', 'problem', 'president', 'steal', 'brush', 'read', 'type', 'beat', 'trainer', 'growth', 'lock', 'bone', 'case', 'equal', 'comfortable', 'region', 'replacement', 'performance', 'mate', 'walk', 'medicine', 'film', 'thing', 'rock', 'tap', 'total', 'competition', 'ease', 'south', 'establishment', 'gather', 'parking', 'world', 'plenty', 'breath', 'claim', 'alcohol', 'trade', 'dear', 'highlight', 'street', 'matter', 'decision', 'mess', 'agreement', 'studio', 'coach', 'assist', 'brain', 'wing', 'style', 'private', 'top', 'brown', 'leg', 'buy', 'procedure', 'method', 'speed', 'high', 'company', 'valuable', 'pie', 'analyst', 'session', 'pattern', 'district', 'pleasure', 'dinner', 'swimming', 'joke', 'order', 'plate', 'department', 'motor', 'cell', 'spend', 'cabinet', 'difference', 'power', 'examination', 'engine', 'horse', 'dimension', 'pay', 'toe', 'curve', 'literature', 'bother', 'fire', 'possibility', 'debate', 'activity', 'passage', 'hello', 'cycle', 'background', 'quiet', 'author', 'effect', 'actor', 'page', 'bicycle', 'error', 'throat', 'attack', 'character', 'phone', 'tea', 'increase', 'outcome', 'file', 'specific', 'inspector', 'internal', 'potential', 'staff', 'building', 'employer', 'shoe', 'hand', 'direction', 'garden', 'purchase', 'interview', 'study', 'recognition', 'member', 'spiritual', 'oven', 'sandwich', 'weird', 'passenger', 'particular', 'response', 'reaction', 'size', 'variation', 'a', 'cancel', 'candy', 'exit', 'guest', 'condition', 'fly', 'price', 'weakness', 'convert', 'hotel', 'great', 'mouth', 'mind', 'song', 'sugar', 'suspect', 'telephone', 'ear', 'roof', 'paint', 'refrigerator', 'organization', 'jury', 'reward', 'engineering', 'day', 'possession', 'crew', 'bar', 'road', 'description', 'celebration', 'score', 'mark', 'letter', 'shower', 'suggestion', 'sir', 'luck', 'national', 'progress', 'hall', 'stroke', 'theory', 'offer', 'story', 'tax', 'definition', 'history', 'ride', 'medium', 'opening', 'glass', 'elevator', 'stomach', 'question', 'ability', 'leading', 'village', 'computer', 'city', 'grand', 'confidence', 'candle', 'priest', 'recommendation', 'point', 'necessary', 'body', 'desk', 'secret', 'horror', 'noise', 'culture', 'warning', 'water', 'round', 'diet', 'flower', 'bus', 'tough', 'permission', 'week', 'prompt', 'connection', 'abuse', 'height', 'save', 'corner', 'border', 'stress', 'drive', 'stop', 'rip', 'meal', 'listen', 'confusion', 'girlfriend', 'living', 'relation', 'significance', 'plan', 'creative', 'atmosphere', 'blame', 'invite', 'housing', 'paper', 'drink', 'roll', 'silver', 'drunk', 'age', 'damage', 'smoke', 'environment', 'pack', 'savings', 'influence', 'tourist', 'rain', 'post', 'sign', 'grandmother', 'run', 'profit', 'push', 'clerk', 'final', 'wine', 'swim', 'pause', 'stuff', 'singer', 'funeral', 'average', 'source', 'scene', 'tradition', 'personal', 'snow', 'nobody', 'distance', 'sort', 'sensitive', 'animal', 'major', 'negotiation', 'click', 'mood', 'period', 'arrival', 'expression', 'holiday', 'repeat', 'dust', 'closet', 'gold', 'bad', 'sail', 'combination', 'clothes', 'emphasis', 'duty', 'black', 'step', 'school', 'jump', 'document', 'professional', 'lip', 'chemical', 'front', 'wake', 'while', 'inside', 'watch', 'row', 'subject', 'penalty', 'balance', 'possible', 'adult', 'aside', 'sample', 'appeal', 'wedding', 'depth', 'king', 'award', 'wife', 'blow', 'site', 'camp', 'music', 'safe', 'gift', 'fault', 'guess', 'act', 'shame', 'drama', 'capital', 'exam', 'stupid', 'record', 'sound', 'swing', 'novel', 'minimum', 'ratio', 'machine', 'shape', 'lead', 'operation', 'salary', 'cloud', 'affair', 'hit', 'chapter', 'stage', 'quantity', 'access', 'army', 'chain', 'traffic', 'kick', 'analysis', 'airport', 'time', 'vacation', 'philosophy', 'ball', 'chest', 'thanks', 'place', 'mountain', 'advertising', 'red', 'past', 'rent', 'return', 'tour', 'house', 'construction', 'net', 'native', 'war', 'figure', 'fee', 'spray', 'user', 'dirt', 'shot', 'task', 'stick', 'friend', 'software', 'promotion', 'interaction', 'surround', 'block', 'purpose', 'practice', 'conflict', 'routine', 'requirement', 'bonus', 'hole', 'state', 'junior', 'sweet', 'catch', 'tear', 'fold', 'wall', 'editor', 'life', 'position', 'pound', 'respect', 'bathroom', 'coat', 'script', 'job', 'teach', 'birth', 'view', 'resolve', 'theme', 'employee', 'doubt', 'market', 'education', 'serve', 'recover', 'tone', 'harm', 'miss', 'union', 'understanding', 'cow', 'river', 'association', 'concept', 'training', 'recipe', 'relationship', 'reserve', 'depression', 'proof', 'hair', 'revenue', 'independent', 'lift', 'assignment', 'temporary', 'amount', 'loss', 'edge', 'track', 'check', 'rope', 'estimate', 'pollution', 'stable', 'message', 'delivery', 'perspective', 'mirror', 'assistant', 'representative', 'witness', 'nature', 'judge', 'fruit', 'tip', 'devil', 'town', 'emergency', 'upper', 'drop', 'stay', 'human', 'neck', 'speaker', 'network', 'sing', 'resist', 'league', 'trip', 'signature', 'lawyer', 'importance', 'gas', 'choice', 'engineer', 'success', 'part', 'external', 'worker', 'simple', 'quarter', 'student', 'heart', 'pass', 'spite', 'shift', 'rough', 'lady', 'grass', 'community', 'garage', 'youth', 'standard', 'skirt', 'promise', 'blind', 'television', 'disease', 'commission', 'positive', 'energy', 'calm', 'presence', 'tune', 'basis', 'preference', 'head', 'common', 'cut', 'somewhere', 'presentation', 'current', 'thought', 'revolution', 'effort', 'master', 'implement', 'republic', 'floor', 'principle', 'stranger', 'shoulder', 'grade', 'button', 'tennis', 'police', 'collection', 'account', 'register', 'glove', 'divide', 'professor', 'chair', 'priority', 'combine', 'peace', 'extension', 'maybe', 'evening', 'frame', 'sister', 'wave', 'code', 'application', 'mouse', 'match', 'counter', 'bottle', 'half', 'cheek', 'resolution', 'back', 'knowledge', 'make', 'discussion', 'screw', 'length', 'accident', 'battle', 'dress', 'knee', 'log', 'package', 'it', 'turn', 'hearing', 'newspaper', 'layer', 'wealth', 'profile', 'imagination', 'answer', 'weekend', 'teacher', 'appearance', 'meet', 'bike', 'rise', 'belt', 'crash', 'bowl', 'equivalent', 'support', 'image', 'poem', 'risk', 'excitement', 'remote', 'secretary', 'public', 'produce', 'plane', 'display', 'money', 'sand', 'situation', 'punch', 'customer', 'title', 'shake', 'mortgage', 'option', 'number', 'pop', 'window', 'extent', 'nothing', 'experience', 'opinion', 'departure', 'dance', 'indication', 'boy', 'material', 'band', 'leader', 'sun', 'beautiful', 'muscle', 'farmer', 'variety', 'fat', 'handle', 'director', 'opportunity', 'calendar', 'outside', 'pace', 'bath', 'fish', 'consequence', 'put', 'owner', 'go', 'doctor', 'information', 'share', 'hurt', 'protection', 'career', 'finance', 'force', 'golf', 'garbage', 'aspect', 'kid', 'food', 'boot', 'milk', 'respond', 'objective', 'reality', 'raw', 'ring', 'mall', 'one', 'impact', 'area', 'news', 'international', 'series', 'impress', 'mother', 'shelter', 'strike', 'loan', 'month', 'seat', 'anything', 'entertainment', 'familiar', 'clue', 'year', 'glad', 'supermarket', 'natural', 'god', 'cost', 'conversation', 'tie', 'ruin', 'comfort', 'earth', 'storm', 'percentage', 'assistance', 'budget', 'strength', 'beginning', 'sleep', 'other', 'young', 'unit', 'fill', 'store', 'desire', 'hide', 'value', 'cup', 'maintenance', 'nurse', 'function', 'tower', 'role', 'class', 'camera', 'database', 'panic', 'nation', 'basket', 'ice', 'art', 'spirit', 'chart', 'exchange', 'feedback', 'statement', 'reputation', 'search', 'hunt', 'exercise', 'nasty', 'notice', 'male', 'yard', 'annual', 'collar', 'date', 'platform', 'plant', 'fortune', 'passion', 'friendship', 'spread', 'cancer', 'ticket', 'attitude', 'island', 'active', 'object', 'service', 'buyer', 'bite', 'card', 'face', 'steak', 'proposal', 'patient', 'heat', 'rule', 'resident', 'broad', 'politics', 'west', 'knife', 'expert', 'girl', 'design', 'salt', 'baseball', 'grab', 'inspection', 'cousin', 'couple', 'magazine', 'cook', 'dependent', 'security', 'chicken', 'version', 'currency', 'ladder', 'scheme', 'kitchen', 'employment', 'local', 'attention', 'manager', 'fact', 'cover', 'sad', 'guard', 'relative', 'county', 'rate', 'lunch', 'program', 'initiative', 'gear', 'bridge', 'breast', 'talk', 'dish', 'guarantee', 'beer', 'vehicle', 'reception', 'woman', 'substance', 'copy', 'lecture', 'advantage', 'park', 'cold', 'death', 'mix', 'hold', 'scale', 'tomorrow', 'blood', 'request', 'green', 'cookie', 'church', 'strip', 'forever', 'beyond', 'debt', 'tackle', 'wash', 'following', 'feel', 'maximum', 'sector', 'sea', 'property', 'economics', 'menu', 'bench', 'try', 'language', 'start', 'call', 'solid', 'address', 'income', 'foot', 'senior', 'honey', 'few', 'mixture', 'cash', 'grocery', 'link', 'map', 'form', 'factor', 'pot', 'model', 'writer', 'farm', 'winter', 'skill', 'anywhere', 'birthday', 'policy', 'release', 'husband', 'lab', 'hurry', 'mail', 'equipment', 'sink', 'pair', 'driver', 'consideration', 'leather', 'skin', 'blue', 'boat', 'sale', 'brick', 'two', 'feed', 'square', 'dot', 'rush', 'dream', 'location', 'afternoon', 'manufacturer', 'control', 'occasion', 'trouble', 'introduction', 'advice', 'bet', 'eat', 'kill', 'category', 'manner', 'office', 'estate', 'pride', 'awareness', 'slip', 'crack', 'client', 'nail', 'shoot', 'membership', 'soft', 'anybody', 'web', 'official', 'individual', 'pizza', 'interest', 'bag', 'spell', 'profession', 'queen', 'deal', 'resource', 'ship', 'guy', 'chocolate', 'joint', 'formal', 'upstairs', 'car', 'resort', 'abroad', 'dealer', 'associate', 'finger', 'surgery', 'comment', 'team', 'detail', 'crazy', 'path', 'tale', 'initial', 'arm', 'radio', 'demand', 'single', 'draw', 'yellow', 'contest', 'piece', 'quote', 'pull', 'commercial', 'shirt', 'contribution', 'cream', 'channel', 'suit', 'discipline', 'instruction', 'concert', 'speech', 'low', 'effective', 'hang', 'scratch', 'industry', 'breakfast', 'lay', 'join', 'metal', 'bedroom', 'minute', 'product', 'rest', 'temperature', 'many', 'give', 'argument', 'print', 'purple', 'laugh', 'health', 'credit', 'investment', 'sell', 'setting', 'lesson', 'egg', 'middle', 'marriage', 'level', 'evidence', 'phrase', 'love', 'self', 'benefit', 'guidance', 'affect', 'you', 'dad', 'anxiety', 'special', 'boyfriend', 'test', 'blank', 'payment', 'soup', 'obligation', 'reply', 'smile', 'deep', 'complaint', 'addition', 'review', 'box', 'towel', 'minor', 'fun', 'soil', 'issue', 'cigarette', 'internet', 'gain', 'tell', 'entry', 'spare', 'incident', 'family', 'refuse', 'branch', 'can', 'pen', 'grandfather', 'constant', 'tank', 'uncle', 'climate', 'ground', 'volume', 'communication', 'kind', 'poet', 'child', 'screen', 'mine', 'quit', 'gene', 'lack', 'charity', 'memory', 'tooth', 'fear', 'mention', 'marketing', 'reveal', 'reason', 'court', 'season', 'freedom', 'land', 'sport', 'audience', 'classroom', 'law', 'hook', 'win', 'carry', 'eye', 'smell', 'distribution', 'research', 'country', 'dare', 'hope', 'whereas', 'stretch', 'library', 'if', 'delay', 'college', 'plastic', 'book', 'present', 'use', 'worry', 'champion', 'goal', 'economy', 'march', 'election', 'reflection', 'midnight', 'slide', 'inflation', 'action', 'challenge', 'guitar', 'coast', 'apple', 'campaign', 'field', 'jacket', 'sense', 'way', 'visual', 'remove', 'weather', 'trash', 'cable', 'regret', 'buddy', 'beach', 'historian', 'courage', 'sympathy', 'truck', 'tension', 'permit', 'nose', 'bed', 'son', 'person', 'base', 'meat', 'usual', 'air', 'meeting', 'worth', 'game', 'independence', 'physical', 'brief', 'play', 'raise', 'board', 'she', 'key', 'writing', 'pick', 'command', 'party', 'yesterday', 'spring', 'candidate', 'physics', 'university', 'concern', 'development', 'change', 'string', 'target', 'instance', 'room', 'bitter', 'bird', 'football', 'normal', 'split', 'impression', 'wood', 'long', 'meaning', 'stock', 'cap', 'leadership', 'media', 'ambition', 'fishing', 'essay', 'salad', 'repair', 'today', 'designer', 'night', 'bank', 'drawing', 'inevitable', 'phase', 'vast', 'chip', 'anger', 'switch', 'cry', 'twist', 'personality', 'attempt', 'storage', 'being', 'preparation', 'bat', 'selection', 'white', 'technology', 'contract', 'side', 'section', 'station', 'till', 'structure', 'tongue', 'taste', 'truth', 'difficulty', 'group', 'limit', 'main', 'move', 'feeling', 'light', 'example', 'mission', 'might', 'wait', 'wheel', 'shop', 'host', 'classic', 'alternative', 'cause', 'agent', 'consist', 'table', 'airline', 'text', 'pool', 'craft', 'range', 'fuel', 'tool', 'partner', 'load', 'entrance', 'deposit', 'hate', 'article', 'video', 'summer', 'feature', 'extreme', 'mobile', 'hospital', 'flight', 'fall', 'pension', 'piano', 'fail', 'result', 'rub', 'gap', 'system', 'report', 'suck', 'ordinary', 'wind', 'nerve', 'ask', 'shine', 'note', 'line', 'mom', 'perception', 'brother', 'reference', 'bend', 'charge', 'treat', 'trick', 'term', 'homework', 'bake', 'bid', 'status', 'project', 'strategy', 'orange', 'let', 'enthusiasm', 'parent', 'concentrate', 'device', 'travel', 'poetry', 'business', 'society', 'kiss', 'end', 'vegetable', 'employ', 'schedule', 'hour', 'brave', 'focus', 'process', 'movie', 'illegal', 'general', 'coffee', 'ad', 'highway', 'chemistry', 'psychology', 'hire', 'bell', 'conference', 'relief', 'show', 'neat', 'funny', 'weight', 'quality', 'club', 'daughter', 'zone', 'touch', 'tonight', 'shock', 'burn', 'excuse', 'name', 'survey', 'landscape', 'advance', 'satisfaction', 'bread', 'disaster', 'item', 'hat', 'prior', 'shopping', 'visit', 'east', 'photo', 'home', 'idea', 'father', 'comparison', 'cat', 'pipe', 'winner', 'count', 'lake', 'fight', 'prize', 'foundation', 'dog', 'keep', 'ideal', 'fan', 'struggle', 'peak', 'safety', 'solution', 'hell', 'conclusion', 'population', 'strain', 'alarm', 'measurement', 'second', 'train', 'race', 'due', 'insurance', 'boss', 'tree', 'monitor', 'sick', 'course', 'drag', 'appointment', 'slice', 'still', 'care', 'patience', 'rich', 'escape', 'emotion', 'royal', 'female', 'childhood', 'government', 'picture', 'will', 'sock', 'big', 'gate', 'oil', 'cross', 'pin', 'improvement', 'championship', 'silly', 'help', 'sky', 'pitch', 'man', 'diamond', 'most', 'transition', 'work', 'science', 'committee', 'moment', 'fix', 'teaching', 'dig', 'specialist', 'complex', 'guide', 'people', 'dead', 'voice', 'original', 'break', 'topic', 'data', 'degree', 'reading', 'recording', 'bunch', 'reach', 'judgment', 'lie', 'regular', 'set', 'painting', 'mode', 'list', 'player', 'bear', 'north', 'wonder', 'carpet', 'heavy', 'officer', 'negative', 'clock', 'unique', 'baby', 'pain', 'assumption', 'disk', 'iron', 'bill', 'drawer', 'look', 'double', 'mistake', 'finish', 'future', 'brilliant', 'contact', 'math', 'rice', 'leave', 'restaurant', 'discount', 'sex', 'virus', 'bit', 'trust', 'event', 'wear', 'juice', 'failure', 'bug', 'context', 'mud', 'whole', 'wrap', 'intention', 'draft', 'pressure', 'cake', 'dark', 'explanation', 'space', 'angle', 'word', 'efficiency', 'management', 'habit', 'star', 'chance', 'finding', 'transportation', 'stand', 'criticism', 'flow', 'door', 'injury', 'insect', 'surprise', 'apartment']  # pylint: disable=line-too-long

 # ISO 639-1 codes to language names.
-LANGUAGE_CODES = immutabledict.immutabledict({
+LANGUAGE_CODES = {
    'en': 'English',
    'es': 'Spanish',
    'pt': 'Portuguese',
@@ -60,7 +56,7 @@ LANGUAGE_CODES = immutabledict.immutabledict({
    'pa': 'Punjabi',
    'ml': 'Malayalam',
    'fi': 'Finnish',
-})
+}

 _ALPHABETS = '([A-Za-z])'
 _PREFIXES = '(Mr|St|Mrs|Ms|Dr)[.]'

--- a/opencompass/datasets/TheoremQA.py
+++ b/opencompass/datasets/TheoremQA.py
@@ -24,3 +24,15 @@ def TheoremQA_postprocess(text: str) -> str:
    else:
        text = matches[0].strip().strip('.,?!\"\';:')
        return text
+
+
+def TheoremQA_postprocess_v2(text: str) -> str:
+    prediction = text.strip().strip('\n').split('\n')[-1]
+    tmp = ''
+    for entry in prediction.split(' ')[::-1]:
+        if entry == 'is' or entry == 'be' or entry == 'are' or entry.endswith(
+                ':'):
+            break
+        tmp = entry + ' ' + tmp
+    prediction = tmp.strip().strip('.')
+    return prediction
--- a/opencompass/datasets/hellaswag.py
+++ b/opencompass/datasets/hellaswag.py
 import json
 import os.path as osp

-from datasets import Dataset
+from datasets import Dataset, DatasetDict

 from opencompass.registry import LOAD_DATASET

@@ -71,6 +71,32 @@ class hellaswagDataset_V3(BaseDataset):
        return dataset


+@LOAD_DATASET.register_module()
+class hellaswagDatasetwithICE(BaseDataset):
+
+    @staticmethod
+    def load(path):
+        dataset_dict = DatasetDict()
+        for split, filename in [
+            ['train', 'hellaswag_train_sampled25.jsonl'],
+            ['val', 'hellaswag.jsonl'],
+        ]:
+            dataset = []
+            with open(osp.join(path, filename), 'r', encoding='utf-8') as f:
+                for line in f:
+                    data = json.loads(line)
+                    dataset.append({
+                        'ctx': data['query'].split(': ', 1)[-1],
+                        'A': data['choices'][0],
+                        'B': data['choices'][1],
+                        'C': data['choices'][2],
+                        'D': data['choices'][3],
+                        'label': 'ABCD'[data['gold']],
+                    })
+            dataset_dict[split] = Dataset.from_list(dataset)
+        return dataset_dict
+
+
 class hellaswagDatasetClean(BaseDataset):

    # load the contamination annotations of CEval from

--- a/opencompass/datasets/humaneval.py
+++ b/opencompass/datasets/humaneval.py
@@ -156,10 +156,13 @@ def humaneval_postprocess_v2(text: str) -> str:
    """This is an advanced version of previous postprocess to handle more
    situations, better to use this one."""
    try:
-        # for chatGLM raw text
-        text = eval(text)
+        # for chatGLM related text
+        eval_text = eval(text)
    except Exception:
        pass
+    else:
+        if isinstance(eval_text, str):
+            text = eval_text
    text = text.lstrip('\n')
    if '```' in text:
        blocks = re.findall(r'```(.*?)```', text, re.DOTALL)

--- a/opencompass/datasets/natural_question.py
+++ b/opencompass/datasets/natural_question.py
@@ -77,9 +77,10 @@ class NQEvaluator(BaseEvaluator):
        cnt = 0
        for pred, cand_ans in zip(processed_predictions, processed_answers):
            detail = {'pred': pred, 'answer': cand_ans, 'correct': False}
-            cnt += int(any([cand == pred for cand in cand_ans]))
-            if int(any([cand == pred for cand in cand_ans])):
-                detail['correct'] = True
+            # is_correct = any([cand == pred for cand in cand_ans])
+            is_correct = any([cand in pred for cand in cand_ans])
+            cnt += int(is_correct)
+            detail['correct'] = is_correct
            details.append(detail)
        score = cnt / len(predictions) * 100


--- a/opencompass/datasets/winogrande.py
+++ b/opencompass/datasets/winogrande.py
 import json
 import os

-from datasets import Dataset
+from datasets import Dataset, DatasetDict

 from opencompass.registry import LOAD_DATASET

@@ -20,12 +20,12 @@ class winograndeDataset(BaseDataset):
            for line in f:
                line = json.loads(line)
                prompt = line['sentence']
-                continue_prompt = prompt.split('_')
+                continue_prompt = prompt.split('_')[1]
                data_item = {
                    'opt1': prompt.replace('_', line['option1']),
                    'opt2': prompt.replace('_', line['option2']),
                    'answer': line['answer'],
-                    'cont': continue_prompt[1]
+                    'cont': continue_prompt,
                }
                dataset_list.append(data_item)
        dataset_list = Dataset.from_list(dataset_list)
@@ -44,13 +44,43 @@ class winograndeDataset_V2(BaseDataset):
            for line in f:
                line = json.loads(line)
                prompt = line['sentence']
+                continue_prompt = prompt.split('_')[1]
                answer = line['answer']
                answer = ' AB'[int(answer)] if answer != '' else 'NULL'
                data_item = {
                    'opt1': prompt.replace('_', line['option1']),
                    'opt2': prompt.replace('_', line['option2']),
                    'answer': answer,
+                    'cont': continue_prompt,
                }
                dataset_list.append(data_item)
        dataset_list = Dataset.from_list(dataset_list)
        return dataset_list
+
+
+@LOAD_DATASET.register_module()
+class winograndeDataset_V3(BaseDataset):
+    """Disconnect from Huggingface, winograndeDataset_V2."""
+
+    @staticmethod
+    def load(path):
+        dataset_dict = DatasetDict()
+        for split in ['train_xs', 'dev']:
+            filename = os.path.join(path, f'{split}.jsonl')
+            dataset_list = []
+            with open(filename, 'r', encoding='utf-8') as f:
+                for line in f:
+                    line = json.loads(line)
+                    prompt = line['sentence']
+                    continue_prompt = prompt.split('_')[1]
+                    answer = line['answer']
+                    answer = ' AB'[int(answer)] if answer != '' else 'NULL'
+                    data_item = {
+                        'opt1': prompt.replace('_', line['option1']),
+                        'opt2': prompt.replace('_', line['option2']),
+                        'answer': answer,
+                        'cont': continue_prompt,
+                    }
+                    dataset_list.append(data_item)
+            dataset_dict[split] = Dataset.from_list(dataset_list)
+        return dataset_dict
--- a/opencompass/models/__init__.py
+++ b/opencompass/models/__init__.py
@@ -13,6 +13,7 @@ from .huggingface import HuggingFace  # noqa: F401, F403
 from .huggingface import HuggingFaceCausalLM  # noqa: F401, F403
 from .huggingface import HuggingFaceChatGLM3  # noqa: F401, F403
 from .intern_model import InternLM  # noqa: F401, F403
+from .krgpt_api import KrGPT  # noqa: F401
 from .lightllm_api import LightllmAPI  # noqa: F401
 from .llama2 import Llama2, Llama2Chat  # noqa: F401, F403
 from .lmdeploy_pytorch import LmdeployPytorchModel  # noqa: F401

--- a/opencompass/models/krgpt_api.py
+++ b/opencompass/models/krgpt_api.py
+import json
+from concurrent.futures import ThreadPoolExecutor
+from typing import Dict, List, Optional, Union
+
+import requests
+
+from opencompass.registry import MODELS
+from opencompass.utils.logging import get_logger
+from opencompass.utils.prompt import PromptList
+
+from .base_api import BaseAPIModel
+
+PromptType = Union[PromptList, str]
+
+
+@MODELS.register_module()
+class KrGPT(BaseAPIModel):
+    is_api: bool = True
+
+    def __init__(
+            self,
+            path: str = 'KrGPT',
+            url: str = 'http://101.69.162.5:9300/v1/chat/completions',
+            max_seq_len: int = 2048,
+            meta_template: Optional[Dict] = None,
+            retry: int = 2,
+            generation_kwargs: Optional[Dict] = dict(),
+    ):
+        super().__init__(
+            path=path,
+            max_seq_len=max_seq_len,
+            meta_template=meta_template,
+            retry=retry,
+            generation_kwargs=generation_kwargs,
+        )
+        self.logger = get_logger()
+        self.url = url
+        self.generation_kwargs = generation_kwargs
+        self.max_out_len = self.generation_kwargs.get('max_new_tokens', 1024)
+
+    def generate(self, inputs: List[str], max_out_len: int,
+                 **kwargs) -> List[str]:
+        """Generate results given a list of inputs.
+
+        Args:
+            inputs (List[str]): A list of strings or PromptDicts.
+                The PromptDict should be organized in OpenCompass'
+                API format.
+            max_out_len (int): The maximum length of the output.
+
+        Returns:
+            List[str]: A list of generated strings.
+        """
+
+        with ThreadPoolExecutor() as executor:
+            results = list(
+                executor.map(self._generate, inputs,
+                             [self.max_out_len] * len(inputs)))
+        return results
+
+    def _generate(self,
+                  input: PromptType,
+                  max_out_len: int,
+                  temperature: float = 0.0) -> str:
+        """Generate results given a list of inputs.
+
+        Args:
+            inputs (PromptType): A string or PromptDict.
+                The PromptDict should be organized in OpenCompass'
+                API format.
+            max_out_len (int): The maximum length of the output.
+            temperature (float): What sampling temperature to use,
+                between 0 and 2. Higher values like 0.8 will make the output
+                more random, while lower values like 0.2 will make it more
+                focused and deterministic.
+
+        Returns:
+            str: The generated string.
+        """
+        assert isinstance(input, (str, PromptList))
+
+        if isinstance(input, str):
+            messages = [{'role': 'user', 'content': input}]
+        else:
+            messages = []
+            for item in input:
+                msg = {'content': item['prompt']}
+                if item['role'] == 'HUMAN':
+                    msg['role'] = 'user'
+                elif item['role'] == 'BOT':
+                    msg['role'] = 'assistant'
+                elif item['role'] == 'SYSTEM':
+                    msg['role'] = 'system'
+                messages.append(msg)
+
+        max_num_retries = 0
+        while max_num_retries < self.retry:
+            header = {'content-type': 'application/json'}
+
+            try:
+                data = dict(messages=messages)
+                raw_response = requests.post(self.url,
+                                             headers=header,
+                                             data=json.dumps(data))
+            except requests.ConnectionError:
+                self.logger.error('Got connection error, retrying...')
+                continue
+            try:
+                response = raw_response.json()
+            except requests.JSONDecodeError:
+                self.logger.error('JsonDecode error, got',
+                                  str(raw_response.content))
+                continue
+            try:
+                return response['choices'][0]['message']['content'].strip()
+            except KeyError:
+                self.logger.error('Find error message in response: ',
+                                  str(response))
+                # if 'error' in response:
+                #     if response['error']['code'] == 'rate_limit_exceeded':
+                #         time.sleep(1)
+                #         continue
+                #     elif response['error']['code'] == 'insufficient_quota':
+                #         self.invalid_keys.add(key)
+                #         self.logger.warn(f'insufficient_quota key: {key}')
+                #         continue
+
+                #     self.logger.error('Find error message in response: ',
+                #                       str(response['error']))
+            max_num_retries += 1
+
+        raise RuntimeError('Calling OpenAI failed after retrying for '
+                           f'{max_num_retries} times. Check the logs for '
+                           'details.')
--- a/opencompass/models/openai_api.py
+++ b/opencompass/models/openai_api.py
@@ -415,6 +415,13 @@ class OpenAIAllesAPIN(OpenAI):
                    self.logger.error(data)
                else:
                    return choices[0]['message']['content'].strip()
+            try:
+                match = re.match(r'Error code: \d+ - (.*)', response['data'])
+                err = eval(match.group(1))['error']
+                if err['code'] == 'content_filter' and err['status'] == 400:
+                    return err['message']
+            except Exception:
+                pass
            self.logger.error(response['msg'])
            self.logger.error(response)
            time.sleep(1)

--- a/opencompass/runners/dlc.py
+++ b/opencompass/runners/dlc.py
 import datetime
+import json
 import os
 import os.path as osp
 import random
@@ -38,6 +39,7 @@ class DLCRunner(BaseRunner):
                 task: ConfigDict,
                 aliyun_cfg: ConfigDict,
                 max_num_workers: int = 32,
+                 eval_with_gpu: list = ['plugin_eval'],
                 retry: int = 2,
                 debug: bool = False,
                 lark_bot_url: str = None):
@@ -46,6 +48,8 @@ class DLCRunner(BaseRunner):
        self.max_num_workers = max_num_workers
        self.retry = retry

+        self.eval_with_gpu = eval_with_gpu
+
        logger = get_logger()
        logger.warning(
            'To ensure the integrity of the log results, the log displayed '
@@ -93,19 +97,62 @@ class DLCRunner(BaseRunner):
        num_gpus = task.num_gpus
        task_name = task.name

+        is_eval_task = 'OpenICLEval' in task_name
+        if is_eval_task and num_gpus == 0:
+            for check_name in self.eval_with_gpu:
+                if check_name in task_name:
+                    num_gpus = 1
+                    break
+
        # Dump task config to file
        mmengine.mkdir_or_exist('tmp/')
        param_file = f'tmp/{os.getpid()}_params.py'
+        pwd = os.getcwd()
        try:
            cfg.dump(param_file)
+            if self.aliyun_cfg.get('bashrc_path') is not None:
+                # using user's conda env
+                bashrc_path = self.aliyun_cfg['bashrc_path']
+                assert osp.exists(bashrc_path)
+                assert self.aliyun_cfg.get('conda_env_name') is not None
+                conda_env_name = self.aliyun_cfg['conda_env_name']
+                shell_cmd = (f'source {bashrc_path}; '
+                             f'conda activate {conda_env_name}; ')
+            else:
+                # using public conda env
+                # users can also set `python_env_path` to their
+                # own env python path
+                assert self.aliyun_cfg.get('python_env_path') is not None
+                shell_cmd = (
+                    f'export PATH={self.aliyun_cfg["python_env_path"]}/bin:$PATH; '  # noqa: E501
+                    f'export PYTHONPATH={pwd}:$PYTHONPATH; ')
+
+            huggingface_cache = self.aliyun_cfg.get('huggingface_cache')
+            if huggingface_cache is not None:
+                # HUGGINGFACE_HUB_CACHE is a Legacy env variable, here we set
+                # `HF_HUB_CACHE` and `HUGGINGFACE_HUB_CACHE` for bc
+                shell_cmd += f'export HF_HUB_CACHE={huggingface_cache}; '
+                shell_cmd += f'export HUGGINGFACE_HUB_CACHE={huggingface_cache}; '  # noqa: E501
+
+            torch_cache = self.aliyun_cfg.get('torch_cache')
+            if torch_cache is not None:
+                shell_cmd += f'export TORCH_HOME={torch_cache}; '
+
+            hf_offline = self.aliyun_cfg.get('hf_offline', True)
+            if hf_offline:
+                shell_cmd += 'export HF_DATASETS_OFFLINE=1; export TRANSFORMERS_OFFLINE=1; export HF_EVALUATE_OFFLINE=1; '  # noqa: E501
+
+            http_proxy = self.aliyun_cfg.get('http_proxy')
+            if http_proxy is not None:
+                shell_cmd += f'export http_proxy={http_proxy}; export https_proxy={http_proxy}; '  # noqa: E501
+                shell_cmd += f'export HTTP_PROXY={http_proxy}; export HTTPS_PROXY={http_proxy}; '  # noqa: E501

-            # Build up DLC command
-            pwd = os.getcwd()
-            shell_cmd = (
-                f'source {self.aliyun_cfg["bashrc_path"]}; '
-                f'conda activate {self.aliyun_cfg["conda_env_name"]}; '
-                f'cd {pwd}; '
-                '{task_cmd}')
+            hf_endpoint = self.aliyun_cfg.get('hf_endpoint')
+            if hf_endpoint is not None:
+                shell_cmd += f'export HF_ENDPOINT={hf_endpoint}; '
+
+            shell_cmd += f'cd {pwd}; '
+            shell_cmd += '{task_cmd}'

            tmpl = ('dlc create job'
                    f" --command '{shell_cmd}'"
@@ -114,11 +161,10 @@ class DLCRunner(BaseRunner):
                    f" -c {self.aliyun_cfg['dlc_config_path']}"
                    f" --workspace_id {self.aliyun_cfg['workspace_id']}"
                    ' --worker_count 1'
-                    f' --worker_cpu {max(num_gpus * 6, 8)}'
+                    f' --worker_cpu {max(num_gpus * 8, 32)}'
                    f' --worker_gpu {num_gpus}'
-                    f' --worker_memory {max(num_gpus * 64, 48)}'
-                    f" --worker_image {self.aliyun_cfg['worker_image']}"
-                    ' --interactive')
+                    f' --worker_memory {max(num_gpus * 128, 256)}'
+                    f" --worker_image {self.aliyun_cfg['worker_image']}")
            get_cmd = partial(task.get_command,
                              cfg_path=param_file,
                              template=tmpl)
@@ -139,77 +185,64 @@ class DLCRunner(BaseRunner):
                time.sleep(random.randint(0, 10))

            def _run_within_retry():
-                try:
-                    process = subprocess.Popen(cmd,
-                                               shell=True,
-                                               text=True,
-                                               stdout=subprocess.PIPE,
-                                               stderr=subprocess.PIPE)
-                    job_id = None
-                    job_allocated = False
-                    job_finished = False
-                    last_end_time = datetime.datetime.now().strftime(
-                        '%Y-%m-%dT%H:%M:%SZ')
-                    while True:
-                        if not job_allocated:
-                            line = process.stdout.readline()
-                            if not line:
-                                break
-                            match = re.search(r'(dlc[0-9a-z]+)', line)
-                            if match and job_id is None:
-                                job_id = match.group(1)
-                            stdout.write(line)
-                            match = re.search(r'Job .* is \[Running\]', line)
-                            if match:
-                                job_allocated = True
-                        else:
-                            try:
-                                process.wait(10)
-                            except subprocess.TimeoutExpired:
-                                pass
-                            else:
-                                job_finished = True
-                            if job_finished:
-                                this_end_time = datetime.datetime.now(
-                                ).strftime('%Y-%m-%dT%H:%M:%SZ')
-                            else:
-                                this_end_time = (
-                                    datetime.datetime.now() -
-                                    datetime.timedelta(seconds=10)
-                                ).strftime('%Y-%m-%dT%H:%M:%SZ')
-                            logs_cmd = (
-                                'dlc logs'
+                output = subprocess.getoutput(cmd)
+                match = re.search(r'\|\s+(dlc[0-9a-z]+)\s+\|', output)
+                if match is None:
+                    raise RuntimeError(
+                        f'Failed to launch dlc job for {output}')
+                else:
+                    job_id = match.group(1)
+                stdout.write(output)
+
+                pod_create_time = None
+                pri_time = None
+                initial_time = datetime.datetime.now()
+                while True:
+                    # 1. Avoid to request dlc too frequently.
+                    # 2. DLC job may not be ready immediately after creation.
+                    for _ in range(5):
+                        time.sleep(2)
+                        try:
+                            job_info = json.loads(
+                                subprocess.getoutput(f'dlc get job {job_id}'))
+                            break
+                        except:  # noqa: E722
+                            pass
+                    else:
+                        raise RuntimeError(
+                            f'Failed to get job info for {job_id}')
+
+                    status = job_info['Status']
+                    if status == 'Failed':
+                        return -1
+                    elif status == 'Succeeded':
+                        return 0
+                    elif status != 'Running':
+                        continue
+
+                    # The pod time could be different from the real time.
+                    # Therefore we need to extract the pod start time from
+                    # the `job_info` and calculate the `start_time` and
+                    # `end_time` in pod.
+                    if pod_create_time is None:
+                        pod_create_time = job_info['GmtCreateTime']
+                        pri_time = pod_create_time
+                        pod_create_time = datetime.datetime.strptime(
+                            pod_create_time, '%Y-%m-%dT%H:%M:%SZ')
+                    elasped_time = datetime.datetime.now() - initial_time
+                    cur_time = (pod_create_time +
+                                elasped_time).strftime('%Y-%m-%dT%H:%M:%SZ')
+                    logs_cmd = ('dlc logs'
                                f' {job_id} {job_id}-worker-0'
-                                f' --start_time {last_end_time}'
-                                f' --end_time {this_end_time}'
-                                f" -c {self.aliyun_cfg['dlc_config_path']}")
-                            log_process = subprocess.Popen(
-                                logs_cmd,
-                                shell=True,
-                                text=True,
-                                stdout=subprocess.PIPE,
-                                stderr=subprocess.PIPE)
-                            log_output, log_err = log_process.communicate()
-                            log_output = '\n'.join(log_output.split('\n')[2:])
-                            stdout.write(log_output)
-                            last_end_time = this_end_time
+                                f" -c {self.aliyun_cfg['dlc_config_path']}"
+                                f' --start_time {pri_time}'
+                                f' --end_time {cur_time}')
+                    log_output = subprocess.getoutput(logs_cmd)
+
+                    if '[WARN] No logs found for the pod' not in log_output:
+                        pri_time = cur_time
+                        stdout.write(log_output)
                        stdout.flush()
-                        if job_finished:
-                            break
-                    process.wait()
-                    return process.returncode
-                finally:
-                    if job_id is not None:
-                        cancel_cmd = (
-                            'dlc stop job'
-                            f' {job_id}'
-                            f" -c {self.aliyun_cfg['dlc_config_path']}"
-                            ' -f')
-                        subprocess.run(cancel_cmd,
-                                       shell=True,
-                                       text=True,
-                                       stdout=subprocess.PIPE,
-                                       stderr=subprocess.PIPE)

            return_code = _run_within_retry()
            retry = self.retry

--- a/tools/prompt_viewer.py
+++ b/tools/prompt_viewer.py
@@ -6,7 +6,8 @@ from mmengine.config import Config, ConfigDict

 from opencompass.openicl.icl_inferencer import (AgentInferencer,
                                                ChatInferencer, CLPInferencer,
-                                                GenInferencer, PPLInferencer,
+                                                GenInferencer, LLInferencer,
+                                                PPLInferencer,
                                                PPLOnlyInferencer)
 from opencompass.registry import ICL_PROMPT_TEMPLATES, ICL_RETRIEVERS
 from opencompass.utils import (Menu, build_dataset_from_cfg,
@@ -81,14 +82,15 @@ def print_prompts(model_cfg, dataset_cfg, count=1):

    supported_inferencer = [
        AgentInferencer, PPLInferencer, GenInferencer, CLPInferencer,
-        PPLOnlyInferencer, ChatInferencer
+        PPLOnlyInferencer, ChatInferencer, LLInferencer
    ]
    if infer_cfg.inferencer.type not in supported_inferencer:
        print(f'Only {supported_inferencer} are supported')
        return

    for idx in range(min(count, len(ice_idx_list))):
-        if issubclass(infer_cfg.inferencer.type, PPLInferencer):
+        if issubclass(infer_cfg.inferencer.type,
+                      (PPLInferencer, LLInferencer)):
            labels = retriever.get_labels(ice_template=ice_template,
                                          prompt_template=prompt_template)
            ice = retriever.generate_ice(ice_idx_list[idx],