Skip to content

Latest commit

 

History

History
310 lines (293 loc) · 28.6 KB

README.md

File metadata and controls

310 lines (293 loc) · 28.6 KB

meraGPT/mera-mix-4x7B results

openllm results

Benchmark Model gsm8k arc hellaswag mmlu truthfulqa winogrande Average
openllm mera-mix-4x7B 72.93 71.76 88.92 63.8 77.6 84.53 76.59

gsm8k

Task Version Metric Value Stderr
gsm8k 3 exact_match,strict-match 0.73
exact_match_stderr,strict-match 0.01
exact_match,flexible-extract 0.73
exact_match_stderr,flexible-extract 0.01
alias gsm8k

Average: 72.93%

arc

Task Version Metric Value Stderr
arc_challenge 1 acc,none 0.70
acc_stderr,none 0.01
acc_norm,none 0.72
acc_norm_stderr,none 0.01
alias arc_challenge

Average: 71.76%

hellaswag

Task Version Metric Value Stderr
hellaswag 1 acc,none 0.72
acc_stderr,none 0
acc_norm,none 0.89
acc_norm_stderr,none 0
alias hellaswag

Average: 88.92%

mmlu

Task Version Metric Value Stderr
mmlu N/A acc,none 0.64
acc_stderr,none 0
alias mmlu
mmlu_abstract_algebra 0 alias - abstract_algebra
acc,none 0.33
acc_stderr,none 0.05
mmlu_anatomy 0 alias - anatomy
acc,none 0.63
acc_stderr,none 0.04
mmlu_astronomy 0 alias - astronomy
acc,none 0.70
acc_stderr,none 0.04
mmlu_business_ethics 0 alias - business_ethics
acc,none 0.61
acc_stderr,none 0.05
mmlu_clinical_knowledge 0 alias - clinical_knowledge
acc,none 0.70
acc_stderr,none 0.03
mmlu_college_biology 0 alias - college_biology
acc,none 0.76
acc_stderr,none 0.04
mmlu_college_chemistry 0 alias - college_chemistry
acc,none 0.46
acc_stderr,none 0.05
mmlu_college_computer_science 0 alias - college_computer_science
acc,none 0.59
acc_stderr,none 0.05
mmlu_college_mathematics 0 alias - college_mathematics
acc,none 0.33
acc_stderr,none 0.05
mmlu_college_medicine 0 alias - college_medicine
acc,none 0.65
acc_stderr,none 0.04
mmlu_college_physics 0 alias - college_physics
acc,none 0.36
acc_stderr,none 0.05
mmlu_computer_security 0 alias - computer_security
acc,none 0.76
acc_stderr,none 0.04
mmlu_conceptual_physics 0 alias - conceptual_physics
acc,none 0.57
acc_stderr,none 0.03
mmlu_econometrics 0 alias - econometrics
acc,none 0.48
acc_stderr,none 0.05
mmlu_electrical_engineering 0 alias - electrical_engineering
acc,none 0.56
acc_stderr,none 0.04
mmlu_elementary_mathematics 0 alias - elementary_mathematics
acc,none 0.42
acc_stderr,none 0.03
mmlu_formal_logic 0 alias - formal_logic
acc,none 0.48
acc_stderr,none 0.04
mmlu_global_facts 0 alias - global_facts
acc,none 0.34
acc_stderr,none 0.05
mmlu_high_school_biology 0 alias - high_school_biology
acc,none 0.77
acc_stderr,none 0.02
mmlu_high_school_chemistry 0 alias - high_school_chemistry
acc,none 0.50
acc_stderr,none 0.04
mmlu_high_school_computer_science 0 alias - high_school_computer_science
acc,none 0.70
acc_stderr,none 0.05
mmlu_high_school_european_history 0 alias - high_school_european_history
acc,none 0.77
acc_stderr,none 0.03
mmlu_high_school_geography 0 alias - high_school_geography
acc,none 0.80
acc_stderr,none 0.03
mmlu_high_school_government_and_politics 0 alias - high_school_government_and_politics
acc,none 0.91
acc_stderr,none 0.02
mmlu_high_school_macroeconomics 0 alias - high_school_macroeconomics
acc,none 0.65
acc_stderr,none 0.02
mmlu_high_school_mathematics 0 alias - high_school_mathematics
acc,none 0.32
acc_stderr,none 0.03
mmlu_high_school_microeconomics 0 alias - high_school_microeconomics
acc,none 0.67
acc_stderr,none 0.03
mmlu_high_school_physics 0 alias - high_school_physics
acc,none 0.35
acc_stderr,none 0.04
mmlu_high_school_psychology 0 alias - high_school_psychology
acc,none 0.84
acc_stderr,none 0.02
mmlu_high_school_statistics 0 alias - high_school_statistics
acc,none 0.51
acc_stderr,none 0.03
mmlu_high_school_us_history 0 alias - high_school_us_history
acc,none 0.85
acc_stderr,none 0.02
mmlu_high_school_world_history 0 alias - high_school_world_history
acc,none 0.81
acc_stderr,none 0.03
mmlu_human_aging 0 alias - human_aging
acc,none 0.67
acc_stderr,none 0.03
mmlu_human_sexuality 0 alias - human_sexuality
acc,none 0.80
acc_stderr,none 0.03
mmlu_humanities N/A alias - humanities
acc,none 0.60
acc_stderr,none 0.01
mmlu_international_law 0 alias - international_law
acc,none 0.78
acc_stderr,none 0.04
mmlu_jurisprudence 0 alias - jurisprudence
acc,none 0.81
acc_stderr,none 0.04
mmlu_logical_fallacies 0 alias - logical_fallacies
acc,none 0.79
acc_stderr,none 0.03
mmlu_machine_learning 0 alias - machine_learning
acc,none 0.45
acc_stderr,none 0.05
mmlu_management 0 alias - management
acc,none 0.78
acc_stderr,none 0.04
mmlu_marketing 0 alias - marketing
acc,none 0.88
acc_stderr,none 0.02
mmlu_medical_genetics 0 alias - medical_genetics
acc,none 0.71
acc_stderr,none 0.05
mmlu_miscellaneous 0 alias - miscellaneous
acc,none 0.83
acc_stderr,none 0.01
mmlu_moral_disputes 0 alias - moral_disputes
acc,none 0.73
acc_stderr,none 0.02
mmlu_moral_scenarios 0 alias - moral_scenarios
acc,none 0.43
acc_stderr,none 0.02
mmlu_nutrition 0 alias - nutrition
acc,none 0.73
acc_stderr,none 0.03
mmlu_other N/A alias - other
acc,none 0.70
acc_stderr,none 0.01
mmlu_philosophy 0 alias - philosophy
acc,none 0.69
acc_stderr,none 0.03
mmlu_prehistory 0 alias - prehistory
acc,none 0.73
acc_stderr,none 0.02
mmlu_professional_accounting 0 alias - professional_accounting
acc,none 0.48
acc_stderr,none 0.03
mmlu_professional_law 0 alias - professional_law
acc,none 0.47
acc_stderr,none 0.01
mmlu_professional_medicine 0 alias - professional_medicine
acc,none 0.68
acc_stderr,none 0.03
mmlu_professional_psychology 0 alias - professional_psychology
acc,none 0.67
acc_stderr,none 0.02
mmlu_public_relations 0 alias - public_relations
acc,none 0.66
acc_stderr,none 0.05
mmlu_security_studies 0 alias - security_studies
acc,none 0.73
acc_stderr,none 0.03
mmlu_social_sciences N/A alias - social_sciences
acc,none 0.74
acc_stderr,none 0.01
mmlu_sociology 0 alias - sociology
acc,none 0.84
acc_stderr,none 0.03
mmlu_stem N/A alias - stem
acc,none 0.53
acc_stderr,none 0.01
mmlu_us_foreign_policy 0 alias - us_foreign_policy
acc,none 0.84
acc_stderr,none 0.04
mmlu_virology 0 alias - virology
acc,none 0.55
acc_stderr,none 0.04
mmlu_world_religions 0 alias - world_religions
acc,none 0.85
acc_stderr,none 0.03

Average: 63.8%

truthfulqa

Task Version Metric Value Stderr
truthfulqa N/A rougeL_max,none 29.69
rougeL_max_stderr,none 0.62
bleu_max,none 10.03
bleu_max_stderr,none 0.42
rouge1_diff,none 2.28
rouge1_diff_stderr,none 0.47
rouge2_acc,none 0.39
rouge2_acc_stderr,none 0.02
rouge2_max,none 17.85
rouge2_max_stderr,none 0.63
rouge1_acc,none 0.56
rouge1_acc_stderr,none 0.02
rougeL_acc,none 0.53
rougeL_acc_stderr,none 0.02
rouge2_diff,none 1.66
rouge2_diff_stderr,none 0.48
bleu_acc,none 0.51
bleu_acc_stderr,none 0.02
bleu_diff,none 1.03
bleu_diff_stderr,none 0.28
acc,none 0.70
acc_stderr,none 0.01
rouge1_max,none 33.43
rouge1_max_stderr,none 0.62
rougeL_diff,none 2.13
rougeL_diff_stderr,none 0.45
alias truthfulqa
truthfulqa_gen 3 bleu_max,none 10.03
bleu_max_stderr,none 0.42
bleu_acc,none 0.51
bleu_acc_stderr,none 0.02
bleu_diff,none 1.03
bleu_diff_stderr,none 0.28
rouge1_max,none 33.43
rouge1_max_stderr,none 0.62
rouge1_acc,none 0.56
rouge1_acc_stderr,none 0.02
rouge1_diff,none 2.28
rouge1_diff_stderr,none 0.47
rouge2_max,none 17.85
rouge2_max_stderr,none 0.63
rouge2_acc,none 0.39
rouge2_acc_stderr,none 0.02
rouge2_diff,none 1.66
rouge2_diff_stderr,none 0.48
rougeL_max,none 29.69
rougeL_max_stderr,none 0.62
rougeL_acc,none 0.53
rougeL_acc_stderr,none 0.02
rougeL_diff,none 2.13
rougeL_diff_stderr,none 0.45
alias - truthfulqa_gen
truthfulqa_mc1 2 acc,none 0.62
acc_stderr,none 0.02
alias - truthfulqa_mc1
truthfulqa_mc2 2 acc,none 0.78
acc_stderr,none 0.01
alias - truthfulqa_mc2

Average: 77.6%

winogrande

Task Version Metric Value Stderr
winogrande 1 acc,none 0.85
acc_stderr,none 0.01
alias winogrande

Average: 84.53%

Average score: 76.59%

Metadata: {'elapsed_time': '07:13:04', 'gpu': 'NVIDIA A100 80GB PCIe'}