{"id": 820642, "name": "MMLU avg", "unit": "%", "createdAt": "2024-02-15T16:16:43.000Z", "updatedAt": "2025-09-25T19:29:50.000Z", "coverage": "", "timespan": "2019-2023", "datasetId": 6385, "shortUnit": "%", "columnOrder": 0, "shortName": "mmlu_avg", "catalogPath": "grapher/artificial_intelligence/2024-02-15/epoch_llms/epoch_llms#mmlu_avg", "type": "float", "dataChecksum": "10889697038403301654", "metadataChecksum": "-4876574886571220142", "datasetName": "Large Language Model Performance and Compute", "datasetVersion": "2024-02-15", "nonRedistributable": false, "display": {"name": "Knowledge tests (MMLU)", "unit": "%", "shortUnit": "%", "numDecimalPlaces": 1}, "schemaVersion": 2, "processingLevel": "minor", "presentation": {"titlePublic": "Performance on knowledge tests (MMLU)", "topicTagsLinks": ["Artificial Intelligence"]}, "descriptionKey": ["The MMLU benchmark covers a wide range of 57 subjects, including STEM, humanities, social sciences, and more.", "It encompasses subjects of varying difficulty levels, spanning from elementary concepts to advanced professional topics. This comprehensive benchmark assesses not only world knowledge but also problem-solving abilities."], "dimensions": {"years": {"values": [{"id": 2022}, {"id": 2023}, {"id": 2019}, {"id": 2020}, {"id": 2021}]}, "entities": {"values": [{"id": 367072, "name": "BLOOM", "code": null}, {"id": 367081, "name": "BloombergGPT", "code": null}, {"id": 273166, "name": "Chinchilla", "code": null}, {"id": 365992, "name": "GLM-130B", "code": null}, {"id": 367086, "name": "GPT-2 (finetuned)", "code": null}, {"id": 367068, "name": "GPT-3 (davinci)", "code": null}, {"id": 367084, "name": "GPT-3.5", "code": null}, {"id": 363052, "name": "GPT-4", "code": null}, {"id": 257096, "name": "GPT-NeoX-20B", "code": null}, {"id": 368727, "name": "Gemini Ultra", "code": null}, {"id": 367254, "name": "Gopher (0.4B)", "code": null}, {"id": 367258, "name": "Gopher (1.4B)", "code": null}, {"id": 367255, "name": "Gopher (280B)", "code": null}, {"id": 367257, "name": "Gopher (7B)", "code": null}, {"id": 367426, "name": "LLaMA (13B)", "code": null}, {"id": 367427, "name": "LLaMA (33B)", "code": null}, {"id": 367304, "name": "LLaMA (65B)", "code": null}, {"id": 367425, "name": "LLaMA (7B)", "code": null}, {"id": 367085, "name": "OPT", "code": null}, {"id": 273167, "name": "PaLM (540B)", "code": null}, {"id": 367253, "name": "PaLM (62B)", "code": null}, {"id": 367252, "name": "PaLM (62B+)", "code": null}, {"id": 367256, "name": "PaLM (8B)", "code": null}, {"id": 367071, "name": "PaLM-2", "code": null}, {"id": 367083, "name": "code-davinci-002", "code": null}]}}, "origins": [{"id": 8745, "title": "Large Language Model Performance and Compute", "description": "Epoch dataset on how performance on a MMLU language benchmark scales with computational resources.", "producer": "Epoch AI", "citationFull": "Owen, David. (2023). Large Language Model performance and compute, Epoch (2023) [Data set]. In Extrapolating performance in language modeling benchmarks. Published online at epoch.ai. Retrieved from: 'https://epoch.ai/blog/extrapolating-performance-in-language-modelling-benchmarks' [online resource].", "urlMain": "https://epoch.ai/blog/extrapolating-performance-in-language-modelling-benchmarks", "dateAccessed": "2024-02-15", "datePublished": "2023-07-12", "license": {"url": "https://epoch.ai/blog/extrapolating-performance-in-language-modelling-benchmarks", "name": "Creative Commons BY 4.0"}}]}