{"id": 1012327, "name": "All knowledge tests", "unit": "%", "createdAt": "2025-02-17T13:09:24.000Z", "updatedAt": "2025-04-22T21:23:29.000Z", "coverage": "", "timespan": "", "datasetId": 6947, "shortUnit": "%", "columnOrder": 0, "shortName": "performance_language_average", "catalogPath": "grapher/artificial_intelligence/2025-02-17/papers_with_code_language/papers_with_code_language#performance_language_average", "type": "float", "dataChecksum": "15881659677963828352", "metadataChecksum": "-2155909434739955854", "datasetName": "AI Performance on Language Tasks", "datasetVersion": "2025-02-17", "nonRedistributable": false, "display": {"unit": "%", "zeroDay": "2019-07-26", "shortUnit": "%", "yearIsDay": true, "numDecimalPlaces": 1}, "schemaVersion": 2, "processingLevel": "major", "presentation": {"topicTagsLinks": ["Artificial Intelligence"]}, "descriptionKey": ["This benchmark assesses the average accuracy of models across all subjects based on the MMLU benchmark.", "The MMLU benchmark covers a wide range of 57 subjects, including STEM, humanities, social sciences, and more. It encompasses subjects of varying difficulty levels, spanning from elementary concepts to advanced professional topics. This comprehensive benchmark assesses not only world knowledge but also problem-solving abilities."], "dimensions": {"years": {"values": [{"id": 0}, {"id": 102}, {"id": 281}, {"id": 866}, {"id": 977}, {"id": 984}, {"id": 1182}, {"id": 1328}, {"id": 1644}, {"id": 1683}, {"id": 1753}]}, "entities": {"values": [{"id": 369310, "name": "State of the art", "code": null}]}}, "origins": [{"id": 2885, "title": "AI Performance on Language Tasks", "descriptionSnapshot": "MMLU (Massive Multitask Language Understanding) is a new benchmark designed to measure knowledge acquired during pretraining by evaluating models exclusively in zero-shot and few-shot settings. This makes the benchmark more challenging and more similar to how we evaluate humans. The benchmark covers 57 subjects across STEM, the humanities, the social sciences, and more. It ranges in difficulty from an elementary level to an advanced professional level, and it tests both world knowledge and problem solving ability. Subjects range from traditional areas, such as mathematics and history, to more specialized areas like law and ethics. The granularity and breadth of the subjects makes the benchmark ideal for identifying a model\u2019s blind spots.\n", "producer": "Papers with Code", "citationFull": "Multi-task Language Understanding on MMLU. Papers with Code (2025)", "urlMain": "https://paperswithcode.com/sota/multi-task-language-understanding-on-mmlu", "urlDownload": "https://paperswithcode.com/sota/multi-task-language-understanding-on-mmlu", "dateAccessed": "2025-02-17", "datePublished": "2025-02-17", "license": {"url": "https://creativecommons.org/licenses/by-sa/4.0/", "name": "CC BY 4.0"}}]}